CoCalc -- RegCacheFPU.cpp

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/x86/RegCacheFPU.cpp
³¹⁸⁹ views
1
// Copyright (c) 2012- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include "ppsspp_config.h"
19
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
20

21
#include <cstring>
22
#include "Common/Math/SIMDHeaders.h"
23
#include "Common/Log.h"
24
#include "Common/x64Emitter.h"
25
#include "Core/MIPS/MIPSAnalyst.h"
26
#include "Core/MIPS/x86/Jit.h"
27
#include "Core/MIPS/x86/RegCache.h"
28
#include "Core/MIPS/x86/RegCacheFPU.h"
29

30
using namespace Gen;
31
using namespace X64JitConstants;
32

33
FPURegCache::FPURegCache() {
34
	vregs = regs + 32;
35
}
36

37
void FPURegCache::Start(MIPSState *mipsState, MIPSComp::JitState *js, MIPSComp::JitOptions *jo, MIPSAnalyst::AnalysisResults &stats, bool useRip) {
38
	mips_ = mipsState;
39
	useRip_ = useRip;
40
	if (!initialReady) {
41
		SetupInitialRegs();
42
		initialReady = true;
43
	}
44

45
	memcpy(xregs, xregsInitial, sizeof(xregs));
46
	memcpy(regs, regsInitial, sizeof(regs));
47
	pendingFlush = false;
48

49
	js_ = js;
50
	jo_ = jo;
51
}
52

53
void FPURegCache::SetupInitialRegs() {
54
	for (int i = 0; i < NUM_X_FPREGS; i++) {
55
		memset(xregsInitial[i].mipsRegs, -1, sizeof(xregsInitial[i].mipsRegs));
56
		xregsInitial[i].dirty = false;
57
	}
58
	memset(regsInitial, 0, sizeof(regsInitial));
59
	OpArg base = GetDefaultLocation(0);
60
	for (int i = 0; i < 32; i++) {
61
		regsInitial[i].location = base;
62
		base.IncreaseOffset(sizeof(float));
63
	}
64
	for (int i = 32; i < 32 + 128; i++) {
65
		regsInitial[i].location = GetDefaultLocation(i);
66
	}
67
	base = GetDefaultLocation(32 + 128);
68
	for (int i = 32 + 128; i < NUM_MIPS_FPRS; i++) {
69
		regsInitial[i].location = base;
70
		base.IncreaseOffset(sizeof(float));
71
	}
72
}
73

74
void FPURegCache::SpillLock(int p1, int p2, int p3, int p4) {
75
	regs[p1].locked++;
76
	if (p2 != 0xFF) regs[p2].locked++;
77
	if (p3 != 0xFF) regs[p3].locked++;
78
	if (p4 != 0xFF) regs[p4].locked++;
79
}
80

81
void FPURegCache::SpillLockV(const u8 *vec, VectorSize sz) {
82
	for (int i = 0; i < GetNumVectorElements(sz); i++) {
83
		vregs[vec[i]].locked++;
84
	}
85
}
86

87
void FPURegCache::SpillLockV(int vec, VectorSize sz) {
88
	u8 r[4];
89
	GetVectorRegs(r, sz, vec);
90
	SpillLockV(r, sz);
91
}
92

93
void FPURegCache::ReleaseSpillLockV(const u8 *vec, VectorSize sz) {
94
	for (int i = 0; i < GetNumVectorElements(sz); i++) {
95
		vregs[vec[i]].locked = 0;
96
	}
97
}
98

99
void FPURegCache::ReduceSpillLock(int mipsreg) {
100
	regs[mipsreg].locked--;
101
}
102

103
void FPURegCache::ReduceSpillLockV(const u8 *vec, VectorSize sz) {
104
	for (int i = 0; i < GetNumVectorElements(sz); i++) {
105
		vregs[vec[i]].locked--;
106
	}
107
}
108

109
void FPURegCache::FlushRemap(int oldreg, int newreg) {
110
	OpArg oldLocation = regs[oldreg].location;
111
	_assert_msg_(oldLocation.IsSimpleReg(), "FlushRemap: Must already be in an x86 SSE register");
112
	_assert_msg_(regs[oldreg].lane == 0, "FlushRemap only supports FPR registers");
113

114
	X64Reg xr = oldLocation.GetSimpleReg();
115
	if (oldreg == newreg) {
116
		xregs[xr].dirty = true;
117
		return;
118
	}
119

120
	StoreFromRegister(oldreg);
121

122
	// Now, if newreg already was mapped somewhere, get rid of that.
123
	DiscardR(newreg);
124

125
	// Now, take over the old register.
126
	regs[newreg].location = oldLocation;
127
	regs[newreg].away = true;
128
	regs[newreg].locked = true;
129
	regs[newreg].lane = 0;
130
	xregs[xr].mipsReg = newreg;
131
	xregs[xr].dirty = true;
132
}
133

134
void FPURegCache::MapRegV(int vreg, int flags) {
135
	MapReg(vreg + 32, (flags & MAP_NOINIT) != MAP_NOINIT, (flags & MAP_DIRTY) != 0);
136
}
137

138
void FPURegCache::MapRegsV(int vec, VectorSize sz, int flags) {
139
	u8 r[4];
140
	GetVectorRegs(r, sz, vec);
141
	SpillLockV(r, sz);
142
	for (int i = 0; i < GetNumVectorElements(sz); i++) {
143
		MapReg(r[i] + 32, (flags & MAP_NOINIT) != MAP_NOINIT, (flags & MAP_DIRTY) != 0);
144
	}
145
	if ((flags & MAP_NOLOCK) != 0) {
146
		// We have to lock so the sz won't spill, so we unlock after.
147
		// If they were already locked, we only reduce the lock we added above.
148
		ReduceSpillLockV(r, sz);
149
	}
150
}
151

152
void FPURegCache::MapRegsV(const u8 *r, VectorSize sz, int flags) {
153
	SpillLockV(r, sz);
154
	for (int i = 0; i < GetNumVectorElements(sz); i++) {
155
		MapReg(r[i] + 32, (flags & MAP_NOINIT) != MAP_NOINIT, (flags & MAP_DIRTY) != 0);
156
	}
157
	if ((flags & MAP_NOLOCK) != 0) {
158
		// We have to lock so the sz won't spill, so we unlock after.
159
		// If they were already locked, we only reduce the lock we added above.
160
		ReduceSpillLockV(r, sz);
161
	}
162
}
163

164
bool FPURegCache::IsMappedVS(const u8 *v, VectorSize vsz) {
165
	const int n = GetNumVectorElements(vsz);
166

167
	// Make sure the first reg is at least mapped in the right place.
168
	if (!IsMappedVS(v[0]))
169
		return false;
170
	if (vregs[v[0]].lane != 1)
171
		return false;
172

173
	// And make sure the rest are mapped to the same reg in the right positions.
174
	X64Reg xr = VSX(v);
175
	for (int i = 1; i < n; ++i) {
176
		u8 vi = v[i];
177
		if (!IsMappedVS(vi) || VSX(&vi) != xr)
178
			return false;
179
		if (vregs[vi].lane != i + 1)
180
			return false;
181
	}
182
	// TODO: Optimize this case?  It happens.
183
	for (int i = n; i < 4; ++i) {
184
		if (xregs[xr].mipsRegs[i] != -1) {
185
			return false;
186
		}
187
	}
188
	return true;
189
}
190

191
void FPURegCache::MapRegsVS(const u8 *r, VectorSize vsz, int flags) {
192
	const int n = GetNumVectorElements(vsz);
193

194
	_dbg_assert_msg_(jo_->enableVFPUSIMD, "Should not map simd regs when option is off.");
195

196
	if (!TryMapRegsVS(r, vsz, flags)) {
197
		// TODO: Could be more optimal.
198
		for (int i = 0; i < n; ++i) {
199
			StoreFromRegisterV(r[i]);
200
		}
201
		if (!TryMapRegsVS(r, vsz, flags)) {
202
			_dbg_assert_msg_(false, "MapRegsVS() failed on second try.");
203
		}
204
	}
205
}
206

207
bool FPURegCache::CanMapVS(const u8 *v, VectorSize vsz) {
208
	const int n = GetNumVectorElements(vsz);
209

210
	if (!jo_->enableVFPUSIMD) {
211
		return false;
212
	}
213

214
	if (IsMappedVS(v, vsz)) {
215
		return true;
216
	} else if (vregs[v[0]].lane != 0) {
217
		const MIPSCachedFPReg &v0 = vregs[v[0]];
218
		_dbg_assert_msg_(v0.away, "Must be away when lane != 0");
219
		_dbg_assert_msg_(v0.location.IsSimpleReg(), "Must be is register when lane != 0");
220

221
		// Already in a different simd set.
222
		return false;
223
	}
224

225
	if (vregs[v[0]].locked) {
226
		// If it's locked, we can't mess with it.
227
		return false;
228
	}
229

230
	// Next, fail if any of the other regs are in simd currently.
231
	// TODO: Only if locked?  Not sure if it will be worth breaking them anyway.
232
	for (int i = 1; i < n; ++i) {
233
		if (vregs[v[i]].lane != 0) {
234
			return false;
235
		}
236
		// If it's locked, in simd or not, we can't use it.
237
		if (vregs[v[i]].locked) {
238
			return false;
239
		}
240
		_assert_msg_(!vregs[v[i]].location.IsImm(), "Cannot handle imms in fp cache.");
241
	}
242

243
	return true;
244
}
245

246
bool FPURegCache::TryMapRegsVS(const u8 *v, VectorSize vsz, int flags) {
247
	const int n = GetNumVectorElements(vsz);
248

249
	if (!CanMapVS(v, vsz)) {
250
		return false;
251
	}
252

253
	if (IsMappedVS(v, vsz)) {
254
		// Already mapped then, perfect.  Just mark dirty.
255
		if ((flags & MAP_DIRTY) != 0)
256
			xregs[VSX(v)].dirty = true;
257
		if ((flags & MAP_NOLOCK) == 0)
258
			SpillLockV(v, vsz);
259
		return true;
260
	}
261

262
	// At this point, some or all are in single regs or memory, and they're not locked there.
263

264
	if (n == 1) {
265
		// Single is easy, just map normally but track as a SIMD reg.
266
		// This way V/VS can warn about improper usage properly.
267
		MapRegV(v[0], flags);
268
		X64Reg vx = VX(v[0]);
269
		if (vx == INVALID_REG)
270
			return false;
271

272
		vregs[v[0]].lane = 1;
273
		if ((flags & MAP_DIRTY) != 0)
274
			xregs[vx].dirty = true;
275
		if ((flags & MAP_NOLOCK) == 0)
276
			SpillLockV(v, vsz);
277
		Invariant();
278
		return true;
279
	}
280

281
	X64Reg xr;
282
	if ((flags & MAP_NOINIT) != MAP_NOINIT) {
283
		xr = LoadRegsVS(v, n);
284
	} else {
285
		xr = GetFreeXReg();
286
	}
287

288
	// Victory, now let's clean up everything.
289
	OpArg newloc = Gen::R(xr);
290
	bool dirty = (flags & MAP_DIRTY) != 0;
291
	for (int i = 0; i < n; ++i) {
292
		MIPSCachedFPReg &vr = vregs[v[i]];
293
		if (vr.away) {
294
			// Clear the xreg it was in before.
295
			X64Reg oldXReg = vr.location.GetSimpleReg();
296
			if (oldXReg != xr) {
297
				xregs[oldXReg].mipsReg = -1;
298
			}
299
			if (xregs[oldXReg].dirty) {
300
				// Inherit the "dirtiness" (ultimately set below for all regs.)
301
				dirty = true;
302
				xregs[oldXReg].dirty = false;
303
			}
304
		}
305
		xregs[xr].mipsRegs[i] = v[i] + 32;
306
		vr.location = newloc;
307
		vr.lane = i + 1;
308
		vr.away = true;
309
	}
310
	xregs[xr].dirty = dirty;
311

312
	if ((flags & MAP_NOLOCK) == 0) {
313
		SpillLockV(v, vsz);
314
	}
315

316
	Invariant();
317
	return true;
318
}
319

320
X64Reg FPURegCache::LoadRegsVS(const u8 *v, int n) {
321
	int regsAvail = 0;
322
	int regsLoaded = 0;
323
	X64Reg xrs[4] = {INVALID_REG, INVALID_REG, INVALID_REG, INVALID_REG};
324
	bool xrsLoaded[4] = {false, false, false, false};
325

326
	_dbg_assert_msg_(n >= 2 && n <= 4, "LoadRegsVS is only implemented for simd loads.");
327

328
	for (int i = 0; i < n; ++i) {
329
		const MIPSCachedFPReg &mr = vregs[v[i]];
330
		if (mr.away) {
331
			X64Reg mrx = mr.location.GetSimpleReg();
332
			// If it's not simd, or lanes 1+ are clear, we can use it.
333
			if (mr.lane == 0 || xregs[mrx].mipsRegs[1] == -1) {
334
				// Okay, there's nothing else in this reg, so we can use it.
335
				xrsLoaded[i] = true;
336
				xrs[i] = mrx;
337
				++regsLoaded;
338
				++regsAvail;
339
			} else if (mr.lane != 0) {
340
				_dbg_assert_msg_(false, "LoadRegsVS is not able to handle simd remapping yet, store first.");
341
			}
342
		}
343
	}
344

345
	if (regsAvail < n) {
346
		// Try to grab some without spilling.
347
		X64Reg xrFree[4];
348
		int obtained = GetFreeXRegs(xrFree, n - regsAvail, false);
349
		int pos = 0;
350
		for (int i = 0; i < n && pos < obtained; ++i) {
351
			if (xrs[i] == INVALID_REG) {
352
				// Okay, it's not loaded but we have a reg for this slot.
353
				xrs[i] = xrFree[pos++];
354
				++regsAvail;
355
			}
356
		}
357
	}
358

359
	// Let's also check if the memory addresses are sequential.
360
	int sequential = 1;
361
	for (int i = 1; i < n; ++i) {
362
		if (v[i] < 128 && v[i - 1] < 128) {
363
			if (voffset[v[i]] != voffset[v[i - 1]] + 1) {
364
				break;
365
			}
366
		} else if (v[i] >= 128 && v[i - 1] >= 128) {
367
			if (v[i] != v[i - 1] + 1) {
368
				break;
369
			}
370
		} else {
371
			// Temps can't be sequential with non-temps.
372
			break;
373
		}
374
		++sequential;
375
	}
376

377
	// Did we end up with enough regs?
378
	// TODO: Not handling the case of some regs avail and some loaded right now.
379
	if (regsAvail < n && (sequential != n || regsLoaded == n || regsAvail == 0)) {
380
		regsAvail = GetFreeXRegs(xrs, 2, true);
381
		_dbg_assert_msg_(regsAvail >= 2, "Ran out of fp regs for loading simd regs with.");
382
		_dbg_assert_msg_(xrs[0] != xrs[1], "Regs for simd load are the same, bad things await.");
383
		// We spilled, so we assume that all our regs are screwed up now anyway.
384
		for (int i = 0; i < 4; ++i) {
385
			xrsLoaded[i] = false;
386
		}
387
		for (int i = 2; i < n; ++i){
388
			xrs[i] = INVALID_REG;
389
		}
390
		regsLoaded = 0;
391
	}
392

393
	// If they're sequential, and we wouldn't need to store them all, use a single load.
394
	// But if they're already loaded, we'd have to store, not worth it.
395
	X64Reg res = INVALID_REG;
396
	if (sequential == n && regsLoaded < n) {
397
		// TODO: What should we do if some are in regs?  Better to assemble?
398
		for (int i = 0; i < n; ++i) {
399
			StoreFromRegisterV(v[i]);
400
		}
401

402
		// Grab any available reg.
403
		for (int i = 0; i < n; ++i) {
404
			if (xrs[i] != INVALID_REG) {
405
				res = xrs[i];
406
				break;
407
			}
408
		}
409
		const float *f = v[0] < 128 ? &mips_->v[voffset[v[0]]] : &mips_->tempValues[v[0] - 128];
410
		if (((intptr_t)f & 0x7) == 0 && n == 2) {
411
			emit->MOVQ_xmm(res, vregs[v[0]].location);
412
		} else if (((intptr_t)f & 0xf) == 0) {
413
			// On modern processors, MOVUPS on aligned is fast, but maybe not on older ones.
414
			emit->MOVAPS(res, vregs[v[0]].location);
415
		} else {
416
			emit->MOVUPS(res, vregs[v[0]].location);
417
		}
418
	} else if (regsAvail >= n) {
419
		// Have enough regs, potentially all in regs.
420
		auto loadXR = [&](int l) {
421
			if (!xrsLoaded[l] && n >= l + 1) {
422
				emit->MOVSS(xrs[l], vregs[v[l]].location);
423
			}
424
		};
425
		// The order here is intentional.
426
		loadXR(3);
427
		loadXR(1);
428
		loadXR(2);
429
		loadXR(0);
430
		if (n == 4) {
431
			// This gives us [w, y] in the y reg.
432
			emit->UNPCKLPS(xrs[1], Gen::R(xrs[3]));
433
		}
434
		if (n >= 3) {
435
			// This gives us [z, x].  Then we combine with y.
436
			emit->UNPCKLPS(xrs[0], Gen::R(xrs[2]));
437
		}
438
		if (n >= 2) {
439
			emit->UNPCKLPS(xrs[0], Gen::R(xrs[1]));
440
		}
441
		res = xrs[0];
442
	} else {
443
		_dbg_assert_msg_(n > 2, "2 should not be possible here.");
444

445
		// Available regs are less than n, and some may be loaded.
446
		// Let's grab the most optimal unloaded ones.
447
		X64Reg xr1 = n == 3 ? xrs[1] : xrs[3];
448
		X64Reg xr2 = xrs[2];
449
		if (xr1 == INVALID_REG) {
450
			// Not one of the available ones.  Grab another.
451
			for (int i = n - 1; i >= 0; --i) {
452
				if (xrs[i] != INVALID_REG && xrs[i] != xr2) {
453
					StoreFromRegisterV(v[i]);
454
					xr1 = xrs[i];
455
					break;
456
				}
457
			}
458
		}
459
		if (xr2 == INVALID_REG) {
460
			// Not one of the available ones.  Grab another.
461
			for (int i = n - 1; i >= 0; --i) {
462
				if (xrs[i] != INVALID_REG && xrs[i] != xr1) {
463
					StoreFromRegisterV(v[i]);
464
					xr2 = xrs[i];
465
					break;
466
				}
467
			}
468
		}
469

470
		if (n == 3) {
471
			if (!vregs[v[2]].location.IsSimpleReg(xr2))
472
				emit->MOVSS(xr2, vregs[v[2]].location);
473
			if (!vregs[v[1]].location.IsSimpleReg(xr1))
474
				emit->MOVSS(xr1, vregs[v[1]].location);
475
			emit->SHUFPS(xr1, Gen::R(xr2), _MM_SHUFFLE(3, 0, 0, 0));
476
			emit->MOVSS(xr2, vregs[v[0]].location);
477
			emit->MOVSS(xr1, Gen::R(xr2));
478
		} else if (n == 4) {
479
			if (!vregs[v[2]].location.IsSimpleReg(xr2))
480
				emit->MOVSS(xr2, vregs[v[2]].location);
481
			if (!vregs[v[3]].location.IsSimpleReg(xr1))
482
				emit->MOVSS(xr1, vregs[v[3]].location);
483
			emit->UNPCKLPS(xr2, Gen::R(xr1));
484
			emit->MOVSS(xr1, vregs[v[1]].location);
485
			emit->SHUFPS(xr1, Gen::R(xr2), _MM_SHUFFLE(1, 0, 0, 3));
486
			emit->MOVSS(xr2, vregs[v[0]].location);
487
			emit->MOVSS(xr1, Gen::R(xr2));
488
		}
489
		res = xr1;
490
	}
491

492
	return res;
493
}
494

495
bool FPURegCache::TryMapDirtyInVS(const u8 *vd, VectorSize vdsz, const u8 *vs, VectorSize vssz, bool avoidLoad) {
496
	// Don't waste time mapping if some will for sure fail.
497
	if (!CanMapVS(vd, vdsz) || !CanMapVS(vs, vssz)) {
498
		return false;
499
	}
500
	// But, they could still fail based on overlap.  Hopefully not common...
501
	bool success = TryMapRegsVS(vs, vssz, 0);
502
	if (success) {
503
		success = TryMapRegsVS(vd, vdsz, avoidLoad ? MAP_NOINIT : MAP_DIRTY);
504
	}
505
	ReleaseSpillLockV(vs, vssz);
506
	ReleaseSpillLockV(vd, vdsz);
507

508
	_dbg_assert_msg_(!success || IsMappedVS(vd, vdsz), "vd should be mapped now");
509
	_dbg_assert_msg_(!success || IsMappedVS(vs, vssz), "vs should be mapped now");
510

511
	return success;
512
}
513

514
bool FPURegCache::TryMapDirtyInInVS(const u8 *vd, VectorSize vdsz, const u8 *vs, VectorSize vssz, const u8 *vt, VectorSize vtsz, bool avoidLoad) {
515
	// Don't waste time mapping if some will for sure fail.
516
	if (!CanMapVS(vd, vdsz) || !CanMapVS(vs, vssz) || !CanMapVS(vt, vtsz)) {
517
		return false;
518
	}
519

520

521
	// But, they could still fail based on overlap.  Hopefully not common...
522
	bool success = TryMapRegsVS(vs, vssz, 0);
523
	if (success) {
524
		success = TryMapRegsVS(vt, vtsz, 0);
525
	}
526
	if (success) {
527
		success = TryMapRegsVS(vd, vdsz, avoidLoad ? MAP_NOINIT : MAP_DIRTY);
528
	}
529
	ReleaseSpillLockV(vd, vdsz);
530
	ReleaseSpillLockV(vs, vssz);
531
	ReleaseSpillLockV(vt, vtsz);
532

533
	_dbg_assert_msg_(!success || IsMappedVS(vd, vdsz), "vd should be mapped now");
534
	_dbg_assert_msg_(!success || IsMappedVS(vs, vssz), "vs should be mapped now");
535
	_dbg_assert_msg_(!success || IsMappedVS(vt, vtsz), "vt should be mapped now");
536

537
	return success;
538
}
539

540
void FPURegCache::SimpleRegsV(const u8 *v, VectorSize vsz, int flags) {
541
	const int n = GetNumVectorElements(vsz);
542
	// TODO: Could be more optimal (in case of Discard or etc.)
543
	for (int i = 0; i < n; ++i) {
544
		SimpleRegV(v[i], flags);
545
	}
546
}
547

548
void FPURegCache::SimpleRegsV(const u8 *v, MatrixSize msz, int flags) {
549
	const int n = GetMatrixSide(msz);
550
	// TODO: Could be more optimal (in case of Discard or etc.)
551
	for (int i = 0; i < n; ++i) {
552
		for (int j = 0; j < n; ++j) {
553
			SimpleRegV(v[j * 4 + i], flags);
554
		}
555
	}
556
}
557

558
void FPURegCache::SimpleRegV(const u8 v, int flags) {
559
	MIPSCachedFPReg &vr = vregs[v];
560
	// Special optimization: if it's in a single simd, we can keep it there.
561
	if (vr.lane == 1 && xregs[VSX(&v)].mipsRegs[1] == -1) {
562
		if (flags & MAP_DIRTY) {
563
			xregs[VSX(&v)].dirty = true;
564
		}
565
		// Just change the lane to 0.
566
		vr.lane = 0;
567
	} else if (vr.lane != 0) {
568
		// This will never end up in a register this way, so ignore dirty.
569
		if ((flags & MAP_NOINIT) == MAP_NOINIT) {
570
			// This will discard only this reg, and store the others.
571
			DiscardV(v);
572
		} else {
573
			StoreFromRegisterV(v);
574
		}
575
	} else if (vr.away) {
576
		// There are no immediates in the FPR reg file, so we already had this in a register. Make dirty as necessary.
577
		if (flags & MAP_DIRTY) {
578
			xregs[VX(v)].dirty = true;
579
		}
580
		_assert_msg_(vr.location.IsSimpleReg(), "not loaded and not simple.");
581
	}
582
	Invariant();
583
}
584

585
void FPURegCache::ReleaseSpillLock(int mipsreg) {
586
	regs[mipsreg].locked = 0;
587
}
588

589
void FPURegCache::ReleaseSpillLocks() {
590
	for (int i = 0; i < NUM_MIPS_FPRS; i++)
591
		regs[i].locked = 0;
592
	for (int i = TEMP0; i < TEMP0 + NUM_X86_FPU_TEMPS; ++i)
593
		DiscardR(i);
594
}
595

596
void FPURegCache::MapReg(const int i, bool doLoad, bool makeDirty) {
597
	pendingFlush = true;
598
	_assert_msg_(!regs[i].location.IsImm(), "WTF - FPURegCache::MapReg - imm");
599
	_assert_msg_(i >= 0 && i < NUM_MIPS_FPRS, "WTF - FPURegCache::MapReg - invalid mips reg %d", i);
600

601
	if (!regs[i].away) {
602
		// Reg is at home in the memory register file. Let's pull it out.
603
		X64Reg xr = GetFreeXReg();
604
		_assert_msg_(xr < NUM_X_FPREGS, "WTF - FPURegCache::MapReg - invalid reg %d", (int)xr);
605
		xregs[xr].mipsReg = i;
606
		xregs[xr].dirty = makeDirty;
607
		OpArg newloc = ::Gen::R(xr);
608
		if (doLoad)	{
609
			emit->MOVSS(xr, regs[i].location);
610
		}
611
		regs[i].location = newloc;
612
		regs[i].lane = 0;
613
		regs[i].away = true;
614
	} else if (regs[i].lane != 0) {
615
		// Well, darn.  This means we need to flush it.
616
		// TODO: This could be more optimal.  Also check flags.
617
		StoreFromRegister(i);
618
		MapReg(i, doLoad, makeDirty);
619
	} else {
620
		// There are no immediates in the FPR reg file, so we already had this in a register. Make dirty as necessary.
621
		xregs[RX(i)].dirty |= makeDirty;
622
		_assert_msg_(regs[i].location.IsSimpleReg(), "not loaded and not simple.");
623
	}
624
	Invariant();
625
}
626

627
static int MMShuffleSwapTo0(int lane) {
628
	if (lane == 0) {
629
		return _MM_SHUFFLE(3, 2, 1, 0);
630
	} else if (lane == 1) {
631
		return _MM_SHUFFLE(3, 2, 0, 1);
632
	} else if (lane == 2) {
633
		return _MM_SHUFFLE(3, 0, 1, 2);
634
	} else if (lane == 3) {
635
		return _MM_SHUFFLE(0, 2, 1, 3);
636
	} else {
637
		_assert_msg_(false, "MMShuffleSwapTo0: Invalid lane %d", lane);
638
		return 0;
639
	}
640
}
641

642
void FPURegCache::StoreFromRegister(int i) {
643
	_assert_msg_(!regs[i].location.IsImm(), "WTF - FPURegCache::StoreFromRegister - it's an imm");
644
	_assert_msg_(i >= 0 && i < NUM_MIPS_FPRS, "WTF - FPURegCache::StoreFromRegister - invalid mipsreg %i PC=%08x", i, js_->compilerPC);
645

646
	if (regs[i].away) {
647
		X64Reg xr = regs[i].location.GetSimpleReg();
648
		_assert_msg_(xr < NUM_X_FPREGS, "WTF - FPURegCache::StoreFromRegister - invalid reg: x %i (mr: %i). PC=%08x", (int)xr, i, js_->compilerPC);
649
		if (regs[i].lane != 0) {
650
			const int *mri = xregs[xr].mipsRegs;
651
			int seq = 1;
652
			for (int j = 1; j < 4; ++j) {
653
				if (mri[j] == -1) {
654
					break;
655
				}
656
				if (mri[j] - 32 >= 128 && mri[j - 1] - 32 >= 128 && mri[j] == mri[j - 1] + 1) {
657
					seq++;
658
				} else if (mri[j] - 32 < 128 && mri[j - 1] - 32 < 128 && voffset[mri[j] - 32] == voffset[mri[j - 1] - 32] + 1) {
659
					seq++;
660
				} else {
661
					break;
662
				}
663
			}
664

665
			const float *f = mri[0] - 32 < 128 ? &mips_->v[voffset[mri[0] - 32]] : &mips_->tempValues[mri[0] - 32 - 128];
666
			int align = (intptr_t)f & 0xf;
667

668
			// If we can do a multistore...
669
			if ((seq == 2 && (align & 0x7) == 0) || seq == 4) {
670
				OpArg newLoc = GetDefaultLocation(mri[0]);
671
				if (xregs[xr].dirty) {
672
					if (seq == 4 && align == 0)
673
						emit->MOVAPS(newLoc, xr);
674
					else if (seq == 4)
675
						emit->MOVUPS(newLoc, xr);
676
					else
677
						emit->MOVQ_xmm(newLoc, xr);
678
				}
679
				for (int j = 0; j < seq; ++j) {
680
					int mr = xregs[xr].mipsRegs[j];
681
					if (mr == -1) {
682
						continue;
683
					}
684
					OpArg newLoc = GetDefaultLocation(mr);
685
					regs[mr].location = newLoc;
686
					regs[mr].away = false;
687
					regs[mr].lane = 0;
688
					xregs[xr].mipsRegs[j] = -1;
689
				}
690
			} else {
691
				seq = 0;
692
			}
693
			// Store the rest.
694
			for (int j = seq; j < 4; ++j) {
695
				int mr = xregs[xr].mipsRegs[j];
696
				if (mr == -1) {
697
					continue;
698
				}
699
				if (j != 0 && xregs[xr].dirty) {
700
					emit->SHUFPS(xr, Gen::R(xr), MMShuffleSwapTo0(j));
701
				}
702
				OpArg newLoc = GetDefaultLocation(mr);
703
				if (xregs[xr].dirty) {
704
					emit->MOVSS(newLoc, xr);
705
				}
706
				regs[mr].location = newLoc;
707
				regs[mr].away = false;
708
				regs[mr].lane = 0;
709
				xregs[xr].mipsRegs[j] = -1;
710
			}
711
		} else {
712
			OpArg newLoc = GetDefaultLocation(i);
713
			xregs[xr].mipsReg = -1;
714
			if (xregs[xr].dirty) {
715
				emit->MOVSS(newLoc, xr);
716
			}
717
			regs[i].location = newLoc;
718
		}
719
		xregs[xr].dirty = false;
720
		regs[i].away = false;
721
	} else {
722
		//	_assert_msg_(false,"already stored");
723
	}
724
	Invariant();
725
}
726

727
void FPURegCache::DiscardR(int i) {
728
	_assert_msg_(!regs[i].location.IsImm(), "FPU can't handle imm yet.");
729
	if (regs[i].away) {
730
		X64Reg xr = regs[i].location.GetSimpleReg();
731
		_assert_msg_(xr < NUM_X_FPREGS, "DiscardR: MipsReg had bad X64Reg");
732
		// Note that we DO NOT write it back here. That's the whole point of Discard.
733
		if (regs[i].lane != 0) {
734
			// But we can't just discard all of them in SIMD, just the one lane.
735
			// TODO: Potentially this could be more optimal (MOVQ or etc.)
736
			xregs[xr].mipsRegs[regs[i].lane - 1] = -1;
737
			regs[i].lane = 0;
738
			for (int j = 0; j < 4; ++j) {
739
				int mr = xregs[xr].mipsRegs[j];
740
				if (mr == -1) {
741
					continue;
742
				}
743
				if (j != 0 && xregs[xr].dirty) {
744
					emit->SHUFPS(xr, Gen::R(xr), MMShuffleSwapTo0(j));
745
				}
746

747
				OpArg newLoc = GetDefaultLocation(mr);
748
				if (xregs[xr].dirty) {
749
					emit->MOVSS(newLoc, xr);
750
				}
751
				regs[mr].location = newLoc;
752
				regs[mr].away = false;
753
				regs[mr].lane = 0;
754
				xregs[xr].mipsRegs[j] = -1;
755
			}
756
		} else {
757
			xregs[xr].mipsReg = -1;
758
		}
759
		xregs[xr].dirty = false;
760
		regs[i].location = GetDefaultLocation(i);
761
		regs[i].away = false;
762
		regs[i].tempLocked = false;
763
	} else {
764
		//	_assert_msg_(false,"already stored");
765
		regs[i].tempLocked = false;
766
	}
767
	Invariant();
768
}
769

770
void FPURegCache::DiscardVS(int vreg) {
771
	_assert_msg_(!vregs[vreg].location.IsImm(), "FPU can't handle imm yet.");
772

773
	if (vregs[vreg].away) {
774
		_assert_msg_(vregs[vreg].lane != 0, "VS expects a SIMD reg.");
775
		X64Reg xr = vregs[vreg].location.GetSimpleReg();
776
		_assert_msg_(xr < NUM_X_FPREGS, "DiscardR: MipsReg had bad X64Reg");
777
		// Note that we DO NOT write it back here. That's the whole point of Discard.
778
		for (int i = 0; i < 4; ++i) {
779
			int mr = xregs[xr].mipsRegs[i];
780
			if (mr != -1) {
781
				regs[mr].location = GetDefaultLocation(mr);
782
				regs[mr].away = false;
783
				regs[mr].tempLocked = false;
784
				regs[mr].lane = 0;
785
			}
786
			xregs[xr].mipsRegs[i] = -1;
787
		}
788
		xregs[xr].dirty = false;
789
	} else {
790
		vregs[vreg].tempLocked = false;
791
	}
792
	Invariant();
793
}
794

795
bool FPURegCache::IsTempX(X64Reg xr) {
796
	return xregs[xr].mipsReg >= TEMP0;
797
}
798

799
int FPURegCache::GetTempR() {
800
	pendingFlush = true;
801
	for (int r = TEMP0; r < TEMP0 + NUM_X86_FPU_TEMPS; ++r) {
802
		if (!regs[r].away && !regs[r].tempLocked) {
803
			regs[r].tempLocked = true;
804
			return r;
805
		}
806
	}
807

808
	_assert_msg_(false, "Regcache ran out of temp regs, might need to DiscardR() some.");
809
	return -1;
810
}
811

812
int FPURegCache::GetTempVS(u8 *v, VectorSize vsz) {
813
	pendingFlush = true;
814
	const int n = GetNumVectorElements(vsz);
815

816
	// Let's collect regs as we go, but try for n free in a row.
817
	int found = 0;
818
	for (int r = TEMP0; r <= TEMP0 + NUM_X86_FPU_TEMPS - n; ++r) {
819
		if (regs[r].away || regs[r].tempLocked) {
820
			continue;
821
		}
822

823
		// How many free siblings does this have?
824
		int seq = 1;
825
		for (int i = 1; i < n; ++i) {
826
			if (regs[r + i].away || regs[r + i].tempLocked) {
827
				break;
828
			}
829
			++seq;
830
		}
831

832
		if (seq == n) {
833
			// Got 'em.  Exacty as many as we need.
834
			for (int i = 0; i < n; ++i) {
835
				v[i] = r + i - 32;
836
			}
837
			found = n;
838
			break;
839
		}
840

841
		if (found < n) {
842
			v[found++] = r - 32;
843
		}
844
	}
845

846
	if (found != n) {
847
		_assert_msg_(false, "Regcache ran out of temp regs, might need to DiscardR() some.");
848
		return -1;
849
	}
850

851
	for (int i = 0; i < n; ++i) {
852
		regs[v[i] + 32].tempLocked = true;
853
	}
854

855
	return 0;  // ??
856
}
857

858
void FPURegCache::Flush() {
859
	if (!pendingFlush) {
860
		return;
861
	}
862
	for (int i = 0; i < NUM_MIPS_FPRS; i++) {
863
		_assert_msg_(!regs[i].locked, "Somebody forgot to unlock MIPS reg %d.", i);
864
		if (regs[i].away) {
865
			if (regs[i].location.IsSimpleReg()) {
866
				X64Reg xr = RX(i);
867
				StoreFromRegister(i);
868
				xregs[xr].dirty = false;
869
			} else if (regs[i].location.IsImm()) {
870
				StoreFromRegister(i);
871
			} else {
872
				_assert_msg_(false, "Jit64 - Flush unhandled case, reg %i PC: %08x", i, mips_->pc);
873
			}
874
		}
875
	}
876
	pendingFlush = false;
877
	Invariant();
878
}
879

880
OpArg FPURegCache::GetDefaultLocation(int reg) const {
881
	if (reg < 32) {
882
		// Smaller than RIP addressing since we can use a byte offset.
883
		return MDisp(CTXREG, reg * 4);
884
	} else if (reg < 32 + 128) {
885
		// Here, RIP has the advantage so let's use it when possible
886
		if (useRip_) {
887
			return M(&mips_->v[voffset[reg - 32]]);  // rip accessible
888
		} else {
889
			return MIPSSTATE_VAR_ELEM32(v[0], voffset[reg - 32]);
890
		}
891
	} else {
892
		if (useRip_) {
893
			return M(&mips_->tempValues[reg - 32 - 128]);  // rip accessible
894
		} else {
895
			return MIPSSTATE_VAR_ELEM32(tempValues[0], reg - 32 - 128);
896
		}
897
	}
898
}
899

900
void FPURegCache::Invariant() const {
901
#if 0
902
	_assert_msg_(SanityCheck() == 0, "Sanity check failed: %d", SanityCheck());
903
#endif
904
}
905

906
static int GetMRMtx(int mr) {
907
	if (mr < 32)
908
		return -1;
909
	if (mr >= 128 + 32)
910
		return -1;
911
	return ((mr - 32) >> 2) & 7;
912
}
913

914
static int GetMRRow(int mr) {
915
	if (mr < 32)
916
		return -1;
917
	if (mr >= 128 + 32)
918
		return -1;
919
	return ((mr - 32) >> 0) & 3;
920
}
921

922
static int GetMRCol(int mr) {
923
	if (mr < 32)
924
		return -1;
925
	if (mr >= 128 + 32)
926
		return -1;
927
	return ((mr - 32) >> 5) & 3;
928
}
929

930
static bool IsMRTemp(int mr) {
931
	return mr >= 128 + 32;
932
}
933

934
int FPURegCache::SanityCheck() const {
935
	for (int i = 0; i < NUM_MIPS_FPRS; i++) {
936
		const MIPSCachedFPReg &mr = regs[i];
937

938
		// FPR can never have imms.
939
		if (mr.location.IsImm())
940
			return 1;
941

942
		bool reallyAway = mr.location.IsSimpleReg();
943
		if (reallyAway != mr.away)
944
			return 2;
945

946
		if (mr.lane < 0 || mr.lane > 4)
947
			return 3;
948
		if (mr.lane != 0 && !reallyAway)
949
			return 4;
950

951
		if (mr.away) {
952
			Gen::X64Reg simple = mr.location.GetSimpleReg();
953
			if (mr.lane == 0) {
954
				if (xregs[simple].mipsReg != i)
955
					return 5;
956
				for (int j = 1; j < 4; ++j) {
957
					if (xregs[simple].mipsRegs[j] != -1)
958
						return 6;
959
				}
960
			} else {
961
				if (xregs[simple].mipsRegs[mr.lane - 1] != i)
962
					return 7;
963
			}
964
		}
965
	}
966

967
	for (int i = 0; i < NUM_X_FPREGS; ++i) {
968
		const X64CachedFPReg &xr = xregs[i];
969
		bool hasReg = xr.mipsReg != -1;
970
		if (!hasReg && xr.dirty)
971
			return 8;
972

973
		bool hasMoreRegs = hasReg;
974
		int mtx = -2;
975
		int row = -2;
976
		int col = -2;
977
		bool rowMatched = true;
978
		bool colMatched = true;
979
		for (int j = 0; j < 4; ++j) {
980
			if (xr.mipsRegs[j] == -1) {
981
				hasMoreRegs = false;
982
				continue;
983
			}
984
			if (xr.mipsRegs[j] >= NUM_MIPS_FPRS) {
985
				return 13;
986
			}
987
			// We can't have a hole in the middle / front.
988
			if (!hasMoreRegs)
989
				return 9;
990

991
			const MIPSCachedFPReg &mr = regs[xr.mipsRegs[j]];
992
			if (!mr.location.IsSimpleReg(X64Reg(i)))
993
				return 10;
994

995
			if (!IsMRTemp(xr.mipsRegs[j])) {
996
				if (mtx == -2)
997
					mtx = GetMRMtx(xr.mipsRegs[j]);
998
				else if (mtx != GetMRMtx(xr.mipsRegs[j]))
999
					return 11;
1000

1001
				if (row == -2)
1002
					row = GetMRRow(xr.mipsRegs[j]);
1003
				else if (row != GetMRRow(xr.mipsRegs[j]))
1004
					rowMatched = false;
1005

1006
				if (col == -2)
1007
					col = GetMRCol(xr.mipsRegs[j]);
1008
				else if (col != GetMRCol(xr.mipsRegs[j]))
1009
					colMatched = false;
1010
			}
1011
		}
1012
		if (!rowMatched && !colMatched) {
1013
			return 12;
1014
		}
1015
	}
1016

1017
	return 0;
1018
}
1019

1020
const int *FPURegCache::GetAllocationOrder(int &count) {
1021
	static const int allocationOrder[] = {
1022
#if PPSSPP_ARCH(AMD64)
1023
		XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15, XMM2, XMM3, XMM4, XMM5
1024
#elif PPSSPP_ARCH(X86)
1025
		XMM2, XMM3, XMM4, XMM5, XMM6, XMM7,
1026
#endif
1027
	};
1028
	count = sizeof(allocationOrder) / sizeof(int);
1029
	return allocationOrder;
1030
}
1031

1032
X64Reg FPURegCache::GetFreeXReg() {
1033
	X64Reg res;
1034
	int obtained = GetFreeXRegs(&res, 1);
1035

1036
	_assert_msg_(obtained == 1, "Regcache ran out of regs");
1037
	return res;
1038
}
1039

1040
int FPURegCache::GetFreeXRegs(X64Reg *res, int n, bool spill) {
1041
	pendingFlush = true;
1042
	int aCount;
1043
	const int *aOrder = GetAllocationOrder(aCount);
1044

1045
	_dbg_assert_msg_(n <= NUM_X_FPREGS - 2, "Cannot obtain that many regs.");
1046

1047
	int r = 0;
1048

1049
	for (int i = 0; i < aCount; i++) {
1050
		X64Reg xr = (X64Reg)aOrder[i];
1051
		if (xregs[xr].mipsReg == -1) {
1052
			res[r++] = (X64Reg)xr;
1053
			if (r >= n) {
1054
				break;
1055
			}
1056
		}
1057
	}
1058

1059
	if (r < n && spill) {
1060
		// Okay, not found :(... Force grab one.
1061
		// TODO - add a pass to grab xregs whose mipsreg is not used in the next 3 instructions.
1062
		for (int i = 0; i < aCount; i++) {
1063
			X64Reg xr = (X64Reg)aOrder[i];
1064
			int preg = xregs[xr].mipsReg;
1065
			_assert_msg_(preg >= -1 && preg < NUM_MIPS_FPRS, "WTF - FPURegCache::GetFreeXRegs - invalid mips reg %d in xr %d", preg, (int)xr);
1066

1067
			// We're only spilling here, so don't overlap.
1068
			if (preg != -1 && !regs[preg].locked) {
1069
				StoreFromRegister(preg);
1070
				res[r++] = xr;
1071
				if (r >= n) {
1072
					break;
1073
				}
1074
			}
1075
		}
1076
	}
1077

1078
	for (int i = r; i < n; ++i) {
1079
		res[i] = INVALID_REG;
1080
	}
1081
	return r;
1082
}
1083

1084
void FPURegCache::FlushX(X64Reg reg) {
1085
	if (reg >= NUM_X_FPREGS) {
1086
		_assert_msg_(false, "Flushing non existent reg");
1087
	} else if (xregs[reg].mipsReg != -1) {
1088
		StoreFromRegister(xregs[reg].mipsReg);
1089
	}
1090
}
1091

1092
void FPURegCache::GetState(FPURegCacheState &state) const {
1093
	memcpy(state.regs, regs, sizeof(regs));
1094
	memcpy(state.xregs, xregs, sizeof(xregs));
1095
}
1096

1097
void FPURegCache::RestoreState(const FPURegCacheState& state) {
1098
	memcpy(regs, state.regs, sizeof(regs));
1099
	memcpy(xregs, state.xregs, sizeof(xregs));
1100
	pendingFlush = true;
1101
}
1102

1103
#endif // PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
1104

1105
Product

Resources

Company