CoCalc -- CompVFPU.cpp

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/x86/CompVFPU.cpp
³¹⁸⁹ views
1
// Copyright (c) 2012- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
// Table 13.10 in http://agner.org/optimize/optimizing_assembly.pdf is cool - generate constants with
19
// short instruction sequences. Surprisingly many are possible.
20

21
#include "ppsspp_config.h"
22

23
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
24
#include <cmath>
25
#include <limits>
26
#include "Common/Math/math_util.h"
27

28
#include "Common/CPUDetect.h"
29
#include "Common/Math/SIMDHeaders.h"
30
#include "Common/Log.h"
31
#include "Core/Compatibility.h"
32
#include "Core/Config.h"
33
#include "Core/MemMap.h"
34
#include "Core/Reporting.h"
35
#include "Core/System.h"
36
#include "Core/MIPS/MIPSAnalyst.h"
37
#include "Core/MIPS/MIPSCodeUtils.h"
38
#include "Core/MIPS/MIPSVFPUUtils.h"
39
#include "Core/MIPS/x86/Jit.h"
40
#include "Core/MIPS/x86/RegCache.h"
41

42
// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
43
// Currently known non working ones should have DISABLE.
44

45
// #define CONDITIONAL_DISABLE { fpr.ReleaseSpillLocks(); Comp_Generic(op); return; }
46
#define CONDITIONAL_DISABLE(flag) if (jo.Disabled(JitDisable::flag)) { Comp_Generic(op); return; }
47
#define DISABLE { fpr.ReleaseSpillLocks(); Comp_Generic(op); return; }
48

49
#define _RS MIPS_GET_RS(op)
50
#define _RT MIPS_GET_RT(op)
51
#define _RD MIPS_GET_RD(op)
52
#define _FS MIPS_GET_FS(op)
53
#define _FT MIPS_GET_FT(op)
54
#define _FD MIPS_GET_FD(op)
55
#define _SA MIPS_GET_SA(op)
56
#define _POS  ((op>> 6) & 0x1F)
57
#define _SIZE ((op>>11) & 0x1F)
58
#define _IMM16 (signed short)(op & 0xFFFF)
59
#define _IMM26 (op & 0x03FFFFFF)
60

61
namespace MIPSComp
62
{
63
using namespace Gen;
64
using namespace X64JitConstants;
65

66
static const float one = 1.0f;
67
static const float minus_one = -1.0f;
68

69
alignas(16) const u32 noSignMask[4] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
70
alignas(16) const u32 signBitAll[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
71
alignas(16) const u32 signBitLower[4] = {0x80000000, 0, 0, 0};
72
alignas(16) const float oneOneOneOne[4] = {1.0f, 1.0f, 1.0f, 1.0f};
73
alignas(16) const u32 fourinfnan[4] = {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
74
alignas(16) const float identityMatrix[4][4] = { { 1.0f, 0, 0, 0 }, { 0, 1.0f, 0, 0 }, { 0, 0, 1.0f, 0 }, { 0, 0, 0, 1.0f} };
75

76
void Jit::Comp_VPFX(MIPSOpcode op)
77
{
78
	CONDITIONAL_DISABLE(VFPU_XFER);
79
	int data = op & 0xFFFFF;
80
	int regnum = (op >> 24) & 3;
81
	switch (regnum) {
82
	case 0:  // S
83
		js.prefixS = data;
84
		js.prefixSFlag = JitState::PREFIX_KNOWN_DIRTY;
85
		break;
86
	case 1:  // T
87
		js.prefixT = data;
88
		js.prefixTFlag = JitState::PREFIX_KNOWN_DIRTY;
89
		break;
90
	case 2:  // D
91
		js.prefixD = data & 0x00000FFF;
92
		js.prefixDFlag = JitState::PREFIX_KNOWN_DIRTY;
93
		break;
94
	}
95
}
96

97
void Jit::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) {
98
	if (prefix == 0xE4) return;
99

100
	int n = GetNumVectorElements(sz);
101
	u8 origV[4];
102
	static const float constantArray[8] = {0.f, 1.f, 2.f, 0.5f, 3.f, 1.f/3.f, 0.25f, 1.f/6.f};
103

104
	for (int i = 0; i < n; i++)
105
		origV[i] = vregs[i];
106

107
	for (int i = 0; i < n; i++) {
108
		int regnum = (prefix >> (i*2)) & 3;
109
		int abs    = (prefix >> (8+i)) & 1;
110
		int negate = (prefix >> (16+i)) & 1;
111
		int constants = (prefix >> (12+i)) & 1;
112

113
		// Unchanged, hurray.
114
		if (!constants && regnum == i && !abs && !negate)
115
			continue;
116

117
		// This puts the value into a temp reg, so we won't write the modified value back.
118
		vregs[i] = fpr.GetTempV();
119
		fpr.MapRegV(vregs[i], MAP_NOINIT | MAP_DIRTY);
120

121
		if (!constants) {
122
			// Prefix may say "z, z, z, z" but if this is a pair, we force to x.
123
			// TODO: But some ops seem to use const 0 instead?
124
			if (regnum >= n) {
125
				ERROR_LOG_REPORT(Log::CPU, "Invalid VFPU swizzle: %08x / %d", prefix, sz);
126
				regnum = 0;
127
			}
128
			fpr.SimpleRegV(origV[regnum], 0);
129
			MOVSS(fpr.VX(vregs[i]), fpr.V(origV[regnum]));
130
			if (abs) {
131
				if (RipAccessible(&noSignMask)) {
132
					ANDPS(fpr.VX(vregs[i]), M(&noSignMask));  // rip accessible
133
				} else {
134
					MOV(PTRBITS, R(TEMPREG), ImmPtr(&noSignMask));
135
					ANDPS(fpr.VX(vregs[i]), MatR(TEMPREG));
136
				}
137
			}
138
		} else {
139
			if (RipAccessible(constantArray)) {
140
				MOVSS(fpr.VX(vregs[i]), M(&constantArray[regnum + (abs << 2)]));  // rip accessible
141
			} else {
142
				MOV(PTRBITS, R(TEMPREG), ImmPtr(&constantArray[regnum + (abs << 2)]));
143
				MOVSS(fpr.VX(vregs[i]), MatR(TEMPREG));
144
			}
145
		}
146

147
		if (negate) {
148
			if (RipAccessible(&signBitLower)) {
149
				XORPS(fpr.VX(vregs[i]), M(&signBitLower));  // rip accessible
150
			} else {
151
				MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower));
152
				XORPS(fpr.VX(vregs[i]), MatR(TEMPREG));
153
			}
154
		}
155
		// TODO: This probably means it will swap out soon, inefficiently...
156
		fpr.ReleaseSpillLockV(vregs[i]);
157
	}
158
}
159

160
void Jit::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) {
161
	_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
162

163
	GetVectorRegs(regs, sz, vectorReg);
164
	if (js.prefixD == 0)
165
		return;
166

167
	int n = GetNumVectorElements(sz);
168
	for (int i = 0; i < n; i++) {
169
		// Hopefully this is rare, we'll just write it into a reg we drop.
170
		if (js.VfpuWriteMask(i))
171
			regs[i] = fpr.GetTempV();
172
	}
173
}
174

175
void Jit::ApplyPrefixD(const u8 *vregs, VectorSize sz) {
176
	_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
177
	if (!js.prefixD) return;
178

179
	int n = GetNumVectorElements(sz);
180
	for (int i = 0; i < n; i++) {
181
		if (js.VfpuWriteMask(i))
182
			continue;
183

184
		int sat = (js.prefixD >> (i * 2)) & 3;
185
		if (sat == 1) {
186
			fpr.MapRegV(vregs[i], MAP_DIRTY);
187

188
			// Zero out XMM0 if it was <= +0.0f (but skip NAN.)
189
			MOVSS(R(XMM0), fpr.VX(vregs[i]));
190
			XORPS(XMM1, R(XMM1));
191
			CMPLESS(XMM0, R(XMM1));
192
			ANDNPS(XMM0, fpr.V(vregs[i]));
193

194
			// Retain a NAN in XMM0 (must be second operand.)
195
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
196
			MOVSS(fpr.VX(vregs[i]), MatR(TEMPREG));
197
			MINSS(fpr.VX(vregs[i]), R(XMM0));
198
		} else if (sat == 3) {
199
			fpr.MapRegV(vregs[i], MAP_DIRTY);
200

201
			// Check for < -1.0f, but careful of NANs.
202
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&minus_one));
203
			MOVSS(XMM1, MatR(TEMPREG));
204
			MOVSS(R(XMM0), fpr.VX(vregs[i]));
205
			CMPLESS(XMM0, R(XMM1));
206
			// If it was NOT less, the three ops below do nothing.
207
			// Otherwise, they replace the value with -1.0f.
208
			ANDPS(XMM1, R(XMM0));
209
			ANDNPS(XMM0, fpr.V(vregs[i]));
210
			ORPS(XMM0, R(XMM1));
211

212
			// Retain a NAN in XMM0 (must be second operand.)
213
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
214
			MOVSS(fpr.VX(vregs[i]), MatR(TEMPREG));
215
			MINSS(fpr.VX(vregs[i]), R(XMM0));
216
		}
217
	}
218
}
219

220
// Vector regs can overlap in all sorts of swizzled ways.
221
// This does allow a single overlap in sregs[i].
222
bool IsOverlapSafeAllowS(int dreg, int di, int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = NULL) {
223
	for (int i = 0; i < sn; ++i) {
224
		if (sregs[i] == dreg && i != di)
225
			return false;
226
	}
227
	for (int i = 0; i < tn; ++i) {
228
		if (tregs[i] == dreg)
229
			return false;
230
	}
231

232
	// Hurray, no overlap, we can write directly.
233
	return true;
234
}
235

236
bool IsOverlapSafe(int dreg, int di, int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = NULL) {
237
	return IsOverlapSafeAllowS(dreg, di, sn, sregs, tn, tregs) && sregs[di] != dreg;
238
}
239

240
void Jit::Comp_SV(MIPSOpcode op) {
241
	CONDITIONAL_DISABLE(LSU_VFPU);
242

243
	s32 imm = (signed short)(op&0xFFFC);
244
	int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5);
245
	MIPSGPReg rs = _RS;
246

247
	CheckMemoryBreakpoint(0, rs, imm);
248

249
	switch (op >> 26) {
250
	case 50: //lv.s  // VI(vt) = Memory::Read_U32(addr);
251
		{
252
			gpr.Lock(rs);
253
			fpr.MapRegV(vt, MAP_DIRTY | MAP_NOINIT);
254

255
			JitSafeMem safe(this, rs, imm);
256
			OpArg src;
257
			if (safe.PrepareRead(src, 4)) {
258
				MOVSS(fpr.VX(vt), safe.NextFastAddress(0));
259
			}
260
			if (safe.PrepareSlowRead(safeMemFuncs.readU32)) {
261
				MOVD_xmm(fpr.VX(vt), R(EAX));
262
			}
263
			safe.Finish();
264

265
			gpr.UnlockAll();
266
			fpr.ReleaseSpillLocks();
267
		}
268
		break;
269

270
	case 58: //sv.s   // Memory::Write_U32(VI(vt), addr);
271
		{
272
			gpr.Lock(rs);
273

274
			fpr.MapRegV(vt, 0);
275

276
			JitSafeMem safe(this, rs, imm);
277
			OpArg dest;
278
			if (safe.PrepareWrite(dest, 4)) {
279
				MOVSS(safe.NextFastAddress(0), fpr.VX(vt));
280
			}
281
			if (safe.PrepareSlowWrite()) {
282
				MOVSS(MIPSSTATE_VAR(temp), fpr.VX(vt));
283
				safe.DoSlowWrite(safeMemFuncs.writeU32, MIPSSTATE_VAR(temp), 0);
284
			}
285
			safe.Finish();
286

287
			fpr.ReleaseSpillLocks();
288
			gpr.UnlockAll();
289
		}
290
		break;
291

292
	default:
293
		DISABLE;
294
	}
295
}
296

297
void Jit::Comp_SVQ(MIPSOpcode op) {
298
	CONDITIONAL_DISABLE(LSU_VFPU);
299

300
	int imm = (signed short)(op&0xFFFC);
301
	int vt = (((op >> 16) & 0x1f)) | ((op&1) << 5);
302
	MIPSGPReg rs = _RS;
303

304
	CheckMemoryBreakpoint(0, rs, imm);
305

306
	switch (op >> 26) {
307
	case 53: //lvl.q/lvr.q
308
		{
309
			if (!g_Config.bFastMemory) {
310
				DISABLE;
311
			}
312
			DISABLE;  // The code below isn't quite working, so we fall back to interpreter for now.
313

314
			gpr.MapReg(rs, true, false);
315
			gpr.FlushLockX(ECX);
316
			u8 vregs[4];
317
			GetVectorRegs(vregs, V_Quad, vt);
318
			MOV(32, R(EAX), gpr.R(rs));
319
			ADD(32, R(EAX), Imm32(imm));
320
#ifdef MASKED_PSP_MEMORY
321
			AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
322
#endif
323
			MOV(32, R(ECX), R(EAX));
324
			SHR(32, R(EAX), Imm8(2));
325
			AND(32, R(EAX), Imm32(0x3));
326
			CMP(32, R(EAX), Imm32(0));
327
			FixupBranch next = J_CC(CC_NE);
328

329
			auto PSPMemAddr = [](X64Reg scaled, int offset) {
330
#if PPSSPP_ARCH(X86)
331
				return MDisp(scaled, (u32)Memory::base + offset);
332
#else
333
				return MComplex(MEMBASEREG, scaled, 1, offset);
334
#endif
335
			};
336

337
			fpr.MapRegsV(vregs, V_Quad, MAP_DIRTY);
338

339
			// Offset = 0
340
			MOVSS(fpr.RX(vregs[3]), PSPMemAddr(EAX, 0));
341

342
			FixupBranch skip0 = J();
343
			SetJumpTarget(next);
344
			CMP(32, R(EAX), Imm32(1));
345
			next = J_CC(CC_NE);
346

347
			// Offset = 1
348
			MOVSS(fpr.RX(vregs[3]), PSPMemAddr(EAX, 4));
349
			MOVSS(fpr.RX(vregs[2]), PSPMemAddr(EAX, 0));
350

351
			FixupBranch skip1 = J();
352
			SetJumpTarget(next);
353
			CMP(32, R(EAX), Imm32(2));
354
			next = J_CC(CC_NE);
355

356
			// Offset = 2
357
			MOVSS(fpr.RX(vregs[3]), PSPMemAddr(EAX, 8));
358
			MOVSS(fpr.RX(vregs[2]), PSPMemAddr(EAX, 4));
359
			MOVSS(fpr.RX(vregs[1]), PSPMemAddr(EAX, 0));
360

361
			FixupBranch skip2 = J();
362
			SetJumpTarget(next);
363
			CMP(32, R(EAX), Imm32(3));
364
			next = J_CC(CC_NE);
365

366
			// Offset = 3
367
			MOVSS(fpr.RX(vregs[3]), PSPMemAddr(EAX, 12));
368
			MOVSS(fpr.RX(vregs[2]), PSPMemAddr(EAX, 8));
369
			MOVSS(fpr.RX(vregs[1]), PSPMemAddr(EAX, 4));
370
			MOVSS(fpr.RX(vregs[0]), PSPMemAddr(EAX, 0));
371

372
			SetJumpTarget(next);
373
			SetJumpTarget(skip0);
374
			SetJumpTarget(skip1);
375
			SetJumpTarget(skip2);
376

377
			gpr.UnlockAll();
378
			fpr.ReleaseSpillLocks();
379
		}
380
		break;
381

382
	case 54: //lv.q
383
		{
384
			gpr.Lock(rs);
385
			// This must be in a reg or an immediate.
386
			// Otherwise, it'll get put in EAX and we'll clobber that during NextSlowRead().
387
			if (!gpr.IsImm(rs))
388
				gpr.MapReg(rs, true, false);
389
	
390
			u8 vregs[4];
391
			GetVectorRegs(vregs, V_Quad, vt);
392

393
			if (fpr.TryMapRegsVS(vregs, V_Quad, MAP_NOINIT | MAP_DIRTY)) {
394
				JitSafeMem safe(this, rs, imm);
395
				OpArg src;
396
				if (safe.PrepareRead(src, 16)) {
397
					// Should be safe, since lv.q must be aligned, but let's try to avoid crashing in safe mode.
398
					if (g_Config.bFastMemory) {
399
						MOVAPS(fpr.VSX(vregs), safe.NextFastAddress(0));
400
					} else {
401
						MOVUPS(fpr.VSX(vregs), safe.NextFastAddress(0));
402
					}
403
				}
404
				if (safe.PrepareSlowRead(safeMemFuncs.readU32)) {
405
					for (int i = 0; i < 4; i++) {
406
						safe.NextSlowRead(safeMemFuncs.readU32, i * 4);
407
						// We use XMM0 as a temporary since MOVSS and MOVD would clear the higher bits.
408
						MOVD_xmm(XMM0, R(EAX));
409
						MOVSS(fpr.VSX(vregs), R(XMM0));
410
						// Rotate things so we can read in the next higher float.
411
						// By the end (4 rotates), they'll all be back into place.
412
						SHUFPS(fpr.VSX(vregs), fpr.VS(vregs), _MM_SHUFFLE(0, 3, 2, 1));
413
					}
414
				}
415
				safe.Finish();
416
				gpr.UnlockAll();
417
				fpr.ReleaseSpillLocks();
418
				return;
419
			}
420

421
			fpr.MapRegsV(vregs, V_Quad, MAP_DIRTY | MAP_NOINIT);
422

423
			JitSafeMem safe(this, rs, imm);
424
			OpArg src;
425
			if (safe.PrepareRead(src, 16)) {
426
				// Just copy 4 words the easiest way while not wasting registers.
427
				for (int i = 0; i < 4; i++)
428
					MOVSS(fpr.VX(vregs[i]), safe.NextFastAddress(i * 4));
429
			}
430
			if (safe.PrepareSlowRead(safeMemFuncs.readU32)) {
431
				for (int i = 0; i < 4; i++) {
432
					safe.NextSlowRead(safeMemFuncs.readU32, i * 4);
433
					MOVD_xmm(fpr.VX(vregs[i]), R(EAX));
434
				}
435
			}
436
			safe.Finish();
437

438
			gpr.UnlockAll();
439
			fpr.ReleaseSpillLocks();
440
		}
441
		break;
442

443
	case 62: //sv.q
444
		{
445
			gpr.Lock(rs);
446
			// This must be in a reg or an immediate.
447
			// Otherwise, it'll get put in EAX and we'll clobber that during NextSlowRead().
448
			if (!gpr.IsImm(rs))
449
				gpr.MapReg(rs, true, false);
450

451
			u8 vregs[4];
452
			GetVectorRegs(vregs, V_Quad, vt);
453

454
			if (fpr.TryMapRegsVS(vregs, V_Quad, 0)) {
455
				JitSafeMem safe(this, rs, imm);
456
				OpArg dest;
457
				if (safe.PrepareWrite(dest, 16)) {
458
					// Should be safe, since sv.q must be aligned, but let's try to avoid crashing in safe mode.
459
					if (g_Config.bFastMemory) {
460
						MOVAPS(safe.NextFastAddress(0), fpr.VSX(vregs));
461
					} else {
462
						MOVUPS(safe.NextFastAddress(0), fpr.VSX(vregs));
463
					}
464
				}
465
				if (safe.PrepareSlowWrite()) {
466
					MOVAPS(XMM0, fpr.VS(vregs));
467
					for (int i = 0; i < 4; i++) {
468
						MOVSS(MIPSSTATE_VAR(temp), XMM0);
469
						SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));
470
						safe.DoSlowWrite(safeMemFuncs.writeU32, MIPSSTATE_VAR(temp), i * 4);
471
					}
472
				}
473
				safe.Finish();
474
				gpr.UnlockAll();
475
				fpr.ReleaseSpillLocks();
476
				return;
477
			}
478

479
			// Even if we don't use real SIMD there's still 8 or 16 scalar float registers.
480
			fpr.MapRegsV(vregs, V_Quad, 0);
481

482
			JitSafeMem safe(this, rs, imm);
483
			OpArg dest;
484
			if (safe.PrepareWrite(dest, 16)) {
485
				for (int i = 0; i < 4; i++)
486
					MOVSS(safe.NextFastAddress(i * 4), fpr.VX(vregs[i]));
487
			}
488
			if (safe.PrepareSlowWrite()) {
489
				for (int i = 0; i < 4; i++) {
490
					MOVSS(MIPSSTATE_VAR(temp), fpr.VX(vregs[i]));
491
					safe.DoSlowWrite(safeMemFuncs.writeU32, MIPSSTATE_VAR(temp), i * 4);
492
				}
493
			}
494
			safe.Finish();
495

496
			gpr.UnlockAll();
497
			fpr.ReleaseSpillLocks();
498
		}
499
		break;
500

501
	default:
502
		DISABLE;
503
		break;
504
	}
505
}
506

507
void Jit::Comp_VVectorInit(MIPSOpcode op) {
508
	CONDITIONAL_DISABLE(VFPU_XFER);
509

510
	if (js.HasUnknownPrefix())
511
		DISABLE;
512

513
	VectorSize sz = GetVecSize(op);
514
	int type = (op >> 16) & 0xF;
515
	u8 dregs[4];
516
	GetVectorRegsPrefixD(dregs, sz, _VD);
517

518
	if (fpr.TryMapRegsVS(dregs, sz, MAP_NOINIT | MAP_DIRTY)) {
519
		if (type == 6) {
520
			XORPS(fpr.VSX(dregs), fpr.VS(dregs));
521
		} else if (type == 7) {
522
			if (RipAccessible(&oneOneOneOne)) {
523
				MOVAPS(fpr.VSX(dregs), M(&oneOneOneOne));  // rip accessible
524
			} else {
525
				MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
526
				MOVAPS(fpr.VSX(dregs), MatR(TEMPREG));
527
			}
528
		} else {
529
			DISABLE;
530
		}
531
		ApplyPrefixD(dregs, sz);
532
		fpr.ReleaseSpillLocks();
533
		return;
534
	}
535

536
	switch (type) {
537
	case 6: // v=zeros; break;  //vzero
538
		XORPS(XMM0, R(XMM0));
539
		break;
540
	case 7: // v=ones; break;   //vone
541
		if (RipAccessible(&one)) {
542
			MOVSS(XMM0, M(&one));  // rip accessible
543
		} else {
544
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
545
			MOVSS(XMM0, MatR(TEMPREG));
546
		}
547
		break;
548
	default:
549
		DISABLE;
550
		break;
551
	}
552

553
	int n = GetNumVectorElements(sz);
554
	fpr.MapRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
555
	for (int i = 0; i < n; ++i)
556
		MOVSS(fpr.VX(dregs[i]), R(XMM0));
557
	ApplyPrefixD(dregs, sz);
558

559
	fpr.ReleaseSpillLocks();
560
}
561

562
void Jit::Comp_VIdt(MIPSOpcode op) {
563
	CONDITIONAL_DISABLE(VFPU_XFER);
564
	if (js.HasUnknownPrefix())
565
		DISABLE;
566

567
	int vd = _VD;
568
	VectorSize sz = GetVecSize(op);
569
	int n = GetNumVectorElements(sz);
570

571
	u8 dregs[4];
572
	GetVectorRegsPrefixD(dregs, sz, _VD);
573
	if (fpr.TryMapRegsVS(dregs, sz, MAP_NOINIT | MAP_DIRTY)) {
574
		int row = vd & (n - 1);
575
		if (RipAccessible(identityMatrix)) {
576
			MOVAPS(fpr.VSX(dregs), M(identityMatrix[row]));  // rip accessible
577
		} else {
578
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&identityMatrix[row]));
579
			MOVAPS(fpr.VSX(dregs), MatR(TEMPREG));
580
		}
581
		ApplyPrefixD(dregs, sz);
582
		fpr.ReleaseSpillLocks();
583
		return;
584
	}
585

586
	XORPS(XMM0, R(XMM0));
587
	if (RipAccessible(&one)) {
588
		MOVSS(XMM1, M(&one));  // rip accessible
589
	} else {
590
		MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
591
		MOVSS(XMM1, MatR(TEMPREG));
592
	}
593
	fpr.MapRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
594
	switch (sz) {
595
	case V_Pair:
596
		MOVSS(fpr.VX(dregs[0]), R((vd&1)==0 ? XMM1 : XMM0));
597
		MOVSS(fpr.VX(dregs[1]), R((vd&1)==1 ? XMM1 : XMM0));
598
		break;
599
	case V_Quad:
600
		MOVSS(fpr.VX(dregs[0]), R((vd&3)==0 ? XMM1 : XMM0));
601
		MOVSS(fpr.VX(dregs[1]), R((vd&3)==1 ? XMM1 : XMM0));
602
		MOVSS(fpr.VX(dregs[2]), R((vd&3)==2 ? XMM1 : XMM0));
603
		MOVSS(fpr.VX(dregs[3]), R((vd&3)==3 ? XMM1 : XMM0));
604
		break;
605
	default:
606
		_dbg_assert_msg_(false,"Trying to interpret instruction that can't be interpreted");
607
		break;
608
	}
609
	ApplyPrefixD(dregs, sz);
610
	fpr.ReleaseSpillLocks();
611
}
612

613
void Jit::Comp_VDot(MIPSOpcode op) {
614
	CONDITIONAL_DISABLE(VFPU_VEC);
615

616
	if (js.HasUnknownPrefix())
617
		DISABLE;
618

619
	VectorSize sz = GetVecSize(op);
620
	int n = GetNumVectorElements(sz);
621

622
	// TODO: Force read one of them into regs? probably not.
623
	u8 sregs[4], tregs[4], dregs[1];
624
	GetVectorRegsPrefixS(sregs, sz, _VS);
625
	GetVectorRegsPrefixT(tregs, sz, _VT);
626
	GetVectorRegsPrefixD(dregs, V_Single, _VD);
627

628
	// With SSE2, these won't really give any performance benefit on their own, but may reduce
629
	// conversion costs from/to SIMD form. However, the SSE4.1 DPPS may be worth it.
630
	// Benchmarking will have to decide whether to enable this on < SSE4.1. Also a HADDPS version
631
	// for SSE3 could be written.
632
	if (fpr.TryMapDirtyInInVS(dregs, V_Single, sregs, sz, tregs, sz)) {
633
		switch (sz) {
634
		case V_Pair:
635
			if (cpu_info.bSSE4_1) {
636
				if (fpr.VSX(dregs) != fpr.VSX(sregs) && fpr.VSX(dregs) != fpr.VSX(tregs)) {
637
					MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
638
					DPPS(fpr.VSX(dregs), fpr.VS(tregs), 0x31);
639
				} else {
640
					MOVAPS(XMM0, fpr.VS(sregs));
641
					DPPS(XMM0, fpr.VS(tregs), 0x31);
642
					MOVAPS(fpr.VSX(dregs), R(XMM0));
643
				}
644
			} else {
645
				MOVAPS(XMM0, fpr.VS(sregs));
646
				MULPS(XMM0, fpr.VS(tregs));
647
				MOVAPS(R(XMM1), XMM0);
648
				SHUFPS(XMM1, R(XMM0), _MM_SHUFFLE(1, 1, 1, 1));
649
				ADDPS(XMM1, R(XMM0));
650
				MOVAPS(fpr.VS(dregs), XMM1);
651
			}
652
			break;
653
		case V_Triple:
654
			if (cpu_info.bSSE4_1) {
655
				if (fpr.VSX(dregs) != fpr.VSX(sregs) && fpr.VSX(dregs) != fpr.VSX(tregs)) {
656
					MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
657
					DPPS(fpr.VSX(dregs), fpr.VS(tregs), 0x71);
658
				} else {
659
					MOVAPS(XMM0, fpr.VS(sregs));
660
					DPPS(XMM0, fpr.VS(tregs), 0x71);
661
					MOVAPS(fpr.VSX(dregs), R(XMM0));
662
				}
663
			} else {
664
				MOVAPS(XMM0, fpr.VS(sregs));
665
				MULPS(XMM0, fpr.VS(tregs));
666
				MOVAPS(R(XMM1), XMM0);
667
				SHUFPS(XMM1, R(XMM0), _MM_SHUFFLE(3, 2, 1, 1));
668
				ADDSS(XMM1, R(XMM0));
669
				SHUFPS(XMM0, R(XMM1), _MM_SHUFFLE(3, 2, 2, 2));
670
				ADDSS(XMM1, R(XMM0));
671
				MOVAPS(fpr.VS(dregs), XMM1);
672
			}
673
			break;
674
		case V_Quad:
675
			if (cpu_info.bSSE4_1) {
676
				if (fpr.VSX(dregs) != fpr.VSX(sregs) && fpr.VSX(dregs) != fpr.VSX(tregs)) {
677
					MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
678
					DPPS(fpr.VSX(dregs), fpr.VS(tregs), 0xF1);
679
				} else {
680
					MOVAPS(XMM0, fpr.VS(sregs));
681
					DPPS(XMM0, fpr.VS(tregs), 0xF1);
682
					MOVAPS(fpr.VSX(dregs), R(XMM0));
683
				}
684
			} /* else if (cpu_info.bSSE3) {   // This is slower than the SSE2 solution on my Ivy!
685
				MOVAPS(XMM0, fpr.VS(sregs));
686
				MOVAPS(XMM1, fpr.VS(tregs));
687
				HADDPS(XMM0, R(XMM1));
688
				HADDPS(XMM0, R(XMM0));
689
				MOVAPS(fpr.VSX(dregs), R(XMM0));
690
			} */ else {
691
				MOVAPS(XMM0, fpr.VS(sregs));
692
				MOVAPS(XMM1, fpr.VS(tregs));
693
				MULPS(XMM0, R(XMM1));
694
				MOVAPS(XMM1, R(XMM0));
695
				SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(2, 3, 0, 1));
696
				ADDPS(XMM0, R(XMM1));
697
				MOVAPS(XMM1, R(XMM0));
698
				SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 1, 2, 3));
699
				ADDSS(XMM0, R(XMM1));
700
				MOVAPS(fpr.VSX(dregs), R(XMM0));
701
			}
702
			break;
703
		default:
704
			DISABLE;
705
		}
706
		ApplyPrefixD(dregs, V_Single);
707
		fpr.ReleaseSpillLocks();
708
		return;
709
	}
710

711
	// Flush SIMD.
712
	fpr.SimpleRegsV(sregs, sz, 0);
713
	fpr.SimpleRegsV(tregs, sz, 0);
714
	fpr.SimpleRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
715

716
	X64Reg tempxreg = XMM0;
717
	if (IsOverlapSafe(dregs[0], 0, n, sregs, n, tregs)) {
718
		fpr.MapRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
719
		tempxreg = fpr.VX(dregs[0]);
720
	}
721

722
	// Need to start with +0.0f so it doesn't result in -0.0f.
723
	MOVSS(tempxreg, fpr.V(sregs[0]));
724
	MULSS(tempxreg, fpr.V(tregs[0]));
725
	for (int i = 1; i < n; i++)
726
	{
727
		// sum += s[i]*t[i];
728
		MOVSS(XMM1, fpr.V(sregs[i]));
729
		MULSS(XMM1, fpr.V(tregs[i]));
730
		ADDSS(tempxreg, R(XMM1));
731
	}
732

733
	if (!fpr.V(dregs[0]).IsSimpleReg(tempxreg)) {
734
		fpr.MapRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
735
		MOVSS(fpr.V(dregs[0]), tempxreg);
736
	}
737

738
	ApplyPrefixD(dregs, V_Single);
739

740
	fpr.ReleaseSpillLocks();
741
}
742

743

744
void Jit::Comp_VHdp(MIPSOpcode op) {
745
	CONDITIONAL_DISABLE(VFPU_VEC);
746

747
	if (js.HasUnknownPrefix())
748
		DISABLE;
749

750
	VectorSize sz = GetVecSize(op);
751
	int n = GetNumVectorElements(sz);
752

753
	u8 sregs[4], tregs[4], dregs[1];
754
	GetVectorRegsPrefixS(sregs, sz, _VS);
755
	GetVectorRegsPrefixT(tregs, sz, _VT);
756
	GetVectorRegsPrefixD(dregs, V_Single, _VD);
757

758
	// Flush SIMD.
759
	fpr.SimpleRegsV(sregs, sz, 0);
760
	fpr.SimpleRegsV(tregs, sz, 0);
761
	fpr.SimpleRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
762

763
	X64Reg tempxreg = XMM0;
764
	if (IsOverlapSafe(dregs[0], 0, n, sregs, n, tregs)) {
765
		fpr.MapRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
766
		tempxreg = fpr.VX(dregs[0]);
767
	}
768

769
	// Need to start with +0.0f so it doesn't result in -0.0f.
770
	MOVSS(tempxreg, fpr.V(sregs[0]));
771
	MULSS(tempxreg, fpr.V(tregs[0]));
772
	for (int i = 1; i < n; i++) {
773
		// sum += (i == n-1) ? t[i] : s[i]*t[i];
774
		if (i == n - 1) {
775
			ADDSS(tempxreg, fpr.V(tregs[i]));
776
		} else {
777
			MOVSS(XMM1, fpr.V(sregs[i]));
778
			MULSS(XMM1, fpr.V(tregs[i]));
779
			ADDSS(tempxreg, R(XMM1));
780
		}
781
	}
782

783
	if (!fpr.V(dregs[0]).IsSimpleReg(tempxreg)) {
784
		fpr.MapRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
785
		MOVSS(fpr.V(dregs[0]), tempxreg);
786
	}
787

788
	ApplyPrefixD(dregs, V_Single);
789

790
	fpr.ReleaseSpillLocks();
791
}
792

793
void Jit::Comp_VCrossQuat(MIPSOpcode op) {
794
	CONDITIONAL_DISABLE(VFPU_VEC);
795

796
	if (js.HasUnknownPrefix())
797
		DISABLE;
798

799
	VectorSize sz = GetVecSize(op);
800

801
	u8 sregs[4], tregs[4], dregs[4];
802
	GetVectorRegs(sregs, sz, _VS);
803
	GetVectorRegs(tregs, sz, _VT);
804
	GetVectorRegs(dregs, sz, _VD);
805

806
	if (sz == V_Triple) {
807
		// Cross product vcrsp.t
808
		if (fpr.TryMapDirtyInInVS(dregs, sz, sregs, sz, tregs, sz)) {
809
			MOVAPS(XMM0, fpr.VS(tregs));
810
			MOVAPS(XMM1, fpr.VS(sregs));
811
			SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 0, 2, 1));
812
			SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 0, 2, 1));
813
			MULPS(XMM0, fpr.VS(sregs));
814
			MULPS(XMM1, fpr.VS(tregs));
815
			SUBPS(XMM0, R(XMM1));
816
			SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 0, 2, 1));
817
			MOVAPS(fpr.VS(dregs), XMM0);
818
			fpr.ReleaseSpillLocks();
819
			return;
820
		}
821

822
		// Flush SIMD.
823
		fpr.SimpleRegsV(sregs, sz, 0);
824
		fpr.SimpleRegsV(tregs, sz, 0);
825
		fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
826

827
		fpr.MapRegsV(sregs, sz, 0);
828
	
829
		// Compute X
830
		MOVSS(XMM0, fpr.V(sregs[1]));
831
		MULSS(XMM0, fpr.V(tregs[2]));
832
		MOVSS(XMM1, fpr.V(sregs[2]));
833
		MULSS(XMM1, fpr.V(tregs[1]));
834
		SUBSS(XMM0, R(XMM1));
835
		MOVSS(fpr.V(dregs[0]), XMM0);
836

837
		// Compute Y
838
		MOVSS(XMM0, fpr.V(sregs[2]));
839
		MULSS(XMM0, fpr.V(tregs[0]));
840
		MOVSS(XMM1, fpr.V(sregs[0]));
841
		MULSS(XMM1, fpr.V(tregs[2]));
842
		SUBSS(XMM0, R(XMM1));
843
		MOVSS(fpr.V(dregs[1]), XMM0);
844

845
		// Compute Z
846
		MOVSS(XMM0, fpr.V(sregs[0]));
847
		MULSS(XMM0, fpr.V(tregs[1]));
848
		MOVSS(XMM1, fpr.V(sregs[1]));
849
		MULSS(XMM1, fpr.V(tregs[0]));
850
		SUBSS(XMM0, R(XMM1));
851
		MOVSS(fpr.V(dregs[2]), XMM0);
852
	} else if (sz == V_Quad) {
853
		// Flush SIMD.
854
		fpr.SimpleRegsV(sregs, sz, 0);
855
		fpr.SimpleRegsV(tregs, sz, 0);
856
		fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
857

858
		// Quaternion product  vqmul.q
859
		fpr.MapRegsV(sregs, sz, 0);
860

861
		// Compute X
862
		// d[0] = s[0] * t[3] + s[1] * t[2] - s[2] * t[1] + s[3] * t[0];
863
		MOVSS(XMM0, fpr.V(sregs[0]));
864
		MULSS(XMM0, fpr.V(tregs[3]));
865
		MOVSS(XMM1, fpr.V(sregs[1]));
866
		MULSS(XMM1, fpr.V(tregs[2]));
867
		ADDSS(XMM0, R(XMM1));
868
		MOVSS(XMM1, fpr.V(sregs[2]));
869
		MULSS(XMM1, fpr.V(tregs[1]));
870
		SUBSS(XMM0, R(XMM1));
871
		MOVSS(XMM1, fpr.V(sregs[3]));
872
		MULSS(XMM1, fpr.V(tregs[0]));
873
		ADDSS(XMM0, R(XMM1));
874
		MOVSS(fpr.V(dregs[0]), XMM0);
875

876
		// Compute Y
877
		//d[1] = s[1] * t[3] + s[2] * t[0] + s[3] * t[1] - s[0] * t[2];
878
		MOVSS(XMM0, fpr.V(sregs[1]));
879
		MULSS(XMM0, fpr.V(tregs[3]));
880
		MOVSS(XMM1, fpr.V(sregs[2]));
881
		MULSS(XMM1, fpr.V(tregs[0]));
882
		ADDSS(XMM0, R(XMM1));
883
		MOVSS(XMM1, fpr.V(sregs[3]));
884
		MULSS(XMM1, fpr.V(tregs[1]));
885
		ADDSS(XMM0, R(XMM1));
886
		MOVSS(XMM1, fpr.V(sregs[0]));
887
		MULSS(XMM1, fpr.V(tregs[2]));
888
		SUBSS(XMM0, R(XMM1));
889
		MOVSS(fpr.V(dregs[1]), XMM0);
890

891
		// Compute Z
892
		//d[2] = s[0] * t[1] - s[1] * t[0] + s[2] * t[3] + s[3] * t[2];
893
		MOVSS(XMM0, fpr.V(sregs[0]));
894
		MULSS(XMM0, fpr.V(tregs[1]));
895
		MOVSS(XMM1, fpr.V(sregs[1]));
896
		MULSS(XMM1, fpr.V(tregs[0]));
897
		SUBSS(XMM0, R(XMM1));
898
		MOVSS(XMM1, fpr.V(sregs[2]));
899
		MULSS(XMM1, fpr.V(tregs[3]));
900
		ADDSS(XMM0, R(XMM1));
901
		MOVSS(XMM1, fpr.V(sregs[3]));
902
		MULSS(XMM1, fpr.V(tregs[2]));
903
		ADDSS(XMM0, R(XMM1));
904
		MOVSS(fpr.V(dregs[2]), XMM0);
905

906
		// Compute W
907
		//d[3] = -s[0] * t[0] - s[1] * t[1] - s[2] * t[2] + s[3] * t[3];
908
		MOVSS(XMM0, fpr.V(sregs[3]));
909
		MULSS(XMM0, fpr.V(tregs[3]));
910
		MOVSS(XMM1, fpr.V(sregs[1]));
911
		MULSS(XMM1, fpr.V(tregs[1]));
912
		SUBSS(XMM0, R(XMM1));
913
		MOVSS(XMM1, fpr.V(sregs[2]));
914
		MULSS(XMM1, fpr.V(tregs[2]));
915
		SUBSS(XMM0, R(XMM1));
916
		MOVSS(XMM1, fpr.V(sregs[0]));
917
		MULSS(XMM1, fpr.V(tregs[0]));
918
		SUBSS(XMM0, R(XMM1));
919
		MOVSS(fpr.V(dregs[3]), XMM0);
920
	}
921

922
	fpr.ReleaseSpillLocks();
923
}
924

925
void Jit::Comp_Vcmov(MIPSOpcode op) {
926
	CONDITIONAL_DISABLE(VFPU_COMP);
927

928
	if (js.HasUnknownPrefix())
929
		DISABLE;
930

931
	VectorSize sz = GetVecSize(op);
932
	int n = GetNumVectorElements(sz);
933

934
	u8 sregs[4], dregs[4];
935
	GetVectorRegsPrefixS(sregs, sz, _VS);
936
	GetVectorRegsPrefixD(dregs, sz, _VD);
937
	int tf = (op >> 19) & 1;
938
	int imm3 = (op >> 16) & 7;
939

940
	// Flush SIMD.
941
	fpr.SimpleRegsV(sregs, sz, 0);
942

943
	for (int i = 0; i < n; ++i) {
944
		// Simplification: Disable if overlap unsafe
945
		if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
946
			DISABLE;
947
		}
948
	}
949

950
	if (imm3 < 6) {
951
		gpr.MapReg(MIPS_REG_VFPUCC, true, false);
952
		fpr.MapRegsV(dregs, sz, MAP_DIRTY);
953
		// Test one bit of CC. This bit decides whether none or all subregisters are copied.
954
		TEST(32, gpr.R(MIPS_REG_VFPUCC), Imm32(1 << imm3));
955
		FixupBranch skip = J_CC(tf ? CC_NZ : CC_Z, true);
956
		for (int i = 0; i < n; i++) {
957
			MOVSS(fpr.VX(dregs[i]), fpr.V(sregs[i]));
958
		}
959
		SetJumpTarget(skip);
960
	} else {
961
		gpr.MapReg(MIPS_REG_VFPUCC, true, false);
962
		fpr.MapRegsV(dregs, sz, MAP_DIRTY);
963
		// Look at the bottom four bits of CC to individually decide if the subregisters should be copied.
964
		for (int i = 0; i < n; i++) {
965
			TEST(32, gpr.R(MIPS_REG_VFPUCC), Imm32(1 << i));
966
			FixupBranch skip = J_CC(tf ? CC_NZ : CC_Z, true);
967
			MOVSS(fpr.VX(dregs[i]), fpr.V(sregs[i]));
968
			SetJumpTarget(skip);
969
		}
970
	}
971

972
	ApplyPrefixD(dregs, sz);
973

974
	fpr.ReleaseSpillLocks();
975
}
976

977
static s32 DoVminSS(s32 treg) {
978
	s32 sreg = currentMIPS->temp;
979

980
	// If both are negative, we flip the comparison (not two's compliment.)
981
	if (sreg < 0 && treg < 0) {
982
		// If at least one side is NAN, we take the highest mantissa bits.
983
		return treg < sreg ? sreg : treg;
984
	} else {
985
		// Otherwise, we take the lowest value (negative or lowest mantissa.)
986
		return treg > sreg ? sreg : treg;
987
	}
988
}
989

990
static s32 DoVmaxSS(s32 treg) {
991
	s32 sreg = currentMIPS->temp;
992

993
	// This is the same logic as vmin, just reversed.
994
	if (sreg < 0 && treg < 0) {
995
		return treg < sreg ? treg : sreg;
996
	} else {
997
		return treg > sreg ? treg : sreg;
998
	}
999
}
1000

1001
void Jit::Comp_VecDo3(MIPSOpcode op) {
1002
	CONDITIONAL_DISABLE(VFPU_VEC);
1003

1004
	if (js.HasUnknownPrefix())
1005
		DISABLE;
1006

1007
	// Check that we can support the ops, and prepare temporary values for ops that need it.
1008
	bool allowSIMD = true;
1009
	switch (op >> 26) {
1010
	case 24: //VFPU0
1011
		switch ((op >> 23) & 7) {
1012
		case 0: // d[i] = s[i] + t[i]; break; //vadd
1013
		case 1: // d[i] = s[i] - t[i]; break; //vsub
1014
		case 7: // d[i] = s[i] / t[i]; break; //vdiv
1015
			break;
1016
		default:
1017
			DISABLE;
1018
		}
1019
		break;
1020
	case 25: //VFPU1
1021
		switch ((op >> 23) & 7) {
1022
		case 0: // d[i] = s[i] * t[i]; break; //vmul
1023
			break;
1024
		default:
1025
			DISABLE;
1026
		}
1027
		break;
1028
	case 27: //VFPU3
1029
		switch ((op >> 23) & 7) {
1030
		case 2:  // vmin
1031
		case 3:  // vmax
1032
			allowSIMD = false;
1033
			break;
1034
		case 6:  // vsge
1035
		case 7:  // vslt
1036
			break;
1037
		default:
1038
			DISABLE;
1039
		}
1040
		break;
1041
	default:
1042
		DISABLE;
1043
		break;
1044
	}
1045

1046
	VectorSize sz = GetVecSize(op);
1047
	int n = GetNumVectorElements(sz);
1048

1049
	u8 sregs[4], tregs[4], dregs[4];
1050
	GetVectorRegsPrefixS(sregs, sz, _VS);
1051
	GetVectorRegsPrefixT(tregs, sz, _VT);
1052
	GetVectorRegsPrefixD(dregs, sz, _VD);
1053

1054
	if (allowSIMD && fpr.TryMapDirtyInInVS(dregs, sz, sregs, sz, tregs, sz)) {
1055
		void (XEmitter::*opFunc)(X64Reg, OpArg) = nullptr;
1056
		bool symmetric = false;
1057
		switch (op >> 26) {
1058
		case 24: //VFPU0
1059
			switch ((op >> 23) & 7) {
1060
			case 0: // d[i] = s[i] + t[i]; break; //vadd
1061
				opFunc = &XEmitter::ADDPS;
1062
				symmetric = true;
1063
				break;
1064
			case 1: // d[i] = s[i] - t[i]; break; //vsub
1065
				opFunc = &XEmitter::SUBPS;
1066
				break;
1067
			case 7: // d[i] = s[i] / t[i]; break; //vdiv
1068
				opFunc = &XEmitter::DIVPS;
1069
				break;
1070
			}
1071
			break;
1072
		case 25: //VFPU1
1073
			switch ((op >> 23) & 7)
1074
			{
1075
			case 0: // d[i] = s[i] * t[i]; break; //vmul
1076
				opFunc = &XEmitter::MULPS;
1077
				symmetric = true;
1078
				break;
1079
			}
1080
			break;
1081
		case 27: //VFPU3
1082
			switch ((op >> 23) & 7)
1083
			{
1084
			case 2:  // vmin
1085
				// TODO: Mishandles NaN.  Disabled for now.
1086
				MOVAPS(XMM1, fpr.VS(sregs));
1087
				MINPS(XMM1, fpr.VS(tregs));
1088
				MOVAPS(fpr.VSX(dregs), R(XMM1));
1089
				break;
1090
			case 3:  // vmax
1091
				// TODO: Mishandles NaN.  Disabled for now.
1092
				MOVAPS(XMM1, fpr.VS(sregs));
1093
				MAXPS(XMM1, fpr.VS(tregs));
1094
				MOVAPS(fpr.VSX(dregs), R(XMM1));
1095
				break;
1096
			case 6:  // vsge
1097
				MOVAPS(XMM0, fpr.VS(tregs));
1098
				MOVAPS(XMM1, fpr.VS(sregs));
1099
				CMPPS(XMM0, R(XMM1), CMP_ORD);
1100
				CMPPS(XMM1, fpr.VS(tregs), CMP_NLT);
1101

1102
				ANDPS(XMM1, R(XMM0));
1103
				MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
1104
				ANDPS(XMM1, MatR(TEMPREG));
1105
				MOVAPS(fpr.VSX(dregs), R(XMM1));
1106
				break;
1107
			case 7:  // vslt
1108
				MOVAPS(XMM1, fpr.VS(sregs));
1109
				CMPPS(XMM1, fpr.VS(tregs), CMP_LT);
1110
				MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
1111
				ANDPS(XMM1, MatR(TEMPREG));
1112
				MOVAPS(fpr.VSX(dregs), R(XMM1));
1113
				break;
1114
			}
1115
			break;
1116
		}
1117

1118
		if (opFunc != nullptr) {
1119
			if (fpr.VSX(dregs) != fpr.VSX(tregs)) {
1120
				if (fpr.VSX(dregs) != fpr.VSX(sregs)) {
1121
					MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
1122
				}
1123
				(this->*opFunc)(fpr.VSX(dregs), fpr.VS(tregs));
1124
			} else if (symmetric) {
1125
				// We already know d = t.
1126
				(this->*opFunc)(fpr.VSX(dregs), fpr.VS(sregs));
1127
			} else {
1128
				MOVAPS(XMM1, fpr.VS(sregs));
1129
				(this->*opFunc)(XMM1, fpr.VS(tregs));
1130
				MOVAPS(fpr.VSX(dregs), R(XMM1));
1131
			}
1132
		}
1133

1134
		ApplyPrefixD(dregs, sz);
1135
		fpr.ReleaseSpillLocks();
1136
		return;
1137
	}
1138

1139
	// Flush SIMD.
1140
	fpr.SimpleRegsV(sregs, sz, 0);
1141
	fpr.SimpleRegsV(tregs, sz, 0);
1142
	fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
1143

1144
	X64Reg tempxregs[4];
1145
	for (int i = 0; i < n; ++i)
1146
	{
1147
		if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs, n, tregs))
1148
		{
1149
			// On 32-bit we only have 6 xregs for mips regs, use XMM0/XMM1 if possible.
1150
			// But for vmin/vmax/vsge, we need XMM0/XMM1, so avoid.
1151
			if (i < 2 && (op >> 26) != 27)
1152
				tempxregs[i] = (X64Reg) (XMM0 + i);
1153
			else
1154
			{
1155
				int reg = fpr.GetTempV();
1156
				fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
1157
				fpr.SpillLockV(reg);
1158
				tempxregs[i] = fpr.VX(reg);
1159
			}
1160
		}
1161
		else
1162
		{
1163
			fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
1164
			fpr.SpillLockV(dregs[i]);
1165
			tempxregs[i] = fpr.VX(dregs[i]);
1166
		}
1167
	}
1168

1169
	for (int i = 0; i < n; ++i)
1170
	{
1171
		if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
1172
			MOVSS(tempxregs[i], fpr.V(sregs[i]));
1173
	}
1174

1175
	for (int i = 0; i < n; ++i) {
1176
		switch (op >> 26) {
1177
		case 24: //VFPU0
1178
			switch ((op >> 23) & 7) {
1179
			case 0: // d[i] = s[i] + t[i]; break; //vadd
1180
				ADDSS(tempxregs[i], fpr.V(tregs[i]));
1181
				break;
1182
			case 1: // d[i] = s[i] - t[i]; break; //vsub
1183
				SUBSS(tempxregs[i], fpr.V(tregs[i]));
1184
				break;
1185
			case 7: // d[i] = s[i] / t[i]; break; //vdiv
1186
				DIVSS(tempxregs[i], fpr.V(tregs[i]));
1187
				break;
1188
			}
1189
			break;
1190
		case 25: //VFPU1
1191
			switch ((op >> 23) & 7)
1192
			{
1193
			case 0: // d[i] = s[i] * t[i]; break; //vmul
1194
				MULSS(tempxregs[i], fpr.V(tregs[i]));
1195
				break;
1196
			}
1197
			break;
1198
		case 27: //VFPU3
1199
			switch ((op >> 23) & 7)
1200
			{
1201
			case 2:  // vmin
1202
				{
1203
					MOVSS(XMM0, fpr.V(tregs[i]));
1204
					UCOMISS(tempxregs[i], R(XMM0));
1205
					FixupBranch skip = J_CC(CC_NP, true);
1206

1207
					MOVSS(MIPSSTATE_VAR(temp), tempxregs[i]);
1208
					MOVD_xmm(R(EAX), XMM0);
1209
					CallProtectedFunction(&DoVminSS, R(EAX));
1210
					MOVD_xmm(tempxregs[i], R(EAX));
1211
					FixupBranch finish = J();
1212

1213
					SetJumpTarget(skip);
1214
					MINSS(tempxregs[i], R(XMM0));
1215
					SetJumpTarget(finish);
1216
				}
1217
				break;
1218
			case 3:  // vmax
1219
				{
1220
					MOVSS(XMM0, fpr.V(tregs[i]));
1221
					UCOMISS(tempxregs[i], R(XMM0));
1222
					FixupBranch skip = J_CC(CC_NP, true);
1223

1224
					MOVSS(MIPSSTATE_VAR(temp), tempxregs[i]);
1225
					MOVD_xmm(R(EAX), XMM0);
1226
					CallProtectedFunction(&DoVmaxSS, R(EAX));
1227
					MOVD_xmm(tempxregs[i], R(EAX));
1228
					FixupBranch finish = J();
1229

1230
					SetJumpTarget(skip);
1231
					MAXSS(tempxregs[i], R(XMM0));
1232
					SetJumpTarget(finish);
1233
				}
1234
				break;
1235
			case 6:  // vsge
1236
				// We can't just reverse, because of 0/-0.
1237
				MOVSS(XMM0, fpr.V(tregs[i]));
1238
				MOVSS(XMM1, R(tempxregs[i]));
1239
				CMPORDSS(XMM1, R(XMM0));
1240
				CMPNLTSS(tempxregs[i], R(XMM0));
1241
				ANDPS(tempxregs[i], R(XMM1));
1242
				MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
1243
				ANDPS(tempxregs[i], MatR(TEMPREG));
1244
				break;
1245
			case 7:  // vslt
1246
				CMPLTSS(tempxregs[i], fpr.V(tregs[i]));
1247
				MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
1248
				ANDPS(tempxregs[i], MatR(TEMPREG));
1249
				break;
1250
			}
1251
			break;
1252
		}
1253
	}
1254

1255
	for (int i = 0; i < n; ++i)
1256
	{
1257
		if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
1258
			MOVSS(fpr.V(dregs[i]), tempxregs[i]);
1259
	}
1260

1261
	ApplyPrefixD(dregs, sz);
1262

1263
	fpr.ReleaseSpillLocks();
1264
}
1265

1266
alignas(16) static const u32 vcmpMask[4][4] = {
1267
	{0x00000031, 0x00000000, 0x00000000, 0x00000000},
1268
	{0x00000011, 0x00000012, 0x00000000, 0x00000000},
1269
	{0x00000011, 0x00000012, 0x00000014, 0x00000000},
1270
	{0x00000011, 0x00000012, 0x00000014, 0x00000018},
1271
};
1272

1273
void Jit::Comp_Vcmp(MIPSOpcode op) {
1274
	CONDITIONAL_DISABLE(VFPU_COMP);
1275

1276
	if (js.HasUnknownPrefix())
1277
		DISABLE;
1278

1279
	VectorSize sz = GetVecSize(op);
1280
	int n = GetNumVectorElements(sz);
1281

1282
	VCondition cond = (VCondition)(op & 0xF);
1283

1284
	u8 sregs[4], tregs[4];
1285
	GetVectorRegsPrefixS(sregs, sz, _VS);
1286
	GetVectorRegsPrefixT(tregs, sz, _VT);
1287

1288
	// Some, we just fall back to the interpreter.
1289
	switch (cond) {
1290
	case VC_EI: // c = my_isinf(s[i]); break;
1291
	case VC_NI: // c = !my_isinf(s[i]); break;
1292
		DISABLE;
1293
		break;
1294
	case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break;   // Tekken Dark Resurrection
1295
	case VC_NS: // c = !my_isnan(s[i]) && !my_isinf(s[i]); break;
1296
	case VC_EN: // c = my_isnan(s[i]); break;
1297
	case VC_NN: // c = !my_isnan(s[i]); break;
1298
		if (_VS != _VT)
1299
			DISABLE;
1300
		break;
1301
	default:
1302
		break;
1303
	}
1304

1305
	// First, let's get the trivial ones.
1306

1307
	static const int true_bits[4] = {0x31, 0x33, 0x37, 0x3f};
1308

1309
	if (cond == VC_TR) {
1310
		gpr.MapReg(MIPS_REG_VFPUCC, true, true);
1311
		OR(32, gpr.R(MIPS_REG_VFPUCC), Imm32(true_bits[n-1]));
1312
		return;
1313
	} else if (cond == VC_FL) {
1314
		gpr.MapReg(MIPS_REG_VFPUCC, true, true);
1315
		AND(32, gpr.R(MIPS_REG_VFPUCC), Imm32(~true_bits[n-1]));
1316
		return;
1317
	}
1318

1319
	if (n > 1)
1320
		gpr.FlushLockX(ECX);
1321

1322
	// Start with zero in each lane for the compare to zero.
1323
	if (cond == VC_EZ || cond == VC_NZ) {
1324
		XORPS(XMM0, R(XMM0));
1325
		if (n > 1) {
1326
			XORPS(XMM1, R(XMM1));
1327
		}
1328
	}
1329

1330
	bool inverse = false;
1331

1332
	if (cond == VC_GE || cond == VC_GT) {
1333
		// We flip, and we need them in regs so we don't clear the high lanes.
1334
		fpr.SimpleRegsV(sregs, sz, 0);
1335
		fpr.MapRegsV(tregs, sz, 0);
1336
	} else {
1337
		fpr.SimpleRegsV(tregs, sz, 0);
1338
		fpr.MapRegsV(sregs, sz, 0);
1339
	}
1340

1341
	// We go backwards because it's more convenient to put things in the right lanes.
1342
	int affected_bits = (1 << 4) | (1 << 5);  // 4 and 5
1343
	for (int i = n - 1; i >= 0; --i) {
1344
		// Alternate between XMM0 and XMM1
1345
		X64Reg reg = i == 1 || i == 3 ? XMM1 : XMM0;
1346
		if ((i == 0 || i == 1) && n > 2) {
1347
			// We need to swap lanes... this also puts them in the right place.
1348
			SHUFPS(reg, R(reg), _MM_SHUFFLE(3, 2, 0, 1));
1349
		}
1350

1351
		// Let's only handle the easy ones, and fall back on the interpreter for the rest.
1352
		bool compareTwo = false;
1353
		bool compareToZero = false;
1354
		int comparison = -1;
1355
		bool flip = false;
1356

1357
		switch (cond) {
1358
		case VC_ES:
1359
			comparison = -1;  // We will do the compare at the end. XMM1 will have the bits.
1360
			MOVSS(reg, fpr.V(sregs[i]));
1361
			break;
1362

1363
		case VC_NS:
1364
			comparison = -1;  // We will do the compare at the end. XMM1 will have the bits.
1365
			MOVSS(reg, fpr.V(sregs[i]));
1366
			// Note that we do this all at once at the end.
1367
			inverse = true;
1368
			break;
1369

1370
		case VC_EN:
1371
			comparison = CMP_UNORD;
1372
			compareTwo = true;
1373
			break;
1374

1375
		case VC_NN:
1376
			comparison = CMP_UNORD;
1377
			compareTwo = true;
1378
			// Note that we do this all at once at the end.
1379
			inverse = true;
1380
			break;
1381

1382
		case VC_EQ: // c = s[i] == t[i]; break;
1383
			comparison = CMP_EQ;
1384
			compareTwo = true;
1385
			break;
1386

1387
		case VC_LT: // c = s[i] < t[i]; break;
1388
			comparison = CMP_LT;
1389
			compareTwo = true;
1390
			break;
1391

1392
		case VC_LE: // c = s[i] <= t[i]; break;
1393
			comparison = CMP_LE;
1394
			compareTwo = true;
1395
			break;
1396

1397
		case VC_NE: // c = s[i] != t[i]; break;
1398
			comparison = CMP_NEQ;
1399
			compareTwo = true;
1400
			break;
1401

1402
		case VC_GE: // c = s[i] >= t[i]; break;
1403
			comparison = CMP_LE;
1404
			flip = true;
1405
			compareTwo = true;
1406
			break;
1407

1408
		case VC_GT: // c = s[i] > t[i]; break;
1409
			comparison = CMP_LT;
1410
			flip = true;
1411
			compareTwo = true;
1412
			break;
1413

1414
		case VC_EZ: // c = s[i] == 0.0f || s[i] == -0.0f; break;
1415
			comparison = CMP_EQ;
1416
			compareToZero = true;
1417
			break;
1418

1419
		case VC_NZ: // c = s[i] != 0; break;
1420
			comparison = CMP_NEQ;
1421
			compareToZero = true;
1422
			break;
1423

1424
		default:
1425
			DISABLE;
1426
		}
1427

1428
		if (comparison != -1) {
1429
			if (compareTwo) {
1430
				if (!flip) {
1431
					MOVSS(reg, fpr.V(sregs[i]));
1432
					CMPSS(reg, fpr.V(tregs[i]), comparison);
1433
				} else {
1434
					MOVSS(reg, fpr.V(tregs[i]));
1435
					CMPSS(reg, fpr.V(sregs[i]), comparison);
1436
				}
1437
			} else if (compareToZero) {
1438
				CMPSS(reg, fpr.V(sregs[i]), comparison);
1439
			}
1440
		}
1441

1442
		affected_bits |= 1 << i;
1443
	}
1444

1445
	if (n > 1) {
1446
		XOR(32, R(ECX), R(ECX));
1447

1448
		// This combines them together.
1449
		UNPCKLPS(XMM0, R(XMM1));
1450

1451
		// Finalize the comparison for ES/NS.
1452
		if (cond == VC_ES || cond == VC_NS) {
1453
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&fourinfnan));
1454
			ANDPS(XMM0, MatR(TEMPREG));
1455
			PCMPEQD(XMM0, MatR(TEMPREG));  // Integer comparison
1456
			// It's inversed below for NS.
1457
		}
1458

1459
		if (inverse) {
1460
			// The canonical way to generate a bunch of ones, see https://stackoverflow.com/questions/35085059/what-are-the-best-instruction-sequences-to-generate-vector-constants-on-the-fly
1461
			PCMPEQW(XMM1, R(XMM1));
1462
			XORPS(XMM0, R(XMM1));
1463
		}
1464
		MOV(PTRBITS, R(TEMPREG), ImmPtr(&vcmpMask[n - 1]));
1465
		ANDPS(XMM0, MatR(TEMPREG));
1466
		MOVAPS(MIPSSTATE_VAR(vcmpResult), XMM0);
1467

1468
		MOV(32, R(TEMPREG), MIPSSTATE_VAR(vcmpResult[0]));
1469
		for (int i = 1; i < n; ++i) {
1470
			OR(32, R(TEMPREG), MIPSSTATE_VAR_ELEM32(vcmpResult[0], i));
1471
		}
1472

1473
		// Aggregate the bits. Urgh, expensive. Can optimize for the case of one comparison,
1474
		// which is the most common after all.
1475
		CMP(32, R(TEMPREG), Imm8(affected_bits & 0x1F));
1476
		SETcc(CC_E, R(ECX));
1477
		SHL(32, R(ECX), Imm8(5));
1478
		OR(32, R(TEMPREG), R(ECX));
1479
	} else {
1480
		// Finalize the comparison for ES/NS.
1481
		if (cond == VC_ES || cond == VC_NS) {
1482
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&fourinfnan));
1483
			ANDPS(XMM0, MatR(TEMPREG));
1484
			PCMPEQD(XMM0, MatR(TEMPREG));  // Integer comparison
1485
			// It's inversed below for NS.
1486
		}
1487

1488
		MOVD_xmm(R(TEMPREG), XMM0);
1489
		if (inverse) {
1490
			XOR(32, R(TEMPREG), Imm32(0xFFFFFFFF));
1491
		}
1492
		AND(32, R(TEMPREG), Imm32(0x31));
1493
	}
1494
	
1495
	gpr.UnlockAllX();
1496
	gpr.MapReg(MIPS_REG_VFPUCC, true, true);
1497
	AND(32, gpr.R(MIPS_REG_VFPUCC), Imm32(~affected_bits));
1498
	OR(32, gpr.R(MIPS_REG_VFPUCC), R(TEMPREG));
1499

1500
	fpr.ReleaseSpillLocks();
1501
}
1502

1503
// There are no immediates for floating point, so we need to load these
1504
// from RAM. Might as well have a table ready.
1505
extern const float mulTableVi2f[32] = {
1506
	1.0f/(1UL<<0),1.0f/(1UL<<1),1.0f/(1UL<<2),1.0f/(1UL<<3),
1507
	1.0f/(1UL<<4),1.0f/(1UL<<5),1.0f/(1UL<<6),1.0f/(1UL<<7),
1508
	1.0f/(1UL<<8),1.0f/(1UL<<9),1.0f/(1UL<<10),1.0f/(1UL<<11),
1509
	1.0f/(1UL<<12),1.0f/(1UL<<13),1.0f/(1UL<<14),1.0f/(1UL<<15),
1510
	1.0f/(1UL<<16),1.0f/(1UL<<17),1.0f/(1UL<<18),1.0f/(1UL<<19),
1511
	1.0f/(1UL<<20),1.0f/(1UL<<21),1.0f/(1UL<<22),1.0f/(1UL<<23),
1512
	1.0f/(1UL<<24),1.0f/(1UL<<25),1.0f/(1UL<<26),1.0f/(1UL<<27),
1513
	1.0f/(1UL<<28),1.0f/(1UL<<29),1.0f/(1UL<<30),1.0f/(1UL<<31),
1514
};
1515

1516
void Jit::Comp_Vi2f(MIPSOpcode op) {
1517
	CONDITIONAL_DISABLE(VFPU_VEC);
1518

1519
	if (js.HasUnknownPrefix())
1520
		DISABLE;
1521

1522
	VectorSize sz = GetVecSize(op);
1523
	int n = GetNumVectorElements(sz);
1524

1525
	int imm = (op >> 16) & 0x1f;
1526
	const float *mult = &mulTableVi2f[imm];
1527

1528
	u8 sregs[4], dregs[4];
1529
	GetVectorRegsPrefixS(sregs, sz, _VS);
1530
	GetVectorRegsPrefixD(dregs, sz, _VD);
1531

1532
	// Flush SIMD.
1533
	fpr.SimpleRegsV(sregs, sz, 0);
1534
	fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
1535

1536
	int tempregs[4];
1537
	for (int i = 0; i < n; ++i) {
1538
		if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
1539
			tempregs[i] = fpr.GetTempV();
1540
		} else {
1541
			tempregs[i] = dregs[i];
1542
		}
1543
	}
1544

1545
	if (*mult != 1.0f) {
1546
		if (RipAccessible(mult)) {
1547
			MOVSS(XMM1, M(mult));  // rip accessible
1548
		} else {
1549
			MOV(PTRBITS, R(TEMPREG), ImmPtr(mult));
1550
			MOVSS(XMM1, MatR(TEMPREG));
1551
		}
1552
	}
1553
	for (int i = 0; i < n; i++) {
1554
		fpr.MapRegV(tempregs[i], sregs[i] == dregs[i] ? MAP_DIRTY : MAP_NOINIT);
1555
		if (fpr.V(sregs[i]).IsSimpleReg()) {
1556
			CVTDQ2PS(fpr.VX(tempregs[i]), fpr.V(sregs[i]));
1557
		} else {
1558
			MOVSS(fpr.VX(tempregs[i]), fpr.V(sregs[i]));
1559
			CVTDQ2PS(fpr.VX(tempregs[i]), R(fpr.VX(tempregs[i])));
1560
		}
1561
		if (*mult != 1.0f)
1562
			MULSS(fpr.VX(tempregs[i]), R(XMM1));
1563
	}
1564

1565
	for (int i = 0; i < n; ++i) {
1566
		if (dregs[i] != tempregs[i]) {
1567
			fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);
1568
			MOVSS(fpr.VX(dregs[i]), fpr.V(tempregs[i]));
1569
		}
1570
	}
1571

1572
	ApplyPrefixD(dregs, sz);
1573
	fpr.ReleaseSpillLocks();
1574
}
1575

1576
// Planning for true SIMD
1577

1578
// Sequence for gathering sparse registers into one SIMD:
1579
// MOVSS(XMM0, fpr.R(sregs[0]));
1580
// MOVSS(XMM1, fpr.R(sregs[1]));
1581
// MOVSS(XMM2, fpr.R(sregs[2]));
1582
// MOVSS(XMM3, fpr.R(sregs[3]));
1583
// SHUFPS(XMM0, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));   // XMM0 = S1 S1 S0 S0
1584
// SHUFPS(XMM2, R(XMM3), _MM_SHUFFLE(0, 0, 0, 0));   // XMM2 = S3 S3 S2 S2
1585
// SHUFPS(XMM0, R(XMM2), _MM_SHUFFLE(2, 0, 2, 0));   // XMM0 = S3 S2 S1 S0
1586
// Some punpckwd etc would also work.
1587
// Alternatively, MOVSS and three PINSRD (SSE4) with mem source.
1588
// Why PINSRD instead of INSERTPS?
1589
// http://software.intel.com/en-us/blogs/2009/01/07/using-sse41-for-mp3-encoding-quantization
1590

1591
// Sequence for scattering a SIMD register to sparse registers:
1592
// (Very serial though, better methods may be possible)
1593
// MOVSS(fpr.R(sregs[0]), XMM0);
1594
// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));
1595
// MOVSS(fpr.R(sregs[1]), XMM0);
1596
// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));
1597
// MOVSS(fpr.R(sregs[2]), XMM0);
1598
// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));
1599
// MOVSS(fpr.R(sregs[3]), XMM0);
1600
// On SSE4 we should use EXTRACTPS.
1601

1602
// Translation of ryg's half_to_float5_SSE2
1603
void Jit::Comp_Vh2f(MIPSOpcode op) {
1604
	CONDITIONAL_DISABLE(VFPU_VEC);
1605
	if (js.HasUnknownPrefix())
1606
		DISABLE;
1607

1608
#define SSE_CONST4(name, val) alignas(16) static const u32 name[4] = { (val), (val), (val), (val) }
1609

1610
	SSE_CONST4(mask_nosign,         0x7fff);
1611
	SSE_CONST4(nan_mantissa,        0x800003ff);
1612
	SSE_CONST4(magic,               (254 - 15) << 23);
1613
	SSE_CONST4(was_infnan,          0x7bff);
1614
	SSE_CONST4(exp_infnan,          255 << 23);
1615

1616
	OpArg mask_nosign_arg, nan_mantissa_arg, magic_arg, was_infnan_arg, exp_infnan_arg;
1617
	if (RipAccessible(mask_nosign)) {
1618
		mask_nosign_arg = M(&mask_nosign[0]);
1619
		nan_mantissa_arg = M(&nan_mantissa[0]);
1620
		magic_arg = M(&magic[0]);
1621
		was_infnan_arg = M(&was_infnan[0]);
1622
		exp_infnan_arg = M(&exp_infnan[0]);
1623
	} else {
1624
		MOV(PTRBITS, R(TEMPREG), ImmPtr(&mask_nosign[0]));
1625
		mask_nosign_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &mask_nosign[0]);
1626
		nan_mantissa_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &nan_mantissa[0]);
1627
		magic_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &magic[0]);
1628
		was_infnan_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &was_infnan[0]);
1629
		exp_infnan_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &exp_infnan[0]);
1630
	}
1631

1632
#undef SSE_CONST4
1633
	VectorSize sz = GetVecSize(op);
1634
	VectorSize outsize;
1635
	switch (sz) {
1636
	case V_Single:
1637
		outsize = V_Pair;
1638
		break;
1639
	case V_Pair:
1640
		outsize = V_Quad;
1641
		break;
1642
	default:
1643
		DISABLE;
1644
	}
1645

1646
	u8 sregs[4], dregs[4];
1647
	GetVectorRegsPrefixS(sregs, sz, _VS);
1648
	GetVectorRegsPrefixD(dregs, outsize, _VD);
1649

1650
	// Flush SIMD.
1651
	fpr.SimpleRegsV(sregs, sz, 0);
1652

1653
	// Force ourselves an extra xreg as temp space.
1654
	X64Reg tempR = fpr.GetFreeXReg();
1655
	
1656
	MOVSS(XMM0, fpr.V(sregs[0]));
1657
 	if (sz != V_Single) {
1658
		MOVSS(XMM1, fpr.V(sregs[1]));
1659
		PUNPCKLDQ(XMM0, R(XMM1));
1660
	}
1661
	XORPS(XMM1, R(XMM1));
1662
	PUNPCKLWD(XMM0, R(XMM1));
1663

1664
	// OK, 16 bits in each word.
1665
	// Let's go. Deep magic here.
1666
	MOVAPS(XMM1, R(XMM0));
1667
	ANDPS(XMM0, mask_nosign_arg); // xmm0 = expmant
1668
	XORPS(XMM1, R(XMM0));  // xmm1 = justsign = expmant ^ xmm0
1669
	MOVAPS(tempR, R(XMM0));
1670
	PSLLD(XMM0, 13);
1671
	MULPS(XMM0, magic_arg);  /// xmm0 = scaled
1672
	PSLLD(XMM1, 16);  // xmm1 = sign
1673
	ORPS(XMM0, R(XMM1));
1674

1675
	// Now create a NAN mask, adding in the sign.
1676
	ORPS(XMM1, R(tempR)); // xmm1 = sign + original mantissa.
1677
	ANDPS(XMM1, nan_mantissa_arg); // xmm1 = original mantissa
1678
	PCMPGTD(tempR, was_infnan_arg);  // xmm2 = b_wasinfnan
1679
	ORPS(XMM1, exp_infnan_arg); // xmm1 = infnan result
1680
	ANDPS(XMM1, R(tempR)); // xmm1 = infnan result OR zero if not infnan
1681
	ANDNPS(tempR, R(XMM0)); // tempR = result OR zero if infnan
1682
	ORPS(XMM1, R(tempR));
1683

1684
	fpr.MapRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY);  
1685

1686
	// TODO: Could apply D-prefix in parallel here...
1687

1688
	MOVSS(fpr.V(dregs[0]), XMM1);
1689
	SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 3, 2, 1));
1690
	MOVSS(fpr.V(dregs[1]), XMM1);
1691

1692
	if (sz != V_Single) {
1693
		SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 3, 2, 1));
1694
		MOVSS(fpr.V(dregs[2]), XMM1);
1695
		SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 3, 2, 1));
1696
		MOVSS(fpr.V(dregs[3]), XMM1);
1697
	}
1698

1699
	ApplyPrefixD(dregs, outsize);
1700
	gpr.UnlockAllX();
1701
	fpr.ReleaseSpillLocks();
1702
}
1703

1704
// The goal is to map (reversed byte order for clarity):
1705
// AABBCCDD -> 000000AA 000000BB 000000CC 000000DD
1706
alignas(16) static s8 vc2i_shuffle[16] = { -1, -1, -1, 0,  -1, -1, -1, 1,  -1, -1, -1, 2,  -1, -1, -1, 3 };
1707
// AABBCCDD -> AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
1708
alignas(16) static s8 vuc2i_shuffle[16] = { 0, 0, 0, 0,  1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3 };
1709

1710
void Jit::Comp_Vx2i(MIPSOpcode op) {
1711
	CONDITIONAL_DISABLE(VFPU_VEC);
1712
	if (js.HasUnknownPrefix())
1713
		DISABLE;
1714

1715
	int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vuc2i/vc2i (0/1), vus2i/vs2i (2/3)
1716
	bool unsignedOp = ((op >> 16) & 1) == 0; // vuc2i (0), vus2i (2)
1717

1718
	// vs2i or vus2i unpack pairs of 16-bit integers into 32-bit integers, with the values
1719
	// at the top.  vus2i shifts it an extra bit right afterward.
1720
	// vc2i and vuc2i unpack quads of 8-bit integers into 32-bit integers, with the values
1721
	// at the top too.  vuc2i is a bit special (see below.)
1722
	// Let's do this similarly as h2f - we do a solution that works for both singles and pairs
1723
	// then use it for both.
1724

1725
	VectorSize sz = GetVecSize(op);
1726
	VectorSize outsize;
1727
	if (bits == 8) {
1728
		outsize = V_Quad;
1729
	} else {
1730
		switch (sz) {
1731
		case V_Single:
1732
			outsize = V_Pair;
1733
			break;
1734
		case V_Pair:
1735
			outsize = V_Quad;
1736
			break;
1737
		default:
1738
			DISABLE;
1739
		}
1740
	}
1741

1742
	u8 sregs[4], dregs[4];
1743
	GetVectorRegsPrefixS(sregs, sz, _VS);
1744
	GetVectorRegsPrefixD(dregs, outsize, _VD);
1745

1746
	// Flush SIMD.
1747
	fpr.SimpleRegsV(sregs, sz, 0);
1748

1749
	if (bits == 16) {
1750
		MOVSS(XMM1, fpr.V(sregs[0]));
1751
		if (sz != V_Single) {
1752
			MOVSS(XMM0, fpr.V(sregs[1]));
1753
			PUNPCKLDQ(XMM1, R(XMM0));
1754
		}
1755

1756
		// Unpack 16-bit words into 32-bit words, upper position, and we're done!
1757
		PXOR(XMM0, R(XMM0));
1758
		PUNPCKLWD(XMM0, R(XMM1));
1759
	} else if (bits == 8) {
1760
		if (unsignedOp) {
1761
			// vuc2i is a bit special.  It spreads out the bits like this:
1762
			// s[0] = 0xDDCCBBAA -> d[0] = (0xAAAAAAAA >> 1), d[1] = (0xBBBBBBBB >> 1), etc.
1763
			MOVSS(XMM0, fpr.V(sregs[0]));
1764
			if (cpu_info.bSSSE3 && RipAccessible(vuc2i_shuffle)) {
1765
				// Not really different speed.  Generates a bit less code.
1766
				PSHUFB(XMM0, M(&vuc2i_shuffle[0]));  // rip accessible
1767
			} else {
1768
				// First, we change 0xDDCCBBAA to 0xDDDDCCCCBBBBAAAA.
1769
				PUNPCKLBW(XMM0, R(XMM0));
1770
				// Now, interleave each 16 bits so they're all 32 bits wide.
1771
				PUNPCKLWD(XMM0, R(XMM0));
1772
			}
1773
		} else {
1774
			if (cpu_info.bSSSE3 && RipAccessible(vc2i_shuffle)) {
1775
				MOVSS(XMM0, fpr.V(sregs[0]));
1776
				PSHUFB(XMM0, M(&vc2i_shuffle[0]));
1777
			} else {
1778
				PXOR(XMM1, R(XMM1));
1779
				MOVSS(XMM0, fpr.V(sregs[0]));
1780
				PUNPCKLBW(XMM1, R(XMM0));
1781
				PXOR(XMM0, R(XMM0));
1782
				PUNPCKLWD(XMM0, R(XMM1));
1783
			}
1784
		}
1785
	}
1786

1787
	// At this point we have the regs in the 4 lanes.
1788
	// In the "u" mode, we need to shift it out of the sign bit.
1789
	if (unsignedOp) {
1790
		PSRLD(XMM0, 1);
1791
	}
1792

1793
	if (fpr.TryMapRegsVS(dregs, outsize, MAP_NOINIT | MAP_DIRTY)) {
1794
		MOVAPS(fpr.VSX(dregs), R(XMM0));
1795
	} else {
1796
		// Done! TODO: The rest of this should be possible to extract into a function.
1797
		fpr.MapRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY);
1798

1799
		// TODO: Could apply D-prefix in parallel here...
1800

1801
		MOVSS(fpr.V(dregs[0]), XMM0);
1802
		PSRLDQ(XMM0, 4);
1803
		MOVSS(fpr.V(dregs[1]), XMM0);
1804

1805
		if (outsize != V_Pair) {
1806
			PSRLDQ(XMM0, 4);
1807
			MOVSS(fpr.V(dregs[2]), XMM0);
1808
			PSRLDQ(XMM0, 4);
1809
			MOVSS(fpr.V(dregs[3]), XMM0);
1810
		}
1811
	}
1812

1813
	ApplyPrefixD(dregs, outsize);
1814
	gpr.UnlockAllX();
1815
	fpr.ReleaseSpillLocks();
1816
}
1817

1818
extern const double mulTableVf2i[32] = {
1819
	(1ULL<<0),(1ULL<<1),(1ULL<<2),(1ULL<<3),
1820
	(1ULL<<4),(1ULL<<5),(1ULL<<6),(1ULL<<7),
1821
	(1ULL<<8),(1ULL<<9),(1ULL<<10),(1ULL<<11),
1822
	(1ULL<<12),(1ULL<<13),(1ULL<<14),(1ULL<<15),
1823
	(1ULL<<16),(1ULL<<17),(1ULL<<18),(1ULL<<19),
1824
	(1ULL<<20),(1ULL<<21),(1ULL<<22),(1ULL<<23),
1825
	(1ULL<<24),(1ULL<<25),(1ULL<<26),(1ULL<<27),
1826
	(1ULL<<28),(1ULL<<29),(1ULL<<30),(1ULL<<31),
1827
};
1828

1829
static const double maxMinIntAsDouble[2] = { (double)0x7fffffff, (double)(int)0x80000000 };  // that's not equal to 0x80000000
1830

1831
void Jit::Comp_Vf2i(MIPSOpcode op) {
1832
	CONDITIONAL_DISABLE(VFPU_VEC);
1833
	if (js.HasUnknownPrefix())
1834
		DISABLE;
1835

1836
	VectorSize sz = GetVecSize(op);
1837
	int n = GetNumVectorElements(sz);
1838

1839
	int imm = (op >> 16) & 0x1f;
1840
	const double *mult = &mulTableVf2i[imm];
1841

1842
	int setMXCSR = -1;
1843
	int rmode = (op >> 21) & 0x1f;
1844
	switch (rmode) {
1845
	case 17:
1846
		break; //z - truncate. Easy to support.
1847
	case 16:
1848
		setMXCSR = 0;
1849
		break;
1850
	case 18:
1851
		setMXCSR = 2;
1852
		break;
1853
	case 19:
1854
		setMXCSR = 1;
1855
		break;
1856
	}
1857

1858
	// Small optimization: 0 is our default mode anyway.
1859
	if (setMXCSR == 0 && !js.hasSetRounding) {
1860
		setMXCSR = -1;
1861
	}
1862
	// Except for truncate, we need to update MXCSR to our preferred rounding mode.
1863
	if (setMXCSR != -1) {
1864
		STMXCSR(MIPSSTATE_VAR(mxcsrTemp));
1865
		MOV(32, R(TEMPREG), MIPSSTATE_VAR(mxcsrTemp));
1866
		AND(32, R(TEMPREG), Imm32(~(3 << 13)));
1867
		if (setMXCSR != 0) {
1868
			OR(32, R(TEMPREG), Imm32(setMXCSR << 13));
1869
		}
1870
		MOV(32, MIPSSTATE_VAR(temp), R(TEMPREG));
1871
		LDMXCSR(MIPSSTATE_VAR(temp));
1872
	}
1873

1874
	u8 sregs[4], dregs[4];
1875
	GetVectorRegsPrefixS(sregs, sz, _VS);
1876
	GetVectorRegsPrefixD(dregs, sz, _VD);
1877

1878
	// Really tricky to SIMD due to double precision requirement...
1879

1880
	// Flush SIMD.
1881
	fpr.SimpleRegsV(sregs, sz, 0);
1882
	fpr.SimpleRegsV(dregs, sz, MAP_DIRTY | MAP_NOINIT);
1883

1884
	u8 tempregs[4];
1885
	for (int i = 0; i < n; ++i) {
1886
		if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
1887
			tempregs[i] = fpr.GetTempV();
1888
		} else {
1889
			tempregs[i] = dregs[i];
1890
		}
1891
	}
1892

1893
	if (*mult != 1.0f) {
1894
		if (RipAccessible(mult)) {
1895
			MOVSD(XMM1, M(mult));  // rip accessible
1896
		} else {
1897
			MOV(PTRBITS, R(TEMPREG), ImmPtr(mult));
1898
			MOVSD(XMM1, MatR(TEMPREG));
1899
		}
1900
	}
1901

1902
	fpr.MapRegsV(tempregs, sz, MAP_DIRTY | MAP_NOINIT);
1903
	for (int i = 0; i < n; i++) {
1904
		// Need to do this in double precision to clamp correctly as float
1905
		// doesn't have enough precision to represent 0x7fffffff for example exactly.
1906
		MOVSS(XMM0, fpr.V(sregs[i]));
1907
		CVTSS2SD(XMM0, R(XMM0)); // convert to double precision
1908
		if (*mult != 1.0f) {
1909
			MULSD(XMM0, R(XMM1));
1910
		}
1911
		MOV(PTRBITS, R(TEMPREG), ImmPtr(maxMinIntAsDouble));
1912
		MINSD(XMM0, MDisp(TEMPREG, 0));
1913
		MAXSD(XMM0, MDisp(TEMPREG, sizeof(double)));
1914
		// We've set the rounding mode above, so this part's easy.
1915
		switch ((op >> 21) & 0x1f) {
1916
		case 16: CVTSD2SI(TEMPREG, R(XMM0)); break; //n
1917
		case 17: CVTTSD2SI(TEMPREG, R(XMM0)); break; //z - truncate
1918
		case 18: CVTSD2SI(TEMPREG, R(XMM0)); break; //u
1919
		case 19: CVTSD2SI(TEMPREG, R(XMM0)); break; //d
1920
		}
1921
		MOVD_xmm(fpr.VX(tempregs[i]), R(TEMPREG));
1922
	}
1923

1924
	for (int i = 0; i < n; ++i) {
1925
		if (dregs[i] != tempregs[i]) {
1926
			fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);
1927
			MOVSS(fpr.VX(dregs[i]), fpr.V(tempregs[i]));
1928
			fpr.DiscardV(tempregs[i]);
1929
		}
1930
	}
1931

1932
	if (setMXCSR != -1) {
1933
		LDMXCSR(MIPSSTATE_VAR(mxcsrTemp));
1934
	}
1935

1936
	ApplyPrefixD(dregs, sz);
1937
	fpr.ReleaseSpillLocks();
1938
}
1939

1940
void Jit::Comp_Vcst(MIPSOpcode op) {
1941
	CONDITIONAL_DISABLE(VFPU_XFER);
1942

1943
	if (js.HasUnknownPrefix())
1944
		DISABLE;
1945

1946
	int conNum = (op >> 16) & 0x1f;
1947
	int vd = _VD;
1948

1949
	VectorSize sz = GetVecSize(op);
1950
	int n = GetNumVectorElements(sz);
1951

1952
	u8 dregs[4];
1953
	GetVectorRegsPrefixD(dregs, sz, vd);
1954

1955
	if (RipAccessible(cst_constants)) {
1956
		MOVSS(XMM0, M(&cst_constants[conNum]));  // rip accessible
1957
	} else {
1958
		MOV(PTRBITS, R(TEMPREG), ImmPtr(&cst_constants[conNum]));
1959
		MOVSS(XMM0, MatR(TEMPREG));
1960
	}
1961

1962
	if (fpr.TryMapRegsVS(dregs, sz, MAP_NOINIT | MAP_DIRTY)) {
1963
		SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0,0,0,0));
1964
		MOVAPS(fpr.VS(dregs), XMM0);
1965
		fpr.ReleaseSpillLocks();
1966
		return;
1967
	}
1968

1969
	fpr.MapRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
1970
	for (int i = 0; i < n; i++) {
1971
		MOVSS(fpr.V(dregs[i]), XMM0);
1972
	}
1973
	ApplyPrefixD(dregs, sz);
1974
	fpr.ReleaseSpillLocks();
1975
}
1976

1977
void Jit::Comp_Vsgn(MIPSOpcode op) {
1978
	CONDITIONAL_DISABLE(VFPU_VEC);
1979

1980
	if (js.HasUnknownPrefix())
1981
		DISABLE;
1982

1983
	VectorSize sz = GetVecSize(op);
1984
	int n = GetNumVectorElements(sz);
1985

1986
	u8 sregs[4], dregs[4];
1987
	GetVectorRegsPrefixS(sregs, sz, _VS);
1988
	GetVectorRegsPrefixD(dregs, sz, _VD);
1989

1990
	// Flush SIMD.
1991
	fpr.SimpleRegsV(sregs, sz, 0);
1992
	fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
1993

1994
	X64Reg tempxregs[4];
1995
	for (int i = 0; i < n; ++i) {
1996
		if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
1997
			int reg = fpr.GetTempV();
1998
			fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
1999
			fpr.SpillLockV(reg);
2000
			tempxregs[i] = fpr.VX(reg);
2001
		} else {
2002
			fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
2003
			fpr.SpillLockV(dregs[i]);
2004
			tempxregs[i] = fpr.VX(dregs[i]);
2005
		}
2006
	}
2007

2008
	// Would be nice with more temp regs here so we could put signBitLower and oneOneOneOne into regs...
2009
	for (int i = 0; i < n; ++i) {
2010
		XORPS(XMM0, R(XMM0));
2011
		CMPEQSS(XMM0, fpr.V(sregs[i]));  // XMM0 = s[i] == 0.0f
2012
		MOVSS(XMM1, fpr.V(sregs[i]));
2013
		// Preserve sign bit, replace rest with ones
2014
		if (RipAccessible(signBitLower)) {
2015
			ANDPS(XMM1, M(&signBitLower));  // rip accessible
2016
			ORPS(XMM1, M(&oneOneOneOne));  // rip accessible
2017
		} else {
2018
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower));
2019
			ANDPS(XMM1, MatR(TEMPREG));
2020
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
2021
			ORPS(XMM1, MatR(TEMPREG));
2022
		}
2023
		// If really was equal to zero, zap. Note that ANDN negates the destination.
2024
		ANDNPS(XMM0, R(XMM1));
2025
		MOVAPS(tempxregs[i], R(XMM0));
2026
	}
2027

2028
	for (int i = 0; i < n; ++i) {
2029
		if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
2030
			MOVSS(fpr.V(dregs[i]), tempxregs[i]);
2031
	}
2032

2033
	ApplyPrefixD(dregs, sz);
2034

2035
	fpr.ReleaseSpillLocks();
2036
}
2037

2038
void Jit::Comp_Vocp(MIPSOpcode op) {
2039
	CONDITIONAL_DISABLE(VFPU_VEC);
2040

2041
	if (js.HasUnknownPrefix())
2042
		DISABLE;
2043

2044
	VectorSize sz = GetVecSize(op);
2045
	int n = GetNumVectorElements(sz);
2046

2047
	// This is a hack that modifies prefixes.  We eat them later, so just overwrite.
2048
	// S prefix forces the negate flags.
2049
	js.prefixS |= 0x000F0000;
2050
	// T prefix forces constants on and regnum to 1.
2051
	// That means negate still works, and abs activates a different constant.
2052
	js.prefixT = (js.prefixT & ~0x000000FF) | 0x00000055 | 0x0000F000;
2053

2054
	u8 sregs[4], tregs[4], dregs[4];
2055
	// Actually uses the T prefixes (despite being VS.)
2056
	GetVectorRegsPrefixS(sregs, sz, _VS);
2057
	if (js.prefixT != 0x0000F055)
2058
		GetVectorRegsPrefixT(tregs, sz, _VS);
2059
	GetVectorRegsPrefixD(dregs, sz, _VD);
2060

2061
	// Flush SIMD.
2062
	fpr.SimpleRegsV(sregs, sz, 0);
2063
	if (js.prefixT != 0x0000F055)
2064
		fpr.SimpleRegsV(tregs, sz, 0);
2065
	fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2066

2067
	X64Reg tempxregs[4];
2068
	for (int i = 0; i < n; ++i) {
2069
		if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
2070
			int reg = fpr.GetTempV();
2071
			fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
2072
			fpr.SpillLockV(reg);
2073
			tempxregs[i] = fpr.VX(reg);
2074
		} else {
2075
			fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
2076
			fpr.SpillLockV(dregs[i]);
2077
			tempxregs[i] = fpr.VX(dregs[i]);
2078
		}
2079
	}
2080

2081
	if (js.prefixT == 0x0000F055) {
2082
		MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2083
		MOVSS(XMM1, MatR(TEMPREG));
2084
	}
2085
	for (int i = 0; i < n; ++i) {
2086
		if (js.prefixT == 0x0000F055) {
2087
			MOVSS(XMM0, R(XMM1));
2088
		} else {
2089
			MOVSS(XMM0, fpr.V(tregs[i]));
2090
		}
2091
		ADDSS(XMM0, fpr.V(sregs[i]));
2092
		MOVSS(tempxregs[i], R(XMM0));
2093
	}
2094

2095
	for (int i = 0; i < n; ++i) {
2096
		if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
2097
			MOVSS(fpr.V(dregs[i]), tempxregs[i]);
2098
	}
2099

2100
	ApplyPrefixD(dregs, sz);
2101

2102
	fpr.ReleaseSpillLocks();
2103
}
2104

2105
void Jit::Comp_Vbfy(MIPSOpcode op) {
2106
	CONDITIONAL_DISABLE(VFPU_VEC);
2107
	if (js.HasUnknownPrefix())
2108
		DISABLE;
2109

2110
	VectorSize sz = GetVecSize(op);
2111
	int n = GetNumVectorElements(sz);
2112
	if (n != 2 && n != 4) {
2113
		DISABLE;
2114
	}
2115

2116
	u8 sregs[4], dregs[4];
2117
	GetVectorRegsPrefixS(sregs, sz, _VS);
2118
	GetVectorRegsPrefixD(dregs, sz, _VD);
2119
	// Flush SIMD.
2120
	fpr.SimpleRegsV(sregs, sz, 0);
2121
	fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2122

2123
	X64Reg tempxregs[4];
2124
	for (int i = 0; i < n; ++i) {
2125
		if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
2126
			int reg = fpr.GetTempV();
2127
			fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
2128
			fpr.SpillLockV(reg);
2129
			tempxregs[i] = fpr.VX(reg);
2130
		} else {
2131
			fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
2132
			fpr.SpillLockV(dregs[i]);
2133
			tempxregs[i] = fpr.VX(dregs[i]);
2134
		}
2135
	}
2136

2137
	int subop = (op >> 16) & 0x1F;
2138
	if (subop == 3) {
2139
		// vbfy2
2140
		MOVSS(tempxregs[0], fpr.V(sregs[0]));
2141
		MOVSS(tempxregs[1], fpr.V(sregs[1]));
2142
		MOVSS(tempxregs[2], fpr.V(sregs[0]));
2143
		MOVSS(tempxregs[3], fpr.V(sregs[1]));
2144
		ADDSS(tempxregs[0], fpr.V(sregs[2]));
2145
		ADDSS(tempxregs[1], fpr.V(sregs[3]));
2146
		SUBSS(tempxregs[2], fpr.V(sregs[2]));
2147
		SUBSS(tempxregs[3], fpr.V(sregs[3]));
2148
	} else if (subop == 2) {
2149
		// vbfy1
2150
		MOVSS(tempxregs[0], fpr.V(sregs[0]));
2151
		MOVSS(tempxregs[1], fpr.V(sregs[0]));
2152
		ADDSS(tempxregs[0], fpr.V(sregs[1]));
2153
		SUBSS(tempxregs[1], fpr.V(sregs[1]));
2154
		if (n == 4) {
2155
			MOVSS(tempxregs[2], fpr.V(sregs[2]));
2156
			MOVSS(tempxregs[3], fpr.V(sregs[2]));
2157
			ADDSS(tempxregs[2], fpr.V(sregs[3]));
2158
			SUBSS(tempxregs[3], fpr.V(sregs[3]));
2159
		}
2160
	} else {
2161
		DISABLE;
2162
	}
2163

2164
	for (int i = 0; i < n; ++i) {
2165
		if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
2166
			MOVSS(fpr.V(dregs[i]), tempxregs[i]);
2167
	}
2168

2169
	ApplyPrefixD(dregs, sz);
2170

2171
	fpr.ReleaseSpillLocks();
2172
}
2173

2174
union u32float {
2175
	u32 u;
2176
	float f;
2177

2178
	operator float() const {
2179
		return f;
2180
	}
2181

2182
	inline u32float &operator *=(const float &other) {
2183
		f *= other;
2184
		return *this;
2185
	}
2186
};
2187

2188
#if PPSSPP_ARCH(AMD64)
2189
typedef float SinCosArg;
2190
#else
2191
typedef u32float SinCosArg;
2192
#endif
2193

2194
void SinCos(SinCosArg angle, float *output) {
2195
	vfpu_sincos(angle, output[0], output[1]);
2196
}
2197

2198
void SinOnly(SinCosArg angle, float *output) {
2199
	output[0] = vfpu_sin(angle);
2200
}
2201

2202
void NegSinOnly(SinCosArg angle, float *output) {
2203
	output[0] = -vfpu_sin(angle);
2204
}
2205

2206
void CosOnly(SinCosArg angle, float *output) {
2207
	output[1] = vfpu_cos(angle);
2208
}
2209

2210
void ASinScaled(SinCosArg sine, float *output) {
2211
	output[0] = vfpu_asin(sine);
2212
}
2213

2214
void SinCosNegSin(SinCosArg angle, float *output) {
2215
	vfpu_sincos(angle, output[0], output[1]);
2216
	output[0] = -output[0];
2217
}
2218

2219
void Exp2(SinCosArg arg, float *output) {
2220
	output[0] = vfpu_exp2(arg);
2221
}
2222

2223
void Log2(SinCosArg arg, float *output) {
2224
	output[0] = vfpu_log2(arg);
2225
}
2226

2227
void RExp2(SinCosArg arg, float *output) {
2228
	output[0] = vfpu_rexp2(arg);
2229
}
2230

2231
void Jit::Comp_VV2Op(MIPSOpcode op) {
2232
	CONDITIONAL_DISABLE(VFPU_VEC);
2233

2234
	if (js.HasUnknownPrefix())
2235
		DISABLE;
2236

2237
	auto specialFuncCallHelper = [this](void (*specialFunc)(SinCosArg, float *output), u8 sreg) {
2238
#if PPSSPP_ARCH(AMD64)
2239
		MOVSS(XMM0, fpr.V(sreg));
2240
		// TODO: This reg might be different on Linux...
2241
#ifdef _WIN32
2242
		LEA(64, RDX, MIPSSTATE_VAR(sincostemp[0]));
2243
#else
2244
		LEA(64, RDI, MIPSSTATE_VAR(sincostemp[0]));
2245
#endif
2246
		ABI_CallFunction(thunks.ProtectFunction((const void *)specialFunc, 0));
2247
#else
2248
		// Sigh, passing floats with cdecl isn't pretty, ends up on the stack.
2249
		if (fpr.V(sreg).IsSimpleReg()) {
2250
			MOVD_xmm(R(EAX), fpr.VX(sreg));
2251
		} else {
2252
			MOV(32, R(EAX), fpr.V(sreg));
2253
		}
2254
		CallProtectedFunction((const void *)specialFunc, R(EAX), Imm32((uint32_t)(uintptr_t)&mips_->sincostemp[0]));
2255
#endif
2256
	};
2257

2258
	// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure
2259
	if (((op >> 16) & 0x1f) == 0 && _VS == _VD && js.HasNoPrefix()) {
2260
		return;
2261
	}
2262

2263
	VectorSize sz = GetVecSize(op);
2264
	int n = GetNumVectorElements(sz);
2265

2266
	u8 sregs[4], dregs[4];
2267
	GetVectorRegsPrefixS(sregs, sz, _VS);
2268
	GetVectorRegsPrefixD(dregs, sz, _VD);
2269

2270
	bool canSIMD = false;
2271
	// Some can be SIMD'd.
2272
	switch ((op >> 16) & 0x1f) {
2273
	case 0:  // vmov
2274
	case 1:  // vabs
2275
	case 2:  // vneg
2276
		canSIMD = true;
2277
		break;
2278
	}
2279

2280
	if (canSIMD && fpr.TryMapDirtyInVS(dregs, sz, sregs, sz)) {
2281
		switch ((op >> 16) & 0x1f) {
2282
		case 0:  // vmov
2283
			MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
2284
			break;
2285
		case 1:  // vabs
2286
			if (dregs[0] != sregs[0])
2287
				MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
2288
			if (RipAccessible(&noSignMask)) {
2289
				ANDPS(fpr.VSX(dregs), M(&noSignMask));  // rip accessible
2290
			} else {
2291
				MOV(PTRBITS, R(TEMPREG), ImmPtr(&noSignMask));
2292
				ANDPS(fpr.VSX(dregs), MatR(TEMPREG));
2293
			}
2294
			break;
2295
		case 2:  // vneg
2296
			if (dregs[0] != sregs[0])
2297
				MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
2298
			if (RipAccessible(&signBitAll)) {
2299
				XORPS(fpr.VSX(dregs), M(&signBitAll)); // rip accessible
2300
			} else {
2301
				MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitAll));
2302
				XORPS(fpr.VSX(dregs), MatR(TEMPREG));
2303
			}
2304
			break;
2305
		}
2306
		ApplyPrefixD(dregs, sz);
2307
		fpr.ReleaseSpillLocks();
2308
		return;
2309
	}
2310

2311
	// Flush SIMD.
2312
	fpr.SimpleRegsV(sregs, sz, 0);
2313
	fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2314

2315
	X64Reg tempxregs[4];
2316
	for (int i = 0; i < n; ++i)
2317
	{
2318
		if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs))
2319
		{
2320
			int reg = fpr.GetTempV();
2321
			fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
2322
			fpr.SpillLockV(reg);
2323
			tempxregs[i] = fpr.VX(reg);
2324
		}
2325
		else
2326
		{
2327
			fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
2328
			fpr.SpillLockV(dregs[i]);
2329
			tempxregs[i] = fpr.VX(dregs[i]);
2330
		}
2331
	}
2332

2333
	// Warning: sregs[i] and tempxregs[i] may be the same reg.
2334
	// Helps for vmov, hurts for vrcp, etc.
2335
	for (int i = 0; i < n; ++i)
2336
	{
2337
		switch ((op >> 16) & 0x1f)
2338
		{
2339
		case 0: // d[i] = s[i]; break; //vmov
2340
			// Probably for swizzle.
2341
			if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2342
				MOVSS(tempxregs[i], fpr.V(sregs[i]));
2343
			break;
2344
		case 1: // d[i] = fabsf(s[i]); break; //vabs
2345
			if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2346
				MOVSS(tempxregs[i], fpr.V(sregs[i]));
2347
			if (RipAccessible(&noSignMask)) {
2348
				ANDPS(tempxregs[i], M(&noSignMask));  // rip accessible
2349
			} else {
2350
				MOV(PTRBITS, R(TEMPREG), ImmPtr(&noSignMask));
2351
				ANDPS(tempxregs[i], MatR(TEMPREG));
2352
			}
2353
			break;
2354
		case 2: // d[i] = -s[i]; break; //vneg
2355
			if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2356
				MOVSS(tempxregs[i], fpr.V(sregs[i]));
2357
			if (RipAccessible(&signBitLower)) {
2358
				XORPS(tempxregs[i], M(&signBitLower));  // rip accessible
2359
			} else {
2360
				MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower));
2361
				XORPS(tempxregs[i], MatR(TEMPREG));
2362
			}
2363
			break;
2364
		case 4: // if (s[i] < 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break;    // vsat0
2365
			if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2366
				MOVSS(tempxregs[i], fpr.V(sregs[i]));
2367

2368
			// Zero out XMM0 if it was <= +0.0f (but skip NAN.)
2369
			MOVSS(R(XMM0), tempxregs[i]);
2370
			XORPS(XMM1, R(XMM1));
2371
			CMPLESS(XMM0, R(XMM1));
2372
			ANDNPS(XMM0, R(tempxregs[i]));
2373

2374
			// Retain a NAN in XMM0 (must be second operand.)
2375
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2376
			MOVSS(tempxregs[i], MatR(TEMPREG));
2377
			MINSS(tempxregs[i], R(XMM0));
2378
			break;
2379
		case 5: // if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break;  // vsat1
2380
			if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2381
				MOVSS(tempxregs[i], fpr.V(sregs[i]));
2382

2383
			// Check for < -1.0f, but careful of NANs.
2384
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&minus_one));
2385
			MOVSS(XMM1, MatR(TEMPREG));
2386
			MOVSS(R(XMM0), tempxregs[i]);
2387
			CMPLESS(XMM0, R(XMM1));
2388
			// If it was NOT less, the three ops below do nothing.
2389
			// Otherwise, they replace the value with -1.0f.
2390
			ANDPS(XMM1, R(XMM0));
2391
			ANDNPS(XMM0, R(tempxregs[i]));
2392
			ORPS(XMM0, R(XMM1));
2393

2394
			// Retain a NAN in XMM0 (must be second operand.)
2395
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2396
			MOVSS(tempxregs[i], MatR(TEMPREG));
2397
			MINSS(tempxregs[i], R(XMM0));
2398
			break;
2399
		case 16: // d[i] = 1.0f / s[i]; break; //vrcp
2400
			if (RipAccessible(&one)) {
2401
				MOVSS(XMM0, M(&one));  // rip accessible
2402
			} else {
2403
				MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2404
				MOVSS(XMM0, MatR(TEMPREG));
2405
			}
2406
			DIVSS(XMM0, fpr.V(sregs[i]));
2407
			MOVSS(tempxregs[i], R(XMM0));
2408
			break;
2409
		case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq
2410
			SQRTSS(XMM0, fpr.V(sregs[i]));
2411
			if (RipAccessible(&one)) {
2412
				MOVSS(tempxregs[i], M(&one));  // rip accessible
2413
			} else {
2414
				MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2415
				MOVSS(tempxregs[i], MatR(TEMPREG));
2416
			}
2417
			DIVSS(tempxregs[i], R(XMM0));
2418
			break;
2419
		case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin
2420
			specialFuncCallHelper(&SinOnly, sregs[i]);
2421
			MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2422
			break;
2423
		case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos
2424
			specialFuncCallHelper(&CosOnly, sregs[i]);
2425
			MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[1]));
2426
			break;
2427
		case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2
2428
			specialFuncCallHelper(&Exp2, sregs[i]);
2429
			MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2430
			break;
2431
		case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2
2432
			specialFuncCallHelper(&Log2, sregs[i]);
2433
			MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2434
			break;
2435
		case 22: // d[i] = sqrtf(s[i]); break; //vsqrt
2436
			SQRTSS(tempxregs[i], fpr.V(sregs[i]));
2437
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&noSignMask));
2438
			ANDPS(tempxregs[i], MatR(TEMPREG));
2439
			break;
2440
		case 23: // d[i] = asinf(s[i]) / M_PI_2; break; //vasin
2441
			specialFuncCallHelper(&ASinScaled, sregs[i]);
2442
			MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2443
			break;
2444
		case 24: // d[i] = -1.0f / s[i]; break; // vnrcp
2445
			// Rare so let's not bother checking for RipAccessible.
2446
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&minus_one));
2447
			MOVSS(XMM0, MatR(TEMPREG));
2448
			DIVSS(XMM0, fpr.V(sregs[i]));
2449
			MOVSS(tempxregs[i], R(XMM0));
2450
			break;
2451
		case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin
2452
			specialFuncCallHelper(&NegSinOnly, sregs[i]);
2453
			MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2454
			break;
2455
		case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2
2456
			specialFuncCallHelper(&RExp2, sregs[i]);
2457
			MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2458
			break;
2459
		}
2460
	}
2461
	for (int i = 0; i < n; ++i)
2462
	{
2463
		if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
2464
			MOVSS(fpr.V(dregs[i]), tempxregs[i]);
2465
	}
2466

2467
	ApplyPrefixD(dregs, sz);
2468

2469
	fpr.ReleaseSpillLocks();
2470
}
2471

2472
void Jit::Comp_Mftv(MIPSOpcode op) {
2473
	CONDITIONAL_DISABLE(VFPU_XFER);
2474

2475
	int imm = op & 0xFF;
2476
	MIPSGPReg rt = _RT;
2477
	switch ((op >> 21) & 0x1f)
2478
	{
2479
	case 3: //mfv / mfvc
2480
		// rt = 0, imm = 255 appears to be used as a CPU interlock by some games.
2481
		if (rt != MIPS_REG_ZERO) {
2482
			if (imm < 128) {  //R(rt) = VI(imm);
2483
				fpr.SimpleRegV(imm, 0);
2484
				if (fpr.V(imm).IsSimpleReg()) {
2485
					fpr.MapRegV(imm, 0);
2486
					gpr.MapReg(rt, false, true);
2487
					MOVD_xmm(gpr.R(rt), fpr.VX(imm));
2488
				} else {
2489
					// Let's not bother mapping the vreg.
2490
					gpr.MapReg(rt, false, true);
2491
					MOV(32, gpr.R(rt), fpr.V(imm));
2492
				}
2493
			} else if (imm < 128 + VFPU_CTRL_MAX) { //mfvc
2494
				if (imm - 128 == VFPU_CTRL_CC) {
2495
					if (gpr.IsImm(MIPS_REG_VFPUCC)) {
2496
						gpr.SetImm(rt, gpr.GetImm(MIPS_REG_VFPUCC));
2497
					} else {
2498
						gpr.Lock(rt, MIPS_REG_VFPUCC);
2499
						gpr.MapReg(rt, false, true);
2500
						gpr.MapReg(MIPS_REG_VFPUCC, true, false);
2501
						MOV(32, gpr.R(rt), gpr.R(MIPS_REG_VFPUCC));
2502
						gpr.UnlockAll();
2503
					}
2504
				} else {
2505
					// In case we have a saved prefix.
2506
					FlushPrefixV();
2507
					gpr.MapReg(rt, false, true);
2508
					MOV(32, gpr.R(rt), MIPSSTATE_VAR_ELEM32(vfpuCtrl[0], imm - 128));
2509
				}
2510
			} else {
2511
				//ERROR - maybe need to make this value too an "interlock" value?
2512
				_dbg_assert_msg_(false,"mfv - invalid register");
2513
			}
2514
		}
2515
		break;
2516

2517
	case 7: //mtv
2518
		if (imm < 128) { // VI(imm) = R(rt);
2519
			fpr.MapRegV(imm, MAP_DIRTY | MAP_NOINIT);
2520
			// Let's not bother mapping rt if we don't have to.
2521
			if (gpr.IsImm(rt) && gpr.GetImm(rt) == 0) {
2522
				XORPS(fpr.VX(imm), fpr.V(imm));
2523
			} else {
2524
				gpr.KillImmediate(rt, true, false);
2525
				MOVD_xmm(fpr.VX(imm), gpr.R(rt));
2526
			}
2527
		} else if (imm < 128 + VFPU_CTRL_MAX) { //mtvc //currentMIPS->vfpuCtrl[imm - 128] = R(rt);
2528
			if (imm - 128 == VFPU_CTRL_CC) {
2529
				if (gpr.IsImm(rt)) {
2530
					gpr.SetImm(MIPS_REG_VFPUCC, gpr.GetImm(rt));
2531
				} else {
2532
					gpr.Lock(rt, MIPS_REG_VFPUCC);
2533
					gpr.MapReg(rt, true, false);
2534
					gpr.MapReg(MIPS_REG_VFPUCC, false, true);
2535
					MOV(32, gpr.R(MIPS_REG_VFPUCC), gpr.R(rt));
2536
					gpr.UnlockAll();
2537
				}
2538
			} else {
2539
				gpr.MapReg(rt, true, false);
2540
				MOV(32, MIPSSTATE_VAR_ELEM32(vfpuCtrl[0], imm - 128), gpr.R(rt));
2541
			}
2542

2543
			// TODO: Optimization if rt is Imm?
2544
			if (imm - 128 == VFPU_CTRL_SPREFIX) {
2545
				js.prefixSFlag = JitState::PREFIX_UNKNOWN;
2546
				js.blockWrotePrefixes = true;
2547
			} else if (imm - 128 == VFPU_CTRL_TPREFIX) {
2548
				js.prefixTFlag = JitState::PREFIX_UNKNOWN;
2549
				js.blockWrotePrefixes = true;
2550
			} else if (imm - 128 == VFPU_CTRL_DPREFIX) {
2551
				js.prefixDFlag = JitState::PREFIX_UNKNOWN;
2552
				js.blockWrotePrefixes = true;
2553
			}
2554
		} else {
2555
			//ERROR
2556
			_dbg_assert_msg_(false,"mtv - invalid register");
2557
		}
2558
		break;
2559

2560
	default:
2561
		DISABLE;
2562
	}
2563
}
2564

2565
void Jit::Comp_Vmfvc(MIPSOpcode op) {
2566
	CONDITIONAL_DISABLE(VFPU_XFER);
2567
	int vd = _VD;
2568
	int imm = (op >> 8) & 0x7F;
2569
	if (imm < VFPU_CTRL_MAX) {
2570
		fpr.MapRegV(vd, MAP_DIRTY | MAP_NOINIT);
2571
		if (imm == VFPU_CTRL_CC) {
2572
			gpr.MapReg(MIPS_REG_VFPUCC, true, false);
2573
			MOVD_xmm(fpr.VX(vd), gpr.R(MIPS_REG_VFPUCC));
2574
		} else {
2575
			MOVSS(fpr.VX(vd), MIPSSTATE_VAR_ELEM32(vfpuCtrl[0], imm));
2576
		}
2577
		fpr.ReleaseSpillLocks();
2578
	} else {
2579
		fpr.MapRegV(vd, MAP_DIRTY | MAP_NOINIT);
2580
		XORPS(fpr.VX(vd), fpr.V(vd));
2581
		fpr.ReleaseSpillLocks();
2582
	}
2583
}
2584

2585
void Jit::Comp_Vmtvc(MIPSOpcode op) {
2586
	CONDITIONAL_DISABLE(VFPU_XFER);
2587
	int vs = _VS;
2588
	int imm = op & 0x7F;
2589
	if (imm < VFPU_CTRL_MAX) {
2590
		fpr.MapRegV(vs, 0);
2591
		if (imm == VFPU_CTRL_CC) {
2592
			gpr.MapReg(MIPS_REG_VFPUCC, false, true);
2593
			MOVD_xmm(gpr.R(MIPS_REG_VFPUCC), fpr.VX(vs));
2594
		} else {
2595
			MOVSS(MIPSSTATE_VAR_ELEM32(vfpuCtrl[0], imm), fpr.VX(vs));
2596
		}
2597
		fpr.ReleaseSpillLocks();
2598

2599
		if (imm == VFPU_CTRL_SPREFIX) {
2600
			js.prefixSFlag = JitState::PREFIX_UNKNOWN;
2601
			js.blockWrotePrefixes = true;
2602
		} else if (imm == VFPU_CTRL_TPREFIX) {
2603
			js.prefixTFlag = JitState::PREFIX_UNKNOWN;
2604
			js.blockWrotePrefixes = true;
2605
		} else if (imm == VFPU_CTRL_DPREFIX) {
2606
			js.prefixDFlag = JitState::PREFIX_UNKNOWN;
2607
			js.blockWrotePrefixes = true;
2608
		}
2609
	}
2610
}
2611

2612
void Jit::Comp_VMatrixInit(MIPSOpcode op) {
2613
	CONDITIONAL_DISABLE(VFPU_XFER);
2614

2615
	if (js.HasUnknownPrefix())
2616
		DISABLE;
2617

2618
	MatrixSize sz = GetMtxSize(op);
2619
	int n = GetMatrixSide(sz);
2620

2621
	// Not really about trying here, it will work if enabled.
2622
	if (jo.enableVFPUSIMD) {
2623
		VectorSize vsz = GetVectorSize(sz);
2624
		u8 vecs[4];
2625
		GetMatrixColumns(_VD, sz, vecs);
2626
		switch ((op >> 16) & 0xF) {
2627
		case 3:
2628
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&identityMatrix[0]));
2629
			break;
2630
		case 7:
2631
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
2632
			MOVAPS(XMM0, MatR(TEMPREG));
2633
			break;
2634
		}
2635

2636
		for (int i = 0; i < n; i++) {
2637
			u8 vec[4];
2638
			GetVectorRegs(vec, vsz, vecs[i]);
2639
			fpr.MapRegsVS(vec, vsz, MAP_NOINIT | MAP_DIRTY);
2640
			switch ((op >> 16) & 0xF) {
2641
			case 3:
2642
				MOVAPS(fpr.VSX(vec), MDisp(TEMPREG, 16 * i));
2643
				break;
2644
			case 6:
2645
				XORPS(fpr.VSX(vec), fpr.VS(vec));
2646
				break;
2647
			case 7:
2648
				MOVAPS(fpr.VSX(vec), R(XMM0));
2649
				break;
2650
			}
2651
		}
2652
		fpr.ReleaseSpillLocks();
2653
		return;
2654
	}
2655

2656
	u8 dregs[16];
2657
	GetMatrixRegs(dregs, sz, _VD);
2658

2659
	// Flush SIMD.
2660
	fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2661

2662
	switch ((op >> 16) & 0xF) {
2663
	case 3: // vmidt
2664
		XORPS(XMM0, R(XMM0));
2665
		MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2666
		MOVSS(XMM1, MatR(TEMPREG));
2667
		for (int a = 0; a < n; a++) {
2668
			for (int b = 0; b < n; b++) {
2669
				MOVSS(fpr.V(dregs[a * 4 + b]), a == b ? XMM1 : XMM0);
2670
			}
2671
		}
2672
		break;
2673
	case 6: // vmzero
2674
		XORPS(XMM0, R(XMM0));
2675
		for (int a = 0; a < n; a++) {
2676
			for (int b = 0; b < n; b++) {
2677
				MOVSS(fpr.V(dregs[a * 4 + b]), XMM0);
2678
			}
2679
		}
2680
		break;
2681
	case 7: // vmone
2682
		MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2683
		MOVSS(XMM0, MatR(TEMPREG));
2684
		for (int a = 0; a < n; a++) {
2685
			for (int b = 0; b < n; b++) {
2686
				MOVSS(fpr.V(dregs[a * 4 + b]), XMM0);
2687
			}
2688
		}
2689
		break;
2690
	}
2691

2692
	fpr.ReleaseSpillLocks();
2693
}
2694

2695
void Jit::Comp_Vmmov(MIPSOpcode op) {
2696
	CONDITIONAL_DISABLE(VFPU_MTX_VMMOV);
2697

2698
	// TODO: This probably ignores prefixes?
2699
	if (js.HasUnknownPrefix())
2700
		DISABLE;
2701

2702
	MatrixSize sz = GetMtxSize(op);
2703
	int n = GetMatrixSide(sz);
2704

2705
	if (jo.enableVFPUSIMD) {
2706
		VectorSize vsz = GetVectorSize(sz);
2707
		u8 dest[4][4];
2708
		MatrixOverlapType overlap = GetMatrixOverlap(_VD, _VS, sz);
2709

2710
		u8 vecs[4];
2711
		if (overlap == OVERLAP_NONE) {
2712
			GetMatrixColumns(_VD, sz, vecs);
2713
			for (int i = 0; i < n; ++i) {
2714
				GetVectorRegs(dest[i], vsz, vecs[i]);
2715
			}
2716
		} else {
2717
			for (int i = 0; i < n; ++i) {
2718
				fpr.GetTempVS(dest[i], vsz);
2719
			}
2720
		}
2721

2722
		GetMatrixColumns(_VS, sz, vecs);
2723
		for (int i = 0; i < n; i++) {
2724
			u8 vec[4];
2725
			GetVectorRegs(vec, vsz, vecs[i]);
2726
			fpr.MapRegsVS(vec, vsz, 0);
2727
			fpr.MapRegsVS(dest[i], vsz, MAP_NOINIT);
2728
			MOVAPS(fpr.VSX(dest[i]), fpr.VS(vec));
2729
			fpr.ReleaseSpillLocks();
2730
		}
2731

2732
		if (overlap != OVERLAP_NONE) {
2733
			// Okay, move from the temps to VD now.
2734
			GetMatrixColumns(_VD, sz, vecs);
2735
			for (int i = 0; i < n; i++) {
2736
				u8 vec[4];
2737
				GetVectorRegs(vec, vsz, vecs[i]);
2738
				fpr.MapRegsVS(vec, vsz, MAP_NOINIT);
2739
				fpr.MapRegsVS(dest[i], vsz, 0);
2740
				MOVAPS(fpr.VSX(vec), fpr.VS(dest[i]));
2741
				fpr.ReleaseSpillLocks();
2742
			}
2743
		}
2744

2745
		fpr.ReleaseSpillLocks();
2746
		return;
2747
	}
2748

2749
	u8 sregs[16], dregs[16];
2750
	GetMatrixRegs(sregs, sz, _VS);
2751
	GetMatrixRegs(dregs, sz, _VD);
2752

2753
	// Flush SIMD.
2754
	fpr.SimpleRegsV(sregs, sz, 0);
2755
	fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2756

2757
	// TODO: gas doesn't allow overlap, what does the PSP do?
2758
	// Potentially detect overlap or the safe direction to move in, or just DISABLE?
2759
	// This is very not optimal, blows the regcache everytime.
2760
	u8 tempregs[16];
2761
	for (int a = 0; a < n; a++) {
2762
		for (int b = 0; b < n; b++) {
2763
			u8 temp = (u8) fpr.GetTempV();
2764
			fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);
2765
			MOVSS(fpr.VX(temp), fpr.V(sregs[a * 4 + b]));
2766
			fpr.StoreFromRegisterV(temp);
2767
			tempregs[a * 4 + b] = temp;
2768
		}
2769
	}
2770
	for (int a = 0; a < n; a++) {
2771
		for (int b = 0; b < n; b++) {
2772
			u8 temp = tempregs[a * 4 + b];
2773
			fpr.MapRegV(temp, 0);
2774
			MOVSS(fpr.V(dregs[a * 4 + b]), fpr.VX(temp));
2775
		}
2776
	}
2777

2778
	fpr.ReleaseSpillLocks();
2779
}
2780

2781
void Jit::Comp_VScl(MIPSOpcode op) {
2782
	CONDITIONAL_DISABLE(VFPU_VEC);
2783

2784
	if (js.HasUnknownPrefix())
2785
		DISABLE;
2786

2787
	VectorSize sz = GetVecSize(op);
2788
	int n = GetNumVectorElements(sz);
2789

2790
	u8 sregs[4], dregs[4], scale;
2791
	GetVectorRegsPrefixS(sregs, sz, _VS);
2792
	GetVectorRegsPrefixT(&scale, V_Single, _VT);
2793
	GetVectorRegsPrefixD(dregs, sz, _VD);
2794

2795
	if (fpr.TryMapDirtyInInVS(dregs, sz, sregs, sz, &scale, V_Single, true)) {
2796
		MOVSS(XMM0, fpr.VS(&scale));
2797
		if (sz != V_Single)
2798
			SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
2799
		if (dregs[0] != sregs[0]) {
2800
			MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
2801
		}
2802
		MULPS(fpr.VSX(dregs), R(XMM0));
2803
		ApplyPrefixD(dregs, sz);
2804
		fpr.ReleaseSpillLocks();
2805
		return;
2806
	}
2807

2808
	// Flush SIMD.
2809
	fpr.SimpleRegsV(sregs, sz, 0);
2810
	fpr.SimpleRegsV(&scale, V_Single, 0);
2811
	fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2812

2813
	// Move to XMM0 early, so we don't have to worry about overlap with scale.
2814
	MOVSS(XMM0, fpr.V(scale));
2815

2816
	X64Reg tempxregs[4];
2817
	for (int i = 0; i < n; ++i) {
2818
		if (dregs[i] != scale || !IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
2819
			int reg = fpr.GetTempV();
2820
			fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
2821
			fpr.SpillLockV(reg);
2822
			tempxregs[i] = fpr.VX(reg);
2823
		} else {
2824
			fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
2825
			fpr.SpillLockV(dregs[i]);
2826
			tempxregs[i] = fpr.VX(dregs[i]);
2827
		}
2828
	}
2829
	for (int i = 0; i < n; ++i) {
2830
		if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2831
			MOVSS(tempxregs[i], fpr.V(sregs[i]));
2832
		MULSS(tempxregs[i], R(XMM0));
2833
	}
2834
	for (int i = 0; i < n; ++i) {
2835
		if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
2836
			MOVSS(fpr.V(dregs[i]), tempxregs[i]);
2837
	}
2838
	ApplyPrefixD(dregs, sz);
2839

2840
	fpr.ReleaseSpillLocks();
2841
}
2842

2843
void Jit::Comp_Vmmul(MIPSOpcode op) {
2844
	CONDITIONAL_DISABLE(VFPU_MTX_VMMUL);
2845
	if (!js.HasNoPrefix()) {
2846
		DISABLE;
2847
	}
2848

2849
	if (PSP_CoreParameter().compat.flags().MoreAccurateVMMUL) {
2850
		// Fall back to interpreter, which has the accurate implementation.
2851
		// Later we might do something more optimized here.
2852
		DISABLE;
2853
	}
2854

2855
	MatrixSize sz = GetMtxSize(op);
2856
	VectorSize vsz = GetVectorSize(sz);
2857
	int n = GetMatrixSide(sz);
2858

2859
	MatrixOverlapType soverlap = GetMatrixOverlap(_VS, _VD, sz);
2860
	MatrixOverlapType toverlap = GetMatrixOverlap(_VT, _VD, sz);
2861
	// If these overlap, we won't be able to map T as singles.
2862
	MatrixOverlapType stoverlap = GetMatrixOverlap(_VS, _VT, sz);
2863

2864
	if (jo.enableVFPUSIMD && !soverlap && !toverlap && !stoverlap) {
2865
		u8 scols[4], dcols[4], tregs[16];
2866

2867
		int vs = _VS;
2868
		int vd = _VD;
2869
		int vt = _VT;
2870

2871
		bool transposeDest = false;
2872
		bool transposeS = false;
2873

2874
		if ((vd & 0x20) && sz == M_4x4) {
2875
			vd ^= 0x20;
2876
			transposeDest = true;
2877
		}
2878

2879
		// Our algorithm needs a transposed S (which is the usual).
2880
		if (!(vs & 0x20) && sz == M_4x4) {
2881
			vs ^= 0x20;
2882
			transposeS = true;
2883
		}
2884

2885
		// The T matrix we will address individually.
2886
		GetMatrixColumns(vd, sz, dcols);
2887
		GetMatrixRows(vs, sz, scols);
2888
		memset(tregs, 255, sizeof(tregs));
2889
		GetMatrixRegs(tregs, sz, vt);
2890
		for (int i = 0; i < 16; i++) {
2891
			if (tregs[i] != 255)
2892
				fpr.StoreFromRegisterV(tregs[i]);
2893
		}
2894

2895
		u8 scol[4][4];
2896

2897
		// Map all of S's columns into registers.
2898
		for (int i = 0; i < n; i++) {
2899
			if (transposeS){
2900
				fpr.StoreFromRegisterV(scols[i]);
2901
			}
2902
			GetVectorRegs(scol[i], vsz, scols[i]);
2903
			fpr.MapRegsVS(scol[i], vsz, 0);
2904
			fpr.SpillLockV(scols[i], vsz);
2905
		}
2906

2907
		// Shorter than manually stuffing the registers. But it feels like ther'es room for optimization here...
2908
		auto transposeInPlace = [=](u8 col[4][4]) {
2909
			MOVAPS(XMM0, fpr.VS(col[0]));
2910
			UNPCKLPS(fpr.VSX(col[0]), fpr.VS(col[2]));
2911
			UNPCKHPS(XMM0, fpr.VS(col[2]));
2912

2913
			MOVAPS(fpr.VSX(col[2]), fpr.VS(col[1]));
2914
			UNPCKLPS(fpr.VSX(col[1]), fpr.VS(col[3]));
2915
			UNPCKHPS(fpr.VSX(col[2]), fpr.VS(col[3]));
2916

2917
			MOVAPS(fpr.VSX(col[3]), fpr.VS(col[0]));
2918
			UNPCKLPS(fpr.VSX(col[0]), fpr.VS(col[1]));
2919
			UNPCKHPS(fpr.VSX(col[3]), fpr.VS(col[1]));
2920

2921
			MOVAPS(fpr.VSX(col[1]), R(XMM0));
2922
			UNPCKLPS(fpr.VSX(col[1]), fpr.VS(col[2]));
2923
			UNPCKHPS(XMM0, fpr.VS(col[2]));
2924

2925
			MOVAPS(fpr.VSX(col[2]), fpr.VS(col[1]));
2926
			MOVAPS(fpr.VSX(col[1]), fpr.VS(col[3]));
2927
			MOVAPS(fpr.VSX(col[3]), R(XMM0));
2928
		};
2929

2930
		// Some games pass in S as an E matrix (transposed). Let's just transpose the data before we do the multiplication instead.
2931
		// This is shorter than trying to combine a discontinous matrix with lots of shufps.
2932
		if (transposeS) {
2933
			transposeInPlace(scol);
2934
		}
2935

2936
		// Now, work our way through the matrix, loading things as we go.
2937
		// TODO: With more temp registers, can generate much more efficient code.
2938
		for (int i = 0; i < n; i++) {
2939
			MOVSS(XMM1, fpr.V(tregs[4 * i]));  // TODO: AVX broadcastss to replace this and the SHUFPS
2940
			MOVSS(XMM0, fpr.V(tregs[4 * i + 1]));
2941
			SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
2942
			SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
2943
			MULPS(XMM1, fpr.VS(scol[0]));
2944
			MULPS(XMM0, fpr.VS(scol[1]));
2945
			ADDPS(XMM1, R(XMM0));
2946
			for (int j = 2; j < n; j++) {
2947
				MOVSS(XMM0, fpr.V(tregs[4 * i + j]));
2948
				SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
2949
				MULPS(XMM0, fpr.VS(scol[j]));
2950
				ADDPS(XMM1, R(XMM0));
2951
			}
2952
			// Map the D column.
2953
			u8 dcol[4];
2954
			GetVectorRegs(dcol, vsz, dcols[i]);
2955
#if !PPSSPP_ARCH(AMD64)
2956
			fpr.MapRegsVS(dcol, vsz, MAP_DIRTY | MAP_NOINIT | MAP_NOLOCK);
2957
#else
2958
			fpr.MapRegsVS(dcol, vsz, MAP_DIRTY | MAP_NOINIT);
2959
#endif
2960
			MOVAPS(fpr.VS(dcol), XMM1);
2961
		}
2962
		if (transposeS){
2963
			for (int i = 0; i < n; i++){
2964
				fpr.DiscardVS(scols[i]);
2965
			}
2966
		}
2967

2968
#if !PPSSPP_ARCH(AMD64)
2969
		fpr.ReleaseSpillLocks();
2970
#endif
2971
		if (transposeDest) {
2972
			u8 dcol[4][4];
2973
			for (int i = 0; i < n; i++) {
2974
				GetVectorRegs(dcol[i], vsz, dcols[i]);
2975
				fpr.MapRegsVS(dcol[i], vsz, MAP_DIRTY);
2976
			}
2977
			transposeInPlace(dcol);
2978
		}
2979
		fpr.ReleaseSpillLocks();
2980
		return;
2981
	}
2982

2983
	u8 sregs[16], tregs[16], dregs[16];
2984
	GetMatrixRegs(sregs, sz, _VS);
2985
	GetMatrixRegs(tregs, sz, _VT);
2986
	GetMatrixRegs(dregs, sz, _VD);
2987

2988
	// Flush SIMD.
2989
	fpr.SimpleRegsV(sregs, sz, 0);
2990
	fpr.SimpleRegsV(tregs, sz, 0);
2991
	fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2992

2993
	// Rough overlap check.
2994
	bool overlap = false;
2995
	if (GetMtx(_VS) == GetMtx(_VD) || GetMtx(_VT) == GetMtx(_VD)) {
2996
		// Potential overlap (guaranteed for 3x3 or more).
2997
		overlap = true;
2998
	}
2999

3000
	if (overlap) {
3001
		u8 tempregs[16];
3002
		for (int a = 0; a < n; a++) {
3003
			for (int b = 0; b < n; b++) {
3004
				MOVSS(XMM0, fpr.V(sregs[b * 4]));
3005
				MULSS(XMM0, fpr.V(tregs[a * 4]));
3006
				for (int c = 1; c < n; c++) {
3007
					MOVSS(XMM1, fpr.V(sregs[b * 4 + c]));
3008
					MULSS(XMM1, fpr.V(tregs[a * 4 + c]));
3009
					ADDSS(XMM0, R(XMM1));
3010
				}
3011
				u8 temp = (u8) fpr.GetTempV();
3012
				fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);
3013
				MOVSS(fpr.VX(temp), R(XMM0));
3014
				fpr.StoreFromRegisterV(temp);
3015
				tempregs[a * 4 + b] = temp;
3016
			}
3017
		}
3018
		for (int a = 0; a < n; a++) {
3019
			for (int b = 0; b < n; b++) {
3020
				u8 temp = tempregs[a * 4 + b];
3021
				fpr.MapRegV(temp, 0);
3022
				MOVSS(fpr.V(dregs[a * 4 + b]), fpr.VX(temp));
3023
			}
3024
		}
3025
	} else {
3026
		for (int a = 0; a < n; a++) {
3027
			for (int b = 0; b < n; b++) {
3028
				MOVSS(XMM0, fpr.V(sregs[b * 4]));
3029
				MULSS(XMM0, fpr.V(tregs[a * 4]));
3030
				for (int c = 1; c < n; c++) {
3031
					MOVSS(XMM1, fpr.V(sregs[b * 4 + c]));
3032
					MULSS(XMM1, fpr.V(tregs[a * 4 + c]));
3033
					ADDSS(XMM0, R(XMM1));
3034
				}
3035
				MOVSS(fpr.V(dregs[a * 4 + b]), XMM0);
3036
			}
3037
		}
3038
	}
3039
	fpr.ReleaseSpillLocks();
3040
}
3041

3042
void Jit::Comp_Vmscl(MIPSOpcode op) {
3043
	CONDITIONAL_DISABLE(VFPU_MTX_VMSCL);
3044

3045
	// TODO: This op probably ignores prefixes?
3046
	if (js.HasUnknownPrefix())
3047
		DISABLE;
3048

3049
	MatrixSize sz = GetMtxSize(op);
3050
	int n = GetMatrixSide(sz);
3051

3052
	u8 sregs[16], dregs[16], scale;
3053
	GetMatrixRegs(sregs, sz, _VS);
3054
	GetVectorRegs(&scale, V_Single, _VT);
3055
	GetMatrixRegs(dregs, sz, _VD);
3056

3057
	// Flush SIMD.
3058
	fpr.SimpleRegsV(sregs, sz, 0);
3059
	fpr.SimpleRegsV(&scale, V_Single, 0);
3060
	fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
3061

3062
	// Move to XMM0 early, so we don't have to worry about overlap with scale.
3063
	MOVSS(XMM0, fpr.V(scale));
3064

3065
	// TODO: test overlap, optimize.
3066
	u8 tempregs[16];
3067
	for (int a = 0; a < n; a++) {
3068
		for (int b = 0; b < n; b++) {
3069
			u8 temp = (u8) fpr.GetTempV();
3070
			fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);
3071
			MOVSS(fpr.VX(temp), fpr.V(sregs[a * 4 + b]));
3072
			MULSS(fpr.VX(temp), R(XMM0));
3073
			fpr.StoreFromRegisterV(temp);
3074
			tempregs[a * 4 + b] = temp;
3075
		}
3076
	}
3077
	for (int a = 0; a < n; a++) {
3078
		for (int b = 0; b < n; b++) {
3079
			u8 temp = tempregs[a * 4 + b];
3080
			fpr.MapRegV(temp, 0);
3081
			MOVSS(fpr.V(dregs[a * 4 + b]), fpr.VX(temp));
3082
		}
3083
	}
3084

3085
	fpr.ReleaseSpillLocks();
3086
}
3087

3088
void Jit::Comp_Vtfm(MIPSOpcode op) {
3089
	CONDITIONAL_DISABLE(VFPU_MTX_VTFM);
3090

3091
	// TODO: This probably ignores prefixes?  Or maybe uses D?
3092
	if (js.HasUnknownPrefix())
3093
		DISABLE;
3094

3095
	VectorSize sz = GetVecSize(op);
3096
	MatrixSize msz = GetMtxSize(op);
3097
	int n = GetNumVectorElements(sz);
3098
	int ins = (op >> 23) & 7;
3099

3100
	bool homogenous = false;
3101
	if (n == ins) {
3102
		n++;
3103
		sz = (VectorSize)((int)(sz)+1);
3104
		msz = (MatrixSize)((int)(msz)+1);
3105
		homogenous = true;
3106
	}
3107
	// Otherwise, n should already be ins + 1.
3108
	else if (n != ins + 1) {
3109
		DISABLE;
3110
	}
3111

3112
	if (jo.enableVFPUSIMD) {
3113
		u8 scols[4], dcol[4], tregs[4];
3114

3115
		int vs = _VS;
3116
		int vd = _VD;
3117
		int vt = _VT;  // vector!
3118

3119
		// The T matrix we will address individually.
3120
		GetVectorRegs(dcol, sz, vd);
3121
		GetMatrixRows(vs, msz, scols);
3122
		GetVectorRegs(tregs, sz, vt);
3123
		for (int i = 0; i < n; i++) {
3124
			fpr.StoreFromRegisterV(tregs[i]);
3125
		}
3126

3127
		// We need the T regs in individual regs, but they could overlap with S regs.
3128
		// If that happens, we copy the T reg to a temp.
3129
		auto flushConflictingTRegsToTemps = [&](u8 regs[4]) {
3130
			for (int i = 0; i < n; ++i) {
3131
				for (int j = 0; j < n; ++j) {
3132
					if (regs[i] != tregs[j]) {
3133
						continue;
3134
					}
3135

3136
					// They match.  Let's replace this treg with a temp reg.
3137
					// Note that it will spill if there's contention, unfortunately...
3138
					tregs[j] = fpr.GetTempV();
3139
					fpr.MapRegV(tregs[j], MAP_NOINIT);
3140
					MOVSS(fpr.VX(tregs[j]), fpr.V(regs[i]));
3141
				}
3142
			}
3143
		};
3144

3145
		u8 scol[4][4];
3146

3147
		// Map all of S's columns into registers.
3148
		for (int i = 0; i < n; i++) {
3149
			GetVectorRegs(scol[i], sz, scols[i]);
3150
			flushConflictingTRegsToTemps(scol[i]);
3151
			fpr.MapRegsVS(scol[i], sz, 0);
3152
		}
3153

3154
		// Now, work our way through the matrix, loading things as we go.
3155
		// TODO: With more temp registers, can generate much more efficient code.
3156
		MOVSS(XMM1, fpr.V(tregs[0]));  // TODO: AVX broadcastss to replace this and the SHUFPS (but take care of temps, unless we force store them.)
3157
		SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
3158
		MULPS(XMM1, fpr.VS(scol[0]));
3159
		for (int j = 1; j < n; j++) {
3160
			if (!homogenous || j != n - 1) {
3161
				MOVSS(XMM0, fpr.V(tregs[j]));
3162
				SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
3163
				MULPS(XMM0, fpr.VS(scol[j]));
3164
				ADDPS(XMM1, R(XMM0));
3165
			} else {
3166
				ADDPS(XMM1, fpr.VS(scol[j]));
3167
			}
3168
		}
3169
		// Map the D column.  Release first in case of overlap.
3170
		for (int i = 0; i < n; i++) {
3171
			fpr.ReleaseSpillLockV(scol[i], sz);
3172
		}
3173
		fpr.MapRegsVS(dcol, sz, MAP_DIRTY | MAP_NOINIT);
3174
		MOVAPS(fpr.VS(dcol), XMM1);
3175
		fpr.ReleaseSpillLocks();
3176
		return;
3177
	}
3178

3179
	u8 sregs[16], dregs[4], tregs[4];
3180
	GetMatrixRegs(sregs, msz, _VS);
3181
	GetVectorRegs(tregs, sz, _VT);
3182
	GetVectorRegs(dregs, sz, _VD);
3183

3184
	// Flush SIMD.
3185
	fpr.SimpleRegsV(sregs, msz, 0);
3186
	fpr.SimpleRegsV(tregs, sz, 0);
3187
	fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
3188

3189
	// TODO: test overlap, optimize.
3190
	u8 tempregs[4];
3191
	for (int i = 0; i < n; i++) {
3192
		MOVSS(XMM0, fpr.V(sregs[i * 4]));
3193
		MULSS(XMM0, fpr.V(tregs[0]));
3194
		for (int k = 1; k < n; k++)
3195
		{
3196
			MOVSS(XMM1, fpr.V(sregs[i * 4 + k]));
3197
			if (!homogenous || k != n - 1)
3198
				MULSS(XMM1, fpr.V(tregs[k]));
3199
			ADDSS(XMM0, R(XMM1));
3200
		}
3201

3202
		u8 temp = (u8) fpr.GetTempV();
3203
		fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);
3204
		MOVSS(fpr.VX(temp), R(XMM0));
3205
		fpr.StoreFromRegisterV(temp);
3206
		tempregs[i] = temp;
3207
	}
3208
	for (int i = 0; i < n; i++) {
3209
		u8 temp = tempregs[i];
3210
		fpr.MapRegV(temp, 0);
3211
		MOVSS(fpr.V(dregs[i]), fpr.VX(temp));
3212
	}
3213

3214
	fpr.ReleaseSpillLocks();
3215
}
3216

3217
void Jit::Comp_VCrs(MIPSOpcode op) {
3218
	DISABLE;
3219
}
3220

3221
void Jit::Comp_VDet(MIPSOpcode op) {
3222
	DISABLE;
3223
}
3224

3225
// The goal is to map (reversed byte order for clarity):
3226
// 000000AA 000000BB 000000CC 000000DD -> AABBCCDD
3227
alignas(16) static const s8 vi2xc_shuffle[16] = { 3, 7, 11, 15,  -1, -1, -1, -1,  -1, -1, -1, -1,  -1, -1, -1, -1 };
3228
// 0000AAAA 0000BBBB 0000CCCC 0000DDDD -> AAAABBBB CCCCDDDD
3229
alignas(16) static const s8 vi2xs_shuffle[16] = { 2, 3, 6, 7,  10, 11, 14, 15,  -1, -1, -1, -1,  -1, -1, -1, -1 };
3230

3231
void Jit::Comp_Vi2x(MIPSOpcode op) {
3232
	CONDITIONAL_DISABLE(VFPU_VEC);
3233
	if (js.HasUnknownPrefix())
3234
		DISABLE;
3235

3236
	int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3)
3237
	bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2)
3238

3239
	// These instructions pack pairs or quads of integers into 32 bits.
3240
	// The unsigned (u) versions skip the sign bit when packing.
3241

3242
	VectorSize sz = GetVecSize(op);
3243
	VectorSize outsize;
3244
	if (bits == 8) {
3245
		outsize = V_Single;
3246
		if (sz != V_Quad) {
3247
			DISABLE;
3248
		}
3249
	} else {
3250
		switch (sz) {
3251
		case V_Pair:
3252
			outsize = V_Single;
3253
			break;
3254
		case V_Quad:
3255
			outsize = V_Pair;
3256
			break;
3257
		default:
3258
			DISABLE;
3259
		}
3260
	}
3261

3262
	u8 sregs[4], dregs[4];
3263
	GetVectorRegsPrefixS(sregs, sz, _VS);
3264
	GetVectorRegsPrefixD(dregs, outsize, _VD);
3265

3266
	// Flush SIMD.
3267
	fpr.SimpleRegsV(sregs, sz, 0);
3268
	fpr.SimpleRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY);
3269

3270
	// First, let's assemble the sregs into lanes of a single xmm reg.
3271
	// For quad inputs, we need somewhere for the bottom regs.  Ideally dregs[0].
3272
	X64Reg dst0 = XMM0;
3273
	if (sz == V_Quad) {
3274
		int vreg = dregs[0];
3275
		if (!IsOverlapSafeAllowS(dregs[0], 0, 4, sregs)) {
3276
			// Will be discarded on release.
3277
			vreg = fpr.GetTempV();
3278
		}
3279
		fpr.MapRegV(vreg, vreg == sregs[0] ? MAP_DIRTY : MAP_NOINIT);
3280
		fpr.SpillLockV(vreg);
3281
		dst0 = fpr.VX(vreg);
3282
	} else {
3283
		// Pair, let's check if we should use dregs[0] directly.  No temp needed.
3284
		int vreg = dregs[0];
3285
		if (IsOverlapSafeAllowS(dregs[0], 0, 2, sregs)) {
3286
			fpr.MapRegV(vreg, vreg == sregs[0] ? MAP_DIRTY : MAP_NOINIT);
3287
			fpr.SpillLockV(vreg);
3288
			dst0 = fpr.VX(vreg);
3289
		}
3290
	}
3291

3292
	if (!fpr.V(sregs[0]).IsSimpleReg(dst0)) {
3293
		MOVSS(dst0, fpr.V(sregs[0]));
3294
	}
3295
	MOVSS(XMM1, fpr.V(sregs[1]));
3296
	// With this, we have the lower half in dst0.
3297
	PUNPCKLDQ(dst0, R(XMM1));
3298
	if (sz == V_Quad) {
3299
		MOVSS(XMM0, fpr.V(sregs[2]));
3300
		MOVSS(XMM1, fpr.V(sregs[3]));
3301
		PUNPCKLDQ(XMM0, R(XMM1));
3302
		// Now we need to combine XMM0 into dst0.
3303
		PUNPCKLQDQ(dst0, R(XMM0));
3304
	} else {
3305
		// Otherwise, we need to zero out the top 2.
3306
		// We expect XMM1 to be zero below.
3307
		PXOR(XMM1, R(XMM1));
3308
		PUNPCKLQDQ(dst0, R(XMM1));
3309
	}
3310

3311
	// For "u" type ops, we clamp to zero and shift off the sign bit first.
3312
	if (unsignedOp) {
3313
		if (cpu_info.bSSE4_1) {
3314
			if (sz == V_Quad) {
3315
				// Zeroed in the other case above.
3316
				PXOR(XMM1, R(XMM1));
3317
			}
3318
			PMAXSD(dst0, R(XMM1));
3319
			PSLLD(dst0, 1);
3320
		} else {
3321
			// Get a mask of the sign bit in dst0, then and in the values.  This clamps to 0.
3322
			MOVDQA(XMM1, R(dst0));
3323
			PSRAD(dst0, 31);
3324
			PSLLD(XMM1, 1);
3325
			PANDN(dst0, R(XMM1));
3326
		}
3327
	}
3328

3329
	// At this point, everything is aligned in the high bits of our lanes.
3330
	if (cpu_info.bSSSE3) {
3331
		if (RipAccessible(vi2xc_shuffle)) {
3332
			PSHUFB(dst0, bits == 8 ? M(vi2xc_shuffle) : M(vi2xs_shuffle));  // rip accessible
3333
		} else {
3334
			MOV(PTRBITS, R(TEMPREG), bits == 8 ? ImmPtr(vi2xc_shuffle) : ImmPtr(vi2xs_shuffle));
3335
			PSHUFB(dst0, MatR(TEMPREG));
3336
		}
3337
	} else {
3338
		// Let's *arithmetically* shift in the sign so we can use saturating packs.
3339
		PSRAD(dst0, 32 - bits);
3340
		// XMM1 used for the high part just so there's no dependency.  It contains garbage or 0.
3341
		PACKSSDW(dst0, R(XMM1));
3342
		if (bits == 8) {
3343
			PACKSSWB(dst0, R(XMM1));
3344
		}
3345
	}
3346

3347
	if (!fpr.V(dregs[0]).IsSimpleReg(dst0)) {
3348
		MOVSS(fpr.V(dregs[0]), dst0);
3349
	}
3350
	if (outsize == V_Pair) {
3351
		fpr.MapRegV(dregs[1], MAP_NOINIT | MAP_DIRTY);
3352
		MOVDQA(fpr.V(dregs[1]), dst0);
3353
		// Shift out the lower result to get the result we want.
3354
		PSRLDQ(fpr.VX(dregs[1]), 4);
3355
	}
3356

3357
	ApplyPrefixD(dregs, outsize);
3358
	fpr.ReleaseSpillLocks();
3359
}
3360

3361
alignas(16) static const float vavg_table[4] = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };
3362

3363
void Jit::Comp_Vhoriz(MIPSOpcode op) {
3364
	CONDITIONAL_DISABLE(VFPU_VEC);
3365

3366
	if (js.HasUnknownPrefix())
3367
		DISABLE;
3368

3369
	VectorSize sz = GetVecSize(op);
3370
	int n = GetNumVectorElements(sz);
3371

3372
	u8 sregs[4], dregs[1];
3373
	GetVectorRegsPrefixS(sregs, sz, _VS);
3374
	GetVectorRegsPrefixD(dregs, V_Single, _VD);
3375
	if (fpr.TryMapDirtyInVS(dregs, V_Single, sregs, sz)) {
3376
		if (cpu_info.bSSE4_1) {
3377
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
3378
			switch (sz) {
3379
			case V_Pair:
3380
				MOVAPS(XMM0, fpr.VS(sregs));
3381
				DPPS(XMM0, MatR(TEMPREG), 0x31);
3382
				MOVAPS(fpr.VSX(dregs), R(XMM0));
3383
				break;
3384
			case V_Triple:
3385
				MOVAPS(XMM0, fpr.VS(sregs));
3386
				DPPS(XMM0, MatR(TEMPREG), 0x71);
3387
				MOVAPS(fpr.VSX(dregs), R(XMM0));
3388
				break;
3389
			case V_Quad:
3390
				XORPS(XMM1, R(XMM1));
3391
				MOVAPS(XMM0, fpr.VS(sregs));
3392
				DPPS(XMM0, MatR(TEMPREG), 0xF1);
3393
				// In every other case, +0.0 is selected by the mask and added.
3394
				// But, here we need to manually add it to the result.
3395
				ADDPS(XMM0, R(XMM1));
3396
				MOVAPS(fpr.VSX(dregs), R(XMM0));
3397
				break;
3398
			default:
3399
				DISABLE;
3400
			}
3401
		} else {
3402
			switch (sz) {
3403
			case V_Pair:
3404
				XORPS(XMM1, R(XMM1));
3405
				MOVAPS(XMM0, fpr.VS(sregs));
3406
				ADDPS(XMM1, R(XMM0));
3407
				SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 1));
3408
				ADDPS(XMM0, R(XMM1));
3409
				MOVAPS(fpr.VSX(dregs), R(XMM0));
3410
				break;
3411
			case V_Triple:
3412
				XORPS(XMM1, R(XMM1));
3413
				MOVAPS(XMM0, fpr.VS(sregs));
3414
				ADDPS(XMM1, R(XMM0));
3415
				SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 1));
3416
				ADDPS(XMM0, R(XMM1));
3417
				SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 2));
3418
				ADDPS(XMM0, R(XMM1));
3419
				MOVAPS(fpr.VSX(dregs), R(XMM0));
3420
				break;
3421
			case V_Quad:
3422
				XORPS(XMM1, R(XMM1));
3423
				MOVAPS(XMM0, fpr.VS(sregs));
3424
				// This flips the sign of any -0.000.
3425
				ADDPS(XMM0, R(XMM1));
3426
				MOVHLPS(XMM1, XMM0);
3427
				ADDPS(XMM0, R(XMM1));
3428
				MOVAPS(XMM1, R(XMM0));
3429
				SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(1, 1, 1, 1));
3430
				ADDPS(XMM0, R(XMM1));
3431
				MOVAPS(fpr.VSX(dregs), R(XMM0));
3432
				break;
3433
			default:
3434
				DISABLE;
3435
			}
3436
		}
3437
		if (((op >> 16) & 31) == 7) { // vavg
3438
			MOV(PTRBITS, R(TEMPREG), ImmPtr(&vavg_table[n - 1]));
3439
			MULSS(fpr.VSX(dregs), MatR(TEMPREG));
3440
		}
3441
		ApplyPrefixD(dregs, V_Single);
3442
		fpr.ReleaseSpillLocks();
3443
		return;
3444
	}
3445

3446
	// Flush SIMD.
3447
	fpr.SimpleRegsV(sregs, sz, 0);
3448
	fpr.SimpleRegsV(dregs, V_Single, MAP_NOINIT | MAP_DIRTY);
3449

3450
	X64Reg reg = XMM0;
3451
	if (IsOverlapSafe(dregs[0], 0, n, sregs)) {
3452
		fpr.MapRegV(dregs[0], dregs[0] == sregs[0] ? MAP_DIRTY : MAP_NOINIT);
3453
		fpr.SpillLockV(dregs[0]);
3454
		reg = fpr.VX(dregs[0]);
3455
	}
3456

3457
	// We have to start zt +0.000 in case any values are -0.000.
3458
	XORPS(reg, R(reg));
3459
	for (int i = 0; i < n; ++i) {
3460
		ADDSS(reg, fpr.V(sregs[i]));
3461
	}
3462

3463
	switch ((op >> 16) & 31) {
3464
	case 6:  // vfad
3465
		break;
3466
	case 7:  // vavg
3467
		MOV(PTRBITS, R(TEMPREG), ImmPtr(&vavg_table[n - 1]));
3468
		MULSS(reg, MatR(TEMPREG));
3469
		break;
3470
	}
3471

3472
	if (reg == XMM0) {
3473
		MOVSS(fpr.V(dregs[0]), XMM0);
3474
	}
3475

3476
	ApplyPrefixD(dregs, V_Single);
3477
	fpr.ReleaseSpillLocks();
3478
}
3479

3480
void Jit::Comp_Viim(MIPSOpcode op) {
3481
	CONDITIONAL_DISABLE(VFPU_XFER);
3482

3483
	if (js.HasUnknownPrefix())
3484
		DISABLE;
3485

3486
	u8 dreg;
3487
	GetVectorRegs(&dreg, V_Single, _VT);
3488

3489
	// Flush SIMD.
3490
	fpr.SimpleRegsV(&dreg, V_Single, MAP_NOINIT | MAP_DIRTY);
3491

3492
	s32 imm = SignExtend16ToS32(op);
3493
	FP32 fp;
3494
	fp.f = (float)imm;
3495
	MOV(32, R(TEMPREG), Imm32(fp.u));
3496
	fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT);
3497
	MOVD_xmm(fpr.VX(dreg), R(TEMPREG));
3498

3499
	ApplyPrefixD(&dreg, V_Single);
3500
	fpr.ReleaseSpillLocks();
3501
}
3502

3503
void Jit::Comp_Vfim(MIPSOpcode op) {
3504
	CONDITIONAL_DISABLE(VFPU_XFER);
3505

3506
	if (js.HasUnknownPrefix())
3507
		DISABLE;
3508

3509
	u8 dreg;
3510
	GetVectorRegs(&dreg, V_Single, _VT);
3511

3512
	// Flush SIMD.
3513
	fpr.SimpleRegsV(&dreg, V_Single, MAP_NOINIT | MAP_DIRTY);
3514

3515
	FP16 half;
3516
	half.u = op & 0xFFFF;
3517
	FP32 fval = half_to_float_fast5(half);
3518
	MOV(32, R(TEMPREG), Imm32(fval.u));
3519
	fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT);
3520
	MOVD_xmm(fpr.VX(dreg), R(TEMPREG));
3521

3522
	ApplyPrefixD(&dreg, V_Single);
3523
	fpr.ReleaseSpillLocks();
3524
}
3525

3526
void Jit::CompVrotShuffle(u8 *dregs, int imm, int n, bool negSin) {
3527
	char what[4] = { '0', '0', '0', '0' };
3528
	if (((imm >> 2) & 3) == (imm & 3)) {
3529
		for (int i = 0; i < 4; i++)
3530
			what[i] = 'S';
3531
	}
3532
	what[(imm >> 2) & 3] = 'S';
3533
	what[imm & 3] = 'C';
3534

3535
	// TODO: shufps SIMD version
3536

3537
	for (int i = 0; i < n; i++) {
3538
		fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);
3539
		switch (what[i]) {
3540
		case 'C': MOVSS(fpr.V(dregs[i]), XMM1); break;
3541
		case 'S':
3542
			MOVSS(fpr.V(dregs[i]), XMM0);
3543
			if (negSin) {
3544
				if (RipAccessible(&signBitLower)) {
3545
					XORPS(fpr.VX(dregs[i]), M(&signBitLower));  // rip accessible
3546
				} else {
3547
					MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower));
3548
					XORPS(fpr.VX(dregs[i]), MatR(TEMPREG));
3549
				}
3550
			}
3551
			break;
3552
		case '0':
3553
		{
3554
			XORPS(fpr.VX(dregs[i]), fpr.V(dregs[i]));
3555
			break;
3556
		}
3557
		default:
3558
			ERROR_LOG(Log::JIT, "Bad what in vrot");
3559
			break;
3560
		}
3561
	}
3562
}
3563

3564
// Very heavily used by FF:CC
3565
void Jit::Comp_VRot(MIPSOpcode op) {
3566
	CONDITIONAL_DISABLE(VFPU_VEC);
3567
	if (js.HasUnknownPrefix()) {
3568
		DISABLE;
3569
	}
3570
	if (!js.HasNoPrefix()) {
3571
		// Prefixes work strangely for this, see IRCompVFPU.
3572
		WARN_LOG_REPORT(Log::JIT, "vrot instruction using prefixes at %08x", GetCompilerPC());
3573
		DISABLE;
3574
	}
3575

3576
	int vd = _VD;
3577
	int vs = _VS;
3578

3579
	VectorSize sz = GetVecSize(op);
3580
	int n = GetNumVectorElements(sz);
3581

3582
	u8 dregs[4];
3583
	u8 dregs2[4];
3584

3585
	MIPSOpcode nextOp = GetOffsetInstruction(1);
3586
	int vd2 = -1;
3587
	int imm2 = -1;
3588
	if ((nextOp >> 26) == 60 && ((nextOp >> 21) & 0x1F) == 29 && _VS == MIPS_GET_VS(nextOp)) {
3589
		// Pair of vrot with the same angle argument. Let's join them (can share sin/cos results).
3590
		vd2 = MIPS_GET_VD(nextOp);
3591
		imm2 = (nextOp >> 16) & 0x1f;
3592
		// NOTICE_LOG(Log::JIT, "Joint VFPU at %08x", js.blockStart);
3593
	}
3594

3595
	u8 sreg;
3596
	GetVectorRegs(dregs, sz, vd);
3597
	if (vd2 >= 0)
3598
		GetVectorRegs(dregs2, sz, vd2);
3599
	GetVectorRegs(&sreg, V_Single, vs);
3600

3601
	// Flush SIMD.
3602
	fpr.SimpleRegsV(&sreg, V_Single, 0);
3603

3604
	int imm = (op >> 16) & 0x1f;
3605

3606
	gpr.FlushBeforeCall();
3607
	fpr.Flush();
3608

3609
	bool negSin1 = (imm & 0x10) ? true : false;
3610

3611
#if PPSSPP_ARCH(AMD64)
3612
#ifdef _WIN32
3613
	LEA(64, RDX, MIPSSTATE_VAR(sincostemp));
3614
#else
3615
	LEA(64, RDI, MIPSSTATE_VAR(sincostemp));
3616
#endif
3617
	MOVSS(XMM0, fpr.V(sreg));
3618
	ABI_CallFunction(negSin1 ? (const void *)&SinCosNegSin : (const void *)&SinCos);
3619
#else
3620
	// Sigh, passing floats with cdecl isn't pretty, ends up on the stack.
3621
	ABI_CallFunctionAC(negSin1 ? (const void *)&SinCosNegSin : (const void *)&SinCos, fpr.V(sreg), (uintptr_t)mips_->sincostemp);
3622
#endif
3623

3624
	MOVSS(XMM0, MIPSSTATE_VAR(sincostemp[0]));
3625
	MOVSS(XMM1, MIPSSTATE_VAR(sincostemp[1]));
3626

3627
	CompVrotShuffle(dregs, imm, n, false);
3628
	if (vd2 != -1) {
3629
		// If the negsin setting differs between the two joint invocations, we need to flip the second one.
3630
		bool negSin2 = (imm2 & 0x10) ? true : false;
3631
		CompVrotShuffle(dregs2, imm2, n, negSin1 != negSin2);
3632
		EatInstruction(nextOp);
3633
	}
3634
	fpr.ReleaseSpillLocks();
3635
}
3636

3637
void Jit::Comp_ColorConv(MIPSOpcode op) {
3638
	CONDITIONAL_DISABLE(VFPU_VEC);
3639
	if (js.HasUnknownPrefix())
3640
		DISABLE;
3641

3642
	int vd = _VD;
3643
	int vs = _VS;
3644

3645
	DISABLE;
3646
#if 0
3647
	VectorSize sz = V_Quad;
3648
	int n = GetNumVectorElements(sz);
3649

3650
	switch ((op >> 16) & 3) {
3651
	case 1:
3652
		break;
3653
	default:
3654
		DISABLE;
3655
	}
3656

3657
	u8 sregs[4];
3658
	u8 dregs[1];
3659
	// WARNING: Prefixes.
3660
	GetVectorRegs(sregs, sz, vs);
3661
	GetVectorRegs(dregs, V_Pair, vd);
3662

3663
	if (fpr.TryMapDirtyInVS(dregs, V_Single, sregs, sz)) {
3664
		switch ((op >> 16) & 3) {
3665
		case 1:  // 4444
3666
		{
3667
			//int a = ((in >> 24) & 0xFF) >> 4;
3668
			//int b = ((in >> 16) & 0xFF) >> 4;
3669
			//int g = ((in >> 8) & 0xFF) >> 4;
3670
			//int r = ((in)& 0xFF) >> 4;
3671
			//col = (a << 12) | (b << 8) | (g << 4) | (r);
3672
			//PACKUSW
3673
			break;
3674
		}
3675
		case 2:  // 5551
3676
		{
3677
			//int a = ((in >> 24) & 0xFF) >> 7;
3678
			//int b = ((in >> 16) & 0xFF) >> 3;
3679
			//int g = ((in >> 8) & 0xFF) >> 3;
3680
			//int r = ((in)& 0xFF) >> 3;
3681
			//col = (a << 15) | (b << 10) | (g << 5) | (r);
3682
			break;
3683
		}
3684
		case 3:  // 565
3685
		{
3686
			//int b = ((in >> 16) & 0xFF) >> 3;
3687
			//int g = ((in >> 8) & 0xFF) >> 2;
3688
			//int r = ((in)& 0xFF) >> 3;
3689
			//col = (b << 11) | (g << 5) | (r);
3690
			break;
3691
		}
3692
	}
3693
		DISABLE;
3694

3695
	// Flush SIMD.
3696
	fpr.SimpleRegsV(&sreg, V_Pair, MAP_NOINIT | MAP_DIRTY);
3697
	fpr.SimpleRegsV(&dreg, V_Pair, MAP_NOINIT | MAP_DIRTY);
3698
#endif
3699

3700
}
3701
}
3702

3703
#endif // PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
3704

3705
Product

Resources

Company