Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hrydgard
GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/x86/CompVFPU.cpp
3189 views
1
// Copyright (c) 2012- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
// Table 13.10 in http://agner.org/optimize/optimizing_assembly.pdf is cool - generate constants with
19
// short instruction sequences. Surprisingly many are possible.
20
21
#include "ppsspp_config.h"
22
23
#if PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
24
#include <cmath>
25
#include <limits>
26
#include "Common/Math/math_util.h"
27
28
#include "Common/CPUDetect.h"
29
#include "Common/Math/SIMDHeaders.h"
30
#include "Common/Log.h"
31
#include "Core/Compatibility.h"
32
#include "Core/Config.h"
33
#include "Core/MemMap.h"
34
#include "Core/Reporting.h"
35
#include "Core/System.h"
36
#include "Core/MIPS/MIPSAnalyst.h"
37
#include "Core/MIPS/MIPSCodeUtils.h"
38
#include "Core/MIPS/MIPSVFPUUtils.h"
39
#include "Core/MIPS/x86/Jit.h"
40
#include "Core/MIPS/x86/RegCache.h"
41
42
// All functions should have CONDITIONAL_DISABLE, so we can narrow things down to a file quickly.
43
// Currently known non working ones should have DISABLE.
44
45
// #define CONDITIONAL_DISABLE { fpr.ReleaseSpillLocks(); Comp_Generic(op); return; }
46
#define CONDITIONAL_DISABLE(flag) if (jo.Disabled(JitDisable::flag)) { Comp_Generic(op); return; }
47
#define DISABLE { fpr.ReleaseSpillLocks(); Comp_Generic(op); return; }
48
49
#define _RS MIPS_GET_RS(op)
50
#define _RT MIPS_GET_RT(op)
51
#define _RD MIPS_GET_RD(op)
52
#define _FS MIPS_GET_FS(op)
53
#define _FT MIPS_GET_FT(op)
54
#define _FD MIPS_GET_FD(op)
55
#define _SA MIPS_GET_SA(op)
56
#define _POS ((op>> 6) & 0x1F)
57
#define _SIZE ((op>>11) & 0x1F)
58
#define _IMM16 (signed short)(op & 0xFFFF)
59
#define _IMM26 (op & 0x03FFFFFF)
60
61
namespace MIPSComp
62
{
63
using namespace Gen;
64
using namespace X64JitConstants;
65
66
static const float one = 1.0f;
67
static const float minus_one = -1.0f;
68
69
alignas(16) const u32 noSignMask[4] = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
70
alignas(16) const u32 signBitAll[4] = {0x80000000, 0x80000000, 0x80000000, 0x80000000};
71
alignas(16) const u32 signBitLower[4] = {0x80000000, 0, 0, 0};
72
alignas(16) const float oneOneOneOne[4] = {1.0f, 1.0f, 1.0f, 1.0f};
73
alignas(16) const u32 fourinfnan[4] = {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
74
alignas(16) const float identityMatrix[4][4] = { { 1.0f, 0, 0, 0 }, { 0, 1.0f, 0, 0 }, { 0, 0, 1.0f, 0 }, { 0, 0, 0, 1.0f} };
75
76
void Jit::Comp_VPFX(MIPSOpcode op)
77
{
78
CONDITIONAL_DISABLE(VFPU_XFER);
79
int data = op & 0xFFFFF;
80
int regnum = (op >> 24) & 3;
81
switch (regnum) {
82
case 0: // S
83
js.prefixS = data;
84
js.prefixSFlag = JitState::PREFIX_KNOWN_DIRTY;
85
break;
86
case 1: // T
87
js.prefixT = data;
88
js.prefixTFlag = JitState::PREFIX_KNOWN_DIRTY;
89
break;
90
case 2: // D
91
js.prefixD = data & 0x00000FFF;
92
js.prefixDFlag = JitState::PREFIX_KNOWN_DIRTY;
93
break;
94
}
95
}
96
97
void Jit::ApplyPrefixST(u8 *vregs, u32 prefix, VectorSize sz) {
98
if (prefix == 0xE4) return;
99
100
int n = GetNumVectorElements(sz);
101
u8 origV[4];
102
static const float constantArray[8] = {0.f, 1.f, 2.f, 0.5f, 3.f, 1.f/3.f, 0.25f, 1.f/6.f};
103
104
for (int i = 0; i < n; i++)
105
origV[i] = vregs[i];
106
107
for (int i = 0; i < n; i++) {
108
int regnum = (prefix >> (i*2)) & 3;
109
int abs = (prefix >> (8+i)) & 1;
110
int negate = (prefix >> (16+i)) & 1;
111
int constants = (prefix >> (12+i)) & 1;
112
113
// Unchanged, hurray.
114
if (!constants && regnum == i && !abs && !negate)
115
continue;
116
117
// This puts the value into a temp reg, so we won't write the modified value back.
118
vregs[i] = fpr.GetTempV();
119
fpr.MapRegV(vregs[i], MAP_NOINIT | MAP_DIRTY);
120
121
if (!constants) {
122
// Prefix may say "z, z, z, z" but if this is a pair, we force to x.
123
// TODO: But some ops seem to use const 0 instead?
124
if (regnum >= n) {
125
ERROR_LOG_REPORT(Log::CPU, "Invalid VFPU swizzle: %08x / %d", prefix, sz);
126
regnum = 0;
127
}
128
fpr.SimpleRegV(origV[regnum], 0);
129
MOVSS(fpr.VX(vregs[i]), fpr.V(origV[regnum]));
130
if (abs) {
131
if (RipAccessible(&noSignMask)) {
132
ANDPS(fpr.VX(vregs[i]), M(&noSignMask)); // rip accessible
133
} else {
134
MOV(PTRBITS, R(TEMPREG), ImmPtr(&noSignMask));
135
ANDPS(fpr.VX(vregs[i]), MatR(TEMPREG));
136
}
137
}
138
} else {
139
if (RipAccessible(constantArray)) {
140
MOVSS(fpr.VX(vregs[i]), M(&constantArray[regnum + (abs << 2)])); // rip accessible
141
} else {
142
MOV(PTRBITS, R(TEMPREG), ImmPtr(&constantArray[regnum + (abs << 2)]));
143
MOVSS(fpr.VX(vregs[i]), MatR(TEMPREG));
144
}
145
}
146
147
if (negate) {
148
if (RipAccessible(&signBitLower)) {
149
XORPS(fpr.VX(vregs[i]), M(&signBitLower)); // rip accessible
150
} else {
151
MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower));
152
XORPS(fpr.VX(vregs[i]), MatR(TEMPREG));
153
}
154
}
155
// TODO: This probably means it will swap out soon, inefficiently...
156
fpr.ReleaseSpillLockV(vregs[i]);
157
}
158
}
159
160
void Jit::GetVectorRegsPrefixD(u8 *regs, VectorSize sz, int vectorReg) {
161
_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
162
163
GetVectorRegs(regs, sz, vectorReg);
164
if (js.prefixD == 0)
165
return;
166
167
int n = GetNumVectorElements(sz);
168
for (int i = 0; i < n; i++) {
169
// Hopefully this is rare, we'll just write it into a reg we drop.
170
if (js.VfpuWriteMask(i))
171
regs[i] = fpr.GetTempV();
172
}
173
}
174
175
void Jit::ApplyPrefixD(const u8 *vregs, VectorSize sz) {
176
_assert_(js.prefixDFlag & JitState::PREFIX_KNOWN);
177
if (!js.prefixD) return;
178
179
int n = GetNumVectorElements(sz);
180
for (int i = 0; i < n; i++) {
181
if (js.VfpuWriteMask(i))
182
continue;
183
184
int sat = (js.prefixD >> (i * 2)) & 3;
185
if (sat == 1) {
186
fpr.MapRegV(vregs[i], MAP_DIRTY);
187
188
// Zero out XMM0 if it was <= +0.0f (but skip NAN.)
189
MOVSS(R(XMM0), fpr.VX(vregs[i]));
190
XORPS(XMM1, R(XMM1));
191
CMPLESS(XMM0, R(XMM1));
192
ANDNPS(XMM0, fpr.V(vregs[i]));
193
194
// Retain a NAN in XMM0 (must be second operand.)
195
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
196
MOVSS(fpr.VX(vregs[i]), MatR(TEMPREG));
197
MINSS(fpr.VX(vregs[i]), R(XMM0));
198
} else if (sat == 3) {
199
fpr.MapRegV(vregs[i], MAP_DIRTY);
200
201
// Check for < -1.0f, but careful of NANs.
202
MOV(PTRBITS, R(TEMPREG), ImmPtr(&minus_one));
203
MOVSS(XMM1, MatR(TEMPREG));
204
MOVSS(R(XMM0), fpr.VX(vregs[i]));
205
CMPLESS(XMM0, R(XMM1));
206
// If it was NOT less, the three ops below do nothing.
207
// Otherwise, they replace the value with -1.0f.
208
ANDPS(XMM1, R(XMM0));
209
ANDNPS(XMM0, fpr.V(vregs[i]));
210
ORPS(XMM0, R(XMM1));
211
212
// Retain a NAN in XMM0 (must be second operand.)
213
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
214
MOVSS(fpr.VX(vregs[i]), MatR(TEMPREG));
215
MINSS(fpr.VX(vregs[i]), R(XMM0));
216
}
217
}
218
}
219
220
// Vector regs can overlap in all sorts of swizzled ways.
221
// This does allow a single overlap in sregs[i].
222
bool IsOverlapSafeAllowS(int dreg, int di, int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = NULL) {
223
for (int i = 0; i < sn; ++i) {
224
if (sregs[i] == dreg && i != di)
225
return false;
226
}
227
for (int i = 0; i < tn; ++i) {
228
if (tregs[i] == dreg)
229
return false;
230
}
231
232
// Hurray, no overlap, we can write directly.
233
return true;
234
}
235
236
bool IsOverlapSafe(int dreg, int di, int sn, const u8 sregs[], int tn = 0, const u8 tregs[] = NULL) {
237
return IsOverlapSafeAllowS(dreg, di, sn, sregs, tn, tregs) && sregs[di] != dreg;
238
}
239
240
void Jit::Comp_SV(MIPSOpcode op) {
241
CONDITIONAL_DISABLE(LSU_VFPU);
242
243
s32 imm = (signed short)(op&0xFFFC);
244
int vt = ((op >> 16) & 0x1f) | ((op & 3) << 5);
245
MIPSGPReg rs = _RS;
246
247
CheckMemoryBreakpoint(0, rs, imm);
248
249
switch (op >> 26) {
250
case 50: //lv.s // VI(vt) = Memory::Read_U32(addr);
251
{
252
gpr.Lock(rs);
253
fpr.MapRegV(vt, MAP_DIRTY | MAP_NOINIT);
254
255
JitSafeMem safe(this, rs, imm);
256
OpArg src;
257
if (safe.PrepareRead(src, 4)) {
258
MOVSS(fpr.VX(vt), safe.NextFastAddress(0));
259
}
260
if (safe.PrepareSlowRead(safeMemFuncs.readU32)) {
261
MOVD_xmm(fpr.VX(vt), R(EAX));
262
}
263
safe.Finish();
264
265
gpr.UnlockAll();
266
fpr.ReleaseSpillLocks();
267
}
268
break;
269
270
case 58: //sv.s // Memory::Write_U32(VI(vt), addr);
271
{
272
gpr.Lock(rs);
273
274
fpr.MapRegV(vt, 0);
275
276
JitSafeMem safe(this, rs, imm);
277
OpArg dest;
278
if (safe.PrepareWrite(dest, 4)) {
279
MOVSS(safe.NextFastAddress(0), fpr.VX(vt));
280
}
281
if (safe.PrepareSlowWrite()) {
282
MOVSS(MIPSSTATE_VAR(temp), fpr.VX(vt));
283
safe.DoSlowWrite(safeMemFuncs.writeU32, MIPSSTATE_VAR(temp), 0);
284
}
285
safe.Finish();
286
287
fpr.ReleaseSpillLocks();
288
gpr.UnlockAll();
289
}
290
break;
291
292
default:
293
DISABLE;
294
}
295
}
296
297
void Jit::Comp_SVQ(MIPSOpcode op) {
298
CONDITIONAL_DISABLE(LSU_VFPU);
299
300
int imm = (signed short)(op&0xFFFC);
301
int vt = (((op >> 16) & 0x1f)) | ((op&1) << 5);
302
MIPSGPReg rs = _RS;
303
304
CheckMemoryBreakpoint(0, rs, imm);
305
306
switch (op >> 26) {
307
case 53: //lvl.q/lvr.q
308
{
309
if (!g_Config.bFastMemory) {
310
DISABLE;
311
}
312
DISABLE; // The code below isn't quite working, so we fall back to interpreter for now.
313
314
gpr.MapReg(rs, true, false);
315
gpr.FlushLockX(ECX);
316
u8 vregs[4];
317
GetVectorRegs(vregs, V_Quad, vt);
318
MOV(32, R(EAX), gpr.R(rs));
319
ADD(32, R(EAX), Imm32(imm));
320
#ifdef MASKED_PSP_MEMORY
321
AND(32, R(EAX), Imm32(Memory::MEMVIEW32_MASK));
322
#endif
323
MOV(32, R(ECX), R(EAX));
324
SHR(32, R(EAX), Imm8(2));
325
AND(32, R(EAX), Imm32(0x3));
326
CMP(32, R(EAX), Imm32(0));
327
FixupBranch next = J_CC(CC_NE);
328
329
auto PSPMemAddr = [](X64Reg scaled, int offset) {
330
#if PPSSPP_ARCH(X86)
331
return MDisp(scaled, (u32)Memory::base + offset);
332
#else
333
return MComplex(MEMBASEREG, scaled, 1, offset);
334
#endif
335
};
336
337
fpr.MapRegsV(vregs, V_Quad, MAP_DIRTY);
338
339
// Offset = 0
340
MOVSS(fpr.RX(vregs[3]), PSPMemAddr(EAX, 0));
341
342
FixupBranch skip0 = J();
343
SetJumpTarget(next);
344
CMP(32, R(EAX), Imm32(1));
345
next = J_CC(CC_NE);
346
347
// Offset = 1
348
MOVSS(fpr.RX(vregs[3]), PSPMemAddr(EAX, 4));
349
MOVSS(fpr.RX(vregs[2]), PSPMemAddr(EAX, 0));
350
351
FixupBranch skip1 = J();
352
SetJumpTarget(next);
353
CMP(32, R(EAX), Imm32(2));
354
next = J_CC(CC_NE);
355
356
// Offset = 2
357
MOVSS(fpr.RX(vregs[3]), PSPMemAddr(EAX, 8));
358
MOVSS(fpr.RX(vregs[2]), PSPMemAddr(EAX, 4));
359
MOVSS(fpr.RX(vregs[1]), PSPMemAddr(EAX, 0));
360
361
FixupBranch skip2 = J();
362
SetJumpTarget(next);
363
CMP(32, R(EAX), Imm32(3));
364
next = J_CC(CC_NE);
365
366
// Offset = 3
367
MOVSS(fpr.RX(vregs[3]), PSPMemAddr(EAX, 12));
368
MOVSS(fpr.RX(vregs[2]), PSPMemAddr(EAX, 8));
369
MOVSS(fpr.RX(vregs[1]), PSPMemAddr(EAX, 4));
370
MOVSS(fpr.RX(vregs[0]), PSPMemAddr(EAX, 0));
371
372
SetJumpTarget(next);
373
SetJumpTarget(skip0);
374
SetJumpTarget(skip1);
375
SetJumpTarget(skip2);
376
377
gpr.UnlockAll();
378
fpr.ReleaseSpillLocks();
379
}
380
break;
381
382
case 54: //lv.q
383
{
384
gpr.Lock(rs);
385
// This must be in a reg or an immediate.
386
// Otherwise, it'll get put in EAX and we'll clobber that during NextSlowRead().
387
if (!gpr.IsImm(rs))
388
gpr.MapReg(rs, true, false);
389
390
u8 vregs[4];
391
GetVectorRegs(vregs, V_Quad, vt);
392
393
if (fpr.TryMapRegsVS(vregs, V_Quad, MAP_NOINIT | MAP_DIRTY)) {
394
JitSafeMem safe(this, rs, imm);
395
OpArg src;
396
if (safe.PrepareRead(src, 16)) {
397
// Should be safe, since lv.q must be aligned, but let's try to avoid crashing in safe mode.
398
if (g_Config.bFastMemory) {
399
MOVAPS(fpr.VSX(vregs), safe.NextFastAddress(0));
400
} else {
401
MOVUPS(fpr.VSX(vregs), safe.NextFastAddress(0));
402
}
403
}
404
if (safe.PrepareSlowRead(safeMemFuncs.readU32)) {
405
for (int i = 0; i < 4; i++) {
406
safe.NextSlowRead(safeMemFuncs.readU32, i * 4);
407
// We use XMM0 as a temporary since MOVSS and MOVD would clear the higher bits.
408
MOVD_xmm(XMM0, R(EAX));
409
MOVSS(fpr.VSX(vregs), R(XMM0));
410
// Rotate things so we can read in the next higher float.
411
// By the end (4 rotates), they'll all be back into place.
412
SHUFPS(fpr.VSX(vregs), fpr.VS(vregs), _MM_SHUFFLE(0, 3, 2, 1));
413
}
414
}
415
safe.Finish();
416
gpr.UnlockAll();
417
fpr.ReleaseSpillLocks();
418
return;
419
}
420
421
fpr.MapRegsV(vregs, V_Quad, MAP_DIRTY | MAP_NOINIT);
422
423
JitSafeMem safe(this, rs, imm);
424
OpArg src;
425
if (safe.PrepareRead(src, 16)) {
426
// Just copy 4 words the easiest way while not wasting registers.
427
for (int i = 0; i < 4; i++)
428
MOVSS(fpr.VX(vregs[i]), safe.NextFastAddress(i * 4));
429
}
430
if (safe.PrepareSlowRead(safeMemFuncs.readU32)) {
431
for (int i = 0; i < 4; i++) {
432
safe.NextSlowRead(safeMemFuncs.readU32, i * 4);
433
MOVD_xmm(fpr.VX(vregs[i]), R(EAX));
434
}
435
}
436
safe.Finish();
437
438
gpr.UnlockAll();
439
fpr.ReleaseSpillLocks();
440
}
441
break;
442
443
case 62: //sv.q
444
{
445
gpr.Lock(rs);
446
// This must be in a reg or an immediate.
447
// Otherwise, it'll get put in EAX and we'll clobber that during NextSlowRead().
448
if (!gpr.IsImm(rs))
449
gpr.MapReg(rs, true, false);
450
451
u8 vregs[4];
452
GetVectorRegs(vregs, V_Quad, vt);
453
454
if (fpr.TryMapRegsVS(vregs, V_Quad, 0)) {
455
JitSafeMem safe(this, rs, imm);
456
OpArg dest;
457
if (safe.PrepareWrite(dest, 16)) {
458
// Should be safe, since sv.q must be aligned, but let's try to avoid crashing in safe mode.
459
if (g_Config.bFastMemory) {
460
MOVAPS(safe.NextFastAddress(0), fpr.VSX(vregs));
461
} else {
462
MOVUPS(safe.NextFastAddress(0), fpr.VSX(vregs));
463
}
464
}
465
if (safe.PrepareSlowWrite()) {
466
MOVAPS(XMM0, fpr.VS(vregs));
467
for (int i = 0; i < 4; i++) {
468
MOVSS(MIPSSTATE_VAR(temp), XMM0);
469
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));
470
safe.DoSlowWrite(safeMemFuncs.writeU32, MIPSSTATE_VAR(temp), i * 4);
471
}
472
}
473
safe.Finish();
474
gpr.UnlockAll();
475
fpr.ReleaseSpillLocks();
476
return;
477
}
478
479
// Even if we don't use real SIMD there's still 8 or 16 scalar float registers.
480
fpr.MapRegsV(vregs, V_Quad, 0);
481
482
JitSafeMem safe(this, rs, imm);
483
OpArg dest;
484
if (safe.PrepareWrite(dest, 16)) {
485
for (int i = 0; i < 4; i++)
486
MOVSS(safe.NextFastAddress(i * 4), fpr.VX(vregs[i]));
487
}
488
if (safe.PrepareSlowWrite()) {
489
for (int i = 0; i < 4; i++) {
490
MOVSS(MIPSSTATE_VAR(temp), fpr.VX(vregs[i]));
491
safe.DoSlowWrite(safeMemFuncs.writeU32, MIPSSTATE_VAR(temp), i * 4);
492
}
493
}
494
safe.Finish();
495
496
gpr.UnlockAll();
497
fpr.ReleaseSpillLocks();
498
}
499
break;
500
501
default:
502
DISABLE;
503
break;
504
}
505
}
506
507
void Jit::Comp_VVectorInit(MIPSOpcode op) {
508
CONDITIONAL_DISABLE(VFPU_XFER);
509
510
if (js.HasUnknownPrefix())
511
DISABLE;
512
513
VectorSize sz = GetVecSize(op);
514
int type = (op >> 16) & 0xF;
515
u8 dregs[4];
516
GetVectorRegsPrefixD(dregs, sz, _VD);
517
518
if (fpr.TryMapRegsVS(dregs, sz, MAP_NOINIT | MAP_DIRTY)) {
519
if (type == 6) {
520
XORPS(fpr.VSX(dregs), fpr.VS(dregs));
521
} else if (type == 7) {
522
if (RipAccessible(&oneOneOneOne)) {
523
MOVAPS(fpr.VSX(dregs), M(&oneOneOneOne)); // rip accessible
524
} else {
525
MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
526
MOVAPS(fpr.VSX(dregs), MatR(TEMPREG));
527
}
528
} else {
529
DISABLE;
530
}
531
ApplyPrefixD(dregs, sz);
532
fpr.ReleaseSpillLocks();
533
return;
534
}
535
536
switch (type) {
537
case 6: // v=zeros; break; //vzero
538
XORPS(XMM0, R(XMM0));
539
break;
540
case 7: // v=ones; break; //vone
541
if (RipAccessible(&one)) {
542
MOVSS(XMM0, M(&one)); // rip accessible
543
} else {
544
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
545
MOVSS(XMM0, MatR(TEMPREG));
546
}
547
break;
548
default:
549
DISABLE;
550
break;
551
}
552
553
int n = GetNumVectorElements(sz);
554
fpr.MapRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
555
for (int i = 0; i < n; ++i)
556
MOVSS(fpr.VX(dregs[i]), R(XMM0));
557
ApplyPrefixD(dregs, sz);
558
559
fpr.ReleaseSpillLocks();
560
}
561
562
void Jit::Comp_VIdt(MIPSOpcode op) {
563
CONDITIONAL_DISABLE(VFPU_XFER);
564
if (js.HasUnknownPrefix())
565
DISABLE;
566
567
int vd = _VD;
568
VectorSize sz = GetVecSize(op);
569
int n = GetNumVectorElements(sz);
570
571
u8 dregs[4];
572
GetVectorRegsPrefixD(dregs, sz, _VD);
573
if (fpr.TryMapRegsVS(dregs, sz, MAP_NOINIT | MAP_DIRTY)) {
574
int row = vd & (n - 1);
575
if (RipAccessible(identityMatrix)) {
576
MOVAPS(fpr.VSX(dregs), M(identityMatrix[row])); // rip accessible
577
} else {
578
MOV(PTRBITS, R(TEMPREG), ImmPtr(&identityMatrix[row]));
579
MOVAPS(fpr.VSX(dregs), MatR(TEMPREG));
580
}
581
ApplyPrefixD(dregs, sz);
582
fpr.ReleaseSpillLocks();
583
return;
584
}
585
586
XORPS(XMM0, R(XMM0));
587
if (RipAccessible(&one)) {
588
MOVSS(XMM1, M(&one)); // rip accessible
589
} else {
590
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
591
MOVSS(XMM1, MatR(TEMPREG));
592
}
593
fpr.MapRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
594
switch (sz) {
595
case V_Pair:
596
MOVSS(fpr.VX(dregs[0]), R((vd&1)==0 ? XMM1 : XMM0));
597
MOVSS(fpr.VX(dregs[1]), R((vd&1)==1 ? XMM1 : XMM0));
598
break;
599
case V_Quad:
600
MOVSS(fpr.VX(dregs[0]), R((vd&3)==0 ? XMM1 : XMM0));
601
MOVSS(fpr.VX(dregs[1]), R((vd&3)==1 ? XMM1 : XMM0));
602
MOVSS(fpr.VX(dregs[2]), R((vd&3)==2 ? XMM1 : XMM0));
603
MOVSS(fpr.VX(dregs[3]), R((vd&3)==3 ? XMM1 : XMM0));
604
break;
605
default:
606
_dbg_assert_msg_(false,"Trying to interpret instruction that can't be interpreted");
607
break;
608
}
609
ApplyPrefixD(dregs, sz);
610
fpr.ReleaseSpillLocks();
611
}
612
613
void Jit::Comp_VDot(MIPSOpcode op) {
614
CONDITIONAL_DISABLE(VFPU_VEC);
615
616
if (js.HasUnknownPrefix())
617
DISABLE;
618
619
VectorSize sz = GetVecSize(op);
620
int n = GetNumVectorElements(sz);
621
622
// TODO: Force read one of them into regs? probably not.
623
u8 sregs[4], tregs[4], dregs[1];
624
GetVectorRegsPrefixS(sregs, sz, _VS);
625
GetVectorRegsPrefixT(tregs, sz, _VT);
626
GetVectorRegsPrefixD(dregs, V_Single, _VD);
627
628
// With SSE2, these won't really give any performance benefit on their own, but may reduce
629
// conversion costs from/to SIMD form. However, the SSE4.1 DPPS may be worth it.
630
// Benchmarking will have to decide whether to enable this on < SSE4.1. Also a HADDPS version
631
// for SSE3 could be written.
632
if (fpr.TryMapDirtyInInVS(dregs, V_Single, sregs, sz, tregs, sz)) {
633
switch (sz) {
634
case V_Pair:
635
if (cpu_info.bSSE4_1) {
636
if (fpr.VSX(dregs) != fpr.VSX(sregs) && fpr.VSX(dregs) != fpr.VSX(tregs)) {
637
MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
638
DPPS(fpr.VSX(dregs), fpr.VS(tregs), 0x31);
639
} else {
640
MOVAPS(XMM0, fpr.VS(sregs));
641
DPPS(XMM0, fpr.VS(tregs), 0x31);
642
MOVAPS(fpr.VSX(dregs), R(XMM0));
643
}
644
} else {
645
MOVAPS(XMM0, fpr.VS(sregs));
646
MULPS(XMM0, fpr.VS(tregs));
647
MOVAPS(R(XMM1), XMM0);
648
SHUFPS(XMM1, R(XMM0), _MM_SHUFFLE(1, 1, 1, 1));
649
ADDPS(XMM1, R(XMM0));
650
MOVAPS(fpr.VS(dregs), XMM1);
651
}
652
break;
653
case V_Triple:
654
if (cpu_info.bSSE4_1) {
655
if (fpr.VSX(dregs) != fpr.VSX(sregs) && fpr.VSX(dregs) != fpr.VSX(tregs)) {
656
MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
657
DPPS(fpr.VSX(dregs), fpr.VS(tregs), 0x71);
658
} else {
659
MOVAPS(XMM0, fpr.VS(sregs));
660
DPPS(XMM0, fpr.VS(tregs), 0x71);
661
MOVAPS(fpr.VSX(dregs), R(XMM0));
662
}
663
} else {
664
MOVAPS(XMM0, fpr.VS(sregs));
665
MULPS(XMM0, fpr.VS(tregs));
666
MOVAPS(R(XMM1), XMM0);
667
SHUFPS(XMM1, R(XMM0), _MM_SHUFFLE(3, 2, 1, 1));
668
ADDSS(XMM1, R(XMM0));
669
SHUFPS(XMM0, R(XMM1), _MM_SHUFFLE(3, 2, 2, 2));
670
ADDSS(XMM1, R(XMM0));
671
MOVAPS(fpr.VS(dregs), XMM1);
672
}
673
break;
674
case V_Quad:
675
if (cpu_info.bSSE4_1) {
676
if (fpr.VSX(dregs) != fpr.VSX(sregs) && fpr.VSX(dregs) != fpr.VSX(tregs)) {
677
MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
678
DPPS(fpr.VSX(dregs), fpr.VS(tregs), 0xF1);
679
} else {
680
MOVAPS(XMM0, fpr.VS(sregs));
681
DPPS(XMM0, fpr.VS(tregs), 0xF1);
682
MOVAPS(fpr.VSX(dregs), R(XMM0));
683
}
684
} /* else if (cpu_info.bSSE3) { // This is slower than the SSE2 solution on my Ivy!
685
MOVAPS(XMM0, fpr.VS(sregs));
686
MOVAPS(XMM1, fpr.VS(tregs));
687
HADDPS(XMM0, R(XMM1));
688
HADDPS(XMM0, R(XMM0));
689
MOVAPS(fpr.VSX(dregs), R(XMM0));
690
} */ else {
691
MOVAPS(XMM0, fpr.VS(sregs));
692
MOVAPS(XMM1, fpr.VS(tregs));
693
MULPS(XMM0, R(XMM1));
694
MOVAPS(XMM1, R(XMM0));
695
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(2, 3, 0, 1));
696
ADDPS(XMM0, R(XMM1));
697
MOVAPS(XMM1, R(XMM0));
698
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 1, 2, 3));
699
ADDSS(XMM0, R(XMM1));
700
MOVAPS(fpr.VSX(dregs), R(XMM0));
701
}
702
break;
703
default:
704
DISABLE;
705
}
706
ApplyPrefixD(dregs, V_Single);
707
fpr.ReleaseSpillLocks();
708
return;
709
}
710
711
// Flush SIMD.
712
fpr.SimpleRegsV(sregs, sz, 0);
713
fpr.SimpleRegsV(tregs, sz, 0);
714
fpr.SimpleRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
715
716
X64Reg tempxreg = XMM0;
717
if (IsOverlapSafe(dregs[0], 0, n, sregs, n, tregs)) {
718
fpr.MapRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
719
tempxreg = fpr.VX(dregs[0]);
720
}
721
722
// Need to start with +0.0f so it doesn't result in -0.0f.
723
MOVSS(tempxreg, fpr.V(sregs[0]));
724
MULSS(tempxreg, fpr.V(tregs[0]));
725
for (int i = 1; i < n; i++)
726
{
727
// sum += s[i]*t[i];
728
MOVSS(XMM1, fpr.V(sregs[i]));
729
MULSS(XMM1, fpr.V(tregs[i]));
730
ADDSS(tempxreg, R(XMM1));
731
}
732
733
if (!fpr.V(dregs[0]).IsSimpleReg(tempxreg)) {
734
fpr.MapRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
735
MOVSS(fpr.V(dregs[0]), tempxreg);
736
}
737
738
ApplyPrefixD(dregs, V_Single);
739
740
fpr.ReleaseSpillLocks();
741
}
742
743
744
void Jit::Comp_VHdp(MIPSOpcode op) {
745
CONDITIONAL_DISABLE(VFPU_VEC);
746
747
if (js.HasUnknownPrefix())
748
DISABLE;
749
750
VectorSize sz = GetVecSize(op);
751
int n = GetNumVectorElements(sz);
752
753
u8 sregs[4], tregs[4], dregs[1];
754
GetVectorRegsPrefixS(sregs, sz, _VS);
755
GetVectorRegsPrefixT(tregs, sz, _VT);
756
GetVectorRegsPrefixD(dregs, V_Single, _VD);
757
758
// Flush SIMD.
759
fpr.SimpleRegsV(sregs, sz, 0);
760
fpr.SimpleRegsV(tregs, sz, 0);
761
fpr.SimpleRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
762
763
X64Reg tempxreg = XMM0;
764
if (IsOverlapSafe(dregs[0], 0, n, sregs, n, tregs)) {
765
fpr.MapRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
766
tempxreg = fpr.VX(dregs[0]);
767
}
768
769
// Need to start with +0.0f so it doesn't result in -0.0f.
770
MOVSS(tempxreg, fpr.V(sregs[0]));
771
MULSS(tempxreg, fpr.V(tregs[0]));
772
for (int i = 1; i < n; i++) {
773
// sum += (i == n-1) ? t[i] : s[i]*t[i];
774
if (i == n - 1) {
775
ADDSS(tempxreg, fpr.V(tregs[i]));
776
} else {
777
MOVSS(XMM1, fpr.V(sregs[i]));
778
MULSS(XMM1, fpr.V(tregs[i]));
779
ADDSS(tempxreg, R(XMM1));
780
}
781
}
782
783
if (!fpr.V(dregs[0]).IsSimpleReg(tempxreg)) {
784
fpr.MapRegsV(dregs, V_Single, MAP_DIRTY | MAP_NOINIT);
785
MOVSS(fpr.V(dregs[0]), tempxreg);
786
}
787
788
ApplyPrefixD(dregs, V_Single);
789
790
fpr.ReleaseSpillLocks();
791
}
792
793
void Jit::Comp_VCrossQuat(MIPSOpcode op) {
794
CONDITIONAL_DISABLE(VFPU_VEC);
795
796
if (js.HasUnknownPrefix())
797
DISABLE;
798
799
VectorSize sz = GetVecSize(op);
800
801
u8 sregs[4], tregs[4], dregs[4];
802
GetVectorRegs(sregs, sz, _VS);
803
GetVectorRegs(tregs, sz, _VT);
804
GetVectorRegs(dregs, sz, _VD);
805
806
if (sz == V_Triple) {
807
// Cross product vcrsp.t
808
if (fpr.TryMapDirtyInInVS(dregs, sz, sregs, sz, tregs, sz)) {
809
MOVAPS(XMM0, fpr.VS(tregs));
810
MOVAPS(XMM1, fpr.VS(sregs));
811
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 0, 2, 1));
812
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 0, 2, 1));
813
MULPS(XMM0, fpr.VS(sregs));
814
MULPS(XMM1, fpr.VS(tregs));
815
SUBPS(XMM0, R(XMM1));
816
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 0, 2, 1));
817
MOVAPS(fpr.VS(dregs), XMM0);
818
fpr.ReleaseSpillLocks();
819
return;
820
}
821
822
// Flush SIMD.
823
fpr.SimpleRegsV(sregs, sz, 0);
824
fpr.SimpleRegsV(tregs, sz, 0);
825
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
826
827
fpr.MapRegsV(sregs, sz, 0);
828
829
// Compute X
830
MOVSS(XMM0, fpr.V(sregs[1]));
831
MULSS(XMM0, fpr.V(tregs[2]));
832
MOVSS(XMM1, fpr.V(sregs[2]));
833
MULSS(XMM1, fpr.V(tregs[1]));
834
SUBSS(XMM0, R(XMM1));
835
MOVSS(fpr.V(dregs[0]), XMM0);
836
837
// Compute Y
838
MOVSS(XMM0, fpr.V(sregs[2]));
839
MULSS(XMM0, fpr.V(tregs[0]));
840
MOVSS(XMM1, fpr.V(sregs[0]));
841
MULSS(XMM1, fpr.V(tregs[2]));
842
SUBSS(XMM0, R(XMM1));
843
MOVSS(fpr.V(dregs[1]), XMM0);
844
845
// Compute Z
846
MOVSS(XMM0, fpr.V(sregs[0]));
847
MULSS(XMM0, fpr.V(tregs[1]));
848
MOVSS(XMM1, fpr.V(sregs[1]));
849
MULSS(XMM1, fpr.V(tregs[0]));
850
SUBSS(XMM0, R(XMM1));
851
MOVSS(fpr.V(dregs[2]), XMM0);
852
} else if (sz == V_Quad) {
853
// Flush SIMD.
854
fpr.SimpleRegsV(sregs, sz, 0);
855
fpr.SimpleRegsV(tregs, sz, 0);
856
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
857
858
// Quaternion product vqmul.q
859
fpr.MapRegsV(sregs, sz, 0);
860
861
// Compute X
862
// d[0] = s[0] * t[3] + s[1] * t[2] - s[2] * t[1] + s[3] * t[0];
863
MOVSS(XMM0, fpr.V(sregs[0]));
864
MULSS(XMM0, fpr.V(tregs[3]));
865
MOVSS(XMM1, fpr.V(sregs[1]));
866
MULSS(XMM1, fpr.V(tregs[2]));
867
ADDSS(XMM0, R(XMM1));
868
MOVSS(XMM1, fpr.V(sregs[2]));
869
MULSS(XMM1, fpr.V(tregs[1]));
870
SUBSS(XMM0, R(XMM1));
871
MOVSS(XMM1, fpr.V(sregs[3]));
872
MULSS(XMM1, fpr.V(tregs[0]));
873
ADDSS(XMM0, R(XMM1));
874
MOVSS(fpr.V(dregs[0]), XMM0);
875
876
// Compute Y
877
//d[1] = s[1] * t[3] + s[2] * t[0] + s[3] * t[1] - s[0] * t[2];
878
MOVSS(XMM0, fpr.V(sregs[1]));
879
MULSS(XMM0, fpr.V(tregs[3]));
880
MOVSS(XMM1, fpr.V(sregs[2]));
881
MULSS(XMM1, fpr.V(tregs[0]));
882
ADDSS(XMM0, R(XMM1));
883
MOVSS(XMM1, fpr.V(sregs[3]));
884
MULSS(XMM1, fpr.V(tregs[1]));
885
ADDSS(XMM0, R(XMM1));
886
MOVSS(XMM1, fpr.V(sregs[0]));
887
MULSS(XMM1, fpr.V(tregs[2]));
888
SUBSS(XMM0, R(XMM1));
889
MOVSS(fpr.V(dregs[1]), XMM0);
890
891
// Compute Z
892
//d[2] = s[0] * t[1] - s[1] * t[0] + s[2] * t[3] + s[3] * t[2];
893
MOVSS(XMM0, fpr.V(sregs[0]));
894
MULSS(XMM0, fpr.V(tregs[1]));
895
MOVSS(XMM1, fpr.V(sregs[1]));
896
MULSS(XMM1, fpr.V(tregs[0]));
897
SUBSS(XMM0, R(XMM1));
898
MOVSS(XMM1, fpr.V(sregs[2]));
899
MULSS(XMM1, fpr.V(tregs[3]));
900
ADDSS(XMM0, R(XMM1));
901
MOVSS(XMM1, fpr.V(sregs[3]));
902
MULSS(XMM1, fpr.V(tregs[2]));
903
ADDSS(XMM0, R(XMM1));
904
MOVSS(fpr.V(dregs[2]), XMM0);
905
906
// Compute W
907
//d[3] = -s[0] * t[0] - s[1] * t[1] - s[2] * t[2] + s[3] * t[3];
908
MOVSS(XMM0, fpr.V(sregs[3]));
909
MULSS(XMM0, fpr.V(tregs[3]));
910
MOVSS(XMM1, fpr.V(sregs[1]));
911
MULSS(XMM1, fpr.V(tregs[1]));
912
SUBSS(XMM0, R(XMM1));
913
MOVSS(XMM1, fpr.V(sregs[2]));
914
MULSS(XMM1, fpr.V(tregs[2]));
915
SUBSS(XMM0, R(XMM1));
916
MOVSS(XMM1, fpr.V(sregs[0]));
917
MULSS(XMM1, fpr.V(tregs[0]));
918
SUBSS(XMM0, R(XMM1));
919
MOVSS(fpr.V(dregs[3]), XMM0);
920
}
921
922
fpr.ReleaseSpillLocks();
923
}
924
925
void Jit::Comp_Vcmov(MIPSOpcode op) {
926
CONDITIONAL_DISABLE(VFPU_COMP);
927
928
if (js.HasUnknownPrefix())
929
DISABLE;
930
931
VectorSize sz = GetVecSize(op);
932
int n = GetNumVectorElements(sz);
933
934
u8 sregs[4], dregs[4];
935
GetVectorRegsPrefixS(sregs, sz, _VS);
936
GetVectorRegsPrefixD(dregs, sz, _VD);
937
int tf = (op >> 19) & 1;
938
int imm3 = (op >> 16) & 7;
939
940
// Flush SIMD.
941
fpr.SimpleRegsV(sregs, sz, 0);
942
943
for (int i = 0; i < n; ++i) {
944
// Simplification: Disable if overlap unsafe
945
if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
946
DISABLE;
947
}
948
}
949
950
if (imm3 < 6) {
951
gpr.MapReg(MIPS_REG_VFPUCC, true, false);
952
fpr.MapRegsV(dregs, sz, MAP_DIRTY);
953
// Test one bit of CC. This bit decides whether none or all subregisters are copied.
954
TEST(32, gpr.R(MIPS_REG_VFPUCC), Imm32(1 << imm3));
955
FixupBranch skip = J_CC(tf ? CC_NZ : CC_Z, true);
956
for (int i = 0; i < n; i++) {
957
MOVSS(fpr.VX(dregs[i]), fpr.V(sregs[i]));
958
}
959
SetJumpTarget(skip);
960
} else {
961
gpr.MapReg(MIPS_REG_VFPUCC, true, false);
962
fpr.MapRegsV(dregs, sz, MAP_DIRTY);
963
// Look at the bottom four bits of CC to individually decide if the subregisters should be copied.
964
for (int i = 0; i < n; i++) {
965
TEST(32, gpr.R(MIPS_REG_VFPUCC), Imm32(1 << i));
966
FixupBranch skip = J_CC(tf ? CC_NZ : CC_Z, true);
967
MOVSS(fpr.VX(dregs[i]), fpr.V(sregs[i]));
968
SetJumpTarget(skip);
969
}
970
}
971
972
ApplyPrefixD(dregs, sz);
973
974
fpr.ReleaseSpillLocks();
975
}
976
977
static s32 DoVminSS(s32 treg) {
978
s32 sreg = currentMIPS->temp;
979
980
// If both are negative, we flip the comparison (not two's compliment.)
981
if (sreg < 0 && treg < 0) {
982
// If at least one side is NAN, we take the highest mantissa bits.
983
return treg < sreg ? sreg : treg;
984
} else {
985
// Otherwise, we take the lowest value (negative or lowest mantissa.)
986
return treg > sreg ? sreg : treg;
987
}
988
}
989
990
static s32 DoVmaxSS(s32 treg) {
991
s32 sreg = currentMIPS->temp;
992
993
// This is the same logic as vmin, just reversed.
994
if (sreg < 0 && treg < 0) {
995
return treg < sreg ? treg : sreg;
996
} else {
997
return treg > sreg ? treg : sreg;
998
}
999
}
1000
1001
void Jit::Comp_VecDo3(MIPSOpcode op) {
1002
CONDITIONAL_DISABLE(VFPU_VEC);
1003
1004
if (js.HasUnknownPrefix())
1005
DISABLE;
1006
1007
// Check that we can support the ops, and prepare temporary values for ops that need it.
1008
bool allowSIMD = true;
1009
switch (op >> 26) {
1010
case 24: //VFPU0
1011
switch ((op >> 23) & 7) {
1012
case 0: // d[i] = s[i] + t[i]; break; //vadd
1013
case 1: // d[i] = s[i] - t[i]; break; //vsub
1014
case 7: // d[i] = s[i] / t[i]; break; //vdiv
1015
break;
1016
default:
1017
DISABLE;
1018
}
1019
break;
1020
case 25: //VFPU1
1021
switch ((op >> 23) & 7) {
1022
case 0: // d[i] = s[i] * t[i]; break; //vmul
1023
break;
1024
default:
1025
DISABLE;
1026
}
1027
break;
1028
case 27: //VFPU3
1029
switch ((op >> 23) & 7) {
1030
case 2: // vmin
1031
case 3: // vmax
1032
allowSIMD = false;
1033
break;
1034
case 6: // vsge
1035
case 7: // vslt
1036
break;
1037
default:
1038
DISABLE;
1039
}
1040
break;
1041
default:
1042
DISABLE;
1043
break;
1044
}
1045
1046
VectorSize sz = GetVecSize(op);
1047
int n = GetNumVectorElements(sz);
1048
1049
u8 sregs[4], tregs[4], dregs[4];
1050
GetVectorRegsPrefixS(sregs, sz, _VS);
1051
GetVectorRegsPrefixT(tregs, sz, _VT);
1052
GetVectorRegsPrefixD(dregs, sz, _VD);
1053
1054
if (allowSIMD && fpr.TryMapDirtyInInVS(dregs, sz, sregs, sz, tregs, sz)) {
1055
void (XEmitter::*opFunc)(X64Reg, OpArg) = nullptr;
1056
bool symmetric = false;
1057
switch (op >> 26) {
1058
case 24: //VFPU0
1059
switch ((op >> 23) & 7) {
1060
case 0: // d[i] = s[i] + t[i]; break; //vadd
1061
opFunc = &XEmitter::ADDPS;
1062
symmetric = true;
1063
break;
1064
case 1: // d[i] = s[i] - t[i]; break; //vsub
1065
opFunc = &XEmitter::SUBPS;
1066
break;
1067
case 7: // d[i] = s[i] / t[i]; break; //vdiv
1068
opFunc = &XEmitter::DIVPS;
1069
break;
1070
}
1071
break;
1072
case 25: //VFPU1
1073
switch ((op >> 23) & 7)
1074
{
1075
case 0: // d[i] = s[i] * t[i]; break; //vmul
1076
opFunc = &XEmitter::MULPS;
1077
symmetric = true;
1078
break;
1079
}
1080
break;
1081
case 27: //VFPU3
1082
switch ((op >> 23) & 7)
1083
{
1084
case 2: // vmin
1085
// TODO: Mishandles NaN. Disabled for now.
1086
MOVAPS(XMM1, fpr.VS(sregs));
1087
MINPS(XMM1, fpr.VS(tregs));
1088
MOVAPS(fpr.VSX(dregs), R(XMM1));
1089
break;
1090
case 3: // vmax
1091
// TODO: Mishandles NaN. Disabled for now.
1092
MOVAPS(XMM1, fpr.VS(sregs));
1093
MAXPS(XMM1, fpr.VS(tregs));
1094
MOVAPS(fpr.VSX(dregs), R(XMM1));
1095
break;
1096
case 6: // vsge
1097
MOVAPS(XMM0, fpr.VS(tregs));
1098
MOVAPS(XMM1, fpr.VS(sregs));
1099
CMPPS(XMM0, R(XMM1), CMP_ORD);
1100
CMPPS(XMM1, fpr.VS(tregs), CMP_NLT);
1101
1102
ANDPS(XMM1, R(XMM0));
1103
MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
1104
ANDPS(XMM1, MatR(TEMPREG));
1105
MOVAPS(fpr.VSX(dregs), R(XMM1));
1106
break;
1107
case 7: // vslt
1108
MOVAPS(XMM1, fpr.VS(sregs));
1109
CMPPS(XMM1, fpr.VS(tregs), CMP_LT);
1110
MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
1111
ANDPS(XMM1, MatR(TEMPREG));
1112
MOVAPS(fpr.VSX(dregs), R(XMM1));
1113
break;
1114
}
1115
break;
1116
}
1117
1118
if (opFunc != nullptr) {
1119
if (fpr.VSX(dregs) != fpr.VSX(tregs)) {
1120
if (fpr.VSX(dregs) != fpr.VSX(sregs)) {
1121
MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
1122
}
1123
(this->*opFunc)(fpr.VSX(dregs), fpr.VS(tregs));
1124
} else if (symmetric) {
1125
// We already know d = t.
1126
(this->*opFunc)(fpr.VSX(dregs), fpr.VS(sregs));
1127
} else {
1128
MOVAPS(XMM1, fpr.VS(sregs));
1129
(this->*opFunc)(XMM1, fpr.VS(tregs));
1130
MOVAPS(fpr.VSX(dregs), R(XMM1));
1131
}
1132
}
1133
1134
ApplyPrefixD(dregs, sz);
1135
fpr.ReleaseSpillLocks();
1136
return;
1137
}
1138
1139
// Flush SIMD.
1140
fpr.SimpleRegsV(sregs, sz, 0);
1141
fpr.SimpleRegsV(tregs, sz, 0);
1142
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
1143
1144
X64Reg tempxregs[4];
1145
for (int i = 0; i < n; ++i)
1146
{
1147
if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs, n, tregs))
1148
{
1149
// On 32-bit we only have 6 xregs for mips regs, use XMM0/XMM1 if possible.
1150
// But for vmin/vmax/vsge, we need XMM0/XMM1, so avoid.
1151
if (i < 2 && (op >> 26) != 27)
1152
tempxregs[i] = (X64Reg) (XMM0 + i);
1153
else
1154
{
1155
int reg = fpr.GetTempV();
1156
fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
1157
fpr.SpillLockV(reg);
1158
tempxregs[i] = fpr.VX(reg);
1159
}
1160
}
1161
else
1162
{
1163
fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
1164
fpr.SpillLockV(dregs[i]);
1165
tempxregs[i] = fpr.VX(dregs[i]);
1166
}
1167
}
1168
1169
for (int i = 0; i < n; ++i)
1170
{
1171
if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
1172
MOVSS(tempxregs[i], fpr.V(sregs[i]));
1173
}
1174
1175
for (int i = 0; i < n; ++i) {
1176
switch (op >> 26) {
1177
case 24: //VFPU0
1178
switch ((op >> 23) & 7) {
1179
case 0: // d[i] = s[i] + t[i]; break; //vadd
1180
ADDSS(tempxregs[i], fpr.V(tregs[i]));
1181
break;
1182
case 1: // d[i] = s[i] - t[i]; break; //vsub
1183
SUBSS(tempxregs[i], fpr.V(tregs[i]));
1184
break;
1185
case 7: // d[i] = s[i] / t[i]; break; //vdiv
1186
DIVSS(tempxregs[i], fpr.V(tregs[i]));
1187
break;
1188
}
1189
break;
1190
case 25: //VFPU1
1191
switch ((op >> 23) & 7)
1192
{
1193
case 0: // d[i] = s[i] * t[i]; break; //vmul
1194
MULSS(tempxregs[i], fpr.V(tregs[i]));
1195
break;
1196
}
1197
break;
1198
case 27: //VFPU3
1199
switch ((op >> 23) & 7)
1200
{
1201
case 2: // vmin
1202
{
1203
MOVSS(XMM0, fpr.V(tregs[i]));
1204
UCOMISS(tempxregs[i], R(XMM0));
1205
FixupBranch skip = J_CC(CC_NP, true);
1206
1207
MOVSS(MIPSSTATE_VAR(temp), tempxregs[i]);
1208
MOVD_xmm(R(EAX), XMM0);
1209
CallProtectedFunction(&DoVminSS, R(EAX));
1210
MOVD_xmm(tempxregs[i], R(EAX));
1211
FixupBranch finish = J();
1212
1213
SetJumpTarget(skip);
1214
MINSS(tempxregs[i], R(XMM0));
1215
SetJumpTarget(finish);
1216
}
1217
break;
1218
case 3: // vmax
1219
{
1220
MOVSS(XMM0, fpr.V(tregs[i]));
1221
UCOMISS(tempxregs[i], R(XMM0));
1222
FixupBranch skip = J_CC(CC_NP, true);
1223
1224
MOVSS(MIPSSTATE_VAR(temp), tempxregs[i]);
1225
MOVD_xmm(R(EAX), XMM0);
1226
CallProtectedFunction(&DoVmaxSS, R(EAX));
1227
MOVD_xmm(tempxregs[i], R(EAX));
1228
FixupBranch finish = J();
1229
1230
SetJumpTarget(skip);
1231
MAXSS(tempxregs[i], R(XMM0));
1232
SetJumpTarget(finish);
1233
}
1234
break;
1235
case 6: // vsge
1236
// We can't just reverse, because of 0/-0.
1237
MOVSS(XMM0, fpr.V(tregs[i]));
1238
MOVSS(XMM1, R(tempxregs[i]));
1239
CMPORDSS(XMM1, R(XMM0));
1240
CMPNLTSS(tempxregs[i], R(XMM0));
1241
ANDPS(tempxregs[i], R(XMM1));
1242
MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
1243
ANDPS(tempxregs[i], MatR(TEMPREG));
1244
break;
1245
case 7: // vslt
1246
CMPLTSS(tempxregs[i], fpr.V(tregs[i]));
1247
MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
1248
ANDPS(tempxregs[i], MatR(TEMPREG));
1249
break;
1250
}
1251
break;
1252
}
1253
}
1254
1255
for (int i = 0; i < n; ++i)
1256
{
1257
if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
1258
MOVSS(fpr.V(dregs[i]), tempxregs[i]);
1259
}
1260
1261
ApplyPrefixD(dregs, sz);
1262
1263
fpr.ReleaseSpillLocks();
1264
}
1265
1266
alignas(16) static const u32 vcmpMask[4][4] = {
1267
{0x00000031, 0x00000000, 0x00000000, 0x00000000},
1268
{0x00000011, 0x00000012, 0x00000000, 0x00000000},
1269
{0x00000011, 0x00000012, 0x00000014, 0x00000000},
1270
{0x00000011, 0x00000012, 0x00000014, 0x00000018},
1271
};
1272
1273
void Jit::Comp_Vcmp(MIPSOpcode op) {
1274
CONDITIONAL_DISABLE(VFPU_COMP);
1275
1276
if (js.HasUnknownPrefix())
1277
DISABLE;
1278
1279
VectorSize sz = GetVecSize(op);
1280
int n = GetNumVectorElements(sz);
1281
1282
VCondition cond = (VCondition)(op & 0xF);
1283
1284
u8 sregs[4], tregs[4];
1285
GetVectorRegsPrefixS(sregs, sz, _VS);
1286
GetVectorRegsPrefixT(tregs, sz, _VT);
1287
1288
// Some, we just fall back to the interpreter.
1289
switch (cond) {
1290
case VC_EI: // c = my_isinf(s[i]); break;
1291
case VC_NI: // c = !my_isinf(s[i]); break;
1292
DISABLE;
1293
break;
1294
case VC_ES: // c = my_isnan(s[i]) || my_isinf(s[i]); break; // Tekken Dark Resurrection
1295
case VC_NS: // c = !my_isnan(s[i]) && !my_isinf(s[i]); break;
1296
case VC_EN: // c = my_isnan(s[i]); break;
1297
case VC_NN: // c = !my_isnan(s[i]); break;
1298
if (_VS != _VT)
1299
DISABLE;
1300
break;
1301
default:
1302
break;
1303
}
1304
1305
// First, let's get the trivial ones.
1306
1307
static const int true_bits[4] = {0x31, 0x33, 0x37, 0x3f};
1308
1309
if (cond == VC_TR) {
1310
gpr.MapReg(MIPS_REG_VFPUCC, true, true);
1311
OR(32, gpr.R(MIPS_REG_VFPUCC), Imm32(true_bits[n-1]));
1312
return;
1313
} else if (cond == VC_FL) {
1314
gpr.MapReg(MIPS_REG_VFPUCC, true, true);
1315
AND(32, gpr.R(MIPS_REG_VFPUCC), Imm32(~true_bits[n-1]));
1316
return;
1317
}
1318
1319
if (n > 1)
1320
gpr.FlushLockX(ECX);
1321
1322
// Start with zero in each lane for the compare to zero.
1323
if (cond == VC_EZ || cond == VC_NZ) {
1324
XORPS(XMM0, R(XMM0));
1325
if (n > 1) {
1326
XORPS(XMM1, R(XMM1));
1327
}
1328
}
1329
1330
bool inverse = false;
1331
1332
if (cond == VC_GE || cond == VC_GT) {
1333
// We flip, and we need them in regs so we don't clear the high lanes.
1334
fpr.SimpleRegsV(sregs, sz, 0);
1335
fpr.MapRegsV(tregs, sz, 0);
1336
} else {
1337
fpr.SimpleRegsV(tregs, sz, 0);
1338
fpr.MapRegsV(sregs, sz, 0);
1339
}
1340
1341
// We go backwards because it's more convenient to put things in the right lanes.
1342
int affected_bits = (1 << 4) | (1 << 5); // 4 and 5
1343
for (int i = n - 1; i >= 0; --i) {
1344
// Alternate between XMM0 and XMM1
1345
X64Reg reg = i == 1 || i == 3 ? XMM1 : XMM0;
1346
if ((i == 0 || i == 1) && n > 2) {
1347
// We need to swap lanes... this also puts them in the right place.
1348
SHUFPS(reg, R(reg), _MM_SHUFFLE(3, 2, 0, 1));
1349
}
1350
1351
// Let's only handle the easy ones, and fall back on the interpreter for the rest.
1352
bool compareTwo = false;
1353
bool compareToZero = false;
1354
int comparison = -1;
1355
bool flip = false;
1356
1357
switch (cond) {
1358
case VC_ES:
1359
comparison = -1; // We will do the compare at the end. XMM1 will have the bits.
1360
MOVSS(reg, fpr.V(sregs[i]));
1361
break;
1362
1363
case VC_NS:
1364
comparison = -1; // We will do the compare at the end. XMM1 will have the bits.
1365
MOVSS(reg, fpr.V(sregs[i]));
1366
// Note that we do this all at once at the end.
1367
inverse = true;
1368
break;
1369
1370
case VC_EN:
1371
comparison = CMP_UNORD;
1372
compareTwo = true;
1373
break;
1374
1375
case VC_NN:
1376
comparison = CMP_UNORD;
1377
compareTwo = true;
1378
// Note that we do this all at once at the end.
1379
inverse = true;
1380
break;
1381
1382
case VC_EQ: // c = s[i] == t[i]; break;
1383
comparison = CMP_EQ;
1384
compareTwo = true;
1385
break;
1386
1387
case VC_LT: // c = s[i] < t[i]; break;
1388
comparison = CMP_LT;
1389
compareTwo = true;
1390
break;
1391
1392
case VC_LE: // c = s[i] <= t[i]; break;
1393
comparison = CMP_LE;
1394
compareTwo = true;
1395
break;
1396
1397
case VC_NE: // c = s[i] != t[i]; break;
1398
comparison = CMP_NEQ;
1399
compareTwo = true;
1400
break;
1401
1402
case VC_GE: // c = s[i] >= t[i]; break;
1403
comparison = CMP_LE;
1404
flip = true;
1405
compareTwo = true;
1406
break;
1407
1408
case VC_GT: // c = s[i] > t[i]; break;
1409
comparison = CMP_LT;
1410
flip = true;
1411
compareTwo = true;
1412
break;
1413
1414
case VC_EZ: // c = s[i] == 0.0f || s[i] == -0.0f; break;
1415
comparison = CMP_EQ;
1416
compareToZero = true;
1417
break;
1418
1419
case VC_NZ: // c = s[i] != 0; break;
1420
comparison = CMP_NEQ;
1421
compareToZero = true;
1422
break;
1423
1424
default:
1425
DISABLE;
1426
}
1427
1428
if (comparison != -1) {
1429
if (compareTwo) {
1430
if (!flip) {
1431
MOVSS(reg, fpr.V(sregs[i]));
1432
CMPSS(reg, fpr.V(tregs[i]), comparison);
1433
} else {
1434
MOVSS(reg, fpr.V(tregs[i]));
1435
CMPSS(reg, fpr.V(sregs[i]), comparison);
1436
}
1437
} else if (compareToZero) {
1438
CMPSS(reg, fpr.V(sregs[i]), comparison);
1439
}
1440
}
1441
1442
affected_bits |= 1 << i;
1443
}
1444
1445
if (n > 1) {
1446
XOR(32, R(ECX), R(ECX));
1447
1448
// This combines them together.
1449
UNPCKLPS(XMM0, R(XMM1));
1450
1451
// Finalize the comparison for ES/NS.
1452
if (cond == VC_ES || cond == VC_NS) {
1453
MOV(PTRBITS, R(TEMPREG), ImmPtr(&fourinfnan));
1454
ANDPS(XMM0, MatR(TEMPREG));
1455
PCMPEQD(XMM0, MatR(TEMPREG)); // Integer comparison
1456
// It's inversed below for NS.
1457
}
1458
1459
if (inverse) {
1460
// The canonical way to generate a bunch of ones, see https://stackoverflow.com/questions/35085059/what-are-the-best-instruction-sequences-to-generate-vector-constants-on-the-fly
1461
PCMPEQW(XMM1, R(XMM1));
1462
XORPS(XMM0, R(XMM1));
1463
}
1464
MOV(PTRBITS, R(TEMPREG), ImmPtr(&vcmpMask[n - 1]));
1465
ANDPS(XMM0, MatR(TEMPREG));
1466
MOVAPS(MIPSSTATE_VAR(vcmpResult), XMM0);
1467
1468
MOV(32, R(TEMPREG), MIPSSTATE_VAR(vcmpResult[0]));
1469
for (int i = 1; i < n; ++i) {
1470
OR(32, R(TEMPREG), MIPSSTATE_VAR_ELEM32(vcmpResult[0], i));
1471
}
1472
1473
// Aggregate the bits. Urgh, expensive. Can optimize for the case of one comparison,
1474
// which is the most common after all.
1475
CMP(32, R(TEMPREG), Imm8(affected_bits & 0x1F));
1476
SETcc(CC_E, R(ECX));
1477
SHL(32, R(ECX), Imm8(5));
1478
OR(32, R(TEMPREG), R(ECX));
1479
} else {
1480
// Finalize the comparison for ES/NS.
1481
if (cond == VC_ES || cond == VC_NS) {
1482
MOV(PTRBITS, R(TEMPREG), ImmPtr(&fourinfnan));
1483
ANDPS(XMM0, MatR(TEMPREG));
1484
PCMPEQD(XMM0, MatR(TEMPREG)); // Integer comparison
1485
// It's inversed below for NS.
1486
}
1487
1488
MOVD_xmm(R(TEMPREG), XMM0);
1489
if (inverse) {
1490
XOR(32, R(TEMPREG), Imm32(0xFFFFFFFF));
1491
}
1492
AND(32, R(TEMPREG), Imm32(0x31));
1493
}
1494
1495
gpr.UnlockAllX();
1496
gpr.MapReg(MIPS_REG_VFPUCC, true, true);
1497
AND(32, gpr.R(MIPS_REG_VFPUCC), Imm32(~affected_bits));
1498
OR(32, gpr.R(MIPS_REG_VFPUCC), R(TEMPREG));
1499
1500
fpr.ReleaseSpillLocks();
1501
}
1502
1503
// There are no immediates for floating point, so we need to load these
1504
// from RAM. Might as well have a table ready.
1505
extern const float mulTableVi2f[32] = {
1506
1.0f/(1UL<<0),1.0f/(1UL<<1),1.0f/(1UL<<2),1.0f/(1UL<<3),
1507
1.0f/(1UL<<4),1.0f/(1UL<<5),1.0f/(1UL<<6),1.0f/(1UL<<7),
1508
1.0f/(1UL<<8),1.0f/(1UL<<9),1.0f/(1UL<<10),1.0f/(1UL<<11),
1509
1.0f/(1UL<<12),1.0f/(1UL<<13),1.0f/(1UL<<14),1.0f/(1UL<<15),
1510
1.0f/(1UL<<16),1.0f/(1UL<<17),1.0f/(1UL<<18),1.0f/(1UL<<19),
1511
1.0f/(1UL<<20),1.0f/(1UL<<21),1.0f/(1UL<<22),1.0f/(1UL<<23),
1512
1.0f/(1UL<<24),1.0f/(1UL<<25),1.0f/(1UL<<26),1.0f/(1UL<<27),
1513
1.0f/(1UL<<28),1.0f/(1UL<<29),1.0f/(1UL<<30),1.0f/(1UL<<31),
1514
};
1515
1516
void Jit::Comp_Vi2f(MIPSOpcode op) {
1517
CONDITIONAL_DISABLE(VFPU_VEC);
1518
1519
if (js.HasUnknownPrefix())
1520
DISABLE;
1521
1522
VectorSize sz = GetVecSize(op);
1523
int n = GetNumVectorElements(sz);
1524
1525
int imm = (op >> 16) & 0x1f;
1526
const float *mult = &mulTableVi2f[imm];
1527
1528
u8 sregs[4], dregs[4];
1529
GetVectorRegsPrefixS(sregs, sz, _VS);
1530
GetVectorRegsPrefixD(dregs, sz, _VD);
1531
1532
// Flush SIMD.
1533
fpr.SimpleRegsV(sregs, sz, 0);
1534
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
1535
1536
int tempregs[4];
1537
for (int i = 0; i < n; ++i) {
1538
if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
1539
tempregs[i] = fpr.GetTempV();
1540
} else {
1541
tempregs[i] = dregs[i];
1542
}
1543
}
1544
1545
if (*mult != 1.0f) {
1546
if (RipAccessible(mult)) {
1547
MOVSS(XMM1, M(mult)); // rip accessible
1548
} else {
1549
MOV(PTRBITS, R(TEMPREG), ImmPtr(mult));
1550
MOVSS(XMM1, MatR(TEMPREG));
1551
}
1552
}
1553
for (int i = 0; i < n; i++) {
1554
fpr.MapRegV(tempregs[i], sregs[i] == dregs[i] ? MAP_DIRTY : MAP_NOINIT);
1555
if (fpr.V(sregs[i]).IsSimpleReg()) {
1556
CVTDQ2PS(fpr.VX(tempregs[i]), fpr.V(sregs[i]));
1557
} else {
1558
MOVSS(fpr.VX(tempregs[i]), fpr.V(sregs[i]));
1559
CVTDQ2PS(fpr.VX(tempregs[i]), R(fpr.VX(tempregs[i])));
1560
}
1561
if (*mult != 1.0f)
1562
MULSS(fpr.VX(tempregs[i]), R(XMM1));
1563
}
1564
1565
for (int i = 0; i < n; ++i) {
1566
if (dregs[i] != tempregs[i]) {
1567
fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);
1568
MOVSS(fpr.VX(dregs[i]), fpr.V(tempregs[i]));
1569
}
1570
}
1571
1572
ApplyPrefixD(dregs, sz);
1573
fpr.ReleaseSpillLocks();
1574
}
1575
1576
// Planning for true SIMD
1577
1578
// Sequence for gathering sparse registers into one SIMD:
1579
// MOVSS(XMM0, fpr.R(sregs[0]));
1580
// MOVSS(XMM1, fpr.R(sregs[1]));
1581
// MOVSS(XMM2, fpr.R(sregs[2]));
1582
// MOVSS(XMM3, fpr.R(sregs[3]));
1583
// SHUFPS(XMM0, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0)); // XMM0 = S1 S1 S0 S0
1584
// SHUFPS(XMM2, R(XMM3), _MM_SHUFFLE(0, 0, 0, 0)); // XMM2 = S3 S3 S2 S2
1585
// SHUFPS(XMM0, R(XMM2), _MM_SHUFFLE(2, 0, 2, 0)); // XMM0 = S3 S2 S1 S0
1586
// Some punpckwd etc would also work.
1587
// Alternatively, MOVSS and three PINSRD (SSE4) with mem source.
1588
// Why PINSRD instead of INSERTPS?
1589
// http://software.intel.com/en-us/blogs/2009/01/07/using-sse41-for-mp3-encoding-quantization
1590
1591
// Sequence for scattering a SIMD register to sparse registers:
1592
// (Very serial though, better methods may be possible)
1593
// MOVSS(fpr.R(sregs[0]), XMM0);
1594
// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));
1595
// MOVSS(fpr.R(sregs[1]), XMM0);
1596
// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));
1597
// MOVSS(fpr.R(sregs[2]), XMM0);
1598
// SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(3, 3, 2, 1));
1599
// MOVSS(fpr.R(sregs[3]), XMM0);
1600
// On SSE4 we should use EXTRACTPS.
1601
1602
// Translation of ryg's half_to_float5_SSE2
1603
void Jit::Comp_Vh2f(MIPSOpcode op) {
1604
CONDITIONAL_DISABLE(VFPU_VEC);
1605
if (js.HasUnknownPrefix())
1606
DISABLE;
1607
1608
#define SSE_CONST4(name, val) alignas(16) static const u32 name[4] = { (val), (val), (val), (val) }
1609
1610
SSE_CONST4(mask_nosign, 0x7fff);
1611
SSE_CONST4(nan_mantissa, 0x800003ff);
1612
SSE_CONST4(magic, (254 - 15) << 23);
1613
SSE_CONST4(was_infnan, 0x7bff);
1614
SSE_CONST4(exp_infnan, 255 << 23);
1615
1616
OpArg mask_nosign_arg, nan_mantissa_arg, magic_arg, was_infnan_arg, exp_infnan_arg;
1617
if (RipAccessible(mask_nosign)) {
1618
mask_nosign_arg = M(&mask_nosign[0]);
1619
nan_mantissa_arg = M(&nan_mantissa[0]);
1620
magic_arg = M(&magic[0]);
1621
was_infnan_arg = M(&was_infnan[0]);
1622
exp_infnan_arg = M(&exp_infnan[0]);
1623
} else {
1624
MOV(PTRBITS, R(TEMPREG), ImmPtr(&mask_nosign[0]));
1625
mask_nosign_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &mask_nosign[0]);
1626
nan_mantissa_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &nan_mantissa[0]);
1627
magic_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &magic[0]);
1628
was_infnan_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &was_infnan[0]);
1629
exp_infnan_arg = MAccessibleDisp(TEMPREG, &mask_nosign[0], &exp_infnan[0]);
1630
}
1631
1632
#undef SSE_CONST4
1633
VectorSize sz = GetVecSize(op);
1634
VectorSize outsize;
1635
switch (sz) {
1636
case V_Single:
1637
outsize = V_Pair;
1638
break;
1639
case V_Pair:
1640
outsize = V_Quad;
1641
break;
1642
default:
1643
DISABLE;
1644
}
1645
1646
u8 sregs[4], dregs[4];
1647
GetVectorRegsPrefixS(sregs, sz, _VS);
1648
GetVectorRegsPrefixD(dregs, outsize, _VD);
1649
1650
// Flush SIMD.
1651
fpr.SimpleRegsV(sregs, sz, 0);
1652
1653
// Force ourselves an extra xreg as temp space.
1654
X64Reg tempR = fpr.GetFreeXReg();
1655
1656
MOVSS(XMM0, fpr.V(sregs[0]));
1657
if (sz != V_Single) {
1658
MOVSS(XMM1, fpr.V(sregs[1]));
1659
PUNPCKLDQ(XMM0, R(XMM1));
1660
}
1661
XORPS(XMM1, R(XMM1));
1662
PUNPCKLWD(XMM0, R(XMM1));
1663
1664
// OK, 16 bits in each word.
1665
// Let's go. Deep magic here.
1666
MOVAPS(XMM1, R(XMM0));
1667
ANDPS(XMM0, mask_nosign_arg); // xmm0 = expmant
1668
XORPS(XMM1, R(XMM0)); // xmm1 = justsign = expmant ^ xmm0
1669
MOVAPS(tempR, R(XMM0));
1670
PSLLD(XMM0, 13);
1671
MULPS(XMM0, magic_arg); /// xmm0 = scaled
1672
PSLLD(XMM1, 16); // xmm1 = sign
1673
ORPS(XMM0, R(XMM1));
1674
1675
// Now create a NAN mask, adding in the sign.
1676
ORPS(XMM1, R(tempR)); // xmm1 = sign + original mantissa.
1677
ANDPS(XMM1, nan_mantissa_arg); // xmm1 = original mantissa
1678
PCMPGTD(tempR, was_infnan_arg); // xmm2 = b_wasinfnan
1679
ORPS(XMM1, exp_infnan_arg); // xmm1 = infnan result
1680
ANDPS(XMM1, R(tempR)); // xmm1 = infnan result OR zero if not infnan
1681
ANDNPS(tempR, R(XMM0)); // tempR = result OR zero if infnan
1682
ORPS(XMM1, R(tempR));
1683
1684
fpr.MapRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY);
1685
1686
// TODO: Could apply D-prefix in parallel here...
1687
1688
MOVSS(fpr.V(dregs[0]), XMM1);
1689
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 3, 2, 1));
1690
MOVSS(fpr.V(dregs[1]), XMM1);
1691
1692
if (sz != V_Single) {
1693
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 3, 2, 1));
1694
MOVSS(fpr.V(dregs[2]), XMM1);
1695
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 3, 2, 1));
1696
MOVSS(fpr.V(dregs[3]), XMM1);
1697
}
1698
1699
ApplyPrefixD(dregs, outsize);
1700
gpr.UnlockAllX();
1701
fpr.ReleaseSpillLocks();
1702
}
1703
1704
// The goal is to map (reversed byte order for clarity):
1705
// AABBCCDD -> 000000AA 000000BB 000000CC 000000DD
1706
alignas(16) static s8 vc2i_shuffle[16] = { -1, -1, -1, 0, -1, -1, -1, 1, -1, -1, -1, 2, -1, -1, -1, 3 };
1707
// AABBCCDD -> AAAAAAAA BBBBBBBB CCCCCCCC DDDDDDDD
1708
alignas(16) static s8 vuc2i_shuffle[16] = { 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3 };
1709
1710
void Jit::Comp_Vx2i(MIPSOpcode op) {
1711
CONDITIONAL_DISABLE(VFPU_VEC);
1712
if (js.HasUnknownPrefix())
1713
DISABLE;
1714
1715
int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vuc2i/vc2i (0/1), vus2i/vs2i (2/3)
1716
bool unsignedOp = ((op >> 16) & 1) == 0; // vuc2i (0), vus2i (2)
1717
1718
// vs2i or vus2i unpack pairs of 16-bit integers into 32-bit integers, with the values
1719
// at the top. vus2i shifts it an extra bit right afterward.
1720
// vc2i and vuc2i unpack quads of 8-bit integers into 32-bit integers, with the values
1721
// at the top too. vuc2i is a bit special (see below.)
1722
// Let's do this similarly as h2f - we do a solution that works for both singles and pairs
1723
// then use it for both.
1724
1725
VectorSize sz = GetVecSize(op);
1726
VectorSize outsize;
1727
if (bits == 8) {
1728
outsize = V_Quad;
1729
} else {
1730
switch (sz) {
1731
case V_Single:
1732
outsize = V_Pair;
1733
break;
1734
case V_Pair:
1735
outsize = V_Quad;
1736
break;
1737
default:
1738
DISABLE;
1739
}
1740
}
1741
1742
u8 sregs[4], dregs[4];
1743
GetVectorRegsPrefixS(sregs, sz, _VS);
1744
GetVectorRegsPrefixD(dregs, outsize, _VD);
1745
1746
// Flush SIMD.
1747
fpr.SimpleRegsV(sregs, sz, 0);
1748
1749
if (bits == 16) {
1750
MOVSS(XMM1, fpr.V(sregs[0]));
1751
if (sz != V_Single) {
1752
MOVSS(XMM0, fpr.V(sregs[1]));
1753
PUNPCKLDQ(XMM1, R(XMM0));
1754
}
1755
1756
// Unpack 16-bit words into 32-bit words, upper position, and we're done!
1757
PXOR(XMM0, R(XMM0));
1758
PUNPCKLWD(XMM0, R(XMM1));
1759
} else if (bits == 8) {
1760
if (unsignedOp) {
1761
// vuc2i is a bit special. It spreads out the bits like this:
1762
// s[0] = 0xDDCCBBAA -> d[0] = (0xAAAAAAAA >> 1), d[1] = (0xBBBBBBBB >> 1), etc.
1763
MOVSS(XMM0, fpr.V(sregs[0]));
1764
if (cpu_info.bSSSE3 && RipAccessible(vuc2i_shuffle)) {
1765
// Not really different speed. Generates a bit less code.
1766
PSHUFB(XMM0, M(&vuc2i_shuffle[0])); // rip accessible
1767
} else {
1768
// First, we change 0xDDCCBBAA to 0xDDDDCCCCBBBBAAAA.
1769
PUNPCKLBW(XMM0, R(XMM0));
1770
// Now, interleave each 16 bits so they're all 32 bits wide.
1771
PUNPCKLWD(XMM0, R(XMM0));
1772
}
1773
} else {
1774
if (cpu_info.bSSSE3 && RipAccessible(vc2i_shuffle)) {
1775
MOVSS(XMM0, fpr.V(sregs[0]));
1776
PSHUFB(XMM0, M(&vc2i_shuffle[0]));
1777
} else {
1778
PXOR(XMM1, R(XMM1));
1779
MOVSS(XMM0, fpr.V(sregs[0]));
1780
PUNPCKLBW(XMM1, R(XMM0));
1781
PXOR(XMM0, R(XMM0));
1782
PUNPCKLWD(XMM0, R(XMM1));
1783
}
1784
}
1785
}
1786
1787
// At this point we have the regs in the 4 lanes.
1788
// In the "u" mode, we need to shift it out of the sign bit.
1789
if (unsignedOp) {
1790
PSRLD(XMM0, 1);
1791
}
1792
1793
if (fpr.TryMapRegsVS(dregs, outsize, MAP_NOINIT | MAP_DIRTY)) {
1794
MOVAPS(fpr.VSX(dregs), R(XMM0));
1795
} else {
1796
// Done! TODO: The rest of this should be possible to extract into a function.
1797
fpr.MapRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY);
1798
1799
// TODO: Could apply D-prefix in parallel here...
1800
1801
MOVSS(fpr.V(dregs[0]), XMM0);
1802
PSRLDQ(XMM0, 4);
1803
MOVSS(fpr.V(dregs[1]), XMM0);
1804
1805
if (outsize != V_Pair) {
1806
PSRLDQ(XMM0, 4);
1807
MOVSS(fpr.V(dregs[2]), XMM0);
1808
PSRLDQ(XMM0, 4);
1809
MOVSS(fpr.V(dregs[3]), XMM0);
1810
}
1811
}
1812
1813
ApplyPrefixD(dregs, outsize);
1814
gpr.UnlockAllX();
1815
fpr.ReleaseSpillLocks();
1816
}
1817
1818
extern const double mulTableVf2i[32] = {
1819
(1ULL<<0),(1ULL<<1),(1ULL<<2),(1ULL<<3),
1820
(1ULL<<4),(1ULL<<5),(1ULL<<6),(1ULL<<7),
1821
(1ULL<<8),(1ULL<<9),(1ULL<<10),(1ULL<<11),
1822
(1ULL<<12),(1ULL<<13),(1ULL<<14),(1ULL<<15),
1823
(1ULL<<16),(1ULL<<17),(1ULL<<18),(1ULL<<19),
1824
(1ULL<<20),(1ULL<<21),(1ULL<<22),(1ULL<<23),
1825
(1ULL<<24),(1ULL<<25),(1ULL<<26),(1ULL<<27),
1826
(1ULL<<28),(1ULL<<29),(1ULL<<30),(1ULL<<31),
1827
};
1828
1829
static const double maxMinIntAsDouble[2] = { (double)0x7fffffff, (double)(int)0x80000000 }; // that's not equal to 0x80000000
1830
1831
void Jit::Comp_Vf2i(MIPSOpcode op) {
1832
CONDITIONAL_DISABLE(VFPU_VEC);
1833
if (js.HasUnknownPrefix())
1834
DISABLE;
1835
1836
VectorSize sz = GetVecSize(op);
1837
int n = GetNumVectorElements(sz);
1838
1839
int imm = (op >> 16) & 0x1f;
1840
const double *mult = &mulTableVf2i[imm];
1841
1842
int setMXCSR = -1;
1843
int rmode = (op >> 21) & 0x1f;
1844
switch (rmode) {
1845
case 17:
1846
break; //z - truncate. Easy to support.
1847
case 16:
1848
setMXCSR = 0;
1849
break;
1850
case 18:
1851
setMXCSR = 2;
1852
break;
1853
case 19:
1854
setMXCSR = 1;
1855
break;
1856
}
1857
1858
// Small optimization: 0 is our default mode anyway.
1859
if (setMXCSR == 0 && !js.hasSetRounding) {
1860
setMXCSR = -1;
1861
}
1862
// Except for truncate, we need to update MXCSR to our preferred rounding mode.
1863
if (setMXCSR != -1) {
1864
STMXCSR(MIPSSTATE_VAR(mxcsrTemp));
1865
MOV(32, R(TEMPREG), MIPSSTATE_VAR(mxcsrTemp));
1866
AND(32, R(TEMPREG), Imm32(~(3 << 13)));
1867
if (setMXCSR != 0) {
1868
OR(32, R(TEMPREG), Imm32(setMXCSR << 13));
1869
}
1870
MOV(32, MIPSSTATE_VAR(temp), R(TEMPREG));
1871
LDMXCSR(MIPSSTATE_VAR(temp));
1872
}
1873
1874
u8 sregs[4], dregs[4];
1875
GetVectorRegsPrefixS(sregs, sz, _VS);
1876
GetVectorRegsPrefixD(dregs, sz, _VD);
1877
1878
// Really tricky to SIMD due to double precision requirement...
1879
1880
// Flush SIMD.
1881
fpr.SimpleRegsV(sregs, sz, 0);
1882
fpr.SimpleRegsV(dregs, sz, MAP_DIRTY | MAP_NOINIT);
1883
1884
u8 tempregs[4];
1885
for (int i = 0; i < n; ++i) {
1886
if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
1887
tempregs[i] = fpr.GetTempV();
1888
} else {
1889
tempregs[i] = dregs[i];
1890
}
1891
}
1892
1893
if (*mult != 1.0f) {
1894
if (RipAccessible(mult)) {
1895
MOVSD(XMM1, M(mult)); // rip accessible
1896
} else {
1897
MOV(PTRBITS, R(TEMPREG), ImmPtr(mult));
1898
MOVSD(XMM1, MatR(TEMPREG));
1899
}
1900
}
1901
1902
fpr.MapRegsV(tempregs, sz, MAP_DIRTY | MAP_NOINIT);
1903
for (int i = 0; i < n; i++) {
1904
// Need to do this in double precision to clamp correctly as float
1905
// doesn't have enough precision to represent 0x7fffffff for example exactly.
1906
MOVSS(XMM0, fpr.V(sregs[i]));
1907
CVTSS2SD(XMM0, R(XMM0)); // convert to double precision
1908
if (*mult != 1.0f) {
1909
MULSD(XMM0, R(XMM1));
1910
}
1911
MOV(PTRBITS, R(TEMPREG), ImmPtr(maxMinIntAsDouble));
1912
MINSD(XMM0, MDisp(TEMPREG, 0));
1913
MAXSD(XMM0, MDisp(TEMPREG, sizeof(double)));
1914
// We've set the rounding mode above, so this part's easy.
1915
switch ((op >> 21) & 0x1f) {
1916
case 16: CVTSD2SI(TEMPREG, R(XMM0)); break; //n
1917
case 17: CVTTSD2SI(TEMPREG, R(XMM0)); break; //z - truncate
1918
case 18: CVTSD2SI(TEMPREG, R(XMM0)); break; //u
1919
case 19: CVTSD2SI(TEMPREG, R(XMM0)); break; //d
1920
}
1921
MOVD_xmm(fpr.VX(tempregs[i]), R(TEMPREG));
1922
}
1923
1924
for (int i = 0; i < n; ++i) {
1925
if (dregs[i] != tempregs[i]) {
1926
fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);
1927
MOVSS(fpr.VX(dregs[i]), fpr.V(tempregs[i]));
1928
fpr.DiscardV(tempregs[i]);
1929
}
1930
}
1931
1932
if (setMXCSR != -1) {
1933
LDMXCSR(MIPSSTATE_VAR(mxcsrTemp));
1934
}
1935
1936
ApplyPrefixD(dregs, sz);
1937
fpr.ReleaseSpillLocks();
1938
}
1939
1940
void Jit::Comp_Vcst(MIPSOpcode op) {
1941
CONDITIONAL_DISABLE(VFPU_XFER);
1942
1943
if (js.HasUnknownPrefix())
1944
DISABLE;
1945
1946
int conNum = (op >> 16) & 0x1f;
1947
int vd = _VD;
1948
1949
VectorSize sz = GetVecSize(op);
1950
int n = GetNumVectorElements(sz);
1951
1952
u8 dregs[4];
1953
GetVectorRegsPrefixD(dregs, sz, vd);
1954
1955
if (RipAccessible(cst_constants)) {
1956
MOVSS(XMM0, M(&cst_constants[conNum])); // rip accessible
1957
} else {
1958
MOV(PTRBITS, R(TEMPREG), ImmPtr(&cst_constants[conNum]));
1959
MOVSS(XMM0, MatR(TEMPREG));
1960
}
1961
1962
if (fpr.TryMapRegsVS(dregs, sz, MAP_NOINIT | MAP_DIRTY)) {
1963
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0,0,0,0));
1964
MOVAPS(fpr.VS(dregs), XMM0);
1965
fpr.ReleaseSpillLocks();
1966
return;
1967
}
1968
1969
fpr.MapRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
1970
for (int i = 0; i < n; i++) {
1971
MOVSS(fpr.V(dregs[i]), XMM0);
1972
}
1973
ApplyPrefixD(dregs, sz);
1974
fpr.ReleaseSpillLocks();
1975
}
1976
1977
void Jit::Comp_Vsgn(MIPSOpcode op) {
1978
CONDITIONAL_DISABLE(VFPU_VEC);
1979
1980
if (js.HasUnknownPrefix())
1981
DISABLE;
1982
1983
VectorSize sz = GetVecSize(op);
1984
int n = GetNumVectorElements(sz);
1985
1986
u8 sregs[4], dregs[4];
1987
GetVectorRegsPrefixS(sregs, sz, _VS);
1988
GetVectorRegsPrefixD(dregs, sz, _VD);
1989
1990
// Flush SIMD.
1991
fpr.SimpleRegsV(sregs, sz, 0);
1992
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
1993
1994
X64Reg tempxregs[4];
1995
for (int i = 0; i < n; ++i) {
1996
if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
1997
int reg = fpr.GetTempV();
1998
fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
1999
fpr.SpillLockV(reg);
2000
tempxregs[i] = fpr.VX(reg);
2001
} else {
2002
fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
2003
fpr.SpillLockV(dregs[i]);
2004
tempxregs[i] = fpr.VX(dregs[i]);
2005
}
2006
}
2007
2008
// Would be nice with more temp regs here so we could put signBitLower and oneOneOneOne into regs...
2009
for (int i = 0; i < n; ++i) {
2010
XORPS(XMM0, R(XMM0));
2011
CMPEQSS(XMM0, fpr.V(sregs[i])); // XMM0 = s[i] == 0.0f
2012
MOVSS(XMM1, fpr.V(sregs[i]));
2013
// Preserve sign bit, replace rest with ones
2014
if (RipAccessible(signBitLower)) {
2015
ANDPS(XMM1, M(&signBitLower)); // rip accessible
2016
ORPS(XMM1, M(&oneOneOneOne)); // rip accessible
2017
} else {
2018
MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower));
2019
ANDPS(XMM1, MatR(TEMPREG));
2020
MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
2021
ORPS(XMM1, MatR(TEMPREG));
2022
}
2023
// If really was equal to zero, zap. Note that ANDN negates the destination.
2024
ANDNPS(XMM0, R(XMM1));
2025
MOVAPS(tempxregs[i], R(XMM0));
2026
}
2027
2028
for (int i = 0; i < n; ++i) {
2029
if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
2030
MOVSS(fpr.V(dregs[i]), tempxregs[i]);
2031
}
2032
2033
ApplyPrefixD(dregs, sz);
2034
2035
fpr.ReleaseSpillLocks();
2036
}
2037
2038
void Jit::Comp_Vocp(MIPSOpcode op) {
2039
CONDITIONAL_DISABLE(VFPU_VEC);
2040
2041
if (js.HasUnknownPrefix())
2042
DISABLE;
2043
2044
VectorSize sz = GetVecSize(op);
2045
int n = GetNumVectorElements(sz);
2046
2047
// This is a hack that modifies prefixes. We eat them later, so just overwrite.
2048
// S prefix forces the negate flags.
2049
js.prefixS |= 0x000F0000;
2050
// T prefix forces constants on and regnum to 1.
2051
// That means negate still works, and abs activates a different constant.
2052
js.prefixT = (js.prefixT & ~0x000000FF) | 0x00000055 | 0x0000F000;
2053
2054
u8 sregs[4], tregs[4], dregs[4];
2055
// Actually uses the T prefixes (despite being VS.)
2056
GetVectorRegsPrefixS(sregs, sz, _VS);
2057
if (js.prefixT != 0x0000F055)
2058
GetVectorRegsPrefixT(tregs, sz, _VS);
2059
GetVectorRegsPrefixD(dregs, sz, _VD);
2060
2061
// Flush SIMD.
2062
fpr.SimpleRegsV(sregs, sz, 0);
2063
if (js.prefixT != 0x0000F055)
2064
fpr.SimpleRegsV(tregs, sz, 0);
2065
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2066
2067
X64Reg tempxregs[4];
2068
for (int i = 0; i < n; ++i) {
2069
if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
2070
int reg = fpr.GetTempV();
2071
fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
2072
fpr.SpillLockV(reg);
2073
tempxregs[i] = fpr.VX(reg);
2074
} else {
2075
fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
2076
fpr.SpillLockV(dregs[i]);
2077
tempxregs[i] = fpr.VX(dregs[i]);
2078
}
2079
}
2080
2081
if (js.prefixT == 0x0000F055) {
2082
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2083
MOVSS(XMM1, MatR(TEMPREG));
2084
}
2085
for (int i = 0; i < n; ++i) {
2086
if (js.prefixT == 0x0000F055) {
2087
MOVSS(XMM0, R(XMM1));
2088
} else {
2089
MOVSS(XMM0, fpr.V(tregs[i]));
2090
}
2091
ADDSS(XMM0, fpr.V(sregs[i]));
2092
MOVSS(tempxregs[i], R(XMM0));
2093
}
2094
2095
for (int i = 0; i < n; ++i) {
2096
if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
2097
MOVSS(fpr.V(dregs[i]), tempxregs[i]);
2098
}
2099
2100
ApplyPrefixD(dregs, sz);
2101
2102
fpr.ReleaseSpillLocks();
2103
}
2104
2105
void Jit::Comp_Vbfy(MIPSOpcode op) {
2106
CONDITIONAL_DISABLE(VFPU_VEC);
2107
if (js.HasUnknownPrefix())
2108
DISABLE;
2109
2110
VectorSize sz = GetVecSize(op);
2111
int n = GetNumVectorElements(sz);
2112
if (n != 2 && n != 4) {
2113
DISABLE;
2114
}
2115
2116
u8 sregs[4], dregs[4];
2117
GetVectorRegsPrefixS(sregs, sz, _VS);
2118
GetVectorRegsPrefixD(dregs, sz, _VD);
2119
// Flush SIMD.
2120
fpr.SimpleRegsV(sregs, sz, 0);
2121
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2122
2123
X64Reg tempxregs[4];
2124
for (int i = 0; i < n; ++i) {
2125
if (!IsOverlapSafe(dregs[i], i, n, sregs)) {
2126
int reg = fpr.GetTempV();
2127
fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
2128
fpr.SpillLockV(reg);
2129
tempxregs[i] = fpr.VX(reg);
2130
} else {
2131
fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
2132
fpr.SpillLockV(dregs[i]);
2133
tempxregs[i] = fpr.VX(dregs[i]);
2134
}
2135
}
2136
2137
int subop = (op >> 16) & 0x1F;
2138
if (subop == 3) {
2139
// vbfy2
2140
MOVSS(tempxregs[0], fpr.V(sregs[0]));
2141
MOVSS(tempxregs[1], fpr.V(sregs[1]));
2142
MOVSS(tempxregs[2], fpr.V(sregs[0]));
2143
MOVSS(tempxregs[3], fpr.V(sregs[1]));
2144
ADDSS(tempxregs[0], fpr.V(sregs[2]));
2145
ADDSS(tempxregs[1], fpr.V(sregs[3]));
2146
SUBSS(tempxregs[2], fpr.V(sregs[2]));
2147
SUBSS(tempxregs[3], fpr.V(sregs[3]));
2148
} else if (subop == 2) {
2149
// vbfy1
2150
MOVSS(tempxregs[0], fpr.V(sregs[0]));
2151
MOVSS(tempxregs[1], fpr.V(sregs[0]));
2152
ADDSS(tempxregs[0], fpr.V(sregs[1]));
2153
SUBSS(tempxregs[1], fpr.V(sregs[1]));
2154
if (n == 4) {
2155
MOVSS(tempxregs[2], fpr.V(sregs[2]));
2156
MOVSS(tempxregs[3], fpr.V(sregs[2]));
2157
ADDSS(tempxregs[2], fpr.V(sregs[3]));
2158
SUBSS(tempxregs[3], fpr.V(sregs[3]));
2159
}
2160
} else {
2161
DISABLE;
2162
}
2163
2164
for (int i = 0; i < n; ++i) {
2165
if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
2166
MOVSS(fpr.V(dregs[i]), tempxregs[i]);
2167
}
2168
2169
ApplyPrefixD(dregs, sz);
2170
2171
fpr.ReleaseSpillLocks();
2172
}
2173
2174
union u32float {
2175
u32 u;
2176
float f;
2177
2178
operator float() const {
2179
return f;
2180
}
2181
2182
inline u32float &operator *=(const float &other) {
2183
f *= other;
2184
return *this;
2185
}
2186
};
2187
2188
#if PPSSPP_ARCH(AMD64)
2189
typedef float SinCosArg;
2190
#else
2191
typedef u32float SinCosArg;
2192
#endif
2193
2194
void SinCos(SinCosArg angle, float *output) {
2195
vfpu_sincos(angle, output[0], output[1]);
2196
}
2197
2198
void SinOnly(SinCosArg angle, float *output) {
2199
output[0] = vfpu_sin(angle);
2200
}
2201
2202
void NegSinOnly(SinCosArg angle, float *output) {
2203
output[0] = -vfpu_sin(angle);
2204
}
2205
2206
void CosOnly(SinCosArg angle, float *output) {
2207
output[1] = vfpu_cos(angle);
2208
}
2209
2210
void ASinScaled(SinCosArg sine, float *output) {
2211
output[0] = vfpu_asin(sine);
2212
}
2213
2214
void SinCosNegSin(SinCosArg angle, float *output) {
2215
vfpu_sincos(angle, output[0], output[1]);
2216
output[0] = -output[0];
2217
}
2218
2219
void Exp2(SinCosArg arg, float *output) {
2220
output[0] = vfpu_exp2(arg);
2221
}
2222
2223
void Log2(SinCosArg arg, float *output) {
2224
output[0] = vfpu_log2(arg);
2225
}
2226
2227
void RExp2(SinCosArg arg, float *output) {
2228
output[0] = vfpu_rexp2(arg);
2229
}
2230
2231
void Jit::Comp_VV2Op(MIPSOpcode op) {
2232
CONDITIONAL_DISABLE(VFPU_VEC);
2233
2234
if (js.HasUnknownPrefix())
2235
DISABLE;
2236
2237
auto specialFuncCallHelper = [this](void (*specialFunc)(SinCosArg, float *output), u8 sreg) {
2238
#if PPSSPP_ARCH(AMD64)
2239
MOVSS(XMM0, fpr.V(sreg));
2240
// TODO: This reg might be different on Linux...
2241
#ifdef _WIN32
2242
LEA(64, RDX, MIPSSTATE_VAR(sincostemp[0]));
2243
#else
2244
LEA(64, RDI, MIPSSTATE_VAR(sincostemp[0]));
2245
#endif
2246
ABI_CallFunction(thunks.ProtectFunction((const void *)specialFunc, 0));
2247
#else
2248
// Sigh, passing floats with cdecl isn't pretty, ends up on the stack.
2249
if (fpr.V(sreg).IsSimpleReg()) {
2250
MOVD_xmm(R(EAX), fpr.VX(sreg));
2251
} else {
2252
MOV(32, R(EAX), fpr.V(sreg));
2253
}
2254
CallProtectedFunction((const void *)specialFunc, R(EAX), Imm32((uint32_t)(uintptr_t)&mips_->sincostemp[0]));
2255
#endif
2256
};
2257
2258
// Pre-processing: Eliminate silly no-op VMOVs, common in Wipeout Pure
2259
if (((op >> 16) & 0x1f) == 0 && _VS == _VD && js.HasNoPrefix()) {
2260
return;
2261
}
2262
2263
VectorSize sz = GetVecSize(op);
2264
int n = GetNumVectorElements(sz);
2265
2266
u8 sregs[4], dregs[4];
2267
GetVectorRegsPrefixS(sregs, sz, _VS);
2268
GetVectorRegsPrefixD(dregs, sz, _VD);
2269
2270
bool canSIMD = false;
2271
// Some can be SIMD'd.
2272
switch ((op >> 16) & 0x1f) {
2273
case 0: // vmov
2274
case 1: // vabs
2275
case 2: // vneg
2276
canSIMD = true;
2277
break;
2278
}
2279
2280
if (canSIMD && fpr.TryMapDirtyInVS(dregs, sz, sregs, sz)) {
2281
switch ((op >> 16) & 0x1f) {
2282
case 0: // vmov
2283
MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
2284
break;
2285
case 1: // vabs
2286
if (dregs[0] != sregs[0])
2287
MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
2288
if (RipAccessible(&noSignMask)) {
2289
ANDPS(fpr.VSX(dregs), M(&noSignMask)); // rip accessible
2290
} else {
2291
MOV(PTRBITS, R(TEMPREG), ImmPtr(&noSignMask));
2292
ANDPS(fpr.VSX(dregs), MatR(TEMPREG));
2293
}
2294
break;
2295
case 2: // vneg
2296
if (dregs[0] != sregs[0])
2297
MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
2298
if (RipAccessible(&signBitAll)) {
2299
XORPS(fpr.VSX(dregs), M(&signBitAll)); // rip accessible
2300
} else {
2301
MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitAll));
2302
XORPS(fpr.VSX(dregs), MatR(TEMPREG));
2303
}
2304
break;
2305
}
2306
ApplyPrefixD(dregs, sz);
2307
fpr.ReleaseSpillLocks();
2308
return;
2309
}
2310
2311
// Flush SIMD.
2312
fpr.SimpleRegsV(sregs, sz, 0);
2313
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2314
2315
X64Reg tempxregs[4];
2316
for (int i = 0; i < n; ++i)
2317
{
2318
if (!IsOverlapSafeAllowS(dregs[i], i, n, sregs))
2319
{
2320
int reg = fpr.GetTempV();
2321
fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
2322
fpr.SpillLockV(reg);
2323
tempxregs[i] = fpr.VX(reg);
2324
}
2325
else
2326
{
2327
fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
2328
fpr.SpillLockV(dregs[i]);
2329
tempxregs[i] = fpr.VX(dregs[i]);
2330
}
2331
}
2332
2333
// Warning: sregs[i] and tempxregs[i] may be the same reg.
2334
// Helps for vmov, hurts for vrcp, etc.
2335
for (int i = 0; i < n; ++i)
2336
{
2337
switch ((op >> 16) & 0x1f)
2338
{
2339
case 0: // d[i] = s[i]; break; //vmov
2340
// Probably for swizzle.
2341
if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2342
MOVSS(tempxregs[i], fpr.V(sregs[i]));
2343
break;
2344
case 1: // d[i] = fabsf(s[i]); break; //vabs
2345
if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2346
MOVSS(tempxregs[i], fpr.V(sregs[i]));
2347
if (RipAccessible(&noSignMask)) {
2348
ANDPS(tempxregs[i], M(&noSignMask)); // rip accessible
2349
} else {
2350
MOV(PTRBITS, R(TEMPREG), ImmPtr(&noSignMask));
2351
ANDPS(tempxregs[i], MatR(TEMPREG));
2352
}
2353
break;
2354
case 2: // d[i] = -s[i]; break; //vneg
2355
if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2356
MOVSS(tempxregs[i], fpr.V(sregs[i]));
2357
if (RipAccessible(&signBitLower)) {
2358
XORPS(tempxregs[i], M(&signBitLower)); // rip accessible
2359
} else {
2360
MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower));
2361
XORPS(tempxregs[i], MatR(TEMPREG));
2362
}
2363
break;
2364
case 4: // if (s[i] < 0) d[i] = 0; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat0
2365
if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2366
MOVSS(tempxregs[i], fpr.V(sregs[i]));
2367
2368
// Zero out XMM0 if it was <= +0.0f (but skip NAN.)
2369
MOVSS(R(XMM0), tempxregs[i]);
2370
XORPS(XMM1, R(XMM1));
2371
CMPLESS(XMM0, R(XMM1));
2372
ANDNPS(XMM0, R(tempxregs[i]));
2373
2374
// Retain a NAN in XMM0 (must be second operand.)
2375
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2376
MOVSS(tempxregs[i], MatR(TEMPREG));
2377
MINSS(tempxregs[i], R(XMM0));
2378
break;
2379
case 5: // if (s[i] < -1.0f) d[i] = -1.0f; else {if(s[i] > 1.0f) d[i] = 1.0f; else d[i] = s[i];} break; // vsat1
2380
if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2381
MOVSS(tempxregs[i], fpr.V(sregs[i]));
2382
2383
// Check for < -1.0f, but careful of NANs.
2384
MOV(PTRBITS, R(TEMPREG), ImmPtr(&minus_one));
2385
MOVSS(XMM1, MatR(TEMPREG));
2386
MOVSS(R(XMM0), tempxregs[i]);
2387
CMPLESS(XMM0, R(XMM1));
2388
// If it was NOT less, the three ops below do nothing.
2389
// Otherwise, they replace the value with -1.0f.
2390
ANDPS(XMM1, R(XMM0));
2391
ANDNPS(XMM0, R(tempxregs[i]));
2392
ORPS(XMM0, R(XMM1));
2393
2394
// Retain a NAN in XMM0 (must be second operand.)
2395
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2396
MOVSS(tempxregs[i], MatR(TEMPREG));
2397
MINSS(tempxregs[i], R(XMM0));
2398
break;
2399
case 16: // d[i] = 1.0f / s[i]; break; //vrcp
2400
if (RipAccessible(&one)) {
2401
MOVSS(XMM0, M(&one)); // rip accessible
2402
} else {
2403
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2404
MOVSS(XMM0, MatR(TEMPREG));
2405
}
2406
DIVSS(XMM0, fpr.V(sregs[i]));
2407
MOVSS(tempxregs[i], R(XMM0));
2408
break;
2409
case 17: // d[i] = 1.0f / sqrtf(s[i]); break; //vrsq
2410
SQRTSS(XMM0, fpr.V(sregs[i]));
2411
if (RipAccessible(&one)) {
2412
MOVSS(tempxregs[i], M(&one)); // rip accessible
2413
} else {
2414
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2415
MOVSS(tempxregs[i], MatR(TEMPREG));
2416
}
2417
DIVSS(tempxregs[i], R(XMM0));
2418
break;
2419
case 18: // d[i] = sinf((float)M_PI_2 * s[i]); break; //vsin
2420
specialFuncCallHelper(&SinOnly, sregs[i]);
2421
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2422
break;
2423
case 19: // d[i] = cosf((float)M_PI_2 * s[i]); break; //vcos
2424
specialFuncCallHelper(&CosOnly, sregs[i]);
2425
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[1]));
2426
break;
2427
case 20: // d[i] = powf(2.0f, s[i]); break; //vexp2
2428
specialFuncCallHelper(&Exp2, sregs[i]);
2429
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2430
break;
2431
case 21: // d[i] = logf(s[i])/log(2.0f); break; //vlog2
2432
specialFuncCallHelper(&Log2, sregs[i]);
2433
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2434
break;
2435
case 22: // d[i] = sqrtf(s[i]); break; //vsqrt
2436
SQRTSS(tempxregs[i], fpr.V(sregs[i]));
2437
MOV(PTRBITS, R(TEMPREG), ImmPtr(&noSignMask));
2438
ANDPS(tempxregs[i], MatR(TEMPREG));
2439
break;
2440
case 23: // d[i] = asinf(s[i]) / M_PI_2; break; //vasin
2441
specialFuncCallHelper(&ASinScaled, sregs[i]);
2442
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2443
break;
2444
case 24: // d[i] = -1.0f / s[i]; break; // vnrcp
2445
// Rare so let's not bother checking for RipAccessible.
2446
MOV(PTRBITS, R(TEMPREG), ImmPtr(&minus_one));
2447
MOVSS(XMM0, MatR(TEMPREG));
2448
DIVSS(XMM0, fpr.V(sregs[i]));
2449
MOVSS(tempxregs[i], R(XMM0));
2450
break;
2451
case 26: // d[i] = -sinf((float)M_PI_2 * s[i]); break; // vnsin
2452
specialFuncCallHelper(&NegSinOnly, sregs[i]);
2453
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2454
break;
2455
case 28: // d[i] = 1.0f / expf(s[i] * (float)M_LOG2E); break; // vrexp2
2456
specialFuncCallHelper(&RExp2, sregs[i]);
2457
MOVSS(tempxregs[i], MIPSSTATE_VAR(sincostemp[0]));
2458
break;
2459
}
2460
}
2461
for (int i = 0; i < n; ++i)
2462
{
2463
if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
2464
MOVSS(fpr.V(dregs[i]), tempxregs[i]);
2465
}
2466
2467
ApplyPrefixD(dregs, sz);
2468
2469
fpr.ReleaseSpillLocks();
2470
}
2471
2472
void Jit::Comp_Mftv(MIPSOpcode op) {
2473
CONDITIONAL_DISABLE(VFPU_XFER);
2474
2475
int imm = op & 0xFF;
2476
MIPSGPReg rt = _RT;
2477
switch ((op >> 21) & 0x1f)
2478
{
2479
case 3: //mfv / mfvc
2480
// rt = 0, imm = 255 appears to be used as a CPU interlock by some games.
2481
if (rt != MIPS_REG_ZERO) {
2482
if (imm < 128) { //R(rt) = VI(imm);
2483
fpr.SimpleRegV(imm, 0);
2484
if (fpr.V(imm).IsSimpleReg()) {
2485
fpr.MapRegV(imm, 0);
2486
gpr.MapReg(rt, false, true);
2487
MOVD_xmm(gpr.R(rt), fpr.VX(imm));
2488
} else {
2489
// Let's not bother mapping the vreg.
2490
gpr.MapReg(rt, false, true);
2491
MOV(32, gpr.R(rt), fpr.V(imm));
2492
}
2493
} else if (imm < 128 + VFPU_CTRL_MAX) { //mfvc
2494
if (imm - 128 == VFPU_CTRL_CC) {
2495
if (gpr.IsImm(MIPS_REG_VFPUCC)) {
2496
gpr.SetImm(rt, gpr.GetImm(MIPS_REG_VFPUCC));
2497
} else {
2498
gpr.Lock(rt, MIPS_REG_VFPUCC);
2499
gpr.MapReg(rt, false, true);
2500
gpr.MapReg(MIPS_REG_VFPUCC, true, false);
2501
MOV(32, gpr.R(rt), gpr.R(MIPS_REG_VFPUCC));
2502
gpr.UnlockAll();
2503
}
2504
} else {
2505
// In case we have a saved prefix.
2506
FlushPrefixV();
2507
gpr.MapReg(rt, false, true);
2508
MOV(32, gpr.R(rt), MIPSSTATE_VAR_ELEM32(vfpuCtrl[0], imm - 128));
2509
}
2510
} else {
2511
//ERROR - maybe need to make this value too an "interlock" value?
2512
_dbg_assert_msg_(false,"mfv - invalid register");
2513
}
2514
}
2515
break;
2516
2517
case 7: //mtv
2518
if (imm < 128) { // VI(imm) = R(rt);
2519
fpr.MapRegV(imm, MAP_DIRTY | MAP_NOINIT);
2520
// Let's not bother mapping rt if we don't have to.
2521
if (gpr.IsImm(rt) && gpr.GetImm(rt) == 0) {
2522
XORPS(fpr.VX(imm), fpr.V(imm));
2523
} else {
2524
gpr.KillImmediate(rt, true, false);
2525
MOVD_xmm(fpr.VX(imm), gpr.R(rt));
2526
}
2527
} else if (imm < 128 + VFPU_CTRL_MAX) { //mtvc //currentMIPS->vfpuCtrl[imm - 128] = R(rt);
2528
if (imm - 128 == VFPU_CTRL_CC) {
2529
if (gpr.IsImm(rt)) {
2530
gpr.SetImm(MIPS_REG_VFPUCC, gpr.GetImm(rt));
2531
} else {
2532
gpr.Lock(rt, MIPS_REG_VFPUCC);
2533
gpr.MapReg(rt, true, false);
2534
gpr.MapReg(MIPS_REG_VFPUCC, false, true);
2535
MOV(32, gpr.R(MIPS_REG_VFPUCC), gpr.R(rt));
2536
gpr.UnlockAll();
2537
}
2538
} else {
2539
gpr.MapReg(rt, true, false);
2540
MOV(32, MIPSSTATE_VAR_ELEM32(vfpuCtrl[0], imm - 128), gpr.R(rt));
2541
}
2542
2543
// TODO: Optimization if rt is Imm?
2544
if (imm - 128 == VFPU_CTRL_SPREFIX) {
2545
js.prefixSFlag = JitState::PREFIX_UNKNOWN;
2546
js.blockWrotePrefixes = true;
2547
} else if (imm - 128 == VFPU_CTRL_TPREFIX) {
2548
js.prefixTFlag = JitState::PREFIX_UNKNOWN;
2549
js.blockWrotePrefixes = true;
2550
} else if (imm - 128 == VFPU_CTRL_DPREFIX) {
2551
js.prefixDFlag = JitState::PREFIX_UNKNOWN;
2552
js.blockWrotePrefixes = true;
2553
}
2554
} else {
2555
//ERROR
2556
_dbg_assert_msg_(false,"mtv - invalid register");
2557
}
2558
break;
2559
2560
default:
2561
DISABLE;
2562
}
2563
}
2564
2565
void Jit::Comp_Vmfvc(MIPSOpcode op) {
2566
CONDITIONAL_DISABLE(VFPU_XFER);
2567
int vd = _VD;
2568
int imm = (op >> 8) & 0x7F;
2569
if (imm < VFPU_CTRL_MAX) {
2570
fpr.MapRegV(vd, MAP_DIRTY | MAP_NOINIT);
2571
if (imm == VFPU_CTRL_CC) {
2572
gpr.MapReg(MIPS_REG_VFPUCC, true, false);
2573
MOVD_xmm(fpr.VX(vd), gpr.R(MIPS_REG_VFPUCC));
2574
} else {
2575
MOVSS(fpr.VX(vd), MIPSSTATE_VAR_ELEM32(vfpuCtrl[0], imm));
2576
}
2577
fpr.ReleaseSpillLocks();
2578
} else {
2579
fpr.MapRegV(vd, MAP_DIRTY | MAP_NOINIT);
2580
XORPS(fpr.VX(vd), fpr.V(vd));
2581
fpr.ReleaseSpillLocks();
2582
}
2583
}
2584
2585
void Jit::Comp_Vmtvc(MIPSOpcode op) {
2586
CONDITIONAL_DISABLE(VFPU_XFER);
2587
int vs = _VS;
2588
int imm = op & 0x7F;
2589
if (imm < VFPU_CTRL_MAX) {
2590
fpr.MapRegV(vs, 0);
2591
if (imm == VFPU_CTRL_CC) {
2592
gpr.MapReg(MIPS_REG_VFPUCC, false, true);
2593
MOVD_xmm(gpr.R(MIPS_REG_VFPUCC), fpr.VX(vs));
2594
} else {
2595
MOVSS(MIPSSTATE_VAR_ELEM32(vfpuCtrl[0], imm), fpr.VX(vs));
2596
}
2597
fpr.ReleaseSpillLocks();
2598
2599
if (imm == VFPU_CTRL_SPREFIX) {
2600
js.prefixSFlag = JitState::PREFIX_UNKNOWN;
2601
js.blockWrotePrefixes = true;
2602
} else if (imm == VFPU_CTRL_TPREFIX) {
2603
js.prefixTFlag = JitState::PREFIX_UNKNOWN;
2604
js.blockWrotePrefixes = true;
2605
} else if (imm == VFPU_CTRL_DPREFIX) {
2606
js.prefixDFlag = JitState::PREFIX_UNKNOWN;
2607
js.blockWrotePrefixes = true;
2608
}
2609
}
2610
}
2611
2612
void Jit::Comp_VMatrixInit(MIPSOpcode op) {
2613
CONDITIONAL_DISABLE(VFPU_XFER);
2614
2615
if (js.HasUnknownPrefix())
2616
DISABLE;
2617
2618
MatrixSize sz = GetMtxSize(op);
2619
int n = GetMatrixSide(sz);
2620
2621
// Not really about trying here, it will work if enabled.
2622
if (jo.enableVFPUSIMD) {
2623
VectorSize vsz = GetVectorSize(sz);
2624
u8 vecs[4];
2625
GetMatrixColumns(_VD, sz, vecs);
2626
switch ((op >> 16) & 0xF) {
2627
case 3:
2628
MOV(PTRBITS, R(TEMPREG), ImmPtr(&identityMatrix[0]));
2629
break;
2630
case 7:
2631
MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
2632
MOVAPS(XMM0, MatR(TEMPREG));
2633
break;
2634
}
2635
2636
for (int i = 0; i < n; i++) {
2637
u8 vec[4];
2638
GetVectorRegs(vec, vsz, vecs[i]);
2639
fpr.MapRegsVS(vec, vsz, MAP_NOINIT | MAP_DIRTY);
2640
switch ((op >> 16) & 0xF) {
2641
case 3:
2642
MOVAPS(fpr.VSX(vec), MDisp(TEMPREG, 16 * i));
2643
break;
2644
case 6:
2645
XORPS(fpr.VSX(vec), fpr.VS(vec));
2646
break;
2647
case 7:
2648
MOVAPS(fpr.VSX(vec), R(XMM0));
2649
break;
2650
}
2651
}
2652
fpr.ReleaseSpillLocks();
2653
return;
2654
}
2655
2656
u8 dregs[16];
2657
GetMatrixRegs(dregs, sz, _VD);
2658
2659
// Flush SIMD.
2660
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2661
2662
switch ((op >> 16) & 0xF) {
2663
case 3: // vmidt
2664
XORPS(XMM0, R(XMM0));
2665
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2666
MOVSS(XMM1, MatR(TEMPREG));
2667
for (int a = 0; a < n; a++) {
2668
for (int b = 0; b < n; b++) {
2669
MOVSS(fpr.V(dregs[a * 4 + b]), a == b ? XMM1 : XMM0);
2670
}
2671
}
2672
break;
2673
case 6: // vmzero
2674
XORPS(XMM0, R(XMM0));
2675
for (int a = 0; a < n; a++) {
2676
for (int b = 0; b < n; b++) {
2677
MOVSS(fpr.V(dregs[a * 4 + b]), XMM0);
2678
}
2679
}
2680
break;
2681
case 7: // vmone
2682
MOV(PTRBITS, R(TEMPREG), ImmPtr(&one));
2683
MOVSS(XMM0, MatR(TEMPREG));
2684
for (int a = 0; a < n; a++) {
2685
for (int b = 0; b < n; b++) {
2686
MOVSS(fpr.V(dregs[a * 4 + b]), XMM0);
2687
}
2688
}
2689
break;
2690
}
2691
2692
fpr.ReleaseSpillLocks();
2693
}
2694
2695
void Jit::Comp_Vmmov(MIPSOpcode op) {
2696
CONDITIONAL_DISABLE(VFPU_MTX_VMMOV);
2697
2698
// TODO: This probably ignores prefixes?
2699
if (js.HasUnknownPrefix())
2700
DISABLE;
2701
2702
MatrixSize sz = GetMtxSize(op);
2703
int n = GetMatrixSide(sz);
2704
2705
if (jo.enableVFPUSIMD) {
2706
VectorSize vsz = GetVectorSize(sz);
2707
u8 dest[4][4];
2708
MatrixOverlapType overlap = GetMatrixOverlap(_VD, _VS, sz);
2709
2710
u8 vecs[4];
2711
if (overlap == OVERLAP_NONE) {
2712
GetMatrixColumns(_VD, sz, vecs);
2713
for (int i = 0; i < n; ++i) {
2714
GetVectorRegs(dest[i], vsz, vecs[i]);
2715
}
2716
} else {
2717
for (int i = 0; i < n; ++i) {
2718
fpr.GetTempVS(dest[i], vsz);
2719
}
2720
}
2721
2722
GetMatrixColumns(_VS, sz, vecs);
2723
for (int i = 0; i < n; i++) {
2724
u8 vec[4];
2725
GetVectorRegs(vec, vsz, vecs[i]);
2726
fpr.MapRegsVS(vec, vsz, 0);
2727
fpr.MapRegsVS(dest[i], vsz, MAP_NOINIT);
2728
MOVAPS(fpr.VSX(dest[i]), fpr.VS(vec));
2729
fpr.ReleaseSpillLocks();
2730
}
2731
2732
if (overlap != OVERLAP_NONE) {
2733
// Okay, move from the temps to VD now.
2734
GetMatrixColumns(_VD, sz, vecs);
2735
for (int i = 0; i < n; i++) {
2736
u8 vec[4];
2737
GetVectorRegs(vec, vsz, vecs[i]);
2738
fpr.MapRegsVS(vec, vsz, MAP_NOINIT);
2739
fpr.MapRegsVS(dest[i], vsz, 0);
2740
MOVAPS(fpr.VSX(vec), fpr.VS(dest[i]));
2741
fpr.ReleaseSpillLocks();
2742
}
2743
}
2744
2745
fpr.ReleaseSpillLocks();
2746
return;
2747
}
2748
2749
u8 sregs[16], dregs[16];
2750
GetMatrixRegs(sregs, sz, _VS);
2751
GetMatrixRegs(dregs, sz, _VD);
2752
2753
// Flush SIMD.
2754
fpr.SimpleRegsV(sregs, sz, 0);
2755
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2756
2757
// TODO: gas doesn't allow overlap, what does the PSP do?
2758
// Potentially detect overlap or the safe direction to move in, or just DISABLE?
2759
// This is very not optimal, blows the regcache everytime.
2760
u8 tempregs[16];
2761
for (int a = 0; a < n; a++) {
2762
for (int b = 0; b < n; b++) {
2763
u8 temp = (u8) fpr.GetTempV();
2764
fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);
2765
MOVSS(fpr.VX(temp), fpr.V(sregs[a * 4 + b]));
2766
fpr.StoreFromRegisterV(temp);
2767
tempregs[a * 4 + b] = temp;
2768
}
2769
}
2770
for (int a = 0; a < n; a++) {
2771
for (int b = 0; b < n; b++) {
2772
u8 temp = tempregs[a * 4 + b];
2773
fpr.MapRegV(temp, 0);
2774
MOVSS(fpr.V(dregs[a * 4 + b]), fpr.VX(temp));
2775
}
2776
}
2777
2778
fpr.ReleaseSpillLocks();
2779
}
2780
2781
void Jit::Comp_VScl(MIPSOpcode op) {
2782
CONDITIONAL_DISABLE(VFPU_VEC);
2783
2784
if (js.HasUnknownPrefix())
2785
DISABLE;
2786
2787
VectorSize sz = GetVecSize(op);
2788
int n = GetNumVectorElements(sz);
2789
2790
u8 sregs[4], dregs[4], scale;
2791
GetVectorRegsPrefixS(sregs, sz, _VS);
2792
GetVectorRegsPrefixT(&scale, V_Single, _VT);
2793
GetVectorRegsPrefixD(dregs, sz, _VD);
2794
2795
if (fpr.TryMapDirtyInInVS(dregs, sz, sregs, sz, &scale, V_Single, true)) {
2796
MOVSS(XMM0, fpr.VS(&scale));
2797
if (sz != V_Single)
2798
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
2799
if (dregs[0] != sregs[0]) {
2800
MOVAPS(fpr.VSX(dregs), fpr.VS(sregs));
2801
}
2802
MULPS(fpr.VSX(dregs), R(XMM0));
2803
ApplyPrefixD(dregs, sz);
2804
fpr.ReleaseSpillLocks();
2805
return;
2806
}
2807
2808
// Flush SIMD.
2809
fpr.SimpleRegsV(sregs, sz, 0);
2810
fpr.SimpleRegsV(&scale, V_Single, 0);
2811
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2812
2813
// Move to XMM0 early, so we don't have to worry about overlap with scale.
2814
MOVSS(XMM0, fpr.V(scale));
2815
2816
X64Reg tempxregs[4];
2817
for (int i = 0; i < n; ++i) {
2818
if (dregs[i] != scale || !IsOverlapSafeAllowS(dregs[i], i, n, sregs)) {
2819
int reg = fpr.GetTempV();
2820
fpr.MapRegV(reg, MAP_NOINIT | MAP_DIRTY);
2821
fpr.SpillLockV(reg);
2822
tempxregs[i] = fpr.VX(reg);
2823
} else {
2824
fpr.MapRegV(dregs[i], dregs[i] == sregs[i] ? MAP_DIRTY : MAP_NOINIT);
2825
fpr.SpillLockV(dregs[i]);
2826
tempxregs[i] = fpr.VX(dregs[i]);
2827
}
2828
}
2829
for (int i = 0; i < n; ++i) {
2830
if (!fpr.V(sregs[i]).IsSimpleReg(tempxregs[i]))
2831
MOVSS(tempxregs[i], fpr.V(sregs[i]));
2832
MULSS(tempxregs[i], R(XMM0));
2833
}
2834
for (int i = 0; i < n; ++i) {
2835
if (!fpr.V(dregs[i]).IsSimpleReg(tempxregs[i]))
2836
MOVSS(fpr.V(dregs[i]), tempxregs[i]);
2837
}
2838
ApplyPrefixD(dregs, sz);
2839
2840
fpr.ReleaseSpillLocks();
2841
}
2842
2843
void Jit::Comp_Vmmul(MIPSOpcode op) {
2844
CONDITIONAL_DISABLE(VFPU_MTX_VMMUL);
2845
if (!js.HasNoPrefix()) {
2846
DISABLE;
2847
}
2848
2849
if (PSP_CoreParameter().compat.flags().MoreAccurateVMMUL) {
2850
// Fall back to interpreter, which has the accurate implementation.
2851
// Later we might do something more optimized here.
2852
DISABLE;
2853
}
2854
2855
MatrixSize sz = GetMtxSize(op);
2856
VectorSize vsz = GetVectorSize(sz);
2857
int n = GetMatrixSide(sz);
2858
2859
MatrixOverlapType soverlap = GetMatrixOverlap(_VS, _VD, sz);
2860
MatrixOverlapType toverlap = GetMatrixOverlap(_VT, _VD, sz);
2861
// If these overlap, we won't be able to map T as singles.
2862
MatrixOverlapType stoverlap = GetMatrixOverlap(_VS, _VT, sz);
2863
2864
if (jo.enableVFPUSIMD && !soverlap && !toverlap && !stoverlap) {
2865
u8 scols[4], dcols[4], tregs[16];
2866
2867
int vs = _VS;
2868
int vd = _VD;
2869
int vt = _VT;
2870
2871
bool transposeDest = false;
2872
bool transposeS = false;
2873
2874
if ((vd & 0x20) && sz == M_4x4) {
2875
vd ^= 0x20;
2876
transposeDest = true;
2877
}
2878
2879
// Our algorithm needs a transposed S (which is the usual).
2880
if (!(vs & 0x20) && sz == M_4x4) {
2881
vs ^= 0x20;
2882
transposeS = true;
2883
}
2884
2885
// The T matrix we will address individually.
2886
GetMatrixColumns(vd, sz, dcols);
2887
GetMatrixRows(vs, sz, scols);
2888
memset(tregs, 255, sizeof(tregs));
2889
GetMatrixRegs(tregs, sz, vt);
2890
for (int i = 0; i < 16; i++) {
2891
if (tregs[i] != 255)
2892
fpr.StoreFromRegisterV(tregs[i]);
2893
}
2894
2895
u8 scol[4][4];
2896
2897
// Map all of S's columns into registers.
2898
for (int i = 0; i < n; i++) {
2899
if (transposeS){
2900
fpr.StoreFromRegisterV(scols[i]);
2901
}
2902
GetVectorRegs(scol[i], vsz, scols[i]);
2903
fpr.MapRegsVS(scol[i], vsz, 0);
2904
fpr.SpillLockV(scols[i], vsz);
2905
}
2906
2907
// Shorter than manually stuffing the registers. But it feels like ther'es room for optimization here...
2908
auto transposeInPlace = [=](u8 col[4][4]) {
2909
MOVAPS(XMM0, fpr.VS(col[0]));
2910
UNPCKLPS(fpr.VSX(col[0]), fpr.VS(col[2]));
2911
UNPCKHPS(XMM0, fpr.VS(col[2]));
2912
2913
MOVAPS(fpr.VSX(col[2]), fpr.VS(col[1]));
2914
UNPCKLPS(fpr.VSX(col[1]), fpr.VS(col[3]));
2915
UNPCKHPS(fpr.VSX(col[2]), fpr.VS(col[3]));
2916
2917
MOVAPS(fpr.VSX(col[3]), fpr.VS(col[0]));
2918
UNPCKLPS(fpr.VSX(col[0]), fpr.VS(col[1]));
2919
UNPCKHPS(fpr.VSX(col[3]), fpr.VS(col[1]));
2920
2921
MOVAPS(fpr.VSX(col[1]), R(XMM0));
2922
UNPCKLPS(fpr.VSX(col[1]), fpr.VS(col[2]));
2923
UNPCKHPS(XMM0, fpr.VS(col[2]));
2924
2925
MOVAPS(fpr.VSX(col[2]), fpr.VS(col[1]));
2926
MOVAPS(fpr.VSX(col[1]), fpr.VS(col[3]));
2927
MOVAPS(fpr.VSX(col[3]), R(XMM0));
2928
};
2929
2930
// Some games pass in S as an E matrix (transposed). Let's just transpose the data before we do the multiplication instead.
2931
// This is shorter than trying to combine a discontinous matrix with lots of shufps.
2932
if (transposeS) {
2933
transposeInPlace(scol);
2934
}
2935
2936
// Now, work our way through the matrix, loading things as we go.
2937
// TODO: With more temp registers, can generate much more efficient code.
2938
for (int i = 0; i < n; i++) {
2939
MOVSS(XMM1, fpr.V(tregs[4 * i])); // TODO: AVX broadcastss to replace this and the SHUFPS
2940
MOVSS(XMM0, fpr.V(tregs[4 * i + 1]));
2941
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
2942
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
2943
MULPS(XMM1, fpr.VS(scol[0]));
2944
MULPS(XMM0, fpr.VS(scol[1]));
2945
ADDPS(XMM1, R(XMM0));
2946
for (int j = 2; j < n; j++) {
2947
MOVSS(XMM0, fpr.V(tregs[4 * i + j]));
2948
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
2949
MULPS(XMM0, fpr.VS(scol[j]));
2950
ADDPS(XMM1, R(XMM0));
2951
}
2952
// Map the D column.
2953
u8 dcol[4];
2954
GetVectorRegs(dcol, vsz, dcols[i]);
2955
#if !PPSSPP_ARCH(AMD64)
2956
fpr.MapRegsVS(dcol, vsz, MAP_DIRTY | MAP_NOINIT | MAP_NOLOCK);
2957
#else
2958
fpr.MapRegsVS(dcol, vsz, MAP_DIRTY | MAP_NOINIT);
2959
#endif
2960
MOVAPS(fpr.VS(dcol), XMM1);
2961
}
2962
if (transposeS){
2963
for (int i = 0; i < n; i++){
2964
fpr.DiscardVS(scols[i]);
2965
}
2966
}
2967
2968
#if !PPSSPP_ARCH(AMD64)
2969
fpr.ReleaseSpillLocks();
2970
#endif
2971
if (transposeDest) {
2972
u8 dcol[4][4];
2973
for (int i = 0; i < n; i++) {
2974
GetVectorRegs(dcol[i], vsz, dcols[i]);
2975
fpr.MapRegsVS(dcol[i], vsz, MAP_DIRTY);
2976
}
2977
transposeInPlace(dcol);
2978
}
2979
fpr.ReleaseSpillLocks();
2980
return;
2981
}
2982
2983
u8 sregs[16], tregs[16], dregs[16];
2984
GetMatrixRegs(sregs, sz, _VS);
2985
GetMatrixRegs(tregs, sz, _VT);
2986
GetMatrixRegs(dregs, sz, _VD);
2987
2988
// Flush SIMD.
2989
fpr.SimpleRegsV(sregs, sz, 0);
2990
fpr.SimpleRegsV(tregs, sz, 0);
2991
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
2992
2993
// Rough overlap check.
2994
bool overlap = false;
2995
if (GetMtx(_VS) == GetMtx(_VD) || GetMtx(_VT) == GetMtx(_VD)) {
2996
// Potential overlap (guaranteed for 3x3 or more).
2997
overlap = true;
2998
}
2999
3000
if (overlap) {
3001
u8 tempregs[16];
3002
for (int a = 0; a < n; a++) {
3003
for (int b = 0; b < n; b++) {
3004
MOVSS(XMM0, fpr.V(sregs[b * 4]));
3005
MULSS(XMM0, fpr.V(tregs[a * 4]));
3006
for (int c = 1; c < n; c++) {
3007
MOVSS(XMM1, fpr.V(sregs[b * 4 + c]));
3008
MULSS(XMM1, fpr.V(tregs[a * 4 + c]));
3009
ADDSS(XMM0, R(XMM1));
3010
}
3011
u8 temp = (u8) fpr.GetTempV();
3012
fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);
3013
MOVSS(fpr.VX(temp), R(XMM0));
3014
fpr.StoreFromRegisterV(temp);
3015
tempregs[a * 4 + b] = temp;
3016
}
3017
}
3018
for (int a = 0; a < n; a++) {
3019
for (int b = 0; b < n; b++) {
3020
u8 temp = tempregs[a * 4 + b];
3021
fpr.MapRegV(temp, 0);
3022
MOVSS(fpr.V(dregs[a * 4 + b]), fpr.VX(temp));
3023
}
3024
}
3025
} else {
3026
for (int a = 0; a < n; a++) {
3027
for (int b = 0; b < n; b++) {
3028
MOVSS(XMM0, fpr.V(sregs[b * 4]));
3029
MULSS(XMM0, fpr.V(tregs[a * 4]));
3030
for (int c = 1; c < n; c++) {
3031
MOVSS(XMM1, fpr.V(sregs[b * 4 + c]));
3032
MULSS(XMM1, fpr.V(tregs[a * 4 + c]));
3033
ADDSS(XMM0, R(XMM1));
3034
}
3035
MOVSS(fpr.V(dregs[a * 4 + b]), XMM0);
3036
}
3037
}
3038
}
3039
fpr.ReleaseSpillLocks();
3040
}
3041
3042
void Jit::Comp_Vmscl(MIPSOpcode op) {
3043
CONDITIONAL_DISABLE(VFPU_MTX_VMSCL);
3044
3045
// TODO: This op probably ignores prefixes?
3046
if (js.HasUnknownPrefix())
3047
DISABLE;
3048
3049
MatrixSize sz = GetMtxSize(op);
3050
int n = GetMatrixSide(sz);
3051
3052
u8 sregs[16], dregs[16], scale;
3053
GetMatrixRegs(sregs, sz, _VS);
3054
GetVectorRegs(&scale, V_Single, _VT);
3055
GetMatrixRegs(dregs, sz, _VD);
3056
3057
// Flush SIMD.
3058
fpr.SimpleRegsV(sregs, sz, 0);
3059
fpr.SimpleRegsV(&scale, V_Single, 0);
3060
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
3061
3062
// Move to XMM0 early, so we don't have to worry about overlap with scale.
3063
MOVSS(XMM0, fpr.V(scale));
3064
3065
// TODO: test overlap, optimize.
3066
u8 tempregs[16];
3067
for (int a = 0; a < n; a++) {
3068
for (int b = 0; b < n; b++) {
3069
u8 temp = (u8) fpr.GetTempV();
3070
fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);
3071
MOVSS(fpr.VX(temp), fpr.V(sregs[a * 4 + b]));
3072
MULSS(fpr.VX(temp), R(XMM0));
3073
fpr.StoreFromRegisterV(temp);
3074
tempregs[a * 4 + b] = temp;
3075
}
3076
}
3077
for (int a = 0; a < n; a++) {
3078
for (int b = 0; b < n; b++) {
3079
u8 temp = tempregs[a * 4 + b];
3080
fpr.MapRegV(temp, 0);
3081
MOVSS(fpr.V(dregs[a * 4 + b]), fpr.VX(temp));
3082
}
3083
}
3084
3085
fpr.ReleaseSpillLocks();
3086
}
3087
3088
void Jit::Comp_Vtfm(MIPSOpcode op) {
3089
CONDITIONAL_DISABLE(VFPU_MTX_VTFM);
3090
3091
// TODO: This probably ignores prefixes? Or maybe uses D?
3092
if (js.HasUnknownPrefix())
3093
DISABLE;
3094
3095
VectorSize sz = GetVecSize(op);
3096
MatrixSize msz = GetMtxSize(op);
3097
int n = GetNumVectorElements(sz);
3098
int ins = (op >> 23) & 7;
3099
3100
bool homogenous = false;
3101
if (n == ins) {
3102
n++;
3103
sz = (VectorSize)((int)(sz)+1);
3104
msz = (MatrixSize)((int)(msz)+1);
3105
homogenous = true;
3106
}
3107
// Otherwise, n should already be ins + 1.
3108
else if (n != ins + 1) {
3109
DISABLE;
3110
}
3111
3112
if (jo.enableVFPUSIMD) {
3113
u8 scols[4], dcol[4], tregs[4];
3114
3115
int vs = _VS;
3116
int vd = _VD;
3117
int vt = _VT; // vector!
3118
3119
// The T matrix we will address individually.
3120
GetVectorRegs(dcol, sz, vd);
3121
GetMatrixRows(vs, msz, scols);
3122
GetVectorRegs(tregs, sz, vt);
3123
for (int i = 0; i < n; i++) {
3124
fpr.StoreFromRegisterV(tregs[i]);
3125
}
3126
3127
// We need the T regs in individual regs, but they could overlap with S regs.
3128
// If that happens, we copy the T reg to a temp.
3129
auto flushConflictingTRegsToTemps = [&](u8 regs[4]) {
3130
for (int i = 0; i < n; ++i) {
3131
for (int j = 0; j < n; ++j) {
3132
if (regs[i] != tregs[j]) {
3133
continue;
3134
}
3135
3136
// They match. Let's replace this treg with a temp reg.
3137
// Note that it will spill if there's contention, unfortunately...
3138
tregs[j] = fpr.GetTempV();
3139
fpr.MapRegV(tregs[j], MAP_NOINIT);
3140
MOVSS(fpr.VX(tregs[j]), fpr.V(regs[i]));
3141
}
3142
}
3143
};
3144
3145
u8 scol[4][4];
3146
3147
// Map all of S's columns into registers.
3148
for (int i = 0; i < n; i++) {
3149
GetVectorRegs(scol[i], sz, scols[i]);
3150
flushConflictingTRegsToTemps(scol[i]);
3151
fpr.MapRegsVS(scol[i], sz, 0);
3152
}
3153
3154
// Now, work our way through the matrix, loading things as we go.
3155
// TODO: With more temp registers, can generate much more efficient code.
3156
MOVSS(XMM1, fpr.V(tregs[0])); // TODO: AVX broadcastss to replace this and the SHUFPS (but take care of temps, unless we force store them.)
3157
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(0, 0, 0, 0));
3158
MULPS(XMM1, fpr.VS(scol[0]));
3159
for (int j = 1; j < n; j++) {
3160
if (!homogenous || j != n - 1) {
3161
MOVSS(XMM0, fpr.V(tregs[j]));
3162
SHUFPS(XMM0, R(XMM0), _MM_SHUFFLE(0, 0, 0, 0));
3163
MULPS(XMM0, fpr.VS(scol[j]));
3164
ADDPS(XMM1, R(XMM0));
3165
} else {
3166
ADDPS(XMM1, fpr.VS(scol[j]));
3167
}
3168
}
3169
// Map the D column. Release first in case of overlap.
3170
for (int i = 0; i < n; i++) {
3171
fpr.ReleaseSpillLockV(scol[i], sz);
3172
}
3173
fpr.MapRegsVS(dcol, sz, MAP_DIRTY | MAP_NOINIT);
3174
MOVAPS(fpr.VS(dcol), XMM1);
3175
fpr.ReleaseSpillLocks();
3176
return;
3177
}
3178
3179
u8 sregs[16], dregs[4], tregs[4];
3180
GetMatrixRegs(sregs, msz, _VS);
3181
GetVectorRegs(tregs, sz, _VT);
3182
GetVectorRegs(dregs, sz, _VD);
3183
3184
// Flush SIMD.
3185
fpr.SimpleRegsV(sregs, msz, 0);
3186
fpr.SimpleRegsV(tregs, sz, 0);
3187
fpr.SimpleRegsV(dregs, sz, MAP_NOINIT | MAP_DIRTY);
3188
3189
// TODO: test overlap, optimize.
3190
u8 tempregs[4];
3191
for (int i = 0; i < n; i++) {
3192
MOVSS(XMM0, fpr.V(sregs[i * 4]));
3193
MULSS(XMM0, fpr.V(tregs[0]));
3194
for (int k = 1; k < n; k++)
3195
{
3196
MOVSS(XMM1, fpr.V(sregs[i * 4 + k]));
3197
if (!homogenous || k != n - 1)
3198
MULSS(XMM1, fpr.V(tregs[k]));
3199
ADDSS(XMM0, R(XMM1));
3200
}
3201
3202
u8 temp = (u8) fpr.GetTempV();
3203
fpr.MapRegV(temp, MAP_NOINIT | MAP_DIRTY);
3204
MOVSS(fpr.VX(temp), R(XMM0));
3205
fpr.StoreFromRegisterV(temp);
3206
tempregs[i] = temp;
3207
}
3208
for (int i = 0; i < n; i++) {
3209
u8 temp = tempregs[i];
3210
fpr.MapRegV(temp, 0);
3211
MOVSS(fpr.V(dregs[i]), fpr.VX(temp));
3212
}
3213
3214
fpr.ReleaseSpillLocks();
3215
}
3216
3217
void Jit::Comp_VCrs(MIPSOpcode op) {
3218
DISABLE;
3219
}
3220
3221
void Jit::Comp_VDet(MIPSOpcode op) {
3222
DISABLE;
3223
}
3224
3225
// The goal is to map (reversed byte order for clarity):
3226
// 000000AA 000000BB 000000CC 000000DD -> AABBCCDD
3227
alignas(16) static const s8 vi2xc_shuffle[16] = { 3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 };
3228
// 0000AAAA 0000BBBB 0000CCCC 0000DDDD -> AAAABBBB CCCCDDDD
3229
alignas(16) static const s8 vi2xs_shuffle[16] = { 2, 3, 6, 7, 10, 11, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1 };
3230
3231
void Jit::Comp_Vi2x(MIPSOpcode op) {
3232
CONDITIONAL_DISABLE(VFPU_VEC);
3233
if (js.HasUnknownPrefix())
3234
DISABLE;
3235
3236
int bits = ((op >> 16) & 2) == 0 ? 8 : 16; // vi2uc/vi2c (0/1), vi2us/vi2s (2/3)
3237
bool unsignedOp = ((op >> 16) & 1) == 0; // vi2uc (0), vi2us (2)
3238
3239
// These instructions pack pairs or quads of integers into 32 bits.
3240
// The unsigned (u) versions skip the sign bit when packing.
3241
3242
VectorSize sz = GetVecSize(op);
3243
VectorSize outsize;
3244
if (bits == 8) {
3245
outsize = V_Single;
3246
if (sz != V_Quad) {
3247
DISABLE;
3248
}
3249
} else {
3250
switch (sz) {
3251
case V_Pair:
3252
outsize = V_Single;
3253
break;
3254
case V_Quad:
3255
outsize = V_Pair;
3256
break;
3257
default:
3258
DISABLE;
3259
}
3260
}
3261
3262
u8 sregs[4], dregs[4];
3263
GetVectorRegsPrefixS(sregs, sz, _VS);
3264
GetVectorRegsPrefixD(dregs, outsize, _VD);
3265
3266
// Flush SIMD.
3267
fpr.SimpleRegsV(sregs, sz, 0);
3268
fpr.SimpleRegsV(dregs, outsize, MAP_NOINIT | MAP_DIRTY);
3269
3270
// First, let's assemble the sregs into lanes of a single xmm reg.
3271
// For quad inputs, we need somewhere for the bottom regs. Ideally dregs[0].
3272
X64Reg dst0 = XMM0;
3273
if (sz == V_Quad) {
3274
int vreg = dregs[0];
3275
if (!IsOverlapSafeAllowS(dregs[0], 0, 4, sregs)) {
3276
// Will be discarded on release.
3277
vreg = fpr.GetTempV();
3278
}
3279
fpr.MapRegV(vreg, vreg == sregs[0] ? MAP_DIRTY : MAP_NOINIT);
3280
fpr.SpillLockV(vreg);
3281
dst0 = fpr.VX(vreg);
3282
} else {
3283
// Pair, let's check if we should use dregs[0] directly. No temp needed.
3284
int vreg = dregs[0];
3285
if (IsOverlapSafeAllowS(dregs[0], 0, 2, sregs)) {
3286
fpr.MapRegV(vreg, vreg == sregs[0] ? MAP_DIRTY : MAP_NOINIT);
3287
fpr.SpillLockV(vreg);
3288
dst0 = fpr.VX(vreg);
3289
}
3290
}
3291
3292
if (!fpr.V(sregs[0]).IsSimpleReg(dst0)) {
3293
MOVSS(dst0, fpr.V(sregs[0]));
3294
}
3295
MOVSS(XMM1, fpr.V(sregs[1]));
3296
// With this, we have the lower half in dst0.
3297
PUNPCKLDQ(dst0, R(XMM1));
3298
if (sz == V_Quad) {
3299
MOVSS(XMM0, fpr.V(sregs[2]));
3300
MOVSS(XMM1, fpr.V(sregs[3]));
3301
PUNPCKLDQ(XMM0, R(XMM1));
3302
// Now we need to combine XMM0 into dst0.
3303
PUNPCKLQDQ(dst0, R(XMM0));
3304
} else {
3305
// Otherwise, we need to zero out the top 2.
3306
// We expect XMM1 to be zero below.
3307
PXOR(XMM1, R(XMM1));
3308
PUNPCKLQDQ(dst0, R(XMM1));
3309
}
3310
3311
// For "u" type ops, we clamp to zero and shift off the sign bit first.
3312
if (unsignedOp) {
3313
if (cpu_info.bSSE4_1) {
3314
if (sz == V_Quad) {
3315
// Zeroed in the other case above.
3316
PXOR(XMM1, R(XMM1));
3317
}
3318
PMAXSD(dst0, R(XMM1));
3319
PSLLD(dst0, 1);
3320
} else {
3321
// Get a mask of the sign bit in dst0, then and in the values. This clamps to 0.
3322
MOVDQA(XMM1, R(dst0));
3323
PSRAD(dst0, 31);
3324
PSLLD(XMM1, 1);
3325
PANDN(dst0, R(XMM1));
3326
}
3327
}
3328
3329
// At this point, everything is aligned in the high bits of our lanes.
3330
if (cpu_info.bSSSE3) {
3331
if (RipAccessible(vi2xc_shuffle)) {
3332
PSHUFB(dst0, bits == 8 ? M(vi2xc_shuffle) : M(vi2xs_shuffle)); // rip accessible
3333
} else {
3334
MOV(PTRBITS, R(TEMPREG), bits == 8 ? ImmPtr(vi2xc_shuffle) : ImmPtr(vi2xs_shuffle));
3335
PSHUFB(dst0, MatR(TEMPREG));
3336
}
3337
} else {
3338
// Let's *arithmetically* shift in the sign so we can use saturating packs.
3339
PSRAD(dst0, 32 - bits);
3340
// XMM1 used for the high part just so there's no dependency. It contains garbage or 0.
3341
PACKSSDW(dst0, R(XMM1));
3342
if (bits == 8) {
3343
PACKSSWB(dst0, R(XMM1));
3344
}
3345
}
3346
3347
if (!fpr.V(dregs[0]).IsSimpleReg(dst0)) {
3348
MOVSS(fpr.V(dregs[0]), dst0);
3349
}
3350
if (outsize == V_Pair) {
3351
fpr.MapRegV(dregs[1], MAP_NOINIT | MAP_DIRTY);
3352
MOVDQA(fpr.V(dregs[1]), dst0);
3353
// Shift out the lower result to get the result we want.
3354
PSRLDQ(fpr.VX(dregs[1]), 4);
3355
}
3356
3357
ApplyPrefixD(dregs, outsize);
3358
fpr.ReleaseSpillLocks();
3359
}
3360
3361
alignas(16) static const float vavg_table[4] = { 1.0f, 1.0f / 2.0f, 1.0f / 3.0f, 1.0f / 4.0f };
3362
3363
void Jit::Comp_Vhoriz(MIPSOpcode op) {
3364
CONDITIONAL_DISABLE(VFPU_VEC);
3365
3366
if (js.HasUnknownPrefix())
3367
DISABLE;
3368
3369
VectorSize sz = GetVecSize(op);
3370
int n = GetNumVectorElements(sz);
3371
3372
u8 sregs[4], dregs[1];
3373
GetVectorRegsPrefixS(sregs, sz, _VS);
3374
GetVectorRegsPrefixD(dregs, V_Single, _VD);
3375
if (fpr.TryMapDirtyInVS(dregs, V_Single, sregs, sz)) {
3376
if (cpu_info.bSSE4_1) {
3377
MOV(PTRBITS, R(TEMPREG), ImmPtr(&oneOneOneOne));
3378
switch (sz) {
3379
case V_Pair:
3380
MOVAPS(XMM0, fpr.VS(sregs));
3381
DPPS(XMM0, MatR(TEMPREG), 0x31);
3382
MOVAPS(fpr.VSX(dregs), R(XMM0));
3383
break;
3384
case V_Triple:
3385
MOVAPS(XMM0, fpr.VS(sregs));
3386
DPPS(XMM0, MatR(TEMPREG), 0x71);
3387
MOVAPS(fpr.VSX(dregs), R(XMM0));
3388
break;
3389
case V_Quad:
3390
XORPS(XMM1, R(XMM1));
3391
MOVAPS(XMM0, fpr.VS(sregs));
3392
DPPS(XMM0, MatR(TEMPREG), 0xF1);
3393
// In every other case, +0.0 is selected by the mask and added.
3394
// But, here we need to manually add it to the result.
3395
ADDPS(XMM0, R(XMM1));
3396
MOVAPS(fpr.VSX(dregs), R(XMM0));
3397
break;
3398
default:
3399
DISABLE;
3400
}
3401
} else {
3402
switch (sz) {
3403
case V_Pair:
3404
XORPS(XMM1, R(XMM1));
3405
MOVAPS(XMM0, fpr.VS(sregs));
3406
ADDPS(XMM1, R(XMM0));
3407
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 1));
3408
ADDPS(XMM0, R(XMM1));
3409
MOVAPS(fpr.VSX(dregs), R(XMM0));
3410
break;
3411
case V_Triple:
3412
XORPS(XMM1, R(XMM1));
3413
MOVAPS(XMM0, fpr.VS(sregs));
3414
ADDPS(XMM1, R(XMM0));
3415
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 1));
3416
ADDPS(XMM0, R(XMM1));
3417
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(3, 2, 1, 2));
3418
ADDPS(XMM0, R(XMM1));
3419
MOVAPS(fpr.VSX(dregs), R(XMM0));
3420
break;
3421
case V_Quad:
3422
XORPS(XMM1, R(XMM1));
3423
MOVAPS(XMM0, fpr.VS(sregs));
3424
// This flips the sign of any -0.000.
3425
ADDPS(XMM0, R(XMM1));
3426
MOVHLPS(XMM1, XMM0);
3427
ADDPS(XMM0, R(XMM1));
3428
MOVAPS(XMM1, R(XMM0));
3429
SHUFPS(XMM1, R(XMM1), _MM_SHUFFLE(1, 1, 1, 1));
3430
ADDPS(XMM0, R(XMM1));
3431
MOVAPS(fpr.VSX(dregs), R(XMM0));
3432
break;
3433
default:
3434
DISABLE;
3435
}
3436
}
3437
if (((op >> 16) & 31) == 7) { // vavg
3438
MOV(PTRBITS, R(TEMPREG), ImmPtr(&vavg_table[n - 1]));
3439
MULSS(fpr.VSX(dregs), MatR(TEMPREG));
3440
}
3441
ApplyPrefixD(dregs, V_Single);
3442
fpr.ReleaseSpillLocks();
3443
return;
3444
}
3445
3446
// Flush SIMD.
3447
fpr.SimpleRegsV(sregs, sz, 0);
3448
fpr.SimpleRegsV(dregs, V_Single, MAP_NOINIT | MAP_DIRTY);
3449
3450
X64Reg reg = XMM0;
3451
if (IsOverlapSafe(dregs[0], 0, n, sregs)) {
3452
fpr.MapRegV(dregs[0], dregs[0] == sregs[0] ? MAP_DIRTY : MAP_NOINIT);
3453
fpr.SpillLockV(dregs[0]);
3454
reg = fpr.VX(dregs[0]);
3455
}
3456
3457
// We have to start zt +0.000 in case any values are -0.000.
3458
XORPS(reg, R(reg));
3459
for (int i = 0; i < n; ++i) {
3460
ADDSS(reg, fpr.V(sregs[i]));
3461
}
3462
3463
switch ((op >> 16) & 31) {
3464
case 6: // vfad
3465
break;
3466
case 7: // vavg
3467
MOV(PTRBITS, R(TEMPREG), ImmPtr(&vavg_table[n - 1]));
3468
MULSS(reg, MatR(TEMPREG));
3469
break;
3470
}
3471
3472
if (reg == XMM0) {
3473
MOVSS(fpr.V(dregs[0]), XMM0);
3474
}
3475
3476
ApplyPrefixD(dregs, V_Single);
3477
fpr.ReleaseSpillLocks();
3478
}
3479
3480
void Jit::Comp_Viim(MIPSOpcode op) {
3481
CONDITIONAL_DISABLE(VFPU_XFER);
3482
3483
if (js.HasUnknownPrefix())
3484
DISABLE;
3485
3486
u8 dreg;
3487
GetVectorRegs(&dreg, V_Single, _VT);
3488
3489
// Flush SIMD.
3490
fpr.SimpleRegsV(&dreg, V_Single, MAP_NOINIT | MAP_DIRTY);
3491
3492
s32 imm = SignExtend16ToS32(op);
3493
FP32 fp;
3494
fp.f = (float)imm;
3495
MOV(32, R(TEMPREG), Imm32(fp.u));
3496
fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT);
3497
MOVD_xmm(fpr.VX(dreg), R(TEMPREG));
3498
3499
ApplyPrefixD(&dreg, V_Single);
3500
fpr.ReleaseSpillLocks();
3501
}
3502
3503
void Jit::Comp_Vfim(MIPSOpcode op) {
3504
CONDITIONAL_DISABLE(VFPU_XFER);
3505
3506
if (js.HasUnknownPrefix())
3507
DISABLE;
3508
3509
u8 dreg;
3510
GetVectorRegs(&dreg, V_Single, _VT);
3511
3512
// Flush SIMD.
3513
fpr.SimpleRegsV(&dreg, V_Single, MAP_NOINIT | MAP_DIRTY);
3514
3515
FP16 half;
3516
half.u = op & 0xFFFF;
3517
FP32 fval = half_to_float_fast5(half);
3518
MOV(32, R(TEMPREG), Imm32(fval.u));
3519
fpr.MapRegV(dreg, MAP_DIRTY | MAP_NOINIT);
3520
MOVD_xmm(fpr.VX(dreg), R(TEMPREG));
3521
3522
ApplyPrefixD(&dreg, V_Single);
3523
fpr.ReleaseSpillLocks();
3524
}
3525
3526
void Jit::CompVrotShuffle(u8 *dregs, int imm, int n, bool negSin) {
3527
char what[4] = { '0', '0', '0', '0' };
3528
if (((imm >> 2) & 3) == (imm & 3)) {
3529
for (int i = 0; i < 4; i++)
3530
what[i] = 'S';
3531
}
3532
what[(imm >> 2) & 3] = 'S';
3533
what[imm & 3] = 'C';
3534
3535
// TODO: shufps SIMD version
3536
3537
for (int i = 0; i < n; i++) {
3538
fpr.MapRegV(dregs[i], MAP_DIRTY | MAP_NOINIT);
3539
switch (what[i]) {
3540
case 'C': MOVSS(fpr.V(dregs[i]), XMM1); break;
3541
case 'S':
3542
MOVSS(fpr.V(dregs[i]), XMM0);
3543
if (negSin) {
3544
if (RipAccessible(&signBitLower)) {
3545
XORPS(fpr.VX(dregs[i]), M(&signBitLower)); // rip accessible
3546
} else {
3547
MOV(PTRBITS, R(TEMPREG), ImmPtr(&signBitLower));
3548
XORPS(fpr.VX(dregs[i]), MatR(TEMPREG));
3549
}
3550
}
3551
break;
3552
case '0':
3553
{
3554
XORPS(fpr.VX(dregs[i]), fpr.V(dregs[i]));
3555
break;
3556
}
3557
default:
3558
ERROR_LOG(Log::JIT, "Bad what in vrot");
3559
break;
3560
}
3561
}
3562
}
3563
3564
// Very heavily used by FF:CC
3565
void Jit::Comp_VRot(MIPSOpcode op) {
3566
CONDITIONAL_DISABLE(VFPU_VEC);
3567
if (js.HasUnknownPrefix()) {
3568
DISABLE;
3569
}
3570
if (!js.HasNoPrefix()) {
3571
// Prefixes work strangely for this, see IRCompVFPU.
3572
WARN_LOG_REPORT(Log::JIT, "vrot instruction using prefixes at %08x", GetCompilerPC());
3573
DISABLE;
3574
}
3575
3576
int vd = _VD;
3577
int vs = _VS;
3578
3579
VectorSize sz = GetVecSize(op);
3580
int n = GetNumVectorElements(sz);
3581
3582
u8 dregs[4];
3583
u8 dregs2[4];
3584
3585
MIPSOpcode nextOp = GetOffsetInstruction(1);
3586
int vd2 = -1;
3587
int imm2 = -1;
3588
if ((nextOp >> 26) == 60 && ((nextOp >> 21) & 0x1F) == 29 && _VS == MIPS_GET_VS(nextOp)) {
3589
// Pair of vrot with the same angle argument. Let's join them (can share sin/cos results).
3590
vd2 = MIPS_GET_VD(nextOp);
3591
imm2 = (nextOp >> 16) & 0x1f;
3592
// NOTICE_LOG(Log::JIT, "Joint VFPU at %08x", js.blockStart);
3593
}
3594
3595
u8 sreg;
3596
GetVectorRegs(dregs, sz, vd);
3597
if (vd2 >= 0)
3598
GetVectorRegs(dregs2, sz, vd2);
3599
GetVectorRegs(&sreg, V_Single, vs);
3600
3601
// Flush SIMD.
3602
fpr.SimpleRegsV(&sreg, V_Single, 0);
3603
3604
int imm = (op >> 16) & 0x1f;
3605
3606
gpr.FlushBeforeCall();
3607
fpr.Flush();
3608
3609
bool negSin1 = (imm & 0x10) ? true : false;
3610
3611
#if PPSSPP_ARCH(AMD64)
3612
#ifdef _WIN32
3613
LEA(64, RDX, MIPSSTATE_VAR(sincostemp));
3614
#else
3615
LEA(64, RDI, MIPSSTATE_VAR(sincostemp));
3616
#endif
3617
MOVSS(XMM0, fpr.V(sreg));
3618
ABI_CallFunction(negSin1 ? (const void *)&SinCosNegSin : (const void *)&SinCos);
3619
#else
3620
// Sigh, passing floats with cdecl isn't pretty, ends up on the stack.
3621
ABI_CallFunctionAC(negSin1 ? (const void *)&SinCosNegSin : (const void *)&SinCos, fpr.V(sreg), (uintptr_t)mips_->sincostemp);
3622
#endif
3623
3624
MOVSS(XMM0, MIPSSTATE_VAR(sincostemp[0]));
3625
MOVSS(XMM1, MIPSSTATE_VAR(sincostemp[1]));
3626
3627
CompVrotShuffle(dregs, imm, n, false);
3628
if (vd2 != -1) {
3629
// If the negsin setting differs between the two joint invocations, we need to flip the second one.
3630
bool negSin2 = (imm2 & 0x10) ? true : false;
3631
CompVrotShuffle(dregs2, imm2, n, negSin1 != negSin2);
3632
EatInstruction(nextOp);
3633
}
3634
fpr.ReleaseSpillLocks();
3635
}
3636
3637
void Jit::Comp_ColorConv(MIPSOpcode op) {
3638
CONDITIONAL_DISABLE(VFPU_VEC);
3639
if (js.HasUnknownPrefix())
3640
DISABLE;
3641
3642
int vd = _VD;
3643
int vs = _VS;
3644
3645
DISABLE;
3646
#if 0
3647
VectorSize sz = V_Quad;
3648
int n = GetNumVectorElements(sz);
3649
3650
switch ((op >> 16) & 3) {
3651
case 1:
3652
break;
3653
default:
3654
DISABLE;
3655
}
3656
3657
u8 sregs[4];
3658
u8 dregs[1];
3659
// WARNING: Prefixes.
3660
GetVectorRegs(sregs, sz, vs);
3661
GetVectorRegs(dregs, V_Pair, vd);
3662
3663
if (fpr.TryMapDirtyInVS(dregs, V_Single, sregs, sz)) {
3664
switch ((op >> 16) & 3) {
3665
case 1: // 4444
3666
{
3667
//int a = ((in >> 24) & 0xFF) >> 4;
3668
//int b = ((in >> 16) & 0xFF) >> 4;
3669
//int g = ((in >> 8) & 0xFF) >> 4;
3670
//int r = ((in)& 0xFF) >> 4;
3671
//col = (a << 12) | (b << 8) | (g << 4) | (r);
3672
//PACKUSW
3673
break;
3674
}
3675
case 2: // 5551
3676
{
3677
//int a = ((in >> 24) & 0xFF) >> 7;
3678
//int b = ((in >> 16) & 0xFF) >> 3;
3679
//int g = ((in >> 8) & 0xFF) >> 3;
3680
//int r = ((in)& 0xFF) >> 3;
3681
//col = (a << 15) | (b << 10) | (g << 5) | (r);
3682
break;
3683
}
3684
case 3: // 565
3685
{
3686
//int b = ((in >> 16) & 0xFF) >> 3;
3687
//int g = ((in >> 8) & 0xFF) >> 2;
3688
//int r = ((in)& 0xFF) >> 3;
3689
//col = (b << 11) | (g << 5) | (r);
3690
break;
3691
}
3692
}
3693
DISABLE;
3694
3695
// Flush SIMD.
3696
fpr.SimpleRegsV(&sreg, V_Pair, MAP_NOINIT | MAP_DIRTY);
3697
fpr.SimpleRegsV(&dreg, V_Pair, MAP_NOINIT | MAP_DIRTY);
3698
#endif
3699
3700
}
3701
}
3702
3703
#endif // PPSSPP_ARCH(X86) || PPSSPP_ARCH(AMD64)
3704
3705