Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hrydgard
GitHub Repository: hrydgard/ppsspp
Path: blob/master/Core/MIPS/MIPSVFPUUtils.cpp
3186 views
1
// Copyright (c) 2012- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include <cstdint>
19
#include <cstring>
20
21
#include "Common/BitScan.h"
22
#include "Common/File/VFS/VFS.h"
23
#include "Common/Math/SIMDHeaders.h"
24
#include "Common/StringUtils.h"
25
#include "Core/Reporting.h"
26
#include "Core/MIPS/MIPS.h"
27
#include "Core/MIPS/MIPSVFPUUtils.h"
28
#include "Core/MIPS/MIPSVFPUFallbacks.h"
29
30
#ifdef _MSC_VER
31
#pragma warning(disable: 4146)
32
#endif
33
34
union float2int {
35
uint32_t i;
36
float f;
37
};
38
39
void GetVectorRegs(u8 regs[4], VectorSize N, int vectorReg) {
40
int mtx = (vectorReg >> 2) & 7;
41
int col = vectorReg & 3;
42
int row = 0;
43
int length = 0;
44
int transpose = (vectorReg>>5) & 1;
45
46
switch (N) {
47
case V_Single: transpose = 0; row=(vectorReg>>5)&3; length = 1; break;
48
case V_Pair: row=(vectorReg>>5)&2; length = 2; break;
49
case V_Triple: row=(vectorReg>>6)&1; length = 3; break;
50
case V_Quad: row=(vectorReg>>5)&2; length = 4; break;
51
default: _assert_msg_(false, "%s: Bad vector size", __FUNCTION__);
52
}
53
54
for (int i = 0; i < length; i++) {
55
int index = mtx * 4;
56
if (transpose)
57
index += ((row+i)&3) + col*32;
58
else
59
index += col + ((row+i)&3)*32;
60
regs[i] = index;
61
}
62
}
63
64
void GetMatrixRegs(u8 regs[16], MatrixSize N, int matrixReg) {
65
int mtx = (matrixReg >> 2) & 7;
66
int col = matrixReg & 3;
67
68
int row = 0;
69
int side = 0;
70
int transpose = (matrixReg >> 5) & 1;
71
72
switch (N) {
73
case M_1x1: transpose = 0; row = (matrixReg >> 5) & 3; side = 1; break;
74
case M_2x2: row = (matrixReg >> 5) & 2; side = 2; break;
75
case M_3x3: row = (matrixReg >> 6) & 1; side = 3; break;
76
case M_4x4: row = (matrixReg >> 5) & 2; side = 4; break;
77
default: _assert_msg_(false, "%s: Bad matrix size", __FUNCTION__);
78
}
79
80
for (int i = 0; i < side; i++) {
81
for (int j = 0; j < side; j++) {
82
int index = mtx * 4;
83
if (transpose)
84
index += ((row+i)&3) + ((col+j)&3)*32;
85
else
86
index += ((col+j)&3) + ((row+i)&3)*32;
87
regs[j*4 + i] = index;
88
}
89
}
90
}
91
92
int GetMatrixName(int matrix, MatrixSize msize, int column, int row, bool transposed) {
93
// TODO: Fix (?)
94
int name = (matrix * 4) | (transposed << 5);
95
switch (msize) {
96
case M_4x4:
97
if (row || column) {
98
ERROR_LOG(Log::JIT, "GetMatrixName: Invalid row %i or column %i for size %i", row, column, msize);
99
}
100
break;
101
102
case M_3x3:
103
if (row & ~2) {
104
ERROR_LOG(Log::JIT, "GetMatrixName: Invalid row %i for size %i", row, msize);
105
}
106
if (column & ~2) {
107
ERROR_LOG(Log::JIT, "GetMatrixName: Invalid col %i for size %i", column, msize);
108
}
109
name |= (row << 6) | column;
110
break;
111
112
case M_2x2:
113
if (row & ~2) {
114
ERROR_LOG(Log::JIT, "GetMatrixName: Invalid row %i for size %i", row, msize);
115
}
116
if (column & ~2) {
117
ERROR_LOG(Log::JIT, "GetMatrixName: Invalid col %i for size %i", column, msize);
118
}
119
name |= (row << 5) | column;
120
break;
121
122
default: _assert_msg_(false, "%s: Bad matrix size", __FUNCTION__);
123
}
124
125
return name;
126
}
127
128
int GetColumnName(int matrix, MatrixSize msize, int column, int offset) {
129
return matrix * 4 + column + offset * 32;
130
}
131
132
int GetRowName(int matrix, MatrixSize msize, int column, int offset) {
133
return 0x20 | (matrix * 4 + column + offset * 32);
134
}
135
136
void GetMatrixColumns(int matrixReg, MatrixSize msize, u8 vecs[4]) {
137
int n = GetMatrixSide(msize);
138
139
int col = matrixReg & 3;
140
int row = (matrixReg >> 5) & 2;
141
int transpose = (matrixReg >> 5) & 1;
142
143
for (int i = 0; i < n; i++) {
144
vecs[i] = (transpose << 5) | (row << 5) | (matrixReg & 0x1C) | (i + col);
145
}
146
}
147
148
void GetMatrixRows(int matrixReg, MatrixSize msize, u8 vecs[4]) {
149
int n = GetMatrixSide(msize);
150
int col = matrixReg & 3;
151
int row = (matrixReg >> 5) & 2;
152
153
int swappedCol = row ? (msize == M_3x3 ? 1 : 2) : 0;
154
int swappedRow = col ? 2 : 0;
155
int transpose = ((matrixReg >> 5) & 1) ^ 1;
156
157
for (int i = 0; i < n; i++) {
158
vecs[i] = (transpose << 5) | (swappedRow << 5) | (matrixReg & 0x1C) | (i + swappedCol);
159
}
160
}
161
162
void ReadVector(float *rd, VectorSize size, int reg) {
163
int row;
164
int length;
165
switch (size) {
166
case V_Single: rd[0] = currentMIPS->v[voffset[reg]]; return; // transpose = 0; row=(reg>>5)&3; length = 1; break;
167
case V_Pair: row=(reg>>5)&2; length = 2; break;
168
case V_Triple: row=(reg>>6)&1; length = 3; break;
169
case V_Quad: row=(reg>>5)&2; length = 4; break;
170
default: length = 0; break;
171
}
172
int transpose = (reg >> 5) & 1;
173
const int mtx = ((reg << 2) & 0x70);
174
const int col = reg & 3;
175
// NOTE: We now skip the voffset lookups.
176
if (transpose) {
177
const int base = mtx + col;
178
for (int i = 0; i < length; i++)
179
rd[i] = currentMIPS->v[base + ((row + i) & 3) * 4];
180
} else {
181
const int base = mtx + col * 4;
182
for (int i = 0; i < length; i++)
183
rd[i] = currentMIPS->v[base + ((row + i) & 3)];
184
}
185
}
186
187
void WriteVector(const float *rd, VectorSize size, int reg) {
188
int row;
189
int length;
190
191
switch (size) {
192
case V_Single: if (!currentMIPS->VfpuWriteMask(0)) currentMIPS->v[voffset[reg]] = rd[0]; return; // transpose = 0; row=(reg>>5)&3; length = 1; break;
193
case V_Pair: row=(reg>>5)&2; length = 2; break;
194
case V_Triple: row=(reg>>6)&1; length = 3; break;
195
case V_Quad: row=(reg>>5)&2; length = 4; break;
196
default: length = 0; break;
197
}
198
199
const int mtx = ((reg << 2) & 0x70);
200
const int col = reg & 3;
201
bool transpose = (reg >> 5) & 1;
202
// NOTE: We now skip the voffset lookups.
203
if (transpose) {
204
const int base = mtx + col;
205
if (currentMIPS->VfpuWriteMask() == 0) {
206
for (int i = 0; i < length; i++)
207
currentMIPS->v[base + ((row+i) & 3) * 4] = rd[i];
208
} else {
209
for (int i = 0; i < length; i++) {
210
if (!currentMIPS->VfpuWriteMask(i)) {
211
currentMIPS->v[base + ((row+i) & 3) * 4] = rd[i];
212
}
213
}
214
}
215
} else {
216
const int base = mtx + col * 4;
217
if (currentMIPS->VfpuWriteMask() == 0) {
218
for (int i = 0; i < length; i++)
219
currentMIPS->v[base + ((row + i) & 3)] = rd[i];
220
} else {
221
for (int i = 0; i < length; i++) {
222
if (!currentMIPS->VfpuWriteMask(i)) {
223
currentMIPS->v[base + ((row + i) & 3)] = rd[i];
224
}
225
}
226
}
227
}
228
}
229
230
u32 VFPURewritePrefix(int ctrl, u32 remove, u32 add) {
231
u32 prefix = currentMIPS->vfpuCtrl[ctrl];
232
return (prefix & ~remove) | add;
233
}
234
235
void ReadMatrix(float *rd, MatrixSize size, int reg) {
236
int row = 0;
237
int side = 0;
238
int transpose = (reg >> 5) & 1;
239
240
switch (size) {
241
case M_1x1: transpose = 0; row = (reg >> 5) & 3; side = 1; break;
242
case M_2x2: row = (reg >> 5) & 2; side = 2; break;
243
case M_3x3: row = (reg >> 6) & 1; side = 3; break;
244
case M_4x4: row = (reg >> 5) & 2; side = 4; break;
245
default: side = 0; break;
246
}
247
248
int mtx = (reg >> 2) & 7;
249
int col = reg & 3;
250
251
// The voffset ordering is now integrated in these formulas,
252
// eliminating a table lookup.
253
const float *v = currentMIPS->v + (size_t)mtx * 16;
254
if (transpose) {
255
if (side == 4 && col == 0 && row == 0) {
256
// Fast path: Simple 4x4 transpose. TODO: Optimize.
257
for (int j = 0; j < 4; j++) {
258
for (int i = 0; i < 4; i++) {
259
rd[j * 4 + i] = v[i * 4 + j];
260
}
261
}
262
} else {
263
for (int j = 0; j < side; j++) {
264
for (int i = 0; i < side; i++) {
265
int index = ((row + i) & 3) * 4 + ((col + j) & 3);
266
rd[j * 4 + i] = v[index];
267
}
268
}
269
}
270
} else {
271
if (side == 4 && col == 0 && row == 0) {
272
// Fast path
273
memcpy(rd, v, sizeof(float) * 16); // rd[j * 4 + i] = v[j * 4 + i];
274
} else {
275
for (int j = 0; j < side; j++) {
276
for (int i = 0; i < side; i++) {
277
int index = ((col + j) & 3) * 4 + ((row + i) & 3);
278
rd[j * 4 + i] = v[index];
279
}
280
}
281
}
282
}
283
}
284
285
void WriteMatrix(const float *rd, MatrixSize size, int reg) {
286
int mtx = (reg>>2)&7;
287
int col = reg&3;
288
289
int row;
290
int side;
291
int transpose = (reg >> 5) & 1;
292
293
switch (size) {
294
case M_1x1: transpose = 0; row = (reg >> 5) & 3; side = 1; break;
295
case M_2x2: row = (reg >> 5) & 2; side = 2; break;
296
case M_3x3: row = (reg >> 6) & 1; side = 3; break;
297
case M_4x4: row = (reg >> 5) & 2; side = 4; break;
298
default: side = 0;
299
}
300
301
if (currentMIPS->VfpuWriteMask() != 0) {
302
ERROR_LOG_REPORT(Log::CPU, "Write mask used with vfpu matrix instruction.");
303
}
304
305
// The voffset ordering is now integrated in these formulas,
306
// eliminating a table lookup.
307
float *v = currentMIPS->v + (size_t)mtx * 16;
308
if (transpose) {
309
if (side == 4 && row == 0 && col == 0 && currentMIPS->VfpuWriteMask() == 0x0) {
310
// Fast path: Simple 4x4 transpose. TODO: Optimize.
311
for (int j = 0; j < side; j++) {
312
for (int i = 0; i < side; i++) {
313
v[i * 4 + j] = rd[j * 4 + i];
314
}
315
}
316
} else {
317
for (int j = 0; j < side; j++) {
318
for (int i = 0; i < side; i++) {
319
if (j != side - 1 || !currentMIPS->VfpuWriteMask(i)) {
320
int index = ((row + i) & 3) * 4 + ((col + j) & 3);
321
v[index] = rd[j * 4 + i];
322
}
323
}
324
}
325
}
326
} else {
327
if (side == 4 && row == 0 && col == 0 && currentMIPS->VfpuWriteMask() == 0x0) {
328
memcpy(v, rd, sizeof(float) * 16); // v[j * 4 + i] = rd[j * 4 + i];
329
} else {
330
for (int j = 0; j < side; j++) {
331
for (int i = 0; i < side; i++) {
332
if (j != side - 1 || !currentMIPS->VfpuWriteMask(i)) {
333
int index = ((col + j) & 3) * 4 + ((row + i) & 3);
334
v[index] = rd[j * 4 + i];
335
}
336
}
337
}
338
}
339
}
340
}
341
342
int GetVectorOverlap(int vec1, VectorSize size1, int vec2, VectorSize size2) {
343
// Different matrices? Can't overlap, return early.
344
if (((vec1 >> 2) & 7) != ((vec2 >> 2) & 7))
345
return 0;
346
347
int n1 = GetNumVectorElements(size1);
348
int n2 = GetNumVectorElements(size2);
349
u8 regs1[4];
350
u8 regs2[4];
351
GetVectorRegs(regs1, size1, vec1);
352
GetVectorRegs(regs2, size1, vec2);
353
int count = 0;
354
for (int i = 0; i < n1; i++) {
355
for (int j = 0; j < n2; j++) {
356
if (regs1[i] == regs2[j])
357
count++;
358
}
359
}
360
return count;
361
}
362
363
VectorSize GetHalfVectorSizeSafe(VectorSize sz) {
364
switch (sz) {
365
case V_Pair: return V_Single;
366
case V_Quad: return V_Pair;
367
default: return V_Invalid;
368
}
369
}
370
371
VectorSize GetHalfVectorSize(VectorSize sz) {
372
VectorSize res = GetHalfVectorSizeSafe(sz);
373
_assert_msg_(res != V_Invalid, "%s: Bad vector size", __FUNCTION__);
374
return res;
375
}
376
377
VectorSize GetDoubleVectorSizeSafe(VectorSize sz) {
378
switch (sz) {
379
case V_Single: return V_Pair;
380
case V_Pair: return V_Quad;
381
default: return V_Invalid;
382
}
383
}
384
385
VectorSize GetDoubleVectorSize(VectorSize sz) {
386
VectorSize res = GetDoubleVectorSizeSafe(sz);
387
_assert_msg_(res != V_Invalid, "%s: Bad vector size", __FUNCTION__);
388
return res;
389
}
390
391
VectorSize GetVectorSizeSafe(MatrixSize sz) {
392
switch (sz) {
393
case M_1x1: return V_Single;
394
case M_2x2: return V_Pair;
395
case M_3x3: return V_Triple;
396
case M_4x4: return V_Quad;
397
default: return V_Invalid;
398
}
399
}
400
401
VectorSize GetVectorSize(MatrixSize sz) {
402
VectorSize res = GetVectorSizeSafe(sz);
403
_assert_msg_(res != V_Invalid, "%s: Bad vector size", __FUNCTION__);
404
return res;
405
}
406
407
MatrixSize GetMatrixSizeSafe(VectorSize sz) {
408
switch (sz) {
409
case V_Single: return M_1x1;
410
case V_Pair: return M_2x2;
411
case V_Triple: return M_3x3;
412
case V_Quad: return M_4x4;
413
default: return M_Invalid;
414
}
415
}
416
417
MatrixSize GetMatrixSize(VectorSize sz) {
418
MatrixSize res = GetMatrixSizeSafe(sz);
419
_assert_msg_(res != M_Invalid, "%s: Bad vector size", __FUNCTION__);
420
return res;
421
}
422
423
VectorSize MatrixVectorSizeSafe(MatrixSize sz) {
424
switch (sz) {
425
case M_1x1: return V_Single;
426
case M_2x2: return V_Pair;
427
case M_3x3: return V_Triple;
428
case M_4x4: return V_Quad;
429
default: return V_Invalid;
430
}
431
}
432
433
VectorSize MatrixVectorSize(MatrixSize sz) {
434
VectorSize res = MatrixVectorSizeSafe(sz);
435
_assert_msg_(res != V_Invalid, "%s: Bad matrix size", __FUNCTION__);
436
return res;
437
}
438
439
int GetMatrixSideSafe(MatrixSize sz) {
440
switch (sz) {
441
case M_1x1: return 1;
442
case M_2x2: return 2;
443
case M_3x3: return 3;
444
case M_4x4: return 4;
445
default: return 0;
446
}
447
}
448
449
int GetMatrixSide(MatrixSize sz) {
450
int res = GetMatrixSideSafe(sz);
451
_assert_msg_(res != 0, "%s: Bad matrix size", __FUNCTION__);
452
return res;
453
}
454
455
// TODO: Optimize
456
MatrixOverlapType GetMatrixOverlap(int mtx1, int mtx2, MatrixSize msize) {
457
int n = GetMatrixSide(msize);
458
459
if (mtx1 == mtx2)
460
return OVERLAP_EQUAL;
461
462
u8 m1[16];
463
u8 m2[16];
464
GetMatrixRegs(m1, msize, mtx1);
465
GetMatrixRegs(m2, msize, mtx2);
466
467
// Simply do an exhaustive search.
468
for (int x = 0; x < n; x++) {
469
for (int y = 0; y < n; y++) {
470
int val = m1[y * 4 + x];
471
for (int a = 0; a < n; a++) {
472
for (int b = 0; b < n; b++) {
473
if (m2[a * 4 + b] == val) {
474
return OVERLAP_PARTIAL;
475
}
476
}
477
}
478
}
479
}
480
481
return OVERLAP_NONE;
482
}
483
484
std::string GetVectorNotation(int reg, VectorSize size) {
485
int mtx = (reg>>2)&7;
486
int col = reg&3;
487
int row = 0;
488
int transpose = (reg>>5)&1;
489
char c;
490
switch (size) {
491
case V_Single: transpose=0; c='S'; row=(reg>>5)&3; break;
492
case V_Pair: c='C'; row=(reg>>5)&2; break;
493
case V_Triple: c='C'; row=(reg>>6)&1; break;
494
case V_Quad: c='C'; row=(reg>>5)&2; break;
495
default: c='?'; break;
496
}
497
if (transpose && c == 'C') c='R';
498
if (transpose)
499
return StringFromFormat("%c%i%i%i", c, mtx, row, col);
500
return StringFromFormat("%c%i%i%i", c, mtx, col, row);
501
}
502
503
std::string GetMatrixNotation(int reg, MatrixSize size) {
504
int mtx = (reg>>2)&7;
505
int col = reg&3;
506
int row = 0;
507
int transpose = (reg>>5)&1;
508
char c;
509
switch (size)
510
{
511
case M_2x2: c='M'; row=(reg>>5)&2; break;
512
case M_3x3: c='M'; row=(reg>>6)&1; break;
513
case M_4x4: c='M'; row=(reg>>5)&2; break;
514
default: c='?'; break;
515
}
516
if (transpose && c=='M') c='E';
517
if (transpose)
518
return StringFromFormat("%c%i%i%i", c, mtx, row, col);
519
return StringFromFormat("%c%i%i%i", c, mtx, col, row);
520
}
521
522
bool GetVFPUCtrlMask(int reg, u32 *mask) {
523
switch (reg) {
524
case VFPU_CTRL_SPREFIX:
525
case VFPU_CTRL_TPREFIX:
526
*mask = 0x000FFFFF;
527
return true;
528
case VFPU_CTRL_DPREFIX:
529
*mask = 0x00000FFF;
530
return true;
531
case VFPU_CTRL_CC:
532
*mask = 0x0000003F;
533
return true;
534
case VFPU_CTRL_INF4:
535
*mask = 0xFFFFFFFF;
536
return true;
537
case VFPU_CTRL_RSV5:
538
case VFPU_CTRL_RSV6:
539
case VFPU_CTRL_REV:
540
// Don't change anything, these regs are read only.
541
return false;
542
case VFPU_CTRL_RCX0:
543
case VFPU_CTRL_RCX1:
544
case VFPU_CTRL_RCX2:
545
case VFPU_CTRL_RCX3:
546
case VFPU_CTRL_RCX4:
547
case VFPU_CTRL_RCX5:
548
case VFPU_CTRL_RCX6:
549
case VFPU_CTRL_RCX7:
550
*mask = 0x3FFFFFFF;
551
return true;
552
default:
553
return false;
554
}
555
}
556
557
float Float16ToFloat32(unsigned short l)
558
{
559
float2int f2i;
560
561
unsigned short float16 = l;
562
unsigned int sign = (float16 >> VFPU_SH_FLOAT16_SIGN) & VFPU_MASK_FLOAT16_SIGN;
563
int exponent = (float16 >> VFPU_SH_FLOAT16_EXP) & VFPU_MASK_FLOAT16_EXP;
564
unsigned int fraction = float16 & VFPU_MASK_FLOAT16_FRAC;
565
566
float f;
567
if (exponent == VFPU_FLOAT16_EXP_MAX)
568
{
569
f2i.i = sign << 31;
570
f2i.i |= 255 << 23;
571
f2i.i |= fraction;
572
f = f2i.f;
573
}
574
else if (exponent == 0 && fraction == 0)
575
{
576
f = sign == 1 ? -0.0f : 0.0f;
577
}
578
else
579
{
580
if (exponent == 0)
581
{
582
do
583
{
584
fraction <<= 1;
585
exponent--;
586
}
587
while (!(fraction & (VFPU_MASK_FLOAT16_FRAC + 1)));
588
589
fraction &= VFPU_MASK_FLOAT16_FRAC;
590
}
591
592
/* Convert to 32-bit single-precision IEEE754. */
593
f2i.i = sign << 31;
594
f2i.i |= (exponent + 112) << 23;
595
f2i.i |= fraction << 13;
596
f=f2i.f;
597
}
598
return f;
599
}
600
601
// Reference C++ version.
602
static float vfpu_dot_cpp(const float a[4], const float b[4]) {
603
static const int EXTRA_BITS = 2;
604
float2int result;
605
float2int src[2];
606
607
int32_t exps[4];
608
int32_t mants[4];
609
int32_t signs[4];
610
int32_t max_exp = 0;
611
int32_t last_inf = -1;
612
613
for (int i = 0; i < 4; i++) {
614
src[0].f = a[i];
615
src[1].f = b[i];
616
617
int32_t aexp = get_uexp(src[0].i);
618
int32_t bexp = get_uexp(src[1].i);
619
int32_t amant = get_mant(src[0].i) << EXTRA_BITS;
620
int32_t bmant = get_mant(src[1].i) << EXTRA_BITS;
621
622
exps[i] = aexp + bexp - 127;
623
if (aexp == 255) {
624
// INF * 0 = NAN
625
if ((src[0].i & 0x007FFFFF) != 0 || bexp == 0) {
626
result.i = 0x7F800001;
627
return result.f;
628
}
629
mants[i] = get_mant(0) << EXTRA_BITS;
630
exps[i] = 255;
631
} else if (bexp == 255) {
632
if ((src[1].i & 0x007FFFFF) != 0 || aexp == 0) {
633
result.i = 0x7F800001;
634
return result.f;
635
}
636
mants[i] = get_mant(0) << EXTRA_BITS;
637
exps[i] = 255;
638
} else {
639
// TODO: Adjust precision?
640
uint64_t adjust = (uint64_t)amant * (uint64_t)bmant;
641
mants[i] = (adjust >> (23 + EXTRA_BITS)) & 0x7FFFFFFF;
642
}
643
signs[i] = get_sign(src[0].i) ^ get_sign(src[1].i);
644
645
if (exps[i] > max_exp) {
646
max_exp = exps[i];
647
}
648
if (exps[i] >= 255) {
649
// Infinity minus infinity is not a real number.
650
if (last_inf != -1 && signs[i] != last_inf) {
651
result.i = 0x7F800001;
652
return result.f;
653
}
654
last_inf = signs[i];
655
}
656
}
657
658
int32_t mant_sum = 0;
659
for (int i = 0; i < 4; i++) {
660
int exp = max_exp - exps[i];
661
if (exp >= 32) {
662
mants[i] = 0;
663
} else {
664
mants[i] >>= exp;
665
}
666
if (signs[i]) {
667
mants[i] = -mants[i];
668
}
669
mant_sum += mants[i];
670
}
671
672
uint32_t sign_sum = 0;
673
if (mant_sum < 0) {
674
sign_sum = 0x80000000;
675
mant_sum = -mant_sum;
676
}
677
678
// Truncate off the extra bits now. We want to zero them for rounding purposes.
679
mant_sum >>= EXTRA_BITS;
680
681
if (mant_sum == 0 || max_exp <= 0) {
682
return 0.0f;
683
}
684
685
int8_t shift = (int8_t)clz32_nonzero(mant_sum) - 8;
686
if (shift < 0) {
687
// Round to even if we'd shift away a 0.5.
688
const uint32_t round_bit = 1 << (-shift - 1);
689
if ((mant_sum & round_bit) && (mant_sum & (round_bit << 1))) {
690
mant_sum += round_bit;
691
shift = (int8_t)clz32_nonzero(mant_sum) - 8;
692
} else if ((mant_sum & round_bit) && (mant_sum & (round_bit - 1))) {
693
mant_sum += round_bit;
694
shift = (int8_t)clz32_nonzero(mant_sum) - 8;
695
}
696
mant_sum >>= -shift;
697
max_exp += -shift;
698
} else {
699
mant_sum <<= shift;
700
max_exp -= shift;
701
}
702
_dbg_assert_msg_((mant_sum & 0x00800000) != 0, "Mantissa wrong: %08x", mant_sum);
703
704
if (max_exp >= 255) {
705
max_exp = 255;
706
mant_sum = 0;
707
} else if (max_exp <= 0) {
708
return 0.0f;
709
}
710
711
result.i = sign_sum | (max_exp << 23) | (mant_sum & 0x007FFFFF);
712
return result.f;
713
}
714
715
#if PPSSPP_ARCH(SSE2)
716
717
static inline __m128i mulhi32x4(__m128i a, __m128i b) {
718
__m128i m02 = _mm_mul_epu32(a, b);
719
__m128i m13 = _mm_mul_epu32(
720
_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 1, 1)),
721
_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)));
722
__m128i m=_mm_unpacklo_epi32(
723
_mm_shuffle_epi32(m02, _MM_SHUFFLE(3, 2, 3, 1)),
724
_mm_shuffle_epi32(m13, _MM_SHUFFLE(3, 2, 3, 1)));
725
return m;
726
}
727
728
// Values of rounding_mode:
729
// -1 - detect at runtime
730
// 0 - assume round-to-nearest-ties-to-even
731
// 1 - round yourself in integer math
732
template<int rounding_mode=-1>
733
static float vfpu_dot_sse2(const float a[4], const float b[4])
734
{
735
static const int EXTRA_BITS = 2;
736
737
bool is_default_rounding_mode = (rounding_mode == 0);
738
if(rounding_mode == -1)
739
{
740
volatile float test05 = 5.9604644775390625e-08f; // 0.5*2^-23
741
volatile float test15 = 1.78813934326171875e-07f; // 1.5*2^-23
742
const float res15 = 1.0000002384185791015625f; // 1+2^-22
743
test05 += 1.0f;
744
test15 += 1.0f;
745
is_default_rounding_mode = (test05 == 1.0f && test15 == res15);
746
}
747
__m128 A = _mm_loadu_ps(a);
748
__m128 B = _mm_loadu_ps(b);
749
// Extract exponents.
750
__m128 exp_mask = _mm_castsi128_ps(_mm_set1_epi32(0x7F800000));
751
__m128 eA = _mm_and_ps(A, exp_mask);
752
__m128 eB = _mm_and_ps(B, exp_mask);
753
__m128i exps = _mm_srli_epi32(_mm_add_epi32(
754
_mm_castps_si128(eA),
755
_mm_castps_si128(eB)),23);
756
// Find maximum exponent, stored as float32 in [1;2),
757
// so we can use _mm_max_ps() with normal arguments.
758
__m128 t = _mm_or_ps(_mm_castsi128_ps(exps), _mm_set1_ps(1.0f));
759
t = _mm_max_ps(t, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(t), _MM_SHUFFLE(2, 3, 0, 1))));
760
t = _mm_max_ps(t, _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(t), _MM_SHUFFLE(1, 0, 3, 2))));
761
t = _mm_max_ps(t, _mm_castsi128_ps(_mm_set1_epi32(0x3F80007F)));
762
int32_t mexp = _mm_cvtsi128_si32(_mm_castps_si128(t)) & 511;
763
// NOTE: mexp is doubly-biased, same for exps.
764
int32_t max_exp = mexp - 127;
765
// Fall back on anything weird.
766
__m128 finiteA = _mm_sub_ps(A, A);
767
__m128 finiteB = _mm_sub_ps(B, B);
768
finiteA = _mm_cmpeq_ps(finiteA, finiteA);
769
finiteB = _mm_cmpeq_ps(finiteB, finiteB);
770
if(max_exp >= 255 || _mm_movemask_ps(_mm_and_ps(finiteA, finiteB)) != 15) return vfpu_dot_cpp(a, b);
771
// Extract significands.
772
__m128i mA = _mm_or_si128(_mm_and_si128(_mm_castps_si128(A),_mm_set1_epi32(0x007FFFFF)),_mm_set1_epi32(0x00800000));
773
__m128i mB = _mm_or_si128(_mm_and_si128(_mm_castps_si128(B),_mm_set1_epi32(0x007FFFFF)),_mm_set1_epi32(0x00800000));
774
// Multiply.
775
// NOTE: vfpu_dot does multiplication as
776
// ((x<<EXTRA_BITS)*(y<<EXTRA_BITS))>>(23+EXTRA_BITS),
777
// here we do (x*y)>>(23-EXTRA_BITS-1),
778
// which produces twice the result (neither expression
779
// overflows in our case). We need that because our
780
// variable-shift scheme (below) must shift by at least 1 bit.
781
static const int s = 32-(23 - EXTRA_BITS - 1), s0 = s / 2,s1 = s - s0;
782
// We compute ((x*y)>>shift) as
783
// (((x*y)<<(32-shift))>>32), which we express as
784
// (((x<<s0)*(y<<s1))>>32) (neither shift overflows).
785
__m128i m = mulhi32x4(_mm_slli_epi32(mA, s0), _mm_slli_epi32(mB, s1));
786
// Shift according to max_exp. Since SSE2 doesn't have
787
// variable per-lane shifts, we multiply *again*,
788
// specifically, x>>y turns into (x<<(1<<(32-y)))>>32.
789
// We compute 1<<(32-y) using floating-point casts.
790
// NOTE: the cast for 1<<31 produces the correct value,
791
// since the _mm_cvttps_epi32 error code just happens
792
// to be 0x80000000.
793
// So (since we pre-multiplied m by 2), we need
794
// (m>>1)>>(mexp-exps),
795
// i.e. m>>(mexp+1-exps),
796
// i.e. (m<<(32-(mexp+1-exps)))>>32,
797
// i.e. (m<<(exps-(mexp-31)))>>32.
798
__m128i amounts = _mm_sub_epi32(exps, _mm_set1_epi32(mexp - 31));
799
// Clamp by 0. Both zero and negative amounts produce zero,
800
// since they correspond to right-shifting by 32 or more bits.
801
amounts = _mm_and_si128(amounts, _mm_cmpgt_epi32(amounts, _mm_set1_epi32(0)));
802
// Set up multipliers.
803
__m128i bits = _mm_add_epi32(_mm_set1_epi32(0x3F800000), _mm_slli_epi32(amounts, 23));
804
__m128i muls = _mm_cvttps_epi32(_mm_castsi128_ps(bits));
805
m = mulhi32x4(m, muls);
806
// Extract signs.
807
__m128i signs = _mm_cmpgt_epi32(
808
_mm_set1_epi32(0),
809
_mm_xor_si128(_mm_castps_si128(A), _mm_castps_si128(B)));
810
// Apply signs to m.
811
m = _mm_sub_epi32(_mm_xor_si128(m, signs), signs);
812
// Horizontal sum.
813
// See https://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-sse-vector-sum-or-other-reduction
814
__m128i h64 = _mm_shuffle_epi32(m, _MM_SHUFFLE(1, 0, 3, 2));
815
__m128i s64 = _mm_add_epi32(h64, m);
816
__m128i h32 = _mm_shufflelo_epi16(s64, _MM_SHUFFLE(1, 0, 3, 2));
817
__m128i s32 = _mm_add_epi32(s64, h32);
818
int32_t mant_sum = _mm_cvtsi128_si32(s32);
819
820
// The rest is scalar.
821
uint32_t sign_sum = 0;
822
if (mant_sum < 0) {
823
sign_sum = 0x80000000;
824
mant_sum = -mant_sum;
825
}
826
827
// Truncate off the extra bits now. We want to zero them for rounding purposes.
828
mant_sum >>= EXTRA_BITS;
829
830
if (mant_sum == 0 || max_exp <= 0) {
831
return 0.0f;
832
}
833
834
if(is_default_rounding_mode)
835
{
836
float2int r;
837
r.f = (float)mant_sum;
838
mant_sum = (r.i & 0x007FFFFF) | 0x00800000;
839
max_exp += (r.i >> 23) - 0x96;
840
}
841
else
842
{
843
int8_t shift = (int8_t)clz32_nonzero(mant_sum) - 8;
844
if (shift < 0) {
845
// Round to even if we'd shift away a 0.5.
846
const uint32_t round_bit = 1 << (-shift - 1);
847
if ((mant_sum & round_bit) && (mant_sum & (round_bit << 1))) {
848
mant_sum += round_bit;
849
shift = (int8_t)clz32_nonzero(mant_sum) - 8;
850
} else if ((mant_sum & round_bit) && (mant_sum & (round_bit - 1))) {
851
mant_sum += round_bit;
852
shift = (int8_t)clz32_nonzero(mant_sum) - 8;
853
}
854
mant_sum >>= -shift;
855
max_exp += -shift;
856
} else {
857
mant_sum <<= shift;
858
max_exp -= shift;
859
}
860
_dbg_assert_msg_((mant_sum & 0x00800000) != 0, "Mantissa wrong: %08x", mant_sum);
861
}
862
863
if (max_exp >= 255) {
864
max_exp = 255;
865
mant_sum = 0;
866
} else if (max_exp <= 0) {
867
return 0.0f;
868
}
869
870
float2int result;
871
result.i = sign_sum | (max_exp << 23) | (mant_sum & 0x007FFFFF);
872
return result.f;
873
}
874
875
#endif // PPSSPP_ARCH(SSE2)
876
877
float vfpu_dot(const float a[4], const float b[4]) {
878
#if PPSSPP_ARCH(SSE2)
879
return vfpu_dot_sse2(a, b);
880
#else
881
return vfpu_dot_cpp(a, b);
882
#endif
883
}
884
885
//==============================================================================
886
// The code below attempts to exactly match behaviour of
887
// PSP's vrnd instructions. See investigation starting around
888
// https://github.com/hrydgard/ppsspp/issues/16946#issuecomment-1467261209
889
// for details.
890
891
// Redundant currently, since MIPSState::Init() already
892
// does this on its own, but left as-is to be self-contained.
893
void vrnd_init_default(uint32_t *rcx) {
894
rcx[0] = 0x00000001;
895
rcx[1] = 0x00000002;
896
rcx[2] = 0x00000004;
897
rcx[3] = 0x00000008;
898
rcx[4] = 0x00000000;
899
rcx[5] = 0x00000000;
900
rcx[6] = 0x00000000;
901
rcx[7] = 0x00000000;
902
}
903
904
void vrnd_init(uint32_t seed, uint32_t *rcx) {
905
for(int i = 0; i < 8; ++i) rcx[i] =
906
0x3F800000u | // 1.0f mask.
907
((seed >> ((i / 4) * 16)) & 0xFFFFu) | // lower or upper half of the seed.
908
(((seed >> (4 * i)) & 0xF) << 16); // remaining nibble.
909
910
}
911
912
uint32_t vrnd_generate(uint32_t *rcx) {
913
// The actual RNG state appears to be 5 parts
914
// (32-bit each) stored into the registers as follows:
915
uint32_t A = (rcx[0] & 0xFFFFu) | (rcx[4] << 16);
916
uint32_t B = (rcx[1] & 0xFFFFu) | (rcx[5] << 16);
917
uint32_t C = (rcx[2] & 0xFFFFu) | (rcx[6] << 16);
918
uint32_t D = (rcx[3] & 0xFFFFu) | (rcx[7] << 16);
919
uint32_t E = (((rcx[0] >> 16) & 0xF) << 0) |
920
(((rcx[1] >> 16) & 0xF) << 4) |
921
(((rcx[2] >> 16) & 0xF) << 8) |
922
(((rcx[3] >> 16) & 0xF) << 12) |
923
(((rcx[4] >> 16) & 0xF) << 16) |
924
(((rcx[5] >> 16) & 0xF) << 20) |
925
(((rcx[6] >> 16) & 0xF) << 24) |
926
(((rcx[7] >> 16) & 0xF) << 28);
927
// Update.
928
// LCG with classic parameters.
929
A = 69069u * A + 1u; // NOTE: decimal constants.
930
// Xorshift, with classic parameters. Algorithm "xor" from p. 4 of Marsaglia, "Xorshift RNGs".
931
B ^= B << 13;
932
B ^= B >> 17;
933
B ^= B << 5;
934
// Sequence similar to Pell numbers ( https://en.wikipedia.org/wiki/Pell_number ),
935
// except with different starting values, and an occasional increment (E).
936
uint32_t t= 2u * D + C + E;
937
// NOTE: the details of how E-part is set are somewhat of a guess
938
// at the moment. The expression below looks weird, but does match
939
// the available test data.
940
E = uint32_t((uint64_t(C) + uint64_t(D >> 1) + uint64_t(E)) >> 32);
941
C = D;
942
D = t;
943
// Store.
944
rcx[0] = 0x3F800000u | (((E >> 0) & 0xF) << 16) | (A & 0xFFFFu);
945
rcx[1] = 0x3F800000u | (((E >> 4) & 0xF) << 16) | (B & 0xFFFFu);
946
rcx[2] = 0x3F800000u | (((E >> 8) & 0xF) << 16) | (C & 0xFFFFu);
947
rcx[3] = 0x3F800000u | (((E >> 12) & 0xF) << 16) | (D & 0xFFFFu);
948
rcx[4] = 0x3F800000u | (((E >> 16) & 0xF) << 16) | (A >> 16);
949
rcx[5] = 0x3F800000u | (((E >> 20) & 0xF) << 16) | (B >> 16);
950
rcx[6] = 0x3F800000u | (((E >> 24) & 0xF) << 16) | (C >> 16);
951
rcx[7] = 0x3F800000u | (((E >> 28) & 0xF) << 16) | (D >> 16);
952
// Return value.
953
return A + B + D;
954
}
955
956
//==============================================================================
957
// The code below attempts to exactly match the output of
958
// several PSP's VFPU functions. For the sake of
959
// making lookup tables smaller the code is
960
// somewhat gnarly.
961
// Lookup tables sometimes store deltas from (explicitly computable)
962
// estimations, to allow to store them in smaller types.
963
// See https://github.com/hrydgard/ppsspp/issues/16946 for details.
964
965
// Lookup tables.
966
// Note: these are never unloaded, and stay till program termination.
967
static uint32_t *vfpu_sin_lut8192=nullptr;
968
static int8_t (*vfpu_sin_lut_delta)[2]=nullptr;
969
static int16_t *vfpu_sin_lut_interval_delta=nullptr;
970
static uint8_t *vfpu_sin_lut_exceptions=nullptr;
971
972
static int8_t (*vfpu_sqrt_lut)[2]=nullptr;
973
974
static int8_t (*vfpu_rsqrt_lut)[2]=nullptr;
975
976
static uint32_t *vfpu_exp2_lut65536=nullptr;
977
static uint8_t (*vfpu_exp2_lut)[2]=nullptr;
978
979
static uint32_t *vfpu_log2_lut65536=nullptr;
980
static uint32_t *vfpu_log2_lut65536_quadratic=nullptr;
981
static uint8_t (*vfpu_log2_lut)[131072][2]=nullptr;
982
983
static int32_t (*vfpu_asin_lut65536)[3]=nullptr;
984
static uint64_t *vfpu_asin_lut_deltas=nullptr;
985
static uint16_t *vfpu_asin_lut_indices=nullptr;
986
987
static int8_t (*vfpu_rcp_lut)[2]=nullptr;
988
989
template<typename T>
990
static inline bool load_vfpu_table(T *&ptr, const char *filename, size_t expected_size) {
991
#if COMMON_BIG_ENDIAN
992
// Tables are little-endian.
993
#error Byteswap for VFPU tables not implemented
994
#endif
995
if (ptr) return true; // Already loaded.
996
size_t size = 0u;
997
INFO_LOG(Log::CPU, "Loading '%s'...", filename);
998
ptr = reinterpret_cast<decltype(&*ptr)>(g_VFS.ReadFile(filename, &size));
999
if (!ptr || size != expected_size) {
1000
ERROR_LOG(Log::CPU, "Error loading '%s' (size=%u, expected: %u)", filename, (unsigned)size, (unsigned)expected_size);
1001
delete[] ptr;
1002
ptr = nullptr;
1003
return false;
1004
}
1005
INFO_LOG(Log::CPU, "Successfully loaded '%s'", filename);
1006
return true;
1007
}
1008
1009
#define LOAD_TABLE(name, expected_size)\
1010
load_vfpu_table(name,"vfpu/" #name ".dat",expected_size)
1011
1012
// Note: PSP sin/cos output only has 22 significant
1013
// binary digits.
1014
static inline uint32_t vfpu_sin_quantum(uint32_t x) {
1015
return x < 1u << 22?
1016
1u:
1017
1u << (32 - 22 - clz32_nonzero(x));
1018
}
1019
1020
static inline uint32_t vfpu_sin_truncate_bits(u32 x) {
1021
return x & -vfpu_sin_quantum(x);
1022
}
1023
1024
static inline uint32_t vfpu_sin_fixed(uint32_t arg) {
1025
// Handle endpoints.
1026
if(arg == 0u) return 0u;
1027
if(arg == 0x00800000) return 0x10000000;
1028
// Get endpoints for 8192-wide interval.
1029
uint32_t L = vfpu_sin_lut8192[(arg >> 13) + 0];
1030
uint32_t H = vfpu_sin_lut8192[(arg >> 13) + 1];
1031
// Approximate endpoints for 64-wide interval via lerp.
1032
uint32_t A = L+(((H - L)*(((arg >> 6) & 127) + 0)) >> 7);
1033
uint32_t B = L+(((H - L)*(((arg >> 6) & 127) + 1)) >> 7);
1034
// Adjust endpoints from deltas, and increase working precision.
1035
uint64_t a = (uint64_t(A) << 5) + uint64_t(vfpu_sin_lut_delta[arg >> 6][0]) * vfpu_sin_quantum(A);
1036
uint64_t b = (uint64_t(B) << 5) + uint64_t(vfpu_sin_lut_delta[arg >> 6][1]) * vfpu_sin_quantum(B);
1037
// Compute approximation via lerp. Is off by at most 1 quantum.
1038
uint32_t v = uint32_t(((a * (64 - (arg & 63)) + b * (arg & 63)) >> 6) >> 5);
1039
v=vfpu_sin_truncate_bits(v);
1040
// Look up exceptions via binary search.
1041
// Note: vfpu_sin_lut_interval_delta stores
1042
// deltas from interval estimation.
1043
uint32_t lo = ((169u * ((arg >> 7) + 0)) >> 7)+uint32_t(vfpu_sin_lut_interval_delta[(arg >> 7) + 0]) + 16384u;
1044
uint32_t hi = ((169u * ((arg >> 7) + 1)) >> 7)+uint32_t(vfpu_sin_lut_interval_delta[(arg >> 7) + 1]) + 16384u;
1045
while(lo < hi) {
1046
uint32_t m = (lo + hi) / 2;
1047
// Note: vfpu_sin_lut_exceptions stores
1048
// index&127 (for each initial interval the
1049
// upper bits of index are the same, namely
1050
// arg&-128), plus direction (0 for +1, and
1051
// 128 for -1).
1052
uint32_t b = vfpu_sin_lut_exceptions[m];
1053
uint32_t e = (arg & -128u)+(b & 127u);
1054
if(e == arg) {
1055
v += vfpu_sin_quantum(v) * (b >> 7 ? -1u : +1u);
1056
break;
1057
}
1058
else if(e < arg) lo = m + 1;
1059
else hi = m;
1060
}
1061
return v;
1062
}
1063
1064
float vfpu_sin(float x) {
1065
static bool loaded =
1066
LOAD_TABLE(vfpu_sin_lut8192, 4100)&&
1067
LOAD_TABLE(vfpu_sin_lut_delta, 262144)&&
1068
LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&&
1069
LOAD_TABLE(vfpu_sin_lut_exceptions, 86938);
1070
if (!loaded)
1071
return vfpu_sin_fallback(x);
1072
uint32_t bits;
1073
memcpy(&bits, &x, sizeof(x));
1074
uint32_t sign = bits & 0x80000000u;
1075
uint32_t exponent = (bits >> 23) & 0xFFu;
1076
uint32_t significand = (bits & 0x007FFFFFu) | 0x00800000u;
1077
if(exponent == 0xFFu) {
1078
// NOTE: this bitpattern is a signaling
1079
// NaN on x86, so maybe just return
1080
// a normal qNaN?
1081
float y;
1082
bits=sign ^ 0x7F800001u;
1083
memcpy(&y, &bits, sizeof(y));
1084
return y;
1085
}
1086
if(exponent < 0x7Fu) {
1087
if(exponent < 0x7Fu-23u) significand = 0u;
1088
else significand >>= (0x7F - exponent);
1089
}
1090
else if(exponent > 0x7Fu) {
1091
// There is weirdness for large exponents.
1092
if(exponent - 0x7Fu >= 25u && exponent - 0x7Fu < 32u) significand = 0u;
1093
else if((exponent & 0x9Fu) == 0x9Fu) significand = 0u;
1094
else significand <<= (exponent - 0x7Fu);
1095
}
1096
sign ^= ((significand << 7) & 0x80000000u);
1097
significand &= 0x00FFFFFFu;
1098
if(significand > 0x00800000u) significand = 0x01000000u - significand;
1099
uint32_t ret = vfpu_sin_fixed(significand);
1100
return (sign ? -1.0f : +1.0f) * float(int32_t(ret)) * 3.7252903e-09f; // 0x1p-28f
1101
}
1102
1103
float vfpu_cos(float x) {
1104
static bool loaded =
1105
LOAD_TABLE(vfpu_sin_lut8192, 4100)&&
1106
LOAD_TABLE(vfpu_sin_lut_delta, 262144)&&
1107
LOAD_TABLE(vfpu_sin_lut_interval_delta, 131074)&&
1108
LOAD_TABLE(vfpu_sin_lut_exceptions, 86938);
1109
if (!loaded)
1110
return vfpu_cos_fallback(x);
1111
uint32_t bits;
1112
memcpy(&bits, &x, sizeof(x));
1113
bits &= 0x7FFFFFFFu;
1114
uint32_t sign = 0u;
1115
uint32_t exponent = (bits >> 23) & 0xFFu;
1116
uint32_t significand = (bits & 0x007FFFFFu) | 0x00800000u;
1117
if(exponent == 0xFFu) {
1118
// NOTE: this bitpattern is a signaling
1119
// NaN on x86, so maybe just return
1120
// a normal qNaN?
1121
float y;
1122
bits = sign ^ 0x7F800001u;
1123
memcpy(&y, &bits, sizeof(y));
1124
return y;
1125
}
1126
if(exponent < 0x7Fu) {
1127
if(exponent < 0x7Fu - 23u) significand = 0u;
1128
else significand >>= (0x7F - exponent);
1129
}
1130
else if(exponent > 0x7Fu) {
1131
// There is weirdness for large exponents.
1132
if(exponent - 0x7Fu >= 25u && exponent - 0x7Fu < 32u) significand = 0u;
1133
else if((exponent & 0x9Fu) == 0x9Fu) significand = 0u;
1134
else significand <<= (exponent - 0x7Fu);
1135
}
1136
sign ^= ((significand << 7) & 0x80000000u);
1137
significand &= 0x00FFFFFFu;
1138
if(significand >= 0x00800000u) {
1139
significand = 0x01000000u - significand;
1140
sign ^= 0x80000000u;
1141
}
1142
uint32_t ret = vfpu_sin_fixed(0x00800000u - significand);
1143
return (sign ? -1.0f : +1.0f) * float(int32_t(ret)) * 3.7252903e-09f; // 0x1p-28f
1144
}
1145
1146
void vfpu_sincos(float a, float &s, float &c) {
1147
// Just invoke both sin and cos.
1148
// Suboptimal but whatever.
1149
s = vfpu_sin(a);
1150
c = vfpu_cos(a);
1151
}
1152
1153
// Integer square root of 2^23*x (rounded to zero).
1154
// Input is in 2^23 <= x < 2^25, and representable in float.
1155
static inline uint32_t isqrt23(uint32_t x) {
1156
#if 0
1157
// Reference version.
1158
int dir=fesetround(FE_TOWARDZERO);
1159
uint32_t ret=uint32_t(int32_t(sqrtf(float(int32_t(x)) * 8388608.0f)));
1160
fesetround(dir);
1161
return ret;
1162
#elif 1
1163
// Double version.
1164
// Verified to produce correct result for all valid inputs,
1165
// in all rounding modes, both in double and double-extended (x87)
1166
// precision.
1167
// Requires correctly-rounded sqrt (which on any IEEE-754 system
1168
// it should be).
1169
return uint32_t(int32_t(sqrt(double(x) * 8388608.0)));
1170
#else
1171
// Pure integer version, if you don't like floating point.
1172
// Based on code from Hacker's Delight. See isqrt4 in
1173
// https://github.com/hcs0/Hackers-Delight/blob/master/isqrt.c.txt
1174
// Relatively slow.
1175
uint64_t t=uint64_t(x) << 23, m, y, b;
1176
m=0x4000000000000000ull;
1177
y=0;
1178
while(m != 0) // Do 32 times.
1179
{
1180
b=y | m;
1181
y=y >> 1;
1182
if(t >= b)
1183
{
1184
t = t - b;
1185
y = y | m;
1186
}
1187
m = m >> 2;
1188
}
1189
return y;
1190
#endif
1191
}
1192
1193
// Returns floating-point bitpattern.
1194
static inline uint32_t vfpu_sqrt_fixed(uint32_t x) {
1195
// Endpoints of input.
1196
uint32_t lo =(x + 0u) & -64u;
1197
uint32_t hi = (x + 64u) & -64u;
1198
// Convert input to 9.23 fixed-point.
1199
lo = (lo >= 0x00400000u ? 4u * lo : 0x00800000u + 2u * lo);
1200
hi = (hi >= 0x00400000u ? 4u * hi : 0x00800000u + 2u * hi);
1201
// Estimate endpoints of output.
1202
uint32_t A = 0x3F000000u + isqrt23(lo);
1203
uint32_t B = 0x3F000000u + isqrt23(hi);
1204
// Apply deltas, and increase the working precision.
1205
uint64_t a = (uint64_t(A) << 4) + uint64_t(vfpu_sqrt_lut[x >> 6][0]);
1206
uint64_t b = (uint64_t(B) << 4) + uint64_t(vfpu_sqrt_lut[x >> 6][1]);
1207
uint32_t ret = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);
1208
// Truncate lower 2 bits.
1209
ret &= -4u;
1210
return ret;
1211
}
1212
1213
float vfpu_sqrt(float x) {
1214
static bool loaded =
1215
LOAD_TABLE(vfpu_sqrt_lut, 262144);
1216
if (!loaded)
1217
return vfpu_sqrt_fallback(x);
1218
uint32_t bits;
1219
memcpy(&bits, &x, sizeof(bits));
1220
if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
1221
// Denormals (and zeroes) get +0, regardless
1222
// of sign.
1223
return +0.0f;
1224
}
1225
if(bits >> 31) {
1226
// Other negatives get NaN.
1227
bits = 0x7F800001u;
1228
memcpy(&x, &bits, sizeof(x));
1229
return x;
1230
}
1231
if((bits >> 23) == 255u) {
1232
// Inf/NaN gets Inf/NaN.
1233
bits = 0x7F800000u + ((bits & 0x007FFFFFu) != 0u);
1234
memcpy(&x, &bits, sizeof(bits));
1235
return x;
1236
}
1237
int32_t exponent = int32_t(bits >> 23) - 127;
1238
// Bottom bit of exponent (inverted) + significand (except bottom bit).
1239
uint32_t index = ((bits + 0x00800000u) >> 1) & 0x007FFFFFu;
1240
bits = vfpu_sqrt_fixed(index);
1241
bits += uint32_t(exponent >> 1) << 23;
1242
memcpy(&x, &bits, sizeof(bits));
1243
return x;
1244
}
1245
1246
// Returns floor(2^33/sqrt(x)), for 2^22 <= x < 2^24.
1247
static inline uint32_t rsqrt_floor22(uint32_t x) {
1248
#if 1
1249
// Verified correct in all rounding directions,
1250
// by exhaustive search.
1251
return uint32_t(8589934592.0 / sqrt(double(x))); // 0x1p33
1252
#else
1253
// Pure integer version, if you don't like floating point.
1254
// Based on code from Hacker's Delight. See isqrt4 in
1255
// https://github.com/hcs0/Hackers-Delight/blob/master/isqrt.c.txt
1256
// Relatively slow.
1257
uint64_t t=uint64_t(x) << 22, m, y, b;
1258
m = 0x4000000000000000ull;
1259
y = 0;
1260
while(m != 0) // Do 32 times.
1261
{
1262
b = y | m;
1263
y = y >> 1;
1264
if(t >= b)
1265
{
1266
t = t - b;
1267
y = y | m;
1268
}
1269
m = m >> 2;
1270
}
1271
y = (1ull << 44) / y;
1272
// Decrement if y > floor(2^33 / sqrt(x)).
1273
// This hack works because exhaustive
1274
// search (on [2^22;2^24]) says it does.
1275
if((y * y >> 3) * x > (1ull << 63) - 3ull * (((y & 7) == 6) << 21)) --y;
1276
return uint32_t(y);
1277
#endif
1278
}
1279
1280
// Returns floating-point bitpattern.
1281
static inline uint32_t vfpu_rsqrt_fixed(uint32_t x) {
1282
// Endpoints of input.
1283
uint32_t lo = (x + 0u) & -64u;
1284
uint32_t hi = (x + 64u) & -64u;
1285
// Convert input to 10.22 fixed-point.
1286
lo = (lo >= 0x00400000u ? 2u * lo : 0x00400000u + lo);
1287
hi = (hi >= 0x00400000u ? 2u * hi : 0x00400000u + hi);
1288
// Estimate endpoints of output.
1289
uint32_t A = 0x3E800000u + 4u * rsqrt_floor22(lo);
1290
uint32_t B = 0x3E800000u + 4u * rsqrt_floor22(hi);
1291
// Apply deltas, and increase the working precision.
1292
uint64_t a = (uint64_t(A) << 4) + uint64_t(vfpu_rsqrt_lut[x >> 6][0]);
1293
uint64_t b = (uint64_t(B) << 4) + uint64_t(vfpu_rsqrt_lut[x >> 6][1]);
1294
// Evaluate via lerp.
1295
uint32_t ret = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);
1296
// Truncate lower 2 bits.
1297
ret &= -4u;
1298
return ret;
1299
}
1300
1301
float vfpu_rsqrt(float x) {
1302
static bool loaded =
1303
LOAD_TABLE(vfpu_rsqrt_lut, 262144);
1304
if (!loaded)
1305
return vfpu_rsqrt_fallback(x);
1306
uint32_t bits;
1307
memcpy(&bits, &x, sizeof(bits));
1308
if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
1309
// Denormals (and zeroes) get inf of the same sign.
1310
bits = 0x7F800000u | (bits & 0x80000000u);
1311
memcpy(&x, &bits, sizeof(x));
1312
return x;
1313
}
1314
if(bits >> 31) {
1315
// Other negatives get negative NaN.
1316
bits = 0xFF800001u;
1317
memcpy(&x, &bits, sizeof(x));
1318
return x;
1319
}
1320
if((bits >> 23) == 255u) {
1321
// inf gets 0, NaN gets NaN.
1322
bits = ((bits & 0x007FFFFFu) ? 0x7F800001u : 0u);
1323
memcpy(&x, &bits, sizeof(bits));
1324
return x;
1325
}
1326
int32_t exponent = int32_t(bits >> 23) - 127;
1327
// Bottom bit of exponent (inverted) + significand (except bottom bit).
1328
uint32_t index = ((bits + 0x00800000u) >> 1) & 0x007FFFFFu;
1329
bits = vfpu_rsqrt_fixed(index);
1330
bits -= uint32_t(exponent >> 1) << 23;
1331
memcpy(&x, &bits, sizeof(bits));
1332
return x;
1333
}
1334
1335
static inline uint32_t vfpu_asin_quantum(uint32_t x) {
1336
return x<1u<<23?
1337
1u:
1338
1u<<(32-23-clz32_nonzero(x));
1339
}
1340
1341
static inline uint32_t vfpu_asin_truncate_bits(uint32_t x) {
1342
return x & -vfpu_asin_quantum(x);
1343
}
1344
1345
// Input is fixed 9.23, output is fixed 2.30.
1346
static inline uint32_t vfpu_asin_approx(uint32_t x) {
1347
const int32_t *C = vfpu_asin_lut65536[x >> 16];
1348
x &= 0xFFFFu;
1349
return vfpu_asin_truncate_bits(uint32_t((((((int64_t(C[2]) * x) >> 16) + int64_t(C[1])) * x) >> 16) + C[0]));
1350
}
1351
1352
// Input is fixed 9.23, output is fixed 2.30.
1353
static uint32_t vfpu_asin_fixed(uint32_t x) {
1354
if(x == 0u) return 0u;
1355
if(x == 1u << 23) return 1u << 30;
1356
uint32_t ret = vfpu_asin_approx(x);
1357
uint32_t index = vfpu_asin_lut_indices[x / 21u];
1358
uint64_t deltas = vfpu_asin_lut_deltas[index];
1359
return ret + (3u - uint32_t((deltas >> (3u * (x % 21u))) & 7u)) * vfpu_asin_quantum(ret);
1360
}
1361
1362
float vfpu_asin(float x) {
1363
static bool loaded =
1364
LOAD_TABLE(vfpu_asin_lut65536, 1536)&&
1365
LOAD_TABLE(vfpu_asin_lut_indices, 798916)&&
1366
LOAD_TABLE(vfpu_asin_lut_deltas, 517448);
1367
if (!loaded)
1368
return vfpu_asin_fallback(x);
1369
1370
uint32_t bits;
1371
memcpy(&bits, &x, sizeof(x));
1372
uint32_t sign = bits & 0x80000000u;
1373
bits = bits & 0x7FFFFFFFu;
1374
if(bits > 0x3F800000u) {
1375
bits = 0x7F800001u ^ sign;
1376
memcpy(&x, &bits, sizeof(x));
1377
return x;
1378
}
1379
1380
bits = vfpu_asin_fixed(uint32_t(int32_t(fabsf(x) * 8388608.0f))); // 0x1p23
1381
x=float(int32_t(bits)) * 9.31322574615478515625e-10f; // 0x1p-30
1382
if(sign) x = -x;
1383
return x;
1384
}
1385
1386
static inline uint32_t vfpu_exp2_approx(uint32_t x) {
1387
if(x == 0x00800000u) return 0x00800000u;
1388
uint32_t a=vfpu_exp2_lut65536[x >> 16];
1389
x &= 0x0000FFFFu;
1390
uint32_t b = uint32_t(((2977151143ull * x) >> 23) + ((1032119999ull * (x * x)) >> 46));
1391
return (a + uint32_t((uint64_t(a + (1u << 23)) * uint64_t(b)) >> 32)) & -4u;
1392
}
1393
1394
static inline uint32_t vfpu_exp2_fixed(uint32_t x) {
1395
if(x == 0u) return 0u;
1396
if(x == 0x00800000u) return 0x00800000u;
1397
uint32_t A = vfpu_exp2_approx((x ) & -64u);
1398
uint32_t B = vfpu_exp2_approx((x + 64) & -64u);
1399
uint64_t a = (A<<4)+vfpu_exp2_lut[x >> 6][0]-64u;
1400
uint64_t b = (B<<4)+vfpu_exp2_lut[x >> 6][1]-64u;
1401
uint32_t y = uint32_t((a + (((b - a) * (x & 63)) >> 6)) >> 4);
1402
y &= -4u;
1403
return y;
1404
}
1405
1406
float vfpu_exp2(float x) {
1407
static bool loaded =
1408
LOAD_TABLE(vfpu_exp2_lut65536, 512)&&
1409
LOAD_TABLE(vfpu_exp2_lut, 262144);
1410
if (!loaded)
1411
return vfpu_exp2_fallback(x);
1412
int32_t bits;
1413
memcpy(&bits, &x, sizeof(bits));
1414
if((bits & 0x7FFFFFFF) <= 0x007FFFFF) {
1415
// Denormals are treated as 0.
1416
return 1.0f;
1417
}
1418
if(x != x) {
1419
// NaN gets NaN.
1420
bits = 0x7F800001u;
1421
memcpy(&x, &bits, sizeof(x));
1422
return x;
1423
}
1424
if(x <= -126.0f) {
1425
// Small numbers get 0 (exp2(-126) is smallest positive non-denormal).
1426
// But yes, -126.0f produces +0.0f.
1427
return 0.0f;
1428
}
1429
if(x >= +128.0f) {
1430
// Large numbers get infinity.
1431
bits = 0x7F800000u;
1432
memcpy(&x, &bits, sizeof(x));
1433
return x;
1434
}
1435
bits = int32_t(x * 0x1p23f);
1436
if(x < 0.0f) --bits; // Yes, really.
1437
bits = int32_t(0x3F800000) + (bits & int32_t(0xFF800000)) + int32_t(vfpu_exp2_fixed(bits & int32_t(0x007FFFFF)));
1438
memcpy(&x, &bits, sizeof(bits));
1439
return x;
1440
}
1441
1442
float vfpu_rexp2(float x) {
1443
return vfpu_exp2(-x);
1444
}
1445
1446
// Input fixed 9.23, output fixed 10.22.
1447
// Returns log2(1+x).
1448
static inline uint32_t vfpu_log2_approx(uint32_t x) {
1449
uint32_t a = vfpu_log2_lut65536[(x >> 16) + 0];
1450
uint32_t b = vfpu_log2_lut65536[(x >> 16) + 1];
1451
uint32_t c = vfpu_log2_lut65536_quadratic[x >> 16];
1452
x &= 0xFFFFu;
1453
uint64_t ret = uint64_t(a) * (0x10000u - x) + uint64_t(b) * x;
1454
uint64_t d = (uint64_t(c) * x * (0x10000u-x)) >> 40;
1455
ret += d;
1456
return uint32_t(ret >> 16);
1457
}
1458
1459
// Matches PSP output on all known values.
1460
float vfpu_log2(float x) {
1461
static bool loaded =
1462
LOAD_TABLE(vfpu_log2_lut65536, 516)&&
1463
LOAD_TABLE(vfpu_log2_lut65536_quadratic, 512)&&
1464
LOAD_TABLE(vfpu_log2_lut, 2097152);
1465
if (!loaded)
1466
return vfpu_log2_fallback(x);
1467
uint32_t bits;
1468
memcpy(&bits, &x, sizeof(bits));
1469
if((bits & 0x7FFFFFFFu) <= 0x007FFFFFu) {
1470
// Denormals (and zeroes) get -inf.
1471
bits = 0xFF800000u;
1472
memcpy(&x, &bits, sizeof(x));
1473
return x;
1474
}
1475
if(bits & 0x80000000u) {
1476
// Other negatives get NaN.
1477
bits = 0x7F800001u;
1478
memcpy(&x, &bits, sizeof(x));
1479
return x;
1480
}
1481
if((bits >> 23) == 255u) {
1482
// NaN gets NaN, +inf gets +inf.
1483
bits = 0x7F800000u + ((bits & 0x007FFFFFu) != 0);
1484
memcpy(&x, &bits, sizeof(x));
1485
return x;
1486
}
1487
uint32_t e = (bits & 0x7F800000u) - 0x3F800000u;
1488
uint32_t i = bits & 0x007FFFFFu;
1489
if(e >> 31 && i >= 0x007FFE00u) {
1490
// Process 1-2^{-14}<=x*2^n<1 (for n>0) separately,
1491
// since the table doesn't give the right answer.
1492
float c = float(int32_t(~e) >> 23);
1493
// Note: if c is 0 the sign of -0 output is correct.
1494
return i < 0x007FFEF7u ? // 1-265*2^{-24}
1495
-3.05175781e-05f - c:
1496
-0.0f - c;
1497
}
1498
int d = (e < 0x01000000u ? 0 : 8 - clz32_nonzero(e) - int(e >> 31));
1499
//assert(d >= 0 && d < 8);
1500
uint32_t q = 1u << d;
1501
uint32_t A = vfpu_log2_approx((i ) & -64u) & -q;
1502
uint32_t B = vfpu_log2_approx((i + 64) & -64u) & -q;
1503
uint64_t a = (A << 6)+(uint64_t(vfpu_log2_lut[d][i >> 6][0]) - 80ull) * q;
1504
uint64_t b = (B << 6)+(uint64_t(vfpu_log2_lut[d][i >> 6][1]) - 80ull) * q;
1505
uint32_t v = uint32_t((a +(((b - a) * (i & 63)) >> 6)) >> 6);
1506
v &= -q;
1507
bits = e ^ (2u * v);
1508
x = float(int32_t(bits)) * 1.1920928955e-7f; // 0x1p-23f
1509
return x;
1510
}
1511
1512
static inline uint32_t vfpu_rcp_approx(uint32_t i) {
1513
return 0x3E800000u + (uint32_t((1ull << 47) / ((1ull << 23) + i)) & -4u);
1514
}
1515
1516
float vfpu_rcp(float x) {
1517
static bool loaded =
1518
LOAD_TABLE(vfpu_rcp_lut, 262144);
1519
if (!loaded)
1520
return vfpu_rcp_fallback(x);
1521
uint32_t bits;
1522
memcpy(&bits, &x, sizeof(bits));
1523
uint32_t s = bits & 0x80000000u;
1524
uint32_t e = bits & 0x7F800000u;
1525
uint32_t i = bits & 0x007FFFFFu;
1526
if((bits & 0x7FFFFFFFu) > 0x7E800000u) {
1527
bits = (e == 0x7F800000u && i ? s ^ 0x7F800001u : s);
1528
memcpy(&x, &bits, sizeof(x));
1529
return x;
1530
}
1531
if(e==0u) {
1532
bits = s^0x7F800000u;
1533
memcpy(&x, &bits, sizeof(x));
1534
return x;
1535
}
1536
uint32_t A = vfpu_rcp_approx((i ) & -64u);
1537
uint32_t B = vfpu_rcp_approx((i + 64) & -64u);
1538
uint64_t a = (uint64_t(A) << 6) + uint64_t(vfpu_rcp_lut[i >> 6][0]) * 4u;
1539
uint64_t b = (uint64_t(B) << 6) + uint64_t(vfpu_rcp_lut[i >> 6][1]) * 4u;
1540
uint32_t v = uint32_t((a+(((b-a)*(i&63))>>6))>>6);
1541
v &= -4u;
1542
bits = s + (0x3F800000u - e) + v;
1543
memcpy(&x, &bits, sizeof(x));
1544
return x;
1545
}
1546
1547
//==============================================================================
1548
1549
void InitVFPU() {
1550
#if 0
1551
// Load all in advance.
1552
LOAD_TABLE(vfpu_asin_lut65536 , 1536);
1553
LOAD_TABLE(vfpu_asin_lut_deltas , 517448);
1554
LOAD_TABLE(vfpu_asin_lut_indices , 798916);
1555
LOAD_TABLE(vfpu_exp2_lut65536 , 512);
1556
LOAD_TABLE(vfpu_exp2_lut , 262144);
1557
LOAD_TABLE(vfpu_log2_lut65536 , 516);
1558
LOAD_TABLE(vfpu_log2_lut65536_quadratic, 512);
1559
LOAD_TABLE(vfpu_log2_lut , 2097152);
1560
LOAD_TABLE(vfpu_rcp_lut , 262144);
1561
LOAD_TABLE(vfpu_rsqrt_lut , 262144);
1562
LOAD_TABLE(vfpu_sin_lut8192 , 4100);
1563
LOAD_TABLE(vfpu_sin_lut_delta , 262144);
1564
LOAD_TABLE(vfpu_sin_lut_exceptions , 86938);
1565
LOAD_TABLE(vfpu_sin_lut_interval_delta , 131074);
1566
LOAD_TABLE(vfpu_sqrt_lut , 262144);
1567
#endif
1568
}
1569
1570