Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hrydgard
GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Common/TextureDecoder.cpp
3186 views
1
// Copyright (c) 2012- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include "ppsspp_config.h"
19
20
#include "ext/xxhash.h"
21
22
#include "Common/Common.h"
23
#include "Common/Log.h"
24
#include "Common/Math/SIMDHeaders.h"
25
26
#include "GPU/GPUState.h"
27
#include "GPU/Common/TextureDecoder.h"
28
29
#include "Common/Math/SIMDHeaders.h"
30
31
const u8 textureBitsPerPixel[16] = {
32
16, //GE_TFMT_5650,
33
16, //GE_TFMT_5551,
34
16, //GE_TFMT_4444,
35
32, //GE_TFMT_8888,
36
4, //GE_TFMT_CLUT4,
37
8, //GE_TFMT_CLUT8,
38
16, //GE_TFMT_CLUT16,
39
32, //GE_TFMT_CLUT32,
40
4, //GE_TFMT_DXT1,
41
8, //GE_TFMT_DXT3,
42
8, //GE_TFMT_DXT5,
43
0, // INVALID,
44
0, // INVALID,
45
0, // INVALID,
46
0, // INVALID,
47
0, // INVALID,
48
};
49
50
#ifdef _M_SSE
51
52
static u32 QuickTexHashSSE2(const void *checkp, u32 size) {
53
u32 check = 0;
54
55
if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
56
__m128i cursor = _mm_set1_epi32(0);
57
__m128i cursor2 = _mm_set_epi16(0x0001U, 0x0083U, 0x4309U, 0x4d9bU, 0xb651U, 0x4b73U, 0x9bd9U, 0xc00bU);
58
__m128i update = _mm_set1_epi16(0x2455U);
59
const __m128i *p = (const __m128i *)checkp;
60
for (u32 i = 0; i < size / 16; i += 4) {
61
__m128i chunk = _mm_mullo_epi16(_mm_load_si128(&p[i]), cursor2);
62
cursor = _mm_add_epi16(cursor, chunk);
63
cursor = _mm_xor_si128(cursor, _mm_load_si128(&p[i + 1]));
64
cursor = _mm_add_epi32(cursor, _mm_load_si128(&p[i + 2]));
65
chunk = _mm_mullo_epi16(_mm_load_si128(&p[i + 3]), cursor2);
66
cursor = _mm_xor_si128(cursor, chunk);
67
cursor2 = _mm_add_epi16(cursor2, update);
68
}
69
cursor = _mm_add_epi32(cursor, cursor2);
70
// Add the four parts into the low i32.
71
cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 8));
72
cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 4));
73
check = _mm_cvtsi128_si32(cursor);
74
} else {
75
const u32 *p = (const u32 *)checkp;
76
for (u32 i = 0; i < size / 8; ++i) {
77
check += *p++;
78
check ^= *p++;
79
}
80
}
81
82
return check;
83
}
84
#endif
85
86
#if PPSSPP_ARCH(ARM_NEON)
87
88
alignas(16) static const u16 QuickTexHashInitial[8] = { 0xc00bU, 0x9bd9U, 0x4b73U, 0xb651U, 0x4d9bU, 0x4309U, 0x0083U, 0x0001U };
89
90
static u32 QuickTexHashNEON(const void *checkp, u32 size) {
91
u32 check = 0;
92
93
if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
94
#if PPSSPP_PLATFORM(IOS) || PPSSPP_ARCH(ARM64) || defined(_MSC_VER) || !PPSSPP_ARCH(ARMV7)
95
uint32x4_t cursor = vdupq_n_u32(0);
96
uint16x8_t cursor2 = vld1q_u16(QuickTexHashInitial);
97
uint16x8_t update = vdupq_n_u16(0x2455U);
98
99
const u32 *p = (const u32 *)checkp;
100
const u32 *pend = p + size / 4;
101
while (p < pend) {
102
cursor = vreinterpretq_u32_u16(vmlaq_u16(vreinterpretq_u16_u32(cursor), vreinterpretq_u16_u32(vld1q_u32(&p[4 * 0])), cursor2));
103
cursor = veorq_u32(cursor, vld1q_u32(&p[4 * 1]));
104
cursor = vaddq_u32(cursor, vld1q_u32(&p[4 * 2]));
105
cursor = veorq_u32(cursor, vreinterpretq_u32_u16(vmulq_u16(vreinterpretq_u16_u32(vld1q_u32(&p[4 * 3])), cursor2)));
106
cursor2 = vaddq_u16(cursor2, update);
107
108
p += 4 * 4;
109
}
110
111
cursor = vaddq_u32(cursor, vreinterpretq_u32_u16(cursor2));
112
uint32x2_t mixed = vadd_u32(vget_high_u32(cursor), vget_low_u32(cursor));
113
check = vget_lane_u32(mixed, 0) + vget_lane_u32(mixed, 1);
114
#else
115
// TODO: Why does this crash on iOS, but only certain devices?
116
// It's faster than the above, but I guess it sucks to be using an iPhone.
117
// As of 2020 clang, it's still faster by ~1.4%.
118
119
// d0/d1 (q0) - cursor
120
// d2/d3 (q1) - cursor2
121
// d4/d5 (q2) - update
122
// d16-d23 (q8-q11) - memory transfer
123
asm volatile (
124
// Initialize cursor.
125
"vmov.i32 q0, #0\n"
126
127
// Initialize cursor2.
128
"movw r0, 0xc00b\n"
129
"movt r0, 0x9bd9\n"
130
"movw r1, 0x4b73\n"
131
"movt r1, 0xb651\n"
132
"vmov d2, r0, r1\n"
133
"movw r0, 0x4d9b\n"
134
"movt r0, 0x4309\n"
135
"movw r1, 0x0083\n"
136
"movt r1, 0x0001\n"
137
"vmov d3, r0, r1\n"
138
139
// Initialize update.
140
"movw r0, 0x2455\n"
141
"vdup.i16 q2, r0\n"
142
143
// This is where we end.
144
"add r0, %1, %2\n"
145
146
// Okay, do the memory hashing.
147
"QuickTexHashNEON_next:\n"
148
"pld [%2, #0xc0]\n"
149
"vldmia %2!, {d16-d23}\n"
150
"vmla.i16 q0, q1, q8\n"
151
"vmul.i16 q11, q11, q1\n"
152
"veor.i32 q0, q0, q9\n"
153
"cmp %2, r0\n"
154
"vadd.i32 q0, q0, q10\n"
155
"vadd.i16 q1, q1, q2\n"
156
"veor.i32 q0, q0, q11\n"
157
"blo QuickTexHashNEON_next\n"
158
159
// Now let's get the result.
160
"vadd.i32 q0, q0, q1\n"
161
"vadd.i32 d0, d0, d1\n"
162
"vmov r0, r1, d0\n"
163
"add %0, r0, r1\n"
164
165
: "=r"(check)
166
: "r"(size), "r"(checkp)
167
: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "cc"
168
);
169
#endif
170
} else {
171
const u32 size_u32 = size / 4;
172
const u32 *p = (const u32 *)checkp;
173
for (u32 i = 0; i < size_u32; i += 4) {
174
check += p[i + 0];
175
check ^= p[i + 1];
176
check += p[i + 2];
177
check ^= p[i + 3];
178
}
179
}
180
181
return check;
182
}
183
184
#endif // PPSSPP_ARCH(ARM_NEON)
185
186
#if PPSSPP_ARCH(LOONGARCH64_LSX)
187
188
alignas(16) static const u16 QuickTexHashInitial[8] = { 0xc00bU, 0x9bd9U, 0x4b73U, 0xb651U, 0x4d9bU, 0x4309U, 0x0083U, 0x0001U };
189
190
static u32 QuickTexHashLSX(const void *checkp, u32 size) {
191
u32 check = 0;
192
193
if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
194
__m128i cursor = __lsx_vrepli_d(0);
195
__m128i cursor2 = __lsx_vld(QuickTexHashInitial, 0);
196
__m128i update = __lsx_vreplgr2vr_h(0x2455U);
197
const __m128i *p = (const __m128i *)checkp;
198
for (u32 i = 0; i < size / 16; i += 4) {
199
__m128i chunk = __lsx_vmul_h(__lsx_vld(&p[i], 0), cursor2);
200
cursor = __lsx_vadd_h(cursor, chunk);
201
cursor = __lsx_vxor_v(cursor, __lsx_vld(&p[i + 1], 0));
202
cursor = __lsx_vadd_w(cursor, __lsx_vld(&p[i + 2], 0));
203
chunk = __lsx_vmul_h(__lsx_vld(&p[i + 3], 0), cursor2);
204
cursor = __lsx_vxor_v(cursor, chunk);
205
cursor2 = __lsx_vadd_h(cursor2, update);
206
}
207
cursor = __lsx_vadd_w(cursor, cursor2);
208
// Add the four parts into the low i32.
209
cursor = __lsx_vadd_w(cursor, __lsx_vbsrl_v(cursor, 8));
210
cursor = __lsx_vadd_w(cursor, __lsx_vbsrl_v(cursor, 4));
211
check = __lsx_vpickve2gr_w(cursor, 0);
212
} else {
213
const u32 *p = (const u32 *)checkp;
214
for (u32 i = 0; i < size / 8; ++i) {
215
check += *p++;
216
check ^= *p++;
217
}
218
}
219
220
return check;
221
}
222
223
#endif // PPSSPP_ARCH(LOONGARCH64_LSX)
224
225
// Masks to downalign bufw to 16 bytes, and wrap at 2048.
226
static const u32 textureAlignMask16[16] = {
227
0x7FF & ~(((8 * 16) / 16) - 1), //GE_TFMT_5650,
228
0x7FF & ~(((8 * 16) / 16) - 1), //GE_TFMT_5551,
229
0x7FF & ~(((8 * 16) / 16) - 1), //GE_TFMT_4444,
230
0x7FF & ~(((8 * 16) / 32) - 1), //GE_TFMT_8888,
231
0x7FF & ~(((8 * 16) / 4) - 1), //GE_TFMT_CLUT4,
232
0x7FF & ~(((8 * 16) / 8) - 1), //GE_TFMT_CLUT8,
233
0x7FF & ~(((8 * 16) / 16) - 1), //GE_TFMT_CLUT16,
234
0x7FF & ~(((8 * 16) / 32) - 1), //GE_TFMT_CLUT32,
235
0x7FF, //GE_TFMT_DXT1,
236
0x7FF, //GE_TFMT_DXT3,
237
0x7FF, //GE_TFMT_DXT5,
238
0, // INVALID,
239
0, // INVALID,
240
0, // INVALID,
241
0, // INVALID,
242
0, // INVALID,
243
};
244
245
u32 GetTextureBufw(int level, u32 texaddr, GETextureFormat format) {
246
// This is a hack to allow for us to draw the huge PPGe texture, which is always in kernel ram.
247
if (texaddr >= PSP_GetKernelMemoryBase() && texaddr < PSP_GetKernelMemoryEnd())
248
return gstate.texbufwidth[level] & 0x1FFF;
249
250
u32 bufw = gstate.texbufwidth[level] & textureAlignMask16[format];
251
if (bufw == 0 && format <= GE_TFMT_DXT5) {
252
// If it's less than 16 bytes, use 16 bytes.
253
bufw = (8 * 16) / textureBitsPerPixel[format];
254
}
255
return bufw;
256
}
257
258
// Matches QuickTexHashNEON/SSE, see #7029.
259
static u32 QuickTexHashNonSSE(const void *checkp, u32 size) {
260
u32 check = 0;
261
262
if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
263
static const u16 cursor2_initial[8] = {0xc00bU, 0x9bd9U, 0x4b73U, 0xb651U, 0x4d9bU, 0x4309U, 0x0083U, 0x0001U};
264
union u32x4_u16x8 {
265
#if defined(__GNUC__)
266
uint32_t x32 __attribute__((vector_size(16)));
267
uint16_t x16 __attribute__((vector_size(16)));
268
#else
269
u32 x32[4];
270
u16 x16[8];
271
#endif
272
};
273
u32x4_u16x8 cursor{};
274
u32x4_u16x8 cursor2;
275
static const u16 update[8] = {0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U};
276
277
for (u32 j = 0; j < 8; ++j) {
278
cursor2.x16[j] = cursor2_initial[j];
279
}
280
281
const u32x4_u16x8 *p = (const u32x4_u16x8 *)checkp;
282
for (u32 i = 0; i < size / 16; i += 4) {
283
for (u32 j = 0; j < 8; ++j) {
284
const u16 temp = p[i + 0].x16[j] * cursor2.x16[j];
285
cursor.x16[j] += temp;
286
}
287
for (u32 j = 0; j < 4; ++j) {
288
cursor.x32[j] ^= p[i + 1].x32[j];
289
cursor.x32[j] += p[i + 2].x32[j];
290
}
291
for (u32 j = 0; j < 8; ++j) {
292
const u16 temp = p[i + 3].x16[j] * cursor2.x16[j];
293
cursor.x16[j] ^= temp;
294
}
295
for (u32 j = 0; j < 8; ++j) {
296
cursor2.x16[j] += update[j];
297
}
298
}
299
300
for (u32 j = 0; j < 4; ++j) {
301
cursor.x32[j] += cursor2.x32[j];
302
}
303
check = cursor.x32[0] + cursor.x32[1] + cursor.x32[2] + cursor.x32[3];
304
} else {
305
const u32 *p = (const u32 *)checkp;
306
for (u32 i = 0; i < size / 8; ++i) {
307
check += *p++;
308
check ^= *p++;
309
}
310
}
311
312
return check;
313
}
314
315
u32 StableQuickTexHash(const void *checkp, u32 size) {
316
#if defined(_M_SSE)
317
return QuickTexHashSSE2(checkp, size);
318
#elif PPSSPP_ARCH(ARM_NEON)
319
return QuickTexHashNEON(checkp, size);
320
#elif PPSSPP_ARCH(LOONGARCH64_LSX)
321
return QuickTexHashLSX(checkp, size);
322
#else
323
return QuickTexHashNonSSE(checkp, size);
324
#endif
325
}
326
327
void DoSwizzleTex16(const u32 *ysrcp, u8 *texptr, int bxc, int byc, u32 pitch) {
328
// ysrcp is in 32-bits, so this is convenient.
329
const u32 pitchBy32 = pitch >> 2;
330
#ifdef _M_SSE
331
if (((uintptr_t)ysrcp & 0xF) == 0 && (pitch & 0xF) == 0) {
332
__m128i *dest = (__m128i *)texptr;
333
// The pitch parameter is in bytes, so shift down for 128-bit.
334
// Note: it's always aligned to 16 bytes, so this is safe.
335
const u32 pitchBy128 = pitch >> 4;
336
for (int by = 0; by < byc; by++) {
337
const __m128i *xsrc = (const __m128i *)ysrcp;
338
for (int bx = 0; bx < bxc; bx++) {
339
const __m128i *src = xsrc;
340
for (int n = 0; n < 2; n++) {
341
// Textures are always 16-byte aligned so this is fine.
342
__m128i temp1 = _mm_load_si128(src);
343
src += pitchBy128;
344
__m128i temp2 = _mm_load_si128(src);
345
src += pitchBy128;
346
__m128i temp3 = _mm_load_si128(src);
347
src += pitchBy128;
348
__m128i temp4 = _mm_load_si128(src);
349
src += pitchBy128;
350
351
_mm_store_si128(dest, temp1);
352
_mm_store_si128(dest + 1, temp2);
353
_mm_store_si128(dest + 2, temp3);
354
_mm_store_si128(dest + 3, temp4);
355
dest += 4;
356
}
357
xsrc++;
358
}
359
ysrcp += pitchBy32 * 8;
360
}
361
} else
362
#endif
363
{
364
u32 *dest = (u32 *)texptr;
365
for (int by = 0; by < byc; by++) {
366
const u32 *xsrc = ysrcp;
367
for (int bx = 0; bx < bxc; bx++) {
368
const u32 *src = xsrc;
369
for (int n = 0; n < 8; n++) {
370
memcpy(dest, src, 16);
371
src += pitchBy32;
372
dest += 4;
373
}
374
xsrc += 4;
375
}
376
ysrcp += pitchBy32 * 8;
377
}
378
}
379
}
380
381
void DoUnswizzleTex16(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch) {
382
// ydestp is in 32-bits, so this is convenient.
383
const u32 pitchBy32 = pitch >> 2;
384
385
#ifdef _M_SSE
386
// This check is pretty much a given, right?
387
if (((uintptr_t)ydestp & 0xF) == 0 && (pitch & 0xF) == 0) {
388
const __m128i *src = (const __m128i *)texptr;
389
// The pitch parameter is in bytes, so shift down for 128-bit.
390
// Note: it's always aligned to 16 bytes, so this is safe.
391
const u32 pitchBy128 = pitch >> 4;
392
for (int by = 0; by < byc; by++) {
393
__m128i *xdest = (__m128i *)ydestp;
394
for (int bx = 0; bx < bxc; bx++) {
395
__m128i *dest = xdest;
396
for (int n = 0; n < 2; n++) {
397
// Textures are always 16-byte aligned so this is fine.
398
__m128i temp1 = _mm_load_si128(src);
399
__m128i temp2 = _mm_load_si128(src + 1);
400
__m128i temp3 = _mm_load_si128(src + 2);
401
__m128i temp4 = _mm_load_si128(src + 3);
402
_mm_store_si128(dest, temp1);
403
dest += pitchBy128;
404
_mm_store_si128(dest, temp2);
405
dest += pitchBy128;
406
_mm_store_si128(dest, temp3);
407
dest += pitchBy128;
408
_mm_store_si128(dest, temp4);
409
dest += pitchBy128;
410
src += 4;
411
}
412
xdest++;
413
}
414
ydestp += pitchBy32 * 8;
415
}
416
} else
417
#elif PPSSPP_ARCH(ARM_NEON)
418
if (((uintptr_t)ydestp & 0xF) == 0 && (pitch & 0xF) == 0) {
419
const u32 *src = (const u32 *)texptr;
420
for (int by = 0; by < byc; by++) {
421
u32 *xdest = ydestp;
422
for (int bx = 0; bx < bxc; bx++) {
423
u32 *dest = xdest;
424
for (int n = 0; n < 2; n++) {
425
// Textures are always 16-byte aligned so this is fine.
426
uint32x4_t temp1 = vld1q_u32(src);
427
uint32x4_t temp2 = vld1q_u32(src + 4);
428
uint32x4_t temp3 = vld1q_u32(src + 8);
429
uint32x4_t temp4 = vld1q_u32(src + 12);
430
vst1q_u32(dest, temp1);
431
dest += pitchBy32;
432
vst1q_u32(dest, temp2);
433
dest += pitchBy32;
434
vst1q_u32(dest, temp3);
435
dest += pitchBy32;
436
vst1q_u32(dest, temp4);
437
dest += pitchBy32;
438
src += 16;
439
}
440
xdest += 4;
441
}
442
ydestp += pitchBy32 * 8;
443
}
444
} else
445
#endif
446
{
447
const u32 *src = (const u32 *)texptr;
448
for (int by = 0; by < byc; by++) {
449
u32 *xdest = ydestp;
450
for (int bx = 0; bx < bxc; bx++) {
451
u32 *dest = xdest;
452
for (int n = 0; n < 8; n++) {
453
memcpy(dest, src, 16);
454
dest += pitchBy32;
455
src += 4;
456
}
457
xdest += 4;
458
}
459
ydestp += pitchBy32 * 8;
460
}
461
}
462
}
463
464
// S3TC / DXT Decoder
465
class DXTDecoder {
466
public:
467
inline void DecodeColors(const DXT1Block *src, bool ignore1bitAlpha);
468
inline void DecodeAlphaDXT5(const DXT5Block *src);
469
inline void WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int width, int height);
470
inline void WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int width, int height);
471
inline void WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int width, int height);
472
473
bool AnyNonFullAlpha() const { return anyNonFullAlpha_; }
474
475
protected:
476
u32 colors_[4];
477
u8 alpha_[8];
478
bool alphaMode_ = false;
479
bool anyNonFullAlpha_ = false;
480
};
481
482
static inline u32 makecol(int r, int g, int b, int a) {
483
return (a << 24) | (b << 16) | (g << 8) | r;
484
}
485
486
static inline int mix_2_3(int c1, int c2) {
487
return (c1 + c1 + c2) / 3;
488
}
489
490
// This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.
491
void DXTDecoder::DecodeColors(const DXT1Block *src, bool ignore1bitAlpha) {
492
u16 c1 = src->color1;
493
u16 c2 = src->color2;
494
int blue1 = (c1 << 3) & 0xF8;
495
int blue2 = (c2 << 3) & 0xF8;
496
int green1 = (c1 >> 3) & 0xFC;
497
int green2 = (c2 >> 3) & 0xFC;
498
int red1 = (c1 >> 8) & 0xF8;
499
int red2 = (c2 >> 8) & 0xF8;
500
501
// Keep alpha zero for non-DXT1 to skip masking the colors.
502
int alpha = ignore1bitAlpha ? 0 : 255;
503
504
colors_[0] = makecol(red1, green1, blue1, alpha);
505
colors_[1] = makecol(red2, green2, blue2, alpha);
506
if (c1 > c2) {
507
colors_[2] = makecol(mix_2_3(red1, red2), mix_2_3(green1, green2), mix_2_3(blue1, blue2), alpha);
508
colors_[3] = makecol(mix_2_3(red2, red1), mix_2_3(green2, green1), mix_2_3(blue2, blue1), alpha);
509
} else {
510
// Average - these are always left shifted, so no need to worry about ties.
511
int red3 = (red1 + red2) / 2;
512
int green3 = (green1 + green2) / 2;
513
int blue3 = (blue1 + blue2) / 2;
514
colors_[2] = makecol(red3, green3, blue3, alpha);
515
colors_[3] = makecol(0, 0, 0, 0);
516
if (alpha == 255) {
517
alphaMode_ = true;
518
}
519
}
520
}
521
522
static inline u8 lerp8(const DXT5Block *src, int n) {
523
// These weights multiple alpha1/alpha2 to fixed 8.8 point.
524
int alpha1 = (src->alpha1 * ((7 - n) << 8)) / 7;
525
int alpha2 = (src->alpha2 * (n << 8)) / 7;
526
return (u8)((alpha1 + alpha2 + 31) >> 8);
527
}
528
529
static inline u8 lerp6(const DXT5Block *src, int n) {
530
int alpha1 = (src->alpha1 * ((5 - n) << 8)) / 5;
531
int alpha2 = (src->alpha2 * (n << 8)) / 5;
532
return (u8)((alpha1 + alpha2 + 31) >> 8);
533
}
534
535
void DXTDecoder::DecodeAlphaDXT5(const DXT5Block *src) {
536
alpha_[0] = src->alpha1;
537
alpha_[1] = src->alpha2;
538
if (alpha_[0] > alpha_[1]) {
539
alpha_[2] = lerp8(src, 1);
540
alpha_[3] = lerp8(src, 2);
541
alpha_[4] = lerp8(src, 3);
542
alpha_[5] = lerp8(src, 4);
543
alpha_[6] = lerp8(src, 5);
544
alpha_[7] = lerp8(src, 6);
545
} else {
546
alpha_[2] = lerp6(src, 1);
547
alpha_[3] = lerp6(src, 2);
548
alpha_[4] = lerp6(src, 3);
549
alpha_[5] = lerp6(src, 4);
550
alpha_[6] = 0;
551
alpha_[7] = 255;
552
}
553
}
554
555
void DXTDecoder::WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int width, int height) {
556
bool anyColor3 = false;
557
for (int y = 0; y < height; y++) {
558
int colordata = src->lines[y];
559
for (int x = 0; x < width; x++) {
560
int col = colordata & 3;
561
if (col == 3) {
562
anyColor3 = true;
563
}
564
dst[x] = colors_[col];
565
colordata >>= 2;
566
}
567
dst += pitch;
568
}
569
570
if (alphaMode_ && anyColor3) {
571
anyNonFullAlpha_ = true;
572
}
573
}
574
575
void DXTDecoder::WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int width, int height) {
576
for (int y = 0; y < height; y++) {
577
int colordata = src->color.lines[y];
578
u32 alphadata = src->alphaLines[y];
579
for (int x = 0; x < width; x++) {
580
dst[x] = colors_[colordata & 3] | (alphadata << 28);
581
colordata >>= 2;
582
alphadata >>= 4;
583
}
584
dst += pitch;
585
}
586
}
587
588
void DXTDecoder::WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int width, int height) {
589
// 48 bits, 3 bit index per pixel, 12 bits per line.
590
u64 allAlpha = ((u64)(u16)src->alphadata1 << 32) | (u32)src->alphadata2;
591
592
for (int y = 0; y < height; y++) {
593
uint32_t colordata = src->color.lines[y];
594
uint32_t alphadata = allAlpha >> (12 * y);
595
for (int x = 0; x < width; x++) {
596
dst[x] = colors_[colordata & 3] | (alpha_[alphadata & 7] << 24);
597
colordata >>= 2;
598
alphadata >>= 3;
599
}
600
dst += pitch;
601
}
602
}
603
604
uint32_t GetDXTTexelColor(const DXT1Block *src, int x, int y, int alpha) {
605
_dbg_assert_(x >= 0 && x < 4);
606
_dbg_assert_(y >= 0 && y < 4);
607
608
uint16_t c1 = src->color1;
609
uint16_t c2 = src->color2;
610
int blue1 = (c1 << 3) & 0xF8;
611
int blue2 = (c2 << 3) & 0xF8;
612
int green1 = (c1 >> 3) & 0xFC;
613
int green2 = (c2 >> 3) & 0xFC;
614
int red1 = (c1 >> 8) & 0xF8;
615
int red2 = (c2 >> 8) & 0xF8;
616
617
int colorIndex = (src->lines[y] >> (x * 2)) & 3;
618
if (colorIndex == 0) {
619
return makecol(red1, green1, blue1, alpha);
620
} else if (colorIndex == 1) {
621
return makecol(red2, green2, blue2, alpha);
622
} else if (c1 > c2) {
623
if (colorIndex == 2) {
624
return makecol(mix_2_3(red1, red2), mix_2_3(green1, green2), mix_2_3(blue1, blue2), alpha);
625
}
626
return makecol(mix_2_3(red2, red1), mix_2_3(green2, green1), mix_2_3(blue2, blue1), alpha);
627
} else if (colorIndex == 3) {
628
return makecol(0, 0, 0, 0);
629
}
630
631
// Average - these are always left shifted, so no need to worry about ties.
632
int red3 = (red1 + red2) / 2;
633
int green3 = (green1 + green2) / 2;
634
int blue3 = (blue1 + blue2) / 2;
635
return makecol(red3, green3, blue3, alpha);
636
}
637
638
uint32_t GetDXT1Texel(const DXT1Block *src, int x, int y) {
639
return GetDXTTexelColor(src, x, y, 255);
640
}
641
642
uint32_t GetDXT3Texel(const DXT3Block *src, int x, int y) {
643
uint32_t color = GetDXTTexelColor(&src->color, x, y, 0);
644
u32 alpha = (src->alphaLines[y] >> (x * 4)) & 0xF;
645
return color | (alpha << 28);
646
}
647
648
uint32_t GetDXT5Texel(const DXT5Block *src, int x, int y) {
649
uint32_t color = GetDXTTexelColor(&src->color, x, y, 0);
650
uint64_t alphadata = ((uint64_t)(uint16_t)src->alphadata1 << 32) | (uint32_t)src->alphadata2;
651
int alphaIndex = (alphadata >> (y * 12 + x * 3)) & 7;
652
653
if (alphaIndex == 0) {
654
return color | (src->alpha1 << 24);
655
} else if (alphaIndex == 1) {
656
return color | (src->alpha2 << 24);
657
} else if (src->alpha1 > src->alpha2) {
658
return color | (lerp8(src, alphaIndex - 1) << 24);
659
} else if (alphaIndex == 6) {
660
return color;
661
} else if (alphaIndex == 7) {
662
return color | 0xFF000000;
663
}
664
return color | (lerp6(src, alphaIndex - 1) << 24);
665
}
666
667
// This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.
668
void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int width, int height, u32 *alpha) {
669
DXTDecoder dxt;
670
dxt.DecodeColors(src, false);
671
dxt.WriteColorsDXT1(dst, src, pitch, width, height);
672
*alpha &= dxt.AnyNonFullAlpha() ? 0 : 1;
673
}
674
675
void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int width, int height) {
676
DXTDecoder dxt;
677
dxt.DecodeColors(&src->color, true);
678
dxt.WriteColorsDXT3(dst, src, pitch, width, height);
679
}
680
681
void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int width, int height) {
682
DXTDecoder dxt;
683
dxt.DecodeColors(&src->color, true);
684
dxt.DecodeAlphaDXT5(src);
685
dxt.WriteColorsDXT5(dst, src, pitch, width, height);
686
}
687
688
#ifdef _M_SSE
689
inline u32 SSEReduce32And(__m128i value) {
690
value = _mm_and_si128(value, _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
691
value = _mm_and_si128(value, _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1)));
692
return _mm_cvtsi128_si32(value);
693
}
694
inline u32 SSEReduce16And(__m128i value) {
695
u32 mask = SSEReduce32And(value);
696
return mask & (mask >> 16);
697
}
698
#endif
699
700
#if PPSSPP_ARCH(ARM_NEON)
701
inline u32 NEONReduce32And(uint32x4_t value) {
702
// TODO: Maybe a shuffle and a vector and, or something?
703
return vgetq_lane_u32(value, 0) & vgetq_lane_u32(value, 1) & vgetq_lane_u32(value, 2) & vgetq_lane_u32(value, 3);
704
}
705
inline u32 NEONReduce16And(uint16x8_t value) {
706
uint32x4_t value32 = vreinterpretq_u32_u16(value);
707
// TODO: Maybe a shuffle and a vector and, or something?
708
u32 mask = vgetq_lane_u32(value32, 0) & vgetq_lane_u32(value32, 1) & vgetq_lane_u32(value32, 2) & vgetq_lane_u32(value32, 3);
709
return mask & (mask >> 16);
710
}
711
#endif
712
713
// TODO: SSE/SIMD
714
// At least on x86, compiler actually SIMDs these pretty well.
715
void CopyAndSumMask16(u16 *dst, const u16 *src, int width, u32 *outMask) {
716
u16 mask = 0xFFFF;
717
#ifdef _M_SSE
718
if (width >= 8) {
719
__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
720
while (width >= 8) {
721
__m128i color = _mm_loadu_si128((__m128i *)src);
722
wideMask = _mm_and_si128(wideMask, color);
723
_mm_storeu_si128((__m128i *)dst, color);
724
src += 8;
725
dst += 8;
726
width -= 8;
727
}
728
mask = SSEReduce16And(wideMask);
729
}
730
#elif PPSSPP_ARCH(ARM_NEON)
731
if (width >= 8) {
732
uint16x8_t wideMask = vdupq_n_u16(0xFFFF);
733
while (width >= 8) {
734
uint16x8_t colors = vld1q_u16(src);
735
wideMask = vandq_u16(wideMask, colors);
736
vst1q_u16(dst, colors);
737
src += 8;
738
dst += 8;
739
width -= 8;
740
}
741
mask = NEONReduce16And(wideMask);
742
}
743
#endif
744
745
DO_NOT_VECTORIZE_LOOP
746
for (int i = 0; i < width; i++) {
747
u16 color = src[i];
748
mask &= color;
749
dst[i] = color;
750
}
751
*outMask &= (u32)mask;
752
}
753
754
// Used in video playback so nice to have being fast.
755
void CopyAndSumMask32(u32 *dst, const u32 *src, int width, u32 *outMask) {
756
u32 mask = 0xFFFFFFFF;
757
#ifdef _M_SSE
758
if (width >= 4) {
759
__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
760
while (width >= 4) {
761
__m128i color = _mm_loadu_si128((__m128i *)src);
762
wideMask = _mm_and_si128(wideMask, color);
763
_mm_storeu_si128((__m128i *)dst, color);
764
src += 4;
765
dst += 4;
766
width -= 4;
767
}
768
mask = SSEReduce32And(wideMask);
769
}
770
#elif PPSSPP_ARCH(ARM_NEON)
771
if (width >= 4) {
772
uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF);
773
while (width >= 4) {
774
uint32x4_t colors = vld1q_u32(src);
775
wideMask = vandq_u32(wideMask, colors);
776
vst1q_u32(dst, colors);
777
src += 4;
778
dst += 4;
779
width -= 4;
780
}
781
mask = NEONReduce32And(wideMask);
782
}
783
#endif
784
785
DO_NOT_VECTORIZE_LOOP
786
for (int i = 0; i < width; i++) {
787
u32 color = src[i];
788
mask &= color;
789
dst[i] = color;
790
}
791
*outMask &= (u32)mask;
792
}
793
794
void CheckMask16(const u16 *src, int width, u32 *outMask) {
795
u16 mask = 0xFFFF;
796
#ifdef _M_SSE
797
if (width >= 8) {
798
__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
799
while (width >= 8) {
800
wideMask = _mm_and_si128(wideMask, _mm_loadu_si128((__m128i *)src));
801
src += 8;
802
width -= 8;
803
}
804
mask = SSEReduce16And(wideMask);
805
}
806
#elif PPSSPP_ARCH(ARM_NEON)
807
if (width >= 8) {
808
uint16x8_t wideMask = vdupq_n_u16(0xFFFF);
809
while (width >= 8) {
810
wideMask = vandq_u16(wideMask, vld1q_u16(src));
811
src += 8;
812
width -= 8;
813
}
814
mask = NEONReduce16And(wideMask);
815
}
816
#endif
817
818
DO_NOT_VECTORIZE_LOOP
819
for (int i = 0; i < width; i++) {
820
mask &= src[i];
821
}
822
*outMask &= (u32)mask;
823
}
824
825
void CheckMask32(const u32 *src, int width, u32 *outMask) {
826
u32 mask = 0xFFFFFFFF;
827
#ifdef _M_SSE
828
if (width >= 4) {
829
__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
830
while (width >= 4) {
831
wideMask = _mm_and_si128(wideMask, _mm_loadu_si128((__m128i *)src));
832
src += 4;
833
width -= 4;
834
}
835
mask = SSEReduce32And(wideMask);
836
}
837
#elif PPSSPP_ARCH(ARM_NEON)
838
if (width >= 4) {
839
uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF);
840
while (width >= 4) {
841
wideMask = vandq_u32(wideMask, vld1q_u32(src));
842
src += 4;
843
width -= 4;
844
}
845
mask = NEONReduce32And(wideMask);
846
}
847
#endif
848
849
DO_NOT_VECTORIZE_LOOP
850
for (int i = 0; i < width; i++) {
851
mask &= src[i];
852
}
853
*outMask &= (u32)mask;
854
}
855
856