CoCalc -- TextureDecoder.cpp

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Common/TextureDecoder.cpp
³¹⁸⁶ views
1
// Copyright (c) 2012- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include "ppsspp_config.h"
19

20
#include "ext/xxhash.h"
21

22
#include "Common/Common.h"
23
#include "Common/Log.h"
24
#include "Common/Math/SIMDHeaders.h"
25

26
#include "GPU/GPUState.h"
27
#include "GPU/Common/TextureDecoder.h"
28

29
#include "Common/Math/SIMDHeaders.h"
30

31
const u8 textureBitsPerPixel[16] = {
32
	16,  //GE_TFMT_5650,
33
	16,  //GE_TFMT_5551,
34
	16,  //GE_TFMT_4444,
35
	32,  //GE_TFMT_8888,
36
	4,   //GE_TFMT_CLUT4,
37
	8,   //GE_TFMT_CLUT8,
38
	16,  //GE_TFMT_CLUT16,
39
	32,  //GE_TFMT_CLUT32,
40
	4,   //GE_TFMT_DXT1,
41
	8,   //GE_TFMT_DXT3,
42
	8,   //GE_TFMT_DXT5,
43
	0,   // INVALID,
44
	0,   // INVALID,
45
	0,   // INVALID,
46
	0,   // INVALID,
47
	0,   // INVALID,
48
};
49

50
#ifdef _M_SSE
51

52
static u32 QuickTexHashSSE2(const void *checkp, u32 size) {
53
	u32 check = 0;
54

55
	if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
56
		__m128i cursor = _mm_set1_epi32(0);
57
		__m128i cursor2 = _mm_set_epi16(0x0001U, 0x0083U, 0x4309U, 0x4d9bU, 0xb651U, 0x4b73U, 0x9bd9U, 0xc00bU);
58
		__m128i update = _mm_set1_epi16(0x2455U);
59
		const __m128i *p = (const __m128i *)checkp;
60
		for (u32 i = 0; i < size / 16; i += 4) {
61
			__m128i chunk = _mm_mullo_epi16(_mm_load_si128(&p[i]), cursor2);
62
			cursor = _mm_add_epi16(cursor, chunk);
63
			cursor = _mm_xor_si128(cursor, _mm_load_si128(&p[i + 1]));
64
			cursor = _mm_add_epi32(cursor, _mm_load_si128(&p[i + 2]));
65
			chunk = _mm_mullo_epi16(_mm_load_si128(&p[i + 3]), cursor2);
66
			cursor = _mm_xor_si128(cursor, chunk);
67
			cursor2 = _mm_add_epi16(cursor2, update);
68
		}
69
		cursor = _mm_add_epi32(cursor, cursor2);
70
		// Add the four parts into the low i32.
71
		cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 8));
72
		cursor = _mm_add_epi32(cursor, _mm_srli_si128(cursor, 4));
73
		check = _mm_cvtsi128_si32(cursor);
74
	} else {
75
		const u32 *p = (const u32 *)checkp;
76
		for (u32 i = 0; i < size / 8; ++i) {
77
			check += *p++;
78
			check ^= *p++;
79
		}
80
	}
81

82
	return check;
83
}
84
#endif
85

86
#if PPSSPP_ARCH(ARM_NEON)
87

88
alignas(16) static const u16 QuickTexHashInitial[8] = { 0xc00bU, 0x9bd9U, 0x4b73U, 0xb651U, 0x4d9bU, 0x4309U, 0x0083U, 0x0001U };
89

90
static u32 QuickTexHashNEON(const void *checkp, u32 size) {
91
	u32 check = 0;
92

93
	if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
94
#if PPSSPP_PLATFORM(IOS) || PPSSPP_ARCH(ARM64) || defined(_MSC_VER) || !PPSSPP_ARCH(ARMV7)
95
		uint32x4_t cursor = vdupq_n_u32(0);
96
		uint16x8_t cursor2 = vld1q_u16(QuickTexHashInitial);
97
		uint16x8_t update = vdupq_n_u16(0x2455U);
98

99
		const u32 *p = (const u32 *)checkp;
100
		const u32 *pend = p + size / 4;
101
		while (p < pend) {
102
			cursor = vreinterpretq_u32_u16(vmlaq_u16(vreinterpretq_u16_u32(cursor), vreinterpretq_u16_u32(vld1q_u32(&p[4 * 0])), cursor2));
103
			cursor = veorq_u32(cursor, vld1q_u32(&p[4 * 1]));
104
			cursor = vaddq_u32(cursor, vld1q_u32(&p[4 * 2]));
105
			cursor = veorq_u32(cursor, vreinterpretq_u32_u16(vmulq_u16(vreinterpretq_u16_u32(vld1q_u32(&p[4 * 3])), cursor2)));
106
			cursor2 = vaddq_u16(cursor2, update);
107

108
			p += 4 * 4;
109
		}
110

111
		cursor = vaddq_u32(cursor, vreinterpretq_u32_u16(cursor2));
112
		uint32x2_t mixed = vadd_u32(vget_high_u32(cursor), vget_low_u32(cursor));
113
		check = vget_lane_u32(mixed, 0) + vget_lane_u32(mixed, 1);
114
#else
115
		// TODO: Why does this crash on iOS, but only certain devices?
116
		// It's faster than the above, but I guess it sucks to be using an iPhone.
117
		// As of 2020 clang, it's still faster by ~1.4%.
118

119
		// d0/d1 (q0) - cursor
120
		// d2/d3 (q1) - cursor2
121
		// d4/d5 (q2) - update
122
		// d16-d23 (q8-q11) - memory transfer
123
		asm volatile (
124
			// Initialize cursor.
125
			"vmov.i32 q0, #0\n"
126

127
			// Initialize cursor2.
128
			"movw r0, 0xc00b\n"
129
			"movt r0, 0x9bd9\n"
130
			"movw r1, 0x4b73\n"
131
			"movt r1, 0xb651\n"
132
			"vmov d2, r0, r1\n"
133
			"movw r0, 0x4d9b\n"
134
			"movt r0, 0x4309\n"
135
			"movw r1, 0x0083\n"
136
			"movt r1, 0x0001\n"
137
			"vmov d3, r0, r1\n"
138

139
			// Initialize update.
140
			"movw r0, 0x2455\n"
141
			"vdup.i16 q2, r0\n"
142

143
			// This is where we end.
144
			"add r0, %1, %2\n"
145

146
			// Okay, do the memory hashing.
147
			"QuickTexHashNEON_next:\n"
148
			"pld [%2, #0xc0]\n"
149
			"vldmia %2!, {d16-d23}\n"
150
			"vmla.i16 q0, q1, q8\n"
151
			"vmul.i16 q11, q11, q1\n"
152
			"veor.i32 q0, q0, q9\n"
153
			"cmp %2, r0\n"
154
			"vadd.i32 q0, q0, q10\n"
155
			"vadd.i16 q1, q1, q2\n"
156
			"veor.i32 q0, q0, q11\n"
157
			"blo QuickTexHashNEON_next\n"
158

159
			// Now let's get the result.
160
			"vadd.i32 q0, q0, q1\n"
161
			"vadd.i32 d0, d0, d1\n"
162
			"vmov r0, r1, d0\n"
163
			"add %0, r0, r1\n"
164

165
			: "=r"(check)
166
			: "r"(size), "r"(checkp)
167
			: "r0", "r1", "d0", "d1", "d2", "d3", "d4", "d5", "d16", "d17", "d18", "d19", "d20", "d21", "d22", "d23", "cc"
168
			);
169
#endif
170
	} else {
171
		const u32 size_u32 = size / 4;
172
		const u32 *p = (const u32 *)checkp;
173
		for (u32 i = 0; i < size_u32; i += 4) {
174
			check += p[i + 0];
175
			check ^= p[i + 1];
176
			check += p[i + 2];
177
			check ^= p[i + 3];
178
		}
179
	}
180

181
	return check;
182
}
183

184
#endif  // PPSSPP_ARCH(ARM_NEON)
185

186
#if PPSSPP_ARCH(LOONGARCH64_LSX)
187

188
alignas(16) static const u16 QuickTexHashInitial[8] = { 0xc00bU, 0x9bd9U, 0x4b73U, 0xb651U, 0x4d9bU, 0x4309U, 0x0083U, 0x0001U };
189

190
static u32 QuickTexHashLSX(const void *checkp, u32 size) {
191
	u32 check = 0;
192

193
	if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
194
		__m128i cursor = __lsx_vrepli_d(0);
195
		__m128i cursor2 = __lsx_vld(QuickTexHashInitial, 0);
196
		__m128i update = __lsx_vreplgr2vr_h(0x2455U);
197
		const __m128i *p = (const __m128i *)checkp;
198
		for (u32 i = 0; i < size / 16; i += 4) {
199
			__m128i chunk = __lsx_vmul_h(__lsx_vld(&p[i], 0), cursor2);
200
			cursor = __lsx_vadd_h(cursor, chunk);
201
			cursor = __lsx_vxor_v(cursor, __lsx_vld(&p[i + 1], 0));
202
			cursor = __lsx_vadd_w(cursor, __lsx_vld(&p[i + 2], 0));
203
			chunk = __lsx_vmul_h(__lsx_vld(&p[i + 3], 0), cursor2);
204
			cursor = __lsx_vxor_v(cursor, chunk);
205
			cursor2 = __lsx_vadd_h(cursor2, update);
206
		}
207
		cursor = __lsx_vadd_w(cursor, cursor2);
208
		// Add the four parts into the low i32.
209
		cursor = __lsx_vadd_w(cursor, __lsx_vbsrl_v(cursor, 8));
210
		cursor = __lsx_vadd_w(cursor, __lsx_vbsrl_v(cursor, 4));
211
		check = __lsx_vpickve2gr_w(cursor, 0);
212
	} else {
213
		const u32 *p = (const u32 *)checkp;
214
		for (u32 i = 0; i < size / 8; ++i) {
215
			check += *p++;
216
			check ^= *p++;
217
		}
218
	}
219

220
	return check;
221
}
222

223
#endif // PPSSPP_ARCH(LOONGARCH64_LSX)
224

225
// Masks to downalign bufw to 16 bytes, and wrap at 2048.
226
static const u32 textureAlignMask16[16] = {
227
	0x7FF & ~(((8 * 16) / 16) - 1),  //GE_TFMT_5650,
228
	0x7FF & ~(((8 * 16) / 16) - 1),  //GE_TFMT_5551,
229
	0x7FF & ~(((8 * 16) / 16) - 1),  //GE_TFMT_4444,
230
	0x7FF & ~(((8 * 16) / 32) - 1),  //GE_TFMT_8888,
231
	0x7FF & ~(((8 * 16) / 4) - 1),   //GE_TFMT_CLUT4,
232
	0x7FF & ~(((8 * 16) / 8) - 1),   //GE_TFMT_CLUT8,
233
	0x7FF & ~(((8 * 16) / 16) - 1),  //GE_TFMT_CLUT16,
234
	0x7FF & ~(((8 * 16) / 32) - 1),  //GE_TFMT_CLUT32,
235
	0x7FF, //GE_TFMT_DXT1,
236
	0x7FF, //GE_TFMT_DXT3,
237
	0x7FF, //GE_TFMT_DXT5,
238
	0,   // INVALID,
239
	0,   // INVALID,
240
	0,   // INVALID,
241
	0,   // INVALID,
242
	0,   // INVALID,
243
};
244

245
u32 GetTextureBufw(int level, u32 texaddr, GETextureFormat format) {
246
	// This is a hack to allow for us to draw the huge PPGe texture, which is always in kernel ram.
247
	if (texaddr >= PSP_GetKernelMemoryBase() && texaddr < PSP_GetKernelMemoryEnd())
248
		return gstate.texbufwidth[level] & 0x1FFF;
249

250
	u32 bufw = gstate.texbufwidth[level] & textureAlignMask16[format];
251
	if (bufw == 0 && format <= GE_TFMT_DXT5) {
252
		// If it's less than 16 bytes, use 16 bytes.
253
		bufw = (8 * 16) / textureBitsPerPixel[format];
254
	}
255
	return bufw;
256
}
257

258
// Matches QuickTexHashNEON/SSE, see #7029.
259
static u32 QuickTexHashNonSSE(const void *checkp, u32 size) {
260
	u32 check = 0;
261

262
	if (((intptr_t)checkp & 0xf) == 0 && (size & 0x3f) == 0) {
263
		static const u16 cursor2_initial[8] = {0xc00bU, 0x9bd9U, 0x4b73U, 0xb651U, 0x4d9bU, 0x4309U, 0x0083U, 0x0001U};
264
		union u32x4_u16x8 {
265
#if defined(__GNUC__)
266
			uint32_t x32 __attribute__((vector_size(16)));
267
			uint16_t x16 __attribute__((vector_size(16)));
268
#else
269
			u32 x32[4];
270
			u16 x16[8];
271
#endif
272
		};
273
		u32x4_u16x8 cursor{};
274
		u32x4_u16x8 cursor2;
275
		static const u16 update[8] = {0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U, 0x2455U};
276

277
		for (u32 j = 0; j < 8; ++j) {
278
			cursor2.x16[j] = cursor2_initial[j];
279
		}
280

281
		const u32x4_u16x8 *p = (const u32x4_u16x8 *)checkp;
282
		for (u32 i = 0; i < size / 16; i += 4) {
283
			for (u32 j = 0; j < 8; ++j) {
284
				const u16 temp = p[i + 0].x16[j] * cursor2.x16[j];
285
				cursor.x16[j] += temp;
286
			}
287
			for (u32 j = 0; j < 4; ++j) {
288
				cursor.x32[j] ^= p[i + 1].x32[j];
289
				cursor.x32[j] += p[i + 2].x32[j];
290
			}
291
			for (u32 j = 0; j < 8; ++j) {
292
				const u16 temp = p[i + 3].x16[j] * cursor2.x16[j];
293
				cursor.x16[j] ^= temp;
294
			}
295
			for (u32 j = 0; j < 8; ++j) {
296
				cursor2.x16[j] += update[j];
297
			}
298
		}
299

300
		for (u32 j = 0; j < 4; ++j) {
301
			cursor.x32[j] += cursor2.x32[j];
302
		}
303
		check = cursor.x32[0] + cursor.x32[1] + cursor.x32[2] + cursor.x32[3];
304
	} else {
305
		const u32 *p = (const u32 *)checkp;
306
		for (u32 i = 0; i < size / 8; ++i) {
307
			check += *p++;
308
			check ^= *p++;
309
		}
310
	}
311

312
	return check;
313
}
314

315
u32 StableQuickTexHash(const void *checkp, u32 size) {
316
#if defined(_M_SSE)
317
	return QuickTexHashSSE2(checkp, size);
318
#elif PPSSPP_ARCH(ARM_NEON)
319
	return QuickTexHashNEON(checkp, size);
320
#elif PPSSPP_ARCH(LOONGARCH64_LSX)
321
	return QuickTexHashLSX(checkp, size);
322
#else
323
	return QuickTexHashNonSSE(checkp, size);
324
#endif
325
}
326

327
void DoSwizzleTex16(const u32 *ysrcp, u8 *texptr, int bxc, int byc, u32 pitch) {
328
	// ysrcp is in 32-bits, so this is convenient.
329
	const u32 pitchBy32 = pitch >> 2;
330
#ifdef _M_SSE
331
	if (((uintptr_t)ysrcp & 0xF) == 0 && (pitch & 0xF) == 0) {
332
		__m128i *dest = (__m128i *)texptr;
333
		// The pitch parameter is in bytes, so shift down for 128-bit.
334
		// Note: it's always aligned to 16 bytes, so this is safe.
335
		const u32 pitchBy128 = pitch >> 4;
336
		for (int by = 0; by < byc; by++) {
337
			const __m128i *xsrc = (const __m128i *)ysrcp;
338
			for (int bx = 0; bx < bxc; bx++) {
339
				const __m128i *src = xsrc;
340
				for (int n = 0; n < 2; n++) {
341
					// Textures are always 16-byte aligned so this is fine.
342
					__m128i temp1 = _mm_load_si128(src);
343
					src += pitchBy128;
344
					__m128i temp2 = _mm_load_si128(src);
345
					src += pitchBy128;
346
					__m128i temp3 = _mm_load_si128(src);
347
					src += pitchBy128;
348
					__m128i temp4 = _mm_load_si128(src);
349
					src += pitchBy128;
350

351
					_mm_store_si128(dest, temp1);
352
					_mm_store_si128(dest + 1, temp2);
353
					_mm_store_si128(dest + 2, temp3);
354
					_mm_store_si128(dest + 3, temp4);
355
					dest += 4;
356
				}
357
				xsrc++;
358
			}
359
			ysrcp += pitchBy32 * 8;
360
		}
361
	} else
362
#endif
363
	{
364
		u32 *dest = (u32 *)texptr;
365
		for (int by = 0; by < byc; by++) {
366
			const u32 *xsrc = ysrcp;
367
			for (int bx = 0; bx < bxc; bx++) {
368
				const u32 *src = xsrc;
369
				for (int n = 0; n < 8; n++) {
370
					memcpy(dest, src, 16);
371
					src += pitchBy32;
372
					dest += 4;
373
				}
374
				xsrc += 4;
375
			}
376
			ysrcp += pitchBy32 * 8;
377
		}
378
	}
379
}
380

381
void DoUnswizzleTex16(const u8 *texptr, u32 *ydestp, int bxc, int byc, u32 pitch) {
382
	// ydestp is in 32-bits, so this is convenient.
383
	const u32 pitchBy32 = pitch >> 2;
384

385
#ifdef _M_SSE
386
	// This check is pretty much a given, right?
387
	if (((uintptr_t)ydestp & 0xF) == 0 && (pitch & 0xF) == 0) {
388
		const __m128i *src = (const __m128i *)texptr;
389
		// The pitch parameter is in bytes, so shift down for 128-bit.
390
		// Note: it's always aligned to 16 bytes, so this is safe.
391
		const u32 pitchBy128 = pitch >> 4;
392
		for (int by = 0; by < byc; by++) {
393
			__m128i *xdest = (__m128i *)ydestp;
394
			for (int bx = 0; bx < bxc; bx++) {
395
				__m128i *dest = xdest;
396
				for (int n = 0; n < 2; n++) {
397
					// Textures are always 16-byte aligned so this is fine.
398
					__m128i temp1 = _mm_load_si128(src);
399
					__m128i temp2 = _mm_load_si128(src + 1);
400
					__m128i temp3 = _mm_load_si128(src + 2);
401
					__m128i temp4 = _mm_load_si128(src + 3);
402
					_mm_store_si128(dest, temp1);
403
					dest += pitchBy128;
404
					_mm_store_si128(dest, temp2);
405
					dest += pitchBy128;
406
					_mm_store_si128(dest, temp3);
407
					dest += pitchBy128;
408
					_mm_store_si128(dest, temp4);
409
					dest += pitchBy128;
410
					src += 4;
411
				}
412
				xdest++;
413
			}
414
			ydestp += pitchBy32 * 8;
415
		}
416
	} else
417
#elif PPSSPP_ARCH(ARM_NEON)
418
	if (((uintptr_t)ydestp & 0xF) == 0 && (pitch & 0xF) == 0) {
419
		const u32 *src = (const u32 *)texptr;
420
		for (int by = 0; by < byc; by++) {
421
			u32 *xdest = ydestp;
422
			for (int bx = 0; bx < bxc; bx++) {
423
				u32 *dest = xdest;
424
				for (int n = 0; n < 2; n++) {
425
					// Textures are always 16-byte aligned so this is fine.
426
					uint32x4_t temp1 = vld1q_u32(src);
427
					uint32x4_t temp2 = vld1q_u32(src + 4);
428
					uint32x4_t temp3 = vld1q_u32(src + 8);
429
					uint32x4_t temp4 = vld1q_u32(src + 12);
430
					vst1q_u32(dest, temp1);
431
					dest += pitchBy32;
432
					vst1q_u32(dest, temp2);
433
					dest += pitchBy32;
434
					vst1q_u32(dest, temp3);
435
					dest += pitchBy32;
436
					vst1q_u32(dest, temp4);
437
					dest += pitchBy32;
438
					src += 16;
439
				}
440
				xdest += 4;
441
			}
442
			ydestp += pitchBy32 * 8;
443
		}
444
	} else
445
#endif
446
	{
447
		const u32 *src = (const u32 *)texptr;
448
		for (int by = 0; by < byc; by++) {
449
			u32 *xdest = ydestp;
450
			for (int bx = 0; bx < bxc; bx++) {
451
				u32 *dest = xdest;
452
				for (int n = 0; n < 8; n++) {
453
					memcpy(dest, src, 16);
454
					dest += pitchBy32;
455
					src += 4;
456
				}
457
				xdest += 4;
458
			}
459
			ydestp += pitchBy32 * 8;
460
		}
461
	}
462
}
463

464
// S3TC / DXT Decoder
465
class DXTDecoder {
466
public:
467
	inline void DecodeColors(const DXT1Block *src, bool ignore1bitAlpha);
468
	inline void DecodeAlphaDXT5(const DXT5Block *src);
469
	inline void WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int width, int height);
470
	inline void WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int width, int height);
471
	inline void WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int width, int height);
472

473
	bool AnyNonFullAlpha() const { return anyNonFullAlpha_; }
474

475
protected:
476
	u32 colors_[4];
477
	u8 alpha_[8];
478
	bool alphaMode_ = false;
479
	bool anyNonFullAlpha_ = false;
480
};
481

482
static inline u32 makecol(int r, int g, int b, int a) {
483
	return (a << 24) | (b << 16) | (g << 8) | r;
484
}
485

486
static inline int mix_2_3(int c1, int c2) {
487
	return (c1 + c1 + c2) / 3;
488
}
489

490
// This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.
491
void DXTDecoder::DecodeColors(const DXT1Block *src, bool ignore1bitAlpha) {
492
	u16 c1 = src->color1;
493
	u16 c2 = src->color2;
494
	int blue1 = (c1 << 3) & 0xF8;
495
	int blue2 = (c2 << 3) & 0xF8;
496
	int green1 = (c1 >> 3) & 0xFC;
497
	int green2 = (c2 >> 3) & 0xFC;
498
	int red1 = (c1 >> 8) & 0xF8;
499
	int red2 = (c2 >> 8) & 0xF8;
500

501
	// Keep alpha zero for non-DXT1 to skip masking the colors.
502
	int alpha = ignore1bitAlpha ? 0 : 255;
503

504
	colors_[0] = makecol(red1, green1, blue1, alpha);
505
	colors_[1] = makecol(red2, green2, blue2, alpha);
506
	if (c1 > c2) {
507
		colors_[2] = makecol(mix_2_3(red1, red2), mix_2_3(green1, green2), mix_2_3(blue1, blue2), alpha);
508
		colors_[3] = makecol(mix_2_3(red2, red1), mix_2_3(green2, green1), mix_2_3(blue2, blue1), alpha);
509
	} else {
510
		// Average - these are always left shifted, so no need to worry about ties.
511
		int red3 = (red1 + red2) / 2;
512
		int green3 = (green1 + green2) / 2;
513
		int blue3 = (blue1 + blue2) / 2;
514
		colors_[2] = makecol(red3, green3, blue3, alpha);
515
		colors_[3] = makecol(0, 0, 0, 0);
516
		if (alpha == 255) {
517
			alphaMode_ = true;
518
		}
519
	}
520
}
521

522
static inline u8 lerp8(const DXT5Block *src, int n) {
523
	// These weights multiple alpha1/alpha2 to fixed 8.8 point.
524
	int alpha1 = (src->alpha1 * ((7 - n) << 8)) / 7;
525
	int alpha2 = (src->alpha2 * (n << 8)) / 7;
526
	return (u8)((alpha1 + alpha2 + 31) >> 8);
527
}
528

529
static inline u8 lerp6(const DXT5Block *src, int n) {
530
	int alpha1 = (src->alpha1 * ((5 - n) << 8)) / 5;
531
	int alpha2 = (src->alpha2 * (n << 8)) / 5;
532
	return (u8)((alpha1 + alpha2 + 31) >> 8);
533
}
534

535
void DXTDecoder::DecodeAlphaDXT5(const DXT5Block *src) {
536
	alpha_[0] = src->alpha1;
537
	alpha_[1] = src->alpha2;
538
	if (alpha_[0] > alpha_[1]) {
539
		alpha_[2] = lerp8(src, 1);
540
		alpha_[3] = lerp8(src, 2);
541
		alpha_[4] = lerp8(src, 3);
542
		alpha_[5] = lerp8(src, 4);
543
		alpha_[6] = lerp8(src, 5);
544
		alpha_[7] = lerp8(src, 6);
545
	} else {
546
		alpha_[2] = lerp6(src, 1);
547
		alpha_[3] = lerp6(src, 2);
548
		alpha_[4] = lerp6(src, 3);
549
		alpha_[5] = lerp6(src, 4);
550
		alpha_[6] = 0;
551
		alpha_[7] = 255;
552
	}
553
}
554

555
void DXTDecoder::WriteColorsDXT1(u32 *dst, const DXT1Block *src, int pitch, int width, int height) {
556
	bool anyColor3 = false;
557
	for (int y = 0; y < height; y++) {
558
		int colordata = src->lines[y];
559
		for (int x = 0; x < width; x++) {
560
			int col = colordata & 3;
561
			if (col == 3) {
562
				anyColor3 = true;
563
			}
564
			dst[x] = colors_[col];
565
			colordata >>= 2;
566
		}
567
		dst += pitch;
568
	}
569

570
	if (alphaMode_ && anyColor3) {
571
		anyNonFullAlpha_ = true;
572
	}
573
}
574

575
void DXTDecoder::WriteColorsDXT3(u32 *dst, const DXT3Block *src, int pitch, int width, int height) {
576
	for (int y = 0; y < height; y++) {
577
		int colordata = src->color.lines[y];
578
		u32 alphadata = src->alphaLines[y];
579
		for (int x = 0; x < width; x++) {
580
			dst[x] = colors_[colordata & 3] | (alphadata << 28);
581
			colordata >>= 2;
582
			alphadata >>= 4;
583
		}
584
		dst += pitch;
585
	}
586
}
587

588
void DXTDecoder::WriteColorsDXT5(u32 *dst, const DXT5Block *src, int pitch, int width, int height) {
589
	// 48 bits, 3 bit index per pixel, 12 bits per line.
590
	u64 allAlpha = ((u64)(u16)src->alphadata1 << 32) | (u32)src->alphadata2;
591

592
	for (int y = 0; y < height; y++) {
593
		uint32_t colordata = src->color.lines[y];
594
		uint32_t alphadata = allAlpha >> (12 * y);
595
		for (int x = 0; x < width; x++) {
596
			dst[x] = colors_[colordata & 3] | (alpha_[alphadata & 7] << 24);
597
			colordata >>= 2;
598
			alphadata >>= 3;
599
		}
600
		dst += pitch;
601
	}
602
}
603

604
uint32_t GetDXTTexelColor(const DXT1Block *src, int x, int y, int alpha) {
605
	_dbg_assert_(x >= 0 && x < 4);
606
	_dbg_assert_(y >= 0 && y < 4);
607

608
	uint16_t c1 = src->color1;
609
	uint16_t c2 = src->color2;
610
	int blue1 = (c1 << 3) & 0xF8;
611
	int blue2 = (c2 << 3) & 0xF8;
612
	int green1 = (c1 >> 3) & 0xFC;
613
	int green2 = (c2 >> 3) & 0xFC;
614
	int red1 = (c1 >> 8) & 0xF8;
615
	int red2 = (c2 >> 8) & 0xF8;
616

617
	int colorIndex = (src->lines[y] >> (x * 2)) & 3;
618
	if (colorIndex == 0) {
619
		return makecol(red1, green1, blue1, alpha);
620
	} else if (colorIndex == 1) {
621
		return makecol(red2, green2, blue2, alpha);
622
	} else if (c1 > c2) {
623
		if (colorIndex == 2) {
624
			return makecol(mix_2_3(red1, red2), mix_2_3(green1, green2), mix_2_3(blue1, blue2), alpha);
625
		}
626
		return makecol(mix_2_3(red2, red1), mix_2_3(green2, green1), mix_2_3(blue2, blue1), alpha);
627
	} else if (colorIndex == 3) {
628
		return makecol(0, 0, 0, 0);
629
	}
630

631
	// Average - these are always left shifted, so no need to worry about ties.
632
	int red3 = (red1 + red2) / 2;
633
	int green3 = (green1 + green2) / 2;
634
	int blue3 = (blue1 + blue2) / 2;
635
	return makecol(red3, green3, blue3, alpha);
636
}
637

638
uint32_t GetDXT1Texel(const DXT1Block *src, int x, int y) {
639
	return GetDXTTexelColor(src, x, y, 255);
640
}
641

642
uint32_t GetDXT3Texel(const DXT3Block *src, int x, int y) {
643
	uint32_t color = GetDXTTexelColor(&src->color, x, y, 0);
644
	u32 alpha = (src->alphaLines[y] >> (x * 4)) & 0xF;
645
	return color | (alpha << 28);
646
}
647

648
uint32_t GetDXT5Texel(const DXT5Block *src, int x, int y) {
649
	uint32_t color = GetDXTTexelColor(&src->color, x, y, 0);
650
	uint64_t alphadata = ((uint64_t)(uint16_t)src->alphadata1 << 32) | (uint32_t)src->alphadata2;
651
	int alphaIndex = (alphadata >> (y * 12 + x * 3)) & 7;
652

653
	if (alphaIndex == 0) {
654
		return color | (src->alpha1 << 24);
655
	} else if (alphaIndex == 1) {
656
		return color | (src->alpha2 << 24);
657
	} else if (src->alpha1 > src->alpha2) {
658
		return color | (lerp8(src, alphaIndex - 1) << 24);
659
	} else if (alphaIndex == 6) {
660
		return color;
661
	} else if (alphaIndex == 7) {
662
		return color | 0xFF000000;
663
	}
664
	return color | (lerp6(src, alphaIndex - 1) << 24);
665
}
666

667
// This could probably be done faster by decoding two or four blocks at a time with SSE/NEON.
668
void DecodeDXT1Block(u32 *dst, const DXT1Block *src, int pitch, int width, int height, u32 *alpha) {
669
	DXTDecoder dxt;
670
	dxt.DecodeColors(src, false);
671
	dxt.WriteColorsDXT1(dst, src, pitch, width, height);
672
	*alpha &= dxt.AnyNonFullAlpha() ? 0 : 1;
673
}
674

675
void DecodeDXT3Block(u32 *dst, const DXT3Block *src, int pitch, int width,  int height) {
676
	DXTDecoder dxt;
677
	dxt.DecodeColors(&src->color, true);
678
	dxt.WriteColorsDXT3(dst, src, pitch, width, height);
679
}
680

681
void DecodeDXT5Block(u32 *dst, const DXT5Block *src, int pitch, int width, int height) {
682
	DXTDecoder dxt;
683
	dxt.DecodeColors(&src->color, true);
684
	dxt.DecodeAlphaDXT5(src);
685
	dxt.WriteColorsDXT5(dst, src, pitch, width, height);
686
}
687

688
#ifdef _M_SSE
689
inline u32 SSEReduce32And(__m128i value) {
690
	value = _mm_and_si128(value, _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 0, 3, 2)));
691
	value = _mm_and_si128(value, _mm_shuffle_epi32(value, _MM_SHUFFLE(1, 1, 1, 1)));
692
	return _mm_cvtsi128_si32(value);
693
}
694
inline u32 SSEReduce16And(__m128i value) {
695
	u32 mask = SSEReduce32And(value);
696
	return mask & (mask >> 16);
697
}
698
#endif
699

700
#if PPSSPP_ARCH(ARM_NEON)
701
inline u32 NEONReduce32And(uint32x4_t value) {
702
	// TODO: Maybe a shuffle and a vector and, or something?
703
	return vgetq_lane_u32(value, 0) & vgetq_lane_u32(value, 1) & vgetq_lane_u32(value, 2) & vgetq_lane_u32(value, 3);
704
}
705
inline u32 NEONReduce16And(uint16x8_t value) {
706
	uint32x4_t value32 = vreinterpretq_u32_u16(value);
707
	// TODO: Maybe a shuffle and a vector and, or something?
708
	u32 mask = vgetq_lane_u32(value32, 0) & vgetq_lane_u32(value32, 1) & vgetq_lane_u32(value32, 2) & vgetq_lane_u32(value32, 3);
709
	return mask & (mask >> 16);
710
}
711
#endif
712

713
// TODO: SSE/SIMD
714
// At least on x86, compiler actually SIMDs these pretty well.
715
void CopyAndSumMask16(u16 *dst, const u16 *src, int width, u32 *outMask) {
716
	u16 mask = 0xFFFF;
717
#ifdef _M_SSE
718
	if (width >= 8) {
719
		__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
720
		while (width >= 8) {
721
			__m128i color = _mm_loadu_si128((__m128i *)src);
722
			wideMask = _mm_and_si128(wideMask, color);
723
			_mm_storeu_si128((__m128i *)dst, color);
724
			src += 8;
725
			dst += 8;
726
			width -= 8;
727
		}
728
		mask = SSEReduce16And(wideMask);
729
	}
730
#elif PPSSPP_ARCH(ARM_NEON)
731
	if (width >= 8) {
732
		uint16x8_t wideMask = vdupq_n_u16(0xFFFF);
733
		while (width >= 8) {
734
			uint16x8_t colors = vld1q_u16(src);
735
			wideMask = vandq_u16(wideMask, colors);
736
			vst1q_u16(dst, colors);
737
			src += 8;
738
			dst += 8;
739
			width -= 8;
740
		}
741
		mask = NEONReduce16And(wideMask);
742
	}
743
#endif
744

745
	DO_NOT_VECTORIZE_LOOP
746
	for (int i = 0; i < width; i++) {
747
		u16 color = src[i];
748
		mask &= color;
749
		dst[i] = color;
750
	}
751
	*outMask &= (u32)mask;
752
}
753

754
// Used in video playback so nice to have being fast.
755
void CopyAndSumMask32(u32 *dst, const u32 *src, int width, u32 *outMask) {
756
	u32 mask = 0xFFFFFFFF;
757
#ifdef _M_SSE
758
	if (width >= 4) {
759
		__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
760
		while (width >= 4) {
761
			__m128i color = _mm_loadu_si128((__m128i *)src);
762
			wideMask = _mm_and_si128(wideMask, color);
763
			_mm_storeu_si128((__m128i *)dst, color);
764
			src += 4;
765
			dst += 4;
766
			width -= 4;
767
		}
768
		mask = SSEReduce32And(wideMask);
769
	}
770
#elif PPSSPP_ARCH(ARM_NEON)
771
	if (width >= 4) {
772
		uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF);
773
		while (width >= 4) {
774
			uint32x4_t colors = vld1q_u32(src);
775
			wideMask = vandq_u32(wideMask, colors);
776
			vst1q_u32(dst, colors);
777
			src += 4;
778
			dst += 4;
779
			width -= 4;
780
		}
781
		mask = NEONReduce32And(wideMask);
782
	}
783
#endif
784

785
	DO_NOT_VECTORIZE_LOOP
786
	for (int i = 0; i < width; i++) {
787
		u32 color = src[i];
788
		mask &= color;
789
		dst[i] = color;
790
	}
791
	*outMask &= (u32)mask;
792
}
793

794
void CheckMask16(const u16 *src, int width, u32 *outMask) {
795
	u16 mask = 0xFFFF;
796
#ifdef _M_SSE
797
	if (width >= 8) {
798
		__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
799
		while (width >= 8) {
800
			wideMask = _mm_and_si128(wideMask, _mm_loadu_si128((__m128i *)src));
801
			src += 8;
802
			width -= 8;
803
		}
804
		mask = SSEReduce16And(wideMask);
805
	}
806
#elif PPSSPP_ARCH(ARM_NEON)
807
	if (width >= 8) {
808
		uint16x8_t wideMask = vdupq_n_u16(0xFFFF);
809
		while (width >= 8) {
810
			wideMask = vandq_u16(wideMask, vld1q_u16(src));
811
			src += 8;
812
			width -= 8;
813
		}
814
		mask = NEONReduce16And(wideMask);
815
	}
816
#endif
817

818
	DO_NOT_VECTORIZE_LOOP
819
	for (int i = 0; i < width; i++) {
820
		mask &= src[i];
821
	}
822
	*outMask &= (u32)mask;
823
}
824

825
void CheckMask32(const u32 *src, int width, u32 *outMask) {
826
	u32 mask = 0xFFFFFFFF;
827
#ifdef _M_SSE
828
	if (width >= 4) {
829
		__m128i wideMask = _mm_set1_epi32(0xFFFFFFFF);
830
		while (width >= 4) {
831
			wideMask = _mm_and_si128(wideMask, _mm_loadu_si128((__m128i *)src));
832
			src += 4;
833
			width -= 4;
834
		}
835
		mask = SSEReduce32And(wideMask);
836
	}
837
#elif PPSSPP_ARCH(ARM_NEON)
838
	if (width >= 4) {
839
		uint32x4_t wideMask = vdupq_n_u32(0xFFFFFFFF);
840
		while (width >= 4) {
841
			wideMask = vandq_u32(wideMask, vld1q_u32(src));
842
			src += 4;
843
			width -= 4;
844
		}
845
		mask = NEONReduce32And(wideMask);
846
	}
847
#endif
848

849
	DO_NOT_VECTORIZE_LOOP
850
	for (int i = 0; i < width; i++) {
851
		mask &= src[i];
852
	}
853
	*outMask &= (u32)mask;
854
}
855

856
Product

Resources

Company