Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hrydgard
GitHub Repository: hrydgard/ppsspp
Path: blob/master/Common/Data/Convert/ColorConv.cpp
3187 views
1
// Copyright (c) 2015- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include "ppsspp_config.h"
19
#include "Common/Data/Convert/ColorConv.h"
20
#include "Common/Data/Convert/SmallDataConvert.h"
21
#include "Common/Common.h"
22
#include "Common/CPUDetect.h"
23
#include "Common/Math/SIMDHeaders.h"
24
25
void ConvertBGRA8888ToRGBA8888(u32 *dst, const u32 *src, u32 numPixels) {
26
#if PPSSPP_ARCH(SSE2)
27
const __m128i maskGA = _mm_set1_epi32(0xFF00FF00);
28
29
const __m128i *srcp = (const __m128i *)src;
30
__m128i *dstp = (__m128i *)dst;
31
u32 sseChunks = numPixels / 4;
32
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
33
sseChunks = 0;
34
}
35
for (u32 i = 0; i < sseChunks; ++i) {
36
__m128i c = _mm_load_si128(&srcp[i]);
37
__m128i rb = _mm_andnot_si128(maskGA, c);
38
c = _mm_and_si128(c, maskGA);
39
40
__m128i b = _mm_srli_epi32(rb, 16);
41
__m128i r = _mm_slli_epi32(rb, 16);
42
c = _mm_or_si128(_mm_or_si128(c, r), b);
43
_mm_store_si128(&dstp[i], c);
44
}
45
// The remainder starts right after those done via SSE.
46
u32 i = sseChunks * 4;
47
#else
48
u32 i = 0;
49
#endif
50
for (; i < numPixels; i++) {
51
const u32 c = src[i];
52
dst[i] = ((c >> 16) & 0x000000FF) |
53
(c & 0xFF00FF00) |
54
((c << 16) & 0x00FF0000);
55
}
56
}
57
58
void ConvertBGRA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {
59
for (uint32_t x = 0; x < numPixels; ++x) {
60
uint32_t c = src[x];
61
dst[x * 3 + 0] = (c >> 16) & 0xFF;
62
dst[x * 3 + 1] = (c >> 8) & 0xFF;
63
dst[x * 3 + 2] = (c >> 0) & 0xFF;
64
}
65
}
66
67
#if PPSSPP_ARCH(SSE2)
68
// fp64's improved SSE2 version, see #19751. SSE4 no longer required here.
69
static inline void ConvertRGBA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
70
const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
71
const __m128i maskGA = _mm_set1_epi32(0x8000F800);
72
const __m128i mulRB = _mm_set1_epi32(0x04000001);
73
const __m128i mulGA = _mm_set1_epi32(0x00400001);
74
75
for (u32 i = 0; i < sseChunks; i += 2) {
76
__m128i c0 = _mm_load_si128(&srcp[i + 0]);
77
__m128i c1 = _mm_load_si128(&srcp[i + 1]);
78
79
__m128i rb0 = _mm_and_si128(c0, maskRB); // 00000000bbbbb00000000000rrrrr000 (each 32-bit lane)
80
__m128i rb1 = _mm_and_si128(c1, maskRB); // 00000000bbbbb00000000000rrrrr000
81
__m128i ga0 = _mm_and_si128(c0, maskGA); // a000000000000000ggggg00000000000
82
__m128i ga1 = _mm_and_si128(c1, maskGA); // a000000000000000ggggg00000000000
83
rb0 = _mm_madd_epi16(_mm_srli_epi32(rb0, 3), mulRB); // 00000000000000000bbbbb00000rrrrr
84
rb1 = _mm_madd_epi16(_mm_srli_epi32(rb1, 3), mulRB); // 00000000000000000bbbbb00000rrrrr
85
ga0 = _mm_madd_epi16(_mm_srli_epi32(ga0, 11), mulGA); // 000000000000000000000a00000ggggg
86
ga1 = _mm_madd_epi16(_mm_srli_epi32(ga1, 11), mulGA); // 000000000000000000000a00000ggggg
87
__m128i rb = _mm_packs_epi32(rb0, rb1);
88
__m128i ga = _mm_slli_epi32(_mm_packs_epi32(ga0, ga1), 5);
89
90
_mm_store_si128(&dstp[i / 2], _mm_or_si128(ga, rb));
91
}
92
}
93
#endif
94
95
void ConvertRGBA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
96
#if PPSSPP_ARCH(SSE2)
97
const __m128i *srcp = (const __m128i *)src;
98
__m128i *dstp = (__m128i *)dst;
99
u32 sseChunks = (numPixels / 4) & ~1;
100
// SSE 4.1 required for _mm_packus_epi32.
101
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
102
sseChunks = 0;
103
} else {
104
ConvertRGBA8888ToRGBA5551(dstp, srcp, sseChunks);
105
}
106
107
// The remainder starts right after those done via SSE.
108
u32 i = sseChunks * 4;
109
#else
110
u32 i = 0;
111
#endif
112
for (; i < numPixels; i++) {
113
dst[i] = RGBA8888toRGBA5551(src[i]);
114
}
115
}
116
117
#if PPSSPP_ARCH(SSE2)
118
/*
119
#if defined(__GNUC__) || defined(__clang__) || defined(__INTEL_COMPILER)
120
[[gnu::target("sse4.1")]]
121
#endif
122
*/
123
static inline void ConvertBGRA8888ToRGBA5551(__m128i *dstp, const __m128i *srcp, u32 sseChunks) {
124
const __m128i maskAG = _mm_set1_epi32(0x8000F800);
125
const __m128i maskRB = _mm_set1_epi32(0x00F800F8);
126
const __m128i mask = _mm_set1_epi32(0x0000FFFF);
127
128
for (u32 i = 0; i < sseChunks; i += 2) {
129
__m128i c1 = _mm_load_si128(&srcp[i + 0]);
130
__m128i c2 = _mm_load_si128(&srcp[i + 1]);
131
__m128i ag, rb;
132
133
ag = _mm_and_si128(c1, maskAG);
134
ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
135
rb = _mm_and_si128(c1, maskRB);
136
rb = _mm_or_si128(_mm_srli_epi32(rb, 19), _mm_slli_epi32(rb, 7));
137
c1 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
138
139
ag = _mm_and_si128(c2, maskAG);
140
ag = _mm_or_si128(_mm_srli_epi32(ag, 16), _mm_srli_epi32(ag, 6));
141
rb = _mm_and_si128(c2, maskRB);
142
rb = _mm_or_si128(_mm_srli_epi32(rb, 19), _mm_slli_epi32(rb, 7));
143
c2 = _mm_and_si128(_mm_or_si128(ag, rb), mask);
144
145
// Unfortunately no good SSE2 way to do _mm_packus_epi32.
146
// We can approximate it with a few shuffles.
147
#if 0
148
_mm_store_si128(&dstp[i / 2], _mm_packus_epi32(c1, c2));
149
#else
150
// SSE2 path.
151
_mm_store_si128(&dstp[i / 2], _mm_packu2_epi32_SSE2(c1, c2));
152
#endif
153
}
154
}
155
#endif
156
157
void ConvertBGRA8888ToRGBA5551(u16 *dst, const u32 *src, u32 numPixels) {
158
#if defined(_M_SSE)
159
const __m128i *srcp = (const __m128i *)src;
160
__m128i *dstp = (__m128i *)dst;
161
u32 sseChunks = (numPixels / 4) & ~1;
162
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
163
sseChunks = 0;
164
} else {
165
ConvertBGRA8888ToRGBA5551(dstp, srcp, sseChunks);
166
}
167
// The remainder starts right after those done via SSE.
168
u32 i = sseChunks * 4;
169
#else
170
u32 i = 0;
171
#endif
172
for (; i < numPixels; i++) {
173
dst[i] = BGRA8888toRGBA5551(src[i]);
174
}
175
}
176
177
void ConvertBGRA8888ToRGB565(u16 *dst, const u32 *src, u32 numPixels) {
178
for (u32 i = 0; i < numPixels; i++) {
179
dst[i] = BGRA8888toRGB565(src[i]);
180
}
181
}
182
183
void ConvertBGRA8888ToRGBA4444(u16 *dst, const u32 *src, u32 numPixels) {
184
for (u32 i = 0; i < numPixels; i++) {
185
dst[i] = BGRA8888toRGBA4444(src[i]);
186
}
187
}
188
189
void ConvertRGBA8888ToRGB565(u16 *dst, const u32 *src, u32 numPixels) {
190
for (u32 x = 0; x < numPixels; ++x) {
191
dst[x] = RGBA8888toRGB565(src[x]);
192
}
193
}
194
195
void ConvertRGBA8888ToRGBA4444(u16 *dst, const u32 *src, u32 numPixels) {
196
for (u32 x = 0; x < numPixels; ++x) {
197
dst[x] = RGBA8888toRGBA4444(src[x]);
198
}
199
}
200
201
void ConvertRGBA8888ToRGB888(u8 *dst, const u32 *src, u32 numPixels) {
202
for (uint32_t x = 0; x < numPixels; ++x) {
203
memcpy(dst + x * 3, src + x, 3);
204
}
205
}
206
207
void ConvertRGB565ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
208
#ifdef _M_SSE
209
const __m128i mask5 = _mm_set1_epi16(0x001f);
210
const __m128i mask6 = _mm_set1_epi16(0x003f);
211
const __m128i mask8 = _mm_set1_epi16(0x00ff);
212
213
const __m128i *srcp = (const __m128i *)src;
214
__m128i *dstp = (__m128i *)dst32;
215
u32 sseChunks = numPixels / 8;
216
if (((intptr_t)src & 0xF) || ((intptr_t)dst32 & 0xF)) {
217
sseChunks = 0;
218
}
219
for (u32 i = 0; i < sseChunks; ++i) {
220
const __m128i c = _mm_load_si128(&srcp[i]);
221
222
// Swizzle, resulting in RR00 RR00.
223
__m128i r = _mm_and_si128(c, mask5);
224
r = _mm_or_si128(_mm_slli_epi16(r, 3), _mm_srli_epi16(r, 2));
225
r = _mm_and_si128(r, mask8);
226
227
// This one becomes 00GG 00GG.
228
__m128i g = _mm_and_si128(_mm_srli_epi16(c, 5), mask6);
229
g = _mm_or_si128(_mm_slli_epi16(g, 2), _mm_srli_epi16(g, 4));
230
g = _mm_slli_epi16(g, 8);
231
232
// Almost done, we aim for BB00 BB00 again here.
233
__m128i b = _mm_and_si128(_mm_srli_epi16(c, 11), mask5);
234
b = _mm_or_si128(_mm_slli_epi16(b, 3), _mm_srli_epi16(b, 2));
235
b = _mm_and_si128(b, mask8);
236
237
// Always set alpha to 00FF 00FF.
238
__m128i a = _mm_slli_epi16(mask8, 8);
239
240
// Now combine them, RRGG RRGG and BBAA BBAA, and then interleave.
241
const __m128i rg = _mm_or_si128(r, g);
242
const __m128i ba = _mm_or_si128(b, a);
243
_mm_store_si128(&dstp[i * 2 + 0], _mm_unpacklo_epi16(rg, ba));
244
_mm_store_si128(&dstp[i * 2 + 1], _mm_unpackhi_epi16(rg, ba));
245
}
246
u32 i = sseChunks * 8;
247
#else
248
u32 i = 0;
249
#endif
250
251
u8 *dst = (u8 *)dst32;
252
for (u32 x = i; x < numPixels; x++) {
253
u16 col = src[x];
254
dst[x * 4] = Convert5To8((col) & 0x1f);
255
dst[x * 4 + 1] = Convert6To8((col >> 5) & 0x3f);
256
dst[x * 4 + 2] = Convert5To8((col >> 11) & 0x1f);
257
dst[x * 4 + 3] = 255;
258
}
259
}
260
261
void ConvertRGBA5551ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
262
#ifdef _M_SSE
263
const __m128i mask5 = _mm_set1_epi16(0x001f);
264
const __m128i mask8 = _mm_set1_epi16(0x00ff);
265
266
const __m128i *srcp = (const __m128i *)src;
267
__m128i *dstp = (__m128i *)dst32;
268
u32 sseChunks = numPixels / 8;
269
if (((intptr_t)src & 0xF) || ((intptr_t)dst32 & 0xF)) {
270
sseChunks = 0;
271
}
272
for (u32 i = 0; i < sseChunks; ++i) {
273
const __m128i c = _mm_load_si128(&srcp[i]);
274
275
// Swizzle, resulting in RR00 RR00.
276
__m128i r = _mm_and_si128(c, mask5);
277
r = _mm_or_si128(_mm_slli_epi16(r, 3), _mm_srli_epi16(r, 2));
278
r = _mm_and_si128(r, mask8);
279
280
// This one becomes 00GG 00GG.
281
__m128i g = _mm_and_si128(_mm_srli_epi16(c, 5), mask5);
282
g = _mm_or_si128(_mm_slli_epi16(g, 3), _mm_srli_epi16(g, 2));
283
g = _mm_slli_epi16(g, 8);
284
285
// Almost done, we aim for BB00 BB00 again here.
286
__m128i b = _mm_and_si128(_mm_srli_epi16(c, 10), mask5);
287
b = _mm_or_si128(_mm_slli_epi16(b, 3), _mm_srli_epi16(b, 2));
288
b = _mm_and_si128(b, mask8);
289
290
// 1 bit A to 00AA 00AA.
291
__m128i a = _mm_srai_epi16(c, 15);
292
a = _mm_slli_epi16(a, 8);
293
294
// Now combine them, RRGG RRGG and BBAA BBAA, and then interleave.
295
const __m128i rg = _mm_or_si128(r, g);
296
const __m128i ba = _mm_or_si128(b, a);
297
_mm_store_si128(&dstp[i * 2 + 0], _mm_unpacklo_epi16(rg, ba));
298
_mm_store_si128(&dstp[i * 2 + 1], _mm_unpackhi_epi16(rg, ba));
299
}
300
u32 i = sseChunks * 8;
301
#else
302
u32 i = 0;
303
#endif
304
305
u8 *dst = (u8 *)dst32;
306
for (u32 x = i; x < numPixels; x++) {
307
u16 col = src[x];
308
dst[x * 4] = Convert5To8((col) & 0x1f);
309
dst[x * 4 + 1] = Convert5To8((col >> 5) & 0x1f);
310
dst[x * 4 + 2] = Convert5To8((col >> 10) & 0x1f);
311
dst[x * 4 + 3] = (col >> 15) ? 255 : 0;
312
}
313
}
314
315
void ConvertRGBA4444ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
316
#ifdef _M_SSE
317
const __m128i mask4 = _mm_set1_epi16(0x000f);
318
319
const __m128i *srcp = (const __m128i *)src;
320
__m128i *dstp = (__m128i *)dst32;
321
u32 sseChunks = numPixels / 8;
322
if (((intptr_t)src & 0xF) || ((intptr_t)dst32 & 0xF)) {
323
sseChunks = 0;
324
}
325
for (u32 i = 0; i < sseChunks; ++i) {
326
const __m128i c = _mm_load_si128(&srcp[i]);
327
328
// Let's just grab R000 R000, without swizzling yet.
329
__m128i r = _mm_and_si128(c, mask4);
330
// And then 00G0 00G0.
331
__m128i g = _mm_and_si128(_mm_srli_epi16(c, 4), mask4);
332
g = _mm_slli_epi16(g, 8);
333
// Now B000 B000.
334
__m128i b = _mm_and_si128(_mm_srli_epi16(c, 8), mask4);
335
// And lastly 00A0 00A0. No mask needed, we have a wall.
336
__m128i a = _mm_srli_epi16(c, 12);
337
a = _mm_slli_epi16(a, 8);
338
339
// We swizzle after combining - R0G0 R0G0 and B0A0 B0A0 -> RRGG RRGG and BBAA BBAA.
340
__m128i rg = _mm_or_si128(r, g);
341
__m128i ba = _mm_or_si128(b, a);
342
rg = _mm_or_si128(rg, _mm_slli_epi16(rg, 4));
343
ba = _mm_or_si128(ba, _mm_slli_epi16(ba, 4));
344
345
// And then we can store.
346
_mm_store_si128(&dstp[i * 2 + 0], _mm_unpacklo_epi16(rg, ba));
347
_mm_store_si128(&dstp[i * 2 + 1], _mm_unpackhi_epi16(rg, ba));
348
}
349
u32 i = sseChunks * 8;
350
#else
351
u32 i = 0;
352
#endif
353
354
u8 *dst = (u8 *)dst32;
355
for (u32 x = i; x < numPixels; x++) {
356
u16 col = src[x];
357
dst[x * 4] = Convert4To8(col & 0xf);
358
dst[x * 4 + 1] = Convert4To8((col >> 4) & 0xf);
359
dst[x * 4 + 2] = Convert4To8((col >> 8) & 0xf);
360
dst[x * 4 + 3] = Convert4To8(col >> 12);
361
}
362
}
363
364
void ConvertBGR565ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
365
u8 *dst = (u8 *)dst32;
366
for (u32 x = 0; x < numPixels; x++) {
367
u16 col = src[x];
368
dst[x * 4] = Convert5To8((col >> 11) & 0x1f);
369
dst[x * 4 + 1] = Convert6To8((col >> 5) & 0x3f);
370
dst[x * 4 + 2] = Convert5To8((col) & 0x1f);
371
dst[x * 4 + 3] = 255;
372
}
373
}
374
375
void ConvertABGR1555ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
376
u8 *dst = (u8 *)dst32;
377
for (u32 x = 0; x < numPixels; x++) {
378
u16 col = src[x];
379
dst[x * 4] = Convert5To8((col >> 11) & 0x1f);
380
dst[x * 4 + 1] = Convert5To8((col >> 6) & 0x1f);
381
dst[x * 4 + 2] = Convert5To8((col >> 1) & 0x1f);
382
dst[x * 4 + 3] = (col & 1) ? 255 : 0;
383
}
384
}
385
386
void ConvertABGR4444ToRGBA8888(u32 *dst32, const u16 *src, u32 numPixels) {
387
u8 *dst = (u8 *)dst32;
388
for (u32 x = 0; x < numPixels; x++) {
389
u16 col = src[x];
390
dst[x * 4] = Convert4To8(col >> 12);
391
dst[x * 4 + 1] = Convert4To8((col >> 8) & 0xf);
392
dst[x * 4 + 2] = Convert4To8((col >> 4) & 0xf);
393
dst[x * 4 + 3] = Convert4To8(col & 0xf);
394
}
395
}
396
397
void ConvertRGBA4444ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels) {
398
for (u32 x = 0; x < numPixels; x++) {
399
u16 c = src[x];
400
u32 r = Convert4To8(c & 0x000f);
401
u32 g = Convert4To8((c >> 4) & 0x000f);
402
u32 b = Convert4To8((c >> 8) & 0x000f);
403
u32 a = Convert4To8((c >> 12) & 0x000f);
404
405
dst[x] = (a << 24) | (r << 16) | (g << 8) | b;
406
}
407
}
408
409
void ConvertRGBA5551ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels) {
410
for (u32 x = 0; x < numPixels; x++) {
411
u16 c = src[x];
412
u32 r = Convert5To8(c & 0x001f);
413
u32 g = Convert5To8((c >> 5) & 0x001f);
414
u32 b = Convert5To8((c >> 10) & 0x001f);
415
// We force an arithmetic shift to get the sign bits.
416
u32 a = SignExtend16ToU32(c) & 0xff000000;
417
418
dst[x] = a | (r << 16) | (g << 8) | b;
419
}
420
}
421
422
void ConvertRGB565ToBGRA8888(u32 *dst, const u16 *src, u32 numPixels) {
423
for (u32 x = 0; x < numPixels; x++) {
424
u16 c = src[x];
425
u32 r = Convert5To8(c & 0x001f);
426
u32 g = Convert6To8((c >> 5) & 0x003f);
427
u32 b = Convert5To8((c >> 11) & 0x001f);
428
429
dst[x] = 0xFF000000 | (r << 16) | (g << 8) | b;
430
}
431
}
432
433
void ConvertRGBA4444ToABGR4444(u16 *dst, const u16 *src, u32 numPixels) {
434
#if PPSSPP_ARCH(SSE2)
435
const __m128i mask0040 = _mm_set1_epi16(0x00F0);
436
437
const __m128i *srcp = (const __m128i *)src;
438
__m128i *dstp = (__m128i *)dst;
439
u32 sseChunks = numPixels / 8;
440
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
441
sseChunks = 0;
442
}
443
for (u32 i = 0; i < sseChunks; ++i) {
444
const __m128i c = _mm_load_si128(&srcp[i]);
445
__m128i v = _mm_srli_epi16(c, 12);
446
v = _mm_or_si128(v, _mm_and_si128(_mm_srli_epi16(c, 4), mask0040));
447
v = _mm_or_si128(v, _mm_slli_epi16(_mm_and_si128(c, mask0040), 4));
448
v = _mm_or_si128(v, _mm_slli_epi16(c, 12));
449
_mm_store_si128(&dstp[i], v);
450
}
451
// The remainder is done in chunks of 2, SSE was chunks of 8.
452
u32 i = sseChunks * 8 / 2;
453
#elif PPSSPP_ARCH(ARM_NEON)
454
const uint16x8_t mask0040 = vdupq_n_u16(0x00F0);
455
456
if (((uintptr_t)dst & 15) == 0 && ((uintptr_t)src & 15) == 0) {
457
u32 simdable = (numPixels / 8) * 8;
458
for (u32 i = 0; i < simdable; i += 8) {
459
uint16x8_t c = vld1q_u16(src);
460
461
const uint16x8_t a = vshrq_n_u16(c, 12);
462
const uint16x8_t b = vandq_u16(vshrq_n_u16(c, 4), mask0040);
463
const uint16x8_t g = vshlq_n_u16(vandq_u16(c, mask0040), 4);
464
const uint16x8_t r = vshlq_n_u16(c, 12);
465
466
uint16x8_t res = vorrq_u16(vorrq_u16(r, g), vorrq_u16(b, a));
467
vst1q_u16(dst, res);
468
469
src += 8;
470
dst += 8;
471
}
472
numPixels -= simdable;
473
}
474
u32 i = 0; // already moved the pointers forward
475
#else
476
u32 i = 0;
477
#endif
478
479
const u32 *src32 = (const u32 *)src;
480
u32 *dst32 = (u32 *)dst;
481
for (; i < numPixels / 2; i++) {
482
const u32 c = src32[i];
483
dst32[i] = ((c >> 12) & 0x000F000F) |
484
((c >> 4) & 0x00F000F0) |
485
((c << 4) & 0x0F000F00) |
486
((c << 12) & 0xF000F000);
487
}
488
489
if (numPixels & 1) {
490
const u32 i = numPixels - 1;
491
const u16 c = src[i];
492
dst[i] = ((c >> 12) & 0x000F) |
493
((c >> 4) & 0x00F0) |
494
((c << 4) & 0x0F00) |
495
((c << 12) & 0xF000);
496
}
497
}
498
499
void ConvertRGBA5551ToABGR1555(u16 *dst, const u16 *src, u32 numPixels) {
500
#if PPSSPP_ARCH(SSE2)
501
const __m128i maskB = _mm_set1_epi16(0x003E);
502
const __m128i maskG = _mm_set1_epi16(0x07C0);
503
504
const __m128i *srcp = (const __m128i *)src;
505
__m128i *dstp = (__m128i *)dst;
506
u32 sseChunks = numPixels / 8;
507
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
508
sseChunks = 0;
509
}
510
for (u32 i = 0; i < sseChunks; ++i) {
511
const __m128i c = _mm_load_si128(&srcp[i]);
512
__m128i v = _mm_srli_epi16(c, 15);
513
v = _mm_or_si128(v, _mm_and_si128(_mm_srli_epi16(c, 9), maskB));
514
v = _mm_or_si128(v, _mm_and_si128(_mm_slli_epi16(c, 1), maskG));
515
v = _mm_or_si128(v, _mm_slli_epi16(c, 11));
516
_mm_store_si128(&dstp[i], v);
517
}
518
// The remainder is done in chunks of 2, SSE was chunks of 8.
519
u32 i = sseChunks * 8 / 2;
520
#elif PPSSPP_ARCH(ARM_NEON)
521
const uint16x8_t maskB = vdupq_n_u16(0x003E);
522
const uint16x8_t maskG = vdupq_n_u16(0x07C0);
523
524
if (((uintptr_t)dst & 15) == 0 && ((uintptr_t)src & 15) == 0) {
525
u32 simdable = (numPixels / 8) * 8;
526
for (u32 i = 0; i < simdable; i += 8) {
527
uint16x8_t c = vld1q_u16(src);
528
529
const uint16x8_t a = vshrq_n_u16(c, 15);
530
const uint16x8_t b = vandq_u16(vshrq_n_u16(c, 9), maskB);
531
const uint16x8_t g = vandq_u16(vshlq_n_u16(c, 1), maskG);
532
const uint16x8_t r = vshlq_n_u16(c, 11);
533
534
uint16x8_t res = vorrq_u16(vorrq_u16(r, g), vorrq_u16(b, a));
535
vst1q_u16(dst, res);
536
537
src += 8;
538
dst += 8;
539
}
540
numPixels -= simdable;
541
}
542
u32 i = 0;
543
#else
544
u32 i = 0;
545
#endif
546
547
const u32 *src32 = (const u32 *)src;
548
u32 *dst32 = (u32 *)dst;
549
for (; i < numPixels / 2; i++) {
550
const u32 c = src32[i];
551
dst32[i] = ((c >> 15) & 0x00010001) |
552
((c >> 9) & 0x003E003E) |
553
((c << 1) & 0x07C007C0) |
554
((c << 11) & 0xF800F800);
555
}
556
557
if (numPixels & 1) {
558
const u32 i = numPixels - 1;
559
const u16 c = src[i];
560
dst[i] = ((c >> 15) & 0x0001) |
561
((c >> 9) & 0x003E) |
562
((c << 1) & 0x07C0) |
563
((c << 11) & 0xF800);
564
}
565
}
566
567
void ConvertRGB565ToBGR565(u16 *dst, const u16 *src, u32 numPixels) {
568
#if PPSSPP_ARCH(SSE2)
569
const __m128i maskG = _mm_set1_epi16(0x07E0);
570
571
const __m128i *srcp = (const __m128i *)src;
572
__m128i *dstp = (__m128i *)dst;
573
u32 sseChunks = numPixels / 8;
574
if (((intptr_t)src & 0xF) || ((intptr_t)dst & 0xF)) {
575
sseChunks = 0;
576
}
577
for (u32 i = 0; i < sseChunks; ++i) {
578
const __m128i c = _mm_load_si128(&srcp[i]);
579
__m128i v = _mm_srli_epi16(c, 11);
580
v = _mm_or_si128(v, _mm_and_si128(c, maskG));
581
v = _mm_or_si128(v, _mm_slli_epi16(c, 11));
582
_mm_store_si128(&dstp[i], v);
583
}
584
// The remainder is done in chunks of 2, SSE was chunks of 8.
585
u32 i = sseChunks * 8 / 2;
586
#elif PPSSPP_ARCH(ARM_NEON)
587
const uint16x8_t maskG = vdupq_n_u16(0x07E0);
588
589
if (((uintptr_t)dst & 15) == 0 && ((uintptr_t)src & 15) == 0) {
590
u32 simdable = (numPixels / 8) * 8;
591
for (u32 i = 0; i < simdable; i += 8) {
592
uint16x8_t c = vld1q_u16(src);
593
594
const uint16x8_t b = vshrq_n_u16(c, 11);
595
const uint16x8_t g = vandq_u16(c, maskG);
596
const uint16x8_t r = vshlq_n_u16(c, 11);
597
598
uint16x8_t res = vorrq_u16(vorrq_u16(r, g), b);
599
vst1q_u16(dst, res);
600
601
src += 8;
602
dst += 8;
603
}
604
numPixels -= simdable;
605
}
606
607
u32 i = 0;
608
#else
609
u32 i = 0;
610
#endif
611
612
// TODO: Add a 64-bit loop too.
613
const u32 *src32 = (const u32 *)src;
614
u32 *dst32 = (u32 *)dst;
615
for (; i < numPixels / 2; i++) {
616
const u32 c = src32[i];
617
dst32[i] = ((c >> 11) & 0x001F001F) |
618
((c >> 0) & 0x07E007E0) |
619
((c << 11) & 0xF800F800);
620
}
621
622
if (numPixels & 1) {
623
const u32 i = numPixels - 1;
624
const u16 c = src[i];
625
dst[i] = ((c >> 11) & 0x001F) |
626
((c >> 0) & 0x07E0) |
627
((c << 11) & 0xF800);
628
}
629
}
630
631
void ConvertBGRA5551ToABGR1555(u16 *dst, const u16 *src, u32 numPixels) {
632
const u32 *src32 = (const u32 *)src;
633
u32 *dst32 = (u32 *)dst;
634
for (u32 i = 0; i < numPixels / 2; i++) {
635
const u32 c = src32[i];
636
dst32[i] = ((c >> 15) & 0x00010001) | ((c << 1) & 0xFFFEFFFE);
637
}
638
639
if (numPixels & 1) {
640
const u32 i = numPixels - 1;
641
const u16 c = src[i];
642
dst[i] = (c >> 15) | (c << 1);
643
}
644
}
645
646