Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hrydgard
GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Software/DrawPixel.cpp
3187 views
1
// Copyright (c) 2013- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include "ppsspp_config.h"
19
#include <mutex>
20
#include "Common/Common.h"
21
#include "Common/Data/Convert/ColorConv.h"
22
#include "Core/Config.h"
23
#include "GPU/Software/BinManager.h"
24
#include "GPU/Software/DrawPixel.h"
25
#include "GPU/Software/FuncId.h"
26
#include "GPU/Software/Rasterizer.h"
27
#include "GPU/Software/SoftGpu.h"
28
29
using namespace Math3D;
30
31
namespace Rasterizer {
32
33
std::mutex jitCacheLock;
34
PixelJitCache *jitCache = nullptr;
35
36
void Init() {
37
jitCache = new PixelJitCache();
38
}
39
40
void FlushJit() {
41
jitCache->Flush();
42
}
43
44
void Shutdown() {
45
delete jitCache;
46
jitCache = nullptr;
47
}
48
49
bool DescribeCodePtr(const u8 *ptr, std::string &name) {
50
if (!jitCache->IsInSpace(ptr)) {
51
return false;
52
}
53
54
name = jitCache->DescribeCodePtr(ptr);
55
return true;
56
}
57
58
static inline u8 GetPixelStencil(GEBufferFormat fmt, int fbStride, int x, int y) {
59
if (fmt == GE_FORMAT_565) {
60
// Always treated as 0 for comparison purposes.
61
return 0;
62
} else if (fmt == GE_FORMAT_5551) {
63
return ((fb.Get16(x, y, fbStride) & 0x8000) != 0) ? 0xFF : 0;
64
} else if (fmt == GE_FORMAT_4444) {
65
return Convert4To8(fb.Get16(x, y, fbStride) >> 12);
66
} else {
67
return fb.Get32(x, y, fbStride) >> 24;
68
}
69
}
70
71
static inline void SetPixelStencil(GEBufferFormat fmt, int fbStride, uint32_t targetWriteMask, int x, int y, u8 value) {
72
if (fmt == GE_FORMAT_565) {
73
// Do nothing
74
} else if (fmt == GE_FORMAT_5551) {
75
if ((targetWriteMask & 0x8000) == 0) {
76
u16 pixel = fb.Get16(x, y, fbStride) & ~0x8000;
77
pixel |= (value & 0x80) << 8;
78
fb.Set16(x, y, fbStride, pixel);
79
}
80
} else if (fmt == GE_FORMAT_4444) {
81
const u16 write_mask = targetWriteMask | 0x0FFF;
82
u16 pixel = fb.Get16(x, y, fbStride) & write_mask;
83
pixel |= ((u16)value << 8) & ~write_mask;
84
fb.Set16(x, y, fbStride, pixel);
85
} else {
86
const u32 write_mask = targetWriteMask | 0x00FFFFFF;
87
u32 pixel = fb.Get32(x, y, fbStride) & write_mask;
88
pixel |= ((u32)value << 24) & ~write_mask;
89
fb.Set32(x, y, fbStride, pixel);
90
}
91
}
92
93
static inline u16 GetPixelDepth(int x, int y, int stride) {
94
return depthbuf.Get16(x, y, stride);
95
}
96
97
static inline void SetPixelDepth(int x, int y, int stride, u16 value) {
98
depthbuf.Set16(x, y, stride, value);
99
}
100
101
// NOTE: These likely aren't endian safe
102
static inline u32 GetPixelColor(GEBufferFormat fmt, int fbStride, int x, int y) {
103
switch (fmt) {
104
case GE_FORMAT_565:
105
// A should be zero for the purposes of alpha blending.
106
return RGB565ToRGBA8888(fb.Get16(x, y, fbStride)) & 0x00FFFFFF;
107
108
case GE_FORMAT_5551:
109
return RGBA5551ToRGBA8888(fb.Get16(x, y, fbStride));
110
111
case GE_FORMAT_4444:
112
return RGBA4444ToRGBA8888(fb.Get16(x, y, fbStride));
113
114
case GE_FORMAT_8888:
115
return fb.Get32(x, y, fbStride);
116
117
default:
118
return 0;
119
}
120
}
121
122
static inline void SetPixelColor(GEBufferFormat fmt, int fbStride, int x, int y, u32 value, u32 old_value, u32 targetWriteMask) {
123
switch (fmt) {
124
case GE_FORMAT_565:
125
value = RGBA8888ToRGB565(value);
126
if (targetWriteMask != 0) {
127
old_value = RGBA8888ToRGB565(old_value);
128
value = (value & ~targetWriteMask) | (old_value & targetWriteMask);
129
}
130
fb.Set16(x, y, fbStride, value);
131
break;
132
133
case GE_FORMAT_5551:
134
value = RGBA8888ToRGBA5551(value);
135
if (targetWriteMask != 0) {
136
old_value = RGBA8888ToRGBA5551(old_value);
137
value = (value & ~targetWriteMask) | (old_value & targetWriteMask);
138
}
139
fb.Set16(x, y, fbStride, value);
140
break;
141
142
case GE_FORMAT_4444:
143
value = RGBA8888ToRGBA4444(value);
144
if (targetWriteMask != 0) {
145
old_value = RGBA8888ToRGBA4444(old_value);
146
value = (value & ~targetWriteMask) | (old_value & targetWriteMask);
147
}
148
fb.Set16(x, y, fbStride, value);
149
break;
150
151
case GE_FORMAT_8888:
152
value = (value & ~targetWriteMask) | (old_value & targetWriteMask);
153
fb.Set32(x, y, fbStride, value);
154
break;
155
156
default:
157
break;
158
}
159
}
160
161
static inline bool AlphaTestPassed(const PixelFuncID &pixelID, int alpha) {
162
const u8 ref = pixelID.alphaTestRef;
163
if (pixelID.hasAlphaTestMask)
164
alpha &= pixelID.cached.alphaTestMask;
165
166
switch (pixelID.AlphaTestFunc()) {
167
case GE_COMP_NEVER:
168
return false;
169
170
case GE_COMP_ALWAYS:
171
return true;
172
173
case GE_COMP_EQUAL:
174
return (alpha == ref);
175
176
case GE_COMP_NOTEQUAL:
177
return (alpha != ref);
178
179
case GE_COMP_LESS:
180
return (alpha < ref);
181
182
case GE_COMP_LEQUAL:
183
return (alpha <= ref);
184
185
case GE_COMP_GREATER:
186
return (alpha > ref);
187
188
case GE_COMP_GEQUAL:
189
return (alpha >= ref);
190
}
191
return true;
192
}
193
194
static inline bool ColorTestPassed(const PixelFuncID &pixelID, const Vec3<int> &color) {
195
const u32 mask = pixelID.cached.colorTestMask;
196
const u32 c = color.ToRGB() & mask;
197
const u32 ref = pixelID.cached.colorTestRef;
198
switch (pixelID.cached.colorTestFunc) {
199
case GE_COMP_NEVER:
200
return false;
201
202
case GE_COMP_ALWAYS:
203
return true;
204
205
case GE_COMP_EQUAL:
206
return c == ref;
207
208
case GE_COMP_NOTEQUAL:
209
return c != ref;
210
211
default:
212
return true;
213
}
214
}
215
216
static inline bool StencilTestPassed(const PixelFuncID &pixelID, u8 stencil) {
217
if (pixelID.hasStencilTestMask)
218
stencil &= pixelID.cached.stencilTestMask;
219
u8 ref = pixelID.stencilTestRef;
220
switch (pixelID.StencilTestFunc()) {
221
case GE_COMP_NEVER:
222
return false;
223
224
case GE_COMP_ALWAYS:
225
return true;
226
227
case GE_COMP_EQUAL:
228
return ref == stencil;
229
230
case GE_COMP_NOTEQUAL:
231
return ref != stencil;
232
233
case GE_COMP_LESS:
234
return ref < stencil;
235
236
case GE_COMP_LEQUAL:
237
return ref <= stencil;
238
239
case GE_COMP_GREATER:
240
return ref > stencil;
241
242
case GE_COMP_GEQUAL:
243
return ref >= stencil;
244
}
245
return true;
246
}
247
248
static inline u8 ApplyStencilOp(GEBufferFormat fmt, uint8_t stencilReplace, GEStencilOp op, u8 old_stencil) {
249
switch (op) {
250
case GE_STENCILOP_KEEP:
251
return old_stencil;
252
253
case GE_STENCILOP_ZERO:
254
return 0;
255
256
case GE_STENCILOP_REPLACE:
257
return stencilReplace;
258
259
case GE_STENCILOP_INVERT:
260
return ~old_stencil;
261
262
case GE_STENCILOP_INCR:
263
switch (fmt) {
264
case GE_FORMAT_8888:
265
if (old_stencil != 0xFF) {
266
return old_stencil + 1;
267
}
268
return old_stencil;
269
case GE_FORMAT_5551:
270
return 0xFF;
271
case GE_FORMAT_4444:
272
if (old_stencil < 0xF0) {
273
return old_stencil + 0x10;
274
}
275
return old_stencil;
276
default:
277
return old_stencil;
278
}
279
break;
280
281
case GE_STENCILOP_DECR:
282
switch (fmt) {
283
case GE_FORMAT_4444:
284
if (old_stencil >= 0x10)
285
return old_stencil - 0x10;
286
break;
287
case GE_FORMAT_5551:
288
return 0;
289
default:
290
if (old_stencil != 0)
291
return old_stencil - 1;
292
return old_stencil;
293
}
294
break;
295
}
296
297
return old_stencil;
298
}
299
300
static inline bool DepthTestPassed(GEComparison func, int x, int y, int stride, u16 z) {
301
u16 reference_z = GetPixelDepth(x, y, stride);
302
303
switch (func) {
304
case GE_COMP_NEVER:
305
return false;
306
307
case GE_COMP_ALWAYS:
308
return true;
309
310
case GE_COMP_EQUAL:
311
return (z == reference_z);
312
313
case GE_COMP_NOTEQUAL:
314
return (z != reference_z);
315
316
case GE_COMP_LESS:
317
return (z < reference_z);
318
319
case GE_COMP_LEQUAL:
320
return (z <= reference_z);
321
322
case GE_COMP_GREATER:
323
return (z > reference_z);
324
325
case GE_COMP_GEQUAL:
326
return (z >= reference_z);
327
328
default:
329
return 0;
330
}
331
}
332
333
bool CheckDepthTestPassed(GEComparison func, int x, int y, int stride, u16 z) {
334
return DepthTestPassed(func, x, y, stride, z);
335
}
336
337
static inline u32 ApplyLogicOp(GELogicOp op, u32 old_color, u32 new_color) {
338
// All of the operations here intentionally preserve alpha/stencil.
339
switch (op) {
340
case GE_LOGIC_CLEAR:
341
new_color &= 0xFF000000;
342
break;
343
344
case GE_LOGIC_AND:
345
new_color = new_color & (old_color | 0xFF000000);
346
break;
347
348
case GE_LOGIC_AND_REVERSE:
349
new_color = new_color & (~old_color | 0xFF000000);
350
break;
351
352
case GE_LOGIC_COPY:
353
// No change to new_color.
354
break;
355
356
case GE_LOGIC_AND_INVERTED:
357
new_color = (~new_color & (old_color & 0x00FFFFFF)) | (new_color & 0xFF000000);
358
break;
359
360
case GE_LOGIC_NOOP:
361
new_color = (old_color & 0x00FFFFFF) | (new_color & 0xFF000000);
362
break;
363
364
case GE_LOGIC_XOR:
365
new_color = new_color ^ (old_color & 0x00FFFFFF);
366
break;
367
368
case GE_LOGIC_OR:
369
new_color = new_color | (old_color & 0x00FFFFFF);
370
break;
371
372
case GE_LOGIC_NOR:
373
new_color = (~(new_color | old_color) & 0x00FFFFFF) | (new_color & 0xFF000000);
374
break;
375
376
case GE_LOGIC_EQUIV:
377
new_color = (~(new_color ^ old_color) & 0x00FFFFFF) | (new_color & 0xFF000000);
378
break;
379
380
case GE_LOGIC_INVERTED:
381
new_color = (~old_color & 0x00FFFFFF) | (new_color & 0xFF000000);
382
break;
383
384
case GE_LOGIC_OR_REVERSE:
385
new_color = new_color | (~old_color & 0x00FFFFFF);
386
break;
387
388
case GE_LOGIC_COPY_INVERTED:
389
new_color = (~new_color & 0x00FFFFFF) | (new_color & 0xFF000000);
390
break;
391
392
case GE_LOGIC_OR_INVERTED:
393
new_color = ((~new_color | old_color) & 0x00FFFFFF) | (new_color & 0xFF000000);
394
break;
395
396
case GE_LOGIC_NAND:
397
new_color = (~(new_color & old_color) & 0x00FFFFFF) | (new_color & 0xFF000000);
398
break;
399
400
case GE_LOGIC_SET:
401
new_color |= 0x00FFFFFF;
402
break;
403
}
404
405
return new_color;
406
}
407
408
static inline Vec3<int> GetSourceFactor(PixelBlendFactor factor, const Vec4<int> &source, const Vec4<int> &dst, uint32_t fix) {
409
switch (factor) {
410
case PixelBlendFactor::OTHERCOLOR:
411
return dst.rgb();
412
413
case PixelBlendFactor::INVOTHERCOLOR:
414
return Vec3<int>::AssignToAll(255) - dst.rgb();
415
416
case PixelBlendFactor::SRCALPHA:
417
#if defined(_M_SSE)
418
return Vec3<int>(_mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3)));
419
#elif PPSSPP_ARCH(ARM64_NEON)
420
return Vec3<int>(vdupq_laneq_s32(source.ivec, 3));
421
#else
422
return Vec3<int>::AssignToAll(source.a());
423
#endif
424
425
case PixelBlendFactor::INVSRCALPHA:
426
#if defined(_M_SSE)
427
return Vec3<int>(_mm_sub_epi32(_mm_set1_epi32(255), _mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3))));
428
#elif PPSSPP_ARCH(ARM64_NEON)
429
return Vec3<int>(vsubq_s32(vdupq_n_s32(255), vdupq_laneq_s32(source.ivec, 3)));
430
#else
431
return Vec3<int>::AssignToAll(255 - source.a());
432
#endif
433
434
case PixelBlendFactor::DSTALPHA:
435
return Vec3<int>::AssignToAll(dst.a());
436
437
case PixelBlendFactor::INVDSTALPHA:
438
return Vec3<int>::AssignToAll(255 - dst.a());
439
440
case PixelBlendFactor::DOUBLESRCALPHA:
441
return Vec3<int>::AssignToAll(2 * source.a());
442
443
case PixelBlendFactor::DOUBLEINVSRCALPHA:
444
return Vec3<int>::AssignToAll(255 - std::min(2 * source.a(), 255));
445
446
case PixelBlendFactor::DOUBLEDSTALPHA:
447
return Vec3<int>::AssignToAll(2 * dst.a());
448
449
case PixelBlendFactor::DOUBLEINVDSTALPHA:
450
return Vec3<int>::AssignToAll(255 - std::min(2 * dst.a(), 255));
451
452
case PixelBlendFactor::FIX:
453
default:
454
// All other dest factors (> 10) are treated as FIXA.
455
return Vec3<int>::FromRGB(fix);
456
457
case PixelBlendFactor::ZERO:
458
return Vec3<int>::AssignToAll(0);
459
460
case PixelBlendFactor::ONE:
461
return Vec3<int>::AssignToAll(255);
462
}
463
}
464
465
static inline Vec3<int> GetDestFactor(PixelBlendFactor factor, const Vec4<int> &source, const Vec4<int> &dst, uint32_t fix) {
466
switch (factor) {
467
case PixelBlendFactor::OTHERCOLOR:
468
return source.rgb();
469
470
case PixelBlendFactor::INVOTHERCOLOR:
471
return Vec3<int>::AssignToAll(255) - source.rgb();
472
473
case PixelBlendFactor::SRCALPHA:
474
#if defined(_M_SSE)
475
return Vec3<int>(_mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3)));
476
#elif PPSSPP_ARCH(ARM64_NEON)
477
return Vec3<int>(vdupq_laneq_s32(source.ivec, 3));
478
#else
479
return Vec3<int>::AssignToAll(source.a());
480
#endif
481
482
case PixelBlendFactor::INVSRCALPHA:
483
#if defined(_M_SSE)
484
return Vec3<int>(_mm_sub_epi32(_mm_set1_epi32(255), _mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3))));
485
#elif PPSSPP_ARCH(ARM64_NEON)
486
return Vec3<int>(vsubq_s32(vdupq_n_s32(255), vdupq_laneq_s32(source.ivec, 3)));
487
#else
488
return Vec3<int>::AssignToAll(255 - source.a());
489
#endif
490
491
case PixelBlendFactor::DSTALPHA:
492
return Vec3<int>::AssignToAll(dst.a());
493
494
case PixelBlendFactor::INVDSTALPHA:
495
return Vec3<int>::AssignToAll(255 - dst.a());
496
497
case PixelBlendFactor::DOUBLESRCALPHA:
498
return Vec3<int>::AssignToAll(2 * source.a());
499
500
case PixelBlendFactor::DOUBLEINVSRCALPHA:
501
return Vec3<int>::AssignToAll(255 - std::min(2 * source.a(), 255));
502
503
case PixelBlendFactor::DOUBLEDSTALPHA:
504
return Vec3<int>::AssignToAll(2 * dst.a());
505
506
case PixelBlendFactor::DOUBLEINVDSTALPHA:
507
return Vec3<int>::AssignToAll(255 - std::min(2 * dst.a(), 255));
508
509
case PixelBlendFactor::FIX:
510
default:
511
// All other dest factors (> 10) are treated as FIXB.
512
return Vec3<int>::FromRGB(fix);
513
514
case PixelBlendFactor::ZERO:
515
return Vec3<int>::AssignToAll(0);
516
517
case PixelBlendFactor::ONE:
518
return Vec3<int>::AssignToAll(255);
519
}
520
}
521
522
// Removed inline here - it was never chosen to be inlined by the compiler anyway, too complex.
523
static Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &source, const Vec4<int> &dst) {
524
// Note: These factors cannot go below 0, but they can go above 255 when doubling.
525
Vec3<int> srcfactor = GetSourceFactor(pixelID.AlphaBlendSrc(), source, dst, pixelID.cached.alphaBlendSrc);
526
Vec3<int> dstfactor = GetDestFactor(pixelID.AlphaBlendDst(), source, dst, pixelID.cached.alphaBlendDst);
527
528
switch (pixelID.AlphaBlendEq()) {
529
case GE_BLENDMODE_MUL_AND_ADD:
530
{
531
#if defined(_M_SSE)
532
// We switch to 16 bit to use mulhi, and we use 4 bits of decimal to make the 16 bit shift free.
533
const __m128i half = _mm_set1_epi16(1 << 3);
534
535
const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
536
const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
537
const __m128i s = _mm_mulhi_epi16(srgb, sf);
538
539
const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
540
const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
541
const __m128i d = _mm_mulhi_epi16(drgb, df);
542
543
return Vec3<int>(_mm_unpacklo_epi16(_mm_adds_epi16(s, d), _mm_setzero_si128()));
544
#elif PPSSPP_ARCH(ARM64_NEON)
545
const int32x4_t half = vdupq_n_s32(1);
546
547
const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half);
548
const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half);
549
const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10);
550
551
const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half);
552
const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half);
553
const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10);
554
555
return Vec3<int>(vaddq_s32(s, d));
556
#else
557
static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
558
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
559
Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
560
return lhs + rhs;
561
#endif
562
}
563
564
case GE_BLENDMODE_MUL_AND_SUBTRACT:
565
{
566
#if defined(_M_SSE)
567
const __m128i half = _mm_set1_epi16(1 << 3);
568
569
const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
570
const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
571
const __m128i s = _mm_mulhi_epi16(srgb, sf);
572
573
const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
574
const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
575
const __m128i d = _mm_mulhi_epi16(drgb, df);
576
577
return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(s, d), _mm_setzero_si128()), _mm_setzero_si128()));
578
#elif PPSSPP_ARCH(ARM64_NEON)
579
const int32x4_t half = vdupq_n_s32(1);
580
581
const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half);
582
const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half);
583
const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10);
584
585
const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half);
586
const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half);
587
const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10);
588
589
return Vec3<int>(vqsubq_s32(s, d));
590
#else
591
static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
592
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
593
Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
594
return lhs - rhs;
595
#endif
596
}
597
598
case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
599
{
600
#if defined(_M_SSE)
601
const __m128i half = _mm_set1_epi16(1 << 3);
602
603
const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
604
const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
605
const __m128i s = _mm_mulhi_epi16(srgb, sf);
606
607
const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
608
const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
609
const __m128i d = _mm_mulhi_epi16(drgb, df);
610
611
return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(d, s), _mm_setzero_si128()), _mm_setzero_si128()));
612
#elif PPSSPP_ARCH(ARM64_NEON)
613
const int32x4_t half = vdupq_n_s32(1);
614
615
const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half);
616
const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half);
617
const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10);
618
619
const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half);
620
const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half);
621
const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10);
622
623
return Vec3<int>(vqsubq_s32(d, s));
624
#else
625
static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
626
Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
627
Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
628
return rhs - lhs;
629
#endif
630
}
631
632
case GE_BLENDMODE_MIN:
633
#if PPSSPP_ARCH(ARM64_NEON)
634
return Vec3<int>(vminq_s32(source.ivec, dst.ivec));
635
#else
636
return Vec3<int>(std::min(source.r(), dst.r()),
637
std::min(source.g(), dst.g()),
638
std::min(source.b(), dst.b()));
639
#endif
640
641
case GE_BLENDMODE_MAX:
642
#if PPSSPP_ARCH(ARM64_NEON)
643
return Vec3<int>(vmaxq_s32(source.ivec, dst.ivec));
644
#else
645
return Vec3<int>(std::max(source.r(), dst.r()),
646
std::max(source.g(), dst.g()),
647
std::max(source.b(), dst.b()));
648
#endif
649
650
case GE_BLENDMODE_ABSDIFF:
651
#if PPSSPP_ARCH(ARM64_NEON)
652
return Vec3<int>(vabdq_s32(source.ivec, dst.ivec));
653
#else
654
return Vec3<int>(::abs(source.r() - dst.r()),
655
::abs(source.g() - dst.g()),
656
::abs(source.b() - dst.b()));
657
#endif
658
659
default:
660
return source.rgb();
661
}
662
}
663
664
template <bool clearMode, GEBufferFormat fbFormat>
665
void SOFTRAST_CALL DrawSinglePixel(int x, int y, int z, int fog, Vec4IntArg color_in, const PixelFuncID &pixelID) {
666
Vec4<int> prim_color = Vec4<int>(color_in).Clamp(0, 255);
667
// Depth range test - applied in clear mode, if not through mode.
668
if (pixelID.applyDepthRange && !pixelID.earlyZChecks)
669
if (z < pixelID.cached.minz || z > pixelID.cached.maxz)
670
return;
671
672
if (pixelID.AlphaTestFunc() != GE_COMP_ALWAYS && !clearMode)
673
if (!AlphaTestPassed(pixelID, prim_color.a()))
674
return;
675
676
// Fog is applied prior to color test.
677
if (pixelID.applyFog && !clearMode) {
678
Vec3<int> fogColor = Vec3<int>::FromRGB(pixelID.cached.fogColor);
679
// This is very similar to the BLEND texfunc, and simply always rounds up.
680
static constexpr Vec3<int> roundup = Vec3<int>::AssignToAll(255);
681
fogColor = (prim_color.rgb() * fog + fogColor * (255 - fog) + roundup) / 256;
682
prim_color.r() = fogColor.r();
683
prim_color.g() = fogColor.g();
684
prim_color.b() = fogColor.b();
685
}
686
687
if (pixelID.colorTest && !clearMode)
688
if (!ColorTestPassed(pixelID, prim_color.rgb()))
689
return;
690
691
// In clear mode, it uses the alpha color as stencil.
692
uint32_t targetWriteMask = pixelID.applyColorWriteMask ? pixelID.cached.colorWriteMask : 0;
693
u8 stencil = clearMode ? prim_color.a() : GetPixelStencil(fbFormat, pixelID.cached.framebufStride, x, y);
694
if (clearMode) {
695
if (pixelID.DepthClear())
696
SetPixelDepth(x, y, pixelID.cached.depthbufStride, z);
697
} else if (pixelID.stencilTest) {
698
const uint8_t stencilReplace = pixelID.hasStencilTestMask ? pixelID.cached.stencilRef : pixelID.stencilTestRef;
699
if (!StencilTestPassed(pixelID, stencil)) {
700
stencil = ApplyStencilOp(fbFormat, stencilReplace, pixelID.SFail(), stencil);
701
SetPixelStencil(fbFormat, pixelID.cached.framebufStride, targetWriteMask, x, y, stencil);
702
return;
703
}
704
705
// Also apply depth at the same time. If disabled, same as passing.
706
if (!pixelID.earlyZChecks && pixelID.DepthTestFunc() != GE_COMP_ALWAYS && !DepthTestPassed(pixelID.DepthTestFunc(), x, y, pixelID.cached.depthbufStride, z)) {
707
stencil = ApplyStencilOp(fbFormat, stencilReplace, pixelID.ZFail(), stencil);
708
SetPixelStencil(fbFormat, pixelID.cached.framebufStride, targetWriteMask, x, y, stencil);
709
return;
710
}
711
712
stencil = ApplyStencilOp(fbFormat, stencilReplace, pixelID.ZPass(), stencil);
713
} else if (!pixelID.earlyZChecks) {
714
if (pixelID.DepthTestFunc() != GE_COMP_ALWAYS && !DepthTestPassed(pixelID.DepthTestFunc(), x, y, pixelID.cached.depthbufStride, z)) {
715
return;
716
}
717
}
718
719
if (pixelID.depthWrite && !clearMode)
720
SetPixelDepth(x, y, pixelID.cached.depthbufStride, z);
721
722
const u32 old_color = GetPixelColor(fbFormat, pixelID.cached.framebufStride, x, y);
723
u32 new_color;
724
725
// Dithering happens before the logic op and regardless of framebuffer format or clear mode.
726
// We do it while alpha blending because it happens before clamping.
727
if (pixelID.alphaBlend && !clearMode) {
728
const Vec4<int> dst = Vec4<int>::FromRGBA(old_color);
729
Vec3<int> blended = AlphaBlendingResult(pixelID, prim_color, dst);
730
if (pixelID.dithering) {
731
blended += Vec3<int>::AssignToAll(pixelID.cached.ditherMatrix[(y & 3) * 4 + (x & 3)]);
732
}
733
734
// ToRGB() always automatically clamps.
735
new_color = blended.ToRGB();
736
new_color |= stencil << 24;
737
} else {
738
if (pixelID.dithering) {
739
// We'll discard alpha anyway.
740
prim_color += Vec4<int>::AssignToAll(pixelID.cached.ditherMatrix[(y & 3) * 4 + (x & 3)]);
741
}
742
743
#if defined(_M_SSE) || PPSSPP_ARCH(ARM64_NEON)
744
new_color = Vec3<int>(prim_color.ivec).ToRGB();
745
new_color |= stencil << 24;
746
#else
747
new_color = Vec4<int>(prim_color.r(), prim_color.g(), prim_color.b(), stencil).ToRGBA();
748
#endif
749
}
750
751
// Logic ops are applied after blending (if blending is enabled.)
752
if (pixelID.applyLogicOp && !clearMode) {
753
// Logic ops don't affect stencil, which happens inside ApplyLogicOp.
754
new_color = ApplyLogicOp(pixelID.cached.logicOp, old_color, new_color);
755
}
756
757
if (clearMode) {
758
if (!pixelID.ColorClear())
759
new_color = (new_color & 0xFF000000) | (old_color & 0x00FFFFFF);
760
if (!pixelID.StencilClear())
761
new_color = (new_color & 0x00FFFFFF) | (old_color & 0xFF000000);
762
}
763
764
SetPixelColor(fbFormat, pixelID.cached.framebufStride, x, y, new_color, old_color, targetWriteMask);
765
}
766
767
SingleFunc GetSingleFunc(const PixelFuncID &id, BinManager *binner) {
768
SingleFunc jitted = jitCache->GetSingle(id, binner);
769
if (jitted) {
770
return jitted;
771
}
772
773
return jitCache->GenericSingle(id);
774
}
775
776
SingleFunc PixelJitCache::GenericSingle(const PixelFuncID &id) {
777
if (id.clearMode) {
778
switch (id.fbFormat) {
779
case GE_FORMAT_565:
780
return &DrawSinglePixel<true, GE_FORMAT_565>;
781
case GE_FORMAT_5551:
782
return &DrawSinglePixel<true, GE_FORMAT_5551>;
783
case GE_FORMAT_4444:
784
return &DrawSinglePixel<true, GE_FORMAT_4444>;
785
case GE_FORMAT_8888:
786
return &DrawSinglePixel<true, GE_FORMAT_8888>;
787
}
788
}
789
switch (id.fbFormat) {
790
case GE_FORMAT_565:
791
return &DrawSinglePixel<false, GE_FORMAT_565>;
792
case GE_FORMAT_5551:
793
return &DrawSinglePixel<false, GE_FORMAT_5551>;
794
case GE_FORMAT_4444:
795
return &DrawSinglePixel<false, GE_FORMAT_4444>;
796
case GE_FORMAT_8888:
797
return &DrawSinglePixel<false, GE_FORMAT_8888>;
798
}
799
_assert_(false);
800
return nullptr;
801
}
802
803
thread_local PixelJitCache::LastCache PixelJitCache::lastSingle_;
804
int PixelJitCache::clearGen_ = 0;
805
806
// 256k should be plenty of space for plenty of variations.
807
PixelJitCache::PixelJitCache() : CodeBlock(1024 * 64 * 4), cache_(64) {
808
lastSingle_.gen = -1;
809
clearGen_++;
810
}
811
812
void PixelJitCache::Clear() {
813
clearGen_++;
814
CodeBlock::Clear();
815
cache_.Clear();
816
addresses_.clear();
817
818
constBlendHalf_11_4s_ = nullptr;
819
constBlendInvert_11_4s_ = nullptr;
820
}
821
822
std::string PixelJitCache::DescribeCodePtr(const u8 *ptr) {
823
constexpr bool USE_IDS = false;
824
ptrdiff_t dist = 0x7FFFFFFF;
825
if (USE_IDS) {
826
PixelFuncID found{};
827
for (const auto &it : addresses_) {
828
ptrdiff_t it_dist = ptr - it.second;
829
if (it_dist >= 0 && it_dist < dist) {
830
found = it.first;
831
dist = it_dist;
832
}
833
}
834
835
return DescribePixelFuncID(found);
836
}
837
838
return CodeBlock::DescribeCodePtr(ptr);
839
}
840
841
void PixelJitCache::Flush() {
842
std::unique_lock<std::mutex> guard(jitCacheLock);
843
for (const auto &queued : compileQueue_) {
844
// Might've been compiled after enqueue, but before now.
845
size_t queuedKey = std::hash<PixelFuncID>()(queued);
846
if (!cache_.ContainsKey(queuedKey))
847
Compile(queued);
848
}
849
compileQueue_.clear();
850
}
851
852
SingleFunc PixelJitCache::GetSingle(const PixelFuncID &id, BinManager *binner) {
853
if (!g_Config.bSoftwareRenderingJit)
854
return nullptr;
855
856
const size_t key = std::hash<PixelFuncID>()(id);
857
if (lastSingle_.Match(key, clearGen_))
858
return lastSingle_.func;
859
860
std::unique_lock<std::mutex> guard(jitCacheLock);
861
SingleFunc singleFunc;
862
if (cache_.Get(key, &singleFunc)) {
863
lastSingle_.Set(key, singleFunc, clearGen_);
864
return singleFunc;
865
}
866
867
if (!binner) {
868
// Can't compile, let's try to do it later when there's an opportunity.
869
compileQueue_.insert(id);
870
return nullptr;
871
}
872
873
guard.unlock();
874
binner->Flush("compile");
875
guard.lock();
876
877
for (const auto &queued : compileQueue_) {
878
// Might've been compiled after enqueue, but before now.
879
size_t queuedKey = std::hash<PixelFuncID>()(queued);
880
if (!cache_.ContainsKey(queuedKey))
881
Compile(queued);
882
}
883
compileQueue_.clear();
884
885
// Might've been in the queue.
886
if (!cache_.ContainsKey(key))
887
Compile(id);
888
889
if (cache_.Get(key, &singleFunc)) {
890
lastSingle_.Set(key, singleFunc, clearGen_);
891
return singleFunc;
892
} else {
893
return nullptr;
894
}
895
}
896
897
void PixelJitCache::Compile(const PixelFuncID &id) {
898
// x64 is typically 200-500 bytes, but let's be safe.
899
if (GetSpaceLeft() < 65536) {
900
Clear();
901
}
902
903
#if PPSSPP_ARCH(AMD64) && !PPSSPP_PLATFORM(UWP)
904
addresses_[id] = GetCodePointer();
905
SingleFunc func = CompileSingle(id);
906
cache_.Insert(std::hash<PixelFuncID>()(id), func);
907
#endif
908
}
909
910
void ComputePixelBlendState(PixelBlendState &state, const PixelFuncID &id) {
911
switch (id.AlphaBlendEq()) {
912
case GE_BLENDMODE_MUL_AND_ADD:
913
case GE_BLENDMODE_MUL_AND_SUBTRACT:
914
case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
915
state.usesFactors = true;
916
break;
917
918
case GE_BLENDMODE_MIN:
919
case GE_BLENDMODE_MAX:
920
case GE_BLENDMODE_ABSDIFF:
921
break;
922
}
923
924
if (state.usesFactors) {
925
switch (id.AlphaBlendSrc()) {
926
case PixelBlendFactor::DSTALPHA:
927
case PixelBlendFactor::INVDSTALPHA:
928
case PixelBlendFactor::DOUBLEDSTALPHA:
929
case PixelBlendFactor::DOUBLEINVDSTALPHA:
930
state.usesDstAlpha = true;
931
break;
932
933
case PixelBlendFactor::OTHERCOLOR:
934
case PixelBlendFactor::INVOTHERCOLOR:
935
state.dstColorAsFactor = true;
936
break;
937
938
case PixelBlendFactor::SRCALPHA:
939
case PixelBlendFactor::INVSRCALPHA:
940
case PixelBlendFactor::DOUBLESRCALPHA:
941
case PixelBlendFactor::DOUBLEINVSRCALPHA:
942
state.srcColorAsFactor = true;
943
break;
944
945
default:
946
break;
947
}
948
949
switch (id.AlphaBlendDst()) {
950
case PixelBlendFactor::INVSRCALPHA:
951
state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::SRCALPHA;
952
state.srcColorAsFactor = true;
953
break;
954
955
case PixelBlendFactor::DOUBLEINVSRCALPHA:
956
state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::DOUBLESRCALPHA;
957
state.srcColorAsFactor = true;
958
break;
959
960
case PixelBlendFactor::DSTALPHA:
961
state.usesDstAlpha = true;
962
break;
963
964
case PixelBlendFactor::INVDSTALPHA:
965
state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::DSTALPHA;
966
state.usesDstAlpha = true;
967
break;
968
969
case PixelBlendFactor::DOUBLEDSTALPHA:
970
state.usesDstAlpha = true;
971
break;
972
973
case PixelBlendFactor::DOUBLEINVDSTALPHA:
974
state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::DOUBLEDSTALPHA;
975
state.usesDstAlpha = true;
976
break;
977
978
case PixelBlendFactor::OTHERCOLOR:
979
case PixelBlendFactor::INVOTHERCOLOR:
980
state.srcColorAsFactor = true;
981
break;
982
983
case PixelBlendFactor::SRCALPHA:
984
case PixelBlendFactor::DOUBLESRCALPHA:
985
state.srcColorAsFactor = true;
986
break;
987
988
case PixelBlendFactor::ZERO:
989
state.readsDstPixel = state.dstColorAsFactor || state.usesDstAlpha;
990
break;
991
992
default:
993
break;
994
}
995
}
996
}
997
998
};
999
1000