CoCalc -- DrawPixel.cpp

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Software/DrawPixel.cpp
³¹⁸⁷ views
1
// Copyright (c) 2013- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include "ppsspp_config.h"
19
#include <mutex>
20
#include "Common/Common.h"
21
#include "Common/Data/Convert/ColorConv.h"
22
#include "Core/Config.h"
23
#include "GPU/Software/BinManager.h"
24
#include "GPU/Software/DrawPixel.h"
25
#include "GPU/Software/FuncId.h"
26
#include "GPU/Software/Rasterizer.h"
27
#include "GPU/Software/SoftGpu.h"
28

29
using namespace Math3D;
30

31
namespace Rasterizer {
32

33
std::mutex jitCacheLock;
34
PixelJitCache *jitCache = nullptr;
35

36
void Init() {
37
	jitCache = new PixelJitCache();
38
}
39

40
void FlushJit() {
41
	jitCache->Flush();
42
}
43

44
void Shutdown() {
45
	delete jitCache;
46
	jitCache = nullptr;
47
}
48

49
bool DescribeCodePtr(const u8 *ptr, std::string &name) {
50
	if (!jitCache->IsInSpace(ptr)) {
51
		return false;
52
	}
53

54
	name = jitCache->DescribeCodePtr(ptr);
55
	return true;
56
}
57

58
static inline u8 GetPixelStencil(GEBufferFormat fmt, int fbStride, int x, int y) {
59
	if (fmt == GE_FORMAT_565) {
60
		// Always treated as 0 for comparison purposes.
61
		return 0;
62
	} else if (fmt == GE_FORMAT_5551) {
63
		return ((fb.Get16(x, y, fbStride) & 0x8000) != 0) ? 0xFF : 0;
64
	} else if (fmt == GE_FORMAT_4444) {
65
		return Convert4To8(fb.Get16(x, y, fbStride) >> 12);
66
	} else {
67
		return fb.Get32(x, y, fbStride) >> 24;
68
	}
69
}
70

71
static inline void SetPixelStencil(GEBufferFormat fmt, int fbStride, uint32_t targetWriteMask, int x, int y, u8 value) {
72
	if (fmt == GE_FORMAT_565) {
73
		// Do nothing
74
	} else if (fmt == GE_FORMAT_5551) {
75
		if ((targetWriteMask & 0x8000) == 0) {
76
			u16 pixel = fb.Get16(x, y, fbStride) & ~0x8000;
77
			pixel |= (value & 0x80) << 8;
78
			fb.Set16(x, y, fbStride, pixel);
79
		}
80
	} else if (fmt == GE_FORMAT_4444) {
81
		const u16 write_mask = targetWriteMask | 0x0FFF;
82
		u16 pixel = fb.Get16(x, y, fbStride) & write_mask;
83
		pixel |= ((u16)value << 8) & ~write_mask;
84
		fb.Set16(x, y, fbStride, pixel);
85
	} else {
86
		const u32 write_mask = targetWriteMask | 0x00FFFFFF;
87
		u32 pixel = fb.Get32(x, y, fbStride) & write_mask;
88
		pixel |= ((u32)value << 24) & ~write_mask;
89
		fb.Set32(x, y, fbStride, pixel);
90
	}
91
}
92

93
static inline u16 GetPixelDepth(int x, int y, int stride) {
94
	return depthbuf.Get16(x, y, stride);
95
}
96

97
static inline void SetPixelDepth(int x, int y, int stride, u16 value) {
98
	depthbuf.Set16(x, y, stride, value);
99
}
100

101
// NOTE: These likely aren't endian safe
102
static inline u32 GetPixelColor(GEBufferFormat fmt, int fbStride, int x, int y) {
103
	switch (fmt) {
104
	case GE_FORMAT_565:
105
		// A should be zero for the purposes of alpha blending.
106
		return RGB565ToRGBA8888(fb.Get16(x, y, fbStride)) & 0x00FFFFFF;
107

108
	case GE_FORMAT_5551:
109
		return RGBA5551ToRGBA8888(fb.Get16(x, y, fbStride));
110

111
	case GE_FORMAT_4444:
112
		return RGBA4444ToRGBA8888(fb.Get16(x, y, fbStride));
113

114
	case GE_FORMAT_8888:
115
		return fb.Get32(x, y, fbStride);
116

117
	default:
118
		return 0;
119
	}
120
}
121

122
static inline void SetPixelColor(GEBufferFormat fmt, int fbStride, int x, int y, u32 value, u32 old_value, u32 targetWriteMask) {
123
	switch (fmt) {
124
	case GE_FORMAT_565:
125
		value = RGBA8888ToRGB565(value);
126
		if (targetWriteMask != 0) {
127
			old_value = RGBA8888ToRGB565(old_value);
128
			value = (value & ~targetWriteMask) | (old_value & targetWriteMask);
129
		}
130
		fb.Set16(x, y, fbStride, value);
131
		break;
132

133
	case GE_FORMAT_5551:
134
		value = RGBA8888ToRGBA5551(value);
135
		if (targetWriteMask != 0) {
136
			old_value = RGBA8888ToRGBA5551(old_value);
137
			value = (value & ~targetWriteMask) | (old_value & targetWriteMask);
138
		}
139
		fb.Set16(x, y, fbStride, value);
140
		break;
141

142
	case GE_FORMAT_4444:
143
		value = RGBA8888ToRGBA4444(value);
144
		if (targetWriteMask != 0) {
145
			old_value = RGBA8888ToRGBA4444(old_value);
146
			value = (value & ~targetWriteMask) | (old_value & targetWriteMask);
147
		}
148
		fb.Set16(x, y, fbStride, value);
149
		break;
150

151
	case GE_FORMAT_8888:
152
		value = (value & ~targetWriteMask) | (old_value & targetWriteMask);
153
		fb.Set32(x, y, fbStride, value);
154
		break;
155

156
	default:
157
		break;
158
	}
159
}
160

161
static inline bool AlphaTestPassed(const PixelFuncID &pixelID, int alpha) {
162
	const u8 ref = pixelID.alphaTestRef;
163
	if (pixelID.hasAlphaTestMask)
164
		alpha &= pixelID.cached.alphaTestMask;
165

166
	switch (pixelID.AlphaTestFunc()) {
167
	case GE_COMP_NEVER:
168
		return false;
169

170
	case GE_COMP_ALWAYS:
171
		return true;
172

173
	case GE_COMP_EQUAL:
174
		return (alpha == ref);
175

176
	case GE_COMP_NOTEQUAL:
177
		return (alpha != ref);
178

179
	case GE_COMP_LESS:
180
		return (alpha < ref);
181

182
	case GE_COMP_LEQUAL:
183
		return (alpha <= ref);
184

185
	case GE_COMP_GREATER:
186
		return (alpha > ref);
187

188
	case GE_COMP_GEQUAL:
189
		return (alpha >= ref);
190
	}
191
	return true;
192
}
193

194
static inline bool ColorTestPassed(const PixelFuncID &pixelID, const Vec3<int> &color) {
195
	const u32 mask = pixelID.cached.colorTestMask;
196
	const u32 c = color.ToRGB() & mask;
197
	const u32 ref = pixelID.cached.colorTestRef;
198
	switch (pixelID.cached.colorTestFunc) {
199
	case GE_COMP_NEVER:
200
		return false;
201

202
	case GE_COMP_ALWAYS:
203
		return true;
204

205
	case GE_COMP_EQUAL:
206
		return c == ref;
207

208
	case GE_COMP_NOTEQUAL:
209
		return c != ref;
210

211
	default:
212
		return true;
213
	}
214
}
215

216
static inline bool StencilTestPassed(const PixelFuncID &pixelID, u8 stencil) {
217
	if (pixelID.hasStencilTestMask)
218
		stencil &= pixelID.cached.stencilTestMask;
219
	u8 ref = pixelID.stencilTestRef;
220
	switch (pixelID.StencilTestFunc()) {
221
	case GE_COMP_NEVER:
222
		return false;
223

224
	case GE_COMP_ALWAYS:
225
		return true;
226

227
	case GE_COMP_EQUAL:
228
		return ref == stencil;
229

230
	case GE_COMP_NOTEQUAL:
231
		return ref != stencil;
232

233
	case GE_COMP_LESS:
234
		return ref < stencil;
235

236
	case GE_COMP_LEQUAL:
237
		return ref <= stencil;
238

239
	case GE_COMP_GREATER:
240
		return ref > stencil;
241

242
	case GE_COMP_GEQUAL:
243
		return ref >= stencil;
244
	}
245
	return true;
246
}
247

248
static inline u8 ApplyStencilOp(GEBufferFormat fmt, uint8_t stencilReplace, GEStencilOp op, u8 old_stencil) {
249
	switch (op) {
250
	case GE_STENCILOP_KEEP:
251
		return old_stencil;
252

253
	case GE_STENCILOP_ZERO:
254
		return 0;
255

256
	case GE_STENCILOP_REPLACE:
257
		return stencilReplace;
258

259
	case GE_STENCILOP_INVERT:
260
		return ~old_stencil;
261

262
	case GE_STENCILOP_INCR:
263
		switch (fmt) {
264
		case GE_FORMAT_8888:
265
			if (old_stencil != 0xFF) {
266
				return old_stencil + 1;
267
			}
268
			return old_stencil;
269
		case GE_FORMAT_5551:
270
			return 0xFF;
271
		case GE_FORMAT_4444:
272
			if (old_stencil < 0xF0) {
273
				return old_stencil + 0x10;
274
			}
275
			return old_stencil;
276
		default:
277
			return old_stencil;
278
		}
279
		break;
280

281
	case GE_STENCILOP_DECR:
282
		switch (fmt) {
283
		case GE_FORMAT_4444:
284
			if (old_stencil >= 0x10)
285
				return old_stencil - 0x10;
286
			break;
287
		case GE_FORMAT_5551:
288
			return 0;
289
		default:
290
			if (old_stencil != 0)
291
				return old_stencil - 1;
292
			return old_stencil;
293
		}
294
		break;
295
	}
296

297
	return old_stencil;
298
}
299

300
static inline bool DepthTestPassed(GEComparison func, int x, int y, int stride, u16 z) {
301
	u16 reference_z = GetPixelDepth(x, y, stride);
302

303
	switch (func) {
304
	case GE_COMP_NEVER:
305
		return false;
306

307
	case GE_COMP_ALWAYS:
308
		return true;
309

310
	case GE_COMP_EQUAL:
311
		return (z == reference_z);
312

313
	case GE_COMP_NOTEQUAL:
314
		return (z != reference_z);
315

316
	case GE_COMP_LESS:
317
		return (z < reference_z);
318

319
	case GE_COMP_LEQUAL:
320
		return (z <= reference_z);
321

322
	case GE_COMP_GREATER:
323
		return (z > reference_z);
324

325
	case GE_COMP_GEQUAL:
326
		return (z >= reference_z);
327

328
	default:
329
		return 0;
330
	}
331
}
332

333
bool CheckDepthTestPassed(GEComparison func, int x, int y, int stride, u16 z) {
334
	return DepthTestPassed(func, x, y, stride, z);
335
}
336

337
static inline u32 ApplyLogicOp(GELogicOp op, u32 old_color, u32 new_color) {
338
	// All of the operations here intentionally preserve alpha/stencil.
339
	switch (op) {
340
	case GE_LOGIC_CLEAR:
341
		new_color &= 0xFF000000;
342
		break;
343

344
	case GE_LOGIC_AND:
345
		new_color = new_color & (old_color | 0xFF000000);
346
		break;
347

348
	case GE_LOGIC_AND_REVERSE:
349
		new_color = new_color & (~old_color | 0xFF000000);
350
		break;
351

352
	case GE_LOGIC_COPY:
353
		// No change to new_color.
354
		break;
355

356
	case GE_LOGIC_AND_INVERTED:
357
		new_color = (~new_color & (old_color & 0x00FFFFFF)) | (new_color & 0xFF000000);
358
		break;
359

360
	case GE_LOGIC_NOOP:
361
		new_color = (old_color & 0x00FFFFFF) | (new_color & 0xFF000000);
362
		break;
363

364
	case GE_LOGIC_XOR:
365
		new_color = new_color ^ (old_color & 0x00FFFFFF);
366
		break;
367

368
	case GE_LOGIC_OR:
369
		new_color = new_color | (old_color & 0x00FFFFFF);
370
		break;
371

372
	case GE_LOGIC_NOR:
373
		new_color = (~(new_color | old_color) & 0x00FFFFFF) | (new_color & 0xFF000000);
374
		break;
375

376
	case GE_LOGIC_EQUIV:
377
		new_color = (~(new_color ^ old_color) & 0x00FFFFFF) | (new_color & 0xFF000000);
378
		break;
379

380
	case GE_LOGIC_INVERTED:
381
		new_color = (~old_color & 0x00FFFFFF) | (new_color & 0xFF000000);
382
		break;
383

384
	case GE_LOGIC_OR_REVERSE:
385
		new_color = new_color | (~old_color & 0x00FFFFFF);
386
		break;
387

388
	case GE_LOGIC_COPY_INVERTED:
389
		new_color = (~new_color & 0x00FFFFFF) | (new_color & 0xFF000000);
390
		break;
391

392
	case GE_LOGIC_OR_INVERTED:
393
		new_color = ((~new_color | old_color) & 0x00FFFFFF) | (new_color & 0xFF000000);
394
		break;
395

396
	case GE_LOGIC_NAND:
397
		new_color = (~(new_color & old_color) & 0x00FFFFFF) | (new_color & 0xFF000000);
398
		break;
399

400
	case GE_LOGIC_SET:
401
		new_color |= 0x00FFFFFF;
402
		break;
403
	}
404

405
	return new_color;
406
}
407

408
static inline Vec3<int> GetSourceFactor(PixelBlendFactor factor, const Vec4<int> &source, const Vec4<int> &dst, uint32_t fix) {
409
	switch (factor) {
410
	case PixelBlendFactor::OTHERCOLOR:
411
		return dst.rgb();
412

413
	case PixelBlendFactor::INVOTHERCOLOR:
414
		return Vec3<int>::AssignToAll(255) - dst.rgb();
415

416
	case PixelBlendFactor::SRCALPHA:
417
#if defined(_M_SSE)
418
		return Vec3<int>(_mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3)));
419
#elif PPSSPP_ARCH(ARM64_NEON)
420
		return Vec3<int>(vdupq_laneq_s32(source.ivec, 3));
421
#else
422
		return Vec3<int>::AssignToAll(source.a());
423
#endif
424

425
	case PixelBlendFactor::INVSRCALPHA:
426
#if defined(_M_SSE)
427
		return Vec3<int>(_mm_sub_epi32(_mm_set1_epi32(255), _mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3))));
428
#elif PPSSPP_ARCH(ARM64_NEON)
429
		return Vec3<int>(vsubq_s32(vdupq_n_s32(255), vdupq_laneq_s32(source.ivec, 3)));
430
#else
431
		return Vec3<int>::AssignToAll(255 - source.a());
432
#endif
433

434
	case PixelBlendFactor::DSTALPHA:
435
		return Vec3<int>::AssignToAll(dst.a());
436

437
	case PixelBlendFactor::INVDSTALPHA:
438
		return Vec3<int>::AssignToAll(255 - dst.a());
439

440
	case PixelBlendFactor::DOUBLESRCALPHA:
441
		return Vec3<int>::AssignToAll(2 * source.a());
442

443
	case PixelBlendFactor::DOUBLEINVSRCALPHA:
444
		return Vec3<int>::AssignToAll(255 - std::min(2 * source.a(), 255));
445

446
	case PixelBlendFactor::DOUBLEDSTALPHA:
447
		return Vec3<int>::AssignToAll(2 * dst.a());
448

449
	case PixelBlendFactor::DOUBLEINVDSTALPHA:
450
		return Vec3<int>::AssignToAll(255 - std::min(2 * dst.a(), 255));
451

452
	case PixelBlendFactor::FIX:
453
	default:
454
		// All other dest factors (> 10) are treated as FIXA.
455
		return Vec3<int>::FromRGB(fix);
456

457
	case PixelBlendFactor::ZERO:
458
		return Vec3<int>::AssignToAll(0);
459

460
	case PixelBlendFactor::ONE:
461
		return Vec3<int>::AssignToAll(255);
462
	}
463
}
464

465
static inline Vec3<int> GetDestFactor(PixelBlendFactor factor, const Vec4<int> &source, const Vec4<int> &dst, uint32_t fix) {
466
	switch (factor) {
467
	case PixelBlendFactor::OTHERCOLOR:
468
		return source.rgb();
469

470
	case PixelBlendFactor::INVOTHERCOLOR:
471
		return Vec3<int>::AssignToAll(255) - source.rgb();
472

473
	case PixelBlendFactor::SRCALPHA:
474
#if defined(_M_SSE)
475
		return Vec3<int>(_mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3)));
476
#elif PPSSPP_ARCH(ARM64_NEON)
477
		return Vec3<int>(vdupq_laneq_s32(source.ivec, 3));
478
#else
479
		return Vec3<int>::AssignToAll(source.a());
480
#endif
481

482
	case PixelBlendFactor::INVSRCALPHA:
483
#if defined(_M_SSE)
484
		return Vec3<int>(_mm_sub_epi32(_mm_set1_epi32(255), _mm_shuffle_epi32(source.ivec, _MM_SHUFFLE(3, 3, 3, 3))));
485
#elif PPSSPP_ARCH(ARM64_NEON)
486
		return Vec3<int>(vsubq_s32(vdupq_n_s32(255), vdupq_laneq_s32(source.ivec, 3)));
487
#else
488
		return Vec3<int>::AssignToAll(255 - source.a());
489
#endif
490

491
	case PixelBlendFactor::DSTALPHA:
492
		return Vec3<int>::AssignToAll(dst.a());
493

494
	case PixelBlendFactor::INVDSTALPHA:
495
		return Vec3<int>::AssignToAll(255 - dst.a());
496

497
	case PixelBlendFactor::DOUBLESRCALPHA:
498
		return Vec3<int>::AssignToAll(2 * source.a());
499

500
	case PixelBlendFactor::DOUBLEINVSRCALPHA:
501
		return Vec3<int>::AssignToAll(255 - std::min(2 * source.a(), 255));
502

503
	case PixelBlendFactor::DOUBLEDSTALPHA:
504
		return Vec3<int>::AssignToAll(2 * dst.a());
505

506
	case PixelBlendFactor::DOUBLEINVDSTALPHA:
507
		return Vec3<int>::AssignToAll(255 - std::min(2 * dst.a(), 255));
508

509
	case PixelBlendFactor::FIX:
510
	default:
511
		// All other dest factors (> 10) are treated as FIXB.
512
		return Vec3<int>::FromRGB(fix);
513

514
	case PixelBlendFactor::ZERO:
515
		return Vec3<int>::AssignToAll(0);
516

517
	case PixelBlendFactor::ONE:
518
		return Vec3<int>::AssignToAll(255);
519
	}
520
}
521

522
// Removed inline here - it was never chosen to be inlined by the compiler anyway, too complex.
523
static Vec3<int> AlphaBlendingResult(const PixelFuncID &pixelID, const Vec4<int> &source, const Vec4<int> &dst) {
524
	// Note: These factors cannot go below 0, but they can go above 255 when doubling.
525
	Vec3<int> srcfactor = GetSourceFactor(pixelID.AlphaBlendSrc(), source, dst, pixelID.cached.alphaBlendSrc);
526
	Vec3<int> dstfactor = GetDestFactor(pixelID.AlphaBlendDst(), source, dst, pixelID.cached.alphaBlendDst);
527

528
	switch (pixelID.AlphaBlendEq()) {
529
	case GE_BLENDMODE_MUL_AND_ADD:
530
	{
531
#if defined(_M_SSE)
532
		// We switch to 16 bit to use mulhi, and we use 4 bits of decimal to make the 16 bit shift free.
533
		const __m128i half = _mm_set1_epi16(1 << 3);
534

535
		const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
536
		const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
537
		const __m128i s = _mm_mulhi_epi16(srgb, sf);
538

539
		const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
540
		const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
541
		const __m128i d = _mm_mulhi_epi16(drgb, df);
542

543
		return Vec3<int>(_mm_unpacklo_epi16(_mm_adds_epi16(s, d), _mm_setzero_si128()));
544
#elif PPSSPP_ARCH(ARM64_NEON)
545
		const int32x4_t half = vdupq_n_s32(1);
546

547
		const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half);
548
		const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half);
549
		const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10);
550

551
		const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half);
552
		const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half);
553
		const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10);
554

555
		return Vec3<int>(vaddq_s32(s, d));
556
#else
557
		static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
558
		Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
559
		Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
560
		return lhs + rhs;
561
#endif
562
	}
563

564
	case GE_BLENDMODE_MUL_AND_SUBTRACT:
565
	{
566
#if defined(_M_SSE)
567
		const __m128i half = _mm_set1_epi16(1 << 3);
568

569
		const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
570
		const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
571
		const __m128i s = _mm_mulhi_epi16(srgb, sf);
572

573
		const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
574
		const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
575
		const __m128i d = _mm_mulhi_epi16(drgb, df);
576

577
		return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(s, d), _mm_setzero_si128()), _mm_setzero_si128()));
578
#elif PPSSPP_ARCH(ARM64_NEON)
579
		const int32x4_t half = vdupq_n_s32(1);
580

581
		const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half);
582
		const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half);
583
		const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10);
584

585
		const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half);
586
		const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half);
587
		const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10);
588

589
		return Vec3<int>(vqsubq_s32(s, d));
590
#else
591
		static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
592
		Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
593
		Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
594
		return lhs - rhs;
595
#endif
596
	}
597

598
	case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
599
	{
600
#if defined(_M_SSE)
601
		const __m128i half = _mm_set1_epi16(1 << 3);
602

603
		const __m128i srgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(source.ivec, source.ivec), 4), half);
604
		const __m128i sf = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(srcfactor.ivec, srcfactor.ivec), 4), half);
605
		const __m128i s = _mm_mulhi_epi16(srgb, sf);
606

607
		const __m128i drgb = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dst.ivec, dst.ivec), 4), half);
608
		const __m128i df = _mm_add_epi16(_mm_slli_epi16(_mm_packs_epi32(dstfactor.ivec, dstfactor.ivec), 4), half);
609
		const __m128i d = _mm_mulhi_epi16(drgb, df);
610

611
		return Vec3<int>(_mm_unpacklo_epi16(_mm_max_epi16(_mm_subs_epi16(d, s), _mm_setzero_si128()), _mm_setzero_si128()));
612
#elif PPSSPP_ARCH(ARM64_NEON)
613
		const int32x4_t half = vdupq_n_s32(1);
614

615
		const int32x4_t srgb = vaddq_s32(vshlq_n_s32(source.ivec, 1), half);
616
		const int32x4_t sf = vaddq_s32(vshlq_n_s32(srcfactor.ivec, 1), half);
617
		const int32x4_t s = vshrq_n_s32(vmulq_s32(srgb, sf), 10);
618

619
		const int32x4_t drgb = vaddq_s32(vshlq_n_s32(dst.ivec, 1), half);
620
		const int32x4_t df = vaddq_s32(vshlq_n_s32(dstfactor.ivec, 1), half);
621
		const int32x4_t d = vshrq_n_s32(vmulq_s32(drgb, df), 10);
622

623
		return Vec3<int>(vqsubq_s32(d, s));
624
#else
625
		static constexpr Vec3<int> half = Vec3<int>::AssignToAll(1);
626
		Vec3<int> lhs = ((source.rgb() * 2 + half) * (srcfactor * 2 + half)) / 1024;
627
		Vec3<int> rhs = ((dst.rgb() * 2 + half) * (dstfactor * 2 + half)) / 1024;
628
		return rhs - lhs;
629
#endif
630
	}
631

632
	case GE_BLENDMODE_MIN:
633
#if PPSSPP_ARCH(ARM64_NEON)
634
		return Vec3<int>(vminq_s32(source.ivec, dst.ivec));
635
#else
636
		return Vec3<int>(std::min(source.r(), dst.r()),
637
			std::min(source.g(), dst.g()),
638
			std::min(source.b(), dst.b()));
639
#endif
640

641
	case GE_BLENDMODE_MAX:
642
#if PPSSPP_ARCH(ARM64_NEON)
643
		return Vec3<int>(vmaxq_s32(source.ivec, dst.ivec));
644
#else
645
		return Vec3<int>(std::max(source.r(), dst.r()),
646
			std::max(source.g(), dst.g()),
647
			std::max(source.b(), dst.b()));
648
#endif
649

650
	case GE_BLENDMODE_ABSDIFF:
651
#if PPSSPP_ARCH(ARM64_NEON)
652
		return Vec3<int>(vabdq_s32(source.ivec, dst.ivec));
653
#else
654
		return Vec3<int>(::abs(source.r() - dst.r()),
655
			::abs(source.g() - dst.g()),
656
			::abs(source.b() - dst.b()));
657
#endif
658

659
	default:
660
		return source.rgb();
661
	}
662
}
663

664
template <bool clearMode, GEBufferFormat fbFormat>
665
void SOFTRAST_CALL DrawSinglePixel(int x, int y, int z, int fog, Vec4IntArg color_in, const PixelFuncID &pixelID) {
666
	Vec4<int> prim_color = Vec4<int>(color_in).Clamp(0, 255);
667
	// Depth range test - applied in clear mode, if not through mode.
668
	if (pixelID.applyDepthRange && !pixelID.earlyZChecks)
669
		if (z < pixelID.cached.minz || z > pixelID.cached.maxz)
670
			return;
671

672
	if (pixelID.AlphaTestFunc() != GE_COMP_ALWAYS && !clearMode)
673
		if (!AlphaTestPassed(pixelID, prim_color.a()))
674
			return;
675

676
	// Fog is applied prior to color test.
677
	if (pixelID.applyFog && !clearMode) {
678
		Vec3<int> fogColor = Vec3<int>::FromRGB(pixelID.cached.fogColor);
679
		// This is very similar to the BLEND texfunc, and simply always rounds up.
680
		static constexpr Vec3<int> roundup = Vec3<int>::AssignToAll(255);
681
		fogColor = (prim_color.rgb() * fog + fogColor * (255 - fog) + roundup) / 256;
682
		prim_color.r() = fogColor.r();
683
		prim_color.g() = fogColor.g();
684
		prim_color.b() = fogColor.b();
685
	}
686

687
	if (pixelID.colorTest && !clearMode)
688
		if (!ColorTestPassed(pixelID, prim_color.rgb()))
689
			return;
690

691
	// In clear mode, it uses the alpha color as stencil.
692
	uint32_t targetWriteMask = pixelID.applyColorWriteMask ? pixelID.cached.colorWriteMask : 0;
693
	u8 stencil = clearMode ? prim_color.a() : GetPixelStencil(fbFormat, pixelID.cached.framebufStride, x, y);
694
	if (clearMode) {
695
		if (pixelID.DepthClear())
696
			SetPixelDepth(x, y, pixelID.cached.depthbufStride, z);
697
	} else if (pixelID.stencilTest) {
698
		const uint8_t stencilReplace = pixelID.hasStencilTestMask ? pixelID.cached.stencilRef : pixelID.stencilTestRef;
699
		if (!StencilTestPassed(pixelID, stencil)) {
700
			stencil = ApplyStencilOp(fbFormat, stencilReplace, pixelID.SFail(), stencil);
701
			SetPixelStencil(fbFormat, pixelID.cached.framebufStride, targetWriteMask, x, y, stencil);
702
			return;
703
		}
704

705
		// Also apply depth at the same time.  If disabled, same as passing.
706
		if (!pixelID.earlyZChecks && pixelID.DepthTestFunc() != GE_COMP_ALWAYS && !DepthTestPassed(pixelID.DepthTestFunc(), x, y, pixelID.cached.depthbufStride, z)) {
707
			stencil = ApplyStencilOp(fbFormat, stencilReplace, pixelID.ZFail(), stencil);
708
			SetPixelStencil(fbFormat, pixelID.cached.framebufStride, targetWriteMask, x, y, stencil);
709
			return;
710
		}
711

712
		stencil = ApplyStencilOp(fbFormat, stencilReplace, pixelID.ZPass(), stencil);
713
	} else if (!pixelID.earlyZChecks) {
714
		if (pixelID.DepthTestFunc() != GE_COMP_ALWAYS && !DepthTestPassed(pixelID.DepthTestFunc(), x, y, pixelID.cached.depthbufStride, z)) {
715
			return;
716
		}
717
	}
718

719
	if (pixelID.depthWrite && !clearMode)
720
		SetPixelDepth(x, y, pixelID.cached.depthbufStride, z);
721

722
	const u32 old_color = GetPixelColor(fbFormat, pixelID.cached.framebufStride, x, y);
723
	u32 new_color;
724

725
	// Dithering happens before the logic op and regardless of framebuffer format or clear mode.
726
	// We do it while alpha blending because it happens before clamping.
727
	if (pixelID.alphaBlend && !clearMode) {
728
		const Vec4<int> dst = Vec4<int>::FromRGBA(old_color);
729
		Vec3<int> blended = AlphaBlendingResult(pixelID, prim_color, dst);
730
		if (pixelID.dithering) {
731
			blended += Vec3<int>::AssignToAll(pixelID.cached.ditherMatrix[(y & 3) * 4 + (x & 3)]);
732
		}
733

734
		// ToRGB() always automatically clamps.
735
		new_color = blended.ToRGB();
736
		new_color |= stencil << 24;
737
	} else {
738
		if (pixelID.dithering) {
739
			// We'll discard alpha anyway.
740
			prim_color += Vec4<int>::AssignToAll(pixelID.cached.ditherMatrix[(y & 3) * 4 + (x & 3)]);
741
		}
742

743
#if defined(_M_SSE) || PPSSPP_ARCH(ARM64_NEON)
744
		new_color = Vec3<int>(prim_color.ivec).ToRGB();
745
		new_color |= stencil << 24;
746
#else
747
		new_color = Vec4<int>(prim_color.r(), prim_color.g(), prim_color.b(), stencil).ToRGBA();
748
#endif
749
	}
750

751
	// Logic ops are applied after blending (if blending is enabled.)
752
	if (pixelID.applyLogicOp && !clearMode) {
753
		// Logic ops don't affect stencil, which happens inside ApplyLogicOp.
754
		new_color = ApplyLogicOp(pixelID.cached.logicOp, old_color, new_color);
755
	}
756

757
	if (clearMode) {
758
		if (!pixelID.ColorClear())
759
			new_color = (new_color & 0xFF000000) | (old_color & 0x00FFFFFF);
760
		if (!pixelID.StencilClear())
761
			new_color = (new_color & 0x00FFFFFF) | (old_color & 0xFF000000);
762
	}
763

764
	SetPixelColor(fbFormat, pixelID.cached.framebufStride, x, y, new_color, old_color, targetWriteMask);
765
}
766

767
SingleFunc GetSingleFunc(const PixelFuncID &id, BinManager *binner) {
768
	SingleFunc jitted = jitCache->GetSingle(id, binner);
769
	if (jitted) {
770
		return jitted;
771
	}
772

773
	return jitCache->GenericSingle(id);
774
}
775

776
SingleFunc PixelJitCache::GenericSingle(const PixelFuncID &id) {
777
	if (id.clearMode) {
778
		switch (id.fbFormat) {
779
		case GE_FORMAT_565:
780
			return &DrawSinglePixel<true, GE_FORMAT_565>;
781
		case GE_FORMAT_5551:
782
			return &DrawSinglePixel<true, GE_FORMAT_5551>;
783
		case GE_FORMAT_4444:
784
			return &DrawSinglePixel<true, GE_FORMAT_4444>;
785
		case GE_FORMAT_8888:
786
			return &DrawSinglePixel<true, GE_FORMAT_8888>;
787
		}
788
	}
789
	switch (id.fbFormat) {
790
	case GE_FORMAT_565:
791
		return &DrawSinglePixel<false, GE_FORMAT_565>;
792
	case GE_FORMAT_5551:
793
		return &DrawSinglePixel<false, GE_FORMAT_5551>;
794
	case GE_FORMAT_4444:
795
		return &DrawSinglePixel<false, GE_FORMAT_4444>;
796
	case GE_FORMAT_8888:
797
		return &DrawSinglePixel<false, GE_FORMAT_8888>;
798
	}
799
	_assert_(false);
800
	return nullptr;
801
}
802

803
thread_local PixelJitCache::LastCache PixelJitCache::lastSingle_;
804
int PixelJitCache::clearGen_ = 0;
805

806
// 256k should be plenty of space for plenty of variations.
807
PixelJitCache::PixelJitCache() : CodeBlock(1024 * 64 * 4), cache_(64) {
808
	lastSingle_.gen = -1;
809
	clearGen_++;
810
}
811

812
void PixelJitCache::Clear() {
813
	clearGen_++;
814
	CodeBlock::Clear();
815
	cache_.Clear();
816
	addresses_.clear();
817

818
	constBlendHalf_11_4s_ = nullptr;
819
	constBlendInvert_11_4s_ = nullptr;
820
}
821

822
std::string PixelJitCache::DescribeCodePtr(const u8 *ptr) {
823
	constexpr bool USE_IDS = false;
824
	ptrdiff_t dist = 0x7FFFFFFF;
825
	if (USE_IDS) {
826
		PixelFuncID found{};
827
		for (const auto &it : addresses_) {
828
			ptrdiff_t it_dist = ptr - it.second;
829
			if (it_dist >= 0 && it_dist < dist) {
830
				found = it.first;
831
				dist = it_dist;
832
			}
833
		}
834

835
		return DescribePixelFuncID(found);
836
	}
837

838
	return CodeBlock::DescribeCodePtr(ptr);
839
}
840

841
void PixelJitCache::Flush() {
842
	std::unique_lock<std::mutex> guard(jitCacheLock);
843
	for (const auto &queued : compileQueue_) {
844
		// Might've been compiled after enqueue, but before now.
845
		size_t queuedKey = std::hash<PixelFuncID>()(queued);
846
		if (!cache_.ContainsKey(queuedKey))
847
			Compile(queued);
848
	}
849
	compileQueue_.clear();
850
}
851

852
SingleFunc PixelJitCache::GetSingle(const PixelFuncID &id, BinManager *binner) {
853
	if (!g_Config.bSoftwareRenderingJit)
854
		return nullptr;
855

856
	const size_t key = std::hash<PixelFuncID>()(id);
857
	if (lastSingle_.Match(key, clearGen_))
858
		return lastSingle_.func;
859

860
	std::unique_lock<std::mutex> guard(jitCacheLock);
861
	SingleFunc singleFunc;
862
	if (cache_.Get(key, &singleFunc)) {
863
		lastSingle_.Set(key, singleFunc, clearGen_);
864
		return singleFunc;
865
	}
866

867
	if (!binner) {
868
		// Can't compile, let's try to do it later when there's an opportunity.
869
		compileQueue_.insert(id);
870
		return nullptr;
871
	}
872

873
	guard.unlock();
874
	binner->Flush("compile");
875
	guard.lock();
876

877
	for (const auto &queued : compileQueue_) {
878
		// Might've been compiled after enqueue, but before now.
879
		size_t queuedKey = std::hash<PixelFuncID>()(queued);
880
		if (!cache_.ContainsKey(queuedKey))
881
			Compile(queued);
882
	}
883
	compileQueue_.clear();
884

885
	// Might've been in the queue.
886
	if (!cache_.ContainsKey(key))
887
		Compile(id);
888

889
	if (cache_.Get(key, &singleFunc)) {
890
		lastSingle_.Set(key, singleFunc, clearGen_);
891
		return singleFunc;
892
	} else {
893
		return nullptr;
894
	}
895
}
896

897
void PixelJitCache::Compile(const PixelFuncID &id) {
898
	// x64 is typically 200-500 bytes, but let's be safe.
899
	if (GetSpaceLeft() < 65536) {
900
		Clear();
901
	}
902

903
#if PPSSPP_ARCH(AMD64) && !PPSSPP_PLATFORM(UWP)
904
	addresses_[id] = GetCodePointer();
905
	SingleFunc func = CompileSingle(id);
906
	cache_.Insert(std::hash<PixelFuncID>()(id), func);
907
#endif
908
}
909

910
void ComputePixelBlendState(PixelBlendState &state, const PixelFuncID &id) {
911
	switch (id.AlphaBlendEq()) {
912
	case GE_BLENDMODE_MUL_AND_ADD:
913
	case GE_BLENDMODE_MUL_AND_SUBTRACT:
914
	case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
915
		state.usesFactors = true;
916
		break;
917

918
	case GE_BLENDMODE_MIN:
919
	case GE_BLENDMODE_MAX:
920
	case GE_BLENDMODE_ABSDIFF:
921
		break;
922
	}
923

924
	if (state.usesFactors) {
925
		switch (id.AlphaBlendSrc()) {
926
		case PixelBlendFactor::DSTALPHA:
927
		case PixelBlendFactor::INVDSTALPHA:
928
		case PixelBlendFactor::DOUBLEDSTALPHA:
929
		case PixelBlendFactor::DOUBLEINVDSTALPHA:
930
			state.usesDstAlpha = true;
931
			break;
932

933
		case PixelBlendFactor::OTHERCOLOR:
934
		case PixelBlendFactor::INVOTHERCOLOR:
935
			state.dstColorAsFactor = true;
936
			break;
937

938
		case PixelBlendFactor::SRCALPHA:
939
		case PixelBlendFactor::INVSRCALPHA:
940
		case PixelBlendFactor::DOUBLESRCALPHA:
941
		case PixelBlendFactor::DOUBLEINVSRCALPHA:
942
			state.srcColorAsFactor = true;
943
			break;
944

945
		default:
946
			break;
947
		}
948

949
		switch (id.AlphaBlendDst()) {
950
		case PixelBlendFactor::INVSRCALPHA:
951
			state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::SRCALPHA;
952
			state.srcColorAsFactor = true;
953
			break;
954

955
		case PixelBlendFactor::DOUBLEINVSRCALPHA:
956
			state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::DOUBLESRCALPHA;
957
			state.srcColorAsFactor = true;
958
			break;
959

960
		case PixelBlendFactor::DSTALPHA:
961
			state.usesDstAlpha = true;
962
			break;
963

964
		case PixelBlendFactor::INVDSTALPHA:
965
			state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::DSTALPHA;
966
			state.usesDstAlpha = true;
967
			break;
968

969
		case PixelBlendFactor::DOUBLEDSTALPHA:
970
			state.usesDstAlpha = true;
971
			break;
972

973
		case PixelBlendFactor::DOUBLEINVDSTALPHA:
974
			state.dstFactorIsInverse = id.AlphaBlendSrc() == PixelBlendFactor::DOUBLEDSTALPHA;
975
			state.usesDstAlpha = true;
976
			break;
977

978
		case PixelBlendFactor::OTHERCOLOR:
979
		case PixelBlendFactor::INVOTHERCOLOR:
980
			state.srcColorAsFactor = true;
981
			break;
982

983
		case PixelBlendFactor::SRCALPHA:
984
		case PixelBlendFactor::DOUBLESRCALPHA:
985
			state.srcColorAsFactor = true;
986
			break;
987

988
		case PixelBlendFactor::ZERO:
989
			state.readsDstPixel = state.dstColorAsFactor || state.usesDstAlpha;
990
			break;
991

992
		default:
993
			break;
994
		}
995
	}
996
}
997

998
};
999

1000
Product

Resources

Company