Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hrydgard
GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Software/DrawPixelX86.cpp
3186 views
1
// Copyright (c) 2017- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include "ppsspp_config.h"
19
20
#if PPSSPP_ARCH(AMD64)
21
22
#include "Common/x64Emitter.h"
23
#include "Common/CPUDetect.h"
24
#include "Common/LogReporting.h"
25
#include "Common/Math/SIMDHeaders.h"
26
#include "GPU/GPUState.h"
27
#include "GPU/Software/DrawPixel.h"
28
#include "GPU/Software/SoftGpu.h"
29
#include "GPU/ge_constants.h"
30
31
using namespace Gen;
32
33
namespace Rasterizer {
34
35
SingleFunc PixelJitCache::CompileSingle(const PixelFuncID &id) {
36
// Setup the reg cache and disallow spill for arguments.
37
regCache_.SetupABI({
38
RegCache::GEN_ARG_X,
39
RegCache::GEN_ARG_Y,
40
RegCache::GEN_ARG_Z,
41
RegCache::GEN_ARG_FOG,
42
RegCache::VEC_ARG_COLOR,
43
RegCache::GEN_ARG_ID,
44
});
45
46
BeginWrite(64);
47
Describe("Init");
48
WriteConstantPool(id);
49
50
const u8 *resetPos = AlignCode16();
51
EndWrite();
52
bool success = true;
53
54
#if PPSSPP_PLATFORM(WINDOWS)
55
// RET + Windows reserves space to save args, half of 1 xmm + 4 ints before the id.
56
_assert_(!regCache_.Has(RegCache::GEN_ARG_ID));
57
int stackSpace = 0;
58
if (id.hasStencilTestMask)
59
stackSpace = WriteProlog(0, { XMM6, XMM7, XMM8, XMM9, XMM10, XMM11, XMM12, XMM13, XMM14, XMM15 }, { R12, R13, R14, R15 });
60
else
61
stackSpace = WriteProlog(0, {}, {});
62
stackIDOffset_ = stackSpace + 8 + 8 + 4 * PTRBITS / 8;
63
#else
64
_assert_(regCache_.Has(RegCache::GEN_ARG_ID));
65
WriteProlog(0, {}, {});
66
stackIDOffset_ = -1;
67
#endif
68
69
// Start with the depth range.
70
success = success && Jit_ApplyDepthRange(id);
71
72
// Next, let's clamp the color (might affect alpha test, and everything expects it clamped.)
73
// We simply convert to 4x8-bit to clamp. Everything else expects color in this format.
74
Describe("ClampColor");
75
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
76
PACKSSDW(argColorReg, R(argColorReg));
77
PACKUSWB(argColorReg, R(argColorReg));
78
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
79
colorIs16Bit_ = false;
80
81
success = success && Jit_AlphaTest(id);
82
// Fog is applied prior to color test. Maybe before alpha test too, but it doesn't affect it...
83
success = success && Jit_ApplyFog(id);
84
success = success && Jit_ColorTest(id);
85
86
if (id.stencilTest && !id.clearMode)
87
success = success && Jit_StencilAndDepthTest(id);
88
else if (!id.clearMode)
89
success = success && Jit_DepthTest(id);
90
success = success && Jit_WriteDepth(id);
91
92
success = success && Jit_AlphaBlend(id);
93
success = success && Jit_Dither(id);
94
success = success && Jit_WriteColor(id);
95
96
for (auto &fixup : discards_) {
97
SetJumpTarget(fixup);
98
}
99
discards_.clear();
100
101
if (regCache_.Has(RegCache::GEN_ARG_ID))
102
regCache_.ForceRelease(RegCache::GEN_ARG_ID);
103
104
if (!success) {
105
ERROR_LOG_REPORT(Log::G3D, "Could not compile pixel func: %s", DescribePixelFuncID(id).c_str());
106
107
regCache_.Reset(false);
108
EndWrite();
109
ResetCodePtr(GetOffset(resetPos));
110
return nullptr;
111
}
112
113
const u8 *start = WriteFinalizedEpilog();
114
regCache_.Reset(true);
115
return (SingleFunc)start;
116
}
117
118
RegCache::Reg PixelJitCache::GetPixelID() {
119
if (regCache_.Has(RegCache::GEN_ARG_ID))
120
return regCache_.Find(RegCache::GEN_ARG_ID);
121
if (!regCache_.Has(RegCache::GEN_ID)) {
122
X64Reg r = regCache_.Alloc(RegCache::GEN_ID);
123
_assert_(stackIDOffset_ != -1);
124
MOV(PTRBITS, R(r), MDisp(RSP, stackIDOffset_));
125
return r;
126
}
127
return regCache_.Find(RegCache::GEN_ID);
128
}
129
130
void PixelJitCache::UnlockPixelID(RegCache::Reg &r) {
131
if (regCache_.Has(RegCache::GEN_ARG_ID))
132
regCache_.Unlock(r, RegCache::GEN_ARG_ID);
133
else
134
regCache_.Unlock(r, RegCache::GEN_ID);
135
}
136
137
RegCache::Reg PixelJitCache::GetColorOff(const PixelFuncID &id) {
138
if (!regCache_.Has(RegCache::GEN_COLOR_OFF)) {
139
Describe("GetColorOff");
140
if (id.useStandardStride && !id.dithering) {
141
bool loadDepthOff = id.depthWrite || (id.DepthTestFunc() != GE_COMP_ALWAYS && !id.earlyZChecks);
142
X64Reg depthTemp = INVALID_REG;
143
X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);
144
X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);
145
146
// In this mode, we force argXReg to the off, and throw away argYReg.
147
SHL(32, R(argYReg), Imm8(9));
148
ADD(32, R(argXReg), R(argYReg));
149
150
// Now add the pointer for the color buffer.
151
if (loadDepthOff) {
152
_assert_msg_(Accessible(&fb.data, &depthbuf.data), "fb.data and depthbuf.data too far apart: %p %p (fb=%08x d=%08x)", fb.data, depthbuf.data, gstate.getFrameBufAddress(), gstate.getDepthBufAddress());
153
depthTemp = regCache_.Alloc(RegCache::GEN_DEPTH_OFF);
154
if (RipAccessible(&fb.data) && RipAccessible(&depthbuf.data)) {
155
MOV(PTRBITS, R(argYReg), M(&fb.data));
156
} else {
157
MOV(PTRBITS, R(depthTemp), ImmPtr(&fb.data));
158
MOV(PTRBITS, R(argYReg), MatR(depthTemp));
159
}
160
} else {
161
if (RipAccessible(&fb.data)) {
162
MOV(PTRBITS, R(argYReg), M(&fb.data));
163
} else {
164
MOV(PTRBITS, R(argYReg), ImmPtr(&fb.data));
165
MOV(PTRBITS, R(argYReg), MatR(argYReg));
166
}
167
}
168
LEA(PTRBITS, argYReg, MComplex(argYReg, argXReg, id.FBFormat() == GE_FORMAT_8888 ? 4 : 2, 0));
169
// With that, argYOff is now GEN_COLOR_OFF.
170
regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);
171
regCache_.Change(RegCache::GEN_ARG_Y, RegCache::GEN_COLOR_OFF);
172
// Retain it, because we can't recalculate this.
173
regCache_.ForceRetain(RegCache::GEN_COLOR_OFF);
174
175
// Next, also calculate the depth offset, unless we won't need it at all.
176
if (loadDepthOff) {
177
if (RipAccessible(&fb.data) && RipAccessible(&depthbuf.data)) {
178
MOV(PTRBITS, R(depthTemp), M(&depthbuf.data));
179
} else {
180
MOV(PTRBITS, R(depthTemp), MAccessibleDisp(depthTemp, &fb.data, &depthbuf.data));
181
}
182
LEA(PTRBITS, argXReg, MComplex(depthTemp, argXReg, 2, 0));
183
regCache_.Release(depthTemp, RegCache::GEN_DEPTH_OFF);
184
185
// Okay, same deal - release as GEN_DEPTH_OFF and force retain it.
186
regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
187
regCache_.Change(RegCache::GEN_ARG_X, RegCache::GEN_DEPTH_OFF);
188
regCache_.ForceRetain(RegCache::GEN_DEPTH_OFF);
189
} else {
190
regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
191
regCache_.ForceRelease(RegCache::GEN_ARG_X);
192
}
193
194
return regCache_.Find(RegCache::GEN_COLOR_OFF);
195
}
196
197
X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);
198
X64Reg r = regCache_.Alloc(RegCache::GEN_COLOR_OFF);
199
if (id.useStandardStride) {
200
MOV(32, R(r), R(argYReg));
201
SHL(32, R(r), Imm8(9));
202
} else {
203
if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
204
X64Reg idReg = GetPixelID();
205
MOVZX(32, 16, r, MDisp(idReg, offsetof(PixelFuncID, cached.framebufStride)));
206
UnlockPixelID(idReg);
207
} else {
208
_assert_(stackIDOffset_ != -1);
209
MOV(PTRBITS, R(r), MDisp(RSP, stackIDOffset_));
210
MOVZX(32, 16, r, MDisp(r, offsetof(PixelFuncID, cached.framebufStride)));
211
}
212
213
IMUL(32, r, R(argYReg));
214
}
215
regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);
216
217
X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);
218
ADD(32, R(r), R(argXReg));
219
regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
220
221
X64Reg temp = regCache_.Alloc(RegCache::GEN_TEMP_HELPER);
222
if (RipAccessible(&fb.data)) {
223
MOV(PTRBITS, R(temp), M(&fb.data));
224
} else {
225
MOV(PTRBITS, R(temp), ImmPtr(&fb.data));
226
MOV(PTRBITS, R(temp), MatR(temp));
227
}
228
LEA(PTRBITS, r, MComplex(temp, r, id.FBFormat() == GE_FORMAT_8888 ? 4 : 2, 0));
229
regCache_.Release(temp, RegCache::GEN_TEMP_HELPER);
230
231
return r;
232
}
233
return regCache_.Find(RegCache::GEN_COLOR_OFF);
234
}
235
236
RegCache::Reg PixelJitCache::GetDepthOff(const PixelFuncID &id) {
237
if (!regCache_.Has(RegCache::GEN_DEPTH_OFF)) {
238
// If both color and depth use 512, the offsets are the same.
239
if (id.useStandardStride && !id.dithering) {
240
// Calculate once inside GetColorOff().
241
X64Reg colorOffReg = GetColorOff(id);
242
regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);
243
return regCache_.Find(RegCache::GEN_DEPTH_OFF);
244
}
245
246
Describe("GetDepthOff");
247
X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);
248
X64Reg r = regCache_.Alloc(RegCache::GEN_DEPTH_OFF);
249
if (id.useStandardStride) {
250
MOV(32, R(r), R(argYReg));
251
SHL(32, R(r), Imm8(9));
252
} else {
253
if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
254
X64Reg idReg = GetPixelID();
255
MOVZX(32, 16, r, MDisp(idReg, offsetof(PixelFuncID, cached.depthbufStride)));
256
UnlockPixelID(idReg);
257
} else {
258
_assert_(stackIDOffset_ != -1);
259
MOV(PTRBITS, R(r), MDisp(RSP, stackIDOffset_));
260
MOVZX(32, 16, r, MDisp(r, offsetof(PixelFuncID, cached.depthbufStride)));
261
}
262
263
IMUL(32, r, R(argYReg));
264
}
265
regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);
266
267
X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);
268
ADD(32, R(r), R(argXReg));
269
regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
270
271
X64Reg temp = regCache_.Alloc(RegCache::GEN_TEMP_HELPER);
272
if (RipAccessible(&depthbuf.data)) {
273
MOV(PTRBITS, R(temp), M(&depthbuf.data));
274
} else {
275
MOV(PTRBITS, R(temp), ImmPtr(&depthbuf.data));
276
MOV(PTRBITS, R(temp), MatR(temp));
277
}
278
LEA(PTRBITS, r, MComplex(temp, r, 2, 0));
279
regCache_.Release(temp, RegCache::GEN_TEMP_HELPER);
280
281
return r;
282
}
283
return regCache_.Find(RegCache::GEN_DEPTH_OFF);
284
}
285
286
287
RegCache::Reg PixelJitCache::GetDestStencil(const PixelFuncID &id) {
288
// Skip if 565, since stencil is fixed zero.
289
if (id.FBFormat() == GE_FORMAT_565)
290
return INVALID_REG;
291
292
X64Reg colorOffReg = GetColorOff(id);
293
Describe("GetDestStencil");
294
X64Reg stencilReg = regCache_.Alloc(RegCache::GEN_STENCIL);
295
if (id.FBFormat() == GE_FORMAT_8888) {
296
MOVZX(32, 8, stencilReg, MDisp(colorOffReg, 3));
297
} else if (id.FBFormat() == GE_FORMAT_5551) {
298
MOVZX(32, 8, stencilReg, MDisp(colorOffReg, 1));
299
SAR(8, R(stencilReg), Imm8(7));
300
} else if (id.FBFormat() == GE_FORMAT_4444) {
301
MOVZX(32, 8, stencilReg, MDisp(colorOffReg, 1));
302
SHR(32, R(stencilReg), Imm8(4));
303
X64Reg temp = regCache_.Alloc(RegCache::GEN_TEMP_HELPER);
304
MOV(32, R(temp), R(stencilReg));
305
SHL(32, R(temp), Imm8(4));
306
OR(32, R(stencilReg), R(temp));
307
regCache_.Release(temp, RegCache::GEN_TEMP_HELPER);
308
}
309
regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);
310
311
return stencilReg;
312
}
313
314
void PixelJitCache::Discard() {
315
discards_.push_back(J(true));
316
}
317
318
void PixelJitCache::Discard(Gen::CCFlags cc) {
319
discards_.push_back(J_CC(cc, true));
320
}
321
322
void PixelJitCache::WriteConstantPool(const PixelFuncID &id) {
323
// This is used to add a fixed point 0.5 (as s.11.4) for blend factors to multiply accurately.
324
WriteSimpleConst8x16(constBlendHalf_11_4s_, 1 << 3);
325
326
// This is used for shifted blend factors, to inverse them.
327
WriteSimpleConst8x16(constBlendInvert_11_4s_, 0xFF << 4);
328
}
329
330
bool PixelJitCache::Jit_ApplyDepthRange(const PixelFuncID &id) {
331
if (id.applyDepthRange && !id.earlyZChecks) {
332
Describe("ApplyDepthR");
333
X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);
334
X64Reg idReg = GetPixelID();
335
336
// We expanded this to 32 bits, so it's convenient to compare.
337
CMP(32, R(argZReg), MDisp(idReg, offsetof(PixelFuncID, cached.minz)));
338
Discard(CC_L);
339
340
// We load the low 16 bits, but compare all 32 of z. Above handles < 0.
341
CMP(32, R(argZReg), MDisp(idReg, offsetof(PixelFuncID, cached.maxz)));
342
Discard(CC_G);
343
344
UnlockPixelID(idReg);
345
regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);
346
}
347
348
// Since this is early on, try to free up the z reg if we don't need it anymore.
349
if (id.clearMode && !id.DepthClear())
350
regCache_.ForceRelease(RegCache::GEN_ARG_Z);
351
else if (!id.clearMode && !id.depthWrite && (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks))
352
regCache_.ForceRelease(RegCache::GEN_ARG_Z);
353
354
return true;
355
}
356
357
bool PixelJitCache::Jit_AlphaTest(const PixelFuncID &id) {
358
// Take care of ALWAYS/NEVER first. ALWAYS is common, means disabled.
359
Describe("AlphaTest");
360
switch (id.AlphaTestFunc()) {
361
case GE_COMP_NEVER:
362
Discard();
363
return true;
364
365
case GE_COMP_ALWAYS:
366
return true;
367
368
default:
369
break;
370
}
371
372
// Load alpha into its own general reg.
373
X64Reg alphaReg;
374
if (regCache_.Has(RegCache::GEN_SRC_ALPHA)) {
375
alphaReg = regCache_.Find(RegCache::GEN_SRC_ALPHA);
376
} else {
377
alphaReg = regCache_.Alloc(RegCache::GEN_SRC_ALPHA);
378
_assert_(!colorIs16Bit_);
379
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
380
MOVD_xmm(R(alphaReg), argColorReg);
381
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
382
SHR(32, R(alphaReg), Imm8(24));
383
}
384
385
if (id.hasAlphaTestMask) {
386
// Unfortunate, we'll need pixelID to load the mask.
387
// Note: we leave the ALPHA purpose untouched and free it, because later code may reuse.
388
X64Reg idReg = GetPixelID();
389
X64Reg maskedReg = regCache_.Alloc(RegCache::GEN_TEMP0);
390
391
MOVZX(32, 8, maskedReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaTestMask)));
392
UnlockPixelID(idReg);
393
AND(32, R(maskedReg), R(alphaReg));
394
regCache_.Unlock(alphaReg, RegCache::GEN_SRC_ALPHA);
395
396
// Okay now do the rest using the masked reg, which we modified.
397
alphaReg = maskedReg;
398
}
399
400
// We hardcode the ref into this jit func.
401
CMP(8, R(alphaReg), Imm8(id.alphaTestRef));
402
if (id.hasAlphaTestMask)
403
regCache_.Release(alphaReg, RegCache::GEN_TEMP0);
404
else
405
regCache_.Unlock(alphaReg, RegCache::GEN_SRC_ALPHA);
406
407
switch (id.AlphaTestFunc()) {
408
case GE_COMP_NEVER:
409
case GE_COMP_ALWAYS:
410
break;
411
412
case GE_COMP_EQUAL:
413
Discard(CC_NE);
414
break;
415
416
case GE_COMP_NOTEQUAL:
417
Discard(CC_E);
418
break;
419
420
case GE_COMP_LESS:
421
Discard(CC_AE);
422
break;
423
424
case GE_COMP_LEQUAL:
425
Discard(CC_A);
426
break;
427
428
case GE_COMP_GREATER:
429
Discard(CC_BE);
430
break;
431
432
case GE_COMP_GEQUAL:
433
Discard(CC_B);
434
break;
435
}
436
437
return true;
438
}
439
440
bool PixelJitCache::Jit_ColorTest(const PixelFuncID &id) {
441
if (!id.colorTest || id.clearMode)
442
return true;
443
444
// We'll have 4 with fog released, so we're using them all...
445
Describe("ColorTest");
446
X64Reg idReg = GetPixelID();
447
X64Reg funcReg = regCache_.Alloc(RegCache::GEN_TEMP0);
448
X64Reg maskReg = regCache_.Alloc(RegCache::GEN_TEMP1);
449
X64Reg refReg = regCache_.Alloc(RegCache::GEN_TEMP2);
450
451
// First, load the registers: mask and ref.
452
MOV(32, R(maskReg), MDisp(idReg, offsetof(PixelFuncID, cached.colorTestMask)));
453
MOV(32, R(refReg), MDisp(idReg, offsetof(PixelFuncID, cached.colorTestRef)));
454
455
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
456
if (colorIs16Bit_) {
457
// If it's expanded, we need to clamp anyway if it was fogged.
458
PACKUSWB(argColorReg, R(argColorReg));
459
colorIs16Bit_ = false;
460
}
461
462
// Temporarily abuse funcReg to grab the color into maskReg.
463
MOVD_xmm(R(funcReg), argColorReg);
464
AND(32, R(maskReg), R(funcReg));
465
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
466
467
// Now that we're setup, get the func and follow it.
468
MOVZX(32, 8, funcReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorTestFunc)));
469
UnlockPixelID(idReg);
470
471
CMP(8, R(funcReg), Imm8(GE_COMP_ALWAYS));
472
// Discard for GE_COMP_NEVER...
473
Discard(CC_B);
474
FixupBranch skip = J_CC(CC_E);
475
476
CMP(8, R(funcReg), Imm8(GE_COMP_EQUAL));
477
FixupBranch doEqual = J_CC(CC_E);
478
regCache_.Release(funcReg, RegCache::GEN_TEMP0);
479
480
// The not equal path here... if they are equal, we discard.
481
CMP(32, R(refReg), R(maskReg));
482
Discard(CC_E);
483
FixupBranch skip2 = J();
484
485
SetJumpTarget(doEqual);
486
CMP(32, R(refReg), R(maskReg));
487
Discard(CC_NE);
488
489
regCache_.Release(maskReg, RegCache::GEN_TEMP1);
490
regCache_.Release(refReg, RegCache::GEN_TEMP2);
491
492
SetJumpTarget(skip);
493
SetJumpTarget(skip2);
494
495
return true;
496
}
497
498
bool PixelJitCache::Jit_ApplyFog(const PixelFuncID &id) {
499
if (!id.applyFog) {
500
// Okay, anyone can use the fog register then.
501
regCache_.ForceRelease(RegCache::GEN_ARG_FOG);
502
return true;
503
}
504
505
// Load fog and expand to 16 bit. Ignore the high 8 bits, which'll match up with A.
506
Describe("ApplyFog");
507
X64Reg fogColorReg = regCache_.Alloc(RegCache::VEC_TEMP1);
508
X64Reg idReg = GetPixelID();
509
if (cpu_info.bSSE4_1) {
510
PMOVZXBW(fogColorReg, MDisp(idReg, offsetof(PixelFuncID, cached.fogColor)));
511
} else {
512
X64Reg zeroReg = GetZeroVec();
513
MOVD_xmm(fogColorReg, MDisp(idReg, offsetof(PixelFuncID, cached.fogColor)));
514
PUNPCKLBW(fogColorReg, R(zeroReg));
515
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
516
}
517
UnlockPixelID(idReg);
518
519
// Load a set of 255s at 16 bit into a reg for later...
520
X64Reg invertReg = regCache_.Alloc(RegCache::VEC_TEMP2);
521
PCMPEQW(invertReg, R(invertReg));
522
PSRLW(invertReg, 8);
523
524
// Expand (we clamped) color to 16 bit as well, so we can multiply with fog.
525
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
526
if (!colorIs16Bit_) {
527
if (cpu_info.bSSE4_1) {
528
PMOVZXBW(argColorReg, R(argColorReg));
529
} else {
530
X64Reg zeroReg = GetZeroVec();
531
PUNPCKLBW(argColorReg, R(zeroReg));
532
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
533
}
534
colorIs16Bit_ = true;
535
}
536
537
// Save A so we can put it back, we don't "fog" A.
538
X64Reg alphaReg;
539
if (regCache_.Has(RegCache::GEN_SRC_ALPHA)) {
540
alphaReg = regCache_.Find(RegCache::GEN_SRC_ALPHA);
541
} else {
542
alphaReg = regCache_.Alloc(RegCache::GEN_SRC_ALPHA);
543
PEXTRW(alphaReg, argColorReg, 3);
544
}
545
546
// Okay, let's broadcast fog to an XMM.
547
X64Reg fogMultReg = regCache_.Alloc(RegCache::VEC_TEMP3);
548
X64Reg argFogReg = regCache_.Find(RegCache::GEN_ARG_FOG);
549
MOVD_xmm(fogMultReg, R(argFogReg));
550
PSHUFLW(fogMultReg, R(fogMultReg), _MM_SHUFFLE(0, 0, 0, 0));
551
regCache_.Unlock(argFogReg, RegCache::GEN_ARG_FOG);
552
// We can free up the actual fog reg now.
553
regCache_.ForceRelease(RegCache::GEN_ARG_FOG);
554
555
// Our goal here is to calculate this formula:
556
// (argColor * fog + fogColor * (255 - fog) + 255) / 256
557
558
// Now we multiply the existing color by fog...
559
PMULLW(argColorReg, R(fogMultReg));
560
// Before inversing, let's add that 255 we loaded in as well, since we have it.
561
PADDW(argColorReg, R(invertReg));
562
// And then inverse the fog value using those 255s, and multiply by fog color.
563
PSUBW(invertReg, R(fogMultReg));
564
PMULLW(fogColorReg, R(invertReg));
565
// At this point, argColorReg and fogColorReg are multiplied at 16-bit, so we need to sum.
566
PADDW(argColorReg, R(fogColorReg));
567
regCache_.Release(fogColorReg, RegCache::VEC_TEMP1);
568
regCache_.Release(invertReg, RegCache::VEC_TEMP2);
569
regCache_.Release(fogMultReg, RegCache::VEC_TEMP3);
570
571
// Now we simply divide by 256, or in other words shift by 8.
572
PSRLW(argColorReg, 8);
573
574
// Okay, put A back in, we'll shrink it to 8888 when needed.
575
PINSRW(argColorReg, R(alphaReg), 3);
576
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
577
578
// We most likely won't use alphaReg again.
579
regCache_.Unlock(alphaReg, RegCache::GEN_SRC_ALPHA);
580
581
return true;
582
}
583
584
bool PixelJitCache::Jit_StencilAndDepthTest(const PixelFuncID &id) {
585
_assert_(!id.clearMode && id.stencilTest);
586
587
X64Reg stencilReg = GetDestStencil(id);
588
Describe("StencilAndDepth");
589
X64Reg maskedReg = stencilReg;
590
if (id.hasStencilTestMask && stencilReg != INVALID_REG) {
591
X64Reg idReg = GetPixelID();
592
maskedReg = regCache_.Alloc(RegCache::GEN_TEMP0);
593
MOV(32, R(maskedReg), R(stencilReg));
594
AND(8, R(maskedReg), MDisp(idReg, offsetof(PixelFuncID, cached.stencilTestMask)));
595
UnlockPixelID(idReg);
596
}
597
598
bool success = true;
599
success = success && Jit_StencilTest(id, stencilReg, maskedReg);
600
if (maskedReg != stencilReg)
601
regCache_.Release(maskedReg, RegCache::GEN_TEMP0);
602
603
// Next up, the depth test.
604
if (stencilReg == INVALID_REG) {
605
// Just use the standard one, since we don't need to write stencil.
606
// We also don't need to worry about cleanup either.
607
return success && Jit_DepthTest(id);
608
}
609
610
success = success && Jit_DepthTestForStencil(id, stencilReg);
611
success = success && Jit_ApplyStencilOp(id, id.ZPass(), stencilReg);
612
613
// At this point, stencilReg can't be spilled. It contains the updated value.
614
regCache_.Unlock(stencilReg, RegCache::GEN_STENCIL);
615
regCache_.ForceRetain(RegCache::GEN_STENCIL);
616
617
return success;
618
}
619
620
bool PixelJitCache::Jit_StencilTest(const PixelFuncID &id, RegCache::Reg stencilReg, RegCache::Reg maskedReg) {
621
Describe("StencilTest");
622
623
bool hasFixedResult = false;
624
bool fixedResult = false;
625
FixupBranch toPass;
626
if (stencilReg == INVALID_REG) {
627
// This means stencil is a fixed value 0.
628
hasFixedResult = true;
629
switch (id.StencilTestFunc()) {
630
case GE_COMP_NEVER: fixedResult = false; break;
631
case GE_COMP_ALWAYS: fixedResult = true; break;
632
case GE_COMP_EQUAL: fixedResult = id.stencilTestRef == 0; break;
633
case GE_COMP_NOTEQUAL: fixedResult = id.stencilTestRef != 0; break;
634
case GE_COMP_LESS: fixedResult = false; break;
635
case GE_COMP_LEQUAL: fixedResult = id.stencilTestRef == 0; break;
636
case GE_COMP_GREATER: fixedResult = id.stencilTestRef != 0; break;
637
case GE_COMP_GEQUAL: fixedResult = true; break;
638
}
639
} else if (id.StencilTestFunc() == GE_COMP_ALWAYS) {
640
// Fairly common, skip the CMP.
641
hasFixedResult = true;
642
fixedResult = true;
643
} else {
644
// Reversed here because of the imm, so tests below are reversed.
645
CMP(8, R(maskedReg), Imm8(id.stencilTestRef));
646
switch (id.StencilTestFunc()) {
647
case GE_COMP_NEVER:
648
hasFixedResult = true;
649
fixedResult = false;
650
break;
651
652
case GE_COMP_ALWAYS:
653
_assert_(false);
654
break;
655
656
case GE_COMP_EQUAL:
657
toPass = J_CC(CC_E);
658
break;
659
660
case GE_COMP_NOTEQUAL:
661
toPass = J_CC(CC_NE);
662
break;
663
664
case GE_COMP_LESS:
665
toPass = J_CC(CC_A);
666
break;
667
668
case GE_COMP_LEQUAL:
669
toPass = J_CC(CC_AE);
670
break;
671
672
case GE_COMP_GREATER:
673
toPass = J_CC(CC_B);
674
break;
675
676
case GE_COMP_GEQUAL:
677
toPass = J_CC(CC_BE);
678
break;
679
}
680
}
681
682
if (hasFixedResult && !fixedResult && stencilReg == INVALID_REG) {
683
Discard();
684
return true;
685
}
686
687
bool hadColorOffReg = regCache_.Has(RegCache::GEN_COLOR_OFF);
688
bool hadIdReg = regCache_.Has(RegCache::GEN_ID);
689
690
bool success = true;
691
if (stencilReg != INVALID_REG && (!hasFixedResult || !fixedResult)) {
692
// This is the fail path.
693
success = success && Jit_ApplyStencilOp(id, id.SFail(), stencilReg);
694
success = success && Jit_WriteStencilOnly(id, stencilReg);
695
696
Discard();
697
}
698
699
// If we allocated either id or colorOff in the conditional, forget.
700
if (!hadColorOffReg && regCache_.Has(RegCache::GEN_COLOR_OFF))
701
regCache_.Change(RegCache::GEN_COLOR_OFF, RegCache::GEN_INVALID);
702
if (!hadIdReg && regCache_.Has(RegCache::GEN_ID))
703
regCache_.Change(RegCache::GEN_ID, RegCache::GEN_INVALID);
704
705
if (!hasFixedResult)
706
SetJumpTarget(toPass);
707
return success;
708
}
709
710
bool PixelJitCache::Jit_DepthTestForStencil(const PixelFuncID &id, RegCache::Reg stencilReg) {
711
if (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks)
712
return true;
713
714
X64Reg depthOffReg = GetDepthOff(id);
715
Describe("DepthTestStencil");
716
X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);
717
CMP(16, R(argZReg), MatR(depthOffReg));
718
regCache_.Unlock(depthOffReg, RegCache::GEN_DEPTH_OFF);
719
regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);
720
721
// We discard the opposite of the passing test.
722
FixupBranch skip;
723
switch (id.DepthTestFunc()) {
724
case GE_COMP_NEVER:
725
// Shouldn't happen, just do an extra CMP.
726
CMP(32, R(RAX), R(RAX));
727
// This is just to have a skip that is valid.
728
skip = J_CC(CC_NE);
729
break;
730
731
case GE_COMP_ALWAYS:
732
// Shouldn't happen, just do an extra CMP.
733
CMP(32, R(RAX), R(RAX));
734
skip = J_CC(CC_E);
735
break;
736
737
case GE_COMP_EQUAL:
738
skip = J_CC(CC_E);
739
break;
740
741
case GE_COMP_NOTEQUAL:
742
skip = J_CC(CC_NE);
743
break;
744
745
case GE_COMP_LESS:
746
skip = J_CC(CC_B);
747
break;
748
749
case GE_COMP_LEQUAL:
750
skip = J_CC(CC_BE);
751
break;
752
753
case GE_COMP_GREATER:
754
skip = J_CC(CC_A);
755
break;
756
757
case GE_COMP_GEQUAL:
758
skip = J_CC(CC_AE);
759
break;
760
}
761
762
bool hadColorOffReg = regCache_.Has(RegCache::GEN_COLOR_OFF);
763
bool hadIdReg = regCache_.Has(RegCache::GEN_ID);
764
765
bool success = true;
766
success = success && Jit_ApplyStencilOp(id, id.ZFail(), stencilReg);
767
success = success && Jit_WriteStencilOnly(id, stencilReg);
768
Discard();
769
770
// If we allocated either id or colorOff in the conditional, forget.
771
if (!hadColorOffReg && regCache_.Has(RegCache::GEN_COLOR_OFF))
772
regCache_.Change(RegCache::GEN_COLOR_OFF, RegCache::GEN_INVALID);
773
if (!hadIdReg && regCache_.Has(RegCache::GEN_ID))
774
regCache_.Change(RegCache::GEN_ID, RegCache::GEN_INVALID);
775
776
SetJumpTarget(skip);
777
778
// Like in Jit_DepthTest(), at this point we may not need this reg anymore.
779
if (!id.depthWrite)
780
regCache_.ForceRelease(RegCache::GEN_ARG_Z);
781
782
return success;
783
}
784
785
bool PixelJitCache::Jit_ApplyStencilOp(const PixelFuncID &id, GEStencilOp op, RegCache::Reg stencilReg) {
786
_assert_(stencilReg != INVALID_REG);
787
788
Describe("ApplyStencil");
789
FixupBranch skip;
790
switch (op) {
791
case GE_STENCILOP_KEEP:
792
// Nothing to do.
793
break;
794
795
case GE_STENCILOP_ZERO:
796
XOR(32, R(stencilReg), R(stencilReg));
797
break;
798
799
case GE_STENCILOP_REPLACE:
800
if (id.hasStencilTestMask) {
801
// Load the unmasked value.
802
X64Reg idReg = GetPixelID();
803
MOVZX(32, 8, stencilReg, MDisp(idReg, offsetof(PixelFuncID, cached.stencilRef)));
804
UnlockPixelID(idReg);
805
} else {
806
MOV(8, R(stencilReg), Imm8(id.stencilTestRef));
807
}
808
break;
809
810
case GE_STENCILOP_INVERT:
811
NOT(8, R(stencilReg));
812
break;
813
814
case GE_STENCILOP_INCR:
815
switch (id.fbFormat) {
816
case GE_FORMAT_565:
817
break;
818
819
case GE_FORMAT_5551:
820
MOV(8, R(stencilReg), Imm8(0xFF));
821
break;
822
823
case GE_FORMAT_4444:
824
CMP(8, R(stencilReg), Imm8(0xF0));
825
skip = J_CC(CC_AE);
826
ADD(8, R(stencilReg), Imm8(0x11));
827
SetJumpTarget(skip);
828
break;
829
830
case GE_FORMAT_8888:
831
CMP(8, R(stencilReg), Imm8(0xFF));
832
skip = J_CC(CC_E);
833
ADD(8, R(stencilReg), Imm8(0x01));
834
SetJumpTarget(skip);
835
break;
836
}
837
break;
838
839
case GE_STENCILOP_DECR:
840
switch (id.fbFormat) {
841
case GE_FORMAT_565:
842
break;
843
844
case GE_FORMAT_5551:
845
XOR(32, R(stencilReg), R(stencilReg));
846
break;
847
848
case GE_FORMAT_4444:
849
CMP(8, R(stencilReg), Imm8(0x11));
850
skip = J_CC(CC_B);
851
SUB(8, R(stencilReg), Imm8(0x11));
852
SetJumpTarget(skip);
853
break;
854
855
case GE_FORMAT_8888:
856
CMP(8, R(stencilReg), Imm8(0x00));
857
skip = J_CC(CC_E);
858
SUB(8, R(stencilReg), Imm8(0x01));
859
SetJumpTarget(skip);
860
break;
861
}
862
break;
863
}
864
865
return true;
866
}
867
868
bool PixelJitCache::Jit_WriteStencilOnly(const PixelFuncID &id, RegCache::Reg stencilReg) {
869
_assert_(stencilReg != INVALID_REG);
870
871
// It's okay to destroy stencilReg here, we know we're the last writing it.
872
X64Reg colorOffReg = GetColorOff(id);
873
Describe("WriteStencil");
874
if (id.applyColorWriteMask) {
875
X64Reg idReg = GetPixelID();
876
X64Reg maskReg = regCache_.Alloc(RegCache::GEN_TEMP5);
877
878
switch (id.fbFormat) {
879
case GE_FORMAT_565:
880
break;
881
882
case GE_FORMAT_5551:
883
// Read the high 8 bits of the 16-bit color mask.
884
MOVZX(32, 8, maskReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask) + 1));
885
OR(8, R(maskReg), Imm8(0x7F));
886
887
// Poor man's BIC...
888
NOT(32, R(stencilReg));
889
OR(32, R(stencilReg), R(maskReg));
890
NOT(32, R(stencilReg));
891
892
AND(8, MDisp(colorOffReg, 1), R(maskReg));
893
OR(8, MDisp(colorOffReg, 1), R(stencilReg));
894
break;
895
896
case GE_FORMAT_4444:
897
// Read the high 8 bits of the 16-bit color mask.
898
MOVZX(32, 8, maskReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask) + 1));
899
OR(8, R(maskReg), Imm8(0x0F));
900
901
// Poor man's BIC...
902
NOT(32, R(stencilReg));
903
OR(32, R(stencilReg), R(maskReg));
904
NOT(32, R(stencilReg));
905
906
AND(8, MDisp(colorOffReg, 1), R(maskReg));
907
OR(8, MDisp(colorOffReg, 1), R(stencilReg));
908
break;
909
910
case GE_FORMAT_8888:
911
// Read the high 8 bits of the 32-bit color mask.
912
MOVZX(32, 8, maskReg, MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask) + 3));
913
914
// Poor man's BIC...
915
NOT(32, R(stencilReg));
916
OR(32, R(stencilReg), R(maskReg));
917
NOT(32, R(stencilReg));
918
919
AND(8, MDisp(colorOffReg, 3), R(maskReg));
920
OR(8, MDisp(colorOffReg, 3), R(stencilReg));
921
break;
922
}
923
924
regCache_.Release(maskReg, RegCache::GEN_TEMP5);
925
UnlockPixelID(idReg);
926
} else {
927
switch (id.fbFormat) {
928
case GE_FORMAT_565:
929
break;
930
931
case GE_FORMAT_5551:
932
AND(8, R(stencilReg), Imm8(0x80));
933
AND(8, MDisp(colorOffReg, 1), Imm8(0x7F));
934
OR(8, MDisp(colorOffReg, 1), R(stencilReg));
935
break;
936
937
case GE_FORMAT_4444:
938
AND(8, MDisp(colorOffReg, 1), Imm8(0x0F));
939
AND(8, R(stencilReg), Imm8(0xF0));
940
OR(8, MDisp(colorOffReg, 1), R(stencilReg));
941
break;
942
943
case GE_FORMAT_8888:
944
MOV(8, MDisp(colorOffReg, 3), R(stencilReg));
945
break;
946
}
947
}
948
949
regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);
950
return true;
951
}
952
953
bool PixelJitCache::Jit_DepthTest(const PixelFuncID &id) {
954
if (id.DepthTestFunc() == GE_COMP_ALWAYS || id.earlyZChecks)
955
return true;
956
957
if (id.DepthTestFunc() == GE_COMP_NEVER) {
958
Discard();
959
// This should be uncommon, just keep going to have shared cleanup...
960
}
961
962
X64Reg depthOffReg = GetDepthOff(id);
963
Describe("DepthTest");
964
X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);
965
CMP(16, R(argZReg), MatR(depthOffReg));
966
regCache_.Unlock(depthOffReg, RegCache::GEN_DEPTH_OFF);
967
regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);
968
969
// We discard the opposite of the passing test.
970
switch (id.DepthTestFunc()) {
971
case GE_COMP_NEVER:
972
case GE_COMP_ALWAYS:
973
break;
974
975
case GE_COMP_EQUAL:
976
Discard(CC_NE);
977
break;
978
979
case GE_COMP_NOTEQUAL:
980
Discard(CC_E);
981
break;
982
983
case GE_COMP_LESS:
984
Discard(CC_AE);
985
break;
986
987
case GE_COMP_LEQUAL:
988
Discard(CC_A);
989
break;
990
991
case GE_COMP_GREATER:
992
Discard(CC_BE);
993
break;
994
995
case GE_COMP_GEQUAL:
996
Discard(CC_B);
997
break;
998
}
999
1000
// If we're not writing, we don't need Z anymore. We'll free GEN_DEPTH_OFF in Jit_WriteDepth().
1001
if (!id.depthWrite)
1002
regCache_.ForceRelease(RegCache::GEN_ARG_Z);
1003
1004
return true;
1005
}
1006
1007
bool PixelJitCache::Jit_WriteDepth(const PixelFuncID &id) {
1008
// Clear mode shares depthWrite for DepthClear().
1009
if (id.depthWrite) {
1010
X64Reg depthOffReg = GetDepthOff(id);
1011
Describe("WriteDepth");
1012
X64Reg argZReg = regCache_.Find(RegCache::GEN_ARG_Z);
1013
MOV(16, MatR(depthOffReg), R(argZReg));
1014
regCache_.Unlock(depthOffReg, RegCache::GEN_DEPTH_OFF);
1015
regCache_.Unlock(argZReg, RegCache::GEN_ARG_Z);
1016
regCache_.ForceRelease(RegCache::GEN_ARG_Z);
1017
}
1018
1019
// We can free up this reg if we force locked it.
1020
if (regCache_.Has(RegCache::GEN_DEPTH_OFF)) {
1021
regCache_.ForceRelease(RegCache::GEN_DEPTH_OFF);
1022
}
1023
1024
return true;
1025
}
1026
1027
bool PixelJitCache::Jit_AlphaBlend(const PixelFuncID &id) {
1028
if (!id.alphaBlend)
1029
return true;
1030
1031
// Check if we need to load and prep factors.
1032
PixelBlendState blendState;
1033
ComputePixelBlendState(blendState, id);
1034
1035
bool success = true;
1036
1037
// Step 1: Load and expand dest color.
1038
X64Reg dstReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1039
if (!blendState.readsDstPixel) {
1040
// Let's load colorOff just for registers to be consistent.
1041
X64Reg colorOff = GetColorOff(id);
1042
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
1043
1044
PXOR(dstReg, R(dstReg));
1045
} else if (id.FBFormat() == GE_FORMAT_8888) {
1046
X64Reg colorOff = GetColorOff(id);
1047
Describe("AlphaBlend");
1048
MOVD_xmm(dstReg, MatR(colorOff));
1049
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
1050
} else {
1051
X64Reg colorOff = GetColorOff(id);
1052
Describe("AlphaBlend");
1053
X64Reg dstGenReg = regCache_.Alloc(RegCache::GEN_TEMP0);
1054
MOVZX(32, 16, dstGenReg, MatR(colorOff));
1055
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
1056
1057
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
1058
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
1059
1060
switch (id.fbFormat) {
1061
case GE_FORMAT_565:
1062
success = success && Jit_ConvertFrom565(id, dstGenReg, temp1Reg, temp2Reg);
1063
break;
1064
1065
case GE_FORMAT_5551:
1066
success = success && Jit_ConvertFrom5551(id, dstGenReg, temp1Reg, temp2Reg, blendState.usesDstAlpha);
1067
break;
1068
1069
case GE_FORMAT_4444:
1070
success = success && Jit_ConvertFrom4444(id, dstGenReg, temp1Reg, temp2Reg, blendState.usesDstAlpha);
1071
break;
1072
1073
case GE_FORMAT_8888:
1074
break;
1075
}
1076
1077
Describe("AlphaBlend");
1078
MOVD_xmm(dstReg, R(dstGenReg));
1079
1080
regCache_.Release(dstGenReg, RegCache::GEN_TEMP0);
1081
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
1082
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
1083
}
1084
1085
// Step 2: Load and apply factors.
1086
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
1087
if (blendState.usesFactors) {
1088
X64Reg srcFactorReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1089
X64Reg dstFactorReg = regCache_.Alloc(RegCache::VEC_TEMP2);
1090
1091
// We apply these at 16-bit, because they can be doubled and have a half offset.
1092
if (cpu_info.bSSE4_1) {
1093
if (!colorIs16Bit_)
1094
PMOVZXBW(argColorReg, R(argColorReg));
1095
PMOVZXBW(dstReg, R(dstReg));
1096
} else {
1097
X64Reg zeroReg = GetZeroVec();
1098
if (!colorIs16Bit_)
1099
PUNPCKLBW(argColorReg, R(zeroReg));
1100
PUNPCKLBW(dstReg, R(zeroReg));
1101
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
1102
}
1103
colorIs16Bit_ = true;
1104
1105
// Skip multiplying by factors if we can.
1106
bool multiplySrc = id.AlphaBlendSrc() != PixelBlendFactor::ZERO && id.AlphaBlendSrc() != PixelBlendFactor::ONE;
1107
bool multiplyDst = id.AlphaBlendDst() != PixelBlendFactor::ZERO && id.AlphaBlendDst() != PixelBlendFactor::ONE;
1108
// We also shift left by 4, so mulhi gives us a free shift
1109
// We also need to add a half bit later, so this gives us space.
1110
if (multiplySrc || blendState.srcColorAsFactor)
1111
PSLLW(argColorReg, 4);
1112
if (multiplyDst || blendState.dstColorAsFactor || blendState.usesDstAlpha)
1113
PSLLW(dstReg, 4);
1114
1115
// Okay, now grab our factors. Don't bother if they're known values.
1116
if (id.AlphaBlendSrc() < PixelBlendFactor::ZERO)
1117
success = success && Jit_BlendFactor(id, srcFactorReg, dstReg, id.AlphaBlendSrc());
1118
if (id.AlphaBlendDst() < PixelBlendFactor::ZERO)
1119
success = success && Jit_DstBlendFactor(id, srcFactorReg, dstFactorReg, dstReg);
1120
1121
X64Reg halfReg = INVALID_REG;
1122
if (multiplySrc || multiplyDst) {
1123
halfReg = regCache_.Alloc(RegCache::VEC_TEMP3);
1124
// We'll use this several times, so load into a reg.
1125
MOVDQA(halfReg, M(constBlendHalf_11_4s_));
1126
}
1127
1128
// Add in the half bit to the factors and color values, then multiply.
1129
// We take the high 16 bits to get a free right shift by 16.
1130
if (multiplySrc) {
1131
POR(srcFactorReg, R(halfReg));
1132
POR(argColorReg, R(halfReg));
1133
PMULHUW(argColorReg, R(srcFactorReg));
1134
} else if (id.AlphaBlendSrc() == PixelBlendFactor::ZERO) {
1135
PXOR(argColorReg, R(argColorReg));
1136
} else if (id.AlphaBlendSrc() == PixelBlendFactor::ONE) {
1137
if (blendState.srcColorAsFactor)
1138
PSRLW(argColorReg, 4);
1139
}
1140
1141
if (multiplyDst) {
1142
POR(dstFactorReg, R(halfReg));
1143
POR(dstReg, R(halfReg));
1144
PMULHUW(dstReg, R(dstFactorReg));
1145
} else if (id.AlphaBlendDst() == PixelBlendFactor::ZERO) {
1146
// No need to add or subtract zero, unless we're negating.
1147
// This is common for bloom preparation.
1148
if (id.AlphaBlendEq() == GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE)
1149
PXOR(dstReg, R(dstReg));
1150
} else if (id.AlphaBlendDst() == PixelBlendFactor::ONE) {
1151
if (blendState.dstColorAsFactor || blendState.usesDstAlpha)
1152
PSRLW(dstReg, 4);
1153
}
1154
1155
regCache_.Release(srcFactorReg, RegCache::VEC_TEMP1);
1156
regCache_.Release(dstFactorReg, RegCache::VEC_TEMP2);
1157
if (halfReg != INVALID_REG)
1158
regCache_.Release(halfReg, RegCache::VEC_TEMP3);
1159
} else if (colorIs16Bit_) {
1160
// If it's expanded, shrink and clamp for our min/max/absdiff handling.
1161
PACKUSWB(argColorReg, R(argColorReg));
1162
colorIs16Bit_ = false;
1163
}
1164
1165
// Step 3: Apply equation.
1166
// Note: below, we completely ignore what happens to the alpha bits.
1167
// It won't matter, since we'll replace those with stencil anyway.
1168
X64Reg tempReg = regCache_.Alloc(RegCache::VEC_TEMP1);
1169
switch (id.AlphaBlendEq()) {
1170
case GE_BLENDMODE_MUL_AND_ADD:
1171
if (id.AlphaBlendDst() != PixelBlendFactor::ZERO)
1172
PADDUSW(argColorReg, R(dstReg));
1173
break;
1174
1175
case GE_BLENDMODE_MUL_AND_SUBTRACT:
1176
if (id.AlphaBlendDst() != PixelBlendFactor::ZERO)
1177
PSUBUSW(argColorReg, R(dstReg));
1178
break;
1179
1180
case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
1181
if (cpu_info.bAVX) {
1182
VPSUBUSW(128, argColorReg, dstReg, R(argColorReg));
1183
} else {
1184
MOVDQA(tempReg, R(argColorReg));
1185
MOVDQA(argColorReg, R(dstReg));
1186
PSUBUSW(argColorReg, R(tempReg));
1187
}
1188
break;
1189
1190
case GE_BLENDMODE_MIN:
1191
PMINUB(argColorReg, R(dstReg));
1192
break;
1193
1194
case GE_BLENDMODE_MAX:
1195
PMAXUB(argColorReg, R(dstReg));
1196
break;
1197
1198
case GE_BLENDMODE_ABSDIFF:
1199
// Calculate A=(dst-src < 0 ? 0 : dst-src) and B=(src-dst < 0 ? 0 : src-dst)...
1200
MOVDQA(tempReg, R(dstReg));
1201
PSUBUSB(tempReg, R(argColorReg));
1202
PSUBUSB(argColorReg, R(dstReg));
1203
1204
// Now, one of those must be zero, and the other one is the result (could also be zero.)
1205
POR(argColorReg, R(tempReg));
1206
break;
1207
}
1208
1209
regCache_.Release(dstReg, RegCache::VEC_TEMP0);
1210
regCache_.Release(tempReg, RegCache::VEC_TEMP1);
1211
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
1212
1213
return success;
1214
}
1215
1216
bool PixelJitCache::Jit_BlendFactor(const PixelFuncID &id, RegCache::Reg factorReg, RegCache::Reg dstReg, PixelBlendFactor factor) {
1217
X64Reg idReg = INVALID_REG;
1218
X64Reg tempReg = INVALID_REG;
1219
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
1220
1221
// Everything below expects an expanded 16-bit color
1222
_assert_(colorIs16Bit_);
1223
1224
// Between source and dest factors, only DSTCOLOR, INVDSTCOLOR, and FIXA differ.
1225
// In those cases, it uses SRCCOLOR, INVSRCCOLOR, and FIXB respectively.
1226
1227
// Load the invert constant first off, if needed.
1228
switch (factor) {
1229
case PixelBlendFactor::INVOTHERCOLOR:
1230
case PixelBlendFactor::INVSRCALPHA:
1231
case PixelBlendFactor::INVDSTALPHA:
1232
case PixelBlendFactor::DOUBLEINVSRCALPHA:
1233
case PixelBlendFactor::DOUBLEINVDSTALPHA:
1234
MOVDQA(factorReg, M(constBlendInvert_11_4s_));
1235
break;
1236
1237
default:
1238
break;
1239
}
1240
1241
switch (factor) {
1242
case PixelBlendFactor::OTHERCOLOR:
1243
MOVDQA(factorReg, R(dstReg));
1244
break;
1245
1246
case PixelBlendFactor::INVOTHERCOLOR:
1247
PSUBUSW(factorReg, R(dstReg));
1248
break;
1249
1250
case PixelBlendFactor::SRCALPHA:
1251
PSHUFLW(factorReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
1252
break;
1253
1254
case PixelBlendFactor::INVSRCALPHA:
1255
tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);
1256
1257
PSHUFLW(tempReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
1258
PSUBUSW(factorReg, R(tempReg));
1259
break;
1260
1261
case PixelBlendFactor::DSTALPHA:
1262
PSHUFLW(factorReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
1263
break;
1264
1265
case PixelBlendFactor::INVDSTALPHA:
1266
tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);
1267
1268
PSHUFLW(tempReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
1269
PSUBUSW(factorReg, R(tempReg));
1270
break;
1271
1272
case PixelBlendFactor::DOUBLESRCALPHA:
1273
PSHUFLW(factorReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
1274
PSLLW(factorReg, 1);
1275
break;
1276
1277
case PixelBlendFactor::DOUBLEINVSRCALPHA:
1278
tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);
1279
1280
PSHUFLW(tempReg, R(argColorReg), _MM_SHUFFLE(3, 3, 3, 3));
1281
PSLLW(tempReg, 1);
1282
PSUBUSW(factorReg, R(tempReg));
1283
break;
1284
1285
case PixelBlendFactor::DOUBLEDSTALPHA:
1286
PSHUFLW(factorReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
1287
PSLLW(factorReg, 1);
1288
break;
1289
1290
case PixelBlendFactor::DOUBLEINVDSTALPHA:
1291
tempReg = regCache_.Alloc(RegCache::VEC_TEMP3);
1292
1293
PSHUFLW(tempReg, R(dstReg), _MM_SHUFFLE(3, 3, 3, 3));
1294
PSLLW(tempReg, 1);
1295
PSUBUSW(factorReg, R(tempReg));
1296
break;
1297
1298
case PixelBlendFactor::ZERO:
1299
// Special value meaning zero.
1300
PXOR(factorReg, R(factorReg));
1301
break;
1302
1303
case PixelBlendFactor::ONE:
1304
// Special value meaning all 255s.
1305
PCMPEQD(factorReg, R(factorReg));
1306
PSLLW(factorReg, 8);
1307
PSRLW(factorReg, 4);
1308
break;
1309
1310
case PixelBlendFactor::FIX:
1311
default:
1312
idReg = GetPixelID();
1313
if (cpu_info.bSSE4_1) {
1314
PMOVZXBW(factorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendSrc)));
1315
} else {
1316
X64Reg zeroReg = GetZeroVec();
1317
MOVD_xmm(factorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendSrc)));
1318
PUNPCKLBW(factorReg, R(zeroReg));
1319
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
1320
}
1321
// Round it out by shifting into place.
1322
PSLLW(factorReg, 4);
1323
break;
1324
}
1325
1326
if (idReg != INVALID_REG)
1327
UnlockPixelID(idReg);
1328
if (tempReg != INVALID_REG)
1329
regCache_.Release(tempReg, RegCache::VEC_TEMP3);
1330
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
1331
1332
return true;
1333
}
1334
1335
bool PixelJitCache::Jit_DstBlendFactor(const PixelFuncID &id, RegCache::Reg srcFactorReg, RegCache::Reg dstFactorReg, RegCache::Reg dstReg) {
1336
bool success = true;
1337
X64Reg idReg = INVALID_REG;
1338
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
1339
1340
// Everything below expects an expanded 16-bit color
1341
_assert_(colorIs16Bit_);
1342
1343
PixelBlendState blendState;
1344
ComputePixelBlendState(blendState, id);
1345
1346
// We might be able to reuse srcFactorReg for dst, in some cases.
1347
switch (id.AlphaBlendDst()) {
1348
case PixelBlendFactor::OTHERCOLOR:
1349
MOVDQA(dstFactorReg, R(argColorReg));
1350
break;
1351
1352
case PixelBlendFactor::INVOTHERCOLOR:
1353
MOVDQA(dstFactorReg, M(constBlendInvert_11_4s_));
1354
PSUBUSW(dstFactorReg, R(argColorReg));
1355
break;
1356
1357
case PixelBlendFactor::SRCALPHA:
1358
case PixelBlendFactor::INVSRCALPHA:
1359
case PixelBlendFactor::DSTALPHA:
1360
case PixelBlendFactor::INVDSTALPHA:
1361
case PixelBlendFactor::DOUBLESRCALPHA:
1362
case PixelBlendFactor::DOUBLEINVSRCALPHA:
1363
case PixelBlendFactor::DOUBLEDSTALPHA:
1364
case PixelBlendFactor::DOUBLEINVDSTALPHA:
1365
case PixelBlendFactor::ZERO:
1366
case PixelBlendFactor::ONE:
1367
// These are all equivalent for src factor, so reuse that logic.
1368
if (id.AlphaBlendSrc() == id.AlphaBlendDst()) {
1369
MOVDQA(dstFactorReg, R(srcFactorReg));
1370
} else if (blendState.dstFactorIsInverse) {
1371
MOVDQA(dstFactorReg, M(constBlendInvert_11_4s_));
1372
PSUBUSW(dstFactorReg, R(srcFactorReg));
1373
} else {
1374
success = success && Jit_BlendFactor(id, dstFactorReg, dstReg, id.AlphaBlendDst());
1375
}
1376
break;
1377
1378
case PixelBlendFactor::FIX:
1379
default:
1380
idReg = GetPixelID();
1381
if (cpu_info.bSSE4_1) {
1382
PMOVZXBW(dstFactorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendDst)));
1383
} else {
1384
X64Reg zeroReg = GetZeroVec();
1385
MOVD_xmm(dstFactorReg, MDisp(idReg, offsetof(PixelFuncID, cached.alphaBlendDst)));
1386
PUNPCKLBW(dstFactorReg, R(zeroReg));
1387
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
1388
}
1389
// Round it out by shifting into place.
1390
PSLLW(dstFactorReg, 4);
1391
break;
1392
}
1393
1394
if (idReg != INVALID_REG)
1395
UnlockPixelID(idReg);
1396
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
1397
1398
return success;
1399
}
1400
1401
bool PixelJitCache::Jit_Dither(const PixelFuncID &id) {
1402
if (!id.dithering)
1403
return true;
1404
1405
Describe("Dither");
1406
X64Reg valueReg = regCache_.Alloc(RegCache::GEN_TEMP0);
1407
1408
// Load the row dither matrix entry (will still need to get the X.)
1409
X64Reg argYReg = regCache_.Find(RegCache::GEN_ARG_Y);
1410
MOV(32, R(valueReg), R(argYReg));
1411
AND(32, R(valueReg), Imm8(3));
1412
1413
// At this point, we're done with depth and y, so let's grab GEN_COLOR_OFF and retain it.
1414
// Then we can modify x and throw it away too, which is our actual goal.
1415
X64Reg colorOffReg = GetColorOff(id);
1416
Describe("Dither");
1417
regCache_.Unlock(colorOffReg, RegCache::GEN_COLOR_OFF);
1418
regCache_.ForceRetain(RegCache::GEN_COLOR_OFF);
1419
// And get rid of y, we can use for other regs.
1420
regCache_.Unlock(argYReg, RegCache::GEN_ARG_Y);
1421
regCache_.ForceRelease(RegCache::GEN_ARG_Y);
1422
1423
X64Reg argXReg = regCache_.Find(RegCache::GEN_ARG_X);
1424
AND(32, R(argXReg), Imm32(3));
1425
1426
// Sum up (x + y * 4) + ditherMatrix offset to valueReg.
1427
LEA(32, valueReg, MComplex(argXReg, valueReg, 4, offsetof(PixelFuncID, cached.ditherMatrix)));
1428
1429
// Okay, now abuse argXReg to read the PixelFuncID pointer on the stack.
1430
if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
1431
X64Reg idReg = GetPixelID();
1432
MOVSX(32, 8, valueReg, MRegSum(idReg, valueReg));
1433
UnlockPixelID(idReg);
1434
} else {
1435
_assert_(stackIDOffset_ != -1);
1436
MOV(PTRBITS, R(argXReg), MDisp(RSP, stackIDOffset_));
1437
MOVSX(32, 8, valueReg, MRegSum(argXReg, valueReg));
1438
}
1439
regCache_.Unlock(argXReg, RegCache::GEN_ARG_X);
1440
regCache_.ForceRelease(RegCache::GEN_ARG_X);
1441
1442
// Copy that value into a vec to add to the color.
1443
X64Reg vecValueReg = regCache_.Alloc(RegCache::VEC_TEMP0);
1444
MOVD_xmm(vecValueReg, R(valueReg));
1445
regCache_.Release(valueReg, RegCache::GEN_TEMP0);
1446
1447
// Now we want to broadcast RGB in 16-bit, but keep A as 0.
1448
// Luckily, we know that third lane (in 16-bit) is zero from MOVD clearing it.
1449
// We use 16-bit because we need a signed add, but we also want to saturate.
1450
PSHUFLW(vecValueReg, R(vecValueReg), _MM_SHUFFLE(2, 0, 0, 0));
1451
1452
// With that, now let's convert the color to 16 bit...
1453
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
1454
if (!colorIs16Bit_) {
1455
if (cpu_info.bSSE4_1) {
1456
PMOVZXBW(argColorReg, R(argColorReg));
1457
} else {
1458
X64Reg zeroReg = GetZeroVec();
1459
PUNPCKLBW(argColorReg, R(zeroReg));
1460
regCache_.Unlock(zeroReg, RegCache::VEC_ZERO);
1461
}
1462
colorIs16Bit_ = true;
1463
}
1464
// And simply add the dither values.
1465
PADDSW(argColorReg, R(vecValueReg));
1466
regCache_.Release(vecValueReg, RegCache::VEC_TEMP0);
1467
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
1468
1469
return true;
1470
}
1471
1472
bool PixelJitCache::Jit_WriteColor(const PixelFuncID &id) {
1473
X64Reg colorOff = GetColorOff(id);
1474
Describe("WriteColor");
1475
if (regCache_.Has(RegCache::GEN_ARG_X)) {
1476
// We normally toss x and y during dithering or useStandardStride with no dithering.
1477
// Free up the regs now to get more reg space.
1478
regCache_.ForceRelease(RegCache::GEN_ARG_X);
1479
regCache_.ForceRelease(RegCache::GEN_ARG_Y);
1480
1481
// But make sure we don't lose GEN_COLOR_OFF, we'll be lost without that now.
1482
regCache_.ForceRetain(RegCache::GEN_COLOR_OFF);
1483
}
1484
1485
// Convert back to 8888 and clamp.
1486
X64Reg argColorReg = regCache_.Find(RegCache::VEC_ARG_COLOR);
1487
if (colorIs16Bit_) {
1488
PACKUSWB(argColorReg, R(argColorReg));
1489
colorIs16Bit_ = false;
1490
}
1491
1492
if (id.clearMode) {
1493
bool drawingDone = false;
1494
if (!id.ColorClear() && !id.StencilClear())
1495
drawingDone = true;
1496
if (!id.ColorClear() && id.FBFormat() == GE_FORMAT_565)
1497
drawingDone = true;
1498
1499
bool success = true;
1500
if (!id.ColorClear() && !drawingDone) {
1501
// Let's reuse Jit_WriteStencilOnly for this path.
1502
X64Reg alphaReg;
1503
if (regCache_.Has(RegCache::GEN_SRC_ALPHA)) {
1504
alphaReg = regCache_.Find(RegCache::GEN_SRC_ALPHA);
1505
} else {
1506
alphaReg = regCache_.Alloc(RegCache::GEN_SRC_ALPHA);
1507
MOVD_xmm(R(alphaReg), argColorReg);
1508
SHR(32, R(alphaReg), Imm8(24));
1509
}
1510
success = Jit_WriteStencilOnly(id, alphaReg);
1511
regCache_.Release(alphaReg, RegCache::GEN_SRC_ALPHA);
1512
1513
drawingDone = true;
1514
}
1515
1516
if (drawingDone) {
1517
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
1518
regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
1519
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
1520
regCache_.ForceRelease(RegCache::GEN_COLOR_OFF);
1521
return success;
1522
}
1523
1524
// In this case, we're clearing only color or only color and stencil. Proceed.
1525
}
1526
1527
X64Reg colorReg = regCache_.Alloc(RegCache::GEN_TEMP0);
1528
MOVD_xmm(R(colorReg), argColorReg);
1529
regCache_.Unlock(argColorReg, RegCache::VEC_ARG_COLOR);
1530
regCache_.ForceRelease(RegCache::VEC_ARG_COLOR);
1531
1532
X64Reg stencilReg = INVALID_REG;
1533
if (regCache_.Has(RegCache::GEN_STENCIL))
1534
stencilReg = regCache_.Find(RegCache::GEN_STENCIL);
1535
1536
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP1);
1537
X64Reg temp2Reg = regCache_.Alloc(RegCache::GEN_TEMP2);
1538
bool convertAlpha = id.clearMode && id.StencilClear();
1539
bool writeAlpha = convertAlpha || stencilReg != INVALID_REG;
1540
uint32_t fixedKeepMask = 0x00000000;
1541
1542
bool success = true;
1543
1544
// Step 1: Load the color into colorReg.
1545
switch (id.fbFormat) {
1546
case GE_FORMAT_565:
1547
// In this case, stencil doesn't matter.
1548
success = success && Jit_ConvertTo565(id, colorReg, temp1Reg, temp2Reg);
1549
break;
1550
1551
case GE_FORMAT_5551:
1552
success = success && Jit_ConvertTo5551(id, colorReg, temp1Reg, temp2Reg, convertAlpha);
1553
1554
if (stencilReg != INVALID_REG) {
1555
// Truncate off the top bit of the stencil.
1556
SHR(32, R(stencilReg), Imm8(7));
1557
SHL(32, R(stencilReg), Imm8(15));
1558
} else if (!writeAlpha) {
1559
fixedKeepMask = 0x8000;
1560
}
1561
break;
1562
1563
case GE_FORMAT_4444:
1564
success = success && Jit_ConvertTo4444(id, colorReg, temp1Reg, temp2Reg, convertAlpha);
1565
1566
if (stencilReg != INVALID_REG) {
1567
// Truncate off the top bit of the stencil.
1568
SHR(32, R(stencilReg), Imm8(4));
1569
SHL(32, R(stencilReg), Imm8(12));
1570
} else if (!writeAlpha) {
1571
fixedKeepMask = 0xF000;
1572
}
1573
break;
1574
1575
case GE_FORMAT_8888:
1576
if (stencilReg != INVALID_REG) {
1577
SHL(32, R(stencilReg), Imm8(24));
1578
// Clear out the alpha bits so we can fit the stencil.
1579
AND(32, R(colorReg), Imm32(0x00FFFFFF));
1580
} else if (!writeAlpha) {
1581
fixedKeepMask = 0xFF000000;
1582
}
1583
break;
1584
}
1585
1586
// Step 2: Load write mask if needed.
1587
// Note that we apply the write mask at the destination bit depth.
1588
Describe("WriteColor");
1589
X64Reg maskReg = INVALID_REG;
1590
if (id.applyColorWriteMask) {
1591
maskReg = regCache_.Alloc(RegCache::GEN_TEMP3);
1592
// Load the pre-converted and combined write mask.
1593
if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
1594
X64Reg idReg = GetPixelID();
1595
MOV(32, R(maskReg), MDisp(idReg, offsetof(PixelFuncID, cached.colorWriteMask)));
1596
UnlockPixelID(idReg);
1597
} else {
1598
_assert_(stackIDOffset_ != -1);
1599
MOV(PTRBITS, R(maskReg), MDisp(RSP, stackIDOffset_));
1600
MOV(32, R(maskReg), MDisp(maskReg, offsetof(PixelFuncID, cached.colorWriteMask)));
1601
}
1602
}
1603
1604
// We've run out of regs, let's live without temp2 from here on.
1605
regCache_.Release(temp2Reg, RegCache::GEN_TEMP2);
1606
1607
// Step 3: Apply logic op, combine stencil.
1608
skipStandardWrites_.clear();
1609
if (id.applyLogicOp) {
1610
// Note: we combine stencil during logic op, because it's a bit complex to retain.
1611
success = success && Jit_ApplyLogicOp(id, colorReg, maskReg);
1612
} else if (stencilReg != INVALID_REG) {
1613
OR(32, R(colorReg), R(stencilReg));
1614
}
1615
1616
// Step 4: Write and apply write mask.
1617
Describe("WriteColor");
1618
switch (id.fbFormat) {
1619
case GE_FORMAT_565:
1620
case GE_FORMAT_5551:
1621
case GE_FORMAT_4444:
1622
if (maskReg != INVALID_REG) {
1623
// Zero all other bits, then flip maskReg to clear the bits we're keeping in colorReg.
1624
AND(16, MatR(colorOff), R(maskReg));
1625
if (cpu_info.bBMI1) {
1626
ANDN(32, colorReg, maskReg, R(colorReg));
1627
} else {
1628
NOT(32, R(maskReg));
1629
AND(32, R(colorReg), R(maskReg));
1630
}
1631
OR(16, MatR(colorOff), R(colorReg));
1632
} else if (fixedKeepMask == 0) {
1633
MOV(16, MatR(colorOff), R(colorReg));
1634
} else {
1635
// Clear the non-stencil bits and or in the color.
1636
AND(16, MatR(colorOff), Imm16((uint16_t)fixedKeepMask));
1637
OR(16, MatR(colorOff), R(colorReg));
1638
}
1639
break;
1640
1641
case GE_FORMAT_8888:
1642
if (maskReg != INVALID_REG) {
1643
// Zero all other bits, then flip maskReg to clear the bits we're keeping in colorReg.
1644
AND(32, MatR(colorOff), R(maskReg));
1645
if (cpu_info.bBMI1) {
1646
ANDN(32, colorReg, maskReg, R(colorReg));
1647
} else {
1648
NOT(32, R(maskReg));
1649
AND(32, R(colorReg), R(maskReg));
1650
}
1651
OR(32, MatR(colorOff), R(colorReg));
1652
} else if (fixedKeepMask == 0) {
1653
MOV(32, MatR(colorOff), R(colorReg));
1654
} else if (fixedKeepMask == 0xFF000000) {
1655
// We want to set 24 bits only, since we're not changing stencil.
1656
// For now, let's do two writes rather than reading in the old stencil.
1657
MOV(16, MatR(colorOff), R(colorReg));
1658
SHR(32, R(colorReg), Imm8(16));
1659
MOV(8, MDisp(colorOff, 2), R(colorReg));
1660
} else {
1661
AND(32, MatR(colorOff), Imm32(fixedKeepMask));
1662
OR(32, MatR(colorOff), R(colorReg));
1663
}
1664
break;
1665
}
1666
1667
for (FixupBranch &fixup : skipStandardWrites_)
1668
SetJumpTarget(fixup);
1669
skipStandardWrites_.clear();
1670
1671
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
1672
regCache_.ForceRelease(RegCache::GEN_COLOR_OFF);
1673
regCache_.Release(colorReg, RegCache::GEN_TEMP0);
1674
regCache_.Release(temp1Reg, RegCache::GEN_TEMP1);
1675
if (maskReg != INVALID_REG)
1676
regCache_.Release(maskReg, RegCache::GEN_TEMP3);
1677
if (stencilReg != INVALID_REG) {
1678
regCache_.Unlock(stencilReg, RegCache::GEN_STENCIL);
1679
regCache_.ForceRelease(RegCache::GEN_STENCIL);
1680
}
1681
1682
return success;
1683
}
1684
1685
bool PixelJitCache::Jit_ApplyLogicOp(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg maskReg) {
1686
Describe("LogicOp");
1687
X64Reg logicOpReg = regCache_.Alloc(RegCache::GEN_TEMP4);
1688
if (regCache_.Has(RegCache::GEN_ARG_ID) || regCache_.Has(RegCache::GEN_ID)) {
1689
X64Reg idReg = GetPixelID();
1690
MOVZX(32, 8, logicOpReg, MDisp(idReg, offsetof(PixelFuncID, cached.logicOp)));
1691
UnlockPixelID(idReg);
1692
} else {
1693
_assert_(stackIDOffset_ != -1);
1694
MOV(PTRBITS, R(logicOpReg), MDisp(RSP, stackIDOffset_));
1695
MOVZX(32, 8, logicOpReg, MDisp(logicOpReg, offsetof(PixelFuncID, cached.logicOp)));
1696
}
1697
1698
X64Reg stencilReg = INVALID_REG;
1699
if (regCache_.Has(RegCache::GEN_STENCIL))
1700
stencilReg = regCache_.Find(RegCache::GEN_STENCIL);
1701
1702
// Should already be allocated.
1703
X64Reg colorOff = regCache_.Find(RegCache::GEN_COLOR_OFF);
1704
X64Reg temp1Reg = regCache_.Alloc(RegCache::GEN_TEMP5);
1705
1706
// We'll use these in several cases, so prepare.
1707
int bits = id.fbFormat == GE_FORMAT_8888 ? 32 : 16;
1708
OpArg stencilMask, notStencilMask;
1709
switch (id.fbFormat) {
1710
case GE_FORMAT_565:
1711
stencilMask = Imm16(0);
1712
notStencilMask = Imm16(0xFFFF);
1713
break;
1714
case GE_FORMAT_5551:
1715
stencilMask = Imm16(0x8000);
1716
notStencilMask = Imm16(0x7FFF);
1717
break;
1718
case GE_FORMAT_4444:
1719
stencilMask = Imm16(0xF000);
1720
notStencilMask = Imm16(0x0FFF);
1721
break;
1722
case GE_FORMAT_8888:
1723
stencilMask = Imm32(0xFF000000);
1724
notStencilMask = Imm32(0x00FFFFFF);
1725
break;
1726
}
1727
1728
std::vector<FixupBranch> finishes;
1729
finishes.reserve(11);
1730
FixupBranch skipTable = J(true);
1731
const u8 *tableValues[16]{};
1732
1733
tableValues[GE_LOGIC_CLEAR] = GetCodePointer();
1734
if (stencilReg != INVALID_REG) {
1735
// If clearing and setting the stencil, that's easy - stencilReg has it.
1736
MOV(32, R(colorReg), R(stencilReg));
1737
finishes.push_back(J(true));
1738
} else if (maskReg != INVALID_REG) {
1739
// Just and out the unmasked bits (stencil already included in maskReg.)
1740
AND(bits, MatR(colorOff), R(maskReg));
1741
skipStandardWrites_.push_back(J(true));
1742
} else {
1743
// Otherwise, no mask, just AND the stencil bits to zero the rest.
1744
AND(bits, MatR(colorOff), stencilMask);
1745
skipStandardWrites_.push_back(J(true));
1746
}
1747
1748
tableValues[GE_LOGIC_AND] = GetCodePointer();
1749
if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
1750
// Since we're ANDing, set the mask bits (AND will keep them as-is.)
1751
OR(32, R(colorReg), R(maskReg));
1752
OR(32, R(colorReg), R(stencilReg));
1753
1754
// To apply stencil, we'll OR the stencil unmasked bits in memory, so our AND keeps them.
1755
NOT(32, R(maskReg));
1756
AND(bits, R(maskReg), stencilMask);
1757
OR(bits, MatR(colorOff), R(maskReg));
1758
} else if (stencilReg != INVALID_REG) {
1759
OR(32, R(colorReg), R(stencilReg));
1760
// No mask, so just or in the stencil bits so our AND can set any we want.
1761
OR(bits, MatR(colorOff), stencilMask);
1762
} else if (maskReg != INVALID_REG) {
1763
// Force in the mask (which includes all stencil bits) so both are kept as-is.
1764
OR(32, R(colorReg), R(maskReg));
1765
} else {
1766
// Force on the stencil bits so they AND and keep the existing value.
1767
if (stencilMask.GetImmValue() != 0)
1768
OR(bits, R(colorReg), stencilMask);
1769
}
1770
// Now the AND, which applies stencil and the logic op.
1771
AND(bits, MatR(colorOff), R(colorReg));
1772
skipStandardWrites_.push_back(J(true));
1773
1774
tableValues[GE_LOGIC_AND_REVERSE] = GetCodePointer();
1775
// Reverse memory in a temp reg so we can apply the write mask easily.
1776
MOV(bits, R(temp1Reg), MatR(colorOff));
1777
if (cpu_info.bBMI1) {
1778
ANDN(32, colorReg, temp1Reg, R(colorReg));
1779
} else {
1780
NOT(32, R(temp1Reg));
1781
AND(32, R(colorReg), R(temp1Reg));
1782
}
1783
// Now add in the stencil bits (must be zero before, since we used AND.)
1784
if (stencilReg != INVALID_REG) {
1785
OR(32, R(colorReg), R(stencilReg));
1786
}
1787
finishes.push_back(J(true));
1788
1789
tableValues[GE_LOGIC_COPY] = GetCodePointer();
1790
// This is just a standard write, nothing complex.
1791
if (stencilReg != INVALID_REG) {
1792
OR(32, R(colorReg), R(stencilReg));
1793
}
1794
finishes.push_back(J(true));
1795
1796
tableValues[GE_LOGIC_AND_INVERTED] = GetCodePointer();
1797
if (stencilReg != INVALID_REG) {
1798
// Set the stencil bits, so they're zero when we invert.
1799
OR(bits, R(colorReg), stencilMask);
1800
NOT(32, R(colorReg));
1801
OR(32, R(colorReg), R(stencilReg));
1802
1803
if (maskReg != INVALID_REG) {
1804
// This way our AND will keep all those bits.
1805
OR(32, R(colorReg), R(maskReg));
1806
1807
// To apply stencil, we'll OR the stencil unmasked bits in memory, so our AND keeps them.
1808
NOT(32, R(maskReg));
1809
AND(bits, R(maskReg), stencilMask);
1810
OR(bits, MatR(colorOff), R(maskReg));
1811
} else {
1812
// Force memory to take our stencil bits by ORing for the AND.
1813
OR(bits, MatR(colorOff), stencilMask);
1814
}
1815
} else if (maskReg != INVALID_REG) {
1816
NOT(32, R(colorReg));
1817
// This way our AND will keep all those bits.
1818
OR(32, R(colorReg), R(maskReg));
1819
} else {
1820
// Invert our color, but then add in stencil bits so the AND keeps them.
1821
NOT(32, R(colorReg));
1822
// We only do this for 8888 since the rest will have had 0 stencil bits (which turned to 1s.)
1823
if (id.FBFormat() == GE_FORMAT_8888)
1824
OR(bits, R(colorReg), stencilMask);
1825
}
1826
AND(bits, MatR(colorOff), R(colorReg));
1827
skipStandardWrites_.push_back(J(true));
1828
1829
tableValues[GE_LOGIC_NOOP] = GetCodePointer();
1830
if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
1831
// Start by clearing masked bits from stencilReg.
1832
if (cpu_info.bBMI1) {
1833
ANDN(32, stencilReg, maskReg, R(stencilReg));
1834
} else {
1835
NOT(32, R(maskReg));
1836
AND(32, R(stencilReg), R(maskReg));
1837
NOT(32, R(maskReg));
1838
}
1839
1840
// Now mask out the stencil bits we're writing from memory.
1841
OR(bits, R(maskReg), notStencilMask);
1842
AND(bits, MatR(colorOff), R(maskReg));
1843
1844
// Now set those remaining stencil bits.
1845
OR(bits, MatR(colorOff), R(stencilReg));
1846
skipStandardWrites_.push_back(J(true));
1847
} else if (stencilReg != INVALID_REG) {
1848
// Clear and set just the stencil bits.
1849
AND(bits, MatR(colorOff), notStencilMask);
1850
OR(bits, MatR(colorOff), R(stencilReg));
1851
skipStandardWrites_.push_back(J(true));
1852
} else {
1853
Discard();
1854
}
1855
1856
tableValues[GE_LOGIC_XOR] = GetCodePointer();
1857
XOR(bits, R(colorReg), MatR(colorOff));
1858
if (stencilReg != INVALID_REG) {
1859
// Purge out the stencil bits from the XOR and copy ours in.
1860
AND(bits, R(colorReg), notStencilMask);
1861
OR(32, R(colorReg), R(stencilReg));
1862
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
1863
// XOR might've set some bits, and without a maskReg we won't clear them.
1864
AND(bits, R(colorReg), notStencilMask);
1865
}
1866
finishes.push_back(J(true));
1867
1868
tableValues[GE_LOGIC_OR] = GetCodePointer();
1869
if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
1870
OR(32, R(colorReg), R(stencilReg));
1871
1872
// Clear the bits we should be masking out.
1873
if (cpu_info.bBMI1) {
1874
ANDN(32, colorReg, maskReg, R(colorReg));
1875
} else {
1876
NOT(32, R(maskReg));
1877
AND(32, R(colorReg), R(maskReg));
1878
NOT(32, R(maskReg));
1879
}
1880
1881
// Clear all the unmasked stencil bits, so we can set our own.
1882
OR(bits, R(maskReg), notStencilMask);
1883
AND(bits, MatR(colorOff), R(maskReg));
1884
} else if (stencilReg != INVALID_REG) {
1885
OR(32, R(colorReg), R(stencilReg));
1886
// AND out the stencil bits so we set our own.
1887
AND(bits, MatR(colorOff), notStencilMask);
1888
} else if (maskReg != INVALID_REG) {
1889
// Clear the bits we should be masking out.
1890
if (cpu_info.bBMI1) {
1891
ANDN(32, colorReg, maskReg, R(colorReg));
1892
} else {
1893
NOT(32, R(maskReg));
1894
AND(32, R(colorReg), R(maskReg));
1895
}
1896
} else if (id.FBFormat() == GE_FORMAT_8888) {
1897
// We only need to do this for 8888, the others already have 0 stencil.
1898
AND(bits, R(colorReg), notStencilMask);
1899
}
1900
// Now the OR, which applies stencil and the logic op itself.
1901
OR(bits, MatR(colorOff), R(colorReg));
1902
skipStandardWrites_.push_back(J(true));
1903
1904
tableValues[GE_LOGIC_NOR] = GetCodePointer();
1905
OR(bits, R(colorReg), MatR(colorOff));
1906
NOT(32, R(colorReg));
1907
if (stencilReg != INVALID_REG) {
1908
AND(bits, R(colorReg), notStencilMask);
1909
OR(32, R(colorReg), R(stencilReg));
1910
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
1911
// We need to clear the stencil bits since the standard write logic assumes they're zero.
1912
AND(bits, R(colorReg), notStencilMask);
1913
}
1914
finishes.push_back(J(true));
1915
1916
tableValues[GE_LOGIC_EQUIV] = GetCodePointer();
1917
XOR(bits, R(colorReg), MatR(colorOff));
1918
NOT(32, R(colorReg));
1919
if (stencilReg != INVALID_REG) {
1920
AND(bits, R(colorReg), notStencilMask);
1921
OR(32, R(colorReg), R(stencilReg));
1922
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
1923
// We need to clear the stencil bits since the standard write logic assumes they're zero.
1924
AND(bits, R(colorReg), notStencilMask);
1925
}
1926
finishes.push_back(J(true));
1927
1928
tableValues[GE_LOGIC_INVERTED] = GetCodePointer();
1929
// We just toss our color entirely.
1930
MOV(bits, R(colorReg), MatR(colorOff));
1931
NOT(32, R(colorReg));
1932
if (stencilReg != INVALID_REG) {
1933
AND(bits, R(colorReg), notStencilMask);
1934
OR(32, R(colorReg), R(stencilReg));
1935
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
1936
// We need to clear the stencil bits since the standard write logic assumes they're zero.
1937
AND(bits, R(colorReg), notStencilMask);
1938
}
1939
finishes.push_back(J(true));
1940
1941
tableValues[GE_LOGIC_OR_REVERSE] = GetCodePointer();
1942
// Reverse in a temp reg so we can mask properly.
1943
MOV(bits, R(temp1Reg), MatR(colorOff));
1944
NOT(32, R(temp1Reg));
1945
OR(32, R(colorReg), R(temp1Reg));
1946
if (stencilReg != INVALID_REG) {
1947
AND(bits, R(colorReg), notStencilMask);
1948
OR(32, R(colorReg), R(stencilReg));
1949
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
1950
// We need to clear the stencil bits since the standard write logic assumes they're zero.
1951
AND(bits, R(colorReg), notStencilMask);
1952
}
1953
finishes.push_back(J(true));
1954
1955
tableValues[GE_LOGIC_COPY_INVERTED] = GetCodePointer();
1956
NOT(32, R(colorReg));
1957
if (stencilReg != INVALID_REG) {
1958
AND(bits, R(colorReg), notStencilMask);
1959
OR(32, R(colorReg), R(stencilReg));
1960
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
1961
// We need to clear the stencil bits since the standard write logic assumes they're zero.
1962
AND(bits, R(colorReg), notStencilMask);
1963
}
1964
finishes.push_back(J(true));
1965
1966
tableValues[GE_LOGIC_OR_INVERTED] = GetCodePointer();
1967
NOT(32, R(colorReg));
1968
if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
1969
AND(bits, R(colorReg), notStencilMask);
1970
OR(32, R(colorReg), R(stencilReg));
1971
1972
// Clear the bits we should be masking out.
1973
if (cpu_info.bBMI1) {
1974
ANDN(32, colorReg, maskReg, R(colorReg));
1975
} else {
1976
NOT(32, R(maskReg));
1977
AND(32, R(colorReg), R(maskReg));
1978
NOT(32, R(maskReg));
1979
}
1980
1981
// Clear all the unmasked stencil bits, so we can set our own.
1982
OR(bits, R(maskReg), notStencilMask);
1983
AND(bits, MatR(colorOff), R(maskReg));
1984
} else if (stencilReg != INVALID_REG) {
1985
AND(bits, R(colorReg), notStencilMask);
1986
OR(32, R(colorReg), R(stencilReg));
1987
// AND out the stencil bits so we set our own.
1988
AND(bits, MatR(colorOff), notStencilMask);
1989
} else if (maskReg != INVALID_REG) {
1990
// Clear the bits we should be masking out.
1991
NOT(32, R(maskReg));
1992
AND(32, R(colorReg), R(maskReg));
1993
} else if (id.FBFormat() == GE_FORMAT_8888) {
1994
// We only need to do this for 8888, the others already have 0 stencil.
1995
AND(bits, R(colorReg), notStencilMask);
1996
}
1997
OR(bits, MatR(colorOff), R(colorReg));
1998
skipStandardWrites_.push_back(J(true));
1999
2000
tableValues[GE_LOGIC_NAND] = GetCodePointer();
2001
AND(bits, R(temp1Reg), MatR(colorOff));
2002
NOT(32, R(colorReg));
2003
if (stencilReg != INVALID_REG) {
2004
AND(bits, R(colorReg), notStencilMask);
2005
OR(32, R(colorReg), R(stencilReg));
2006
} else if (maskReg == INVALID_REG && stencilMask.GetImmValue() != 0) {
2007
// We need to clear the stencil bits since the standard write logic assumes they're zero.
2008
AND(bits, R(colorReg), notStencilMask);
2009
}
2010
finishes.push_back(J(true));
2011
2012
tableValues[GE_LOGIC_SET] = GetCodePointer();
2013
if (stencilReg != INVALID_REG && maskReg != INVALID_REG) {
2014
OR(32, R(colorReg), R(stencilReg));
2015
OR(bits, R(colorReg), notStencilMask);
2016
finishes.push_back(J(true));
2017
} else if (stencilReg != INVALID_REG) {
2018
// Set bits directly in stencilReg, and then put in memory.
2019
OR(bits, R(stencilReg), notStencilMask);
2020
MOV(bits, MatR(colorOff), R(stencilReg));
2021
skipStandardWrites_.push_back(J(true));
2022
} else if (maskReg != INVALID_REG) {
2023
// OR in the bits we're allowed to write (won't be any stencil.)
2024
NOT(32, R(maskReg));
2025
OR(bits, MatR(colorOff), R(maskReg));
2026
skipStandardWrites_.push_back(J(true));
2027
} else {
2028
OR(bits, MatR(colorOff), notStencilMask);
2029
skipStandardWrites_.push_back(J(true));
2030
}
2031
2032
const u8 *tablePtr = GetCodePointer();
2033
for (int i = 0; i < 16; ++i) {
2034
Write64((uintptr_t)tableValues[i]);
2035
}
2036
2037
SetJumpTarget(skipTable);
2038
LEA(64, temp1Reg, M(tablePtr));
2039
JMPptr(MComplex(temp1Reg, logicOpReg, 8, 0));
2040
2041
for (FixupBranch &fixup : finishes)
2042
SetJumpTarget(fixup);
2043
2044
regCache_.Unlock(colorOff, RegCache::GEN_COLOR_OFF);
2045
regCache_.Release(logicOpReg, RegCache::GEN_TEMP4);
2046
regCache_.Release(temp1Reg, RegCache::GEN_TEMP5);
2047
if (stencilReg != INVALID_REG)
2048
regCache_.Unlock(stencilReg, RegCache::GEN_STENCIL);
2049
2050
return true;
2051
}
2052
2053
bool PixelJitCache::Jit_ConvertTo565(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg) {
2054
Describe("ConvertTo565");
2055
2056
if (cpu_info.bBMI2_fast) {
2057
MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
2058
PEXT(32, colorReg, colorReg, R(temp1Reg));
2059
return true;
2060
}
2061
2062
// Assemble the 565 color, starting with R...
2063
MOV(32, R(temp1Reg), R(colorReg));
2064
SHR(32, R(temp1Reg), Imm8(3));
2065
AND(16, R(temp1Reg), Imm16(0x1F << 0));
2066
2067
// For G, move right 5 (because the top 6 are offset by 10.)
2068
MOV(32, R(temp2Reg), R(colorReg));
2069
SHR(32, R(temp2Reg), Imm8(5));
2070
AND(16, R(temp2Reg), Imm16(0x3F << 5));
2071
OR(32, R(temp1Reg), R(temp2Reg));
2072
2073
// And finally B, move right 8 (top 5 are offset by 19.)
2074
SHR(32, R(colorReg), Imm8(8));
2075
AND(16, R(colorReg), Imm16(0x1F << 11));
2076
OR(32, R(colorReg), R(temp1Reg));
2077
2078
return true;
2079
}
2080
2081
bool PixelJitCache::Jit_ConvertTo5551(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
2082
Describe("ConvertTo5551");
2083
2084
if (cpu_info.bBMI2_fast) {
2085
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x80F8F8F8 : 0x00F8F8F8));
2086
PEXT(32, colorReg, colorReg, R(temp1Reg));
2087
return true;
2088
}
2089
2090
// This is R, pretty simple.
2091
MOV(32, R(temp1Reg), R(colorReg));
2092
SHR(32, R(temp1Reg), Imm8(3));
2093
AND(16, R(temp1Reg), Imm16(0x1F << 0));
2094
2095
// G moves right 6, to match the top 5 at 11.
2096
MOV(32, R(temp2Reg), R(colorReg));
2097
SHR(32, R(temp2Reg), Imm8(6));
2098
AND(16, R(temp2Reg), Imm16(0x1F << 5));
2099
OR(32, R(temp1Reg), R(temp2Reg));
2100
2101
if (keepAlpha) {
2102
// Grab A into tempReg2 before handling B.
2103
MOV(32, R(temp2Reg), R(colorReg));
2104
SHR(32, R(temp2Reg), Imm8(31));
2105
SHL(32, R(temp2Reg), Imm8(15));
2106
}
2107
2108
// B moves right 9, to match the top 5 at 19.
2109
SHR(32, R(colorReg), Imm8(9));
2110
AND(16, R(colorReg), Imm16(0x1F << 10));
2111
OR(32, R(colorReg), R(temp1Reg));
2112
2113
if (keepAlpha)
2114
OR(32, R(colorReg), R(temp2Reg));
2115
2116
return true;
2117
}
2118
2119
bool PixelJitCache::Jit_ConvertTo4444(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
2120
Describe("ConvertTo4444");
2121
2122
if (cpu_info.bBMI2_fast) {
2123
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0xF0F0F0F0 : 0x00F0F0F0));
2124
PEXT(32, colorReg, colorReg, R(temp1Reg));
2125
return true;
2126
}
2127
2128
// Shift and mask out R.
2129
MOV(32, R(temp1Reg), R(colorReg));
2130
SHR(32, R(temp1Reg), Imm8(4));
2131
AND(16, R(temp1Reg), Imm16(0xF << 0));
2132
2133
// Shift G into position and mask.
2134
MOV(32, R(temp2Reg), R(colorReg));
2135
SHR(32, R(temp2Reg), Imm8(8));
2136
AND(16, R(temp2Reg), Imm16(0xF << 4));
2137
OR(32, R(temp1Reg), R(temp2Reg));
2138
2139
if (keepAlpha) {
2140
// Grab A into tempReg2 before handling B.
2141
MOV(32, R(temp2Reg), R(colorReg));
2142
SHR(32, R(temp2Reg), Imm8(28));
2143
SHL(32, R(temp2Reg), Imm8(12));
2144
}
2145
2146
// B moves right 12, to match the top 4 at 20.
2147
SHR(32, R(colorReg), Imm8(12));
2148
AND(16, R(colorReg), Imm16(0xF << 8));
2149
OR(32, R(colorReg), R(temp1Reg));
2150
2151
if (keepAlpha)
2152
OR(32, R(colorReg), R(temp2Reg));
2153
2154
return true;
2155
}
2156
2157
bool PixelJitCache::Jit_ConvertFrom565(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg) {
2158
Describe("ConvertFrom565");
2159
2160
if (cpu_info.bBMI2_fast) {
2161
// Start off with the high bits.
2162
MOV(32, R(temp1Reg), Imm32(0x00F8FCF8));
2163
PDEP(32, temp1Reg, colorReg, R(temp1Reg));
2164
2165
// Now grab the low bits (they end up packed.)
2166
MOV(32, R(temp2Reg), Imm32(0x0000E61C));
2167
PEXT(32, colorReg, colorReg, R(temp2Reg));
2168
// And spread them back out.
2169
MOV(32, R(temp2Reg), Imm32(0x00070307));
2170
PDEP(32, colorReg, colorReg, R(temp2Reg));
2171
2172
// Finally put the high bits in, we're done.
2173
OR(32, R(colorReg), R(temp1Reg));
2174
return true;
2175
}
2176
2177
// Filter out red only into temp1.
2178
MOV(32, R(temp1Reg), R(colorReg));
2179
AND(16, R(temp1Reg), Imm16(0x1F << 0));
2180
// Move it left to the top of the 8 bits.
2181
SHL(32, R(temp1Reg), Imm8(3));
2182
2183
// Now we bring in blue, since it's also 5 like red.
2184
MOV(32, R(temp2Reg), R(colorReg));
2185
AND(16, R(temp2Reg), Imm16(0x1F << 11));
2186
// Shift blue into place, 8 left (at 19), and merge back to temp1.
2187
SHL(32, R(temp2Reg), Imm8(8));
2188
OR(32, R(temp1Reg), R(temp2Reg));
2189
2190
// Make a copy back in temp2, and shift left 1 so we can swizzle together with G.
2191
OR(32, R(temp2Reg), R(temp1Reg));
2192
SHL(32, R(temp2Reg), Imm8(1));
2193
2194
// We go to green last because it's the different one. Put it in place.
2195
AND(16, R(colorReg), Imm16(0x3F << 5));
2196
SHL(32, R(colorReg), Imm8(5));
2197
// Combine with temp2 (for swizzling), then merge in temp1 (R+B pre-swizzle.)
2198
OR(32, R(temp2Reg), R(colorReg));
2199
OR(32, R(colorReg), R(temp1Reg));
2200
2201
// Now shift and mask temp2 for swizzle.
2202
SHR(32, R(temp2Reg), Imm8(6));
2203
AND(32, R(temp2Reg), Imm32(0x00070307));
2204
// And then OR that in too. We're done.
2205
OR(32, R(colorReg), R(temp2Reg));
2206
2207
return true;
2208
}
2209
2210
bool PixelJitCache::Jit_ConvertFrom5551(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
2211
Describe("ConvertFrom5551");
2212
2213
if (cpu_info.bBMI2_fast) {
2214
// First, grab the top bits.
2215
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0x01F8F8F8 : 0x00F8F8F8));
2216
PDEP(32, colorReg, colorReg, R(temp1Reg));
2217
2218
// Now make the swizzle bits.
2219
MOV(32, R(temp2Reg), R(colorReg));
2220
SHR(32, R(temp2Reg), Imm8(5));
2221
AND(32, R(temp2Reg), Imm32(0x00070707));
2222
2223
if (keepAlpha) {
2224
// Sign extend the alpha bit to 8 bits.
2225
SHL(32, R(colorReg), Imm8(7));
2226
SAR(32, R(colorReg), Imm8(7));
2227
}
2228
2229
OR(32, R(colorReg), R(temp2Reg));
2230
return true;
2231
}
2232
2233
// Filter out red only into temp1.
2234
MOV(32, R(temp1Reg), R(colorReg));
2235
AND(16, R(temp1Reg), Imm16(0x1F << 0));
2236
// Move it left to the top of the 8 bits.
2237
SHL(32, R(temp1Reg), Imm8(3));
2238
2239
// Add in green and shift into place (top bits.)
2240
MOV(32, R(temp2Reg), R(colorReg));
2241
AND(16, R(temp2Reg), Imm16(0x1F << 5));
2242
SHL(32, R(temp2Reg), Imm8(6));
2243
OR(32, R(temp1Reg), R(temp2Reg));
2244
2245
if (keepAlpha) {
2246
// Now take blue and alpha together.
2247
AND(16, R(colorReg), Imm16(0x8000 | (0x1F << 10)));
2248
// We move all the way left, then sign extend right to expand alpha.
2249
SHL(32, R(colorReg), Imm8(16));
2250
SAR(32, R(colorReg), Imm8(7));
2251
} else {
2252
AND(16, R(colorReg), Imm16(0x1F << 10));
2253
SHL(32, R(colorReg), Imm8(9));
2254
}
2255
2256
// Combine both together, we still need to swizzle.
2257
OR(32, R(colorReg), R(temp1Reg));
2258
OR(32, R(temp1Reg), R(colorReg));
2259
// Now for swizzle, we'll mask carefully to avoid overflow.
2260
SHR(32, R(temp1Reg), Imm8(5));
2261
AND(32, R(temp1Reg), Imm32(0x00070707));
2262
2263
// Then finally merge in the swizzle bits.
2264
OR(32, R(colorReg), R(temp1Reg));
2265
return true;
2266
}
2267
2268
bool PixelJitCache::Jit_ConvertFrom4444(const PixelFuncID &id, RegCache::Reg colorReg, RegCache::Reg temp1Reg, RegCache::Reg temp2Reg, bool keepAlpha) {
2269
Describe("ConvertFrom4444");
2270
2271
if (cpu_info.bBMI2_fast) {
2272
// First, spread the bits out with spaces.
2273
MOV(32, R(temp1Reg), Imm32(keepAlpha ? 0xF0F0F0F0 : 0x00F0F0F0));
2274
PDEP(32, colorReg, colorReg, R(temp1Reg));
2275
2276
// Now swizzle the low bits in.
2277
MOV(32, R(temp1Reg), R(colorReg));
2278
SHR(32, R(temp1Reg), Imm8(4));
2279
OR(32, R(colorReg), R(temp1Reg));
2280
return true;
2281
}
2282
2283
// Move red into position within temp1.
2284
MOV(32, R(temp1Reg), R(colorReg));
2285
AND(16, R(temp1Reg), Imm16(0xF << 0));
2286
SHL(32, R(temp1Reg), Imm8(4));
2287
2288
// Green is just as simple.
2289
MOV(32, R(temp2Reg), R(colorReg));
2290
AND(16, R(temp2Reg), Imm16(0xF << 4));
2291
SHL(32, R(temp2Reg), Imm8(8));
2292
OR(32, R(temp1Reg), R(temp2Reg));
2293
2294
// Blue isn't last this time, but it's next.
2295
MOV(32, R(temp2Reg), R(colorReg));
2296
AND(16, R(temp2Reg), Imm16(0xF << 8));
2297
SHL(32, R(temp2Reg), Imm8(12));
2298
OR(32, R(temp1Reg), R(temp2Reg));
2299
2300
if (keepAlpha) {
2301
// Last but not least, alpha.
2302
AND(16, R(colorReg), Imm16(0xF << 12));
2303
SHL(32, R(colorReg), Imm8(16));
2304
OR(32, R(colorReg), R(temp1Reg));
2305
2306
// Copy to temp1 again for swizzling.
2307
OR(32, R(temp1Reg), R(colorReg));
2308
} else {
2309
// Overwrite colorReg (we need temp1 as a copy anyway.)
2310
MOV(32, R(colorReg), R(temp1Reg));
2311
}
2312
2313
// Masking isn't necessary here since everything is 4 wide.
2314
SHR(32, R(temp1Reg), Imm8(4));
2315
OR(32, R(colorReg), R(temp1Reg));
2316
return true;
2317
}
2318
2319
};
2320
2321
#endif
2322
2323