CoCalc -- GPUStateUtils.cpp

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Common/GPUStateUtils.cpp
³¹⁸⁷ views
1
// Copyright (c) 2015- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include "ppsspp_config.h"
19
#include <algorithm>
20
#include <limits>
21

22
#include "Core/ConfigValues.h"
23
#include "Core/System.h"
24
#include "Core/Config.h"
25
#include "Core/Reporting.h"
26

27
#include "GPU/ge_constants.h"
28
#include "GPU/GPUState.h"
29
#include "GPU/Math3D.h"
30
#include "GPU/Common/PresentationCommon.h"
31

32
#include "GPU/Common/GPUStateUtils.h"
33

34
bool IsStencilTestOutputDisabled() {
35
	// The mask applies on all stencil ops.
36
	if (gstate.isStencilTestEnabled() && (gstate.pmska & 0xFF) != 0xFF) {
37
		if (gstate_c.framebufFormat == GE_FORMAT_565) {
38
			return true;
39
		}
40
		return gstate.getStencilOpZPass() == GE_STENCILOP_KEEP && gstate.getStencilOpZFail() == GE_STENCILOP_KEEP && gstate.getStencilOpSFail() == GE_STENCILOP_KEEP;
41
	}
42
	return true;
43
}
44

45
bool NeedsTestDiscard() {
46
	// We assume this is called only when enabled and not trivially true (may also be for color testing.)
47
	if (gstate.isStencilTestEnabled() && (gstate.pmska & 0xFF) != 0xFF)
48
		return true;
49
	if (gstate.isDepthTestEnabled() && gstate.isDepthWriteEnabled())
50
		return true;
51
	if (!gstate.isAlphaBlendEnabled())
52
		return true;
53
	if (gstate.getBlendFuncA() != GE_SRCBLEND_SRCALPHA && gstate.getBlendFuncA() != GE_SRCBLEND_DOUBLESRCALPHA)
54
		return true;
55
	// GE_DSTBLEND_DOUBLEINVSRCALPHA is actually inverse double src alpha, and doubling zero is still zero.
56
	if (gstate.getBlendFuncB() != GE_DSTBLEND_INVSRCALPHA && gstate.getBlendFuncB() != GE_DSTBLEND_DOUBLEINVSRCALPHA) {
57
		if (gstate.getBlendFuncB() != GE_DSTBLEND_FIXB || gstate.getFixB() != 0xFFFFFF)
58
			return true;
59
	}
60
	if (gstate.getBlendEq() != GE_BLENDMODE_MUL_AND_ADD && gstate.getBlendEq() != GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE)
61
		return true;
62
	if (gstate.isLogicOpEnabled() && gstate.getLogicOp() != GE_LOGIC_COPY)
63
		return true;
64

65
	return false;
66
}
67

68
bool IsAlphaTestTriviallyTrue() {
69
	switch (gstate.getAlphaTestFunction()) {
70
	case GE_COMP_NEVER:
71
		return false;
72

73
	case GE_COMP_ALWAYS:
74
		return true;
75

76
	case GE_COMP_GEQUAL:
77
		if (gstate_c.vertexFullAlpha && (gstate_c.textureFullAlpha || !gstate.isTextureAlphaUsed()))
78
			return true;  // If alpha is full, it doesn't matter what the ref value is.
79
		return gstate.getAlphaTestRef() == 0;
80

81
		// Non-zero check. If we have no depth testing (and thus no depth writing), and an alpha func that will result in no change if zero alpha, get rid of the alpha test.
82
		// Speeds up Lumines by a LOT on PowerVR.
83
	case GE_COMP_NOTEQUAL:
84
		if (gstate.getAlphaTestRef() == 255) {
85
			// Likely to be rare. Let's just skip the vertexFullAlpha optimization here instead of adding
86
			// complicated code to discard the draw or whatnot.
87
			return false;
88
		}
89
		// Fallthrough on purpose
90
		[[fallthrough]];
91
	case GE_COMP_GREATER:
92
	{
93
		// If the texture and vertex only use 1.0 alpha, then the ref value doesn't matter.
94
		if (gstate_c.vertexFullAlpha && (gstate_c.textureFullAlpha || !gstate.isTextureAlphaUsed()))
95
			return true;
96
		return gstate.getAlphaTestRef() == 0 && !NeedsTestDiscard();
97
	}
98

99
	case GE_COMP_LEQUAL:
100
		return gstate.getAlphaTestRef() == 255;
101

102
	case GE_COMP_EQUAL:
103
	case GE_COMP_LESS:
104
		return false;
105

106
	default:
107
		return false;
108
	}
109
}
110

111
bool IsAlphaTestAgainstZero() {
112
	return gstate.getAlphaTestRef() == 0 && gstate.getAlphaTestMask() == 0xFF;
113
}
114

115
bool IsColorTestAgainstZero() {
116
	return gstate.getColorTestRef() == 0 && gstate.getColorTestMask() == 0xFFFFFF;
117
}
118

119
bool IsColorTestTriviallyTrue() {
120
	switch (gstate.getColorTestFunction()) {
121
	case GE_COMP_NEVER:
122
		return false;
123

124
	case GE_COMP_ALWAYS:
125
		return true;
126

127
	case GE_COMP_EQUAL:
128
	case GE_COMP_NOTEQUAL:
129
		return false;
130
	default:
131
		return false;
132
	}
133
}
134

135
bool IsDepthTestEffectivelyDisabled() {
136
	if (!gstate.isDepthTestEnabled())
137
		return true;
138
	// We can ignore stencil, because ALWAYS and disabled choose the same stencil path.
139
	if (gstate.getDepthTestFunction() != GE_COMP_ALWAYS)
140
		return false;
141
	return !gstate.isDepthWriteEnabled();
142
}
143

144
const bool nonAlphaSrcFactors[16] = {
145
	true,  // GE_SRCBLEND_DSTCOLOR,
146
	true,  // GE_SRCBLEND_INVDSTCOLOR,
147
	false, // GE_SRCBLEND_SRCALPHA,
148
	false, // GE_SRCBLEND_INVSRCALPHA,
149
	true,  // GE_SRCBLEND_DSTALPHA,
150
	true,  // GE_SRCBLEND_INVDSTALPHA,
151
	false, // GE_SRCBLEND_DOUBLESRCALPHA,
152
	false, // GE_SRCBLEND_DOUBLEINVSRCALPHA,
153
	true,  // GE_SRCBLEND_DOUBLEDSTALPHA,
154
	true,  // GE_SRCBLEND_DOUBLEINVDSTALPHA,
155
	true,  // GE_SRCBLEND_FIXA,
156
	true,
157
	true,
158
	true,
159
	true,
160
	true,
161
};
162

163
const bool nonAlphaDestFactors[16] = {
164
	true,  // GE_DSTBLEND_SRCCOLOR,
165
	true,  // GE_DSTBLEND_INVSRCCOLOR,
166
	false, // GE_DSTBLEND_SRCALPHA,
167
	false, // GE_DSTBLEND_INVSRCALPHA,
168
	true,  // GE_DSTBLEND_DSTALPHA,
169
	true,  // GE_DSTBLEND_INVDSTALPHA,
170
	false, // GE_DSTBLEND_DOUBLESRCALPHA,
171
	false, // GE_DSTBLEND_DOUBLEINVSRCALPHA,
172
	true,  // GE_DSTBLEND_DOUBLEDSTALPHA,
173
	true,  // GE_DSTBLEND_DOUBLEINVDSTALPHA,
174
	true,  // GE_DSTBLEND_FIXB,
175
	true,
176
	true,
177
	true,
178
	true,
179
	true,
180
};
181

182
ReplaceAlphaType ReplaceAlphaWithStencil(ReplaceBlendType replaceBlend) {
183
	if (IsStencilTestOutputDisabled() || gstate.isModeClear()) {
184
		return REPLACE_ALPHA_NO;
185
	}
186

187
	if (replaceBlend != REPLACE_BLEND_NO && replaceBlend != REPLACE_BLEND_READ_FRAMEBUFFER) {
188
		if (nonAlphaSrcFactors[gstate.getBlendFuncA()] && nonAlphaDestFactors[gstate.getBlendFuncB()]) {
189
			return REPLACE_ALPHA_YES;
190
		} else {
191
			if (gstate_c.Use(GPU_USE_DUALSOURCE_BLEND)) {
192
				return REPLACE_ALPHA_DUALSOURCE;
193
			} else {
194
				return REPLACE_ALPHA_NO;
195
			}
196
		}
197
	}
198

199
	if (replaceBlend == ReplaceBlendType::REPLACE_BLEND_BLUE_TO_ALPHA) {
200
		return REPLACE_ALPHA_NO;  // irrelevant
201
	}
202

203
	return REPLACE_ALPHA_YES;
204
}
205

206
StencilValueType ReplaceAlphaWithStencilType() {
207
	switch (gstate_c.framebufFormat) {
208
	case GE_FORMAT_565:
209
		// There's never a stencil value.  Maybe the right alpha is 1?
210
		return STENCIL_VALUE_ONE;
211

212
	case GE_FORMAT_5551:
213
		switch (gstate.getStencilOpZPass()) {
214
			// Technically, this should only ever use zero/one.
215
		case GE_STENCILOP_REPLACE:
216
			return (gstate.getStencilTestRef() & 0x80) != 0 ? STENCIL_VALUE_ONE : STENCIL_VALUE_ZERO;
217

218
			// Decrementing always zeros, since there's only one bit.
219
		case GE_STENCILOP_DECR:
220
		case GE_STENCILOP_ZERO:
221
			return STENCIL_VALUE_ZERO;
222

223
			// Incrementing always fills, since there's only one bit.
224
		case GE_STENCILOP_INCR:
225
			return STENCIL_VALUE_ONE;
226

227
		case GE_STENCILOP_INVERT:
228
			return STENCIL_VALUE_INVERT;
229

230
		case GE_STENCILOP_KEEP:
231
			return STENCIL_VALUE_KEEP;
232
		}
233
		break;
234

235
	case GE_FORMAT_4444:
236
	case GE_FORMAT_8888:
237
	case GE_FORMAT_INVALID:
238
	case GE_FORMAT_DEPTH16:
239
	case GE_FORMAT_CLUT8:
240
		switch (gstate.getStencilOpZPass()) {
241
		case GE_STENCILOP_REPLACE:
242
			// TODO: Could detect zero here and force ZERO - less uniform updates?
243
			return STENCIL_VALUE_UNIFORM;
244

245
		case GE_STENCILOP_ZERO:
246
			return STENCIL_VALUE_ZERO;
247

248
		case GE_STENCILOP_DECR:
249
			return gstate_c.framebufFormat == GE_FORMAT_4444 ? STENCIL_VALUE_DECR_4 : STENCIL_VALUE_DECR_8;
250

251
		case GE_STENCILOP_INCR:
252
			return gstate_c.framebufFormat == GE_FORMAT_4444 ? STENCIL_VALUE_INCR_4 : STENCIL_VALUE_INCR_8;
253

254
		case GE_STENCILOP_INVERT:
255
			return STENCIL_VALUE_INVERT;
256

257
		case GE_STENCILOP_KEEP:
258
			return STENCIL_VALUE_KEEP;
259
		}
260
		break;
261
	}
262

263
	return STENCIL_VALUE_KEEP;
264
}
265

266
ReplaceBlendType ReplaceBlendWithShader(GEBufferFormat bufferFormat) {
267
	if (gstate_c.blueToAlpha) {
268
		return REPLACE_BLEND_BLUE_TO_ALPHA;
269
	}
270

271
	if (!gstate.isAlphaBlendEnabled() || gstate.isModeClear()) {
272
		return REPLACE_BLEND_NO;
273
	}
274

275
	GEBlendMode eq = gstate.getBlendEq();
276
	// Let's get the non-factor modes out of the way first.
277
	switch (eq) {
278
	case GE_BLENDMODE_ABSDIFF:
279
		return REPLACE_BLEND_READ_FRAMEBUFFER;
280

281
	case GE_BLENDMODE_MIN:
282
	case GE_BLENDMODE_MAX:
283
		if (gstate_c.Use(GPU_USE_BLEND_MINMAX)) {
284
			return REPLACE_BLEND_STANDARD;
285
		} else {
286
			return REPLACE_BLEND_READ_FRAMEBUFFER;
287
		}
288

289
	case GE_BLENDMODE_MUL_AND_ADD:
290
	case GE_BLENDMODE_MUL_AND_SUBTRACT:
291
	case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
292
		// Other blend equations simply don't blend on hardware.
293
		break;
294

295
	default:
296
		return REPLACE_BLEND_NO;
297
	}
298

299
	GEBlendSrcFactor funcA = gstate.getBlendFuncA();
300
	GEBlendDstFactor funcB = gstate.getBlendFuncB();
301

302
	switch (funcA) {
303
	case GE_SRCBLEND_DOUBLESRCALPHA:
304
	case GE_SRCBLEND_DOUBLEINVSRCALPHA:
305
		// 2x alpha in the source function and not in the dest = source color doubling.
306
		// Even dest alpha is safe, since we're moving the * 2.0 into the src color.
307
		switch (funcB) {
308
		case GE_DSTBLEND_SRCCOLOR:
309
		case GE_DSTBLEND_INVSRCCOLOR:
310
			// When inversing, alpha clamping isn't an issue.
311
			if (funcA == GE_SRCBLEND_DOUBLEINVSRCALPHA)
312
				return REPLACE_BLEND_2X_ALPHA;
313
			// Can't double, we need the source color to be correct.
314
			// Doubling only alpha would clamp the src alpha incorrectly.
315
			return REPLACE_BLEND_READ_FRAMEBUFFER;
316

317
		case GE_DSTBLEND_DOUBLEDSTALPHA:
318
		case GE_DSTBLEND_DOUBLEINVDSTALPHA:
319
			if (bufferFormat == GE_FORMAT_565)
320
				return REPLACE_BLEND_2X_ALPHA;
321
			return REPLACE_BLEND_READ_FRAMEBUFFER;
322

323
		case GE_DSTBLEND_DOUBLESRCALPHA:
324
			// We can't technically do this correctly (due to clamping) without reading the dst color.
325
			// Using a copy isn't accurate either, though, when there's overlap.
326
			if (gstate_c.Use(GPU_USE_FRAMEBUFFER_FETCH))
327
				return REPLACE_BLEND_READ_FRAMEBUFFER;
328
			return REPLACE_BLEND_PRE_SRC_2X_ALPHA;
329

330
		case GE_DSTBLEND_DOUBLEINVSRCALPHA:
331
			// For the inverse, doubling alpha is safe, because it will clamp correctly.
332
			return REPLACE_BLEND_PRE_SRC_2X_ALPHA;
333

334
		case GE_DSTBLEND_SRCALPHA:
335
		case GE_DSTBLEND_INVSRCALPHA:
336
		case GE_DSTBLEND_DSTALPHA:
337
		case GE_DSTBLEND_INVDSTALPHA:
338
		case GE_DSTBLEND_FIXB:
339
		default:
340
			// TODO: Could use vertexFullAlpha, but it's not calculated yet.
341
			// This outputs the original alpha for the dest factor.
342
			return REPLACE_BLEND_PRE_SRC;
343
		}
344

345
	case GE_SRCBLEND_DOUBLEDSTALPHA:
346
		switch (funcB) {
347
		case GE_DSTBLEND_SRCCOLOR:
348
		case GE_DSTBLEND_INVSRCCOLOR:
349
			if (bufferFormat == GE_FORMAT_565) {
350
				// Dest alpha should be zero.
351
				return REPLACE_BLEND_STANDARD;
352
			}
353
			// Can't double, we need the source color to be correct.
354
			return REPLACE_BLEND_READ_FRAMEBUFFER;
355

356
		case GE_DSTBLEND_DOUBLEDSTALPHA:
357
		case GE_DSTBLEND_DOUBLEINVDSTALPHA:
358
			if (bufferFormat == GE_FORMAT_565) {
359
				// Both blend factors are 0 or 1, no need to read it, since it's known.
360
				// Doubling will have no effect here.
361
				return REPLACE_BLEND_STANDARD;
362
			}
363
			return REPLACE_BLEND_READ_FRAMEBUFFER;
364

365
		case GE_DSTBLEND_DOUBLESRCALPHA:
366
		case GE_DSTBLEND_DOUBLEINVSRCALPHA:
367
			if (bufferFormat == GE_FORMAT_565) {
368
				return REPLACE_BLEND_2X_ALPHA;
369
			}
370
			// Double both src (for dst alpha) and alpha (for dst factor.)
371
			// But to be accurate (clamping), we need to read the dst color.
372
			return REPLACE_BLEND_READ_FRAMEBUFFER;
373

374
		case GE_DSTBLEND_SRCALPHA:
375
		case GE_DSTBLEND_INVSRCALPHA:
376
		case GE_DSTBLEND_DSTALPHA:
377
		case GE_DSTBLEND_INVDSTALPHA:
378
		case GE_DSTBLEND_FIXB:
379
		default:
380
			if (bufferFormat == GE_FORMAT_565) {
381
				return REPLACE_BLEND_STANDARD;
382
			}
383
			// We can't technically do this correctly (due to clamping) without reading the dst alpha.
384
			return REPLACE_BLEND_READ_FRAMEBUFFER;
385
		}
386

387
	case GE_SRCBLEND_DOUBLEINVDSTALPHA:
388
		// Inverse double dst alpha is tricky.  Doubling the src color is probably the wrong direction,
389
		// halving might be more correct.  We really need to read the dst color.
390
		switch (funcB) {
391
		case GE_DSTBLEND_SRCCOLOR:
392
		case GE_DSTBLEND_INVSRCCOLOR:
393
		case GE_DSTBLEND_DOUBLEDSTALPHA:
394
		case GE_DSTBLEND_DOUBLEINVDSTALPHA:
395
			if (bufferFormat == GE_FORMAT_565) {
396
				return REPLACE_BLEND_STANDARD;
397
			}
398
			return REPLACE_BLEND_READ_FRAMEBUFFER;
399

400
		case GE_DSTBLEND_DOUBLESRCALPHA:
401
		case GE_DSTBLEND_DOUBLEINVSRCALPHA:
402
			if (bufferFormat == GE_FORMAT_565) {
403
				return REPLACE_BLEND_2X_ALPHA;
404
			}
405
			return REPLACE_BLEND_READ_FRAMEBUFFER;
406

407
		case GE_DSTBLEND_SRCALPHA:
408
		case GE_DSTBLEND_INVSRCALPHA:
409
		case GE_DSTBLEND_DSTALPHA:
410
		case GE_DSTBLEND_INVDSTALPHA:
411
		case GE_DSTBLEND_FIXB:
412
		default:
413
			if (bufferFormat == GE_FORMAT_565) {
414
				return REPLACE_BLEND_STANDARD;
415
			}
416
			return REPLACE_BLEND_READ_FRAMEBUFFER;
417
		}
418

419
	case GE_SRCBLEND_FIXA:
420
	default:
421
		switch (funcB) {
422
		case GE_DSTBLEND_DOUBLESRCALPHA:
423
		{
424
			// L.A. Rush ends up here (detail textures at the end of the frame). It uses FIXA = 0 (no src color contribution)
425
			// but I still can't find a way to replicate the formula.
426
			// If our framebuffer was floating point we could make it work (since that turns off clamping before blending)
427
			// by just doubling src_alpha in the shader.
428
			//
429
			// It might be possible to replicate it if we implement a 2-pass decomposition:
430
			// * First pass just does:
431
			//   src=ZERO dst=SRC_ALPHA.
432
			// * Second pass renders with white input color. To double the resulting destination color:
433
			//   src=DST_COLOR dst=ONE
434
			return REPLACE_BLEND_READ_FRAMEBUFFER;
435
		}
436

437
		case GE_DSTBLEND_DOUBLEINVSRCALPHA:
438
			// Doubling alpha is safe for the inverse, will clamp to zero either way.
439
			return REPLACE_BLEND_2X_ALPHA;
440

441
		case GE_DSTBLEND_DOUBLEDSTALPHA:
442
		case GE_DSTBLEND_DOUBLEINVDSTALPHA:
443
			if (bufferFormat == GE_FORMAT_565) {
444
				// Alpha is irrelevant with this format.
445
				return REPLACE_BLEND_STANDARD;
446
			}
447
			return REPLACE_BLEND_READ_FRAMEBUFFER;
448

449
		case GE_DSTBLEND_FIXB:
450
		default:
451
			if (gstate.getFixA() == 0xFFFFFF && gstate.getFixB() == 0x000000) {
452
				// Some games specify this. Some GPUs may prefer blending off entirely.
453
				return REPLACE_BLEND_NO;
454
			} else if (gstate.getFixA() == 0xFFFFFF || gstate.getFixA() == 0x000000 || gstate.getFixB() == 0xFFFFFF || gstate.getFixB() == 0x000000) {
455
				// We can represent this with standard factors.
456
				return REPLACE_BLEND_STANDARD;
457
			} else {
458
				// Multiply the src color in the shader, that way it's always accurate.
459
				return REPLACE_BLEND_PRE_SRC;
460
			}
461

462
		case GE_DSTBLEND_SRCCOLOR:
463
		case GE_DSTBLEND_INVSRCCOLOR:
464
		case GE_DSTBLEND_SRCALPHA:
465
		case GE_DSTBLEND_INVSRCALPHA:
466
		case GE_DSTBLEND_DSTALPHA:
467
		case GE_DSTBLEND_INVDSTALPHA:
468
			return REPLACE_BLEND_STANDARD;
469
		}
470

471
	case GE_SRCBLEND_DSTCOLOR:
472
	case GE_SRCBLEND_INVDSTCOLOR:
473
	case GE_SRCBLEND_SRCALPHA:
474
	case GE_SRCBLEND_INVSRCALPHA:
475
	case GE_SRCBLEND_DSTALPHA:
476
	case GE_SRCBLEND_INVDSTALPHA:
477
		switch (funcB) {
478
		case GE_DSTBLEND_DOUBLESRCALPHA:
479
			if (funcA == GE_SRCBLEND_SRCALPHA || funcA == GE_SRCBLEND_INVSRCALPHA) {
480
				// Can't safely double alpha, will clamp.  However, a copy may easily be worse due to overlap.
481
				if (gstate_c.Use(GPU_USE_FRAMEBUFFER_FETCH))
482
					return REPLACE_BLEND_READ_FRAMEBUFFER;
483
				// Hm, this is similar to the L.A. Rush case above. This will not be accurate.
484
				// Wonder in which games we encounter this?
485
				return REPLACE_BLEND_PRE_SRC_2X_ALPHA;
486
			} else {
487
				// This means dst alpha/color is used in the src factor.
488
				// Unfortunately, copying here causes overlap problems in Silent Hill games (it seems?)
489
				// We will just hope that doubling alpha for the dst factor will not clamp too badly.
490
				if (gstate_c.Use(GPU_USE_FRAMEBUFFER_FETCH))
491
					return REPLACE_BLEND_READ_FRAMEBUFFER;
492
				// Hm, this is similar to the L.A. Rush case above. This will not be accurate.
493
				// Wonder in which games we encounter this? One example is MotorStorm.
494
				return REPLACE_BLEND_2X_ALPHA;
495
			}
496

497
		case GE_DSTBLEND_DOUBLEINVSRCALPHA:
498
			// For inverse, things are simpler.  Clamping isn't an issue, as long as we avoid
499
			// messing with the other factor's components.
500
			if (funcA == GE_SRCBLEND_SRCALPHA || funcA == GE_SRCBLEND_INVSRCALPHA) {
501
				return REPLACE_BLEND_PRE_SRC_2X_ALPHA;
502
			}
503
			return REPLACE_BLEND_2X_ALPHA;
504

505
		case GE_DSTBLEND_DOUBLEDSTALPHA:
506
		case GE_DSTBLEND_DOUBLEINVDSTALPHA:
507
			if (bufferFormat == GE_FORMAT_565) {
508
				return REPLACE_BLEND_STANDARD;
509
			}
510
			return REPLACE_BLEND_READ_FRAMEBUFFER;
511

512
		default:
513
			return REPLACE_BLEND_STANDARD;
514
		}
515
	}
516

517
	// Should never get here.
518
	return REPLACE_BLEND_STANDARD;
519
}
520

521
static const float DEPTH_SLICE_FACTOR_HIGH = 4.0f;
522
static const float DEPTH_SLICE_FACTOR_16BIT = 256.0f;
523

524
// The supported flag combinations. TODO: Maybe they should be distilled down into an enum.
525
//
526
// 0 - "Old"-style GL depth.
527
//     Or "Non-accurate depth" : effectively ignore minz / maxz. Map Z values based on viewport, which clamps.
528
//     This skews depth in many instances. Depth can be inverted in this mode if viewport says.
529
//     This is completely wrong, but works in some cases (probably because some game devs assumed it was how it worked)
530
//     and avoids some depth clamp issues.
531
//
532
// GPU_USE_ACCURATE_DEPTH:
533
//     Accurate depth: Z in the framebuffer matches the range of Z used on the PSP linearly in some way. We choose
534
//     a centered range, to simulate clamping by letting otherwise out-of-range pixels survive the 0 and 1 cutoffs.
535
//     Clip depth based on minz/maxz, and viewport is just a means to scale and center the value, not clipping or mapping to stored values.
536
//
537
// GPU_USE_ACCURATE_DEPTH | GPU_USE_DEPTH_CLAMP:
538
//     Variant of GPU_USE_ACCURATE_DEPTH, just the range is the nice and convenient 0-1 since we can use
539
//     hardware depth clamp. only viable in accurate depth mode, clamps depth and therefore uses the full 0-1 range. Using the full 0-1 range is not what accurate means, it's implied by depth clamp (which also means we're clamping.)
540
//
541
// GPU_USE_ACCURATE_DEPTH | GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT:
542
// GPU_USE_ACCURATE_DEPTH | GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT | GPU_USE_DEPTH_CLAMP:
543
//     Only viable in accurate depth mode, means to use a range of the 24-bit depth values available
544
//     from the GPU to represent the 16-bit values the PSP had, to try to make everything round and
545
//     z-fight (close to) the same way as on hardware, cheaply (cheaper than rounding depth in fragment shader).
546
//     We automatically switch to this if Z tests for equality are used.
547
//     Depth clamp has no effect on the depth scaling here if set, though will still be enabled
548
//     and clamp wildly out of line values.
549
//
550
// Any other combinations of these particular flags are bogus (like for example a lonely GPU_USE_DEPTH_CLAMP).
551

552
float DepthSliceFactor(u32 useFlags) {
553
	if (!(useFlags & GPU_USE_ACCURATE_DEPTH)) {
554
		// Old style depth.
555
		return 1.0f;
556
	}
557
	if (useFlags & GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT) {
558
		// Accurate depth but 16-bit resolution, so squish.
559
		return DEPTH_SLICE_FACTOR_16BIT;
560
	}
561
	if (useFlags & GPU_USE_DEPTH_CLAMP) {
562
		// Accurate depth, but we can use the full range since clamping is available.
563
		return 1.0f;
564
	}
565

566
	// Standard accurate depth.
567
	return DEPTH_SLICE_FACTOR_HIGH;
568
}
569

570
// See class DepthScaleFactors for how to apply.
571
DepthScaleFactors GetDepthScaleFactors(u32 useFlags) {
572
	if (!(useFlags & GPU_USE_ACCURATE_DEPTH)) {
573
		return DepthScaleFactors(0.0f, 65535.0f);
574
	}
575

576
	if (useFlags & GPU_SCALE_DEPTH_FROM_24BIT_TO_16BIT) {
577
		const double offset = 0.5 * (DEPTH_SLICE_FACTOR_16BIT - 1.0) / DEPTH_SLICE_FACTOR_16BIT;
578
		// Use one bit for each value, rather than 1.0 / (65535.0 * 256.0).
579
		const double scale = 16777215.0;
580
		return DepthScaleFactors(offset, scale);
581
	} else if (useFlags & GPU_USE_DEPTH_CLAMP) {
582
		return DepthScaleFactors(0.0f, 65535.0f);
583
	} else {
584
		const double offset = 0.5f * (DEPTH_SLICE_FACTOR_HIGH - 1.0f) * (1.0f / DEPTH_SLICE_FACTOR_HIGH);
585
		return DepthScaleFactors(offset, (float)(DEPTH_SLICE_FACTOR_HIGH * 65535.0));
586
	}
587
}
588

589
void ConvertViewportAndScissor(bool useBufferedRendering, float renderWidth, float renderHeight, int bufferWidth, int bufferHeight, ViewportAndScissor &out) {
590
	out.throughMode = gstate.isModeThrough();
591

592
	float renderWidthFactor, renderHeightFactor;
593
	float renderX = 0.0f, renderY = 0.0f;
594
	float displayOffsetX, displayOffsetY;
595
	if (useBufferedRendering) {
596
		displayOffsetX = 0.0f;
597
		displayOffsetY = 0.0f;
598
		renderWidthFactor = (float)renderWidth / (float)bufferWidth;
599
		renderHeightFactor = (float)renderHeight / (float)bufferHeight;
600
	} else {
601
		float pixelW = PSP_CoreParameter().pixelWidth;
602
		float pixelH = PSP_CoreParameter().pixelHeight;
603
		FRect frame = GetScreenFrame(pixelW, pixelH);
604
		FRect rc;
605
		CalculateDisplayOutputRect(&rc, 480, 272, frame, ROTATION_LOCKED_HORIZONTAL);
606
		displayOffsetX = rc.x;
607
		displayOffsetY = rc.y;
608
		renderWidth = rc.w;
609
		renderHeight = rc.h;
610
		renderWidthFactor = renderWidth / 480.0f;
611
		renderHeightFactor = renderHeight / 272.0f;
612
	}
613

614
	// We take care negative offsets of in the projection matrix.
615
	// These come from split framebuffers (Killzone).
616
	// TODO: Might be safe to do get rid of this here and do the same for positive offsets?
617
	renderX = std::max(gstate_c.curRTOffsetX, 0);
618
	renderY = std::max(gstate_c.curRTOffsetY, 0);
619

620
	// Scissor
621
	int scissorX1 = gstate.getScissorX1();
622
	int scissorY1 = gstate.getScissorY1();
623
	int scissorX2 = gstate.getScissorX2() + 1;
624
	int scissorY2 = gstate.getScissorY2() + 1;
625

626
	if (scissorX2 < scissorX1 || scissorY2 < scissorY1) {
627
		out.scissorX = 0;
628
		out.scissorY = 0;
629
		out.scissorW = 0;
630
		out.scissorH = 0;
631
	} else {
632
		out.scissorX = (renderX * renderWidthFactor) + displayOffsetX + scissorX1 * renderWidthFactor;
633
		out.scissorY = (renderY * renderHeightFactor) + displayOffsetY + scissorY1 * renderHeightFactor;
634
		out.scissorW = (scissorX2 - scissorX1) * renderWidthFactor;
635
		out.scissorH = (scissorY2 - scissorY1) * renderHeightFactor;
636
	}
637

638
	int curRTWidth = gstate_c.curRTWidth;
639
	int curRTHeight = gstate_c.curRTHeight;
640

641
	float offsetX = gstate.getOffsetX();
642
	float offsetY = gstate.getOffsetY();
643

644
	DepthScaleFactors depthScale = GetDepthScaleFactors(gstate_c.UseFlags());
645

646
	if (out.throughMode) {
647
		// If renderX/renderY are offset to compensate for a split framebuffer,
648
		// applying the offset to the viewport isn't enough, since the viewport clips.
649
		// We need to apply either directly to the vertices, or to the "through" projection matrix.
650
		out.viewportX = renderX * renderWidthFactor + displayOffsetX;
651
		out.viewportY = renderY * renderHeightFactor + displayOffsetY;
652
		out.viewportW = curRTWidth * renderWidthFactor;
653
		out.viewportH = curRTHeight * renderHeightFactor;
654
		out.depthRangeMin = depthScale.EncodeFromU16(0.0f);
655
		out.depthRangeMax = depthScale.EncodeFromU16(65536.0f);
656
	} else {
657
		// These we can turn into a glViewport call, offset by offsetX and offsetY. Math after.
658
		float vpXScale = gstate.getViewportXScale();
659
		float vpXCenter = gstate.getViewportXCenter();
660
		float vpYScale = gstate.getViewportYScale();
661
		float vpYCenter = gstate.getViewportYCenter();
662

663
		// The viewport transform appears to go like this:
664
		// Xscreen = -offsetX + vpXCenter + vpXScale * Xview
665
		// Yscreen = -offsetY + vpYCenter + vpYScale * Yview
666
		// Zscreen = vpZCenter + vpZScale * Zview
667

668
		// The viewport is normally centered at 2048,2048 but can also be centered at other locations.
669
		// Offset is subtracted from the viewport center and is also set to values in those ranges, and is set so that the viewport will cover
670
		// the desired screen area ([0-480)x[0-272)), so 1808,1912.
671

672
		// This means that to get the analogue glViewport we must:
673
		float vpX0 = vpXCenter - offsetX - fabsf(vpXScale);
674
		float vpY0 = vpYCenter - offsetY - fabsf(vpYScale);
675
		gstate_c.vpWidth = vpXScale * 2.0f;
676
		gstate_c.vpHeight = vpYScale * 2.0f;
677

678
		float vpWidth = fabsf(gstate_c.vpWidth);
679
		float vpHeight = fabsf(gstate_c.vpHeight);
680

681
		float left = renderX + vpX0;
682
		float top = renderY + vpY0;
683
		float right = left + vpWidth;
684
		float bottom = top + vpHeight;
685

686
		out.widthScale = 1.0f;
687
		out.xOffset = 0.0f;
688
		out.heightScale = 1.0f;
689
		out.yOffset = 0.0f;
690

691
		// If we're within the bounds, we want clipping the viewport way.  So leave it be.
692
		{
693
			float overageLeft = std::max(-left, 0.0f);
694
			float overageRight = std::max(right - bufferWidth, 0.0f);
695

696
			// Expand viewport to cover scissor region. The viewport doesn't clip on the PSP.
697
			if (right < scissorX2) {
698
				overageRight -= scissorX2 - right;
699
			}
700
			if (left > scissorX1) {
701
				overageLeft += scissorX1 - left;
702
			}
703

704
			// Our center drifted by the difference in overages.
705
			float drift = overageRight - overageLeft;
706

707
			if (overageLeft != 0.0f || overageRight != 0.0f) {
708
				left += overageLeft;
709
				right -= overageRight;
710

711
				// Protect against the viewport being entirely outside the scissor.
712
				// Emit a tiny but valid viewport. Really, we should probably emit a flag to ignore draws.
713
				if (right <= left) {
714
					right = left + 1.0f;
715
				}
716

717
				out.widthScale = vpWidth / (right - left);
718
				out.xOffset = drift / (right - left);
719
			}
720
		}
721

722
		{
723
			float overageTop = std::max(-top, 0.0f);
724
			float overageBottom = std::max(bottom - bufferHeight, 0.0f);
725

726
			// Expand viewport to cover scissor region. The viewport doesn't clip on the PSP.
727
			if (bottom < scissorY2) {
728
				overageBottom -= scissorY2 - bottom;
729
			}
730
			if (top > scissorY1) {
731
				overageTop += scissorY1 - top;
732
			}
733
			// Our center drifted by the difference in overages.
734
			float drift = overageBottom - overageTop;
735

736
			if (overageTop != 0.0f || overageBottom != 0.0f) {
737
				top += overageTop;
738
				bottom -= overageBottom;
739

740
				// Protect against the viewport being entirely outside the scissor.
741
				// Emit a tiny but valid  viewport. Really, we should probably emit a flag to ignore draws.
742
				if (bottom <= top) {
743
					bottom = top + 1.0f;
744
				}
745

746
				out.heightScale = vpHeight / (bottom - top);
747
				out.yOffset = drift / (bottom - top);
748
			}
749
		}
750

751
		out.viewportX = left * renderWidthFactor + displayOffsetX;
752
		out.viewportY = top * renderHeightFactor + displayOffsetY;
753
		out.viewportW = (right - left) * renderWidthFactor;
754
		out.viewportH = (bottom - top) * renderHeightFactor;
755

756
		// The depth viewport parameters are the same, but we handle it a bit differently.
757
		// When clipping is enabled, depth is clamped to [0, 65535].  And minz/maxz discard.
758
		// So, we apply the depth range as minz/maxz, and transform for the viewport.
759
		float vpZScale = gstate.getViewportZScale();
760
		float vpZCenter = gstate.getViewportZCenter();
761
		// TODO: This clip the entire draw if minz > maxz.
762
		float minz = gstate.getDepthRangeMin();
763
		float maxz = gstate.getDepthRangeMax();
764

765
		if (gstate.isDepthClampEnabled() && (minz == 0 || maxz == 65535)) {
766
			// Here, we should "clamp."  But clamping per fragment would be slow.
767
			// So, instead, we just increase the available range and hope.
768
			// If depthSliceFactor is 4, it means (75% / 2) of the depth lies in each direction.
769
			float fullDepthRange = 65535.0f * (depthScale.Scale() - 1.0f) * (1.0f / 2.0f);
770
			if (minz == 0) {
771
				minz -= fullDepthRange;
772
			}
773
			if (maxz == 65535) {
774
				maxz += fullDepthRange;
775
			}
776
		} else if (maxz == 65535) {
777
			// This means clamp isn't enabled, but we still want to allow values up to 65535.99.
778
			// If DepthSliceFactor() is 1.0, though, this would make out.depthRangeMax exceed 1.
779
			// Since that would clamp, it would make Z=1234 not match between draws when maxz changes.
780
			if (depthScale.Scale() > 1.0f)
781
				maxz = 65535.99f;
782
		}
783

784
		// Okay.  So, in our shader, -1 will map to minz, and +1 will map to maxz.
785
		float halfActualZRange = (maxz - minz) * (1.0f / 2.0f);
786
		out.depthScale = halfActualZRange < std::numeric_limits<float>::epsilon() ? 1.0f : vpZScale / halfActualZRange;
787
		// This adjusts the center from halfActualZRange to vpZCenter.
788
		out.zOffset = halfActualZRange < std::numeric_limits<float>::epsilon() ? 0.0f : (vpZCenter - (minz + halfActualZRange)) / halfActualZRange;
789

790
		if (!gstate_c.Use(GPU_USE_ACCURATE_DEPTH)) {
791
			out.depthScale = 1.0f;
792
			out.zOffset = 0.0f;
793
			out.depthRangeMin = depthScale.EncodeFromU16(vpZCenter - vpZScale);
794
			out.depthRangeMax = depthScale.EncodeFromU16(vpZCenter + vpZScale);
795
		} else {
796
			out.depthRangeMin = depthScale.EncodeFromU16(minz);
797
			out.depthRangeMax = depthScale.EncodeFromU16(maxz);
798
		}
799

800
		// OpenGL will clamp these for us anyway, and Direct3D will error if not clamped.
801
		// Of course, if this happens we've skewed out.depthScale/out.zOffset and may get z-fighting.
802
		out.depthRangeMin = std::max(out.depthRangeMin, 0.0f);
803
		out.depthRangeMax = std::min(out.depthRangeMax, 1.0f);
804
	}
805
}
806

807
void UpdateCachedViewportState(const ViewportAndScissor &vpAndScissor) {
808
	if (vpAndScissor.throughMode)
809
		return;
810

811
	bool scaleChanged = gstate_c.vpWidthScale != vpAndScissor.widthScale || gstate_c.vpHeightScale != vpAndScissor.heightScale;
812
	bool offsetChanged = gstate_c.vpXOffset != vpAndScissor.xOffset || gstate_c.vpYOffset != vpAndScissor.yOffset;
813
	bool depthChanged = gstate_c.vpDepthScale != vpAndScissor.depthScale || gstate_c.vpZOffset != vpAndScissor.zOffset;
814
	if (scaleChanged || offsetChanged || depthChanged) {
815
		gstate_c.vpWidthScale = vpAndScissor.widthScale;
816
		gstate_c.vpHeightScale = vpAndScissor.heightScale;
817
		gstate_c.vpDepthScale = vpAndScissor.depthScale;
818
		gstate_c.vpXOffset = vpAndScissor.xOffset;
819
		gstate_c.vpYOffset = vpAndScissor.yOffset;
820
		gstate_c.vpZOffset = vpAndScissor.zOffset;
821

822
		gstate_c.Dirty(DIRTY_PROJMATRIX);
823
		if (depthChanged) {
824
			gstate_c.Dirty(DIRTY_DEPTHRANGE);
825
		}
826
	}
827
}
828

829
static const BlendFactor genericALookup[11] = {
830
	BlendFactor::DST_COLOR,
831
	BlendFactor::ONE_MINUS_DST_COLOR,
832
	BlendFactor::SRC_ALPHA,
833
	BlendFactor::ONE_MINUS_SRC_ALPHA,
834
	BlendFactor::DST_ALPHA,
835
	BlendFactor::ONE_MINUS_DST_ALPHA,
836
	BlendFactor::SRC_ALPHA,			// GE_SRCBLEND_DOUBLESRCALPHA
837
	BlendFactor::ONE_MINUS_SRC_ALPHA,		// GE_SRCBLEND_DOUBLEINVSRCALPHA
838
	BlendFactor::DST_ALPHA,			// GE_SRCBLEND_DOUBLEDSTALPHA
839
	BlendFactor::ONE_MINUS_DST_ALPHA,		// GE_SRCBLEND_DOUBLEINVDSTALPHA
840
	BlendFactor::CONSTANT_COLOR,		// FIXA
841
};
842

843
static const BlendFactor genericBLookup[11] = {
844
	BlendFactor::SRC_COLOR,
845
	BlendFactor::ONE_MINUS_SRC_COLOR,
846
	BlendFactor::SRC_ALPHA,
847
	BlendFactor::ONE_MINUS_SRC_ALPHA,
848
	BlendFactor::DST_ALPHA,
849
	BlendFactor::ONE_MINUS_DST_ALPHA,
850
	BlendFactor::SRC_ALPHA,			// GE_SRCBLEND_DOUBLESRCALPHA
851
	BlendFactor::ONE_MINUS_SRC_ALPHA,		// GE_SRCBLEND_DOUBLEINVSRCALPHA
852
	BlendFactor::DST_ALPHA,			// GE_SRCBLEND_DOUBLEDSTALPHA
853
	BlendFactor::ONE_MINUS_DST_ALPHA,		// GE_SRCBLEND_DOUBLEINVDSTALPHA
854
	BlendFactor::CONSTANT_COLOR,		// FIXB
855
};
856

857
static const BlendEq eqLookupNoMinMax[] = {
858
	BlendEq::ADD,
859
	BlendEq::SUBTRACT,
860
	BlendEq::REVERSE_SUBTRACT,
861
	BlendEq::ADD,			// GE_BLENDMODE_MIN
862
	BlendEq::ADD,			// GE_BLENDMODE_MAX
863
	BlendEq::ADD,			// GE_BLENDMODE_ABSDIFF
864
	BlendEq::ADD,
865
	BlendEq::ADD,
866
};
867

868
static const BlendEq eqLookup[] = {
869
	BlendEq::ADD,
870
	BlendEq::SUBTRACT,
871
	BlendEq::REVERSE_SUBTRACT,
872
	BlendEq::MIN,			// GE_BLENDMODE_MIN
873
	BlendEq::MAX,			// GE_BLENDMODE_MAX
874
	BlendEq::MAX,			// GE_BLENDMODE_ABSDIFF
875
	BlendEq::ADD,
876
	BlendEq::ADD,
877
};
878

879
static BlendFactor toDualSource(BlendFactor blendfunc) {
880
	switch (blendfunc) {
881
	case BlendFactor::SRC_ALPHA:
882
		return BlendFactor::SRC1_ALPHA;
883
	case BlendFactor::ONE_MINUS_SRC_ALPHA:
884
		return BlendFactor::ONE_MINUS_SRC1_ALPHA;
885
	default:
886
		return blendfunc;
887
	}
888
}
889

890
static BlendFactor blendColor2Func(u32 fix, bool &approx) {
891
	if (fix == 0xFFFFFF)
892
		return BlendFactor::ONE;
893
	if (fix == 0)
894
		return BlendFactor::ZERO;
895

896
	// Otherwise, it's approximate if we pick ONE/ZERO.
897
	approx = true;
898

899
	const Vec3f fix3 = Vec3f::FromRGB(fix);
900
	if (fix3.x >= 0.99 && fix3.y >= 0.99 && fix3.z >= 0.99)
901
		return BlendFactor::ONE;
902
	else if (fix3.x <= 0.01 && fix3.y <= 0.01 && fix3.z <= 0.01)
903
		return BlendFactor::ZERO;
904
	return BlendFactor::INVALID;
905
}
906

907
// abs is a quagmire of compiler incompatibilities, so...
908
inline int iabs(int x) {
909
	return x >= 0 ? x : -x;
910
}
911

912
static inline bool blendColorSimilar(uint32_t a, uint32_t b, int margin = 25) {   // 25 ~= 0.1 * 255
913
	int diffx = iabs((a & 0xff) - (b & 0xff));
914
	int diffy = iabs(((a >> 8) & 0xff) - ((b >> 8) & 0xff));
915
	int diffz = iabs(((a >> 16) & 0xff) - ((b >> 16) & 0xff));
916
	if (diffx <= margin && diffy <= margin && diffz <= margin)
917
		return true;
918
	return false;
919
}
920

921
// Try to simulate some common logic ops by using blend, if needed.
922
// The shader might also need modification, the below function SimulateLogicOpShaderTypeIfNeeded
923
// takes care of that.
924
static bool SimulateLogicOpIfNeeded(BlendFactor &srcBlend, BlendFactor &dstBlend, BlendEq &blendEq) {
925
	if (!gstate.isLogicOpEnabled())
926
		return false;
927

928
	// Note: our shader solution applies logic ops BEFORE blending, not correctly after.
929
	// This is however fine for the most common ones, like CLEAR/NOOP/SET, etc.
930
	if (!gstate_c.Use(GPU_USE_LOGIC_OP)) {
931
		switch (gstate.getLogicOp()) {
932
		case GE_LOGIC_CLEAR:
933
			srcBlend = BlendFactor::ZERO;
934
			dstBlend = BlendFactor::ZERO;
935
			blendEq = BlendEq::ADD;
936
			return true;
937
		case GE_LOGIC_AND:
938
		case GE_LOGIC_AND_REVERSE:
939
			WARN_LOG_REPORT_ONCE(d3dLogicOpAnd, Log::G3D, "Unsupported AND logic op: %x", gstate.getLogicOp());
940
			break;
941
		case GE_LOGIC_COPY:
942
			// This is the same as off.
943
			break;
944
		case GE_LOGIC_COPY_INVERTED:
945
			// Handled in the shader.
946
			break;
947
		case GE_LOGIC_AND_INVERTED:
948
		case GE_LOGIC_NOR:
949
		case GE_LOGIC_NAND:
950
		case GE_LOGIC_EQUIV:
951
			// Handled in the shader.
952
			WARN_LOG_REPORT_ONCE(d3dLogicOpAndInverted, Log::G3D, "Attempted invert for logic op: %x", gstate.getLogicOp());
953
			break;
954
		case GE_LOGIC_INVERTED:
955
			srcBlend = BlendFactor::ONE;
956
			dstBlend = BlendFactor::ONE;
957
			blendEq = BlendEq::SUBTRACT;
958
			WARN_LOG_REPORT_ONCE(d3dLogicOpInverted, Log::G3D, "Attempted inverse for logic op: %x", gstate.getLogicOp());
959
			return true;
960
		case GE_LOGIC_NOOP:
961
			srcBlend = BlendFactor::ZERO;
962
			dstBlend = BlendFactor::ONE;
963
			blendEq = BlendEq::ADD;
964
			return true;
965
		case GE_LOGIC_XOR:
966
			WARN_LOG_REPORT_ONCE(d3dLogicOpOrXor, Log::G3D, "Unsupported XOR logic op: %x", gstate.getLogicOp());
967
			break;
968
		case GE_LOGIC_OR:
969
		case GE_LOGIC_OR_INVERTED:
970
			// Inverted in shader.
971
			srcBlend = BlendFactor::ONE;
972
			dstBlend = BlendFactor::ONE;
973
			blendEq = BlendEq::ADD;
974
			WARN_LOG_REPORT_ONCE(d3dLogicOpOr, Log::G3D, "Attempted or for logic op: %x", gstate.getLogicOp());
975
			return true;
976
		case GE_LOGIC_OR_REVERSE:
977
			WARN_LOG_REPORT_ONCE(d3dLogicOpOrReverse, Log::G3D, "Unsupported OR REVERSE logic op: %x", gstate.getLogicOp());
978
			break;
979
		case GE_LOGIC_SET:
980
			srcBlend = BlendFactor::ONE;
981
			dstBlend = BlendFactor::ONE;
982
			blendEq = BlendEq::ADD;
983
			WARN_LOG_REPORT_ONCE(d3dLogicOpSet, Log::G3D, "Attempted set for logic op: %x", gstate.getLogicOp());
984
			return true;
985
		}
986
	} else {
987
		// Even if we support hardware logic ops, alpha is handled wrong.
988
		// It's better to override blending for the simple cases.
989
		switch (gstate.getLogicOp()) {
990
		case GE_LOGIC_CLEAR:
991
			srcBlend = BlendFactor::ZERO;
992
			dstBlend = BlendFactor::ZERO;
993
			blendEq = BlendEq::ADD;
994
			return true;
995
		case GE_LOGIC_NOOP:
996
			srcBlend = BlendFactor::ZERO;
997
			dstBlend = BlendFactor::ONE;
998
			blendEq = BlendEq::ADD;
999
			return true;
1000

1001
		default:
1002
			// Let's hope hardware gets it right.
1003
			return false;
1004
		}
1005
	}
1006
	return false;
1007
}
1008

1009
// Choose the shader part of the above logic op fallback simulation.
1010
SimulateLogicOpType SimulateLogicOpShaderTypeIfNeeded() {
1011
	if (!gstate_c.Use(GPU_USE_LOGIC_OP) && gstate.isLogicOpEnabled()) {
1012
		switch (gstate.getLogicOp()) {
1013
		case GE_LOGIC_COPY_INVERTED:
1014
		case GE_LOGIC_AND_INVERTED:
1015
		case GE_LOGIC_OR_INVERTED:
1016
		case GE_LOGIC_NOR:
1017
		case GE_LOGIC_NAND:
1018
		case GE_LOGIC_EQUIV:
1019
			return LOGICOPTYPE_INVERT;
1020
		case GE_LOGIC_INVERTED:
1021
			return LOGICOPTYPE_ONE;
1022
		case GE_LOGIC_SET:
1023
			return LOGICOPTYPE_ONE;
1024
		default:
1025
			return LOGICOPTYPE_NORMAL;
1026
		}
1027
	}
1028
	return LOGICOPTYPE_NORMAL;
1029
}
1030

1031
void ApplyStencilReplaceAndLogicOpIgnoreBlend(ReplaceAlphaType replaceAlphaWithStencil, GenericBlendState &blendState) {
1032
	StencilValueType stencilType = STENCIL_VALUE_KEEP;
1033
	if (replaceAlphaWithStencil == REPLACE_ALPHA_YES) {
1034
		stencilType = ReplaceAlphaWithStencilType();
1035
	}
1036

1037
	// Normally, we would add src + 0 with blending off, but the logic op may have us do differently.
1038
	BlendFactor srcBlend = BlendFactor::ONE;
1039
	BlendFactor dstBlend = BlendFactor::ZERO;
1040
	BlendEq blendEq = BlendEq::ADD;
1041

1042
	// We're not blending, but we may still want to "blend" for stencil.
1043
	// This is only useful for INCR/DECR/INVERT.  Others can write directly.
1044
	switch (stencilType) {
1045
	case STENCIL_VALUE_INCR_4:
1046
	case STENCIL_VALUE_INCR_8:
1047
		// We'll add the incremented value output by the shader.
1048
		blendState.blendEnabled = true;
1049
		blendState.setFactors(srcBlend, dstBlend, BlendFactor::ONE, BlendFactor::ONE);
1050
		blendState.setEquation(blendEq, BlendEq::ADD);
1051
		break;
1052

1053
	case STENCIL_VALUE_DECR_4:
1054
	case STENCIL_VALUE_DECR_8:
1055
		// We'll subtract the incremented value output by the shader.
1056
		blendState.blendEnabled = true;
1057
		blendState.setFactors(srcBlend, dstBlend, BlendFactor::ONE, BlendFactor::ONE);
1058
		blendState.setEquation(blendEq, BlendEq::SUBTRACT);
1059
		break;
1060

1061
	case STENCIL_VALUE_INVERT:
1062
		// The shader will output one, and reverse subtracting will essentially invert.
1063
		blendState.blendEnabled = true;
1064
		blendState.setFactors(srcBlend, dstBlend, BlendFactor::ONE, BlendFactor::ONE);
1065
		blendState.setEquation(blendEq, BlendEq::REVERSE_SUBTRACT);
1066
		break;
1067

1068
	default:
1069
		if (srcBlend == BlendFactor::ONE && dstBlend == BlendFactor::ZERO && blendEq == BlendEq::ADD) {
1070
			blendState.blendEnabled = false;
1071
		} else {
1072
			blendState.blendEnabled = true;
1073
			blendState.setFactors(srcBlend, dstBlend, BlendFactor::ONE, BlendFactor::ZERO);
1074
			blendState.setEquation(blendEq, BlendEq::ADD);
1075
		}
1076
		break;
1077
	}
1078
}
1079

1080
enum class FBReadSetting {
1081
	Forced,
1082
	Allowed,
1083
	Disallowed,
1084
};
1085

1086
// If we can we emulate the colorMask by simply toggling the full R G B A masks offered
1087
// by modern hardware, we do that. This is 99.9% of the time.
1088
// When that's not enough, we fall back on a technique similar to shader blending,
1089
// we read from the framebuffer (or a copy of it).
1090
// We also prepare uniformMask so that if doing this in the shader gets forced-on,
1091
// we have the right mask already.
1092
static void ConvertMaskState(GenericMaskState &maskState, FBReadSetting useShader) {
1093
	if (gstate_c.blueToAlpha) {
1094
		maskState.applyFramebufferRead = false;
1095
		maskState.uniformMask = 0xFF000000;
1096
		maskState.channelMask = 0x8;
1097
		return;
1098
	}
1099

1100
	// Invert to convert masks from the PSP's format where 1 is don't draw to PC where 1 is draw.
1101
	uint32_t colorMask = ~((gstate.pmskc & 0xFFFFFF) | (gstate.pmska << 24));
1102

1103
	maskState.uniformMask = colorMask;
1104
	maskState.applyFramebufferRead = false;
1105
	maskState.channelMask = 0;
1106
	for (int i = 0; i < 4; i++) {
1107
		uint32_t channelMask = (colorMask >> (i * 8)) & 0xFF;
1108
		switch (channelMask) {
1109
		case 0x0:
1110
			break;
1111
		case 0xFF:
1112
			maskState.channelMask |= 1 << i;
1113
			break;
1114
		default:
1115
			if (useShader != FBReadSetting::Disallowed && PSP_CoreParameter().compat.flags().ShaderColorBitmask) {
1116
				// Shaders can emulate masking accurately. Let's make use of that.
1117
				maskState.applyFramebufferRead = true;
1118
				maskState.channelMask |= 1 << i;
1119
			} else {
1120
				// Use the old inaccurate heuristic.
1121
				if (channelMask >= 128) {
1122
					maskState.channelMask |= 1 << i;
1123
				}
1124
			}
1125
		}
1126
	}
1127

1128
	// Let's not write to alpha if stencil isn't enabled.
1129
	// Also if the stencil type is set to KEEP, we shouldn't write to the stencil/alpha channel.
1130
	if (IsStencilTestOutputDisabled() || ReplaceAlphaWithStencilType() == STENCIL_VALUE_KEEP) {
1131
		maskState.channelMask &= ~8;
1132
		maskState.uniformMask &= ~0xFF000000;
1133
	}
1134

1135
	// For 5551, only the top alpha bit matters.  We might even want to swizzle 4444.
1136
	// Alpha should correctly read as 255 from a 5551 texture.
1137
	if (gstate.FrameBufFormat() == GE_FORMAT_5551) {
1138
		if ((maskState.uniformMask & 0x80000000) != 0)
1139
			maskState.uniformMask |= 0xFF000000;
1140
		else
1141
			maskState.uniformMask &= ~0xFF000000;
1142
	}
1143
}
1144

1145
// Called even if AlphaBlendEnable == false - it also deals with stencil-related blend state.
1146
static void ConvertBlendState(GenericBlendState &blendState, FBReadSetting useFBRead) {
1147
	// Blending is a bit complex to emulate.  This is due to several reasons:
1148
	//
1149
	//  * Doubled blend modes (src, dst, inversed) aren't supported in OpenGL.
1150
	//    If possible, we double the src color or src alpha in the shader to account for these.
1151
	//    These may clip incorrectly, so we avoid unfortunately.
1152
	//  * OpenGL only has one arbitrary fixed color.  We premultiply the other in the shader.
1153
	//  * The written output alpha should actually be the stencil value.  Alpha is not written.
1154
	//
1155
	// If we can't apply blending, we make a copy of the framebuffer and do it manually.
1156

1157
	blendState.applyFramebufferRead = false;
1158
	blendState.dirtyShaderBlendFixValues = false;
1159
	blendState.useBlendColor = false;
1160

1161
	ReplaceBlendType replaceBlend = ReplaceBlendWithShader(gstate_c.framebufFormat);
1162
	if (useFBRead == FBReadSetting::Forced) {
1163
		// Enforce blend replacement if enabled. If not, shouldn't do anything of course.
1164
		replaceBlend = gstate.isAlphaBlendEnabled() ? REPLACE_BLEND_READ_FRAMEBUFFER : REPLACE_BLEND_NO;
1165
	}
1166

1167
	blendState.replaceBlend = replaceBlend;
1168

1169
	blendState.simulateLogicOpType = SimulateLogicOpShaderTypeIfNeeded();
1170

1171
	ReplaceAlphaType replaceAlphaWithStencil = ReplaceAlphaWithStencil(replaceBlend);
1172
	blendState.replaceAlphaWithStencil = replaceAlphaWithStencil;
1173

1174
	bool usePreSrc = false;
1175

1176
	bool blueToAlpha = false;
1177

1178
	switch (replaceBlend) {
1179
	case REPLACE_BLEND_NO:
1180
		// We may still want to do something about stencil -> alpha.
1181
		ApplyStencilReplaceAndLogicOpIgnoreBlend(replaceAlphaWithStencil, blendState);
1182

1183
		if (useFBRead == FBReadSetting::Forced) {
1184
			// If this is true, the logic and mask replacements will be applied, at least. In that case,
1185
			// we should not apply any logic op simulation.
1186
			blendState.simulateLogicOpType = LOGICOPTYPE_NORMAL;
1187
		}
1188
		return;
1189

1190
	case REPLACE_BLEND_BLUE_TO_ALPHA:
1191
		blueToAlpha = true;
1192
		blendState.blendEnabled = gstate.isAlphaBlendEnabled();
1193
		// We'll later convert the color blend to blend in the alpha channel.
1194
		break;
1195

1196
	case REPLACE_BLEND_READ_FRAMEBUFFER:
1197
		blendState.blendEnabled = true;
1198
		blendState.applyFramebufferRead = true;
1199
		blendState.simulateLogicOpType = LOGICOPTYPE_NORMAL;
1200
		break;
1201

1202
	case REPLACE_BLEND_PRE_SRC:
1203
	case REPLACE_BLEND_PRE_SRC_2X_ALPHA:
1204
		blendState.blendEnabled = true;
1205
		usePreSrc = true;
1206
		break;
1207

1208
	case REPLACE_BLEND_STANDARD:
1209
	case REPLACE_BLEND_2X_ALPHA:
1210
	case REPLACE_BLEND_2X_SRC:
1211
		blendState.blendEnabled = true;
1212
		break;
1213
	}
1214

1215
	const GEBlendMode blendFuncEq = gstate.getBlendEq();
1216
	GEBlendSrcFactor blendFuncA = gstate.getBlendFuncA();
1217
	GEBlendDstFactor blendFuncB = gstate.getBlendFuncB();
1218
	const u32 fixA = gstate.getFixA();
1219
	const u32 fixB = gstate.getFixB();
1220

1221
	if (blendFuncA > GE_SRCBLEND_FIXA)
1222
		blendFuncA = GE_SRCBLEND_FIXA;
1223
	if (blendFuncB > GE_DSTBLEND_FIXB)
1224
		blendFuncB = GE_DSTBLEND_FIXB;
1225

1226
	int constantAlpha = 255;
1227
	BlendFactor constantAlphaGL = BlendFactor::ONE;
1228
	if (!IsStencilTestOutputDisabled() && replaceAlphaWithStencil == REPLACE_ALPHA_NO) {
1229
		switch (ReplaceAlphaWithStencilType()) {
1230
		case STENCIL_VALUE_UNIFORM:
1231
			constantAlpha = gstate.getStencilTestRef();
1232
			break;
1233

1234
		case STENCIL_VALUE_INCR_4:
1235
		case STENCIL_VALUE_DECR_4:
1236
			constantAlpha = 16;
1237
			break;
1238

1239
		case STENCIL_VALUE_INCR_8:
1240
		case STENCIL_VALUE_DECR_8:
1241
			constantAlpha = 1;
1242
			break;
1243

1244
		default:
1245
			break;
1246
		}
1247

1248
		// Otherwise it will stay GL_ONE.
1249
		if (constantAlpha <= 0) {
1250
			constantAlphaGL = BlendFactor::ZERO;
1251
		} else if (constantAlpha < 255) {
1252
			constantAlphaGL = BlendFactor::CONSTANT_ALPHA;
1253
		}
1254
	}
1255

1256
	// Shortcut by using GL_ONE where possible, no need to set blendcolor
1257
	bool approxFuncA = false;
1258
	BlendFactor glBlendFuncA = blendFuncA == GE_SRCBLEND_FIXA ? blendColor2Func(fixA, approxFuncA) : genericALookup[blendFuncA];
1259
	bool approxFuncB = false;
1260
	BlendFactor glBlendFuncB = blendFuncB == GE_DSTBLEND_FIXB ? blendColor2Func(fixB, approxFuncB) : genericBLookup[blendFuncB];
1261

1262
	if (gstate_c.framebufFormat == GE_FORMAT_565) {
1263
		if (blendFuncA == GE_SRCBLEND_DSTALPHA || blendFuncA == GE_SRCBLEND_DOUBLEDSTALPHA) {
1264
			glBlendFuncA = BlendFactor::ZERO;
1265
		}
1266
		if (blendFuncA == GE_SRCBLEND_INVDSTALPHA || blendFuncA == GE_SRCBLEND_DOUBLEINVDSTALPHA) {
1267
			glBlendFuncA = BlendFactor::ONE;
1268
		}
1269
		if (blendFuncB == GE_DSTBLEND_DSTALPHA || blendFuncB == GE_DSTBLEND_DOUBLEDSTALPHA) {
1270
			glBlendFuncB = BlendFactor::ZERO;
1271
		}
1272
		if (blendFuncB == GE_DSTBLEND_INVDSTALPHA || blendFuncB == GE_DSTBLEND_DOUBLEINVDSTALPHA) {
1273
			glBlendFuncB = BlendFactor::ONE;
1274
		}
1275
	}
1276

1277
	if (usePreSrc) {
1278
		glBlendFuncA = BlendFactor::ONE;
1279
		// Need to pull in the fixed color. TODO: If it hasn't changed, no need to dirty.
1280
		if (blendFuncA == GE_SRCBLEND_FIXA) {
1281
			blendState.dirtyShaderBlendFixValues = true;
1282
		}
1283
	}
1284

1285
	if (replaceAlphaWithStencil == REPLACE_ALPHA_DUALSOURCE) {
1286
		glBlendFuncA = toDualSource(glBlendFuncA);
1287
		glBlendFuncB = toDualSource(glBlendFuncB);
1288
	}
1289

1290
	if (blendFuncA == GE_SRCBLEND_FIXA || blendFuncB == GE_DSTBLEND_FIXB) {
1291
		if (glBlendFuncA == BlendFactor::INVALID && glBlendFuncB != BlendFactor::INVALID) {
1292
			// Can use blendcolor trivially.
1293
			blendState.setBlendColor(fixA, constantAlpha);
1294
			glBlendFuncA = BlendFactor::CONSTANT_COLOR;
1295
		} else if (glBlendFuncA != BlendFactor::INVALID && glBlendFuncB == BlendFactor::INVALID) {
1296
			// Can use blendcolor trivially.
1297
			blendState.setBlendColor(fixB, constantAlpha);
1298
			glBlendFuncB = BlendFactor::CONSTANT_COLOR;
1299
		} else if (glBlendFuncA == BlendFactor::INVALID && glBlendFuncB == BlendFactor::INVALID) {
1300
			if (blendColorSimilar(fixA, 0xFFFFFF ^ fixB)) {
1301
				glBlendFuncA = BlendFactor::CONSTANT_COLOR;
1302
				glBlendFuncB = BlendFactor::ONE_MINUS_CONSTANT_COLOR;
1303
				blendState.setBlendColor(fixA, constantAlpha);
1304
			} else if (blendColorSimilar(fixA, fixB)) {
1305
				glBlendFuncA = BlendFactor::CONSTANT_COLOR;
1306
				glBlendFuncB = BlendFactor::CONSTANT_COLOR;
1307
				blendState.setBlendColor(fixA, constantAlpha);
1308
			} else {
1309
				DEBUG_LOG(Log::G3D, "ERROR INVALID blendcolorstate: FixA=%06x FixB=%06x FuncA=%i FuncB=%i", fixA, fixB, blendFuncA, blendFuncB);
1310
				// Let's approximate, at least.  Close is better than totally off.
1311
				const bool nearZeroA = blendColorSimilar(fixA, 0, 64);
1312
				const bool nearZeroB = blendColorSimilar(fixB, 0, 64);
1313
				if (nearZeroA || blendColorSimilar(fixA, 0xFFFFFF, 64)) {
1314
					glBlendFuncA = nearZeroA ? BlendFactor::ZERO : BlendFactor::ONE;
1315
					glBlendFuncB = BlendFactor::CONSTANT_COLOR;
1316
					blendState.setBlendColor(fixB, constantAlpha);
1317
				} else {
1318
					// We need to pick something.  Let's go with A as the fixed color.
1319
					glBlendFuncA = BlendFactor::CONSTANT_COLOR;
1320
					glBlendFuncB = nearZeroB ? BlendFactor::ZERO : BlendFactor::ONE;
1321
					blendState.setBlendColor(fixA, constantAlpha);
1322
				}
1323
			}
1324
		} else {
1325
			// We optimized both, but that's probably not necessary, so let's pick one to be constant.
1326
			if (blendFuncA == GE_SRCBLEND_FIXA && !usePreSrc && approxFuncA) {
1327
				glBlendFuncA = BlendFactor::CONSTANT_COLOR;
1328
				blendState.setBlendColor(fixA, constantAlpha);
1329
			} else if (approxFuncB) {
1330
				glBlendFuncB = BlendFactor::CONSTANT_COLOR;
1331
				blendState.setBlendColor(fixB, constantAlpha);
1332
			} else {
1333
				if (constantAlphaGL == BlendFactor::CONSTANT_ALPHA) {
1334
					blendState.defaultBlendColor(constantAlpha);
1335
				}
1336
			}
1337
		}
1338
	} else {
1339
		if (constantAlphaGL == BlendFactor::CONSTANT_ALPHA) {
1340
			blendState.defaultBlendColor(constantAlpha);
1341
		}
1342
	}
1343

1344
	// Some Android devices (especially old Mali, it seems) composite badly if there's alpha in the backbuffer.
1345
	// So in non-buffered rendering, we will simply consider the dest alpha to be zero in blending equations.
1346
#if PPSSPP_PLATFORM(ANDROID)
1347
	if (g_Config.bSkipBufferEffects) {
1348
		if (glBlendFuncA == BlendFactor::DST_ALPHA) glBlendFuncA = BlendFactor::ZERO;
1349
		if (glBlendFuncB == BlendFactor::DST_ALPHA) glBlendFuncB = BlendFactor::ZERO;
1350
		if (glBlendFuncA == BlendFactor::ONE_MINUS_DST_ALPHA) glBlendFuncA = BlendFactor::ONE;
1351
		if (glBlendFuncB == BlendFactor::ONE_MINUS_DST_ALPHA) glBlendFuncB = BlendFactor::ONE;
1352
	}
1353
#endif
1354

1355
	// At this point, through all paths above, glBlendFuncA and glBlendFuncB will be set right somehow.
1356
	BlendEq colorEq;
1357
	if (gstate_c.Use(GPU_USE_BLEND_MINMAX)) {
1358
		colorEq = eqLookup[blendFuncEq];
1359
	} else {
1360
		colorEq = eqLookupNoMinMax[blendFuncEq];
1361
	}
1362

1363
	// The stencil-to-alpha in fragment shader doesn't apply here (blending is enabled), and we shouldn't
1364
	// do any blending in the alpha channel as that doesn't seem to happen on PSP.  So, we attempt to
1365
	// apply the stencil to the alpha, since that's what should be stored.
1366
	BlendEq alphaEq = BlendEq::ADD;
1367
	if (replaceAlphaWithStencil != REPLACE_ALPHA_NO) {
1368
		// Let the fragment shader take care of it.
1369
		switch (ReplaceAlphaWithStencilType()) {
1370
		case STENCIL_VALUE_INCR_4:
1371
		case STENCIL_VALUE_INCR_8:
1372
			// We'll add the increment value.
1373
			blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ONE, BlendFactor::ONE);
1374
			break;
1375

1376
		case STENCIL_VALUE_DECR_4:
1377
		case STENCIL_VALUE_DECR_8:
1378
			// Like add with a small value, but subtracting.
1379
			blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ONE, BlendFactor::ONE);
1380
			alphaEq = BlendEq::SUBTRACT;
1381
			break;
1382

1383
		case STENCIL_VALUE_INVERT:
1384
			// This will subtract by one, effectively inverting the bits.
1385
			blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ONE, BlendFactor::ONE);
1386
			alphaEq = BlendEq::REVERSE_SUBTRACT;
1387
			break;
1388

1389
		default:
1390
			blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ONE, BlendFactor::ZERO);
1391
			break;
1392
		}
1393
	} else if (!IsStencilTestOutputDisabled()) {
1394
		StencilValueType stencilValue = ReplaceAlphaWithStencilType();
1395
		if (stencilValue == STENCIL_VALUE_UNIFORM && constantAlpha == 0x00) {
1396
			stencilValue = STENCIL_VALUE_ZERO;
1397
		} else if (stencilValue == STENCIL_VALUE_UNIFORM && constantAlpha == 0xFF) {
1398
			stencilValue = STENCIL_VALUE_ONE;
1399
		}
1400
		switch (stencilValue) {
1401
		case STENCIL_VALUE_KEEP:
1402
			blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ZERO, BlendFactor::ONE);
1403
			break;
1404
		case STENCIL_VALUE_ONE:
1405
			// This won't give one but it's our best shot...
1406
			blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ONE, BlendFactor::ONE);
1407
			break;
1408
		case STENCIL_VALUE_ZERO:
1409
			blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ZERO, BlendFactor::ZERO);
1410
			break;
1411
		case STENCIL_VALUE_UNIFORM:
1412
			// This won't give a correct value (it multiplies) but it may be better than random values.
1413
			blendState.setFactors(glBlendFuncA, glBlendFuncB, constantAlphaGL, BlendFactor::ZERO);
1414
			break;
1415
		case STENCIL_VALUE_INCR_4:
1416
		case STENCIL_VALUE_INCR_8:
1417
			// This won't give a correct value always, but it will try to increase at least.
1418
			blendState.setFactors(glBlendFuncA, glBlendFuncB, constantAlphaGL, BlendFactor::ONE);
1419
			break;
1420
		case STENCIL_VALUE_DECR_4:
1421
		case STENCIL_VALUE_DECR_8:
1422
			// This won't give a correct value always, but it will try to decrease at least.
1423
			blendState.setFactors(glBlendFuncA, glBlendFuncB, constantAlphaGL, BlendFactor::ONE);
1424
			alphaEq = BlendEq::SUBTRACT;
1425
			break;
1426
		case STENCIL_VALUE_INVERT:
1427
			blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ONE, BlendFactor::ONE);
1428
			// If the output alpha is near 1, this will basically invert.  It's our best shot.
1429
			alphaEq = BlendEq::REVERSE_SUBTRACT;
1430
			break;
1431
		}
1432
	} else if (blueToAlpha) {
1433
		blendState.setFactors(BlendFactor::ZERO, BlendFactor::ZERO, BlendFactor::ONE, glBlendFuncB);
1434
		blendState.setEquation(BlendEq::ADD, colorEq);
1435
		return;
1436
	} else {
1437
		// Retain the existing value when stencil testing is off.
1438
		blendState.setFactors(glBlendFuncA, glBlendFuncB, BlendFactor::ZERO, BlendFactor::ONE);
1439
	}
1440

1441
	blendState.setEquation(colorEq, alphaEq);
1442
}
1443

1444
static void ConvertLogicOpState(GenericLogicState &logicOpState, bool logicSupported, bool shaderBitOpsSupported, FBReadSetting useFBRead) {
1445
	// TODO: We can get more detailed with checks here. Some logic ops don't involve the destination at all.
1446
	// Several can be trivially supported even without any bitwise logic.
1447
	if (!gstate.isLogicOpEnabled() || gstate.getLogicOp() == GE_LOGIC_COPY) {
1448
		// No matter what, don't need to do anything.
1449
		logicOpState.logicOpEnabled = false;
1450
		logicOpState.logicOp = GE_LOGIC_COPY;
1451
		logicOpState.applyFramebufferRead = useFBRead == FBReadSetting::Forced;
1452
		return;
1453
	}
1454

1455
	// TODO: Brave story uses GE_INVERTED, this is easy to convert to a blend function - unless blend is also enabled simultaneously.
1456

1457
	if (useFBRead == FBReadSetting::Forced && shaderBitOpsSupported) {
1458
		// We have to emulate logic ops in the shader.
1459
		logicOpState.logicOpEnabled = false;  // Don't use any hardware logic op, supported or not.
1460
		logicOpState.applyFramebufferRead = true;
1461
		logicOpState.logicOp = gstate.getLogicOp();
1462
	} else if (logicSupported) {
1463
		// We can use hardware logic ops directly, if needed.
1464
		logicOpState.applyFramebufferRead = false;
1465
		if (gstate.isLogicOpEnabled()) {
1466
			logicOpState.logicOpEnabled = true;
1467
			logicOpState.logicOp = gstate.getLogicOp();
1468
		} else {
1469
			logicOpState.logicOpEnabled = false;
1470
			logicOpState.logicOp = GE_LOGIC_COPY;
1471
		}
1472
	} else if (shaderBitOpsSupported && useFBRead != FBReadSetting::Disallowed) {
1473
		// D3D11 and some OpenGL versions will end up here.
1474
		// Logic ops not support, bitops supported. Let's punt to the shader.
1475
		// We should possibly always do this and never use the hardware ops, since they'll mishandle the alpha channel..
1476
		logicOpState.logicOpEnabled = false;  // Don't use any hardware logic op, supported or not.
1477
		logicOpState.applyFramebufferRead = true;
1478
		logicOpState.logicOp = gstate.getLogicOp();
1479
	} else {
1480
		// In this case, the SIMULATE fallback should kick in.
1481
		// Need to make sure this is checking for the same things though...
1482
		logicOpState.logicOpEnabled = false;
1483
		logicOpState.logicOp = GE_LOGIC_COPY;
1484
		logicOpState.applyFramebufferRead = false;
1485
	}
1486
}
1487

1488
static void ConvertStencilFunc5551(GenericStencilFuncState &state) {
1489
	// Flaws:
1490
	// - INVERT should convert 1, 5, 0xFF to 0.  Currently it won't always.
1491
	// - INCR twice shouldn't change the value.
1492
	// - REPLACE should write 0 for 0x00 - 0x7F, and non-zero for 0x80 - 0xFF.
1493
	// - Write mask may need double checking, but likely only the top bit matters.
1494

1495
	const bool usesRef = state.sFail == GE_STENCILOP_REPLACE || state.zFail == GE_STENCILOP_REPLACE || state.zPass == GE_STENCILOP_REPLACE;
1496
	const u8 maskedRef = state.testRef & state.testMask;
1497
	const u8 usedRef = (state.testRef & 0x80) != 0 ? 0xFF : 0x00;
1498

1499
	auto rewriteFunc = [&](GEComparison func, u8 ref) {
1500
		// We can only safely rewrite if it doesn't use the ref, or if the ref is the same.
1501
		if (!usesRef || usedRef == ref) {
1502
			state.testFunc = func;
1503
			state.testRef = ref;
1504
			state.testMask = 0xFF;
1505
		}
1506
	};
1507
	auto rewriteRef = [&](bool always) {
1508
		state.testFunc = always ? GE_COMP_ALWAYS : GE_COMP_NEVER;
1509
		if (usesRef) {
1510
			// Rewrite the ref (for REPLACE) to 0x00 or 0xFF (the "best" values) if safe.
1511
			// This will only be called if the test doesn't need the ref.
1512
			state.testRef = usedRef;
1513
			// Nuke the mask as well, since this is always/never, just for consistency.
1514
			state.testMask = 0xFF;
1515
		} else {
1516
			// Not used, so let's make the ref 0xFF which is a useful value later.
1517
			state.testRef = 0xFF;
1518
			state.testMask = 0xFF;
1519
		}
1520
	};
1521

1522
	// For 5551, we treat any non-zero value in the buffer as 255.  Only zero is treated as zero.
1523
	// See: https://github.com/hrydgard/ppsspp/pull/4150#issuecomment-26211193
1524
	switch (state.testFunc) {
1525
	case GE_COMP_NEVER:
1526
	case GE_COMP_ALWAYS:
1527
		// Fine as is.
1528
		rewriteRef(state.testFunc == GE_COMP_ALWAYS);
1529
		break;
1530
	case GE_COMP_EQUAL: // maskedRef == maskedBuffer
1531
		if (maskedRef == 0) {
1532
			// Remove any mask, we might have bits less than 255 but that should not match.
1533
			rewriteFunc(GE_COMP_EQUAL, 0);
1534
		} else if (maskedRef == (0xFF & state.testMask) && state.testMask != 0) {
1535
			// Equal to 255, for our buffer, means not equal to zero.
1536
			rewriteFunc(GE_COMP_NOTEQUAL, 0);
1537
		} else {
1538
			// This should never pass, regardless of buffer value.  Only 0 and 255 are directly equal.
1539
			rewriteRef(false);
1540
		}
1541
		break;
1542
	case GE_COMP_NOTEQUAL: // maskedRef != maskedBuffer
1543
		if (maskedRef == 0) {
1544
			// Remove the mask, since our buffer might not be exactly 255.
1545
			rewriteFunc(GE_COMP_NOTEQUAL, 0);
1546
		} else if (maskedRef == (0xFF & state.testMask) && state.testMask != 0) {
1547
			// The only value != 255 is 0, in our buffer.
1548
			rewriteFunc(GE_COMP_EQUAL, 0);
1549
		} else {
1550
			// Every other value evaluates as not equal, always.
1551
			rewriteRef(true);
1552
		}
1553
		break;
1554
	case GE_COMP_LESS: // maskedRef < maskedBuffer
1555
		if (maskedRef == (0xFF & state.testMask) && state.testMask != 0) {
1556
			// No possible value is less than 255.
1557
			rewriteRef(false);
1558
		} else {
1559
			// "0 < (0 or 255)" and "254 < (0 or 255)" can only work for non zero.
1560
			rewriteFunc(GE_COMP_NOTEQUAL, 0);
1561
		}
1562
		break;
1563
	case GE_COMP_LEQUAL: // maskedRef <= maskedBuffer
1564
		if (maskedRef == 0) {
1565
			// 0 is <= every possible value.
1566
			rewriteRef(true);
1567
		} else {
1568
			// "1 <= (0 or 255)" and "255 <= (0 or 255)" simply mean, anything but zero.
1569
			rewriteFunc(GE_COMP_NOTEQUAL, 0);
1570
		}
1571
		break;
1572
	case GE_COMP_GREATER: // maskedRef > maskedBuffer
1573
		if (maskedRef > 0) {
1574
			// "1 > (0 or 255)" and "255 > (0 or 255)" can only match 0.
1575
			rewriteFunc(GE_COMP_EQUAL, 0);
1576
		} else {
1577
			// 0 is never greater than any possible value.
1578
			rewriteRef(false);
1579
		}
1580
		break;
1581
	case GE_COMP_GEQUAL: // maskedRef >= maskedBuffer
1582
		if (maskedRef == (0xFF & state.testMask) && state.testMask != 0) {
1583
			// 255 is >= every possible value.
1584
			rewriteRef(true);
1585
		} else {
1586
			// "0 >= (0 or 255)" and "254 >= "(0 or 255)" are the same, equal to zero.
1587
			rewriteFunc(GE_COMP_EQUAL, 0);
1588
		}
1589
		break;
1590
	}
1591

1592
	auto rewriteOps = [&](GEStencilOp from, GEStencilOp to) {
1593
		if (state.sFail == from)
1594
			state.sFail = to;
1595
		if (state.zFail == from)
1596
			state.zFail = to;
1597
		if (state.zPass == from)
1598
			state.zPass = to;
1599
	};
1600

1601
	// Decrement always zeros, so let's rewrite those to be safe (even if it's not 1.)
1602
	rewriteOps(GE_STENCILOP_DECR, GE_STENCILOP_ZERO);
1603

1604
	if (state.testFunc == GE_COMP_NOTEQUAL && state.testRef == 0 && state.testMask != 0) {
1605
		// If it's != 0 (as optimized above), then we can rewrite INVERT to ZERO.
1606
		// With 1 bit of stencil, INVERT != 0 can only make it 0.
1607
		rewriteOps(GE_STENCILOP_INVERT, GE_STENCILOP_ZERO);
1608
	}
1609
	if (state.testFunc == GE_COMP_EQUAL && state.testRef == 0 && state.testMask != 0) {
1610
		// If it's == 0 (as optimized above), then we can rewrite INCR to INVERT.
1611
		// Otherwise we get 1, which we mostly handle, but won't INVERT correctly.
1612
		rewriteOps(GE_STENCILOP_INCR, GE_STENCILOP_INVERT);
1613
	}
1614
	if (!usesRef && state.testRef == 0xFF) {
1615
		// Safe to use REPLACE instead of INCR.
1616
		rewriteOps(GE_STENCILOP_INCR, GE_STENCILOP_REPLACE);
1617
	}
1618
}
1619

1620
static void ConvertStencilMask5551(GenericStencilFuncState &state) {
1621
	state.writeMask = state.writeMask >= 0x80 ? 0xff : 0x00;
1622
}
1623

1624
void ConvertStencilFuncState(GenericStencilFuncState &state) {
1625
	// The PSP's mask is reversed (bits not to write.)  Ignore enabled, used for clears too.
1626
	state.writeMask = (~gstate.getStencilWriteMask()) & 0xFF;
1627
	state.enabled = gstate.isStencilTestEnabled();
1628
	if (!state.enabled) {
1629
		if (gstate_c.framebufFormat == GE_FORMAT_5551)
1630
			ConvertStencilMask5551(state);
1631
		return;
1632
	}
1633

1634
	state.sFail = gstate.getStencilOpSFail();
1635
	state.zFail = gstate.getStencilOpZFail();
1636
	state.zPass = gstate.getStencilOpZPass();
1637

1638
	state.testFunc = gstate.getStencilTestFunction();
1639
	state.testRef = gstate.getStencilTestRef();
1640
	state.testMask = gstate.getStencilTestMask();
1641

1642
	bool depthTest = gstate.isDepthTestEnabled();
1643
	if ((state.sFail == state.zFail || !depthTest) && state.sFail == state.zPass) {
1644
		// Common case: we're writing only to stencil (usually REPLACE/REPLACE/REPLACE.)
1645
		// We want to write stencil to alpha in this case, so switch to ALWAYS if already masked.
1646
		bool depthWrite = gstate.isDepthWriteEnabled();
1647
		if ((gstate.getColorMask() & 0x00FFFFFF) == 0x00FFFFFF && (!depthTest || !depthWrite)) {
1648
			state.testFunc = GE_COMP_ALWAYS;
1649
		}
1650
	}
1651

1652
	switch (gstate_c.framebufFormat) {
1653
	case GE_FORMAT_565:
1654
		state.writeMask = 0;
1655
		break;
1656

1657
	case GE_FORMAT_5551:
1658
		ConvertStencilMask5551(state);
1659
		ConvertStencilFunc5551(state);
1660
		break;
1661

1662
	default:
1663
		// Hard to do anything useful for 4444, and 8888 is fine.
1664
		break;
1665
	}
1666
}
1667

1668
void GenericMaskState::Log() {
1669
	WARN_LOG(Log::G3D, "Mask: %08x %01X readfb=%d", uniformMask, channelMask, applyFramebufferRead);
1670
}
1671

1672
void GenericBlendState::Log() {
1673
	WARN_LOG(Log::G3D, "Blend: hwenable=%d readfb=%d replblend=%d replalpha=%d",
1674
		blendEnabled, applyFramebufferRead, replaceBlend, (int)replaceAlphaWithStencil);
1675
}
1676

1677
void ComputedPipelineState::Convert(bool shaderBitOpsSupported, bool fbReadAllowed) {
1678
	// Passing on the previous applyFramebufferRead as forceFrameBuffer read in the next one,
1679
	// thus propagating forward.
1680
	FBReadSetting readFB = (fbReadAllowed && shaderBitOpsSupported) ? FBReadSetting::Allowed : FBReadSetting::Disallowed;
1681
	ConvertMaskState(maskState, readFB);
1682
	readFB = maskState.applyFramebufferRead ? FBReadSetting::Forced : (fbReadAllowed ? FBReadSetting::Allowed : FBReadSetting::Disallowed);
1683
	ConvertLogicOpState(logicState, gstate_c.Use(GPU_USE_LOGIC_OP), shaderBitOpsSupported, readFB);
1684
	readFB = logicState.applyFramebufferRead ? FBReadSetting::Forced : (fbReadAllowed ? FBReadSetting::Allowed : FBReadSetting::Disallowed);
1685
	ConvertBlendState(blendState, readFB);
1686

1687
	// Note: If the blend state decided it had to use framebuffer reads,
1688
	// we need to make sure that both mask and logic also use it, otherwise things will go wrong.
1689
	if (blendState.applyFramebufferRead || logicState.applyFramebufferRead) {
1690
		_dbg_assert_(fbReadAllowed);
1691
		maskState.ConvertToShaderBlend();
1692
		logicState.ConvertToShaderBlend();
1693
	} else {
1694
		// If it isn't a read, we may need to change blending to apply the logic op.
1695
		logicState.ApplyToBlendState(blendState);
1696
	}
1697
}
1698

1699
void GenericLogicState::ApplyToBlendState(GenericBlendState &blendState) {
1700
	if (SimulateLogicOpIfNeeded(blendState.srcColor, blendState.dstColor, blendState.eqColor)) {
1701
		if (!blendState.blendEnabled) {
1702
			// If it wasn't turned on, make sure it is now.
1703
			blendState.blendEnabled = true;
1704
			blendState.srcAlpha = BlendFactor::ONE;
1705
			blendState.dstAlpha = BlendFactor::ZERO;
1706
			blendState.eqAlpha = BlendEq::ADD;
1707
		}
1708
		logicOpEnabled = false;
1709
		logicOp = GE_LOGIC_COPY;
1710
	}
1711
}
1712

1713
Product

Resources

Company