CoCalc -- FragmentShaderGenerator.cpp

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Common/FragmentShaderGenerator.cpp
³¹⁸⁶ views
1
// Copyright (c) 2012- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#include <cstdio>
19
#include <sstream>
20

21
#include "Common/Log.h"
22
#include "Common/StringUtils.h"
23
#include "Common/GPU/OpenGL/GLFeatures.h"
24
#include "Common/GPU/ShaderWriter.h"
25
#include "Common/GPU/thin3d.h"
26
#include "Core/Compatibility.h"
27
#include "Core/Config.h"
28
#include "Core/System.h"
29
#include "GPU/Common/GPUStateUtils.h"
30
#include "GPU/Common/ShaderId.h"
31
#include "GPU/Common/ShaderUniforms.h"
32
#include "GPU/Common/FragmentShaderGenerator.h"
33
#include "GPU/Vulkan/DrawEngineVulkan.h"
34
#include "GPU/ge_constants.h"
35
#include "GPU/GPUState.h"
36

37
#define WRITE(p, ...) p.F(__VA_ARGS__)
38

39
static const SamplerDef samplersMono[3] = {
40
	{ 0, "tex" },
41
	{ 1, "fbotex", SamplerFlags::ARRAY_ON_VULKAN },
42
	{ 2, "pal" },
43
};
44

45
static const SamplerDef samplersStereo[3] = {
46
	{ 0, "tex", SamplerFlags::ARRAY_ON_VULKAN },
47
	{ 1, "fbotex", SamplerFlags::ARRAY_ON_VULKAN },
48
	{ 2, "pal" },
49
};
50

51
bool GenerateFragmentShader(const FShaderID &id, char *buffer, const ShaderLanguageDesc &compat, Draw::Bugs bugs, uint64_t *uniformMask, FragmentShaderFlags *fragmentShaderFlags, std::string *errorString) {
52
	*uniformMask = 0;
53
	*fragmentShaderFlags = (FragmentShaderFlags)0;
54
	errorString->clear();
55

56
	bool useStereo = id.Bit(FS_BIT_STEREO);
57
	bool highpFog = false;
58
	bool highpTexcoord = false;
59
	bool enableFragmentTestCache = gstate_c.Use(GPU_USE_FRAGMENT_TEST_CACHE);
60

61
	if (compat.gles) {
62
		// PowerVR needs highp to do the fog in MHU correctly.
63
		// Others don't, and some can't handle highp in the fragment shader.
64
		highpFog = (gl_extensions.bugs & BUG_PVR_SHADER_PRECISION_BAD) ? true : false;
65
		highpTexcoord = highpFog;
66
	}
67

68
	bool texture3D = id.Bit(FS_BIT_3D_TEXTURE);
69
	bool arrayTexture = id.Bit(FS_BIT_SAMPLE_ARRAY_TEXTURE);
70

71
	ReplaceAlphaType stencilToAlpha = static_cast<ReplaceAlphaType>(id.Bits(FS_BIT_STENCIL_TO_ALPHA, 2));
72

73
	std::vector<const char*> extensions;
74
	if (ShaderLanguageIsOpenGL(compat.shaderLanguage)) {
75
		if (stencilToAlpha == REPLACE_ALPHA_DUALSOURCE && gl_extensions.EXT_blend_func_extended) {
76
			extensions.push_back("#extension GL_EXT_blend_func_extended : require");
77
		}
78
		if (gl_extensions.EXT_gpu_shader4) {
79
			extensions.push_back("#extension GL_EXT_gpu_shader4 : enable");
80
		}
81
		if (compat.framebufferFetchExtension) {
82
			extensions.push_back(compat.framebufferFetchExtension);
83
		}
84
		if (gl_extensions.OES_texture_3D && texture3D) {
85
			extensions.push_back("#extension GL_OES_texture_3D: enable");
86
		}
87
	} 
88

89
	ShaderWriterFlags flags = ShaderWriterFlags::NONE;
90
	if (useStereo) {
91
		flags |= ShaderWriterFlags::FS_AUTO_STEREO;
92
	}
93

94
	ShaderWriter p(buffer, compat, ShaderStage::Fragment, extensions, flags);
95
	p.F("// %s\n", FragmentShaderDesc(id).c_str());
96

97
	p.ApplySamplerMetadata(arrayTexture ? samplersStereo : samplersMono);
98

99
	bool lmode = id.Bit(FS_BIT_LMODE);
100
	bool doTexture = id.Bit(FS_BIT_DO_TEXTURE);
101
	bool enableFog = id.Bit(FS_BIT_ENABLE_FOG);
102
	bool enableAlphaTest = id.Bit(FS_BIT_ALPHA_TEST);
103

104
	bool alphaTestAgainstZero = id.Bit(FS_BIT_ALPHA_AGAINST_ZERO);
105
	bool testForceToZero = id.Bit(FS_BIT_TEST_DISCARD_TO_ZERO);
106
	bool enableColorTest = id.Bit(FS_BIT_COLOR_TEST);
107
	bool colorTestAgainstZero = id.Bit(FS_BIT_COLOR_AGAINST_ZERO);
108
	bool doTextureProjection = id.Bit(FS_BIT_DO_TEXTURE_PROJ);
109

110
	bool ubershader = id.Bit(FS_BIT_UBERSHADER);
111
	// ubershader-controlled bits. If ubershader is on, these will not be used below (and will be false).
112
	bool useTexAlpha = id.Bit(FS_BIT_TEXALPHA);
113
	bool enableColorDouble = id.Bit(FS_BIT_DOUBLE_COLOR);
114

115
	if (texture3D && arrayTexture) {
116
		*errorString = "Invalid combination of 3D texture and array texture, shouldn't happen";
117
		return false;
118
	}
119
	if (compat.shaderLanguage != ShaderLanguage::GLSL_VULKAN && arrayTexture) {
120
		*errorString = "We only do array textures for framebuffers in Vulkan.";
121
		return false;
122
	}
123

124
	bool flatBug = bugs.Has(Draw::Bugs::BROKEN_FLAT_IN_SHADER) && g_Config.bVendorBugChecksEnabled;
125

126
	bool doFlatShading = id.Bit(FS_BIT_FLATSHADE) && !flatBug;
127
	if (doFlatShading) {
128
		*fragmentShaderFlags |= FragmentShaderFlags::USES_FLAT_SHADING;
129
	}
130

131
	ShaderDepalMode shaderDepalMode = (ShaderDepalMode)id.Bits(FS_BIT_SHADER_DEPAL_MODE, 2);
132
	if (texture3D) {
133
		shaderDepalMode = ShaderDepalMode::OFF;
134
	}
135
	if (!compat.bitwiseOps && shaderDepalMode != ShaderDepalMode::OFF) {
136
		*errorString = "depal requires bitwise ops";
137
		return false;
138
	}
139
	bool bgraTexture = id.Bit(FS_BIT_BGRA_TEXTURE);
140
	bool colorWriteMask = id.Bit(FS_BIT_COLOR_WRITEMASK) && compat.bitwiseOps;
141

142
	GEComparison alphaTestFunc = (GEComparison)id.Bits(FS_BIT_ALPHA_TEST_FUNC, 3);
143
	GEComparison colorTestFunc = (GEComparison)id.Bits(FS_BIT_COLOR_TEST_FUNC, 2);
144
	bool needShaderTexClamp = id.Bit(FS_BIT_SHADER_TEX_CLAMP);
145

146
	GETexFunc texFunc = (GETexFunc)id.Bits(FS_BIT_TEXFUNC, 3);
147

148
	ReplaceBlendType replaceBlend = static_cast<ReplaceBlendType>(id.Bits(FS_BIT_REPLACE_BLEND, 3));
149

150
	bool blueToAlpha = false;
151
	if (replaceBlend == ReplaceBlendType::REPLACE_BLEND_BLUE_TO_ALPHA) {
152
		blueToAlpha = true;
153
	}
154

155
	bool isModeClear = id.Bit(FS_BIT_CLEARMODE);
156

157
	const char *shading = "";
158
	if (compat.glslES30 || compat.shaderLanguage == ShaderLanguage::GLSL_VULKAN) {
159
		shading = doFlatShading ? "flat" : "";
160
	}
161

162
	bool forceDepthWritesOff = id.Bit(FS_BIT_DEPTH_TEST_NEVER);
163

164
	bool useDiscardStencilBugWorkaround = id.Bit(FS_BIT_NO_DEPTH_CANNOT_DISCARD_STENCIL) && !forceDepthWritesOff;
165

166
	GEBlendSrcFactor replaceBlendFuncA = (GEBlendSrcFactor)id.Bits(FS_BIT_BLENDFUNC_A, 4);
167
	GEBlendDstFactor replaceBlendFuncB = (GEBlendDstFactor)id.Bits(FS_BIT_BLENDFUNC_B, 4);
168
	GEBlendMode replaceBlendEq = (GEBlendMode)id.Bits(FS_BIT_BLENDEQ, 3);
169
	StencilValueType replaceAlphaWithStencilType = (StencilValueType)id.Bits(FS_BIT_REPLACE_ALPHA_WITH_STENCIL_TYPE, 4);
170

171
	// Distinct from the logic op simulation support.
172
	GELogicOp replaceLogicOpType = isModeClear ? GE_LOGIC_COPY : (GELogicOp)id.Bits(FS_BIT_REPLACE_LOGIC_OP, 4);
173
	bool replaceLogicOp = replaceLogicOpType != GE_LOGIC_COPY && compat.bitwiseOps;
174

175
	bool needFramebufferRead = replaceBlend == REPLACE_BLEND_READ_FRAMEBUFFER || colorWriteMask || replaceLogicOp;
176

177
	bool fetchFramebuffer = needFramebufferRead && id.Bit(FS_BIT_USE_FRAMEBUFFER_FETCH);
178
	bool readFramebufferTex = needFramebufferRead && !id.Bit(FS_BIT_USE_FRAMEBUFFER_FETCH);
179

180
	if (fetchFramebuffer && (compat.shaderLanguage != GLSL_3xx || !compat.lastFragData)) {
181
		*errorString = "framebuffer fetch requires GLSL 3xx";
182
		return false;
183
	}
184

185
	bool needFragCoord = readFramebufferTex || gstate_c.Use(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT);
186
	bool writeDepth = gstate_c.Use(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT) && !forceDepthWritesOff;
187

188
	// TODO: We could have a separate mechanism to support more ops using the shader blending mechanism,
189
// on hardware that can do proper bit math in fragment shaders.
190
	SimulateLogicOpType simulateLogicOpType = (SimulateLogicOpType)id.Bits(FS_BIT_SIMULATE_LOGIC_OP_TYPE, 2);
191

192
	if (shaderDepalMode != ShaderDepalMode::OFF && !doTexture) {
193
		*errorString = "depal requires a texture";
194
		return false;
195
	}
196

197
	// Currently only used by Vulkan.
198
	std::vector<SamplerDef> samplers;
199

200
	if (compat.shaderLanguage == ShaderLanguage::GLSL_VULKAN) {
201
		if (useDiscardStencilBugWorkaround && !writeDepth) {
202
			WRITE(p, "layout (depth_unchanged) out float gl_FragDepth;\n");
203
		}
204

205
		WRITE(p, "layout (std140, set = 0, binding = %d) uniform baseUBO {\n%s};\n", DRAW_BINDING_DYNUBO_BASE, ub_baseStr);
206
		if (doTexture) {
207
			WRITE(p, "layout (set = 0, binding = %d) uniform %s%s tex;\n", DRAW_BINDING_TEXTURE, texture3D ? "sampler3D" : "sampler2D", arrayTexture ? "Array" : "");
208
		}
209

210
		if (readFramebufferTex) {
211
			// The framebuffer texture is always bound as an array.
212
			p.F("layout (set = 0, binding = %d) uniform sampler2DArray fbotex;\n", DRAW_BINDING_2ND_TEXTURE);
213
		}
214

215
		if (shaderDepalMode != ShaderDepalMode::OFF) {
216
			WRITE(p, "layout (set = 0, binding = %d) uniform sampler2D pal;\n", DRAW_BINDING_DEPAL_TEXTURE);
217
		}
218

219
		// Note: the precision qualifiers must match the vertex shader!
220
		WRITE(p, "layout (location = 1) %s in lowp vec4 v_color0;\n", shading);
221
		if (lmode) {
222
			WRITE(p, "layout (location = 2) %s in lowp vec3 v_color1;\n", shading);
223
		}
224
		WRITE(p, "layout (location = 3) in highp float v_fogdepth;\n");
225
		if (doTexture) {
226
			WRITE(p, "layout (location = 0) in highp vec3 v_texcoord;\n");
227
		}
228

229
		if (enableAlphaTest && !alphaTestAgainstZero) {
230
			WRITE(p, "int roundAndScaleTo255i(in highp float x) { return int(floor(x * 255.0 + 0.5)); }\n");
231
		}
232
		if (enableColorTest && !colorTestAgainstZero) {
233
			WRITE(p, "uint roundAndScaleTo8x4(in highp vec3 x) { uvec3 u = uvec3(floor(x * 255.0 + 0.5)); return u.r | (u.g << 8) | (u.b << 16); }\n");
234
			WRITE(p, "uint packFloatsTo8x4(in vec3 x) { uvec3 u = uvec3(x); return u.r | (u.g << 8) | (u.b << 16); }\n");
235
		}
236

237
		WRITE(p, "layout (location = 0, index = 0) out vec4 fragColor0;\n");
238
		if (stencilToAlpha == REPLACE_ALPHA_DUALSOURCE) {
239
			WRITE(p, "layout (location = 0, index = 1) out vec4 fragColor1;\n");
240
		}
241
	} else if (compat.shaderLanguage == HLSL_D3D11) {
242
		{
243
			WRITE(p, "SamplerState texSamp : register(s0);\n");
244
			if (texture3D) {
245
				WRITE(p, "Texture3D<vec4> tex : register(t0);\n");
246
			} else {
247
				WRITE(p, "Texture2D<vec4> tex : register(t0);\n");
248
			}
249
			if (readFramebufferTex) {
250
				// No sampler required, we Load
251
				WRITE(p, "Texture2D<vec4> fbotex : register(t1);\n");
252
			}
253

254
			if (shaderDepalMode != ShaderDepalMode::OFF) {
255
				WRITE(p, "SamplerState palSamp : register(s3);\n");
256
				WRITE(p, "Texture2D<vec4> pal : register(t3);\n");
257
				WRITE(p, "float2 textureSize(Texture2D<float4> tex, int mip) { float2 size; tex.GetDimensions(size.x, size.y); return size; }\n");
258
			}
259

260
			WRITE(p, "cbuffer base : register(b0) {\n%s};\n", ub_baseStr);
261
		}
262

263
		if (enableAlphaTest) {
264
			if (compat.shaderLanguage == HLSL_D3D11) {
265
				WRITE(p, "int roundAndScaleTo255i(float x) { return int(floor(x * 255.0f + 0.5f)); }\n");
266
			} else {
267
				// D3D11 level 9 gets to take this path.
268
				WRITE(p, "float roundAndScaleTo255f(float x) { return floor(x * 255.0f + 0.5f); }\n");
269
			}
270
		}
271
		if (enableColorTest) {
272
			if (compat.shaderLanguage == HLSL_D3D11) {
273
				WRITE(p, "uint roundAndScaleTo8x4(float3 x) { uvec3 u = (floor(x * 255.0f + 0.5f)); return u.r | (u.g << 8) | (u.b << 16); }\n");
274
				WRITE(p, "uint packFloatsTo8x4(in vec3 x) { uvec3 u = uvec3(x); return u.r | (u.g << 8) | (u.b << 16); }\n");
275
			} else {
276
				WRITE(p, "vec3 roundAndScaleTo255v(float3 x) { return floor(x * 255.0f + 0.5f); }\n");
277
			}
278
		}
279

280
		WRITE(p, "struct PS_IN {\n");
281
		if (doTexture || compat.shaderLanguage == HLSL_D3D11) {
282
			// In D3D11, if we always have a texcoord in the VS, we always need it in the PS too for the structs to match.
283
			WRITE(p, "  vec3 v_texcoord: TEXCOORD0;\n");
284
		}
285
		const char *colorInterpolation = doFlatShading && compat.shaderLanguage == HLSL_D3D11 ? "nointerpolation " : "";
286
		WRITE(p, "  %svec4 v_color0: COLOR0;\n", colorInterpolation);
287
		if (lmode) {
288
			WRITE(p, "  vec3 v_color1: COLOR1;\n");
289
		}
290
		WRITE(p, "  float v_fogdepth: TEXCOORD1;\n");
291
		if (needFragCoord) {
292
			if (compat.shaderLanguage == HLSL_D3D11) {
293
				WRITE(p, "  vec4 pixelPos : SV_POSITION;\n");
294
			}
295
		}
296
		WRITE(p, "};\n");
297

298
		if (compat.shaderLanguage == HLSL_D3D11) {
299
			WRITE(p, "struct PS_OUT {\n");
300
			if (stencilToAlpha == REPLACE_ALPHA_DUALSOURCE) {
301
				WRITE(p, "  vec4 target : SV_Target0;\n");
302
				WRITE(p, "  vec4 target1 : SV_Target1;\n");
303
			} else {
304
				WRITE(p, "  vec4 target : SV_Target;\n");
305
			}
306
			if (writeDepth) {
307
				WRITE(p, "  float depth : SV_Depth;\n");
308
			}
309
			WRITE(p, "};\n");
310
		}
311
	} else if (ShaderLanguageIsOpenGL(compat.shaderLanguage)) {
312
		if ((shaderDepalMode != ShaderDepalMode::OFF || colorWriteMask) && gl_extensions.IsGLES) {
313
			WRITE(p, "precision highp int;\n");
314
		}
315

316
		if (doTexture) {
317
			if (texture3D) {
318
				// For whatever reason, a precision specifier is required here.
319
				WRITE(p, "uniform lowp sampler3D tex;\n");
320
			} else {
321
				WRITE(p, "uniform sampler2D tex;\n");
322
			}
323
			*uniformMask |= DIRTY_TEX_ALPHA_MUL;
324
			if (ubershader) {
325
				WRITE(p, "uniform vec2 u_texNoAlphaMul;\n");
326
			}
327
		}
328

329
		if (readFramebufferTex) {
330
			if (!compat.texelFetch) {
331
				WRITE(p, "uniform vec2 u_fbotexSize;\n");
332
			}
333
			WRITE(p, "uniform sampler2D fbotex;\n");
334
		}
335

336
		if (!isModeClear && replaceBlend > REPLACE_BLEND_STANDARD) {
337
			*uniformMask |= DIRTY_SHADERBLEND;
338
			if (replaceBlendFuncA >= GE_SRCBLEND_FIXA) {
339
				WRITE(p, "uniform vec3 u_blendFixA;\n");
340
			}
341
			if (replaceBlendFuncB >= GE_DSTBLEND_FIXB) {
342
				WRITE(p, "uniform vec3 u_blendFixB;\n");
343
			}
344
		}
345

346
		if (needShaderTexClamp && doTexture) {
347
			*uniformMask |= DIRTY_TEXCLAMP;
348
			WRITE(p, "uniform vec4 u_texclamp;\n");
349
			WRITE(p, "uniform vec2 u_texclampoff;\n");
350
		}
351

352
		// TODO: Can get rid of some of this in the != 0 cases.
353
		if (enableAlphaTest || enableColorTest) {
354
			if (enableFragmentTestCache) {
355
				WRITE(p, "uniform sampler2D testtex;\n");
356
			} else {
357
				*uniformMask |= DIRTY_ALPHACOLORREF;
358
				if (compat.bitwiseOps) {
359
					WRITE(p, "uniform uint u_alphacolorref;\n");
360
				} else {
361
					WRITE(p, "uniform vec4 u_alphacolorref;\n");
362
				}
363
				if (compat.bitwiseOps && ((enableColorTest && !colorTestAgainstZero) || (enableAlphaTest && !alphaTestAgainstZero))) {
364
					*uniformMask |= DIRTY_ALPHACOLORMASK;
365
					WRITE(p, "uniform uint u_alphacolormask;\n");
366
				}
367
			}
368
		}
369

370
		if (shaderDepalMode != ShaderDepalMode::OFF) {
371
			WRITE(p, "uniform sampler2D pal;\n");
372
			WRITE(p, "uniform uint u_depal_mask_shift_off_fmt;\n");
373
			*uniformMask |= DIRTY_DEPAL;
374
		}
375

376
		if (colorWriteMask) {
377
			WRITE(p, "uniform uint u_colorWriteMask;\n");
378
			*uniformMask |= DIRTY_COLORWRITEMASK;
379
		}
380

381
		if (stencilToAlpha && replaceAlphaWithStencilType == STENCIL_VALUE_UNIFORM) {
382
			*uniformMask |= DIRTY_STENCILREPLACEVALUE;
383
			WRITE(p, "uniform float u_stencilReplaceValue;\n");
384
		}
385
		if (doTexture && texFunc == GE_TEXFUNC_BLEND) {
386
			*uniformMask |= DIRTY_TEXENV;
387
			WRITE(p, "uniform vec3 u_texenv;\n");
388
		}
389

390
		if (texture3D) {
391
			*uniformMask |= DIRTY_MIPBIAS;
392
			WRITE(p, "uniform float u_mipBias;\n");
393
		}
394

395
		WRITE(p, "%s %s lowp vec4 v_color0;\n", shading, compat.varying_fs);
396
		if (lmode) {
397
			WRITE(p, "%s %s lowp vec3 v_color1;\n", shading, compat.varying_fs);
398
		}
399
		if (enableFog) {
400
			*uniformMask |= DIRTY_FOGCOLOR;
401
			WRITE(p, "uniform vec3 u_fogcolor;\n");
402
		}
403
		WRITE(p, "%s %s float v_fogdepth;\n", compat.varying_fs, highpFog ? "highp" : "mediump");
404
		if (doTexture) {
405
			WRITE(p, "%s %s vec3 v_texcoord;\n", compat.varying_fs, highpTexcoord ? "highp" : "mediump");
406
		}
407

408
		if (!enableFragmentTestCache) {
409
			if (enableAlphaTest && !alphaTestAgainstZero) {
410
				if (compat.bitwiseOps) {
411
					WRITE(p, "int roundAndScaleTo255i(in float x) { return int(floor(x * 255.0 + 0.5)); }\n");
412
				} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {
413
					WRITE(p, "float roundTo255thf(in mediump float x) { mediump float y = x + (0.5/255.0); return y - fract(y * 255.0) * (1.0 / 255.0); }\n");
414
				} else {
415
					WRITE(p, "float roundAndScaleTo255f(in float x) { return floor(x * 255.0 + 0.5); }\n");
416
				}
417
			}
418
			if (enableColorTest && !colorTestAgainstZero) {
419
				if (compat.bitwiseOps) {
420
					WRITE(p, "uint roundAndScaleTo8x4(in vec3 x) { uvec3 u = uvec3(floor(x * 255.92)); return u.r | (u.g << 0x8u) | (u.b << 0x10u); }\n");
421
					WRITE(p, "uint packFloatsTo8x4(in vec3 x) { uvec3 u = uvec3(x); return u.r | (u.g << 0x8u) | (u.b << 0x10u); }\n");
422
				} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {
423
					WRITE(p, "vec3 roundTo255thv(in vec3 x) { vec3 y = x + (0.5/255.0); return y - fract(y * 255.0) * (1.0 / 255.0); }\n");
424
				} else {
425
					WRITE(p, "vec3 roundAndScaleTo255v(in vec3 x) { return floor(x * 255.0 + 0.5); }\n");
426
				}
427
			}
428
		}
429

430
		if (!strcmp(compat.fragColor0, "fragColor0")) {
431
			const char *qualifierColor0 = "out";
432
			if (fetchFramebuffer && compat.lastFragData && !strcmp(compat.lastFragData, compat.fragColor0)) {
433
				qualifierColor0 = "inout";
434
			}
435
			// Output the output color definitions.
436
			if (stencilToAlpha == REPLACE_ALPHA_DUALSOURCE) {
437
				WRITE(p, "%s vec4 fragColor0;\n", qualifierColor0);
438
				WRITE(p, "out vec4 fragColor1;\n");
439
			} else {
440
				WRITE(p, "%s vec4 fragColor0;\n", qualifierColor0);
441
			}
442
		}
443
	}
444

445
	bool hasPackUnorm4x8 = false;
446
	if (compat.shaderLanguage == GLSL_VULKAN) {
447
		hasPackUnorm4x8 = true;
448
	} else if (ShaderLanguageIsOpenGL(compat.shaderLanguage)) {
449
		if (compat.gles) {
450
			hasPackUnorm4x8 = compat.glslVersionNumber >= 310;
451
		} else {
452
			hasPackUnorm4x8 = compat.glslVersionNumber >= 400;
453
		}
454
	}
455

456
	const char *packSuffix = "";
457
	if (!hasPackUnorm4x8) {
458
		packSuffix = "R";
459
	}
460

461
	// Provide implementations of packUnorm4x8 and unpackUnorm4x8 if not available.
462
	if ((colorWriteMask || replaceLogicOp) && !hasPackUnorm4x8) {
463
		WRITE(p, "uint packUnorm4x8%s(%svec4 v) {\n", packSuffix, compat.shaderLanguage == GLSL_VULKAN ? "highp " : "");
464
		WRITE(p, "  highp vec4 f = clamp(v, 0.0, 1.0);\n");
465
		WRITE(p, "  uvec4 u = uvec4(255.0 * f);\n");
466
		WRITE(p, "  return u.x | (u.y << 0x8u) | (u.z << 0x10u) | (u.w << 0x18u);\n");
467
		WRITE(p, "}\n");
468

469
		WRITE(p, "vec4 unpackUnorm4x8%s(highp uint x) {\n", packSuffix);
470
		WRITE(p, "  highp uvec4 u = uvec4(x & 0xFFu, (x >> 0x8u) & 0xFFu, (x >> 0x10u) & 0xFFu, (x >> 0x18u) & 0xFFu);\n");
471
		WRITE(p, "  highp vec4 f = vec4(u);\n");
472
		WRITE(p, "  return f * (1.0 / 255.0);\n");
473
		WRITE(p, "}\n");
474
	}
475

476
	if (compat.bitwiseOps && enableColorTest) {
477
		p.C("uvec3 unpackUVec3(highp uint x) {\n");
478
		p.C("  return uvec3(x & 0xFFu, (x >> 0x8u) & 0xFFu, (x >> 0x10u) & 0xFFu);\n");
479
		p.C("}\n");
480
	}
481

482
	// PowerVR needs a custom modulo function. For some reason, this has far higher precision than the builtin one.
483
	if ((gl_extensions.bugs & BUG_PVR_SHADER_PRECISION_BAD) && needShaderTexClamp) {
484
		WRITE(p, "float mymod(float a, float b) { return a - b * floor(a / b); }\n");
485
	}
486

487
	if (compat.shaderLanguage == HLSL_D3D11) {
488
		WRITE(p, "PS_OUT main( PS_IN In ) {\n");
489
		WRITE(p, "  PS_OUT outfragment;\n");
490
		if (needFragCoord) {
491
			WRITE(p, "  vec4 gl_FragCoord = In.pixelPos;\n");
492
		}
493
		if (writeDepth) {
494
			WRITE(p, "  float gl_FragDepth;\n");
495
		}
496
	} else {
497
		WRITE(p, "void main() {\n");
498
	}
499

500
	if (compat.shaderLanguage == HLSL_D3D11) {
501
		WRITE(p, "  vec4 v_color0 = In.v_color0;\n");
502
		if (lmode) {
503
			WRITE(p, "  vec3 v_color1 = In.v_color1;\n");
504
		}
505
		if (enableFog) {
506
			WRITE(p, "  float v_fogdepth = In.v_fogdepth;\n");
507
		}
508
		if (doTexture) {
509
			WRITE(p, "  vec3 v_texcoord = In.v_texcoord;\n");
510
		}
511
	}
512

513
	// Two things read from the old framebuffer - shader replacement blending and bit-level masking.
514
	if (readFramebufferTex) {
515
		if (compat.shaderLanguage == HLSL_D3D11) {
516
			WRITE(p, "  vec4 destColor = fbotex.Load(int3((int)gl_FragCoord.x, (int)gl_FragCoord.y, 0));\n");
517
		} else if (compat.shaderLanguage == GLSL_VULKAN) {
518
			WRITE(p, "  lowp vec4 destColor = %s(fbotex, ivec3(gl_FragCoord.x, gl_FragCoord.y, %s), 0);\n", compat.texelFetch, useStereo ? "float(gl_ViewIndex)" : "0");
519
		} else if (!compat.texelFetch) {
520
			WRITE(p, "  lowp vec4 destColor = %s(fbotex, gl_FragCoord.xy * u_fbotexSize.xy);\n", compat.texture);
521
		} else {
522
			WRITE(p, "  lowp vec4 destColor = %s(fbotex, ivec2(gl_FragCoord.x, gl_FragCoord.y), 0);\n", compat.texelFetch);
523
		}
524
	} else if (fetchFramebuffer) {
525
		// If we have EXT_shader_framebuffer_fetch / ARM_shader_framebuffer_fetch, we skip the blit.
526
		// We can just read the prev value more directly.
527
		if (compat.shaderLanguage == GLSL_3xx) {
528
			WRITE(p, "  lowp vec4 destColor = %s;\n", compat.lastFragData);
529
		} else if (compat.shaderLanguage == GLSL_VULKAN) {
530
			WRITE(p, "  lowp vec4 destColor = subpassLoad(inputColor);\n");
531
		} else {
532
			_assert_msg_(false, "Need fetch destColor, but not a compatible language");
533
		}
534
	}
535

536
	if (isModeClear) {
537
		// Clear mode does not allow any fancy shading.
538
		WRITE(p, "  vec4 v = v_color0;\n");
539
	} else {
540
		const char *secondary = "";
541
		// Secondary color for specular on top of texture
542
		if (lmode) {
543
			WRITE(p, "  vec4 s = vec4(v_color1, 0.0);\n");
544
			secondary = " + s";
545
		}
546

547
		if (doTexture) {
548
			char texcoord[64] = "v_texcoord";
549
			// TODO: Not sure the right way to do this for projection.
550
			// This path destroys resolution on older PowerVR no matter what I do if projection is needed,
551
			// so we disable it on SGX 540 and lesser, and live with the consequences.
552
			bool terriblePrecision = (gl_extensions.bugs & BUG_PVR_SHADER_PRECISION_TERRIBLE) != 0;
553
			bool clampDisabled = doTextureProjection && terriblePrecision;
554
			// Also with terrible precision we can't do wrapping without destroying the image. See #9189
555
			if (terriblePrecision && (!id.Bit(FS_BIT_CLAMP_S) || !id.Bit(FS_BIT_CLAMP_T))) {
556
				clampDisabled = true;
557
			}
558
			if (needShaderTexClamp && !clampDisabled) {
559
				// We may be clamping inside a larger surface (tex = 64x64, buffer=480x272).
560
				// We may also be wrapping in such a surface, or either one in a too-small surface.
561
				// Obviously, clamping to a smaller surface won't work.  But better to clamp to something.
562
				std::string ucoord = "v_texcoord.x";
563
				std::string vcoord = "v_texcoord.y";
564
				if (doTextureProjection) {
565
					ucoord = "(v_texcoord.x / v_texcoord.z)";
566
					vcoord = "(v_texcoord.y / v_texcoord.z)";
567
				}
568

569
				std::string modulo = (gl_extensions.bugs & BUG_PVR_SHADER_PRECISION_BAD) ? "mymod" : "mod";
570

571
				if (id.Bit(FS_BIT_CLAMP_S)) {
572
					ucoord = "clamp(" + ucoord + ", u_texclamp.z, u_texclamp.x - u_texclamp.z)";
573
				} else {
574
					ucoord = modulo + "(" + ucoord + ", u_texclamp.x)";
575
				}
576
				if (id.Bit(FS_BIT_CLAMP_T)) {
577
					vcoord = "clamp(" + vcoord + ", u_texclamp.w, u_texclamp.y - u_texclamp.w)";
578
				} else {
579
					vcoord = modulo + "(" + vcoord + ", u_texclamp.y)";
580
				}
581
				ucoord = "(" + ucoord + " + u_texclampoff.x)";
582
				vcoord = "(" + vcoord + " + u_texclampoff.y)";
583

584
				WRITE(p, "  vec2 fixedcoord = vec2(%s, %s);\n", ucoord.c_str(), vcoord.c_str());
585
				truncate_cpy(texcoord, "fixedcoord");
586
				// We already projected it.
587
				doTextureProjection = false;
588
			}
589

590
			switch (shaderDepalMode) {
591
			case ShaderDepalMode::OFF:
592
				if (compat.shaderLanguage == HLSL_D3D11) {
593
					if (texture3D) {
594
						if (doTextureProjection) {
595
							WRITE(p, "  vec4 t = tex.Sample(texSamp, vec3(v_texcoord.xy / v_texcoord.z, u_mipBias))%s;\n", bgraTexture ? ".bgra" : "");
596
						} else {
597
							WRITE(p, "  vec4 t = tex.Sample(texSamp, vec3(%s.xy, u_mipBias))%s;\n", texcoord, bgraTexture ? ".bgra" : "");
598
						}
599
					} else {
600
						if (doTextureProjection) {
601
							WRITE(p, "  vec4 t = tex.Sample(texSamp, v_texcoord.xy / v_texcoord.z)%s;\n", bgraTexture ? ".bgra" : "");
602
						} else {
603
							WRITE(p, "  vec4 t = tex.Sample(texSamp, %s.xy)%s;\n", texcoord, bgraTexture ? ".bgra" : "");
604
						}
605
					}
606
				} else {
607
					// Note that here we're relying on the filter to be linear. We would have to otherwise to do two samples and manually filter in Z.
608
					// Let's add that if we run into a case...
609
					if (texture3D) {
610
						if (doTextureProjection) {
611
							WRITE(p, "  vec4 t = %sProj(tex, vec4(%s.xy, u_mipBias, %s.z));\n", compat.texture3D, texcoord, texcoord);
612
						} else {
613
							WRITE(p, "  vec4 t = %s(tex, vec3(%s.xy, u_mipBias));\n", compat.texture3D, texcoord);
614
						}
615
					} else if (arrayTexture) {
616
						_dbg_assert_(compat.shaderLanguage == GLSL_VULKAN);
617
						// Used for stereo rendering.
618
						const char *arrayIndex = useStereo ? "float(gl_ViewIndex)" : "0.0";
619
						if (doTextureProjection) {
620
							// There's no textureProj for array textures, so we need to emulate it.
621
							// Should be fine on any Vulkan-compatible hardware.
622
							WRITE(p, "  vec2 uv_proj = (%s.xy) / (%s.z);\n", texcoord, texcoord);
623
							WRITE(p, "  vec4 t = %s(tex, vec3(uv_proj, %s));\n", compat.texture, texcoord, arrayIndex);
624
						} else {
625
							WRITE(p, "  vec4 t = %s(tex, vec3(%s.xy, %s));\n", compat.texture, texcoord, arrayIndex);
626
						}
627
					} else {
628
						if (doTextureProjection) {
629
							WRITE(p, "  vec4 t = %sProj(tex, %s);\n", compat.texture, texcoord);
630
						} else {
631
							WRITE(p, "  vec4 t = %s(tex, %s.xy);\n", compat.texture, texcoord);
632
						}
633
					}
634
				}
635
				break;
636
			case ShaderDepalMode::SMOOTHED:
637
				// Specific mode for Test Drive. Fixes the banding.
638
				if (doTextureProjection) {
639
					// We don't use textureProj because we need better control and it's probably not much of a savings anyway.
640
					// However it is good for precision on older hardware like PowerVR.
641
					p.F("  vec2 uv = %s.xy/%s.z;\n  vec2 uv_round;\n", texcoord, texcoord);
642
				} else {
643
					p.F("  vec2 uv = %s.xy;\n  vec2 uv_round;\n", texcoord);
644
				}
645
				// Restrictions on this are checked before setting the smoothed flag.
646
				// Only RGB565 and RGBA5551 are supported, and only the specific shifts hitting the
647
				// channels directly.
648
				// Also, since we know the CLUT is smooth, we do not need to do the bilinear filter manually, we can just
649
				// lookup with the filtered value once.
650
				p.F("  vec4 t = ").SampleTexture2D("tex", "uv").C(";\n");
651
				p.C("  uint depalShift = (u_depal_mask_shift_off_fmt >> 0x8u) & 0xFFu;\n");
652
				p.C("  uint depalOffset = ((u_depal_mask_shift_off_fmt >> 0x10u) & 0xFFu) << 0x4u;\n");
653
				p.C("  uint depalFmt = (u_depal_mask_shift_off_fmt >> 0x18u) & 0x3u;\n");
654
				p.C("  float index0 = t.r;\n");
655
				p.C("  float factor = 31.0 / 256.0;\n");
656
				p.C("  if (depalFmt == 0x0u) {\n");  // yes, different versions of Test Drive use different formats. Could do compile time by adding more compat flags but meh.
657
				p.C("    if (depalShift == 0x5u) { index0 = t.g; factor = 63.0 / 256.0; }\n");
658
				p.C("    else if (depalShift == 0xBu) { index0 = t.b; }\n");
659
				p.C("  } else {\n");
660
				p.C("    if (depalShift == 0x5u) { index0 = t.g; }\n");
661
				p.C("    else if (depalShift == 0xAu) { index0 = t.b; }\n");
662
				p.C("  }\n");
663
				p.C("  float offset = float(depalOffset) / 256.0;\n");
664
				p.F("  t = ").SampleTexture2D("pal", "vec2((index0 * factor + offset) * 0.5 + 0.5 / 512.0, 0.0)").C(";\n");  // 0.5 for 512-entry CLUT.
665
				break;
666
			case ShaderDepalMode::NORMAL:
667
				if (doTextureProjection) {
668
					// We don't use textureProj because we need better control and it's probably not much of a savings anyway.
669
					// However it is good for precision on older hardware like PowerVR.
670
					WRITE(p, "  vec2 uv = %s.xy/%s.z;\n  vec2 uv_round;\n", texcoord, texcoord);
671
				} else {
672
					WRITE(p, "  vec2 uv = %s.xy;\n  vec2 uv_round;\n", texcoord);
673
				}
674
				WRITE(p, "  vec2 tsize = vec2(textureSize(tex, 0).xy);\n");
675
				WRITE(p, "  vec2 fraction;\n");
676
				WRITE(p, "  bool bilinear = (u_depal_mask_shift_off_fmt >> 0x2Fu) != 0x0u;\n");
677
				WRITE(p, "  if (bilinear) {\n");
678
				WRITE(p, "    uv_round = uv * tsize - vec2(0.5, 0.5);\n");
679
				WRITE(p, "    fraction = fract(uv_round);\n");
680
				WRITE(p, "    uv_round = (uv_round - fraction + vec2(0.5, 0.5)) / tsize;\n");  // We want to take our four point samples at pixel centers.
681
				WRITE(p, "  } else {\n");
682
				WRITE(p, "    uv_round = uv;\n");
683
				WRITE(p, "  }\n");
684
				p.C("  highp vec4 t = ").SampleTexture2D("tex", "uv_round").C(";\n");
685
				p.C("  highp vec4 t1 = ").SampleTexture2DOffset("tex", "uv_round", 1, 0).C(";\n");
686
				p.C("  highp vec4 t2 = ").SampleTexture2DOffset("tex", "uv_round", 0, 1).C(";\n");
687
				p.C("  highp vec4 t3 = ").SampleTexture2DOffset("tex", "uv_round", 1, 1).C(";\n");
688
				WRITE(p, "  uint depalMask = (u_depal_mask_shift_off_fmt & 0xFFu);\n");
689
				WRITE(p, "  uint depalShift = (u_depal_mask_shift_off_fmt >> 0x8u) & 0xFFu;\n");
690
				WRITE(p, "  uint depalOffset = ((u_depal_mask_shift_off_fmt >> 0x10u) & 0xFFu) << 0x4u;\n");
691
				WRITE(p, "  uint depalFmt = (u_depal_mask_shift_off_fmt >> 0x18u) & 0x3u;\n");
692
				WRITE(p, "  uvec4 col; uint index0; uint index1; uint index2; uint index3;\n");
693
				WRITE(p, "  switch (int(depalFmt)) {\n");  // We might want to include fmt in the shader ID if this is a performance issue.
694
				WRITE(p, "  case 0:\n");  // 565
695
				WRITE(p, "    col = uvec4(t.rgb * vec3(31.99, 63.99, 31.99), 0);\n");
696
				WRITE(p, "    index0 = (col.b << 0xBu) | (col.g << 0x5u) | (col.r);\n");
697
				WRITE(p, "    if (bilinear) {\n");
698
				WRITE(p, "      col = uvec4(t1.rgb * vec3(31.99, 63.99, 31.99), 0);\n");
699
				WRITE(p, "      index1 = (col.b << 0xBu) | (col.g << 0x5u) | (col.r);\n");
700
				WRITE(p, "      col = uvec4(t2.rgb * vec3(31.99, 63.99, 31.99), 0);\n");
701
				WRITE(p, "      index2 = (col.b << 0xBu) | (col.g << 0x5u) | (col.r);\n");
702
				WRITE(p, "      col = uvec4(t3.rgb * vec3(31.99, 63.99, 31.99), 0);\n");
703
				WRITE(p, "      index3 = (col.b << 0xBu) | (col.g << 0x5u) | (col.r);\n");
704
				WRITE(p, "    }\n");
705
				WRITE(p, "    break;\n");
706
				WRITE(p, "  case 1:\n");  // 5551
707
				WRITE(p, "    col = uvec4(t.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n");
708
				WRITE(p, "    index0 = (col.a << 0xFu) | (col.b << 0xAu) | (col.g << 0x5u) | (col.r);\n");
709
				WRITE(p, "    if (bilinear) {\n");
710
				WRITE(p, "      col = uvec4(t1.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n");
711
				WRITE(p, "      index1 = (col.a << 0xFu) | (col.b << 0xAu) | (col.g << 0x5u) | (col.r);\n");
712
				WRITE(p, "      col = uvec4(t2.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n");
713
				WRITE(p, "      index2 = (col.a << 0xFu) | (col.b << 0xAu) | (col.g << 0x5u) | (col.r);\n");
714
				WRITE(p, "      col = uvec4(t3.rgba * vec4(31.99, 31.99, 31.99, 1.0));\n");
715
				WRITE(p, "      index3 = (col.a << 0xFu) | (col.b << 0xAu) | (col.g << 0x5u) | (col.r);\n");
716
				WRITE(p, "    }\n");
717
				WRITE(p, "    break;\n");
718
				WRITE(p, "  case 2:\n");  // 4444
719
				WRITE(p, "    col = uvec4(t.rgba * 15.99);\n");
720
				WRITE(p, "    index0 = (col.a << 0xCu) | (col.b << 0x8u) | (col.g << 0x4u) | (col.r);\n");
721
				WRITE(p, "    if (bilinear) {\n");
722
				WRITE(p, "      col = uvec4(t1.rgba * 15.99);\n");
723
				WRITE(p, "      index1 = (col.a << 0xCu) | (col.b << 0x8u) | (col.g << 0x4u) | (col.r);\n");
724
				WRITE(p, "      col = uvec4(t2.rgba * 15.99);\n");
725
				WRITE(p, "      index2 = (col.a << 0xCu) | (col.b << 0x8u) | (col.g << 0x4u) | (col.r);\n");
726
				WRITE(p, "      col = uvec4(t3.rgba * 15.99);\n");
727
				WRITE(p, "      index3 = (col.a << 0xCu) | (col.b << 0x8u) | (col.g << 0x4u) | (col.r);\n");
728
				WRITE(p, "    }\n");
729
				WRITE(p, "    break;\n");
730
				WRITE(p, "  case 3:\n");  // 8888
731
				WRITE(p, "    col = uvec4(t.rgba * 255.99);\n");
732
				WRITE(p, "    index0 = (col.a << 0x18u) | (col.b << 0x10u) | (col.g << 0x8u) | (col.r);\n");
733
				WRITE(p, "    if (bilinear) {\n");
734
				WRITE(p, "      col = uvec4(t1.rgba * 255.99);\n");
735
				WRITE(p, "      index1 = (col.a << 0x18u) | (col.b << 0x10u) | (col.g << 0x8u) | (col.r);\n");
736
				WRITE(p, "      col = uvec4(t2.rgba * 255.99);\n");
737
				WRITE(p, "      index2 = (col.a << 0x18u) | (col.b << 0x10u) | (col.g << 0x8u) | (col.r);\n");
738
				WRITE(p, "      col = uvec4(t3.rgba * 255.99);\n");
739
				WRITE(p, "      index3 = (col.a << 0x18u) | (col.b << 0x10u) | (col.g << 0x8u) | (col.r);\n");
740
				WRITE(p, "    }\n");
741
				WRITE(p, "    break;\n");
742
				WRITE(p, "  };\n");
743
				WRITE(p, "  index0 = ((index0 >> depalShift) & depalMask) | depalOffset;\n");
744
				p.C("  t = ").LoadTexture2D("pal", "ivec2(index0, 0)", 0).C(";\n");
745
				WRITE(p, "  if (bilinear && !(index0 == index1 && index1 == index2 && index2 == index3)) {\n");
746
				WRITE(p, "    index1 = ((index1 >> depalShift) & depalMask) | depalOffset;\n");
747
				WRITE(p, "    index2 = ((index2 >> depalShift) & depalMask) | depalOffset;\n");
748
				WRITE(p, "    index3 = ((index3 >> depalShift) & depalMask) | depalOffset;\n");
749
				p.C("  t1 = ").LoadTexture2D("pal", "ivec2(index1, 0)", 0).C(";\n");
750
				p.C("  t2 = ").LoadTexture2D("pal", "ivec2(index2, 0)", 0).C(";\n");
751
				p.C("  t3 = ").LoadTexture2D("pal", "ivec2(index3, 0)", 0).C(";\n");
752
				WRITE(p, "    t = mix(t, t1, fraction.x);\n");
753
				WRITE(p, "    t2 = mix(t2, t3, fraction.x);\n");
754
				WRITE(p, "    t = mix(t, t2, fraction.y);\n");
755
				WRITE(p, "  }\n");
756
				break;
757
			case ShaderDepalMode::CLUT8_8888:
758
				if (doTextureProjection) {
759
					// We don't use textureProj because we need better control and it's probably not much of a savings anyway.
760
					// However it is good for precision on older hardware like PowerVR.
761
					p.F("  vec2 uv = %s.xy/%s.z;\n  vec2 uv_round;\n", texcoord, texcoord);
762
				} else {
763
					p.F("  vec2 uv = %s.xy;\n  vec2 uv_round;\n", texcoord);
764
				}
765
				p.C("  vec2 tsize = vec2(textureSize(tex, 0).xy);\n");
766
				p.C("  uv_round = floor(uv * tsize);\n");
767
				p.C("  int component = int(uv_round.x) & 3;\n");
768
				p.C("  uv_round.x *= 0.25;\n");
769
				p.C("  uv_round /= tsize;\n");
770
				p.C("  vec4 t = ").SampleTexture2D("tex", "uv_round").C(";\n");
771
				p.C("  int index;\n");
772
				p.C("  switch (component) {\n");
773
				p.C("  case 0: index = int(t.x * 254.99); break;\n");  // TODO: Not sure why 254.99 instead of 255.99, but it's currently needed.
774
				p.C("  case 1: index = int(t.y * 254.99); break;\n");
775
				p.C("  case 2: index = int(t.z * 254.99); break;\n");
776
				p.C("  case 3: index = int(t.w * 254.99); break;\n");
777
				p.C("  }\n");
778
				p.C("  t = ").LoadTexture2D("pal", "ivec2(index, 0)", 0).C(";\n");
779
				break;
780
			}
781

782
			WRITE(p, "  vec4 p = v_color0;\n");
783

784
			if (texFunc != GE_TEXFUNC_REPLACE) {
785
				if (ubershader) {
786
					WRITE(p, "  t.a = max(t.a, u_texNoAlphaMul.x);\n");
787
				} else if (!useTexAlpha) {
788
					WRITE(p, "  t.a = 1.0;\n");
789
				}
790
			}
791

792
			switch (texFunc) {
793
			case GE_TEXFUNC_MODULATE:
794
				WRITE(p, "  vec4 v = p * t%s;\n", secondary);
795
				break;
796
			case GE_TEXFUNC_DECAL:
797
				WRITE(p, "  vec4 v = vec4(mix(p.rgb, t.rgb, t.a), p.a)%s;\n", secondary);
798
				break;
799
			case GE_TEXFUNC_BLEND:
800
				WRITE(p, "  vec4 v = vec4(mix(p.rgb, u_texenv.rgb, t.rgb), p.a * t.a)%s;\n", secondary);
801
				break;
802
			case GE_TEXFUNC_REPLACE:
803
				WRITE(p, "  vec4 r = t;\n");
804
				if (ubershader) {
805
					WRITE(p, "  r.a = mix(r.a, p.a, u_texNoAlphaMul.x);\n");
806
				} else if (!useTexAlpha) {
807
					WRITE(p, "  r.a = p.a;\n");
808
				}
809
				WRITE(p, "  vec4 v = r%s;\n", secondary);
810
				break;
811
			case GE_TEXFUNC_ADD:
812
			case GE_TEXFUNC_UNKNOWN1:
813
			case GE_TEXFUNC_UNKNOWN2:
814
			case GE_TEXFUNC_UNKNOWN3:
815
				WRITE(p, "  vec4 v = vec4(p.rgb + t.rgb, p.a * t.a)%s;\n", secondary);
816
				break;
817
			default:
818
				// Doesn't happen
819
				WRITE(p, "  vec4 v = p%s;\n", secondary); break;
820
				break;
821
			}
822

823
			// This happens before fog is applied.
824
			*uniformMask |= DIRTY_TEX_ALPHA_MUL;
825

826
			// We only need a clamp if the color will be further processed. Otherwise the hardware color conversion will clamp for us.
827
			if (ubershader) {
828
				if (enableFog || enableColorTest || replaceBlend != REPLACE_BLEND_NO || simulateLogicOpType != LOGICOPTYPE_NORMAL || colorWriteMask || blueToAlpha) {
829
					WRITE(p, "  v.rgb = clamp(v.rgb * u_texNoAlphaMul.y, 0.0, 1.0);\n");
830
				} else {
831
					WRITE(p, "  v.rgb *= u_texNoAlphaMul.y;\n");
832
				}
833
			} else if (enableColorDouble) {
834
				p.C("  v.rgb = clamp(v.rgb * 2.0, 0.0, 1.0);\n");
835
			}
836
		} else {
837
			// No texture mapping
838
			WRITE(p, "  vec4 v = v_color0%s;\n", secondary);
839
		}
840

841
		if (enableFog) {
842
			WRITE(p, "  float fogCoef = clamp(v_fogdepth, 0.0, 1.0);\n");
843
			WRITE(p, "  v = mix(vec4(u_fogcolor, v.a), v, fogCoef);\n");
844
		}
845

846
		// Texture access is at half texels [0.5/256, 255.5/256], but colors are normalized [0, 255].
847
		// So we have to scale to account for the difference.
848
		char alphaTestXCoord[64] = "0";
849
		if (enableFragmentTestCache) {
850
			if (enableColorTest && !colorTestAgainstZero) {
851
				WRITE(p, "  vec4 vScale256 = v * %f + %f;\n", 255.0 / 256.0, 0.5 / 256.0);
852
				truncate_cpy(alphaTestXCoord, "vScale256.a");
853
			} else if (enableAlphaTest && !alphaTestAgainstZero) {
854
				snprintf(alphaTestXCoord, sizeof(alphaTestXCoord), "v.a * %f + %f", 255.0 / 256.0, 0.5 / 256.0);
855
			}
856
		}
857

858
		const char *discardStatement = testForceToZero ? "v.a = 0.0;" : "DISCARD;";
859
		if (enableAlphaTest) {
860
			*fragmentShaderFlags |= FragmentShaderFlags::USES_DISCARD;
861

862
			if (alphaTestAgainstZero) {
863
				// When testing against 0 (extremely common), we can avoid some math.
864
				// 0.002 is approximately half of 1.0 / 255.0.
865
				if (alphaTestFunc == GE_COMP_NOTEQUAL || alphaTestFunc == GE_COMP_GREATER) {
866
					WRITE(p, "  if (v.a < 0.002) %s\n", discardStatement);
867
				} else if (alphaTestFunc != GE_COMP_NEVER) {
868
					// Anything else is a test for == 0.  Happens sometimes, actually...
869
					WRITE(p, "  if (v.a > 0.002) %s\n", discardStatement);
870
				} else {
871
					// NEVER has been logged as used by games, although it makes little sense - statically failing.
872
					// Maybe we could discard the drawcall, but it's pretty rare.  Let's just statically discard here.
873
					WRITE(p, "  %s\n", discardStatement);
874
				}
875
			} else if (enableFragmentTestCache) {
876
				WRITE(p, "  float aResult = %s(testtex, vec2(%s, 0)).a;\n", compat.texture, alphaTestXCoord);
877
				WRITE(p, "  if (aResult < 0.5) %s\n", discardStatement);
878
			} else {
879
				const char *alphaTestFuncs[] = { "#", "#", " != ", " == ", " >= ", " > ", " <= ", " < " };
880
				if (alphaTestFuncs[alphaTestFunc][0] != '#') {
881
					if (compat.bitwiseOps) {
882
						WRITE(p, "  if ((roundAndScaleTo255i(v.a) & int(u_alphacolormask >> 0x18u)) %s int(u_alphacolorref >> 0x18u)) %s\n", alphaTestFuncs[alphaTestFunc], discardStatement);
883
					} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {
884
						// Work around bad PVR driver problem where equality check + discard just doesn't work.
885
						if (alphaTestFunc != GE_COMP_NOTEQUAL) {
886
							WRITE(p, "  if (roundTo255thf(v.a) %s u_alphacolorref.a) %s\n", alphaTestFuncs[alphaTestFunc], discardStatement);
887
						}
888
					} else {
889
						WRITE(p, "  if (roundAndScaleTo255f(v.a) %s u_alphacolorref.a) %s\n", alphaTestFuncs[alphaTestFunc], discardStatement);
890
					}
891
				} else {
892
					// This means NEVER.  See above.
893
					WRITE(p, "  %s\n", discardStatement);
894
				}
895
			}
896
		}
897

898
		if (enableColorTest) {
899
			*fragmentShaderFlags |= FragmentShaderFlags::USES_DISCARD;
900

901
			if (colorTestAgainstZero) {
902
				// When testing against 0 (common), we can avoid some math.
903
				// 0.002 is approximately half of 1.0 / 255.0.
904
				if (colorTestFunc == GE_COMP_NOTEQUAL) {
905
					if (compat.shaderLanguage == GLSL_VULKAN) {
906
						// Old workaround for Adreno driver bug. We could make this the main path actually
907
						// since the math is roughly equivalent given the non-negative inputs.
908
						WRITE(p, "  if (v.r + v.g + v.b < 0.002) %s\n", discardStatement);
909
					} else {
910
						WRITE(p, "  if (v.r < 0.002 && v.g < 0.002 && v.b < 0.002) %s\n", discardStatement);
911
					}
912
				} else if (colorTestFunc != GE_COMP_NEVER) {
913
					if (compat.shaderLanguage == GLSL_VULKAN) {
914
						// See the GE_COMP_NOTEQUAL case.
915
						WRITE(p, "  if (v.r + v.g + v.b > 0.002) %s\n", discardStatement);
916
					} else {
917
						// Anything else is a test for == 0.
918
						WRITE(p, "  if (v.r > 0.002 || v.g > 0.002 || v.b > 0.002) %s\n", discardStatement);
919
					}
920
				} else {
921
					// NEVER has been logged as used by games, although it makes little sense - statically failing.
922
					// Maybe we could discard the drawcall, but it's pretty rare.  Let's just statically discard here.
923
					WRITE(p, "  %s\n", discardStatement);
924
				}
925
			} else if (enableFragmentTestCache) {
926
				WRITE(p, "  float rResult = %s(testtex, vec2(vScale256.r, 0)).r;\n", compat.texture);
927
				WRITE(p, "  float gResult = %s(testtex, vec2(vScale256.g, 0)).g;\n", compat.texture);
928
				WRITE(p, "  float bResult = %s(testtex, vec2(vScale256.b, 0)).b;\n", compat.texture);
929
				if (colorTestFunc == GE_COMP_EQUAL) {
930
					// Equal means all parts must be equal (so discard if any is not.)
931
					WRITE(p, "  if (rResult < 0.5 || gResult < 0.5 || bResult < 0.5) %s\n", discardStatement);
932
				} else {
933
					// Not equal means any part must be not equal.
934
					WRITE(p, "  if (rResult < 0.5 && gResult < 0.5 && bResult < 0.5) %s\n", discardStatement);
935
				}
936
			} else {
937
				const char *colorTestFuncs[] = { "#", "#", " != ", " == " };
938
				const char *test = colorTestFuncs[colorTestFunc];
939
				if (test[0] != '#') {
940
					// TODO: Unify these paths better.
941
					if (compat.bitwiseOps) {
942
						WRITE(p, "  uint v_uint = roundAndScaleTo8x4(v.rgb);\n");
943
						WRITE(p, "  uint v_masked = v_uint & u_alphacolormask;\n");
944
						WRITE(p, "  uint colorTestRef = (u_alphacolorref & u_alphacolormask) & 0xFFFFFFu;\n");
945
						WRITE(p, "  if (v_masked %s colorTestRef) %s\n", test, discardStatement);
946
					} else if (gl_extensions.gpuVendor == GPU_VENDOR_IMGTEC) {
947
						WRITE(p, "  if (roundTo255thv(v.rgb) %s u_alphacolorref.rgb) %s\n", test, discardStatement);
948
					} else {
949
						WRITE(p, "  if (roundAndScaleTo255v(v.rgb) %s u_alphacolorref.rgb) %s\n", test, discardStatement);
950
					}
951
				} else {
952
					WRITE(p, "  %s\n", discardStatement);
953
				}
954
			}
955
		}
956

957
		if (replaceBlend == REPLACE_BLEND_2X_SRC) {
958
			WRITE(p, "  v.rgb = v.rgb * 2.0;\n");
959
		}
960

961
		// In some cases we need to replicate the first half of the blend equation here.
962
		// In case of blue-to-alpha, it's since we overwrite alpha with blue before the actual blend equation runs.
963
		if (replaceBlend == REPLACE_BLEND_PRE_SRC || replaceBlend == REPLACE_BLEND_PRE_SRC_2X_ALPHA || replaceBlend == REPLACE_BLEND_BLUE_TO_ALPHA) {
964
			const char *srcFactor = "ERROR";
965
			switch (replaceBlendFuncA) {
966
			case GE_SRCBLEND_DSTCOLOR:          srcFactor = "ERROR"; break;
967
			case GE_SRCBLEND_INVDSTCOLOR:       srcFactor = "ERROR"; break;
968
			case GE_SRCBLEND_SRCALPHA:          srcFactor = "splat3(v.a)"; break;
969
			case GE_SRCBLEND_INVSRCALPHA:       srcFactor = "splat3(1.0 - v.a)"; break;
970
			case GE_SRCBLEND_DSTALPHA:          srcFactor = "ERROR"; break;
971
			case GE_SRCBLEND_INVDSTALPHA:       srcFactor = "ERROR"; break;
972
			case GE_SRCBLEND_DOUBLESRCALPHA:    srcFactor = "splat3(v.a * 2.0)"; break;
973
			case GE_SRCBLEND_DOUBLEINVSRCALPHA: srcFactor = "splat3(1.0 - v.a * 2.0)"; break;
974
			// PRE_SRC for REPLACE_BLEND_PRE_SRC_2X_ALPHA means "double the src."
975
			// It's close to the same, but clamping can still be an issue.
976
			case GE_SRCBLEND_DOUBLEDSTALPHA:    srcFactor = "splat3(2.0)"; break;
977
			case GE_SRCBLEND_DOUBLEINVDSTALPHA: srcFactor = "ERROR"; break;
978
			case GE_SRCBLEND_FIXA:              srcFactor = "u_blendFixA"; break;
979
			default:                            srcFactor = "u_blendFixA"; break;
980
			}
981

982
			if (!strcmp(srcFactor, "ERROR")) {
983
				*errorString = "Bad replaceblend src factor";
984
				return false;
985
			}
986

987
			WRITE(p, "  v.rgb = v.rgb * %s;\n", srcFactor);
988
		}
989

990
		if (replaceBlend == REPLACE_BLEND_READ_FRAMEBUFFER) {
991
			const char *srcFactor = nullptr;
992
			const char *dstFactor = nullptr;
993

994
			switch (replaceBlendFuncA) {
995
			case GE_SRCBLEND_DSTCOLOR:          srcFactor = "destColor.rgb"; break;
996
			case GE_SRCBLEND_INVDSTCOLOR:       srcFactor = "(splat3(1.0) - destColor.rgb)"; break;
997
			case GE_SRCBLEND_SRCALPHA:          srcFactor = "v.aaa"; break;
998
			case GE_SRCBLEND_INVSRCALPHA:       srcFactor = "splat3(1.0 - v.a)"; break;
999
			case GE_SRCBLEND_DSTALPHA:          srcFactor = "destColor.aaa"; break;
1000
			case GE_SRCBLEND_INVDSTALPHA:       srcFactor = "(splat3(1.0) - destColor.aaa)"; break;
1001
			case GE_SRCBLEND_DOUBLESRCALPHA:    srcFactor = "v.aaa * 2.0"; break;
1002
			case GE_SRCBLEND_DOUBLEINVSRCALPHA: srcFactor = "(splat3(1.0) - v.aaa * 2.0)"; break;
1003
			case GE_SRCBLEND_DOUBLEDSTALPHA:    srcFactor = "destColor.aaa * 2.0"; break;
1004
			case GE_SRCBLEND_DOUBLEINVDSTALPHA: srcFactor = "(splat3(1.0) - destColor.aaa * 2.0)"; break;
1005
			case GE_SRCBLEND_FIXA:              srcFactor = "u_blendFixA"; break;
1006
			default:                            srcFactor = "u_blendFixA"; break;
1007
			}
1008
			switch (replaceBlendFuncB) {
1009
			case GE_DSTBLEND_SRCCOLOR:          dstFactor = "v.rgb"; break;
1010
			case GE_DSTBLEND_INVSRCCOLOR:       dstFactor = "(splat3(1.0) - v.rgb)"; break;
1011
			case GE_DSTBLEND_SRCALPHA:          dstFactor = "v.aaa"; break;
1012
			case GE_DSTBLEND_INVSRCALPHA:       dstFactor = "(splat3(1.0) - v.aaa)"; break;
1013
			case GE_DSTBLEND_DSTALPHA:          dstFactor = "destColor.aaa"; break;
1014
			case GE_DSTBLEND_INVDSTALPHA:       dstFactor = "(splat3(1.0) - destColor.aaa)"; break;
1015
			case GE_DSTBLEND_DOUBLESRCALPHA:    dstFactor = "v.aaa * 2.0"; break;
1016
			case GE_DSTBLEND_DOUBLEINVSRCALPHA: dstFactor = "(splat3(1.0) - v.aaa * 2.0)"; break;
1017
			case GE_DSTBLEND_DOUBLEDSTALPHA:    dstFactor = "destColor.aaa * 2.0"; break;
1018
			case GE_DSTBLEND_DOUBLEINVDSTALPHA: dstFactor = "(splat3(1.0) - destColor.aaa * 2.0)"; break;
1019
			case GE_DSTBLEND_FIXB:              dstFactor = "u_blendFixB"; break;
1020
			default:                            dstFactor = "u_blendFixB"; break;
1021
			}
1022

1023
			switch (replaceBlendEq) {
1024
			case GE_BLENDMODE_MUL_AND_ADD:
1025
				WRITE(p, "  v.rgb = v.rgb * %s + destColor.rgb * %s;\n", srcFactor, dstFactor);
1026
				break;
1027
			case GE_BLENDMODE_MUL_AND_SUBTRACT:
1028
				WRITE(p, "  v.rgb = v.rgb * %s - destColor.rgb * %s;\n", srcFactor, dstFactor);
1029
				break;
1030
			case GE_BLENDMODE_MUL_AND_SUBTRACT_REVERSE:
1031
				WRITE(p, "  v.rgb = destColor.rgb * %s - v.rgb * %s;\n", dstFactor, srcFactor);
1032
				break;
1033
			case GE_BLENDMODE_MIN:
1034
				WRITE(p, "  v.rgb = min(v.rgb, destColor.rgb);\n");
1035
				break;
1036
			case GE_BLENDMODE_MAX:
1037
				WRITE(p, "  v.rgb = max(v.rgb, destColor.rgb);\n");
1038
				break;
1039
			case GE_BLENDMODE_ABSDIFF:
1040
				WRITE(p, "  v.rgb = abs(v.rgb - destColor.rgb);\n");
1041
				break;
1042
			default:
1043
				*errorString = StringFromFormat("Bad replace blend eq: %d", (int)replaceBlendEq);
1044
				return false;
1045
			}
1046
		}
1047

1048
		if (replaceBlend == REPLACE_BLEND_2X_ALPHA || replaceBlend == REPLACE_BLEND_PRE_SRC_2X_ALPHA) {
1049
			WRITE(p, "  v.a *= 2.0;\n");
1050
		}
1051
	}
1052

1053
	char replacedAlpha[64] = "0.0";
1054
	if (stencilToAlpha != REPLACE_ALPHA_NO) {
1055
		switch (replaceAlphaWithStencilType) {
1056
		case STENCIL_VALUE_UNIFORM:
1057
			truncate_cpy(replacedAlpha, "u_stencilReplaceValue");
1058
			break;
1059

1060
		case STENCIL_VALUE_ZERO:
1061
			truncate_cpy(replacedAlpha, "0.0");
1062
			break;
1063

1064
		case STENCIL_VALUE_ONE:
1065
		case STENCIL_VALUE_INVERT:
1066
			// In invert, we subtract by one, but we want to output one here.
1067
			truncate_cpy(replacedAlpha, "1.0");
1068
			break;
1069

1070
		case STENCIL_VALUE_INCR_4:
1071
		case STENCIL_VALUE_DECR_4:
1072
			// We're adding/subtracting, just by the smallest value in 4-bit.
1073
			snprintf(replacedAlpha, sizeof(replacedAlpha), "%f", 1.0 / 15.0);
1074
			break;
1075

1076
		case STENCIL_VALUE_INCR_8:
1077
		case STENCIL_VALUE_DECR_8:
1078
			// We're adding/subtracting, just by the smallest value in 8-bit.
1079
			snprintf(replacedAlpha, sizeof(replacedAlpha), "%f", 1.0 / 255.0);
1080
			break;
1081

1082
		case STENCIL_VALUE_KEEP:
1083
			// Do nothing. We'll mask out the alpha using color mask.
1084
			break;
1085
		}
1086
	}
1087

1088
	switch (stencilToAlpha) {
1089
	case REPLACE_ALPHA_DUALSOURCE:
1090
		WRITE(p, "  %s = vec4(v.rgb, %s);\n", compat.fragColor0, replacedAlpha);
1091
		WRITE(p, "  %s = vec4(0.0, 0.0, 0.0, v.a);\n", compat.fragColor1);
1092
		break;
1093

1094
	case REPLACE_ALPHA_YES:
1095
		WRITE(p, "  %s = vec4(v.rgb, %s);\n", compat.fragColor0, replacedAlpha);
1096
		break;
1097

1098
	case REPLACE_ALPHA_NO:
1099
		WRITE(p, "  %s = v;\n", compat.fragColor0);
1100
		break;
1101

1102
	default:
1103
		*errorString = "Bad stencil-to-alpha type, corrupt ID?";
1104
		return false;
1105
	}
1106

1107
	switch (simulateLogicOpType) {
1108
	case LOGICOPTYPE_ONE:
1109
		WRITE(p, "  %s.rgb = splat3(1.0);\n", compat.fragColor0);
1110
		break;
1111
	case LOGICOPTYPE_INVERT:
1112
		WRITE(p, "  %s.rgb = splat3(1.0) - %s.rgb;\n", compat.fragColor0, compat.fragColor0);
1113
		break;
1114
	case LOGICOPTYPE_NORMAL:
1115
		break;
1116

1117
	default:
1118
		*errorString = "Bad logic op type, corrupt ID?";
1119
		return false;
1120
	}
1121

1122
	// Final color computed - apply logic ops and bitwise color write mask, through shader blending, if specified.
1123
	if (colorWriteMask || replaceLogicOp) {
1124
		WRITE(p, "  highp uint v32 = packUnorm4x8%s(%s);\n", packSuffix, compat.fragColor0);
1125
		WRITE(p, "  highp uint d32 = packUnorm4x8%s(destColor);\n", packSuffix);
1126

1127
		// v32 is both the "s" to the logical operation, and the value that we'll merge to the destination with masking later.
1128
		// d32 is the "d" to the logical operation.
1129
		// NOTE: Alpha of v32 needs to be preserved. Same equations as in the software renderer.
1130
		switch (replaceLogicOpType) {
1131
		case GE_LOGIC_CLEAR:         p.C("  v32 &= 0xFF000000u;\n"); break;
1132
		case GE_LOGIC_AND:           p.C("  v32 = v32 & (d32 | 0xFF000000u);\n"); break;
1133
		case GE_LOGIC_AND_REVERSE:   p.C("  v32 = v32 & (~d32 | 0xFF000000u);\n"); break;
1134
		case GE_LOGIC_COPY: break;  // source to dest, do nothing. Will be set to this, if not used.
1135
		case GE_LOGIC_AND_INVERTED:  p.C("  v32 = (~v32 & (d32 & 0x00FFFFFFu)) | (v32 & 0xFF000000u);\n"); break;
1136
		case GE_LOGIC_NOOP:          p.C("  v32 = (d32 & 0x00FFFFFFu) | (v32 & 0xFF000000u);\n"); break;
1137
		case GE_LOGIC_XOR:           p.C("  v32 = v32 ^ (d32 & 0x00FFFFFFu);\n"); break;
1138
		case GE_LOGIC_OR:            p.C("  v32 = v32 | (d32 & 0x00FFFFFFu);\n"); break;
1139
		case GE_LOGIC_NOR:           p.C("  v32 = (~(v32 | d32) & 0x00FFFFFFu) | (v32 & 0xFF000000u);\n"); break;
1140
		case GE_LOGIC_EQUIV:         p.C("  v32 = (~(v32 ^ d32) & 0x00FFFFFFu) | (v32 & 0xFF000000u);\n"); break;
1141
		case GE_LOGIC_INVERTED:      p.C("  v32 = (~d32 & 0x00FFFFFFu) | (v32 & 0xFF000000u);\n"); break;
1142
		case GE_LOGIC_OR_REVERSE:    p.C("  v32 = v32 | (~d32 & 0x00FFFFFFu);\n"); break;
1143
		case GE_LOGIC_COPY_INVERTED: p.C("  v32 = (~v32 & 0x00FFFFFFu) | (v32 & 0xFF000000u);\n"); break;
1144
		case GE_LOGIC_OR_INVERTED:   p.C("  v32 = ((~v32 | d32) & 0x00FFFFFFu) | (v32 & 0xFF000000u);\n"); break;
1145
		case GE_LOGIC_NAND:          p.C("  v32 = (~(v32 & d32) & 0x00FFFFFFu) | (v32 & 0xFF000000u);\n"); break;
1146
		case GE_LOGIC_SET:           p.C("  v32 |= 0x00FFFFFFu;\n"); break;
1147
		}
1148

1149
		// Note that the mask has already been flipped to the PC way - 1 means write.
1150
		if (colorWriteMask) {
1151
			if (stencilToAlpha != REPLACE_ALPHA_NO)
1152
				WRITE(p, "  v32 = (v32 & u_colorWriteMask) | (d32 & ~u_colorWriteMask);\n");
1153
			else
1154
				WRITE(p, "  v32 = (v32 & u_colorWriteMask & 0x00FFFFFFu) | (d32 & (~u_colorWriteMask | 0xFF000000u));\n");
1155
		}
1156
		WRITE(p, "  %s = unpackUnorm4x8%s(v32);\n", compat.fragColor0, packSuffix);
1157
	}
1158

1159
	if (blueToAlpha) {
1160
		WRITE(p, "  %s = vec4(0.0, 0.0, 0.0, %s.z);  // blue to alpha\n", compat.fragColor0, compat.fragColor0);
1161
	}
1162

1163
	if (gstate_c.Use(GPU_ROUND_FRAGMENT_DEPTH_TO_16BIT)) {
1164
		DepthScaleFactors depthScale = GetDepthScaleFactors(gstate_c.UseFlags());
1165

1166
		const double scale = depthScale.ScaleU16();
1167

1168
		WRITE(p, "  highp float z = gl_FragCoord.z;\n");
1169
		if (gstate_c.Use(GPU_USE_ACCURATE_DEPTH)) {
1170
			// We center the depth with an offset, but only its fraction matters.
1171
			// When (DepthSliceFactor() - 1) is odd, it will be 0.5, otherwise 0.
1172
			if (((int)(depthScale.Scale() - 1.0f) & 1) == 1) {
1173
				WRITE(p, "  z = (floor((z * %f) - (1.0 / 2.0)) + (1.0 / 2.0)) * (1.0 / %f);\n", scale, scale);
1174
			} else {
1175
				WRITE(p, "  z = floor(z * %f) * (1.0 / %f);\n", scale, scale);
1176
			}
1177
		} else {
1178
			WRITE(p, "  z = (1.0 / 65535.0) * floor(z * 65535.0);\n");
1179
		}
1180
		WRITE(p, "  gl_FragDepth = z;\n");
1181
	} else if (useDiscardStencilBugWorkaround) {
1182
		// Adreno and some Mali drivers apply early frag tests even with discard in the shader,
1183
		// when only stencil is used. The exact situation seems to vary by driver.
1184
		// Writing depth prevents the bug for both vendors, even with depth_unchanged specified.
1185
		// This doesn't make a ton of sense, but empirically does work.
1186
		WRITE(p, "  gl_FragDepth = gl_FragCoord.z;\n");
1187
	}
1188

1189
	if (compat.shaderLanguage == HLSL_D3D11) {
1190
		if (writeDepth) {
1191
			WRITE(p, "  outfragment.depth = gl_FragDepth;\n");
1192
		}
1193
		WRITE(p, "  return outfragment;\n");
1194
	}
1195

1196
	WRITE(p, "}\n");
1197

1198
	return true;
1199
}
1200

1201

1202
Product

Resources

Company