CoCalc -- DepthRaster.cpp

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Common/DepthRaster.cpp
³¹⁸⁶ views
1
#include <algorithm>
2
#include <cstring>
3
#include <cstdint>
4

5
#include "Common/Math/CrossSIMD.h"
6
#include "GPU/Common/DepthRaster.h"
7
#include "GPU/Math3D.h"
8
#include "Common/Math/math_util.h"
9
#include "GPU/Common/VertexDecoderCommon.h"
10

11
DepthScissor DepthScissor::Tile(int tile, int numTiles) const {
12
	if (numTiles == 1) {
13
		return *this;
14
	}
15
	// First tiling algorithm: Split into vertical slices.
16
	int w = x2 - x1;
17
	int tileW = (w / numTiles) & ~3;  // Round to four pixels.
18

19
	// TODO: Should round x1 to four pixels as well! except the first one
20

21
	DepthScissor scissor;
22
	scissor.x1 = x1 + tileW * tile;
23
	scissor.x2 = (tile == numTiles - 1) ? x2 : (x1 + tileW * (tile + 1));
24
	scissor.y1 = y1;
25
	scissor.y2 = y2;
26
	return scissor;
27
}
28

29
// x1/x2 etc are the scissor rect.
30
static void DepthRasterRect(uint16_t *dest, int stride, const DepthScissor scissor, int v1x, int v1y, int v2x, int v2y, short depthValue, ZCompareMode compareMode) {
31
	// Swap coordinates if needed, we don't back-face-cull rects.
32
	// We also ignore the UV rotation here.
33
	if (v1x > v2x) {
34
		std::swap(v1x, v2x);
35
	}
36
	if (v1y > v2y) {
37
		std::swap(v1y, v2y);
38
	}
39

40
	if (v1x < scissor.x1) {
41
		v1x = scissor.x1;
42
	}
43
	if (v2x > scissor.x2) {
44
		v2x = scissor.x2 + 1;  // PSP scissors are inclusive
45
	}
46
	if (v1x >= v2x) {
47
		return;
48
	}
49

50
	if (v1y < scissor.y1) {
51
		v1y = scissor.y1;
52
	}
53
	if (v2y > scissor.y2) {
54
		v2y = scissor.y2 + 1;
55
	}
56
	if (v1y >= v2y) {
57
		return;
58
	}
59

60
	Vec8U16 valueX8 = Vec8U16::Splat(depthValue);
61
	for (int y = v1y; y < v2y; y++) {
62
		uint16_t *ptr = (uint16_t *)(dest + stride * y + v1x);
63
		int w = v2x - v1x;
64
		switch (compareMode) {
65
		case ZCompareMode::Always:
66
			if (depthValue == 0) {
67
				memset(ptr, 0, w * 2);
68
			} else {
69
				while (w >= 8) {
70
					valueX8.Store(ptr);
71
					ptr += 8;
72
					w -= 8;
73
				}
74
				// Non-simd trailer.
75
				while (w > 0) {
76
					*ptr++ = depthValue;
77
					w--;
78
				}
79
			}
80
			break;
81
		default:
82
			// TODO
83
			break;
84
		}
85
	}
86
}
87

88
alignas(16) static const int zero123[4] = {0, 1, 2, 3};
89

90
enum class TriangleStat {
91
	OK,
92
	NoPixels,
93
	SmallOrBackface,
94
};
95

96
constexpr int MIN_TWICE_TRI_AREA = 10;
97

98
// A mix of ideas from Intel's sample and ryg's rasterizer blog series.
99
template<ZCompareMode compareMode, bool lowQ>
100
void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) {
101
	// Triangle setup. This is done using SIMD, four triangles at a time.
102
	// 16x16->32 multiplications are doable on SSE2, which should be all we need.
103

104
	// We use 4x1 SIMD tiles for simplicity. 2x2 would be ideal but stores/loads get annoying.
105

106
	// NOTE: Triangles are stored in groups of 4.
107
	Vec4S32 x0 = Vec4S32::LoadAligned(tx);
108
	Vec4S32 y0 = Vec4S32::LoadAligned(ty);
109
	Vec4S32 x1 = Vec4S32::LoadAligned(tx + 4);
110
	Vec4S32 y1 = Vec4S32::LoadAligned(ty + 4);
111
	Vec4S32 x2 = Vec4S32::LoadAligned(tx + 8);
112
	Vec4S32 y2 = Vec4S32::LoadAligned(ty + 8);
113

114
	if (lowQ) {
115
		y0 &= Vec4S32::Splat(~1);
116
		y1 &= Vec4S32::Splat(~1);
117
		y2 &= Vec4S32::Splat(~1);
118
	}
119

120
	// FixupAfterMinMax is just 16->32 sign extension, in case the current platform (like SSE2) just has 16-bit min/max operations.
121
	Vec4S32 minX = x0.Min16(x1).Min16(x2).Max16(Vec4S32::Splat(scissor.x1)).FixupAfterMinMax();
122
	Vec4S32 maxX = x0.Max16(x1).Max16(x2).Min16(Vec4S32::Splat(scissor.x2)).FixupAfterMinMax();
123
	Vec4S32 minY = y0.Min16(y1).Min16(y2).Max16(Vec4S32::Splat(scissor.y1)).FixupAfterMinMax();
124
	Vec4S32 maxY = y0.Max16(y1).Max16(y2).Min16(Vec4S32::Splat(scissor.y2)).FixupAfterMinMax();
125

126
	Vec4S32 triArea = (x1 - x0).Mul16(y2 - y0) - (x2 - x0).Mul16(y1 - y0);
127

128
	// Edge setup
129
	Vec4S32 A12 = y1 - y2;
130
	Vec4S32 B12 = x2 - x1;
131
	Vec4S32 C12 = x1.Mul16(y2) - y1.Mul16(x2);
132

133
	Vec4S32 A20 = y2 - y0;
134
	Vec4S32 B20 = x0 - x2;
135
	Vec4S32 C20 = x2.Mul16(y0) - y2.Mul16(x0);
136

137
	Vec4S32 A01 = y0 - y1;
138
	Vec4S32 B01 = x1 - x0;
139
	Vec4S32 C01 = x0.Mul16(y1) - y0.Mul16(x1);
140

141
	constexpr int stepXSize = 4;
142
	constexpr int stepYSize = lowQ ? 2 : 1;
143

144
	constexpr int stepXShift = 2;
145
	constexpr int stepYShift = lowQ ? 1 : 0;
146

147
	// Step deltas
148
	Vec4S32 stepX12 = A12.Shl<stepXShift>();
149
	Vec4S32 stepY12 = B12.Shl<stepYShift>();
150
	Vec4S32 stepX20 = A20.Shl<stepXShift>();
151
	Vec4S32 stepY20 = B20.Shl<stepYShift>();
152
	Vec4S32 stepX01 = A01.Shl<stepXShift>();
153
	Vec4S32 stepY01 = B01.Shl<stepYShift>();
154

155
	// Prepare to interpolate Z
156
	Vec4F32 oneOverTriArea = Vec4F32FromS32(triArea).Recip();
157
	Vec4F32 zbase = Vec4F32::LoadAligned(tz);
158
	Vec4F32 z_20 = (Vec4F32::LoadAligned(tz + 4) - zbase) * oneOverTriArea;
159
	Vec4F32 z_01 = (Vec4F32::LoadAligned(tz + 8) - zbase) * oneOverTriArea;
160
	Vec4F32 zdx = z_20 * Vec4F32FromS32(stepX20) + z_01 * Vec4F32FromS32(stepX01);
161
	Vec4F32 zdy = z_20 * Vec4F32FromS32(stepY20) + z_01 * Vec4F32FromS32(stepY01);
162

163
	// Shared setup is done, now loop per-triangle in the group of four.
164
	for (int t = 0; t < 4; t++) {
165
		// Check for bad triangle.
166
		// Using operator[] on the vectors actually seems to result in pretty good code.
167
		if (maxX[t] <= minX[t] || maxY[t] <= minY[t]) {
168
			// No pixels, or outside screen.
169
			// Most of these are now gone in the initial pass, but not all since we cull
170
			// in 4-groups there.
171
			stats[(int)TriangleStat::NoPixels]++;
172
			continue;
173
		}
174

175
		if (triArea[t] < MIN_TWICE_TRI_AREA) {
176
			stats[(int)TriangleStat::SmallOrBackface]++;  // Or zero area.
177
			continue;
178
		}
179

180
		const int minXT = minX[t] & ~3;
181
		const int maxXT = maxX[t] & ~3;
182

183
		const int minYT = minY[t];
184
		const int maxYT = maxY[t];
185

186
		// Convert to wide registers.
187
		Vec4S32 initialX = Vec4S32::Splat(minXT) + Vec4S32::LoadAligned(zero123);
188
		int initialY = minY[t];
189
		_dbg_assert_(A12[t] < 32767);
190
		_dbg_assert_(A12[t] > -32767);
191
		_dbg_assert_(A20[t] < 32767);
192
		_dbg_assert_(A20[t] > -32767);
193
		_dbg_assert_(A01[t] < 32767);
194
		_dbg_assert_(A01[t] > -32767);
195

196
		// TODO: The latter subexpression can be broken out of this loop, but reduces block size flexibility.
197
		Vec4S32 w0_row = Vec4S32::Splat(A12[t]).Mul16(initialX) + Vec4S32::Splat(B12[t] * initialY + C12[t]);
198
		Vec4S32 w1_row = Vec4S32::Splat(A20[t]).Mul16(initialX) + Vec4S32::Splat(B20[t] * initialY + C20[t]);
199
		Vec4S32 w2_row = Vec4S32::Splat(A01[t]).Mul16(initialX) + Vec4S32::Splat(B01[t] * initialY + C01[t]);
200

201
		Vec4F32 zrow = Vec4F32::Splat(zbase[t]) + Vec4F32FromS32(w1_row) * z_20[t] + Vec4F32FromS32(w2_row) * z_01[t];
202
		Vec4F32 zdeltaX = Vec4F32::Splat(zdx[t]);
203
		Vec4F32 zdeltaY = Vec4F32::Splat(zdy[t]);
204

205
		Vec4S32 oneStepX12 = Vec4S32::Splat(stepX12[t]);
206
		Vec4S32 oneStepY12 = Vec4S32::Splat(stepY12[t]);
207
		Vec4S32 oneStepX20 = Vec4S32::Splat(stepX20[t]);
208
		Vec4S32 oneStepY20 = Vec4S32::Splat(stepY20[t]);
209
		Vec4S32 oneStepX01 = Vec4S32::Splat(stepX01[t]);
210
		Vec4S32 oneStepY01 = Vec4S32::Splat(stepY01[t]);
211
		// Rasterize
212
		for (int y = minYT; y <= maxYT; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) {
213
			// Barycentric coordinates at start of row
214
			Vec4S32 w0 = w0_row;
215
			Vec4S32 w1 = w1_row;
216
			Vec4S32 w2 = w2_row;
217
			Vec4F32 zs = zrow;
218

219
			uint16_t *rowPtr = depthBuf + stride * y;
220

221
			for (int x = minXT; x <= maxXT; x += stepXSize, w0 += oneStepX12, w1 += oneStepX20, w2 += oneStepX01, zs += zdeltaX) {
222
				// If p is on or inside all edges for any pixels,
223
				// render those pixels.
224
				Vec4S32 signCalc = w0 | w1 | w2;
225

226
				// TODO: Check if this check is profitable. Maybe only for big triangles?
227
				if (!AnyZeroSignBit(signCalc)) {
228
					continue;
229
				}
230

231
				Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x);
232
				Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc);
233
				// Now, the mask has 1111111 where we should preserve the contents of the depth buffer.
234

235
				Vec4U16 shortZ = Vec4U16::FromVec4F32(zs);
236

237
				// This switch is on a templated constant, so should collapse away.
238
				Vec4U16 writeVal;
239
				switch (compareMode) {
240
				case ZCompareMode::Greater:
241
					// To implement the greater/greater-than comparison, we can combine mask and max.
242
					// Unfortunately there's no unsigned max on SSE2, it's synthesized by xoring 0x8000 on input and output.
243
					// We use AndNot to zero out Z results, before doing Max with the buffer.
244
					writeVal = shortZ.AndNot(shortMaskInv).Max(bufferValues);
245
					break;
246
				case ZCompareMode::Less:
247
					// This time, we OR the mask and use .Min.
248
					writeVal = (shortZ | shortMaskInv).Min(bufferValues);
249
					break;
250
				case ZCompareMode::Always:  // UNTESTED
251
					// This could be replaced with a vblend operation.
252
					writeVal = ((bufferValues & shortMaskInv) | shortZ.AndNot(shortMaskInv));
253
					break;
254
				}
255
				writeVal.Store(rowPtr + x);
256
				if (lowQ) {
257
					writeVal.Store(rowPtr + stride + x);
258
				}
259
			}
260
		}
261

262
		stats[(int)TriangleStat::OK]++;
263
	}
264
}
265

266
// This will always run on the main thread. Though, might consider moving the transforms out and just storing verts instead?
267
void DecodeAndTransformForDepthRaster(float *dest, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, const VertexDecoder *dec, u32 vertTypeID) {
268
	// TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder.
269
	_dbg_assert_((vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0);
270

271
	int vertexStride = dec->VertexSize();
272
	int offset = dec->posoff;
273

274
	Mat4F32 mat(worldviewproj);
275

276
	const u8 *startPtr = (const u8 *)vertexData + indexLowerBound * vertexStride;
277
	int count = indexUpperBound - indexLowerBound + 1;
278

279
	switch (vertTypeID & GE_VTYPE_POS_MASK) {
280
	case GE_VTYPE_POS_FLOAT:
281
		for (int i = 0; i < count; i++) {
282
			const float *data = (const float *)(startPtr + i * vertexStride + offset);
283
			Vec4F32::Load(data).AsVec3ByMatrix44(mat).Store(dest + i * 4);
284
		}
285
		break;
286
	case GE_VTYPE_POS_16BIT:
287
		for (int i = 0; i < count; i++) {
288
			const s16 *data = ((const s16 *)((const s8 *)startPtr + i * vertexStride + offset));
289
			Vec4F32::LoadConvertS16(data).Mul(1.0f / 32768.f).AsVec3ByMatrix44(mat).Store(dest + i * 4);
290
		}
291
		break;
292
	case GE_VTYPE_POS_8BIT:
293
		for (int i = 0; i < count; i++) {
294
			const s8 *data = (const s8 *)startPtr + i * vertexStride + offset;
295
			Vec4F32::LoadConvertS8(data).Mul(1.0f / 128.0f).AsVec3ByMatrix44(mat).Store(dest + i * 4);
296
		}
297
		break;
298
	}
299
}
300

301
void TransformPredecodedForDepthRaster(float *dest, const float *worldviewproj, const void *decodedVertexData, const VertexDecoder *dec, int count) {
302
	// TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder.
303
	_dbg_assert_((dec->VertexType() & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0);
304

305
	int vertexStride = dec->GetDecVtxFmt().stride;
306
	int offset = dec->GetDecVtxFmt().posoff;
307

308
	Mat4F32 mat(worldviewproj);
309

310
	const u8 *startPtr = (const u8 *)decodedVertexData;
311
	// Decoded position format is always float3.
312
	for (int i = 0; i < count; i++) {
313
		const float *data = (const float *)(startPtr + i * vertexStride + offset);
314
		Vec4F32::Load(data).AsVec3ByMatrix44(mat).Store(dest + i * 4);
315
	}
316
}
317

318
void ConvertPredecodedThroughForDepthRaster(float *dest, const void *decodedVertexData, const VertexDecoder *dec, int count) {
319
	// TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder.
320
	_dbg_assert_((dec->VertexType() & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0);
321

322
	int vertexStride = dec->GetDecVtxFmt().stride;
323
	int offset = dec->GetDecVtxFmt().posoff;
324

325
	const u8 *startPtr = (const u8 *)decodedVertexData;
326
	// Decoded position format is always float3.
327
	for (int i = 0; i < count; i++) {
328
		const float *data = (const float *)(startPtr + i * vertexStride + offset);
329
		// Just pass the position straight through - this is through mode!
330
		// A W of one makes projection a no-op, without branching.
331
		Vec4F32::Load(data).WithLane3One().Store(dest + i * 4);
332
	}
333
}
334

335
int DepthRasterClipIndexedRectangles(int *tx, int *ty, float *tz, const float *transformed, const uint16_t *indexBuffer, const DepthDraw &draw, const DepthScissor scissor) {
336
	int outCount = 0;
337
	const int count = draw.vertexCount;
338
	for (int i = 0; i < count; i += 2) {
339
		const float *verts[2] = {
340
			transformed + indexBuffer[i] * 4,
341
			transformed + indexBuffer[i + 1] * 4,
342
		};
343

344
		// Check if any vertex is behind the 0 plane.
345
		if (verts[0][3] < 0.0f || verts[1][3] < 0.0f) {
346
			// Ditch this rectangle.
347
			continue;
348
		}
349

350
		// These names are wrong .. until we transpose.
351
		// TODO: Maybe combine two rects here at a time. But hardly relevant for performance.
352
		Vec4F32 x = Vec4F32::Load(verts[0]);
353
		Vec4F32 y = Vec4F32::Load(verts[1]);
354
		Vec4F32 z = Vec4F32::Zero();
355
		Vec4F32 w = Vec4F32::Zero();
356
		Vec4F32::Transpose(x, y, z, w);
357
		// Now the names are accurate! Since we only have two vertices, the third and fourth member of each vector is zero
358
		// and will not be stored (well it will be stored, but it'll be overwritten by the next vertex).
359
		Vec4F32 recipW = w.Recip();
360

361
		x *= recipW;
362
		y *= recipW;
363
		z *= recipW;
364

365
		Vec4S32FromF32(x).Store2(tx + outCount);
366
		Vec4S32FromF32(y).Store2(ty + outCount);
367
		z.Clamp(0.0f, 65535.0f).Store2(tz + outCount);
368
		outCount += 2;
369
	}
370
	return outCount;
371
}
372

373
int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *transformed, const uint16_t *indexBuffer, const DepthDraw &draw, const DepthScissor scissor) {
374
	int outCount = 0;
375

376
	int flipCull = 0;
377
	if (draw.cullEnabled && draw.cullMode == GE_CULL_CW) {
378
		flipCull = 3;
379
	}
380
	const bool cullEnabled = draw.cullEnabled;
381

382
	static const float zerovec[4] = {0.0f, 0.0f, 0.0f, 1.0f};
383

384
	int collected = 0;
385
	int planeCulled = 0;
386
	int boxCulled = 0;
387
	const float *verts[12];  // four triangles at a time!
388
	const int count = draw.vertexCount;
389

390
	// Not exactly the same guardband as on the real PSP, but good enough to prevent 16-bit overflow in raster.
391
	// This is slightly off-center since we are already in screen space, but whatever.
392
	Vec4S32 guardBandTopLeft = Vec4S32::Splat(-4096);
393
	Vec4S32 guardBandBottomRight = Vec4S32::Splat(4096);
394

395
	Vec4S32 scissorX1 = Vec4S32::Splat((float)scissor.x1);
396
	Vec4S32 scissorY1 = Vec4S32::Splat((float)scissor.y1);
397
	Vec4S32 scissorX2 = Vec4S32::Splat((float)scissor.x2);
398
	Vec4S32 scissorY2 = Vec4S32::Splat((float)scissor.y2);
399

400
	// Add cheap pre-projection pre-checks for bad triangle here. Not much we can do safely other than checking W.
401
	auto validVert = [](const float *v) -> bool {
402
		if (v[3] <= 0.0f || v[2] <= 0.0f) {
403
			return false;
404
		}
405
		/*
406
		if (v[2] >= 65535.0f * v[3]) {
407
			return false;
408
		}*/
409
		return true;
410
	};
411

412
	for (int i = 0; i < count; i += 3) {
413
		// Collect valid triangles into buffer.
414
		const float *v0 = transformed + indexBuffer[i] * 4;
415
		const float *v1 = transformed + indexBuffer[i + (1 ^ flipCull)] * 4;
416
		const float *v2 = transformed + indexBuffer[i + (2 ^ flipCull)] * 4;
417
		// Don't collect triangle if any vertex is beyond the planes.
418
		// TODO: Optimize this somehow.
419
		if (validVert(v0) && validVert(v1) && validVert(v2)) {
420
			verts[collected] = v0;
421
			verts[collected + 1] = v1;
422
			verts[collected + 2] = v2;
423
			collected += 3;
424
		} else {
425
			planeCulled++;
426
		}
427

428
		if (i >= count - 3 && collected != 12) {
429
			// Last iteration. Zero out any remaining triangles.
430
			for (int j = collected; j < 12; j++) {
431
				verts[j] = zerovec;
432
			}
433
			collected = 12;
434
		}
435

436
		if (collected != 12) {
437
			// Fetch more!
438
			continue;
439
		}
440

441
		collected = 0;
442

443
		// These names are wrong .. until we transpose.
444
		Vec4F32 x0 = Vec4F32::Load(verts[0]);
445
		Vec4F32 x1 = Vec4F32::Load(verts[1]);
446
		Vec4F32 x2 = Vec4F32::Load(verts[2]);
447
		Vec4F32 y0 = Vec4F32::Load(verts[3]);
448
		Vec4F32 y1 = Vec4F32::Load(verts[4]);
449
		Vec4F32 y2 = Vec4F32::Load(verts[5]);
450
		Vec4F32 z0 = Vec4F32::Load(verts[6]);
451
		Vec4F32 z1 = Vec4F32::Load(verts[7]);
452
		Vec4F32 z2 = Vec4F32::Load(verts[8]);
453
		Vec4F32 w0 = Vec4F32::Load(verts[9]);
454
		Vec4F32 w1 = Vec4F32::Load(verts[10]);
455
		Vec4F32 w2 = Vec4F32::Load(verts[11]);
456

457
		Vec4F32::Transpose(x0, y0, z0, w0);
458
		Vec4F32::Transpose(x1, y1, z1, w1);
459
		Vec4F32::Transpose(x2, y2, z2, w2);
460

461
		// Now the names are accurate!
462

463
		// Let's project all three vertices, for all four triangles.
464
		Vec4F32 recipW0 = w0.Recip();
465
		Vec4F32 recipW1 = w1.Recip();
466
		Vec4F32 recipW2 = w2.Recip();
467
		x0 *= recipW0;
468
		y0 *= recipW0;
469
		z0 *= recipW0;
470
		x1 *= recipW1;
471
		y1 *= recipW1;
472
		z1 *= recipW1;
473
		x2 *= recipW2;
474
		y2 *= recipW2;
475
		z2 *= recipW2;
476

477
		// Check bounding box size. Cast to integer for crude rounding (and to approximately match the rasterizer).
478
		Vec4S32 minX = Vec4S32FromF32(x0.Min(x1.Min(x2)));
479
		Vec4S32 minY = Vec4S32FromF32(y0.Min(y1.Min(y2)));
480
		Vec4S32 maxX = Vec4S32FromF32(x0.Max(x1.Max(x2)));
481
		Vec4S32 maxY = Vec4S32FromF32(y0.Max(y1.Max(y2)));
482

483
		// If all are equal in any dimension, all four triangles are tiny nonsense and can be skipped early.
484
		Vec4S32 eqMask = minX.CompareEq(maxX) | minY.CompareEq(maxY);
485

486
		// Otherwise we just proceed to triangle setup with all four for now.
487
		// We could also save the computed boxes for later..
488
		// TODO: Merge into below checks? Though nice with an early out.
489
		if (!AnyZeroSignBit(eqMask)) {
490
			boxCulled += 4;
491
			continue;
492
		}
493

494
		// Create a mask to kill coordinates of triangles that poke outside the guardband (or are just empty).
495
		Vec4S32 inGuardBand =
496
			((minX.CompareGt(guardBandTopLeft) & maxX.CompareLt(guardBandBottomRight)) &
497
				(minY.CompareGt(guardBandTopLeft) & maxY.CompareLt(guardBandBottomRight))).AndNot(eqMask);
498

499
		// Create another mask to kill off-screen triangles. Not perfectly accurate.
500
		inGuardBand &= (maxX.CompareGt(scissorX1) & minX.CompareLt(scissorX2)) & (maxY.CompareGt(scissorY1) & minY.CompareLt(scissorY2));
501

502
		// It's enough to smash one coordinate to make future checks (like the tri area check) fail.
503
		x0 &= inGuardBand;
504
		x1 &= inGuardBand;
505
		x2 &= inGuardBand;
506

507
		// Floating point double triangle area. Can't be reused for the integer-snapped raster reliably (though may work...)
508
		// Still good for culling early and pretty cheap to compute.
509
		Vec4F32 doubleTriArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) - Vec4F32::Splat((float)(MIN_TWICE_TRI_AREA));
510
		if (!AnyZeroSignBit(doubleTriArea)) {
511
			gpuStats.numDepthRasterEarlySize += 4;
512
			continue;
513
		}
514

515
		// Note: If any triangle is outside the guardband, (just) its X coords get zeroed, and it'll later get rejected.
516
		Vec4S32FromF32(x0).Store(tx + outCount);
517
		Vec4S32FromF32(x1).Store(tx + outCount + 4);
518
		Vec4S32FromF32(x2).Store(tx + outCount + 8);
519
		Vec4S32FromF32(y0).Store(ty + outCount);
520
		Vec4S32FromF32(y1).Store(ty + outCount + 4);
521
		Vec4S32FromF32(y2).Store(ty + outCount + 8);
522
		z0.Store(tz + outCount);
523
		z1.Store(tz + outCount + 4);
524
		z2.Store(tz + outCount + 8);
525

526
#ifdef _DEBUG
527
		for (int i = 0; i < 12; i++) {
528
			_dbg_assert_(tx[outCount + i] < 32767);
529
			_dbg_assert_(tx[outCount + i] >= -32768);
530
			_dbg_assert_(tx[outCount + i] < 32767);
531
			_dbg_assert_(tx[outCount + i] >= -32768);
532
		}
533
#endif
534

535
		outCount += 12;
536

537
		if (!cullEnabled) {
538
			// If culling is off, store the triangles again, with the first two vertices swapped.
539
			(Vec4S32FromF32(x0) & inGuardBand).Store(tx + outCount);
540
			(Vec4S32FromF32(x2) & inGuardBand).Store(tx + outCount + 4);
541
			(Vec4S32FromF32(x1) & inGuardBand).Store(tx + outCount + 8);
542
			Vec4S32FromF32(y0).Store(ty + outCount);
543
			Vec4S32FromF32(y2).Store(ty + outCount + 4);
544
			Vec4S32FromF32(y1).Store(ty + outCount + 8);
545
			z0.Store(tz + outCount);
546
			z2.Store(tz + outCount + 4);
547
			z1.Store(tz + outCount + 8);
548

549
			outCount += 12;
550
		}
551
	}
552

553
	gpuStats.numDepthRasterZCulled += planeCulled;
554
	gpuStats.numDepthEarlyBoxCulled += boxCulled;
555
	return outCount;
556
}
557

558
// Rasterizes screen-space vertices.
559
void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, const int *ty, const float *tz, int count, const DepthDraw &draw, const DepthScissor scissor, bool lowQ) {
560
	// Prim should now be either TRIANGLES or RECTs.
561
	_dbg_assert_(draw.prim == GE_PRIM_RECTANGLES || draw.prim == GE_PRIM_TRIANGLES);
562

563
	switch (draw.prim) {
564
	case GE_PRIM_RECTANGLES:
565
		for (int i = 0; i < count; i += 2) {
566
			uint16_t z = (uint16_t)tz[i + 1];  // depth from second vertex
567
			// TODO: Should clip coordinates to the scissor rectangle.
568
			// We remove the subpixel information here.
569
			DepthRasterRect(depth, depthStride, scissor, tx[i], ty[i], tx[i + 1], ty[i + 1], z, draw.compareMode);
570
		}
571
		gpuStats.numDepthRasterPrims += count / 2;
572
		break;
573
	case GE_PRIM_TRIANGLES:
574
	{
575
		int stats[3]{};
576
		// Batches of 4 triangles, as output by the clip function.
577
		if (lowQ) {
578
			switch (draw.compareMode) {
579
			case ZCompareMode::Greater:
580
			{
581
				for (int i = 0; i < count; i += 12) {
582
					DepthRaster4Triangles<ZCompareMode::Greater, true>(stats, depth, depthStride, scissor, &tx[i], &ty[i], &tz[i]);
583
				}
584
				break;
585
			}
586
			case ZCompareMode::Less:
587
			{
588
				for (int i = 0; i < count; i += 12) {
589
					DepthRaster4Triangles<ZCompareMode::Less, true>(stats, depth, depthStride, scissor, &tx[i], &ty[i], &tz[i]);
590
				}
591
				break;
592
			}
593
			case ZCompareMode::Always:
594
			{
595
				for (int i = 0; i < count; i += 12) {
596
					DepthRaster4Triangles<ZCompareMode::Always, true>(stats, depth, depthStride, scissor, &tx[i], &ty[i], &tz[i]);
597
				}
598
				break;
599
			}
600
			}
601
		} else {
602
			switch (draw.compareMode) {
603
			case ZCompareMode::Greater:
604
			{
605
				for (int i = 0; i < count; i += 12) {
606
					DepthRaster4Triangles<ZCompareMode::Greater, false>(stats, depth, depthStride, scissor, &tx[i], &ty[i], &tz[i]);
607
				}
608
				break;
609
			}
610
			case ZCompareMode::Less:
611
			{
612
				for (int i = 0; i < count; i += 12) {
613
					DepthRaster4Triangles<ZCompareMode::Less, false>(stats, depth, depthStride, scissor, &tx[i], &ty[i], &tz[i]);
614
				}
615
				break;
616
			}
617
			case ZCompareMode::Always:
618
			{
619
				for (int i = 0; i < count; i += 12) {
620
					DepthRaster4Triangles<ZCompareMode::Always, false>(stats, depth, depthStride, scissor, &tx[i], &ty[i], &tz[i]);
621
				}
622
				break;
623
			}
624
			}
625
		}
626
		gpuStats.numDepthRasterNoPixels += stats[(int)TriangleStat::NoPixels];
627
		gpuStats.numDepthRasterTooSmall += stats[(int)TriangleStat::SmallOrBackface];
628
		gpuStats.numDepthRasterPrims += stats[(int)TriangleStat::OK];
629
		break;
630
	}
631
	default:
632
		_dbg_assert_(false);
633
	}
634
}
635

636
Product

Resources

Company