Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hrydgard
GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Common/DepthRaster.cpp
3186 views
1
#include <algorithm>
2
#include <cstring>
3
#include <cstdint>
4
5
#include "Common/Math/CrossSIMD.h"
6
#include "GPU/Common/DepthRaster.h"
7
#include "GPU/Math3D.h"
8
#include "Common/Math/math_util.h"
9
#include "GPU/Common/VertexDecoderCommon.h"
10
11
DepthScissor DepthScissor::Tile(int tile, int numTiles) const {
12
if (numTiles == 1) {
13
return *this;
14
}
15
// First tiling algorithm: Split into vertical slices.
16
int w = x2 - x1;
17
int tileW = (w / numTiles) & ~3; // Round to four pixels.
18
19
// TODO: Should round x1 to four pixels as well! except the first one
20
21
DepthScissor scissor;
22
scissor.x1 = x1 + tileW * tile;
23
scissor.x2 = (tile == numTiles - 1) ? x2 : (x1 + tileW * (tile + 1));
24
scissor.y1 = y1;
25
scissor.y2 = y2;
26
return scissor;
27
}
28
29
// x1/x2 etc are the scissor rect.
30
static void DepthRasterRect(uint16_t *dest, int stride, const DepthScissor scissor, int v1x, int v1y, int v2x, int v2y, short depthValue, ZCompareMode compareMode) {
31
// Swap coordinates if needed, we don't back-face-cull rects.
32
// We also ignore the UV rotation here.
33
if (v1x > v2x) {
34
std::swap(v1x, v2x);
35
}
36
if (v1y > v2y) {
37
std::swap(v1y, v2y);
38
}
39
40
if (v1x < scissor.x1) {
41
v1x = scissor.x1;
42
}
43
if (v2x > scissor.x2) {
44
v2x = scissor.x2 + 1; // PSP scissors are inclusive
45
}
46
if (v1x >= v2x) {
47
return;
48
}
49
50
if (v1y < scissor.y1) {
51
v1y = scissor.y1;
52
}
53
if (v2y > scissor.y2) {
54
v2y = scissor.y2 + 1;
55
}
56
if (v1y >= v2y) {
57
return;
58
}
59
60
Vec8U16 valueX8 = Vec8U16::Splat(depthValue);
61
for (int y = v1y; y < v2y; y++) {
62
uint16_t *ptr = (uint16_t *)(dest + stride * y + v1x);
63
int w = v2x - v1x;
64
switch (compareMode) {
65
case ZCompareMode::Always:
66
if (depthValue == 0) {
67
memset(ptr, 0, w * 2);
68
} else {
69
while (w >= 8) {
70
valueX8.Store(ptr);
71
ptr += 8;
72
w -= 8;
73
}
74
// Non-simd trailer.
75
while (w > 0) {
76
*ptr++ = depthValue;
77
w--;
78
}
79
}
80
break;
81
default:
82
// TODO
83
break;
84
}
85
}
86
}
87
88
alignas(16) static const int zero123[4] = {0, 1, 2, 3};
89
90
enum class TriangleStat {
91
OK,
92
NoPixels,
93
SmallOrBackface,
94
};
95
96
constexpr int MIN_TWICE_TRI_AREA = 10;
97
98
// A mix of ideas from Intel's sample and ryg's rasterizer blog series.
99
template<ZCompareMode compareMode, bool lowQ>
100
void DepthRaster4Triangles(int stats[3], uint16_t *depthBuf, int stride, DepthScissor scissor, const int *tx, const int *ty, const float *tz) {
101
// Triangle setup. This is done using SIMD, four triangles at a time.
102
// 16x16->32 multiplications are doable on SSE2, which should be all we need.
103
104
// We use 4x1 SIMD tiles for simplicity. 2x2 would be ideal but stores/loads get annoying.
105
106
// NOTE: Triangles are stored in groups of 4.
107
Vec4S32 x0 = Vec4S32::LoadAligned(tx);
108
Vec4S32 y0 = Vec4S32::LoadAligned(ty);
109
Vec4S32 x1 = Vec4S32::LoadAligned(tx + 4);
110
Vec4S32 y1 = Vec4S32::LoadAligned(ty + 4);
111
Vec4S32 x2 = Vec4S32::LoadAligned(tx + 8);
112
Vec4S32 y2 = Vec4S32::LoadAligned(ty + 8);
113
114
if (lowQ) {
115
y0 &= Vec4S32::Splat(~1);
116
y1 &= Vec4S32::Splat(~1);
117
y2 &= Vec4S32::Splat(~1);
118
}
119
120
// FixupAfterMinMax is just 16->32 sign extension, in case the current platform (like SSE2) just has 16-bit min/max operations.
121
Vec4S32 minX = x0.Min16(x1).Min16(x2).Max16(Vec4S32::Splat(scissor.x1)).FixupAfterMinMax();
122
Vec4S32 maxX = x0.Max16(x1).Max16(x2).Min16(Vec4S32::Splat(scissor.x2)).FixupAfterMinMax();
123
Vec4S32 minY = y0.Min16(y1).Min16(y2).Max16(Vec4S32::Splat(scissor.y1)).FixupAfterMinMax();
124
Vec4S32 maxY = y0.Max16(y1).Max16(y2).Min16(Vec4S32::Splat(scissor.y2)).FixupAfterMinMax();
125
126
Vec4S32 triArea = (x1 - x0).Mul16(y2 - y0) - (x2 - x0).Mul16(y1 - y0);
127
128
// Edge setup
129
Vec4S32 A12 = y1 - y2;
130
Vec4S32 B12 = x2 - x1;
131
Vec4S32 C12 = x1.Mul16(y2) - y1.Mul16(x2);
132
133
Vec4S32 A20 = y2 - y0;
134
Vec4S32 B20 = x0 - x2;
135
Vec4S32 C20 = x2.Mul16(y0) - y2.Mul16(x0);
136
137
Vec4S32 A01 = y0 - y1;
138
Vec4S32 B01 = x1 - x0;
139
Vec4S32 C01 = x0.Mul16(y1) - y0.Mul16(x1);
140
141
constexpr int stepXSize = 4;
142
constexpr int stepYSize = lowQ ? 2 : 1;
143
144
constexpr int stepXShift = 2;
145
constexpr int stepYShift = lowQ ? 1 : 0;
146
147
// Step deltas
148
Vec4S32 stepX12 = A12.Shl<stepXShift>();
149
Vec4S32 stepY12 = B12.Shl<stepYShift>();
150
Vec4S32 stepX20 = A20.Shl<stepXShift>();
151
Vec4S32 stepY20 = B20.Shl<stepYShift>();
152
Vec4S32 stepX01 = A01.Shl<stepXShift>();
153
Vec4S32 stepY01 = B01.Shl<stepYShift>();
154
155
// Prepare to interpolate Z
156
Vec4F32 oneOverTriArea = Vec4F32FromS32(triArea).Recip();
157
Vec4F32 zbase = Vec4F32::LoadAligned(tz);
158
Vec4F32 z_20 = (Vec4F32::LoadAligned(tz + 4) - zbase) * oneOverTriArea;
159
Vec4F32 z_01 = (Vec4F32::LoadAligned(tz + 8) - zbase) * oneOverTriArea;
160
Vec4F32 zdx = z_20 * Vec4F32FromS32(stepX20) + z_01 * Vec4F32FromS32(stepX01);
161
Vec4F32 zdy = z_20 * Vec4F32FromS32(stepY20) + z_01 * Vec4F32FromS32(stepY01);
162
163
// Shared setup is done, now loop per-triangle in the group of four.
164
for (int t = 0; t < 4; t++) {
165
// Check for bad triangle.
166
// Using operator[] on the vectors actually seems to result in pretty good code.
167
if (maxX[t] <= minX[t] || maxY[t] <= minY[t]) {
168
// No pixels, or outside screen.
169
// Most of these are now gone in the initial pass, but not all since we cull
170
// in 4-groups there.
171
stats[(int)TriangleStat::NoPixels]++;
172
continue;
173
}
174
175
if (triArea[t] < MIN_TWICE_TRI_AREA) {
176
stats[(int)TriangleStat::SmallOrBackface]++; // Or zero area.
177
continue;
178
}
179
180
const int minXT = minX[t] & ~3;
181
const int maxXT = maxX[t] & ~3;
182
183
const int minYT = minY[t];
184
const int maxYT = maxY[t];
185
186
// Convert to wide registers.
187
Vec4S32 initialX = Vec4S32::Splat(minXT) + Vec4S32::LoadAligned(zero123);
188
int initialY = minY[t];
189
_dbg_assert_(A12[t] < 32767);
190
_dbg_assert_(A12[t] > -32767);
191
_dbg_assert_(A20[t] < 32767);
192
_dbg_assert_(A20[t] > -32767);
193
_dbg_assert_(A01[t] < 32767);
194
_dbg_assert_(A01[t] > -32767);
195
196
// TODO: The latter subexpression can be broken out of this loop, but reduces block size flexibility.
197
Vec4S32 w0_row = Vec4S32::Splat(A12[t]).Mul16(initialX) + Vec4S32::Splat(B12[t] * initialY + C12[t]);
198
Vec4S32 w1_row = Vec4S32::Splat(A20[t]).Mul16(initialX) + Vec4S32::Splat(B20[t] * initialY + C20[t]);
199
Vec4S32 w2_row = Vec4S32::Splat(A01[t]).Mul16(initialX) + Vec4S32::Splat(B01[t] * initialY + C01[t]);
200
201
Vec4F32 zrow = Vec4F32::Splat(zbase[t]) + Vec4F32FromS32(w1_row) * z_20[t] + Vec4F32FromS32(w2_row) * z_01[t];
202
Vec4F32 zdeltaX = Vec4F32::Splat(zdx[t]);
203
Vec4F32 zdeltaY = Vec4F32::Splat(zdy[t]);
204
205
Vec4S32 oneStepX12 = Vec4S32::Splat(stepX12[t]);
206
Vec4S32 oneStepY12 = Vec4S32::Splat(stepY12[t]);
207
Vec4S32 oneStepX20 = Vec4S32::Splat(stepX20[t]);
208
Vec4S32 oneStepY20 = Vec4S32::Splat(stepY20[t]);
209
Vec4S32 oneStepX01 = Vec4S32::Splat(stepX01[t]);
210
Vec4S32 oneStepY01 = Vec4S32::Splat(stepY01[t]);
211
// Rasterize
212
for (int y = minYT; y <= maxYT; y += stepYSize, w0_row += oneStepY12, w1_row += oneStepY20, w2_row += oneStepY01, zrow += zdeltaY) {
213
// Barycentric coordinates at start of row
214
Vec4S32 w0 = w0_row;
215
Vec4S32 w1 = w1_row;
216
Vec4S32 w2 = w2_row;
217
Vec4F32 zs = zrow;
218
219
uint16_t *rowPtr = depthBuf + stride * y;
220
221
for (int x = minXT; x <= maxXT; x += stepXSize, w0 += oneStepX12, w1 += oneStepX20, w2 += oneStepX01, zs += zdeltaX) {
222
// If p is on or inside all edges for any pixels,
223
// render those pixels.
224
Vec4S32 signCalc = w0 | w1 | w2;
225
226
// TODO: Check if this check is profitable. Maybe only for big triangles?
227
if (!AnyZeroSignBit(signCalc)) {
228
continue;
229
}
230
231
Vec4U16 bufferValues = Vec4U16::Load(rowPtr + x);
232
Vec4U16 shortMaskInv = SignBits32ToMaskU16(signCalc);
233
// Now, the mask has 1111111 where we should preserve the contents of the depth buffer.
234
235
Vec4U16 shortZ = Vec4U16::FromVec4F32(zs);
236
237
// This switch is on a templated constant, so should collapse away.
238
Vec4U16 writeVal;
239
switch (compareMode) {
240
case ZCompareMode::Greater:
241
// To implement the greater/greater-than comparison, we can combine mask and max.
242
// Unfortunately there's no unsigned max on SSE2, it's synthesized by xoring 0x8000 on input and output.
243
// We use AndNot to zero out Z results, before doing Max with the buffer.
244
writeVal = shortZ.AndNot(shortMaskInv).Max(bufferValues);
245
break;
246
case ZCompareMode::Less:
247
// This time, we OR the mask and use .Min.
248
writeVal = (shortZ | shortMaskInv).Min(bufferValues);
249
break;
250
case ZCompareMode::Always: // UNTESTED
251
// This could be replaced with a vblend operation.
252
writeVal = ((bufferValues & shortMaskInv) | shortZ.AndNot(shortMaskInv));
253
break;
254
}
255
writeVal.Store(rowPtr + x);
256
if (lowQ) {
257
writeVal.Store(rowPtr + stride + x);
258
}
259
}
260
}
261
262
stats[(int)TriangleStat::OK]++;
263
}
264
}
265
266
// This will always run on the main thread. Though, might consider moving the transforms out and just storing verts instead?
267
void DecodeAndTransformForDepthRaster(float *dest, const float *worldviewproj, const void *vertexData, int indexLowerBound, int indexUpperBound, const VertexDecoder *dec, u32 vertTypeID) {
268
// TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder.
269
_dbg_assert_((vertTypeID & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0);
270
271
int vertexStride = dec->VertexSize();
272
int offset = dec->posoff;
273
274
Mat4F32 mat(worldviewproj);
275
276
const u8 *startPtr = (const u8 *)vertexData + indexLowerBound * vertexStride;
277
int count = indexUpperBound - indexLowerBound + 1;
278
279
switch (vertTypeID & GE_VTYPE_POS_MASK) {
280
case GE_VTYPE_POS_FLOAT:
281
for (int i = 0; i < count; i++) {
282
const float *data = (const float *)(startPtr + i * vertexStride + offset);
283
Vec4F32::Load(data).AsVec3ByMatrix44(mat).Store(dest + i * 4);
284
}
285
break;
286
case GE_VTYPE_POS_16BIT:
287
for (int i = 0; i < count; i++) {
288
const s16 *data = ((const s16 *)((const s8 *)startPtr + i * vertexStride + offset));
289
Vec4F32::LoadConvertS16(data).Mul(1.0f / 32768.f).AsVec3ByMatrix44(mat).Store(dest + i * 4);
290
}
291
break;
292
case GE_VTYPE_POS_8BIT:
293
for (int i = 0; i < count; i++) {
294
const s8 *data = (const s8 *)startPtr + i * vertexStride + offset;
295
Vec4F32::LoadConvertS8(data).Mul(1.0f / 128.0f).AsVec3ByMatrix44(mat).Store(dest + i * 4);
296
}
297
break;
298
}
299
}
300
301
void TransformPredecodedForDepthRaster(float *dest, const float *worldviewproj, const void *decodedVertexData, const VertexDecoder *dec, int count) {
302
// TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder.
303
_dbg_assert_((dec->VertexType() & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0);
304
305
int vertexStride = dec->GetDecVtxFmt().stride;
306
int offset = dec->GetDecVtxFmt().posoff;
307
308
Mat4F32 mat(worldviewproj);
309
310
const u8 *startPtr = (const u8 *)decodedVertexData;
311
// Decoded position format is always float3.
312
for (int i = 0; i < count; i++) {
313
const float *data = (const float *)(startPtr + i * vertexStride + offset);
314
Vec4F32::Load(data).AsVec3ByMatrix44(mat).Store(dest + i * 4);
315
}
316
}
317
318
void ConvertPredecodedThroughForDepthRaster(float *dest, const void *decodedVertexData, const VertexDecoder *dec, int count) {
319
// TODO: Ditch skinned and morphed prims for now since we don't have a fast way to skin without running the full decoder.
320
_dbg_assert_((dec->VertexType() & (GE_VTYPE_WEIGHT_MASK | GE_VTYPE_MORPHCOUNT_MASK)) == 0);
321
322
int vertexStride = dec->GetDecVtxFmt().stride;
323
int offset = dec->GetDecVtxFmt().posoff;
324
325
const u8 *startPtr = (const u8 *)decodedVertexData;
326
// Decoded position format is always float3.
327
for (int i = 0; i < count; i++) {
328
const float *data = (const float *)(startPtr + i * vertexStride + offset);
329
// Just pass the position straight through - this is through mode!
330
// A W of one makes projection a no-op, without branching.
331
Vec4F32::Load(data).WithLane3One().Store(dest + i * 4);
332
}
333
}
334
335
int DepthRasterClipIndexedRectangles(int *tx, int *ty, float *tz, const float *transformed, const uint16_t *indexBuffer, const DepthDraw &draw, const DepthScissor scissor) {
336
int outCount = 0;
337
const int count = draw.vertexCount;
338
for (int i = 0; i < count; i += 2) {
339
const float *verts[2] = {
340
transformed + indexBuffer[i] * 4,
341
transformed + indexBuffer[i + 1] * 4,
342
};
343
344
// Check if any vertex is behind the 0 plane.
345
if (verts[0][3] < 0.0f || verts[1][3] < 0.0f) {
346
// Ditch this rectangle.
347
continue;
348
}
349
350
// These names are wrong .. until we transpose.
351
// TODO: Maybe combine two rects here at a time. But hardly relevant for performance.
352
Vec4F32 x = Vec4F32::Load(verts[0]);
353
Vec4F32 y = Vec4F32::Load(verts[1]);
354
Vec4F32 z = Vec4F32::Zero();
355
Vec4F32 w = Vec4F32::Zero();
356
Vec4F32::Transpose(x, y, z, w);
357
// Now the names are accurate! Since we only have two vertices, the third and fourth member of each vector is zero
358
// and will not be stored (well it will be stored, but it'll be overwritten by the next vertex).
359
Vec4F32 recipW = w.Recip();
360
361
x *= recipW;
362
y *= recipW;
363
z *= recipW;
364
365
Vec4S32FromF32(x).Store2(tx + outCount);
366
Vec4S32FromF32(y).Store2(ty + outCount);
367
z.Clamp(0.0f, 65535.0f).Store2(tz + outCount);
368
outCount += 2;
369
}
370
return outCount;
371
}
372
373
int DepthRasterClipIndexedTriangles(int *tx, int *ty, float *tz, const float *transformed, const uint16_t *indexBuffer, const DepthDraw &draw, const DepthScissor scissor) {
374
int outCount = 0;
375
376
int flipCull = 0;
377
if (draw.cullEnabled && draw.cullMode == GE_CULL_CW) {
378
flipCull = 3;
379
}
380
const bool cullEnabled = draw.cullEnabled;
381
382
static const float zerovec[4] = {0.0f, 0.0f, 0.0f, 1.0f};
383
384
int collected = 0;
385
int planeCulled = 0;
386
int boxCulled = 0;
387
const float *verts[12]; // four triangles at a time!
388
const int count = draw.vertexCount;
389
390
// Not exactly the same guardband as on the real PSP, but good enough to prevent 16-bit overflow in raster.
391
// This is slightly off-center since we are already in screen space, but whatever.
392
Vec4S32 guardBandTopLeft = Vec4S32::Splat(-4096);
393
Vec4S32 guardBandBottomRight = Vec4S32::Splat(4096);
394
395
Vec4S32 scissorX1 = Vec4S32::Splat((float)scissor.x1);
396
Vec4S32 scissorY1 = Vec4S32::Splat((float)scissor.y1);
397
Vec4S32 scissorX2 = Vec4S32::Splat((float)scissor.x2);
398
Vec4S32 scissorY2 = Vec4S32::Splat((float)scissor.y2);
399
400
// Add cheap pre-projection pre-checks for bad triangle here. Not much we can do safely other than checking W.
401
auto validVert = [](const float *v) -> bool {
402
if (v[3] <= 0.0f || v[2] <= 0.0f) {
403
return false;
404
}
405
/*
406
if (v[2] >= 65535.0f * v[3]) {
407
return false;
408
}*/
409
return true;
410
};
411
412
for (int i = 0; i < count; i += 3) {
413
// Collect valid triangles into buffer.
414
const float *v0 = transformed + indexBuffer[i] * 4;
415
const float *v1 = transformed + indexBuffer[i + (1 ^ flipCull)] * 4;
416
const float *v2 = transformed + indexBuffer[i + (2 ^ flipCull)] * 4;
417
// Don't collect triangle if any vertex is beyond the planes.
418
// TODO: Optimize this somehow.
419
if (validVert(v0) && validVert(v1) && validVert(v2)) {
420
verts[collected] = v0;
421
verts[collected + 1] = v1;
422
verts[collected + 2] = v2;
423
collected += 3;
424
} else {
425
planeCulled++;
426
}
427
428
if (i >= count - 3 && collected != 12) {
429
// Last iteration. Zero out any remaining triangles.
430
for (int j = collected; j < 12; j++) {
431
verts[j] = zerovec;
432
}
433
collected = 12;
434
}
435
436
if (collected != 12) {
437
// Fetch more!
438
continue;
439
}
440
441
collected = 0;
442
443
// These names are wrong .. until we transpose.
444
Vec4F32 x0 = Vec4F32::Load(verts[0]);
445
Vec4F32 x1 = Vec4F32::Load(verts[1]);
446
Vec4F32 x2 = Vec4F32::Load(verts[2]);
447
Vec4F32 y0 = Vec4F32::Load(verts[3]);
448
Vec4F32 y1 = Vec4F32::Load(verts[4]);
449
Vec4F32 y2 = Vec4F32::Load(verts[5]);
450
Vec4F32 z0 = Vec4F32::Load(verts[6]);
451
Vec4F32 z1 = Vec4F32::Load(verts[7]);
452
Vec4F32 z2 = Vec4F32::Load(verts[8]);
453
Vec4F32 w0 = Vec4F32::Load(verts[9]);
454
Vec4F32 w1 = Vec4F32::Load(verts[10]);
455
Vec4F32 w2 = Vec4F32::Load(verts[11]);
456
457
Vec4F32::Transpose(x0, y0, z0, w0);
458
Vec4F32::Transpose(x1, y1, z1, w1);
459
Vec4F32::Transpose(x2, y2, z2, w2);
460
461
// Now the names are accurate!
462
463
// Let's project all three vertices, for all four triangles.
464
Vec4F32 recipW0 = w0.Recip();
465
Vec4F32 recipW1 = w1.Recip();
466
Vec4F32 recipW2 = w2.Recip();
467
x0 *= recipW0;
468
y0 *= recipW0;
469
z0 *= recipW0;
470
x1 *= recipW1;
471
y1 *= recipW1;
472
z1 *= recipW1;
473
x2 *= recipW2;
474
y2 *= recipW2;
475
z2 *= recipW2;
476
477
// Check bounding box size. Cast to integer for crude rounding (and to approximately match the rasterizer).
478
Vec4S32 minX = Vec4S32FromF32(x0.Min(x1.Min(x2)));
479
Vec4S32 minY = Vec4S32FromF32(y0.Min(y1.Min(y2)));
480
Vec4S32 maxX = Vec4S32FromF32(x0.Max(x1.Max(x2)));
481
Vec4S32 maxY = Vec4S32FromF32(y0.Max(y1.Max(y2)));
482
483
// If all are equal in any dimension, all four triangles are tiny nonsense and can be skipped early.
484
Vec4S32 eqMask = minX.CompareEq(maxX) | minY.CompareEq(maxY);
485
486
// Otherwise we just proceed to triangle setup with all four for now.
487
// We could also save the computed boxes for later..
488
// TODO: Merge into below checks? Though nice with an early out.
489
if (!AnyZeroSignBit(eqMask)) {
490
boxCulled += 4;
491
continue;
492
}
493
494
// Create a mask to kill coordinates of triangles that poke outside the guardband (or are just empty).
495
Vec4S32 inGuardBand =
496
((minX.CompareGt(guardBandTopLeft) & maxX.CompareLt(guardBandBottomRight)) &
497
(minY.CompareGt(guardBandTopLeft) & maxY.CompareLt(guardBandBottomRight))).AndNot(eqMask);
498
499
// Create another mask to kill off-screen triangles. Not perfectly accurate.
500
inGuardBand &= (maxX.CompareGt(scissorX1) & minX.CompareLt(scissorX2)) & (maxY.CompareGt(scissorY1) & minY.CompareLt(scissorY2));
501
502
// It's enough to smash one coordinate to make future checks (like the tri area check) fail.
503
x0 &= inGuardBand;
504
x1 &= inGuardBand;
505
x2 &= inGuardBand;
506
507
// Floating point double triangle area. Can't be reused for the integer-snapped raster reliably (though may work...)
508
// Still good for culling early and pretty cheap to compute.
509
Vec4F32 doubleTriArea = (x1 - x0) * (y2 - y0) - (x2 - x0) * (y1 - y0) - Vec4F32::Splat((float)(MIN_TWICE_TRI_AREA));
510
if (!AnyZeroSignBit(doubleTriArea)) {
511
gpuStats.numDepthRasterEarlySize += 4;
512
continue;
513
}
514
515
// Note: If any triangle is outside the guardband, (just) its X coords get zeroed, and it'll later get rejected.
516
Vec4S32FromF32(x0).Store(tx + outCount);
517
Vec4S32FromF32(x1).Store(tx + outCount + 4);
518
Vec4S32FromF32(x2).Store(tx + outCount + 8);
519
Vec4S32FromF32(y0).Store(ty + outCount);
520
Vec4S32FromF32(y1).Store(ty + outCount + 4);
521
Vec4S32FromF32(y2).Store(ty + outCount + 8);
522
z0.Store(tz + outCount);
523
z1.Store(tz + outCount + 4);
524
z2.Store(tz + outCount + 8);
525
526
#ifdef _DEBUG
527
for (int i = 0; i < 12; i++) {
528
_dbg_assert_(tx[outCount + i] < 32767);
529
_dbg_assert_(tx[outCount + i] >= -32768);
530
_dbg_assert_(tx[outCount + i] < 32767);
531
_dbg_assert_(tx[outCount + i] >= -32768);
532
}
533
#endif
534
535
outCount += 12;
536
537
if (!cullEnabled) {
538
// If culling is off, store the triangles again, with the first two vertices swapped.
539
(Vec4S32FromF32(x0) & inGuardBand).Store(tx + outCount);
540
(Vec4S32FromF32(x2) & inGuardBand).Store(tx + outCount + 4);
541
(Vec4S32FromF32(x1) & inGuardBand).Store(tx + outCount + 8);
542
Vec4S32FromF32(y0).Store(ty + outCount);
543
Vec4S32FromF32(y2).Store(ty + outCount + 4);
544
Vec4S32FromF32(y1).Store(ty + outCount + 8);
545
z0.Store(tz + outCount);
546
z2.Store(tz + outCount + 4);
547
z1.Store(tz + outCount + 8);
548
549
outCount += 12;
550
}
551
}
552
553
gpuStats.numDepthRasterZCulled += planeCulled;
554
gpuStats.numDepthEarlyBoxCulled += boxCulled;
555
return outCount;
556
}
557
558
// Rasterizes screen-space vertices.
559
void DepthRasterScreenVerts(uint16_t *depth, int depthStride, const int *tx, const int *ty, const float *tz, int count, const DepthDraw &draw, const DepthScissor scissor, bool lowQ) {
560
// Prim should now be either TRIANGLES or RECTs.
561
_dbg_assert_(draw.prim == GE_PRIM_RECTANGLES || draw.prim == GE_PRIM_TRIANGLES);
562
563
switch (draw.prim) {
564
case GE_PRIM_RECTANGLES:
565
for (int i = 0; i < count; i += 2) {
566
uint16_t z = (uint16_t)tz[i + 1]; // depth from second vertex
567
// TODO: Should clip coordinates to the scissor rectangle.
568
// We remove the subpixel information here.
569
DepthRasterRect(depth, depthStride, scissor, tx[i], ty[i], tx[i + 1], ty[i + 1], z, draw.compareMode);
570
}
571
gpuStats.numDepthRasterPrims += count / 2;
572
break;
573
case GE_PRIM_TRIANGLES:
574
{
575
int stats[3]{};
576
// Batches of 4 triangles, as output by the clip function.
577
if (lowQ) {
578
switch (draw.compareMode) {
579
case ZCompareMode::Greater:
580
{
581
for (int i = 0; i < count; i += 12) {
582
DepthRaster4Triangles<ZCompareMode::Greater, true>(stats, depth, depthStride, scissor, &tx[i], &ty[i], &tz[i]);
583
}
584
break;
585
}
586
case ZCompareMode::Less:
587
{
588
for (int i = 0; i < count; i += 12) {
589
DepthRaster4Triangles<ZCompareMode::Less, true>(stats, depth, depthStride, scissor, &tx[i], &ty[i], &tz[i]);
590
}
591
break;
592
}
593
case ZCompareMode::Always:
594
{
595
for (int i = 0; i < count; i += 12) {
596
DepthRaster4Triangles<ZCompareMode::Always, true>(stats, depth, depthStride, scissor, &tx[i], &ty[i], &tz[i]);
597
}
598
break;
599
}
600
}
601
} else {
602
switch (draw.compareMode) {
603
case ZCompareMode::Greater:
604
{
605
for (int i = 0; i < count; i += 12) {
606
DepthRaster4Triangles<ZCompareMode::Greater, false>(stats, depth, depthStride, scissor, &tx[i], &ty[i], &tz[i]);
607
}
608
break;
609
}
610
case ZCompareMode::Less:
611
{
612
for (int i = 0; i < count; i += 12) {
613
DepthRaster4Triangles<ZCompareMode::Less, false>(stats, depth, depthStride, scissor, &tx[i], &ty[i], &tz[i]);
614
}
615
break;
616
}
617
case ZCompareMode::Always:
618
{
619
for (int i = 0; i < count; i += 12) {
620
DepthRaster4Triangles<ZCompareMode::Always, false>(stats, depth, depthStride, scissor, &tx[i], &ty[i], &tz[i]);
621
}
622
break;
623
}
624
}
625
}
626
gpuStats.numDepthRasterNoPixels += stats[(int)TriangleStat::NoPixels];
627
gpuStats.numDepthRasterTooSmall += stats[(int)TriangleStat::SmallOrBackface];
628
gpuStats.numDepthRasterPrims += stats[(int)TriangleStat::OK];
629
break;
630
}
631
default:
632
_dbg_assert_(false);
633
}
634
}
635
636