Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hrydgard
GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Software/Sampler.cpp
3186 views
1
// Copyright (c) 2017- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include "ppsspp_config.h"
19
#include <unordered_map>
20
#include <mutex>
21
#include "Common/Common.h"
22
#include "Common/Data/Convert/ColorConv.h"
23
#include "Common/LogReporting.h"
24
#include "Common/Math/SIMDHeaders.h"
25
#include "Core/Config.h"
26
#include "GPU/Common/TextureDecoder.h"
27
#include "GPU/Software/BinManager.h"
28
#include "GPU/Software/Rasterizer.h"
29
#include "GPU/Software/RasterizerRegCache.h"
30
#include "GPU/Software/Sampler.h"
31
32
using namespace Math3D;
33
using namespace Rasterizer;
34
35
namespace Sampler {
36
37
static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int level, int levelFrac, const SamplerID &samplerID);
38
static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int level, int levelFrac, const SamplerID &samplerID);
39
static Vec4IntResult SOFTRAST_CALL SampleFetch(int u, int v, const u8 *tptr, int bufw, int level, const SamplerID &samplerID);
40
41
std::mutex jitCacheLock;
42
SamplerJitCache *jitCache = nullptr;
43
44
void Init() {
45
jitCache = new SamplerJitCache();
46
}
47
48
void FlushJit() {
49
jitCache->Flush();
50
}
51
52
void Shutdown() {
53
delete jitCache;
54
jitCache = nullptr;
55
}
56
57
bool DescribeCodePtr(const u8 *ptr, std::string &name) {
58
if (!jitCache->IsInSpace(ptr)) {
59
return false;
60
}
61
62
name = jitCache->DescribeCodePtr(ptr);
63
return true;
64
}
65
66
NearestFunc GetNearestFunc(SamplerID id, BinManager *binner) {
67
id.linear = false;
68
NearestFunc jitted = jitCache->GetNearest(id, binner);
69
if (jitted) {
70
return jitted;
71
}
72
73
return &SampleNearest;
74
}
75
76
LinearFunc GetLinearFunc(SamplerID id, BinManager *binner) {
77
id.linear = true;
78
LinearFunc jitted = jitCache->GetLinear(id, binner);
79
if (jitted) {
80
return jitted;
81
}
82
83
return &SampleLinear;
84
}
85
86
FetchFunc GetFetchFunc(SamplerID id, BinManager *binner) {
87
id.fetch = true;
88
FetchFunc jitted = jitCache->GetFetch(id, binner);
89
if (jitted) {
90
return jitted;
91
}
92
93
return &SampleFetch;
94
}
95
96
thread_local SamplerJitCache::LastCache SamplerJitCache::lastFetch_;
97
thread_local SamplerJitCache::LastCache SamplerJitCache::lastNearest_;
98
thread_local SamplerJitCache::LastCache SamplerJitCache::lastLinear_;
99
int SamplerJitCache::clearGen_ = 0;
100
101
// 256k should be enough.
102
SamplerJitCache::SamplerJitCache() : Rasterizer::CodeBlock(1024 * 64 * 4), cache_(64) {
103
lastFetch_.gen = -1;
104
lastNearest_.gen = -1;
105
lastLinear_.gen = -1;
106
clearGen_++;
107
}
108
109
void SamplerJitCache::Clear() {
110
clearGen_++;
111
CodeBlock::Clear();
112
cache_.Clear();
113
addresses_.clear();
114
115
const10All16_ = nullptr;
116
const10Low_ = nullptr;
117
const10All8_ = nullptr;
118
119
constWidthHeight256f_ = nullptr;
120
constWidthMinus1i_ = nullptr;
121
constHeightMinus1i_ = nullptr;
122
123
constOnes32_ = nullptr;
124
constOnes16_ = nullptr;
125
constUNext_ = nullptr;
126
constVNext_ = nullptr;
127
128
const5551Swizzle_ = nullptr;
129
const5650Swizzle_ = nullptr;
130
}
131
132
std::string SamplerJitCache::DescribeCodePtr(const u8 *ptr) {
133
constexpr bool USE_IDS = false;
134
ptrdiff_t dist = 0x7FFFFFFF;
135
if (USE_IDS) {
136
SamplerID found{};
137
for (const auto &it : addresses_) {
138
ptrdiff_t it_dist = ptr - it.second;
139
if (it_dist >= 0 && it_dist < dist) {
140
found = it.first;
141
dist = it_dist;
142
}
143
}
144
145
return DescribeSamplerID(found);
146
}
147
148
return CodeBlock::DescribeCodePtr(ptr);
149
}
150
151
void SamplerJitCache::Flush() {
152
std::unique_lock<std::mutex> guard(jitCacheLock);
153
for (const auto &queued : compileQueue_) {
154
// Might've been compiled after enqueue, but before now.
155
size_t queuedKey = std::hash<SamplerID>()(queued);
156
if (!cache_.ContainsKey(queuedKey))
157
Compile(queued);
158
}
159
compileQueue_.clear();
160
}
161
162
NearestFunc SamplerJitCache::GetByID(const SamplerID &id, size_t key, BinManager *binner) {
163
std::unique_lock<std::mutex> guard(jitCacheLock);
164
165
NearestFunc func;
166
if (cache_.Get(key, &func)) {
167
return func;
168
}
169
170
if (!binner) {
171
// Can't compile, let's try to do it later when there's an opportunity.
172
compileQueue_.insert(id);
173
return nullptr;
174
}
175
176
guard.unlock();
177
binner->Flush("compile");
178
guard.lock();
179
180
for (const auto &queued : compileQueue_) {
181
// Might've been compiled after enqueue, but before now.
182
size_t queuedKey = std::hash<SamplerID>()(queued);
183
if (!cache_.ContainsKey(queuedKey))
184
Compile(queued);
185
}
186
compileQueue_.clear();
187
188
if (!cache_.ContainsKey(key))
189
Compile(id);
190
191
// Okay, should be there now.
192
if (cache_.Get(key, &func)) {
193
return func;
194
} else {
195
return nullptr;
196
}
197
}
198
199
NearestFunc SamplerJitCache::GetNearest(const SamplerID &id, BinManager *binner) {
200
if (!g_Config.bSoftwareRenderingJit)
201
return nullptr;
202
203
const size_t key = std::hash<SamplerID>()(id);
204
if (lastNearest_.Match(key, clearGen_))
205
return (NearestFunc)lastNearest_.func;
206
207
auto func = GetByID(id, key, binner);
208
lastNearest_.Set(key, func, clearGen_);
209
return (NearestFunc)func;
210
}
211
212
LinearFunc SamplerJitCache::GetLinear(const SamplerID &id, BinManager *binner) {
213
if (!g_Config.bSoftwareRenderingJit)
214
return nullptr;
215
216
const size_t key = std::hash<SamplerID>()(id);
217
if (lastLinear_.Match(key, clearGen_))
218
return (LinearFunc)lastLinear_.func;
219
220
auto func = GetByID(id, key, binner);
221
lastLinear_.Set(key, func, clearGen_);
222
return (LinearFunc)func;
223
}
224
225
FetchFunc SamplerJitCache::GetFetch(const SamplerID &id, BinManager *binner) {
226
if (!g_Config.bSoftwareRenderingJit)
227
return nullptr;
228
229
const size_t key = std::hash<SamplerID>()(id);
230
if (lastFetch_.Match(key, clearGen_))
231
return (FetchFunc)lastFetch_.func;
232
233
auto func = GetByID(id, key, binner);
234
lastFetch_.Set(key, func, clearGen_);
235
return (FetchFunc)func;
236
}
237
238
void SamplerJitCache::Compile(const SamplerID &id) {
239
// This should be sufficient.
240
if (GetSpaceLeft() < 16384) {
241
Clear();
242
}
243
244
// We compile them together so the cache can't possibly be cleared in between.
245
// We might vary between nearest and linear, so we can't clear between.
246
#if PPSSPP_ARCH(AMD64) && !PPSSPP_PLATFORM(UWP)
247
SamplerID fetchID = id;
248
fetchID.linear = false;
249
fetchID.fetch = true;
250
addresses_[fetchID] = GetCodePointer();
251
cache_.Insert(std::hash<SamplerID>()(fetchID), (NearestFunc)CompileFetch(fetchID));
252
253
SamplerID nearestID = id;
254
nearestID.linear = false;
255
nearestID.fetch = false;
256
addresses_[nearestID] = GetCodePointer();
257
cache_.Insert(std::hash<SamplerID>()(nearestID), (NearestFunc)CompileNearest(nearestID));
258
259
SamplerID linearID = id;
260
linearID.linear = true;
261
linearID.fetch = false;
262
addresses_[linearID] = GetCodePointer();
263
cache_.Insert(std::hash<SamplerID>()(linearID), (NearestFunc)CompileLinear(linearID));
264
#endif
265
}
266
267
template <uint32_t texel_size_bits>
268
static inline int GetPixelDataOffset(uint32_t row_pitch_pixels, uint32_t u, uint32_t v, bool swizzled) {
269
if (!swizzled)
270
return (v * (row_pitch_pixels * texel_size_bits >> 3)) + (u * texel_size_bits >> 3);
271
272
const uint32_t tile_size_bits = 32;
273
const uint32_t tiles_in_block_horizontal = 4;
274
const uint32_t tiles_in_block_vertical = 8;
275
276
constexpr uint32_t texels_per_tile = tile_size_bits / texel_size_bits;
277
uint32_t tile_u = u / texels_per_tile;
278
uint32_t tile_idx = (v % tiles_in_block_vertical) * (tiles_in_block_horizontal) +
279
// TODO: not sure if the *texel_size_bits/8 factor is correct
280
(v / tiles_in_block_vertical) * ((row_pitch_pixels*texel_size_bits/(tile_size_bits))*tiles_in_block_vertical) +
281
(tile_u % tiles_in_block_horizontal) +
282
(tile_u / tiles_in_block_horizontal) * (tiles_in_block_horizontal*tiles_in_block_vertical);
283
284
return tile_idx * (tile_size_bits / 8) + ((u % texels_per_tile) * texel_size_bits) / 8;
285
}
286
287
static inline u32 LookupColor(unsigned int index, unsigned int level, const SamplerID &samplerID) {
288
const int clutSharingOffset = samplerID.useSharedClut ? 0 : level * 16;
289
290
switch (samplerID.ClutFmt()) {
291
case GE_CMODE_16BIT_BGR5650:
292
return RGB565ToRGBA8888(samplerID.cached.clut16[index + clutSharingOffset]);
293
294
case GE_CMODE_16BIT_ABGR5551:
295
return RGBA5551ToRGBA8888(samplerID.cached.clut16[index + clutSharingOffset]);
296
297
case GE_CMODE_16BIT_ABGR4444:
298
return RGBA4444ToRGBA8888(samplerID.cached.clut16[index + clutSharingOffset]);
299
300
case GE_CMODE_32BIT_ABGR8888:
301
return samplerID.cached.clut32[index + clutSharingOffset];
302
303
default:
304
ERROR_LOG_REPORT(Log::G3D, "Software: Unsupported palette format: %x", samplerID.ClutFmt());
305
return 0;
306
}
307
}
308
309
uint32_t TransformClutIndex(uint32_t index, const SamplerID &samplerID) {
310
if (samplerID.hasClutShift || samplerID.hasClutMask || samplerID.hasClutOffset) {
311
const uint8_t shift = (samplerID.cached.clutFormat >> 2) & 0x1F;
312
const uint8_t mask = (samplerID.cached.clutFormat >> 8) & 0xFF;
313
const uint16_t offset = ((samplerID.cached.clutFormat >> 16) & 0x1F) << 4;
314
// We need to wrap any entries beyond the first 1024 bytes.
315
const uint16_t offsetMask = samplerID.ClutFmt() == GE_CMODE_32BIT_ABGR8888 ? 0xFF : 0x1FF;
316
317
return ((index >> shift) & mask) | (offset & offsetMask);
318
}
319
return index & 0xFF;
320
}
321
322
struct Nearest4 {
323
alignas(16) u32 v[4];
324
325
operator u32() const {
326
return v[0];
327
}
328
};
329
330
template <int N>
331
inline static Nearest4 SOFTRAST_CALL SampleNearest(const int u[N], const int v[N], const u8 *srcptr, uint16_t texbufw, int level, const SamplerID &samplerID) {
332
Nearest4 res;
333
if (!srcptr) {
334
memset(res.v, 0, sizeof(res.v));
335
return res;
336
}
337
338
// TODO: Should probably check if textures are aligned properly...
339
340
switch (samplerID.TexFmt()) {
341
case GE_TFMT_4444:
342
for (int i = 0; i < N; ++i) {
343
const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i], samplerID.swizzle);
344
res.v[i] = RGBA4444ToRGBA8888(*(const u16 *)src);
345
}
346
return res;
347
348
case GE_TFMT_5551:
349
for (int i = 0; i < N; ++i) {
350
const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i], samplerID.swizzle);
351
res.v[i] = RGBA5551ToRGBA8888(*(const u16 *)src);
352
}
353
return res;
354
355
case GE_TFMT_5650:
356
for (int i = 0; i < N; ++i) {
357
const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i], samplerID.swizzle);
358
res.v[i] = RGB565ToRGBA8888(*(const u16 *)src);
359
}
360
return res;
361
362
case GE_TFMT_8888:
363
for (int i = 0; i < N; ++i) {
364
const u8 *src = srcptr + GetPixelDataOffset<32>(texbufw, u[i], v[i], samplerID.swizzle);
365
res.v[i] = *(const u32 *)src;
366
}
367
return res;
368
369
case GE_TFMT_CLUT32:
370
for (int i = 0; i < N; ++i) {
371
const u8 *src = srcptr + GetPixelDataOffset<32>(texbufw, u[i], v[i], samplerID.swizzle);
372
u32 val = src[0] + (src[1] << 8) + (src[2] << 16) + (src[3] << 24);
373
res.v[i] = LookupColor(TransformClutIndex(val, samplerID), 0, samplerID);
374
}
375
return res;
376
377
case GE_TFMT_CLUT16:
378
for (int i = 0; i < N; ++i) {
379
const u8 *src = srcptr + GetPixelDataOffset<16>(texbufw, u[i], v[i], samplerID.swizzle);
380
u16 val = src[0] + (src[1] << 8);
381
res.v[i] = LookupColor(TransformClutIndex(val, samplerID), 0, samplerID);
382
}
383
return res;
384
385
case GE_TFMT_CLUT8:
386
for (int i = 0; i < N; ++i) {
387
const u8 *src = srcptr + GetPixelDataOffset<8>(texbufw, u[i], v[i], samplerID.swizzle);
388
u8 val = *src;
389
res.v[i] = LookupColor(TransformClutIndex(val, samplerID), 0, samplerID);
390
}
391
return res;
392
393
case GE_TFMT_CLUT4:
394
for (int i = 0; i < N; ++i) {
395
const u8 *src = srcptr + GetPixelDataOffset<4>(texbufw, u[i], v[i], samplerID.swizzle);
396
u8 val = (u[i] & 1) ? (src[0] >> 4) : (src[0] & 0xF);
397
// Only CLUT4 uses separate mipmap palettes.
398
res.v[i] = LookupColor(TransformClutIndex(val, samplerID), level, samplerID);
399
}
400
return res;
401
402
case GE_TFMT_DXT1:
403
for (int i = 0; i < N; ++i) {
404
const DXT1Block *block = (const DXT1Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
405
res.v[i] = GetDXT1Texel(block, u[i] & 3, v[i] & 3);
406
}
407
return res;
408
409
case GE_TFMT_DXT3:
410
for (int i = 0; i < N; ++i) {
411
const DXT3Block *block = (const DXT3Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
412
res.v[i] = GetDXT3Texel(block, u[i] & 3, v[i] & 3);
413
}
414
return res;
415
416
case GE_TFMT_DXT5:
417
for (int i = 0; i < N; ++i) {
418
const DXT5Block *block = (const DXT5Block *)srcptr + (v[i] >> 2) * (texbufw >> 2) + (u[i] >> 2);
419
res.v[i] = GetDXT5Texel(block, u[i] & 3, v[i] & 3);
420
}
421
return res;
422
423
default:
424
ERROR_LOG_REPORT(Log::G3D, "Software: Unsupported texture format: %x", samplerID.TexFmt());
425
memset(res.v, 0, sizeof(res.v));
426
return res;
427
}
428
}
429
430
static inline int ClampUV(int v, int height) {
431
if (v >= height - 1)
432
return height - 1;
433
if (v >= 511)
434
return 511;
435
else if (v < 0)
436
return 0;
437
return v;
438
}
439
440
static inline int WrapUV(int v, int height) {
441
return v & (height - 1) & 511;
442
}
443
444
template <int N>
445
static inline void ApplyTexelClamp(int out_u[N], int out_v[N], const int u[N], const int v[N], int width, int height, const SamplerID &samplerID) {
446
if (samplerID.clampS) {
447
for (int i = 0; i < N; ++i) {
448
out_u[i] = ClampUV(u[i], width);
449
}
450
} else {
451
for (int i = 0; i < N; ++i) {
452
out_u[i] = WrapUV(u[i], width);
453
}
454
}
455
if (samplerID.clampT) {
456
for (int i = 0; i < N; ++i) {
457
out_v[i] = ClampUV(v[i], height);
458
}
459
} else {
460
for (int i = 0; i < N; ++i) {
461
out_v[i] = WrapUV(v[i], height);
462
}
463
}
464
}
465
466
static inline void GetTexelCoordinates(int level, float s, float t, int &out_u, int &out_v, const SamplerID &samplerID) {
467
int width = samplerID.cached.sizes[level].w;
468
int height = samplerID.cached.sizes[level].h;
469
470
int base_u = (int)(s * width * 256.0f);
471
int base_v = (int)(t * height * 256.0f);
472
473
base_u >>= 8;
474
base_v >>= 8;
475
476
ApplyTexelClamp<1>(&out_u, &out_v, &base_u, &base_v, width, height, samplerID);
477
}
478
479
Vec4IntResult SOFTRAST_CALL GetTextureFunctionOutput(Vec4IntArg prim_color_in, Vec4IntArg texcolor_in, const SamplerID &samplerID) {
480
const Vec4<int> prim_color = prim_color_in;
481
const Vec4<int> texcolor = texcolor_in;
482
483
Vec3<int> out_rgb;
484
int out_a;
485
486
bool rgba = samplerID.useTextureAlpha;
487
488
switch (samplerID.TexFunc()) {
489
case GE_TEXFUNC_MODULATE:
490
{
491
#if defined(_M_SSE)
492
// Modulate weights slightly on the tex color, by adding one to prim and dividing by 256.
493
const __m128i p = _mm_slli_epi16(_mm_packs_epi32(prim_color.ivec, prim_color.ivec), 4);
494
const __m128i pboost = _mm_add_epi16(p, _mm_set1_epi16(1 << 4));
495
__m128i t = _mm_slli_epi16(_mm_packs_epi32(texcolor.ivec, texcolor.ivec), 4);
496
if (samplerID.useColorDoubling) {
497
const __m128i amask = _mm_set_epi16(-1, 0, 0, 0, -1, 0, 0, 0);
498
const __m128i a = _mm_and_si128(t, amask);
499
const __m128i rgb = _mm_andnot_si128(amask, t);
500
t = _mm_or_si128(_mm_slli_epi16(rgb, 1), a);
501
}
502
const __m128i b = _mm_mulhi_epi16(pboost, t);
503
out_rgb.ivec = _mm_unpacklo_epi16(b, _mm_setzero_si128());
504
505
if (rgba) {
506
return ToVec4IntResult(Vec4<int>(out_rgb.ivec));
507
} else {
508
out_a = prim_color.a();
509
}
510
#elif PPSSPP_ARCH(ARM64_NEON)
511
int32x4_t pboost = vaddq_s32(prim_color.ivec, vdupq_n_s32(1));
512
int32x4_t t = texcolor.ivec;
513
if (samplerID.useColorDoubling) {
514
static const int32_t rgbDouble[4] = { 1, 1, 1, 0 };
515
t = vshlq_s32(t, vld1q_s32(rgbDouble));
516
}
517
out_rgb.ivec = vshrq_n_s32(vmulq_s32(pboost, t), 8);
518
519
if (rgba) {
520
return ToVec4IntResult(Vec4<int>(out_rgb.ivec));
521
}
522
out_a = prim_color.a();
523
#else
524
if (samplerID.useColorDoubling) {
525
out_rgb = ((prim_color.rgb() + Vec3<int>::AssignToAll(1)) * texcolor.rgb() * 2) / 256;
526
} else {
527
out_rgb = (prim_color.rgb() + Vec3<int>::AssignToAll(1)) * texcolor.rgb() / 256;
528
}
529
out_a = (rgba) ? ((prim_color.a() + 1) * texcolor.a() / 256) : prim_color.a();
530
#endif
531
break;
532
}
533
534
case GE_TEXFUNC_DECAL:
535
if (rgba) {
536
int t = texcolor.a();
537
int invt = 255 - t;
538
// Both colors are boosted here, making the alpha have more weight.
539
Vec3<int> one = Vec3<int>::AssignToAll(1);
540
out_rgb = ((prim_color.rgb() + one) * invt + (texcolor.rgb() + one) * t);
541
// Keep the bits of accuracy when doubling.
542
if (samplerID.useColorDoubling)
543
out_rgb /= 128;
544
else
545
out_rgb /= 256;
546
} else {
547
if (samplerID.useColorDoubling)
548
out_rgb = texcolor.rgb() * 2;
549
else
550
out_rgb = texcolor.rgb();
551
}
552
out_a = prim_color.a();
553
break;
554
555
case GE_TEXFUNC_BLEND:
556
{
557
const Vec3<int> const255(255, 255, 255);
558
const Vec3<int> texenv = Vec3<int>::FromRGB(samplerID.cached.texBlendColor);
559
560
// Unlike the others (and even alpha), this one simply always rounds up.
561
const Vec3<int> roundup = Vec3<int>::AssignToAll(255);
562
out_rgb = ((const255 - texcolor.rgb()) * prim_color.rgb() + texcolor.rgb() * texenv + roundup);
563
// Must divide by less to keep the precision for doubling to be accurate.
564
if (samplerID.useColorDoubling)
565
out_rgb /= 128;
566
else
567
out_rgb /= 256;
568
569
out_a = (rgba) ? ((prim_color.a() + 1) * texcolor.a() / 256) : prim_color.a();
570
break;
571
}
572
573
case GE_TEXFUNC_REPLACE:
574
out_rgb = texcolor.rgb();
575
// Doubling even happens for replace.
576
if (samplerID.useColorDoubling)
577
out_rgb *= 2;
578
out_a = (rgba) ? texcolor.a() : prim_color.a();
579
break;
580
581
case GE_TEXFUNC_ADD:
582
case GE_TEXFUNC_UNKNOWN1:
583
case GE_TEXFUNC_UNKNOWN2:
584
case GE_TEXFUNC_UNKNOWN3:
585
// Don't need to clamp afterward, we always clamp before tests.
586
out_rgb = prim_color.rgb() + texcolor.rgb();
587
if (samplerID.useColorDoubling)
588
out_rgb *= 2;
589
590
// Alpha is still blended the common way.
591
out_a = (rgba) ? ((prim_color.a() + 1) * texcolor.a() / 256) : prim_color.a();
592
break;
593
}
594
595
return ToVec4IntResult(Vec4<int>(out_rgb, out_a));
596
}
597
598
static Vec4IntResult SOFTRAST_CALL SampleNearest(float s, float t, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int level, int levelFrac, const SamplerID &samplerID) {
599
int u, v;
600
601
// Nearest filtering only. Round texcoords.
602
GetTexelCoordinates(level, s, t, u, v, samplerID);
603
Vec4<int> c0 = Vec4<int>::FromRGBA(SampleNearest<1>(&u, &v, tptr[0], bufw[0], level, samplerID).v[0]);
604
605
if (levelFrac) {
606
GetTexelCoordinates(level + 1, s, t, u, v, samplerID);
607
Vec4<int> c1 = Vec4<int>::FromRGBA(SampleNearest<1>(&u, &v, tptr[1], bufw[1], level + 1, samplerID).v[0]);
608
609
c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) >> 4;
610
}
611
612
return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0), samplerID);
613
}
614
615
static Vec4IntResult SOFTRAST_CALL SampleFetch(int u, int v, const u8 *tptr, int bufw, int level, const SamplerID &samplerID) {
616
Nearest4 c = SampleNearest<1>(&u, &v, tptr, bufw, level, samplerID);
617
return ToVec4IntResult(Vec4<int>::FromRGBA(c.v[0]));
618
}
619
620
static inline Vec4IntResult SOFTRAST_CALL ApplyTexelClampQuad(bool clamp, Vec4IntArg vec, int width) {
621
Vec4<int> result = vec;
622
#ifdef _M_SSE
623
if (clamp) {
624
// First, clamp to zero.
625
__m128i negmask = _mm_cmpgt_epi32(_mm_setzero_si128(), result.ivec);
626
result.ivec = _mm_andnot_si128(negmask, result.ivec);
627
628
// Now the high bound.
629
__m128i bound = _mm_set1_epi32(width > 512 ? 511 : width - 1);
630
__m128i goodmask = _mm_cmpgt_epi32(bound, result.ivec);
631
// Clear the ones that were too high, then or in the high bound to those.
632
result.ivec = _mm_and_si128(goodmask, result.ivec);
633
result.ivec = _mm_or_si128(result.ivec, _mm_andnot_si128(goodmask, bound));
634
} else {
635
result.ivec = _mm_and_si128(result.ivec, _mm_set1_epi32((width - 1) & 511));
636
}
637
#elif PPSSPP_ARCH(ARM64_NEON)
638
if (clamp) {
639
// Let's start by clamping to the maximum.
640
result.ivec = vminq_s32(result.ivec, vdupq_n_s32(width > 512 ? 511 : width - 1));
641
// And then to zero.
642
result.ivec = vmaxq_s32(result.ivec, vdupq_n_s32(0));
643
} else {
644
result.ivec = vandq_s32(result.ivec, vdupq_n_s32((width - 1) & 511));
645
}
646
#else
647
if (clamp) {
648
for (int i = 0; i < 4; ++i) {
649
result[i] = ClampUV(result[i], width);
650
}
651
} else {
652
for (int i = 0; i < 4; ++i) {
653
result[i] = WrapUV(result[i], width);
654
}
655
}
656
#endif
657
658
return ToVec4IntResult(result);
659
}
660
661
static inline Vec4IntResult SOFTRAST_CALL ApplyTexelClampQuadS(bool clamp, int u, int width) {
662
#ifdef _M_SSE
663
__m128i uvec = _mm_add_epi32(_mm_set1_epi32(u), _mm_set_epi32(1, 0, 1, 0));
664
return ApplyTexelClampQuad(clamp, uvec, width);
665
#elif PPSSPP_ARCH(ARM64_NEON)
666
static const int32_t u2[4] = { 0, 1, 0, 1 };
667
int32x4_t uvec = vaddq_s32(vdupq_n_s32(u), vld1q_s32(u2));
668
return ApplyTexelClampQuad(clamp, uvec, width);
669
#else
670
Vec4<int> result = Vec4<int>::AssignToAll(u) + Vec4<int>(0, 1, 0, 1);
671
return ApplyTexelClampQuad(clamp, ToVec4IntArg(result), width);
672
#endif
673
}
674
675
static inline Vec4IntResult SOFTRAST_CALL ApplyTexelClampQuadT(bool clamp, int v, int height) {
676
#ifdef _M_SSE
677
__m128i vvec = _mm_add_epi32(_mm_set1_epi32(v), _mm_set_epi32(1, 1, 0, 0));
678
return ApplyTexelClampQuad(clamp, vvec, height);
679
#elif PPSSPP_ARCH(ARM64_NEON)
680
static const int32_t v2[4] = { 0, 0, 1, 1 };
681
int32x4_t vvec = vaddq_s32(vdupq_n_s32(v), vld1q_s32(v2));
682
return ApplyTexelClampQuad(clamp, vvec, height);
683
#else
684
Vec4<int> result = Vec4<int>::AssignToAll(v) + Vec4<int>(0, 0, 1, 1);
685
return ApplyTexelClampQuad(clamp, ToVec4IntArg(result), height);
686
#endif
687
}
688
689
static inline Vec4IntResult SOFTRAST_CALL GetTexelCoordinatesQuadS(int level, float in_s, int &frac_u, const SamplerID &samplerID) {
690
int width = samplerID.cached.sizes[level].w;
691
692
int base_u = (int)(in_s * width * 256) - 128;
693
frac_u = (int)(base_u >> 4) & 0x0F;
694
base_u >>= 8;
695
696
// Need to generate and individually wrap/clamp the four sample coordinates. Ugh.
697
return ApplyTexelClampQuadS(samplerID.clampS, base_u, width);
698
}
699
700
static inline Vec4IntResult SOFTRAST_CALL GetTexelCoordinatesQuadT(int level, float in_t, int &frac_v, const SamplerID &samplerID) {
701
int height = samplerID.cached.sizes[level].h;
702
703
int base_v = (int)(in_t * height * 256) - 128;
704
frac_v = (int)(base_v >> 4) & 0x0F;
705
base_v >>= 8;
706
707
// Need to generate and individually wrap/clamp the four sample coordinates. Ugh.
708
return ApplyTexelClampQuadT(samplerID.clampT, base_v, height);
709
}
710
711
static Vec4IntResult SOFTRAST_CALL SampleLinearLevel(float s, float t, const u8 *const *tptr, const uint16_t *bufw, int texlevel, const SamplerID &samplerID) {
712
int frac_u, frac_v;
713
const Vec4<int> u = GetTexelCoordinatesQuadS(texlevel, s, frac_u, samplerID);
714
const Vec4<int> v = GetTexelCoordinatesQuadT(texlevel, t, frac_v, samplerID);
715
Nearest4 c = SampleNearest<4>(u.AsArray(), v.AsArray(), tptr[0], bufw[0], texlevel, samplerID);
716
#ifdef _M_SSE
717
__m128i zero = _mm_setzero_si128();
718
__m128i samples = _mm_loadu_si128((const __m128i*)(c.v));
719
__m128i top = _mm_unpacklo_epi8(samples, zero);
720
__m128i bot = _mm_unpackhi_epi8(samples, zero);
721
// I just a want reasonably efficient
722
// __m128i mul_u = _mm_setr_epi16(0x10 - frac_u, 0x10 - frac_u, 0x10 - frac_u, 0x10 - frac_u, frac_u, frac_u, frac_u, frac_u);
723
// GCC/clang do something decent for that, MSVC - not so much.
724
// Hence this. (0x10 - frac_u) is expressed as (frac_u ^ 0xF) + 1,
725
// which REQUIRES 0 <= frac_u < 0x10.
726
__m128i mul_u = _mm_set1_epi16(frac_u);
727
mul_u = _mm_xor_si128(mul_u, _mm_setr_epi16(0xF, 0xF, 0xF, 0xF, 0x0, 0x0, 0x0, 0x0));
728
mul_u = _mm_add_epi16(mul_u, _mm_setr_epi16(0x1, 0x1, 0x1, 0x1, 0x0, 0x0, 0x0, 0x0));
729
top = _mm_mullo_epi16(top, _mm_set1_epi16(0x10 - frac_v));
730
bot = _mm_mullo_epi16(bot, _mm_set1_epi16(frac_v));
731
__m128i sum = _mm_add_epi16(top, bot);
732
sum = _mm_mullo_epi16(sum, mul_u);
733
sum = _mm_add_epi16(sum, _mm_shuffle_epi32(sum, _MM_SHUFFLE(3, 2, 3, 2)));
734
sum = _mm_srli_epi16(sum, 8);
735
sum = _mm_unpacklo_epi16(sum, zero);
736
return sum;
737
#else
738
Vec4<int> texcolor_tl = Vec4<int>::FromRGBA(c.v[0]);
739
Vec4<int> texcolor_tr = Vec4<int>::FromRGBA(c.v[1]);
740
Vec4<int> texcolor_bl = Vec4<int>::FromRGBA(c.v[2]);
741
Vec4<int> texcolor_br = Vec4<int>::FromRGBA(c.v[3]);
742
Vec4<int> top = texcolor_tl * (0x10 - frac_u) + texcolor_tr * frac_u;
743
Vec4<int> bot = texcolor_bl * (0x10 - frac_u) + texcolor_br * frac_u;
744
return ToVec4IntResult((top * (0x10 - frac_v) + bot * frac_v) >> (4 + 4));
745
#endif
746
}
747
748
static Vec4IntResult SOFTRAST_CALL SampleLinear(float s, float t, Vec4IntArg prim_color, const u8 *const *tptr, const uint16_t *bufw, int texlevel, int levelFrac, const SamplerID &samplerID) {
749
Vec4<int> c0 = SampleLinearLevel(s, t, tptr, bufw, texlevel, samplerID);
750
if (levelFrac) {
751
const Vec4<int> c1 = SampleLinearLevel(s, t, tptr + 1, bufw + 1, texlevel + 1, samplerID);
752
c0 = (c1 * levelFrac + c0 * (16 - levelFrac)) >> 4;
753
}
754
return GetTextureFunctionOutput(prim_color, ToVec4IntArg(c0), samplerID);
755
}
756
757
};
758
759