Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hrydgard
GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Common/TextureScalerCommon.cpp
3186 views
1
// Copyright (c) 2012- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#include <algorithm>
19
#include <cstddef>
20
#include <cstring>
21
#include <cmath>
22
23
#include "GPU/Common/TextureScalerCommon.h"
24
25
#include "Core/Config.h"
26
#include "Common/Common.h"
27
#include "Common/Log.h"
28
#include "Common/Math/SIMDHeaders.h"
29
#include "Common/Thread/ParallelLoop.h"
30
#include "ext/xbrz/xbrz.h"
31
32
// Report the time and throughput for each larger scaling operation in the log
33
//#define SCALING_MEASURE_TIME
34
#include "Common/TimeUtil.h"
35
36
/////////////////////////////////////// Helper Functions (mostly math for parallelization)
37
38
namespace {
39
//////////////////////////////////////////////////////////////////// Various image processing
40
41
#define R(_col) ((_col>> 0)&0xFF)
42
#define G(_col) ((_col>> 8)&0xFF)
43
#define B(_col) ((_col>>16)&0xFF)
44
#define A(_col) ((_col>>24)&0xFF)
45
46
#define DISTANCE(_p1,_p2) ( abs(static_cast<int>(static_cast<int>(R(_p1))-R(_p2))) + abs(static_cast<int>(static_cast<int>(G(_p1))-G(_p2))) \
47
+ abs(static_cast<int>(static_cast<int>(B(_p1))-B(_p2))) + abs(static_cast<int>(static_cast<int>(A(_p1))-A(_p2))) )
48
49
// this is sadly much faster than an inline function with a loop, at least in VC10
50
#define MIX_PIXELS(_p0, _p1, _factors) \
51
( (R(_p0)*(_factors)[0] + R(_p1)*(_factors)[1])/255 << 0 ) | \
52
( (G(_p0)*(_factors)[0] + G(_p1)*(_factors)[1])/255 << 8 ) | \
53
( (B(_p0)*(_factors)[0] + B(_p1)*(_factors)[1])/255 << 16 ) | \
54
( (A(_p0)*(_factors)[0] + A(_p1)*(_factors)[1])/255 << 24 )
55
56
#define BLOCK_SIZE 32
57
58
// 3x3 convolution with Neumann boundary conditions, parallelizable
59
// quite slow, could be sped up a lot
60
// especially handling of separable kernels
61
void convolve3x3(const u32 *data, u32 *out, const int kernel[3][3], int width, int height, int l, int u) {
62
for (int yb = 0; yb < (u - l) / BLOCK_SIZE + 1; ++yb) {
63
for (int xb = 0; xb < width / BLOCK_SIZE + 1; ++xb) {
64
for (int y = l + yb*BLOCK_SIZE; y < l + (yb + 1)*BLOCK_SIZE && y < u; ++y) {
65
for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < width; ++x) {
66
int val = 0;
67
for (int yoff = -1; yoff <= 1; ++yoff) {
68
int yy = std::max(std::min(y + yoff, height - 1), 0);
69
for (int xoff = -1; xoff <= 1; ++xoff) {
70
int xx = std::max(std::min(x + xoff, width - 1), 0);
71
val += data[yy*width + xx] * kernel[yoff + 1][xoff + 1];
72
}
73
}
74
out[y*width + x] = abs(val);
75
}
76
}
77
}
78
}
79
}
80
81
// deposterization: smoothes posterized gradients from low-color-depth (e.g. 444, 565, compressed) sources
82
void deposterizeH(const u32 *data, u32 *out, int w, int l, int u) {
83
static const int T = 8;
84
for (int y = l; y < u; ++y) {
85
for (int x = 0; x < w; ++x) {
86
int inpos = y*w + x;
87
u32 center = data[inpos];
88
if (x == 0 || x == w - 1) {
89
out[y*w + x] = center;
90
continue;
91
}
92
u32 left = data[inpos - 1];
93
u32 right = data[inpos + 1];
94
out[y*w + x] = 0;
95
for (int c = 0; c < 4; ++c) {
96
u8 lc = ((left >> c * 8) & 0xFF);
97
u8 cc = ((center >> c * 8) & 0xFF);
98
u8 rc = ((right >> c * 8) & 0xFF);
99
if ((lc != rc) && ((lc == cc && abs((int)((int)rc) - cc) <= T) || (rc == cc && abs((int)((int)lc) - cc) <= T))) {
100
// blend this component
101
out[y*w + x] |= ((rc + lc) / 2) << (c * 8);
102
} else {
103
// no change for this component
104
out[y*w + x] |= cc << (c * 8);
105
}
106
}
107
}
108
}
109
}
110
void deposterizeV(const u32 *data, u32 *out, int w, int h, int l, int u) {
111
static const int T = 8;
112
for (int xb = 0; xb < w / BLOCK_SIZE + 1; ++xb) {
113
for (int y = l; y < u; ++y) {
114
for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < w; ++x) {
115
u32 center = data[y * w + x];
116
if (y == 0 || y == h - 1) {
117
out[y*w + x] = center;
118
continue;
119
}
120
u32 upper = data[(y - 1) * w + x];
121
u32 lower = data[(y + 1) * w + x];
122
out[y*w + x] = 0;
123
for (int c = 0; c < 4; ++c) {
124
u8 uc = ((upper >> c * 8) & 0xFF);
125
u8 cc = ((center >> c * 8) & 0xFF);
126
u8 lc = ((lower >> c * 8) & 0xFF);
127
if ((uc != lc) && ((uc == cc && abs((int)((int)lc) - cc) <= T) || (lc == cc && abs((int)((int)uc) - cc) <= T))) {
128
// blend this component
129
out[y*w + x] |= ((lc + uc) / 2) << (c * 8);
130
} else {
131
// no change for this component
132
out[y*w + x] |= cc << (c * 8);
133
}
134
}
135
}
136
}
137
}
138
}
139
140
// generates a distance mask value for each pixel in data
141
// higher values -> larger distance to the surrounding pixels
142
void generateDistanceMask(const u32 *data, u32 *out, int width, int height, int l, int u) {
143
for (int yb = 0; yb < (u - l) / BLOCK_SIZE + 1; ++yb) {
144
for (int xb = 0; xb < width / BLOCK_SIZE + 1; ++xb) {
145
for (int y = l + yb*BLOCK_SIZE; y < l + (yb + 1)*BLOCK_SIZE && y < u; ++y) {
146
for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < width; ++x) {
147
const u32 center = data[y*width + x];
148
u32 dist = 0;
149
for (int yoff = -1; yoff <= 1; ++yoff) {
150
int yy = y + yoff;
151
if (yy == height || yy == -1) {
152
dist += 1200; // assume distance at borders, usually makes for better result
153
continue;
154
}
155
for (int xoff = -1; xoff <= 1; ++xoff) {
156
if (yoff == 0 && xoff == 0) continue;
157
int xx = x + xoff;
158
if (xx == width || xx == -1) {
159
dist += 400; // assume distance at borders, usually makes for better result
160
continue;
161
}
162
dist += DISTANCE(data[yy*width + xx], center);
163
}
164
}
165
out[y*width + x] = dist;
166
}
167
}
168
}
169
}
170
}
171
172
// mix two images based on a mask
173
void mix(u32 *data, const u32 *source, const u32 *mask, u32 maskmax, int width, int l, int u) {
174
for (int y = l; y < u; ++y) {
175
for (int x = 0; x < width; ++x) {
176
int pos = y*width + x;
177
u8 mixFactors[2] = { 0, static_cast<u8>((std::min(mask[pos], maskmax) * 255) / maskmax) };
178
mixFactors[0] = 255 - mixFactors[1];
179
data[pos] = MIX_PIXELS(data[pos], source[pos], mixFactors);
180
if (A(source[pos]) == 0) data[pos] = data[pos] & 0x00FFFFFF; // xBRZ always does a better job with hard alpha
181
}
182
}
183
}
184
185
//////////////////////////////////////////////////////////////////// Bicubic scaling
186
187
// Code for the cubic upscaler is pasted below as-is.
188
// WARNING: different codestyle.
189
190
// NOTE: in several places memcpy is used instead of type punning,
191
// to avoid strict aliasing problems. This may produce suboptimal
192
// code, especially on MSVC.
193
194
// Loads a sample (4 bytes) from image into 'output'.
195
static void load_sample(ptrdiff_t w, ptrdiff_t h, ptrdiff_t s, const u8 *pixels, int wrap_mode, ptrdiff_t x, ptrdiff_t y, u8 *output) {
196
// Check if the sample is inside. NOTE: for b>=0
197
// the expression (UNSIGNED)a<(UNSIGNED)b is
198
// equivalent to a>=0&&a<b.
199
static_assert(sizeof(ptrdiff_t) == sizeof(size_t), "Assumes ptrdiff_t same width as size_t");
200
201
if((size_t)x >= (size_t)w || (size_t)y >= (size_t)h) {
202
switch(wrap_mode) {
203
case 0: // Wrap
204
if(!((w & (w-1)) | (h & (h-1)))) {
205
// Both w and h are powers of 2.
206
x &= w-1;
207
y &= h-1;
208
} else {
209
// For e.g. 1x1 images we might need to wrap several
210
// times, hence 'while', instead of 'if'. Probably
211
// still faster, than modulo.
212
while(x < 0) x += w;
213
while(y < 0) y += h;
214
while(x >= w) x -= w;
215
while(y >= h) y -= h;
216
}
217
break;
218
case 1: // Clamp
219
if(x < 0) x = 0;
220
if(y < 0) y = 0;
221
if(x >= w) x = w-1;
222
if(y >= h) y = h-1;
223
break;
224
case 2: // Zero
225
memset(output, 0, 4);
226
return;
227
}
228
}
229
memcpy(output, pixels + s*y + 4*x, 4);
230
}
231
232
#define BLOCK 8
233
234
static void init_block(
235
ptrdiff_t w, ptrdiff_t h,
236
ptrdiff_t src_stride, const u8 *src_pixels,
237
int wrap_mode, ptrdiff_t factor, float B, float C,
238
ptrdiff_t x0, ptrdiff_t y0,
239
float (*cx)[4], float (*cy)[4],
240
ptrdiff_t *lx, ptrdiff_t *ly, ptrdiff_t *lx0, ptrdiff_t *ly0, ptrdiff_t *sx, ptrdiff_t *sy,
241
u8 (*src)[(BLOCK+4)*4]) {
242
// Precomputed coefficients for pixel weights
243
// in the Mitchell-Netravali filter:
244
// output = SUM(wij*pixel[i]*t^j)
245
// where t is distance from pixel[1] to the
246
// sampling position.
247
float w00 = B/6.0f , w01 = -C-0.5f*B, w02 = 2.0f*C+0.5f*B , w03 = -C-B/6.0f ;
248
float w10 = 1.0f-B/3.0f,/*w11 = 0.0f ,*/w12 = C+2.0f*B-3.0f , w13 = -C-1.5f*B+2.0f;
249
float w20 = B/6.0f , w21 = C+0.5f*B, w22 = -2.0f*C-2.5f*B+3.0f, w23 = C+1.5f*B-2.0f;
250
float /*w30 = 0.0f , w31 = 0.0f ,*/w32 = -C , w33 = C+B/6.0f ;
251
// Express the sampling position as a rational
252
// number num/den-1 (off by one, so that num is
253
// always positive, since the C language does
254
// not do Euclidean division). Sampling points
255
// for both src and dst are assumed at pixel centers.
256
ptrdiff_t den = 2*factor;
257
float inv_den = 1.0f/(float)den;
258
for(int dir = 0; dir < 2; ++dir) {
259
ptrdiff_t num = (dir ? 2*y0+1+factor : 2*x0+1+factor);
260
ptrdiff_t *l = (dir ? ly : lx), *l0 = (dir ? ly0 : lx0), *s = (dir ? sy : sx);
261
float (*c)[4] = (dir ? cy : cx);
262
(*l0) = num/den-2;
263
num = num%den;
264
for(ptrdiff_t i = 0, j = 0; i < BLOCK; ++i) {
265
l[i] = j; // i-th dst pixel accesses src pixels (l0+l[i])..(l0+l[i]+3) in {x|y} direction.
266
float t = (float)num*inv_den; // Fractional part of the sampling position.
267
// Write out pixel weights.
268
c[i][0] = ((w03*t+w02)*t +w01 )*t +w00 ;
269
c[i][1] = ((w13*t+w12)*t/*+w11*/)*t +w10 ;
270
c[i][2] = ((w23*t+w22)*t +w21 )*t +w20 ;
271
c[i][3] = ((w33*t+w32)*t/*+w31*/)*t/*+w30*/;
272
// Increment the sampling position.
273
if((num += 2) >= den) {num -= den; j += 1;}
274
}
275
(*s) = l[BLOCK-1]+4; // Total sampled src pixels in {x|y} direction.
276
}
277
// Get a local copy of the source pixels.
278
if((*lx0) >=0 && (*ly0) >= 0 && *lx0 + (*sx) <= w && *ly0 + (*sy) <= h) {
279
for(ptrdiff_t iy = 0; iy < (*sy); ++iy)
280
memcpy(src[iy], src_pixels+src_stride*((*ly0) + iy) + 4*(*lx0), (size_t)(4*(*sx)));
281
}
282
else {
283
for(ptrdiff_t iy = 0; iy < (*sy); ++iy) for(ptrdiff_t ix = 0; ix < (*sx); ++ix)
284
load_sample(w, h, src_stride, src_pixels, wrap_mode, (*lx0) + ix, (*ly0) + iy, src[iy] + 4*ix);
285
}
286
}
287
288
static void upscale_block_c(
289
ptrdiff_t w, ptrdiff_t h,
290
ptrdiff_t src_stride, const u8 *src_pixels,
291
int wrap_mode, ptrdiff_t factor, float B, float C,
292
ptrdiff_t x0, ptrdiff_t y0,
293
u8 *dst_pixels) {
294
float cx[BLOCK][4], cy[BLOCK][4];
295
ptrdiff_t lx[BLOCK], ly[BLOCK], lx0, ly0, sx, sy;
296
u8 src[BLOCK+4][(BLOCK+4)*4];
297
float buf[2][BLOCK+4][BLOCK+4][4];
298
init_block(
299
w, h, src_stride, src_pixels, wrap_mode, factor, B, C, x0, y0,
300
cx, cy, lx, ly, &lx0, &ly0, &sx, &sy, src);
301
// Unpack source pixels.
302
for(ptrdiff_t iy = 0; iy < sy; ++iy)
303
for(ptrdiff_t ix = 0; ix < sx; ++ix)
304
for(ptrdiff_t k = 0; k < 4; ++k)
305
buf[0][iy][ix][k] = (float)(int)src[iy][4*ix + k];
306
// Horizontal pass.
307
for(ptrdiff_t ix = 0; ix < BLOCK; ++ix) {
308
#define S(i) (buf[0][iy][lx[ix] + i][k])
309
float C0 = cx[ix][0], C1 = cx[ix][1], C2 = cx[ix][2], C3 = cx[ix][3];
310
for(ptrdiff_t iy = 0; iy < sy; ++iy)
311
for(ptrdiff_t k = 0; k < 4; ++k)
312
buf[1][iy][ix][k] = S(0)*C0 + S(1)*C1 + S(2)*C2 + S(3)*C3;
313
#undef S
314
}
315
// Vertical pass.
316
for(ptrdiff_t iy = 0; iy < BLOCK; ++iy) {
317
#define S(i) (buf[1][ly[iy]+i][ix][k])
318
float C0 = cy[iy][0], C1 = cy[iy][1], C2 = cy[iy][2], C3 = cy[iy][3];
319
for(ptrdiff_t ix = 0; ix < BLOCK; ++ix)
320
for(ptrdiff_t k = 0; k < 4; ++k)
321
buf[0][iy][ix][k] = S(0)*C0 + S(1)*C1 + S(2)*C2 + S(3)*C3;
322
#undef S
323
}
324
// Pack destination pixels.
325
for(ptrdiff_t iy = 0; iy < BLOCK; ++iy)
326
for(ptrdiff_t ix = 0; ix < BLOCK; ++ix) {
327
u8 pixel[4];
328
for(ptrdiff_t k = 0; k < 4; ++k) {
329
float C = buf[0][iy][ix][k];
330
if(!(C>0.0f)) C = 0.0f;
331
if(C>255.0f) C = 255.0f;
332
pixel[k] = (u8)(int)(C + 0.5f);
333
}
334
memcpy(dst_pixels + 4*(BLOCK*iy + ix), pixel, 4);
335
}
336
}
337
338
#if defined(_M_SSE)
339
340
#if defined(__GNUC__)
341
#define ALIGNED(n) __attribute__((aligned(n)))
342
#elif defined(_MSC_VER)
343
#define ALIGNED(n) __declspec(align(n))
344
#else
345
// For our use case, ALIGNED is a hint, not a requirement,
346
// so it's fine to ignore it.
347
#define ALIGNED(n)
348
#endif
349
350
static void upscale_block_sse2(
351
ptrdiff_t w, ptrdiff_t h,
352
ptrdiff_t src_stride, const u8 *src_pixels,
353
int wrap_mode, ptrdiff_t factor, float B, float C,
354
ptrdiff_t x0, ptrdiff_t y0,
355
u8 *dst_pixels) {
356
float cx[BLOCK][4], cy[BLOCK][4];
357
ptrdiff_t lx[BLOCK], ly[BLOCK], lx0, ly0, sx, sy;
358
ALIGNED(16) u8 src[BLOCK+4][(BLOCK+4)*4];
359
ALIGNED(16) float buf[2][BLOCK+4][BLOCK+4][4];
360
init_block(
361
w, h, src_stride, src_pixels, wrap_mode, factor, B, C, x0, y0,
362
cx, cy, lx, ly, &lx0, &ly0, &sx, &sy, src);
363
// Unpack source pixels.
364
for(ptrdiff_t iy = 0; iy < sy; ++iy)
365
for(ptrdiff_t ix = 0; ix < sx; ++ix) {
366
int pixel;
367
memcpy(&pixel, src[iy] + 4*ix, 4);
368
__m128i C = _mm_cvtsi32_si128(pixel);
369
C = _mm_unpacklo_epi8(C, _mm_set1_epi32(0));
370
C = _mm_unpacklo_epi8(C, _mm_set1_epi32(0));
371
_mm_storeu_ps(buf[0][iy][ix], _mm_cvtepi32_ps(C));
372
}
373
// Horizontal pass.
374
for(ptrdiff_t ix = 0; ix < BLOCK; ++ix) {
375
#define S(i) (buf[0][iy][lx[ix] + i])
376
__m128 C0 = _mm_set1_ps(cx[ix][0]),
377
C1 = _mm_set1_ps(cx[ix][1]),
378
C2 = _mm_set1_ps(cx[ix][2]),
379
C3 = _mm_set1_ps(cx[ix][3]);
380
for(ptrdiff_t iy = 0; iy < sy; ++iy)
381
_mm_storeu_ps(buf[1][iy][ix],
382
_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(0)), C0),
383
_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(1)), C1),
384
_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(2)), C2),
385
_mm_mul_ps(_mm_loadu_ps(S(3)), C3)))));
386
#undef S
387
}
388
// Vertical pass.
389
for(ptrdiff_t iy = 0; iy < BLOCK; ++iy) {
390
#define S(i) (buf[1][ly[iy] + i][ix])
391
__m128 C0 = _mm_set1_ps(cy[iy][0]),
392
C1 = _mm_set1_ps(cy[iy][1]),
393
C2 = _mm_set1_ps(cy[iy][2]),
394
C3 = _mm_set1_ps(cy[iy][3]);
395
for(ptrdiff_t ix = 0; ix < BLOCK; ++ix)
396
_mm_storeu_ps(buf[0][iy][ix],
397
_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(0)), C0),
398
_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(1)), C1),
399
_mm_add_ps(_mm_mul_ps(_mm_loadu_ps(S(2)), C2),
400
_mm_mul_ps(_mm_loadu_ps(S(3)), C3)))));
401
#undef S
402
}
403
// Pack destination pixels.
404
for(ptrdiff_t iy = 0; iy < BLOCK; ++iy)
405
for(ptrdiff_t ix = 0; ix < BLOCK; ++ix) {
406
__m128 C = _mm_loadu_ps(buf[0][iy][ix]);
407
C = _mm_min_ps(_mm_max_ps(C, _mm_set1_ps(0.0f)), _mm_set1_ps(255.0f));
408
C = _mm_add_ps(C, _mm_set1_ps(0.5f));
409
__m128i R = _mm_cvttps_epi32(C);
410
R = _mm_packus_epi16(R, R);
411
R = _mm_packus_epi16(R, R);
412
int pixel = _mm_cvtsi128_si32(R);
413
memcpy(dst_pixels + 4*(BLOCK*iy+ix), &pixel, 4);
414
}
415
}
416
#endif // defined(_M_SSE)
417
418
static void upscale_cubic(
419
ptrdiff_t width, ptrdiff_t height, ptrdiff_t src_stride_in_bytes, const void *src_pixels,
420
ptrdiff_t dst_stride_in_bytes, void *dst_pixels,
421
ptrdiff_t scale, float B, float C, int wrap_mode,
422
ptrdiff_t x0, ptrdiff_t y0, ptrdiff_t x1, ptrdiff_t y1) {
423
u8 pixels[BLOCK*BLOCK*4];
424
for(ptrdiff_t y = y0; y < y1; y+= BLOCK)
425
for(ptrdiff_t x = x0; x < x1; x+= BLOCK) {
426
#if defined(_M_SSE)
427
upscale_block_sse2(width, height, src_stride_in_bytes, (const u8*)src_pixels, wrap_mode, scale, B, C, x, y, pixels);
428
#else
429
upscale_block_c (width, height, src_stride_in_bytes, (const u8*)src_pixels, wrap_mode, scale, B, C, x, y, pixels);
430
#endif
431
for(ptrdiff_t iy = 0, ny = (y1-y < BLOCK ? y1-y : BLOCK), nx = (x1-x < BLOCK ? x1-x : BLOCK); iy < ny; ++iy)
432
memcpy((u8*)dst_pixels + dst_stride_in_bytes*(y+iy) + 4*x, pixels + BLOCK*4*iy, (size_t)(4*nx));
433
}
434
}
435
436
// End of pasted cubic upscaler.
437
438
void scaleBicubicBSpline(int factor, const u32 *data, u32 *out, int w, int h, int l, int u) {
439
const float B = 1.0f, C = 0.0f;
440
const int wrap_mode = 1; // Clamp
441
upscale_cubic(
442
w, h, w*4, data,
443
factor*w*4, out,
444
factor, B, C, wrap_mode,
445
0, factor*l, factor*w, factor*u);
446
}
447
448
void scaleBicubicMitchell(int factor, const u32 *data, u32 *out, int w, int h, int l, int u) {
449
const float B = 0.0f, C = 0.5f; // Actually, Catmull-Rom
450
const int wrap_mode = 1; // Clamp
451
upscale_cubic(
452
w, h, w*4, data,
453
factor*w*4, out,
454
factor, B, C, wrap_mode,
455
0, factor*l, factor*w, factor*u);
456
}
457
458
//////////////////////////////////////////////////////////////////// Bilinear scaling
459
460
const static u8 BILINEAR_FACTORS[4][3][2] = {
461
{ { 44, 211 }, { 0, 0 }, { 0, 0 } }, // x2
462
{ { 64, 191 }, { 0, 255 }, { 0, 0 } }, // x3
463
{ { 77, 178 }, { 26, 229 }, { 0, 0 } }, // x4
464
{ { 102, 153 }, { 51, 204 }, { 0, 255 } }, // x5
465
};
466
// integral bilinear upscaling by factor f, horizontal part
467
template<int f>
468
void bilinearHt(const u32 *data, u32 *out, int w, int l, int u) {
469
static_assert(f > 1 && f <= 5, "Bilinear scaling only implemented for factors 2 to 5");
470
int outw = w*f;
471
for (int y = l; y < u; ++y) {
472
for (int x = 0; x < w; ++x) {
473
int inpos = y*w + x;
474
u32 left = data[inpos - (x == 0 ? 0 : 1)];
475
u32 center = data[inpos];
476
u32 right = data[inpos + (x == w - 1 ? 0 : 1)];
477
int i = 0;
478
for (; i < f / 2 + f % 2; ++i) { // first half of the new pixels + center, hope the compiler unrolls this
479
out[y*outw + x*f + i] = MIX_PIXELS(left, center, BILINEAR_FACTORS[f - 2][i]);
480
}
481
for (; i < f; ++i) { // second half of the new pixels, hope the compiler unrolls this
482
out[y*outw + x*f + i] = MIX_PIXELS(right, center, BILINEAR_FACTORS[f - 2][f - 1 - i]);
483
}
484
}
485
}
486
}
487
void bilinearH(int factor, const u32 *data, u32 *out, int w, int l, int u) {
488
switch (factor) {
489
case 2: bilinearHt<2>(data, out, w, l, u); break;
490
case 3: bilinearHt<3>(data, out, w, l, u); break;
491
case 4: bilinearHt<4>(data, out, w, l, u); break;
492
case 5: bilinearHt<5>(data, out, w, l, u); break;
493
default: ERROR_LOG(Log::G3D, "Bilinear upsampling only implemented for factors 2 to 5");
494
}
495
}
496
// integral bilinear upscaling by factor f, vertical part
497
// gl/gu == global lower and upper bound
498
template<int f>
499
void bilinearVt(const u32 *data, u32 *out, int w, int gl, int gu, int l, int u) {
500
static_assert(f>1 && f <= 5, "Bilinear scaling only implemented for 2x, 3x, 4x, and 5x");
501
int outw = w*f;
502
for (int xb = 0; xb < outw / BLOCK_SIZE + 1; ++xb) {
503
for (int y = l; y < u; ++y) {
504
u32 uy = y - (y == gl ? 0 : 1);
505
u32 ly = y + (y == gu - 1 ? 0 : 1);
506
for (int x = xb*BLOCK_SIZE; x < (xb + 1)*BLOCK_SIZE && x < outw; ++x) {
507
u32 upper = data[uy * outw + x];
508
u32 center = data[y * outw + x];
509
u32 lower = data[ly * outw + x];
510
int i = 0;
511
for (; i < f / 2 + f % 2; ++i) { // first half of the new pixels + center, hope the compiler unrolls this
512
out[(y*f + i)*outw + x] = MIX_PIXELS(upper, center, BILINEAR_FACTORS[f - 2][i]);
513
}
514
for (; i < f; ++i) { // second half of the new pixels, hope the compiler unrolls this
515
out[(y*f + i)*outw + x] = MIX_PIXELS(lower, center, BILINEAR_FACTORS[f - 2][f - 1 - i]);
516
}
517
}
518
}
519
}
520
}
521
void bilinearV(int factor, const u32 *data, u32 *out, int w, int gl, int gu, int l, int u) {
522
switch (factor) {
523
case 2: bilinearVt<2>(data, out, w, gl, gu, l, u); break;
524
case 3: bilinearVt<3>(data, out, w, gl, gu, l, u); break;
525
case 4: bilinearVt<4>(data, out, w, gl, gu, l, u); break;
526
case 5: bilinearVt<5>(data, out, w, gl, gu, l, u); break;
527
default: ERROR_LOG(Log::G3D, "Bilinear upsampling only implemented for factors 2 to 5");
528
}
529
}
530
531
#undef BLOCK_SIZE
532
#undef MIX_PIXELS
533
#undef DISTANCE
534
#undef R
535
#undef G
536
#undef B
537
#undef A
538
539
}
540
541
/////////////////////////////////////// Texture Scaler
542
543
TextureScalerCommon::TextureScalerCommon() {
544
// initBicubicWeights() used to be here.
545
}
546
547
TextureScalerCommon::~TextureScalerCommon() {
548
}
549
550
bool TextureScalerCommon::IsEmptyOrFlat(const u32 *data, int pixels) {
551
u32 ref = data[0];
552
// TODO: SIMD-ify this (although, for most textures we'll get out very early)
553
for (int i = 1; i < pixels; ++i) {
554
if (data[i] != ref)
555
return false;
556
}
557
return true;
558
}
559
560
void TextureScalerCommon::ScaleAlways(u32 *out, u32 *src, int width, int height, int *scaledWidth, int *scaledHeight, int factor) {
561
if (IsEmptyOrFlat(src, width * height)) {
562
// This means it was a flat texture. Vulkan wants the size up front, so we need to make it happen.
563
u32 pixel = *src;
564
565
*scaledWidth = width * factor;
566
*scaledHeight = height * factor;
567
568
size_t pixelCount = *scaledWidth * *scaledHeight;
569
570
// ABCD. If A = D, and AB = CD, then they must all be equal (B = C, etc.)
571
if ((pixel & 0x000000FF) == (pixel >> 24) && (pixel & 0x0000FFFF) == (pixel >> 16)) {
572
memset(out, pixel & 0xFF, pixelCount * sizeof(u32));
573
} else {
574
// Let's hope this is vectorized.
575
for (int i = 0; i < pixelCount; ++i) {
576
out[i] = pixel;
577
}
578
}
579
} else {
580
ScaleInto(out, src, width, height, scaledWidth, scaledHeight, factor);
581
}
582
}
583
584
bool TextureScalerCommon::ScaleInto(u32 *outputBuf, u32 *src, int width, int height, int *scaledWidth, int *scaledHeight, int factor) {
585
#ifdef SCALING_MEASURE_TIME
586
double t_start = time_now_d();
587
#endif
588
589
u32 *inputBuf = src;
590
591
// deposterize
592
if (g_Config.bTexDeposterize) {
593
bufDeposter.resize(width * height);
594
DePosterize(inputBuf, bufDeposter.data(), width, height);
595
inputBuf = bufDeposter.data();
596
}
597
598
// scale
599
switch (g_Config.iTexScalingType) {
600
case XBRZ:
601
ScaleXBRZ(factor, inputBuf, outputBuf, width, height);
602
break;
603
case HYBRID:
604
ScaleHybrid(factor, inputBuf, outputBuf, width, height);
605
break;
606
case BICUBIC:
607
ScaleBicubicMitchell(factor, inputBuf, outputBuf, width, height);
608
break;
609
case HYBRID_BICUBIC:
610
ScaleHybrid(factor, inputBuf, outputBuf, width, height, true);
611
break;
612
default:
613
ERROR_LOG(Log::G3D, "Unknown scaling type: %d", g_Config.iTexScalingType);
614
}
615
616
// update values accordingly
617
*scaledWidth = width * factor;
618
*scaledHeight = height * factor;
619
620
#ifdef SCALING_MEASURE_TIME
621
if (*scaledWidth* *scaledHeight > 64 * 64 * factor*factor) {
622
double t = time_now_d() - t_start;
623
NOTICE_LOG(Log::G3D, "TextureScaler: processed %9d pixels in %6.5lf seconds. (%9.2lf Mpixels/second)",
624
*scaledWidth * *scaledHeight, t, (*scaledWidth * *scaledHeight) / (t * 1000 * 1000));
625
}
626
#endif
627
628
return true;
629
}
630
631
bool TextureScalerCommon::Scale(u32* &data, int width, int height, int *scaledWidth, int *scaledHeight, int factor) {
632
// prevent processing empty or flat textures (this happens a lot in some games)
633
// doesn't hurt the standard case, will be very quick for textures with actual texture
634
if (IsEmptyOrFlat(data, width*height)) {
635
DEBUG_LOG(Log::G3D, "TextureScaler: early exit -- empty/flat texture");
636
return false;
637
}
638
639
bufOutput.resize(width * height * (factor * factor)); // used to store the upscaled image
640
u32 *outputBuf = bufOutput.data();
641
642
if (ScaleInto(outputBuf, data, width, height, scaledWidth, scaledHeight, factor)) {
643
data = outputBuf;
644
return true;
645
}
646
return false;
647
}
648
649
const int MIN_LINES_PER_THREAD = 4;
650
651
void TextureScalerCommon::ScaleXBRZ(int factor, u32* source, u32* dest, int width, int height) {
652
xbrz::ScalerCfg cfg;
653
ParallelRangeLoop(&g_threadManager, std::bind(&xbrz::scale, factor, source, dest, width, height, xbrz::ColorFormat::ARGB, cfg, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
654
}
655
656
void TextureScalerCommon::ScaleBilinear(int factor, u32* source, u32* dest, int width, int height) {
657
bufTmp1.resize(width * height * factor);
658
u32 *tmpBuf = bufTmp1.data();
659
ParallelRangeLoop(&g_threadManager, std::bind(&bilinearH, factor, source, tmpBuf, width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
660
ParallelRangeLoop(&g_threadManager, std::bind(&bilinearV, factor, tmpBuf, dest, width, 0, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
661
}
662
663
void TextureScalerCommon::ScaleBicubicBSpline(int factor, u32* source, u32* dest, int width, int height) {
664
ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicBSpline, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
665
}
666
667
void TextureScalerCommon::ScaleBicubicMitchell(int factor, u32* source, u32* dest, int width, int height) {
668
ParallelRangeLoop(&g_threadManager,std::bind(&scaleBicubicMitchell, factor, source, dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
669
}
670
671
void TextureScalerCommon::ScaleHybrid(int factor, u32* source, u32* dest, int width, int height, bool bicubic) {
672
// Basic algorithm:
673
// 1) determine a feature mask C based on a sobel-ish filter + splatting, and upscale that mask bilinearly
674
// 2) generate 2 scaled images: A - using Bilinear filtering, B - using xBRZ
675
// 3) output = A*C + B*(1-C)
676
677
const static int KERNEL_SPLAT[3][3] = {
678
{ 1, 1, 1 }, { 1, 1, 1 }, { 1, 1, 1 }
679
};
680
681
bufTmp1.resize(width*height);
682
bufTmp2.resize(width*height*factor*factor);
683
bufTmp3.resize(width*height*factor*factor);
684
685
ParallelRangeLoop(&g_threadManager,std::bind(&generateDistanceMask, source, bufTmp1.data(), width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
686
ParallelRangeLoop(&g_threadManager,std::bind(&convolve3x3, bufTmp1.data(), bufTmp2.data(), KERNEL_SPLAT, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
687
ScaleBilinear(factor, bufTmp2.data(), bufTmp3.data(), width, height);
688
// mask C is now in bufTmp3
689
690
ScaleXBRZ(factor, source, bufTmp2.data(), width, height);
691
// xBRZ upscaled source is in bufTmp2
692
693
if (bicubic) ScaleBicubicBSpline(factor, source, dest, width, height);
694
else ScaleBilinear(factor, source, dest, width, height);
695
// Upscaled source is in dest
696
697
// Now we can mix it all together
698
// The factor 8192 was found through practical testing on a variety of textures
699
ParallelRangeLoop(&g_threadManager,std::bind(&mix, dest, bufTmp2.data(), bufTmp3.data(), 8192, width*factor, std::placeholders::_1, std::placeholders::_2), 0, height*factor, MIN_LINES_PER_THREAD);
700
}
701
702
void TextureScalerCommon::DePosterize(u32* source, u32* dest, int width, int height) {
703
bufTmp3.resize(width*height);
704
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, source, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
705
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
706
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeH, dest, bufTmp3.data(), width, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
707
ParallelRangeLoop(&g_threadManager,std::bind(&deposterizeV, bufTmp3.data(), dest, width, height, std::placeholders::_1, std::placeholders::_2), 0, height, MIN_LINES_PER_THREAD);
708
}
709
710