CoCalc -- lossless

GitHub Repository: godotengine/godot
Path: blob/master/thirdparty/libwebp/src/dsp/lossless_avx2.c
¹⁴⁷³⁰ views
1
// Copyright 2025 Google Inc. All Rights Reserved.
2
//
3
// Use of this source code is governed by a BSD-style license
4
// that can be found in the COPYING file in the root of the source
5
// tree. An additional intellectual property rights grant can be found
6
// in the file PATENTS. All contributing project authors may
7
// be found in the AUTHORS file in the root of the source tree.
8
// -----------------------------------------------------------------------------
9
//
10
// AVX2 variant of methods for lossless decoder
11
//
12
// Author: Vincent Rabaud ([email protected])
13

14
#include "src/dsp/dsp.h"
15

16
#if defined(WEBP_USE_AVX2)
17

18
#include <stddef.h>
19
#include <immintrin.h>
20

21
#include "src/dsp/cpu.h"
22
#include "src/dsp/lossless.h"
23
#include "src/webp/format_constants.h"
24
#include "src/webp/types.h"
25

26
//------------------------------------------------------------------------------
27
// Predictor Transform
28

29
static WEBP_INLINE void Average2_m256i(const __m256i* const a0,
30
                                       const __m256i* const a1,
31
                                       __m256i* const avg) {
32
  // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
33
  const __m256i ones = _mm256_set1_epi8(1);
34
  const __m256i avg1 = _mm256_avg_epu8(*a0, *a1);
35
  const __m256i one = _mm256_and_si256(_mm256_xor_si256(*a0, *a1), ones);
36
  *avg = _mm256_sub_epi8(avg1, one);
37
}
38

39
// Batch versions of those functions.
40

41
// Predictor0: ARGB_BLACK.
42
static void PredictorAdd0_AVX2(const uint32_t* in, const uint32_t* upper,
43
                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
44
  int i;
45
  const __m256i black = _mm256_set1_epi32((int)ARGB_BLACK);
46
  for (i = 0; i + 8 <= num_pixels; i += 8) {
47
    const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
48
    const __m256i res = _mm256_add_epi8(src, black);
49
    _mm256_storeu_si256((__m256i*)&out[i], res);
50
  }
51
  if (i != num_pixels) {
52
    VP8LPredictorsAdd_SSE[0](in + i, NULL, num_pixels - i, out + i);
53
  }
54
  (void)upper;
55
}
56

57
// Predictor1: left.
58
static void PredictorAdd1_AVX2(const uint32_t* in, const uint32_t* upper,
59
                               int num_pixels, uint32_t* WEBP_RESTRICT out) {
60
  int i;
61
  __m256i prev = _mm256_set1_epi32((int)out[-1]);
62
  for (i = 0; i + 8 <= num_pixels; i += 8) {
63
    // h | g | f | e | d | c | b | a
64
    const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
65
    // g | f | e | 0 | c | b | a | 0
66
    const __m256i shift0 = _mm256_slli_si256(src, 4);
67
    // g + h | f + g | e + f | e | c + d | b + c | a + b | a
68
    const __m256i sum0 = _mm256_add_epi8(src, shift0);
69
    // e + f | e | 0 | 0 | a + b | a | 0 | 0
70
    const __m256i shift1 = _mm256_slli_si256(sum0, 8);
71
    // e + f + g + h | e + f + g | e + f | e | a + b + c + d | a + b + c | a + b
72
    // | a
73
    const __m256i sum1 = _mm256_add_epi8(sum0, shift1);
74
    // Add a + b + c + d to the upper lane.
75
    const int32_t sum_abcd = _mm256_extract_epi32(sum1, 3);
76
    const __m256i sum2 = _mm256_add_epi8(
77
        sum1,
78
        _mm256_set_epi32(sum_abcd, sum_abcd, sum_abcd, sum_abcd, 0, 0, 0, 0));
79

80
    const __m256i res = _mm256_add_epi8(sum2, prev);
81
    _mm256_storeu_si256((__m256i*)&out[i], res);
82
    // replicate last res output in prev.
83
    prev = _mm256_permutevar8x32_epi32(
84
        res, _mm256_set_epi32(7, 7, 7, 7, 7, 7, 7, 7));
85
  }
86
  if (i != num_pixels) {
87
    VP8LPredictorsAdd_SSE[1](in + i, upper + i, num_pixels - i, out + i);
88
  }
89
}
90

91
// Macro that adds 32-bit integers from IN using mod 256 arithmetic
92
// per 8 bit channel.
93
#define GENERATE_PREDICTOR_1(X, IN)                                         \
94
  static void PredictorAdd##X##_AVX2(const uint32_t* in,                    \
95
                                     const uint32_t* upper, int num_pixels, \
96
                                     uint32_t* WEBP_RESTRICT out) {         \
97
    int i;                                                                  \
98
    for (i = 0; i + 8 <= num_pixels; i += 8) {                              \
99
      const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);       \
100
      const __m256i other = _mm256_loadu_si256((const __m256i*)&(IN));      \
101
      const __m256i res = _mm256_add_epi8(src, other);                      \
102
      _mm256_storeu_si256((__m256i*)&out[i], res);                          \
103
    }                                                                       \
104
    if (i != num_pixels) {                                                  \
105
      VP8LPredictorsAdd_SSE[(X)](in + i, upper + i, num_pixels - i, out + i); \
106
    }                                                                       \
107
  }
108

109
// Predictor2: Top.
110
GENERATE_PREDICTOR_1(2, upper[i])
111
// Predictor3: Top-right.
112
GENERATE_PREDICTOR_1(3, upper[i + 1])
113
// Predictor4: Top-left.
114
GENERATE_PREDICTOR_1(4, upper[i - 1])
115
#undef GENERATE_PREDICTOR_1
116

117
// Due to averages with integers, values cannot be accumulated in parallel for
118
// predictors 5 to 7.
119

120
#define GENERATE_PREDICTOR_2(X, IN)                                         \
121
  static void PredictorAdd##X##_AVX2(const uint32_t* in,                    \
122
                                     const uint32_t* upper, int num_pixels, \
123
                                     uint32_t* WEBP_RESTRICT out) {         \
124
    int i;                                                                  \
125
    for (i = 0; i + 8 <= num_pixels; i += 8) {                              \
126
      const __m256i Tother = _mm256_loadu_si256((const __m256i*)&(IN));     \
127
      const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);      \
128
      const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);       \
129
      __m256i avg, res;                                                     \
130
      Average2_m256i(&T, &Tother, &avg);                                    \
131
      res = _mm256_add_epi8(avg, src);                                      \
132
      _mm256_storeu_si256((__m256i*)&out[i], res);                          \
133
    }                                                                       \
134
    if (i != num_pixels) {                                                  \
135
      VP8LPredictorsAdd_SSE[(X)](in + i, upper + i, num_pixels - i, out + i); \
136
    }                                                                       \
137
  }
138
// Predictor8: average TL T.
139
GENERATE_PREDICTOR_2(8, upper[i - 1])
140
// Predictor9: average T TR.
141
GENERATE_PREDICTOR_2(9, upper[i + 1])
142
#undef GENERATE_PREDICTOR_2
143

144
// Predictor10: average of (average of (L,TL), average of (T, TR)).
145
#define DO_PRED10(OUT)                                  \
146
  do {                                                  \
147
    __m256i avgLTL, avg;                                \
148
    Average2_m256i(&L, &TL, &avgLTL);                   \
149
    Average2_m256i(&avgTTR, &avgLTL, &avg);             \
150
    L = _mm256_add_epi8(avg, src);                      \
151
    out[i + (OUT)] = (uint32_t)_mm256_cvtsi256_si32(L); \
152
  } while (0)
153

154
#define DO_PRED10_SHIFT                                         \
155
  do {                                                          \
156
    /* Rotate the pre-computed values for the next iteration.*/ \
157
    avgTTR = _mm256_srli_si256(avgTTR, 4);                      \
158
    TL = _mm256_srli_si256(TL, 4);                              \
159
    src = _mm256_srli_si256(src, 4);                            \
160
  } while (0)
161

162
static void PredictorAdd10_AVX2(const uint32_t* in, const uint32_t* upper,
163
                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
164
  int i, j;
165
  __m256i L = _mm256_setr_epi32((int)out[-1], 0, 0, 0, 0, 0, 0, 0);
166
  for (i = 0; i + 8 <= num_pixels; i += 8) {
167
    __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
168
    __m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]);
169
    const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);
170
    const __m256i TR = _mm256_loadu_si256((const __m256i*)&upper[i + 1]);
171
    __m256i avgTTR;
172
    Average2_m256i(&T, &TR, &avgTTR);
173
    {
174
      const __m256i avgTTR_bak = avgTTR;
175
      const __m256i TL_bak = TL;
176
      const __m256i src_bak = src;
177
      for (j = 0; j < 4; ++j) {
178
        DO_PRED10(j);
179
        DO_PRED10_SHIFT;
180
      }
181
      avgTTR = _mm256_permute2x128_si256(avgTTR_bak, avgTTR_bak, 1);
182
      TL = _mm256_permute2x128_si256(TL_bak, TL_bak, 1);
183
      src = _mm256_permute2x128_si256(src_bak, src_bak, 1);
184
      for (; j < 8; ++j) {
185
        DO_PRED10(j);
186
        DO_PRED10_SHIFT;
187
      }
188
    }
189
  }
190
  if (i != num_pixels) {
191
    VP8LPredictorsAdd_SSE[10](in + i, upper + i, num_pixels - i, out + i);
192
  }
193
}
194
#undef DO_PRED10
195
#undef DO_PRED10_SHIFT
196

197
// Predictor11: select.
198
#define DO_PRED11(OUT)                                                      \
199
  do {                                                                      \
200
    const __m256i L_lo = _mm256_unpacklo_epi32(L, T);                       \
201
    const __m256i TL_lo = _mm256_unpacklo_epi32(TL, T);                     \
202
    const __m256i pb = _mm256_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/   \
203
    const __m256i mask = _mm256_cmpgt_epi32(pb, pa);                        \
204
    const __m256i A = _mm256_and_si256(mask, L);                            \
205
    const __m256i B = _mm256_andnot_si256(mask, T);                         \
206
    const __m256i pred = _mm256_or_si256(A, B); /* pred = (pa > b)? L : T*/ \
207
    L = _mm256_add_epi8(src, pred);                                         \
208
    out[i + (OUT)] = (uint32_t)_mm256_cvtsi256_si32(L);                     \
209
  } while (0)
210

211
#define DO_PRED11_SHIFT                                       \
212
  do {                                                        \
213
    /* Shift the pre-computed value for the next iteration.*/ \
214
    T = _mm256_srli_si256(T, 4);                              \
215
    TL = _mm256_srli_si256(TL, 4);                            \
216
    src = _mm256_srli_si256(src, 4);                          \
217
    pa = _mm256_srli_si256(pa, 4);                            \
218
  } while (0)
219

220
static void PredictorAdd11_AVX2(const uint32_t* in, const uint32_t* upper,
221
                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
222
  int i, j;
223
  __m256i pa;
224
  __m256i L = _mm256_setr_epi32((int)out[-1], 0, 0, 0, 0, 0, 0, 0);
225
  for (i = 0; i + 8 <= num_pixels; i += 8) {
226
    __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);
227
    __m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]);
228
    __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
229
    {
230
      // We can unpack with any value on the upper 32 bits, provided it's the
231
      // same on both operands (so that their sum of abs diff is zero). Here we
232
      // use T.
233
      const __m256i T_lo = _mm256_unpacklo_epi32(T, T);
234
      const __m256i TL_lo = _mm256_unpacklo_epi32(TL, T);
235
      const __m256i T_hi = _mm256_unpackhi_epi32(T, T);
236
      const __m256i TL_hi = _mm256_unpackhi_epi32(TL, T);
237
      const __m256i s_lo = _mm256_sad_epu8(T_lo, TL_lo);
238
      const __m256i s_hi = _mm256_sad_epu8(T_hi, TL_hi);
239
      pa = _mm256_packs_epi32(s_lo, s_hi);  // pa = sum |T-TL|
240
    }
241
    {
242
      const __m256i T_bak = T;
243
      const __m256i TL_bak = TL;
244
      const __m256i src_bak = src;
245
      const __m256i pa_bak = pa;
246
      for (j = 0; j < 4; ++j) {
247
        DO_PRED11(j);
248
        DO_PRED11_SHIFT;
249
      }
250
      T = _mm256_permute2x128_si256(T_bak, T_bak, 1);
251
      TL = _mm256_permute2x128_si256(TL_bak, TL_bak, 1);
252
      src = _mm256_permute2x128_si256(src_bak, src_bak, 1);
253
      pa = _mm256_permute2x128_si256(pa_bak, pa_bak, 1);
254
      for (; j < 8; ++j) {
255
        DO_PRED11(j);
256
        DO_PRED11_SHIFT;
257
      }
258
    }
259
  }
260
  if (i != num_pixels) {
261
    VP8LPredictorsAdd_SSE[11](in + i, upper + i, num_pixels - i, out + i);
262
  }
263
}
264
#undef DO_PRED11
265
#undef DO_PRED11_SHIFT
266

267
// Predictor12: ClampedAddSubtractFull.
268
#define DO_PRED12(DIFF, OUT)                              \
269
  do {                                                    \
270
    const __m256i all = _mm256_add_epi16(L, (DIFF));      \
271
    const __m256i alls = _mm256_packus_epi16(all, all);   \
272
    const __m256i res = _mm256_add_epi8(src, alls);       \
273
    out[i + (OUT)] = (uint32_t)_mm256_cvtsi256_si32(res); \
274
    L = _mm256_unpacklo_epi8(res, zero);                  \
275
  } while (0)
276

277
#define DO_PRED12_SHIFT(DIFF, LANE)                           \
278
  do {                                                        \
279
    /* Shift the pre-computed value for the next iteration.*/ \
280
    if ((LANE) == 0) (DIFF) = _mm256_srli_si256(DIFF, 8);     \
281
    src = _mm256_srli_si256(src, 4);                          \
282
  } while (0)
283

284
static void PredictorAdd12_AVX2(const uint32_t* in, const uint32_t* upper,
285
                                int num_pixels, uint32_t* WEBP_RESTRICT out) {
286
  int i;
287
  const __m256i zero = _mm256_setzero_si256();
288
  const __m256i L8 = _mm256_setr_epi32((int)out[-1], 0, 0, 0, 0, 0, 0, 0);
289
  __m256i L = _mm256_unpacklo_epi8(L8, zero);
290
  for (i = 0; i + 8 <= num_pixels; i += 8) {
291
    // Load 8 pixels at a time.
292
    __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);
293
    const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);
294
    const __m256i T_lo = _mm256_unpacklo_epi8(T, zero);
295
    const __m256i T_hi = _mm256_unpackhi_epi8(T, zero);
296
    const __m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]);
297
    const __m256i TL_lo = _mm256_unpacklo_epi8(TL, zero);
298
    const __m256i TL_hi = _mm256_unpackhi_epi8(TL, zero);
299
    __m256i diff_lo = _mm256_sub_epi16(T_lo, TL_lo);
300
    __m256i diff_hi = _mm256_sub_epi16(T_hi, TL_hi);
301
    const __m256i diff_lo_bak = diff_lo;
302
    const __m256i diff_hi_bak = diff_hi;
303
    const __m256i src_bak = src;
304
    DO_PRED12(diff_lo, 0);
305
    DO_PRED12_SHIFT(diff_lo, 0);
306
    DO_PRED12(diff_lo, 1);
307
    DO_PRED12_SHIFT(diff_lo, 0);
308
    DO_PRED12(diff_hi, 2);
309
    DO_PRED12_SHIFT(diff_hi, 0);
310
    DO_PRED12(diff_hi, 3);
311
    DO_PRED12_SHIFT(diff_hi, 0);
312

313
    // Process the upper lane.
314
    diff_lo = _mm256_permute2x128_si256(diff_lo_bak, diff_lo_bak, 1);
315
    diff_hi = _mm256_permute2x128_si256(diff_hi_bak, diff_hi_bak, 1);
316
    src = _mm256_permute2x128_si256(src_bak, src_bak, 1);
317

318
    DO_PRED12(diff_lo, 4);
319
    DO_PRED12_SHIFT(diff_lo, 0);
320
    DO_PRED12(diff_lo, 5);
321
    DO_PRED12_SHIFT(diff_lo, 1);
322
    DO_PRED12(diff_hi, 6);
323
    DO_PRED12_SHIFT(diff_hi, 0);
324
    DO_PRED12(diff_hi, 7);
325
  }
326
  if (i != num_pixels) {
327
    VP8LPredictorsAdd_SSE[12](in + i, upper + i, num_pixels - i, out + i);
328
  }
329
}
330
#undef DO_PRED12
331
#undef DO_PRED12_SHIFT
332

333
// Due to averages with integers, values cannot be accumulated in parallel for
334
// predictors 13.
335

336
//------------------------------------------------------------------------------
337
// Subtract-Green Transform
338

339
static void AddGreenToBlueAndRed_AVX2(const uint32_t* const src, int num_pixels,
340
                                      uint32_t* dst) {
341
  int i;
342
  const __m256i kCstShuffle = _mm256_set_epi8(
343
      -1, 29, -1, 29, -1, 25, -1, 25, -1, 21, -1, 21, -1, 17, -1, 17, -1, 13,
344
      -1, 13, -1, 9, -1, 9, -1, 5, -1, 5, -1, 1, -1, 1);
345
  for (i = 0; i + 8 <= num_pixels; i += 8) {
346
    const __m256i in = _mm256_loadu_si256((const __m256i*)&src[i]);  // argb
347
    const __m256i in_0g0g = _mm256_shuffle_epi8(in, kCstShuffle);    // 0g0g
348
    const __m256i out = _mm256_add_epi8(in, in_0g0g);
349
    _mm256_storeu_si256((__m256i*)&dst[i], out);
350
  }
351
  // fallthrough and finish off with SSE.
352
  if (i != num_pixels) {
353
    VP8LAddGreenToBlueAndRed_SSE(src + i, num_pixels - i, dst + i);
354
  }
355
}
356

357
//------------------------------------------------------------------------------
358
// Color Transform
359

360
static void TransformColorInverse_AVX2(const VP8LMultipliers* const m,
361
                                       const uint32_t* const src,
362
                                       int num_pixels, uint32_t* dst) {
363
// sign-extended multiplying constants, pre-shifted by 5.
364
#define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
365
  const __m256i mults_rb =
366
      _mm256_set1_epi32((int)((uint32_t)CST(green_to_red) << 16 |
367
                              (CST(green_to_blue) & 0xffff)));
368
  const __m256i mults_b2 = _mm256_set1_epi32(CST(red_to_blue));
369
#undef CST
370
  const __m256i mask_ag = _mm256_set1_epi32((int)0xff00ff00);
371
  const __m256i perm1 = _mm256_setr_epi8(
372
      -1, 1, -1, 1, -1, 5, -1, 5, -1, 9, -1, 9, -1, 13, -1, 13, -1, 17, -1, 17,
373
      -1, 21, -1, 21, -1, 25, -1, 25, -1, 29, -1, 29);
374
  const __m256i perm2 = _mm256_setr_epi8(
375
      -1, 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, 18, -1,
376
      -1, -1, 22, -1, -1, -1, 26, -1, -1, -1, 30, -1, -1);
377
  int i;
378
  for (i = 0; i + 8 <= num_pixels; i += 8) {
379
    const __m256i A = _mm256_loadu_si256((const __m256i*)(src + i));
380
    const __m256i B = _mm256_shuffle_epi8(A, perm1);  // argb -> g0g0
381
    const __m256i C = _mm256_mulhi_epi16(B, mults_rb);
382
    const __m256i D = _mm256_add_epi8(A, C);
383
    const __m256i E = _mm256_shuffle_epi8(D, perm2);
384
    const __m256i F = _mm256_mulhi_epi16(E, mults_b2);
385
    const __m256i G = _mm256_add_epi8(D, F);
386
    const __m256i out = _mm256_blendv_epi8(G, A, mask_ag);
387
    _mm256_storeu_si256((__m256i*)&dst[i], out);
388
  }
389
  // Fall-back to SSE-version for left-overs.
390
  if (i != num_pixels) {
391
    VP8LTransformColorInverse_SSE(m, src + i, num_pixels - i, dst + i);
392
  }
393
}
394

395
//------------------------------------------------------------------------------
396
// Color-space conversion functions
397

398
static void ConvertBGRAToRGBA_AVX2(const uint32_t* WEBP_RESTRICT src,
399
                                   int num_pixels, uint8_t* WEBP_RESTRICT dst) {
400
  const __m256i* in = (const __m256i*)src;
401
  __m256i* out = (__m256i*)dst;
402
  while (num_pixels >= 8) {
403
    const __m256i A = _mm256_loadu_si256(in++);
404
    const __m256i B = _mm256_shuffle_epi8(
405
        A,
406
        _mm256_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2,
407
                        15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2));
408
    _mm256_storeu_si256(out++, B);
409
    num_pixels -= 8;
410
  }
411
  // left-overs
412
  if (num_pixels > 0) {
413
    VP8LConvertBGRAToRGBA_SSE((const uint32_t*)in, num_pixels, (uint8_t*)out);
414
  }
415
}
416

417
//------------------------------------------------------------------------------
418
// Entry point
419

420
extern void VP8LDspInitAVX2(void);
421

422
WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitAVX2(void) {
423
  VP8LPredictorsAdd[0] = PredictorAdd0_AVX2;
424
  VP8LPredictorsAdd[1] = PredictorAdd1_AVX2;
425
  VP8LPredictorsAdd[2] = PredictorAdd2_AVX2;
426
  VP8LPredictorsAdd[3] = PredictorAdd3_AVX2;
427
  VP8LPredictorsAdd[4] = PredictorAdd4_AVX2;
428
  VP8LPredictorsAdd[8] = PredictorAdd8_AVX2;
429
  VP8LPredictorsAdd[9] = PredictorAdd9_AVX2;
430
  VP8LPredictorsAdd[10] = PredictorAdd10_AVX2;
431
  VP8LPredictorsAdd[11] = PredictorAdd11_AVX2;
432
  VP8LPredictorsAdd[12] = PredictorAdd12_AVX2;
433

434
  VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_AVX2;
435
  VP8LTransformColorInverse = TransformColorInverse_AVX2;
436
  VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_AVX2;
437
}
438

439
#else  // !WEBP_USE_AVX2
440

441
WEBP_DSP_INIT_STUB(VP8LDspInitAVX2)
442

443
#endif  // WEBP_USE_AVX2
444

445
Product

Resources

Company