Path: blob/master/thirdparty/libwebp/src/dsp/lossless_avx2.c
14730 views
// Copyright 2025 Google Inc. All Rights Reserved.1//2// Use of this source code is governed by a BSD-style license3// that can be found in the COPYING file in the root of the source4// tree. An additional intellectual property rights grant can be found5// in the file PATENTS. All contributing project authors may6// be found in the AUTHORS file in the root of the source tree.7// -----------------------------------------------------------------------------8//9// AVX2 variant of methods for lossless decoder10//11// Author: Vincent Rabaud ([email protected])1213#include "src/dsp/dsp.h"1415#if defined(WEBP_USE_AVX2)1617#include <stddef.h>18#include <immintrin.h>1920#include "src/dsp/cpu.h"21#include "src/dsp/lossless.h"22#include "src/webp/format_constants.h"23#include "src/webp/types.h"2425//------------------------------------------------------------------------------26// Predictor Transform2728static WEBP_INLINE void Average2_m256i(const __m256i* const a0,29const __m256i* const a1,30__m256i* const avg) {31// (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)32const __m256i ones = _mm256_set1_epi8(1);33const __m256i avg1 = _mm256_avg_epu8(*a0, *a1);34const __m256i one = _mm256_and_si256(_mm256_xor_si256(*a0, *a1), ones);35*avg = _mm256_sub_epi8(avg1, one);36}3738// Batch versions of those functions.3940// Predictor0: ARGB_BLACK.41static void PredictorAdd0_AVX2(const uint32_t* in, const uint32_t* upper,42int num_pixels, uint32_t* WEBP_RESTRICT out) {43int i;44const __m256i black = _mm256_set1_epi32((int)ARGB_BLACK);45for (i = 0; i + 8 <= num_pixels; i += 8) {46const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);47const __m256i res = _mm256_add_epi8(src, black);48_mm256_storeu_si256((__m256i*)&out[i], res);49}50if (i != num_pixels) {51VP8LPredictorsAdd_SSE[0](in + i, NULL, num_pixels - i, out + i);52}53(void)upper;54}5556// Predictor1: left.57static void PredictorAdd1_AVX2(const uint32_t* in, const uint32_t* upper,58int num_pixels, uint32_t* WEBP_RESTRICT out) {59int i;60__m256i prev = _mm256_set1_epi32((int)out[-1]);61for (i = 0; i + 8 <= num_pixels; i += 8) {62// h | g | f | e | d | c | b | a63const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);64// g | f | e | 0 | c | b | a | 065const __m256i shift0 = _mm256_slli_si256(src, 4);66// g + h | f + g | e + f | e | c + d | b + c | a + b | a67const __m256i sum0 = _mm256_add_epi8(src, shift0);68// e + f | e | 0 | 0 | a + b | a | 0 | 069const __m256i shift1 = _mm256_slli_si256(sum0, 8);70// e + f + g + h | e + f + g | e + f | e | a + b + c + d | a + b + c | a + b71// | a72const __m256i sum1 = _mm256_add_epi8(sum0, shift1);73// Add a + b + c + d to the upper lane.74const int32_t sum_abcd = _mm256_extract_epi32(sum1, 3);75const __m256i sum2 = _mm256_add_epi8(76sum1,77_mm256_set_epi32(sum_abcd, sum_abcd, sum_abcd, sum_abcd, 0, 0, 0, 0));7879const __m256i res = _mm256_add_epi8(sum2, prev);80_mm256_storeu_si256((__m256i*)&out[i], res);81// replicate last res output in prev.82prev = _mm256_permutevar8x32_epi32(83res, _mm256_set_epi32(7, 7, 7, 7, 7, 7, 7, 7));84}85if (i != num_pixels) {86VP8LPredictorsAdd_SSE[1](in + i, upper + i, num_pixels - i, out + i);87}88}8990// Macro that adds 32-bit integers from IN using mod 256 arithmetic91// per 8 bit channel.92#define GENERATE_PREDICTOR_1(X, IN) \93static void PredictorAdd##X##_AVX2(const uint32_t* in, \94const uint32_t* upper, int num_pixels, \95uint32_t* WEBP_RESTRICT out) { \96int i; \97for (i = 0; i + 8 <= num_pixels; i += 8) { \98const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]); \99const __m256i other = _mm256_loadu_si256((const __m256i*)&(IN)); \100const __m256i res = _mm256_add_epi8(src, other); \101_mm256_storeu_si256((__m256i*)&out[i], res); \102} \103if (i != num_pixels) { \104VP8LPredictorsAdd_SSE[(X)](in + i, upper + i, num_pixels - i, out + i); \105} \106}107108// Predictor2: Top.109GENERATE_PREDICTOR_1(2, upper[i])110// Predictor3: Top-right.111GENERATE_PREDICTOR_1(3, upper[i + 1])112// Predictor4: Top-left.113GENERATE_PREDICTOR_1(4, upper[i - 1])114#undef GENERATE_PREDICTOR_1115116// Due to averages with integers, values cannot be accumulated in parallel for117// predictors 5 to 7.118119#define GENERATE_PREDICTOR_2(X, IN) \120static void PredictorAdd##X##_AVX2(const uint32_t* in, \121const uint32_t* upper, int num_pixels, \122uint32_t* WEBP_RESTRICT out) { \123int i; \124for (i = 0; i + 8 <= num_pixels; i += 8) { \125const __m256i Tother = _mm256_loadu_si256((const __m256i*)&(IN)); \126const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]); \127const __m256i src = _mm256_loadu_si256((const __m256i*)&in[i]); \128__m256i avg, res; \129Average2_m256i(&T, &Tother, &avg); \130res = _mm256_add_epi8(avg, src); \131_mm256_storeu_si256((__m256i*)&out[i], res); \132} \133if (i != num_pixels) { \134VP8LPredictorsAdd_SSE[(X)](in + i, upper + i, num_pixels - i, out + i); \135} \136}137// Predictor8: average TL T.138GENERATE_PREDICTOR_2(8, upper[i - 1])139// Predictor9: average T TR.140GENERATE_PREDICTOR_2(9, upper[i + 1])141#undef GENERATE_PREDICTOR_2142143// Predictor10: average of (average of (L,TL), average of (T, TR)).144#define DO_PRED10(OUT) \145do { \146__m256i avgLTL, avg; \147Average2_m256i(&L, &TL, &avgLTL); \148Average2_m256i(&avgTTR, &avgLTL, &avg); \149L = _mm256_add_epi8(avg, src); \150out[i + (OUT)] = (uint32_t)_mm256_cvtsi256_si32(L); \151} while (0)152153#define DO_PRED10_SHIFT \154do { \155/* Rotate the pre-computed values for the next iteration.*/ \156avgTTR = _mm256_srli_si256(avgTTR, 4); \157TL = _mm256_srli_si256(TL, 4); \158src = _mm256_srli_si256(src, 4); \159} while (0)160161static void PredictorAdd10_AVX2(const uint32_t* in, const uint32_t* upper,162int num_pixels, uint32_t* WEBP_RESTRICT out) {163int i, j;164__m256i L = _mm256_setr_epi32((int)out[-1], 0, 0, 0, 0, 0, 0, 0);165for (i = 0; i + 8 <= num_pixels; i += 8) {166__m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);167__m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]);168const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);169const __m256i TR = _mm256_loadu_si256((const __m256i*)&upper[i + 1]);170__m256i avgTTR;171Average2_m256i(&T, &TR, &avgTTR);172{173const __m256i avgTTR_bak = avgTTR;174const __m256i TL_bak = TL;175const __m256i src_bak = src;176for (j = 0; j < 4; ++j) {177DO_PRED10(j);178DO_PRED10_SHIFT;179}180avgTTR = _mm256_permute2x128_si256(avgTTR_bak, avgTTR_bak, 1);181TL = _mm256_permute2x128_si256(TL_bak, TL_bak, 1);182src = _mm256_permute2x128_si256(src_bak, src_bak, 1);183for (; j < 8; ++j) {184DO_PRED10(j);185DO_PRED10_SHIFT;186}187}188}189if (i != num_pixels) {190VP8LPredictorsAdd_SSE[10](in + i, upper + i, num_pixels - i, out + i);191}192}193#undef DO_PRED10194#undef DO_PRED10_SHIFT195196// Predictor11: select.197#define DO_PRED11(OUT) \198do { \199const __m256i L_lo = _mm256_unpacklo_epi32(L, T); \200const __m256i TL_lo = _mm256_unpacklo_epi32(TL, T); \201const __m256i pb = _mm256_sad_epu8(L_lo, TL_lo); /* pb = sum |L-TL|*/ \202const __m256i mask = _mm256_cmpgt_epi32(pb, pa); \203const __m256i A = _mm256_and_si256(mask, L); \204const __m256i B = _mm256_andnot_si256(mask, T); \205const __m256i pred = _mm256_or_si256(A, B); /* pred = (pa > b)? L : T*/ \206L = _mm256_add_epi8(src, pred); \207out[i + (OUT)] = (uint32_t)_mm256_cvtsi256_si32(L); \208} while (0)209210#define DO_PRED11_SHIFT \211do { \212/* Shift the pre-computed value for the next iteration.*/ \213T = _mm256_srli_si256(T, 4); \214TL = _mm256_srli_si256(TL, 4); \215src = _mm256_srli_si256(src, 4); \216pa = _mm256_srli_si256(pa, 4); \217} while (0)218219static void PredictorAdd11_AVX2(const uint32_t* in, const uint32_t* upper,220int num_pixels, uint32_t* WEBP_RESTRICT out) {221int i, j;222__m256i pa;223__m256i L = _mm256_setr_epi32((int)out[-1], 0, 0, 0, 0, 0, 0, 0);224for (i = 0; i + 8 <= num_pixels; i += 8) {225__m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);226__m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]);227__m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);228{229// We can unpack with any value on the upper 32 bits, provided it's the230// same on both operands (so that their sum of abs diff is zero). Here we231// use T.232const __m256i T_lo = _mm256_unpacklo_epi32(T, T);233const __m256i TL_lo = _mm256_unpacklo_epi32(TL, T);234const __m256i T_hi = _mm256_unpackhi_epi32(T, T);235const __m256i TL_hi = _mm256_unpackhi_epi32(TL, T);236const __m256i s_lo = _mm256_sad_epu8(T_lo, TL_lo);237const __m256i s_hi = _mm256_sad_epu8(T_hi, TL_hi);238pa = _mm256_packs_epi32(s_lo, s_hi); // pa = sum |T-TL|239}240{241const __m256i T_bak = T;242const __m256i TL_bak = TL;243const __m256i src_bak = src;244const __m256i pa_bak = pa;245for (j = 0; j < 4; ++j) {246DO_PRED11(j);247DO_PRED11_SHIFT;248}249T = _mm256_permute2x128_si256(T_bak, T_bak, 1);250TL = _mm256_permute2x128_si256(TL_bak, TL_bak, 1);251src = _mm256_permute2x128_si256(src_bak, src_bak, 1);252pa = _mm256_permute2x128_si256(pa_bak, pa_bak, 1);253for (; j < 8; ++j) {254DO_PRED11(j);255DO_PRED11_SHIFT;256}257}258}259if (i != num_pixels) {260VP8LPredictorsAdd_SSE[11](in + i, upper + i, num_pixels - i, out + i);261}262}263#undef DO_PRED11264#undef DO_PRED11_SHIFT265266// Predictor12: ClampedAddSubtractFull.267#define DO_PRED12(DIFF, OUT) \268do { \269const __m256i all = _mm256_add_epi16(L, (DIFF)); \270const __m256i alls = _mm256_packus_epi16(all, all); \271const __m256i res = _mm256_add_epi8(src, alls); \272out[i + (OUT)] = (uint32_t)_mm256_cvtsi256_si32(res); \273L = _mm256_unpacklo_epi8(res, zero); \274} while (0)275276#define DO_PRED12_SHIFT(DIFF, LANE) \277do { \278/* Shift the pre-computed value for the next iteration.*/ \279if ((LANE) == 0) (DIFF) = _mm256_srli_si256(DIFF, 8); \280src = _mm256_srli_si256(src, 4); \281} while (0)282283static void PredictorAdd12_AVX2(const uint32_t* in, const uint32_t* upper,284int num_pixels, uint32_t* WEBP_RESTRICT out) {285int i;286const __m256i zero = _mm256_setzero_si256();287const __m256i L8 = _mm256_setr_epi32((int)out[-1], 0, 0, 0, 0, 0, 0, 0);288__m256i L = _mm256_unpacklo_epi8(L8, zero);289for (i = 0; i + 8 <= num_pixels; i += 8) {290// Load 8 pixels at a time.291__m256i src = _mm256_loadu_si256((const __m256i*)&in[i]);292const __m256i T = _mm256_loadu_si256((const __m256i*)&upper[i]);293const __m256i T_lo = _mm256_unpacklo_epi8(T, zero);294const __m256i T_hi = _mm256_unpackhi_epi8(T, zero);295const __m256i TL = _mm256_loadu_si256((const __m256i*)&upper[i - 1]);296const __m256i TL_lo = _mm256_unpacklo_epi8(TL, zero);297const __m256i TL_hi = _mm256_unpackhi_epi8(TL, zero);298__m256i diff_lo = _mm256_sub_epi16(T_lo, TL_lo);299__m256i diff_hi = _mm256_sub_epi16(T_hi, TL_hi);300const __m256i diff_lo_bak = diff_lo;301const __m256i diff_hi_bak = diff_hi;302const __m256i src_bak = src;303DO_PRED12(diff_lo, 0);304DO_PRED12_SHIFT(diff_lo, 0);305DO_PRED12(diff_lo, 1);306DO_PRED12_SHIFT(diff_lo, 0);307DO_PRED12(diff_hi, 2);308DO_PRED12_SHIFT(diff_hi, 0);309DO_PRED12(diff_hi, 3);310DO_PRED12_SHIFT(diff_hi, 0);311312// Process the upper lane.313diff_lo = _mm256_permute2x128_si256(diff_lo_bak, diff_lo_bak, 1);314diff_hi = _mm256_permute2x128_si256(diff_hi_bak, diff_hi_bak, 1);315src = _mm256_permute2x128_si256(src_bak, src_bak, 1);316317DO_PRED12(diff_lo, 4);318DO_PRED12_SHIFT(diff_lo, 0);319DO_PRED12(diff_lo, 5);320DO_PRED12_SHIFT(diff_lo, 1);321DO_PRED12(diff_hi, 6);322DO_PRED12_SHIFT(diff_hi, 0);323DO_PRED12(diff_hi, 7);324}325if (i != num_pixels) {326VP8LPredictorsAdd_SSE[12](in + i, upper + i, num_pixels - i, out + i);327}328}329#undef DO_PRED12330#undef DO_PRED12_SHIFT331332// Due to averages with integers, values cannot be accumulated in parallel for333// predictors 13.334335//------------------------------------------------------------------------------336// Subtract-Green Transform337338static void AddGreenToBlueAndRed_AVX2(const uint32_t* const src, int num_pixels,339uint32_t* dst) {340int i;341const __m256i kCstShuffle = _mm256_set_epi8(342-1, 29, -1, 29, -1, 25, -1, 25, -1, 21, -1, 21, -1, 17, -1, 17, -1, 13,343-1, 13, -1, 9, -1, 9, -1, 5, -1, 5, -1, 1, -1, 1);344for (i = 0; i + 8 <= num_pixels; i += 8) {345const __m256i in = _mm256_loadu_si256((const __m256i*)&src[i]); // argb346const __m256i in_0g0g = _mm256_shuffle_epi8(in, kCstShuffle); // 0g0g347const __m256i out = _mm256_add_epi8(in, in_0g0g);348_mm256_storeu_si256((__m256i*)&dst[i], out);349}350// fallthrough and finish off with SSE.351if (i != num_pixels) {352VP8LAddGreenToBlueAndRed_SSE(src + i, num_pixels - i, dst + i);353}354}355356//------------------------------------------------------------------------------357// Color Transform358359static void TransformColorInverse_AVX2(const VP8LMultipliers* const m,360const uint32_t* const src,361int num_pixels, uint32_t* dst) {362// sign-extended multiplying constants, pre-shifted by 5.363#define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend364const __m256i mults_rb =365_mm256_set1_epi32((int)((uint32_t)CST(green_to_red) << 16 |366(CST(green_to_blue) & 0xffff)));367const __m256i mults_b2 = _mm256_set1_epi32(CST(red_to_blue));368#undef CST369const __m256i mask_ag = _mm256_set1_epi32((int)0xff00ff00);370const __m256i perm1 = _mm256_setr_epi8(371-1, 1, -1, 1, -1, 5, -1, 5, -1, 9, -1, 9, -1, 13, -1, 13, -1, 17, -1, 17,372-1, 21, -1, 21, -1, 25, -1, 25, -1, 29, -1, 29);373const __m256i perm2 = _mm256_setr_epi8(374-1, 2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1, 18, -1,375-1, -1, 22, -1, -1, -1, 26, -1, -1, -1, 30, -1, -1);376int i;377for (i = 0; i + 8 <= num_pixels; i += 8) {378const __m256i A = _mm256_loadu_si256((const __m256i*)(src + i));379const __m256i B = _mm256_shuffle_epi8(A, perm1); // argb -> g0g0380const __m256i C = _mm256_mulhi_epi16(B, mults_rb);381const __m256i D = _mm256_add_epi8(A, C);382const __m256i E = _mm256_shuffle_epi8(D, perm2);383const __m256i F = _mm256_mulhi_epi16(E, mults_b2);384const __m256i G = _mm256_add_epi8(D, F);385const __m256i out = _mm256_blendv_epi8(G, A, mask_ag);386_mm256_storeu_si256((__m256i*)&dst[i], out);387}388// Fall-back to SSE-version for left-overs.389if (i != num_pixels) {390VP8LTransformColorInverse_SSE(m, src + i, num_pixels - i, dst + i);391}392}393394//------------------------------------------------------------------------------395// Color-space conversion functions396397static void ConvertBGRAToRGBA_AVX2(const uint32_t* WEBP_RESTRICT src,398int num_pixels, uint8_t* WEBP_RESTRICT dst) {399const __m256i* in = (const __m256i*)src;400__m256i* out = (__m256i*)dst;401while (num_pixels >= 8) {402const __m256i A = _mm256_loadu_si256(in++);403const __m256i B = _mm256_shuffle_epi8(404A,405_mm256_set_epi8(15, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2,40615, 12, 13, 14, 11, 8, 9, 10, 7, 4, 5, 6, 3, 0, 1, 2));407_mm256_storeu_si256(out++, B);408num_pixels -= 8;409}410// left-overs411if (num_pixels > 0) {412VP8LConvertBGRAToRGBA_SSE((const uint32_t*)in, num_pixels, (uint8_t*)out);413}414}415416//------------------------------------------------------------------------------417// Entry point418419extern void VP8LDspInitAVX2(void);420421WEBP_TSAN_IGNORE_FUNCTION void VP8LDspInitAVX2(void) {422VP8LPredictorsAdd[0] = PredictorAdd0_AVX2;423VP8LPredictorsAdd[1] = PredictorAdd1_AVX2;424VP8LPredictorsAdd[2] = PredictorAdd2_AVX2;425VP8LPredictorsAdd[3] = PredictorAdd3_AVX2;426VP8LPredictorsAdd[4] = PredictorAdd4_AVX2;427VP8LPredictorsAdd[8] = PredictorAdd8_AVX2;428VP8LPredictorsAdd[9] = PredictorAdd9_AVX2;429VP8LPredictorsAdd[10] = PredictorAdd10_AVX2;430VP8LPredictorsAdd[11] = PredictorAdd11_AVX2;431VP8LPredictorsAdd[12] = PredictorAdd12_AVX2;432433VP8LAddGreenToBlueAndRed = AddGreenToBlueAndRed_AVX2;434VP8LTransformColorInverse = TransformColorInverse_AVX2;435VP8LConvertBGRAToRGBA = ConvertBGRAToRGBA_AVX2;436}437438#else // !WEBP_USE_AVX2439440WEBP_DSP_INIT_STUB(VP8LDspInitAVX2)441442#endif // WEBP_USE_AVX2443444445