Path: blob/master/Common/Data/Convert/SmallDataConvert.h
3187 views
#pragma once12#include <cstdint>3#include <cstring>4#include <cmath>56#include "Common/Common.h"7#include "ppsspp_config.h"8#include "Common/Math/SIMDHeaders.h"91011extern const float one_over_255_x4[4];12extern const float exactly_255_x4[4];1314// Utilities useful for filling in std140-layout uniform buffers, and similar.15// NEON intrinsics: https://developer.arm.com/documentation/den0018/a/NEON-Intrinsics?lang=en1617// LSBs in f[0], etc.18inline void Uint8x4ToFloat4(float f[4], uint32_t u) {19#ifdef _M_SSE20__m128i zero = _mm_setzero_si128();21__m128i value = _mm_set1_epi32(u);22__m128i value32 = _mm_unpacklo_epi16(_mm_unpacklo_epi8(value, zero), zero);23__m128 fvalues = _mm_mul_ps(_mm_cvtepi32_ps(value32), _mm_load_ps(one_over_255_x4));24_mm_storeu_ps(f, fvalues);25#elif PPSSPP_ARCH(ARM_NEON)26const uint8x8_t value = (uint8x8_t)vdup_n_u32(u);27const uint16x8_t value16 = vmovl_u8(value);28const uint32x4_t value32 = vmovl_u16(vget_low_u16(value16));29const float32x4_t valueFloat = vmulq_f32(vcvtq_f32_u32(value32), vdupq_n_f32(1.0f / 255.0f));30vst1q_f32(f, valueFloat);31#else32f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);33f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);34f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);35f[3] = ((u >> 24) & 0xFF) * (1.0f / 255.0f);36#endif37}3839// Could be SSE optimized.40inline uint32_t Float4ToUint8x4(const float f[4]) {41#ifdef _M_SSE42__m128i zero = _mm_setzero_si128();43__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));44__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);45return _mm_cvtsi128_si32(ivalue);46#elif PPSSPP_ARCH(ARM_NEON)47const float32x4_t value = vmulq_f32(vld1q_f32(f), vdupq_n_f32(255.0f));48uint32x4_t ivalue32 = vcvtq_u32_f32(value);49uint16x4_t ivalue16 = vqmovn_u32(ivalue32);50uint8x8_t ivalue8 = vqmovn_u16(vcombine_u16(ivalue16, ivalue16)); // Is there no way to avoid the combine here?51uint32x2_t outValue32 = vreinterpret_u32_u8(ivalue8);52return vget_lane_u32(outValue32, 0);53#else54int i4[4];55for (int i = 0; i < 4; i++) {56if (f[i] > 1.0f) {57i4[i] = 255;58} else if (f[i] < 0.0f) {59i4[i] = 0;60} else {61i4[i] = (int)(f[i] * 255.0f);62}63}64return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);65#endif66}6768inline uint32_t Float4ToUint8x4_NoClamp(const float f[4]) {69#ifdef _M_SSE70// Does actually clamp, no way to avoid it with the pack ops!71__m128i zero = _mm_setzero_si128();72__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));73__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);74return _mm_cvtsi128_si32(ivalue);75#elif PPSSPP_ARCH(ARM_NEON)76const float32x4_t value = vmulq_f32(vld1q_f32(f), vdupq_n_f32(255.0f));77uint32x4_t ivalue32 = vcvtq_u32_f32(value);78uint16x4_t ivalue16 = vqmovn_u32(ivalue32);79uint8x8_t ivalue8 = vqmovn_u16(vcombine_u16(ivalue16, ivalue16)); // Is there no way to avoid the combine here?80uint32x2_t outValue32 = vreinterpret_u32_u8(ivalue8);81return vget_lane_u32(outValue32, 0);82#else83u32 i4[4];84for (int i = 0; i < 4; i++) {85i4[i] = (int)(f[i] * 255.0f);86}87return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);88#endif89}9091inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) {92#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON)93Uint8x4ToFloat4(f, (u & 0xFFFFFF) | (alpha << 24));94#else95f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);96f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);97f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);98f[3] = alpha * (1.0f / 255.0f);99#endif100}101102inline void Uint8x3ToFloat4(float f[4], uint32_t u) {103#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON)104Uint8x4ToFloat4(f, u & 0xFFFFFF);105#else106f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);107f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);108f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);109f[3] = ((u >> 24) & 0xFF) * (1.0f / 255.0f);110#endif111}112113inline void Uint8x3ToFloat3(float f[4], uint32_t u) {114#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON)115float temp[4];116Uint8x4ToFloat4(temp, u & 0xFFFFFF);117f[0] = temp[0];118f[1] = temp[1];119f[2] = temp[2];120#else121f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);122f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);123f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);124#endif125}126127inline void Uint8x3ToInt4(int i[4], uint32_t u) {128i[0] = ((u >> 0) & 0xFF);129i[1] = ((u >> 8) & 0xFF);130i[2] = ((u >> 16) & 0xFF);131i[3] = 0;132}133134inline void Uint8x3ToInt4_Alpha(int i[4], uint32_t u, uint8_t alpha) {135i[0] = ((u >> 0) & 0xFF);136i[1] = ((u >> 8) & 0xFF);137i[2] = ((u >> 16) & 0xFF);138i[3] = alpha;139}140141inline void Uint8x3ToFloat4_Alpha(float f[4], uint32_t u, float alpha) {142f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);143f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);144f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);145f[3] = alpha;146}147148inline void Uint8x1ToFloat4(float f[4], uint32_t u) {149f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);150f[1] = 0.0f;151f[2] = 0.0f;152f[3] = 0.0f;153}154155// These are just for readability.156157inline void CopyFloat2(float dest[2], const float src[2]) {158dest[0] = src[0];159dest[1] = src[1];160}161162inline void CopyFloat3(float dest[3], const float src[3]) {163dest[0] = src[0];164dest[1] = src[1];165dest[2] = src[2];166}167168inline void CopyFloat4(float dest[4], const float src[4]) {169#ifdef _M_SSE170_mm_storeu_ps(dest, _mm_loadu_ps(src));171#else172dest[0] = src[0];173dest[1] = src[1];174dest[2] = src[2];175dest[3] = src[3];176#endif177}178179inline void CopyFloat1To4(float dest[4], const float src) {180#ifdef _M_SSE181_mm_storeu_ps(dest, _mm_set_ss(src));182#else183dest[0] = src;184dest[1] = 0.0f;185dest[2] = 0.0f;186dest[3] = 0.0f;187#endif188}189190inline void CopyFloat2To4(float dest[4], const float src[2]) {191dest[0] = src[0];192dest[1] = src[1];193dest[2] = 0.0f;194dest[3] = 0.0f;195}196197inline void CopyFloat3To4(float dest[4], const float src[3]) {198dest[0] = src[0];199dest[1] = src[1];200dest[2] = src[2];201dest[3] = 0.0f;202}203204inline void CopyMatrix4x4(float dest[16], const float src[16]) {205memcpy(dest, src, sizeof(float) * 16);206}207208inline void ExpandFloat24x3ToFloat4(float dest[4], const uint32_t src[3]) {209#ifdef _M_SSE210__m128i values = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)src), 8);211_mm_storeu_si128((__m128i *)dest, values);212#elif PPSSPP_ARCH(ARM_NEON)213const uint32x4_t values = vshlq_n_u32(vld1q_u32(src), 8);214vst1q_u32((uint32_t *)dest, values);215#else216uint32_t temp[4] = { src[0] << 8, src[1] << 8, src[2] << 8, 0 };217memcpy(dest, temp, sizeof(float) * 4);218#endif219}220221// Note: If length is 0.0, it's gonna be left as 0.0 instead of trying to normalize. This is important.222inline void ExpandFloat24x3ToFloat4AndNormalize(float dest[4], const uint32_t src[3]) {223float temp[4];224ExpandFloat24x3ToFloat4(temp, src);225// TODO: Reuse code from NormalizedOr001 and optimize226float x = temp[0];227float y = temp[1];228float z = temp[2];229float len = sqrtf(x * x + y * y + z * z);230if (len != 0.0f)231len = 1.0f / len;232dest[0] = x * len;233dest[1] = y * len;234dest[2] = z * len;235dest[3] = 0.0f;236}237238inline uint32_t BytesToUint32(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {239return (a) | (b << 8) | (c << 16) | (d << 24);240}241242constexpr int32_t SignExtend8ToS32(uint32_t value) {243// This extends this sign at the 8th bit to the other 24 bits.244return (int8_t)(value & 0xFF);245}246247constexpr uint32_t SignExtend8ToU32(uint32_t value) {248// Just treat the bits as unsigned.249return (uint32_t)SignExtend8ToS32(value);250}251252constexpr int32_t SignExtend16ToS32(uint32_t value) {253// Same as SignExtend8toS32, but from the 16th bit.254return (int16_t)(value & 0xFFFF);255}256257constexpr uint32_t SignExtend16ToU32(uint32_t value) {258return (uint32_t)SignExtend16ToS32(value);259}260261262