CoCalc -- SmallDataConvert.h

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Common/Data/Convert/SmallDataConvert.h
³¹⁸⁷ views
1
#pragma once
2

3
#include <cstdint>
4
#include <cstring>
5
#include <cmath>
6

7
#include "Common/Common.h"
8
#include "ppsspp_config.h"
9
#include "Common/Math/SIMDHeaders.h"
10

11

12
extern const float one_over_255_x4[4];
13
extern const float exactly_255_x4[4];
14

15
// Utilities useful for filling in std140-layout uniform buffers, and similar.
16
// NEON intrinsics: https://developer.arm.com/documentation/den0018/a/NEON-Intrinsics?lang=en
17

18
// LSBs in f[0], etc.
19
inline void Uint8x4ToFloat4(float f[4], uint32_t u) {
20
#ifdef _M_SSE
21
	__m128i zero = _mm_setzero_si128();
22
	__m128i value = _mm_set1_epi32(u);
23
	__m128i value32 = _mm_unpacklo_epi16(_mm_unpacklo_epi8(value, zero), zero);
24
	__m128 fvalues = _mm_mul_ps(_mm_cvtepi32_ps(value32), _mm_load_ps(one_over_255_x4));
25
	_mm_storeu_ps(f, fvalues);
26
#elif PPSSPP_ARCH(ARM_NEON)
27
	const uint8x8_t value = (uint8x8_t)vdup_n_u32(u);
28
	const uint16x8_t value16 = vmovl_u8(value);
29
	const uint32x4_t value32 = vmovl_u16(vget_low_u16(value16));
30
	const float32x4_t valueFloat = vmulq_f32(vcvtq_f32_u32(value32), vdupq_n_f32(1.0f / 255.0f));
31
	vst1q_f32(f, valueFloat);
32
#else
33
	f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
34
	f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
35
	f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
36
	f[3] = ((u >> 24) & 0xFF) * (1.0f / 255.0f);
37
#endif
38
}
39

40
// Could be SSE optimized.
41
inline uint32_t Float4ToUint8x4(const float f[4]) {
42
#ifdef _M_SSE
43
	__m128i zero = _mm_setzero_si128();
44
	__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
45
	__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
46
	return _mm_cvtsi128_si32(ivalue);
47
#elif PPSSPP_ARCH(ARM_NEON)
48
	const float32x4_t value = vmulq_f32(vld1q_f32(f), vdupq_n_f32(255.0f));
49
	uint32x4_t ivalue32 = vcvtq_u32_f32(value);
50
	uint16x4_t ivalue16 = vqmovn_u32(ivalue32);
51
	uint8x8_t ivalue8 = vqmovn_u16(vcombine_u16(ivalue16, ivalue16));  // Is there no way to avoid the combine here?
52
	uint32x2_t outValue32 = vreinterpret_u32_u8(ivalue8);
53
	return vget_lane_u32(outValue32, 0);
54
#else
55
	int i4[4];
56
	for (int i = 0; i < 4; i++) {
57
		if (f[i] > 1.0f) {
58
			i4[i] = 255;
59
		} else if (f[i] < 0.0f) {
60
			i4[i] = 0;
61
		} else {
62
			i4[i] = (int)(f[i] * 255.0f);
63
		}
64
	}
65
	return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);
66
#endif
67
}
68

69
inline uint32_t Float4ToUint8x4_NoClamp(const float f[4]) {
70
#ifdef _M_SSE
71
	// Does actually clamp, no way to avoid it with the pack ops!
72
	__m128i zero = _mm_setzero_si128();
73
	__m128 value = _mm_mul_ps(_mm_loadu_ps(f), _mm_load_ps(exactly_255_x4));
74
	__m128i ivalue = _mm_packus_epi16(_mm_packs_epi32(_mm_cvtps_epi32(value), zero), zero);
75
	return _mm_cvtsi128_si32(ivalue);
76
#elif PPSSPP_ARCH(ARM_NEON)
77
	const float32x4_t value = vmulq_f32(vld1q_f32(f), vdupq_n_f32(255.0f));
78
	uint32x4_t ivalue32 = vcvtq_u32_f32(value);
79
	uint16x4_t ivalue16 = vqmovn_u32(ivalue32);
80
	uint8x8_t ivalue8 = vqmovn_u16(vcombine_u16(ivalue16, ivalue16));  // Is there no way to avoid the combine here?
81
	uint32x2_t outValue32 = vreinterpret_u32_u8(ivalue8);
82
	return vget_lane_u32(outValue32, 0);
83
#else
84
	u32 i4[4];
85
	for (int i = 0; i < 4; i++) {
86
		i4[i] = (int)(f[i] * 255.0f);
87
	}
88
	return i4[0] | (i4[1] << 8) | (i4[2] << 16) | (i4[3] << 24);
89
#endif
90
}
91

92
inline void Uint8x3ToFloat4_AlphaUint8(float f[4], uint32_t u, uint8_t alpha) {
93
#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON)
94
	Uint8x4ToFloat4(f, (u & 0xFFFFFF) | (alpha << 24));
95
#else
96
	f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
97
	f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
98
	f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
99
	f[3] = alpha * (1.0f / 255.0f);
100
#endif
101
}
102

103
inline void Uint8x3ToFloat4(float f[4], uint32_t u) {
104
#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON)
105
	Uint8x4ToFloat4(f, u & 0xFFFFFF);
106
#else
107
	f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
108
	f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
109
	f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
110
	f[3] = ((u >> 24) & 0xFF) * (1.0f / 255.0f);
111
#endif
112
}
113

114
inline void Uint8x3ToFloat3(float f[4], uint32_t u) {
115
#if defined(_M_SSE) || PPSSPP_ARCH(ARM_NEON)
116
	float temp[4];
117
	Uint8x4ToFloat4(temp, u & 0xFFFFFF);
118
	f[0] = temp[0];
119
	f[1] = temp[1];
120
	f[2] = temp[2];
121
#else
122
	f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
123
	f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
124
	f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
125
#endif
126
}
127

128
inline void Uint8x3ToInt4(int i[4], uint32_t u) {
129
	i[0] = ((u >> 0) & 0xFF);
130
	i[1] = ((u >> 8) & 0xFF);
131
	i[2] = ((u >> 16) & 0xFF);
132
	i[3] = 0;
133
}
134

135
inline void Uint8x3ToInt4_Alpha(int i[4], uint32_t u, uint8_t alpha) {
136
	i[0] = ((u >> 0) & 0xFF);
137
	i[1] = ((u >> 8) & 0xFF);
138
	i[2] = ((u >> 16) & 0xFF);
139
	i[3] = alpha;
140
}
141

142
inline void Uint8x3ToFloat4_Alpha(float f[4], uint32_t u, float alpha) {
143
	f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
144
	f[1] = ((u >> 8) & 0xFF) * (1.0f / 255.0f);
145
	f[2] = ((u >> 16) & 0xFF) * (1.0f / 255.0f);
146
	f[3] = alpha;
147
}
148

149
inline void Uint8x1ToFloat4(float f[4], uint32_t u) {
150
	f[0] = ((u >> 0) & 0xFF) * (1.0f / 255.0f);
151
	f[1] = 0.0f;
152
	f[2] = 0.0f;
153
	f[3] = 0.0f;
154
}
155

156
// These are just for readability.
157

158
inline void CopyFloat2(float dest[2], const float src[2]) {
159
	dest[0] = src[0];
160
	dest[1] = src[1];
161
}
162

163
inline void CopyFloat3(float dest[3], const float src[3]) {
164
	dest[0] = src[0];
165
	dest[1] = src[1];
166
	dest[2] = src[2];
167
}
168

169
inline void CopyFloat4(float dest[4], const float src[4]) {
170
#ifdef _M_SSE
171
	_mm_storeu_ps(dest, _mm_loadu_ps(src));
172
#else
173
	dest[0] = src[0];
174
	dest[1] = src[1];
175
	dest[2] = src[2];
176
	dest[3] = src[3];
177
#endif
178
}
179

180
inline void CopyFloat1To4(float dest[4], const float src) {
181
#ifdef _M_SSE
182
	_mm_storeu_ps(dest, _mm_set_ss(src));
183
#else
184
	dest[0] = src;
185
	dest[1] = 0.0f;
186
	dest[2] = 0.0f;
187
	dest[3] = 0.0f;
188
#endif
189
}
190

191
inline void CopyFloat2To4(float dest[4], const float src[2]) {
192
	dest[0] = src[0];
193
	dest[1] = src[1];
194
	dest[2] = 0.0f;
195
	dest[3] = 0.0f;
196
}
197

198
inline void CopyFloat3To4(float dest[4], const float src[3]) {
199
	dest[0] = src[0];
200
	dest[1] = src[1];
201
	dest[2] = src[2];
202
	dest[3] = 0.0f;
203
}
204

205
inline void CopyMatrix4x4(float dest[16], const float src[16]) {
206
	memcpy(dest, src, sizeof(float) * 16);
207
}
208

209
inline void ExpandFloat24x3ToFloat4(float dest[4], const uint32_t src[3]) {
210
#ifdef _M_SSE
211
	__m128i values = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)src), 8);
212
	_mm_storeu_si128((__m128i *)dest, values);
213
#elif PPSSPP_ARCH(ARM_NEON)
214
	const uint32x4_t values = vshlq_n_u32(vld1q_u32(src), 8);
215
	vst1q_u32((uint32_t *)dest, values);
216
#else
217
	uint32_t temp[4] = { src[0] << 8, src[1] << 8, src[2] << 8, 0 };
218
	memcpy(dest, temp, sizeof(float) * 4);
219
#endif
220
}
221

222
// Note: If length is 0.0, it's gonna be left as 0.0 instead of trying to normalize. This is important.
223
inline void ExpandFloat24x3ToFloat4AndNormalize(float dest[4], const uint32_t src[3]) {
224
	float temp[4];
225
	ExpandFloat24x3ToFloat4(temp, src);
226
	// TODO: Reuse code from NormalizedOr001 and optimize
227
	float x = temp[0];
228
	float y = temp[1];
229
	float z = temp[2];
230
	float len = sqrtf(x * x + y * y + z * z);
231
	if (len != 0.0f)
232
		len = 1.0f / len;
233
	dest[0] = x * len;
234
	dest[1] = y * len;
235
	dest[2] = z * len;
236
	dest[3] = 0.0f;
237
}
238

239
inline uint32_t BytesToUint32(uint8_t a, uint8_t b, uint8_t c, uint8_t d) {
240
	return (a) | (b << 8) | (c << 16) | (d << 24);
241
}
242

243
constexpr int32_t SignExtend8ToS32(uint32_t value) {
244
	// This extends this sign at the 8th bit to the other 24 bits.
245
	return (int8_t)(value & 0xFF);
246
}
247

248
constexpr uint32_t SignExtend8ToU32(uint32_t value) {
249
	// Just treat the bits as unsigned.
250
	return (uint32_t)SignExtend8ToS32(value);
251
}
252

253
constexpr int32_t SignExtend16ToS32(uint32_t value) {
254
	// Same as SignExtend8toS32, but from the 16th bit.
255
	return (int16_t)(value & 0xFFFF);
256
}
257

258
constexpr uint32_t SignExtend16ToU32(uint32_t value) {
259
	return (uint32_t)SignExtend16ToS32(value);
260
}
261

262
Product

Resources

Company