CoCalc -- CrossSIMD.h

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Common/Math/CrossSIMD.h
³¹⁸⁶ views
1
// CrossSIMD
2
//
3
// This file will contain cross-instruction-set SIMD instruction wrappers.
4
//
5
// This specific file (and a future CrossSIMD.cpp) file is under public domain or MIT, unlike most of the rest of the emulator.
6

7
#pragma once
8

9
#include "Common/Math/SIMDHeaders.h"
10

11
#define TEST_FALLBACK 0
12

13
#if PPSSPP_ARCH(SSE2) && !TEST_FALLBACK
14

15
// The point of this, as opposed to a float4 array, is to almost force the compiler
16
// to keep the matrix in registers, rather than loading on every access.
17
struct Mat4F32 {
18
	Mat4F32() {}
19
	Mat4F32(const float *matrix) {
20
		col0 = _mm_loadu_ps(matrix);
21
		col1 = _mm_loadu_ps(matrix + 4);
22
		col2 = _mm_loadu_ps(matrix + 8);
23
		col3 = _mm_loadu_ps(matrix + 12);
24
	}
25
	void Store(float *m) {
26
		_mm_storeu_ps(m, col0);
27
		_mm_storeu_ps(m + 4, col1);
28
		_mm_storeu_ps(m + 8, col2);
29
		_mm_storeu_ps(m + 12, col3);
30
	}
31

32
	// Unlike the old one, this one is careful about not loading out-of-range data.
33
	// The last two loads overlap.
34
	static Mat4F32 Load4x3(const float *m) {
35
		Mat4F32 result;
36
		alignas(16) static const uint32_t mask[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0 };
37
		alignas(16) static const float onelane3[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
38
		__m128 mask1110 = _mm_loadu_ps((const float *)mask);
39
		result.col0 = _mm_and_ps(_mm_loadu_ps(m), mask1110);
40
		result.col1 = _mm_and_ps(_mm_loadu_ps(m + 3), mask1110);
41
		result.col2 = _mm_and_ps(_mm_loadu_ps(m + 6), mask1110);
42
		__m128 lastCol = _mm_loadu_ps(m + 8);
43
		result.col3 = _mm_or_ps(_mm_and_ps(_mm_shuffle_ps(lastCol, lastCol, _MM_SHUFFLE(3, 3, 2, 1)), mask1110), _mm_load_ps(onelane3));
44
		return result;
45
	}
46

47
	__m128 col0;
48
	__m128 col1;
49
	__m128 col2;
50
	__m128 col3;
51
};
52

53
// The columns are spread out between the data*. This is just intermediate storage for multiplication.
54
struct Mat4x3F32 {
55
	Mat4x3F32(const float *matrix) {
56
		data0 = _mm_loadu_ps(matrix);
57
		data1 = _mm_loadu_ps(matrix + 4);
58
		data2 = _mm_loadu_ps(matrix + 8);
59
	}
60

61
	__m128 data0;
62
	__m128 data1;
63
	__m128 data2;
64
};
65

66
inline Mat4F32 Mul4x4By4x4(Mat4F32 a, Mat4F32 b) {
67
	Mat4F32 result;
68

69
	__m128 r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col0, 0));
70
	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col0, 1)));
71
	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col0, 2)));
72
	result.col0 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col0, 3)));
73

74
	r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col1, 0));
75
	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col1, 1)));
76
	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col1, 2)));
77
	result.col1 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col1, 3)));
78

79
	r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col2, 0));
80
	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col2, 1)));
81
	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col2, 2)));
82
	result.col2 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col2, 3)));
83

84
	r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.col3, 0));
85
	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.col3, 1)));
86
	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.col3, 2)));
87
	result.col3 = _mm_add_ps(r_col, _mm_mul_ps(b.col3, _mm_splat_lane_ps(a.col3, 3)));
88

89
	return result;
90
}
91

92
inline Mat4F32 Mul4x3By4x4(Mat4x3F32 a, Mat4F32 b) {
93
	Mat4F32 result;
94

95
	__m128 r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data0, 0));
96
	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data0, 1)));
97
	result.col0 = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data0, 2)));
98

99
	r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data0, 3));
100
	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data1, 0)));
101
	result.col1 = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data1, 1)));
102

103
	r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data1, 2));
104
	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data1, 3)));
105
	result.col2 = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data2, 0)));
106

107
	r_col = _mm_mul_ps(b.col0, _mm_splat_lane_ps(a.data2, 1));
108
	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col1, _mm_splat_lane_ps(a.data2, 2)));
109
	r_col = _mm_add_ps(r_col, _mm_mul_ps(b.col2, _mm_splat_lane_ps(a.data2, 3)));
110

111
	// The last entry has an implied 1.0f.
112
	result.col3 = _mm_add_ps(r_col, b.col3);
113
	return result;
114
}
115

116
struct Vec4S32 {
117
	__m128i v;
118

119
	static Vec4S32 Zero() { return Vec4S32{ _mm_setzero_si128() }; }
120
	static Vec4S32 Splat(int lane) { return Vec4S32{ _mm_set1_epi32(lane) }; }
121

122
	static Vec4S32 Load(const int *src) { return Vec4S32{ _mm_loadu_si128((const __m128i *)src) }; }
123
	static Vec4S32 LoadAligned(const int *src) { return Vec4S32{ _mm_load_si128((const __m128i *)src) }; }
124
	void Store(int *dst) { _mm_storeu_si128((__m128i *)dst, v); }
125
	void Store2(int *dst) { _mm_storel_epi64((__m128i *)dst, v); }
126
	void StoreAligned(int *dst) { _mm_store_si128((__m128i *)dst, v);}
127

128
	Vec4S32 SignBits32ToMask() {
129
		return Vec4S32{
130
			_mm_srai_epi32(v, 31)
131
		};
132
	}
133

134
	// Reads 16 bits from both operands, produces a 32-bit result per lane.
135
	// On SSE2, much faster than _mm_mullo_epi32_SSE2.
136
	// On NEON though, it'll read the full 32 bits, so beware.
137
	// See https://fgiesen.wordpress.com/2016/04/03/sse-mind-the-gap/.
138
	Vec4S32 Mul16(Vec4S32 other) const {
139
		// Note that we only need to mask one of the inputs, so we get zeroes - multiplying
140
		// by zero is zero, so it doesn't matter what the upper halfword of each 32-bit word is
141
		// in the other register.
142
		return Vec4S32{ _mm_madd_epi16(v, _mm_and_si128(other.v, _mm_set1_epi32(0x0000FFFF))) };
143
	}
144

145
	Vec4S32 SignExtend16() const { return Vec4S32{ _mm_srai_epi32(_mm_slli_epi32(v, 16), 16) }; }
146
	// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output.
147
	Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ _mm_min_epi16(v, other.v) }; }
148
	Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ _mm_max_epi16(v, other.v) }; }
149
	Vec4S32 FixupAfterMinMax() const { return SignExtend16(); }
150

151
	Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ _mm_add_epi32(v, other.v) }; }
152
	Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ _mm_sub_epi32(v, other.v) }; }
153
	Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ _mm_or_si128(v, other.v) }; }
154
	Vec4S32 operator &(Vec4S32 other) const { return Vec4S32{ _mm_and_si128(v, other.v) }; }
155
	Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ _mm_xor_si128(v, other.v) }; }
156
	// TODO: andnot
157
	void operator +=(Vec4S32 other) { v = _mm_add_epi32(v, other.v); }
158
	void operator -=(Vec4S32 other) { v = _mm_sub_epi32(v, other.v); }
159
	void operator &=(Vec4S32 other) { v = _mm_and_si128(v, other.v); }
160
	void operator |=(Vec4S32 other) { v = _mm_or_si128(v, other.v); }
161
	void operator ^=(Vec4S32 other) { v = _mm_xor_si128(v, other.v); }
162

163
	Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ _mm_andnot_si128(inverted.v, v) }; }  // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed.
164
	Vec4S32 Mul(Vec4S32 other) const { return *this * other; }
165

166
	template<int imm>
167
	Vec4S32 Shl() const { return Vec4S32{ imm == 0 ? v : _mm_slli_epi32(v, imm) }; }
168

169
	// NOTE: May be slow.
170
	int operator[](size_t index) const { return ((int *)&v)[index]; }
171

172
	// NOTE: This uses a CrossSIMD wrapper if we don't compile with SSE4 support, and is thus slow.
173
	Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ _mm_mullo_epi32_SSE2(v, other.v) }; }  // (ab3,ab2,ab1,ab0)
174

175
	Vec4S32 CompareEq(Vec4S32 other) const { return Vec4S32{ _mm_cmpeq_epi32(v, other.v) }; }
176
	Vec4S32 CompareLt(Vec4S32 other) const { return Vec4S32{ _mm_cmplt_epi32(v, other.v) }; }
177
	Vec4S32 CompareGt(Vec4S32 other) const { return Vec4S32{ _mm_cmpgt_epi32(v, other.v) }; }
178
};
179

180
inline bool AnyZeroSignBit(Vec4S32 value) {
181
	return _mm_movemask_ps(_mm_castsi128_ps(value.v)) != 0xF;
182
}
183

184
struct Vec4F32 {
185
	__m128 v;
186

187
	static Vec4F32 Zero() { return Vec4F32{ _mm_setzero_ps() }; }
188
	static Vec4F32 Splat(float lane) { return Vec4F32{ _mm_set1_ps(lane) }; }
189

190
	static Vec4F32 Load(const float *src) { return Vec4F32{ _mm_loadu_ps(src) }; }
191
	static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ _mm_load_ps(src) }; }
192
	static Vec4F32 LoadS8Norm(const int8_t *src) {
193
		__m128i value = _mm_set1_epi32(*((uint32_t *)src));
194
		__m128i value32 = _mm_unpacklo_epi16(_mm_unpacklo_epi8(value, value), value);
195
		// Sign extension. A bit ugly without SSE4.
196
		value32 = _mm_srai_epi32(value32, 24);
197
		return Vec4F32 { _mm_mul_ps(_mm_cvtepi32_ps(value32), _mm_set1_ps(1.0f / 128.0f)) };
198
	}
199
	static Vec4F32 LoadS16Norm(const int16_t *src) {  // Divides by 32768.0f
200
		__m128i bits = _mm_castpd_si128(_mm_load_sd((const double *)src));
201
		// Sign extension. A bit ugly without SSE4.
202
		bits = _mm_srai_epi32(_mm_unpacklo_epi16(bits, bits), 16);
203
		return Vec4F32 { _mm_mul_ps(_mm_cvtepi32_ps(bits), _mm_set1_ps(1.0f / 32768.0f)) };
204
	}
205
	void Store(float *dst) { _mm_storeu_ps(dst, v); }
206
	void Store2(float *dst) { _mm_storel_epi64((__m128i *)dst, _mm_castps_si128(v)); }
207
	void StoreAligned (float *dst) { _mm_store_ps(dst, v); }
208
	void Store3(float *dst) {
209
		// TODO: There might be better ways.
210
		_mm_store_pd((double *)dst, _mm_castps_pd(v));
211
		_mm_store_ss(dst + 2, _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)));
212
	}
213

214
	static Vec4F32 LoadConvertS16(const int16_t *src) {  // Note: will load 8 bytes
215
		__m128i value = _mm_loadl_epi64((const __m128i *)src);
216
		// 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend
217
		return Vec4F32{ _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(value, value), 16)) };
218
	}
219

220
	static Vec4F32 LoadConvertS8(const int8_t *src) {  // Note: will load 8 bytes
221
		__m128i value = _mm_loadl_epi64((const __m128i *)src);
222
		__m128i value16 = _mm_unpacklo_epi8(value, value);
223
		// 16-bit to 32-bit, use the upper words and an arithmetic shift right to sign extend
224
		return Vec4F32{ _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(value16, value16), 24)) };
225
	}
226

227
	static Vec4F32 LoadF24x3_One(const uint32_t *src) {
228
		alignas(16) static const uint32_t mask[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0 };
229
		alignas(16) static const float onelane3[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
230

231
		__m128 value = _mm_castsi128_ps(_mm_slli_epi32(_mm_loadu_si128((const __m128i *)src), 8));
232
		return Vec4F32{ _mm_or_ps(_mm_and_ps(value, _mm_load_ps((const float *)mask)), _mm_load_ps(onelane3)) };
233
	}
234

235
	static Vec4F32 FromVec4S32(Vec4S32 other) { return Vec4F32{ _mm_cvtepi32_ps(other.v) }; }
236

237
	Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ _mm_add_ps(v, other.v) }; }
238
	Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ _mm_sub_ps(v, other.v) }; }
239
	Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ _mm_mul_ps(v, other.v) }; }
240
	Vec4F32 Min(Vec4F32 other) const { return Vec4F32{ _mm_min_ps(v, other.v) }; }
241
	Vec4F32 Max(Vec4F32 other) const { return Vec4F32{ _mm_max_ps(v, other.v) }; }
242
	void operator +=(Vec4F32 other) { v = _mm_add_ps(v, other.v); }
243
	void operator -=(Vec4F32 other) { v = _mm_sub_ps(v, other.v); }
244
	void operator *=(Vec4F32 other) { v = _mm_mul_ps(v, other.v); }
245
	void operator /=(Vec4F32 other) { v = _mm_div_ps(v, other.v); }
246
	void operator &=(Vec4S32 other) { v = _mm_and_ps(v, _mm_castsi128_ps(other.v)); }
247
	Vec4F32 operator *(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
248
	// NOTE: May be slow.
249
	float operator[](size_t index) const { return ((float *)&v)[index]; }
250

251
	Vec4F32 Mul(float f) const { return Vec4F32{ _mm_mul_ps(v, _mm_set1_ps(f)) }; }
252
	Vec4F32 RecipApprox() const { return Vec4F32{ _mm_rcp_ps(v) }; }
253
	Vec4F32 Recip() const { return Vec4F32{ _mm_div_ps(_mm_set1_ps(1.0f), v) }; }
254

255
	Vec4F32 Clamp(float lower, float higher) const {
256
		return Vec4F32{
257
			_mm_min_ps(_mm_max_ps(v, _mm_set1_ps(lower)), _mm_set1_ps(higher))
258
		};
259
	}
260

261
	Vec4F32 WithLane3Zero() const {
262
		alignas(16) static const uint32_t mask[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0 };
263
		return Vec4F32{ _mm_and_ps(v, _mm_load_ps((const float *)mask)) };
264
	}
265

266
	Vec4F32 WithLane3One() const {
267
		alignas(16) static const uint32_t mask[4] = { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0 };
268
		alignas(16) static const float onelane3[4] = { 0.0f, 0.0f, 0.0f, 1.0f };
269
		return Vec4F32{ _mm_or_ps(_mm_and_ps(v, _mm_load_ps((const float *)mask)), _mm_load_ps((const float *)onelane3)) };
270
	}
271

272
	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
273
		return Vec4F32{ _mm_add_ps(
274
			_mm_add_ps(
275
				_mm_mul_ps(m.col0, _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0))),
276
				_mm_mul_ps(m.col1, _mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)))
277
			),
278
			_mm_add_ps(
279
				_mm_mul_ps(m.col2, _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2))),
280
				m.col3)
281
			)
282
		};
283
	}
284

285
	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
286
		_MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v);
287
	}
288

289
	// This is here because ARM64 can do this very efficiently.
290
	static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
291
		col0.v = _mm_loadu_ps(src);
292
		col1.v = _mm_loadu_ps(src + 4);
293
		col2.v = _mm_loadu_ps(src + 8);
294
		col3.v = _mm_loadu_ps(src + 12);
295
		_MM_TRANSPOSE4_PS(col0.v, col1.v, col2.v, col3.v);
296
	}
297

298
	Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpeq_ps(v, other.v)) }; }
299
	Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmplt_ps(v, other.v)) }; }
300
	Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ _mm_castps_si128(_mm_cmpgt_ps(v, other.v)) }; }
301
};
302

303
inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ _mm_cvttps_epi32(f.v) }; }
304
inline Vec4F32 Vec4F32FromS32(Vec4S32 f) { return Vec4F32{ _mm_cvtepi32_ps(f.v) }; }
305

306
inline bool AnyZeroSignBit(Vec4F32 value) {
307
	return _mm_movemask_ps(value.v) != 0xF;
308
}
309

310
// Make sure the W component of scale is 1.0f.
311
inline void ScaleInplace(Mat4F32 &m, Vec4F32 scale) {
312
	m.col0 = _mm_mul_ps(m.col0, scale.v);
313
	m.col1 = _mm_mul_ps(m.col1, scale.v);
314
	m.col2 = _mm_mul_ps(m.col2, scale.v);
315
	m.col3 = _mm_mul_ps(m.col3, scale.v);
316
}
317

318
inline void TranslateAndScaleInplace(Mat4F32 &m, Vec4F32 scale, Vec4F32 translate) {
319
	m.col0 = _mm_add_ps(_mm_mul_ps(m.col0, scale.v), _mm_mul_ps(translate.v, _mm_shuffle_ps(m.col0, m.col0, _MM_SHUFFLE(3,3,3,3))));
320
	m.col1 = _mm_add_ps(_mm_mul_ps(m.col1, scale.v), _mm_mul_ps(translate.v, _mm_shuffle_ps(m.col1, m.col1, _MM_SHUFFLE(3,3,3,3))));
321
	m.col2 = _mm_add_ps(_mm_mul_ps(m.col2, scale.v), _mm_mul_ps(translate.v, _mm_shuffle_ps(m.col2, m.col2, _MM_SHUFFLE(3,3,3,3))));
322
	m.col3 = _mm_add_ps(_mm_mul_ps(m.col3, scale.v), _mm_mul_ps(translate.v, _mm_shuffle_ps(m.col3, m.col3, _MM_SHUFFLE(3,3,3,3))));
323
}
324

325
struct Vec4U16 {
326
	__m128i v;  // we only use the lower 64 bits.
327

328
	static Vec4U16 Zero() { return Vec4U16{ _mm_setzero_si128() }; }
329
	// static Vec4U16 AllOnes() { return Vec4U16{ _mm_cmpeq_epi16(_mm_setzero_si128(), _mm_setzero_si128()) }; }
330

331
	static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ _mm_loadl_epi64((__m128i *)mem) }; }
332
	void Store(uint16_t *mem) { _mm_storel_epi64((__m128i *)mem, v); }
333

334
	// NOTE: 16-bit signed saturation! Will work for a lot of things, but not all.
335
	static Vec4U16 FromVec4S32(Vec4S32 v) {
336
		return Vec4U16{ _mm_packu_epi32_SSE2(v.v)};
337
	}
338
	static Vec4U16 FromVec4F32(Vec4F32 v) {
339
		return Vec4U16{ _mm_packu_epi32_SSE2(_mm_cvtps_epi32(v.v)) };
340
	}
341

342
	Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ _mm_or_si128(v, other.v) }; }
343
	Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ _mm_and_si128(v, other.v) }; }
344
	Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ _mm_xor_si128(v, other.v) }; }
345

346
	Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ _mm_max_epu16_SSE2(v, other.v) }; }
347
	Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ _mm_min_epu16_SSE2(v, other.v) }; }
348
	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ _mm_cmplt_epu16(v, other.v) }; }
349

350
	inline Vec4U16 AndNot(Vec4U16 inverted) {
351
		return Vec4U16{
352
			_mm_andnot_si128(inverted.v, v)  // NOTE: with _mm_andnot, the first parameter is inverted, and then and is performed.
353
		};
354
	}
355
};
356

357
struct Vec8U16 {
358
	__m128i v;
359

360
	static Vec8U16 Zero() { return Vec8U16{ _mm_setzero_si128() }; }
361
	static Vec8U16 Splat(uint16_t value) { return Vec8U16{ _mm_set1_epi16((int16_t)value) }; }
362

363
	static Vec8U16 Load(const uint16_t *mem) { return Vec8U16{ _mm_loadu_si128((__m128i *)mem) }; }
364
	void Store(uint16_t *mem) { _mm_storeu_si128((__m128i *)mem, v); }
365
};
366

367
inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
368
	__m128i temp = _mm_srai_epi32(v.v, 31);
369
	return Vec4U16 {
370
		_mm_packs_epi32(temp, temp)
371
	};
372
}
373

374
#elif PPSSPP_ARCH(ARM_NEON) && !TEST_FALLBACK
375

376
struct Mat4F32 {
377
	Mat4F32() {}
378
	Mat4F32(const float *matrix) {
379
		col0 = vld1q_f32(matrix);
380
		col1 = vld1q_f32(matrix + 4);
381
		col2 = vld1q_f32(matrix + 8);
382
		col3 = vld1q_f32(matrix + 12);
383
	}
384
	void Store(float *m) {
385
		vst1q_f32(m, col0);
386
		vst1q_f32(m + 4, col1);
387
		vst1q_f32(m + 8, col2);
388
		vst1q_f32(m + 12, col3);
389
	}
390

391
	// Unlike the old one, this one is careful about not loading out-of-range data.
392
	// The last two loads overlap.
393
	static Mat4F32 Load4x3(const float *m) {
394
		Mat4F32 result;
395
		result.col0 = vsetq_lane_f32(0.0f, vld1q_f32(m), 3);
396
		result.col1 = vsetq_lane_f32(0.0f, vld1q_f32(m + 3), 3);
397
		result.col2 = vsetq_lane_f32(0.0f, vld1q_f32(m + 6), 3);
398
		result.col3 = vsetq_lane_f32(1.0f, vld1q_f32(m + 9), 3);  // TODO: Fix this out of bounds read
399
		return result;
400
	}
401

402
	float32x4_t col0;
403
	float32x4_t col1;
404
	float32x4_t col2;
405
	float32x4_t col3;
406
};
407

408
// The columns are spread out between the data*. This is just intermediate storage for multiplication.
409
struct Mat4x3F32 {
410
	Mat4x3F32(const float *matrix) {
411
		data0 = vld1q_f32(matrix);
412
		data1 = vld1q_f32(matrix + 4);
413
		data2 = vld1q_f32(matrix + 8);
414
	}
415

416
	float32x4_t data0;
417
	float32x4_t data1;
418
	float32x4_t data2;
419
};
420

421
inline Mat4F32 Mul4x4By4x4(Mat4F32 a, Mat4F32 b) {
422
	Mat4F32 result;
423

424
	float32x4_t r_col = vmulq_laneq_f32(b.col0, a.col0, 0);
425
	r_col = vfmaq_laneq_f32(r_col, b.col1, a.col0, 1);
426
	r_col = vfmaq_laneq_f32(r_col, b.col2, a.col0, 2);
427
	result.col0 = vfmaq_laneq_f32(r_col, b.col3, a.col0, 3);
428

429
	r_col = vmulq_laneq_f32(b.col0, a.col1, 0);
430
	r_col = vfmaq_laneq_f32(r_col, b.col1, a.col1, 1);
431
	r_col = vfmaq_laneq_f32(r_col, b.col2, a.col1, 2);
432
	result.col1 = vfmaq_laneq_f32(r_col, b.col3, a.col1, 3);
433

434
	r_col = vmulq_laneq_f32(b.col0, a.col2, 0);
435
	r_col = vfmaq_laneq_f32(r_col, b.col1, a.col2, 1);
436
	r_col = vfmaq_laneq_f32(r_col, b.col2, a.col2, 2);
437
	result.col2 = vfmaq_laneq_f32(r_col, b.col3, a.col2, 3);
438

439
	r_col = vmulq_laneq_f32(b.col0, a.col3, 0);
440
	r_col = vfmaq_laneq_f32(r_col, b.col1, a.col3, 1);
441
	r_col = vfmaq_laneq_f32(r_col, b.col2, a.col3, 2);
442
	result.col3 = vfmaq_laneq_f32(r_col, b.col3, a.col3, 3);
443

444
	return result;
445
}
446

447
inline Mat4F32 Mul4x3By4x4(Mat4x3F32 a, Mat4F32 b) {
448
	Mat4F32 result;
449

450
	float32x4_t r_col = vmulq_laneq_f32(b.col0, a.data0, 0);
451
	r_col = vfmaq_laneq_f32(r_col, b.col1, a.data0, 1);
452
	result.col0 = vfmaq_laneq_f32(r_col, b.col2, a.data0, 2);
453

454
	r_col = vmulq_laneq_f32(b.col0, a.data0, 3);
455
	r_col = vfmaq_laneq_f32(r_col, b.col1, a.data1, 0);
456
	result.col1 = vfmaq_laneq_f32(r_col, b.col2, a.data1, 1);
457

458
	r_col = vmulq_laneq_f32(b.col0, a.data1, 2);
459
	r_col = vfmaq_laneq_f32(r_col, b.col1, a.data1, 3);
460
	result.col2 = vfmaq_laneq_f32(r_col, b.col2, a.data2, 0);
461

462
	r_col = vmulq_laneq_f32(b.col0, a.data2, 1);
463
	r_col = vfmaq_laneq_f32(r_col, b.col1, a.data2, 2);
464
	r_col = vfmaq_laneq_f32(r_col, b.col2, a.data2, 3);
465

466
	// The last entry has an implied 1.0f.
467
	result.col3 = vaddq_f32(r_col, b.col3);
468
	return result;
469
}
470

471
struct Vec4S32 {
472
	int32x4_t v;
473

474
	static Vec4S32 Zero() { return Vec4S32{ vdupq_n_s32(0) }; }
475
	static Vec4S32 Splat(int lane) { return Vec4S32{ vdupq_n_s32(lane) }; }
476

477
	static Vec4S32 Load(const int *src) { return Vec4S32{ vld1q_s32(src) }; }
478
	static Vec4S32 LoadAligned(const int *src) { return Vec4S32{ vld1q_s32(src) }; }
479
	void Store(int *dst) { vst1q_s32(dst, v); }
480
	void Store2(int *dst) { vst1_s32(dst, vget_low_s32(v)); }
481
	void StoreAligned(int *dst) { vst1q_s32(dst, v); }
482

483
	// Warning: Unlike on x86, this is a full 32-bit multiplication.
484
	Vec4S32 Mul16(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
485

486
	Vec4S32 SignExtend16() const { return Vec4S32{ vshrq_n_s32(vshlq_n_s32(v, 16), 16) }; }
487
	// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least).
488
	Vec4S32 Min16(Vec4S32 other) const { return Vec4S32{ vminq_s32(v, other.v) }; }
489
	Vec4S32 Max16(Vec4S32 other) const { return Vec4S32{ vmaxq_s32(v, other.v) }; }
490
	Vec4S32 FixupAfterMinMax() const { return Vec4S32{ v }; }
491

492
	// NOTE: May be slow.
493
	int operator[](size_t index) const { return ((int *)&v)[index]; }
494

495
	Vec4S32 operator +(Vec4S32 other) const { return Vec4S32{ vaddq_s32(v, other.v) }; }
496
	Vec4S32 operator -(Vec4S32 other) const { return Vec4S32{ vsubq_s32(v, other.v) }; }
497
	Vec4S32 operator *(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
498
	Vec4S32 operator |(Vec4S32 other) const { return Vec4S32{ vorrq_s32(v, other.v) }; }
499
	Vec4S32 operator &(Vec4S32 other) const { return Vec4S32{ vandq_s32(v, other.v) }; }
500
	Vec4S32 operator ^(Vec4S32 other) const { return Vec4S32{ veorq_s32(v, other.v) }; }
501
	Vec4S32 AndNot(Vec4S32 inverted) const { return Vec4S32{ vandq_s32(v, vmvnq_s32(inverted.v))}; }
502
	Vec4S32 Mul(Vec4S32 other) const { return Vec4S32{ vmulq_s32(v, other.v) }; }
503
	void operator &=(Vec4S32 other) { v = vandq_s32(v, other.v); }
504

505
	template<int imm>
506
	Vec4S32 Shl() const { return Vec4S32{ vshlq_n_s32(v, imm) }; }
507

508
	void operator +=(Vec4S32 other) { v = vaddq_s32(v, other.v); }
509
	void operator -=(Vec4S32 other) { v = vsubq_s32(v, other.v); }
510

511
	Vec4S32 CompareEq(Vec4S32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vceqq_s32(v, other.v)) }; }
512
	Vec4S32 CompareLt(Vec4S32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcltq_s32(v, other.v)) }; }
513
	Vec4S32 CompareGt(Vec4S32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgtq_s32(v, other.v)) }; }
514
	Vec4S32 CompareGtZero() const { return Vec4S32{ vreinterpretq_s32_u32(vcgtq_s32(v, vdupq_n_s32(0))) }; }
515
};
516

517
struct Vec4F32 {
518
	float32x4_t v;
519

520
	static Vec4F32 Zero() { return Vec4F32{ vdupq_n_f32(0.0f) }; }
521
	static Vec4F32 Splat(float lane) { return Vec4F32{ vdupq_n_f32(lane) }; }
522

523
	static Vec4F32 Load(const float *src) { return Vec4F32{ vld1q_f32(src) }; }
524
	static Vec4F32 LoadS8Norm(const int8_t *src) {
525
		const int8x8_t value = (int8x8_t)vdup_n_u32(*((uint32_t *)src));
526
		const int16x8_t value16 = vmovl_s8(value);
527
		return Vec4F32 { vcvtq_n_f32_s32(vmovl_s16(vget_low_s16(value16)), 7) };
528
	}
529
	static Vec4F32 LoadS16Norm(const int16_t *src) {  // Divides by 32768.0f
530
		return Vec4F32 { vcvtq_n_f32_s32(vmovl_s16(vld1_s16(src)), 15) };
531
	}
532
	static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ vld1q_f32(src) }; }
533
	void Store(float *dst) { vst1q_f32(dst, v); }
534
	void Store2(float *dst) { vst1_f32(dst, vget_low_f32(v)); }
535
	void StoreAligned(float *dst) { vst1q_f32(dst, v); }
536
	void Store3(float *dst) {
537
		// TODO: There might be better ways. Try to avoid this when possible.
538
		vst1_f32(dst, vget_low_f32(v));
539
		dst[2] = vgetq_lane_f32(v, 2);
540
	}
541

542
	static Vec4F32 LoadConvertS16(const int16_t *src) {
543
		int16x4_t value = vld1_s16(src);
544
		return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value)) };
545
	}
546

547
	static Vec4F32 LoadConvertS8(const int8_t *src) {  // Note: will load 8 bytes, not 4. Only the first 4 bytes will be used.
548
		int8x8_t value = vld1_s8(src);
549
		int16x4_t value16 = vget_low_s16(vmovl_s8(value));
550
		return Vec4F32{ vcvtq_f32_s32(vmovl_s16(value16)) };
551
	}
552

553
	static Vec4F32 LoadF24x3_One(const uint32_t *src) {
554
		return Vec4F32{ vsetq_lane_f32(1.0f, vreinterpretq_f32_u32(vshlq_n_u32(vld1q_u32(src), 8)), 3) };
555
	}
556

557
	static Vec4F32 FromVec4S32(Vec4S32 other) {
558
		return Vec4F32{ vcvtq_f32_s32(other.v) };
559
	}
560

561
	// NOTE: May be slow.
562
	float operator[](size_t index) const { return ((float *)&v)[index]; }
563

564
	Vec4F32 operator +(Vec4F32 other) const { return Vec4F32{ vaddq_f32(v, other.v) }; }
565
	Vec4F32 operator -(Vec4F32 other) const { return Vec4F32{ vsubq_f32(v, other.v) }; }
566
	Vec4F32 operator *(Vec4F32 other) const { return Vec4F32{ vmulq_f32(v, other.v) }; }
567
	Vec4F32 Min(Vec4F32 other) const { return Vec4F32{ vminq_f32(v, other.v) }; }
568
	Vec4F32 Max(Vec4F32 other) const { return Vec4F32{ vmaxq_f32(v, other.v) }; }
569
	void operator +=(Vec4F32 other) { v = vaddq_f32(v, other.v); }
570
	void operator -=(Vec4F32 other) { v = vsubq_f32(v, other.v); }
571
	void operator *=(Vec4F32 other) { v = vmulq_f32(v, other.v); }
572
#if PPSSPP_ARCH(ARM64_NEON)
573
	void operator /=(Vec4F32 other) { v = vdivq_f32(v, other.v); }
574
#else
575
	// ARM32 doesn't have vdivq.
576
	void operator /=(Vec4F32 other) { v = vmulq_f32(v, other.Recip().v); }
577
#endif
578
	void operator &=(Vec4S32 other) { v = vreinterpretq_f32_s32(vandq_s32(vreinterpretq_s32_f32(v), other.v)); }
579
	Vec4F32 operator *(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
580

581
	Vec4F32 Mul(float f) const { return Vec4F32{ vmulq_f32(v, vdupq_n_f32(f)) }; }
582

583
	Vec4F32 Recip() const {
584
		float32x4_t recip = vrecpeq_f32(v);
585
		// Use a couple Newton-Raphson steps to refine the estimate.
586
		// To save one iteration at the expense of accuracy, use RecipApprox().
587
		recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
588
		recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
589
		return Vec4F32{ recip };
590
	}
591

592
	Vec4F32 RecipApprox() const {
593
		float32x4_t recip = vrecpeq_f32(v);
594
		// To approximately match the precision of x86-64's rcpps, do a single iteration.
595
		recip = vmulq_f32(vrecpsq_f32(v, recip), recip);
596
		return Vec4F32{ recip };
597
	}
598

599
	Vec4F32 Clamp(float lower, float higher) const {
600
		return Vec4F32{
601
			vminq_f32(vmaxq_f32(v, vdupq_n_f32(lower)), vdupq_n_f32(higher))
602
		};
603
	}
604

605
	Vec4F32 WithLane3Zero() const {
606
		return Vec4F32{ vsetq_lane_f32(0.0f, v, 3) };
607
	}
608

609
	Vec4F32 WithLane3One() const {
610
		return Vec4F32{ vsetq_lane_f32(1.0f, v, 3) };
611
	}
612

613
	Vec4S32 CompareEq(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vceqq_f32(v, other.v)) }; }
614
	Vec4S32 CompareLt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcltq_f32(v, other.v)) }; }
615
	Vec4S32 CompareGt(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgtq_f32(v, other.v)) }; }
616
	Vec4S32 CompareLe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcleq_f32(v, other.v)) }; }
617
	Vec4S32 CompareGe(Vec4F32 other) const { return Vec4S32{ vreinterpretq_s32_u32(vcgeq_f32(v, other.v)) }; }
618

619
	// One of many possible solutions. Sometimes we could also use vld4q_f32 probably..
620
	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
621
#if PPSSPP_ARCH(ARM64_NEON)
622
		// Only works on ARM64
623
		float32x4_t temp0 = vzip1q_f32(col0.v, col2.v);
624
		float32x4_t temp1 = vzip2q_f32(col0.v, col2.v);
625
		float32x4_t temp2 = vzip1q_f32(col1.v, col3.v);
626
		float32x4_t temp3 = vzip2q_f32(col1.v, col3.v);
627
		col0.v = vzip1q_f32(temp0, temp2);
628
		col1.v = vzip2q_f32(temp0, temp2);
629
		col2.v = vzip1q_f32(temp1, temp3);
630
		col3.v = vzip2q_f32(temp1, temp3);
631
#else
632
   		float32x4x2_t col01 = vtrnq_f32(col0.v, col1.v);
633
        float32x4x2_t col23 = vtrnq_f32(col2.v, col3.v);
634
        col0.v = vcombine_f32(vget_low_f32(col01.val[0]), vget_low_f32(col23.val[0]));
635
        col1.v = vcombine_f32(vget_low_f32(col01.val[1]), vget_low_f32(col23.val[1]));
636
        col2.v = vcombine_f32(vget_high_f32(col01.val[0]), vget_high_f32(col23.val[0]));
637
        col3.v = vcombine_f32(vget_high_f32(col01.val[1]), vget_high_f32(col23.val[1]));
638
#endif
639
	}
640

641
	static void LoadTranspose(const float *src, Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
642
		// The optimizer hopefully gets rid of the copies below.
643
		float32x4x4_t r = vld4q_f32(src);
644
		col0.v = r.val[0];
645
		col1.v = r.val[1];
646
		col2.v = r.val[2];
647
		col3.v = r.val[3];
648
	}
649

650
	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
651
#if PPSSPP_ARCH(ARM64_NEON)
652
		float32x4_t sum = vaddq_f32(
653
			vaddq_f32(vmulq_laneq_f32(m.col0, v, 0), vmulq_laneq_f32(m.col1, v, 1)),
654
			vaddq_f32(vmulq_laneq_f32(m.col2, v, 2), m.col3));
655
#else
656
		float32x4_t sum = vaddq_f32(
657
			vaddq_f32(vmulq_lane_f32(m.col0, vget_low_f32(v), 0), vmulq_lane_f32(m.col1, vget_low_f32(v), 1)),
658
			vaddq_f32(vmulq_lane_f32(m.col2, vget_high_f32(v), 0), m.col3));
659
#endif
660
		return Vec4F32{ sum };
661
	}
662
};
663

664
inline Vec4S32 Vec4S32FromF32(Vec4F32 f) { return Vec4S32{ vcvtq_s32_f32(f.v) }; }
665
inline Vec4F32 Vec4F32FromS32(Vec4S32 s) { return Vec4F32{ vcvtq_f32_s32(s.v) }; }
666

667
// Make sure the W component of scale is 1.0f.
668
inline void ScaleInplace(Mat4F32 &m, Vec4F32 scale) {
669
	m.col0 = vmulq_f32(m.col0, scale.v);
670
	m.col1 = vmulq_f32(m.col1, scale.v);
671
	m.col2 = vmulq_f32(m.col2, scale.v);
672
	m.col3 = vmulq_f32(m.col3, scale.v);
673
}
674

675
// Make sure the W component of scale is 1.0f, and the W component of translate should be 0.
676
inline void TranslateAndScaleInplace(Mat4F32 &m, Vec4F32 scale, Vec4F32 translate) {
677
	m.col0 = vaddq_f32(vmulq_f32(m.col0, scale.v), vmulq_laneq_f32(translate.v, m.col0, 3));
678
	m.col1 = vaddq_f32(vmulq_f32(m.col1, scale.v), vmulq_laneq_f32(translate.v, m.col1, 3));
679
	m.col2 = vaddq_f32(vmulq_f32(m.col2, scale.v), vmulq_laneq_f32(translate.v, m.col2, 3));
680
	m.col3 = vaddq_f32(vmulq_f32(m.col3, scale.v), vmulq_laneq_f32(translate.v, m.col3, 3));
681
}
682

683
inline bool AnyZeroSignBit(Vec4S32 value) {
684
#if PPSSPP_ARCH(ARM64_NEON)
685
	// Shortcut on arm64
686
	return vmaxvq_s32(value.v) >= 0;
687
#else
688
	// Very suboptimal, let's optimize later.
689
	int32x2_t prod = vand_s32(vget_low_s32(value.v), vget_high_s32(value.v));
690
	int mask = vget_lane_s32(prod, 0) & vget_lane_s32(prod, 1);
691
	return (mask & 0x80000000) == 0;
692
#endif
693
}
694

695
inline bool AnyZeroSignBit(Vec4F32 value) {
696
	int32x4_t ival = vreinterpretq_s32_f32(value.v);
697
#if PPSSPP_ARCH(ARM64_NEON)
698
	// Shortcut on arm64
699
	return vmaxvq_s32(ival) >= 0;
700
#else
701
	int32x2_t prod = vand_s32(vget_low_s32(ival), vget_high_s32(ival));
702
	int mask = vget_lane_s32(prod, 0) & vget_lane_s32(prod, 1);
703
	return (mask & 0x80000000) == 0;
704
#endif
705
}
706

707
struct Vec4U16 {
708
	uint16x4_t v;  // 64 bits.
709

710
	static Vec4U16 Zero() { return Vec4U16{ vdup_n_u16(0) }; }
711
	static Vec4U16 Splat(uint16_t value) { return Vec4U16{ vdup_n_u16(value) }; }
712

713
	static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ vld1_u16(mem) }; }
714
	void Store(uint16_t *mem) { vst1_u16(mem, v); }
715

716
	static Vec4U16 FromVec4S32(Vec4S32 v) {
717
		return Vec4U16{ vmovn_u32(vreinterpretq_u32_s32(v.v)) };
718
	}
719
	static Vec4U16 FromVec4F32(Vec4F32 v) {
720
		return Vec4U16{ vmovn_u32(vreinterpretq_u32_s32(vcvtq_s32_f32(v.v))) };
721
	}
722

723
	Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ vorr_u16(v, other.v) }; }
724
	Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ vand_u16(v, other.v) }; }
725
	Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ veor_u16(v, other.v) }; }
726

727
	Vec4U16 Max(Vec4U16 other) const { return Vec4U16{ vmax_u16(v, other.v) }; }
728
	Vec4U16 Min(Vec4U16 other) const { return Vec4U16{ vmin_u16(v, other.v) }; }
729
	Vec4U16 CompareLT(Vec4U16 other) { return Vec4U16{ vclt_u16(v, other.v) }; }
730

731
	Vec4U16 AndNot(Vec4U16 inverted) { return Vec4U16{ vand_u16(v, vmvn_u16(inverted.v)) }; }
732
};
733

734
inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
735
	int32x4_t sign_mask = vshrq_n_s32(v.v, 31);
736
	uint16x4_t result = vreinterpret_u16_s16(vmovn_s32(sign_mask));
737
	return Vec4U16{ result };
738
}
739

740
struct Vec8U16 {
741
	uint16x8_t v;
742

743
	static Vec8U16 Zero() { return Vec8U16{ vdupq_n_u16(0) }; }
744
	static Vec8U16 Splat(uint16_t value) { return Vec8U16{ vdupq_n_u16(value) }; }
745

746
	static Vec8U16 Load(const uint16_t *mem) { return Vec8U16{ vld1q_u16(mem) }; }
747
	void Store(uint16_t *mem) { vst1q_u16(mem, v); }
748
};
749

750
#else
751

752
#define CROSSSIMD_SLOW 1
753

754
// Fake SIMD by using scalar.
755

756
struct Mat4F32 {
757
	Mat4F32() {}
758
	Mat4F32(const float *src) {
759
		memcpy(m, src, sizeof(m));
760
	}
761
	void Store(float *dest) {
762
		memcpy(dest, m, sizeof(m));
763
	}
764
	static Mat4F32 Load4x3(const float *src) {
765
		Mat4F32 mat;
766
		mat.m[0] = src[0];
767
		mat.m[1] = src[1];
768
		mat.m[2] = src[2];
769
		mat.m[3] = 0.0f;
770
		mat.m[4] = src[3];
771
		mat.m[5] = src[4];
772
		mat.m[6] = src[5];
773
		mat.m[7] = 0.0f;
774
		mat.m[8] = src[6];
775
		mat.m[9] = src[7];
776
		mat.m[10] = src[8];
777
		mat.m[11] = 0.0f;
778
		mat.m[12] = src[9];
779
		mat.m[13] = src[10];
780
		mat.m[14] = src[11];
781
		mat.m[15] = 1.0f;
782
		return mat;
783
	}
784

785
	// cols are consecutive
786
	float m[16];
787
};
788

789
// The columns are consecutive but missing the last row (implied 0,0,0,1).
790
// This is just intermediate storage for multiplication.
791
struct Mat4x3F32 {
792
	Mat4x3F32(const float *matrix) {
793
		memcpy(m, matrix, 12 * sizeof(float));
794
	}
795
	float m[12];
796
};
797

798
struct Vec4S32 {
799
	int32_t v[4];
800

801
	static Vec4S32 Zero() { return Vec4S32{}; }
802
	static Vec4S32 Splat(int lane) { return Vec4S32{ { lane, lane, lane, lane } }; }
803

804
	static Vec4S32 Load(const int *src) { return Vec4S32{ { src[0], src[1], src[2], src[3] }}; }
805
	static Vec4S32 LoadAligned(const int *src) { return Load(src); }
806
	void Store(int *dst) { memcpy(dst, v, sizeof(v)); }
807
	void Store2(int *dst) { memcpy(dst, v, sizeof(v[0]) * 2); }
808
	void StoreAligned(int *dst) { memcpy(dst, v, sizeof(v)); }
809

810
	// Warning: Unlike on x86 SSE2, this is a full 32-bit multiplication.
811
	Vec4S32 Mul16(Vec4S32 other) const { return Vec4S32{ { v[0] * other.v[0], v[1] * other.v[1], v[2] * other.v[2], v[3] * other.v[3] } }; }
812

813
	Vec4S32 SignExtend16() const {
814
		Vec4S32 tmp;
815
		for (int i = 0; i < 4; i++) {
816
			tmp.v[i] = (int32_t)(int16_t)v[i];
817
		}
818
		return tmp;
819
	}
820
	// NOTE: These can be done in sequence, but when done, you must FixupAfterMinMax to get valid output (on SSE2 at least).
821
	Vec4S32 Min16(Vec4S32 other) const {
822
		Vec4S32 tmp;
823
		for (int i = 0; i < 4; i++) {
824
			tmp.v[i] = other.v[i] < v[i] ? other.v[i] : v[i];
825
		}
826
		return tmp;
827
	}
828
	Vec4S32 Max16(Vec4S32 other) const {
829
		Vec4S32 tmp;
830
		for (int i = 0; i < 4; i++) {
831
			tmp.v[i] = other.v[i] > v[i] ? other.v[i] : v[i];
832
		}
833
		return tmp;
834
	}
835
	Vec4S32 FixupAfterMinMax() const { return *this; }
836

837
	int operator[](size_t index) const { return v[index]; }
838

839
	Vec4S32 operator +(Vec4S32 other) const {
840
		return Vec4S32{ { v[0] + other.v[0], v[1] + other.v[1], v[2] + other.v[2], v[3] + other.v[3], } };
841
	}
842
	Vec4S32 operator -(Vec4S32 other) const {
843
		return Vec4S32{ { v[0] - other.v[0], v[1] - other.v[1], v[2] - other.v[2], v[3] - other.v[3], } };
844
	}
845
	Vec4S32 operator *(Vec4S32 other) const {
846
		return Vec4S32{ { v[0] * other.v[0], v[1] * other.v[1], v[2] * other.v[2], v[3] * other.v[3], } };
847
	}
848
	// TODO: Can optimize the bitwise ones with 64-bit operations.
849
	Vec4S32 operator |(Vec4S32 other) const {
850
		return Vec4S32{ { v[0] | other.v[0], v[1] | other.v[1], v[2] | other.v[2], v[3] | other.v[3], } };
851
	}
852
	Vec4S32 operator &(Vec4S32 other) const {
853
		return Vec4S32{ { v[0] & other.v[0], v[1] & other.v[1], v[2] & other.v[2], v[3] & other.v[3], } };
854
	}
855
	Vec4S32 operator ^(Vec4S32 other) const {
856
		return Vec4S32{ { v[0] ^ other.v[0], v[1] ^ other.v[1], v[2] ^ other.v[2], v[3] ^ other.v[3], } };
857
	}
858
	Vec4S32 AndNot(Vec4S32 other) const {
859
		return Vec4S32{ { v[0] & ~other.v[0], v[1] & ~other.v[1], v[2] & ~other.v[2], v[3] & ~other.v[3], } };
860
	}
861
	Vec4S32 Mul(Vec4S32 other) const { return *this * other; }
862

863
	void operator &=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] &= other.v[i]; }
864
	void operator +=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] += other.v[i]; }
865
	void operator -=(Vec4S32 other) { for (int i = 0; i < 4; i++) v[i] -= other.v[i]; }
866

867
	template<int imm>
868
	Vec4S32 Shl() const { return Vec4S32{ { v[0] << imm, v[1] << imm, v[2] << imm, v[3] << imm } }; }
869

870
	Vec4S32 CompareEq(Vec4S32 other) const {
871
		Vec4S32 out;
872
		for (int i = 0; i < 4; i++) {
873
			out.v[i] = v[i] == other.v[i] ? 0xFFFFFFFF : 0;
874
		}
875
		return out;
876
	}
877
	Vec4S32 CompareLt(Vec4S32 other) const {
878
		Vec4S32 out;
879
		for (int i = 0; i < 4; i++) {
880
			out.v[i] = v[i] < other.v[i] ? 0xFFFFFFFF : 0;
881
		}
882
		return out;
883
	}
884
	Vec4S32 CompareGt(Vec4S32 other) const {
885
		Vec4S32 out;
886
		for (int i = 0; i < 4; i++) {
887
			out.v[i] = v[i] > other.v[i] ? 0xFFFFFFFF : 0;
888
		}
889
		return out;
890
	}
891
	Vec4S32 CompareGtZero() const {
892
		Vec4S32 out;
893
		for (int i = 0; i < 4; i++) {
894
			out.v[i] = v[i] > 0 ? 0xFFFFFFFF : 0;
895
		}
896
		return out;
897
	}
898
};
899

900
struct Vec4F32 {
901
	float v[4];
902

903
	static Vec4F32 Zero() { return Vec4F32{}; }
904
	static Vec4F32 Splat(float lane) { return Vec4F32{ { lane, lane, lane, lane } }; }
905

906
	static Vec4F32 Load(const float *src) { return Vec4F32{ { src[0], src[1], src[2], src[3] } }; }
907
	static Vec4F32 LoadAligned(const float *src) { return Vec4F32{ { src[0], src[1], src[2], src[3] } }; }
908
	static Vec4F32 LoadS8Norm(const int8_t *src) {
909
		Vec4F32 temp;
910
		for (int i = 0; i < 4; i++) {
911
			temp.v[i] = (float)src[i] * (1.0f / 128.0f);
912
		}
913
		return temp;
914
	}
915
	static Vec4F32 LoadS16Norm(const int16_t *src) {  // Divides by 32768.0f
916
		Vec4F32 temp;
917
		for (int i = 0; i < 4; i++) {
918
			temp.v[i] = (float)src[i] * (1.0f / 32768.0f);
919
		}
920
		return temp;
921
	}
922
	void Store(float *dst) { memcpy(dst, v, sizeof(v)); }
923
	void Store2(float *dst) { memcpy(dst, v, sizeof(v[0]) * 2); }
924
	void StoreAligned(float *dst) { memcpy(dst, v, sizeof(v)); }
925
	void Store3(float *dst) {
926
		memcpy(dst, v, sizeof(v[0]) * 3);
927
	}
928

929
	static Vec4F32 LoadConvertS16(const int16_t *src) {
930
		Vec4F32 temp;
931
		for (int i = 0; i < 4; i++) {
932
			temp.v[i] = (float)src[i];
933
		}
934
		return temp;
935
	}
936

937
	static Vec4F32 LoadConvertS8(const int8_t *src) {  // Note: will load 8 bytes, not 4. Only the first 4 bytes will be used.
938
		Vec4F32 temp;
939
		for (int i = 0; i < 4; i++) {
940
			temp.v[i] = (float)src[i];
941
		}
942
		return temp;
943
	}
944

945
	static Vec4F32 LoadF24x3_One(const uint32_t *src) {
946
		uint32_t shifted[4] = { src[0] << 8, src[1] << 8, src[2] << 8, 0 };
947
		Vec4F32 temp;
948
		memcpy(temp.v, shifted, sizeof(temp.v));
949
		return temp;
950
	}
951

952
	static Vec4F32 FromVec4S32(Vec4S32 src) {
953
		Vec4F32 temp;
954
		for (int i = 0; i < 4; i++) {
955
			temp.v[i] = (float)src[i];
956
		}
957
		return temp;
958
	}
959

960
	float operator[](size_t index) const { return v[index]; }
961

962
	Vec4F32 operator +(Vec4F32 other) const {
963
		return Vec4F32{ { v[0] + other.v[0], v[1] + other.v[1], v[2] + other.v[2], v[3] + other.v[3], } };
964
	}
965
	Vec4F32 operator -(Vec4F32 other) const {
966
		return Vec4F32{ { v[0] - other.v[0], v[1] - other.v[1], v[2] - other.v[2], v[3] - other.v[3], } };
967
	}
968
	Vec4F32 operator *(Vec4F32 other) const {
969
		return Vec4F32{ { v[0] * other.v[0], v[1] * other.v[1], v[2] * other.v[2], v[3] * other.v[3], } };
970
	}
971
	Vec4F32 Min(Vec4F32 other) const {
972
		Vec4F32 temp;
973
		for (int i = 0; i < 4; i++) {
974
			temp.v[i] = v[i] < other.v[i] ? v[i] : other.v[i];
975
		}
976
		return temp;
977
	}
978
	Vec4F32 Max(Vec4F32 other) const {
979
		Vec4F32 temp;
980
		for (int i = 0; i < 4; i++) {
981
			temp.v[i] = v[i] > other.v[i] ? v[i] : other.v[i];
982
		}
983
		return temp;
984
	}
985
	void operator +=(Vec4F32 other) {
986
		for (int i = 0; i < 4; i++) {
987
			v[i] += other.v[i];
988
		}
989
	}
990
	void operator -=(Vec4F32 other) {
991
		for (int i = 0; i < 4; i++) {
992
			v[i] -= other.v[i];
993
		}
994
	}
995
	void operator *=(Vec4F32 other) {
996
		for (int i = 0; i < 4; i++) {
997
			v[i] *= other.v[i];
998
		}
999
	}
1000
	void operator /=(Vec4F32 other) {
1001
		for (int i = 0; i < 4; i++) {
1002
			v[i] /= other.v[i];
1003
		}
1004
	}
1005
	void operator &=(Vec4S32 other) {
1006
		// TODO: This can be done simpler, although with some ugly casts.
1007
		for (int i = 0; i < 4; i++) {
1008
			uint32_t val;
1009
			memcpy(&val, &v[i], 4);
1010
			val &= other.v[i];
1011
			memcpy(&v[i], &val, 4);
1012
		}
1013
	}
1014
	Vec4F32 operator *(float f) const {
1015
		return Vec4F32{ { v[0] * f, v[1] * f, v[2] * f, v[3] * f } };
1016
	}
1017

1018
	Vec4F32 Mul(float f) const {
1019
		return Vec4F32{ { v[0] * f, v[1] * f, v[2] * f, v[3] * f } };
1020
	}
1021

1022
	Vec4F32 Recip() const {
1023
		return Vec4F32{ { 1.0f / v[0], 1.0f / v[1], 1.0f / v[2], 1.0f / v[3] } };
1024
	}
1025

1026
	Vec4F32 RecipApprox() const {
1027
		return Vec4F32{ { 1.0f / v[0], 1.0f / v[1], 1.0f / v[2], 1.0f / v[3] } };
1028
	}
1029

1030
	Vec4F32 Clamp(float lower, float higher) const {
1031
		Vec4F32 temp;
1032
		for (int i = 0; i < 4; i++) {
1033
			if (v[i] > higher) {
1034
				temp.v[i] = higher;
1035
			} else if (v[i] < lower) {
1036
				temp.v[i] = lower;
1037
			} else {
1038
				temp.v[i] = v[i];
1039
			}
1040
		}
1041
		return temp;
1042
	}
1043

1044
	Vec4F32 WithLane3Zero() const {
1045
		return Vec4F32{ { v[0], v[1], v[2], 0.0f } };
1046
	}
1047

1048
	Vec4F32 WithLane3One() const {
1049
		return Vec4F32{ { v[0], v[1], v[2], 1.0f } };
1050
	}
1051

1052
	Vec4S32 CompareEq(Vec4F32 other) const {
1053
		Vec4S32 temp;
1054
		for (int i = 0; i < 4; i++) {
1055
			temp.v[i] = v[i] == other.v[i] ? 0xFFFFFFFF : 0;
1056
		}
1057
		return temp;
1058
	}
1059
	Vec4S32 CompareLt(Vec4F32 other) const {
1060
		Vec4S32 temp;
1061
		for (int i = 0; i < 4; i++) {
1062
			temp.v[i] = v[i] < other.v[i] ? 0xFFFFFFFF : 0;
1063
		}
1064
		return temp;
1065
	}
1066
	Vec4S32 CompareGt(Vec4F32 other) const {
1067
		Vec4S32 temp;
1068
		for (int i = 0; i < 4; i++) {
1069
			temp.v[i] = v[i] > other.v[i] ? 0xFFFFFFFF : 0;
1070
		}
1071
		return temp;
1072
	}
1073
	Vec4S32 CompareLe(Vec4F32 other) const {
1074
		Vec4S32 temp;
1075
		for (int i = 0; i < 4; i++) {
1076
			temp.v[i] = v[i] <= other.v[i] ? 0xFFFFFFFF : 0;
1077
		}
1078
		return temp;
1079
	}
1080
	Vec4S32 CompareGe(Vec4F32 other) const {
1081
		Vec4S32 temp;
1082
		for (int i = 0; i < 4; i++) {
1083
			temp.v[i] = v[i] >= other.v[i] ? 0xFFFFFFFF : 0;
1084
		}
1085
		return temp;
1086
	}
1087

1088
	// In-place transpose.
1089
	static void Transpose(Vec4F32 &col0, Vec4F32 &col1, Vec4F32 &col2, Vec4F32 &col3) {
1090
		float m[16];
1091
		for (int i = 0; i < 4; i++) {
1092
			m[0 + i] = col0.v[i];
1093
			m[4 + i] = col1.v[i];
1094
			m[8 + i] = col2.v[i];
1095
			m[12 + i] = col3.v[i];
1096
		}
1097
		for (int i = 0; i < 4; i++) {
1098
			col0.v[i] = m[i * 4 + 0];
1099
			col1.v[i] = m[i * 4 + 1];
1100
			col2.v[i] = m[i * 4 + 2];
1101
			col3.v[i] = m[i * 4 + 3];
1102
		}
1103
	}
1104

1105
	inline Vec4F32 AsVec3ByMatrix44(const Mat4F32 &m) {
1106
		float x = m.m[0] * v[0] + m.m[4] * v[1] + m.m[8] * v[2] + m.m[12];
1107
		float y = m.m[1] * v[0] + m.m[5] * v[1] + m.m[9] * v[2] + m.m[13];
1108
		float z = m.m[2] * v[0] + m.m[6] * v[1] + m.m[10] * v[2] + m.m[14];
1109

1110
		return Vec4F32{ { x, y, z, 1.0f } };
1111
	}
1112
};
1113

1114
inline bool AnyZeroSignBit(Vec4S32 value) {
1115
	for (int i = 0; i < 4; i++) {
1116
		if (value.v[i] >= 0) {
1117
			return true;
1118
		}
1119
	}
1120
	return false;
1121
}
1122

1123
inline bool AnyZeroSignBit(Vec4F32 value) {
1124
	for (int i = 0; i < 4; i++) {
1125
		if (value.v[i] >= 0.0f) {
1126
			return true;
1127
		}
1128
	}
1129
	return false;
1130
}
1131

1132
struct Vec4U16 {
1133
	uint16_t v[4];  // 64 bits.
1134

1135
	static Vec4U16 Zero() { return Vec4U16{}; }
1136
	static Vec4U16 Splat(uint16_t lane) { return Vec4U16{ { lane, lane, lane, lane } }; }
1137

1138
	static Vec4U16 Load(const uint16_t *mem) { return Vec4U16{ { mem[0], mem[1], mem[2], mem[3] }}; }
1139
	void Store(uint16_t *mem) { memcpy(mem, v, sizeof(v)); }
1140

1141
	static Vec4U16 FromVec4S32(Vec4S32 v) {
1142
		return Vec4U16{ { (uint16_t)v.v[0], (uint16_t)v.v[1], (uint16_t)v.v[2], (uint16_t)v.v[3] }};
1143
	}
1144
	static Vec4U16 FromVec4F32(Vec4F32 v) {
1145
		return Vec4U16{ { (uint16_t)v.v[0], (uint16_t)v.v[1], (uint16_t)v.v[2], (uint16_t)v.v[3] }};
1146
	}
1147

1148
	Vec4U16 operator |(Vec4U16 other) const { return Vec4U16{ { (uint16_t)(v[0] | other.v[0]), (uint16_t)(v[1] | other.v[1]), (uint16_t)(v[2] | other.v[2]), (uint16_t)(v[3] | other.v[3]), } }; }
1149
	Vec4U16 operator &(Vec4U16 other) const { return Vec4U16{ { (uint16_t)(v[0] & other.v[0]), (uint16_t)(v[1] & other.v[1]), (uint16_t)(v[2] & other.v[2]), (uint16_t)(v[3] & other.v[3]), } }; }
1150
	Vec4U16 operator ^(Vec4U16 other) const { return Vec4U16{ { (uint16_t)	(v[0] ^ other.v[0]), (uint16_t)(v[1] ^ other.v[1]), (uint16_t)(v[2] ^ other.v[2]), (uint16_t)(v[3] ^ other.v[3]), } }; }
1151

1152
	Vec4U16 Max(Vec4U16 other) const {
1153
		Vec4U16 temp;
1154
		for (int i = 0; i < 4; i++) {
1155
			temp.v[i] = v[i] > other.v[i] ? v[i] : other.v[i];
1156
		}
1157
		return temp;
1158
	}
1159
	Vec4U16 Min(Vec4U16 other) const {
1160
		Vec4U16 temp;
1161
		for (int i = 0; i < 4; i++) {
1162
			temp.v[i] = v[i] < other.v[i] ? v[i] : other.v[i];
1163
		}
1164
		return temp;
1165
	}
1166
	Vec4U16 CompareLT(Vec4U16 other) const {
1167
		Vec4U16 temp;
1168
		for (int i = 0; i < 4; i++) {
1169
			temp.v[i] = v[i] < other.v[i] ? 0xFFFF : 0;
1170
		}
1171
		return temp;
1172
	}
1173
	Vec4U16 AndNot(Vec4U16 other) const {
1174
		Vec4U16 temp;
1175
		for (int i = 0; i < 4; i++) {
1176
			temp.v[i] = v[i] & ~other.v[i];
1177
		}
1178
		return temp;
1179
	}
1180
};
1181

1182
struct Vec8U16 {
1183
	uint16_t v[8];
1184

1185
	static Vec8U16 Zero() { return Vec8U16{}; }
1186
	static Vec8U16 Splat(uint16_t value) { return Vec8U16{ {
1187
		value, value, value, value, value, value, value, value,
1188
	}}; }
1189

1190
	static Vec8U16 Load(const uint16_t *mem) { Vec8U16 tmp; memcpy(tmp.v, mem, sizeof(v)); return tmp; }
1191
	void Store(uint16_t *mem) { memcpy(mem, v, sizeof(v)); }
1192
};
1193

1194
inline Vec4U16 SignBits32ToMaskU16(Vec4S32 v) {
1195
	return Vec4U16{ { (uint16_t)(v.v[0] >> 31), (uint16_t)(v.v[1] >> 31), (uint16_t)(v.v[2] >> 31), (uint16_t)(v.v[3] >> 31),  } };
1196
}
1197

1198
inline Vec4S32 Vec4S32FromF32(Vec4F32 f) {
1199
	return Vec4S32{ { (int32_t)f.v[0], (int32_t)f.v[1], (int32_t)f.v[2], (int32_t)f.v[3] } };
1200
}
1201

1202
inline Vec4F32 Vec4F32FromS32(Vec4S32 f) {
1203
	return Vec4F32{ { (float)f.v[0], (float)f.v[1], (float)f.v[2], (float)f.v[3] } };
1204
}
1205

1206
// Make sure the W component of scale is 1.0f.
1207
inline void ScaleInplace(Mat4F32 &m, Vec4F32 scale) {
1208
	for (int i = 0; i < 4; i++) {
1209
		m.m[i * 4 + 0] *= scale.v[0];
1210
		m.m[i * 4 + 1] *= scale.v[1];
1211
		m.m[i * 4 + 2] *= scale.v[2];
1212
		m.m[i * 4 + 3] *= scale.v[3];
1213
	}
1214
}
1215

1216
inline void TranslateAndScaleInplace(Mat4F32 &m, Vec4F32 scale, Vec4F32 translate) {
1217
	for (int i = 0; i < 4; i++) {
1218
		m.m[i * 4 + 0] = m.m[i * 4 + 0] * scale.v[0] + translate.v[0] * m.m[i * 4 + 3];
1219
		m.m[i * 4 + 1] = m.m[i * 4 + 1] * scale.v[1] + translate.v[1] * m.m[i * 4 + 3];
1220
		m.m[i * 4 + 2] = m.m[i * 4 + 2] * scale.v[2] + translate.v[2] * m.m[i * 4 + 3];
1221
		m.m[i * 4 + 3] = m.m[i * 4 + 3] * scale.v[3] + translate.v[3] * m.m[i * 4 + 3];
1222
	}
1223
}
1224

1225
inline Mat4F32 Mul4x4By4x4(Mat4F32 a, Mat4F32 b) {
1226
	Mat4F32 result;
1227
	for (int j = 0; j < 4; j++) {
1228
		for (int i = 0; i < 4; i++) {
1229
			float sum = 0.0f;
1230
			for (int k = 0; k < 4; k++) {
1231
				sum += b.m[k * 4 + i] * a.m[j * 4 + k];
1232
			}
1233
			result.m[j * 4 + i] = sum;
1234
		}
1235
	}
1236
	return result;
1237
}
1238

1239
inline Mat4F32 Mul4x3By4x4(Mat4x3F32 a, Mat4F32 b) {
1240
	Mat4F32 result;
1241

1242
	for (int j = 0; j < 4; j++) {
1243
		for (int i = 0; i < 4; i++) {
1244
			float sum = 0.0f;
1245
			for (int k = 0; k < 3; k++) {
1246
				sum += b.m[k * 4 + i] * a.m[j * 3 + k];
1247
			}
1248
			if (j == 3) {
1249
				sum += b.m[12 + i];
1250
			}
1251
			result.m[j * 4 + i] = sum;
1252
		}
1253
	}
1254
	return result;
1255
}
1256

1257
#endif
1258

1259
Product

Resources

Company