CoCalc -- Math3D.h

GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Math3D.h
³¹⁸⁵ views
1
// Copyright (c) 2012- PPSSPP Project.
2

3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6

7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
10
// GNU General Public License 2.0 for more details.
11

12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14

15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17

18
#pragma once
19

20
#include "ppsspp_config.h"
21
#include <cmath>
22
#include <cstring>
23

24
#include "Common/Common.h"
25
#include "Core/Util/AudioFormat.h"  // for clamp_u8
26
#include "Common/Math/fast/fast_matrix.h"
27
#include "Common/Math/SIMDHeaders.h"
28

29
#if PPSSPP_PLATFORM(WINDOWS) && (defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER))
30
#define MATH3D_CALL __vectorcall
31
#else
32
#define MATH3D_CALL
33
#endif
34

35
// There's probably a better place to define these macros.
36
#if PPSSPP_ARCH(X86)
37
// On 32-bit x86, MSVC does not guarantee alignment for
38
// SSE arguments passed on stack (Compiler Error C2719), see e.g.:
39
//   https://stackoverflow.com/questions/10484422/msvc-cannot-send-function-parameters-of-16byte-alignment-on-x86
40
//   https://stackoverflow.com/questions/28488986/formal-parameter-with-declspecalign16-wont-be-aligned
41
// So, as a workaround, "dangerous" cases are loaded via loadu* on 32-bit x86.
42
// Compilers are decently ok at eliminating these extra loads, at least
43
// in trivial cases.
44
// NOTE: not to be outdone, GCC has its own flavor of broken, see e.g.:
45
//   http://www.peterstock.co.uk/games/mingw_sse/
46
//   https://github.com/nothings/stb/issues/81
47
// which is probably worse since it breaks alignment of locals and/or
48
// spills, but that, hopefully, does not affect PPSSPP (modern GCC+Linux
49
// is 16-byte aligned on x86, and MinGW is not a supported PPSSPP target).
50
// NOTE: weird double-casts add a bit of type-safety.
51
#define SAFE_M128(v)  _mm_loadu_ps   (reinterpret_cast<const float*>  (static_cast<const __m128*> (&(v))))
52
#define SAFE_M128I(v) _mm_loadu_si128(reinterpret_cast<const __m128i*>(static_cast<const __m128i*>(&(v))))
53
#else // x64, FWIW also works for non-x86.
54
#define SAFE_M128(v)  (v)
55
#define SAFE_M128I(v) (v)
56
#endif
57

58
namespace Math3D {
59

60
// Helper for Vec classes to clamp values.
61
template<typename T>
62
inline static T VecClamp(const T &v, const T &low, const T &high)
63
{
64
	if (v > high)
65
		return high;
66
	if (v < low)
67
		return low;
68
	return v;
69
}
70

71
template<typename T>
72
class Vec2 {
73
public:
74
	struct {
75
		T x,y;
76
	};
77

78
	T* AsArray() { return &x; }
79
	const T* AsArray() const { return &x; }
80

81
	Vec2() {}
82
	Vec2(const T a[2]) : x(a[0]), y(a[1]) {}
83
	Vec2(const T& _x, const T& _y) : x(_x), y(_y) {}
84

85
	template<typename T2>
86
	Vec2<T2> Cast() const
87
	{
88
		return Vec2<T2>((T2)x, (T2)y);
89
	}
90

91
	static Vec2 AssignToAll(const T& f)
92
	{
93
		return Vec2<T>(f, f);
94
	}
95

96
	void Write(T a[2])
97
	{
98
		a[0] = x; a[1] = y;
99
	}
100

101
	Vec2 operator +(const Vec2& other) const
102
	{
103
		return Vec2(x+other.x, y+other.y);
104
	}
105
	void operator += (const Vec2 &other)
106
	{
107
		x+=other.x; y+=other.y;
108
	}
109
	Vec2 operator -(const Vec2& other) const
110
	{
111
		return Vec2(x-other.x, y-other.y);
112
	}
113
	void operator -= (const Vec2& other)
114
	{
115
		x-=other.x; y-=other.y;
116
	}
117
	Vec2 operator -() const
118
	{
119
		return Vec2(-x,-y);
120
	}
121
	Vec2 operator * (const Vec2& other) const
122
	{
123
		return Vec2(x*other.x, y*other.y);
124
	}
125
	template<typename V>
126
	Vec2 operator * (const V& f) const
127
	{
128
		return Vec2(x*f,y*f);
129
	}
130
	template<typename V>
131
	void operator *= (const V& f)
132
	{
133
		x*=f; y*=f;
134
	}
135
	template<typename V>
136
	Vec2 operator / (const V& f) const
137
	{
138
		return Vec2(x/f,y/f);
139
	}
140
	template<typename V>
141
	void operator /= (const V& f)
142
	{
143
		*this = *this / f;
144
	}
145

146
	T Length2() const
147
	{
148
		return x*x + y*y;
149
	}
150

151
	Vec2 Clamp(const T &l, const T &h) const
152
	{
153
		return Vec2(VecClamp(x, l, h), VecClamp(y, l, h));
154
	}
155

156
	// Only implemented for T=float
157
	float Length() const;
158
	void SetLength(const float l);
159
	Vec2 WithLength(const float l) const;
160
	float Distance2To(const Vec2 &other) const;
161
	Vec2 Normalized() const;
162
	float Normalize(); // returns the previous length, which is often useful
163

164
	T& operator [] (int i) //allow vector[1] = 3   (vector.y=3)
165
	{
166
		return *((&x) + i);
167
	}
168
	T operator [] (const int i) const
169
	{
170
		return *((&x) + i);
171
	}
172

173
	void SetZero()
174
	{
175
		x=0; y=0;
176
	}
177

178
	// Common aliases: UV (texel coordinates), ST (texture coordinates)
179
	T& u() { return x; }
180
	T& v() { return y; }
181
	T& s() { return x; }
182
	T& t() { return y; }
183

184
	const T& u() const { return x; }
185
	const T& v() const { return y; }
186
	const T& s() const { return x; }
187
	const T& t() const { return y; }
188

189
	// swizzlers - create a subvector of specific components
190
	const Vec2 yx() const { return Vec2(y, x); }
191
	const Vec2 vu() const { return Vec2(y, x); }
192
	const Vec2 ts() const { return Vec2(y, x); }
193
};
194

195
template<typename T>
196
class Vec3Packed;
197

198
template<typename T>
199
class Vec3
200
{
201
public:
202
	union
203
	{
204
		struct
205
		{
206
			T x,y,z;
207
		};
208
#if defined(_M_SSE)
209
		__m128i ivec;
210
		__m128 vec;
211
#elif PPSSPP_ARCH(ARM_NEON)
212
		int32x4_t ivec;
213
		float32x4_t vec;
214
#endif
215
	};
216

217
	T* AsArray() { return &x; }
218
	const T* AsArray() const { return &x; }
219

220
	Vec3() {}
221
	Vec3(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
222
	constexpr Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
223
	Vec3(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}
224
#if defined(_M_SSE)
225
	constexpr Vec3(const __m128 &_vec) : vec(_vec) {}
226
	constexpr Vec3(const __m128i &_ivec) : ivec(_ivec) {}
227
	Vec3(const Vec3Packed<T> &_xyz) {
228
		vec = _mm_loadu_ps(_xyz.AsArray());
229
	}
230
#elif PPSSPP_ARCH(ARM_NEON)
231
	Vec3(const float32x4_t &_vec) : vec(_vec) {}
232
#if !defined(_MSC_VER)
233
	Vec3(const int32x4_t &_ivec) : ivec(_ivec) {}
234
#endif
235
	Vec3(const Vec3Packed<T> &_xyz) {
236
		vec = vld1q_f32(_xyz.AsArray());
237
	}
238
#else
239
	Vec3(const Vec3Packed<T> &_xyz) : x(_xyz.x), y(_xyz.y), z(_xyz.z) {}
240
#endif
241

242
	template<typename T2>
243
	constexpr Vec3<T2> Cast() const
244
	{
245
		return Vec3<T2>((T2)x, (T2)y, (T2)z);
246
	}
247

248
	// Only implemented for T=int and T=float
249
	static Vec3 FromRGB(unsigned int rgb);
250
	unsigned int ToRGB() const; // alpha bits set to zero
251

252
	static constexpr Vec3 AssignToAll(const T& f)
253
	{
254
		return Vec3<T>(f, f, f);
255
	}
256

257
	void Write(T a[3])
258
	{
259
		a[0] = x; a[1] = y; a[2] = z;
260
	}
261

262
	Vec3 operator +(const Vec3 &other) const
263
	{
264
		return Vec3(x+other.x, y+other.y, z+other.z);
265
	}
266
	void operator += (const Vec3 &other)
267
	{
268
		x+=other.x; y+=other.y; z+=other.z;
269
	}
270
	Vec3 operator -(const Vec3 &other) const
271
	{
272
		return Vec3(x-other.x, y-other.y, z-other.z);
273
	}
274
	void operator -= (const Vec3 &other)
275
	{
276
		x-=other.x; y-=other.y; z-=other.z;
277
	}
278
	Vec3 operator -() const
279
	{
280
		return Vec3(-x,-y,-z);
281
	}
282
	Vec3 operator * (const Vec3 &other) const
283
	{
284
		return Vec3(x*other.x, y*other.y, z*other.z);
285
	}
286
	template<typename V>
287
	Vec3 operator * (const V& f) const
288
	{
289
		return Vec3(x*f,y*f,z*f);
290
	}
291
	template<typename V>
292
	void operator *= (const V& f)
293
	{
294
		x*=f; y*=f; z*=f;
295
	}
296
	template<typename V>
297
	Vec3 operator / (const V& f) const
298
	{
299
		return Vec3(x/f,y/f,z/f);
300
	}
301
	template<typename V>
302
	void operator /= (const V& f)
303
	{
304
		*this = *this / f;
305
	}
306

307
	bool operator ==(const Vec3 &other) const {
308
		return x == other.x && y == other.y && z == other.z;
309
	}
310

311
	T Length2() const
312
	{
313
		return x*x + y*y + z*z;
314
	}
315

316
	Vec3 Clamp(const T &l, const T &h) const
317
	{
318
		return Vec3(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h));
319
	}
320

321
	// Only implemented for T=float
322
	float Length() const;
323
	void SetLength(const float l);
324
	Vec3 WithLength(const float l) const;
325
	float Distance2To(const Vec3 &other) const;
326
	Vec3 Normalized(bool useSSE4 = false) const;
327
	Vec3 NormalizedOr001(bool useSSE4 = false) const;
328
	float Normalize(); // returns the previous length, which is often useful
329
	float NormalizeOr001();
330

331
	T& operator [] (int i) //allow vector[2] = 3   (vector.z=3)
332
	{
333
		return *((&x) + i);
334
	}
335
	T operator [] (const int i) const
336
	{
337
		return *((&x) + i);
338
	}
339

340
	void SetZero()
341
	{
342
		x=0; y=0; z=0;
343
	}
344

345
	// Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates)
346
	T& u() { return x; }
347
	T& v() { return y; }
348
	T& w() { return z; }
349

350
	T& r() { return x; }
351
	T& g() { return y; }
352
	T& b() { return z; }
353

354
	T& s() { return x; }
355
	T& t() { return y; }
356
	T& q() { return z; }
357

358
	const T& u() const { return x; }
359
	const T& v() const { return y; }
360
	const T& w() const { return z; }
361

362
	const T& r() const { return x; }
363
	const T& g() const { return y; }
364
	const T& b() const { return z; }
365

366
	const T& s() const { return x; }
367
	const T& t() const { return y; }
368
	const T& q() const { return z; }
369

370
	// swizzlers - create a subvector of specific components
371
	// e.g. Vec2 uv() { return Vec2(x,y); }
372
	// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
373
#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
374
#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \
375
	_DEFINE_SWIZZLER2(a, b, a##b); \
376
	_DEFINE_SWIZZLER2(a, b, a2##b2); \
377
	_DEFINE_SWIZZLER2(a, b, a3##b3); \
378
	_DEFINE_SWIZZLER2(a, b, a4##b4); \
379
	_DEFINE_SWIZZLER2(b, a, b##a); \
380
	_DEFINE_SWIZZLER2(b, a, b2##a2); \
381
	_DEFINE_SWIZZLER2(b, a, b3##a3); \
382
	_DEFINE_SWIZZLER2(b, a, b4##a4);
383

384
	DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);
385
	DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);
386
	DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);
387
#undef DEFINE_SWIZZLER2
388
#undef _DEFINE_SWIZZLER2
389
};
390

391
template<typename T>
392
class Vec3Packed
393
{
394
public:
395
	union
396
	{
397
		struct
398
		{
399
			T x,y,z;
400
		};
401
	};
402

403
	T* AsArray() { return &x; }
404
	const T* AsArray() const { return &x; }
405

406
	Vec3Packed() {}
407
	Vec3Packed(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
408
	Vec3Packed(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
409
	Vec3Packed(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}
410
	Vec3Packed(const Vec3<T>& _xyz) {
411
		memcpy(&x, _xyz.AsArray(), sizeof(float) * 3);
412
	}
413

414
	template<typename T2>
415
	Vec3Packed<T2> Cast() const
416
	{
417
		return Vec3Packed<T2>((T2)x, (T2)y, (T2)z);
418
	}
419

420
	// Only implemented for T=int and T=float
421
	static Vec3Packed FromRGB(unsigned int rgb);
422
	unsigned int ToRGB() const; // alpha bits set to zero
423

424
	static Vec3Packed AssignToAll(const T& f)
425
	{
426
		return Vec3Packed<T>(f, f, f);
427
	}
428

429
	void Write(T a[3])
430
	{
431
		a[0] = x; a[1] = y; a[2] = z;
432
	}
433

434
	Vec3Packed operator +(const Vec3Packed &other) const
435
	{
436
		return Vec3Packed(x+other.x, y+other.y, z+other.z);
437
	}
438
	void operator += (const Vec3Packed &other)
439
	{
440
		x+=other.x; y+=other.y; z+=other.z;
441
	}
442
	Vec3Packed operator -(const Vec3Packed &other) const
443
	{
444
		return Vec3Packed(x-other.x, y-other.y, z-other.z);
445
	}
446
	void operator -= (const Vec3Packed &other)
447
	{
448
		x-=other.x; y-=other.y; z-=other.z;
449
	}
450
	Vec3Packed operator -() const
451
	{
452
		return Vec3Packed(-x,-y,-z);
453
	}
454
	Vec3Packed operator * (const Vec3Packed &other) const
455
	{
456
		return Vec3Packed(x*other.x, y*other.y, z*other.z);
457
	}
458
	template<typename V>
459
	Vec3Packed operator * (const V& f) const
460
	{
461
		return Vec3Packed(x*f,y*f,z*f);
462
	}
463
	template<typename V>
464
	void operator *= (const V& f)
465
	{
466
		x*=f; y*=f; z*=f;
467
	}
468
	template<typename V>
469
	Vec3Packed operator / (const V& f) const
470
	{
471
		return Vec3Packed(x/f,y/f,z/f);
472
	}
473
	template<typename V>
474
	void operator /= (const V& f)
475
	{
476
		*this = *this / f;
477
	}
478

479
	T Length2() const
480
	{
481
		return x*x + y*y + z*z;
482
	}
483

484
	Vec3Packed Clamp(const T &l, const T &h) const
485
	{
486
		return Vec3Packed(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h));
487
	}
488

489
	// Only implemented for T=float
490
	float Length() const;
491
	void SetLength(const float l);
492
	Vec3Packed WithLength(const float l) const;
493
	float Distance2To(const Vec3Packed &other) const;
494
	Vec3Packed Normalized() const;
495
	float Normalize(); // returns the previous length, which is often useful
496

497
	T& operator [] (int i) //allow vector[2] = 3   (vector.z=3)
498
	{
499
		return *((&x) + i);
500
	}
501
	T operator [] (const int i) const
502
	{
503
		return *((&x) + i);
504
	}
505

506
	void SetZero()
507
	{
508
		x=0; y=0; z=0;
509
	}
510

511
	// Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates)
512
	T& u() { return x; }
513
	T& v() { return y; }
514
	T& w() { return z; }
515

516
	T& r() { return x; }
517
	T& g() { return y; }
518
	T& b() { return z; }
519

520
	T& s() { return x; }
521
	T& t() { return y; }
522
	T& q() { return z; }
523

524
	const T& u() const { return x; }
525
	const T& v() const { return y; }
526
	const T& w() const { return z; }
527

528
	const T& r() const { return x; }
529
	const T& g() const { return y; }
530
	const T& b() const { return z; }
531

532
	const T& s() const { return x; }
533
	const T& t() const { return y; }
534
	const T& q() const { return z; }
535

536
	// swizzlers - create a subvector of specific components
537
	// e.g. Vec2 uv() { return Vec2(x,y); }
538
	// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
539
#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
540
#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \
541
	_DEFINE_SWIZZLER2(a, b, a##b); \
542
	_DEFINE_SWIZZLER2(a, b, a2##b2); \
543
	_DEFINE_SWIZZLER2(a, b, a3##b3); \
544
	_DEFINE_SWIZZLER2(a, b, a4##b4); \
545
	_DEFINE_SWIZZLER2(b, a, b##a); \
546
	_DEFINE_SWIZZLER2(b, a, b2##a2); \
547
	_DEFINE_SWIZZLER2(b, a, b3##a3); \
548
	_DEFINE_SWIZZLER2(b, a, b4##a4);
549

550
	DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);
551
	DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);
552
	DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);
553
#undef DEFINE_SWIZZLER2
554
#undef _DEFINE_SWIZZLER2
555
};
556

557
template<typename T>
558
class Vec4
559
{
560
public:
561
	union
562
	{
563
		struct
564
		{
565
			T x,y,z,w;
566
		};
567
#if defined(_M_SSE)
568
		__m128i ivec;
569
		__m128 vec;
570
#elif PPSSPP_ARCH(ARM_NEON)
571
		int32x4_t ivec;
572
		float32x4_t vec;
573
#endif
574
	};
575

576
	T* AsArray() { return &x; }
577
	const T* AsArray() const { return &x; }
578

579
	Vec4() {}
580
	Vec4(const T a[4]) : x(a[0]), y(a[1]), z(a[2]), w(a[3]) {}
581
	Vec4(const T& _x, const T& _y, const T& _z, const T& _w) : x(_x), y(_y), z(_z), w(_w) {}
582
	Vec4(const Vec2<T>& _xy, const T& _z, const T& _w) : x(_xy.x), y(_xy.y), z(_z), w(_w) {}
583
	Vec4(const Vec3<T>& _xyz, const T& _w) : x(_xyz.x), y(_xyz.y), z(_xyz.z), w(_w) {}
584
#if defined(_M_SSE)
585
	Vec4(const __m128 &_vec) : vec(_vec) {}
586
	Vec4(const __m128i &_ivec) : ivec(_ivec) {}
587
#elif PPSSPP_ARCH(ARM_NEON)
588
	Vec4(const float32x4_t &_vec) : vec(_vec) {}
589
#if !defined(_MSC_VER)
590
	Vec4(const int32x4_t &_ivec) : ivec(_ivec) {}
591
#endif
592
#endif
593

594
	template<typename T2>
595
	Vec4<T2> Cast() const {
596
		if constexpr (std::is_same<T, float>::value && std::is_same<T2, int>::value) {
597
#if defined(_M_SSE)
598
			return _mm_cvtps_epi32(SAFE_M128(vec));
599
#elif PPSSPP_ARCH(ARM_NEON)
600
			return vcvtq_s32_f32(vec);
601
#endif
602
		}
603
		if constexpr (std::is_same<T, int>::value && std::is_same<T2, float>::value) {
604
#if defined(_M_SSE)
605
			return _mm_cvtepi32_ps(SAFE_M128I(ivec));
606
#elif PPSSPP_ARCH(ARM_NEON)
607
			return vcvtq_f32_s32(ivec);
608
#endif
609
		}
610
		return Vec4<T2>((T2)x, (T2)y, (T2)z, (T2)w);
611
	}
612

613
	// Only implemented for T=int and T=float
614
	static Vec4 FromRGBA(unsigned int rgba);
615
	static Vec4 FromRGBA(const u8 *rgba);
616
	unsigned int ToRGBA() const;
617
	void ToRGBA(u8 *rgba) const;
618

619
	static Vec4 AssignToAll(const T& f)
620
	{
621
		return Vec4<T>(f, f, f, f);
622
	}
623

624
	void Write(T a[4])
625
	{
626
		a[0] = x; a[1] = y; a[2] = z; a[3] = w;
627
	}
628

629
	Vec4 operator +(const Vec4& other) const
630
	{
631
		return Vec4(x+other.x, y+other.y, z+other.z, w+other.w);
632
	}
633
	void operator += (const Vec4& other)
634
	{
635
		x+=other.x; y+=other.y; z+=other.z; w+=other.w;
636
	}
637
	Vec4 operator -(const Vec4 &other) const
638
	{
639
		return Vec4(x-other.x, y-other.y, z-other.z, w-other.w);
640
	}
641
	void operator -= (const Vec4 &other)
642
	{
643
		x-=other.x; y-=other.y; z-=other.z; w-=other.w;
644
	}
645
	Vec4 operator -() const
646
	{
647
		return Vec4(-x,-y,-z,-w);
648
	}
649
	Vec4 operator * (const Vec4 &other) const
650
	{
651
		return Vec4(x*other.x, y*other.y, z*other.z, w*other.w);
652
	}
653
	Vec4 operator | (const Vec4 &other) const
654
	{
655
		return Vec4(x | other.x, y | other.y, z | other.z, w | other.w);
656
	}
657
	Vec4 operator & (const Vec4 &other) const
658
	{
659
		return Vec4(x & other.x, y & other.y, z & other.z, w & other.w);
660
	}
661
	Vec4 operator << (const int amount) const
662
	{
663
		// NOTE: x*(1<<amount), etc., might be safer, since
664
		// left-shifting negatives is UB pre-C++20.
665
		return Vec4(x << amount, y << amount, z << amount, w << amount);
666
	}
667
	Vec4 operator >> (const int amount) const
668
	{
669
		return Vec4(x >> amount, y >> amount, z >> amount, w >> amount);
670
	}
671
	template<typename V>
672
	Vec4 operator * (const V& f) const
673
	{
674
		return Vec4(x*f,y*f,z*f,w*f);
675
	}
676
	template<typename V>
677
	void operator *= (const V& f)
678
	{
679
		x*=f; y*=f; z*=f; w*=f;
680
	}
681
	template<typename V>
682
	Vec4 operator / (const V& f) const
683
	{
684
		return Vec4(x/f,y/f,z/f,w/f);
685
	}
686
	template<typename V>
687
	void operator /= (const V& f)
688
	{
689
		*this = *this / f;
690
	}
691

692
	bool operator ==(const Vec4 &other) const {
693
		return x == other.x && y == other.y && z == other.z && w == other.w;
694
	}
695

696
	T Length2() const
697
	{
698
		return x*x + y*y + z*z + w*w;
699
	}
700

701
	Vec4 Clamp(const T &l, const T &h) const
702
	{
703
		return Vec4(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h), VecClamp(w, l, h));
704
	}
705

706
	Vec4 Reciprocal() const
707
	{
708
		const T one = 1.0f;
709
		return Vec4(one / x, one / y, one / z, one / w);
710
	}
711

712
	// Only implemented for T=float
713
	float Length() const;
714
	void SetLength(const float l);
715
	Vec4 WithLength(const float l) const;
716
	float Distance2To(const Vec4 &other) const;
717
	Vec4 Normalized() const;
718
	float Normalize(); // returns the previous length, which is often useful
719

720
	T& operator [] (int i) //allow vector[2] = 3   (vector.z=3)
721
	{
722
		return *((&x) + i);
723
	}
724
	T operator [] (const int i) const
725
	{
726
		return *((&x) + i);
727
	}
728

729
	void SetZero()
730
	{
731
		x=0; y=0; z=0; w=0;
732
	}
733

734
	// Common alias: RGBA (colors)
735
	T& r() { return x; }
736
	T& g() { return y; }
737
	T& b() { return z; }
738
	T& a() { return w; }
739

740
	const T& r() const { return x; }
741
	const T& g() const { return y; }
742
	const T& b() const { return z; }
743
	const T& a() const { return w; }
744

745
	// swizzlers - create a subvector of specific components
746
	// e.g. Vec2 uv() { return Vec2(x,y); }
747
	// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
748
#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
749
#define DEFINE_SWIZZLER2(a, b, a2, b2) \
750
	_DEFINE_SWIZZLER2(a, b, a##b); \
751
	_DEFINE_SWIZZLER2(a, b, a2##b2); \
752
	_DEFINE_SWIZZLER2(b, a, b##a); \
753
	_DEFINE_SWIZZLER2(b, a, b2##a2);
754

755
	DEFINE_SWIZZLER2(x, y, r, g);
756
	DEFINE_SWIZZLER2(x, z, r, b);
757
	DEFINE_SWIZZLER2(x, w, r, a);
758
	DEFINE_SWIZZLER2(y, z, g, b);
759
	DEFINE_SWIZZLER2(y, w, g, a);
760
	DEFINE_SWIZZLER2(z, w, b, a);
761
#undef DEFINE_SWIZZLER2
762
#undef _DEFINE_SWIZZLER2
763

764
#define _DEFINE_SWIZZLER3(a, b, c, name) const Vec3<T> name() const { return Vec3<T>(a, b, c); }
765
#define DEFINE_SWIZZLER3(a, b, c, a2, b2, c2) \
766
	_DEFINE_SWIZZLER3(a, b, c, a##b##c); \
767
	_DEFINE_SWIZZLER3(a, c, b, a##c##b); \
768
	_DEFINE_SWIZZLER3(b, a, c, b##a##c); \
769
	_DEFINE_SWIZZLER3(b, c, a, b##c##a); \
770
	_DEFINE_SWIZZLER3(c, a, b, c##a##b); \
771
	_DEFINE_SWIZZLER3(c, b, a, c##b##a); \
772
	_DEFINE_SWIZZLER3(a, b, c, a2##b2##c2); \
773
	_DEFINE_SWIZZLER3(a, c, b, a2##c2##b2); \
774
	_DEFINE_SWIZZLER3(b, a, c, b2##a2##c2); \
775
	_DEFINE_SWIZZLER3(b, c, a, b2##c2##a2); \
776
	_DEFINE_SWIZZLER3(c, a, b, c2##a2##b2); \
777
	_DEFINE_SWIZZLER3(c, b, a, c2##b2##a2);
778

779
	DEFINE_SWIZZLER3(x, y, z, r, g, b);
780
	DEFINE_SWIZZLER3(x, y, w, r, g, a);
781
	DEFINE_SWIZZLER3(x, z, w, r, b, a);
782
	DEFINE_SWIZZLER3(y, z, w, g, b, a);
783
#undef DEFINE_SWIZZLER3
784
#undef _DEFINE_SWIZZLER3
785
};
786

787

788
template<typename BaseType>
789
class Mat3x3
790
{
791
public:
792
	// Convention: first three values = first column
793
	Mat3x3(const BaseType values[])
794
	{
795
		for (unsigned int i = 0; i < 3*3; ++i)
796
		{
797
			this->values[i] = values[i];
798
		}
799
	}
800

801
	Mat3x3(BaseType _00, BaseType _01, BaseType _02, BaseType _10, BaseType _11, BaseType _12, BaseType _20, BaseType _21, BaseType _22)
802
	{
803
		values[0] = _00;
804
		values[1] = _01;
805
		values[2] = _02;
806
		values[3] = _10;
807
		values[4] = _11;
808
		values[5] = _12;
809
		values[6] = _20;
810
		values[7] = _21;
811
		values[8] = _22;
812
	}
813

814
	template<typename T>
815
	Vec3<T> operator * (const Vec3<T>& vec) const
816
	{
817
		Vec3<T> ret;
818
		ret.x = values[0]*vec.x + values[3]*vec.y + values[6]*vec.z;
819
		ret.y = values[1]*vec.x + values[4]*vec.y + values[7]*vec.z;
820
		ret.z = values[2]*vec.x + values[5]*vec.y + values[8]*vec.z;
821
		return ret;
822
	}
823

824
	Mat3x3 Inverse() const
825
	{
826
		float a = values[0];
827
		float b = values[1];
828
		float c = values[2];
829
		float d = values[3];
830
		float e = values[4];
831
		float f = values[5];
832
		float g = values[6];
833
		float h = values[7];
834
		float i = values[8];
835
		return Mat3x3(e*i-f*h, f*g-d*i, d*h-e*g,
836
						c*h-b*i, a*i-c*g, b*g-a*h,
837
						b*f-c*e, c*d-a*f, a*e-b*d) / Det();
838
	}
839

840
	BaseType Det() const
841
	{
842
		return values[0]*values[4]*values[8] + values[3]*values[7]*values[2] +
843
				values[6]*values[1]*values[5] - values[2]*values[4]*values[6] -
844
				values[5]*values[7]*values[0] - values[8]*values[1]*values[3];
845
	}
846

847
	Mat3x3 operator / (const BaseType& val) const
848
	{
849
		return Mat3x3(values[0]/val, values[1]/val, values[2]/val,
850
						values[3]/val, values[4]/val, values[5]/val,
851
						values[6]/val, values[7]/val, values[8]/val);
852
	}
853

854
private:
855
	BaseType values[3*3];
856
};
857

858

859
template<typename BaseType>
860
class Mat4x4
861
{
862
public:
863
	// Convention: first four values in arrow = first column
864
	Mat4x4(const BaseType values[])
865
	{
866
		for (unsigned int i = 0; i < 4*4; ++i)
867
		{
868
			this->values[i] = values[i];
869
		}
870
	}
871

872
	template<typename T>
873
	Vec4<T> operator * (const Vec4<T>& vec) const
874
	{
875
		Vec4<T> ret;
876
		ret.x = values[0]*vec.x + values[4]*vec.y + values[8]*vec.z + values[12]*vec.w;
877
		ret.y = values[1]*vec.x + values[5]*vec.y + values[9]*vec.z + values[13]*vec.w;
878
		ret.z = values[2]*vec.x + values[6]*vec.y + values[10]*vec.z + values[14]*vec.w;
879
		ret.w = values[3]*vec.x + values[7]*vec.y + values[11]*vec.z + values[15]*vec.w;
880
		return ret;
881
	}
882

883
private:
884
	BaseType values[4*4];
885
};
886

887
}; // namespace Math3D
888

889
typedef Math3D::Vec2<float> Vec2f;
890
typedef Math3D::Vec3<float> Vec3f;
891
typedef Math3D::Vec3Packed<float> Vec3Packedf;
892
typedef Math3D::Vec4<float> Vec4f;
893

894
#if defined(_M_SSE)
895
template<unsigned i>
896
float MATH3D_CALL vectorGetByIndex(__m128 v) {
897
	// shuffle V so that the element that we want is moved to the bottom
898
	return _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i)));
899
}
900
#endif
901

902
#if defined(_M_SSE)
903
// x, y, and z should be broadcast.  Should only be used through Vec3f version.
904
// Note that this will read an extra float from the matrix, so it better not be at the end of an allocation!
905
inline __m128 MATH3D_CALL Vec3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) {
906
	__m128 col0 = _mm_loadu_ps(m);
907
	__m128 col1 = _mm_loadu_ps(m + 3);
908
	__m128 col2 = _mm_loadu_ps(m + 6);
909
	__m128 col3 = _mm_loadu_ps(m + 9);
910
	__m128 sum = _mm_add_ps(
911
		_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
912
		_mm_add_ps(_mm_mul_ps(col2, z), col3));
913
	return sum;
914
}
915
#elif PPSSPP_ARCH(ARM64_NEON)
916
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
917
	float32x4_t col0 = vld1q_f32(m);
918
	float32x4_t col1 = vld1q_f32(m + 3);
919
	float32x4_t col2 = vld1q_f32(m + 6);
920
	float32x4_t col3 = vld1q_f32(m + 9);
921
	float32x4_t sum = vaddq_f32(
922
		vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
923
		vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
924
	return sum;
925
}
926
#elif PPSSPP_ARCH(ARM_NEON)
927
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
928
	float32x4_t col0 = vld1q_f32(m);
929
	float32x4_t col1 = vld1q_f32(m + 3);
930
	float32x4_t col2 = vld1q_f32(m + 6);
931
	float32x4_t col3 = vld1q_f32(m + 9);
932
	float32x4_t sum = vaddq_f32(
933
		vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
934
		vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
935
	return sum;
936
}
937
#endif
938

939
// v and vecOut must point to different memory.
940
inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
941
#if defined(_M_SSE)
942
	__m128 x = _mm_set1_ps(v[0]);
943
	__m128 y = _mm_set1_ps(v[1]);
944
	__m128 z = _mm_set1_ps(v[2]);
945
	__m128 sum = Vec3ByMatrix43Internal(x, y, z, m);
946
	// Not sure what the best way to store 3 elements is. Ideally, we should
947
	// probably store all four.
948
	vecOut[0] = _mm_cvtss_f32(sum);
949
	vecOut[1] = vectorGetByIndex<1>(sum);
950
	vecOut[2] = vectorGetByIndex<2>(sum);
951
#elif PPSSPP_ARCH(ARM_NEON)
952
	float vecIn[4] = {v[0], v[1], v[2], 1.0f};
953
	float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(vecIn), m);
954
	vecOut[0] = vgetq_lane_f32(sum, 0);
955
	vecOut[1] = vgetq_lane_f32(sum, 1);
956
	vecOut[2] = vgetq_lane_f32(sum, 2);
957
#else
958
	vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6] + m[9];
959
	vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7] + m[10];
960
	vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8] + m[11];
961
#endif
962
}
963

964
inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
965
#if defined(_M_SSE)
966
	const __m128 vv = SAFE_M128(v.vec);
967
	__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
968
	__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
969
	__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
970
	return Vec3ByMatrix43Internal(x, y, z, m);
971
#elif PPSSPP_ARCH(ARM_NEON)
972
	return Vec3ByMatrix43Internal(v.vec, m);
973
#else
974
	Vec3f vecOut;
975
	Vec3ByMatrix43(vecOut.AsArray(), v.AsArray(), m);
976
	return vecOut;
977
#endif
978
}
979

980
#if defined(_M_SSE)
981
// x, y, and z should be broadcast.  Should only be used through Vec3f version.
982
inline __m128 MATH3D_CALL Vec3ByMatrix44Internal(__m128 x, __m128 y, __m128 z, const float m[16]) {
983
	__m128 col0 = _mm_loadu_ps(m);
984
	__m128 col1 = _mm_loadu_ps(m + 4);
985
	__m128 col2 = _mm_loadu_ps(m + 8);
986
	__m128 col3 = _mm_loadu_ps(m + 12);
987
	__m128 sum = _mm_add_ps(
988
		_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
989
		_mm_add_ps(_mm_mul_ps(col2, z), col3));
990
	return sum;
991
}
992
#elif PPSSPP_ARCH(ARM64_NEON)
993
inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
994
	float32x4_t col0 = vld1q_f32(m);
995
	float32x4_t col1 = vld1q_f32(m + 4);
996
	float32x4_t col2 = vld1q_f32(m + 8);
997
	float32x4_t col3 = vld1q_f32(m + 12);
998
	float32x4_t sum = vaddq_f32(
999
		vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
1000
		vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
1001
	return sum;
1002
}
1003
#elif PPSSPP_ARCH(ARM_NEON)
1004
inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
1005
	float32x4_t col0 = vld1q_f32(m);
1006
	float32x4_t col1 = vld1q_f32(m + 4);
1007
	float32x4_t col2 = vld1q_f32(m + 8);
1008
	float32x4_t col3 = vld1q_f32(m + 12);
1009
	float32x4_t sum = vaddq_f32(
1010
		vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
1011
		vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
1012
	return sum;
1013
}
1014
#endif
1015

1016
inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) {
1017
#if defined(_M_SSE)
1018
	__m128 x = _mm_set1_ps(v[0]);
1019
	__m128 y = _mm_set1_ps(v[1]);
1020
	__m128 z = _mm_set1_ps(v[2]);
1021
	__m128 sum = Vec3ByMatrix44Internal(x, y, z, m);
1022
	_mm_storeu_ps(vecOut, sum);
1023
#elif PPSSPP_ARCH(ARM_NEON)
1024
	float vecIn[4] = {v[0], v[1], v[2], 1.0f};
1025
	float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(vecIn), m);
1026
	vst1q_f32(vecOut, sum);
1027
#else
1028
	vecOut[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12];
1029
	vecOut[1] = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + m[13];
1030
	vecOut[2] = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + m[14];
1031
	vecOut[3] = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + m[15];
1032
#endif
1033
}
1034

1035
inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
1036
#if defined(_M_SSE)
1037
	const __m128 vv = SAFE_M128(v.vec);
1038
	__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
1039
	__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
1040
	__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
1041
	return Vec3ByMatrix44Internal(x, y, z, m);
1042
#elif PPSSPP_ARCH(ARM_NEON)
1043
	return Vec3ByMatrix44Internal(v.vec, m);
1044
#else
1045
	Vec4f vecOut;
1046
	Vec3ByMatrix44(vecOut.AsArray(), v.AsArray(), m);
1047
	return vecOut;
1048
#endif
1049
}
1050

1051
#if defined(_M_SSE)
1052
// x, y, and z should be broadcast.  Should only be used through Vec3f version.
1053
inline __m128 MATH3D_CALL Norm3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) {
1054
	__m128 col0 = _mm_loadu_ps(m);
1055
	__m128 col1 = _mm_loadu_ps(m + 3);
1056
	__m128 col2 = _mm_loadu_ps(m + 6);
1057
	__m128 sum = _mm_add_ps(
1058
		_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
1059
		_mm_mul_ps(col2, z));
1060
	return sum;
1061
}
1062
#elif PPSSPP_ARCH(ARM64_NEON)
1063
inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
1064
	float32x4_t col0 = vld1q_f32(m);
1065
	float32x4_t col1 = vld1q_f32(m + 3);
1066
	float32x4_t col2 = vld1q_f32(m + 6);
1067
	float32x4_t sum = vaddq_f32(
1068
		vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
1069
		vmulq_laneq_f32(col2, vec, 2));
1070
	return sum;
1071
}
1072
#elif PPSSPP_ARCH(ARM_NEON)
1073
inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
1074
	float32x4_t col0 = vld1q_f32(m);
1075
	float32x4_t col1 = vld1q_f32(m + 3);
1076
	float32x4_t col2 = vld1q_f32(m + 6);
1077
	float32x4_t sum = vaddq_f32(
1078
		vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
1079
		vmulq_lane_f32(col2, vget_high_f32(vec), 0));
1080
	return sum;
1081
}
1082
#endif
1083

1084
inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
1085
#if defined(_M_SSE)
1086
	__m128 x = _mm_set1_ps(v[0]);
1087
	__m128 y = _mm_set1_ps(v[1]);
1088
	__m128 z = _mm_set1_ps(v[2]);
1089
	__m128 sum = Norm3ByMatrix43Internal(x, y, z, m);
1090
	vecOut[0] = _mm_cvtss_f32(sum);
1091
	vecOut[1] = vectorGetByIndex<1>(sum);
1092
	vecOut[2] = vectorGetByIndex<2>(sum);
1093
#elif PPSSPP_ARCH(ARM_NEON)
1094
	float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m);
1095
	vecOut[0] = vgetq_lane_f32(sum, 0);
1096
	vecOut[1] = vgetq_lane_f32(sum, 1);
1097
	vecOut[2] = vgetq_lane_f32(sum, 2);
1098
#else
1099
	vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6];
1100
	vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7];
1101
	vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8];
1102
#endif
1103
}
1104

1105
inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
1106
#if defined(_M_SSE)
1107
	const __m128 vv = SAFE_M128(v.vec);
1108
	__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
1109
	__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
1110
	__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
1111
	return Norm3ByMatrix43Internal(x, y, z, m);
1112
#elif PPSSPP_ARCH(ARM_NEON)
1113
	return Norm3ByMatrix43Internal(v.vec, m);
1114
#else
1115
	Vec3f vecOut;
1116
	Norm3ByMatrix43(vecOut.AsArray(), v.AsArray(), m);
1117
	return vecOut;
1118
#endif
1119
}
1120

1121
inline void Matrix4ByMatrix4(float out[16], const float a[16], const float b[16]) {
1122
	fast_matrix_mul_4x4(out, b, a);
1123
}
1124

1125
inline void ConvertMatrix4x3To4x4(float *m4x4, const float *m4x3) {
1126
	m4x4[0] = m4x3[0];
1127
	m4x4[1] = m4x3[1];
1128
	m4x4[2] = m4x3[2];
1129
	m4x4[3] = 0.0f;
1130
	m4x4[4] = m4x3[3];
1131
	m4x4[5] = m4x3[4];
1132
	m4x4[6] = m4x3[5];
1133
	m4x4[7] = 0.0f;
1134
	m4x4[8] = m4x3[6];
1135
	m4x4[9] = m4x3[7];
1136
	m4x4[10] = m4x3[8];
1137
	m4x4[11] = 0.0f;
1138
	m4x4[12] = m4x3[9];
1139
	m4x4[13] = m4x3[10];
1140
	m4x4[14] = m4x3[11];
1141
	m4x4[15] = 1.0f;
1142
}
1143

1144
inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
1145
#if PPSSPP_ARCH(ARM_NEON)
1146
	// vld3q is a perfect match here!
1147
	float32x4x3_t packed = vld3q_f32(m4x3);
1148
	vst1q_f32(m4x4, packed.val[0]);
1149
	vst1q_f32(m4x4 + 4, packed.val[1]);
1150
	vst1q_f32(m4x4 + 8, packed.val[2]);
1151
#else
1152
	m4x4[0] = m4x3[0];
1153
	m4x4[1] = m4x3[3];
1154
	m4x4[2] = m4x3[6];
1155
	m4x4[3] = m4x3[9];
1156
	m4x4[4] = m4x3[1];
1157
	m4x4[5] = m4x3[4];
1158
	m4x4[6] = m4x3[7];
1159
	m4x4[7] = m4x3[10];
1160
	m4x4[8] = m4x3[2];
1161
	m4x4[9] = m4x3[5];
1162
	m4x4[10] = m4x3[8];
1163
	m4x4[11] = m4x3[11];
1164
#endif
1165
	m4x4[12] = 0.0f;
1166
	m4x4[13] = 0.0f;
1167
	m4x4[14] = 0.0f;
1168
	m4x4[15] = 1.0f;
1169
}
1170

1171
// 0369
1172
// 147A
1173
// 258B
1174
// ->>-
1175
// 0123
1176
// 4567
1177
// 89AB
1178
// Don't see a way to SIMD that. Should be pretty fast anyway.
1179
inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
1180
#if PPSSPP_ARCH(ARM_NEON)
1181
	// vld3q is a perfect match here!
1182
	float32x4x3_t packed = vld3q_f32(m4x3);
1183
	vst1q_f32(m4x4, packed.val[0]);
1184
	vst1q_f32(m4x4 + 4, packed.val[1]);
1185
	vst1q_f32(m4x4 + 8, packed.val[2]);
1186
#else
1187
	m4x4[0] = m4x3[0];
1188
	m4x4[1] = m4x3[3];
1189
	m4x4[2] = m4x3[6];
1190
	m4x4[3] = m4x3[9];
1191
	m4x4[4] = m4x3[1];
1192
	m4x4[5] = m4x3[4];
1193
	m4x4[6] = m4x3[7];
1194
	m4x4[7] = m4x3[10];
1195
	m4x4[8] = m4x3[2];
1196
	m4x4[9] = m4x3[5];
1197
	m4x4[10] = m4x3[8];
1198
	m4x4[11] = m4x3[11];
1199
#endif
1200
}
1201

1202
inline void Transpose4x4(float out[16], const float in[16]) {
1203
	for (int i = 0; i < 4; i++) {
1204
		for (int j = 0; j < 4; j++) {
1205
			out[i * 4 + j] = in[j * 4 + i];
1206
		}
1207
	}
1208
}
1209

1210
namespace Math3D {
1211

1212
template<typename T>
1213
inline T Dot(const Vec2<T>& a, const Vec2<T>& b)
1214
{
1215
	return a.x*b.x + a.y*b.y;
1216
}
1217

1218
template<typename T>
1219
inline T Dot(const Vec3<T>& a, const Vec3<T>& b)
1220
{
1221
	return a.x*b.x + a.y*b.y + a.z*b.z;
1222
}
1223

1224
template<typename T>
1225
inline T Dot(const Vec4<T>& a, const Vec4<T>& b)
1226
{
1227
	return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
1228
}
1229

1230
template<typename T>
1231
inline Vec3<T> Cross(const Vec3<T>& a, const Vec3<T>& b)
1232
{
1233
	return Vec3<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
1234
}
1235

1236
template<typename T>
1237
inline Vec3Packed<T> Cross(const Vec3Packed<T>& a, const Vec3Packed<T>& b)
1238
{
1239
	return Vec3Packed<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
1240
}
1241

1242
template<>
1243
inline Vec3<float> Vec3<float>::FromRGB(unsigned int rgb)
1244
{
1245
#if defined(_M_SSE)
1246
	__m128i z = _mm_setzero_si128();
1247
	__m128i c = _mm_cvtsi32_si128(rgb);
1248
	c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
1249
	return Vec3<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
1250
#elif PPSSPP_ARCH(ARM_NEON)
1251
	uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
1252
	uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
1253
	return Vec3<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
1254
#else
1255
	return Vec3((rgb & 0xFF) * (1.0f/255.0f),
1256
				((rgb >> 8) & 0xFF) * (1.0f/255.0f),
1257
				((rgb >> 16) & 0xFF) * (1.0f/255.0f));
1258
#endif
1259
}
1260

1261
template<>
1262
inline Vec3<int> Vec3<int>::FromRGB(unsigned int rgb)
1263
{
1264
#if defined(_M_SSE)
1265
	__m128i z = _mm_setzero_si128();
1266
	__m128i c = _mm_cvtsi32_si128(rgb);
1267
	c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
1268
	return Vec3<int>(c);
1269
#elif PPSSPP_ARCH(ARM_NEON)
1270
	uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
1271
	uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
1272
	return Vec3<int>(vreinterpretq_s32_u32(u));
1273
#else
1274
	return Vec3(rgb & 0xFF, (rgb >> 8) & 0xFF, (rgb >> 16) & 0xFF);
1275
#endif
1276
}
1277

1278
template<>
1279
__forceinline unsigned int Vec3<float>::ToRGB() const
1280
{
1281
#if defined(_M_SSE)
1282
	__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
1283
	__m128i c16 = _mm_packs_epi32(c, c);
1284
	return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
1285
#elif PPSSPP_ARCH(ARM_NEON)
1286
	uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vsetq_lane_f32(0.0f, vec, 3), vdupq_n_f32(255.0f))));
1287
	uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
1288
	return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
1289
#else
1290
	return (clamp_u8((int)(r() * 255.f)) << 0) |
1291
			(clamp_u8((int)(g() * 255.f)) << 8) |
1292
			(clamp_u8((int)(b() * 255.f)) << 16);
1293
#endif
1294
}
1295

1296
template<>
1297
__forceinline unsigned int Vec3<int>::ToRGB() const
1298
{
1299
#if defined(_M_SSE)
1300
	__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
1301
	return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
1302
#elif PPSSPP_ARCH(ARM_NEON)
1303
	uint16x4_t c16 = vqmovun_s32(vsetq_lane_s32(0, ivec, 3));
1304
	uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
1305
	return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
1306
#else
1307
	return clamp_u8(r()) | (clamp_u8(g()) << 8) | (clamp_u8(b()) << 16);
1308
#endif
1309
}
1310

1311
template<>
1312
inline Vec4<float> Vec4<float>::FromRGBA(unsigned int rgba)
1313
{
1314
#if defined(_M_SSE)
1315
	__m128i z = _mm_setzero_si128();
1316
	__m128i c = _mm_cvtsi32_si128(rgba);
1317
	c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
1318
	return Vec4<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
1319
#elif PPSSPP_ARCH(ARM_NEON)
1320
	uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
1321
	uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
1322
	return Vec4<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
1323
#else
1324
	return Vec4((rgba & 0xFF) * (1.0f/255.0f),
1325
				((rgba >> 8) & 0xFF) * (1.0f/255.0f),
1326
				((rgba >> 16) & 0xFF) * (1.0f/255.0f),
1327
				((rgba >> 24) & 0xFF) * (1.0f/255.0f));
1328
#endif
1329
}
1330

1331
template<typename T>
1332
inline Vec4<T> Vec4<T>::FromRGBA(const u8 *rgba)
1333
{
1334
	return Vec4<T>::FromRGBA(*(unsigned int *)rgba);
1335
}
1336

1337
template<>
1338
inline Vec4<int> Vec4<int>::FromRGBA(unsigned int rgba)
1339
{
1340
#if defined(_M_SSE)
1341
	__m128i z = _mm_setzero_si128();
1342
	__m128i c = _mm_cvtsi32_si128(rgba);
1343
	c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
1344
	return Vec4<int>(c);
1345
#elif PPSSPP_ARCH(ARM_NEON)
1346
	uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
1347
	uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
1348
	return Vec4<int>(vreinterpretq_s32_u32(u));
1349
#else
1350
	return Vec4(rgba & 0xFF, (rgba >> 8) & 0xFF, (rgba >> 16) & 0xFF, (rgba >> 24) & 0xFF);
1351
#endif
1352
}
1353

1354
template<>
1355
__forceinline unsigned int Vec4<float>::ToRGBA() const
1356
{
1357
#if defined(_M_SSE)
1358
	__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
1359
	__m128i c16 = _mm_packs_epi32(c, c);
1360
	return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
1361
#elif PPSSPP_ARCH(ARM_NEON)
1362
	uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vec, vdupq_n_f32(255.0f))));
1363
	uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
1364
	return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
1365
#else
1366
	return (clamp_u8((int)(r() * 255.f)) << 0) |
1367
			(clamp_u8((int)(g() * 255.f)) << 8) |
1368
			(clamp_u8((int)(b() * 255.f)) << 16) |
1369
			(clamp_u8((int)(a() * 255.f)) << 24);
1370
#endif
1371
}
1372

1373
template<>
1374
__forceinline unsigned int Vec4<int>::ToRGBA() const
1375
{
1376
#if defined(_M_SSE)
1377
	__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
1378
	return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
1379
#elif PPSSPP_ARCH(ARM_NEON)
1380
	uint16x4_t c16 = vqmovun_s32(ivec);
1381
	uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
1382
	return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
1383
#else
1384
	return clamp_u8(r()) | (clamp_u8(g()) << 8) | (clamp_u8(b()) << 16) | (clamp_u8(a()) << 24);
1385
#endif
1386
}
1387

1388
template<typename T>
1389
__forceinline void Vec4<T>::ToRGBA(u8 *rgba) const
1390
{
1391
	*(u32 *)rgba = ToRGBA();
1392
}
1393

1394
#if defined(_M_SSE)
1395
// Specialized for SIMD optimization
1396

1397
// Vec3<float> operation
1398
template<>
1399
inline void Vec3<float>::operator += (const Vec3<float> &other) {
1400
	vec = _mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec));
1401
}
1402

1403
template<>
1404
inline Vec3<float> Vec3<float>::operator + (const Vec3 &other) const {
1405
	return Vec3<float>(_mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1406
}
1407

1408
template<>
1409
inline void Vec3<float>::operator -= (const Vec3<float> &other) {
1410
	vec = _mm_sub_ps(SAFE_M128(vec), SAFE_M128(other.vec));
1411
}
1412

1413
template<>
1414
inline Vec3<float> Vec3<float>::operator - (const Vec3 &other) const {
1415
	return Vec3<float>(_mm_sub_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1416
}
1417

1418
template<>
1419
inline Vec3<float> Vec3<float>::operator * (const Vec3 &other) const {
1420
	return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1421
}
1422

1423
template<> template<>
1424
inline Vec3<float> Vec3<float>::operator * (const float &other) const {
1425
	return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));
1426
}
1427

1428
// Vec4<int> operation
1429
template<>
1430
inline Vec4<int> Vec4<int>::operator + (const Vec4 &other) const {
1431
	return Vec4<int>(_mm_add_epi32(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
1432
}
1433

1434
template<>
1435
inline Vec4<int> Vec4<int>::operator * (const Vec4 &other) const {
1436
	__m128i a = SAFE_M128I(ivec);
1437
	__m128i b = SAFE_M128I(other.ivec);
1438
	// Intel in its immense wisdom decided that
1439
	// SSE2 does not get _mm_mullo_epi32(),
1440
	// so we do it this way. This is what clang does,
1441
	// which seems about as good as it gets.
1442
	__m128i m02 = _mm_mul_epu32(a, b);
1443
	__m128i m13 = _mm_mul_epu32(
1444
		_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 1, 1)),
1445
		_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)));
1446
	__m128i ret = _mm_unpacklo_epi32(
1447
		_mm_shuffle_epi32(m02, _MM_SHUFFLE(3, 2, 2, 0)),
1448
		_mm_shuffle_epi32(m13, _MM_SHUFFLE(3, 2, 2, 0)));
1449
	return Vec4<int>(ret);
1450
}
1451

1452
template<> template<>
1453
inline Vec4<int> Vec4<int>::operator * (const int &other) const {
1454
	return (*this) * Vec4<int>(_mm_set1_epi32(other));
1455
}
1456

1457
template<>
1458
inline Vec4<int> Vec4<int>::operator | (const Vec4 &other) const {
1459
	return Vec4<int>(_mm_or_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
1460
}
1461

1462
template<>
1463
inline Vec4<int> Vec4<int>::operator & (const Vec4 &other) const {
1464
	return Vec4<int>(_mm_and_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
1465
}
1466

1467
// NOTE: modern GCC, clang, and MSVC are all ok with
1468
// non-compile-time-const amount for _mm_slli_epi32/_mm_srli_epi32.
1469
template<>
1470
inline Vec4<int> Vec4<int>::operator << (const int amount) const {
1471
	return Vec4<int>(_mm_slli_epi32(SAFE_M128I(ivec), amount));
1472
}
1473

1474
template<>
1475
inline Vec4<int> Vec4<int>::operator >> (const int amount) const {
1476
	return Vec4<int>(_mm_srli_epi32(SAFE_M128I(ivec), amount));
1477
}
1478

1479
// Vec4<float> operation
1480
template<>
1481
inline void Vec4<float>::operator += (const Vec4<float> &other) {
1482
	vec = _mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec));
1483
}
1484

1485
template<>
1486
inline Vec4<float> Vec4<float>::operator + (const Vec4 &other) const {
1487
	return Vec4<float>(_mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1488
}
1489

1490
template<>
1491
inline Vec4<float> Vec4<float>::operator * (const Vec4 &other) const {
1492
	return Vec4<float>(_mm_mul_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1493
}
1494

1495
template<> template<>
1496
inline Vec4<float> Vec4<float>::operator * (const float &other) const {
1497
	return Vec4<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));
1498
}
1499

1500
// Vec3<float> cross product
1501
template<>
1502
inline Vec3<float> Cross(const Vec3<float> &a, const Vec3<float> &b)
1503
{
1504
#if PPSSPP_ARCH(X86)
1505
	__m128 avec = _mm_loadu_ps(&a.x);
1506
	__m128 bvec = _mm_loadu_ps(&b.x);
1507
#else
1508
	__m128 avec = a.vec;
1509
	__m128 bvec = b.vec;
1510
#endif
1511
	const __m128 left = _mm_mul_ps(_mm_shuffle_ps(avec, avec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(bvec, bvec, _MM_SHUFFLE(3, 1, 0, 2)));
1512
	const __m128 right = _mm_mul_ps(_mm_shuffle_ps(avec, avec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(bvec, bvec, _MM_SHUFFLE(3, 0, 2, 1)));
1513
	return _mm_sub_ps(left, right);
1514
}
1515
#endif
1516

1517
}; // namespace Math3D
1518

1519
// linear interpolation via float: 0.0=begin, 1.0=end
1520
template<typename X>
1521
inline X Lerp(const X& begin, const X& end, const float t)
1522
{
1523
	return begin*(1.f-t) + end*t;
1524
}
1525

1526
// linear interpolation via int: 0=begin, base=end
1527
template<typename X, int base>
1528
inline X LerpInt(const X& begin, const X& end, const int t)
1529
{
1530
	return (begin*(base-t) + end*t) / base;
1531
}
1532

1533
Product

Resources

Company