Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hrydgard
GitHub Repository: hrydgard/ppsspp
Path: blob/master/GPU/Math3D.h
3185 views
1
// Copyright (c) 2012- PPSSPP Project.
2
3
// This program is free software: you can redistribute it and/or modify
4
// it under the terms of the GNU General Public License as published by
5
// the Free Software Foundation, version 2.0 or later versions.
6
7
// This program is distributed in the hope that it will be useful,
8
// but WITHOUT ANY WARRANTY; without even the implied warranty of
9
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10
// GNU General Public License 2.0 for more details.
11
12
// A copy of the GPL 2.0 should have been included with the program.
13
// If not, see http://www.gnu.org/licenses/
14
15
// Official git repository and contact information can be found at
16
// https://github.com/hrydgard/ppsspp and http://www.ppsspp.org/.
17
18
#pragma once
19
20
#include "ppsspp_config.h"
21
#include <cmath>
22
#include <cstring>
23
24
#include "Common/Common.h"
25
#include "Core/Util/AudioFormat.h" // for clamp_u8
26
#include "Common/Math/fast/fast_matrix.h"
27
#include "Common/Math/SIMDHeaders.h"
28
29
#if PPSSPP_PLATFORM(WINDOWS) && (defined(_MSC_VER) || defined(__clang__) || defined(__INTEL_COMPILER))
30
#define MATH3D_CALL __vectorcall
31
#else
32
#define MATH3D_CALL
33
#endif
34
35
// There's probably a better place to define these macros.
36
#if PPSSPP_ARCH(X86)
37
// On 32-bit x86, MSVC does not guarantee alignment for
38
// SSE arguments passed on stack (Compiler Error C2719), see e.g.:
39
// https://stackoverflow.com/questions/10484422/msvc-cannot-send-function-parameters-of-16byte-alignment-on-x86
40
// https://stackoverflow.com/questions/28488986/formal-parameter-with-declspecalign16-wont-be-aligned
41
// So, as a workaround, "dangerous" cases are loaded via loadu* on 32-bit x86.
42
// Compilers are decently ok at eliminating these extra loads, at least
43
// in trivial cases.
44
// NOTE: not to be outdone, GCC has its own flavor of broken, see e.g.:
45
// http://www.peterstock.co.uk/games/mingw_sse/
46
// https://github.com/nothings/stb/issues/81
47
// which is probably worse since it breaks alignment of locals and/or
48
// spills, but that, hopefully, does not affect PPSSPP (modern GCC+Linux
49
// is 16-byte aligned on x86, and MinGW is not a supported PPSSPP target).
50
// NOTE: weird double-casts add a bit of type-safety.
51
#define SAFE_M128(v) _mm_loadu_ps (reinterpret_cast<const float*> (static_cast<const __m128*> (&(v))))
52
#define SAFE_M128I(v) _mm_loadu_si128(reinterpret_cast<const __m128i*>(static_cast<const __m128i*>(&(v))))
53
#else // x64, FWIW also works for non-x86.
54
#define SAFE_M128(v) (v)
55
#define SAFE_M128I(v) (v)
56
#endif
57
58
namespace Math3D {
59
60
// Helper for Vec classes to clamp values.
61
template<typename T>
62
inline static T VecClamp(const T &v, const T &low, const T &high)
63
{
64
if (v > high)
65
return high;
66
if (v < low)
67
return low;
68
return v;
69
}
70
71
template<typename T>
72
class Vec2 {
73
public:
74
struct {
75
T x,y;
76
};
77
78
T* AsArray() { return &x; }
79
const T* AsArray() const { return &x; }
80
81
Vec2() {}
82
Vec2(const T a[2]) : x(a[0]), y(a[1]) {}
83
Vec2(const T& _x, const T& _y) : x(_x), y(_y) {}
84
85
template<typename T2>
86
Vec2<T2> Cast() const
87
{
88
return Vec2<T2>((T2)x, (T2)y);
89
}
90
91
static Vec2 AssignToAll(const T& f)
92
{
93
return Vec2<T>(f, f);
94
}
95
96
void Write(T a[2])
97
{
98
a[0] = x; a[1] = y;
99
}
100
101
Vec2 operator +(const Vec2& other) const
102
{
103
return Vec2(x+other.x, y+other.y);
104
}
105
void operator += (const Vec2 &other)
106
{
107
x+=other.x; y+=other.y;
108
}
109
Vec2 operator -(const Vec2& other) const
110
{
111
return Vec2(x-other.x, y-other.y);
112
}
113
void operator -= (const Vec2& other)
114
{
115
x-=other.x; y-=other.y;
116
}
117
Vec2 operator -() const
118
{
119
return Vec2(-x,-y);
120
}
121
Vec2 operator * (const Vec2& other) const
122
{
123
return Vec2(x*other.x, y*other.y);
124
}
125
template<typename V>
126
Vec2 operator * (const V& f) const
127
{
128
return Vec2(x*f,y*f);
129
}
130
template<typename V>
131
void operator *= (const V& f)
132
{
133
x*=f; y*=f;
134
}
135
template<typename V>
136
Vec2 operator / (const V& f) const
137
{
138
return Vec2(x/f,y/f);
139
}
140
template<typename V>
141
void operator /= (const V& f)
142
{
143
*this = *this / f;
144
}
145
146
T Length2() const
147
{
148
return x*x + y*y;
149
}
150
151
Vec2 Clamp(const T &l, const T &h) const
152
{
153
return Vec2(VecClamp(x, l, h), VecClamp(y, l, h));
154
}
155
156
// Only implemented for T=float
157
float Length() const;
158
void SetLength(const float l);
159
Vec2 WithLength(const float l) const;
160
float Distance2To(const Vec2 &other) const;
161
Vec2 Normalized() const;
162
float Normalize(); // returns the previous length, which is often useful
163
164
T& operator [] (int i) //allow vector[1] = 3 (vector.y=3)
165
{
166
return *((&x) + i);
167
}
168
T operator [] (const int i) const
169
{
170
return *((&x) + i);
171
}
172
173
void SetZero()
174
{
175
x=0; y=0;
176
}
177
178
// Common aliases: UV (texel coordinates), ST (texture coordinates)
179
T& u() { return x; }
180
T& v() { return y; }
181
T& s() { return x; }
182
T& t() { return y; }
183
184
const T& u() const { return x; }
185
const T& v() const { return y; }
186
const T& s() const { return x; }
187
const T& t() const { return y; }
188
189
// swizzlers - create a subvector of specific components
190
const Vec2 yx() const { return Vec2(y, x); }
191
const Vec2 vu() const { return Vec2(y, x); }
192
const Vec2 ts() const { return Vec2(y, x); }
193
};
194
195
template<typename T>
196
class Vec3Packed;
197
198
template<typename T>
199
class Vec3
200
{
201
public:
202
union
203
{
204
struct
205
{
206
T x,y,z;
207
};
208
#if defined(_M_SSE)
209
__m128i ivec;
210
__m128 vec;
211
#elif PPSSPP_ARCH(ARM_NEON)
212
int32x4_t ivec;
213
float32x4_t vec;
214
#endif
215
};
216
217
T* AsArray() { return &x; }
218
const T* AsArray() const { return &x; }
219
220
Vec3() {}
221
Vec3(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
222
constexpr Vec3(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
223
Vec3(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}
224
#if defined(_M_SSE)
225
constexpr Vec3(const __m128 &_vec) : vec(_vec) {}
226
constexpr Vec3(const __m128i &_ivec) : ivec(_ivec) {}
227
Vec3(const Vec3Packed<T> &_xyz) {
228
vec = _mm_loadu_ps(_xyz.AsArray());
229
}
230
#elif PPSSPP_ARCH(ARM_NEON)
231
Vec3(const float32x4_t &_vec) : vec(_vec) {}
232
#if !defined(_MSC_VER)
233
Vec3(const int32x4_t &_ivec) : ivec(_ivec) {}
234
#endif
235
Vec3(const Vec3Packed<T> &_xyz) {
236
vec = vld1q_f32(_xyz.AsArray());
237
}
238
#else
239
Vec3(const Vec3Packed<T> &_xyz) : x(_xyz.x), y(_xyz.y), z(_xyz.z) {}
240
#endif
241
242
template<typename T2>
243
constexpr Vec3<T2> Cast() const
244
{
245
return Vec3<T2>((T2)x, (T2)y, (T2)z);
246
}
247
248
// Only implemented for T=int and T=float
249
static Vec3 FromRGB(unsigned int rgb);
250
unsigned int ToRGB() const; // alpha bits set to zero
251
252
static constexpr Vec3 AssignToAll(const T& f)
253
{
254
return Vec3<T>(f, f, f);
255
}
256
257
void Write(T a[3])
258
{
259
a[0] = x; a[1] = y; a[2] = z;
260
}
261
262
Vec3 operator +(const Vec3 &other) const
263
{
264
return Vec3(x+other.x, y+other.y, z+other.z);
265
}
266
void operator += (const Vec3 &other)
267
{
268
x+=other.x; y+=other.y; z+=other.z;
269
}
270
Vec3 operator -(const Vec3 &other) const
271
{
272
return Vec3(x-other.x, y-other.y, z-other.z);
273
}
274
void operator -= (const Vec3 &other)
275
{
276
x-=other.x; y-=other.y; z-=other.z;
277
}
278
Vec3 operator -() const
279
{
280
return Vec3(-x,-y,-z);
281
}
282
Vec3 operator * (const Vec3 &other) const
283
{
284
return Vec3(x*other.x, y*other.y, z*other.z);
285
}
286
template<typename V>
287
Vec3 operator * (const V& f) const
288
{
289
return Vec3(x*f,y*f,z*f);
290
}
291
template<typename V>
292
void operator *= (const V& f)
293
{
294
x*=f; y*=f; z*=f;
295
}
296
template<typename V>
297
Vec3 operator / (const V& f) const
298
{
299
return Vec3(x/f,y/f,z/f);
300
}
301
template<typename V>
302
void operator /= (const V& f)
303
{
304
*this = *this / f;
305
}
306
307
bool operator ==(const Vec3 &other) const {
308
return x == other.x && y == other.y && z == other.z;
309
}
310
311
T Length2() const
312
{
313
return x*x + y*y + z*z;
314
}
315
316
Vec3 Clamp(const T &l, const T &h) const
317
{
318
return Vec3(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h));
319
}
320
321
// Only implemented for T=float
322
float Length() const;
323
void SetLength(const float l);
324
Vec3 WithLength(const float l) const;
325
float Distance2To(const Vec3 &other) const;
326
Vec3 Normalized(bool useSSE4 = false) const;
327
Vec3 NormalizedOr001(bool useSSE4 = false) const;
328
float Normalize(); // returns the previous length, which is often useful
329
float NormalizeOr001();
330
331
T& operator [] (int i) //allow vector[2] = 3 (vector.z=3)
332
{
333
return *((&x) + i);
334
}
335
T operator [] (const int i) const
336
{
337
return *((&x) + i);
338
}
339
340
void SetZero()
341
{
342
x=0; y=0; z=0;
343
}
344
345
// Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates)
346
T& u() { return x; }
347
T& v() { return y; }
348
T& w() { return z; }
349
350
T& r() { return x; }
351
T& g() { return y; }
352
T& b() { return z; }
353
354
T& s() { return x; }
355
T& t() { return y; }
356
T& q() { return z; }
357
358
const T& u() const { return x; }
359
const T& v() const { return y; }
360
const T& w() const { return z; }
361
362
const T& r() const { return x; }
363
const T& g() const { return y; }
364
const T& b() const { return z; }
365
366
const T& s() const { return x; }
367
const T& t() const { return y; }
368
const T& q() const { return z; }
369
370
// swizzlers - create a subvector of specific components
371
// e.g. Vec2 uv() { return Vec2(x,y); }
372
// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
373
#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
374
#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \
375
_DEFINE_SWIZZLER2(a, b, a##b); \
376
_DEFINE_SWIZZLER2(a, b, a2##b2); \
377
_DEFINE_SWIZZLER2(a, b, a3##b3); \
378
_DEFINE_SWIZZLER2(a, b, a4##b4); \
379
_DEFINE_SWIZZLER2(b, a, b##a); \
380
_DEFINE_SWIZZLER2(b, a, b2##a2); \
381
_DEFINE_SWIZZLER2(b, a, b3##a3); \
382
_DEFINE_SWIZZLER2(b, a, b4##a4);
383
384
DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);
385
DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);
386
DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);
387
#undef DEFINE_SWIZZLER2
388
#undef _DEFINE_SWIZZLER2
389
};
390
391
template<typename T>
392
class Vec3Packed
393
{
394
public:
395
union
396
{
397
struct
398
{
399
T x,y,z;
400
};
401
};
402
403
T* AsArray() { return &x; }
404
const T* AsArray() const { return &x; }
405
406
Vec3Packed() {}
407
Vec3Packed(const T a[3]) : x(a[0]), y(a[1]), z(a[2]) {}
408
Vec3Packed(const T& _x, const T& _y, const T& _z) : x(_x), y(_y), z(_z) {}
409
Vec3Packed(const Vec2<T>& _xy, const T& _z) : x(_xy.x), y(_xy.y), z(_z) {}
410
Vec3Packed(const Vec3<T>& _xyz) {
411
memcpy(&x, _xyz.AsArray(), sizeof(float) * 3);
412
}
413
414
template<typename T2>
415
Vec3Packed<T2> Cast() const
416
{
417
return Vec3Packed<T2>((T2)x, (T2)y, (T2)z);
418
}
419
420
// Only implemented for T=int and T=float
421
static Vec3Packed FromRGB(unsigned int rgb);
422
unsigned int ToRGB() const; // alpha bits set to zero
423
424
static Vec3Packed AssignToAll(const T& f)
425
{
426
return Vec3Packed<T>(f, f, f);
427
}
428
429
void Write(T a[3])
430
{
431
a[0] = x; a[1] = y; a[2] = z;
432
}
433
434
Vec3Packed operator +(const Vec3Packed &other) const
435
{
436
return Vec3Packed(x+other.x, y+other.y, z+other.z);
437
}
438
void operator += (const Vec3Packed &other)
439
{
440
x+=other.x; y+=other.y; z+=other.z;
441
}
442
Vec3Packed operator -(const Vec3Packed &other) const
443
{
444
return Vec3Packed(x-other.x, y-other.y, z-other.z);
445
}
446
void operator -= (const Vec3Packed &other)
447
{
448
x-=other.x; y-=other.y; z-=other.z;
449
}
450
Vec3Packed operator -() const
451
{
452
return Vec3Packed(-x,-y,-z);
453
}
454
Vec3Packed operator * (const Vec3Packed &other) const
455
{
456
return Vec3Packed(x*other.x, y*other.y, z*other.z);
457
}
458
template<typename V>
459
Vec3Packed operator * (const V& f) const
460
{
461
return Vec3Packed(x*f,y*f,z*f);
462
}
463
template<typename V>
464
void operator *= (const V& f)
465
{
466
x*=f; y*=f; z*=f;
467
}
468
template<typename V>
469
Vec3Packed operator / (const V& f) const
470
{
471
return Vec3Packed(x/f,y/f,z/f);
472
}
473
template<typename V>
474
void operator /= (const V& f)
475
{
476
*this = *this / f;
477
}
478
479
T Length2() const
480
{
481
return x*x + y*y + z*z;
482
}
483
484
Vec3Packed Clamp(const T &l, const T &h) const
485
{
486
return Vec3Packed(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h));
487
}
488
489
// Only implemented for T=float
490
float Length() const;
491
void SetLength(const float l);
492
Vec3Packed WithLength(const float l) const;
493
float Distance2To(const Vec3Packed &other) const;
494
Vec3Packed Normalized() const;
495
float Normalize(); // returns the previous length, which is often useful
496
497
T& operator [] (int i) //allow vector[2] = 3 (vector.z=3)
498
{
499
return *((&x) + i);
500
}
501
T operator [] (const int i) const
502
{
503
return *((&x) + i);
504
}
505
506
void SetZero()
507
{
508
x=0; y=0; z=0;
509
}
510
511
// Common aliases: UVW (texel coordinates), RGB (colors), STQ (texture coordinates)
512
T& u() { return x; }
513
T& v() { return y; }
514
T& w() { return z; }
515
516
T& r() { return x; }
517
T& g() { return y; }
518
T& b() { return z; }
519
520
T& s() { return x; }
521
T& t() { return y; }
522
T& q() { return z; }
523
524
const T& u() const { return x; }
525
const T& v() const { return y; }
526
const T& w() const { return z; }
527
528
const T& r() const { return x; }
529
const T& g() const { return y; }
530
const T& b() const { return z; }
531
532
const T& s() const { return x; }
533
const T& t() const { return y; }
534
const T& q() const { return z; }
535
536
// swizzlers - create a subvector of specific components
537
// e.g. Vec2 uv() { return Vec2(x,y); }
538
// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
539
#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
540
#define DEFINE_SWIZZLER2(a, b, a2, b2, a3, b3, a4, b4) \
541
_DEFINE_SWIZZLER2(a, b, a##b); \
542
_DEFINE_SWIZZLER2(a, b, a2##b2); \
543
_DEFINE_SWIZZLER2(a, b, a3##b3); \
544
_DEFINE_SWIZZLER2(a, b, a4##b4); \
545
_DEFINE_SWIZZLER2(b, a, b##a); \
546
_DEFINE_SWIZZLER2(b, a, b2##a2); \
547
_DEFINE_SWIZZLER2(b, a, b3##a3); \
548
_DEFINE_SWIZZLER2(b, a, b4##a4);
549
550
DEFINE_SWIZZLER2(x, y, r, g, u, v, s, t);
551
DEFINE_SWIZZLER2(x, z, r, b, u, w, s, q);
552
DEFINE_SWIZZLER2(y, z, g, b, v, w, t, q);
553
#undef DEFINE_SWIZZLER2
554
#undef _DEFINE_SWIZZLER2
555
};
556
557
template<typename T>
558
class Vec4
559
{
560
public:
561
union
562
{
563
struct
564
{
565
T x,y,z,w;
566
};
567
#if defined(_M_SSE)
568
__m128i ivec;
569
__m128 vec;
570
#elif PPSSPP_ARCH(ARM_NEON)
571
int32x4_t ivec;
572
float32x4_t vec;
573
#endif
574
};
575
576
T* AsArray() { return &x; }
577
const T* AsArray() const { return &x; }
578
579
Vec4() {}
580
Vec4(const T a[4]) : x(a[0]), y(a[1]), z(a[2]), w(a[3]) {}
581
Vec4(const T& _x, const T& _y, const T& _z, const T& _w) : x(_x), y(_y), z(_z), w(_w) {}
582
Vec4(const Vec2<T>& _xy, const T& _z, const T& _w) : x(_xy.x), y(_xy.y), z(_z), w(_w) {}
583
Vec4(const Vec3<T>& _xyz, const T& _w) : x(_xyz.x), y(_xyz.y), z(_xyz.z), w(_w) {}
584
#if defined(_M_SSE)
585
Vec4(const __m128 &_vec) : vec(_vec) {}
586
Vec4(const __m128i &_ivec) : ivec(_ivec) {}
587
#elif PPSSPP_ARCH(ARM_NEON)
588
Vec4(const float32x4_t &_vec) : vec(_vec) {}
589
#if !defined(_MSC_VER)
590
Vec4(const int32x4_t &_ivec) : ivec(_ivec) {}
591
#endif
592
#endif
593
594
template<typename T2>
595
Vec4<T2> Cast() const {
596
if constexpr (std::is_same<T, float>::value && std::is_same<T2, int>::value) {
597
#if defined(_M_SSE)
598
return _mm_cvtps_epi32(SAFE_M128(vec));
599
#elif PPSSPP_ARCH(ARM_NEON)
600
return vcvtq_s32_f32(vec);
601
#endif
602
}
603
if constexpr (std::is_same<T, int>::value && std::is_same<T2, float>::value) {
604
#if defined(_M_SSE)
605
return _mm_cvtepi32_ps(SAFE_M128I(ivec));
606
#elif PPSSPP_ARCH(ARM_NEON)
607
return vcvtq_f32_s32(ivec);
608
#endif
609
}
610
return Vec4<T2>((T2)x, (T2)y, (T2)z, (T2)w);
611
}
612
613
// Only implemented for T=int and T=float
614
static Vec4 FromRGBA(unsigned int rgba);
615
static Vec4 FromRGBA(const u8 *rgba);
616
unsigned int ToRGBA() const;
617
void ToRGBA(u8 *rgba) const;
618
619
static Vec4 AssignToAll(const T& f)
620
{
621
return Vec4<T>(f, f, f, f);
622
}
623
624
void Write(T a[4])
625
{
626
a[0] = x; a[1] = y; a[2] = z; a[3] = w;
627
}
628
629
Vec4 operator +(const Vec4& other) const
630
{
631
return Vec4(x+other.x, y+other.y, z+other.z, w+other.w);
632
}
633
void operator += (const Vec4& other)
634
{
635
x+=other.x; y+=other.y; z+=other.z; w+=other.w;
636
}
637
Vec4 operator -(const Vec4 &other) const
638
{
639
return Vec4(x-other.x, y-other.y, z-other.z, w-other.w);
640
}
641
void operator -= (const Vec4 &other)
642
{
643
x-=other.x; y-=other.y; z-=other.z; w-=other.w;
644
}
645
Vec4 operator -() const
646
{
647
return Vec4(-x,-y,-z,-w);
648
}
649
Vec4 operator * (const Vec4 &other) const
650
{
651
return Vec4(x*other.x, y*other.y, z*other.z, w*other.w);
652
}
653
Vec4 operator | (const Vec4 &other) const
654
{
655
return Vec4(x | other.x, y | other.y, z | other.z, w | other.w);
656
}
657
Vec4 operator & (const Vec4 &other) const
658
{
659
return Vec4(x & other.x, y & other.y, z & other.z, w & other.w);
660
}
661
Vec4 operator << (const int amount) const
662
{
663
// NOTE: x*(1<<amount), etc., might be safer, since
664
// left-shifting negatives is UB pre-C++20.
665
return Vec4(x << amount, y << amount, z << amount, w << amount);
666
}
667
Vec4 operator >> (const int amount) const
668
{
669
return Vec4(x >> amount, y >> amount, z >> amount, w >> amount);
670
}
671
template<typename V>
672
Vec4 operator * (const V& f) const
673
{
674
return Vec4(x*f,y*f,z*f,w*f);
675
}
676
template<typename V>
677
void operator *= (const V& f)
678
{
679
x*=f; y*=f; z*=f; w*=f;
680
}
681
template<typename V>
682
Vec4 operator / (const V& f) const
683
{
684
return Vec4(x/f,y/f,z/f,w/f);
685
}
686
template<typename V>
687
void operator /= (const V& f)
688
{
689
*this = *this / f;
690
}
691
692
bool operator ==(const Vec4 &other) const {
693
return x == other.x && y == other.y && z == other.z && w == other.w;
694
}
695
696
T Length2() const
697
{
698
return x*x + y*y + z*z + w*w;
699
}
700
701
Vec4 Clamp(const T &l, const T &h) const
702
{
703
return Vec4(VecClamp(x, l, h), VecClamp(y, l, h), VecClamp(z, l, h), VecClamp(w, l, h));
704
}
705
706
Vec4 Reciprocal() const
707
{
708
const T one = 1.0f;
709
return Vec4(one / x, one / y, one / z, one / w);
710
}
711
712
// Only implemented for T=float
713
float Length() const;
714
void SetLength(const float l);
715
Vec4 WithLength(const float l) const;
716
float Distance2To(const Vec4 &other) const;
717
Vec4 Normalized() const;
718
float Normalize(); // returns the previous length, which is often useful
719
720
T& operator [] (int i) //allow vector[2] = 3 (vector.z=3)
721
{
722
return *((&x) + i);
723
}
724
T operator [] (const int i) const
725
{
726
return *((&x) + i);
727
}
728
729
void SetZero()
730
{
731
x=0; y=0; z=0; w=0;
732
}
733
734
// Common alias: RGBA (colors)
735
T& r() { return x; }
736
T& g() { return y; }
737
T& b() { return z; }
738
T& a() { return w; }
739
740
const T& r() const { return x; }
741
const T& g() const { return y; }
742
const T& b() const { return z; }
743
const T& a() const { return w; }
744
745
// swizzlers - create a subvector of specific components
746
// e.g. Vec2 uv() { return Vec2(x,y); }
747
// _DEFINE_SWIZZLER2 defines a single such function, DEFINE_SWIZZLER2 defines all of them for all component names (x<->r) and permutations (xy<->yx)
748
#define _DEFINE_SWIZZLER2(a, b, name) const Vec2<T> name() const { return Vec2<T>(a, b); }
749
#define DEFINE_SWIZZLER2(a, b, a2, b2) \
750
_DEFINE_SWIZZLER2(a, b, a##b); \
751
_DEFINE_SWIZZLER2(a, b, a2##b2); \
752
_DEFINE_SWIZZLER2(b, a, b##a); \
753
_DEFINE_SWIZZLER2(b, a, b2##a2);
754
755
DEFINE_SWIZZLER2(x, y, r, g);
756
DEFINE_SWIZZLER2(x, z, r, b);
757
DEFINE_SWIZZLER2(x, w, r, a);
758
DEFINE_SWIZZLER2(y, z, g, b);
759
DEFINE_SWIZZLER2(y, w, g, a);
760
DEFINE_SWIZZLER2(z, w, b, a);
761
#undef DEFINE_SWIZZLER2
762
#undef _DEFINE_SWIZZLER2
763
764
#define _DEFINE_SWIZZLER3(a, b, c, name) const Vec3<T> name() const { return Vec3<T>(a, b, c); }
765
#define DEFINE_SWIZZLER3(a, b, c, a2, b2, c2) \
766
_DEFINE_SWIZZLER3(a, b, c, a##b##c); \
767
_DEFINE_SWIZZLER3(a, c, b, a##c##b); \
768
_DEFINE_SWIZZLER3(b, a, c, b##a##c); \
769
_DEFINE_SWIZZLER3(b, c, a, b##c##a); \
770
_DEFINE_SWIZZLER3(c, a, b, c##a##b); \
771
_DEFINE_SWIZZLER3(c, b, a, c##b##a); \
772
_DEFINE_SWIZZLER3(a, b, c, a2##b2##c2); \
773
_DEFINE_SWIZZLER3(a, c, b, a2##c2##b2); \
774
_DEFINE_SWIZZLER3(b, a, c, b2##a2##c2); \
775
_DEFINE_SWIZZLER3(b, c, a, b2##c2##a2); \
776
_DEFINE_SWIZZLER3(c, a, b, c2##a2##b2); \
777
_DEFINE_SWIZZLER3(c, b, a, c2##b2##a2);
778
779
DEFINE_SWIZZLER3(x, y, z, r, g, b);
780
DEFINE_SWIZZLER3(x, y, w, r, g, a);
781
DEFINE_SWIZZLER3(x, z, w, r, b, a);
782
DEFINE_SWIZZLER3(y, z, w, g, b, a);
783
#undef DEFINE_SWIZZLER3
784
#undef _DEFINE_SWIZZLER3
785
};
786
787
788
template<typename BaseType>
789
class Mat3x3
790
{
791
public:
792
// Convention: first three values = first column
793
Mat3x3(const BaseType values[])
794
{
795
for (unsigned int i = 0; i < 3*3; ++i)
796
{
797
this->values[i] = values[i];
798
}
799
}
800
801
Mat3x3(BaseType _00, BaseType _01, BaseType _02, BaseType _10, BaseType _11, BaseType _12, BaseType _20, BaseType _21, BaseType _22)
802
{
803
values[0] = _00;
804
values[1] = _01;
805
values[2] = _02;
806
values[3] = _10;
807
values[4] = _11;
808
values[5] = _12;
809
values[6] = _20;
810
values[7] = _21;
811
values[8] = _22;
812
}
813
814
template<typename T>
815
Vec3<T> operator * (const Vec3<T>& vec) const
816
{
817
Vec3<T> ret;
818
ret.x = values[0]*vec.x + values[3]*vec.y + values[6]*vec.z;
819
ret.y = values[1]*vec.x + values[4]*vec.y + values[7]*vec.z;
820
ret.z = values[2]*vec.x + values[5]*vec.y + values[8]*vec.z;
821
return ret;
822
}
823
824
Mat3x3 Inverse() const
825
{
826
float a = values[0];
827
float b = values[1];
828
float c = values[2];
829
float d = values[3];
830
float e = values[4];
831
float f = values[5];
832
float g = values[6];
833
float h = values[7];
834
float i = values[8];
835
return Mat3x3(e*i-f*h, f*g-d*i, d*h-e*g,
836
c*h-b*i, a*i-c*g, b*g-a*h,
837
b*f-c*e, c*d-a*f, a*e-b*d) / Det();
838
}
839
840
BaseType Det() const
841
{
842
return values[0]*values[4]*values[8] + values[3]*values[7]*values[2] +
843
values[6]*values[1]*values[5] - values[2]*values[4]*values[6] -
844
values[5]*values[7]*values[0] - values[8]*values[1]*values[3];
845
}
846
847
Mat3x3 operator / (const BaseType& val) const
848
{
849
return Mat3x3(values[0]/val, values[1]/val, values[2]/val,
850
values[3]/val, values[4]/val, values[5]/val,
851
values[6]/val, values[7]/val, values[8]/val);
852
}
853
854
private:
855
BaseType values[3*3];
856
};
857
858
859
template<typename BaseType>
860
class Mat4x4
861
{
862
public:
863
// Convention: first four values in arrow = first column
864
Mat4x4(const BaseType values[])
865
{
866
for (unsigned int i = 0; i < 4*4; ++i)
867
{
868
this->values[i] = values[i];
869
}
870
}
871
872
template<typename T>
873
Vec4<T> operator * (const Vec4<T>& vec) const
874
{
875
Vec4<T> ret;
876
ret.x = values[0]*vec.x + values[4]*vec.y + values[8]*vec.z + values[12]*vec.w;
877
ret.y = values[1]*vec.x + values[5]*vec.y + values[9]*vec.z + values[13]*vec.w;
878
ret.z = values[2]*vec.x + values[6]*vec.y + values[10]*vec.z + values[14]*vec.w;
879
ret.w = values[3]*vec.x + values[7]*vec.y + values[11]*vec.z + values[15]*vec.w;
880
return ret;
881
}
882
883
private:
884
BaseType values[4*4];
885
};
886
887
}; // namespace Math3D
888
889
typedef Math3D::Vec2<float> Vec2f;
890
typedef Math3D::Vec3<float> Vec3f;
891
typedef Math3D::Vec3Packed<float> Vec3Packedf;
892
typedef Math3D::Vec4<float> Vec4f;
893
894
#if defined(_M_SSE)
895
template<unsigned i>
896
float MATH3D_CALL vectorGetByIndex(__m128 v) {
897
// shuffle V so that the element that we want is moved to the bottom
898
return _mm_cvtss_f32(_mm_shuffle_ps(v, v, _MM_SHUFFLE(i, i, i, i)));
899
}
900
#endif
901
902
#if defined(_M_SSE)
903
// x, y, and z should be broadcast. Should only be used through Vec3f version.
904
// Note that this will read an extra float from the matrix, so it better not be at the end of an allocation!
905
inline __m128 MATH3D_CALL Vec3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) {
906
__m128 col0 = _mm_loadu_ps(m);
907
__m128 col1 = _mm_loadu_ps(m + 3);
908
__m128 col2 = _mm_loadu_ps(m + 6);
909
__m128 col3 = _mm_loadu_ps(m + 9);
910
__m128 sum = _mm_add_ps(
911
_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
912
_mm_add_ps(_mm_mul_ps(col2, z), col3));
913
return sum;
914
}
915
#elif PPSSPP_ARCH(ARM64_NEON)
916
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
917
float32x4_t col0 = vld1q_f32(m);
918
float32x4_t col1 = vld1q_f32(m + 3);
919
float32x4_t col2 = vld1q_f32(m + 6);
920
float32x4_t col3 = vld1q_f32(m + 9);
921
float32x4_t sum = vaddq_f32(
922
vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
923
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
924
return sum;
925
}
926
#elif PPSSPP_ARCH(ARM_NEON)
927
inline float32x4_t Vec3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
928
float32x4_t col0 = vld1q_f32(m);
929
float32x4_t col1 = vld1q_f32(m + 3);
930
float32x4_t col2 = vld1q_f32(m + 6);
931
float32x4_t col3 = vld1q_f32(m + 9);
932
float32x4_t sum = vaddq_f32(
933
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
934
vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
935
return sum;
936
}
937
#endif
938
939
// v and vecOut must point to different memory.
940
inline void Vec3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
941
#if defined(_M_SSE)
942
__m128 x = _mm_set1_ps(v[0]);
943
__m128 y = _mm_set1_ps(v[1]);
944
__m128 z = _mm_set1_ps(v[2]);
945
__m128 sum = Vec3ByMatrix43Internal(x, y, z, m);
946
// Not sure what the best way to store 3 elements is. Ideally, we should
947
// probably store all four.
948
vecOut[0] = _mm_cvtss_f32(sum);
949
vecOut[1] = vectorGetByIndex<1>(sum);
950
vecOut[2] = vectorGetByIndex<2>(sum);
951
#elif PPSSPP_ARCH(ARM_NEON)
952
float vecIn[4] = {v[0], v[1], v[2], 1.0f};
953
float32x4_t sum = Vec3ByMatrix43Internal(vld1q_f32(vecIn), m);
954
vecOut[0] = vgetq_lane_f32(sum, 0);
955
vecOut[1] = vgetq_lane_f32(sum, 1);
956
vecOut[2] = vgetq_lane_f32(sum, 2);
957
#else
958
vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6] + m[9];
959
vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7] + m[10];
960
vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8] + m[11];
961
#endif
962
}
963
964
inline Vec3f MATH3D_CALL Vec3ByMatrix43(const Vec3f v, const float m[12]) {
965
#if defined(_M_SSE)
966
const __m128 vv = SAFE_M128(v.vec);
967
__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
968
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
969
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
970
return Vec3ByMatrix43Internal(x, y, z, m);
971
#elif PPSSPP_ARCH(ARM_NEON)
972
return Vec3ByMatrix43Internal(v.vec, m);
973
#else
974
Vec3f vecOut;
975
Vec3ByMatrix43(vecOut.AsArray(), v.AsArray(), m);
976
return vecOut;
977
#endif
978
}
979
980
#if defined(_M_SSE)
981
// x, y, and z should be broadcast. Should only be used through Vec3f version.
982
inline __m128 MATH3D_CALL Vec3ByMatrix44Internal(__m128 x, __m128 y, __m128 z, const float m[16]) {
983
__m128 col0 = _mm_loadu_ps(m);
984
__m128 col1 = _mm_loadu_ps(m + 4);
985
__m128 col2 = _mm_loadu_ps(m + 8);
986
__m128 col3 = _mm_loadu_ps(m + 12);
987
__m128 sum = _mm_add_ps(
988
_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
989
_mm_add_ps(_mm_mul_ps(col2, z), col3));
990
return sum;
991
}
992
#elif PPSSPP_ARCH(ARM64_NEON)
993
inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
994
float32x4_t col0 = vld1q_f32(m);
995
float32x4_t col1 = vld1q_f32(m + 4);
996
float32x4_t col2 = vld1q_f32(m + 8);
997
float32x4_t col3 = vld1q_f32(m + 12);
998
float32x4_t sum = vaddq_f32(
999
vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
1000
vaddq_f32(vmulq_laneq_f32(col2, vec, 2), col3));
1001
return sum;
1002
}
1003
#elif PPSSPP_ARCH(ARM_NEON)
1004
inline float32x4_t Vec3ByMatrix44Internal(float32x4_t vec, const float m[16]) {
1005
float32x4_t col0 = vld1q_f32(m);
1006
float32x4_t col1 = vld1q_f32(m + 4);
1007
float32x4_t col2 = vld1q_f32(m + 8);
1008
float32x4_t col3 = vld1q_f32(m + 12);
1009
float32x4_t sum = vaddq_f32(
1010
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
1011
vaddq_f32(vmulq_lane_f32(col2, vget_high_f32(vec), 0), col3));
1012
return sum;
1013
}
1014
#endif
1015
1016
inline void Vec3ByMatrix44(float vecOut[4], const float v[3], const float m[16]) {
1017
#if defined(_M_SSE)
1018
__m128 x = _mm_set1_ps(v[0]);
1019
__m128 y = _mm_set1_ps(v[1]);
1020
__m128 z = _mm_set1_ps(v[2]);
1021
__m128 sum = Vec3ByMatrix44Internal(x, y, z, m);
1022
_mm_storeu_ps(vecOut, sum);
1023
#elif PPSSPP_ARCH(ARM_NEON)
1024
float vecIn[4] = {v[0], v[1], v[2], 1.0f};
1025
float32x4_t sum = Vec3ByMatrix44Internal(vld1q_f32(vecIn), m);
1026
vst1q_f32(vecOut, sum);
1027
#else
1028
vecOut[0] = v[0] * m[0] + v[1] * m[4] + v[2] * m[8] + m[12];
1029
vecOut[1] = v[0] * m[1] + v[1] * m[5] + v[2] * m[9] + m[13];
1030
vecOut[2] = v[0] * m[2] + v[1] * m[6] + v[2] * m[10] + m[14];
1031
vecOut[3] = v[0] * m[3] + v[1] * m[7] + v[2] * m[11] + m[15];
1032
#endif
1033
}
1034
1035
inline Vec4f MATH3D_CALL Vec3ByMatrix44(const Vec3f v, const float m[16]) {
1036
#if defined(_M_SSE)
1037
const __m128 vv = SAFE_M128(v.vec);
1038
__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
1039
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
1040
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
1041
return Vec3ByMatrix44Internal(x, y, z, m);
1042
#elif PPSSPP_ARCH(ARM_NEON)
1043
return Vec3ByMatrix44Internal(v.vec, m);
1044
#else
1045
Vec4f vecOut;
1046
Vec3ByMatrix44(vecOut.AsArray(), v.AsArray(), m);
1047
return vecOut;
1048
#endif
1049
}
1050
1051
#if defined(_M_SSE)
1052
// x, y, and z should be broadcast. Should only be used through Vec3f version.
1053
inline __m128 MATH3D_CALL Norm3ByMatrix43Internal(__m128 x, __m128 y, __m128 z, const float m[12]) {
1054
__m128 col0 = _mm_loadu_ps(m);
1055
__m128 col1 = _mm_loadu_ps(m + 3);
1056
__m128 col2 = _mm_loadu_ps(m + 6);
1057
__m128 sum = _mm_add_ps(
1058
_mm_add_ps(_mm_mul_ps(col0, x), _mm_mul_ps(col1, y)),
1059
_mm_mul_ps(col2, z));
1060
return sum;
1061
}
1062
#elif PPSSPP_ARCH(ARM64_NEON)
1063
inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
1064
float32x4_t col0 = vld1q_f32(m);
1065
float32x4_t col1 = vld1q_f32(m + 3);
1066
float32x4_t col2 = vld1q_f32(m + 6);
1067
float32x4_t sum = vaddq_f32(
1068
vaddq_f32(vmulq_laneq_f32(col0, vec, 0), vmulq_laneq_f32(col1, vec, 1)),
1069
vmulq_laneq_f32(col2, vec, 2));
1070
return sum;
1071
}
1072
#elif PPSSPP_ARCH(ARM_NEON)
1073
inline float32x4_t Norm3ByMatrix43Internal(float32x4_t vec, const float m[16]) {
1074
float32x4_t col0 = vld1q_f32(m);
1075
float32x4_t col1 = vld1q_f32(m + 3);
1076
float32x4_t col2 = vld1q_f32(m + 6);
1077
float32x4_t sum = vaddq_f32(
1078
vaddq_f32(vmulq_lane_f32(col0, vget_low_f32(vec), 0), vmulq_lane_f32(col1, vget_low_f32(vec), 1)),
1079
vmulq_lane_f32(col2, vget_high_f32(vec), 0));
1080
return sum;
1081
}
1082
#endif
1083
1084
inline void Norm3ByMatrix43(float vecOut[3], const float v[3], const float m[12]) {
1085
#if defined(_M_SSE)
1086
__m128 x = _mm_set1_ps(v[0]);
1087
__m128 y = _mm_set1_ps(v[1]);
1088
__m128 z = _mm_set1_ps(v[2]);
1089
__m128 sum = Norm3ByMatrix43Internal(x, y, z, m);
1090
vecOut[0] = _mm_cvtss_f32(sum);
1091
vecOut[1] = vectorGetByIndex<1>(sum);
1092
vecOut[2] = vectorGetByIndex<2>(sum);
1093
#elif PPSSPP_ARCH(ARM_NEON)
1094
float32x4_t sum = Norm3ByMatrix43Internal(vld1q_f32(v), m);
1095
vecOut[0] = vgetq_lane_f32(sum, 0);
1096
vecOut[1] = vgetq_lane_f32(sum, 1);
1097
vecOut[2] = vgetq_lane_f32(sum, 2);
1098
#else
1099
vecOut[0] = v[0] * m[0] + v[1] * m[3] + v[2] * m[6];
1100
vecOut[1] = v[0] * m[1] + v[1] * m[4] + v[2] * m[7];
1101
vecOut[2] = v[0] * m[2] + v[1] * m[5] + v[2] * m[8];
1102
#endif
1103
}
1104
1105
inline Vec3f MATH3D_CALL Norm3ByMatrix43(const Vec3f v, const float m[12]) {
1106
#if defined(_M_SSE)
1107
const __m128 vv = SAFE_M128(v.vec);
1108
__m128 x = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(0, 0, 0, 0));
1109
__m128 y = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(1, 1, 1, 1));
1110
__m128 z = _mm_shuffle_ps(vv, vv, _MM_SHUFFLE(2, 2, 2, 2));
1111
return Norm3ByMatrix43Internal(x, y, z, m);
1112
#elif PPSSPP_ARCH(ARM_NEON)
1113
return Norm3ByMatrix43Internal(v.vec, m);
1114
#else
1115
Vec3f vecOut;
1116
Norm3ByMatrix43(vecOut.AsArray(), v.AsArray(), m);
1117
return vecOut;
1118
#endif
1119
}
1120
1121
inline void Matrix4ByMatrix4(float out[16], const float a[16], const float b[16]) {
1122
fast_matrix_mul_4x4(out, b, a);
1123
}
1124
1125
inline void ConvertMatrix4x3To4x4(float *m4x4, const float *m4x3) {
1126
m4x4[0] = m4x3[0];
1127
m4x4[1] = m4x3[1];
1128
m4x4[2] = m4x3[2];
1129
m4x4[3] = 0.0f;
1130
m4x4[4] = m4x3[3];
1131
m4x4[5] = m4x3[4];
1132
m4x4[6] = m4x3[5];
1133
m4x4[7] = 0.0f;
1134
m4x4[8] = m4x3[6];
1135
m4x4[9] = m4x3[7];
1136
m4x4[10] = m4x3[8];
1137
m4x4[11] = 0.0f;
1138
m4x4[12] = m4x3[9];
1139
m4x4[13] = m4x3[10];
1140
m4x4[14] = m4x3[11];
1141
m4x4[15] = 1.0f;
1142
}
1143
1144
inline void ConvertMatrix4x3To4x4Transposed(float *m4x4, const float *m4x3) {
1145
#if PPSSPP_ARCH(ARM_NEON)
1146
// vld3q is a perfect match here!
1147
float32x4x3_t packed = vld3q_f32(m4x3);
1148
vst1q_f32(m4x4, packed.val[0]);
1149
vst1q_f32(m4x4 + 4, packed.val[1]);
1150
vst1q_f32(m4x4 + 8, packed.val[2]);
1151
#else
1152
m4x4[0] = m4x3[0];
1153
m4x4[1] = m4x3[3];
1154
m4x4[2] = m4x3[6];
1155
m4x4[3] = m4x3[9];
1156
m4x4[4] = m4x3[1];
1157
m4x4[5] = m4x3[4];
1158
m4x4[6] = m4x3[7];
1159
m4x4[7] = m4x3[10];
1160
m4x4[8] = m4x3[2];
1161
m4x4[9] = m4x3[5];
1162
m4x4[10] = m4x3[8];
1163
m4x4[11] = m4x3[11];
1164
#endif
1165
m4x4[12] = 0.0f;
1166
m4x4[13] = 0.0f;
1167
m4x4[14] = 0.0f;
1168
m4x4[15] = 1.0f;
1169
}
1170
1171
// 0369
1172
// 147A
1173
// 258B
1174
// ->>-
1175
// 0123
1176
// 4567
1177
// 89AB
1178
// Don't see a way to SIMD that. Should be pretty fast anyway.
1179
inline void ConvertMatrix4x3To3x4Transposed(float *m4x4, const float *m4x3) {
1180
#if PPSSPP_ARCH(ARM_NEON)
1181
// vld3q is a perfect match here!
1182
float32x4x3_t packed = vld3q_f32(m4x3);
1183
vst1q_f32(m4x4, packed.val[0]);
1184
vst1q_f32(m4x4 + 4, packed.val[1]);
1185
vst1q_f32(m4x4 + 8, packed.val[2]);
1186
#else
1187
m4x4[0] = m4x3[0];
1188
m4x4[1] = m4x3[3];
1189
m4x4[2] = m4x3[6];
1190
m4x4[3] = m4x3[9];
1191
m4x4[4] = m4x3[1];
1192
m4x4[5] = m4x3[4];
1193
m4x4[6] = m4x3[7];
1194
m4x4[7] = m4x3[10];
1195
m4x4[8] = m4x3[2];
1196
m4x4[9] = m4x3[5];
1197
m4x4[10] = m4x3[8];
1198
m4x4[11] = m4x3[11];
1199
#endif
1200
}
1201
1202
inline void Transpose4x4(float out[16], const float in[16]) {
1203
for (int i = 0; i < 4; i++) {
1204
for (int j = 0; j < 4; j++) {
1205
out[i * 4 + j] = in[j * 4 + i];
1206
}
1207
}
1208
}
1209
1210
namespace Math3D {
1211
1212
template<typename T>
1213
inline T Dot(const Vec2<T>& a, const Vec2<T>& b)
1214
{
1215
return a.x*b.x + a.y*b.y;
1216
}
1217
1218
template<typename T>
1219
inline T Dot(const Vec3<T>& a, const Vec3<T>& b)
1220
{
1221
return a.x*b.x + a.y*b.y + a.z*b.z;
1222
}
1223
1224
template<typename T>
1225
inline T Dot(const Vec4<T>& a, const Vec4<T>& b)
1226
{
1227
return a.x*b.x + a.y*b.y + a.z*b.z + a.w*b.w;
1228
}
1229
1230
template<typename T>
1231
inline Vec3<T> Cross(const Vec3<T>& a, const Vec3<T>& b)
1232
{
1233
return Vec3<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
1234
}
1235
1236
template<typename T>
1237
inline Vec3Packed<T> Cross(const Vec3Packed<T>& a, const Vec3Packed<T>& b)
1238
{
1239
return Vec3Packed<T>(a.y*b.z-a.z*b.y, a.z*b.x-a.x*b.z, a.x*b.y-a.y*b.x);
1240
}
1241
1242
template<>
1243
inline Vec3<float> Vec3<float>::FromRGB(unsigned int rgb)
1244
{
1245
#if defined(_M_SSE)
1246
__m128i z = _mm_setzero_si128();
1247
__m128i c = _mm_cvtsi32_si128(rgb);
1248
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
1249
return Vec3<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
1250
#elif PPSSPP_ARCH(ARM_NEON)
1251
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
1252
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
1253
return Vec3<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
1254
#else
1255
return Vec3((rgb & 0xFF) * (1.0f/255.0f),
1256
((rgb >> 8) & 0xFF) * (1.0f/255.0f),
1257
((rgb >> 16) & 0xFF) * (1.0f/255.0f));
1258
#endif
1259
}
1260
1261
template<>
1262
inline Vec3<int> Vec3<int>::FromRGB(unsigned int rgb)
1263
{
1264
#if defined(_M_SSE)
1265
__m128i z = _mm_setzero_si128();
1266
__m128i c = _mm_cvtsi32_si128(rgb);
1267
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
1268
return Vec3<int>(c);
1269
#elif PPSSPP_ARCH(ARM_NEON)
1270
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgb));
1271
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
1272
return Vec3<int>(vreinterpretq_s32_u32(u));
1273
#else
1274
return Vec3(rgb & 0xFF, (rgb >> 8) & 0xFF, (rgb >> 16) & 0xFF);
1275
#endif
1276
}
1277
1278
template<>
1279
__forceinline unsigned int Vec3<float>::ToRGB() const
1280
{
1281
#if defined(_M_SSE)
1282
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
1283
__m128i c16 = _mm_packs_epi32(c, c);
1284
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
1285
#elif PPSSPP_ARCH(ARM_NEON)
1286
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vsetq_lane_f32(0.0f, vec, 3), vdupq_n_f32(255.0f))));
1287
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
1288
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
1289
#else
1290
return (clamp_u8((int)(r() * 255.f)) << 0) |
1291
(clamp_u8((int)(g() * 255.f)) << 8) |
1292
(clamp_u8((int)(b() * 255.f)) << 16);
1293
#endif
1294
}
1295
1296
template<>
1297
__forceinline unsigned int Vec3<int>::ToRGB() const
1298
{
1299
#if defined(_M_SSE)
1300
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
1301
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16)) & 0x00FFFFFF;
1302
#elif PPSSPP_ARCH(ARM_NEON)
1303
uint16x4_t c16 = vqmovun_s32(vsetq_lane_s32(0, ivec, 3));
1304
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
1305
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
1306
#else
1307
return clamp_u8(r()) | (clamp_u8(g()) << 8) | (clamp_u8(b()) << 16);
1308
#endif
1309
}
1310
1311
template<>
1312
inline Vec4<float> Vec4<float>::FromRGBA(unsigned int rgba)
1313
{
1314
#if defined(_M_SSE)
1315
__m128i z = _mm_setzero_si128();
1316
__m128i c = _mm_cvtsi32_si128(rgba);
1317
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
1318
return Vec4<float>(_mm_mul_ps(_mm_cvtepi32_ps(c), _mm_set_ps1(1.0f / 255.0f)));
1319
#elif PPSSPP_ARCH(ARM_NEON)
1320
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
1321
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
1322
return Vec4<float>(vmulq_f32(vcvtq_f32_u32(u), vdupq_n_f32(1.0f / 255.0f)));
1323
#else
1324
return Vec4((rgba & 0xFF) * (1.0f/255.0f),
1325
((rgba >> 8) & 0xFF) * (1.0f/255.0f),
1326
((rgba >> 16) & 0xFF) * (1.0f/255.0f),
1327
((rgba >> 24) & 0xFF) * (1.0f/255.0f));
1328
#endif
1329
}
1330
1331
template<typename T>
1332
inline Vec4<T> Vec4<T>::FromRGBA(const u8 *rgba)
1333
{
1334
return Vec4<T>::FromRGBA(*(unsigned int *)rgba);
1335
}
1336
1337
template<>
1338
inline Vec4<int> Vec4<int>::FromRGBA(unsigned int rgba)
1339
{
1340
#if defined(_M_SSE)
1341
__m128i z = _mm_setzero_si128();
1342
__m128i c = _mm_cvtsi32_si128(rgba);
1343
c = _mm_unpacklo_epi16(_mm_unpacklo_epi8(c, z), z);
1344
return Vec4<int>(c);
1345
#elif PPSSPP_ARCH(ARM_NEON)
1346
uint8x8_t c = vreinterpret_u8_u32(vdup_n_u32(rgba));
1347
uint32x4_t u = vmovl_u16(vget_low_u16(vmovl_u8(c)));
1348
return Vec4<int>(vreinterpretq_s32_u32(u));
1349
#else
1350
return Vec4(rgba & 0xFF, (rgba >> 8) & 0xFF, (rgba >> 16) & 0xFF, (rgba >> 24) & 0xFF);
1351
#endif
1352
}
1353
1354
template<>
1355
__forceinline unsigned int Vec4<float>::ToRGBA() const
1356
{
1357
#if defined(_M_SSE)
1358
__m128i c = _mm_cvtps_epi32(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(255.0f)));
1359
__m128i c16 = _mm_packs_epi32(c, c);
1360
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
1361
#elif PPSSPP_ARCH(ARM_NEON)
1362
uint16x4_t c16 = vqmovun_s32(vcvtq_s32_f32(vmulq_f32(vec, vdupq_n_f32(255.0f))));
1363
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
1364
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
1365
#else
1366
return (clamp_u8((int)(r() * 255.f)) << 0) |
1367
(clamp_u8((int)(g() * 255.f)) << 8) |
1368
(clamp_u8((int)(b() * 255.f)) << 16) |
1369
(clamp_u8((int)(a() * 255.f)) << 24);
1370
#endif
1371
}
1372
1373
template<>
1374
__forceinline unsigned int Vec4<int>::ToRGBA() const
1375
{
1376
#if defined(_M_SSE)
1377
__m128i c16 = _mm_packs_epi32(SAFE_M128I(ivec), SAFE_M128I(ivec));
1378
return _mm_cvtsi128_si32(_mm_packus_epi16(c16, c16));
1379
#elif PPSSPP_ARCH(ARM_NEON)
1380
uint16x4_t c16 = vqmovun_s32(ivec);
1381
uint8x8_t c8 = vqmovn_u16(vcombine_u16(c16, c16));
1382
return vget_lane_u32(vreinterpret_u32_u8(c8), 0);
1383
#else
1384
return clamp_u8(r()) | (clamp_u8(g()) << 8) | (clamp_u8(b()) << 16) | (clamp_u8(a()) << 24);
1385
#endif
1386
}
1387
1388
template<typename T>
1389
__forceinline void Vec4<T>::ToRGBA(u8 *rgba) const
1390
{
1391
*(u32 *)rgba = ToRGBA();
1392
}
1393
1394
#if defined(_M_SSE)
1395
// Specialized for SIMD optimization
1396
1397
// Vec3<float> operation
1398
template<>
1399
inline void Vec3<float>::operator += (const Vec3<float> &other) {
1400
vec = _mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec));
1401
}
1402
1403
template<>
1404
inline Vec3<float> Vec3<float>::operator + (const Vec3 &other) const {
1405
return Vec3<float>(_mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1406
}
1407
1408
template<>
1409
inline void Vec3<float>::operator -= (const Vec3<float> &other) {
1410
vec = _mm_sub_ps(SAFE_M128(vec), SAFE_M128(other.vec));
1411
}
1412
1413
template<>
1414
inline Vec3<float> Vec3<float>::operator - (const Vec3 &other) const {
1415
return Vec3<float>(_mm_sub_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1416
}
1417
1418
template<>
1419
inline Vec3<float> Vec3<float>::operator * (const Vec3 &other) const {
1420
return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1421
}
1422
1423
template<> template<>
1424
inline Vec3<float> Vec3<float>::operator * (const float &other) const {
1425
return Vec3<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));
1426
}
1427
1428
// Vec4<int> operation
1429
template<>
1430
inline Vec4<int> Vec4<int>::operator + (const Vec4 &other) const {
1431
return Vec4<int>(_mm_add_epi32(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
1432
}
1433
1434
template<>
1435
inline Vec4<int> Vec4<int>::operator * (const Vec4 &other) const {
1436
__m128i a = SAFE_M128I(ivec);
1437
__m128i b = SAFE_M128I(other.ivec);
1438
// Intel in its immense wisdom decided that
1439
// SSE2 does not get _mm_mullo_epi32(),
1440
// so we do it this way. This is what clang does,
1441
// which seems about as good as it gets.
1442
__m128i m02 = _mm_mul_epu32(a, b);
1443
__m128i m13 = _mm_mul_epu32(
1444
_mm_shuffle_epi32(a, _MM_SHUFFLE(3, 3, 1, 1)),
1445
_mm_shuffle_epi32(b, _MM_SHUFFLE(3, 3, 1, 1)));
1446
__m128i ret = _mm_unpacklo_epi32(
1447
_mm_shuffle_epi32(m02, _MM_SHUFFLE(3, 2, 2, 0)),
1448
_mm_shuffle_epi32(m13, _MM_SHUFFLE(3, 2, 2, 0)));
1449
return Vec4<int>(ret);
1450
}
1451
1452
template<> template<>
1453
inline Vec4<int> Vec4<int>::operator * (const int &other) const {
1454
return (*this) * Vec4<int>(_mm_set1_epi32(other));
1455
}
1456
1457
template<>
1458
inline Vec4<int> Vec4<int>::operator | (const Vec4 &other) const {
1459
return Vec4<int>(_mm_or_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
1460
}
1461
1462
template<>
1463
inline Vec4<int> Vec4<int>::operator & (const Vec4 &other) const {
1464
return Vec4<int>(_mm_and_si128(SAFE_M128I(ivec), SAFE_M128I(other.ivec)));
1465
}
1466
1467
// NOTE: modern GCC, clang, and MSVC are all ok with
1468
// non-compile-time-const amount for _mm_slli_epi32/_mm_srli_epi32.
1469
template<>
1470
inline Vec4<int> Vec4<int>::operator << (const int amount) const {
1471
return Vec4<int>(_mm_slli_epi32(SAFE_M128I(ivec), amount));
1472
}
1473
1474
template<>
1475
inline Vec4<int> Vec4<int>::operator >> (const int amount) const {
1476
return Vec4<int>(_mm_srli_epi32(SAFE_M128I(ivec), amount));
1477
}
1478
1479
// Vec4<float> operation
1480
template<>
1481
inline void Vec4<float>::operator += (const Vec4<float> &other) {
1482
vec = _mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec));
1483
}
1484
1485
template<>
1486
inline Vec4<float> Vec4<float>::operator + (const Vec4 &other) const {
1487
return Vec4<float>(_mm_add_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1488
}
1489
1490
template<>
1491
inline Vec4<float> Vec4<float>::operator * (const Vec4 &other) const {
1492
return Vec4<float>(_mm_mul_ps(SAFE_M128(vec), SAFE_M128(other.vec)));
1493
}
1494
1495
template<> template<>
1496
inline Vec4<float> Vec4<float>::operator * (const float &other) const {
1497
return Vec4<float>(_mm_mul_ps(SAFE_M128(vec), _mm_set_ps1(other)));
1498
}
1499
1500
// Vec3<float> cross product
1501
template<>
1502
inline Vec3<float> Cross(const Vec3<float> &a, const Vec3<float> &b)
1503
{
1504
#if PPSSPP_ARCH(X86)
1505
__m128 avec = _mm_loadu_ps(&a.x);
1506
__m128 bvec = _mm_loadu_ps(&b.x);
1507
#else
1508
__m128 avec = a.vec;
1509
__m128 bvec = b.vec;
1510
#endif
1511
const __m128 left = _mm_mul_ps(_mm_shuffle_ps(avec, avec, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(bvec, bvec, _MM_SHUFFLE(3, 1, 0, 2)));
1512
const __m128 right = _mm_mul_ps(_mm_shuffle_ps(avec, avec, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(bvec, bvec, _MM_SHUFFLE(3, 0, 2, 1)));
1513
return _mm_sub_ps(left, right);
1514
}
1515
#endif
1516
1517
}; // namespace Math3D
1518
1519
// linear interpolation via float: 0.0=begin, 1.0=end
1520
template<typename X>
1521
inline X Lerp(const X& begin, const X& end, const float t)
1522
{
1523
return begin*(1.f-t) + end*t;
1524
}
1525
1526
// linear interpolation via int: 0=begin, base=end
1527
template<typename X, int base>
1528
inline X LerpInt(const X& begin, const X& end, const int t)
1529
{
1530
return (begin*(base-t) + end*t) / base;
1531
}
1532
1533