Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
hrydgard
GitHub Repository: hrydgard/ppsspp
Path: blob/master/Common/Math/SIMDHeaders.h
3186 views
1
#pragma once
2
3
// SIMD headers
4
// Let's include these in one consistent way across the code base.
5
// Here we'll also add wrappers that paper over differences between different versions
6
// of an instruction set, like NEON vs ASIMD (64-bit).
7
8
#pragma once
9
10
#include "ppsspp_config.h"
11
12
#include "stdint.h"
13
#include <string.h>
14
15
#ifdef __clang__
16
// Weird how you can't just use #pragma in a macro.
17
#define DO_NOT_VECTORIZE_LOOP _Pragma("clang loop vectorize(disable)")
18
#else
19
#define DO_NOT_VECTORIZE_LOOP
20
#endif
21
22
#if PPSSPP_ARCH(SSE2)
23
#include <emmintrin.h>
24
#endif
25
26
#if PPSSPP_ARCH(ARM_NEON)
27
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
28
#include <arm64_neon.h>
29
#else
30
#include <arm_neon.h>
31
#endif
32
#endif
33
34
#if PPSSPP_ARCH(LOONGARCH64)
35
#if PPSSPP_ARCH(LOONGARCH64_LSX)
36
#include <lsxintrin.h>
37
#endif
38
#endif
39
40
// Basic types
41
42
#if PPSSPP_ARCH(ARM64_NEON)
43
44
// No special ones here.
45
46
#elif PPSSPP_ARCH(ARM_NEON)
47
48
// Compatibility wrappers making ARM64 NEON code run on ARM32
49
// With optimization on, these should compile down to the optimal code.
50
51
static inline float32x4_t vmulq_laneq_f32(float32x4_t a, float32x4_t b, int lane) {
52
switch (lane & 3) {
53
case 0: return vmulq_lane_f32(a, vget_low_f32(b), 0);
54
case 1: return vmulq_lane_f32(a, vget_low_f32(b), 1);
55
case 2: return vmulq_lane_f32(a, vget_high_f32(b), 0);
56
default: return vmulq_lane_f32(a, vget_high_f32(b), 1);
57
}
58
}
59
60
static inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c, int lane) {
61
switch (lane & 3) {
62
case 0: return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
63
case 1: return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
64
case 2: return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
65
default: return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
66
}
67
}
68
69
#define vfmaq_laneq_f32 vmlaq_laneq_f32
70
71
static inline uint32x4_t vcgezq_f32(float32x4_t v) {
72
return vcgeq_f32(v, vdupq_n_f32(0.0f));
73
}
74
75
#endif
76
77
#if PPSSPP_ARCH(SSE2)
78
79
#if defined __SSE4_2__
80
# define _M_SSE 0x402
81
#elif defined __SSE4_1__
82
# define _M_SSE 0x401
83
#elif defined __SSSE3__
84
# define _M_SSE 0x301
85
#elif defined __SSE3__
86
# define _M_SSE 0x300
87
#elif defined __SSE2__
88
# define _M_SSE 0x200
89
#elif !defined(__GNUC__) && (defined(_M_X64) || defined(_M_IX86))
90
# define _M_SSE 0x402
91
#endif
92
93
// These are SSE2 versions of SSE4.1 instructions, for compatibility and ease of
94
// writing code.
95
// May later figure out how to use the appropriate ones depending on compile flags.
96
97
inline __m128i _mm_mullo_epi32_SSE2(const __m128i v0, const __m128i v1) {
98
__m128i a13 = _mm_shuffle_epi32(v0, 0xF5); // (-,a3,-,a1)
99
__m128i b13 = _mm_shuffle_epi32(v1, 0xF5); // (-,b3,-,b1)
100
__m128i prod02 = _mm_mul_epu32(v0, v1); // (-,a2*b2,-,a0*b0)
101
__m128i prod13 = _mm_mul_epu32(a13, b13); // (-,a3*b3,-,a1*b1)
102
__m128i prod01 = _mm_unpacklo_epi32(prod02, prod13); // (-,-,a1*b1,a0*b0)
103
__m128i prod23 = _mm_unpackhi_epi32(prod02, prod13); // (-,-,a3*b3,a2*b2)
104
return _mm_unpacklo_epi64(prod01, prod23);
105
}
106
107
inline __m128i _mm_max_epu16_SSE2(const __m128i v0, const __m128i v1) {
108
return _mm_xor_si128(
109
_mm_max_epi16(
110
_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
111
_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
112
_mm_set1_epi16((int16_t)0x8000));
113
}
114
115
inline __m128i _mm_min_epu16_SSE2(const __m128i v0, const __m128i v1) {
116
return _mm_xor_si128(
117
_mm_min_epi16(
118
_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
119
_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
120
_mm_set1_epi16((int16_t)0x8000));
121
}
122
123
// SSE2 replacement for half of a _mm_packus_epi32 but without the saturation.
124
inline __m128i _mm_packu_epi32_SSE2(const __m128i v0) {
125
__m128i temp = _mm_shufflelo_epi16(v0, _MM_SHUFFLE(3, 3, 2, 0));
126
__m128 temp2 = _mm_castsi128_ps(_mm_shufflehi_epi16(temp, _MM_SHUFFLE(3, 3, 2, 0)));
127
return _mm_castps_si128(_mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 3, 2, 0)));
128
}
129
130
#define _mm_splat_lane_ps(v, l) _mm_shuffle_ps((v), (v), _MM_SHUFFLE(l, l, l, l))
131
132
#ifdef __cplusplus
133
134
alignas(16) static const uint32_t g_sign32[4] = { 0x00008000, 0x00008000, 0x00008000, 0x00008000 };
135
alignas(16) static const uint32_t g_sign16[4] = { 0x80008000, 0x80008000, 0x80008000, 0x80008000 };
136
137
// Alternate solution to the above, not sure if faster or slower.
138
// SSE2 replacement for half of _mm_packus_epi32 but without the saturation.
139
// Not ideal! pshufb would make this faster but that's SSSE3.
140
inline __m128i _mm_packu1_epi32_SSE2(const __m128i v0) {
141
// Toggle the sign bit, pack, then toggle back.
142
__m128i toggled = _mm_sub_epi32(v0, _mm_load_si128((const __m128i *)g_sign32));
143
__m128i temp = _mm_packs_epi32(toggled, toggled);
144
__m128i restored = _mm_add_epi16(temp, _mm_load_si128((const __m128i *)g_sign16));
145
return restored;
146
}
147
148
#endif
149
150
// SSE2 replacement for the entire _mm_packus_epi32 but without the saturation.
151
// Not ideal! pshufb would make this faster but that's SSSE3.
152
inline __m128i _mm_packu2_epi32_SSE2(const __m128i v0, const __m128i v1) {
153
__m128i a0 = _mm_shufflelo_epi16(v0, _MM_SHUFFLE(3, 3, 2, 0));
154
__m128 packed0 = _mm_castsi128_ps(_mm_shufflehi_epi16(a0, _MM_SHUFFLE(3, 3, 2, 0)));
155
__m128i a1 = _mm_shufflelo_epi16(v1, _MM_SHUFFLE(3, 3, 2, 0));
156
__m128 packed1 = _mm_castsi128_ps(_mm_shufflehi_epi16(a1, _MM_SHUFFLE(3, 3, 2, 0)));
157
return _mm_castps_si128(_mm_shuffle_ps(packed0, packed1, _MM_SHUFFLE(2, 0, 2, 0)));
158
}
159
160
// The below are not real SSE instructions in any generation, but should exist.
161
162
// Return 0xFFFF where x <= y, else 0x0000.
163
inline __m128i _mm_cmple_epu16(__m128i x, __m128i y) {
164
return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128());
165
}
166
167
// Return 0xFFFF where x >= y, else 0x0000.
168
inline __m128i _mm_cmpge_epu16(__m128i x, __m128i y) {
169
return _mm_cmple_epu16(y, x);
170
}
171
172
// Return 0xFFFF where x > y, else 0x0000.
173
inline __m128i _mm_cmpgt_epu16(__m128i x, __m128i y) {
174
return _mm_andnot_si128(_mm_cmpeq_epi16(x, y), _mm_cmple_epu16(y, x));
175
}
176
177
// Return 0xFFFF where x < y, else 0x0000.
178
inline __m128i _mm_cmplt_epu16(__m128i x, __m128i y) {
179
return _mm_cmpgt_epu16(y, x);
180
}
181
182
#endif
183
184