CoCalc -- SIMDHeaders.h

GitHub Repository: hrydgard/ppsspp
Path: blob/master/Common/Math/SIMDHeaders.h
³¹⁸⁶ views
1
#pragma once
2

3
// SIMD headers
4
// Let's include these in one consistent way across the code base.
5
// Here we'll also add wrappers that paper over differences between different versions
6
// of an instruction set, like NEON vs ASIMD (64-bit).
7

8
#pragma once
9

10
#include "ppsspp_config.h"
11

12
#include "stdint.h"
13
#include <string.h>
14

15
#ifdef __clang__
16
// Weird how you can't just use #pragma in a macro.
17
#define DO_NOT_VECTORIZE_LOOP _Pragma("clang loop vectorize(disable)")
18
#else
19
#define DO_NOT_VECTORIZE_LOOP
20
#endif
21

22
#if PPSSPP_ARCH(SSE2)
23
#include <emmintrin.h>
24
#endif
25

26
#if PPSSPP_ARCH(ARM_NEON)
27
#if defined(_MSC_VER) && PPSSPP_ARCH(ARM64)
28
#include <arm64_neon.h>
29
#else
30
#include <arm_neon.h>
31
#endif
32
#endif
33

34
#if PPSSPP_ARCH(LOONGARCH64)
35
#if PPSSPP_ARCH(LOONGARCH64_LSX)
36
#include <lsxintrin.h>
37
#endif
38
#endif
39

40
// Basic types
41

42
#if PPSSPP_ARCH(ARM64_NEON)
43

44
// No special ones here.
45

46
#elif PPSSPP_ARCH(ARM_NEON)
47

48
// Compatibility wrappers making ARM64 NEON code run on ARM32
49
// With optimization on, these should compile down to the optimal code.
50

51
static inline float32x4_t vmulq_laneq_f32(float32x4_t a, float32x4_t b, int lane) {
52
	switch (lane & 3) {
53
	case 0: return vmulq_lane_f32(a, vget_low_f32(b), 0);
54
	case 1: return vmulq_lane_f32(a, vget_low_f32(b), 1);
55
	case 2: return vmulq_lane_f32(a, vget_high_f32(b), 0);
56
	default: return vmulq_lane_f32(a, vget_high_f32(b), 1);
57
	}
58
}
59

60
static inline float32x4_t vmlaq_laneq_f32(float32x4_t a, float32x4_t b, float32x4_t c, int lane) {
61
	switch (lane & 3) {
62
	case 0: return vmlaq_lane_f32(a, b, vget_low_f32(c), 0);
63
	case 1: return vmlaq_lane_f32(a, b, vget_low_f32(c), 1);
64
	case 2: return vmlaq_lane_f32(a, b, vget_high_f32(c), 0);
65
	default: return vmlaq_lane_f32(a, b, vget_high_f32(c), 1);
66
	}
67
}
68

69
#define vfmaq_laneq_f32 vmlaq_laneq_f32
70

71
static inline uint32x4_t vcgezq_f32(float32x4_t v) {
72
	return vcgeq_f32(v, vdupq_n_f32(0.0f));
73
}
74

75
#endif
76

77
#if PPSSPP_ARCH(SSE2)
78

79
#if defined __SSE4_2__
80
# define _M_SSE 0x402
81
#elif defined __SSE4_1__
82
# define _M_SSE 0x401
83
#elif defined __SSSE3__
84
# define _M_SSE 0x301
85
#elif defined __SSE3__
86
# define _M_SSE 0x300
87
#elif defined __SSE2__
88
# define _M_SSE 0x200
89
#elif !defined(__GNUC__) && (defined(_M_X64) || defined(_M_IX86))
90
# define _M_SSE 0x402
91
#endif
92

93
// These are SSE2 versions of SSE4.1 instructions, for compatibility and ease of
94
// writing code.
95
// May later figure out how to use the appropriate ones depending on compile flags.
96

97
inline __m128i _mm_mullo_epi32_SSE2(const __m128i v0, const __m128i v1) {
98
	__m128i a13 = _mm_shuffle_epi32(v0, 0xF5);             // (-,a3,-,a1)
99
	__m128i b13 = _mm_shuffle_epi32(v1, 0xF5);             // (-,b3,-,b1)
100
	__m128i prod02 = _mm_mul_epu32(v0, v1);                // (-,a2*b2,-,a0*b0)
101
	__m128i prod13 = _mm_mul_epu32(a13, b13);              // (-,a3*b3,-,a1*b1)
102
	__m128i prod01 = _mm_unpacklo_epi32(prod02, prod13);   // (-,-,a1*b1,a0*b0)
103
	__m128i prod23 = _mm_unpackhi_epi32(prod02, prod13);   // (-,-,a3*b3,a2*b2)
104
	return _mm_unpacklo_epi64(prod01, prod23);
105
}
106

107
inline __m128i _mm_max_epu16_SSE2(const __m128i v0, const __m128i v1) {
108
	return _mm_xor_si128(
109
		_mm_max_epi16(
110
			_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
111
			_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
112
		_mm_set1_epi16((int16_t)0x8000));
113
}
114

115
inline __m128i _mm_min_epu16_SSE2(const __m128i v0, const __m128i v1) {
116
	return _mm_xor_si128(
117
		_mm_min_epi16(
118
			_mm_xor_si128(v0, _mm_set1_epi16((int16_t)0x8000)),
119
			_mm_xor_si128(v1, _mm_set1_epi16((int16_t)0x8000))),
120
		_mm_set1_epi16((int16_t)0x8000));
121
}
122

123
// SSE2 replacement for half of a _mm_packus_epi32 but without the saturation.
124
inline __m128i _mm_packu_epi32_SSE2(const __m128i v0) {
125
	__m128i temp = _mm_shufflelo_epi16(v0, _MM_SHUFFLE(3, 3, 2, 0));
126
	__m128 temp2 = _mm_castsi128_ps(_mm_shufflehi_epi16(temp, _MM_SHUFFLE(3, 3, 2, 0)));
127
	return _mm_castps_si128(_mm_shuffle_ps(temp2, temp2, _MM_SHUFFLE(3, 3, 2, 0)));
128
}
129

130
#define _mm_splat_lane_ps(v, l) _mm_shuffle_ps((v), (v), _MM_SHUFFLE(l, l, l, l))
131

132
#ifdef __cplusplus
133

134
alignas(16) static const uint32_t g_sign32[4] = { 0x00008000, 0x00008000, 0x00008000, 0x00008000 };
135
alignas(16) static const uint32_t g_sign16[4] = { 0x80008000, 0x80008000, 0x80008000, 0x80008000 };
136

137
// Alternate solution to the above, not sure if faster or slower.
138
// SSE2 replacement for half of _mm_packus_epi32 but without the saturation.
139
// Not ideal! pshufb would make this faster but that's SSSE3.
140
inline __m128i _mm_packu1_epi32_SSE2(const __m128i v0) {
141
	// Toggle the sign bit, pack, then toggle back.
142
	__m128i toggled = _mm_sub_epi32(v0, _mm_load_si128((const __m128i *)g_sign32));
143
	__m128i temp = _mm_packs_epi32(toggled, toggled);
144
	__m128i restored = _mm_add_epi16(temp, _mm_load_si128((const __m128i *)g_sign16));
145
	return restored;
146
}
147

148
#endif
149

150
// SSE2 replacement for the entire _mm_packus_epi32 but without the saturation.
151
// Not ideal! pshufb would make this faster but that's SSSE3.
152
inline __m128i _mm_packu2_epi32_SSE2(const __m128i v0, const __m128i v1) {
153
	__m128i a0 = _mm_shufflelo_epi16(v0, _MM_SHUFFLE(3, 3, 2, 0));
154
	__m128 packed0 = _mm_castsi128_ps(_mm_shufflehi_epi16(a0, _MM_SHUFFLE(3, 3, 2, 0)));
155
	__m128i a1 = _mm_shufflelo_epi16(v1, _MM_SHUFFLE(3, 3, 2, 0));
156
	__m128 packed1 = _mm_castsi128_ps(_mm_shufflehi_epi16(a1, _MM_SHUFFLE(3, 3, 2, 0)));
157
	return _mm_castps_si128(_mm_shuffle_ps(packed0, packed1, _MM_SHUFFLE(2, 0, 2, 0)));
158
}
159

160
// The below are not real SSE instructions in any generation, but should exist.
161

162
// Return 0xFFFF where x <= y, else 0x0000.
163
inline __m128i _mm_cmple_epu16(__m128i x, __m128i y) {
164
	return _mm_cmpeq_epi16(_mm_subs_epu16(x, y), _mm_setzero_si128());
165
}
166

167
// Return 0xFFFF where x >= y, else 0x0000.
168
inline __m128i _mm_cmpge_epu16(__m128i x, __m128i y) {
169
	return _mm_cmple_epu16(y, x);
170
}
171

172
// Return 0xFFFF where x > y, else 0x0000.
173
inline __m128i _mm_cmpgt_epu16(__m128i x, __m128i y) {
174
	return _mm_andnot_si128(_mm_cmpeq_epi16(x, y), _mm_cmple_epu16(y, x));
175
}
176

177
// Return 0xFFFF where x < y, else 0x0000.
178
inline __m128i _mm_cmplt_epu16(__m128i x, __m128i y) {
179
	return _mm_cmpgt_epu16(y, x);
180
}
181

182
#endif
183

184
Product

Resources

Company