Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
xmrig
GitHub Repository: xmrig/xmrig
Path: blob/master/src/crypto/cn/sse2neon.h
3864 views
1
#ifndef SSE2NEON_H
2
#define SSE2NEON_H
3
4
// This header file provides a simple API translation layer
5
// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6
//
7
// Contributors to this work are:
8
// John W. Ratcliff <[email protected]>
9
// Brandon Rowlett <[email protected]>
10
// Ken Fast <[email protected]>
11
// Eric van Beurden <[email protected]>
12
// Alexander Potylitsin <[email protected]>
13
// Hasindu Gamaarachchi <[email protected]>
14
// Jim Huang <[email protected]>
15
// Mark Cheng <[email protected]>
16
// Malcolm James MacLeod <[email protected]>
17
// Devin Hussey (easyaspi314) <[email protected]>
18
// Sebastian Pop <[email protected]>
19
// Developer Ecosystem Engineering <[email protected]>
20
// Danila Kutenin <[email protected]>
21
// François Turban (JishinMaster) <[email protected]>
22
// Pei-Hsuan Hung <[email protected]>
23
// Yang-Hao Yuan <[email protected]>
24
// Syoyo Fujita <[email protected]>
25
// Brecht Van Lommel <[email protected]>
26
// Jonathan Hue <[email protected]>
27
// Cuda Chen <[email protected]>
28
// Aymen Qader <[email protected]>
29
// Anthony Roberts <[email protected]>
30
31
/*
32
* sse2neon is freely redistributable under the MIT License.
33
*
34
* Permission is hereby granted, free of charge, to any person obtaining a copy
35
* of this software and associated documentation files (the "Software"), to deal
36
* in the Software without restriction, including without limitation the rights
37
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
38
* copies of the Software, and to permit persons to whom the Software is
39
* furnished to do so, subject to the following conditions:
40
*
41
* The above copyright notice and this permission notice shall be included in
42
* all copies or substantial portions of the Software.
43
*
44
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
47
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
48
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
49
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
50
* SOFTWARE.
51
*/
52
53
/* Tunable configurations */
54
55
/* Enable precise implementation of math operations
56
* This would slow down the computation a bit, but gives consistent result with
57
* x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
58
*/
59
/* _mm_min|max_ps|ss|pd|sd */
60
#ifndef SSE2NEON_PRECISE_MINMAX
61
#define SSE2NEON_PRECISE_MINMAX (0)
62
#endif
63
/* _mm_rcp_ps and _mm_div_ps */
64
#ifndef SSE2NEON_PRECISE_DIV
65
#define SSE2NEON_PRECISE_DIV (0)
66
#endif
67
/* _mm_sqrt_ps and _mm_rsqrt_ps */
68
#ifndef SSE2NEON_PRECISE_SQRT
69
#define SSE2NEON_PRECISE_SQRT (0)
70
#endif
71
/* _mm_dp_pd */
72
#ifndef SSE2NEON_PRECISE_DP
73
#define SSE2NEON_PRECISE_DP (0)
74
#endif
75
76
/* Enable inclusion of windows.h on MSVC platforms
77
* This makes _mm_clflush functional on windows, as there is no builtin.
78
*/
79
#ifndef SSE2NEON_INCLUDE_WINDOWS_H
80
#define SSE2NEON_INCLUDE_WINDOWS_H (0)
81
#endif
82
83
/* compiler specific definitions */
84
#if defined(__GNUC__) || defined(__clang__)
85
#pragma push_macro("FORCE_INLINE")
86
#pragma push_macro("ALIGN_STRUCT")
87
#define FORCE_INLINE static inline __attribute__((always_inline))
88
#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
89
#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
90
#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
91
#elif defined(_MSC_VER)
92
#if _MSVC_TRADITIONAL
93
#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
94
#endif
95
#ifndef FORCE_INLINE
96
#define FORCE_INLINE static inline
97
#endif
98
#ifndef ALIGN_STRUCT
99
#define ALIGN_STRUCT(x) __declspec(align(x))
100
#endif
101
#define _sse2neon_likely(x) (x)
102
#define _sse2neon_unlikely(x) (x)
103
#else
104
#pragma message("Macro name collisions may happen with unsupported compilers.")
105
#endif
106
107
/* C language does not allow initializing a variable with a function call. */
108
#ifdef __cplusplus
109
#define _sse2neon_const static const
110
#else
111
#define _sse2neon_const const
112
#endif
113
114
#include <stdint.h>
115
#include <stdlib.h>
116
117
#if defined(_WIN32)
118
/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
119
* from both MinGW-w64 and MSVC.
120
*/
121
#define SSE2NEON_ALLOC_DEFINED
122
#endif
123
124
/* If using MSVC */
125
#ifdef _MSC_VER
126
#include <intrin.h>
127
#if SSE2NEON_INCLUDE_WINDOWS_H
128
#include <processthreadsapi.h>
129
#include <windows.h>
130
#endif
131
132
#if !defined(__cplusplus)
133
#error sse2neon only supports C++ compilation with this compiler
134
#endif
135
136
#ifdef SSE2NEON_ALLOC_DEFINED
137
#include <malloc.h>
138
#endif
139
140
#if (defined(_M_AMD64) || defined(__x86_64__)) || \
141
(defined(_M_ARM64) || defined(__arm64__))
142
#define SSE2NEON_HAS_BITSCAN64
143
#endif
144
#endif
145
146
#if defined(__GNUC__) || defined(__clang__)
147
#define _sse2neon_define0(type, s, body) \
148
__extension__({ \
149
type _a = (s); \
150
body \
151
})
152
#define _sse2neon_define1(type, s, body) \
153
__extension__({ \
154
type _a = (s); \
155
body \
156
})
157
#define _sse2neon_define2(type, a, b, body) \
158
__extension__({ \
159
type _a = (a), _b = (b); \
160
body \
161
})
162
#define _sse2neon_return(ret) (ret)
163
#else
164
#define _sse2neon_define0(type, a, body) [=](type _a) { body }(a)
165
#define _sse2neon_define1(type, a, body) [](type _a) { body }(a)
166
#define _sse2neon_define2(type, a, b, body) \
167
[](type _a, type _b) { body }((a), (b))
168
#define _sse2neon_return(ret) return ret
169
#endif
170
171
#define _sse2neon_init(...) \
172
{ \
173
__VA_ARGS__ \
174
}
175
176
/* Compiler barrier */
177
#if defined(_MSC_VER)
178
#define SSE2NEON_BARRIER() _ReadWriteBarrier()
179
#else
180
#define SSE2NEON_BARRIER() \
181
do { \
182
__asm__ __volatile__("" ::: "memory"); \
183
(void) 0; \
184
} while (0)
185
#endif
186
187
/* Memory barriers
188
* __atomic_thread_fence does not include a compiler barrier; instead,
189
* the barrier is part of __atomic_load/__atomic_store's "volatile-like"
190
* semantics.
191
*/
192
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
193
#include <stdatomic.h>
194
#endif
195
196
FORCE_INLINE void _sse2neon_smp_mb(void)
197
{
198
SSE2NEON_BARRIER();
199
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
200
!defined(__STDC_NO_ATOMICS__)
201
atomic_thread_fence(memory_order_seq_cst);
202
#elif defined(__GNUC__) || defined(__clang__)
203
__atomic_thread_fence(__ATOMIC_SEQ_CST);
204
#else /* MSVC */
205
__dmb(_ARM64_BARRIER_ISH);
206
#endif
207
}
208
209
/* Architecture-specific build options */
210
/* FIXME: #pragma GCC push_options is only available on GCC */
211
#if defined(__GNUC__)
212
#if defined(__arm__) && __ARM_ARCH == 7
213
/* According to ARM C Language Extensions Architecture specification,
214
* __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
215
* architecture supported.
216
*/
217
#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
218
#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
219
#endif
220
#if !defined(__clang__)
221
#pragma GCC push_options
222
#pragma GCC target("fpu=neon")
223
#endif
224
#elif defined(__aarch64__) || defined(_M_ARM64)
225
#if !defined(__clang__) && !defined(_MSC_VER)
226
#pragma GCC push_options
227
#pragma GCC target("+simd")
228
#endif
229
#elif __ARM_ARCH == 8
230
#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
231
#error \
232
"You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
233
#endif
234
#if !defined(__clang__) && !defined(_MSC_VER)
235
#pragma GCC push_options
236
#endif
237
#else
238
#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
239
#endif
240
#endif
241
242
#include <arm_neon.h>
243
#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8)
244
#if defined __has_include && __has_include(<arm_acle.h>)
245
#include <arm_acle.h>
246
#endif
247
#endif
248
249
/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
250
* and other Arm microarchitectures use.
251
* From sysctl -a on Apple M1:
252
* hw.cachelinesize: 128
253
*/
254
#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
255
#define SSE2NEON_CACHELINE_SIZE 128
256
#else
257
#define SSE2NEON_CACHELINE_SIZE 64
258
#endif
259
260
/* Rounding functions require either Aarch64 instructions or libm fallback */
261
#if !defined(__aarch64__) && !defined(_M_ARM64)
262
#include <math.h>
263
#endif
264
265
/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
266
* or even not accessible in user mode.
267
* To write or access to these registers in user mode,
268
* we have to perform syscall instead.
269
*/
270
#if (!defined(__aarch64__) && !defined(_M_ARM64))
271
#include <sys/time.h>
272
#endif
273
274
/* "__has_builtin" can be used to query support for built-in functions
275
* provided by gcc/clang and other compilers that support it.
276
*/
277
#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
278
/* Compatibility with gcc <= 9 */
279
#if defined(__GNUC__) && (__GNUC__ <= 9)
280
#define __has_builtin(x) HAS##x
281
#define HAS__builtin_popcount 1
282
#define HAS__builtin_popcountll 1
283
284
// __builtin_shuffle introduced in GCC 4.7.0
285
#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
286
#define HAS__builtin_shuffle 1
287
#else
288
#define HAS__builtin_shuffle 0
289
#endif
290
291
#define HAS__builtin_shufflevector 0
292
#define HAS__builtin_nontemporal_store 0
293
#else
294
#define __has_builtin(x) 0
295
#endif
296
#endif
297
298
/**
299
* MACRO for shuffle parameter for _mm_shuffle_ps().
300
* Argument fp3 is a digit[0123] that represents the fp from argument "b"
301
* of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
302
* for fp2 in result. fp1 is a digit[0123] that represents the fp from
303
* argument "a" of mm_shuffle_ps that will be places in fp1 of result.
304
* fp0 is the same for fp0 of result.
305
*/
306
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
307
(((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
308
309
#if __has_builtin(__builtin_shufflevector)
310
#define _sse2neon_shuffle(type, a, b, ...) \
311
__builtin_shufflevector(a, b, __VA_ARGS__)
312
#elif __has_builtin(__builtin_shuffle)
313
#define _sse2neon_shuffle(type, a, b, ...) \
314
__extension__({ \
315
type tmp = {__VA_ARGS__}; \
316
__builtin_shuffle(a, b, tmp); \
317
})
318
#endif
319
320
#ifdef _sse2neon_shuffle
321
#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
322
#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
323
#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
324
#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
325
#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
326
#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
327
#endif
328
329
/* Rounding mode macros. */
330
#define _MM_FROUND_TO_NEAREST_INT 0x00
331
#define _MM_FROUND_TO_NEG_INF 0x01
332
#define _MM_FROUND_TO_POS_INF 0x02
333
#define _MM_FROUND_TO_ZERO 0x03
334
#define _MM_FROUND_CUR_DIRECTION 0x04
335
#define _MM_FROUND_NO_EXC 0x08
336
#define _MM_FROUND_RAISE_EXC 0x00
337
#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
338
#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
339
#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
340
#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
341
#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
342
#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
343
#define _MM_ROUND_NEAREST 0x0000
344
#define _MM_ROUND_DOWN 0x2000
345
#define _MM_ROUND_UP 0x4000
346
#define _MM_ROUND_TOWARD_ZERO 0x6000
347
/* Flush zero mode macros. */
348
#define _MM_FLUSH_ZERO_MASK 0x8000
349
#define _MM_FLUSH_ZERO_ON 0x8000
350
#define _MM_FLUSH_ZERO_OFF 0x0000
351
/* Denormals are zeros mode macros. */
352
#define _MM_DENORMALS_ZERO_MASK 0x0040
353
#define _MM_DENORMALS_ZERO_ON 0x0040
354
#define _MM_DENORMALS_ZERO_OFF 0x0000
355
356
/* indicate immediate constant argument in a given range */
357
#define __constrange(a, b) const
358
359
/* A few intrinsics accept traditional data types like ints or floats, but
360
* most operate on data types that are specific to SSE.
361
* If a vector type ends in d, it contains doubles, and if it does not have
362
* a suffix, it contains floats. An integer vector type can contain any type
363
* of integer, from chars to shorts to unsigned long longs.
364
*/
365
typedef int64x1_t __m64;
366
typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
367
// On ARM 32-bit architecture, the float64x2_t is not supported.
368
// The data type __m128d should be represented in a different way for related
369
// intrinsic conversion.
370
#if defined(__aarch64__) || defined(_M_ARM64)
371
typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
372
#else
373
typedef float32x4_t __m128d;
374
#endif
375
typedef int64x2_t __m128i; /* 128-bit vector containing integers */
376
377
// __int64 is defined in the Intrinsics Guide which maps to different datatype
378
// in different data model
379
#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
380
#if (defined(__x86_64__) || defined(__i386__))
381
#define __int64 long long
382
#else
383
#define __int64 int64_t
384
#endif
385
#endif
386
387
/* type-safe casting between types */
388
389
#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
390
#define vreinterpretq_m128_f32(x) (x)
391
#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
392
393
#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
394
#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
395
#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
396
#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
397
398
#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
399
#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
400
#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
401
#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
402
403
#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
404
#define vreinterpretq_f32_m128(x) (x)
405
#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
406
407
#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
408
#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
409
#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
410
#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
411
412
#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
413
#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
414
#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
415
#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
416
417
#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
418
#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
419
#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
420
#define vreinterpretq_m128i_s64(x) (x)
421
422
#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
423
#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
424
#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
425
#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
426
427
#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
428
#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
429
430
#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
431
#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
432
#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
433
#define vreinterpretq_s64_m128i(x) (x)
434
435
#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
436
#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
437
#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
438
#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
439
440
#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
441
#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
442
#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
443
#define vreinterpret_m64_s64(x) (x)
444
445
#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
446
#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
447
#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
448
#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
449
450
#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
451
#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
452
#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
453
454
#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
455
#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
456
#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
457
#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
458
459
#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
460
#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
461
#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
462
#define vreinterpret_s64_m64(x) (x)
463
464
#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
465
466
#if defined(__aarch64__) || defined(_M_ARM64)
467
#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
468
#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
469
470
#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
471
472
#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
473
#define vreinterpretq_m128d_f64(x) (x)
474
475
#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
476
477
#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
478
#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
479
480
#define vreinterpretq_f64_m128d(x) (x)
481
#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
482
#else
483
#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
484
#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
485
486
#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
487
#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
488
489
#define vreinterpretq_m128d_f32(x) (x)
490
491
#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
492
493
#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
494
#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
495
496
#define vreinterpretq_f32_m128d(x) (x)
497
#endif
498
499
// A struct is defined in this header file called 'SIMDVec' which can be used
500
// by applications which attempt to access the contents of an __m128 struct
501
// directly. It is important to note that accessing the __m128 struct directly
502
// is bad coding practice by Microsoft: @see:
503
// https://learn.microsoft.com/en-us/cpp/cpp/m128
504
//
505
// However, some legacy source code may try to access the contents of an __m128
506
// struct directly so the developer can use the SIMDVec as an alias for it. Any
507
// casting must be done manually by the developer, as you cannot cast or
508
// otherwise alias the base NEON data type for intrinsic operations.
509
//
510
// union intended to allow direct access to an __m128 variable using the names
511
// that the MSVC compiler provides. This union should really only be used when
512
// trying to access the members of the vector as integer values. GCC/clang
513
// allow native access to the float members through a simple array access
514
// operator (in C since 4.6, in C++ since 4.8).
515
//
516
// Ideally direct accesses to SIMD vectors should not be used since it can cause
517
// a performance hit. If it really is needed however, the original __m128
518
// variable can be aliased with a pointer to this union and used to access
519
// individual components. The use of this union should be hidden behind a macro
520
// that is used throughout the codebase to access the members instead of always
521
// declaring this type of variable.
522
typedef union ALIGN_STRUCT(16) SIMDVec {
523
float m128_f32[4]; // as floats - DON'T USE. Added for convenience.
524
int8_t m128_i8[16]; // as signed 8-bit integers.
525
int16_t m128_i16[8]; // as signed 16-bit integers.
526
int32_t m128_i32[4]; // as signed 32-bit integers.
527
int64_t m128_i64[2]; // as signed 64-bit integers.
528
uint8_t m128_u8[16]; // as unsigned 8-bit integers.
529
uint16_t m128_u16[8]; // as unsigned 16-bit integers.
530
uint32_t m128_u32[4]; // as unsigned 32-bit integers.
531
uint64_t m128_u64[2]; // as unsigned 64-bit integers.
532
} SIMDVec;
533
534
// casting using SIMDVec
535
#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
536
#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
537
#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
538
539
/* SSE macros */
540
#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
541
#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
542
#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
543
#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
544
545
// Function declaration
546
// SSE
547
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
548
FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
549
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
550
FORCE_INLINE __m128 _mm_set_ps1(float);
551
FORCE_INLINE __m128 _mm_setzero_ps(void);
552
// SSE2
553
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
554
FORCE_INLINE __m128i _mm_castps_si128(__m128);
555
FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
556
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
557
FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
558
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
559
FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
560
FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
561
FORCE_INLINE __m128d _mm_set_pd(double, double);
562
FORCE_INLINE __m128i _mm_set1_epi32(int);
563
FORCE_INLINE __m128i _mm_setzero_si128(void);
564
// SSE4.1
565
FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
566
FORCE_INLINE __m128 _mm_ceil_ps(__m128);
567
FORCE_INLINE __m128d _mm_floor_pd(__m128d);
568
FORCE_INLINE __m128 _mm_floor_ps(__m128);
569
FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
570
FORCE_INLINE __m128 _mm_round_ps(__m128, int);
571
// SSE4.2
572
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
573
574
/* Backwards compatibility for compilers with lack of specific type support */
575
576
// Older gcc does not define vld1q_u8_x4 type
577
#if defined(__GNUC__) && !defined(__clang__) && \
578
((__GNUC__ <= 13 && defined(__arm__)) || \
579
(__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
580
(__GNUC__ <= 9 && defined(__aarch64__)))
581
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
582
{
583
uint8x16x4_t ret;
584
ret.val[0] = vld1q_u8(p + 0);
585
ret.val[1] = vld1q_u8(p + 16);
586
ret.val[2] = vld1q_u8(p + 32);
587
ret.val[3] = vld1q_u8(p + 48);
588
return ret;
589
}
590
#else
591
// Wraps vld1q_u8_x4
592
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
593
{
594
return vld1q_u8_x4(p);
595
}
596
#endif
597
598
#if !defined(__aarch64__) && !defined(_M_ARM64)
599
/* emulate vaddv u8 variant */
600
FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
601
{
602
const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
603
return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
604
}
605
#else
606
// Wraps vaddv_u8
607
FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
608
{
609
return vaddv_u8(v8);
610
}
611
#endif
612
613
#if !defined(__aarch64__) && !defined(_M_ARM64)
614
/* emulate vaddvq u8 variant */
615
FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
616
{
617
uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
618
uint8_t res = 0;
619
for (int i = 0; i < 8; ++i)
620
res += tmp[i];
621
return res;
622
}
623
#else
624
// Wraps vaddvq_u8
625
FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
626
{
627
return vaddvq_u8(a);
628
}
629
#endif
630
631
#if !defined(__aarch64__) && !defined(_M_ARM64)
632
/* emulate vaddvq u16 variant */
633
FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
634
{
635
uint32x4_t m = vpaddlq_u16(a);
636
uint64x2_t n = vpaddlq_u32(m);
637
uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
638
639
return vget_lane_u32((uint32x2_t) o, 0);
640
}
641
#else
642
// Wraps vaddvq_u16
643
FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
644
{
645
return vaddvq_u16(a);
646
}
647
#endif
648
649
/* Function Naming Conventions
650
* The naming convention of SSE intrinsics is straightforward. A generic SSE
651
* intrinsic function is given as follows:
652
* _mm_<name>_<data_type>
653
*
654
* The parts of this format are given as follows:
655
* 1. <name> describes the operation performed by the intrinsic
656
* 2. <data_type> identifies the data type of the function's primary arguments
657
*
658
* This last part, <data_type>, is a little complicated. It identifies the
659
* content of the input values, and can be set to any of the following values:
660
* + ps - vectors contain floats (ps stands for packed single-precision)
661
* + pd - vectors contain doubles (pd stands for packed double-precision)
662
* + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
663
* signed integers
664
* + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
665
* unsigned integers
666
* + si128 - unspecified 128-bit vector or 256-bit vector
667
* + m128/m128i/m128d - identifies input vector types when they are different
668
* than the type of the returned vector
669
*
670
* For example, _mm_setzero_ps. The _mm implies that the function returns
671
* a 128-bit vector. The _ps at the end implies that the argument vectors
672
* contain floats.
673
*
674
* A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
675
* // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
676
* __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
677
* // Set packed 8-bit integers
678
* // 128 bits, 16 chars, per 8 bits
679
* __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11,
680
* 4, 5, 12, 13, 6, 7, 14, 15);
681
* // Shuffle packed 8-bit integers
682
* __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
683
*/
684
685
/* Constants for use with _mm_prefetch. */
686
enum _mm_hint {
687
_MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
688
_MM_HINT_T0 = 1, /* load data to L1 and L2 cache */
689
_MM_HINT_T1 = 2, /* load data to L2 cache only */
690
_MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */
691
};
692
693
// The bit field mapping to the FPCR(floating-point control register)
694
typedef struct {
695
uint16_t res0;
696
uint8_t res1 : 6;
697
uint8_t bit22 : 1;
698
uint8_t bit23 : 1;
699
uint8_t bit24 : 1;
700
uint8_t res2 : 7;
701
#if defined(__aarch64__) || defined(_M_ARM64)
702
uint32_t res3;
703
#endif
704
} fpcr_bitfield;
705
706
// Takes the upper 64 bits of a and places it in the low end of the result
707
// Takes the lower 64 bits of b and places it into the high end of the result.
708
FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
709
{
710
float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
711
float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
712
return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
713
}
714
715
// takes the lower two 32-bit values from a and swaps them and places in high
716
// end of result takes the higher two 32 bit values from b and swaps them and
717
// places in low end of result.
718
FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
719
{
720
float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
721
float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
722
return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
723
}
724
725
FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
726
{
727
float32x2_t a21 = vget_high_f32(
728
vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
729
float32x2_t b03 = vget_low_f32(
730
vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
731
return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
732
}
733
734
FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
735
{
736
float32x2_t a03 = vget_low_f32(
737
vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
738
float32x2_t b21 = vget_high_f32(
739
vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
740
return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
741
}
742
743
FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
744
{
745
float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
746
float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
747
return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
748
}
749
750
FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
751
{
752
float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
753
float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
754
return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
755
}
756
757
FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
758
{
759
float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
760
float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
761
return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
762
}
763
764
// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
765
// high
766
FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
767
{
768
float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
769
float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
770
return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
771
}
772
773
FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
774
{
775
float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
776
float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
777
return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
778
}
779
780
FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
781
{
782
float32x2_t a22 =
783
vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
784
float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
785
return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
786
}
787
788
FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
789
{
790
float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
791
float32x2_t b22 =
792
vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
793
return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
794
}
795
796
FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
797
{
798
float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
799
float32x2_t a22 =
800
vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
801
float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
802
float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
803
return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
804
}
805
806
FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
807
{
808
float32x2_t a33 =
809
vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
810
float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
811
return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
812
}
813
814
FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
815
{
816
float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
817
float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
818
float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
819
float32x2_t b20 = vset_lane_f32(b2, b00, 1);
820
return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
821
}
822
823
FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
824
{
825
float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
826
float32_t b2 = vgetq_lane_f32(b, 2);
827
float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
828
float32x2_t b20 = vset_lane_f32(b2, b00, 1);
829
return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
830
}
831
832
FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
833
{
834
float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
835
float32_t b2 = vgetq_lane_f32(b, 2);
836
float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
837
float32x2_t b20 = vset_lane_f32(b2, b00, 1);
838
return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
839
}
840
841
// For MSVC, we check only if it is ARM64, as every single ARM64 processor
842
// supported by WoA has crypto extensions. If this changes in the future,
843
// this can be verified via the runtime-only method of:
844
// IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)
845
#if (defined(_M_ARM64) && !defined(__clang__)) || \
846
(defined(__ARM_FEATURE_CRYPTO) && \
847
(defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
848
// Wraps vmull_p64
849
FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
850
{
851
poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
852
poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
853
#if defined(_MSC_VER)
854
__n64 a1 = {a}, b1 = {b};
855
return vreinterpretq_u64_p128(vmull_p64(a1, b1));
856
#else
857
return vreinterpretq_u64_p128(vmull_p64(a, b));
858
#endif
859
}
860
#else // ARMv7 polyfill
861
// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
862
//
863
// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
864
// 64-bit->128-bit polynomial multiply.
865
//
866
// It needs some work and is somewhat slow, but it is still faster than all
867
// known scalar methods.
868
//
869
// Algorithm adapted to C from
870
// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
871
// from "Fast Software Polynomial Multiplication on ARM Processors Using the
872
// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
873
// (https://hal.inria.fr/hal-01506572)
874
static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
875
{
876
poly8x8_t a = vreinterpret_p8_u64(_a);
877
poly8x8_t b = vreinterpret_p8_u64(_b);
878
879
// Masks
880
uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
881
vcreate_u8(0x00000000ffffffff));
882
uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
883
vcreate_u8(0x0000000000000000));
884
885
// Do the multiplies, rotating with vext to get all combinations
886
uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0
887
uint8x16_t e =
888
vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1
889
uint8x16_t f =
890
vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0
891
uint8x16_t g =
892
vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2
893
uint8x16_t h =
894
vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0
895
uint8x16_t i =
896
vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3
897
uint8x16_t j =
898
vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0
899
uint8x16_t k =
900
vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4
901
902
// Add cross products
903
uint8x16_t l = veorq_u8(e, f); // L = E + F
904
uint8x16_t m = veorq_u8(g, h); // M = G + H
905
uint8x16_t n = veorq_u8(i, j); // N = I + J
906
907
// Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
908
// instructions.
909
#if defined(__aarch64__)
910
uint8x16_t lm_p0 = vreinterpretq_u8_u64(
911
vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
912
uint8x16_t lm_p1 = vreinterpretq_u8_u64(
913
vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
914
uint8x16_t nk_p0 = vreinterpretq_u8_u64(
915
vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
916
uint8x16_t nk_p1 = vreinterpretq_u8_u64(
917
vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
918
#else
919
uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
920
uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
921
uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
922
uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
923
#endif
924
// t0 = (L) (P0 + P1) << 8
925
// t1 = (M) (P2 + P3) << 16
926
uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
927
uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
928
uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
929
930
// t2 = (N) (P4 + P5) << 24
931
// t3 = (K) (P6 + P7) << 32
932
uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
933
uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
934
uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
935
936
// De-interleave
937
#if defined(__aarch64__)
938
uint8x16_t t0 = vreinterpretq_u8_u64(
939
vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
940
uint8x16_t t1 = vreinterpretq_u8_u64(
941
vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
942
uint8x16_t t2 = vreinterpretq_u8_u64(
943
vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
944
uint8x16_t t3 = vreinterpretq_u8_u64(
945
vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
946
#else
947
uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
948
uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
949
uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
950
uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
951
#endif
952
// Shift the cross products
953
uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8
954
uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16
955
uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24
956
uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32
957
958
// Accumulate the products
959
uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
960
uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
961
uint8x16_t mix = veorq_u8(d, cross1);
962
uint8x16_t r = veorq_u8(mix, cross2);
963
return vreinterpretq_u64_u8(r);
964
}
965
#endif // ARMv7 polyfill
966
967
// C equivalent:
968
// __m128i _mm_shuffle_epi32_default(__m128i a,
969
// __constrange(0, 255) int imm) {
970
// __m128i ret;
971
// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
972
// ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03];
973
// return ret;
974
// }
975
#define _mm_shuffle_epi32_default(a, imm) \
976
vreinterpretq_m128i_s32(vsetq_lane_s32( \
977
vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \
978
vsetq_lane_s32( \
979
vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
980
vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), \
981
((imm) >> 2) & 0x3), \
982
vmovq_n_s32(vgetq_lane_s32( \
983
vreinterpretq_s32_m128i(a), (imm) & (0x3))), \
984
1), \
985
2), \
986
3))
987
988
// Takes the upper 64 bits of a and places it in the low end of the result
989
// Takes the lower 64 bits of a and places it into the high end of the result.
990
FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
991
{
992
int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
993
int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
994
return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
995
}
996
997
// takes the lower two 32-bit values from a and swaps them and places in low end
998
// of result takes the higher two 32 bit values from a and swaps them and places
999
// in high end of result.
1000
FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
1001
{
1002
int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1003
int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
1004
return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
1005
}
1006
1007
// rotates the least significant 32 bits into the most significant 32 bits, and
1008
// shifts the rest down
1009
FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
1010
{
1011
return vreinterpretq_m128i_s32(
1012
vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
1013
}
1014
1015
// rotates the most significant 32 bits into the least significant 32 bits, and
1016
// shifts the rest up
1017
FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
1018
{
1019
return vreinterpretq_m128i_s32(
1020
vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
1021
}
1022
1023
// gets the lower 64 bits of a, and places it in the upper 64 bits
1024
// gets the lower 64 bits of a and places it in the lower 64 bits
1025
FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
1026
{
1027
int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1028
return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
1029
}
1030
1031
// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
1032
// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
1033
FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
1034
{
1035
int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1036
int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1037
return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
1038
}
1039
1040
// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
1041
// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
1042
// places it in the lower 64 bits
1043
FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
1044
{
1045
int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1046
return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
1047
}
1048
1049
FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
1050
{
1051
int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
1052
int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1053
return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
1054
}
1055
1056
FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
1057
{
1058
int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1059
int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1060
return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1061
}
1062
1063
FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
1064
{
1065
int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1066
int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1067
return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1068
}
1069
1070
#if defined(__aarch64__) || defined(_M_ARM64)
1071
#define _mm_shuffle_epi32_splat(a, imm) \
1072
vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
1073
#else
1074
#define _mm_shuffle_epi32_splat(a, imm) \
1075
vreinterpretq_m128i_s32( \
1076
vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))))
1077
#endif
1078
1079
// NEON does not support a general purpose permute intrinsic.
1080
// Shuffle single-precision (32-bit) floating-point elements in a using the
1081
// control in imm8, and store the results in dst.
1082
//
1083
// C equivalent:
1084
// __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
1085
// __constrange(0, 255) int imm) {
1086
// __m128 ret;
1087
// ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3];
1088
// ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03];
1089
// return ret;
1090
// }
1091
//
1092
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
1093
#define _mm_shuffle_ps_default(a, b, imm) \
1094
vreinterpretq_m128_f32(vsetq_lane_f32( \
1095
vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \
1096
vsetq_lane_f32( \
1097
vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \
1098
vsetq_lane_f32( \
1099
vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1100
vmovq_n_f32( \
1101
vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \
1102
1), \
1103
2), \
1104
3))
1105
1106
// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
1107
// Store the results in the low 64 bits of dst, with the high 64 bits being
1108
// copied from a to dst.
1109
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
1110
#define _mm_shufflelo_epi16_function(a, imm) \
1111
_sse2neon_define1( \
1112
__m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \
1113
int16x4_t lowBits = vget_low_s16(ret); \
1114
ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \
1115
ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1116
1); \
1117
ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1118
2); \
1119
ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1120
3); \
1121
_sse2neon_return(vreinterpretq_m128i_s16(ret));)
1122
1123
// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
1124
// Store the results in the high 64 bits of dst, with the low 64 bits being
1125
// copied from a to dst.
1126
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
1127
#define _mm_shufflehi_epi16_function(a, imm) \
1128
_sse2neon_define1( \
1129
__m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \
1130
int16x4_t highBits = vget_high_s16(ret); \
1131
ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \
1132
ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1133
5); \
1134
ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1135
6); \
1136
ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1137
7); \
1138
_sse2neon_return(vreinterpretq_m128i_s16(ret));)
1139
1140
/* MMX */
1141
1142
//_mm_empty is a no-op on arm
1143
FORCE_INLINE void _mm_empty(void) {}
1144
1145
/* SSE */
1146
1147
// Add packed single-precision (32-bit) floating-point elements in a and b, and
1148
// store the results in dst.
1149
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
1150
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
1151
{
1152
return vreinterpretq_m128_f32(
1153
vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1154
}
1155
1156
// Add the lower single-precision (32-bit) floating-point element in a and b,
1157
// store the result in the lower element of dst, and copy the upper 3 packed
1158
// elements from a to the upper elements of dst.
1159
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
1160
FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
1161
{
1162
float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
1163
float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1164
// the upper values in the result must be the remnants of <a>.
1165
return vreinterpretq_m128_f32(vaddq_f32(a, value));
1166
}
1167
1168
// Compute the bitwise AND of packed single-precision (32-bit) floating-point
1169
// elements in a and b, and store the results in dst.
1170
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
1171
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1172
{
1173
return vreinterpretq_m128_s32(
1174
vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1175
}
1176
1177
// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
1178
// elements in a and then AND with b, and store the results in dst.
1179
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
1180
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1181
{
1182
return vreinterpretq_m128_s32(
1183
vbicq_s32(vreinterpretq_s32_m128(b),
1184
vreinterpretq_s32_m128(a))); // *NOTE* argument swap
1185
}
1186
1187
// Average packed unsigned 16-bit integers in a and b, and store the results in
1188
// dst.
1189
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
1190
FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
1191
{
1192
return vreinterpret_m64_u16(
1193
vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1194
}
1195
1196
// Average packed unsigned 8-bit integers in a and b, and store the results in
1197
// dst.
1198
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
1199
FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
1200
{
1201
return vreinterpret_m64_u8(
1202
vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1203
}
1204
1205
// Compare packed single-precision (32-bit) floating-point elements in a and b
1206
// for equality, and store the results in dst.
1207
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
1208
FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
1209
{
1210
return vreinterpretq_m128_u32(
1211
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1212
}
1213
1214
// Compare the lower single-precision (32-bit) floating-point elements in a and
1215
// b for equality, store the result in the lower element of dst, and copy the
1216
// upper 3 packed elements from a to the upper elements of dst.
1217
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
1218
FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
1219
{
1220
return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1221
}
1222
1223
// Compare packed single-precision (32-bit) floating-point elements in a and b
1224
// for greater-than-or-equal, and store the results in dst.
1225
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
1226
FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
1227
{
1228
return vreinterpretq_m128_u32(
1229
vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1230
}
1231
1232
// Compare the lower single-precision (32-bit) floating-point elements in a and
1233
// b for greater-than-or-equal, store the result in the lower element of dst,
1234
// and copy the upper 3 packed elements from a to the upper elements of dst.
1235
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
1236
FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
1237
{
1238
return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1239
}
1240
1241
// Compare packed single-precision (32-bit) floating-point elements in a and b
1242
// for greater-than, and store the results in dst.
1243
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
1244
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
1245
{
1246
return vreinterpretq_m128_u32(
1247
vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1248
}
1249
1250
// Compare the lower single-precision (32-bit) floating-point elements in a and
1251
// b for greater-than, store the result in the lower element of dst, and copy
1252
// the upper 3 packed elements from a to the upper elements of dst.
1253
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
1254
FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
1255
{
1256
return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1257
}
1258
1259
// Compare packed single-precision (32-bit) floating-point elements in a and b
1260
// for less-than-or-equal, and store the results in dst.
1261
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
1262
FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
1263
{
1264
return vreinterpretq_m128_u32(
1265
vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1266
}
1267
1268
// Compare the lower single-precision (32-bit) floating-point elements in a and
1269
// b for less-than-or-equal, store the result in the lower element of dst, and
1270
// copy the upper 3 packed elements from a to the upper elements of dst.
1271
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
1272
FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
1273
{
1274
return _mm_move_ss(a, _mm_cmple_ps(a, b));
1275
}
1276
1277
// Compare packed single-precision (32-bit) floating-point elements in a and b
1278
// for less-than, and store the results in dst.
1279
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
1280
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
1281
{
1282
return vreinterpretq_m128_u32(
1283
vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1284
}
1285
1286
// Compare the lower single-precision (32-bit) floating-point elements in a and
1287
// b for less-than, store the result in the lower element of dst, and copy the
1288
// upper 3 packed elements from a to the upper elements of dst.
1289
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
1290
FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
1291
{
1292
return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1293
}
1294
1295
// Compare packed single-precision (32-bit) floating-point elements in a and b
1296
// for not-equal, and store the results in dst.
1297
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
1298
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
1299
{
1300
return vreinterpretq_m128_u32(vmvnq_u32(
1301
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1302
}
1303
1304
// Compare the lower single-precision (32-bit) floating-point elements in a and
1305
// b for not-equal, store the result in the lower element of dst, and copy the
1306
// upper 3 packed elements from a to the upper elements of dst.
1307
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
1308
FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
1309
{
1310
return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1311
}
1312
1313
// Compare packed single-precision (32-bit) floating-point elements in a and b
1314
// for not-greater-than-or-equal, and store the results in dst.
1315
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
1316
FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
1317
{
1318
return vreinterpretq_m128_u32(vmvnq_u32(
1319
vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1320
}
1321
1322
// Compare the lower single-precision (32-bit) floating-point elements in a and
1323
// b for not-greater-than-or-equal, store the result in the lower element of
1324
// dst, and copy the upper 3 packed elements from a to the upper elements of
1325
// dst.
1326
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
1327
FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
1328
{
1329
return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
1330
}
1331
1332
// Compare packed single-precision (32-bit) floating-point elements in a and b
1333
// for not-greater-than, and store the results in dst.
1334
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
1335
FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
1336
{
1337
return vreinterpretq_m128_u32(vmvnq_u32(
1338
vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1339
}
1340
1341
// Compare the lower single-precision (32-bit) floating-point elements in a and
1342
// b for not-greater-than, store the result in the lower element of dst, and
1343
// copy the upper 3 packed elements from a to the upper elements of dst.
1344
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
1345
FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
1346
{
1347
return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
1348
}
1349
1350
// Compare packed single-precision (32-bit) floating-point elements in a and b
1351
// for not-less-than-or-equal, and store the results in dst.
1352
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
1353
FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
1354
{
1355
return vreinterpretq_m128_u32(vmvnq_u32(
1356
vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1357
}
1358
1359
// Compare the lower single-precision (32-bit) floating-point elements in a and
1360
// b for not-less-than-or-equal, store the result in the lower element of dst,
1361
// and copy the upper 3 packed elements from a to the upper elements of dst.
1362
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
1363
FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
1364
{
1365
return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
1366
}
1367
1368
// Compare packed single-precision (32-bit) floating-point elements in a and b
1369
// for not-less-than, and store the results in dst.
1370
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
1371
FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
1372
{
1373
return vreinterpretq_m128_u32(vmvnq_u32(
1374
vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1375
}
1376
1377
// Compare the lower single-precision (32-bit) floating-point elements in a and
1378
// b for not-less-than, store the result in the lower element of dst, and copy
1379
// the upper 3 packed elements from a to the upper elements of dst.
1380
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
1381
FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
1382
{
1383
return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
1384
}
1385
1386
// Compare packed single-precision (32-bit) floating-point elements in a and b
1387
// to see if neither is NaN, and store the results in dst.
1388
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
1389
//
1390
// See also:
1391
// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1392
// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
1393
FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
1394
{
1395
// Note: NEON does not have ordered compare builtin
1396
// Need to compare a eq a and b eq b to check for NaN
1397
// Do AND of results to get final
1398
uint32x4_t ceqaa =
1399
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1400
uint32x4_t ceqbb =
1401
vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1402
return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1403
}
1404
1405
// Compare the lower single-precision (32-bit) floating-point elements in a and
1406
// b to see if neither is NaN, store the result in the lower element of dst, and
1407
// copy the upper 3 packed elements from a to the upper elements of dst.
1408
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
1409
FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
1410
{
1411
return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1412
}
1413
1414
// Compare packed single-precision (32-bit) floating-point elements in a and b
1415
// to see if either is NaN, and store the results in dst.
1416
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
1417
FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
1418
{
1419
uint32x4_t f32a =
1420
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1421
uint32x4_t f32b =
1422
vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1423
return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
1424
}
1425
1426
// Compare the lower single-precision (32-bit) floating-point elements in a and
1427
// b to see if either is NaN, store the result in the lower element of dst, and
1428
// copy the upper 3 packed elements from a to the upper elements of dst.
1429
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
1430
FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
1431
{
1432
return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
1433
}
1434
1435
// Compare the lower single-precision (32-bit) floating-point element in a and b
1436
// for equality, and return the boolean result (0 or 1).
1437
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
1438
FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
1439
{
1440
uint32x4_t a_eq_b =
1441
vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1442
return vgetq_lane_u32(a_eq_b, 0) & 0x1;
1443
}
1444
1445
// Compare the lower single-precision (32-bit) floating-point element in a and b
1446
// for greater-than-or-equal, and return the boolean result (0 or 1).
1447
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
1448
FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
1449
{
1450
uint32x4_t a_ge_b =
1451
vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1452
return vgetq_lane_u32(a_ge_b, 0) & 0x1;
1453
}
1454
1455
// Compare the lower single-precision (32-bit) floating-point element in a and b
1456
// for greater-than, and return the boolean result (0 or 1).
1457
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
1458
FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
1459
{
1460
uint32x4_t a_gt_b =
1461
vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1462
return vgetq_lane_u32(a_gt_b, 0) & 0x1;
1463
}
1464
1465
// Compare the lower single-precision (32-bit) floating-point element in a and b
1466
// for less-than-or-equal, and return the boolean result (0 or 1).
1467
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
1468
FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
1469
{
1470
uint32x4_t a_le_b =
1471
vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1472
return vgetq_lane_u32(a_le_b, 0) & 0x1;
1473
}
1474
1475
// Compare the lower single-precision (32-bit) floating-point element in a and b
1476
// for less-than, and return the boolean result (0 or 1).
1477
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
1478
FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
1479
{
1480
uint32x4_t a_lt_b =
1481
vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1482
return vgetq_lane_u32(a_lt_b, 0) & 0x1;
1483
}
1484
1485
// Compare the lower single-precision (32-bit) floating-point element in a and b
1486
// for not-equal, and return the boolean result (0 or 1).
1487
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
1488
FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
1489
{
1490
return !_mm_comieq_ss(a, b);
1491
}
1492
1493
// Convert packed signed 32-bit integers in b to packed single-precision
1494
// (32-bit) floating-point elements, store the results in the lower 2 elements
1495
// of dst, and copy the upper 2 packed elements from a to the upper elements of
1496
// dst.
1497
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
1498
FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
1499
{
1500
return vreinterpretq_m128_f32(
1501
vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1502
vget_high_f32(vreinterpretq_f32_m128(a))));
1503
}
1504
1505
// Convert packed single-precision (32-bit) floating-point elements in a to
1506
// packed 32-bit integers, and store the results in dst.
1507
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
1508
FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
1509
{
1510
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1511
defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1512
return vreinterpret_m64_s32(
1513
vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
1514
#else
1515
return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
1516
vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
1517
#endif
1518
}
1519
1520
// Convert the signed 32-bit integer b to a single-precision (32-bit)
1521
// floating-point element, store the result in the lower element of dst, and
1522
// copy the upper 3 packed elements from a to the upper elements of dst.
1523
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
1524
FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
1525
{
1526
return vreinterpretq_m128_f32(
1527
vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1528
}
1529
1530
// Convert the lower single-precision (32-bit) floating-point element in a to a
1531
// 32-bit integer, and store the result in dst.
1532
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
1533
FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
1534
{
1535
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1536
defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1537
return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
1538
0);
1539
#else
1540
float32_t data = vgetq_lane_f32(
1541
vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1542
return (int32_t) data;
1543
#endif
1544
}
1545
1546
// Convert packed 16-bit integers in a to packed single-precision (32-bit)
1547
// floating-point elements, and store the results in dst.
1548
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
1549
FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
1550
{
1551
return vreinterpretq_m128_f32(
1552
vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
1553
}
1554
1555
// Convert packed 32-bit integers in b to packed single-precision (32-bit)
1556
// floating-point elements, store the results in the lower 2 elements of dst,
1557
// and copy the upper 2 packed elements from a to the upper elements of dst.
1558
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
1559
FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
1560
{
1561
return vreinterpretq_m128_f32(
1562
vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1563
vget_high_f32(vreinterpretq_f32_m128(a))));
1564
}
1565
1566
// Convert packed signed 32-bit integers in a to packed single-precision
1567
// (32-bit) floating-point elements, store the results in the lower 2 elements
1568
// of dst, then convert the packed signed 32-bit integers in b to
1569
// single-precision (32-bit) floating-point element, and store the results in
1570
// the upper 2 elements of dst.
1571
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
1572
FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
1573
{
1574
return vreinterpretq_m128_f32(vcvtq_f32_s32(
1575
vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
1576
}
1577
1578
// Convert the lower packed 8-bit integers in a to packed single-precision
1579
// (32-bit) floating-point elements, and store the results in dst.
1580
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
1581
FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
1582
{
1583
return vreinterpretq_m128_f32(vcvtq_f32_s32(
1584
vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
1585
}
1586
1587
// Convert packed single-precision (32-bit) floating-point elements in a to
1588
// packed 16-bit integers, and store the results in dst. Note: this intrinsic
1589
// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
1590
// 0x7FFFFFFF.
1591
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
1592
FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
1593
{
1594
return vreinterpret_m64_s16(
1595
vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
1596
}
1597
1598
// Convert packed single-precision (32-bit) floating-point elements in a to
1599
// packed 32-bit integers, and store the results in dst.
1600
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
1601
#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1602
1603
// Convert packed single-precision (32-bit) floating-point elements in a to
1604
// packed 8-bit integers, and store the results in lower 4 elements of dst.
1605
// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
1606
// between 0x7F and 0x7FFFFFFF.
1607
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
1608
FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
1609
{
1610
return vreinterpret_m64_s8(vqmovn_s16(
1611
vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
1612
}
1613
1614
// Convert packed unsigned 16-bit integers in a to packed single-precision
1615
// (32-bit) floating-point elements, and store the results in dst.
1616
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
1617
FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
1618
{
1619
return vreinterpretq_m128_f32(
1620
vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
1621
}
1622
1623
// Convert the lower packed unsigned 8-bit integers in a to packed
1624
// single-precision (32-bit) floating-point elements, and store the results in
1625
// dst.
1626
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
1627
FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
1628
{
1629
return vreinterpretq_m128_f32(vcvtq_f32_u32(
1630
vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
1631
}
1632
1633
// Convert the signed 32-bit integer b to a single-precision (32-bit)
1634
// floating-point element, store the result in the lower element of dst, and
1635
// copy the upper 3 packed elements from a to the upper elements of dst.
1636
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
1637
#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1638
1639
// Convert the signed 64-bit integer b to a single-precision (32-bit)
1640
// floating-point element, store the result in the lower element of dst, and
1641
// copy the upper 3 packed elements from a to the upper elements of dst.
1642
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
1643
FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
1644
{
1645
return vreinterpretq_m128_f32(
1646
vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1647
}
1648
1649
// Copy the lower single-precision (32-bit) floating-point element of a to dst.
1650
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
1651
FORCE_INLINE float _mm_cvtss_f32(__m128 a)
1652
{
1653
return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1654
}
1655
1656
// Convert the lower single-precision (32-bit) floating-point element in a to a
1657
// 32-bit integer, and store the result in dst.
1658
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
1659
#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1660
1661
// Convert the lower single-precision (32-bit) floating-point element in a to a
1662
// 64-bit integer, and store the result in dst.
1663
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
1664
FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
1665
{
1666
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1667
defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1668
return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
1669
#else
1670
float32_t data = vgetq_lane_f32(
1671
vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1672
return (int64_t) data;
1673
#endif
1674
}
1675
1676
// Convert packed single-precision (32-bit) floating-point elements in a to
1677
// packed 32-bit integers with truncation, and store the results in dst.
1678
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
1679
FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
1680
{
1681
return vreinterpret_m64_s32(
1682
vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
1683
}
1684
1685
// Convert the lower single-precision (32-bit) floating-point element in a to a
1686
// 32-bit integer with truncation, and store the result in dst.
1687
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
1688
FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
1689
{
1690
return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
1691
}
1692
1693
// Convert packed single-precision (32-bit) floating-point elements in a to
1694
// packed 32-bit integers with truncation, and store the results in dst.
1695
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
1696
#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1697
1698
// Convert the lower single-precision (32-bit) floating-point element in a to a
1699
// 32-bit integer with truncation, and store the result in dst.
1700
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
1701
#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1702
1703
// Convert the lower single-precision (32-bit) floating-point element in a to a
1704
// 64-bit integer with truncation, and store the result in dst.
1705
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
1706
FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
1707
{
1708
return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1709
}
1710
1711
// Divide packed single-precision (32-bit) floating-point elements in a by
1712
// packed elements in b, and store the results in dst.
1713
// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement
1714
// division by multiplying a by b's reciprocal before using the Newton-Raphson
1715
// method to approximate the results.
1716
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
1717
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
1718
{
1719
#if defined(__aarch64__) || defined(_M_ARM64)
1720
return vreinterpretq_m128_f32(
1721
vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1722
#else
1723
float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
1724
recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1725
// Additional Netwon-Raphson iteration for accuracy
1726
recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1727
return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
1728
#endif
1729
}
1730
1731
// Divide the lower single-precision (32-bit) floating-point element in a by the
1732
// lower single-precision (32-bit) floating-point element in b, store the result
1733
// in the lower element of dst, and copy the upper 3 packed elements from a to
1734
// the upper elements of dst.
1735
// Warning: ARMv7-A does not produce the same result compared to Intel and not
1736
// IEEE-compliant.
1737
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
1738
FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
1739
{
1740
float32_t value =
1741
vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1742
return vreinterpretq_m128_f32(
1743
vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1744
}
1745
1746
// Extract a 16-bit integer from a, selected with imm8, and store the result in
1747
// the lower element of dst.
1748
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
1749
#define _mm_extract_pi16(a, imm) \
1750
(int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1751
1752
// Free aligned memory that was allocated with _mm_malloc.
1753
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
1754
#if !defined(SSE2NEON_ALLOC_DEFINED)
1755
FORCE_INLINE void _mm_free(void *addr)
1756
{
1757
free(addr);
1758
}
1759
#endif
1760
1761
FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
1762
{
1763
uint64_t value;
1764
#if defined(_MSC_VER)
1765
value = _ReadStatusReg(ARM64_FPCR);
1766
#else
1767
__asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */
1768
#endif
1769
return value;
1770
}
1771
1772
FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
1773
{
1774
#if defined(_MSC_VER)
1775
_WriteStatusReg(ARM64_FPCR, value);
1776
#else
1777
__asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */
1778
#endif
1779
}
1780
1781
// Macro: Get the flush zero bits from the MXCSR control and status register.
1782
// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
1783
// _MM_FLUSH_ZERO_OFF
1784
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
1785
FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
1786
{
1787
union {
1788
fpcr_bitfield field;
1789
#if defined(__aarch64__) || defined(_M_ARM64)
1790
uint64_t value;
1791
#else
1792
uint32_t value;
1793
#endif
1794
} r;
1795
1796
#if defined(__aarch64__) || defined(_M_ARM64)
1797
r.value = _sse2neon_get_fpcr();
1798
#else
1799
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1800
#endif
1801
1802
return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
1803
}
1804
1805
// Macro: Get the rounding mode bits from the MXCSR control and status register.
1806
// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
1807
// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
1808
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
1809
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
1810
{
1811
union {
1812
fpcr_bitfield field;
1813
#if defined(__aarch64__) || defined(_M_ARM64)
1814
uint64_t value;
1815
#else
1816
uint32_t value;
1817
#endif
1818
} r;
1819
1820
#if defined(__aarch64__) || defined(_M_ARM64)
1821
r.value = _sse2neon_get_fpcr();
1822
#else
1823
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1824
#endif
1825
1826
if (r.field.bit22) {
1827
return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
1828
} else {
1829
return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
1830
}
1831
}
1832
1833
// Copy a to dst, and insert the 16-bit integer i into dst at the location
1834
// specified by imm8.
1835
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
1836
#define _mm_insert_pi16(a, b, imm) \
1837
vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm)))
1838
1839
// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
1840
// elements) from memory into dst. mem_addr must be aligned on a 16-byte
1841
// boundary or a general-protection exception may be generated.
1842
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
1843
FORCE_INLINE __m128 _mm_load_ps(const float *p)
1844
{
1845
return vreinterpretq_m128_f32(vld1q_f32(p));
1846
}
1847
1848
// Load a single-precision (32-bit) floating-point element from memory into all
1849
// elements of dst.
1850
//
1851
// dst[31:0] := MEM[mem_addr+31:mem_addr]
1852
// dst[63:32] := MEM[mem_addr+31:mem_addr]
1853
// dst[95:64] := MEM[mem_addr+31:mem_addr]
1854
// dst[127:96] := MEM[mem_addr+31:mem_addr]
1855
//
1856
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
1857
#define _mm_load_ps1 _mm_load1_ps
1858
1859
// Load a single-precision (32-bit) floating-point element from memory into the
1860
// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
1861
// aligned on any particular boundary.
1862
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
1863
FORCE_INLINE __m128 _mm_load_ss(const float *p)
1864
{
1865
return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1866
}
1867
1868
// Load a single-precision (32-bit) floating-point element from memory into all
1869
// elements of dst.
1870
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
1871
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
1872
{
1873
return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1874
}
1875
1876
// Load 2 single-precision (32-bit) floating-point elements from memory into the
1877
// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
1878
// mem_addr does not need to be aligned on any particular boundary.
1879
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
1880
FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
1881
{
1882
return vreinterpretq_m128_f32(
1883
vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
1884
}
1885
1886
// Load 2 single-precision (32-bit) floating-point elements from memory into the
1887
// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
1888
// mem_addr does not need to be aligned on any particular boundary.
1889
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
1890
FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
1891
{
1892
return vreinterpretq_m128_f32(
1893
vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
1894
}
1895
1896
// Load 4 single-precision (32-bit) floating-point elements from memory into dst
1897
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1898
// general-protection exception may be generated.
1899
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
1900
FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
1901
{
1902
float32x4_t v = vrev64q_f32(vld1q_f32(p));
1903
return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
1904
}
1905
1906
// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
1907
// elements) from memory into dst. mem_addr does not need to be aligned on any
1908
// particular boundary.
1909
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
1910
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
1911
{
1912
// for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
1913
// equivalent for neon
1914
return vreinterpretq_m128_f32(vld1q_f32(p));
1915
}
1916
1917
// Load unaligned 16-bit integer from memory into the first element of dst.
1918
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
1919
FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
1920
{
1921
return vreinterpretq_m128i_s16(
1922
vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
1923
}
1924
1925
// Load unaligned 64-bit integer from memory into the first element of dst.
1926
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
1927
FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
1928
{
1929
return vreinterpretq_m128i_s64(
1930
vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
1931
}
1932
1933
// Allocate size bytes of memory, aligned to the alignment specified in align,
1934
// and return a pointer to the allocated memory. _mm_free should be used to free
1935
// memory that is allocated with _mm_malloc.
1936
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
1937
#if !defined(SSE2NEON_ALLOC_DEFINED)
1938
FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
1939
{
1940
void *ptr;
1941
if (align == 1)
1942
return malloc(size);
1943
if (align == 2 || (sizeof(void *) == 8 && align == 4))
1944
align = sizeof(void *);
1945
if (!posix_memalign(&ptr, align, size))
1946
return ptr;
1947
return NULL;
1948
}
1949
#endif
1950
1951
// Conditionally store 8-bit integer elements from a into memory using mask
1952
// (elements are not stored when the highest bit is not set in the corresponding
1953
// element) and a non-temporal memory hint.
1954
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
1955
FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
1956
{
1957
int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
1958
__m128 b = _mm_load_ps((const float *) mem_addr);
1959
int8x8_t masked =
1960
vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
1961
vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
1962
vst1_s8((int8_t *) mem_addr, masked);
1963
}
1964
1965
// Conditionally store 8-bit integer elements from a into memory using mask
1966
// (elements are not stored when the highest bit is not set in the corresponding
1967
// element) and a non-temporal memory hint.
1968
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
1969
#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
1970
1971
// Compare packed signed 16-bit integers in a and b, and store packed maximum
1972
// values in dst.
1973
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
1974
FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
1975
{
1976
return vreinterpret_m64_s16(
1977
vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
1978
}
1979
1980
// Compare packed single-precision (32-bit) floating-point elements in a and b,
1981
// and store packed maximum values in dst. dst does not follow the IEEE Standard
1982
// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
1983
// signed-zero values.
1984
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
1985
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
1986
{
1987
#if SSE2NEON_PRECISE_MINMAX
1988
float32x4_t _a = vreinterpretq_f32_m128(a);
1989
float32x4_t _b = vreinterpretq_f32_m128(b);
1990
return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
1991
#else
1992
return vreinterpretq_m128_f32(
1993
vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1994
#endif
1995
}
1996
1997
// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
1998
// values in dst.
1999
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
2000
FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
2001
{
2002
return vreinterpret_m64_u8(
2003
vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2004
}
2005
2006
// Compare the lower single-precision (32-bit) floating-point elements in a and
2007
// b, store the maximum value in the lower element of dst, and copy the upper 3
2008
// packed elements from a to the upper element of dst. dst does not follow the
2009
// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
2010
// inputs are NaN or signed-zero values.
2011
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
2012
FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
2013
{
2014
float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
2015
return vreinterpretq_m128_f32(
2016
vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2017
}
2018
2019
// Compare packed signed 16-bit integers in a and b, and store packed minimum
2020
// values in dst.
2021
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
2022
FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
2023
{
2024
return vreinterpret_m64_s16(
2025
vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
2026
}
2027
2028
// Compare packed single-precision (32-bit) floating-point elements in a and b,
2029
// and store packed minimum values in dst. dst does not follow the IEEE Standard
2030
// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
2031
// signed-zero values.
2032
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
2033
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
2034
{
2035
#if SSE2NEON_PRECISE_MINMAX
2036
float32x4_t _a = vreinterpretq_f32_m128(a);
2037
float32x4_t _b = vreinterpretq_f32_m128(b);
2038
return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
2039
#else
2040
return vreinterpretq_m128_f32(
2041
vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2042
#endif
2043
}
2044
2045
// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2046
// values in dst.
2047
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
2048
FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
2049
{
2050
return vreinterpret_m64_u8(
2051
vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2052
}
2053
2054
// Compare the lower single-precision (32-bit) floating-point elements in a and
2055
// b, store the minimum value in the lower element of dst, and copy the upper 3
2056
// packed elements from a to the upper element of dst. dst does not follow the
2057
// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
2058
// inputs are NaN or signed-zero values.
2059
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
2060
FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
2061
{
2062
float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
2063
return vreinterpretq_m128_f32(
2064
vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2065
}
2066
2067
// Move the lower single-precision (32-bit) floating-point element from b to the
2068
// lower element of dst, and copy the upper 3 packed elements from a to the
2069
// upper elements of dst.
2070
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
2071
FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
2072
{
2073
return vreinterpretq_m128_f32(
2074
vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
2075
vreinterpretq_f32_m128(a), 0));
2076
}
2077
2078
// Move the upper 2 single-precision (32-bit) floating-point elements from b to
2079
// the lower 2 elements of dst, and copy the upper 2 elements from a to the
2080
// upper 2 elements of dst.
2081
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
2082
FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
2083
{
2084
#if defined(aarch64__)
2085
return vreinterpretq_m128_u64(
2086
vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
2087
#else
2088
float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
2089
float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
2090
return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
2091
#endif
2092
}
2093
2094
// Move the lower 2 single-precision (32-bit) floating-point elements from b to
2095
// the upper 2 elements of dst, and copy the lower 2 elements from a to the
2096
// lower 2 elements of dst.
2097
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
2098
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
2099
{
2100
float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
2101
float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
2102
return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
2103
}
2104
2105
// Create mask from the most significant bit of each 8-bit element in a, and
2106
// store the result in dst.
2107
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
2108
FORCE_INLINE int _mm_movemask_pi8(__m64 a)
2109
{
2110
uint8x8_t input = vreinterpret_u8_m64(a);
2111
#if defined(__aarch64__) || defined(_M_ARM64)
2112
static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
2113
uint8x8_t tmp = vshr_n_u8(input, 7);
2114
return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
2115
#else
2116
// Refer the implementation of `_mm_movemask_epi8`
2117
uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2118
uint32x2_t paired16 =
2119
vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2120
uint8x8_t paired32 =
2121
vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2122
return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2123
#endif
2124
}
2125
2126
// Set each bit of mask dst based on the most significant bit of the
2127
// corresponding packed single-precision (32-bit) floating-point element in a.
2128
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
2129
FORCE_INLINE int _mm_movemask_ps(__m128 a)
2130
{
2131
uint32x4_t input = vreinterpretq_u32_m128(a);
2132
#if defined(__aarch64__) || defined(_M_ARM64)
2133
static const int32_t shift[4] = {0, 1, 2, 3};
2134
uint32x4_t tmp = vshrq_n_u32(input, 31);
2135
return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)));
2136
#else
2137
// Uses the exact same method as _mm_movemask_epi8, see that for details.
2138
// Shift out everything but the sign bits with a 32-bit unsigned shift
2139
// right.
2140
uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2141
// Merge the two pairs together with a 64-bit unsigned shift right + add.
2142
uint8x16_t paired =
2143
vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2144
// Extract the result.
2145
return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2146
#endif
2147
}
2148
2149
// Multiply packed single-precision (32-bit) floating-point elements in a and b,
2150
// and store the results in dst.
2151
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
2152
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2153
{
2154
return vreinterpretq_m128_f32(
2155
vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2156
}
2157
2158
// Multiply the lower single-precision (32-bit) floating-point element in a and
2159
// b, store the result in the lower element of dst, and copy the upper 3 packed
2160
// elements from a to the upper elements of dst.
2161
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
2162
FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
2163
{
2164
return _mm_move_ss(a, _mm_mul_ps(a, b));
2165
}
2166
2167
// Multiply the packed unsigned 16-bit integers in a and b, producing
2168
// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2169
// integers in dst.
2170
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
2171
FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
2172
{
2173
return vreinterpret_m64_u16(vshrn_n_u32(
2174
vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
2175
}
2176
2177
// Compute the bitwise OR of packed single-precision (32-bit) floating-point
2178
// elements in a and b, and store the results in dst.
2179
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
2180
FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
2181
{
2182
return vreinterpretq_m128_s32(
2183
vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2184
}
2185
2186
// Average packed unsigned 8-bit integers in a and b, and store the results in
2187
// dst.
2188
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
2189
#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2190
2191
// Average packed unsigned 16-bit integers in a and b, and store the results in
2192
// dst.
2193
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
2194
#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2195
2196
// Extract a 16-bit integer from a, selected with imm8, and store the result in
2197
// the lower element of dst.
2198
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
2199
#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2200
2201
// Copy a to dst, and insert the 16-bit integer i into dst at the location
2202
// specified by imm8.
2203
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
2204
#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2205
2206
// Compare packed signed 16-bit integers in a and b, and store packed maximum
2207
// values in dst.
2208
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
2209
#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2210
2211
// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2212
// values in dst.
2213
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
2214
#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2215
2216
// Compare packed signed 16-bit integers in a and b, and store packed minimum
2217
// values in dst.
2218
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
2219
#define _m_pminsw(a, b) _mm_min_pi16(a, b)
2220
2221
// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2222
// values in dst.
2223
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
2224
#define _m_pminub(a, b) _mm_min_pu8(a, b)
2225
2226
// Create mask from the most significant bit of each 8-bit element in a, and
2227
// store the result in dst.
2228
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
2229
#define _m_pmovmskb(a) _mm_movemask_pi8(a)
2230
2231
// Multiply the packed unsigned 16-bit integers in a and b, producing
2232
// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2233
// integers in dst.
2234
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
2235
#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2236
2237
// Fetch the line of data from memory that contains address p to a location in
2238
// the cache hierarchy specified by the locality hint i.
2239
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
2240
FORCE_INLINE void _mm_prefetch(char const *p, int i)
2241
{
2242
(void) i;
2243
#if defined(_MSC_VER)
2244
switch (i) {
2245
case _MM_HINT_NTA:
2246
__prefetch2(p, 1);
2247
break;
2248
case _MM_HINT_T0:
2249
__prefetch2(p, 0);
2250
break;
2251
case _MM_HINT_T1:
2252
__prefetch2(p, 2);
2253
break;
2254
case _MM_HINT_T2:
2255
__prefetch2(p, 4);
2256
break;
2257
}
2258
#else
2259
switch (i) {
2260
case _MM_HINT_NTA:
2261
__builtin_prefetch(p, 0, 0);
2262
break;
2263
case _MM_HINT_T0:
2264
__builtin_prefetch(p, 0, 3);
2265
break;
2266
case _MM_HINT_T1:
2267
__builtin_prefetch(p, 0, 2);
2268
break;
2269
case _MM_HINT_T2:
2270
__builtin_prefetch(p, 0, 1);
2271
break;
2272
}
2273
#endif
2274
}
2275
2276
// Compute the absolute differences of packed unsigned 8-bit integers in a and
2277
// b, then horizontally sum each consecutive 8 differences to produce four
2278
// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2279
// 16 bits of dst.
2280
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
2281
#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2282
2283
// Shuffle 16-bit integers in a using the control in imm8, and store the results
2284
// in dst.
2285
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
2286
#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2287
2288
// Compute the approximate reciprocal of packed single-precision (32-bit)
2289
// floating-point elements in a, and store the results in dst. The maximum
2290
// relative error for this approximation is less than 1.5*2^-12.
2291
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
2292
FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
2293
{
2294
float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2295
recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2296
return vreinterpretq_m128_f32(recip);
2297
}
2298
2299
// Compute the approximate reciprocal of the lower single-precision (32-bit)
2300
// floating-point element in a, store the result in the lower element of dst,
2301
// and copy the upper 3 packed elements from a to the upper elements of dst. The
2302
// maximum relative error for this approximation is less than 1.5*2^-12.
2303
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
2304
FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
2305
{
2306
return _mm_move_ss(a, _mm_rcp_ps(a));
2307
}
2308
2309
// Compute the approximate reciprocal square root of packed single-precision
2310
// (32-bit) floating-point elements in a, and store the results in dst. The
2311
// maximum relative error for this approximation is less than 1.5*2^-12.
2312
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
2313
FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
2314
{
2315
float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2316
2317
// Generate masks for detecting whether input has any 0.0f/-0.0f
2318
// (which becomes positive/negative infinity by IEEE-754 arithmetic rules).
2319
const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2320
const uint32x4_t neg_inf = vdupq_n_u32(0xFF800000);
2321
const uint32x4_t has_pos_zero =
2322
vceqq_u32(pos_inf, vreinterpretq_u32_f32(out));
2323
const uint32x4_t has_neg_zero =
2324
vceqq_u32(neg_inf, vreinterpretq_u32_f32(out));
2325
2326
out = vmulq_f32(
2327
out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2328
2329
// Set output vector element to infinity/negative-infinity if
2330
// the corresponding input vector element is 0.0f/-0.0f.
2331
out = vbslq_f32(has_pos_zero, (float32x4_t) pos_inf, out);
2332
out = vbslq_f32(has_neg_zero, (float32x4_t) neg_inf, out);
2333
2334
return vreinterpretq_m128_f32(out);
2335
}
2336
2337
// Compute the approximate reciprocal square root of the lower single-precision
2338
// (32-bit) floating-point element in a, store the result in the lower element
2339
// of dst, and copy the upper 3 packed elements from a to the upper elements of
2340
// dst.
2341
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
2342
FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
2343
{
2344
return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2345
}
2346
2347
// Compute the absolute differences of packed unsigned 8-bit integers in a and
2348
// b, then horizontally sum each consecutive 8 differences to produce four
2349
// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2350
// 16 bits of dst.
2351
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
2352
FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
2353
{
2354
uint64x1_t t = vpaddl_u32(vpaddl_u16(
2355
vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
2356
return vreinterpret_m64_u16(
2357
vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2358
}
2359
2360
// Macro: Set the flush zero bits of the MXCSR control and status register to
2361
// the value in unsigned 32-bit integer a. The flush zero may contain any of the
2362
// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
2363
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
2364
FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
2365
{
2366
// AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
2367
// regardless of the value of the FZ bit.
2368
union {
2369
fpcr_bitfield field;
2370
#if defined(__aarch64__) || defined(_M_ARM64)
2371
uint64_t value;
2372
#else
2373
uint32_t value;
2374
#endif
2375
} r;
2376
2377
#if defined(__aarch64__) || defined(_M_ARM64)
2378
r.value = _sse2neon_get_fpcr();
2379
#else
2380
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2381
#endif
2382
2383
r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
2384
2385
#if defined(__aarch64__) || defined(_M_ARM64)
2386
_sse2neon_set_fpcr(r.value);
2387
#else
2388
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
2389
#endif
2390
}
2391
2392
// Set packed single-precision (32-bit) floating-point elements in dst with the
2393
// supplied values.
2394
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
2395
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
2396
{
2397
float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
2398
return vreinterpretq_m128_f32(vld1q_f32(data));
2399
}
2400
2401
// Broadcast single-precision (32-bit) floating-point value a to all elements of
2402
// dst.
2403
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
2404
FORCE_INLINE __m128 _mm_set_ps1(float _w)
2405
{
2406
return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2407
}
2408
2409
// Macro: Set the rounding mode bits of the MXCSR control and status register to
2410
// the value in unsigned 32-bit integer a. The rounding mode may contain any of
2411
// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
2412
// _MM_ROUND_TOWARD_ZERO
2413
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
2414
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
2415
{
2416
union {
2417
fpcr_bitfield field;
2418
#if defined(__aarch64__) || defined(_M_ARM64)
2419
uint64_t value;
2420
#else
2421
uint32_t value;
2422
#endif
2423
} r;
2424
2425
#if defined(__aarch64__) || defined(_M_ARM64)
2426
r.value = _sse2neon_get_fpcr();
2427
#else
2428
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2429
#endif
2430
2431
switch (rounding) {
2432
case _MM_ROUND_TOWARD_ZERO:
2433
r.field.bit22 = 1;
2434
r.field.bit23 = 1;
2435
break;
2436
case _MM_ROUND_DOWN:
2437
r.field.bit22 = 0;
2438
r.field.bit23 = 1;
2439
break;
2440
case _MM_ROUND_UP:
2441
r.field.bit22 = 1;
2442
r.field.bit23 = 0;
2443
break;
2444
default: //_MM_ROUND_NEAREST
2445
r.field.bit22 = 0;
2446
r.field.bit23 = 0;
2447
}
2448
2449
#if defined(__aarch64__) || defined(_M_ARM64)
2450
_sse2neon_set_fpcr(r.value);
2451
#else
2452
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
2453
#endif
2454
}
2455
2456
// Copy single-precision (32-bit) floating-point element a to the lower element
2457
// of dst, and zero the upper 3 elements.
2458
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
2459
FORCE_INLINE __m128 _mm_set_ss(float a)
2460
{
2461
return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
2462
}
2463
2464
// Broadcast single-precision (32-bit) floating-point value a to all elements of
2465
// dst.
2466
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
2467
FORCE_INLINE __m128 _mm_set1_ps(float _w)
2468
{
2469
return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2470
}
2471
2472
// Set the MXCSR control and status register with the value in unsigned 32-bit
2473
// integer a.
2474
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
2475
// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
2476
FORCE_INLINE void _mm_setcsr(unsigned int a)
2477
{
2478
_MM_SET_ROUNDING_MODE(a);
2479
}
2480
2481
// Get the unsigned 32-bit value of the MXCSR control and status register.
2482
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
2483
// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
2484
FORCE_INLINE unsigned int _mm_getcsr(void)
2485
{
2486
return _MM_GET_ROUNDING_MODE();
2487
}
2488
2489
// Set packed single-precision (32-bit) floating-point elements in dst with the
2490
// supplied values in reverse order.
2491
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
2492
FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
2493
{
2494
float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
2495
return vreinterpretq_m128_f32(vld1q_f32(data));
2496
}
2497
2498
// Return vector of type __m128 with all elements set to zero.
2499
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
2500
FORCE_INLINE __m128 _mm_setzero_ps(void)
2501
{
2502
return vreinterpretq_m128_f32(vdupq_n_f32(0));
2503
}
2504
2505
// Shuffle 16-bit integers in a using the control in imm8, and store the results
2506
// in dst.
2507
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
2508
#ifdef _sse2neon_shuffle
2509
#define _mm_shuffle_pi16(a, imm) \
2510
vreinterpret_m64_s16(vshuffle_s16( \
2511
vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2512
((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)))
2513
#else
2514
#define _mm_shuffle_pi16(a, imm) \
2515
_sse2neon_define1( \
2516
__m64, a, int16x4_t ret; \
2517
ret = vmov_n_s16( \
2518
vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3))); \
2519
ret = vset_lane_s16( \
2520
vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \
2521
1); \
2522
ret = vset_lane_s16( \
2523
vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \
2524
2); \
2525
ret = vset_lane_s16( \
2526
vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \
2527
3); \
2528
_sse2neon_return(vreinterpret_m64_s16(ret));)
2529
#endif
2530
2531
// Perform a serializing operation on all store-to-memory instructions that were
2532
// issued prior to this instruction. Guarantees that every store instruction
2533
// that precedes, in program order, is globally visible before any store
2534
// instruction which follows the fence in program order.
2535
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
2536
FORCE_INLINE void _mm_sfence(void)
2537
{
2538
_sse2neon_smp_mb();
2539
}
2540
2541
// Perform a serializing operation on all load-from-memory and store-to-memory
2542
// instructions that were issued prior to this instruction. Guarantees that
2543
// every memory access that precedes, in program order, the memory fence
2544
// instruction is globally visible before any memory instruction which follows
2545
// the fence in program order.
2546
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
2547
FORCE_INLINE void _mm_mfence(void)
2548
{
2549
_sse2neon_smp_mb();
2550
}
2551
2552
// Perform a serializing operation on all load-from-memory instructions that
2553
// were issued prior to this instruction. Guarantees that every load instruction
2554
// that precedes, in program order, is globally visible before any load
2555
// instruction which follows the fence in program order.
2556
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
2557
FORCE_INLINE void _mm_lfence(void)
2558
{
2559
_sse2neon_smp_mb();
2560
}
2561
2562
// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
2563
// int imm)
2564
#ifdef _sse2neon_shuffle
2565
#define _mm_shuffle_ps(a, b, imm) \
2566
__extension__({ \
2567
float32x4_t _input1 = vreinterpretq_f32_m128(a); \
2568
float32x4_t _input2 = vreinterpretq_f32_m128(b); \
2569
float32x4_t _shuf = \
2570
vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2571
(((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2572
vreinterpretq_m128_f32(_shuf); \
2573
})
2574
#else // generic
2575
#define _mm_shuffle_ps(a, b, imm) \
2576
_sse2neon_define2( \
2577
__m128, a, b, __m128 ret; switch (imm) { \
2578
case _MM_SHUFFLE(1, 0, 3, 2): \
2579
ret = _mm_shuffle_ps_1032(_a, _b); \
2580
break; \
2581
case _MM_SHUFFLE(2, 3, 0, 1): \
2582
ret = _mm_shuffle_ps_2301(_a, _b); \
2583
break; \
2584
case _MM_SHUFFLE(0, 3, 2, 1): \
2585
ret = _mm_shuffle_ps_0321(_a, _b); \
2586
break; \
2587
case _MM_SHUFFLE(2, 1, 0, 3): \
2588
ret = _mm_shuffle_ps_2103(_a, _b); \
2589
break; \
2590
case _MM_SHUFFLE(1, 0, 1, 0): \
2591
ret = _mm_movelh_ps(_a, _b); \
2592
break; \
2593
case _MM_SHUFFLE(1, 0, 0, 1): \
2594
ret = _mm_shuffle_ps_1001(_a, _b); \
2595
break; \
2596
case _MM_SHUFFLE(0, 1, 0, 1): \
2597
ret = _mm_shuffle_ps_0101(_a, _b); \
2598
break; \
2599
case _MM_SHUFFLE(3, 2, 1, 0): \
2600
ret = _mm_shuffle_ps_3210(_a, _b); \
2601
break; \
2602
case _MM_SHUFFLE(0, 0, 1, 1): \
2603
ret = _mm_shuffle_ps_0011(_a, _b); \
2604
break; \
2605
case _MM_SHUFFLE(0, 0, 2, 2): \
2606
ret = _mm_shuffle_ps_0022(_a, _b); \
2607
break; \
2608
case _MM_SHUFFLE(2, 2, 0, 0): \
2609
ret = _mm_shuffle_ps_2200(_a, _b); \
2610
break; \
2611
case _MM_SHUFFLE(3, 2, 0, 2): \
2612
ret = _mm_shuffle_ps_3202(_a, _b); \
2613
break; \
2614
case _MM_SHUFFLE(3, 2, 3, 2): \
2615
ret = _mm_movehl_ps(_b, _a); \
2616
break; \
2617
case _MM_SHUFFLE(1, 1, 3, 3): \
2618
ret = _mm_shuffle_ps_1133(_a, _b); \
2619
break; \
2620
case _MM_SHUFFLE(2, 0, 1, 0): \
2621
ret = _mm_shuffle_ps_2010(_a, _b); \
2622
break; \
2623
case _MM_SHUFFLE(2, 0, 0, 1): \
2624
ret = _mm_shuffle_ps_2001(_a, _b); \
2625
break; \
2626
case _MM_SHUFFLE(2, 0, 3, 2): \
2627
ret = _mm_shuffle_ps_2032(_a, _b); \
2628
break; \
2629
default: \
2630
ret = _mm_shuffle_ps_default(_a, _b, (imm)); \
2631
break; \
2632
} _sse2neon_return(ret);)
2633
#endif
2634
2635
// Compute the square root of packed single-precision (32-bit) floating-point
2636
// elements in a, and store the results in dst.
2637
// Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement
2638
// square root by multiplying input in with its reciprocal square root before
2639
// using the Newton-Raphson method to approximate the results.
2640
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
2641
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
2642
{
2643
#if defined(__aarch64__) || defined(_M_ARM64)
2644
return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2645
#else
2646
float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2647
2648
// Test for vrsqrteq_f32(0) -> positive infinity case.
2649
// Change to zero, so that s * 1/sqrt(s) result is zero too.
2650
const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2651
const uint32x4_t div_by_zero =
2652
vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2653
recip = vreinterpretq_f32_u32(
2654
vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2655
2656
recip = vmulq_f32(
2657
vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2658
recip);
2659
// Additional Netwon-Raphson iteration for accuracy
2660
recip = vmulq_f32(
2661
vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2662
recip);
2663
2664
// sqrt(s) = s * 1/sqrt(s)
2665
return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
2666
#endif
2667
}
2668
2669
// Compute the square root of the lower single-precision (32-bit) floating-point
2670
// element in a, store the result in the lower element of dst, and copy the
2671
// upper 3 packed elements from a to the upper elements of dst.
2672
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
2673
FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
2674
{
2675
float32_t value =
2676
vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2677
return vreinterpretq_m128_f32(
2678
vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2679
}
2680
2681
// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
2682
// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
2683
// or a general-protection exception may be generated.
2684
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
2685
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
2686
{
2687
vst1q_f32(p, vreinterpretq_f32_m128(a));
2688
}
2689
2690
// Store the lower single-precision (32-bit) floating-point element from a into
2691
// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2692
// boundary or a general-protection exception may be generated.
2693
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
2694
FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
2695
{
2696
float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
2697
vst1q_f32(p, vdupq_n_f32(a0));
2698
}
2699
2700
// Store the lower single-precision (32-bit) floating-point element from a into
2701
// memory. mem_addr does not need to be aligned on any particular boundary.
2702
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
2703
FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
2704
{
2705
vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
2706
}
2707
2708
// Store the lower single-precision (32-bit) floating-point element from a into
2709
// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2710
// boundary or a general-protection exception may be generated.
2711
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
2712
#define _mm_store1_ps _mm_store_ps1
2713
2714
// Store the upper 2 single-precision (32-bit) floating-point elements from a
2715
// into memory.
2716
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
2717
FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
2718
{
2719
*p = vreinterpret_m64_f32(vget_high_f32(a));
2720
}
2721
2722
// Store the lower 2 single-precision (32-bit) floating-point elements from a
2723
// into memory.
2724
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
2725
FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
2726
{
2727
*p = vreinterpret_m64_f32(vget_low_f32(a));
2728
}
2729
2730
// Store 4 single-precision (32-bit) floating-point elements from a into memory
2731
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2732
// general-protection exception may be generated.
2733
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
2734
FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
2735
{
2736
float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
2737
float32x4_t rev = vextq_f32(tmp, tmp, 2);
2738
vst1q_f32(p, rev);
2739
}
2740
2741
// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
2742
// elements) from a into memory. mem_addr does not need to be aligned on any
2743
// particular boundary.
2744
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
2745
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
2746
{
2747
vst1q_f32(p, vreinterpretq_f32_m128(a));
2748
}
2749
2750
// Stores 16-bits of integer data a at the address p.
2751
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
2752
FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
2753
{
2754
vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
2755
}
2756
2757
// Stores 64-bits of integer data a at the address p.
2758
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
2759
FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
2760
{
2761
vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
2762
}
2763
2764
// Store 64-bits of integer data from a into memory using a non-temporal memory
2765
// hint.
2766
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
2767
FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
2768
{
2769
vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
2770
}
2771
2772
// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
2773
// point elements) from a into memory using a non-temporal memory hint.
2774
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
2775
FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
2776
{
2777
#if __has_builtin(__builtin_nontemporal_store)
2778
__builtin_nontemporal_store(a, (float32x4_t *) p);
2779
#else
2780
vst1q_f32(p, vreinterpretq_f32_m128(a));
2781
#endif
2782
}
2783
2784
// Subtract packed single-precision (32-bit) floating-point elements in b from
2785
// packed single-precision (32-bit) floating-point elements in a, and store the
2786
// results in dst.
2787
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
2788
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2789
{
2790
return vreinterpretq_m128_f32(
2791
vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2792
}
2793
2794
// Subtract the lower single-precision (32-bit) floating-point element in b from
2795
// the lower single-precision (32-bit) floating-point element in a, store the
2796
// result in the lower element of dst, and copy the upper 3 packed elements from
2797
// a to the upper elements of dst.
2798
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
2799
FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2800
{
2801
return _mm_move_ss(a, _mm_sub_ps(a, b));
2802
}
2803
2804
// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
2805
// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
2806
// transposed matrix in these vectors (row0 now contains column 0, etc.).
2807
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
2808
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2809
do { \
2810
float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \
2811
float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \
2812
row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \
2813
vget_low_f32(ROW23.val[0])); \
2814
row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \
2815
vget_low_f32(ROW23.val[1])); \
2816
row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \
2817
vget_high_f32(ROW23.val[0])); \
2818
row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \
2819
vget_high_f32(ROW23.val[1])); \
2820
} while (0)
2821
2822
// according to the documentation, these intrinsics behave the same as the
2823
// non-'u' versions. We'll just alias them here.
2824
#define _mm_ucomieq_ss _mm_comieq_ss
2825
#define _mm_ucomige_ss _mm_comige_ss
2826
#define _mm_ucomigt_ss _mm_comigt_ss
2827
#define _mm_ucomile_ss _mm_comile_ss
2828
#define _mm_ucomilt_ss _mm_comilt_ss
2829
#define _mm_ucomineq_ss _mm_comineq_ss
2830
2831
// Return vector of type __m128i with undefined elements.
2832
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
2833
FORCE_INLINE __m128i _mm_undefined_si128(void)
2834
{
2835
#if defined(__GNUC__) || defined(__clang__)
2836
#pragma GCC diagnostic push
2837
#pragma GCC diagnostic ignored "-Wuninitialized"
2838
#endif
2839
__m128i a;
2840
#if defined(_MSC_VER)
2841
a = _mm_setzero_si128();
2842
#endif
2843
return a;
2844
#if defined(__GNUC__) || defined(__clang__)
2845
#pragma GCC diagnostic pop
2846
#endif
2847
}
2848
2849
// Return vector of type __m128 with undefined elements.
2850
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
2851
FORCE_INLINE __m128 _mm_undefined_ps(void)
2852
{
2853
#if defined(__GNUC__) || defined(__clang__)
2854
#pragma GCC diagnostic push
2855
#pragma GCC diagnostic ignored "-Wuninitialized"
2856
#endif
2857
__m128 a;
2858
#if defined(_MSC_VER)
2859
a = _mm_setzero_ps();
2860
#endif
2861
return a;
2862
#if defined(__GNUC__) || defined(__clang__)
2863
#pragma GCC diagnostic pop
2864
#endif
2865
}
2866
2867
// Unpack and interleave single-precision (32-bit) floating-point elements from
2868
// the high half a and b, and store the results in dst.
2869
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
2870
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
2871
{
2872
#if defined(__aarch64__) || defined(_M_ARM64)
2873
return vreinterpretq_m128_f32(
2874
vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2875
#else
2876
float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
2877
float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
2878
float32x2x2_t result = vzip_f32(a1, b1);
2879
return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2880
#endif
2881
}
2882
2883
// Unpack and interleave single-precision (32-bit) floating-point elements from
2884
// the low half of a and b, and store the results in dst.
2885
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
2886
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
2887
{
2888
#if defined(__aarch64__) || defined(_M_ARM64)
2889
return vreinterpretq_m128_f32(
2890
vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2891
#else
2892
float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
2893
float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
2894
float32x2x2_t result = vzip_f32(a1, b1);
2895
return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2896
#endif
2897
}
2898
2899
// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
2900
// elements in a and b, and store the results in dst.
2901
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
2902
FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
2903
{
2904
return vreinterpretq_m128_s32(
2905
veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2906
}
2907
2908
/* SSE2 */
2909
2910
// Add packed 16-bit integers in a and b, and store the results in dst.
2911
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
2912
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
2913
{
2914
return vreinterpretq_m128i_s16(
2915
vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2916
}
2917
2918
// Add packed 32-bit integers in a and b, and store the results in dst.
2919
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
2920
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
2921
{
2922
return vreinterpretq_m128i_s32(
2923
vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2924
}
2925
2926
// Add packed 64-bit integers in a and b, and store the results in dst.
2927
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
2928
FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
2929
{
2930
return vreinterpretq_m128i_s64(
2931
vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2932
}
2933
2934
// Add packed 8-bit integers in a and b, and store the results in dst.
2935
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
2936
FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
2937
{
2938
return vreinterpretq_m128i_s8(
2939
vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2940
}
2941
2942
// Add packed double-precision (64-bit) floating-point elements in a and b, and
2943
// store the results in dst.
2944
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
2945
FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
2946
{
2947
#if defined(__aarch64__) || defined(_M_ARM64)
2948
return vreinterpretq_m128d_f64(
2949
vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2950
#else
2951
double *da = (double *) &a;
2952
double *db = (double *) &b;
2953
double c[2];
2954
c[0] = da[0] + db[0];
2955
c[1] = da[1] + db[1];
2956
return vld1q_f32((float32_t *) c);
2957
#endif
2958
}
2959
2960
// Add the lower double-precision (64-bit) floating-point element in a and b,
2961
// store the result in the lower element of dst, and copy the upper element from
2962
// a to the upper element of dst.
2963
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
2964
FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
2965
{
2966
#if defined(__aarch64__) || defined(_M_ARM64)
2967
return _mm_move_sd(a, _mm_add_pd(a, b));
2968
#else
2969
double *da = (double *) &a;
2970
double *db = (double *) &b;
2971
double c[2];
2972
c[0] = da[0] + db[0];
2973
c[1] = da[1];
2974
return vld1q_f32((float32_t *) c);
2975
#endif
2976
}
2977
2978
// Add 64-bit integers a and b, and store the result in dst.
2979
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
2980
FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
2981
{
2982
return vreinterpret_m64_s64(
2983
vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2984
}
2985
2986
// Add packed signed 16-bit integers in a and b using saturation, and store the
2987
// results in dst.
2988
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
2989
FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
2990
{
2991
return vreinterpretq_m128i_s16(
2992
vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2993
}
2994
2995
// Add packed signed 8-bit integers in a and b using saturation, and store the
2996
// results in dst.
2997
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
2998
FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
2999
{
3000
return vreinterpretq_m128i_s8(
3001
vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3002
}
3003
3004
// Add packed unsigned 16-bit integers in a and b using saturation, and store
3005
// the results in dst.
3006
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
3007
FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
3008
{
3009
return vreinterpretq_m128i_u16(
3010
vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
3011
}
3012
3013
// Add packed unsigned 8-bit integers in a and b using saturation, and store the
3014
// results in dst.
3015
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
3016
FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
3017
{
3018
return vreinterpretq_m128i_u8(
3019
vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3020
}
3021
3022
// Compute the bitwise AND of packed double-precision (64-bit) floating-point
3023
// elements in a and b, and store the results in dst.
3024
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
3025
FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
3026
{
3027
return vreinterpretq_m128d_s64(
3028
vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
3029
}
3030
3031
// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
3032
// and store the result in dst.
3033
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
3034
FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
3035
{
3036
return vreinterpretq_m128i_s32(
3037
vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3038
}
3039
3040
// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
3041
// elements in a and then AND with b, and store the results in dst.
3042
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
3043
FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
3044
{
3045
// *NOTE* argument swap
3046
return vreinterpretq_m128d_s64(
3047
vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
3048
}
3049
3050
// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
3051
// AND with b, and store the result in dst.
3052
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
3053
FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
3054
{
3055
return vreinterpretq_m128i_s32(
3056
vbicq_s32(vreinterpretq_s32_m128i(b),
3057
vreinterpretq_s32_m128i(a))); // *NOTE* argument swap
3058
}
3059
3060
// Average packed unsigned 16-bit integers in a and b, and store the results in
3061
// dst.
3062
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
3063
FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
3064
{
3065
return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3066
vreinterpretq_u16_m128i(b));
3067
}
3068
3069
// Average packed unsigned 8-bit integers in a and b, and store the results in
3070
// dst.
3071
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
3072
FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
3073
{
3074
return vreinterpretq_m128i_u8(
3075
vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3076
}
3077
3078
// Shift a left by imm8 bytes while shifting in zeros, and store the results in
3079
// dst.
3080
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
3081
#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3082
3083
// Shift a right by imm8 bytes while shifting in zeros, and store the results in
3084
// dst.
3085
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
3086
#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3087
3088
// Cast vector of type __m128d to type __m128. This intrinsic is only used for
3089
// compilation and does not generate any instructions, thus it has zero latency.
3090
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
3091
FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
3092
{
3093
return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
3094
}
3095
3096
// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
3097
// compilation and does not generate any instructions, thus it has zero latency.
3098
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
3099
FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
3100
{
3101
return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
3102
}
3103
3104
// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
3105
// compilation and does not generate any instructions, thus it has zero latency.
3106
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
3107
FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
3108
{
3109
return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
3110
}
3111
3112
// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
3113
// compilation and does not generate any instructions, thus it has zero latency.
3114
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
3115
FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
3116
{
3117
return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
3118
}
3119
3120
// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
3121
// compilation and does not generate any instructions, thus it has zero latency.
3122
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
3123
FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
3124
{
3125
#if defined(__aarch64__) || defined(_M_ARM64)
3126
return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
3127
#else
3128
return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
3129
#endif
3130
}
3131
3132
// Cast vector of type __m128i to type __m128. This intrinsic is only used for
3133
// compilation and does not generate any instructions, thus it has zero latency.
3134
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
3135
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
3136
{
3137
return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
3138
}
3139
3140
// Invalidate and flush the cache line that contains p from all levels of the
3141
// cache hierarchy.
3142
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
3143
#if defined(__APPLE__)
3144
#include <libkern/OSCacheControl.h>
3145
#endif
3146
FORCE_INLINE void _mm_clflush(void const *p)
3147
{
3148
(void) p;
3149
3150
/* sys_icache_invalidate is supported since macOS 10.5.
3151
* However, it does not work on non-jailbroken iOS devices, although the
3152
* compilation is successful.
3153
*/
3154
#if defined(__APPLE__)
3155
sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
3156
#elif defined(__GNUC__) || defined(__clang__)
3157
uintptr_t ptr = (uintptr_t) p;
3158
__builtin___clear_cache((char *) ptr,
3159
(char *) ptr + SSE2NEON_CACHELINE_SIZE);
3160
#elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H
3161
FlushInstructionCache(GetCurrentProcess(), p, SSE2NEON_CACHELINE_SIZE);
3162
#endif
3163
}
3164
3165
// Compare packed 16-bit integers in a and b for equality, and store the results
3166
// in dst.
3167
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
3168
FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
3169
{
3170
return vreinterpretq_m128i_u16(
3171
vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3172
}
3173
3174
// Compare packed 32-bit integers in a and b for equality, and store the results
3175
// in dst.
3176
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
3177
FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
3178
{
3179
return vreinterpretq_m128i_u32(
3180
vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3181
}
3182
3183
// Compare packed 8-bit integers in a and b for equality, and store the results
3184
// in dst.
3185
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
3186
FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
3187
{
3188
return vreinterpretq_m128i_u8(
3189
vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3190
}
3191
3192
// Compare packed double-precision (64-bit) floating-point elements in a and b
3193
// for equality, and store the results in dst.
3194
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
3195
FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
3196
{
3197
#if defined(__aarch64__) || defined(_M_ARM64)
3198
return vreinterpretq_m128d_u64(
3199
vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3200
#else
3201
// (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3202
uint32x4_t cmp =
3203
vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3204
uint32x4_t swapped = vrev64q_u32(cmp);
3205
return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
3206
#endif
3207
}
3208
3209
// Compare the lower double-precision (64-bit) floating-point elements in a and
3210
// b for equality, store the result in the lower element of dst, and copy the
3211
// upper element from a to the upper element of dst.
3212
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
3213
FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
3214
{
3215
return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
3216
}
3217
3218
// Compare packed double-precision (64-bit) floating-point elements in a and b
3219
// for greater-than-or-equal, and store the results in dst.
3220
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
3221
FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
3222
{
3223
#if defined(__aarch64__) || defined(_M_ARM64)
3224
return vreinterpretq_m128d_u64(
3225
vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3226
#else
3227
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3228
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3229
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3230
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3231
uint64_t d[2];
3232
d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3233
d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3234
3235
return vreinterpretq_m128d_u64(vld1q_u64(d));
3236
#endif
3237
}
3238
3239
// Compare the lower double-precision (64-bit) floating-point elements in a and
3240
// b for greater-than-or-equal, store the result in the lower element of dst,
3241
// and copy the upper element from a to the upper element of dst.
3242
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
3243
FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
3244
{
3245
#if defined(__aarch64__) || defined(_M_ARM64)
3246
return _mm_move_sd(a, _mm_cmpge_pd(a, b));
3247
#else
3248
// expand "_mm_cmpge_pd()" to reduce unnecessary operations
3249
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3250
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3251
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3252
uint64_t d[2];
3253
d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3254
d[1] = a1;
3255
3256
return vreinterpretq_m128d_u64(vld1q_u64(d));
3257
#endif
3258
}
3259
3260
// Compare packed signed 16-bit integers in a and b for greater-than, and store
3261
// the results in dst.
3262
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
3263
FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
3264
{
3265
return vreinterpretq_m128i_u16(
3266
vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3267
}
3268
3269
// Compare packed signed 32-bit integers in a and b for greater-than, and store
3270
// the results in dst.
3271
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
3272
FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
3273
{
3274
return vreinterpretq_m128i_u32(
3275
vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3276
}
3277
3278
// Compare packed signed 8-bit integers in a and b for greater-than, and store
3279
// the results in dst.
3280
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
3281
FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
3282
{
3283
return vreinterpretq_m128i_u8(
3284
vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3285
}
3286
3287
// Compare packed double-precision (64-bit) floating-point elements in a and b
3288
// for greater-than, and store the results in dst.
3289
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
3290
FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
3291
{
3292
#if defined(__aarch64__) || defined(_M_ARM64)
3293
return vreinterpretq_m128d_u64(
3294
vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3295
#else
3296
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3297
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3298
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3299
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3300
uint64_t d[2];
3301
d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3302
d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3303
3304
return vreinterpretq_m128d_u64(vld1q_u64(d));
3305
#endif
3306
}
3307
3308
// Compare the lower double-precision (64-bit) floating-point elements in a and
3309
// b for greater-than, store the result in the lower element of dst, and copy
3310
// the upper element from a to the upper element of dst.
3311
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
3312
FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
3313
{
3314
#if defined(__aarch64__) || defined(_M_ARM64)
3315
return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
3316
#else
3317
// expand "_mm_cmpge_pd()" to reduce unnecessary operations
3318
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3319
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3320
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3321
uint64_t d[2];
3322
d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3323
d[1] = a1;
3324
3325
return vreinterpretq_m128d_u64(vld1q_u64(d));
3326
#endif
3327
}
3328
3329
// Compare packed double-precision (64-bit) floating-point elements in a and b
3330
// for less-than-or-equal, and store the results in dst.
3331
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
3332
FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
3333
{
3334
#if defined(__aarch64__) || defined(_M_ARM64)
3335
return vreinterpretq_m128d_u64(
3336
vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3337
#else
3338
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3339
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3340
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3341
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3342
uint64_t d[2];
3343
d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3344
d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3345
3346
return vreinterpretq_m128d_u64(vld1q_u64(d));
3347
#endif
3348
}
3349
3350
// Compare the lower double-precision (64-bit) floating-point elements in a and
3351
// b for less-than-or-equal, store the result in the lower element of dst, and
3352
// copy the upper element from a to the upper element of dst.
3353
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
3354
FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
3355
{
3356
#if defined(__aarch64__) || defined(_M_ARM64)
3357
return _mm_move_sd(a, _mm_cmple_pd(a, b));
3358
#else
3359
// expand "_mm_cmpge_pd()" to reduce unnecessary operations
3360
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3361
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3362
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3363
uint64_t d[2];
3364
d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3365
d[1] = a1;
3366
3367
return vreinterpretq_m128d_u64(vld1q_u64(d));
3368
#endif
3369
}
3370
3371
// Compare packed signed 16-bit integers in a and b for less-than, and store the
3372
// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
3373
// order of the operands switched.
3374
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
3375
FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
3376
{
3377
return vreinterpretq_m128i_u16(
3378
vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3379
}
3380
3381
// Compare packed signed 32-bit integers in a and b for less-than, and store the
3382
// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
3383
// order of the operands switched.
3384
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
3385
FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
3386
{
3387
return vreinterpretq_m128i_u32(
3388
vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3389
}
3390
3391
// Compare packed signed 8-bit integers in a and b for less-than, and store the
3392
// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
3393
// order of the operands switched.
3394
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
3395
FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
3396
{
3397
return vreinterpretq_m128i_u8(
3398
vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3399
}
3400
3401
// Compare packed double-precision (64-bit) floating-point elements in a and b
3402
// for less-than, and store the results in dst.
3403
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
3404
FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
3405
{
3406
#if defined(__aarch64__) || defined(_M_ARM64)
3407
return vreinterpretq_m128d_u64(
3408
vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3409
#else
3410
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3411
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3412
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3413
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3414
uint64_t d[2];
3415
d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3416
d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3417
3418
return vreinterpretq_m128d_u64(vld1q_u64(d));
3419
#endif
3420
}
3421
3422
// Compare the lower double-precision (64-bit) floating-point elements in a and
3423
// b for less-than, store the result in the lower element of dst, and copy the
3424
// upper element from a to the upper element of dst.
3425
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
3426
FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
3427
{
3428
#if defined(__aarch64__) || defined(_M_ARM64)
3429
return _mm_move_sd(a, _mm_cmplt_pd(a, b));
3430
#else
3431
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3432
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3433
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3434
uint64_t d[2];
3435
d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3436
d[1] = a1;
3437
3438
return vreinterpretq_m128d_u64(vld1q_u64(d));
3439
#endif
3440
}
3441
3442
// Compare packed double-precision (64-bit) floating-point elements in a and b
3443
// for not-equal, and store the results in dst.
3444
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
3445
FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
3446
{
3447
#if defined(__aarch64__) || defined(_M_ARM64)
3448
return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
3449
vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3450
#else
3451
// (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3452
uint32x4_t cmp =
3453
vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3454
uint32x4_t swapped = vrev64q_u32(cmp);
3455
return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
3456
#endif
3457
}
3458
3459
// Compare the lower double-precision (64-bit) floating-point elements in a and
3460
// b for not-equal, store the result in the lower element of dst, and copy the
3461
// upper element from a to the upper element of dst.
3462
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
3463
FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
3464
{
3465
return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
3466
}
3467
3468
// Compare packed double-precision (64-bit) floating-point elements in a and b
3469
// for not-greater-than-or-equal, and store the results in dst.
3470
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
3471
FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
3472
{
3473
#if defined(__aarch64__) || defined(_M_ARM64)
3474
return vreinterpretq_m128d_u64(veorq_u64(
3475
vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3476
vdupq_n_u64(UINT64_MAX)));
3477
#else
3478
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3479
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3480
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3481
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3482
uint64_t d[2];
3483
d[0] =
3484
!((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3485
d[1] =
3486
!((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3487
3488
return vreinterpretq_m128d_u64(vld1q_u64(d));
3489
#endif
3490
}
3491
3492
// Compare the lower double-precision (64-bit) floating-point elements in a and
3493
// b for not-greater-than-or-equal, store the result in the lower element of
3494
// dst, and copy the upper element from a to the upper element of dst.
3495
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
3496
FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
3497
{
3498
return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
3499
}
3500
3501
// Compare packed double-precision (64-bit) floating-point elements in a and b
3502
// for not-greater-than, and store the results in dst.
3503
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
3504
FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
3505
{
3506
#if defined(__aarch64__) || defined(_M_ARM64)
3507
return vreinterpretq_m128d_u64(veorq_u64(
3508
vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3509
vdupq_n_u64(UINT64_MAX)));
3510
#else
3511
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3512
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3513
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3514
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3515
uint64_t d[2];
3516
d[0] =
3517
!((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3518
d[1] =
3519
!((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3520
3521
return vreinterpretq_m128d_u64(vld1q_u64(d));
3522
#endif
3523
}
3524
3525
// Compare the lower double-precision (64-bit) floating-point elements in a and
3526
// b for not-greater-than, store the result in the lower element of dst, and
3527
// copy the upper element from a to the upper element of dst.
3528
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
3529
FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
3530
{
3531
return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
3532
}
3533
3534
// Compare packed double-precision (64-bit) floating-point elements in a and b
3535
// for not-less-than-or-equal, and store the results in dst.
3536
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
3537
FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
3538
{
3539
#if defined(__aarch64__) || defined(_M_ARM64)
3540
return vreinterpretq_m128d_u64(veorq_u64(
3541
vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3542
vdupq_n_u64(UINT64_MAX)));
3543
#else
3544
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3545
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3546
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3547
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3548
uint64_t d[2];
3549
d[0] =
3550
!((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3551
d[1] =
3552
!((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3553
3554
return vreinterpretq_m128d_u64(vld1q_u64(d));
3555
#endif
3556
}
3557
3558
// Compare the lower double-precision (64-bit) floating-point elements in a and
3559
// b for not-less-than-or-equal, store the result in the lower element of dst,
3560
// and copy the upper element from a to the upper element of dst.
3561
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
3562
FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
3563
{
3564
return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
3565
}
3566
3567
// Compare packed double-precision (64-bit) floating-point elements in a and b
3568
// for not-less-than, and store the results in dst.
3569
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
3570
FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
3571
{
3572
#if defined(__aarch64__) || defined(_M_ARM64)
3573
return vreinterpretq_m128d_u64(veorq_u64(
3574
vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3575
vdupq_n_u64(UINT64_MAX)));
3576
#else
3577
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3578
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3579
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3580
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3581
uint64_t d[2];
3582
d[0] =
3583
!((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3584
d[1] =
3585
!((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3586
3587
return vreinterpretq_m128d_u64(vld1q_u64(d));
3588
#endif
3589
}
3590
3591
// Compare the lower double-precision (64-bit) floating-point elements in a and
3592
// b for not-less-than, store the result in the lower element of dst, and copy
3593
// the upper element from a to the upper element of dst.
3594
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
3595
FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
3596
{
3597
return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
3598
}
3599
3600
// Compare packed double-precision (64-bit) floating-point elements in a and b
3601
// to see if neither is NaN, and store the results in dst.
3602
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
3603
FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
3604
{
3605
#if defined(__aarch64__) || defined(_M_ARM64)
3606
// Excluding NaNs, any two floating point numbers can be compared.
3607
uint64x2_t not_nan_a =
3608
vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3609
uint64x2_t not_nan_b =
3610
vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3611
return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
3612
#else
3613
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3614
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3615
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3616
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3617
uint64_t d[2];
3618
d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3619
(*(double *) &b0) == (*(double *) &b0))
3620
? ~UINT64_C(0)
3621
: UINT64_C(0);
3622
d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3623
(*(double *) &b1) == (*(double *) &b1))
3624
? ~UINT64_C(0)
3625
: UINT64_C(0);
3626
3627
return vreinterpretq_m128d_u64(vld1q_u64(d));
3628
#endif
3629
}
3630
3631
// Compare the lower double-precision (64-bit) floating-point elements in a and
3632
// b to see if neither is NaN, store the result in the lower element of dst, and
3633
// copy the upper element from a to the upper element of dst.
3634
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
3635
FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
3636
{
3637
#if defined(__aarch64__) || defined(_M_ARM64)
3638
return _mm_move_sd(a, _mm_cmpord_pd(a, b));
3639
#else
3640
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3641
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3642
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3643
uint64_t d[2];
3644
d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3645
(*(double *) &b0) == (*(double *) &b0))
3646
? ~UINT64_C(0)
3647
: UINT64_C(0);
3648
d[1] = a1;
3649
3650
return vreinterpretq_m128d_u64(vld1q_u64(d));
3651
#endif
3652
}
3653
3654
// Compare packed double-precision (64-bit) floating-point elements in a and b
3655
// to see if either is NaN, and store the results in dst.
3656
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
3657
FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
3658
{
3659
#if defined(__aarch64__) || defined(_M_ARM64)
3660
// Two NaNs are not equal in comparison operation.
3661
uint64x2_t not_nan_a =
3662
vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3663
uint64x2_t not_nan_b =
3664
vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3665
return vreinterpretq_m128d_s32(
3666
vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3667
#else
3668
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3669
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3670
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3671
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3672
uint64_t d[2];
3673
d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3674
(*(double *) &b0) == (*(double *) &b0))
3675
? UINT64_C(0)
3676
: ~UINT64_C(0);
3677
d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3678
(*(double *) &b1) == (*(double *) &b1))
3679
? UINT64_C(0)
3680
: ~UINT64_C(0);
3681
3682
return vreinterpretq_m128d_u64(vld1q_u64(d));
3683
#endif
3684
}
3685
3686
// Compare the lower double-precision (64-bit) floating-point elements in a and
3687
// b to see if either is NaN, store the result in the lower element of dst, and
3688
// copy the upper element from a to the upper element of dst.
3689
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
3690
FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
3691
{
3692
#if defined(__aarch64__) || defined(_M_ARM64)
3693
return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
3694
#else
3695
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3696
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3697
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3698
uint64_t d[2];
3699
d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3700
(*(double *) &b0) == (*(double *) &b0))
3701
? UINT64_C(0)
3702
: ~UINT64_C(0);
3703
d[1] = a1;
3704
3705
return vreinterpretq_m128d_u64(vld1q_u64(d));
3706
#endif
3707
}
3708
3709
// Compare the lower double-precision (64-bit) floating-point element in a and b
3710
// for greater-than-or-equal, and return the boolean result (0 or 1).
3711
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
3712
FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
3713
{
3714
#if defined(__aarch64__) || defined(_M_ARM64)
3715
return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3716
#else
3717
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3718
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3719
3720
return (*(double *) &a0 >= *(double *) &b0);
3721
#endif
3722
}
3723
3724
// Compare the lower double-precision (64-bit) floating-point element in a and b
3725
// for greater-than, and return the boolean result (0 or 1).
3726
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
3727
FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
3728
{
3729
#if defined(__aarch64__) || defined(_M_ARM64)
3730
return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3731
#else
3732
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3733
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3734
3735
return (*(double *) &a0 > *(double *) &b0);
3736
#endif
3737
}
3738
3739
// Compare the lower double-precision (64-bit) floating-point element in a and b
3740
// for less-than-or-equal, and return the boolean result (0 or 1).
3741
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
3742
FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
3743
{
3744
#if defined(__aarch64__) || defined(_M_ARM64)
3745
return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3746
#else
3747
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3748
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3749
3750
return (*(double *) &a0 <= *(double *) &b0);
3751
#endif
3752
}
3753
3754
// Compare the lower double-precision (64-bit) floating-point element in a and b
3755
// for less-than, and return the boolean result (0 or 1).
3756
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
3757
FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
3758
{
3759
#if defined(__aarch64__) || defined(_M_ARM64)
3760
return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3761
#else
3762
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3763
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3764
3765
return (*(double *) &a0 < *(double *) &b0);
3766
#endif
3767
}
3768
3769
// Compare the lower double-precision (64-bit) floating-point element in a and b
3770
// for equality, and return the boolean result (0 or 1).
3771
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
3772
FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
3773
{
3774
#if defined(__aarch64__) || defined(_M_ARM64)
3775
return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3776
#else
3777
uint32x4_t a_not_nan =
3778
vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
3779
uint32x4_t b_not_nan =
3780
vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
3781
uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3782
uint32x4_t a_eq_b =
3783
vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3784
uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3785
vreinterpretq_u64_u32(a_eq_b));
3786
return vgetq_lane_u64(and_results, 0) & 0x1;
3787
#endif
3788
}
3789
3790
// Compare the lower double-precision (64-bit) floating-point element in a and b
3791
// for not-equal, and return the boolean result (0 or 1).
3792
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
3793
FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
3794
{
3795
return !_mm_comieq_sd(a, b);
3796
}
3797
3798
// Convert packed signed 32-bit integers in a to packed double-precision
3799
// (64-bit) floating-point elements, and store the results in dst.
3800
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
3801
FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
3802
{
3803
#if defined(__aarch64__) || defined(_M_ARM64)
3804
return vreinterpretq_m128d_f64(
3805
vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
3806
#else
3807
double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3808
double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
3809
return _mm_set_pd(a1, a0);
3810
#endif
3811
}
3812
3813
// Convert packed signed 32-bit integers in a to packed single-precision
3814
// (32-bit) floating-point elements, and store the results in dst.
3815
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
3816
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
3817
{
3818
return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
3819
}
3820
3821
// Convert packed double-precision (64-bit) floating-point elements in a to
3822
// packed 32-bit integers, and store the results in dst.
3823
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
3824
FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
3825
{
3826
// vrnd32xq_f64 not supported on clang
3827
#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
3828
float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
3829
int64x2_t integers = vcvtq_s64_f64(rounded);
3830
return vreinterpretq_m128i_s32(
3831
vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
3832
#else
3833
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3834
double d0 = ((double *) &rnd)[0];
3835
double d1 = ((double *) &rnd)[1];
3836
return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
3837
#endif
3838
}
3839
3840
// Convert packed double-precision (64-bit) floating-point elements in a to
3841
// packed 32-bit integers, and store the results in dst.
3842
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
3843
FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
3844
{
3845
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3846
double d0 = ((double *) &rnd)[0];
3847
double d1 = ((double *) &rnd)[1];
3848
int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3849
return vreinterpret_m64_s32(vld1_s32(data));
3850
}
3851
3852
// Convert packed double-precision (64-bit) floating-point elements in a to
3853
// packed single-precision (32-bit) floating-point elements, and store the
3854
// results in dst.
3855
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
3856
FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
3857
{
3858
#if defined(__aarch64__) || defined(_M_ARM64)
3859
float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3860
return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
3861
#else
3862
float a0 = (float) ((double *) &a)[0];
3863
float a1 = (float) ((double *) &a)[1];
3864
return _mm_set_ps(0, 0, a1, a0);
3865
#endif
3866
}
3867
3868
// Convert packed signed 32-bit integers in a to packed double-precision
3869
// (64-bit) floating-point elements, and store the results in dst.
3870
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
3871
FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
3872
{
3873
#if defined(__aarch64__) || defined(_M_ARM64)
3874
return vreinterpretq_m128d_f64(
3875
vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
3876
#else
3877
double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
3878
double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
3879
return _mm_set_pd(a1, a0);
3880
#endif
3881
}
3882
3883
// Convert packed single-precision (32-bit) floating-point elements in a to
3884
// packed 32-bit integers, and store the results in dst.
3885
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
3886
// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
3887
// does not support! It is supported on ARMv8-A however.
3888
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
3889
{
3890
#if defined(__ARM_FEATURE_FRINT)
3891
return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
3892
#elif (defined(__aarch64__) || defined(_M_ARM64)) || \
3893
defined(__ARM_FEATURE_DIRECTED_ROUNDING)
3894
switch (_MM_GET_ROUNDING_MODE()) {
3895
case _MM_ROUND_NEAREST:
3896
return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
3897
case _MM_ROUND_DOWN:
3898
return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
3899
case _MM_ROUND_UP:
3900
return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
3901
default: // _MM_ROUND_TOWARD_ZERO
3902
return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
3903
}
3904
#else
3905
float *f = (float *) &a;
3906
switch (_MM_GET_ROUNDING_MODE()) {
3907
case _MM_ROUND_NEAREST: {
3908
uint32x4_t signmask = vdupq_n_u32(0x80000000);
3909
float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
3910
vdupq_n_f32(0.5f)); /* +/- 0.5 */
3911
int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
3912
vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
3913
int32x4_t r_trunc = vcvtq_s32_f32(
3914
vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
3915
int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
3916
vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
3917
int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
3918
vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
3919
float32x4_t delta = vsubq_f32(
3920
vreinterpretq_f32_m128(a),
3921
vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
3922
uint32x4_t is_delta_half =
3923
vceqq_f32(delta, half); /* delta == +/- 0.5 */
3924
return vreinterpretq_m128i_s32(
3925
vbslq_s32(is_delta_half, r_even, r_normal));
3926
}
3927
case _MM_ROUND_DOWN:
3928
return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
3929
floorf(f[0]));
3930
case _MM_ROUND_UP:
3931
return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
3932
ceilf(f[0]));
3933
default: // _MM_ROUND_TOWARD_ZERO
3934
return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
3935
(int32_t) f[0]);
3936
}
3937
#endif
3938
}
3939
3940
// Convert packed single-precision (32-bit) floating-point elements in a to
3941
// packed double-precision (64-bit) floating-point elements, and store the
3942
// results in dst.
3943
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
3944
FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
3945
{
3946
#if defined(__aarch64__) || defined(_M_ARM64)
3947
return vreinterpretq_m128d_f64(
3948
vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
3949
#else
3950
double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
3951
double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
3952
return _mm_set_pd(a1, a0);
3953
#endif
3954
}
3955
3956
// Copy the lower double-precision (64-bit) floating-point element of a to dst.
3957
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
3958
FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
3959
{
3960
#if defined(__aarch64__) || defined(_M_ARM64)
3961
return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
3962
#else
3963
return ((double *) &a)[0];
3964
#endif
3965
}
3966
3967
// Convert the lower double-precision (64-bit) floating-point element in a to a
3968
// 32-bit integer, and store the result in dst.
3969
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
3970
FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
3971
{
3972
#if defined(__aarch64__) || defined(_M_ARM64)
3973
return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3974
#else
3975
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3976
double ret = ((double *) &rnd)[0];
3977
return (int32_t) ret;
3978
#endif
3979
}
3980
3981
// Convert the lower double-precision (64-bit) floating-point element in a to a
3982
// 64-bit integer, and store the result in dst.
3983
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
3984
FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
3985
{
3986
#if defined(__aarch64__) || defined(_M_ARM64)
3987
return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3988
#else
3989
__m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3990
double ret = ((double *) &rnd)[0];
3991
return (int64_t) ret;
3992
#endif
3993
}
3994
3995
// Convert the lower double-precision (64-bit) floating-point element in a to a
3996
// 64-bit integer, and store the result in dst.
3997
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
3998
#define _mm_cvtsd_si64x _mm_cvtsd_si64
3999
4000
// Convert the lower double-precision (64-bit) floating-point element in b to a
4001
// single-precision (32-bit) floating-point element, store the result in the
4002
// lower element of dst, and copy the upper 3 packed elements from a to the
4003
// upper elements of dst.
4004
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
4005
FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
4006
{
4007
#if defined(__aarch64__) || defined(_M_ARM64)
4008
return vreinterpretq_m128_f32(vsetq_lane_f32(
4009
vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
4010
vreinterpretq_f32_m128(a), 0));
4011
#else
4012
return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
4013
vreinterpretq_f32_m128(a), 0));
4014
#endif
4015
}
4016
4017
// Copy the lower 32-bit integer in a to dst.
4018
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
4019
FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
4020
{
4021
return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4022
}
4023
4024
// Copy the lower 64-bit integer in a to dst.
4025
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
4026
FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4027
{
4028
return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4029
}
4030
4031
// Copy the lower 64-bit integer in a to dst.
4032
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
4033
#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4034
4035
// Convert the signed 32-bit integer b to a double-precision (64-bit)
4036
// floating-point element, store the result in the lower element of dst, and
4037
// copy the upper element from a to the upper element of dst.
4038
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
4039
FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
4040
{
4041
#if defined(__aarch64__) || defined(_M_ARM64)
4042
return vreinterpretq_m128d_f64(
4043
vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4044
#else
4045
double bf = (double) b;
4046
return vreinterpretq_m128d_s64(
4047
vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4048
#endif
4049
}
4050
4051
// Copy the lower 64-bit integer in a to dst.
4052
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
4053
#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4054
4055
// Copy 32-bit integer a to the lower elements of dst, and zero the upper
4056
// elements of dst.
4057
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
4058
FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
4059
{
4060
return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4061
}
4062
4063
// Convert the signed 64-bit integer b to a double-precision (64-bit)
4064
// floating-point element, store the result in the lower element of dst, and
4065
// copy the upper element from a to the upper element of dst.
4066
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
4067
FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
4068
{
4069
#if defined(__aarch64__) || defined(_M_ARM64)
4070
return vreinterpretq_m128d_f64(
4071
vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4072
#else
4073
double bf = (double) b;
4074
return vreinterpretq_m128d_s64(
4075
vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4076
#endif
4077
}
4078
4079
// Copy 64-bit integer a to the lower element of dst, and zero the upper
4080
// element.
4081
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
4082
FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4083
{
4084
return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4085
}
4086
4087
// Copy 64-bit integer a to the lower element of dst, and zero the upper
4088
// element.
4089
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
4090
#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4091
4092
// Convert the signed 64-bit integer b to a double-precision (64-bit)
4093
// floating-point element, store the result in the lower element of dst, and
4094
// copy the upper element from a to the upper element of dst.
4095
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
4096
#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4097
4098
// Convert the lower single-precision (32-bit) floating-point element in b to a
4099
// double-precision (64-bit) floating-point element, store the result in the
4100
// lower element of dst, and copy the upper element from a to the upper element
4101
// of dst.
4102
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
4103
FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
4104
{
4105
double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
4106
#if defined(__aarch64__) || defined(_M_ARM64)
4107
return vreinterpretq_m128d_f64(
4108
vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4109
#else
4110
return vreinterpretq_m128d_s64(
4111
vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
4112
#endif
4113
}
4114
4115
// Convert packed double-precision (64-bit) floating-point elements in a to
4116
// packed 32-bit integers with truncation, and store the results in dst.
4117
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
4118
FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
4119
{
4120
double a0 = ((double *) &a)[0];
4121
double a1 = ((double *) &a)[1];
4122
return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
4123
}
4124
4125
// Convert packed double-precision (64-bit) floating-point elements in a to
4126
// packed 32-bit integers with truncation, and store the results in dst.
4127
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
4128
FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
4129
{
4130
double a0 = ((double *) &a)[0];
4131
double a1 = ((double *) &a)[1];
4132
int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4133
return vreinterpret_m64_s32(vld1_s32(data));
4134
}
4135
4136
// Convert packed single-precision (32-bit) floating-point elements in a to
4137
// packed 32-bit integers with truncation, and store the results in dst.
4138
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
4139
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
4140
{
4141
return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4142
}
4143
4144
// Convert the lower double-precision (64-bit) floating-point element in a to a
4145
// 32-bit integer with truncation, and store the result in dst.
4146
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
4147
FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
4148
{
4149
double ret = *((double *) &a);
4150
return (int32_t) ret;
4151
}
4152
4153
// Convert the lower double-precision (64-bit) floating-point element in a to a
4154
// 64-bit integer with truncation, and store the result in dst.
4155
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
4156
FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
4157
{
4158
#if defined(__aarch64__) || defined(_M_ARM64)
4159
return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4160
#else
4161
double ret = *((double *) &a);
4162
return (int64_t) ret;
4163
#endif
4164
}
4165
4166
// Convert the lower double-precision (64-bit) floating-point element in a to a
4167
// 64-bit integer with truncation, and store the result in dst.
4168
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
4169
#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4170
4171
// Divide packed double-precision (64-bit) floating-point elements in a by
4172
// packed elements in b, and store the results in dst.
4173
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
4174
FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
4175
{
4176
#if defined(__aarch64__) || defined(_M_ARM64)
4177
return vreinterpretq_m128d_f64(
4178
vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4179
#else
4180
double *da = (double *) &a;
4181
double *db = (double *) &b;
4182
double c[2];
4183
c[0] = da[0] / db[0];
4184
c[1] = da[1] / db[1];
4185
return vld1q_f32((float32_t *) c);
4186
#endif
4187
}
4188
4189
// Divide the lower double-precision (64-bit) floating-point element in a by the
4190
// lower double-precision (64-bit) floating-point element in b, store the result
4191
// in the lower element of dst, and copy the upper element from a to the upper
4192
// element of dst.
4193
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
4194
FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
4195
{
4196
#if defined(__aarch64__) || defined(_M_ARM64)
4197
float64x2_t tmp =
4198
vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4199
return vreinterpretq_m128d_f64(
4200
vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4201
#else
4202
return _mm_move_sd(a, _mm_div_pd(a, b));
4203
#endif
4204
}
4205
4206
// Extract a 16-bit integer from a, selected with imm8, and store the result in
4207
// the lower element of dst.
4208
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
4209
// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
4210
#define _mm_extract_epi16(a, imm) \
4211
vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4212
4213
// Copy a to dst, and insert the 16-bit integer i into dst at the location
4214
// specified by imm8.
4215
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
4216
// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
4217
// __constrange(0,8) int imm)
4218
#define _mm_insert_epi16(a, b, imm) \
4219
vreinterpretq_m128i_s16( \
4220
vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm)))
4221
4222
// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
4223
// elements) from memory into dst. mem_addr must be aligned on a 16-byte
4224
// boundary or a general-protection exception may be generated.
4225
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
4226
FORCE_INLINE __m128d _mm_load_pd(const double *p)
4227
{
4228
#if defined(__aarch64__) || defined(_M_ARM64)
4229
return vreinterpretq_m128d_f64(vld1q_f64(p));
4230
#else
4231
const float *fp = (const float *) p;
4232
float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4233
return vreinterpretq_m128d_f32(vld1q_f32(data));
4234
#endif
4235
}
4236
4237
// Load a double-precision (64-bit) floating-point element from memory into both
4238
// elements of dst.
4239
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
4240
#define _mm_load_pd1 _mm_load1_pd
4241
4242
// Load a double-precision (64-bit) floating-point element from memory into the
4243
// lower of dst, and zero the upper element. mem_addr does not need to be
4244
// aligned on any particular boundary.
4245
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
4246
FORCE_INLINE __m128d _mm_load_sd(const double *p)
4247
{
4248
#if defined(__aarch64__) || defined(_M_ARM64)
4249
return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4250
#else
4251
const float *fp = (const float *) p;
4252
float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
4253
return vreinterpretq_m128d_f32(vld1q_f32(data));
4254
#endif
4255
}
4256
4257
// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
4258
// on a 16-byte boundary or a general-protection exception may be generated.
4259
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
4260
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4261
{
4262
return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4263
}
4264
4265
// Load a double-precision (64-bit) floating-point element from memory into both
4266
// elements of dst.
4267
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
4268
FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4269
{
4270
#if defined(__aarch64__) || defined(_M_ARM64)
4271
return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4272
#else
4273
return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4274
#endif
4275
}
4276
4277
// Load a double-precision (64-bit) floating-point element from memory into the
4278
// upper element of dst, and copy the lower element from a to dst. mem_addr does
4279
// not need to be aligned on any particular boundary.
4280
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
4281
FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4282
{
4283
#if defined(__aarch64__) || defined(_M_ARM64)
4284
return vreinterpretq_m128d_f64(
4285
vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4286
#else
4287
return vreinterpretq_m128d_f32(vcombine_f32(
4288
vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4289
#endif
4290
}
4291
4292
// Load 64-bit integer from memory into the first element of dst.
4293
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
4294
FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
4295
{
4296
/* Load the lower 64 bits of the value pointed to by p into the
4297
* lower 64 bits of the result, zeroing the upper 64 bits of the result.
4298
*/
4299
return vreinterpretq_m128i_s32(
4300
vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
4301
}
4302
4303
// Load a double-precision (64-bit) floating-point element from memory into the
4304
// lower element of dst, and copy the upper element from a to dst. mem_addr does
4305
// not need to be aligned on any particular boundary.
4306
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
4307
FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
4308
{
4309
#if defined(__aarch64__) || defined(_M_ARM64)
4310
return vreinterpretq_m128d_f64(
4311
vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4312
#else
4313
return vreinterpretq_m128d_f32(
4314
vcombine_f32(vld1_f32((const float *) p),
4315
vget_high_f32(vreinterpretq_f32_m128d(a))));
4316
#endif
4317
}
4318
4319
// Load 2 double-precision (64-bit) floating-point elements from memory into dst
4320
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
4321
// general-protection exception may be generated.
4322
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
4323
FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
4324
{
4325
#if defined(__aarch64__) || defined(_M_ARM64)
4326
float64x2_t v = vld1q_f64(p);
4327
return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4328
#else
4329
int64x2_t v = vld1q_s64((const int64_t *) p);
4330
return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
4331
#endif
4332
}
4333
4334
// Loads two double-precision from unaligned memory, floating-point values.
4335
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
4336
FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
4337
{
4338
return _mm_load_pd(p);
4339
}
4340
4341
// Load 128-bits of integer data from memory into dst. mem_addr does not need to
4342
// be aligned on any particular boundary.
4343
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
4344
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4345
{
4346
return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4347
}
4348
4349
// Load unaligned 32-bit integer from memory into the first element of dst.
4350
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
4351
FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4352
{
4353
return vreinterpretq_m128i_s32(
4354
vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4355
}
4356
4357
// Multiply packed signed 16-bit integers in a and b, producing intermediate
4358
// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
4359
// 32-bit integers, and pack the results in dst.
4360
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
4361
FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
4362
{
4363
int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
4364
vget_low_s16(vreinterpretq_s16_m128i(b)));
4365
#if defined(__aarch64__) || defined(_M_ARM64)
4366
int32x4_t high =
4367
vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
4368
4369
return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
4370
#else
4371
int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
4372
vget_high_s16(vreinterpretq_s16_m128i(b)));
4373
4374
int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4375
int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4376
4377
return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
4378
#endif
4379
}
4380
4381
// Conditionally store 8-bit integer elements from a into memory using mask
4382
// (elements are not stored when the highest bit is not set in the corresponding
4383
// element) and a non-temporal memory hint. mem_addr does not need to be aligned
4384
// on any particular boundary.
4385
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
4386
FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
4387
{
4388
int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
4389
__m128 b = _mm_load_ps((const float *) mem_addr);
4390
int8x16_t masked =
4391
vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
4392
vreinterpretq_s8_m128(b));
4393
vst1q_s8((int8_t *) mem_addr, masked);
4394
}
4395
4396
// Compare packed signed 16-bit integers in a and b, and store packed maximum
4397
// values in dst.
4398
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
4399
FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
4400
{
4401
return vreinterpretq_m128i_s16(
4402
vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4403
}
4404
4405
// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
4406
// values in dst.
4407
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
4408
FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
4409
{
4410
return vreinterpretq_m128i_u8(
4411
vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4412
}
4413
4414
// Compare packed double-precision (64-bit) floating-point elements in a and b,
4415
// and store packed maximum values in dst.
4416
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
4417
FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
4418
{
4419
#if defined(__aarch64__) || defined(_M_ARM64)
4420
#if SSE2NEON_PRECISE_MINMAX
4421
float64x2_t _a = vreinterpretq_f64_m128d(a);
4422
float64x2_t _b = vreinterpretq_f64_m128d(b);
4423
return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4424
#else
4425
return vreinterpretq_m128d_f64(
4426
vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4427
#endif
4428
#else
4429
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4430
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4431
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4432
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4433
uint64_t d[2];
4434
d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
4435
d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
4436
4437
return vreinterpretq_m128d_u64(vld1q_u64(d));
4438
#endif
4439
}
4440
4441
// Compare the lower double-precision (64-bit) floating-point elements in a and
4442
// b, store the maximum value in the lower element of dst, and copy the upper
4443
// element from a to the upper element of dst.
4444
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
4445
FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
4446
{
4447
#if defined(__aarch64__) || defined(_M_ARM64)
4448
return _mm_move_sd(a, _mm_max_pd(a, b));
4449
#else
4450
double *da = (double *) &a;
4451
double *db = (double *) &b;
4452
double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
4453
return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4454
#endif
4455
}
4456
4457
// Compare packed signed 16-bit integers in a and b, and store packed minimum
4458
// values in dst.
4459
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
4460
FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
4461
{
4462
return vreinterpretq_m128i_s16(
4463
vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4464
}
4465
4466
// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
4467
// values in dst.
4468
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
4469
FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
4470
{
4471
return vreinterpretq_m128i_u8(
4472
vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4473
}
4474
4475
// Compare packed double-precision (64-bit) floating-point elements in a and b,
4476
// and store packed minimum values in dst.
4477
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
4478
FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
4479
{
4480
#if defined(__aarch64__) || defined(_M_ARM64)
4481
#if SSE2NEON_PRECISE_MINMAX
4482
float64x2_t _a = vreinterpretq_f64_m128d(a);
4483
float64x2_t _b = vreinterpretq_f64_m128d(b);
4484
return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4485
#else
4486
return vreinterpretq_m128d_f64(
4487
vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4488
#endif
4489
#else
4490
uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4491
uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4492
uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4493
uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4494
uint64_t d[2];
4495
d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
4496
d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
4497
return vreinterpretq_m128d_u64(vld1q_u64(d));
4498
#endif
4499
}
4500
4501
// Compare the lower double-precision (64-bit) floating-point elements in a and
4502
// b, store the minimum value in the lower element of dst, and copy the upper
4503
// element from a to the upper element of dst.
4504
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
4505
FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
4506
{
4507
#if defined(__aarch64__) || defined(_M_ARM64)
4508
return _mm_move_sd(a, _mm_min_pd(a, b));
4509
#else
4510
double *da = (double *) &a;
4511
double *db = (double *) &b;
4512
double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
4513
return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4514
#endif
4515
}
4516
4517
// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
4518
// upper element.
4519
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
4520
FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
4521
{
4522
return vreinterpretq_m128i_s64(
4523
vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
4524
}
4525
4526
// Move the lower double-precision (64-bit) floating-point element from b to the
4527
// lower element of dst, and copy the upper element from a to the upper element
4528
// of dst.
4529
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
4530
FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
4531
{
4532
return vreinterpretq_m128d_f32(
4533
vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
4534
vget_high_f32(vreinterpretq_f32_m128d(a))));
4535
}
4536
4537
// Create mask from the most significant bit of each 8-bit element in a, and
4538
// store the result in dst.
4539
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
4540
FORCE_INLINE int _mm_movemask_epi8(__m128i a)
4541
{
4542
// Use increasingly wide shifts+adds to collect the sign bits
4543
// together.
4544
// Since the widening shifts would be rather confusing to follow in little
4545
// endian, everything will be illustrated in big endian order instead. This
4546
// has a different result - the bits would actually be reversed on a big
4547
// endian machine.
4548
4549
// Starting input (only half the elements are shown):
4550
// 89 ff 1d c0 00 10 99 33
4551
uint8x16_t input = vreinterpretq_u8_m128i(a);
4552
4553
// Shift out everything but the sign bits with an unsigned shift right.
4554
//
4555
// Bytes of the vector::
4556
// 89 ff 1d c0 00 10 99 33
4557
// \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7)
4558
// | | | | | | | |
4559
// 01 01 00 01 00 00 01 00
4560
//
4561
// Bits of first important lane(s):
4562
// 10001001 (89)
4563
// \______
4564
// |
4565
// 00000001 (01)
4566
uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4567
4568
// Merge the even lanes together with a 16-bit unsigned shift right + add.
4569
// 'xx' represents garbage data which will be ignored in the final result.
4570
// In the important bytes, the add functions like a binary OR.
4571
//
4572
// 01 01 00 01 00 00 01 00
4573
// \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7))
4574
// \| \| \| \|
4575
// xx 03 xx 01 xx 00 xx 02
4576
//
4577
// 00000001 00000001 (01 01)
4578
// \_______ |
4579
// \|
4580
// xxxxxxxx xxxxxx11 (xx 03)
4581
uint32x4_t paired16 =
4582
vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4583
4584
// Repeat with a wider 32-bit shift + add.
4585
// xx 03 xx 01 xx 00 xx 02
4586
// \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >>
4587
// 14))
4588
// \| \|
4589
// xx xx xx 0d xx xx xx 02
4590
//
4591
// 00000011 00000001 (03 01)
4592
// \\_____ ||
4593
// '----.\||
4594
// xxxxxxxx xxxx1101 (xx 0d)
4595
uint64x2_t paired32 =
4596
vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4597
4598
// Last, an even wider 64-bit shift + add to get our result in the low 8 bit
4599
// lanes. xx xx xx 0d xx xx xx 02
4600
// \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >>
4601
// 28))
4602
// \|
4603
// xx xx xx xx xx xx xx d2
4604
//
4605
// 00001101 00000010 (0d 02)
4606
// \ \___ | |
4607
// '---. \| |
4608
// xxxxxxxx 11010010 (xx d2)
4609
uint8x16_t paired64 =
4610
vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4611
4612
// Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
4613
// xx xx xx xx xx xx xx d2
4614
// || return paired64[0]
4615
// d2
4616
// Note: Little endian would return the correct value 4b (01001011) instead.
4617
return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4618
}
4619
4620
// Set each bit of mask dst based on the most significant bit of the
4621
// corresponding packed double-precision (64-bit) floating-point element in a.
4622
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
4623
FORCE_INLINE int _mm_movemask_pd(__m128d a)
4624
{
4625
uint64x2_t input = vreinterpretq_u64_m128d(a);
4626
uint64x2_t high_bits = vshrq_n_u64(input, 63);
4627
return (int) (vgetq_lane_u64(high_bits, 0) |
4628
(vgetq_lane_u64(high_bits, 1) << 1));
4629
}
4630
4631
// Copy the lower 64-bit integer in a to dst.
4632
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
4633
FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
4634
{
4635
return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
4636
}
4637
4638
// Copy the 64-bit integer a to the lower element of dst, and zero the upper
4639
// element.
4640
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
4641
FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
4642
{
4643
return vreinterpretq_m128i_s64(
4644
vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
4645
}
4646
4647
// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
4648
// a and b, and store the unsigned 64-bit results in dst.
4649
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
4650
FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
4651
{
4652
// vmull_u32 upcasts instead of masking, so we downcast.
4653
uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
4654
uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
4655
return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
4656
}
4657
4658
// Multiply packed double-precision (64-bit) floating-point elements in a and b,
4659
// and store the results in dst.
4660
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
4661
FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
4662
{
4663
#if defined(__aarch64__) || defined(_M_ARM64)
4664
return vreinterpretq_m128d_f64(
4665
vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4666
#else
4667
double *da = (double *) &a;
4668
double *db = (double *) &b;
4669
double c[2];
4670
c[0] = da[0] * db[0];
4671
c[1] = da[1] * db[1];
4672
return vld1q_f32((float32_t *) c);
4673
#endif
4674
}
4675
4676
// Multiply the lower double-precision (64-bit) floating-point element in a and
4677
// b, store the result in the lower element of dst, and copy the upper element
4678
// from a to the upper element of dst.
4679
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
4680
FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
4681
{
4682
return _mm_move_sd(a, _mm_mul_pd(a, b));
4683
}
4684
4685
// Multiply the low unsigned 32-bit integers from a and b, and store the
4686
// unsigned 64-bit result in dst.
4687
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
4688
FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
4689
{
4690
return vreinterpret_m64_u64(vget_low_u64(
4691
vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
4692
}
4693
4694
// Multiply the packed signed 16-bit integers in a and b, producing intermediate
4695
// 32-bit integers, and store the high 16 bits of the intermediate integers in
4696
// dst.
4697
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
4698
FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
4699
{
4700
/* FIXME: issue with large values because of result saturation */
4701
// int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
4702
// vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
4703
// vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
4704
int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
4705
int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
4706
int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4707
int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
4708
int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
4709
int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4710
uint16x8x2_t r =
4711
vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4712
return vreinterpretq_m128i_u16(r.val[1]);
4713
}
4714
4715
// Multiply the packed unsigned 16-bit integers in a and b, producing
4716
// intermediate 32-bit integers, and store the high 16 bits of the intermediate
4717
// integers in dst.
4718
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
4719
FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
4720
{
4721
uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
4722
uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
4723
uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4724
#if defined(__aarch64__) || defined(_M_ARM64)
4725
uint32x4_t ab7654 =
4726
vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
4727
uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4728
vreinterpretq_u16_u32(ab7654));
4729
return vreinterpretq_m128i_u16(r);
4730
#else
4731
uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
4732
uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
4733
uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4734
uint16x8x2_t r =
4735
vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4736
return vreinterpretq_m128i_u16(r.val[1]);
4737
#endif
4738
}
4739
4740
// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
4741
// integers, and store the low 16 bits of the intermediate integers in dst.
4742
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
4743
FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
4744
{
4745
return vreinterpretq_m128i_s16(
4746
vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4747
}
4748
4749
// Compute the bitwise OR of packed double-precision (64-bit) floating-point
4750
// elements in a and b, and store the results in dst.
4751
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
4752
FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
4753
{
4754
return vreinterpretq_m128d_s64(
4755
vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
4756
}
4757
4758
// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
4759
// and store the result in dst.
4760
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
4761
FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
4762
{
4763
return vreinterpretq_m128i_s32(
4764
vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4765
}
4766
4767
// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
4768
// using signed saturation, and store the results in dst.
4769
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
4770
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
4771
{
4772
return vreinterpretq_m128i_s8(
4773
vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
4774
vqmovn_s16(vreinterpretq_s16_m128i(b))));
4775
}
4776
4777
// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
4778
// using signed saturation, and store the results in dst.
4779
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
4780
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
4781
{
4782
return vreinterpretq_m128i_s16(
4783
vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
4784
vqmovn_s32(vreinterpretq_s32_m128i(b))));
4785
}
4786
4787
// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
4788
// using unsigned saturation, and store the results in dst.
4789
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
4790
FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
4791
{
4792
return vreinterpretq_m128i_u8(
4793
vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
4794
vqmovun_s16(vreinterpretq_s16_m128i(b))));
4795
}
4796
4797
// Pause the processor. This is typically used in spin-wait loops and depending
4798
// on the x86 processor typical values are in the 40-100 cycle range. The
4799
// 'yield' instruction isn't a good fit because it's effectively a nop on most
4800
// Arm cores. Experience with several databases has shown has shown an 'isb' is
4801
// a reasonable approximation.
4802
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
4803
FORCE_INLINE void _mm_pause(void)
4804
{
4805
#if defined(_MSC_VER)
4806
__isb(_ARM64_BARRIER_SY);
4807
#else
4808
__asm__ __volatile__("isb\n");
4809
#endif
4810
}
4811
4812
// Compute the absolute differences of packed unsigned 8-bit integers in a and
4813
// b, then horizontally sum each consecutive 8 differences to produce two
4814
// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
4815
// 16 bits of 64-bit elements in dst.
4816
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
4817
FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
4818
{
4819
uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
4820
return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
4821
}
4822
4823
// Set packed 16-bit integers in dst with the supplied values.
4824
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
4825
FORCE_INLINE __m128i _mm_set_epi16(short i7,
4826
short i6,
4827
short i5,
4828
short i4,
4829
short i3,
4830
short i2,
4831
short i1,
4832
short i0)
4833
{
4834
int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
4835
return vreinterpretq_m128i_s16(vld1q_s16(data));
4836
}
4837
4838
// Set packed 32-bit integers in dst with the supplied values.
4839
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
4840
FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
4841
{
4842
int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
4843
return vreinterpretq_m128i_s32(vld1q_s32(data));
4844
}
4845
4846
// Set packed 64-bit integers in dst with the supplied values.
4847
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
4848
FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
4849
{
4850
return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0));
4851
}
4852
4853
// Set packed 64-bit integers in dst with the supplied values.
4854
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
4855
FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
4856
{
4857
return vreinterpretq_m128i_s64(
4858
vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
4859
}
4860
4861
// Set packed 8-bit integers in dst with the supplied values.
4862
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
4863
FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
4864
signed char b14,
4865
signed char b13,
4866
signed char b12,
4867
signed char b11,
4868
signed char b10,
4869
signed char b9,
4870
signed char b8,
4871
signed char b7,
4872
signed char b6,
4873
signed char b5,
4874
signed char b4,
4875
signed char b3,
4876
signed char b2,
4877
signed char b1,
4878
signed char b0)
4879
{
4880
int8_t ALIGN_STRUCT(16)
4881
data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
4882
(int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
4883
(int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
4884
(int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
4885
return (__m128i) vld1q_s8(data);
4886
}
4887
4888
// Set packed double-precision (64-bit) floating-point elements in dst with the
4889
// supplied values.
4890
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
4891
FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
4892
{
4893
double ALIGN_STRUCT(16) data[2] = {e0, e1};
4894
#if defined(__aarch64__) || defined(_M_ARM64)
4895
return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
4896
#else
4897
return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
4898
#endif
4899
}
4900
4901
// Broadcast double-precision (64-bit) floating-point value a to all elements of
4902
// dst.
4903
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
4904
#define _mm_set_pd1 _mm_set1_pd
4905
4906
// Copy double-precision (64-bit) floating-point element a to the lower element
4907
// of dst, and zero the upper element.
4908
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
4909
FORCE_INLINE __m128d _mm_set_sd(double a)
4910
{
4911
#if defined(__aarch64__) || defined(_M_ARM64)
4912
return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
4913
#else
4914
return _mm_set_pd(0, a);
4915
#endif
4916
}
4917
4918
// Broadcast 16-bit integer a to all elements of dst.
4919
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
4920
FORCE_INLINE __m128i _mm_set1_epi16(short w)
4921
{
4922
return vreinterpretq_m128i_s16(vdupq_n_s16(w));
4923
}
4924
4925
// Broadcast 32-bit integer a to all elements of dst.
4926
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
4927
FORCE_INLINE __m128i _mm_set1_epi32(int _i)
4928
{
4929
return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
4930
}
4931
4932
// Broadcast 64-bit integer a to all elements of dst.
4933
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
4934
FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
4935
{
4936
return vreinterpretq_m128i_s64(vdupq_lane_s64(_i, 0));
4937
}
4938
4939
// Broadcast 64-bit integer a to all elements of dst.
4940
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
4941
FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
4942
{
4943
return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
4944
}
4945
4946
// Broadcast 8-bit integer a to all elements of dst.
4947
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
4948
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
4949
{
4950
return vreinterpretq_m128i_s8(vdupq_n_s8(w));
4951
}
4952
4953
// Broadcast double-precision (64-bit) floating-point value a to all elements of
4954
// dst.
4955
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
4956
FORCE_INLINE __m128d _mm_set1_pd(double d)
4957
{
4958
#if defined(__aarch64__) || defined(_M_ARM64)
4959
return vreinterpretq_m128d_f64(vdupq_n_f64(d));
4960
#else
4961
return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
4962
#endif
4963
}
4964
4965
// Set packed 16-bit integers in dst with the supplied values in reverse order.
4966
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
4967
FORCE_INLINE __m128i _mm_setr_epi16(short w0,
4968
short w1,
4969
short w2,
4970
short w3,
4971
short w4,
4972
short w5,
4973
short w6,
4974
short w7)
4975
{
4976
int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
4977
return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
4978
}
4979
4980
// Set packed 32-bit integers in dst with the supplied values in reverse order.
4981
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
4982
FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
4983
{
4984
int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
4985
return vreinterpretq_m128i_s32(vld1q_s32(data));
4986
}
4987
4988
// Set packed 64-bit integers in dst with the supplied values in reverse order.
4989
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
4990
FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
4991
{
4992
return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
4993
}
4994
4995
// Set packed 8-bit integers in dst with the supplied values in reverse order.
4996
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
4997
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
4998
signed char b1,
4999
signed char b2,
5000
signed char b3,
5001
signed char b4,
5002
signed char b5,
5003
signed char b6,
5004
signed char b7,
5005
signed char b8,
5006
signed char b9,
5007
signed char b10,
5008
signed char b11,
5009
signed char b12,
5010
signed char b13,
5011
signed char b14,
5012
signed char b15)
5013
{
5014
int8_t ALIGN_STRUCT(16)
5015
data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3,
5016
(int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7,
5017
(int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11,
5018
(int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5019
return (__m128i) vld1q_s8(data);
5020
}
5021
5022
// Set packed double-precision (64-bit) floating-point elements in dst with the
5023
// supplied values in reverse order.
5024
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
5025
FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
5026
{
5027
return _mm_set_pd(e0, e1);
5028
}
5029
5030
// Return vector of type __m128d with all elements set to zero.
5031
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
5032
FORCE_INLINE __m128d _mm_setzero_pd(void)
5033
{
5034
#if defined(__aarch64__) || defined(_M_ARM64)
5035
return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5036
#else
5037
return vreinterpretq_m128d_f32(vdupq_n_f32(0));
5038
#endif
5039
}
5040
5041
// Return vector of type __m128i with all elements set to zero.
5042
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
5043
FORCE_INLINE __m128i _mm_setzero_si128(void)
5044
{
5045
return vreinterpretq_m128i_s32(vdupq_n_s32(0));
5046
}
5047
5048
// Shuffle 32-bit integers in a using the control in imm8, and store the results
5049
// in dst.
5050
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
5051
// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
5052
// __constrange(0,255) int imm)
5053
#if defined(_sse2neon_shuffle)
5054
#define _mm_shuffle_epi32(a, imm) \
5055
__extension__({ \
5056
int32x4_t _input = vreinterpretq_s32_m128i(a); \
5057
int32x4_t _shuf = \
5058
vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5059
((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \
5060
vreinterpretq_m128i_s32(_shuf); \
5061
})
5062
#else // generic
5063
#define _mm_shuffle_epi32(a, imm) \
5064
_sse2neon_define1( \
5065
__m128i, a, __m128i ret; switch (imm) { \
5066
case _MM_SHUFFLE(1, 0, 3, 2): \
5067
ret = _mm_shuffle_epi_1032(_a); \
5068
break; \
5069
case _MM_SHUFFLE(2, 3, 0, 1): \
5070
ret = _mm_shuffle_epi_2301(_a); \
5071
break; \
5072
case _MM_SHUFFLE(0, 3, 2, 1): \
5073
ret = _mm_shuffle_epi_0321(_a); \
5074
break; \
5075
case _MM_SHUFFLE(2, 1, 0, 3): \
5076
ret = _mm_shuffle_epi_2103(_a); \
5077
break; \
5078
case _MM_SHUFFLE(1, 0, 1, 0): \
5079
ret = _mm_shuffle_epi_1010(_a); \
5080
break; \
5081
case _MM_SHUFFLE(1, 0, 0, 1): \
5082
ret = _mm_shuffle_epi_1001(_a); \
5083
break; \
5084
case _MM_SHUFFLE(0, 1, 0, 1): \
5085
ret = _mm_shuffle_epi_0101(_a); \
5086
break; \
5087
case _MM_SHUFFLE(2, 2, 1, 1): \
5088
ret = _mm_shuffle_epi_2211(_a); \
5089
break; \
5090
case _MM_SHUFFLE(0, 1, 2, 2): \
5091
ret = _mm_shuffle_epi_0122(_a); \
5092
break; \
5093
case _MM_SHUFFLE(3, 3, 3, 2): \
5094
ret = _mm_shuffle_epi_3332(_a); \
5095
break; \
5096
case _MM_SHUFFLE(0, 0, 0, 0): \
5097
ret = _mm_shuffle_epi32_splat(_a, 0); \
5098
break; \
5099
case _MM_SHUFFLE(1, 1, 1, 1): \
5100
ret = _mm_shuffle_epi32_splat(_a, 1); \
5101
break; \
5102
case _MM_SHUFFLE(2, 2, 2, 2): \
5103
ret = _mm_shuffle_epi32_splat(_a, 2); \
5104
break; \
5105
case _MM_SHUFFLE(3, 3, 3, 3): \
5106
ret = _mm_shuffle_epi32_splat(_a, 3); \
5107
break; \
5108
default: \
5109
ret = _mm_shuffle_epi32_default(_a, (imm)); \
5110
break; \
5111
} _sse2neon_return(ret);)
5112
#endif
5113
5114
// Shuffle double-precision (64-bit) floating-point elements using the control
5115
// in imm8, and store the results in dst.
5116
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
5117
#ifdef _sse2neon_shuffle
5118
#define _mm_shuffle_pd(a, b, imm8) \
5119
vreinterpretq_m128d_s64( \
5120
vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
5121
imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
5122
#else
5123
#define _mm_shuffle_pd(a, b, imm8) \
5124
_mm_castsi128_pd(_mm_set_epi64x( \
5125
vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5126
vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5127
#endif
5128
5129
// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
5130
// __constrange(0,255) int imm)
5131
#if defined(_sse2neon_shuffle)
5132
#define _mm_shufflehi_epi16(a, imm) \
5133
__extension__({ \
5134
int16x8_t _input = vreinterpretq_s16_m128i(a); \
5135
int16x8_t _shuf = \
5136
vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \
5137
(((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5138
(((imm) >> 6) & 0x3) + 4); \
5139
vreinterpretq_m128i_s16(_shuf); \
5140
})
5141
#else // generic
5142
#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5143
#endif
5144
5145
// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
5146
// __constrange(0,255) int imm)
5147
#if defined(_sse2neon_shuffle)
5148
#define _mm_shufflelo_epi16(a, imm) \
5149
__extension__({ \
5150
int16x8_t _input = vreinterpretq_s16_m128i(a); \
5151
int16x8_t _shuf = vshuffleq_s16( \
5152
_input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \
5153
(((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5154
vreinterpretq_m128i_s16(_shuf); \
5155
})
5156
#else // generic
5157
#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5158
#endif
5159
5160
// Shift packed 16-bit integers in a left by count while shifting in zeros, and
5161
// store the results in dst.
5162
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
5163
FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
5164
{
5165
uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5166
if (_sse2neon_unlikely(c & ~15))
5167
return _mm_setzero_si128();
5168
5169
int16x8_t vc = vdupq_n_s16((int16_t) c);
5170
return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
5171
}
5172
5173
// Shift packed 32-bit integers in a left by count while shifting in zeros, and
5174
// store the results in dst.
5175
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
5176
FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
5177
{
5178
uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5179
if (_sse2neon_unlikely(c & ~31))
5180
return _mm_setzero_si128();
5181
5182
int32x4_t vc = vdupq_n_s32((int32_t) c);
5183
return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
5184
}
5185
5186
// Shift packed 64-bit integers in a left by count while shifting in zeros, and
5187
// store the results in dst.
5188
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
5189
FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
5190
{
5191
uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5192
if (_sse2neon_unlikely(c & ~63))
5193
return _mm_setzero_si128();
5194
5195
int64x2_t vc = vdupq_n_s64((int64_t) c);
5196
return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
5197
}
5198
5199
// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
5200
// store the results in dst.
5201
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
5202
FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
5203
{
5204
if (_sse2neon_unlikely(imm & ~15))
5205
return _mm_setzero_si128();
5206
return vreinterpretq_m128i_s16(
5207
vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
5208
}
5209
5210
// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
5211
// store the results in dst.
5212
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
5213
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
5214
{
5215
if (_sse2neon_unlikely(imm & ~31))
5216
return _mm_setzero_si128();
5217
return vreinterpretq_m128i_s32(
5218
vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
5219
}
5220
5221
// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
5222
// store the results in dst.
5223
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
5224
FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
5225
{
5226
if (_sse2neon_unlikely(imm & ~63))
5227
return _mm_setzero_si128();
5228
return vreinterpretq_m128i_s64(
5229
vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
5230
}
5231
5232
// Shift a left by imm8 bytes while shifting in zeros, and store the results in
5233
// dst.
5234
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
5235
#define _mm_slli_si128(a, imm) \
5236
_sse2neon_define1( \
5237
__m128i, a, int8x16_t ret; \
5238
if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \
5239
else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \
5240
else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a), \
5241
((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \
5242
_sse2neon_return(vreinterpretq_m128i_s8(ret));)
5243
5244
// Compute the square root of packed double-precision (64-bit) floating-point
5245
// elements in a, and store the results in dst.
5246
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
5247
FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
5248
{
5249
#if defined(__aarch64__) || defined(_M_ARM64)
5250
return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5251
#else
5252
double a0 = sqrt(((double *) &a)[0]);
5253
double a1 = sqrt(((double *) &a)[1]);
5254
return _mm_set_pd(a1, a0);
5255
#endif
5256
}
5257
5258
// Compute the square root of the lower double-precision (64-bit) floating-point
5259
// element in b, store the result in the lower element of dst, and copy the
5260
// upper element from a to the upper element of dst.
5261
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
5262
FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
5263
{
5264
#if defined(__aarch64__) || defined(_M_ARM64)
5265
return _mm_move_sd(a, _mm_sqrt_pd(b));
5266
#else
5267
return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
5268
#endif
5269
}
5270
5271
// Shift packed 16-bit integers in a right by count while shifting in sign bits,
5272
// and store the results in dst.
5273
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
5274
FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5275
{
5276
int64_t c = vgetq_lane_s64(count, 0);
5277
if (_sse2neon_unlikely(c & ~15))
5278
return _mm_cmplt_epi16(a, _mm_setzero_si128());
5279
return vreinterpretq_m128i_s16(
5280
vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c)));
5281
}
5282
5283
// Shift packed 32-bit integers in a right by count while shifting in sign bits,
5284
// and store the results in dst.
5285
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
5286
FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5287
{
5288
int64_t c = vgetq_lane_s64(count, 0);
5289
if (_sse2neon_unlikely(c & ~31))
5290
return _mm_cmplt_epi32(a, _mm_setzero_si128());
5291
return vreinterpretq_m128i_s32(
5292
vshlq_s32((int32x4_t) a, vdupq_n_s32((int) -c)));
5293
}
5294
5295
// Shift packed 16-bit integers in a right by imm8 while shifting in sign
5296
// bits, and store the results in dst.
5297
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
5298
FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
5299
{
5300
const int count = (imm & ~15) ? 15 : imm;
5301
return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5302
}
5303
5304
// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
5305
// and store the results in dst.
5306
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
5307
// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
5308
#define _mm_srai_epi32(a, imm) \
5309
_sse2neon_define0( \
5310
__m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) { \
5311
ret = _a; \
5312
} else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \
5313
ret = vreinterpretq_m128i_s32( \
5314
vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \
5315
} else { \
5316
ret = vreinterpretq_m128i_s32( \
5317
vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31)); \
5318
} _sse2neon_return(ret);)
5319
5320
// Shift packed 16-bit integers in a right by count while shifting in zeros, and
5321
// store the results in dst.
5322
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
5323
FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
5324
{
5325
uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5326
if (_sse2neon_unlikely(c & ~15))
5327
return _mm_setzero_si128();
5328
5329
int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5330
return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
5331
}
5332
5333
// Shift packed 32-bit integers in a right by count while shifting in zeros, and
5334
// store the results in dst.
5335
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
5336
FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
5337
{
5338
uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5339
if (_sse2neon_unlikely(c & ~31))
5340
return _mm_setzero_si128();
5341
5342
int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5343
return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
5344
}
5345
5346
// Shift packed 64-bit integers in a right by count while shifting in zeros, and
5347
// store the results in dst.
5348
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
5349
FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
5350
{
5351
uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5352
if (_sse2neon_unlikely(c & ~63))
5353
return _mm_setzero_si128();
5354
5355
int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5356
return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
5357
}
5358
5359
// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
5360
// store the results in dst.
5361
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
5362
#define _mm_srli_epi16(a, imm) \
5363
_sse2neon_define0( \
5364
__m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) { \
5365
ret = _mm_setzero_si128(); \
5366
} else { \
5367
ret = vreinterpretq_m128i_u16( \
5368
vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \
5369
} _sse2neon_return(ret);)
5370
5371
// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
5372
// store the results in dst.
5373
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
5374
// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
5375
#define _mm_srli_epi32(a, imm) \
5376
_sse2neon_define0( \
5377
__m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) { \
5378
ret = _mm_setzero_si128(); \
5379
} else { \
5380
ret = vreinterpretq_m128i_u32( \
5381
vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \
5382
} _sse2neon_return(ret);)
5383
5384
// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
5385
// store the results in dst.
5386
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
5387
#define _mm_srli_epi64(a, imm) \
5388
_sse2neon_define0( \
5389
__m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) { \
5390
ret = _mm_setzero_si128(); \
5391
} else { \
5392
ret = vreinterpretq_m128i_u64( \
5393
vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \
5394
} _sse2neon_return(ret);)
5395
5396
// Shift a right by imm8 bytes while shifting in zeros, and store the results in
5397
// dst.
5398
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
5399
#define _mm_srli_si128(a, imm) \
5400
_sse2neon_define1( \
5401
__m128i, a, int8x16_t ret; \
5402
if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \
5403
else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \
5404
(imm > 15 ? 0 : imm)); \
5405
_sse2neon_return(vreinterpretq_m128i_s8(ret));)
5406
5407
// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5408
// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
5409
// or a general-protection exception may be generated.
5410
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
5411
FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
5412
{
5413
#if defined(__aarch64__) || defined(_M_ARM64)
5414
vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5415
#else
5416
vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
5417
#endif
5418
}
5419
5420
// Store the lower double-precision (64-bit) floating-point element from a into
5421
// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5422
// boundary or a general-protection exception may be generated.
5423
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
5424
FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
5425
{
5426
#if defined(__aarch64__) || defined(_M_ARM64)
5427
float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5428
vst1q_f64((float64_t *) mem_addr,
5429
vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5430
#else
5431
float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
5432
vst1q_f32((float32_t *) mem_addr,
5433
vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
5434
#endif
5435
}
5436
5437
// Store the lower double-precision (64-bit) floating-point element from a into
5438
// memory. mem_addr does not need to be aligned on any particular boundary.
5439
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
5440
FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
5441
{
5442
#if defined(__aarch64__) || defined(_M_ARM64)
5443
vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5444
#else
5445
vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
5446
#endif
5447
}
5448
5449
// Store 128-bits of integer data from a into memory. mem_addr must be aligned
5450
// on a 16-byte boundary or a general-protection exception may be generated.
5451
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
5452
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
5453
{
5454
vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5455
}
5456
5457
// Store the lower double-precision (64-bit) floating-point element from a into
5458
// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5459
// boundary or a general-protection exception may be generated.
5460
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
5461
#define _mm_store1_pd _mm_store_pd1
5462
5463
// Store the upper double-precision (64-bit) floating-point element from a into
5464
// memory.
5465
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
5466
FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
5467
{
5468
#if defined(__aarch64__) || defined(_M_ARM64)
5469
vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5470
#else
5471
vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
5472
#endif
5473
}
5474
5475
// Store 64-bit integer from the first element of a into memory.
5476
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
5477
FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
5478
{
5479
vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
5480
}
5481
5482
// Store the lower double-precision (64-bit) floating-point element from a into
5483
// memory.
5484
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
5485
FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
5486
{
5487
#if defined(__aarch64__) || defined(_M_ARM64)
5488
vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5489
#else
5490
vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
5491
#endif
5492
}
5493
5494
// Store 2 double-precision (64-bit) floating-point elements from a into memory
5495
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
5496
// general-protection exception may be generated.
5497
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
5498
FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
5499
{
5500
float32x4_t f = vreinterpretq_f32_m128d(a);
5501
_mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
5502
}
5503
5504
// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5505
// elements) from a into memory. mem_addr does not need to be aligned on any
5506
// particular boundary.
5507
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
5508
FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
5509
{
5510
_mm_store_pd(mem_addr, a);
5511
}
5512
5513
// Store 128-bits of integer data from a into memory. mem_addr does not need to
5514
// be aligned on any particular boundary.
5515
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
5516
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
5517
{
5518
vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5519
}
5520
5521
// Store 32-bit integer from the first element of a into memory. mem_addr does
5522
// not need to be aligned on any particular boundary.
5523
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
5524
FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
5525
{
5526
vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
5527
}
5528
5529
// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5530
// elements) from a into memory using a non-temporal memory hint. mem_addr must
5531
// be aligned on a 16-byte boundary or a general-protection exception may be
5532
// generated.
5533
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
5534
FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
5535
{
5536
#if __has_builtin(__builtin_nontemporal_store)
5537
__builtin_nontemporal_store(a, (__m128d *) p);
5538
#elif defined(__aarch64__) || defined(_M_ARM64)
5539
vst1q_f64(p, vreinterpretq_f64_m128d(a));
5540
#else
5541
vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
5542
#endif
5543
}
5544
5545
// Store 128-bits of integer data from a into memory using a non-temporal memory
5546
// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
5547
// exception may be generated.
5548
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
5549
FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
5550
{
5551
#if __has_builtin(__builtin_nontemporal_store)
5552
__builtin_nontemporal_store(a, p);
5553
#else
5554
vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
5555
#endif
5556
}
5557
5558
// Store 32-bit integer a into memory using a non-temporal hint to minimize
5559
// cache pollution. If the cache line containing address mem_addr is already in
5560
// the cache, the cache will be updated.
5561
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
5562
FORCE_INLINE void _mm_stream_si32(int *p, int a)
5563
{
5564
vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
5565
}
5566
5567
// Store 64-bit integer a into memory using a non-temporal hint to minimize
5568
// cache pollution. If the cache line containing address mem_addr is already in
5569
// the cache, the cache will be updated.
5570
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
5571
FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
5572
{
5573
vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
5574
}
5575
5576
// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
5577
// store the results in dst.
5578
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
5579
FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
5580
{
5581
return vreinterpretq_m128i_s16(
5582
vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5583
}
5584
5585
// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
5586
// store the results in dst.
5587
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
5588
FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
5589
{
5590
return vreinterpretq_m128i_s32(
5591
vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5592
}
5593
5594
// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
5595
// store the results in dst.
5596
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
5597
FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
5598
{
5599
return vreinterpretq_m128i_s64(
5600
vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
5601
}
5602
5603
// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
5604
// store the results in dst.
5605
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
5606
FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
5607
{
5608
return vreinterpretq_m128i_s8(
5609
vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5610
}
5611
5612
// Subtract packed double-precision (64-bit) floating-point elements in b from
5613
// packed double-precision (64-bit) floating-point elements in a, and store the
5614
// results in dst.
5615
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
5616
FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
5617
{
5618
#if defined(__aarch64__) || defined(_M_ARM64)
5619
return vreinterpretq_m128d_f64(
5620
vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5621
#else
5622
double *da = (double *) &a;
5623
double *db = (double *) &b;
5624
double c[2];
5625
c[0] = da[0] - db[0];
5626
c[1] = da[1] - db[1];
5627
return vld1q_f32((float32_t *) c);
5628
#endif
5629
}
5630
5631
// Subtract the lower double-precision (64-bit) floating-point element in b from
5632
// the lower double-precision (64-bit) floating-point element in a, store the
5633
// result in the lower element of dst, and copy the upper element from a to the
5634
// upper element of dst.
5635
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
5636
FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
5637
{
5638
return _mm_move_sd(a, _mm_sub_pd(a, b));
5639
}
5640
5641
// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
5642
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
5643
FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
5644
{
5645
return vreinterpret_m64_s64(
5646
vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
5647
}
5648
5649
// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
5650
// using saturation, and store the results in dst.
5651
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
5652
FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
5653
{
5654
return vreinterpretq_m128i_s16(
5655
vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5656
}
5657
5658
// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
5659
// using saturation, and store the results in dst.
5660
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
5661
FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
5662
{
5663
return vreinterpretq_m128i_s8(
5664
vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5665
}
5666
5667
// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
5668
// integers in a using saturation, and store the results in dst.
5669
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
5670
FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
5671
{
5672
return vreinterpretq_m128i_u16(
5673
vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
5674
}
5675
5676
// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
5677
// integers in a using saturation, and store the results in dst.
5678
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
5679
FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
5680
{
5681
return vreinterpretq_m128i_u8(
5682
vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
5683
}
5684
5685
#define _mm_ucomieq_sd _mm_comieq_sd
5686
#define _mm_ucomige_sd _mm_comige_sd
5687
#define _mm_ucomigt_sd _mm_comigt_sd
5688
#define _mm_ucomile_sd _mm_comile_sd
5689
#define _mm_ucomilt_sd _mm_comilt_sd
5690
#define _mm_ucomineq_sd _mm_comineq_sd
5691
5692
// Return vector of type __m128d with undefined elements.
5693
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
5694
FORCE_INLINE __m128d _mm_undefined_pd(void)
5695
{
5696
#if defined(__GNUC__) || defined(__clang__)
5697
#pragma GCC diagnostic push
5698
#pragma GCC diagnostic ignored "-Wuninitialized"
5699
#endif
5700
__m128d a;
5701
#if defined(_MSC_VER)
5702
a = _mm_setzero_pd();
5703
#endif
5704
return a;
5705
#if defined(__GNUC__) || defined(__clang__)
5706
#pragma GCC diagnostic pop
5707
#endif
5708
}
5709
5710
// Unpack and interleave 16-bit integers from the high half of a and b, and
5711
// store the results in dst.
5712
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
5713
FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
5714
{
5715
#if defined(__aarch64__) || defined(_M_ARM64)
5716
return vreinterpretq_m128i_s16(
5717
vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5718
#else
5719
int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
5720
int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
5721
int16x4x2_t result = vzip_s16(a1, b1);
5722
return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5723
#endif
5724
}
5725
5726
// Unpack and interleave 32-bit integers from the high half of a and b, and
5727
// store the results in dst.
5728
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
5729
FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
5730
{
5731
#if defined(__aarch64__) || defined(_M_ARM64)
5732
return vreinterpretq_m128i_s32(
5733
vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5734
#else
5735
int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
5736
int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
5737
int32x2x2_t result = vzip_s32(a1, b1);
5738
return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5739
#endif
5740
}
5741
5742
// Unpack and interleave 64-bit integers from the high half of a and b, and
5743
// store the results in dst.
5744
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
5745
FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
5746
{
5747
#if defined(__aarch64__) || defined(_M_ARM64)
5748
return vreinterpretq_m128i_s64(
5749
vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
5750
#else
5751
int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
5752
int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
5753
return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
5754
#endif
5755
}
5756
5757
// Unpack and interleave 8-bit integers from the high half of a and b, and store
5758
// the results in dst.
5759
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
5760
FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
5761
{
5762
#if defined(__aarch64__) || defined(_M_ARM64)
5763
return vreinterpretq_m128i_s8(
5764
vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5765
#else
5766
int8x8_t a1 =
5767
vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
5768
int8x8_t b1 =
5769
vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
5770
int8x8x2_t result = vzip_s8(a1, b1);
5771
return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5772
#endif
5773
}
5774
5775
// Unpack and interleave double-precision (64-bit) floating-point elements from
5776
// the high half of a and b, and store the results in dst.
5777
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
5778
FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
5779
{
5780
#if defined(__aarch64__) || defined(_M_ARM64)
5781
return vreinterpretq_m128d_f64(
5782
vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5783
#else
5784
return vreinterpretq_m128d_s64(
5785
vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
5786
vget_high_s64(vreinterpretq_s64_m128d(b))));
5787
#endif
5788
}
5789
5790
// Unpack and interleave 16-bit integers from the low half of a and b, and store
5791
// the results in dst.
5792
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
5793
FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
5794
{
5795
#if defined(__aarch64__) || defined(_M_ARM64)
5796
return vreinterpretq_m128i_s16(
5797
vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5798
#else
5799
int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
5800
int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
5801
int16x4x2_t result = vzip_s16(a1, b1);
5802
return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5803
#endif
5804
}
5805
5806
// Unpack and interleave 32-bit integers from the low half of a and b, and store
5807
// the results in dst.
5808
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
5809
FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
5810
{
5811
#if defined(__aarch64__) || defined(_M_ARM64)
5812
return vreinterpretq_m128i_s32(
5813
vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5814
#else
5815
int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
5816
int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
5817
int32x2x2_t result = vzip_s32(a1, b1);
5818
return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5819
#endif
5820
}
5821
5822
// Unpack and interleave 64-bit integers from the low half of a and b, and store
5823
// the results in dst.
5824
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
5825
FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
5826
{
5827
#if defined(__aarch64__) || defined(_M_ARM64)
5828
return vreinterpretq_m128i_s64(
5829
vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
5830
#else
5831
int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
5832
int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
5833
return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
5834
#endif
5835
}
5836
5837
// Unpack and interleave 8-bit integers from the low half of a and b, and store
5838
// the results in dst.
5839
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
5840
FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
5841
{
5842
#if defined(__aarch64__) || defined(_M_ARM64)
5843
return vreinterpretq_m128i_s8(
5844
vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5845
#else
5846
int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
5847
int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
5848
int8x8x2_t result = vzip_s8(a1, b1);
5849
return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5850
#endif
5851
}
5852
5853
// Unpack and interleave double-precision (64-bit) floating-point elements from
5854
// the low half of a and b, and store the results in dst.
5855
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
5856
FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
5857
{
5858
#if defined(__aarch64__) || defined(_M_ARM64)
5859
return vreinterpretq_m128d_f64(
5860
vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5861
#else
5862
return vreinterpretq_m128d_s64(
5863
vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
5864
vget_low_s64(vreinterpretq_s64_m128d(b))));
5865
#endif
5866
}
5867
5868
// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
5869
// elements in a and b, and store the results in dst.
5870
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
5871
FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
5872
{
5873
return vreinterpretq_m128d_s64(
5874
veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
5875
}
5876
5877
// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
5878
// and store the result in dst.
5879
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
5880
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
5881
{
5882
return vreinterpretq_m128i_s32(
5883
veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5884
}
5885
5886
/* SSE3 */
5887
5888
// Alternatively add and subtract packed double-precision (64-bit)
5889
// floating-point elements in a to/from packed elements in b, and store the
5890
// results in dst.
5891
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
5892
FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
5893
{
5894
_sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
5895
#if defined(__aarch64__) || defined(_M_ARM64)
5896
return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
5897
vreinterpretq_f64_m128d(b),
5898
vreinterpretq_f64_m128d(mask)));
5899
#else
5900
return _mm_add_pd(_mm_mul_pd(b, mask), a);
5901
#endif
5902
}
5903
5904
// Alternatively add and subtract packed single-precision (32-bit)
5905
// floating-point elements in a to/from packed elements in b, and store the
5906
// results in dst.
5907
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
5908
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
5909
{
5910
_sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
5911
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
5912
defined(__ARM_FEATURE_FMA) /* VFPv4+ */
5913
return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
5914
vreinterpretq_f32_m128(mask),
5915
vreinterpretq_f32_m128(b)));
5916
#else
5917
return _mm_add_ps(_mm_mul_ps(b, mask), a);
5918
#endif
5919
}
5920
5921
// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
5922
// elements in a and b, and pack the results in dst.
5923
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
5924
FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
5925
{
5926
#if defined(__aarch64__) || defined(_M_ARM64)
5927
return vreinterpretq_m128d_f64(
5928
vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5929
#else
5930
double *da = (double *) &a;
5931
double *db = (double *) &b;
5932
double c[] = {da[0] + da[1], db[0] + db[1]};
5933
return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
5934
#endif
5935
}
5936
5937
// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
5938
// elements in a and b, and pack the results in dst.
5939
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
5940
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
5941
{
5942
#if defined(__aarch64__) || defined(_M_ARM64)
5943
return vreinterpretq_m128_f32(
5944
vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
5945
#else
5946
float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
5947
float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
5948
float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
5949
float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
5950
return vreinterpretq_m128_f32(
5951
vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
5952
#endif
5953
}
5954
5955
// Horizontally subtract adjacent pairs of double-precision (64-bit)
5956
// floating-point elements in a and b, and pack the results in dst.
5957
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
5958
FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
5959
{
5960
#if defined(__aarch64__) || defined(_M_ARM64)
5961
float64x2_t a = vreinterpretq_f64_m128d(_a);
5962
float64x2_t b = vreinterpretq_f64_m128d(_b);
5963
return vreinterpretq_m128d_f64(
5964
vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
5965
#else
5966
double *da = (double *) &_a;
5967
double *db = (double *) &_b;
5968
double c[] = {da[0] - da[1], db[0] - db[1]};
5969
return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
5970
#endif
5971
}
5972
5973
// Horizontally subtract adjacent pairs of single-precision (32-bit)
5974
// floating-point elements in a and b, and pack the results in dst.
5975
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
5976
FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
5977
{
5978
float32x4_t a = vreinterpretq_f32_m128(_a);
5979
float32x4_t b = vreinterpretq_f32_m128(_b);
5980
#if defined(__aarch64__) || defined(_M_ARM64)
5981
return vreinterpretq_m128_f32(
5982
vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
5983
#else
5984
float32x4x2_t c = vuzpq_f32(a, b);
5985
return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
5986
#endif
5987
}
5988
5989
// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
5990
// may perform better than _mm_loadu_si128 when the data crosses a cache line
5991
// boundary.
5992
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
5993
#define _mm_lddqu_si128 _mm_loadu_si128
5994
5995
// Load a double-precision (64-bit) floating-point element from memory into both
5996
// elements of dst.
5997
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
5998
#define _mm_loaddup_pd _mm_load1_pd
5999
6000
// Duplicate the low double-precision (64-bit) floating-point element from a,
6001
// and store the results in dst.
6002
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
6003
FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
6004
{
6005
#if defined(__aarch64__) || defined(_M_ARM64)
6006
return vreinterpretq_m128d_f64(
6007
vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6008
#else
6009
return vreinterpretq_m128d_u64(
6010
vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
6011
#endif
6012
}
6013
6014
// Duplicate odd-indexed single-precision (32-bit) floating-point elements
6015
// from a, and store the results in dst.
6016
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
6017
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
6018
{
6019
#if defined(__aarch64__) || defined(_M_ARM64)
6020
return vreinterpretq_m128_f32(
6021
vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
6022
#elif defined(_sse2neon_shuffle)
6023
return vreinterpretq_m128_f32(vshuffleq_s32(
6024
vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
6025
#else
6026
float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
6027
float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
6028
float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
6029
return vreinterpretq_m128_f32(vld1q_f32(data));
6030
#endif
6031
}
6032
6033
// Duplicate even-indexed single-precision (32-bit) floating-point elements
6034
// from a, and store the results in dst.
6035
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
6036
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
6037
{
6038
#if defined(__aarch64__) || defined(_M_ARM64)
6039
return vreinterpretq_m128_f32(
6040
vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
6041
#elif defined(_sse2neon_shuffle)
6042
return vreinterpretq_m128_f32(vshuffleq_s32(
6043
vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
6044
#else
6045
float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
6046
float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
6047
float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
6048
return vreinterpretq_m128_f32(vld1q_f32(data));
6049
#endif
6050
}
6051
6052
/* SSSE3 */
6053
6054
// Compute the absolute value of packed signed 16-bit integers in a, and store
6055
// the unsigned results in dst.
6056
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
6057
FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
6058
{
6059
return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
6060
}
6061
6062
// Compute the absolute value of packed signed 32-bit integers in a, and store
6063
// the unsigned results in dst.
6064
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
6065
FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
6066
{
6067
return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
6068
}
6069
6070
// Compute the absolute value of packed signed 8-bit integers in a, and store
6071
// the unsigned results in dst.
6072
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
6073
FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
6074
{
6075
return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
6076
}
6077
6078
// Compute the absolute value of packed signed 16-bit integers in a, and store
6079
// the unsigned results in dst.
6080
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
6081
FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
6082
{
6083
return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
6084
}
6085
6086
// Compute the absolute value of packed signed 32-bit integers in a, and store
6087
// the unsigned results in dst.
6088
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
6089
FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
6090
{
6091
return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
6092
}
6093
6094
// Compute the absolute value of packed signed 8-bit integers in a, and store
6095
// the unsigned results in dst.
6096
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
6097
FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
6098
{
6099
return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
6100
}
6101
6102
// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
6103
// the result right by imm8 bytes, and store the low 16 bytes in dst.
6104
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
6105
#if defined(__GNUC__) && !defined(__clang__)
6106
#define _mm_alignr_epi8(a, b, imm) \
6107
__extension__({ \
6108
uint8x16_t _a = vreinterpretq_u8_m128i(a); \
6109
uint8x16_t _b = vreinterpretq_u8_m128i(b); \
6110
__m128i ret; \
6111
if (_sse2neon_unlikely((imm) & ~31)) \
6112
ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
6113
else if (imm >= 16) \
6114
ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0); \
6115
else \
6116
ret = \
6117
vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
6118
ret; \
6119
})
6120
6121
#else
6122
#define _mm_alignr_epi8(a, b, imm) \
6123
_sse2neon_define2( \
6124
__m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \
6125
uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \
6126
if (_sse2neon_unlikely((imm) & ~31)) ret = \
6127
vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
6128
else if (imm >= 16) ret = \
6129
_mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0); \
6130
else ret = \
6131
vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \
6132
_sse2neon_return(ret);)
6133
6134
#endif
6135
6136
// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
6137
// the result right by imm8 bytes, and store the low 8 bytes in dst.
6138
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
6139
#define _mm_alignr_pi8(a, b, imm) \
6140
_sse2neon_define2( \
6141
__m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) { \
6142
ret = vreinterpret_m64_s8(vdup_n_s8(0)); \
6143
} else { \
6144
uint8x8_t tmp_low; \
6145
uint8x8_t tmp_high; \
6146
if ((imm) >= 8) { \
6147
const int idx = (imm) -8; \
6148
tmp_low = vreinterpret_u8_m64(_a); \
6149
tmp_high = vdup_n_u8(0); \
6150
ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6151
} else { \
6152
const int idx = (imm); \
6153
tmp_low = vreinterpret_u8_m64(_b); \
6154
tmp_high = vreinterpret_u8_m64(_a); \
6155
ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6156
} \
6157
} _sse2neon_return(ret);)
6158
6159
// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6160
// signed 16-bit results in dst.
6161
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
6162
FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
6163
{
6164
int16x8_t a = vreinterpretq_s16_m128i(_a);
6165
int16x8_t b = vreinterpretq_s16_m128i(_b);
6166
#if defined(__aarch64__) || defined(_M_ARM64)
6167
return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
6168
#else
6169
return vreinterpretq_m128i_s16(
6170
vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6171
vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6172
#endif
6173
}
6174
6175
// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6176
// signed 32-bit results in dst.
6177
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
6178
FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
6179
{
6180
int32x4_t a = vreinterpretq_s32_m128i(_a);
6181
int32x4_t b = vreinterpretq_s32_m128i(_b);
6182
#if defined(__aarch64__) || defined(_M_ARM64)
6183
return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
6184
#else
6185
return vreinterpretq_m128i_s32(
6186
vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6187
vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6188
#endif
6189
}
6190
6191
// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6192
// signed 16-bit results in dst.
6193
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
6194
FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
6195
{
6196
return vreinterpret_m64_s16(
6197
vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
6198
}
6199
6200
// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6201
// signed 32-bit results in dst.
6202
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
6203
FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
6204
{
6205
return vreinterpret_m64_s32(
6206
vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
6207
}
6208
6209
// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6210
// saturation, and pack the signed 16-bit results in dst.
6211
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
6212
FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
6213
{
6214
#if defined(__aarch64__) || defined(_M_ARM64)
6215
int16x8_t a = vreinterpretq_s16_m128i(_a);
6216
int16x8_t b = vreinterpretq_s16_m128i(_b);
6217
return vreinterpretq_s64_s16(
6218
vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6219
#else
6220
int32x4_t a = vreinterpretq_s32_m128i(_a);
6221
int32x4_t b = vreinterpretq_s32_m128i(_b);
6222
// Interleave using vshrn/vmovn
6223
// [a0|a2|a4|a6|b0|b2|b4|b6]
6224
// [a1|a3|a5|a7|b1|b3|b5|b7]
6225
int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6226
int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6227
// Saturated add
6228
return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
6229
#endif
6230
}
6231
6232
// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6233
// saturation, and pack the signed 16-bit results in dst.
6234
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
6235
FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
6236
{
6237
int16x4_t a = vreinterpret_s16_m64(_a);
6238
int16x4_t b = vreinterpret_s16_m64(_b);
6239
#if defined(__aarch64__) || defined(_M_ARM64)
6240
return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6241
#else
6242
int16x4x2_t res = vuzp_s16(a, b);
6243
return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6244
#endif
6245
}
6246
6247
// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6248
// the signed 16-bit results in dst.
6249
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
6250
FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
6251
{
6252
int16x8_t a = vreinterpretq_s16_m128i(_a);
6253
int16x8_t b = vreinterpretq_s16_m128i(_b);
6254
#if defined(__aarch64__) || defined(_M_ARM64)
6255
return vreinterpretq_m128i_s16(
6256
vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6257
#else
6258
int16x8x2_t c = vuzpq_s16(a, b);
6259
return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
6260
#endif
6261
}
6262
6263
// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6264
// the signed 32-bit results in dst.
6265
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
6266
FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
6267
{
6268
int32x4_t a = vreinterpretq_s32_m128i(_a);
6269
int32x4_t b = vreinterpretq_s32_m128i(_b);
6270
#if defined(__aarch64__) || defined(_M_ARM64)
6271
return vreinterpretq_m128i_s32(
6272
vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
6273
#else
6274
int32x4x2_t c = vuzpq_s32(a, b);
6275
return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
6276
#endif
6277
}
6278
6279
// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6280
// the signed 16-bit results in dst.
6281
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
6282
FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
6283
{
6284
int16x4_t a = vreinterpret_s16_m64(_a);
6285
int16x4_t b = vreinterpret_s16_m64(_b);
6286
#if defined(__aarch64__) || defined(_M_ARM64)
6287
return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6288
#else
6289
int16x4x2_t c = vuzp_s16(a, b);
6290
return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
6291
#endif
6292
}
6293
6294
// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6295
// the signed 32-bit results in dst.
6296
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
6297
FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
6298
{
6299
int32x2_t a = vreinterpret_s32_m64(_a);
6300
int32x2_t b = vreinterpret_s32_m64(_b);
6301
#if defined(__aarch64__) || defined(_M_ARM64)
6302
return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
6303
#else
6304
int32x2x2_t c = vuzp_s32(a, b);
6305
return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
6306
#endif
6307
}
6308
6309
// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6310
// using saturation, and pack the signed 16-bit results in dst.
6311
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
6312
FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
6313
{
6314
int16x8_t a = vreinterpretq_s16_m128i(_a);
6315
int16x8_t b = vreinterpretq_s16_m128i(_b);
6316
#if defined(__aarch64__) || defined(_M_ARM64)
6317
return vreinterpretq_m128i_s16(
6318
vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6319
#else
6320
int16x8x2_t c = vuzpq_s16(a, b);
6321
return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
6322
#endif
6323
}
6324
6325
// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6326
// using saturation, and pack the signed 16-bit results in dst.
6327
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
6328
FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
6329
{
6330
int16x4_t a = vreinterpret_s16_m64(_a);
6331
int16x4_t b = vreinterpret_s16_m64(_b);
6332
#if defined(__aarch64__) || defined(_M_ARM64)
6333
return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6334
#else
6335
int16x4x2_t c = vuzp_s16(a, b);
6336
return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
6337
#endif
6338
}
6339
6340
// Vertically multiply each unsigned 8-bit integer from a with the corresponding
6341
// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6342
// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
6343
// and pack the saturated results in dst.
6344
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
6345
FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
6346
{
6347
#if defined(__aarch64__) || defined(_M_ARM64)
6348
uint8x16_t a = vreinterpretq_u8_m128i(_a);
6349
int8x16_t b = vreinterpretq_s8_m128i(_b);
6350
int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6351
vmovl_s8(vget_low_s8(b)));
6352
int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6353
vmovl_s8(vget_high_s8(b)));
6354
return vreinterpretq_m128i_s16(
6355
vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6356
#else
6357
// This would be much simpler if x86 would choose to zero extend OR sign
6358
// extend, not both. This could probably be optimized better.
6359
uint16x8_t a = vreinterpretq_u16_m128i(_a);
6360
int16x8_t b = vreinterpretq_s16_m128i(_b);
6361
6362
// Zero extend a
6363
int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6364
int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6365
6366
// Sign extend by shifting left then shifting right.
6367
int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6368
int16x8_t b_odd = vshrq_n_s16(b, 8);
6369
6370
// multiply
6371
int16x8_t prod1 = vmulq_s16(a_even, b_even);
6372
int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
6373
6374
// saturated add
6375
return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
6376
#endif
6377
}
6378
6379
// Vertically multiply each unsigned 8-bit integer from a with the corresponding
6380
// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6381
// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
6382
// pack the saturated results in dst.
6383
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
6384
FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
6385
{
6386
uint16x4_t a = vreinterpret_u16_m64(_a);
6387
int16x4_t b = vreinterpret_s16_m64(_b);
6388
6389
// Zero extend a
6390
int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
6391
int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
6392
6393
// Sign extend by shifting left then shifting right.
6394
int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
6395
int16x4_t b_odd = vshr_n_s16(b, 8);
6396
6397
// multiply
6398
int16x4_t prod1 = vmul_s16(a_even, b_even);
6399
int16x4_t prod2 = vmul_s16(a_odd, b_odd);
6400
6401
// saturated add
6402
return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
6403
}
6404
6405
// Multiply packed signed 16-bit integers in a and b, producing intermediate
6406
// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
6407
// the packed 16-bit integers in dst.
6408
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
6409
FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
6410
{
6411
// Has issues due to saturation
6412
// return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
6413
6414
// Multiply
6415
int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
6416
vget_low_s16(vreinterpretq_s16_m128i(b)));
6417
int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
6418
vget_high_s16(vreinterpretq_s16_m128i(b)));
6419
6420
// Rounding narrowing shift right
6421
// narrow = (int16_t)((mul + 16384) >> 15);
6422
int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
6423
int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
6424
6425
// Join together
6426
return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
6427
}
6428
6429
// Multiply packed signed 16-bit integers in a and b, producing intermediate
6430
// signed 32-bit integers. Truncate each intermediate integer to the 18 most
6431
// significant bits, round by adding 1, and store bits [16:1] to dst.
6432
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
6433
FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
6434
{
6435
int32x4_t mul_extend =
6436
vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
6437
6438
// Rounding narrowing shift right
6439
return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
6440
}
6441
6442
// Shuffle packed 8-bit integers in a according to shuffle control mask in the
6443
// corresponding 8-bit element of b, and store the results in dst.
6444
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
6445
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
6446
{
6447
int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a
6448
uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b
6449
uint8x16_t idx_masked =
6450
vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits
6451
#if defined(__aarch64__) || defined(_M_ARM64)
6452
return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
6453
#elif defined(__GNUC__)
6454
int8x16_t ret;
6455
// %e and %f represent the even and odd D registers
6456
// respectively.
6457
__asm__ __volatile__(
6458
"vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
6459
"vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
6460
: [ret] "=&w"(ret)
6461
: [tbl] "w"(tbl), [idx] "w"(idx_masked));
6462
return vreinterpretq_m128i_s8(ret);
6463
#else
6464
// use this line if testing on aarch64
6465
int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
6466
return vreinterpretq_m128i_s8(
6467
vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
6468
vtbl2_s8(a_split, vget_high_u8(idx_masked))));
6469
#endif
6470
}
6471
6472
// Shuffle packed 8-bit integers in a according to shuffle control mask in the
6473
// corresponding 8-bit element of b, and store the results in dst.
6474
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
6475
FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
6476
{
6477
const int8x8_t controlMask =
6478
vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
6479
int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
6480
return vreinterpret_m64_s8(res);
6481
}
6482
6483
// Negate packed 16-bit integers in a when the corresponding signed
6484
// 16-bit integer in b is negative, and store the results in dst.
6485
// Element in dst are zeroed out when the corresponding element
6486
// in b is zero.
6487
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
6488
FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
6489
{
6490
int16x8_t a = vreinterpretq_s16_m128i(_a);
6491
int16x8_t b = vreinterpretq_s16_m128i(_b);
6492
6493
// signed shift right: faster than vclt
6494
// (b < 0) ? 0xFFFF : 0
6495
uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
6496
// (b == 0) ? 0xFFFF : 0
6497
#if defined(__aarch64__) || defined(_M_ARM64)
6498
int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
6499
#else
6500
int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
6501
#endif
6502
6503
// bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
6504
// 'a') based on ltMask
6505
int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
6506
// res = masked & (~zeroMask)
6507
int16x8_t res = vbicq_s16(masked, zeroMask);
6508
return vreinterpretq_m128i_s16(res);
6509
}
6510
6511
// Negate packed 32-bit integers in a when the corresponding signed
6512
// 32-bit integer in b is negative, and store the results in dst.
6513
// Element in dst are zeroed out when the corresponding element
6514
// in b is zero.
6515
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
6516
FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
6517
{
6518
int32x4_t a = vreinterpretq_s32_m128i(_a);
6519
int32x4_t b = vreinterpretq_s32_m128i(_b);
6520
6521
// signed shift right: faster than vclt
6522
// (b < 0) ? 0xFFFFFFFF : 0
6523
uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
6524
6525
// (b == 0) ? 0xFFFFFFFF : 0
6526
#if defined(__aarch64__) || defined(_M_ARM64)
6527
int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
6528
#else
6529
int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
6530
#endif
6531
6532
// bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
6533
// 'a') based on ltMask
6534
int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
6535
// res = masked & (~zeroMask)
6536
int32x4_t res = vbicq_s32(masked, zeroMask);
6537
return vreinterpretq_m128i_s32(res);
6538
}
6539
6540
// Negate packed 8-bit integers in a when the corresponding signed
6541
// 8-bit integer in b is negative, and store the results in dst.
6542
// Element in dst are zeroed out when the corresponding element
6543
// in b is zero.
6544
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
6545
FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
6546
{
6547
int8x16_t a = vreinterpretq_s8_m128i(_a);
6548
int8x16_t b = vreinterpretq_s8_m128i(_b);
6549
6550
// signed shift right: faster than vclt
6551
// (b < 0) ? 0xFF : 0
6552
uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
6553
6554
// (b == 0) ? 0xFF : 0
6555
#if defined(__aarch64__) || defined(_M_ARM64)
6556
int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
6557
#else
6558
int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
6559
#endif
6560
6561
// bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
6562
// based on ltMask
6563
int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
6564
// res = masked & (~zeroMask)
6565
int8x16_t res = vbicq_s8(masked, zeroMask);
6566
6567
return vreinterpretq_m128i_s8(res);
6568
}
6569
6570
// Negate packed 16-bit integers in a when the corresponding signed 16-bit
6571
// integer in b is negative, and store the results in dst. Element in dst are
6572
// zeroed out when the corresponding element in b is zero.
6573
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
6574
FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
6575
{
6576
int16x4_t a = vreinterpret_s16_m64(_a);
6577
int16x4_t b = vreinterpret_s16_m64(_b);
6578
6579
// signed shift right: faster than vclt
6580
// (b < 0) ? 0xFFFF : 0
6581
uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
6582
6583
// (b == 0) ? 0xFFFF : 0
6584
#if defined(__aarch64__) || defined(_M_ARM64)
6585
int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
6586
#else
6587
int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
6588
#endif
6589
6590
// bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
6591
// based on ltMask
6592
int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
6593
// res = masked & (~zeroMask)
6594
int16x4_t res = vbic_s16(masked, zeroMask);
6595
6596
return vreinterpret_m64_s16(res);
6597
}
6598
6599
// Negate packed 32-bit integers in a when the corresponding signed 32-bit
6600
// integer in b is negative, and store the results in dst. Element in dst are
6601
// zeroed out when the corresponding element in b is zero.
6602
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
6603
FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
6604
{
6605
int32x2_t a = vreinterpret_s32_m64(_a);
6606
int32x2_t b = vreinterpret_s32_m64(_b);
6607
6608
// signed shift right: faster than vclt
6609
// (b < 0) ? 0xFFFFFFFF : 0
6610
uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
6611
6612
// (b == 0) ? 0xFFFFFFFF : 0
6613
#if defined(__aarch64__) || defined(_M_ARM64)
6614
int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
6615
#else
6616
int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
6617
#endif
6618
6619
// bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
6620
// based on ltMask
6621
int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
6622
// res = masked & (~zeroMask)
6623
int32x2_t res = vbic_s32(masked, zeroMask);
6624
6625
return vreinterpret_m64_s32(res);
6626
}
6627
6628
// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
6629
// in b is negative, and store the results in dst. Element in dst are zeroed out
6630
// when the corresponding element in b is zero.
6631
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
6632
FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
6633
{
6634
int8x8_t a = vreinterpret_s8_m64(_a);
6635
int8x8_t b = vreinterpret_s8_m64(_b);
6636
6637
// signed shift right: faster than vclt
6638
// (b < 0) ? 0xFF : 0
6639
uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
6640
6641
// (b == 0) ? 0xFF : 0
6642
#if defined(__aarch64__) || defined(_M_ARM64)
6643
int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
6644
#else
6645
int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
6646
#endif
6647
6648
// bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
6649
// based on ltMask
6650
int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
6651
// res = masked & (~zeroMask)
6652
int8x8_t res = vbic_s8(masked, zeroMask);
6653
6654
return vreinterpret_m64_s8(res);
6655
}
6656
6657
/* SSE4.1 */
6658
6659
// Blend packed 16-bit integers from a and b using control mask imm8, and store
6660
// the results in dst.
6661
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
6662
// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
6663
// __constrange(0,255) int imm)
6664
#define _mm_blend_epi16(a, b, imm) \
6665
_sse2neon_define2( \
6666
__m128i, a, b, \
6667
const uint16_t _mask[8] = \
6668
_sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \
6669
((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \
6670
((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \
6671
((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \
6672
((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \
6673
((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \
6674
((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \
6675
((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0); \
6676
uint16x8_t _mask_vec = vld1q_u16(_mask); \
6677
uint16x8_t __a = vreinterpretq_u16_m128i(_a); \
6678
uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
6679
vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));)
6680
6681
// Blend packed double-precision (64-bit) floating-point elements from a and b
6682
// using control mask imm8, and store the results in dst.
6683
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
6684
#define _mm_blend_pd(a, b, imm) \
6685
_sse2neon_define2( \
6686
__m128d, a, b, \
6687
const uint64_t _mask[2] = \
6688
_sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \
6689
((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \
6690
uint64x2_t _mask_vec = vld1q_u64(_mask); \
6691
uint64x2_t __a = vreinterpretq_u64_m128d(_a); \
6692
uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return( \
6693
vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));)
6694
6695
// Blend packed single-precision (32-bit) floating-point elements from a and b
6696
// using mask, and store the results in dst.
6697
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
6698
FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
6699
{
6700
const uint32_t ALIGN_STRUCT(16)
6701
data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
6702
((imm8) & (1 << 1)) ? UINT32_MAX : 0,
6703
((imm8) & (1 << 2)) ? UINT32_MAX : 0,
6704
((imm8) & (1 << 3)) ? UINT32_MAX : 0};
6705
uint32x4_t mask = vld1q_u32(data);
6706
float32x4_t a = vreinterpretq_f32_m128(_a);
6707
float32x4_t b = vreinterpretq_f32_m128(_b);
6708
return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
6709
}
6710
6711
// Blend packed 8-bit integers from a and b using mask, and store the results in
6712
// dst.
6713
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
6714
FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
6715
{
6716
// Use a signed shift right to create a mask with the sign bit
6717
uint8x16_t mask =
6718
vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
6719
uint8x16_t a = vreinterpretq_u8_m128i(_a);
6720
uint8x16_t b = vreinterpretq_u8_m128i(_b);
6721
return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
6722
}
6723
6724
// Blend packed double-precision (64-bit) floating-point elements from a and b
6725
// using mask, and store the results in dst.
6726
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
6727
FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
6728
{
6729
uint64x2_t mask =
6730
vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
6731
#if defined(__aarch64__) || defined(_M_ARM64)
6732
float64x2_t a = vreinterpretq_f64_m128d(_a);
6733
float64x2_t b = vreinterpretq_f64_m128d(_b);
6734
return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
6735
#else
6736
uint64x2_t a = vreinterpretq_u64_m128d(_a);
6737
uint64x2_t b = vreinterpretq_u64_m128d(_b);
6738
return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
6739
#endif
6740
}
6741
6742
// Blend packed single-precision (32-bit) floating-point elements from a and b
6743
// using mask, and store the results in dst.
6744
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
6745
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
6746
{
6747
// Use a signed shift right to create a mask with the sign bit
6748
uint32x4_t mask =
6749
vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
6750
float32x4_t a = vreinterpretq_f32_m128(_a);
6751
float32x4_t b = vreinterpretq_f32_m128(_b);
6752
return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
6753
}
6754
6755
// Round the packed double-precision (64-bit) floating-point elements in a up
6756
// to an integer value, and store the results as packed double-precision
6757
// floating-point elements in dst.
6758
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
6759
FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
6760
{
6761
#if defined(__aarch64__) || defined(_M_ARM64)
6762
return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
6763
#else
6764
double *f = (double *) &a;
6765
return _mm_set_pd(ceil(f[1]), ceil(f[0]));
6766
#endif
6767
}
6768
6769
// Round the packed single-precision (32-bit) floating-point elements in a up to
6770
// an integer value, and store the results as packed single-precision
6771
// floating-point elements in dst.
6772
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
6773
FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
6774
{
6775
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
6776
defined(__ARM_FEATURE_DIRECTED_ROUNDING)
6777
return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
6778
#else
6779
float *f = (float *) &a;
6780
return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
6781
#endif
6782
}
6783
6784
// Round the lower double-precision (64-bit) floating-point element in b up to
6785
// an integer value, store the result as a double-precision floating-point
6786
// element in the lower element of dst, and copy the upper element from a to the
6787
// upper element of dst.
6788
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
6789
FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
6790
{
6791
return _mm_move_sd(a, _mm_ceil_pd(b));
6792
}
6793
6794
// Round the lower single-precision (32-bit) floating-point element in b up to
6795
// an integer value, store the result as a single-precision floating-point
6796
// element in the lower element of dst, and copy the upper 3 packed elements
6797
// from a to the upper elements of dst.
6798
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
6799
FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
6800
{
6801
return _mm_move_ss(a, _mm_ceil_ps(b));
6802
}
6803
6804
// Compare packed 64-bit integers in a and b for equality, and store the results
6805
// in dst
6806
FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
6807
{
6808
#if defined(__aarch64__) || defined(_M_ARM64)
6809
return vreinterpretq_m128i_u64(
6810
vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
6811
#else
6812
// ARMv7 lacks vceqq_u64
6813
// (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
6814
uint32x4_t cmp =
6815
vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
6816
uint32x4_t swapped = vrev64q_u32(cmp);
6817
return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
6818
#endif
6819
}
6820
6821
// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
6822
// the results in dst.
6823
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
6824
FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
6825
{
6826
return vreinterpretq_m128i_s32(
6827
vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
6828
}
6829
6830
// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
6831
// the results in dst.
6832
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
6833
FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
6834
{
6835
int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */
6836
int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
6837
int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
6838
return vreinterpretq_m128i_s64(s64x2);
6839
}
6840
6841
// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
6842
// the results in dst.
6843
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
6844
FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
6845
{
6846
return vreinterpretq_m128i_s64(
6847
vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
6848
}
6849
6850
// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
6851
// the results in dst.
6852
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
6853
FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
6854
{
6855
int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
6856
int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
6857
return vreinterpretq_m128i_s16(s16x8);
6858
}
6859
6860
// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
6861
// the results in dst.
6862
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
6863
FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
6864
{
6865
int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */
6866
int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
6867
int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
6868
return vreinterpretq_m128i_s32(s32x4);
6869
}
6870
6871
// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
6872
// integers, and store the results in dst.
6873
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
6874
FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
6875
{
6876
int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */
6877
int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */
6878
int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
6879
int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
6880
return vreinterpretq_m128i_s64(s64x2);
6881
}
6882
6883
// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
6884
// and store the results in dst.
6885
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
6886
FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
6887
{
6888
return vreinterpretq_m128i_u32(
6889
vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
6890
}
6891
6892
// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
6893
// and store the results in dst.
6894
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
6895
FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
6896
{
6897
uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */
6898
uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
6899
uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
6900
return vreinterpretq_m128i_u64(u64x2);
6901
}
6902
6903
// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
6904
// and store the results in dst.
6905
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
6906
FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
6907
{
6908
return vreinterpretq_m128i_u64(
6909
vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
6910
}
6911
6912
// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
6913
// and store the results in dst.
6914
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
6915
FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
6916
{
6917
uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */
6918
uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
6919
return vreinterpretq_m128i_u16(u16x8);
6920
}
6921
6922
// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
6923
// and store the results in dst.
6924
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
6925
FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
6926
{
6927
uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */
6928
uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */
6929
uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
6930
return vreinterpretq_m128i_u32(u32x4);
6931
}
6932
6933
// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed
6934
// 64-bit integers, and store the results in dst.
6935
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
6936
FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
6937
{
6938
uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */
6939
uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */
6940
uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
6941
uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
6942
return vreinterpretq_m128i_u64(u64x2);
6943
}
6944
6945
// Conditionally multiply the packed double-precision (64-bit) floating-point
6946
// elements in a and b using the high 4 bits in imm8, sum the four products, and
6947
// conditionally store the sum in dst using the low 4 bits of imm8.
6948
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
6949
FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
6950
{
6951
// Generate mask value from constant immediate bit value
6952
const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
6953
const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
6954
#if !SSE2NEON_PRECISE_DP
6955
const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
6956
const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
6957
#endif
6958
// Conditional multiplication
6959
#if !SSE2NEON_PRECISE_DP
6960
__m128d mul = _mm_mul_pd(a, b);
6961
const __m128d mulMask =
6962
_mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
6963
__m128d tmp = _mm_and_pd(mul, mulMask);
6964
#else
6965
#if defined(__aarch64__) || defined(_M_ARM64)
6966
double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
6967
vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
6968
: 0;
6969
double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
6970
vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
6971
: 0;
6972
#else
6973
double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
6974
double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
6975
#endif
6976
__m128d tmp = _mm_set_pd(d1, d0);
6977
#endif
6978
// Sum the products
6979
#if defined(__aarch64__) || defined(_M_ARM64)
6980
double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
6981
#else
6982
double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
6983
#endif
6984
// Conditionally store the sum
6985
const __m128d sumMask =
6986
_mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
6987
__m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
6988
return res;
6989
}
6990
6991
// Conditionally multiply the packed single-precision (32-bit) floating-point
6992
// elements in a and b using the high 4 bits in imm8, sum the four products,
6993
// and conditionally store the sum in dst using the low 4 bits of imm.
6994
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
6995
FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
6996
{
6997
float32x4_t elementwise_prod = _mm_mul_ps(a, b);
6998
6999
#if defined(__aarch64__) || defined(_M_ARM64)
7000
/* shortcuts */
7001
if (imm == 0xFF) {
7002
return _mm_set1_ps(vaddvq_f32(elementwise_prod));
7003
}
7004
7005
if ((imm & 0x0F) == 0x0F) {
7006
if (!(imm & (1 << 4)))
7007
elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0);
7008
if (!(imm & (1 << 5)))
7009
elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1);
7010
if (!(imm & (1 << 6)))
7011
elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2);
7012
if (!(imm & (1 << 7)))
7013
elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3);
7014
7015
return _mm_set1_ps(vaddvq_f32(elementwise_prod));
7016
}
7017
#endif
7018
7019
float s = 0.0f;
7020
7021
if (imm & (1 << 4))
7022
s += vgetq_lane_f32(elementwise_prod, 0);
7023
if (imm & (1 << 5))
7024
s += vgetq_lane_f32(elementwise_prod, 1);
7025
if (imm & (1 << 6))
7026
s += vgetq_lane_f32(elementwise_prod, 2);
7027
if (imm & (1 << 7))
7028
s += vgetq_lane_f32(elementwise_prod, 3);
7029
7030
const float32_t res[4] = {
7031
(imm & 0x1) ? s : 0.0f,
7032
(imm & 0x2) ? s : 0.0f,
7033
(imm & 0x4) ? s : 0.0f,
7034
(imm & 0x8) ? s : 0.0f,
7035
};
7036
return vreinterpretq_m128_f32(vld1q_f32(res));
7037
}
7038
7039
// Extract a 32-bit integer from a, selected with imm8, and store the result in
7040
// dst.
7041
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
7042
// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
7043
#define _mm_extract_epi32(a, imm) \
7044
vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7045
7046
// Extract a 64-bit integer from a, selected with imm8, and store the result in
7047
// dst.
7048
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
7049
// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
7050
#define _mm_extract_epi64(a, imm) \
7051
vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7052
7053
// Extract an 8-bit integer from a, selected with imm8, and store the result in
7054
// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
7055
// __constrange(0,16) int imm)
7056
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
7057
#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7058
7059
// Extracts the selected single-precision (32-bit) floating-point from a.
7060
// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
7061
#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7062
7063
// Round the packed double-precision (64-bit) floating-point elements in a down
7064
// to an integer value, and store the results as packed double-precision
7065
// floating-point elements in dst.
7066
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
7067
FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
7068
{
7069
#if defined(__aarch64__) || defined(_M_ARM64)
7070
return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7071
#else
7072
double *f = (double *) &a;
7073
return _mm_set_pd(floor(f[1]), floor(f[0]));
7074
#endif
7075
}
7076
7077
// Round the packed single-precision (32-bit) floating-point elements in a down
7078
// to an integer value, and store the results as packed single-precision
7079
// floating-point elements in dst.
7080
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
7081
FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
7082
{
7083
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
7084
defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7085
return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
7086
#else
7087
float *f = (float *) &a;
7088
return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7089
#endif
7090
}
7091
7092
// Round the lower double-precision (64-bit) floating-point element in b down to
7093
// an integer value, store the result as a double-precision floating-point
7094
// element in the lower element of dst, and copy the upper element from a to the
7095
// upper element of dst.
7096
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
7097
FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
7098
{
7099
return _mm_move_sd(a, _mm_floor_pd(b));
7100
}
7101
7102
// Round the lower single-precision (32-bit) floating-point element in b down to
7103
// an integer value, store the result as a single-precision floating-point
7104
// element in the lower element of dst, and copy the upper 3 packed elements
7105
// from a to the upper elements of dst.
7106
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
7107
FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
7108
{
7109
return _mm_move_ss(a, _mm_floor_ps(b));
7110
}
7111
7112
// Copy a to dst, and insert the 32-bit integer i into dst at the location
7113
// specified by imm8.
7114
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
7115
// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
7116
// __constrange(0,4) int imm)
7117
#define _mm_insert_epi32(a, b, imm) \
7118
vreinterpretq_m128i_s32( \
7119
vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm)))
7120
7121
// Copy a to dst, and insert the 64-bit integer i into dst at the location
7122
// specified by imm8.
7123
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
7124
// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
7125
// __constrange(0,2) int imm)
7126
#define _mm_insert_epi64(a, b, imm) \
7127
vreinterpretq_m128i_s64( \
7128
vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm)))
7129
7130
// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
7131
// location specified by imm8.
7132
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
7133
// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
7134
// __constrange(0,16) int imm)
7135
#define _mm_insert_epi8(a, b, imm) \
7136
vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm)))
7137
7138
// Copy a to tmp, then insert a single-precision (32-bit) floating-point
7139
// element from b into tmp using the control in imm8. Store tmp to dst using
7140
// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
7141
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
7142
#define _mm_insert_ps(a, b, imm8) \
7143
_sse2neon_define2( \
7144
__m128, a, b, \
7145
float32x4_t tmp1 = \
7146
vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3), \
7147
vreinterpretq_f32_m128(_a), 0); \
7148
float32x4_t tmp2 = \
7149
vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), \
7150
vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \
7151
const uint32_t data[4] = \
7152
_sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0, \
7153
((imm8) & (1 << 1)) ? UINT32_MAX : 0, \
7154
((imm8) & (1 << 2)) ? UINT32_MAX : 0, \
7155
((imm8) & (1 << 3)) ? UINT32_MAX : 0); \
7156
uint32x4_t mask = vld1q_u32(data); \
7157
float32x4_t all_zeros = vdupq_n_f32(0); \
7158
\
7159
_sse2neon_return(vreinterpretq_m128_f32( \
7160
vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
7161
7162
// Compare packed signed 32-bit integers in a and b, and store packed maximum
7163
// values in dst.
7164
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
7165
FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
7166
{
7167
return vreinterpretq_m128i_s32(
7168
vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7169
}
7170
7171
// Compare packed signed 8-bit integers in a and b, and store packed maximum
7172
// values in dst.
7173
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
7174
FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
7175
{
7176
return vreinterpretq_m128i_s8(
7177
vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7178
}
7179
7180
// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
7181
// values in dst.
7182
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
7183
FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
7184
{
7185
return vreinterpretq_m128i_u16(
7186
vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7187
}
7188
7189
// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
7190
// values in dst.
7191
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
7192
FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
7193
{
7194
return vreinterpretq_m128i_u32(
7195
vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7196
}
7197
7198
// Compare packed signed 32-bit integers in a and b, and store packed minimum
7199
// values in dst.
7200
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
7201
FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
7202
{
7203
return vreinterpretq_m128i_s32(
7204
vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7205
}
7206
7207
// Compare packed signed 8-bit integers in a and b, and store packed minimum
7208
// values in dst.
7209
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
7210
FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
7211
{
7212
return vreinterpretq_m128i_s8(
7213
vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7214
}
7215
7216
// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
7217
// values in dst.
7218
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
7219
FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
7220
{
7221
return vreinterpretq_m128i_u16(
7222
vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7223
}
7224
7225
// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
7226
// values in dst.
7227
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
7228
FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
7229
{
7230
return vreinterpretq_m128i_u32(
7231
vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7232
}
7233
7234
// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
7235
// in a, store the minimum and index in dst, and zero the remaining bits in dst.
7236
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
7237
FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
7238
{
7239
__m128i dst;
7240
uint16_t min, idx = 0;
7241
#if defined(__aarch64__) || defined(_M_ARM64)
7242
// Find the minimum value
7243
min = vminvq_u16(vreinterpretq_u16_m128i(a));
7244
7245
// Get the index of the minimum value
7246
static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
7247
uint16x8_t minv = vdupq_n_u16(min);
7248
uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
7249
idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
7250
#else
7251
// Find the minimum value
7252
__m64 tmp;
7253
tmp = vreinterpret_m64_u16(
7254
vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
7255
vget_high_u16(vreinterpretq_u16_m128i(a))));
7256
tmp = vreinterpret_m64_u16(
7257
vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7258
tmp = vreinterpret_m64_u16(
7259
vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7260
min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
7261
// Get the index of the minimum value
7262
int i;
7263
for (i = 0; i < 8; i++) {
7264
if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
7265
idx = (uint16_t) i;
7266
break;
7267
}
7268
a = _mm_srli_si128(a, 2);
7269
}
7270
#endif
7271
// Generate result
7272
dst = _mm_setzero_si128();
7273
dst = vreinterpretq_m128i_u16(
7274
vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
7275
dst = vreinterpretq_m128i_u16(
7276
vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
7277
return dst;
7278
}
7279
7280
// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
7281
// 8-bit integers in a compared to those in b, and store the 16-bit results in
7282
// dst. Eight SADs are performed using one quadruplet from b and eight
7283
// quadruplets from a. One quadruplet is selected from b starting at on the
7284
// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
7285
// integers selected from a starting at the offset specified in imm8.
7286
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
7287
FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
7288
{
7289
uint8x16_t _a, _b;
7290
7291
switch (imm & 0x4) {
7292
case 0:
7293
// do nothing
7294
_a = vreinterpretq_u8_m128i(a);
7295
break;
7296
case 4:
7297
_a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
7298
vreinterpretq_u32_m128i(a), 1));
7299
break;
7300
default:
7301
#if defined(__GNUC__) || defined(__clang__)
7302
__builtin_unreachable();
7303
#elif defined(_MSC_VER)
7304
__assume(0);
7305
#endif
7306
break;
7307
}
7308
7309
switch (imm & 0x3) {
7310
case 0:
7311
_b = vreinterpretq_u8_u32(
7312
vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
7313
break;
7314
case 1:
7315
_b = vreinterpretq_u8_u32(
7316
vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
7317
break;
7318
case 2:
7319
_b = vreinterpretq_u8_u32(
7320
vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
7321
break;
7322
case 3:
7323
_b = vreinterpretq_u8_u32(
7324
vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
7325
break;
7326
default:
7327
#if defined(__GNUC__) || defined(__clang__)
7328
__builtin_unreachable();
7329
#elif defined(_MSC_VER)
7330
__assume(0);
7331
#endif
7332
break;
7333
}
7334
7335
int16x8_t c04, c15, c26, c37;
7336
uint8x8_t low_b = vget_low_u8(_b);
7337
c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
7338
uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
7339
c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
7340
uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
7341
c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
7342
uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
7343
c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
7344
#if defined(__aarch64__) || defined(_M_ARM64)
7345
// |0|4|2|6|
7346
c04 = vpaddq_s16(c04, c26);
7347
// |1|5|3|7|
7348
c15 = vpaddq_s16(c15, c37);
7349
7350
int32x4_t trn1_c =
7351
vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7352
int32x4_t trn2_c =
7353
vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7354
return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
7355
vreinterpretq_s16_s32(trn2_c)));
7356
#else
7357
int16x4_t c01, c23, c45, c67;
7358
c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
7359
c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
7360
c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
7361
c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
7362
7363
return vreinterpretq_m128i_s16(
7364
vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
7365
#endif
7366
}
7367
7368
// Multiply the low signed 32-bit integers from each packed 64-bit element in
7369
// a and b, and store the signed 64-bit results in dst.
7370
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
7371
FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
7372
{
7373
// vmull_s32 upcasts instead of masking, so we downcast.
7374
int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
7375
int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
7376
return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
7377
}
7378
7379
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
7380
// integers, and store the low 32 bits of the intermediate integers in dst.
7381
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
7382
FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
7383
{
7384
return vreinterpretq_m128i_s32(
7385
vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7386
}
7387
7388
// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
7389
// using unsigned saturation, and store the results in dst.
7390
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
7391
FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
7392
{
7393
return vreinterpretq_m128i_u16(
7394
vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
7395
vqmovun_s32(vreinterpretq_s32_m128i(b))));
7396
}
7397
7398
// Round the packed double-precision (64-bit) floating-point elements in a using
7399
// the rounding parameter, and store the results as packed double-precision
7400
// floating-point elements in dst.
7401
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
7402
FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
7403
{
7404
#if defined(__aarch64__) || defined(_M_ARM64)
7405
switch (rounding) {
7406
case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
7407
return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
7408
case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
7409
return _mm_floor_pd(a);
7410
case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
7411
return _mm_ceil_pd(a);
7412
case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
7413
return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
7414
default: //_MM_FROUND_CUR_DIRECTION
7415
return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
7416
}
7417
#else
7418
double *v_double = (double *) &a;
7419
7420
if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7421
(rounding == _MM_FROUND_CUR_DIRECTION &&
7422
_MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
7423
double res[2], tmp;
7424
for (int i = 0; i < 2; i++) {
7425
tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
7426
double roundDown = floor(tmp); // Round down value
7427
double roundUp = ceil(tmp); // Round up value
7428
double diffDown = tmp - roundDown;
7429
double diffUp = roundUp - tmp;
7430
if (diffDown < diffUp) {
7431
/* If it's closer to the round down value, then use it */
7432
res[i] = roundDown;
7433
} else if (diffDown > diffUp) {
7434
/* If it's closer to the round up value, then use it */
7435
res[i] = roundUp;
7436
} else {
7437
/* If it's equidistant between round up and round down value,
7438
* pick the one which is an even number */
7439
double half = roundDown / 2;
7440
if (half != floor(half)) {
7441
/* If the round down value is odd, return the round up value
7442
*/
7443
res[i] = roundUp;
7444
} else {
7445
/* If the round up value is odd, return the round down value
7446
*/
7447
res[i] = roundDown;
7448
}
7449
}
7450
res[i] = (v_double[i] < 0) ? -res[i] : res[i];
7451
}
7452
return _mm_set_pd(res[1], res[0]);
7453
} else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7454
(rounding == _MM_FROUND_CUR_DIRECTION &&
7455
_MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
7456
return _mm_floor_pd(a);
7457
} else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7458
(rounding == _MM_FROUND_CUR_DIRECTION &&
7459
_MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
7460
return _mm_ceil_pd(a);
7461
}
7462
return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
7463
v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
7464
#endif
7465
}
7466
7467
// Round the packed single-precision (32-bit) floating-point elements in a using
7468
// the rounding parameter, and store the results as packed single-precision
7469
// floating-point elements in dst.
7470
// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
7471
FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
7472
{
7473
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
7474
defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7475
switch (rounding) {
7476
case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
7477
return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
7478
case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
7479
return _mm_floor_ps(a);
7480
case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
7481
return _mm_ceil_ps(a);
7482
case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
7483
return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
7484
default: //_MM_FROUND_CUR_DIRECTION
7485
return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
7486
}
7487
#else
7488
float *v_float = (float *) &a;
7489
7490
if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7491
(rounding == _MM_FROUND_CUR_DIRECTION &&
7492
_MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
7493
uint32x4_t signmask = vdupq_n_u32(0x80000000);
7494
float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
7495
vdupq_n_f32(0.5f)); /* +/- 0.5 */
7496
int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
7497
vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
7498
int32x4_t r_trunc = vcvtq_s32_f32(
7499
vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
7500
int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
7501
vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
7502
int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
7503
vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
7504
float32x4_t delta = vsubq_f32(
7505
vreinterpretq_f32_m128(a),
7506
vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
7507
uint32x4_t is_delta_half =
7508
vceqq_f32(delta, half); /* delta == +/- 0.5 */
7509
return vreinterpretq_m128_f32(
7510
vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
7511
} else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7512
(rounding == _MM_FROUND_CUR_DIRECTION &&
7513
_MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
7514
return _mm_floor_ps(a);
7515
} else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7516
(rounding == _MM_FROUND_CUR_DIRECTION &&
7517
_MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
7518
return _mm_ceil_ps(a);
7519
}
7520
return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
7521
v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
7522
v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
7523
v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
7524
#endif
7525
}
7526
7527
// Round the lower double-precision (64-bit) floating-point element in b using
7528
// the rounding parameter, store the result as a double-precision floating-point
7529
// element in the lower element of dst, and copy the upper element from a to the
7530
// upper element of dst.
7531
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
7532
FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
7533
{
7534
return _mm_move_sd(a, _mm_round_pd(b, rounding));
7535
}
7536
7537
// Round the lower single-precision (32-bit) floating-point element in b using
7538
// the rounding parameter, store the result as a single-precision floating-point
7539
// element in the lower element of dst, and copy the upper 3 packed elements
7540
// from a to the upper elements of dst. Rounding is done according to the
7541
// rounding[3:0] parameter, which can be one of:
7542
// (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
7543
// suppress exceptions
7544
// (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and
7545
// suppress exceptions
7546
// (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress
7547
// exceptions
7548
// (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress
7549
// exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
7550
// _MM_SET_ROUNDING_MODE
7551
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
7552
FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
7553
{
7554
return _mm_move_ss(a, _mm_round_ps(b, rounding));
7555
}
7556
7557
// Load 128-bits of integer data from memory into dst using a non-temporal
7558
// memory hint. mem_addr must be aligned on a 16-byte boundary or a
7559
// general-protection exception may be generated.
7560
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
7561
FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
7562
{
7563
#if __has_builtin(__builtin_nontemporal_store)
7564
return __builtin_nontemporal_load(p);
7565
#else
7566
return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
7567
#endif
7568
}
7569
7570
// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
7571
// all 1's, and return 1 if the result is zero, otherwise return 0.
7572
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
7573
FORCE_INLINE int _mm_test_all_ones(__m128i a)
7574
{
7575
return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
7576
~(uint64_t) 0;
7577
}
7578
7579
// Compute the bitwise AND of 128 bits (representing integer data) in a and
7580
// mask, and return 1 if the result is zero, otherwise return 0.
7581
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
7582
FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
7583
{
7584
int64x2_t a_and_mask =
7585
vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
7586
return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
7587
}
7588
7589
// Compute the bitwise AND of 128 bits (representing integer data) in a and
7590
// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
7591
// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
7592
// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7593
// otherwise return 0.
7594
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
7595
FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
7596
{
7597
uint64x2_t zf =
7598
vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
7599
uint64x2_t cf =
7600
vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
7601
uint64x2_t result = vandq_u64(zf, cf);
7602
return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
7603
}
7604
7605
// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7606
// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7607
// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7608
// otherwise set CF to 0. Return the CF value.
7609
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
7610
FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
7611
{
7612
int64x2_t s64 =
7613
vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
7614
return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7615
}
7616
7617
// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7618
// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7619
// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7620
// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7621
// otherwise return 0.
7622
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
7623
#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
7624
7625
// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7626
// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7627
// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7628
// otherwise set CF to 0. Return the ZF value.
7629
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
7630
FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
7631
{
7632
int64x2_t s64 =
7633
vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
7634
return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7635
}
7636
7637
/* SSE4.2 */
7638
7639
static const uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = {
7640
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7641
};
7642
static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
7643
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7644
0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7645
};
7646
7647
/* specify the source data format */
7648
#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
7649
#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
7650
#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
7651
#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
7652
7653
/* specify the comparison operation */
7654
#define _SIDD_CMP_EQUAL_ANY 0x00 /* compare equal any: strchr */
7655
#define _SIDD_CMP_RANGES 0x04 /* compare ranges */
7656
#define _SIDD_CMP_EQUAL_EACH 0x08 /* compare equal each: strcmp */
7657
#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
7658
7659
/* specify the polarity */
7660
#define _SIDD_POSITIVE_POLARITY 0x00
7661
#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
7662
#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
7663
#define _SIDD_MASKED_NEGATIVE_POLARITY \
7664
0x30 /* negate results only before end of string */
7665
7666
/* specify the output selection in _mm_cmpXstri */
7667
#define _SIDD_LEAST_SIGNIFICANT 0x00
7668
#define _SIDD_MOST_SIGNIFICANT 0x40
7669
7670
/* specify the output selection in _mm_cmpXstrm */
7671
#define _SIDD_BIT_MASK 0x00
7672
#define _SIDD_UNIT_MASK 0x40
7673
7674
/* Pattern Matching for C macros.
7675
* https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
7676
*/
7677
7678
/* catenate */
7679
#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
7680
#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
7681
7682
#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
7683
/* run the 2nd parameter */
7684
#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
7685
/* run the 1st parameter */
7686
#define SSE2NEON_IIF_1(t, ...) t
7687
7688
#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
7689
#define SSE2NEON_COMPL_0 1
7690
#define SSE2NEON_COMPL_1 0
7691
7692
#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
7693
#define SSE2NEON_DEC_1 0
7694
#define SSE2NEON_DEC_2 1
7695
#define SSE2NEON_DEC_3 2
7696
#define SSE2NEON_DEC_4 3
7697
#define SSE2NEON_DEC_5 4
7698
#define SSE2NEON_DEC_6 5
7699
#define SSE2NEON_DEC_7 6
7700
#define SSE2NEON_DEC_8 7
7701
#define SSE2NEON_DEC_9 8
7702
#define SSE2NEON_DEC_10 9
7703
#define SSE2NEON_DEC_11 10
7704
#define SSE2NEON_DEC_12 11
7705
#define SSE2NEON_DEC_13 12
7706
#define SSE2NEON_DEC_14 13
7707
#define SSE2NEON_DEC_15 14
7708
#define SSE2NEON_DEC_16 15
7709
7710
/* detection */
7711
#define SSE2NEON_CHECK_N(x, n, ...) n
7712
#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
7713
#define SSE2NEON_PROBE(x) x, 1,
7714
7715
#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
7716
#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
7717
7718
#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
7719
#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
7720
7721
#define SSE2NEON_EAT(...)
7722
#define SSE2NEON_EXPAND(...) __VA_ARGS__
7723
#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
7724
7725
/* recursion */
7726
/* deferred expression */
7727
#define SSE2NEON_EMPTY()
7728
#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
7729
#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
7730
#define SSE2NEON_EXPAND(...) __VA_ARGS__
7731
7732
#define SSE2NEON_EVAL(...) \
7733
SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
7734
#define SSE2NEON_EVAL1(...) \
7735
SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
7736
#define SSE2NEON_EVAL2(...) \
7737
SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
7738
#define SSE2NEON_EVAL3(...) __VA_ARGS__
7739
7740
#define SSE2NEON_REPEAT(count, macro, ...) \
7741
SSE2NEON_WHEN(count) \
7742
(SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()( \
7743
SSE2NEON_DEC(count), macro, \
7744
__VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
7745
__VA_ARGS__))
7746
#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
7747
7748
#define SSE2NEON_SIZE_OF_byte 8
7749
#define SSE2NEON_NUMBER_OF_LANES_byte 16
7750
#define SSE2NEON_SIZE_OF_word 16
7751
#define SSE2NEON_NUMBER_OF_LANES_word 8
7752
7753
#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type) \
7754
mtx[i] = vreinterpretq_m128i_##type(vceqq_##type( \
7755
vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
7756
vreinterpretq_##type##_m128i(a)));
7757
7758
#define SSE2NEON_FILL_LANE(i, type) \
7759
vec_b[i] = \
7760
vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
7761
7762
#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size, \
7763
number_of_lanes, byte_or_word) \
7764
do { \
7765
SSE2NEON_CAT( \
7766
data_type_prefix, \
7767
SSE2NEON_CAT(size, \
7768
SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
7769
vec_b[number_of_lanes]; \
7770
__m128i mask = SSE2NEON_IIF(byte_or_word)( \
7771
vreinterpretq_m128i_u16(vdupq_n_u16(0xff)), \
7772
vreinterpretq_m128i_u32(vdupq_n_u32(0xffff))); \
7773
SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE, \
7774
SSE2NEON_CAT(type_prefix, size))) \
7775
for (int i = 0; i < number_of_lanes; i++) { \
7776
mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u, \
7777
size)(SSE2NEON_CAT(vbslq_u, size)( \
7778
SSE2NEON_CAT(vreinterpretq_u, \
7779
SSE2NEON_CAT(size, _m128i))(mask), \
7780
SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))( \
7781
vec_b[i], \
7782
SSE2NEON_CAT( \
7783
vreinterpretq_, \
7784
SSE2NEON_CAT(type_prefix, \
7785
SSE2NEON_CAT(size, _m128i(a))))), \
7786
SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))( \
7787
vec_b[i], \
7788
SSE2NEON_CAT( \
7789
vreinterpretq_, \
7790
SSE2NEON_CAT(type_prefix, \
7791
SSE2NEON_CAT(size, _m128i(a))))))); \
7792
} \
7793
} while (0)
7794
7795
#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes) \
7796
do { \
7797
SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, \
7798
SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
7799
SSE2NEON_CAT(u, size))) \
7800
} while (0)
7801
7802
#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \
7803
static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
7804
int lb) \
7805
{ \
7806
__m128i mtx[16]; \
7807
PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7808
SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
7809
return SSE2NEON_CAT( \
7810
_sse2neon_aggregate_equal_any_, \
7811
SSE2NEON_CAT( \
7812
SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7813
SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
7814
type))))(la, lb, mtx); \
7815
}
7816
7817
#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \
7818
static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
7819
int lb) \
7820
{ \
7821
__m128i mtx[16]; \
7822
PCMPSTR_RANGES( \
7823
a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7824
SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \
7825
return SSE2NEON_CAT( \
7826
_sse2neon_aggregate_ranges_, \
7827
SSE2NEON_CAT( \
7828
SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7829
SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \
7830
type))))(la, lb, mtx); \
7831
}
7832
7833
#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type) \
7834
static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \
7835
__m128i b, int lb) \
7836
{ \
7837
__m128i mtx[16]; \
7838
PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7839
SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \
7840
return SSE2NEON_CAT( \
7841
_sse2neon_aggregate_equal_ordered_, \
7842
SSE2NEON_CAT( \
7843
SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \
7844
SSE2NEON_CAT(x, \
7845
SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
7846
SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx); \
7847
}
7848
7849
static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
7850
{
7851
int res = 0;
7852
int m = (1 << la) - 1;
7853
uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7854
uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
7855
uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
7856
uint8x16_t vec = vcombine_u8(t_lo, t_hi);
7857
for (int j = 0; j < lb; j++) {
7858
mtx[j] = vreinterpretq_m128i_u8(
7859
vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
7860
mtx[j] = vreinterpretq_m128i_u8(
7861
vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
7862
int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
7863
res |= (tmp << j);
7864
}
7865
return res;
7866
}
7867
7868
static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
7869
{
7870
int res = 0;
7871
int m = (1 << la) - 1;
7872
uint16x8_t vec =
7873
vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
7874
for (int j = 0; j < lb; j++) {
7875
mtx[j] = vreinterpretq_m128i_u16(
7876
vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
7877
mtx[j] = vreinterpretq_m128i_u16(
7878
vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
7879
int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
7880
res |= (tmp << j);
7881
}
7882
return res;
7883
}
7884
7885
/* clang-format off */
7886
#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
7887
prefix##IMPL(byte) \
7888
prefix##IMPL(word)
7889
/* clang-format on */
7890
7891
SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
7892
7893
static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
7894
{
7895
int res = 0;
7896
int m = (1 << la) - 1;
7897
uint16x8_t vec =
7898
vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
7899
for (int j = 0; j < lb; j++) {
7900
mtx[j] = vreinterpretq_m128i_u16(
7901
vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
7902
mtx[j] = vreinterpretq_m128i_u16(
7903
vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
7904
__m128i tmp = vreinterpretq_m128i_u32(
7905
vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
7906
uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
7907
vreinterpretq_u32_m128i(tmp));
7908
#if defined(__aarch64__) || defined(_M_ARM64)
7909
int t = vaddvq_u32(vec_res) ? 1 : 0;
7910
#else
7911
uint64x2_t sumh = vpaddlq_u32(vec_res);
7912
int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
7913
#endif
7914
res |= (t << j);
7915
}
7916
return res;
7917
}
7918
7919
static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
7920
{
7921
int res = 0;
7922
int m = (1 << la) - 1;
7923
uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7924
uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
7925
uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
7926
uint8x16_t vec = vcombine_u8(t_lo, t_hi);
7927
for (int j = 0; j < lb; j++) {
7928
mtx[j] = vreinterpretq_m128i_u8(
7929
vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
7930
mtx[j] = vreinterpretq_m128i_u8(
7931
vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
7932
__m128i tmp = vreinterpretq_m128i_u16(
7933
vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
7934
uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
7935
vreinterpretq_u16_m128i(tmp));
7936
int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
7937
res |= (t << j);
7938
}
7939
return res;
7940
}
7941
7942
#define SSE2NEON_CMP_RANGES_IS_BYTE 1
7943
#define SSE2NEON_CMP_RANGES_IS_WORD 0
7944
7945
/* clang-format off */
7946
#define SSE2NEON_GENERATE_CMP_RANGES(prefix) \
7947
prefix##IMPL(byte, uint, u, prefix##IS_BYTE) \
7948
prefix##IMPL(byte, int, s, prefix##IS_BYTE) \
7949
prefix##IMPL(word, uint, u, prefix##IS_WORD) \
7950
prefix##IMPL(word, int, s, prefix##IS_WORD)
7951
/* clang-format on */
7952
7953
SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
7954
7955
#undef SSE2NEON_CMP_RANGES_IS_BYTE
7956
#undef SSE2NEON_CMP_RANGES_IS_WORD
7957
7958
static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
7959
{
7960
uint8x16_t mtx =
7961
vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
7962
int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
7963
int m1 = 0x10000 - (1 << la);
7964
int tb = 0x10000 - (1 << lb);
7965
uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
7966
uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
7967
vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7968
vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
7969
vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
7970
vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
7971
vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
7972
tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
7973
tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
7974
7975
res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
7976
res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
7977
res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
7978
res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
7979
res_lo = vand_u8(res_lo, vec_mask);
7980
res_hi = vand_u8(res_hi, vec_mask);
7981
7982
int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
7983
return res;
7984
}
7985
7986
static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
7987
{
7988
uint16x8_t mtx =
7989
vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
7990
int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
7991
int m1 = 0x100 - (1 << la);
7992
int tb = 0x100 - (1 << lb);
7993
uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
7994
uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
7995
uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
7996
uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
7997
mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
7998
mtx = vbslq_u16(vec1, tmp, mtx);
7999
mtx = vandq_u16(mtx, vec_mask);
8000
return _sse2neon_vaddvq_u16(mtx);
8001
}
8002
8003
#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
8004
#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
8005
8006
#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \
8007
static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \
8008
int bound, int la, int lb, __m128i mtx[16]) \
8009
{ \
8010
int res = 0; \
8011
int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la); \
8012
uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)( \
8013
vld1_u##size(_sse2neon_cmpestr_mask##size##b), \
8014
vld1q_u##size(_sse2neon_cmpestr_mask##size##b)); \
8015
uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)( \
8016
vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask), \
8017
vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
8018
vtstq_u##size(vdupq_n_u##size(m1), vec_mask)); \
8019
uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
8020
uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \
8021
for (int j = 0; j < lb; j++) { \
8022
mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size( \
8023
vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j]))); \
8024
} \
8025
for (int j = lb; j < bound; j++) { \
8026
mtx[j] = vreinterpretq_m128i_u##size( \
8027
vbslq_u##size(vec1, vec_minusone, vec_zero)); \
8028
} \
8029
unsigned SSE2NEON_IIF(data_type)(char, short) *ptr = \
8030
(unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx; \
8031
for (int i = 0; i < bound; i++) { \
8032
int val = 1; \
8033
for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \
8034
val &= ptr[k * bound + j]; \
8035
res += val << i; \
8036
} \
8037
return res; \
8038
}
8039
8040
/* clang-format off */
8041
#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
8042
prefix##IMPL(8, 16, prefix##IS_UBYTE) \
8043
prefix##IMPL(16, 8, prefix##IS_UWORD)
8044
/* clang-format on */
8045
8046
SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
8047
8048
#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
8049
#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
8050
8051
/* clang-format off */
8052
#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
8053
prefix##IMPL(byte) \
8054
prefix##IMPL(word)
8055
/* clang-format on */
8056
8057
SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
8058
8059
#define SSE2NEON_CMPESTR_LIST \
8060
_(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any) \
8061
_(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any) \
8062
_(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any) \
8063
_(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any) \
8064
_(CMP_UBYTE_RANGES, cmp_ubyte_ranges) \
8065
_(CMP_UWORD_RANGES, cmp_uword_ranges) \
8066
_(CMP_SBYTE_RANGES, cmp_sbyte_ranges) \
8067
_(CMP_SWORD_RANGES, cmp_sword_ranges) \
8068
_(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each) \
8069
_(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each) \
8070
_(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each) \
8071
_(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each) \
8072
_(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
8073
_(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
8074
_(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
8075
_(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
8076
8077
enum {
8078
#define _(name, func_suffix) name,
8079
SSE2NEON_CMPESTR_LIST
8080
#undef _
8081
};
8082
typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
8083
static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
8084
#define _(name, func_suffix) _sse2neon_##func_suffix,
8085
SSE2NEON_CMPESTR_LIST
8086
#undef _
8087
};
8088
8089
FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
8090
{
8091
switch (imm8 & 0x30) {
8092
case _SIDD_NEGATIVE_POLARITY:
8093
res ^= 0xffffffff;
8094
break;
8095
case _SIDD_MASKED_NEGATIVE_POLARITY:
8096
res ^= (1 << lb) - 1;
8097
break;
8098
default:
8099
break;
8100
}
8101
8102
return res & ((bound == 8) ? 0xFF : 0xFFFF);
8103
}
8104
8105
FORCE_INLINE int _sse2neon_clz(unsigned int x)
8106
{
8107
#ifdef _MSC_VER
8108
unsigned long cnt = 0;
8109
if (_BitScanReverse(&cnt, x))
8110
return 31 - cnt;
8111
return 32;
8112
#else
8113
return x != 0 ? __builtin_clz(x) : 32;
8114
#endif
8115
}
8116
8117
FORCE_INLINE int _sse2neon_ctz(unsigned int x)
8118
{
8119
#ifdef _MSC_VER
8120
unsigned long cnt = 0;
8121
if (_BitScanForward(&cnt, x))
8122
return cnt;
8123
return 32;
8124
#else
8125
return x != 0 ? __builtin_ctz(x) : 32;
8126
#endif
8127
}
8128
8129
FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
8130
{
8131
#ifdef _MSC_VER
8132
unsigned long cnt;
8133
#if defined(SSE2NEON_HAS_BITSCAN64)
8134
if (_BitScanForward64(&cnt, x))
8135
return (int) (cnt);
8136
#else
8137
if (_BitScanForward(&cnt, (unsigned long) (x)))
8138
return (int) cnt;
8139
if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
8140
return (int) (cnt + 32);
8141
#endif /* SSE2NEON_HAS_BITSCAN64 */
8142
return 64;
8143
#else /* assume GNU compatible compilers */
8144
return x != 0 ? __builtin_ctzll(x) : 64;
8145
#endif
8146
}
8147
8148
#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
8149
8150
#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
8151
const int var = (imm & 0x01) ? 8 : 16
8152
8153
#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
8154
int tmp1 = la ^ (la >> 31); \
8155
la = tmp1 - (la >> 31); \
8156
int tmp2 = lb ^ (lb >> 31); \
8157
lb = tmp2 - (lb >> 31); \
8158
la = SSE2NEON_MIN(la, bound); \
8159
lb = SSE2NEON_MIN(lb, bound)
8160
8161
// Compare all pairs of character in string a and b,
8162
// then aggregate the result.
8163
// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
8164
// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
8165
// string a and b.
8166
#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \
8167
SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \
8168
SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \
8169
int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
8170
r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
8171
8172
#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \
8173
return (r2 == 0) ? bound \
8174
: ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
8175
: _sse2neon_ctz(r2))
8176
8177
#define SSE2NEON_CMPSTR_GENERATE_MASK(dst) \
8178
__m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \
8179
if (imm8 & 0x40) { \
8180
if (bound == 8) { \
8181
uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), \
8182
vld1q_u16(_sse2neon_cmpestr_mask16b)); \
8183
dst = vreinterpretq_m128i_u16(vbslq_u16( \
8184
tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \
8185
} else { \
8186
uint8x16_t vec_r2 = \
8187
vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8)); \
8188
uint8x16_t tmp = \
8189
vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b)); \
8190
dst = vreinterpretq_m128i_u8( \
8191
vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst))); \
8192
} \
8193
} else { \
8194
if (bound == 16) { \
8195
dst = vreinterpretq_m128i_u16( \
8196
vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
8197
} else { \
8198
dst = vreinterpretq_m128i_u8( \
8199
vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0)); \
8200
} \
8201
} \
8202
return dst
8203
8204
// Compare packed strings in a and b with lengths la and lb using the control
8205
// in imm8, and returns 1 if b did not contain a null character and the
8206
// resulting mask was zero, and 0 otherwise.
8207
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
8208
FORCE_INLINE int _mm_cmpestra(__m128i a,
8209
int la,
8210
__m128i b,
8211
int lb,
8212
const int imm8)
8213
{
8214
int lb_cpy = lb;
8215
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8216
return !r2 & (lb_cpy > bound);
8217
}
8218
8219
// Compare packed strings in a and b with lengths la and lb using the control in
8220
// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
8221
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
8222
FORCE_INLINE int _mm_cmpestrc(__m128i a,
8223
int la,
8224
__m128i b,
8225
int lb,
8226
const int imm8)
8227
{
8228
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8229
return r2 != 0;
8230
}
8231
8232
// Compare packed strings in a and b with lengths la and lb using the control
8233
// in imm8, and store the generated index in dst.
8234
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
8235
FORCE_INLINE int _mm_cmpestri(__m128i a,
8236
int la,
8237
__m128i b,
8238
int lb,
8239
const int imm8)
8240
{
8241
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8242
SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
8243
}
8244
8245
// Compare packed strings in a and b with lengths la and lb using the control
8246
// in imm8, and store the generated mask in dst.
8247
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
8248
FORCE_INLINE __m128i
8249
_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
8250
{
8251
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8252
SSE2NEON_CMPSTR_GENERATE_MASK(dst);
8253
}
8254
8255
// Compare packed strings in a and b with lengths la and lb using the control in
8256
// imm8, and returns bit 0 of the resulting bit mask.
8257
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
8258
FORCE_INLINE int _mm_cmpestro(__m128i a,
8259
int la,
8260
__m128i b,
8261
int lb,
8262
const int imm8)
8263
{
8264
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8265
return r2 & 1;
8266
}
8267
8268
// Compare packed strings in a and b with lengths la and lb using the control in
8269
// imm8, and returns 1 if any character in a was null, and 0 otherwise.
8270
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
8271
FORCE_INLINE int _mm_cmpestrs(__m128i a,
8272
int la,
8273
__m128i b,
8274
int lb,
8275
const int imm8)
8276
{
8277
(void) a;
8278
(void) b;
8279
(void) lb;
8280
SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8281
return la <= (bound - 1);
8282
}
8283
8284
// Compare packed strings in a and b with lengths la and lb using the control in
8285
// imm8, and returns 1 if any character in b was null, and 0 otherwise.
8286
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
8287
FORCE_INLINE int _mm_cmpestrz(__m128i a,
8288
int la,
8289
__m128i b,
8290
int lb,
8291
const int imm8)
8292
{
8293
(void) a;
8294
(void) b;
8295
(void) la;
8296
SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8297
return lb <= (bound - 1);
8298
}
8299
8300
#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8) \
8301
do { \
8302
if (imm8 & 0x01) { \
8303
uint16x8_t equal_mask_##str = \
8304
vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
8305
uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
8306
uint64_t matches_##str = \
8307
vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
8308
len = _sse2neon_ctzll(matches_##str) >> 3; \
8309
} else { \
8310
uint16x8_t equal_mask_##str = vreinterpretq_u16_u8( \
8311
vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0))); \
8312
uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \
8313
uint64_t matches_##str = \
8314
vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \
8315
len = _sse2neon_ctzll(matches_##str) >> 2; \
8316
} \
8317
} while (0)
8318
8319
#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
8320
int la, lb; \
8321
do { \
8322
SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); \
8323
SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); \
8324
} while (0)
8325
8326
// Compare packed strings with implicit lengths in a and b using the control in
8327
// imm8, and returns 1 if b did not contain a null character and the resulting
8328
// mask was zero, and 0 otherwise.
8329
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
8330
FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
8331
{
8332
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8333
return !r2 & (lb >= bound);
8334
}
8335
8336
// Compare packed strings with implicit lengths in a and b using the control in
8337
// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
8338
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
8339
FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
8340
{
8341
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8342
return r2 != 0;
8343
}
8344
8345
// Compare packed strings with implicit lengths in a and b using the control in
8346
// imm8, and store the generated index in dst.
8347
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
8348
FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
8349
{
8350
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8351
SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
8352
}
8353
8354
// Compare packed strings with implicit lengths in a and b using the control in
8355
// imm8, and store the generated mask in dst.
8356
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
8357
FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
8358
{
8359
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8360
SSE2NEON_CMPSTR_GENERATE_MASK(dst);
8361
}
8362
8363
// Compare packed strings with implicit lengths in a and b using the control in
8364
// imm8, and returns bit 0 of the resulting bit mask.
8365
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
8366
FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
8367
{
8368
SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8369
return r2 & 1;
8370
}
8371
8372
// Compare packed strings with implicit lengths in a and b using the control in
8373
// imm8, and returns 1 if any character in a was null, and 0 otherwise.
8374
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
8375
FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
8376
{
8377
(void) b;
8378
SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8379
int la;
8380
SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
8381
return la <= (bound - 1);
8382
}
8383
8384
// Compare packed strings with implicit lengths in a and b using the control in
8385
// imm8, and returns 1 if any character in b was null, and 0 otherwise.
8386
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
8387
FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
8388
{
8389
(void) a;
8390
SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8391
int lb;
8392
SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
8393
return lb <= (bound - 1);
8394
}
8395
8396
// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
8397
// in b for greater than.
8398
FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
8399
{
8400
#if defined(__aarch64__) || defined(_M_ARM64)
8401
return vreinterpretq_m128i_u64(
8402
vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
8403
#else
8404
return vreinterpretq_m128i_s64(vshrq_n_s64(
8405
vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
8406
63));
8407
#endif
8408
}
8409
8410
// Starting with the initial value in crc, accumulates a CRC32 value for
8411
// unsigned 16-bit integer v, and stores the result in dst.
8412
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
8413
FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
8414
{
8415
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8416
__asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
8417
: [c] "+r"(crc)
8418
: [v] "r"(v));
8419
#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8420
(defined(_M_ARM64) && !defined(__clang__))
8421
crc = __crc32ch(crc, v);
8422
#else
8423
crc = _mm_crc32_u8(crc, v & 0xff);
8424
crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
8425
#endif
8426
return crc;
8427
}
8428
8429
// Starting with the initial value in crc, accumulates a CRC32 value for
8430
// unsigned 32-bit integer v, and stores the result in dst.
8431
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
8432
FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
8433
{
8434
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8435
__asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
8436
: [c] "+r"(crc)
8437
: [v] "r"(v));
8438
#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8439
(defined(_M_ARM64) && !defined(__clang__))
8440
crc = __crc32cw(crc, v);
8441
#else
8442
crc = _mm_crc32_u16(crc, v & 0xffff);
8443
crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
8444
#endif
8445
return crc;
8446
}
8447
8448
// Starting with the initial value in crc, accumulates a CRC32 value for
8449
// unsigned 64-bit integer v, and stores the result in dst.
8450
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
8451
FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
8452
{
8453
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8454
__asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
8455
: [c] "+r"(crc)
8456
: [v] "r"(v));
8457
#elif (defined(_M_ARM64) && !defined(__clang__))
8458
crc = __crc32cd((uint32_t) crc, v);
8459
#else
8460
crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
8461
crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
8462
#endif
8463
return crc;
8464
}
8465
8466
// Starting with the initial value in crc, accumulates a CRC32 value for
8467
// unsigned 8-bit integer v, and stores the result in dst.
8468
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
8469
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
8470
{
8471
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8472
__asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
8473
: [c] "+r"(crc)
8474
: [v] "r"(v));
8475
#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8476
(defined(_M_ARM64) && !defined(__clang__))
8477
crc = __crc32cb(crc, v);
8478
#else
8479
crc ^= v;
8480
for (int bit = 0; bit < 8; bit++) {
8481
if (crc & 1)
8482
crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
8483
else
8484
crc = (crc >> 1);
8485
}
8486
#endif
8487
return crc;
8488
}
8489
8490
/* AES */
8491
8492
#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__))
8493
/* clang-format off */
8494
#define SSE2NEON_AES_SBOX(w) \
8495
{ \
8496
w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8497
w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8498
w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8499
w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8500
w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8501
w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8502
w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8503
w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8504
w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8505
w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8506
w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8507
w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8508
w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8509
w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8510
w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8511
w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8512
w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8513
w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8514
w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8515
w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8516
w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8517
w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8518
w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8519
w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8520
w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8521
w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8522
w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8523
w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8524
w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8525
w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8526
w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8527
w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8528
w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8529
w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8530
w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8531
w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8532
w(0xb0), w(0x54), w(0xbb), w(0x16) \
8533
}
8534
#define SSE2NEON_AES_RSBOX(w) \
8535
{ \
8536
w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
8537
w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
8538
w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
8539
w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
8540
w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
8541
w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
8542
w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
8543
w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
8544
w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
8545
w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
8546
w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
8547
w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
8548
w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
8549
w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
8550
w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
8551
w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
8552
w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
8553
w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
8554
w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
8555
w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
8556
w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
8557
w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
8558
w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
8559
w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
8560
w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
8561
w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
8562
w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
8563
w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
8564
w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
8565
w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
8566
w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
8567
w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
8568
w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
8569
w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
8570
w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
8571
w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
8572
w(0x55), w(0x21), w(0x0c), w(0x7d) \
8573
}
8574
/* clang-format on */
8575
8576
/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
8577
#define SSE2NEON_AES_H0(x) (x)
8578
static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
8579
static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
8580
#undef SSE2NEON_AES_H0
8581
8582
/* x_time function and matrix multiply function */
8583
#if !defined(__aarch64__) && !defined(_M_ARM64)
8584
#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
8585
#define SSE2NEON_MULTIPLY(x, y) \
8586
(((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \
8587
((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^ \
8588
((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
8589
((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
8590
#endif
8591
8592
// In the absence of crypto extensions, implement aesenc using regular NEON
8593
// intrinsics instead. See:
8594
// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
8595
// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
8596
// for more information.
8597
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
8598
{
8599
#if defined(__aarch64__) || defined(_M_ARM64)
8600
static const uint8_t shift_rows[] = {
8601
0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
8602
0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
8603
};
8604
static const uint8_t ror32by8[] = {
8605
0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8606
0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8607
};
8608
8609
uint8x16_t v;
8610
uint8x16_t w = vreinterpretq_u8_m128i(a);
8611
8612
/* shift rows */
8613
w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8614
8615
/* sub bytes */
8616
// Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
8617
// look up each of the table. After each lookup, we load the next table
8618
// which locates at the next 64-bytes. In the meantime, the index in the
8619
// table would be smaller than it was, so the index parameters of
8620
// `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
8621
v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
8622
// 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
8623
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
8624
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
8625
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
8626
8627
/* mix columns */
8628
w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8629
w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8630
w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8631
8632
/* add round key */
8633
return vreinterpretq_m128i_u8(w) ^ RoundKey;
8634
8635
#else /* ARMv7-A implementation for a table-based AES */
8636
#define SSE2NEON_AES_B2W(b0, b1, b2, b3) \
8637
(((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
8638
((uint32_t) (b1) << 8) | (uint32_t) (b0))
8639
// muliplying 'x' by 2 in GF(2^8)
8640
#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
8641
// muliplying 'x' by 3 in GF(2^8)
8642
#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8643
#define SSE2NEON_AES_U0(p) \
8644
SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8645
#define SSE2NEON_AES_U1(p) \
8646
SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8647
#define SSE2NEON_AES_U2(p) \
8648
SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8649
#define SSE2NEON_AES_U3(p) \
8650
SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8651
8652
// this generates a table containing every possible permutation of
8653
// shift_rows() and sub_bytes() with mix_columns().
8654
static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
8655
SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
8656
SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
8657
SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
8658
SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
8659
};
8660
#undef SSE2NEON_AES_B2W
8661
#undef SSE2NEON_AES_F2
8662
#undef SSE2NEON_AES_F3
8663
#undef SSE2NEON_AES_U0
8664
#undef SSE2NEON_AES_U1
8665
#undef SSE2NEON_AES_U2
8666
#undef SSE2NEON_AES_U3
8667
8668
uint32_t x0 = _mm_cvtsi128_si32(a); // get a[31:0]
8669
uint32_t x1 =
8670
_mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); // get a[63:32]
8671
uint32_t x2 =
8672
_mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA)); // get a[95:64]
8673
uint32_t x3 =
8674
_mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); // get a[127:96]
8675
8676
// finish the modulo addition step in mix_columns()
8677
__m128i out = _mm_set_epi32(
8678
(aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8679
aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8680
(aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8681
aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8682
(aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8683
aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8684
(aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8685
aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8686
8687
return _mm_xor_si128(out, RoundKey);
8688
#endif
8689
}
8690
8691
// Perform one round of an AES decryption flow on data (state) in a using the
8692
// round key in RoundKey, and store the result in dst.
8693
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
8694
FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
8695
{
8696
#if defined(__aarch64__)
8697
static const uint8_t inv_shift_rows[] = {
8698
0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
8699
0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
8700
};
8701
static const uint8_t ror32by8[] = {
8702
0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8703
0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8704
};
8705
8706
uint8x16_t v;
8707
uint8x16_t w = vreinterpretq_u8_m128i(a);
8708
8709
// inverse shift rows
8710
w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
8711
8712
// inverse sub bytes
8713
v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
8714
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
8715
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
8716
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
8717
8718
// inverse mix columns
8719
// multiplying 'v' by 4 in GF(2^8)
8720
w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8721
w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
8722
v ^= w;
8723
v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
8724
8725
w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
8726
0x1b); // muliplying 'v' by 2 in GF(2^8)
8727
w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8728
w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8729
8730
// add round key
8731
return vreinterpretq_m128i_u8(w) ^ RoundKey;
8732
8733
#else /* ARMv7-A NEON implementation */
8734
/* FIXME: optimized for NEON */
8735
uint8_t i, e, f, g, h, v[4][4];
8736
uint8_t *_a = (uint8_t *) &a;
8737
for (i = 0; i < 16; ++i) {
8738
v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
8739
}
8740
8741
// inverse mix columns
8742
for (i = 0; i < 4; ++i) {
8743
e = v[i][0];
8744
f = v[i][1];
8745
g = v[i][2];
8746
h = v[i][3];
8747
8748
v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
8749
SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
8750
v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
8751
SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
8752
v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
8753
SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
8754
v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
8755
SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
8756
}
8757
8758
return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
8759
#endif
8760
}
8761
8762
// Perform the last round of an AES encryption flow on data (state) in a using
8763
// the round key in RoundKey, and store the result in dst.
8764
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
8765
FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8766
{
8767
#if defined(__aarch64__)
8768
static const uint8_t shift_rows[] = {
8769
0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
8770
0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
8771
};
8772
8773
uint8x16_t v;
8774
uint8x16_t w = vreinterpretq_u8_m128i(a);
8775
8776
// shift rows
8777
w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8778
8779
// sub bytes
8780
v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
8781
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
8782
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
8783
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
8784
8785
// add round key
8786
return vreinterpretq_m128i_u8(v) ^ RoundKey;
8787
8788
#else /* ARMv7-A implementation */
8789
uint8_t v[16] = {
8790
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
8791
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
8792
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
8793
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
8794
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
8795
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
8796
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
8797
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
8798
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
8799
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
8800
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
8801
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
8802
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
8803
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
8804
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
8805
_sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
8806
};
8807
8808
return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
8809
#endif
8810
}
8811
8812
// Perform the last round of an AES decryption flow on data (state) in a using
8813
// the round key in RoundKey, and store the result in dst.
8814
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
8815
FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
8816
{
8817
#if defined(__aarch64__)
8818
static const uint8_t inv_shift_rows[] = {
8819
0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
8820
0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
8821
};
8822
8823
uint8x16_t v;
8824
uint8x16_t w = vreinterpretq_u8_m128i(a);
8825
8826
// inverse shift rows
8827
w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
8828
8829
// inverse sub bytes
8830
v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
8831
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
8832
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
8833
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
8834
8835
// add round key
8836
return vreinterpretq_m128i_u8(v) ^ RoundKey;
8837
8838
#else /* ARMv7-A NEON implementation */
8839
/* FIXME: optimized for NEON */
8840
uint8_t v[4][4];
8841
uint8_t *_a = (uint8_t *) &a;
8842
for (int i = 0; i < 16; ++i) {
8843
v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
8844
}
8845
8846
return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
8847
#endif
8848
}
8849
8850
// Perform the InvMixColumns transformation on a and store the result in dst.
8851
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
8852
FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
8853
{
8854
#if defined(__aarch64__)
8855
static const uint8_t ror32by8[] = {
8856
0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8857
0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8858
};
8859
uint8x16_t v = vreinterpretq_u8_m128i(a);
8860
uint8x16_t w;
8861
8862
// multiplying 'v' by 4 in GF(2^8)
8863
w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8864
w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
8865
v ^= w;
8866
v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
8867
8868
// multiplying 'v' by 2 in GF(2^8)
8869
w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8870
w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8871
w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8872
return vreinterpretq_m128i_u8(w);
8873
8874
#else /* ARMv7-A NEON implementation */
8875
uint8_t i, e, f, g, h, v[4][4];
8876
vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
8877
for (i = 0; i < 4; ++i) {
8878
e = v[i][0];
8879
f = v[i][1];
8880
g = v[i][2];
8881
h = v[i][3];
8882
8883
v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
8884
SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
8885
v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
8886
SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
8887
v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
8888
SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
8889
v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
8890
SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
8891
}
8892
8893
return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
8894
#endif
8895
}
8896
8897
// Assist in expanding the AES cipher key by computing steps towards generating
8898
// a round key for encryption cipher using data from a and an 8-bit round
8899
// constant specified in imm8, and store the result in dst.
8900
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
8901
//
8902
// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
8903
// This instruction generates a round key for AES encryption. See
8904
// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
8905
// for details.
8906
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
8907
{
8908
#if defined(__aarch64__)
8909
uint8x16_t _a = vreinterpretq_u8_m128i(a);
8910
uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
8911
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
8912
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
8913
v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
8914
8915
uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
8916
uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
8917
uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
8918
8919
return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
8920
8921
#else /* ARMv7-A NEON implementation */
8922
uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
8923
uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
8924
for (int i = 0; i < 4; ++i) {
8925
((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
8926
((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
8927
}
8928
return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
8929
((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
8930
#endif
8931
}
8932
#undef SSE2NEON_AES_SBOX
8933
#undef SSE2NEON_AES_RSBOX
8934
8935
#if defined(__aarch64__)
8936
#undef SSE2NEON_XT
8937
#undef SSE2NEON_MULTIPLY
8938
#endif
8939
8940
#else /* __ARM_FEATURE_CRYPTO */
8941
// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
8942
// AESMC and then manually applying the real key as an xor operation. This
8943
// unfortunately means an additional xor op; the compiler should be able to
8944
// optimize this away for repeated calls however. See
8945
// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
8946
// for more details.
8947
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
8948
{
8949
return vreinterpretq_m128i_u8(veorq_u8(
8950
vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8951
vreinterpretq_u8_m128i(b)));
8952
}
8953
8954
// Perform one round of an AES decryption flow on data (state) in a using the
8955
// round key in RoundKey, and store the result in dst.
8956
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
8957
FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
8958
{
8959
return vreinterpretq_m128i_u8(veorq_u8(
8960
vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8961
vreinterpretq_u8_m128i(RoundKey)));
8962
}
8963
8964
// Perform the last round of an AES encryption flow on data (state) in a using
8965
// the round key in RoundKey, and store the result in dst.
8966
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
8967
FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8968
{
8969
return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
8970
vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8971
RoundKey);
8972
}
8973
8974
// Perform the last round of an AES decryption flow on data (state) in a using
8975
// the round key in RoundKey, and store the result in dst.
8976
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
8977
FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
8978
{
8979
return vreinterpretq_m128i_u8(
8980
veorq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)),
8981
vreinterpretq_u8_m128i(RoundKey)));
8982
}
8983
8984
// Perform the InvMixColumns transformation on a and store the result in dst.
8985
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
8986
FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
8987
{
8988
return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
8989
}
8990
8991
// Assist in expanding the AES cipher key by computing steps towards generating
8992
// a round key for encryption cipher using data from a and an 8-bit round
8993
// constant specified in imm8, and store the result in dst."
8994
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
8995
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
8996
{
8997
// AESE does ShiftRows and SubBytes on A
8998
uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
8999
9000
#ifndef _MSC_VER
9001
uint8x16_t dest = {
9002
// Undo ShiftRows step from AESE and extract X1 and X3
9003
u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1)
9004
u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1))
9005
u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3)
9006
u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3))
9007
};
9008
uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
9009
return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
9010
#else
9011
// We have to do this hack because MSVC is strictly adhering to the CPP
9012
// standard, in particular C++03 8.5.1 sub-section 15, which states that
9013
// unions must be initialized by their first member type.
9014
9015
// As per the Windows ARM64 ABI, it is always little endian, so this works
9016
__n128 dest{
9017
((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) |
9018
((uint64_t) u8.n128_u8[0xE] << 16) |
9019
((uint64_t) u8.n128_u8[0xB] << 24) |
9020
((uint64_t) u8.n128_u8[0x1] << 32) |
9021
((uint64_t) u8.n128_u8[0xE] << 40) |
9022
((uint64_t) u8.n128_u8[0xB] << 48) |
9023
((uint64_t) u8.n128_u8[0x4] << 56),
9024
((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) |
9025
((uint64_t) u8.n128_u8[0x6] << 16) |
9026
((uint64_t) u8.n128_u8[0x3] << 24) |
9027
((uint64_t) u8.n128_u8[0x9] << 32) |
9028
((uint64_t) u8.n128_u8[0x6] << 40) |
9029
((uint64_t) u8.n128_u8[0x3] << 48) |
9030
((uint64_t) u8.n128_u8[0xC] << 56)};
9031
9032
dest.n128_u32[1] = dest.n128_u32[1] ^ rcon;
9033
dest.n128_u32[3] = dest.n128_u32[3] ^ rcon;
9034
9035
return dest;
9036
#endif
9037
}
9038
#endif
9039
9040
/* Others */
9041
9042
// Perform a carry-less multiplication of two 64-bit integers, selected from a
9043
// and b according to imm8, and store the results in dst.
9044
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
9045
FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
9046
{
9047
uint64x2_t a = vreinterpretq_u64_m128i(_a);
9048
uint64x2_t b = vreinterpretq_u64_m128i(_b);
9049
switch (imm & 0x11) {
9050
case 0x00:
9051
return vreinterpretq_m128i_u64(
9052
_sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
9053
case 0x01:
9054
return vreinterpretq_m128i_u64(
9055
_sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
9056
case 0x10:
9057
return vreinterpretq_m128i_u64(
9058
_sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
9059
case 0x11:
9060
return vreinterpretq_m128i_u64(
9061
_sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
9062
default:
9063
abort();
9064
}
9065
}
9066
9067
FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
9068
{
9069
union {
9070
fpcr_bitfield field;
9071
#if defined(__aarch64__) || defined(_M_ARM64)
9072
uint64_t value;
9073
#else
9074
uint32_t value;
9075
#endif
9076
} r;
9077
9078
#if defined(__aarch64__) || defined(_M_ARM64)
9079
r.value = _sse2neon_get_fpcr();
9080
#else
9081
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
9082
#endif
9083
9084
return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
9085
}
9086
9087
// Count the number of bits set to 1 in unsigned 32-bit integer a, and
9088
// return that count in dst.
9089
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
9090
FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
9091
{
9092
#if defined(__aarch64__) || defined(_M_ARM64)
9093
#if __has_builtin(__builtin_popcount)
9094
return __builtin_popcount(a);
9095
#elif defined(_MSC_VER)
9096
return _CountOneBits(a);
9097
#else
9098
return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
9099
#endif
9100
#else
9101
uint32_t count = 0;
9102
uint8x8_t input_val, count8x8_val;
9103
uint16x4_t count16x4_val;
9104
uint32x2_t count32x2_val;
9105
9106
input_val = vld1_u8((uint8_t *) &a);
9107
count8x8_val = vcnt_u8(input_val);
9108
count16x4_val = vpaddl_u8(count8x8_val);
9109
count32x2_val = vpaddl_u16(count16x4_val);
9110
9111
vst1_u32(&count, count32x2_val);
9112
return count;
9113
#endif
9114
}
9115
9116
// Count the number of bits set to 1 in unsigned 64-bit integer a, and
9117
// return that count in dst.
9118
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
9119
FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
9120
{
9121
#if defined(__aarch64__) || defined(_M_ARM64)
9122
#if __has_builtin(__builtin_popcountll)
9123
return __builtin_popcountll(a);
9124
#elif defined(_MSC_VER)
9125
return _CountOneBits64(a);
9126
#else
9127
return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
9128
#endif
9129
#else
9130
uint64_t count = 0;
9131
uint8x8_t input_val, count8x8_val;
9132
uint16x4_t count16x4_val;
9133
uint32x2_t count32x2_val;
9134
uint64x1_t count64x1_val;
9135
9136
input_val = vld1_u8((uint8_t *) &a);
9137
count8x8_val = vcnt_u8(input_val);
9138
count16x4_val = vpaddl_u8(count8x8_val);
9139
count32x2_val = vpaddl_u16(count16x4_val);
9140
count64x1_val = vpaddl_u32(count32x2_val);
9141
vst1_u64(&count, count64x1_val);
9142
return count;
9143
#endif
9144
}
9145
9146
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
9147
{
9148
// AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
9149
// regardless of the value of the FZ bit.
9150
union {
9151
fpcr_bitfield field;
9152
#if defined(__aarch64__) || defined(_M_ARM64)
9153
uint64_t value;
9154
#else
9155
uint32_t value;
9156
#endif
9157
} r;
9158
9159
#if defined(__aarch64__) || defined(_M_ARM64)
9160
r.value = _sse2neon_get_fpcr();
9161
#else
9162
__asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
9163
#endif
9164
9165
r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
9166
9167
#if defined(__aarch64__) || defined(_M_ARM64)
9168
_sse2neon_set_fpcr(r.value);
9169
#else
9170
__asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */
9171
#endif
9172
}
9173
9174
// Return the current 64-bit value of the processor's time-stamp counter.
9175
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
9176
FORCE_INLINE uint64_t _rdtsc(void)
9177
{
9178
#if defined(__aarch64__) || defined(_M_ARM64)
9179
uint64_t val;
9180
9181
/* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
9182
* system counter is at least 56 bits wide; from Armv8.6, the counter
9183
* must be 64 bits wide. So the system counter could be less than 64
9184
* bits wide and it is attributed with the flag 'cap_user_time_short'
9185
* is true.
9186
*/
9187
#if defined(_MSC_VER)
9188
val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
9189
#else
9190
__asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
9191
#endif
9192
9193
return val;
9194
#else
9195
uint32_t pmccntr, pmuseren, pmcntenset;
9196
// Read the user mode Performance Monitoring Unit (PMU)
9197
// User Enable Register (PMUSERENR) access permissions.
9198
__asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
9199
if (pmuseren & 1) { // Allows reading PMUSERENR for user mode code.
9200
__asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
9201
if (pmcntenset & 0x80000000UL) { // Is it counting?
9202
__asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
9203
// The counter is set up to count every 64th cycle
9204
return (uint64_t) (pmccntr) << 6;
9205
}
9206
}
9207
9208
// Fallback to syscall as we can't enable PMUSERENR in user mode.
9209
struct timeval tv;
9210
gettimeofday(&tv, NULL);
9211
return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
9212
#endif
9213
}
9214
9215
#if defined(__GNUC__) || defined(__clang__)
9216
#pragma pop_macro("ALIGN_STRUCT")
9217
#pragma pop_macro("FORCE_INLINE")
9218
#endif
9219
9220
#if defined(__GNUC__) && !defined(__clang__)
9221
#pragma GCC pop_options
9222
#endif
9223
9224
#endif
9225
9226