CoCalc -- sse2neon.h

GitHub Repository: xmrig/xmrig
Path: blob/master/src/crypto/cn/sse2neon.h
³⁸⁶⁴ views
1
#ifndef SSE2NEON_H
2
#define SSE2NEON_H
3

4
// This header file provides a simple API translation layer
5
// between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions
6
//
7
// Contributors to this work are:
8
//   John W. Ratcliff <[email protected]>
9
//   Brandon Rowlett <[email protected]>
10
//   Ken Fast <[email protected]>
11
//   Eric van Beurden <[email protected]>
12
//   Alexander Potylitsin <[email protected]>
13
//   Hasindu Gamaarachchi <[email protected]>
14
//   Jim Huang <[email protected]>
15
//   Mark Cheng <[email protected]>
16
//   Malcolm James MacLeod <[email protected]>
17
//   Devin Hussey (easyaspi314) <[email protected]>
18
//   Sebastian Pop <[email protected]>
19
//   Developer Ecosystem Engineering <[email protected]>
20
//   Danila Kutenin <[email protected]>
21
//   François Turban (JishinMaster) <[email protected]>
22
//   Pei-Hsuan Hung <[email protected]>
23
//   Yang-Hao Yuan <[email protected]>
24
//   Syoyo Fujita <[email protected]>
25
//   Brecht Van Lommel <[email protected]>
26
//   Jonathan Hue <[email protected]>
27
//   Cuda Chen <[email protected]>
28
//   Aymen Qader <[email protected]>
29
//   Anthony Roberts <[email protected]>
30

31
/*
32
 * sse2neon is freely redistributable under the MIT License.
33
 *
34
 * Permission is hereby granted, free of charge, to any person obtaining a copy
35
 * of this software and associated documentation files (the "Software"), to deal
36
 * in the Software without restriction, including without limitation the rights
37
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
38
 * copies of the Software, and to permit persons to whom the Software is
39
 * furnished to do so, subject to the following conditions:
40
 *
41
 * The above copyright notice and this permission notice shall be included in
42
 * all copies or substantial portions of the Software.
43
 *
44
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
47
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
48
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
49
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
50
 * SOFTWARE.
51
 */
52

53
/* Tunable configurations */
54

55
/* Enable precise implementation of math operations
56
 * This would slow down the computation a bit, but gives consistent result with
57
 * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result)
58
 */
59
/* _mm_min|max_ps|ss|pd|sd */
60
#ifndef SSE2NEON_PRECISE_MINMAX
61
#define SSE2NEON_PRECISE_MINMAX (0)
62
#endif
63
/* _mm_rcp_ps and _mm_div_ps */
64
#ifndef SSE2NEON_PRECISE_DIV
65
#define SSE2NEON_PRECISE_DIV (0)
66
#endif
67
/* _mm_sqrt_ps and _mm_rsqrt_ps */
68
#ifndef SSE2NEON_PRECISE_SQRT
69
#define SSE2NEON_PRECISE_SQRT (0)
70
#endif
71
/* _mm_dp_pd */
72
#ifndef SSE2NEON_PRECISE_DP
73
#define SSE2NEON_PRECISE_DP (0)
74
#endif
75

76
/* Enable inclusion of windows.h on MSVC platforms
77
 * This makes _mm_clflush functional on windows, as there is no builtin.
78
 */
79
#ifndef SSE2NEON_INCLUDE_WINDOWS_H
80
#define SSE2NEON_INCLUDE_WINDOWS_H (0)
81
#endif
82

83
/* compiler specific definitions */
84
#if defined(__GNUC__) || defined(__clang__)
85
#pragma push_macro("FORCE_INLINE")
86
#pragma push_macro("ALIGN_STRUCT")
87
#define FORCE_INLINE static inline __attribute__((always_inline))
88
#define ALIGN_STRUCT(x) __attribute__((aligned(x)))
89
#define _sse2neon_likely(x) __builtin_expect(!!(x), 1)
90
#define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0)
91
#elif defined(_MSC_VER)
92
#if _MSVC_TRADITIONAL
93
#error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead.
94
#endif
95
#ifndef FORCE_INLINE
96
#define FORCE_INLINE static inline
97
#endif
98
#ifndef ALIGN_STRUCT
99
#define ALIGN_STRUCT(x) __declspec(align(x))
100
#endif
101
#define _sse2neon_likely(x) (x)
102
#define _sse2neon_unlikely(x) (x)
103
#else
104
#pragma message("Macro name collisions may happen with unsupported compilers.")
105
#endif
106

107
/* C language does not allow initializing a variable with a function call. */
108
#ifdef __cplusplus
109
#define _sse2neon_const static const
110
#else
111
#define _sse2neon_const const
112
#endif
113

114
#include <stdint.h>
115
#include <stdlib.h>
116

117
#if defined(_WIN32)
118
/* Definitions for _mm_{malloc,free} are provided by <malloc.h>
119
 * from both MinGW-w64 and MSVC.
120
 */
121
#define SSE2NEON_ALLOC_DEFINED
122
#endif
123

124
/* If using MSVC */
125
#ifdef _MSC_VER
126
#include <intrin.h>
127
#if SSE2NEON_INCLUDE_WINDOWS_H
128
#include <processthreadsapi.h>
129
#include <windows.h>
130
#endif
131

132
#if !defined(__cplusplus)
133
#error sse2neon only supports C++ compilation with this compiler
134
#endif
135

136
#ifdef SSE2NEON_ALLOC_DEFINED
137
#include <malloc.h>
138
#endif
139

140
#if (defined(_M_AMD64) || defined(__x86_64__)) || \
141
    (defined(_M_ARM64) || defined(__arm64__))
142
#define SSE2NEON_HAS_BITSCAN64
143
#endif
144
#endif
145

146
#if defined(__GNUC__) || defined(__clang__)
147
#define _sse2neon_define0(type, s, body) \
148
    __extension__({                      \
149
        type _a = (s);                   \
150
        body                             \
151
    })
152
#define _sse2neon_define1(type, s, body) \
153
    __extension__({                      \
154
        type _a = (s);                   \
155
        body                             \
156
    })
157
#define _sse2neon_define2(type, a, b, body) \
158
    __extension__({                         \
159
        type _a = (a), _b = (b);            \
160
        body                                \
161
    })
162
#define _sse2neon_return(ret) (ret)
163
#else
164
#define _sse2neon_define0(type, a, body) [=](type _a) { body }(a)
165
#define _sse2neon_define1(type, a, body) [](type _a) { body }(a)
166
#define _sse2neon_define2(type, a, b, body) \
167
    [](type _a, type _b) { body }((a), (b))
168
#define _sse2neon_return(ret) return ret
169
#endif
170

171
#define _sse2neon_init(...) \
172
    {                       \
173
        __VA_ARGS__         \
174
    }
175

176
/* Compiler barrier */
177
#if defined(_MSC_VER)
178
#define SSE2NEON_BARRIER() _ReadWriteBarrier()
179
#else
180
#define SSE2NEON_BARRIER()                     \
181
    do {                                       \
182
        __asm__ __volatile__("" ::: "memory"); \
183
        (void) 0;                              \
184
    } while (0)
185
#endif
186

187
/* Memory barriers
188
 * __atomic_thread_fence does not include a compiler barrier; instead,
189
 * the barrier is part of __atomic_load/__atomic_store's "volatile-like"
190
 * semantics.
191
 */
192
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L)
193
#include <stdatomic.h>
194
#endif
195

196
FORCE_INLINE void _sse2neon_smp_mb(void)
197
{
198
    SSE2NEON_BARRIER();
199
#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \
200
    !defined(__STDC_NO_ATOMICS__)
201
    atomic_thread_fence(memory_order_seq_cst);
202
#elif defined(__GNUC__) || defined(__clang__)
203
    __atomic_thread_fence(__ATOMIC_SEQ_CST);
204
#else /* MSVC */
205
    __dmb(_ARM64_BARRIER_ISH);
206
#endif
207
}
208

209
/* Architecture-specific build options */
210
/* FIXME: #pragma GCC push_options is only available on GCC */
211
#if defined(__GNUC__)
212
#if defined(__arm__) && __ARM_ARCH == 7
213
/* According to ARM C Language Extensions Architecture specification,
214
 * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON)
215
 * architecture supported.
216
 */
217
#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
218
#error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON."
219
#endif
220
#if !defined(__clang__)
221
#pragma GCC push_options
222
#pragma GCC target("fpu=neon")
223
#endif
224
#elif defined(__aarch64__) || defined(_M_ARM64)
225
#if !defined(__clang__) && !defined(_MSC_VER)
226
#pragma GCC push_options
227
#pragma GCC target("+simd")
228
#endif
229
#elif __ARM_ARCH == 8
230
#if !defined(__ARM_NEON) || !defined(__ARM_NEON__)
231
#error \
232
    "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON."
233
#endif
234
#if !defined(__clang__) && !defined(_MSC_VER)
235
#pragma GCC push_options
236
#endif
237
#else
238
#error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A."
239
#endif
240
#endif
241

242
#include <arm_neon.h>
243
#if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8)
244
#if defined __has_include && __has_include(<arm_acle.h>)
245
#include <arm_acle.h>
246
#endif
247
#endif
248

249
/* Apple Silicon cache lines are double of what is commonly used by Intel, AMD
250
 * and other Arm microarchitectures use.
251
 * From sysctl -a on Apple M1:
252
 * hw.cachelinesize: 128
253
 */
254
#if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__))
255
#define SSE2NEON_CACHELINE_SIZE 128
256
#else
257
#define SSE2NEON_CACHELINE_SIZE 64
258
#endif
259

260
/* Rounding functions require either Aarch64 instructions or libm fallback */
261
#if !defined(__aarch64__) && !defined(_M_ARM64)
262
#include <math.h>
263
#endif
264

265
/* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only
266
 * or even not accessible in user mode.
267
 * To write or access to these registers in user mode,
268
 * we have to perform syscall instead.
269
 */
270
#if (!defined(__aarch64__) && !defined(_M_ARM64))
271
#include <sys/time.h>
272
#endif
273

274
/* "__has_builtin" can be used to query support for built-in functions
275
 * provided by gcc/clang and other compilers that support it.
276
 */
277
#ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */
278
/* Compatibility with gcc <= 9 */
279
#if defined(__GNUC__) && (__GNUC__ <= 9)
280
#define __has_builtin(x) HAS##x
281
#define HAS__builtin_popcount 1
282
#define HAS__builtin_popcountll 1
283

284
// __builtin_shuffle introduced in GCC 4.7.0
285
#if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7))
286
#define HAS__builtin_shuffle 1
287
#else
288
#define HAS__builtin_shuffle 0
289
#endif
290

291
#define HAS__builtin_shufflevector 0
292
#define HAS__builtin_nontemporal_store 0
293
#else
294
#define __has_builtin(x) 0
295
#endif
296
#endif
297

298
/**
299
 * MACRO for shuffle parameter for _mm_shuffle_ps().
300
 * Argument fp3 is a digit[0123] that represents the fp from argument "b"
301
 * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same
302
 * for fp2 in result. fp1 is a digit[0123] that represents the fp from
303
 * argument "a" of mm_shuffle_ps that will be places in fp1 of result.
304
 * fp0 is the same for fp0 of result.
305
 */
306
#define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \
307
    (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0)))
308

309
#if __has_builtin(__builtin_shufflevector)
310
#define _sse2neon_shuffle(type, a, b, ...) \
311
    __builtin_shufflevector(a, b, __VA_ARGS__)
312
#elif __has_builtin(__builtin_shuffle)
313
#define _sse2neon_shuffle(type, a, b, ...) \
314
    __extension__({                        \
315
        type tmp = {__VA_ARGS__};          \
316
        __builtin_shuffle(a, b, tmp);      \
317
    })
318
#endif
319

320
#ifdef _sse2neon_shuffle
321
#define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__)
322
#define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__)
323
#define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__)
324
#define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__)
325
#define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__)
326
#define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__)
327
#endif
328

329
/* Rounding mode macros. */
330
#define _MM_FROUND_TO_NEAREST_INT 0x00
331
#define _MM_FROUND_TO_NEG_INF 0x01
332
#define _MM_FROUND_TO_POS_INF 0x02
333
#define _MM_FROUND_TO_ZERO 0x03
334
#define _MM_FROUND_CUR_DIRECTION 0x04
335
#define _MM_FROUND_NO_EXC 0x08
336
#define _MM_FROUND_RAISE_EXC 0x00
337
#define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC)
338
#define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC)
339
#define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC)
340
#define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC)
341
#define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC)
342
#define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC)
343
#define _MM_ROUND_NEAREST 0x0000
344
#define _MM_ROUND_DOWN 0x2000
345
#define _MM_ROUND_UP 0x4000
346
#define _MM_ROUND_TOWARD_ZERO 0x6000
347
/* Flush zero mode macros. */
348
#define _MM_FLUSH_ZERO_MASK 0x8000
349
#define _MM_FLUSH_ZERO_ON 0x8000
350
#define _MM_FLUSH_ZERO_OFF 0x0000
351
/* Denormals are zeros mode macros. */
352
#define _MM_DENORMALS_ZERO_MASK 0x0040
353
#define _MM_DENORMALS_ZERO_ON 0x0040
354
#define _MM_DENORMALS_ZERO_OFF 0x0000
355

356
/* indicate immediate constant argument in a given range */
357
#define __constrange(a, b) const
358

359
/* A few intrinsics accept traditional data types like ints or floats, but
360
 * most operate on data types that are specific to SSE.
361
 * If a vector type ends in d, it contains doubles, and if it does not have
362
 * a suffix, it contains floats. An integer vector type can contain any type
363
 * of integer, from chars to shorts to unsigned long longs.
364
 */
365
typedef int64x1_t __m64;
366
typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */
367
// On ARM 32-bit architecture, the float64x2_t is not supported.
368
// The data type __m128d should be represented in a different way for related
369
// intrinsic conversion.
370
#if defined(__aarch64__) || defined(_M_ARM64)
371
typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */
372
#else
373
typedef float32x4_t __m128d;
374
#endif
375
typedef int64x2_t __m128i; /* 128-bit vector containing integers */
376

377
// __int64 is defined in the Intrinsics Guide which maps to different datatype
378
// in different data model
379
#if !(defined(_WIN32) || defined(_WIN64) || defined(__int64))
380
#if (defined(__x86_64__) || defined(__i386__))
381
#define __int64 long long
382
#else
383
#define __int64 int64_t
384
#endif
385
#endif
386

387
/* type-safe casting between types */
388

389
#define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x)
390
#define vreinterpretq_m128_f32(x) (x)
391
#define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x)
392

393
#define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x)
394
#define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x)
395
#define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x)
396
#define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x)
397

398
#define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x)
399
#define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x)
400
#define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x)
401
#define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x)
402

403
#define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x)
404
#define vreinterpretq_f32_m128(x) (x)
405
#define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x)
406

407
#define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x)
408
#define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x)
409
#define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x)
410
#define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x)
411

412
#define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x)
413
#define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x)
414
#define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x)
415
#define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x)
416

417
#define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x)
418
#define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x)
419
#define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x)
420
#define vreinterpretq_m128i_s64(x) (x)
421

422
#define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x)
423
#define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x)
424
#define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x)
425
#define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x)
426

427
#define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x)
428
#define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x)
429

430
#define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x)
431
#define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x)
432
#define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x)
433
#define vreinterpretq_s64_m128i(x) (x)
434

435
#define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x)
436
#define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x)
437
#define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x)
438
#define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x)
439

440
#define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x)
441
#define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x)
442
#define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x)
443
#define vreinterpret_m64_s64(x) (x)
444

445
#define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x)
446
#define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x)
447
#define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x)
448
#define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x)
449

450
#define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x)
451
#define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x)
452
#define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x)
453

454
#define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x)
455
#define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x)
456
#define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x)
457
#define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x)
458

459
#define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x)
460
#define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x)
461
#define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x)
462
#define vreinterpret_s64_m64(x) (x)
463

464
#define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x)
465

466
#if defined(__aarch64__) || defined(_M_ARM64)
467
#define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x)
468
#define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x)
469

470
#define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x)
471

472
#define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x)
473
#define vreinterpretq_m128d_f64(x) (x)
474

475
#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x)
476

477
#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x)
478
#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x)
479

480
#define vreinterpretq_f64_m128d(x) (x)
481
#define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x)
482
#else
483
#define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x)
484
#define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x)
485

486
#define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x)
487
#define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x)
488

489
#define vreinterpretq_m128d_f32(x) (x)
490

491
#define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x)
492

493
#define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x)
494
#define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x)
495

496
#define vreinterpretq_f32_m128d(x) (x)
497
#endif
498

499
// A struct is defined in this header file called 'SIMDVec' which can be used
500
// by applications which attempt to access the contents of an __m128 struct
501
// directly.  It is important to note that accessing the __m128 struct directly
502
// is bad coding practice by Microsoft: @see:
503
// https://learn.microsoft.com/en-us/cpp/cpp/m128
504
//
505
// However, some legacy source code may try to access the contents of an __m128
506
// struct directly so the developer can use the SIMDVec as an alias for it.  Any
507
// casting must be done manually by the developer, as you cannot cast or
508
// otherwise alias the base NEON data type for intrinsic operations.
509
//
510
// union intended to allow direct access to an __m128 variable using the names
511
// that the MSVC compiler provides.  This union should really only be used when
512
// trying to access the members of the vector as integer values.  GCC/clang
513
// allow native access to the float members through a simple array access
514
// operator (in C since 4.6, in C++ since 4.8).
515
//
516
// Ideally direct accesses to SIMD vectors should not be used since it can cause
517
// a performance hit.  If it really is needed however, the original __m128
518
// variable can be aliased with a pointer to this union and used to access
519
// individual components.  The use of this union should be hidden behind a macro
520
// that is used throughout the codebase to access the members instead of always
521
// declaring this type of variable.
522
typedef union ALIGN_STRUCT(16) SIMDVec {
523
    float m128_f32[4];     // as floats - DON'T USE. Added for convenience.
524
    int8_t m128_i8[16];    // as signed 8-bit integers.
525
    int16_t m128_i16[8];   // as signed 16-bit integers.
526
    int32_t m128_i32[4];   // as signed 32-bit integers.
527
    int64_t m128_i64[2];   // as signed 64-bit integers.
528
    uint8_t m128_u8[16];   // as unsigned 8-bit integers.
529
    uint16_t m128_u16[8];  // as unsigned 16-bit integers.
530
    uint32_t m128_u32[4];  // as unsigned 32-bit integers.
531
    uint64_t m128_u64[2];  // as unsigned 64-bit integers.
532
} SIMDVec;
533

534
// casting using SIMDVec
535
#define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n])
536
#define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n])
537
#define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n])
538

539
/* SSE macros */
540
#define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode
541
#define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode
542
#define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode
543
#define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode
544

545
// Function declaration
546
// SSE
547
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void);
548
FORCE_INLINE __m128 _mm_move_ss(__m128, __m128);
549
FORCE_INLINE __m128 _mm_or_ps(__m128, __m128);
550
FORCE_INLINE __m128 _mm_set_ps1(float);
551
FORCE_INLINE __m128 _mm_setzero_ps(void);
552
// SSE2
553
FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i);
554
FORCE_INLINE __m128i _mm_castps_si128(__m128);
555
FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i);
556
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128);
557
FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d);
558
FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i);
559
FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int);
560
FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t);
561
FORCE_INLINE __m128d _mm_set_pd(double, double);
562
FORCE_INLINE __m128i _mm_set1_epi32(int);
563
FORCE_INLINE __m128i _mm_setzero_si128(void);
564
// SSE4.1
565
FORCE_INLINE __m128d _mm_ceil_pd(__m128d);
566
FORCE_INLINE __m128 _mm_ceil_ps(__m128);
567
FORCE_INLINE __m128d _mm_floor_pd(__m128d);
568
FORCE_INLINE __m128 _mm_floor_ps(__m128);
569
FORCE_INLINE __m128d _mm_round_pd(__m128d, int);
570
FORCE_INLINE __m128 _mm_round_ps(__m128, int);
571
// SSE4.2
572
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t);
573

574
/* Backwards compatibility for compilers with lack of specific type support */
575

576
// Older gcc does not define vld1q_u8_x4 type
577
#if defined(__GNUC__) && !defined(__clang__) &&                        \
578
    ((__GNUC__ <= 13 && defined(__arm__)) ||                           \
579
     (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \
580
     (__GNUC__ <= 9 && defined(__aarch64__)))
581
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
582
{
583
    uint8x16x4_t ret;
584
    ret.val[0] = vld1q_u8(p + 0);
585
    ret.val[1] = vld1q_u8(p + 16);
586
    ret.val[2] = vld1q_u8(p + 32);
587
    ret.val[3] = vld1q_u8(p + 48);
588
    return ret;
589
}
590
#else
591
// Wraps vld1q_u8_x4
592
FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p)
593
{
594
    return vld1q_u8_x4(p);
595
}
596
#endif
597

598
#if !defined(__aarch64__) && !defined(_M_ARM64)
599
/* emulate vaddv u8 variant */
600
FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
601
{
602
    const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8)));
603
    return vget_lane_u8(vreinterpret_u8_u64(v1), 0);
604
}
605
#else
606
// Wraps vaddv_u8
607
FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8)
608
{
609
    return vaddv_u8(v8);
610
}
611
#endif
612

613
#if !defined(__aarch64__) && !defined(_M_ARM64)
614
/* emulate vaddvq u8 variant */
615
FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
616
{
617
    uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a));
618
    uint8_t res = 0;
619
    for (int i = 0; i < 8; ++i)
620
        res += tmp[i];
621
    return res;
622
}
623
#else
624
// Wraps vaddvq_u8
625
FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a)
626
{
627
    return vaddvq_u8(a);
628
}
629
#endif
630

631
#if !defined(__aarch64__) && !defined(_M_ARM64)
632
/* emulate vaddvq u16 variant */
633
FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
634
{
635
    uint32x4_t m = vpaddlq_u16(a);
636
    uint64x2_t n = vpaddlq_u32(m);
637
    uint64x1_t o = vget_low_u64(n) + vget_high_u64(n);
638

639
    return vget_lane_u32((uint32x2_t) o, 0);
640
}
641
#else
642
// Wraps vaddvq_u16
643
FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a)
644
{
645
    return vaddvq_u16(a);
646
}
647
#endif
648

649
/* Function Naming Conventions
650
 * The naming convention of SSE intrinsics is straightforward. A generic SSE
651
 * intrinsic function is given as follows:
652
 *   _mm_<name>_<data_type>
653
 *
654
 * The parts of this format are given as follows:
655
 * 1. <name> describes the operation performed by the intrinsic
656
 * 2. <data_type> identifies the data type of the function's primary arguments
657
 *
658
 * This last part, <data_type>, is a little complicated. It identifies the
659
 * content of the input values, and can be set to any of the following values:
660
 * + ps - vectors contain floats (ps stands for packed single-precision)
661
 * + pd - vectors contain doubles (pd stands for packed double-precision)
662
 * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit
663
 *                            signed integers
664
 * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit
665
 *                            unsigned integers
666
 * + si128 - unspecified 128-bit vector or 256-bit vector
667
 * + m128/m128i/m128d - identifies input vector types when they are different
668
 *                      than the type of the returned vector
669
 *
670
 * For example, _mm_setzero_ps. The _mm implies that the function returns
671
 * a 128-bit vector. The _ps at the end implies that the argument vectors
672
 * contain floats.
673
 *
674
 * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8)
675
 *   // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits
676
 *   __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8);
677
 *   // Set packed 8-bit integers
678
 *   // 128 bits, 16 chars, per 8 bits
679
 *   __m128i v_perm = _mm_setr_epi8(1, 0,  2,  3, 8, 9, 10, 11,
680
 *                                  4, 5, 12, 13, 6, 7, 14, 15);
681
 *   // Shuffle packed 8-bit integers
682
 *   __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb
683
 */
684

685
/* Constants for use with _mm_prefetch. */
686
enum _mm_hint {
687
    _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */
688
    _MM_HINT_T0 = 1,  /* load data to L1 and L2 cache */
689
    _MM_HINT_T1 = 2,  /* load data to L2 cache only */
690
    _MM_HINT_T2 = 3,  /* load data to L2 cache only, mark it as NTA */
691
};
692

693
// The bit field mapping to the FPCR(floating-point control register)
694
typedef struct {
695
    uint16_t res0;
696
    uint8_t res1 : 6;
697
    uint8_t bit22 : 1;
698
    uint8_t bit23 : 1;
699
    uint8_t bit24 : 1;
700
    uint8_t res2 : 7;
701
#if defined(__aarch64__) || defined(_M_ARM64)
702
    uint32_t res3;
703
#endif
704
} fpcr_bitfield;
705

706
// Takes the upper 64 bits of a and places it in the low end of the result
707
// Takes the lower 64 bits of b and places it into the high end of the result.
708
FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b)
709
{
710
    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
711
    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
712
    return vreinterpretq_m128_f32(vcombine_f32(a32, b10));
713
}
714

715
// takes the lower two 32-bit values from a and swaps them and places in high
716
// end of result takes the higher two 32 bit values from b and swaps them and
717
// places in low end of result.
718
FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b)
719
{
720
    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
721
    float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b)));
722
    return vreinterpretq_m128_f32(vcombine_f32(a01, b23));
723
}
724

725
FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b)
726
{
727
    float32x2_t a21 = vget_high_f32(
728
        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
729
    float32x2_t b03 = vget_low_f32(
730
        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
731
    return vreinterpretq_m128_f32(vcombine_f32(a21, b03));
732
}
733

734
FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b)
735
{
736
    float32x2_t a03 = vget_low_f32(
737
        vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3));
738
    float32x2_t b21 = vget_high_f32(
739
        vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3));
740
    return vreinterpretq_m128_f32(vcombine_f32(a03, b21));
741
}
742

743
FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b)
744
{
745
    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
746
    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
747
    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
748
}
749

750
FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b)
751
{
752
    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
753
    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
754
    return vreinterpretq_m128_f32(vcombine_f32(a01, b10));
755
}
756

757
FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b)
758
{
759
    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
760
    float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b)));
761
    return vreinterpretq_m128_f32(vcombine_f32(a01, b01));
762
}
763

764
// keeps the low 64 bits of b in the low and puts the high 64 bits of a in the
765
// high
766
FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b)
767
{
768
    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
769
    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
770
    return vreinterpretq_m128_f32(vcombine_f32(a10, b32));
771
}
772

773
FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b)
774
{
775
    float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1);
776
    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
777
    return vreinterpretq_m128_f32(vcombine_f32(a11, b00));
778
}
779

780
FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b)
781
{
782
    float32x2_t a22 =
783
        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
784
    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
785
    return vreinterpretq_m128_f32(vcombine_f32(a22, b00));
786
}
787

788
FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b)
789
{
790
    float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0);
791
    float32x2_t b22 =
792
        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0);
793
    return vreinterpretq_m128_f32(vcombine_f32(a00, b22));
794
}
795

796
FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b)
797
{
798
    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
799
    float32x2_t a22 =
800
        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0);
801
    float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/
802
    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
803
    return vreinterpretq_m128_f32(vcombine_f32(a02, b32));
804
}
805

806
FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b)
807
{
808
    float32x2_t a33 =
809
        vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1);
810
    float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1);
811
    return vreinterpretq_m128_f32(vcombine_f32(a33, b11));
812
}
813

814
FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b)
815
{
816
    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
817
    float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2);
818
    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
819
    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
820
    return vreinterpretq_m128_f32(vcombine_f32(a10, b20));
821
}
822

823
FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b)
824
{
825
    float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a)));
826
    float32_t b2 = vgetq_lane_f32(b, 2);
827
    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
828
    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
829
    return vreinterpretq_m128_f32(vcombine_f32(a01, b20));
830
}
831

832
FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b)
833
{
834
    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
835
    float32_t b2 = vgetq_lane_f32(b, 2);
836
    float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0);
837
    float32x2_t b20 = vset_lane_f32(b2, b00, 1);
838
    return vreinterpretq_m128_f32(vcombine_f32(a32, b20));
839
}
840

841
// For MSVC, we check only if it is ARM64, as every single ARM64 processor
842
// supported by WoA has crypto extensions. If this changes in the future,
843
// this can be verified via the runtime-only method of:
844
// IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)
845
#if (defined(_M_ARM64) && !defined(__clang__)) || \
846
    (defined(__ARM_FEATURE_CRYPTO) &&             \
847
     (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64)))
848
// Wraps vmull_p64
849
FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
850
{
851
    poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0);
852
    poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0);
853
#if defined(_MSC_VER)
854
    __n64 a1 = {a}, b1 = {b};
855
    return vreinterpretq_u64_p128(vmull_p64(a1, b1));
856
#else
857
    return vreinterpretq_u64_p128(vmull_p64(a, b));
858
#endif
859
}
860
#else  // ARMv7 polyfill
861
// ARMv7/some A64 lacks vmull_p64, but it has vmull_p8.
862
//
863
// vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a
864
// 64-bit->128-bit polynomial multiply.
865
//
866
// It needs some work and is somewhat slow, but it is still faster than all
867
// known scalar methods.
868
//
869
// Algorithm adapted to C from
870
// https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted
871
// from "Fast Software Polynomial Multiplication on ARM Processors Using the
872
// NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab
873
// (https://hal.inria.fr/hal-01506572)
874
static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b)
875
{
876
    poly8x8_t a = vreinterpret_p8_u64(_a);
877
    poly8x8_t b = vreinterpret_p8_u64(_b);
878

879
    // Masks
880
    uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff),
881
                                    vcreate_u8(0x00000000ffffffff));
882
    uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff),
883
                                    vcreate_u8(0x0000000000000000));
884

885
    // Do the multiplies, rotating with vext to get all combinations
886
    uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b));  // D = A0 * B0
887
    uint8x16_t e =
888
        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1)));  // E = A0 * B1
889
    uint8x16_t f =
890
        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b));  // F = A1 * B0
891
    uint8x16_t g =
892
        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2)));  // G = A0 * B2
893
    uint8x16_t h =
894
        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b));  // H = A2 * B0
895
    uint8x16_t i =
896
        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3)));  // I = A0 * B3
897
    uint8x16_t j =
898
        vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b));  // J = A3 * B0
899
    uint8x16_t k =
900
        vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4)));  // L = A0 * B4
901

902
    // Add cross products
903
    uint8x16_t l = veorq_u8(e, f);  // L = E + F
904
    uint8x16_t m = veorq_u8(g, h);  // M = G + H
905
    uint8x16_t n = veorq_u8(i, j);  // N = I + J
906

907
    // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL
908
    // instructions.
909
#if defined(__aarch64__)
910
    uint8x16_t lm_p0 = vreinterpretq_u8_u64(
911
        vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
912
    uint8x16_t lm_p1 = vreinterpretq_u8_u64(
913
        vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m)));
914
    uint8x16_t nk_p0 = vreinterpretq_u8_u64(
915
        vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
916
    uint8x16_t nk_p1 = vreinterpretq_u8_u64(
917
        vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k)));
918
#else
919
    uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m));
920
    uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m));
921
    uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k));
922
    uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k));
923
#endif
924
    // t0 = (L) (P0 + P1) << 8
925
    // t1 = (M) (P2 + P3) << 16
926
    uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1);
927
    uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32);
928
    uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h);
929

930
    // t2 = (N) (P4 + P5) << 24
931
    // t3 = (K) (P6 + P7) << 32
932
    uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1);
933
    uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00);
934
    uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h);
935

936
    // De-interleave
937
#if defined(__aarch64__)
938
    uint8x16_t t0 = vreinterpretq_u8_u64(
939
        vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
940
    uint8x16_t t1 = vreinterpretq_u8_u64(
941
        vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h)));
942
    uint8x16_t t2 = vreinterpretq_u8_u64(
943
        vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
944
    uint8x16_t t3 = vreinterpretq_u8_u64(
945
        vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h)));
946
#else
947
    uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h));
948
    uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h));
949
    uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h));
950
    uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h));
951
#endif
952
    // Shift the cross products
953
    uint8x16_t t0_shift = vextq_u8(t0, t0, 15);  // t0 << 8
954
    uint8x16_t t1_shift = vextq_u8(t1, t1, 14);  // t1 << 16
955
    uint8x16_t t2_shift = vextq_u8(t2, t2, 13);  // t2 << 24
956
    uint8x16_t t3_shift = vextq_u8(t3, t3, 12);  // t3 << 32
957

958
    // Accumulate the products
959
    uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift);
960
    uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift);
961
    uint8x16_t mix = veorq_u8(d, cross1);
962
    uint8x16_t r = veorq_u8(mix, cross2);
963
    return vreinterpretq_u64_u8(r);
964
}
965
#endif  // ARMv7 polyfill
966

967
// C equivalent:
968
//   __m128i _mm_shuffle_epi32_default(__m128i a,
969
//                                     __constrange(0, 255) int imm) {
970
//       __m128i ret;
971
//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
972
//       ret[2] = a[(imm >> 4) & 0x03];  ret[3] = a[(imm >> 6) & 0x03];
973
//       return ret;
974
//   }
975
#define _mm_shuffle_epi32_default(a, imm)                                   \
976
    vreinterpretq_m128i_s32(vsetq_lane_s32(                                 \
977
        vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3),     \
978
        vsetq_lane_s32(                                                     \
979
            vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \
980
            vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a),       \
981
                                          ((imm) >> 2) & 0x3),              \
982
                           vmovq_n_s32(vgetq_lane_s32(                      \
983
                               vreinterpretq_s32_m128i(a), (imm) & (0x3))), \
984
                           1),                                              \
985
            2),                                                             \
986
        3))
987

988
// Takes the upper 64 bits of a and places it in the low end of the result
989
// Takes the lower 64 bits of a and places it into the high end of the result.
990
FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a)
991
{
992
    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
993
    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
994
    return vreinterpretq_m128i_s32(vcombine_s32(a32, a10));
995
}
996

997
// takes the lower two 32-bit values from a and swaps them and places in low end
998
// of result takes the higher two 32 bit values from a and swaps them and places
999
// in high end of result.
1000
FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a)
1001
{
1002
    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1003
    int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a)));
1004
    return vreinterpretq_m128i_s32(vcombine_s32(a01, a23));
1005
}
1006

1007
// rotates the least significant 32 bits into the most significant 32 bits, and
1008
// shifts the rest down
1009
FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a)
1010
{
1011
    return vreinterpretq_m128i_s32(
1012
        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1));
1013
}
1014

1015
// rotates the most significant 32 bits into the least significant 32 bits, and
1016
// shifts the rest up
1017
FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a)
1018
{
1019
    return vreinterpretq_m128i_s32(
1020
        vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3));
1021
}
1022

1023
// gets the lower 64 bits of a, and places it in the upper 64 bits
1024
// gets the lower 64 bits of a and places it in the lower 64 bits
1025
FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a)
1026
{
1027
    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1028
    return vreinterpretq_m128i_s32(vcombine_s32(a10, a10));
1029
}
1030

1031
// gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the
1032
// lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits
1033
FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a)
1034
{
1035
    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1036
    int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a));
1037
    return vreinterpretq_m128i_s32(vcombine_s32(a01, a10));
1038
}
1039

1040
// gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the
1041
// upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and
1042
// places it in the lower 64 bits
1043
FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a)
1044
{
1045
    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1046
    return vreinterpretq_m128i_s32(vcombine_s32(a01, a01));
1047
}
1048

1049
FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a)
1050
{
1051
    int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1);
1052
    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1053
    return vreinterpretq_m128i_s32(vcombine_s32(a11, a22));
1054
}
1055

1056
FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a)
1057
{
1058
    int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0);
1059
    int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a)));
1060
    return vreinterpretq_m128i_s32(vcombine_s32(a22, a01));
1061
}
1062

1063
FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a)
1064
{
1065
    int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a));
1066
    int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1);
1067
    return vreinterpretq_m128i_s32(vcombine_s32(a32, a33));
1068
}
1069

1070
#if defined(__aarch64__) || defined(_M_ARM64)
1071
#define _mm_shuffle_epi32_splat(a, imm) \
1072
    vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm)))
1073
#else
1074
#define _mm_shuffle_epi32_splat(a, imm) \
1075
    vreinterpretq_m128i_s32(            \
1076
        vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))))
1077
#endif
1078

1079
// NEON does not support a general purpose permute intrinsic.
1080
// Shuffle single-precision (32-bit) floating-point elements in a using the
1081
// control in imm8, and store the results in dst.
1082
//
1083
// C equivalent:
1084
//   __m128 _mm_shuffle_ps_default(__m128 a, __m128 b,
1085
//                                 __constrange(0, 255) int imm) {
1086
//       __m128 ret;
1087
//       ret[0] = a[imm        & 0x3];   ret[1] = a[(imm >> 2) & 0x3];
1088
//       ret[2] = b[(imm >> 4) & 0x03];  ret[3] = b[(imm >> 6) & 0x03];
1089
//       return ret;
1090
//   }
1091
//
1092
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps
1093
#define _mm_shuffle_ps_default(a, b, imm)                                      \
1094
    vreinterpretq_m128_f32(vsetq_lane_f32(                                     \
1095
        vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3),         \
1096
        vsetq_lane_f32(                                                        \
1097
            vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3),     \
1098
            vsetq_lane_f32(                                                    \
1099
                vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \
1100
                vmovq_n_f32(                                                   \
1101
                    vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \
1102
                1),                                                            \
1103
            2),                                                                \
1104
        3))
1105

1106
// Shuffle 16-bit integers in the low 64 bits of a using the control in imm8.
1107
// Store the results in the low 64 bits of dst, with the high 64 bits being
1108
// copied from a to dst.
1109
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16
1110
#define _mm_shufflelo_epi16_function(a, imm)                                  \
1111
    _sse2neon_define1(                                                        \
1112
        __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a);              \
1113
        int16x4_t lowBits = vget_low_s16(ret);                                \
1114
        ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0);  \
1115
        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \
1116
                             1);                                              \
1117
        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \
1118
                             2);                                              \
1119
        ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \
1120
                             3);                                              \
1121
        _sse2neon_return(vreinterpretq_m128i_s16(ret));)
1122

1123
// Shuffle 16-bit integers in the high 64 bits of a using the control in imm8.
1124
// Store the results in the high 64 bits of dst, with the low 64 bits being
1125
// copied from a to dst.
1126
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16
1127
#define _mm_shufflehi_epi16_function(a, imm)                                   \
1128
    _sse2neon_define1(                                                         \
1129
        __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a);               \
1130
        int16x4_t highBits = vget_high_s16(ret);                               \
1131
        ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4);  \
1132
        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \
1133
                             5);                                               \
1134
        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \
1135
                             6);                                               \
1136
        ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \
1137
                             7);                                               \
1138
        _sse2neon_return(vreinterpretq_m128i_s16(ret));)
1139

1140
/* MMX */
1141

1142
//_mm_empty is a no-op on arm
1143
FORCE_INLINE void _mm_empty(void) {}
1144

1145
/* SSE */
1146

1147
// Add packed single-precision (32-bit) floating-point elements in a and b, and
1148
// store the results in dst.
1149
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps
1150
FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b)
1151
{
1152
    return vreinterpretq_m128_f32(
1153
        vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1154
}
1155

1156
// Add the lower single-precision (32-bit) floating-point element in a and b,
1157
// store the result in the lower element of dst, and copy the upper 3 packed
1158
// elements from a to the upper elements of dst.
1159
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss
1160
FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b)
1161
{
1162
    float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
1163
    float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
1164
    // the upper values in the result must be the remnants of <a>.
1165
    return vreinterpretq_m128_f32(vaddq_f32(a, value));
1166
}
1167

1168
// Compute the bitwise AND of packed single-precision (32-bit) floating-point
1169
// elements in a and b, and store the results in dst.
1170
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps
1171
FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b)
1172
{
1173
    return vreinterpretq_m128_s32(
1174
        vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
1175
}
1176

1177
// Compute the bitwise NOT of packed single-precision (32-bit) floating-point
1178
// elements in a and then AND with b, and store the results in dst.
1179
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps
1180
FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b)
1181
{
1182
    return vreinterpretq_m128_s32(
1183
        vbicq_s32(vreinterpretq_s32_m128(b),
1184
                  vreinterpretq_s32_m128(a)));  // *NOTE* argument swap
1185
}
1186

1187
// Average packed unsigned 16-bit integers in a and b, and store the results in
1188
// dst.
1189
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16
1190
FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b)
1191
{
1192
    return vreinterpret_m64_u16(
1193
        vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)));
1194
}
1195

1196
// Average packed unsigned 8-bit integers in a and b, and store the results in
1197
// dst.
1198
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8
1199
FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b)
1200
{
1201
    return vreinterpret_m64_u8(
1202
        vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
1203
}
1204

1205
// Compare packed single-precision (32-bit) floating-point elements in a and b
1206
// for equality, and store the results in dst.
1207
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps
1208
FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b)
1209
{
1210
    return vreinterpretq_m128_u32(
1211
        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1212
}
1213

1214
// Compare the lower single-precision (32-bit) floating-point elements in a and
1215
// b for equality, store the result in the lower element of dst, and copy the
1216
// upper 3 packed elements from a to the upper elements of dst.
1217
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss
1218
FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b)
1219
{
1220
    return _mm_move_ss(a, _mm_cmpeq_ps(a, b));
1221
}
1222

1223
// Compare packed single-precision (32-bit) floating-point elements in a and b
1224
// for greater-than-or-equal, and store the results in dst.
1225
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps
1226
FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b)
1227
{
1228
    return vreinterpretq_m128_u32(
1229
        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1230
}
1231

1232
// Compare the lower single-precision (32-bit) floating-point elements in a and
1233
// b for greater-than-or-equal, store the result in the lower element of dst,
1234
// and copy the upper 3 packed elements from a to the upper elements of dst.
1235
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss
1236
FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b)
1237
{
1238
    return _mm_move_ss(a, _mm_cmpge_ps(a, b));
1239
}
1240

1241
// Compare packed single-precision (32-bit) floating-point elements in a and b
1242
// for greater-than, and store the results in dst.
1243
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps
1244
FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b)
1245
{
1246
    return vreinterpretq_m128_u32(
1247
        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1248
}
1249

1250
// Compare the lower single-precision (32-bit) floating-point elements in a and
1251
// b for greater-than, store the result in the lower element of dst, and copy
1252
// the upper 3 packed elements from a to the upper elements of dst.
1253
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss
1254
FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b)
1255
{
1256
    return _mm_move_ss(a, _mm_cmpgt_ps(a, b));
1257
}
1258

1259
// Compare packed single-precision (32-bit) floating-point elements in a and b
1260
// for less-than-or-equal, and store the results in dst.
1261
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps
1262
FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b)
1263
{
1264
    return vreinterpretq_m128_u32(
1265
        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1266
}
1267

1268
// Compare the lower single-precision (32-bit) floating-point elements in a and
1269
// b for less-than-or-equal, store the result in the lower element of dst, and
1270
// copy the upper 3 packed elements from a to the upper elements of dst.
1271
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss
1272
FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b)
1273
{
1274
    return _mm_move_ss(a, _mm_cmple_ps(a, b));
1275
}
1276

1277
// Compare packed single-precision (32-bit) floating-point elements in a and b
1278
// for less-than, and store the results in dst.
1279
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps
1280
FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b)
1281
{
1282
    return vreinterpretq_m128_u32(
1283
        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1284
}
1285

1286
// Compare the lower single-precision (32-bit) floating-point elements in a and
1287
// b for less-than, store the result in the lower element of dst, and copy the
1288
// upper 3 packed elements from a to the upper elements of dst.
1289
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss
1290
FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b)
1291
{
1292
    return _mm_move_ss(a, _mm_cmplt_ps(a, b));
1293
}
1294

1295
// Compare packed single-precision (32-bit) floating-point elements in a and b
1296
// for not-equal, and store the results in dst.
1297
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps
1298
FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b)
1299
{
1300
    return vreinterpretq_m128_u32(vmvnq_u32(
1301
        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1302
}
1303

1304
// Compare the lower single-precision (32-bit) floating-point elements in a and
1305
// b for not-equal, store the result in the lower element of dst, and copy the
1306
// upper 3 packed elements from a to the upper elements of dst.
1307
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss
1308
FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b)
1309
{
1310
    return _mm_move_ss(a, _mm_cmpneq_ps(a, b));
1311
}
1312

1313
// Compare packed single-precision (32-bit) floating-point elements in a and b
1314
// for not-greater-than-or-equal, and store the results in dst.
1315
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps
1316
FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b)
1317
{
1318
    return vreinterpretq_m128_u32(vmvnq_u32(
1319
        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1320
}
1321

1322
// Compare the lower single-precision (32-bit) floating-point elements in a and
1323
// b for not-greater-than-or-equal, store the result in the lower element of
1324
// dst, and copy the upper 3 packed elements from a to the upper elements of
1325
// dst.
1326
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss
1327
FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b)
1328
{
1329
    return _mm_move_ss(a, _mm_cmpnge_ps(a, b));
1330
}
1331

1332
// Compare packed single-precision (32-bit) floating-point elements in a and b
1333
// for not-greater-than, and store the results in dst.
1334
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps
1335
FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b)
1336
{
1337
    return vreinterpretq_m128_u32(vmvnq_u32(
1338
        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1339
}
1340

1341
// Compare the lower single-precision (32-bit) floating-point elements in a and
1342
// b for not-greater-than, store the result in the lower element of dst, and
1343
// copy the upper 3 packed elements from a to the upper elements of dst.
1344
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss
1345
FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b)
1346
{
1347
    return _mm_move_ss(a, _mm_cmpngt_ps(a, b));
1348
}
1349

1350
// Compare packed single-precision (32-bit) floating-point elements in a and b
1351
// for not-less-than-or-equal, and store the results in dst.
1352
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps
1353
FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b)
1354
{
1355
    return vreinterpretq_m128_u32(vmvnq_u32(
1356
        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1357
}
1358

1359
// Compare the lower single-precision (32-bit) floating-point elements in a and
1360
// b for not-less-than-or-equal, store the result in the lower element of dst,
1361
// and copy the upper 3 packed elements from a to the upper elements of dst.
1362
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss
1363
FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b)
1364
{
1365
    return _mm_move_ss(a, _mm_cmpnle_ps(a, b));
1366
}
1367

1368
// Compare packed single-precision (32-bit) floating-point elements in a and b
1369
// for not-less-than, and store the results in dst.
1370
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps
1371
FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b)
1372
{
1373
    return vreinterpretq_m128_u32(vmvnq_u32(
1374
        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))));
1375
}
1376

1377
// Compare the lower single-precision (32-bit) floating-point elements in a and
1378
// b for not-less-than, store the result in the lower element of dst, and copy
1379
// the upper 3 packed elements from a to the upper elements of dst.
1380
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss
1381
FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b)
1382
{
1383
    return _mm_move_ss(a, _mm_cmpnlt_ps(a, b));
1384
}
1385

1386
// Compare packed single-precision (32-bit) floating-point elements in a and b
1387
// to see if neither is NaN, and store the results in dst.
1388
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps
1389
//
1390
// See also:
1391
// http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean
1392
// http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics
1393
FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b)
1394
{
1395
    // Note: NEON does not have ordered compare builtin
1396
    // Need to compare a eq a and b eq b to check for NaN
1397
    // Do AND of results to get final
1398
    uint32x4_t ceqaa =
1399
        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1400
    uint32x4_t ceqbb =
1401
        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1402
    return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb));
1403
}
1404

1405
// Compare the lower single-precision (32-bit) floating-point elements in a and
1406
// b to see if neither is NaN, store the result in the lower element of dst, and
1407
// copy the upper 3 packed elements from a to the upper elements of dst.
1408
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss
1409
FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b)
1410
{
1411
    return _mm_move_ss(a, _mm_cmpord_ps(a, b));
1412
}
1413

1414
// Compare packed single-precision (32-bit) floating-point elements in a and b
1415
// to see if either is NaN, and store the results in dst.
1416
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps
1417
FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b)
1418
{
1419
    uint32x4_t f32a =
1420
        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a));
1421
    uint32x4_t f32b =
1422
        vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b));
1423
    return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b)));
1424
}
1425

1426
// Compare the lower single-precision (32-bit) floating-point elements in a and
1427
// b to see if either is NaN, store the result in the lower element of dst, and
1428
// copy the upper 3 packed elements from a to the upper elements of dst.
1429
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss
1430
FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b)
1431
{
1432
    return _mm_move_ss(a, _mm_cmpunord_ps(a, b));
1433
}
1434

1435
// Compare the lower single-precision (32-bit) floating-point element in a and b
1436
// for equality, and return the boolean result (0 or 1).
1437
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss
1438
FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b)
1439
{
1440
    uint32x4_t a_eq_b =
1441
        vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1442
    return vgetq_lane_u32(a_eq_b, 0) & 0x1;
1443
}
1444

1445
// Compare the lower single-precision (32-bit) floating-point element in a and b
1446
// for greater-than-or-equal, and return the boolean result (0 or 1).
1447
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss
1448
FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b)
1449
{
1450
    uint32x4_t a_ge_b =
1451
        vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1452
    return vgetq_lane_u32(a_ge_b, 0) & 0x1;
1453
}
1454

1455
// Compare the lower single-precision (32-bit) floating-point element in a and b
1456
// for greater-than, and return the boolean result (0 or 1).
1457
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss
1458
FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b)
1459
{
1460
    uint32x4_t a_gt_b =
1461
        vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1462
    return vgetq_lane_u32(a_gt_b, 0) & 0x1;
1463
}
1464

1465
// Compare the lower single-precision (32-bit) floating-point element in a and b
1466
// for less-than-or-equal, and return the boolean result (0 or 1).
1467
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss
1468
FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b)
1469
{
1470
    uint32x4_t a_le_b =
1471
        vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1472
    return vgetq_lane_u32(a_le_b, 0) & 0x1;
1473
}
1474

1475
// Compare the lower single-precision (32-bit) floating-point element in a and b
1476
// for less-than, and return the boolean result (0 or 1).
1477
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss
1478
FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b)
1479
{
1480
    uint32x4_t a_lt_b =
1481
        vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b));
1482
    return vgetq_lane_u32(a_lt_b, 0) & 0x1;
1483
}
1484

1485
// Compare the lower single-precision (32-bit) floating-point element in a and b
1486
// for not-equal, and return the boolean result (0 or 1).
1487
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss
1488
FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b)
1489
{
1490
    return !_mm_comieq_ss(a, b);
1491
}
1492

1493
// Convert packed signed 32-bit integers in b to packed single-precision
1494
// (32-bit) floating-point elements, store the results in the lower 2 elements
1495
// of dst, and copy the upper 2 packed elements from a to the upper elements of
1496
// dst.
1497
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps
1498
FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b)
1499
{
1500
    return vreinterpretq_m128_f32(
1501
        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1502
                     vget_high_f32(vreinterpretq_f32_m128(a))));
1503
}
1504

1505
// Convert packed single-precision (32-bit) floating-point elements in a to
1506
// packed 32-bit integers, and store the results in dst.
1507
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi
1508
FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a)
1509
{
1510
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1511
    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1512
    return vreinterpret_m64_s32(
1513
        vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a)))));
1514
#else
1515
    return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32(
1516
        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)))));
1517
#endif
1518
}
1519

1520
// Convert the signed 32-bit integer b to a single-precision (32-bit)
1521
// floating-point element, store the result in the lower element of dst, and
1522
// copy the upper 3 packed elements from a to the upper elements of dst.
1523
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss
1524
FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b)
1525
{
1526
    return vreinterpretq_m128_f32(
1527
        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1528
}
1529

1530
// Convert the lower single-precision (32-bit) floating-point element in a to a
1531
// 32-bit integer, and store the result in dst.
1532
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si
1533
FORCE_INLINE int _mm_cvt_ss2si(__m128 a)
1534
{
1535
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1536
    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1537
    return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))),
1538
                          0);
1539
#else
1540
    float32_t data = vgetq_lane_f32(
1541
        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1542
    return (int32_t) data;
1543
#endif
1544
}
1545

1546
// Convert packed 16-bit integers in a to packed single-precision (32-bit)
1547
// floating-point elements, and store the results in dst.
1548
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps
1549
FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a)
1550
{
1551
    return vreinterpretq_m128_f32(
1552
        vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a))));
1553
}
1554

1555
// Convert packed 32-bit integers in b to packed single-precision (32-bit)
1556
// floating-point elements, store the results in the lower 2 elements of dst,
1557
// and copy the upper 2 packed elements from a to the upper elements of dst.
1558
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps
1559
FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b)
1560
{
1561
    return vreinterpretq_m128_f32(
1562
        vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)),
1563
                     vget_high_f32(vreinterpretq_f32_m128(a))));
1564
}
1565

1566
// Convert packed signed 32-bit integers in a to packed single-precision
1567
// (32-bit) floating-point elements, store the results in the lower 2 elements
1568
// of dst, then convert the packed signed 32-bit integers in b to
1569
// single-precision (32-bit) floating-point element, and store the results in
1570
// the upper 2 elements of dst.
1571
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps
1572
FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b)
1573
{
1574
    return vreinterpretq_m128_f32(vcvtq_f32_s32(
1575
        vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))));
1576
}
1577

1578
// Convert the lower packed 8-bit integers in a to packed single-precision
1579
// (32-bit) floating-point elements, and store the results in dst.
1580
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps
1581
FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a)
1582
{
1583
    return vreinterpretq_m128_f32(vcvtq_f32_s32(
1584
        vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a))))));
1585
}
1586

1587
// Convert packed single-precision (32-bit) floating-point elements in a to
1588
// packed 16-bit integers, and store the results in dst. Note: this intrinsic
1589
// will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and
1590
// 0x7FFFFFFF.
1591
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16
1592
FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a)
1593
{
1594
    return vreinterpret_m64_s16(
1595
        vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a))));
1596
}
1597

1598
// Convert packed single-precision (32-bit) floating-point elements in a to
1599
// packed 32-bit integers, and store the results in dst.
1600
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32
1601
#define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a)
1602

1603
// Convert packed single-precision (32-bit) floating-point elements in a to
1604
// packed 8-bit integers, and store the results in lower 4 elements of dst.
1605
// Note: this intrinsic will generate 0x7F, rather than 0x80, for input values
1606
// between 0x7F and 0x7FFFFFFF.
1607
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8
1608
FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a)
1609
{
1610
    return vreinterpret_m64_s8(vqmovn_s16(
1611
        vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0))));
1612
}
1613

1614
// Convert packed unsigned 16-bit integers in a to packed single-precision
1615
// (32-bit) floating-point elements, and store the results in dst.
1616
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps
1617
FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a)
1618
{
1619
    return vreinterpretq_m128_f32(
1620
        vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a))));
1621
}
1622

1623
// Convert the lower packed unsigned 8-bit integers in a to packed
1624
// single-precision (32-bit) floating-point elements, and store the results in
1625
// dst.
1626
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps
1627
FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a)
1628
{
1629
    return vreinterpretq_m128_f32(vcvtq_f32_u32(
1630
        vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a))))));
1631
}
1632

1633
// Convert the signed 32-bit integer b to a single-precision (32-bit)
1634
// floating-point element, store the result in the lower element of dst, and
1635
// copy the upper 3 packed elements from a to the upper elements of dst.
1636
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss
1637
#define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b)
1638

1639
// Convert the signed 64-bit integer b to a single-precision (32-bit)
1640
// floating-point element, store the result in the lower element of dst, and
1641
// copy the upper 3 packed elements from a to the upper elements of dst.
1642
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss
1643
FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b)
1644
{
1645
    return vreinterpretq_m128_f32(
1646
        vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0));
1647
}
1648

1649
// Copy the lower single-precision (32-bit) floating-point element of a to dst.
1650
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32
1651
FORCE_INLINE float _mm_cvtss_f32(__m128 a)
1652
{
1653
    return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1654
}
1655

1656
// Convert the lower single-precision (32-bit) floating-point element in a to a
1657
// 32-bit integer, and store the result in dst.
1658
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32
1659
#define _mm_cvtss_si32(a) _mm_cvt_ss2si(a)
1660

1661
// Convert the lower single-precision (32-bit) floating-point element in a to a
1662
// 64-bit integer, and store the result in dst.
1663
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64
1664
FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a)
1665
{
1666
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
1667
    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
1668
    return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0);
1669
#else
1670
    float32_t data = vgetq_lane_f32(
1671
        vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0);
1672
    return (int64_t) data;
1673
#endif
1674
}
1675

1676
// Convert packed single-precision (32-bit) floating-point elements in a to
1677
// packed 32-bit integers with truncation, and store the results in dst.
1678
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi
1679
FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a)
1680
{
1681
    return vreinterpret_m64_s32(
1682
        vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))));
1683
}
1684

1685
// Convert the lower single-precision (32-bit) floating-point element in a to a
1686
// 32-bit integer with truncation, and store the result in dst.
1687
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si
1688
FORCE_INLINE int _mm_cvtt_ss2si(__m128 a)
1689
{
1690
    return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0);
1691
}
1692

1693
// Convert packed single-precision (32-bit) floating-point elements in a to
1694
// packed 32-bit integers with truncation, and store the results in dst.
1695
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32
1696
#define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a)
1697

1698
// Convert the lower single-precision (32-bit) floating-point element in a to a
1699
// 32-bit integer with truncation, and store the result in dst.
1700
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32
1701
#define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a)
1702

1703
// Convert the lower single-precision (32-bit) floating-point element in a to a
1704
// 64-bit integer with truncation, and store the result in dst.
1705
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64
1706
FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a)
1707
{
1708
    return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
1709
}
1710

1711
// Divide packed single-precision (32-bit) floating-point elements in a by
1712
// packed elements in b, and store the results in dst.
1713
// Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement
1714
// division by multiplying a by b's reciprocal before using the Newton-Raphson
1715
// method to approximate the results.
1716
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps
1717
FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b)
1718
{
1719
#if defined(__aarch64__) || defined(_M_ARM64)
1720
    return vreinterpretq_m128_f32(
1721
        vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1722
#else
1723
    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b));
1724
    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1725
    // Additional Netwon-Raphson iteration for accuracy
1726
    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b)));
1727
    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip));
1728
#endif
1729
}
1730

1731
// Divide the lower single-precision (32-bit) floating-point element in a by the
1732
// lower single-precision (32-bit) floating-point element in b, store the result
1733
// in the lower element of dst, and copy the upper 3 packed elements from a to
1734
// the upper elements of dst.
1735
// Warning: ARMv7-A does not produce the same result compared to Intel and not
1736
// IEEE-compliant.
1737
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss
1738
FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b)
1739
{
1740
    float32_t value =
1741
        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0);
1742
    return vreinterpretq_m128_f32(
1743
        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
1744
}
1745

1746
// Extract a 16-bit integer from a, selected with imm8, and store the result in
1747
// the lower element of dst.
1748
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16
1749
#define _mm_extract_pi16(a, imm) \
1750
    (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm))
1751

1752
// Free aligned memory that was allocated with _mm_malloc.
1753
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free
1754
#if !defined(SSE2NEON_ALLOC_DEFINED)
1755
FORCE_INLINE void _mm_free(void *addr)
1756
{
1757
    free(addr);
1758
}
1759
#endif
1760

1761
FORCE_INLINE uint64_t _sse2neon_get_fpcr(void)
1762
{
1763
    uint64_t value;
1764
#if defined(_MSC_VER)
1765
    value = _ReadStatusReg(ARM64_FPCR);
1766
#else
1767
    __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */
1768
#endif
1769
    return value;
1770
}
1771

1772
FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value)
1773
{
1774
#if defined(_MSC_VER)
1775
    _WriteStatusReg(ARM64_FPCR, value);
1776
#else
1777
    __asm__ __volatile__("msr FPCR, %0" ::"r"(value));  /* write */
1778
#endif
1779
}
1780

1781
// Macro: Get the flush zero bits from the MXCSR control and status register.
1782
// The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or
1783
// _MM_FLUSH_ZERO_OFF
1784
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE
1785
FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void)
1786
{
1787
    union {
1788
        fpcr_bitfield field;
1789
#if defined(__aarch64__) || defined(_M_ARM64)
1790
        uint64_t value;
1791
#else
1792
        uint32_t value;
1793
#endif
1794
    } r;
1795

1796
#if defined(__aarch64__) || defined(_M_ARM64)
1797
    r.value = _sse2neon_get_fpcr();
1798
#else
1799
    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1800
#endif
1801

1802
    return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF;
1803
}
1804

1805
// Macro: Get the rounding mode bits from the MXCSR control and status register.
1806
// The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST,
1807
// _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO
1808
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE
1809
FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void)
1810
{
1811
    union {
1812
        fpcr_bitfield field;
1813
#if defined(__aarch64__) || defined(_M_ARM64)
1814
        uint64_t value;
1815
#else
1816
        uint32_t value;
1817
#endif
1818
    } r;
1819

1820
#if defined(__aarch64__) || defined(_M_ARM64)
1821
    r.value = _sse2neon_get_fpcr();
1822
#else
1823
    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
1824
#endif
1825

1826
    if (r.field.bit22) {
1827
        return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP;
1828
    } else {
1829
        return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST;
1830
    }
1831
}
1832

1833
// Copy a to dst, and insert the 16-bit integer i into dst at the location
1834
// specified by imm8.
1835
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16
1836
#define _mm_insert_pi16(a, b, imm) \
1837
    vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm)))
1838

1839
// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
1840
// elements) from memory into dst. mem_addr must be aligned on a 16-byte
1841
// boundary or a general-protection exception may be generated.
1842
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps
1843
FORCE_INLINE __m128 _mm_load_ps(const float *p)
1844
{
1845
    return vreinterpretq_m128_f32(vld1q_f32(p));
1846
}
1847

1848
// Load a single-precision (32-bit) floating-point element from memory into all
1849
// elements of dst.
1850
//
1851
//   dst[31:0] := MEM[mem_addr+31:mem_addr]
1852
//   dst[63:32] := MEM[mem_addr+31:mem_addr]
1853
//   dst[95:64] := MEM[mem_addr+31:mem_addr]
1854
//   dst[127:96] := MEM[mem_addr+31:mem_addr]
1855
//
1856
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1
1857
#define _mm_load_ps1 _mm_load1_ps
1858

1859
// Load a single-precision (32-bit) floating-point element from memory into the
1860
// lower of dst, and zero the upper 3 elements. mem_addr does not need to be
1861
// aligned on any particular boundary.
1862
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss
1863
FORCE_INLINE __m128 _mm_load_ss(const float *p)
1864
{
1865
    return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0));
1866
}
1867

1868
// Load a single-precision (32-bit) floating-point element from memory into all
1869
// elements of dst.
1870
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps
1871
FORCE_INLINE __m128 _mm_load1_ps(const float *p)
1872
{
1873
    return vreinterpretq_m128_f32(vld1q_dup_f32(p));
1874
}
1875

1876
// Load 2 single-precision (32-bit) floating-point elements from memory into the
1877
// upper 2 elements of dst, and copy the lower 2 elements from a to dst.
1878
// mem_addr does not need to be aligned on any particular boundary.
1879
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi
1880
FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p)
1881
{
1882
    return vreinterpretq_m128_f32(
1883
        vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p)));
1884
}
1885

1886
// Load 2 single-precision (32-bit) floating-point elements from memory into the
1887
// lower 2 elements of dst, and copy the upper 2 elements from a to dst.
1888
// mem_addr does not need to be aligned on any particular boundary.
1889
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi
1890
FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p)
1891
{
1892
    return vreinterpretq_m128_f32(
1893
        vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a)));
1894
}
1895

1896
// Load 4 single-precision (32-bit) floating-point elements from memory into dst
1897
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
1898
// general-protection exception may be generated.
1899
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps
1900
FORCE_INLINE __m128 _mm_loadr_ps(const float *p)
1901
{
1902
    float32x4_t v = vrev64q_f32(vld1q_f32(p));
1903
    return vreinterpretq_m128_f32(vextq_f32(v, v, 2));
1904
}
1905

1906
// Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point
1907
// elements) from memory into dst. mem_addr does not need to be aligned on any
1908
// particular boundary.
1909
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps
1910
FORCE_INLINE __m128 _mm_loadu_ps(const float *p)
1911
{
1912
    // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are
1913
    // equivalent for neon
1914
    return vreinterpretq_m128_f32(vld1q_f32(p));
1915
}
1916

1917
// Load unaligned 16-bit integer from memory into the first element of dst.
1918
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16
1919
FORCE_INLINE __m128i _mm_loadu_si16(const void *p)
1920
{
1921
    return vreinterpretq_m128i_s16(
1922
        vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0));
1923
}
1924

1925
// Load unaligned 64-bit integer from memory into the first element of dst.
1926
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64
1927
FORCE_INLINE __m128i _mm_loadu_si64(const void *p)
1928
{
1929
    return vreinterpretq_m128i_s64(
1930
        vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0)));
1931
}
1932

1933
// Allocate size bytes of memory, aligned to the alignment specified in align,
1934
// and return a pointer to the allocated memory. _mm_free should be used to free
1935
// memory that is allocated with _mm_malloc.
1936
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc
1937
#if !defined(SSE2NEON_ALLOC_DEFINED)
1938
FORCE_INLINE void *_mm_malloc(size_t size, size_t align)
1939
{
1940
    void *ptr;
1941
    if (align == 1)
1942
        return malloc(size);
1943
    if (align == 2 || (sizeof(void *) == 8 && align == 4))
1944
        align = sizeof(void *);
1945
    if (!posix_memalign(&ptr, align, size))
1946
        return ptr;
1947
    return NULL;
1948
}
1949
#endif
1950

1951
// Conditionally store 8-bit integer elements from a into memory using mask
1952
// (elements are not stored when the highest bit is not set in the corresponding
1953
// element) and a non-temporal memory hint.
1954
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64
1955
FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr)
1956
{
1957
    int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7);
1958
    __m128 b = _mm_load_ps((const float *) mem_addr);
1959
    int8x8_t masked =
1960
        vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a),
1961
                vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b))));
1962
    vst1_s8((int8_t *) mem_addr, masked);
1963
}
1964

1965
// Conditionally store 8-bit integer elements from a into memory using mask
1966
// (elements are not stored when the highest bit is not set in the corresponding
1967
// element) and a non-temporal memory hint.
1968
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq
1969
#define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr)
1970

1971
// Compare packed signed 16-bit integers in a and b, and store packed maximum
1972
// values in dst.
1973
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16
1974
FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b)
1975
{
1976
    return vreinterpret_m64_s16(
1977
        vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
1978
}
1979

1980
// Compare packed single-precision (32-bit) floating-point elements in a and b,
1981
// and store packed maximum values in dst. dst does not follow the IEEE Standard
1982
// for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or
1983
// signed-zero values.
1984
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps
1985
FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b)
1986
{
1987
#if SSE2NEON_PRECISE_MINMAX
1988
    float32x4_t _a = vreinterpretq_f32_m128(a);
1989
    float32x4_t _b = vreinterpretq_f32_m128(b);
1990
    return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b));
1991
#else
1992
    return vreinterpretq_m128_f32(
1993
        vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
1994
#endif
1995
}
1996

1997
// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
1998
// values in dst.
1999
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8
2000
FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b)
2001
{
2002
    return vreinterpret_m64_u8(
2003
        vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2004
}
2005

2006
// Compare the lower single-precision (32-bit) floating-point elements in a and
2007
// b, store the maximum value in the lower element of dst, and copy the upper 3
2008
// packed elements from a to the upper element of dst. dst does not follow the
2009
// IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when
2010
// inputs are NaN or signed-zero values.
2011
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss
2012
FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b)
2013
{
2014
    float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0);
2015
    return vreinterpretq_m128_f32(
2016
        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2017
}
2018

2019
// Compare packed signed 16-bit integers in a and b, and store packed minimum
2020
// values in dst.
2021
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16
2022
FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b)
2023
{
2024
    return vreinterpret_m64_s16(
2025
        vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
2026
}
2027

2028
// Compare packed single-precision (32-bit) floating-point elements in a and b,
2029
// and store packed minimum values in dst. dst does not follow the IEEE Standard
2030
// for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or
2031
// signed-zero values.
2032
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps
2033
FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b)
2034
{
2035
#if SSE2NEON_PRECISE_MINMAX
2036
    float32x4_t _a = vreinterpretq_f32_m128(a);
2037
    float32x4_t _b = vreinterpretq_f32_m128(b);
2038
    return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b));
2039
#else
2040
    return vreinterpretq_m128_f32(
2041
        vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2042
#endif
2043
}
2044

2045
// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2046
// values in dst.
2047
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8
2048
FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b)
2049
{
2050
    return vreinterpret_m64_u8(
2051
        vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)));
2052
}
2053

2054
// Compare the lower single-precision (32-bit) floating-point elements in a and
2055
// b, store the minimum value in the lower element of dst, and copy the upper 3
2056
// packed elements from a to the upper element of dst. dst does not follow the
2057
// IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when
2058
// inputs are NaN or signed-zero values.
2059
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss
2060
FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b)
2061
{
2062
    float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0);
2063
    return vreinterpretq_m128_f32(
2064
        vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0));
2065
}
2066

2067
// Move the lower single-precision (32-bit) floating-point element from b to the
2068
// lower element of dst, and copy the upper 3 packed elements from a to the
2069
// upper elements of dst.
2070
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss
2071
FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b)
2072
{
2073
    return vreinterpretq_m128_f32(
2074
        vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0),
2075
                       vreinterpretq_f32_m128(a), 0));
2076
}
2077

2078
// Move the upper 2 single-precision (32-bit) floating-point elements from b to
2079
// the lower 2 elements of dst, and copy the upper 2 elements from a to the
2080
// upper 2 elements of dst.
2081
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps
2082
FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b)
2083
{
2084
#if defined(aarch64__)
2085
    return vreinterpretq_m128_u64(
2086
        vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a)));
2087
#else
2088
    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
2089
    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
2090
    return vreinterpretq_m128_f32(vcombine_f32(b32, a32));
2091
#endif
2092
}
2093

2094
// Move the lower 2 single-precision (32-bit) floating-point elements from b to
2095
// the upper 2 elements of dst, and copy the lower 2 elements from a to the
2096
// lower 2 elements of dst.
2097
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps
2098
FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B)
2099
{
2100
    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A));
2101
    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B));
2102
    return vreinterpretq_m128_f32(vcombine_f32(a10, b10));
2103
}
2104

2105
// Create mask from the most significant bit of each 8-bit element in a, and
2106
// store the result in dst.
2107
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8
2108
FORCE_INLINE int _mm_movemask_pi8(__m64 a)
2109
{
2110
    uint8x8_t input = vreinterpret_u8_m64(a);
2111
#if defined(__aarch64__) || defined(_M_ARM64)
2112
    static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7};
2113
    uint8x8_t tmp = vshr_n_u8(input, 7);
2114
    return vaddv_u8(vshl_u8(tmp, vld1_s8(shift)));
2115
#else
2116
    // Refer the implementation of `_mm_movemask_epi8`
2117
    uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7));
2118
    uint32x2_t paired16 =
2119
        vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7));
2120
    uint8x8_t paired32 =
2121
        vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14));
2122
    return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4);
2123
#endif
2124
}
2125

2126
// Set each bit of mask dst based on the most significant bit of the
2127
// corresponding packed single-precision (32-bit) floating-point element in a.
2128
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps
2129
FORCE_INLINE int _mm_movemask_ps(__m128 a)
2130
{
2131
    uint32x4_t input = vreinterpretq_u32_m128(a);
2132
#if defined(__aarch64__) || defined(_M_ARM64)
2133
    static const int32_t shift[4] = {0, 1, 2, 3};
2134
    uint32x4_t tmp = vshrq_n_u32(input, 31);
2135
    return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift)));
2136
#else
2137
    // Uses the exact same method as _mm_movemask_epi8, see that for details.
2138
    // Shift out everything but the sign bits with a 32-bit unsigned shift
2139
    // right.
2140
    uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31));
2141
    // Merge the two pairs together with a 64-bit unsigned shift right + add.
2142
    uint8x16_t paired =
2143
        vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31));
2144
    // Extract the result.
2145
    return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2);
2146
#endif
2147
}
2148

2149
// Multiply packed single-precision (32-bit) floating-point elements in a and b,
2150
// and store the results in dst.
2151
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps
2152
FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b)
2153
{
2154
    return vreinterpretq_m128_f32(
2155
        vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2156
}
2157

2158
// Multiply the lower single-precision (32-bit) floating-point element in a and
2159
// b, store the result in the lower element of dst, and copy the upper 3 packed
2160
// elements from a to the upper elements of dst.
2161
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss
2162
FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b)
2163
{
2164
    return _mm_move_ss(a, _mm_mul_ps(a, b));
2165
}
2166

2167
// Multiply the packed unsigned 16-bit integers in a and b, producing
2168
// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2169
// integers in dst.
2170
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16
2171
FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b)
2172
{
2173
    return vreinterpret_m64_u16(vshrn_n_u32(
2174
        vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16));
2175
}
2176

2177
// Compute the bitwise OR of packed single-precision (32-bit) floating-point
2178
// elements in a and b, and store the results in dst.
2179
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps
2180
FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b)
2181
{
2182
    return vreinterpretq_m128_s32(
2183
        vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2184
}
2185

2186
// Average packed unsigned 8-bit integers in a and b, and store the results in
2187
// dst.
2188
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb
2189
#define _m_pavgb(a, b) _mm_avg_pu8(a, b)
2190

2191
// Average packed unsigned 16-bit integers in a and b, and store the results in
2192
// dst.
2193
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw
2194
#define _m_pavgw(a, b) _mm_avg_pu16(a, b)
2195

2196
// Extract a 16-bit integer from a, selected with imm8, and store the result in
2197
// the lower element of dst.
2198
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw
2199
#define _m_pextrw(a, imm) _mm_extract_pi16(a, imm)
2200

2201
// Copy a to dst, and insert the 16-bit integer i into dst at the location
2202
// specified by imm8.
2203
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw
2204
#define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm)
2205

2206
// Compare packed signed 16-bit integers in a and b, and store packed maximum
2207
// values in dst.
2208
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw
2209
#define _m_pmaxsw(a, b) _mm_max_pi16(a, b)
2210

2211
// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
2212
// values in dst.
2213
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub
2214
#define _m_pmaxub(a, b) _mm_max_pu8(a, b)
2215

2216
// Compare packed signed 16-bit integers in a and b, and store packed minimum
2217
// values in dst.
2218
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw
2219
#define _m_pminsw(a, b) _mm_min_pi16(a, b)
2220

2221
// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
2222
// values in dst.
2223
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub
2224
#define _m_pminub(a, b) _mm_min_pu8(a, b)
2225

2226
// Create mask from the most significant bit of each 8-bit element in a, and
2227
// store the result in dst.
2228
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb
2229
#define _m_pmovmskb(a) _mm_movemask_pi8(a)
2230

2231
// Multiply the packed unsigned 16-bit integers in a and b, producing
2232
// intermediate 32-bit integers, and store the high 16 bits of the intermediate
2233
// integers in dst.
2234
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw
2235
#define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b)
2236

2237
// Fetch the line of data from memory that contains address p to a location in
2238
// the cache hierarchy specified by the locality hint i.
2239
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch
2240
FORCE_INLINE void _mm_prefetch(char const *p, int i)
2241
{
2242
    (void) i;
2243
#if defined(_MSC_VER)
2244
    switch (i) {
2245
    case _MM_HINT_NTA:
2246
        __prefetch2(p, 1);
2247
        break;
2248
    case _MM_HINT_T0:
2249
        __prefetch2(p, 0);
2250
        break;
2251
    case _MM_HINT_T1:
2252
        __prefetch2(p, 2);
2253
        break;
2254
    case _MM_HINT_T2:
2255
        __prefetch2(p, 4);
2256
        break;
2257
    }
2258
#else
2259
    switch (i) {
2260
    case _MM_HINT_NTA:
2261
        __builtin_prefetch(p, 0, 0);
2262
        break;
2263
    case _MM_HINT_T0:
2264
        __builtin_prefetch(p, 0, 3);
2265
        break;
2266
    case _MM_HINT_T1:
2267
        __builtin_prefetch(p, 0, 2);
2268
        break;
2269
    case _MM_HINT_T2:
2270
        __builtin_prefetch(p, 0, 1);
2271
        break;
2272
    }
2273
#endif
2274
}
2275

2276
// Compute the absolute differences of packed unsigned 8-bit integers in a and
2277
// b, then horizontally sum each consecutive 8 differences to produce four
2278
// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2279
// 16 bits of dst.
2280
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw
2281
#define _m_psadbw(a, b) _mm_sad_pu8(a, b)
2282

2283
// Shuffle 16-bit integers in a using the control in imm8, and store the results
2284
// in dst.
2285
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw
2286
#define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm)
2287

2288
// Compute the approximate reciprocal of packed single-precision (32-bit)
2289
// floating-point elements in a, and store the results in dst. The maximum
2290
// relative error for this approximation is less than 1.5*2^-12.
2291
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps
2292
FORCE_INLINE __m128 _mm_rcp_ps(__m128 in)
2293
{
2294
    float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in));
2295
    recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in)));
2296
    return vreinterpretq_m128_f32(recip);
2297
}
2298

2299
// Compute the approximate reciprocal of the lower single-precision (32-bit)
2300
// floating-point element in a, store the result in the lower element of dst,
2301
// and copy the upper 3 packed elements from a to the upper elements of dst. The
2302
// maximum relative error for this approximation is less than 1.5*2^-12.
2303
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss
2304
FORCE_INLINE __m128 _mm_rcp_ss(__m128 a)
2305
{
2306
    return _mm_move_ss(a, _mm_rcp_ps(a));
2307
}
2308

2309
// Compute the approximate reciprocal square root of packed single-precision
2310
// (32-bit) floating-point elements in a, and store the results in dst. The
2311
// maximum relative error for this approximation is less than 1.5*2^-12.
2312
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps
2313
FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in)
2314
{
2315
    float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2316

2317
    // Generate masks for detecting whether input has any 0.0f/-0.0f
2318
    // (which becomes positive/negative infinity by IEEE-754 arithmetic rules).
2319
    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2320
    const uint32x4_t neg_inf = vdupq_n_u32(0xFF800000);
2321
    const uint32x4_t has_pos_zero =
2322
        vceqq_u32(pos_inf, vreinterpretq_u32_f32(out));
2323
    const uint32x4_t has_neg_zero =
2324
        vceqq_u32(neg_inf, vreinterpretq_u32_f32(out));
2325

2326
    out = vmulq_f32(
2327
        out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out));
2328

2329
    // Set output vector element to infinity/negative-infinity if
2330
    // the corresponding input vector element is 0.0f/-0.0f.
2331
    out = vbslq_f32(has_pos_zero, (float32x4_t) pos_inf, out);
2332
    out = vbslq_f32(has_neg_zero, (float32x4_t) neg_inf, out);
2333

2334
    return vreinterpretq_m128_f32(out);
2335
}
2336

2337
// Compute the approximate reciprocal square root of the lower single-precision
2338
// (32-bit) floating-point element in a, store the result in the lower element
2339
// of dst, and copy the upper 3 packed elements from a to the upper elements of
2340
// dst.
2341
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss
2342
FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in)
2343
{
2344
    return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0);
2345
}
2346

2347
// Compute the absolute differences of packed unsigned 8-bit integers in a and
2348
// b, then horizontally sum each consecutive 8 differences to produce four
2349
// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
2350
// 16 bits of dst.
2351
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8
2352
FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b)
2353
{
2354
    uint64x1_t t = vpaddl_u32(vpaddl_u16(
2355
        vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b)))));
2356
    return vreinterpret_m64_u16(
2357
        vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0));
2358
}
2359

2360
// Macro: Set the flush zero bits of the MXCSR control and status register to
2361
// the value in unsigned 32-bit integer a. The flush zero may contain any of the
2362
// following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF
2363
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE
2364
FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag)
2365
{
2366
    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
2367
    // regardless of the value of the FZ bit.
2368
    union {
2369
        fpcr_bitfield field;
2370
#if defined(__aarch64__) || defined(_M_ARM64)
2371
        uint64_t value;
2372
#else
2373
        uint32_t value;
2374
#endif
2375
    } r;
2376

2377
#if defined(__aarch64__) || defined(_M_ARM64)
2378
    r.value = _sse2neon_get_fpcr();
2379
#else
2380
    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2381
#endif
2382

2383
    r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON;
2384

2385
#if defined(__aarch64__) || defined(_M_ARM64)
2386
    _sse2neon_set_fpcr(r.value);
2387
#else
2388
    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
2389
#endif
2390
}
2391

2392
// Set packed single-precision (32-bit) floating-point elements in dst with the
2393
// supplied values.
2394
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps
2395
FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x)
2396
{
2397
    float ALIGN_STRUCT(16) data[4] = {x, y, z, w};
2398
    return vreinterpretq_m128_f32(vld1q_f32(data));
2399
}
2400

2401
// Broadcast single-precision (32-bit) floating-point value a to all elements of
2402
// dst.
2403
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1
2404
FORCE_INLINE __m128 _mm_set_ps1(float _w)
2405
{
2406
    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2407
}
2408

2409
// Macro: Set the rounding mode bits of the MXCSR control and status register to
2410
// the value in unsigned 32-bit integer a. The rounding mode may contain any of
2411
// the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP,
2412
// _MM_ROUND_TOWARD_ZERO
2413
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE
2414
FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding)
2415
{
2416
    union {
2417
        fpcr_bitfield field;
2418
#if defined(__aarch64__) || defined(_M_ARM64)
2419
        uint64_t value;
2420
#else
2421
        uint32_t value;
2422
#endif
2423
    } r;
2424

2425
#if defined(__aarch64__) || defined(_M_ARM64)
2426
    r.value = _sse2neon_get_fpcr();
2427
#else
2428
    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
2429
#endif
2430

2431
    switch (rounding) {
2432
    case _MM_ROUND_TOWARD_ZERO:
2433
        r.field.bit22 = 1;
2434
        r.field.bit23 = 1;
2435
        break;
2436
    case _MM_ROUND_DOWN:
2437
        r.field.bit22 = 0;
2438
        r.field.bit23 = 1;
2439
        break;
2440
    case _MM_ROUND_UP:
2441
        r.field.bit22 = 1;
2442
        r.field.bit23 = 0;
2443
        break;
2444
    default:  //_MM_ROUND_NEAREST
2445
        r.field.bit22 = 0;
2446
        r.field.bit23 = 0;
2447
    }
2448

2449
#if defined(__aarch64__) || defined(_M_ARM64)
2450
    _sse2neon_set_fpcr(r.value);
2451
#else
2452
    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
2453
#endif
2454
}
2455

2456
// Copy single-precision (32-bit) floating-point element a to the lower element
2457
// of dst, and zero the upper 3 elements.
2458
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss
2459
FORCE_INLINE __m128 _mm_set_ss(float a)
2460
{
2461
    return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0));
2462
}
2463

2464
// Broadcast single-precision (32-bit) floating-point value a to all elements of
2465
// dst.
2466
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps
2467
FORCE_INLINE __m128 _mm_set1_ps(float _w)
2468
{
2469
    return vreinterpretq_m128_f32(vdupq_n_f32(_w));
2470
}
2471

2472
// Set the MXCSR control and status register with the value in unsigned 32-bit
2473
// integer a.
2474
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr
2475
// FIXME: _mm_setcsr() implementation supports changing the rounding mode only.
2476
FORCE_INLINE void _mm_setcsr(unsigned int a)
2477
{
2478
    _MM_SET_ROUNDING_MODE(a);
2479
}
2480

2481
// Get the unsigned 32-bit value of the MXCSR control and status register.
2482
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr
2483
// FIXME: _mm_getcsr() implementation supports reading the rounding mode only.
2484
FORCE_INLINE unsigned int _mm_getcsr(void)
2485
{
2486
    return _MM_GET_ROUNDING_MODE();
2487
}
2488

2489
// Set packed single-precision (32-bit) floating-point elements in dst with the
2490
// supplied values in reverse order.
2491
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps
2492
FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x)
2493
{
2494
    float ALIGN_STRUCT(16) data[4] = {w, z, y, x};
2495
    return vreinterpretq_m128_f32(vld1q_f32(data));
2496
}
2497

2498
// Return vector of type __m128 with all elements set to zero.
2499
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps
2500
FORCE_INLINE __m128 _mm_setzero_ps(void)
2501
{
2502
    return vreinterpretq_m128_f32(vdupq_n_f32(0));
2503
}
2504

2505
// Shuffle 16-bit integers in a using the control in imm8, and store the results
2506
// in dst.
2507
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16
2508
#ifdef _sse2neon_shuffle
2509
#define _mm_shuffle_pi16(a, imm)                                       \
2510
    vreinterpret_m64_s16(vshuffle_s16(                                 \
2511
        vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \
2512
        ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3)))
2513
#else
2514
#define _mm_shuffle_pi16(a, imm)                                              \
2515
    _sse2neon_define1(                                                        \
2516
        __m64, a, int16x4_t ret;                                              \
2517
        ret = vmov_n_s16(                                                     \
2518
            vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3)));          \
2519
        ret = vset_lane_s16(                                                  \
2520
            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \
2521
            1);                                                               \
2522
        ret = vset_lane_s16(                                                  \
2523
            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \
2524
            2);                                                               \
2525
        ret = vset_lane_s16(                                                  \
2526
            vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \
2527
            3);                                                               \
2528
        _sse2neon_return(vreinterpret_m64_s16(ret));)
2529
#endif
2530

2531
// Perform a serializing operation on all store-to-memory instructions that were
2532
// issued prior to this instruction. Guarantees that every store instruction
2533
// that precedes, in program order, is globally visible before any store
2534
// instruction which follows the fence in program order.
2535
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence
2536
FORCE_INLINE void _mm_sfence(void)
2537
{
2538
    _sse2neon_smp_mb();
2539
}
2540

2541
// Perform a serializing operation on all load-from-memory and store-to-memory
2542
// instructions that were issued prior to this instruction. Guarantees that
2543
// every memory access that precedes, in program order, the memory fence
2544
// instruction is globally visible before any memory instruction which follows
2545
// the fence in program order.
2546
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence
2547
FORCE_INLINE void _mm_mfence(void)
2548
{
2549
    _sse2neon_smp_mb();
2550
}
2551

2552
// Perform a serializing operation on all load-from-memory instructions that
2553
// were issued prior to this instruction. Guarantees that every load instruction
2554
// that precedes, in program order, is globally visible before any load
2555
// instruction which follows the fence in program order.
2556
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence
2557
FORCE_INLINE void _mm_lfence(void)
2558
{
2559
    _sse2neon_smp_mb();
2560
}
2561

2562
// FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255)
2563
// int imm)
2564
#ifdef _sse2neon_shuffle
2565
#define _mm_shuffle_ps(a, b, imm)                                              \
2566
    __extension__({                                                            \
2567
        float32x4_t _input1 = vreinterpretq_f32_m128(a);                       \
2568
        float32x4_t _input2 = vreinterpretq_f32_m128(b);                       \
2569
        float32x4_t _shuf =                                                    \
2570
            vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \
2571
                          (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \
2572
        vreinterpretq_m128_f32(_shuf);                                         \
2573
    })
2574
#else  // generic
2575
#define _mm_shuffle_ps(a, b, imm)                            \
2576
    _sse2neon_define2(                                       \
2577
        __m128, a, b, __m128 ret; switch (imm) {             \
2578
            case _MM_SHUFFLE(1, 0, 3, 2):                    \
2579
                ret = _mm_shuffle_ps_1032(_a, _b);           \
2580
                break;                                       \
2581
            case _MM_SHUFFLE(2, 3, 0, 1):                    \
2582
                ret = _mm_shuffle_ps_2301(_a, _b);           \
2583
                break;                                       \
2584
            case _MM_SHUFFLE(0, 3, 2, 1):                    \
2585
                ret = _mm_shuffle_ps_0321(_a, _b);           \
2586
                break;                                       \
2587
            case _MM_SHUFFLE(2, 1, 0, 3):                    \
2588
                ret = _mm_shuffle_ps_2103(_a, _b);           \
2589
                break;                                       \
2590
            case _MM_SHUFFLE(1, 0, 1, 0):                    \
2591
                ret = _mm_movelh_ps(_a, _b);                 \
2592
                break;                                       \
2593
            case _MM_SHUFFLE(1, 0, 0, 1):                    \
2594
                ret = _mm_shuffle_ps_1001(_a, _b);           \
2595
                break;                                       \
2596
            case _MM_SHUFFLE(0, 1, 0, 1):                    \
2597
                ret = _mm_shuffle_ps_0101(_a, _b);           \
2598
                break;                                       \
2599
            case _MM_SHUFFLE(3, 2, 1, 0):                    \
2600
                ret = _mm_shuffle_ps_3210(_a, _b);           \
2601
                break;                                       \
2602
            case _MM_SHUFFLE(0, 0, 1, 1):                    \
2603
                ret = _mm_shuffle_ps_0011(_a, _b);           \
2604
                break;                                       \
2605
            case _MM_SHUFFLE(0, 0, 2, 2):                    \
2606
                ret = _mm_shuffle_ps_0022(_a, _b);           \
2607
                break;                                       \
2608
            case _MM_SHUFFLE(2, 2, 0, 0):                    \
2609
                ret = _mm_shuffle_ps_2200(_a, _b);           \
2610
                break;                                       \
2611
            case _MM_SHUFFLE(3, 2, 0, 2):                    \
2612
                ret = _mm_shuffle_ps_3202(_a, _b);           \
2613
                break;                                       \
2614
            case _MM_SHUFFLE(3, 2, 3, 2):                    \
2615
                ret = _mm_movehl_ps(_b, _a);                 \
2616
                break;                                       \
2617
            case _MM_SHUFFLE(1, 1, 3, 3):                    \
2618
                ret = _mm_shuffle_ps_1133(_a, _b);           \
2619
                break;                                       \
2620
            case _MM_SHUFFLE(2, 0, 1, 0):                    \
2621
                ret = _mm_shuffle_ps_2010(_a, _b);           \
2622
                break;                                       \
2623
            case _MM_SHUFFLE(2, 0, 0, 1):                    \
2624
                ret = _mm_shuffle_ps_2001(_a, _b);           \
2625
                break;                                       \
2626
            case _MM_SHUFFLE(2, 0, 3, 2):                    \
2627
                ret = _mm_shuffle_ps_2032(_a, _b);           \
2628
                break;                                       \
2629
            default:                                         \
2630
                ret = _mm_shuffle_ps_default(_a, _b, (imm)); \
2631
                break;                                       \
2632
        } _sse2neon_return(ret);)
2633
#endif
2634

2635
// Compute the square root of packed single-precision (32-bit) floating-point
2636
// elements in a, and store the results in dst.
2637
// Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement
2638
// square root by multiplying input in with its reciprocal square root before
2639
// using the Newton-Raphson method to approximate the results.
2640
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps
2641
FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in)
2642
{
2643
#if defined(__aarch64__) || defined(_M_ARM64)
2644
    return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in)));
2645
#else
2646
    float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in));
2647

2648
    // Test for vrsqrteq_f32(0) -> positive infinity case.
2649
    // Change to zero, so that s * 1/sqrt(s) result is zero too.
2650
    const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000);
2651
    const uint32x4_t div_by_zero =
2652
        vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip));
2653
    recip = vreinterpretq_f32_u32(
2654
        vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip)));
2655

2656
    recip = vmulq_f32(
2657
        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2658
        recip);
2659
    // Additional Netwon-Raphson iteration for accuracy
2660
    recip = vmulq_f32(
2661
        vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)),
2662
        recip);
2663

2664
    // sqrt(s) = s * 1/sqrt(s)
2665
    return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip));
2666
#endif
2667
}
2668

2669
// Compute the square root of the lower single-precision (32-bit) floating-point
2670
// element in a, store the result in the lower element of dst, and copy the
2671
// upper 3 packed elements from a to the upper elements of dst.
2672
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss
2673
FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in)
2674
{
2675
    float32_t value =
2676
        vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0);
2677
    return vreinterpretq_m128_f32(
2678
        vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0));
2679
}
2680

2681
// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
2682
// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
2683
// or a general-protection exception may be generated.
2684
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps
2685
FORCE_INLINE void _mm_store_ps(float *p, __m128 a)
2686
{
2687
    vst1q_f32(p, vreinterpretq_f32_m128(a));
2688
}
2689

2690
// Store the lower single-precision (32-bit) floating-point element from a into
2691
// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2692
// boundary or a general-protection exception may be generated.
2693
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1
2694
FORCE_INLINE void _mm_store_ps1(float *p, __m128 a)
2695
{
2696
    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
2697
    vst1q_f32(p, vdupq_n_f32(a0));
2698
}
2699

2700
// Store the lower single-precision (32-bit) floating-point element from a into
2701
// memory. mem_addr does not need to be aligned on any particular boundary.
2702
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss
2703
FORCE_INLINE void _mm_store_ss(float *p, __m128 a)
2704
{
2705
    vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0);
2706
}
2707

2708
// Store the lower single-precision (32-bit) floating-point element from a into
2709
// 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte
2710
// boundary or a general-protection exception may be generated.
2711
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps
2712
#define _mm_store1_ps _mm_store_ps1
2713

2714
// Store the upper 2 single-precision (32-bit) floating-point elements from a
2715
// into memory.
2716
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi
2717
FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a)
2718
{
2719
    *p = vreinterpret_m64_f32(vget_high_f32(a));
2720
}
2721

2722
// Store the lower 2 single-precision (32-bit) floating-point elements from a
2723
// into memory.
2724
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi
2725
FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a)
2726
{
2727
    *p = vreinterpret_m64_f32(vget_low_f32(a));
2728
}
2729

2730
// Store 4 single-precision (32-bit) floating-point elements from a into memory
2731
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
2732
// general-protection exception may be generated.
2733
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps
2734
FORCE_INLINE void _mm_storer_ps(float *p, __m128 a)
2735
{
2736
    float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a));
2737
    float32x4_t rev = vextq_f32(tmp, tmp, 2);
2738
    vst1q_f32(p, rev);
2739
}
2740

2741
// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point
2742
// elements) from a into memory. mem_addr does not need to be aligned on any
2743
// particular boundary.
2744
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps
2745
FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a)
2746
{
2747
    vst1q_f32(p, vreinterpretq_f32_m128(a));
2748
}
2749

2750
// Stores 16-bits of integer data a at the address p.
2751
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16
2752
FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a)
2753
{
2754
    vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0);
2755
}
2756

2757
// Stores 64-bits of integer data a at the address p.
2758
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64
2759
FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a)
2760
{
2761
    vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0);
2762
}
2763

2764
// Store 64-bits of integer data from a into memory using a non-temporal memory
2765
// hint.
2766
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi
2767
FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a)
2768
{
2769
    vst1_s64((int64_t *) p, vreinterpret_s64_m64(a));
2770
}
2771

2772
// Store 128-bits (composed of 4 packed single-precision (32-bit) floating-
2773
// point elements) from a into memory using a non-temporal memory hint.
2774
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps
2775
FORCE_INLINE void _mm_stream_ps(float *p, __m128 a)
2776
{
2777
#if __has_builtin(__builtin_nontemporal_store)
2778
    __builtin_nontemporal_store(a, (float32x4_t *) p);
2779
#else
2780
    vst1q_f32(p, vreinterpretq_f32_m128(a));
2781
#endif
2782
}
2783

2784
// Subtract packed single-precision (32-bit) floating-point elements in b from
2785
// packed single-precision (32-bit) floating-point elements in a, and store the
2786
// results in dst.
2787
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps
2788
FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b)
2789
{
2790
    return vreinterpretq_m128_f32(
2791
        vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2792
}
2793

2794
// Subtract the lower single-precision (32-bit) floating-point element in b from
2795
// the lower single-precision (32-bit) floating-point element in a, store the
2796
// result in the lower element of dst, and copy the upper 3 packed elements from
2797
// a to the upper elements of dst.
2798
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss
2799
FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b)
2800
{
2801
    return _mm_move_ss(a, _mm_sub_ps(a, b));
2802
}
2803

2804
// Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision
2805
// (32-bit) floating-point elements in row0, row1, row2, and row3, and store the
2806
// transposed matrix in these vectors (row0 now contains column 0, etc.).
2807
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS
2808
#define _MM_TRANSPOSE4_PS(row0, row1, row2, row3)         \
2809
    do {                                                  \
2810
        float32x4x2_t ROW01 = vtrnq_f32(row0, row1);      \
2811
        float32x4x2_t ROW23 = vtrnq_f32(row2, row3);      \
2812
        row0 = vcombine_f32(vget_low_f32(ROW01.val[0]),   \
2813
                            vget_low_f32(ROW23.val[0]));  \
2814
        row1 = vcombine_f32(vget_low_f32(ROW01.val[1]),   \
2815
                            vget_low_f32(ROW23.val[1]));  \
2816
        row2 = vcombine_f32(vget_high_f32(ROW01.val[0]),  \
2817
                            vget_high_f32(ROW23.val[0])); \
2818
        row3 = vcombine_f32(vget_high_f32(ROW01.val[1]),  \
2819
                            vget_high_f32(ROW23.val[1])); \
2820
    } while (0)
2821

2822
// according to the documentation, these intrinsics behave the same as the
2823
// non-'u' versions.  We'll just alias them here.
2824
#define _mm_ucomieq_ss _mm_comieq_ss
2825
#define _mm_ucomige_ss _mm_comige_ss
2826
#define _mm_ucomigt_ss _mm_comigt_ss
2827
#define _mm_ucomile_ss _mm_comile_ss
2828
#define _mm_ucomilt_ss _mm_comilt_ss
2829
#define _mm_ucomineq_ss _mm_comineq_ss
2830

2831
// Return vector of type __m128i with undefined elements.
2832
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128
2833
FORCE_INLINE __m128i _mm_undefined_si128(void)
2834
{
2835
#if defined(__GNUC__) || defined(__clang__)
2836
#pragma GCC diagnostic push
2837
#pragma GCC diagnostic ignored "-Wuninitialized"
2838
#endif
2839
    __m128i a;
2840
#if defined(_MSC_VER)
2841
    a = _mm_setzero_si128();
2842
#endif
2843
    return a;
2844
#if defined(__GNUC__) || defined(__clang__)
2845
#pragma GCC diagnostic pop
2846
#endif
2847
}
2848

2849
// Return vector of type __m128 with undefined elements.
2850
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps
2851
FORCE_INLINE __m128 _mm_undefined_ps(void)
2852
{
2853
#if defined(__GNUC__) || defined(__clang__)
2854
#pragma GCC diagnostic push
2855
#pragma GCC diagnostic ignored "-Wuninitialized"
2856
#endif
2857
    __m128 a;
2858
#if defined(_MSC_VER)
2859
    a = _mm_setzero_ps();
2860
#endif
2861
    return a;
2862
#if defined(__GNUC__) || defined(__clang__)
2863
#pragma GCC diagnostic pop
2864
#endif
2865
}
2866

2867
// Unpack and interleave single-precision (32-bit) floating-point elements from
2868
// the high half a and b, and store the results in dst.
2869
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps
2870
FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b)
2871
{
2872
#if defined(__aarch64__) || defined(_M_ARM64)
2873
    return vreinterpretq_m128_f32(
2874
        vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2875
#else
2876
    float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a));
2877
    float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b));
2878
    float32x2x2_t result = vzip_f32(a1, b1);
2879
    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2880
#endif
2881
}
2882

2883
// Unpack and interleave single-precision (32-bit) floating-point elements from
2884
// the low half of a and b, and store the results in dst.
2885
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps
2886
FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b)
2887
{
2888
#if defined(__aarch64__) || defined(_M_ARM64)
2889
    return vreinterpretq_m128_f32(
2890
        vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
2891
#else
2892
    float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a));
2893
    float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b));
2894
    float32x2x2_t result = vzip_f32(a1, b1);
2895
    return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1]));
2896
#endif
2897
}
2898

2899
// Compute the bitwise XOR of packed single-precision (32-bit) floating-point
2900
// elements in a and b, and store the results in dst.
2901
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps
2902
FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b)
2903
{
2904
    return vreinterpretq_m128_s32(
2905
        veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b)));
2906
}
2907

2908
/* SSE2 */
2909

2910
// Add packed 16-bit integers in a and b, and store the results in dst.
2911
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16
2912
FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b)
2913
{
2914
    return vreinterpretq_m128i_s16(
2915
        vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2916
}
2917

2918
// Add packed 32-bit integers in a and b, and store the results in dst.
2919
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32
2920
FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b)
2921
{
2922
    return vreinterpretq_m128i_s32(
2923
        vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
2924
}
2925

2926
// Add packed 64-bit integers in a and b, and store the results in dst.
2927
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64
2928
FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b)
2929
{
2930
    return vreinterpretq_m128i_s64(
2931
        vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
2932
}
2933

2934
// Add packed 8-bit integers in a and b, and store the results in dst.
2935
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8
2936
FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b)
2937
{
2938
    return vreinterpretq_m128i_s8(
2939
        vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
2940
}
2941

2942
// Add packed double-precision (64-bit) floating-point elements in a and b, and
2943
// store the results in dst.
2944
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd
2945
FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b)
2946
{
2947
#if defined(__aarch64__) || defined(_M_ARM64)
2948
    return vreinterpretq_m128d_f64(
2949
        vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
2950
#else
2951
    double *da = (double *) &a;
2952
    double *db = (double *) &b;
2953
    double c[2];
2954
    c[0] = da[0] + db[0];
2955
    c[1] = da[1] + db[1];
2956
    return vld1q_f32((float32_t *) c);
2957
#endif
2958
}
2959

2960
// Add the lower double-precision (64-bit) floating-point element in a and b,
2961
// store the result in the lower element of dst, and copy the upper element from
2962
// a to the upper element of dst.
2963
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd
2964
FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b)
2965
{
2966
#if defined(__aarch64__) || defined(_M_ARM64)
2967
    return _mm_move_sd(a, _mm_add_pd(a, b));
2968
#else
2969
    double *da = (double *) &a;
2970
    double *db = (double *) &b;
2971
    double c[2];
2972
    c[0] = da[0] + db[0];
2973
    c[1] = da[1];
2974
    return vld1q_f32((float32_t *) c);
2975
#endif
2976
}
2977

2978
// Add 64-bit integers a and b, and store the result in dst.
2979
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64
2980
FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b)
2981
{
2982
    return vreinterpret_m64_s64(
2983
        vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
2984
}
2985

2986
// Add packed signed 16-bit integers in a and b using saturation, and store the
2987
// results in dst.
2988
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16
2989
FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b)
2990
{
2991
    return vreinterpretq_m128i_s16(
2992
        vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
2993
}
2994

2995
// Add packed signed 8-bit integers in a and b using saturation, and store the
2996
// results in dst.
2997
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8
2998
FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b)
2999
{
3000
    return vreinterpretq_m128i_s8(
3001
        vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3002
}
3003

3004
// Add packed unsigned 16-bit integers in a and b using saturation, and store
3005
// the results in dst.
3006
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16
3007
FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b)
3008
{
3009
    return vreinterpretq_m128i_u16(
3010
        vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
3011
}
3012

3013
// Add packed unsigned 8-bit integers in a and b using saturation, and store the
3014
// results in dst.
3015
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8
3016
FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b)
3017
{
3018
    return vreinterpretq_m128i_u8(
3019
        vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3020
}
3021

3022
// Compute the bitwise AND of packed double-precision (64-bit) floating-point
3023
// elements in a and b, and store the results in dst.
3024
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd
3025
FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b)
3026
{
3027
    return vreinterpretq_m128d_s64(
3028
        vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
3029
}
3030

3031
// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
3032
// and store the result in dst.
3033
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128
3034
FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b)
3035
{
3036
    return vreinterpretq_m128i_s32(
3037
        vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3038
}
3039

3040
// Compute the bitwise NOT of packed double-precision (64-bit) floating-point
3041
// elements in a and then AND with b, and store the results in dst.
3042
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd
3043
FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b)
3044
{
3045
    // *NOTE* argument swap
3046
    return vreinterpretq_m128d_s64(
3047
        vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a)));
3048
}
3049

3050
// Compute the bitwise NOT of 128 bits (representing integer data) in a and then
3051
// AND with b, and store the result in dst.
3052
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128
3053
FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b)
3054
{
3055
    return vreinterpretq_m128i_s32(
3056
        vbicq_s32(vreinterpretq_s32_m128i(b),
3057
                  vreinterpretq_s32_m128i(a)));  // *NOTE* argument swap
3058
}
3059

3060
// Average packed unsigned 16-bit integers in a and b, and store the results in
3061
// dst.
3062
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16
3063
FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b)
3064
{
3065
    return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a),
3066
                                 vreinterpretq_u16_m128i(b));
3067
}
3068

3069
// Average packed unsigned 8-bit integers in a and b, and store the results in
3070
// dst.
3071
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8
3072
FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b)
3073
{
3074
    return vreinterpretq_m128i_u8(
3075
        vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
3076
}
3077

3078
// Shift a left by imm8 bytes while shifting in zeros, and store the results in
3079
// dst.
3080
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128
3081
#define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm)
3082

3083
// Shift a right by imm8 bytes while shifting in zeros, and store the results in
3084
// dst.
3085
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128
3086
#define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm)
3087

3088
// Cast vector of type __m128d to type __m128. This intrinsic is only used for
3089
// compilation and does not generate any instructions, thus it has zero latency.
3090
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps
3091
FORCE_INLINE __m128 _mm_castpd_ps(__m128d a)
3092
{
3093
    return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a));
3094
}
3095

3096
// Cast vector of type __m128d to type __m128i. This intrinsic is only used for
3097
// compilation and does not generate any instructions, thus it has zero latency.
3098
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128
3099
FORCE_INLINE __m128i _mm_castpd_si128(__m128d a)
3100
{
3101
    return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a));
3102
}
3103

3104
// Cast vector of type __m128 to type __m128d. This intrinsic is only used for
3105
// compilation and does not generate any instructions, thus it has zero latency.
3106
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd
3107
FORCE_INLINE __m128d _mm_castps_pd(__m128 a)
3108
{
3109
    return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a));
3110
}
3111

3112
// Cast vector of type __m128 to type __m128i. This intrinsic is only used for
3113
// compilation and does not generate any instructions, thus it has zero latency.
3114
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128
3115
FORCE_INLINE __m128i _mm_castps_si128(__m128 a)
3116
{
3117
    return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a));
3118
}
3119

3120
// Cast vector of type __m128i to type __m128d. This intrinsic is only used for
3121
// compilation and does not generate any instructions, thus it has zero latency.
3122
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd
3123
FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a)
3124
{
3125
#if defined(__aarch64__) || defined(_M_ARM64)
3126
    return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a));
3127
#else
3128
    return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a));
3129
#endif
3130
}
3131

3132
// Cast vector of type __m128i to type __m128. This intrinsic is only used for
3133
// compilation and does not generate any instructions, thus it has zero latency.
3134
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps
3135
FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a)
3136
{
3137
    return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a));
3138
}
3139

3140
// Invalidate and flush the cache line that contains p from all levels of the
3141
// cache hierarchy.
3142
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush
3143
#if defined(__APPLE__)
3144
#include <libkern/OSCacheControl.h>
3145
#endif
3146
FORCE_INLINE void _mm_clflush(void const *p)
3147
{
3148
    (void) p;
3149

3150
    /* sys_icache_invalidate is supported since macOS 10.5.
3151
     * However, it does not work on non-jailbroken iOS devices, although the
3152
     * compilation is successful.
3153
     */
3154
#if defined(__APPLE__)
3155
    sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE);
3156
#elif defined(__GNUC__) || defined(__clang__)
3157
    uintptr_t ptr = (uintptr_t) p;
3158
    __builtin___clear_cache((char *) ptr,
3159
                            (char *) ptr + SSE2NEON_CACHELINE_SIZE);
3160
#elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H
3161
    FlushInstructionCache(GetCurrentProcess(), p, SSE2NEON_CACHELINE_SIZE);
3162
#endif
3163
}
3164

3165
// Compare packed 16-bit integers in a and b for equality, and store the results
3166
// in dst.
3167
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16
3168
FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b)
3169
{
3170
    return vreinterpretq_m128i_u16(
3171
        vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3172
}
3173

3174
// Compare packed 32-bit integers in a and b for equality, and store the results
3175
// in dst.
3176
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32
3177
FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b)
3178
{
3179
    return vreinterpretq_m128i_u32(
3180
        vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3181
}
3182

3183
// Compare packed 8-bit integers in a and b for equality, and store the results
3184
// in dst.
3185
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8
3186
FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b)
3187
{
3188
    return vreinterpretq_m128i_u8(
3189
        vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3190
}
3191

3192
// Compare packed double-precision (64-bit) floating-point elements in a and b
3193
// for equality, and store the results in dst.
3194
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd
3195
FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b)
3196
{
3197
#if defined(__aarch64__) || defined(_M_ARM64)
3198
    return vreinterpretq_m128d_u64(
3199
        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3200
#else
3201
    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3202
    uint32x4_t cmp =
3203
        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3204
    uint32x4_t swapped = vrev64q_u32(cmp);
3205
    return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped));
3206
#endif
3207
}
3208

3209
// Compare the lower double-precision (64-bit) floating-point elements in a and
3210
// b for equality, store the result in the lower element of dst, and copy the
3211
// upper element from a to the upper element of dst.
3212
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd
3213
FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b)
3214
{
3215
    return _mm_move_sd(a, _mm_cmpeq_pd(a, b));
3216
}
3217

3218
// Compare packed double-precision (64-bit) floating-point elements in a and b
3219
// for greater-than-or-equal, and store the results in dst.
3220
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd
3221
FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b)
3222
{
3223
#if defined(__aarch64__) || defined(_M_ARM64)
3224
    return vreinterpretq_m128d_u64(
3225
        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3226
#else
3227
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3228
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3229
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3230
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3231
    uint64_t d[2];
3232
    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3233
    d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3234

3235
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3236
#endif
3237
}
3238

3239
// Compare the lower double-precision (64-bit) floating-point elements in a and
3240
// b for greater-than-or-equal, store the result in the lower element of dst,
3241
// and copy the upper element from a to the upper element of dst.
3242
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd
3243
FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b)
3244
{
3245
#if defined(__aarch64__) || defined(_M_ARM64)
3246
    return _mm_move_sd(a, _mm_cmpge_pd(a, b));
3247
#else
3248
    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3249
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3250
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3251
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3252
    uint64_t d[2];
3253
    d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3254
    d[1] = a1;
3255

3256
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3257
#endif
3258
}
3259

3260
// Compare packed signed 16-bit integers in a and b for greater-than, and store
3261
// the results in dst.
3262
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16
3263
FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b)
3264
{
3265
    return vreinterpretq_m128i_u16(
3266
        vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3267
}
3268

3269
// Compare packed signed 32-bit integers in a and b for greater-than, and store
3270
// the results in dst.
3271
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32
3272
FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b)
3273
{
3274
    return vreinterpretq_m128i_u32(
3275
        vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3276
}
3277

3278
// Compare packed signed 8-bit integers in a and b for greater-than, and store
3279
// the results in dst.
3280
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8
3281
FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b)
3282
{
3283
    return vreinterpretq_m128i_u8(
3284
        vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3285
}
3286

3287
// Compare packed double-precision (64-bit) floating-point elements in a and b
3288
// for greater-than, and store the results in dst.
3289
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd
3290
FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b)
3291
{
3292
#if defined(__aarch64__) || defined(_M_ARM64)
3293
    return vreinterpretq_m128d_u64(
3294
        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3295
#else
3296
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3297
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3298
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3299
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3300
    uint64_t d[2];
3301
    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3302
    d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3303

3304
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3305
#endif
3306
}
3307

3308
// Compare the lower double-precision (64-bit) floating-point elements in a and
3309
// b for greater-than, store the result in the lower element of dst, and copy
3310
// the upper element from a to the upper element of dst.
3311
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd
3312
FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b)
3313
{
3314
#if defined(__aarch64__) || defined(_M_ARM64)
3315
    return _mm_move_sd(a, _mm_cmpgt_pd(a, b));
3316
#else
3317
    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3318
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3319
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3320
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3321
    uint64_t d[2];
3322
    d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3323
    d[1] = a1;
3324

3325
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3326
#endif
3327
}
3328

3329
// Compare packed double-precision (64-bit) floating-point elements in a and b
3330
// for less-than-or-equal, and store the results in dst.
3331
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd
3332
FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b)
3333
{
3334
#if defined(__aarch64__) || defined(_M_ARM64)
3335
    return vreinterpretq_m128d_u64(
3336
        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3337
#else
3338
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3339
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3340
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3341
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3342
    uint64_t d[2];
3343
    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3344
    d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3345

3346
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3347
#endif
3348
}
3349

3350
// Compare the lower double-precision (64-bit) floating-point elements in a and
3351
// b for less-than-or-equal, store the result in the lower element of dst, and
3352
// copy the upper element from a to the upper element of dst.
3353
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd
3354
FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b)
3355
{
3356
#if defined(__aarch64__) || defined(_M_ARM64)
3357
    return _mm_move_sd(a, _mm_cmple_pd(a, b));
3358
#else
3359
    // expand "_mm_cmpge_pd()" to reduce unnecessary operations
3360
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3361
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3362
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3363
    uint64_t d[2];
3364
    d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3365
    d[1] = a1;
3366

3367
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3368
#endif
3369
}
3370

3371
// Compare packed signed 16-bit integers in a and b for less-than, and store the
3372
// results in dst. Note: This intrinsic emits the pcmpgtw instruction with the
3373
// order of the operands switched.
3374
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16
3375
FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b)
3376
{
3377
    return vreinterpretq_m128i_u16(
3378
        vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
3379
}
3380

3381
// Compare packed signed 32-bit integers in a and b for less-than, and store the
3382
// results in dst. Note: This intrinsic emits the pcmpgtd instruction with the
3383
// order of the operands switched.
3384
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32
3385
FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b)
3386
{
3387
    return vreinterpretq_m128i_u32(
3388
        vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
3389
}
3390

3391
// Compare packed signed 8-bit integers in a and b for less-than, and store the
3392
// results in dst. Note: This intrinsic emits the pcmpgtb instruction with the
3393
// order of the operands switched.
3394
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8
3395
FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b)
3396
{
3397
    return vreinterpretq_m128i_u8(
3398
        vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
3399
}
3400

3401
// Compare packed double-precision (64-bit) floating-point elements in a and b
3402
// for less-than, and store the results in dst.
3403
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd
3404
FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b)
3405
{
3406
#if defined(__aarch64__) || defined(_M_ARM64)
3407
    return vreinterpretq_m128d_u64(
3408
        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
3409
#else
3410
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3411
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3412
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3413
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3414
    uint64_t d[2];
3415
    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3416
    d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0);
3417

3418
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3419
#endif
3420
}
3421

3422
// Compare the lower double-precision (64-bit) floating-point elements in a and
3423
// b for less-than, store the result in the lower element of dst, and copy the
3424
// upper element from a to the upper element of dst.
3425
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd
3426
FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b)
3427
{
3428
#if defined(__aarch64__) || defined(_M_ARM64)
3429
    return _mm_move_sd(a, _mm_cmplt_pd(a, b));
3430
#else
3431
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3432
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3433
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3434
    uint64_t d[2];
3435
    d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0);
3436
    d[1] = a1;
3437

3438
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3439
#endif
3440
}
3441

3442
// Compare packed double-precision (64-bit) floating-point elements in a and b
3443
// for not-equal, and store the results in dst.
3444
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd
3445
FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b)
3446
{
3447
#if defined(__aarch64__) || defined(_M_ARM64)
3448
    return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64(
3449
        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)))));
3450
#else
3451
    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
3452
    uint32x4_t cmp =
3453
        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3454
    uint32x4_t swapped = vrev64q_u32(cmp);
3455
    return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped)));
3456
#endif
3457
}
3458

3459
// Compare the lower double-precision (64-bit) floating-point elements in a and
3460
// b for not-equal, store the result in the lower element of dst, and copy the
3461
// upper element from a to the upper element of dst.
3462
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd
3463
FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b)
3464
{
3465
    return _mm_move_sd(a, _mm_cmpneq_pd(a, b));
3466
}
3467

3468
// Compare packed double-precision (64-bit) floating-point elements in a and b
3469
// for not-greater-than-or-equal, and store the results in dst.
3470
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd
3471
FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b)
3472
{
3473
#if defined(__aarch64__) || defined(_M_ARM64)
3474
    return vreinterpretq_m128d_u64(veorq_u64(
3475
        vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3476
        vdupq_n_u64(UINT64_MAX)));
3477
#else
3478
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3479
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3480
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3481
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3482
    uint64_t d[2];
3483
    d[0] =
3484
        !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3485
    d[1] =
3486
        !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3487

3488
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3489
#endif
3490
}
3491

3492
// Compare the lower double-precision (64-bit) floating-point elements in a and
3493
// b for not-greater-than-or-equal, store the result in the lower element of
3494
// dst, and copy the upper element from a to the upper element of dst.
3495
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd
3496
FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b)
3497
{
3498
    return _mm_move_sd(a, _mm_cmpnge_pd(a, b));
3499
}
3500

3501
// Compare packed double-precision (64-bit) floating-point elements in a and b
3502
// for not-greater-than, and store the results in dst.
3503
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd
3504
FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b)
3505
{
3506
#if defined(__aarch64__) || defined(_M_ARM64)
3507
    return vreinterpretq_m128d_u64(veorq_u64(
3508
        vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3509
        vdupq_n_u64(UINT64_MAX)));
3510
#else
3511
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3512
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3513
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3514
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3515
    uint64_t d[2];
3516
    d[0] =
3517
        !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3518
    d[1] =
3519
        !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3520

3521
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3522
#endif
3523
}
3524

3525
// Compare the lower double-precision (64-bit) floating-point elements in a and
3526
// b for not-greater-than, store the result in the lower element of dst, and
3527
// copy the upper element from a to the upper element of dst.
3528
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd
3529
FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b)
3530
{
3531
    return _mm_move_sd(a, _mm_cmpngt_pd(a, b));
3532
}
3533

3534
// Compare packed double-precision (64-bit) floating-point elements in a and b
3535
// for not-less-than-or-equal, and store the results in dst.
3536
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd
3537
FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b)
3538
{
3539
#if defined(__aarch64__) || defined(_M_ARM64)
3540
    return vreinterpretq_m128d_u64(veorq_u64(
3541
        vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3542
        vdupq_n_u64(UINT64_MAX)));
3543
#else
3544
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3545
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3546
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3547
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3548
    uint64_t d[2];
3549
    d[0] =
3550
        !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3551
    d[1] =
3552
        !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3553

3554
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3555
#endif
3556
}
3557

3558
// Compare the lower double-precision (64-bit) floating-point elements in a and
3559
// b for not-less-than-or-equal, store the result in the lower element of dst,
3560
// and copy the upper element from a to the upper element of dst.
3561
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd
3562
FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b)
3563
{
3564
    return _mm_move_sd(a, _mm_cmpnle_pd(a, b));
3565
}
3566

3567
// Compare packed double-precision (64-bit) floating-point elements in a and b
3568
// for not-less-than, and store the results in dst.
3569
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd
3570
FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b)
3571
{
3572
#if defined(__aarch64__) || defined(_M_ARM64)
3573
    return vreinterpretq_m128d_u64(veorq_u64(
3574
        vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)),
3575
        vdupq_n_u64(UINT64_MAX)));
3576
#else
3577
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3578
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3579
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3580
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3581
    uint64_t d[2];
3582
    d[0] =
3583
        !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0);
3584
    d[1] =
3585
        !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0);
3586

3587
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3588
#endif
3589
}
3590

3591
// Compare the lower double-precision (64-bit) floating-point elements in a and
3592
// b for not-less-than, store the result in the lower element of dst, and copy
3593
// the upper element from a to the upper element of dst.
3594
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd
3595
FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b)
3596
{
3597
    return _mm_move_sd(a, _mm_cmpnlt_pd(a, b));
3598
}
3599

3600
// Compare packed double-precision (64-bit) floating-point elements in a and b
3601
// to see if neither is NaN, and store the results in dst.
3602
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd
3603
FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b)
3604
{
3605
#if defined(__aarch64__) || defined(_M_ARM64)
3606
    // Excluding NaNs, any two floating point numbers can be compared.
3607
    uint64x2_t not_nan_a =
3608
        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3609
    uint64x2_t not_nan_b =
3610
        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3611
    return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b));
3612
#else
3613
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3614
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3615
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3616
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3617
    uint64_t d[2];
3618
    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3619
            (*(double *) &b0) == (*(double *) &b0))
3620
               ? ~UINT64_C(0)
3621
               : UINT64_C(0);
3622
    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3623
            (*(double *) &b1) == (*(double *) &b1))
3624
               ? ~UINT64_C(0)
3625
               : UINT64_C(0);
3626

3627
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3628
#endif
3629
}
3630

3631
// Compare the lower double-precision (64-bit) floating-point elements in a and
3632
// b to see if neither is NaN, store the result in the lower element of dst, and
3633
// copy the upper element from a to the upper element of dst.
3634
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd
3635
FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b)
3636
{
3637
#if defined(__aarch64__) || defined(_M_ARM64)
3638
    return _mm_move_sd(a, _mm_cmpord_pd(a, b));
3639
#else
3640
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3641
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3642
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3643
    uint64_t d[2];
3644
    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3645
            (*(double *) &b0) == (*(double *) &b0))
3646
               ? ~UINT64_C(0)
3647
               : UINT64_C(0);
3648
    d[1] = a1;
3649

3650
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3651
#endif
3652
}
3653

3654
// Compare packed double-precision (64-bit) floating-point elements in a and b
3655
// to see if either is NaN, and store the results in dst.
3656
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd
3657
FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b)
3658
{
3659
#if defined(__aarch64__) || defined(_M_ARM64)
3660
    // Two NaNs are not equal in comparison operation.
3661
    uint64x2_t not_nan_a =
3662
        vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a));
3663
    uint64x2_t not_nan_b =
3664
        vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b));
3665
    return vreinterpretq_m128d_s32(
3666
        vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b))));
3667
#else
3668
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3669
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3670
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3671
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
3672
    uint64_t d[2];
3673
    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3674
            (*(double *) &b0) == (*(double *) &b0))
3675
               ? UINT64_C(0)
3676
               : ~UINT64_C(0);
3677
    d[1] = ((*(double *) &a1) == (*(double *) &a1) &&
3678
            (*(double *) &b1) == (*(double *) &b1))
3679
               ? UINT64_C(0)
3680
               : ~UINT64_C(0);
3681

3682
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3683
#endif
3684
}
3685

3686
// Compare the lower double-precision (64-bit) floating-point elements in a and
3687
// b to see if either is NaN, store the result in the lower element of dst, and
3688
// copy the upper element from a to the upper element of dst.
3689
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd
3690
FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b)
3691
{
3692
#if defined(__aarch64__) || defined(_M_ARM64)
3693
    return _mm_move_sd(a, _mm_cmpunord_pd(a, b));
3694
#else
3695
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3696
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3697
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
3698
    uint64_t d[2];
3699
    d[0] = ((*(double *) &a0) == (*(double *) &a0) &&
3700
            (*(double *) &b0) == (*(double *) &b0))
3701
               ? UINT64_C(0)
3702
               : ~UINT64_C(0);
3703
    d[1] = a1;
3704

3705
    return vreinterpretq_m128d_u64(vld1q_u64(d));
3706
#endif
3707
}
3708

3709
// Compare the lower double-precision (64-bit) floating-point element in a and b
3710
// for greater-than-or-equal, and return the boolean result (0 or 1).
3711
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd
3712
FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b)
3713
{
3714
#if defined(__aarch64__) || defined(_M_ARM64)
3715
    return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1;
3716
#else
3717
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3718
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3719

3720
    return (*(double *) &a0 >= *(double *) &b0);
3721
#endif
3722
}
3723

3724
// Compare the lower double-precision (64-bit) floating-point element in a and b
3725
// for greater-than, and return the boolean result (0 or 1).
3726
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd
3727
FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b)
3728
{
3729
#if defined(__aarch64__) || defined(_M_ARM64)
3730
    return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1;
3731
#else
3732
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3733
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3734

3735
    return (*(double *) &a0 > *(double *) &b0);
3736
#endif
3737
}
3738

3739
// Compare the lower double-precision (64-bit) floating-point element in a and b
3740
// for less-than-or-equal, and return the boolean result (0 or 1).
3741
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd
3742
FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b)
3743
{
3744
#if defined(__aarch64__) || defined(_M_ARM64)
3745
    return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1;
3746
#else
3747
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3748
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3749

3750
    return (*(double *) &a0 <= *(double *) &b0);
3751
#endif
3752
}
3753

3754
// Compare the lower double-precision (64-bit) floating-point element in a and b
3755
// for less-than, and return the boolean result (0 or 1).
3756
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd
3757
FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b)
3758
{
3759
#if defined(__aarch64__) || defined(_M_ARM64)
3760
    return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1;
3761
#else
3762
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
3763
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
3764

3765
    return (*(double *) &a0 < *(double *) &b0);
3766
#endif
3767
}
3768

3769
// Compare the lower double-precision (64-bit) floating-point element in a and b
3770
// for equality, and return the boolean result (0 or 1).
3771
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd
3772
FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b)
3773
{
3774
#if defined(__aarch64__) || defined(_M_ARM64)
3775
    return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1;
3776
#else
3777
    uint32x4_t a_not_nan =
3778
        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a));
3779
    uint32x4_t b_not_nan =
3780
        vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b));
3781
    uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
3782
    uint32x4_t a_eq_b =
3783
        vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b));
3784
    uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan),
3785
                                       vreinterpretq_u64_u32(a_eq_b));
3786
    return vgetq_lane_u64(and_results, 0) & 0x1;
3787
#endif
3788
}
3789

3790
// Compare the lower double-precision (64-bit) floating-point element in a and b
3791
// for not-equal, and return the boolean result (0 or 1).
3792
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd
3793
FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b)
3794
{
3795
    return !_mm_comieq_sd(a, b);
3796
}
3797

3798
// Convert packed signed 32-bit integers in a to packed double-precision
3799
// (64-bit) floating-point elements, and store the results in dst.
3800
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd
3801
FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a)
3802
{
3803
#if defined(__aarch64__) || defined(_M_ARM64)
3804
    return vreinterpretq_m128d_f64(
3805
        vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))));
3806
#else
3807
    double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
3808
    double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1);
3809
    return _mm_set_pd(a1, a0);
3810
#endif
3811
}
3812

3813
// Convert packed signed 32-bit integers in a to packed single-precision
3814
// (32-bit) floating-point elements, and store the results in dst.
3815
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps
3816
FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a)
3817
{
3818
    return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a)));
3819
}
3820

3821
// Convert packed double-precision (64-bit) floating-point elements in a to
3822
// packed 32-bit integers, and store the results in dst.
3823
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32
3824
FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a)
3825
{
3826
// vrnd32xq_f64 not supported on clang
3827
#if defined(__ARM_FEATURE_FRINT) && !defined(__clang__)
3828
    float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a));
3829
    int64x2_t integers = vcvtq_s64_f64(rounded);
3830
    return vreinterpretq_m128i_s32(
3831
        vcombine_s32(vmovn_s64(integers), vdup_n_s32(0)));
3832
#else
3833
    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3834
    double d0 = ((double *) &rnd)[0];
3835
    double d1 = ((double *) &rnd)[1];
3836
    return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0);
3837
#endif
3838
}
3839

3840
// Convert packed double-precision (64-bit) floating-point elements in a to
3841
// packed 32-bit integers, and store the results in dst.
3842
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32
3843
FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a)
3844
{
3845
    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3846
    double d0 = ((double *) &rnd)[0];
3847
    double d1 = ((double *) &rnd)[1];
3848
    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1};
3849
    return vreinterpret_m64_s32(vld1_s32(data));
3850
}
3851

3852
// Convert packed double-precision (64-bit) floating-point elements in a to
3853
// packed single-precision (32-bit) floating-point elements, and store the
3854
// results in dst.
3855
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps
3856
FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a)
3857
{
3858
#if defined(__aarch64__) || defined(_M_ARM64)
3859
    float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a));
3860
    return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0)));
3861
#else
3862
    float a0 = (float) ((double *) &a)[0];
3863
    float a1 = (float) ((double *) &a)[1];
3864
    return _mm_set_ps(0, 0, a1, a0);
3865
#endif
3866
}
3867

3868
// Convert packed signed 32-bit integers in a to packed double-precision
3869
// (64-bit) floating-point elements, and store the results in dst.
3870
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd
3871
FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a)
3872
{
3873
#if defined(__aarch64__) || defined(_M_ARM64)
3874
    return vreinterpretq_m128d_f64(
3875
        vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a))));
3876
#else
3877
    double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0);
3878
    double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1);
3879
    return _mm_set_pd(a1, a0);
3880
#endif
3881
}
3882

3883
// Convert packed single-precision (32-bit) floating-point elements in a to
3884
// packed 32-bit integers, and store the results in dst.
3885
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32
3886
// *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A
3887
// does not support! It is supported on ARMv8-A however.
3888
FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a)
3889
{
3890
#if defined(__ARM_FEATURE_FRINT)
3891
    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a)));
3892
#elif (defined(__aarch64__) || defined(_M_ARM64)) || \
3893
    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
3894
    switch (_MM_GET_ROUNDING_MODE()) {
3895
    case _MM_ROUND_NEAREST:
3896
        return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a));
3897
    case _MM_ROUND_DOWN:
3898
        return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a));
3899
    case _MM_ROUND_UP:
3900
        return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a));
3901
    default:  // _MM_ROUND_TOWARD_ZERO
3902
        return vreinterpretq_m128i_s32(vcvtq_s32_f32(a));
3903
    }
3904
#else
3905
    float *f = (float *) &a;
3906
    switch (_MM_GET_ROUNDING_MODE()) {
3907
    case _MM_ROUND_NEAREST: {
3908
        uint32x4_t signmask = vdupq_n_u32(0x80000000);
3909
        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
3910
                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
3911
        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
3912
            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
3913
        int32x4_t r_trunc = vcvtq_s32_f32(
3914
            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
3915
        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
3916
            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
3917
        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
3918
                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
3919
        float32x4_t delta = vsubq_f32(
3920
            vreinterpretq_f32_m128(a),
3921
            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
3922
        uint32x4_t is_delta_half =
3923
            vceqq_f32(delta, half); /* delta == +/- 0.5 */
3924
        return vreinterpretq_m128i_s32(
3925
            vbslq_s32(is_delta_half, r_even, r_normal));
3926
    }
3927
    case _MM_ROUND_DOWN:
3928
        return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]),
3929
                             floorf(f[0]));
3930
    case _MM_ROUND_UP:
3931
        return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]),
3932
                             ceilf(f[0]));
3933
    default:  // _MM_ROUND_TOWARD_ZERO
3934
        return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1],
3935
                             (int32_t) f[0]);
3936
    }
3937
#endif
3938
}
3939

3940
// Convert packed single-precision (32-bit) floating-point elements in a to
3941
// packed double-precision (64-bit) floating-point elements, and store the
3942
// results in dst.
3943
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd
3944
FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a)
3945
{
3946
#if defined(__aarch64__) || defined(_M_ARM64)
3947
    return vreinterpretq_m128d_f64(
3948
        vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a))));
3949
#else
3950
    double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
3951
    double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
3952
    return _mm_set_pd(a1, a0);
3953
#endif
3954
}
3955

3956
// Copy the lower double-precision (64-bit) floating-point element of a to dst.
3957
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64
3958
FORCE_INLINE double _mm_cvtsd_f64(__m128d a)
3959
{
3960
#if defined(__aarch64__) || defined(_M_ARM64)
3961
    return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0);
3962
#else
3963
    return ((double *) &a)[0];
3964
#endif
3965
}
3966

3967
// Convert the lower double-precision (64-bit) floating-point element in a to a
3968
// 32-bit integer, and store the result in dst.
3969
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32
3970
FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a)
3971
{
3972
#if defined(__aarch64__) || defined(_M_ARM64)
3973
    return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3974
#else
3975
    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3976
    double ret = ((double *) &rnd)[0];
3977
    return (int32_t) ret;
3978
#endif
3979
}
3980

3981
// Convert the lower double-precision (64-bit) floating-point element in a to a
3982
// 64-bit integer, and store the result in dst.
3983
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64
3984
FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a)
3985
{
3986
#if defined(__aarch64__) || defined(_M_ARM64)
3987
    return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0);
3988
#else
3989
    __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION);
3990
    double ret = ((double *) &rnd)[0];
3991
    return (int64_t) ret;
3992
#endif
3993
}
3994

3995
// Convert the lower double-precision (64-bit) floating-point element in a to a
3996
// 64-bit integer, and store the result in dst.
3997
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x
3998
#define _mm_cvtsd_si64x _mm_cvtsd_si64
3999

4000
// Convert the lower double-precision (64-bit) floating-point element in b to a
4001
// single-precision (32-bit) floating-point element, store the result in the
4002
// lower element of dst, and copy the upper 3 packed elements from a to the
4003
// upper elements of dst.
4004
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss
4005
FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b)
4006
{
4007
#if defined(__aarch64__) || defined(_M_ARM64)
4008
    return vreinterpretq_m128_f32(vsetq_lane_f32(
4009
        vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0),
4010
        vreinterpretq_f32_m128(a), 0));
4011
#else
4012
    return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0],
4013
                                                 vreinterpretq_f32_m128(a), 0));
4014
#endif
4015
}
4016

4017
// Copy the lower 32-bit integer in a to dst.
4018
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32
4019
FORCE_INLINE int _mm_cvtsi128_si32(__m128i a)
4020
{
4021
    return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0);
4022
}
4023

4024
// Copy the lower 64-bit integer in a to dst.
4025
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64
4026
FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a)
4027
{
4028
    return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0);
4029
}
4030

4031
// Copy the lower 64-bit integer in a to dst.
4032
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
4033
#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4034

4035
// Convert the signed 32-bit integer b to a double-precision (64-bit)
4036
// floating-point element, store the result in the lower element of dst, and
4037
// copy the upper element from a to the upper element of dst.
4038
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd
4039
FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b)
4040
{
4041
#if defined(__aarch64__) || defined(_M_ARM64)
4042
    return vreinterpretq_m128d_f64(
4043
        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4044
#else
4045
    double bf = (double) b;
4046
    return vreinterpretq_m128d_s64(
4047
        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4048
#endif
4049
}
4050

4051
// Copy the lower 64-bit integer in a to dst.
4052
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x
4053
#define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a)
4054

4055
// Copy 32-bit integer a to the lower elements of dst, and zero the upper
4056
// elements of dst.
4057
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128
4058
FORCE_INLINE __m128i _mm_cvtsi32_si128(int a)
4059
{
4060
    return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0));
4061
}
4062

4063
// Convert the signed 64-bit integer b to a double-precision (64-bit)
4064
// floating-point element, store the result in the lower element of dst, and
4065
// copy the upper element from a to the upper element of dst.
4066
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd
4067
FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b)
4068
{
4069
#if defined(__aarch64__) || defined(_M_ARM64)
4070
    return vreinterpretq_m128d_f64(
4071
        vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0));
4072
#else
4073
    double bf = (double) b;
4074
    return vreinterpretq_m128d_s64(
4075
        vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0));
4076
#endif
4077
}
4078

4079
// Copy 64-bit integer a to the lower element of dst, and zero the upper
4080
// element.
4081
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128
4082
FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a)
4083
{
4084
    return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0));
4085
}
4086

4087
// Copy 64-bit integer a to the lower element of dst, and zero the upper
4088
// element.
4089
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128
4090
#define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a)
4091

4092
// Convert the signed 64-bit integer b to a double-precision (64-bit)
4093
// floating-point element, store the result in the lower element of dst, and
4094
// copy the upper element from a to the upper element of dst.
4095
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd
4096
#define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b)
4097

4098
// Convert the lower single-precision (32-bit) floating-point element in b to a
4099
// double-precision (64-bit) floating-point element, store the result in the
4100
// lower element of dst, and copy the upper element from a to the upper element
4101
// of dst.
4102
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd
4103
FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b)
4104
{
4105
    double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0);
4106
#if defined(__aarch64__) || defined(_M_ARM64)
4107
    return vreinterpretq_m128d_f64(
4108
        vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0));
4109
#else
4110
    return vreinterpretq_m128d_s64(
4111
        vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0));
4112
#endif
4113
}
4114

4115
// Convert packed double-precision (64-bit) floating-point elements in a to
4116
// packed 32-bit integers with truncation, and store the results in dst.
4117
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32
4118
FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a)
4119
{
4120
    double a0 = ((double *) &a)[0];
4121
    double a1 = ((double *) &a)[1];
4122
    return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0);
4123
}
4124

4125
// Convert packed double-precision (64-bit) floating-point elements in a to
4126
// packed 32-bit integers with truncation, and store the results in dst.
4127
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32
4128
FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a)
4129
{
4130
    double a0 = ((double *) &a)[0];
4131
    double a1 = ((double *) &a)[1];
4132
    int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1};
4133
    return vreinterpret_m64_s32(vld1_s32(data));
4134
}
4135

4136
// Convert packed single-precision (32-bit) floating-point elements in a to
4137
// packed 32-bit integers with truncation, and store the results in dst.
4138
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32
4139
FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a)
4140
{
4141
    return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)));
4142
}
4143

4144
// Convert the lower double-precision (64-bit) floating-point element in a to a
4145
// 32-bit integer with truncation, and store the result in dst.
4146
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32
4147
FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a)
4148
{
4149
    double ret = *((double *) &a);
4150
    return (int32_t) ret;
4151
}
4152

4153
// Convert the lower double-precision (64-bit) floating-point element in a to a
4154
// 64-bit integer with truncation, and store the result in dst.
4155
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64
4156
FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a)
4157
{
4158
#if defined(__aarch64__) || defined(_M_ARM64)
4159
    return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0);
4160
#else
4161
    double ret = *((double *) &a);
4162
    return (int64_t) ret;
4163
#endif
4164
}
4165

4166
// Convert the lower double-precision (64-bit) floating-point element in a to a
4167
// 64-bit integer with truncation, and store the result in dst.
4168
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x
4169
#define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a)
4170

4171
// Divide packed double-precision (64-bit) floating-point elements in a by
4172
// packed elements in b, and store the results in dst.
4173
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd
4174
FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b)
4175
{
4176
#if defined(__aarch64__) || defined(_M_ARM64)
4177
    return vreinterpretq_m128d_f64(
4178
        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4179
#else
4180
    double *da = (double *) &a;
4181
    double *db = (double *) &b;
4182
    double c[2];
4183
    c[0] = da[0] / db[0];
4184
    c[1] = da[1] / db[1];
4185
    return vld1q_f32((float32_t *) c);
4186
#endif
4187
}
4188

4189
// Divide the lower double-precision (64-bit) floating-point element in a by the
4190
// lower double-precision (64-bit) floating-point element in b, store the result
4191
// in the lower element of dst, and copy the upper element from a to the upper
4192
// element of dst.
4193
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd
4194
FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b)
4195
{
4196
#if defined(__aarch64__) || defined(_M_ARM64)
4197
    float64x2_t tmp =
4198
        vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b));
4199
    return vreinterpretq_m128d_f64(
4200
        vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1));
4201
#else
4202
    return _mm_move_sd(a, _mm_div_pd(a, b));
4203
#endif
4204
}
4205

4206
// Extract a 16-bit integer from a, selected with imm8, and store the result in
4207
// the lower element of dst.
4208
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16
4209
// FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm)
4210
#define _mm_extract_epi16(a, imm) \
4211
    vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm))
4212

4213
// Copy a to dst, and insert the 16-bit integer i into dst at the location
4214
// specified by imm8.
4215
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16
4216
// FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b,
4217
//                                       __constrange(0,8) int imm)
4218
#define _mm_insert_epi16(a, b, imm) \
4219
    vreinterpretq_m128i_s16(        \
4220
        vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm)))
4221

4222
// Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point
4223
// elements) from memory into dst. mem_addr must be aligned on a 16-byte
4224
// boundary or a general-protection exception may be generated.
4225
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd
4226
FORCE_INLINE __m128d _mm_load_pd(const double *p)
4227
{
4228
#if defined(__aarch64__) || defined(_M_ARM64)
4229
    return vreinterpretq_m128d_f64(vld1q_f64(p));
4230
#else
4231
    const float *fp = (const float *) p;
4232
    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]};
4233
    return vreinterpretq_m128d_f32(vld1q_f32(data));
4234
#endif
4235
}
4236

4237
// Load a double-precision (64-bit) floating-point element from memory into both
4238
// elements of dst.
4239
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1
4240
#define _mm_load_pd1 _mm_load1_pd
4241

4242
// Load a double-precision (64-bit) floating-point element from memory into the
4243
// lower of dst, and zero the upper element. mem_addr does not need to be
4244
// aligned on any particular boundary.
4245
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd
4246
FORCE_INLINE __m128d _mm_load_sd(const double *p)
4247
{
4248
#if defined(__aarch64__) || defined(_M_ARM64)
4249
    return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0));
4250
#else
4251
    const float *fp = (const float *) p;
4252
    float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0};
4253
    return vreinterpretq_m128d_f32(vld1q_f32(data));
4254
#endif
4255
}
4256

4257
// Load 128-bits of integer data from memory into dst. mem_addr must be aligned
4258
// on a 16-byte boundary or a general-protection exception may be generated.
4259
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128
4260
FORCE_INLINE __m128i _mm_load_si128(const __m128i *p)
4261
{
4262
    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4263
}
4264

4265
// Load a double-precision (64-bit) floating-point element from memory into both
4266
// elements of dst.
4267
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd
4268
FORCE_INLINE __m128d _mm_load1_pd(const double *p)
4269
{
4270
#if defined(__aarch64__) || defined(_M_ARM64)
4271
    return vreinterpretq_m128d_f64(vld1q_dup_f64(p));
4272
#else
4273
    return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p));
4274
#endif
4275
}
4276

4277
// Load a double-precision (64-bit) floating-point element from memory into the
4278
// upper element of dst, and copy the lower element from a to dst. mem_addr does
4279
// not need to be aligned on any particular boundary.
4280
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd
4281
FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p)
4282
{
4283
#if defined(__aarch64__) || defined(_M_ARM64)
4284
    return vreinterpretq_m128d_f64(
4285
        vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p)));
4286
#else
4287
    return vreinterpretq_m128d_f32(vcombine_f32(
4288
        vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p)));
4289
#endif
4290
}
4291

4292
// Load 64-bit integer from memory into the first element of dst.
4293
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64
4294
FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p)
4295
{
4296
    /* Load the lower 64 bits of the value pointed to by p into the
4297
     * lower 64 bits of the result, zeroing the upper 64 bits of the result.
4298
     */
4299
    return vreinterpretq_m128i_s32(
4300
        vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0)));
4301
}
4302

4303
// Load a double-precision (64-bit) floating-point element from memory into the
4304
// lower element of dst, and copy the upper element from a to dst. mem_addr does
4305
// not need to be aligned on any particular boundary.
4306
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd
4307
FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p)
4308
{
4309
#if defined(__aarch64__) || defined(_M_ARM64)
4310
    return vreinterpretq_m128d_f64(
4311
        vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a))));
4312
#else
4313
    return vreinterpretq_m128d_f32(
4314
        vcombine_f32(vld1_f32((const float *) p),
4315
                     vget_high_f32(vreinterpretq_f32_m128d(a))));
4316
#endif
4317
}
4318

4319
// Load 2 double-precision (64-bit) floating-point elements from memory into dst
4320
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
4321
// general-protection exception may be generated.
4322
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd
4323
FORCE_INLINE __m128d _mm_loadr_pd(const double *p)
4324
{
4325
#if defined(__aarch64__) || defined(_M_ARM64)
4326
    float64x2_t v = vld1q_f64(p);
4327
    return vreinterpretq_m128d_f64(vextq_f64(v, v, 1));
4328
#else
4329
    int64x2_t v = vld1q_s64((const int64_t *) p);
4330
    return vreinterpretq_m128d_s64(vextq_s64(v, v, 1));
4331
#endif
4332
}
4333

4334
// Loads two double-precision from unaligned memory, floating-point values.
4335
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd
4336
FORCE_INLINE __m128d _mm_loadu_pd(const double *p)
4337
{
4338
    return _mm_load_pd(p);
4339
}
4340

4341
// Load 128-bits of integer data from memory into dst. mem_addr does not need to
4342
// be aligned on any particular boundary.
4343
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128
4344
FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p)
4345
{
4346
    return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p));
4347
}
4348

4349
// Load unaligned 32-bit integer from memory into the first element of dst.
4350
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32
4351
FORCE_INLINE __m128i _mm_loadu_si32(const void *p)
4352
{
4353
    return vreinterpretq_m128i_s32(
4354
        vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0));
4355
}
4356

4357
// Multiply packed signed 16-bit integers in a and b, producing intermediate
4358
// signed 32-bit integers. Horizontally add adjacent pairs of intermediate
4359
// 32-bit integers, and pack the results in dst.
4360
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16
4361
FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b)
4362
{
4363
    int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
4364
                              vget_low_s16(vreinterpretq_s16_m128i(b)));
4365
#if defined(__aarch64__) || defined(_M_ARM64)
4366
    int32x4_t high =
4367
        vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b));
4368

4369
    return vreinterpretq_m128i_s32(vpaddq_s32(low, high));
4370
#else
4371
    int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
4372
                               vget_high_s16(vreinterpretq_s16_m128i(b)));
4373

4374
    int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low));
4375
    int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high));
4376

4377
    return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum));
4378
#endif
4379
}
4380

4381
// Conditionally store 8-bit integer elements from a into memory using mask
4382
// (elements are not stored when the highest bit is not set in the corresponding
4383
// element) and a non-temporal memory hint. mem_addr does not need to be aligned
4384
// on any particular boundary.
4385
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128
4386
FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr)
4387
{
4388
    int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7);
4389
    __m128 b = _mm_load_ps((const float *) mem_addr);
4390
    int8x16_t masked =
4391
        vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a),
4392
                 vreinterpretq_s8_m128(b));
4393
    vst1q_s8((int8_t *) mem_addr, masked);
4394
}
4395

4396
// Compare packed signed 16-bit integers in a and b, and store packed maximum
4397
// values in dst.
4398
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16
4399
FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b)
4400
{
4401
    return vreinterpretq_m128i_s16(
4402
        vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4403
}
4404

4405
// Compare packed unsigned 8-bit integers in a and b, and store packed maximum
4406
// values in dst.
4407
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8
4408
FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b)
4409
{
4410
    return vreinterpretq_m128i_u8(
4411
        vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4412
}
4413

4414
// Compare packed double-precision (64-bit) floating-point elements in a and b,
4415
// and store packed maximum values in dst.
4416
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd
4417
FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b)
4418
{
4419
#if defined(__aarch64__) || defined(_M_ARM64)
4420
#if SSE2NEON_PRECISE_MINMAX
4421
    float64x2_t _a = vreinterpretq_f64_m128d(a);
4422
    float64x2_t _b = vreinterpretq_f64_m128d(b);
4423
    return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b));
4424
#else
4425
    return vreinterpretq_m128d_f64(
4426
        vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4427
#endif
4428
#else
4429
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4430
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4431
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4432
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4433
    uint64_t d[2];
4434
    d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0;
4435
    d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1;
4436

4437
    return vreinterpretq_m128d_u64(vld1q_u64(d));
4438
#endif
4439
}
4440

4441
// Compare the lower double-precision (64-bit) floating-point elements in a and
4442
// b, store the maximum value in the lower element of dst, and copy the upper
4443
// element from a to the upper element of dst.
4444
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd
4445
FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b)
4446
{
4447
#if defined(__aarch64__) || defined(_M_ARM64)
4448
    return _mm_move_sd(a, _mm_max_pd(a, b));
4449
#else
4450
    double *da = (double *) &a;
4451
    double *db = (double *) &b;
4452
    double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]};
4453
    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4454
#endif
4455
}
4456

4457
// Compare packed signed 16-bit integers in a and b, and store packed minimum
4458
// values in dst.
4459
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16
4460
FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b)
4461
{
4462
    return vreinterpretq_m128i_s16(
4463
        vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4464
}
4465

4466
// Compare packed unsigned 8-bit integers in a and b, and store packed minimum
4467
// values in dst.
4468
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8
4469
FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b)
4470
{
4471
    return vreinterpretq_m128i_u8(
4472
        vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
4473
}
4474

4475
// Compare packed double-precision (64-bit) floating-point elements in a and b,
4476
// and store packed minimum values in dst.
4477
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd
4478
FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b)
4479
{
4480
#if defined(__aarch64__) || defined(_M_ARM64)
4481
#if SSE2NEON_PRECISE_MINMAX
4482
    float64x2_t _a = vreinterpretq_f64_m128d(a);
4483
    float64x2_t _b = vreinterpretq_f64_m128d(b);
4484
    return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b));
4485
#else
4486
    return vreinterpretq_m128d_f64(
4487
        vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4488
#endif
4489
#else
4490
    uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a));
4491
    uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a));
4492
    uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b));
4493
    uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b));
4494
    uint64_t d[2];
4495
    d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0;
4496
    d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1;
4497
    return vreinterpretq_m128d_u64(vld1q_u64(d));
4498
#endif
4499
}
4500

4501
// Compare the lower double-precision (64-bit) floating-point elements in a and
4502
// b, store the minimum value in the lower element of dst, and copy the upper
4503
// element from a to the upper element of dst.
4504
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd
4505
FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b)
4506
{
4507
#if defined(__aarch64__) || defined(_M_ARM64)
4508
    return _mm_move_sd(a, _mm_min_pd(a, b));
4509
#else
4510
    double *da = (double *) &a;
4511
    double *db = (double *) &b;
4512
    double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]};
4513
    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c));
4514
#endif
4515
}
4516

4517
// Copy the lower 64-bit integer in a to the lower element of dst, and zero the
4518
// upper element.
4519
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64
4520
FORCE_INLINE __m128i _mm_move_epi64(__m128i a)
4521
{
4522
    return vreinterpretq_m128i_s64(
4523
        vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1));
4524
}
4525

4526
// Move the lower double-precision (64-bit) floating-point element from b to the
4527
// lower element of dst, and copy the upper element from a to the upper element
4528
// of dst.
4529
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd
4530
FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b)
4531
{
4532
    return vreinterpretq_m128d_f32(
4533
        vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)),
4534
                     vget_high_f32(vreinterpretq_f32_m128d(a))));
4535
}
4536

4537
// Create mask from the most significant bit of each 8-bit element in a, and
4538
// store the result in dst.
4539
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8
4540
FORCE_INLINE int _mm_movemask_epi8(__m128i a)
4541
{
4542
    // Use increasingly wide shifts+adds to collect the sign bits
4543
    // together.
4544
    // Since the widening shifts would be rather confusing to follow in little
4545
    // endian, everything will be illustrated in big endian order instead. This
4546
    // has a different result - the bits would actually be reversed on a big
4547
    // endian machine.
4548

4549
    // Starting input (only half the elements are shown):
4550
    // 89 ff 1d c0 00 10 99 33
4551
    uint8x16_t input = vreinterpretq_u8_m128i(a);
4552

4553
    // Shift out everything but the sign bits with an unsigned shift right.
4554
    //
4555
    // Bytes of the vector::
4556
    // 89 ff 1d c0 00 10 99 33
4557
    // \  \  \  \  \  \  \  \    high_bits = (uint16x4_t)(input >> 7)
4558
    //  |  |  |  |  |  |  |  |
4559
    // 01 01 00 01 00 00 01 00
4560
    //
4561
    // Bits of first important lane(s):
4562
    // 10001001 (89)
4563
    // \______
4564
    //        |
4565
    // 00000001 (01)
4566
    uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7));
4567

4568
    // Merge the even lanes together with a 16-bit unsigned shift right + add.
4569
    // 'xx' represents garbage data which will be ignored in the final result.
4570
    // In the important bytes, the add functions like a binary OR.
4571
    //
4572
    // 01 01 00 01 00 00 01 00
4573
    //  \_ |  \_ |  \_ |  \_ |   paired16 = (uint32x4_t)(input + (input >> 7))
4574
    //    \|    \|    \|    \|
4575
    // xx 03 xx 01 xx 00 xx 02
4576
    //
4577
    // 00000001 00000001 (01 01)
4578
    //        \_______ |
4579
    //                \|
4580
    // xxxxxxxx xxxxxx11 (xx 03)
4581
    uint32x4_t paired16 =
4582
        vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7));
4583

4584
    // Repeat with a wider 32-bit shift + add.
4585
    // xx 03 xx 01 xx 00 xx 02
4586
    //     \____ |     \____ |  paired32 = (uint64x1_t)(paired16 + (paired16 >>
4587
    //     14))
4588
    //          \|          \|
4589
    // xx xx xx 0d xx xx xx 02
4590
    //
4591
    // 00000011 00000001 (03 01)
4592
    //        \\_____ ||
4593
    //         '----.\||
4594
    // xxxxxxxx xxxx1101 (xx 0d)
4595
    uint64x2_t paired32 =
4596
        vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14));
4597

4598
    // Last, an even wider 64-bit shift + add to get our result in the low 8 bit
4599
    // lanes. xx xx xx 0d xx xx xx 02
4600
    //            \_________ |   paired64 = (uint8x8_t)(paired32 + (paired32 >>
4601
    //            28))
4602
    //                      \|
4603
    // xx xx xx xx xx xx xx d2
4604
    //
4605
    // 00001101 00000010 (0d 02)
4606
    //     \   \___ |  |
4607
    //      '---.  \|  |
4608
    // xxxxxxxx 11010010 (xx d2)
4609
    uint8x16_t paired64 =
4610
        vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28));
4611

4612
    // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts.
4613
    // xx xx xx xx xx xx xx d2
4614
    //                      ||  return paired64[0]
4615
    //                      d2
4616
    // Note: Little endian would return the correct value 4b (01001011) instead.
4617
    return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8);
4618
}
4619

4620
// Set each bit of mask dst based on the most significant bit of the
4621
// corresponding packed double-precision (64-bit) floating-point element in a.
4622
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd
4623
FORCE_INLINE int _mm_movemask_pd(__m128d a)
4624
{
4625
    uint64x2_t input = vreinterpretq_u64_m128d(a);
4626
    uint64x2_t high_bits = vshrq_n_u64(input, 63);
4627
    return (int) (vgetq_lane_u64(high_bits, 0) |
4628
                  (vgetq_lane_u64(high_bits, 1) << 1));
4629
}
4630

4631
// Copy the lower 64-bit integer in a to dst.
4632
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64
4633
FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a)
4634
{
4635
    return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a)));
4636
}
4637

4638
// Copy the 64-bit integer a to the lower element of dst, and zero the upper
4639
// element.
4640
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64
4641
FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a)
4642
{
4643
    return vreinterpretq_m128i_s64(
4644
        vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0)));
4645
}
4646

4647
// Multiply the low unsigned 32-bit integers from each packed 64-bit element in
4648
// a and b, and store the unsigned 64-bit results in dst.
4649
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32
4650
FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b)
4651
{
4652
    // vmull_u32 upcasts instead of masking, so we downcast.
4653
    uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a));
4654
    uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b));
4655
    return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo));
4656
}
4657

4658
// Multiply packed double-precision (64-bit) floating-point elements in a and b,
4659
// and store the results in dst.
4660
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd
4661
FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b)
4662
{
4663
#if defined(__aarch64__) || defined(_M_ARM64)
4664
    return vreinterpretq_m128d_f64(
4665
        vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
4666
#else
4667
    double *da = (double *) &a;
4668
    double *db = (double *) &b;
4669
    double c[2];
4670
    c[0] = da[0] * db[0];
4671
    c[1] = da[1] * db[1];
4672
    return vld1q_f32((float32_t *) c);
4673
#endif
4674
}
4675

4676
// Multiply the lower double-precision (64-bit) floating-point element in a and
4677
// b, store the result in the lower element of dst, and copy the upper element
4678
// from a to the upper element of dst.
4679
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd
4680
FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b)
4681
{
4682
    return _mm_move_sd(a, _mm_mul_pd(a, b));
4683
}
4684

4685
// Multiply the low unsigned 32-bit integers from a and b, and store the
4686
// unsigned 64-bit result in dst.
4687
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32
4688
FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b)
4689
{
4690
    return vreinterpret_m64_u64(vget_low_u64(
4691
        vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b))));
4692
}
4693

4694
// Multiply the packed signed 16-bit integers in a and b, producing intermediate
4695
// 32-bit integers, and store the high 16 bits of the intermediate integers in
4696
// dst.
4697
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16
4698
FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b)
4699
{
4700
    /* FIXME: issue with large values because of result saturation */
4701
    // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a),
4702
    // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return
4703
    // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1));
4704
    int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a));
4705
    int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b));
4706
    int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
4707
    int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a));
4708
    int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b));
4709
    int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
4710
    uint16x8x2_t r =
4711
        vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654));
4712
    return vreinterpretq_m128i_u16(r.val[1]);
4713
}
4714

4715
// Multiply the packed unsigned 16-bit integers in a and b, producing
4716
// intermediate 32-bit integers, and store the high 16 bits of the intermediate
4717
// integers in dst.
4718
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16
4719
FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b)
4720
{
4721
    uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a));
4722
    uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b));
4723
    uint32x4_t ab3210 = vmull_u16(a3210, b3210);
4724
#if defined(__aarch64__) || defined(_M_ARM64)
4725
    uint32x4_t ab7654 =
4726
        vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
4727
    uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210),
4728
                              vreinterpretq_u16_u32(ab7654));
4729
    return vreinterpretq_m128i_u16(r);
4730
#else
4731
    uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a));
4732
    uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b));
4733
    uint32x4_t ab7654 = vmull_u16(a7654, b7654);
4734
    uint16x8x2_t r =
4735
        vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654));
4736
    return vreinterpretq_m128i_u16(r.val[1]);
4737
#endif
4738
}
4739

4740
// Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit
4741
// integers, and store the low 16 bits of the intermediate integers in dst.
4742
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16
4743
FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b)
4744
{
4745
    return vreinterpretq_m128i_s16(
4746
        vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
4747
}
4748

4749
// Compute the bitwise OR of packed double-precision (64-bit) floating-point
4750
// elements in a and b, and store the results in dst.
4751
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd
4752
FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b)
4753
{
4754
    return vreinterpretq_m128d_s64(
4755
        vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
4756
}
4757

4758
// Compute the bitwise OR of 128 bits (representing integer data) in a and b,
4759
// and store the result in dst.
4760
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128
4761
FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b)
4762
{
4763
    return vreinterpretq_m128i_s32(
4764
        vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
4765
}
4766

4767
// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
4768
// using signed saturation, and store the results in dst.
4769
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16
4770
FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b)
4771
{
4772
    return vreinterpretq_m128i_s8(
4773
        vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)),
4774
                    vqmovn_s16(vreinterpretq_s16_m128i(b))));
4775
}
4776

4777
// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
4778
// using signed saturation, and store the results in dst.
4779
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32
4780
FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b)
4781
{
4782
    return vreinterpretq_m128i_s16(
4783
        vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)),
4784
                     vqmovn_s32(vreinterpretq_s32_m128i(b))));
4785
}
4786

4787
// Convert packed signed 16-bit integers from a and b to packed 8-bit integers
4788
// using unsigned saturation, and store the results in dst.
4789
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16
4790
FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b)
4791
{
4792
    return vreinterpretq_m128i_u8(
4793
        vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)),
4794
                    vqmovun_s16(vreinterpretq_s16_m128i(b))));
4795
}
4796

4797
// Pause the processor. This is typically used in spin-wait loops and depending
4798
// on the x86 processor typical values are in the 40-100 cycle range. The
4799
// 'yield' instruction isn't a good fit because it's effectively a nop on most
4800
// Arm cores. Experience with several databases has shown has shown an 'isb' is
4801
// a reasonable approximation.
4802
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause
4803
FORCE_INLINE void _mm_pause(void)
4804
{
4805
#if defined(_MSC_VER)
4806
    __isb(_ARM64_BARRIER_SY);
4807
#else
4808
    __asm__ __volatile__("isb\n");
4809
#endif
4810
}
4811

4812
// Compute the absolute differences of packed unsigned 8-bit integers in a and
4813
// b, then horizontally sum each consecutive 8 differences to produce two
4814
// unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low
4815
// 16 bits of 64-bit elements in dst.
4816
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8
4817
FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b)
4818
{
4819
    uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b));
4820
    return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t)));
4821
}
4822

4823
// Set packed 16-bit integers in dst with the supplied values.
4824
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16
4825
FORCE_INLINE __m128i _mm_set_epi16(short i7,
4826
                                   short i6,
4827
                                   short i5,
4828
                                   short i4,
4829
                                   short i3,
4830
                                   short i2,
4831
                                   short i1,
4832
                                   short i0)
4833
{
4834
    int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7};
4835
    return vreinterpretq_m128i_s16(vld1q_s16(data));
4836
}
4837

4838
// Set packed 32-bit integers in dst with the supplied values.
4839
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32
4840
FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0)
4841
{
4842
    int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3};
4843
    return vreinterpretq_m128i_s32(vld1q_s32(data));
4844
}
4845

4846
// Set packed 64-bit integers in dst with the supplied values.
4847
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64
4848
FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2)
4849
{
4850
    return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0));
4851
}
4852

4853
// Set packed 64-bit integers in dst with the supplied values.
4854
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x
4855
FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2)
4856
{
4857
    return vreinterpretq_m128i_s64(
4858
        vcombine_s64(vcreate_s64(i2), vcreate_s64(i1)));
4859
}
4860

4861
// Set packed 8-bit integers in dst with the supplied values.
4862
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8
4863
FORCE_INLINE __m128i _mm_set_epi8(signed char b15,
4864
                                  signed char b14,
4865
                                  signed char b13,
4866
                                  signed char b12,
4867
                                  signed char b11,
4868
                                  signed char b10,
4869
                                  signed char b9,
4870
                                  signed char b8,
4871
                                  signed char b7,
4872
                                  signed char b6,
4873
                                  signed char b5,
4874
                                  signed char b4,
4875
                                  signed char b3,
4876
                                  signed char b2,
4877
                                  signed char b1,
4878
                                  signed char b0)
4879
{
4880
    int8_t ALIGN_STRUCT(16)
4881
        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
4882
                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
4883
                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
4884
                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
4885
    return (__m128i) vld1q_s8(data);
4886
}
4887

4888
// Set packed double-precision (64-bit) floating-point elements in dst with the
4889
// supplied values.
4890
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd
4891
FORCE_INLINE __m128d _mm_set_pd(double e1, double e0)
4892
{
4893
    double ALIGN_STRUCT(16) data[2] = {e0, e1};
4894
#if defined(__aarch64__) || defined(_M_ARM64)
4895
    return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data));
4896
#else
4897
    return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data));
4898
#endif
4899
}
4900

4901
// Broadcast double-precision (64-bit) floating-point value a to all elements of
4902
// dst.
4903
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1
4904
#define _mm_set_pd1 _mm_set1_pd
4905

4906
// Copy double-precision (64-bit) floating-point element a to the lower element
4907
// of dst, and zero the upper element.
4908
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd
4909
FORCE_INLINE __m128d _mm_set_sd(double a)
4910
{
4911
#if defined(__aarch64__) || defined(_M_ARM64)
4912
    return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0));
4913
#else
4914
    return _mm_set_pd(0, a);
4915
#endif
4916
}
4917

4918
// Broadcast 16-bit integer a to all elements of dst.
4919
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16
4920
FORCE_INLINE __m128i _mm_set1_epi16(short w)
4921
{
4922
    return vreinterpretq_m128i_s16(vdupq_n_s16(w));
4923
}
4924

4925
// Broadcast 32-bit integer a to all elements of dst.
4926
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32
4927
FORCE_INLINE __m128i _mm_set1_epi32(int _i)
4928
{
4929
    return vreinterpretq_m128i_s32(vdupq_n_s32(_i));
4930
}
4931

4932
// Broadcast 64-bit integer a to all elements of dst.
4933
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64
4934
FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i)
4935
{
4936
    return vreinterpretq_m128i_s64(vdupq_lane_s64(_i, 0));
4937
}
4938

4939
// Broadcast 64-bit integer a to all elements of dst.
4940
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x
4941
FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i)
4942
{
4943
    return vreinterpretq_m128i_s64(vdupq_n_s64(_i));
4944
}
4945

4946
// Broadcast 8-bit integer a to all elements of dst.
4947
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8
4948
FORCE_INLINE __m128i _mm_set1_epi8(signed char w)
4949
{
4950
    return vreinterpretq_m128i_s8(vdupq_n_s8(w));
4951
}
4952

4953
// Broadcast double-precision (64-bit) floating-point value a to all elements of
4954
// dst.
4955
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd
4956
FORCE_INLINE __m128d _mm_set1_pd(double d)
4957
{
4958
#if defined(__aarch64__) || defined(_M_ARM64)
4959
    return vreinterpretq_m128d_f64(vdupq_n_f64(d));
4960
#else
4961
    return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d));
4962
#endif
4963
}
4964

4965
// Set packed 16-bit integers in dst with the supplied values in reverse order.
4966
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16
4967
FORCE_INLINE __m128i _mm_setr_epi16(short w0,
4968
                                    short w1,
4969
                                    short w2,
4970
                                    short w3,
4971
                                    short w4,
4972
                                    short w5,
4973
                                    short w6,
4974
                                    short w7)
4975
{
4976
    int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7};
4977
    return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data));
4978
}
4979

4980
// Set packed 32-bit integers in dst with the supplied values in reverse order.
4981
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32
4982
FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0)
4983
{
4984
    int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0};
4985
    return vreinterpretq_m128i_s32(vld1q_s32(data));
4986
}
4987

4988
// Set packed 64-bit integers in dst with the supplied values in reverse order.
4989
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64
4990
FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0)
4991
{
4992
    return vreinterpretq_m128i_s64(vcombine_s64(e1, e0));
4993
}
4994

4995
// Set packed 8-bit integers in dst with the supplied values in reverse order.
4996
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8
4997
FORCE_INLINE __m128i _mm_setr_epi8(signed char b0,
4998
                                   signed char b1,
4999
                                   signed char b2,
5000
                                   signed char b3,
5001
                                   signed char b4,
5002
                                   signed char b5,
5003
                                   signed char b6,
5004
                                   signed char b7,
5005
                                   signed char b8,
5006
                                   signed char b9,
5007
                                   signed char b10,
5008
                                   signed char b11,
5009
                                   signed char b12,
5010
                                   signed char b13,
5011
                                   signed char b14,
5012
                                   signed char b15)
5013
{
5014
    int8_t ALIGN_STRUCT(16)
5015
        data[16] = {(int8_t) b0,  (int8_t) b1,  (int8_t) b2,  (int8_t) b3,
5016
                    (int8_t) b4,  (int8_t) b5,  (int8_t) b6,  (int8_t) b7,
5017
                    (int8_t) b8,  (int8_t) b9,  (int8_t) b10, (int8_t) b11,
5018
                    (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15};
5019
    return (__m128i) vld1q_s8(data);
5020
}
5021

5022
// Set packed double-precision (64-bit) floating-point elements in dst with the
5023
// supplied values in reverse order.
5024
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd
5025
FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0)
5026
{
5027
    return _mm_set_pd(e0, e1);
5028
}
5029

5030
// Return vector of type __m128d with all elements set to zero.
5031
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd
5032
FORCE_INLINE __m128d _mm_setzero_pd(void)
5033
{
5034
#if defined(__aarch64__) || defined(_M_ARM64)
5035
    return vreinterpretq_m128d_f64(vdupq_n_f64(0));
5036
#else
5037
    return vreinterpretq_m128d_f32(vdupq_n_f32(0));
5038
#endif
5039
}
5040

5041
// Return vector of type __m128i with all elements set to zero.
5042
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128
5043
FORCE_INLINE __m128i _mm_setzero_si128(void)
5044
{
5045
    return vreinterpretq_m128i_s32(vdupq_n_s32(0));
5046
}
5047

5048
// Shuffle 32-bit integers in a using the control in imm8, and store the results
5049
// in dst.
5050
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32
5051
// FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a,
5052
//                                        __constrange(0,255) int imm)
5053
#if defined(_sse2neon_shuffle)
5054
#define _mm_shuffle_epi32(a, imm)                                            \
5055
    __extension__({                                                          \
5056
        int32x4_t _input = vreinterpretq_s32_m128i(a);                       \
5057
        int32x4_t _shuf =                                                    \
5058
            vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \
5059
                          ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3);           \
5060
        vreinterpretq_m128i_s32(_shuf);                                      \
5061
    })
5062
#else  // generic
5063
#define _mm_shuffle_epi32(a, imm)                           \
5064
    _sse2neon_define1(                                      \
5065
        __m128i, a, __m128i ret; switch (imm) {             \
5066
            case _MM_SHUFFLE(1, 0, 3, 2):                   \
5067
                ret = _mm_shuffle_epi_1032(_a);             \
5068
                break;                                      \
5069
            case _MM_SHUFFLE(2, 3, 0, 1):                   \
5070
                ret = _mm_shuffle_epi_2301(_a);             \
5071
                break;                                      \
5072
            case _MM_SHUFFLE(0, 3, 2, 1):                   \
5073
                ret = _mm_shuffle_epi_0321(_a);             \
5074
                break;                                      \
5075
            case _MM_SHUFFLE(2, 1, 0, 3):                   \
5076
                ret = _mm_shuffle_epi_2103(_a);             \
5077
                break;                                      \
5078
            case _MM_SHUFFLE(1, 0, 1, 0):                   \
5079
                ret = _mm_shuffle_epi_1010(_a);             \
5080
                break;                                      \
5081
            case _MM_SHUFFLE(1, 0, 0, 1):                   \
5082
                ret = _mm_shuffle_epi_1001(_a);             \
5083
                break;                                      \
5084
            case _MM_SHUFFLE(0, 1, 0, 1):                   \
5085
                ret = _mm_shuffle_epi_0101(_a);             \
5086
                break;                                      \
5087
            case _MM_SHUFFLE(2, 2, 1, 1):                   \
5088
                ret = _mm_shuffle_epi_2211(_a);             \
5089
                break;                                      \
5090
            case _MM_SHUFFLE(0, 1, 2, 2):                   \
5091
                ret = _mm_shuffle_epi_0122(_a);             \
5092
                break;                                      \
5093
            case _MM_SHUFFLE(3, 3, 3, 2):                   \
5094
                ret = _mm_shuffle_epi_3332(_a);             \
5095
                break;                                      \
5096
            case _MM_SHUFFLE(0, 0, 0, 0):                   \
5097
                ret = _mm_shuffle_epi32_splat(_a, 0);       \
5098
                break;                                      \
5099
            case _MM_SHUFFLE(1, 1, 1, 1):                   \
5100
                ret = _mm_shuffle_epi32_splat(_a, 1);       \
5101
                break;                                      \
5102
            case _MM_SHUFFLE(2, 2, 2, 2):                   \
5103
                ret = _mm_shuffle_epi32_splat(_a, 2);       \
5104
                break;                                      \
5105
            case _MM_SHUFFLE(3, 3, 3, 3):                   \
5106
                ret = _mm_shuffle_epi32_splat(_a, 3);       \
5107
                break;                                      \
5108
            default:                                        \
5109
                ret = _mm_shuffle_epi32_default(_a, (imm)); \
5110
                break;                                      \
5111
        } _sse2neon_return(ret);)
5112
#endif
5113

5114
// Shuffle double-precision (64-bit) floating-point elements using the control
5115
// in imm8, and store the results in dst.
5116
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd
5117
#ifdef _sse2neon_shuffle
5118
#define _mm_shuffle_pd(a, b, imm8)                                            \
5119
    vreinterpretq_m128d_s64(                                                  \
5120
        vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \
5121
                      imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2))
5122
#else
5123
#define _mm_shuffle_pd(a, b, imm8)                                     \
5124
    _mm_castsi128_pd(_mm_set_epi64x(                                   \
5125
        vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \
5126
        vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1)))
5127
#endif
5128

5129
// FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a,
5130
//                                          __constrange(0,255) int imm)
5131
#if defined(_sse2neon_shuffle)
5132
#define _mm_shufflehi_epi16(a, imm)                                           \
5133
    __extension__({                                                           \
5134
        int16x8_t _input = vreinterpretq_s16_m128i(a);                        \
5135
        int16x8_t _shuf =                                                     \
5136
            vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4,    \
5137
                          (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \
5138
                          (((imm) >> 6) & 0x3) + 4);                          \
5139
        vreinterpretq_m128i_s16(_shuf);                                       \
5140
    })
5141
#else  // generic
5142
#define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm))
5143
#endif
5144

5145
// FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a,
5146
//                                          __constrange(0,255) int imm)
5147
#if defined(_sse2neon_shuffle)
5148
#define _mm_shufflelo_epi16(a, imm)                                  \
5149
    __extension__({                                                  \
5150
        int16x8_t _input = vreinterpretq_s16_m128i(a);               \
5151
        int16x8_t _shuf = vshuffleq_s16(                             \
5152
            _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3),   \
5153
            (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \
5154
        vreinterpretq_m128i_s16(_shuf);                              \
5155
    })
5156
#else  // generic
5157
#define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm))
5158
#endif
5159

5160
// Shift packed 16-bit integers in a left by count while shifting in zeros, and
5161
// store the results in dst.
5162
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16
5163
FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count)
5164
{
5165
    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5166
    if (_sse2neon_unlikely(c & ~15))
5167
        return _mm_setzero_si128();
5168

5169
    int16x8_t vc = vdupq_n_s16((int16_t) c);
5170
    return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc));
5171
}
5172

5173
// Shift packed 32-bit integers in a left by count while shifting in zeros, and
5174
// store the results in dst.
5175
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32
5176
FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count)
5177
{
5178
    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5179
    if (_sse2neon_unlikely(c & ~31))
5180
        return _mm_setzero_si128();
5181

5182
    int32x4_t vc = vdupq_n_s32((int32_t) c);
5183
    return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc));
5184
}
5185

5186
// Shift packed 64-bit integers in a left by count while shifting in zeros, and
5187
// store the results in dst.
5188
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64
5189
FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count)
5190
{
5191
    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5192
    if (_sse2neon_unlikely(c & ~63))
5193
        return _mm_setzero_si128();
5194

5195
    int64x2_t vc = vdupq_n_s64((int64_t) c);
5196
    return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc));
5197
}
5198

5199
// Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and
5200
// store the results in dst.
5201
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16
5202
FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm)
5203
{
5204
    if (_sse2neon_unlikely(imm & ~15))
5205
        return _mm_setzero_si128();
5206
    return vreinterpretq_m128i_s16(
5207
        vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm)));
5208
}
5209

5210
// Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and
5211
// store the results in dst.
5212
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32
5213
FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm)
5214
{
5215
    if (_sse2neon_unlikely(imm & ~31))
5216
        return _mm_setzero_si128();
5217
    return vreinterpretq_m128i_s32(
5218
        vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm)));
5219
}
5220

5221
// Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and
5222
// store the results in dst.
5223
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64
5224
FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm)
5225
{
5226
    if (_sse2neon_unlikely(imm & ~63))
5227
        return _mm_setzero_si128();
5228
    return vreinterpretq_m128i_s64(
5229
        vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm)));
5230
}
5231

5232
// Shift a left by imm8 bytes while shifting in zeros, and store the results in
5233
// dst.
5234
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128
5235
#define _mm_slli_si128(a, imm)                                              \
5236
    _sse2neon_define1(                                                      \
5237
        __m128i, a, int8x16_t ret;                                          \
5238
        if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \
5239
        else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
5240
        else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a),      \
5241
                            ((imm <= 0 || imm > 15) ? 0 : (16 - imm)));     \
5242
        _sse2neon_return(vreinterpretq_m128i_s8(ret));)
5243

5244
// Compute the square root of packed double-precision (64-bit) floating-point
5245
// elements in a, and store the results in dst.
5246
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd
5247
FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a)
5248
{
5249
#if defined(__aarch64__) || defined(_M_ARM64)
5250
    return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a)));
5251
#else
5252
    double a0 = sqrt(((double *) &a)[0]);
5253
    double a1 = sqrt(((double *) &a)[1]);
5254
    return _mm_set_pd(a1, a0);
5255
#endif
5256
}
5257

5258
// Compute the square root of the lower double-precision (64-bit) floating-point
5259
// element in b, store the result in the lower element of dst, and copy the
5260
// upper element from a to the upper element of dst.
5261
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd
5262
FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b)
5263
{
5264
#if defined(__aarch64__) || defined(_M_ARM64)
5265
    return _mm_move_sd(a, _mm_sqrt_pd(b));
5266
#else
5267
    return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0]));
5268
#endif
5269
}
5270

5271
// Shift packed 16-bit integers in a right by count while shifting in sign bits,
5272
// and store the results in dst.
5273
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16
5274
FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count)
5275
{
5276
    int64_t c = vgetq_lane_s64(count, 0);
5277
    if (_sse2neon_unlikely(c & ~15))
5278
        return _mm_cmplt_epi16(a, _mm_setzero_si128());
5279
    return vreinterpretq_m128i_s16(
5280
        vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c)));
5281
}
5282

5283
// Shift packed 32-bit integers in a right by count while shifting in sign bits,
5284
// and store the results in dst.
5285
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32
5286
FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count)
5287
{
5288
    int64_t c = vgetq_lane_s64(count, 0);
5289
    if (_sse2neon_unlikely(c & ~31))
5290
        return _mm_cmplt_epi32(a, _mm_setzero_si128());
5291
    return vreinterpretq_m128i_s32(
5292
        vshlq_s32((int32x4_t) a, vdupq_n_s32((int) -c)));
5293
}
5294

5295
// Shift packed 16-bit integers in a right by imm8 while shifting in sign
5296
// bits, and store the results in dst.
5297
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16
5298
FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm)
5299
{
5300
    const int count = (imm & ~15) ? 15 : imm;
5301
    return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count));
5302
}
5303

5304
// Shift packed 32-bit integers in a right by imm8 while shifting in sign bits,
5305
// and store the results in dst.
5306
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32
5307
// FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm)
5308
#define _mm_srai_epi32(a, imm)                                                \
5309
    _sse2neon_define0(                                                        \
5310
        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) {        \
5311
            ret = _a;                                                         \
5312
        } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) {               \
5313
            ret = vreinterpretq_m128i_s32(                                    \
5314
                vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \
5315
        } else {                                                              \
5316
            ret = vreinterpretq_m128i_s32(                                    \
5317
                vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31));                \
5318
        } _sse2neon_return(ret);)
5319

5320
// Shift packed 16-bit integers in a right by count while shifting in zeros, and
5321
// store the results in dst.
5322
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16
5323
FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count)
5324
{
5325
    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5326
    if (_sse2neon_unlikely(c & ~15))
5327
        return _mm_setzero_si128();
5328

5329
    int16x8_t vc = vdupq_n_s16(-(int16_t) c);
5330
    return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc));
5331
}
5332

5333
// Shift packed 32-bit integers in a right by count while shifting in zeros, and
5334
// store the results in dst.
5335
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32
5336
FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count)
5337
{
5338
    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5339
    if (_sse2neon_unlikely(c & ~31))
5340
        return _mm_setzero_si128();
5341

5342
    int32x4_t vc = vdupq_n_s32(-(int32_t) c);
5343
    return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc));
5344
}
5345

5346
// Shift packed 64-bit integers in a right by count while shifting in zeros, and
5347
// store the results in dst.
5348
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64
5349
FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count)
5350
{
5351
    uint64_t c = vreinterpretq_nth_u64_m128i(count, 0);
5352
    if (_sse2neon_unlikely(c & ~63))
5353
        return _mm_setzero_si128();
5354

5355
    int64x2_t vc = vdupq_n_s64(-(int64_t) c);
5356
    return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc));
5357
}
5358

5359
// Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and
5360
// store the results in dst.
5361
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16
5362
#define _mm_srli_epi16(a, imm)                                                \
5363
    _sse2neon_define0(                                                        \
5364
        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) {       \
5365
            ret = _mm_setzero_si128();                                        \
5366
        } else {                                                              \
5367
            ret = vreinterpretq_m128i_u16(                                    \
5368
                vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \
5369
        } _sse2neon_return(ret);)
5370

5371
// Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and
5372
// store the results in dst.
5373
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32
5374
// FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm)
5375
#define _mm_srli_epi32(a, imm)                                                \
5376
    _sse2neon_define0(                                                        \
5377
        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) {       \
5378
            ret = _mm_setzero_si128();                                        \
5379
        } else {                                                              \
5380
            ret = vreinterpretq_m128i_u32(                                    \
5381
                vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \
5382
        } _sse2neon_return(ret);)
5383

5384
// Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and
5385
// store the results in dst.
5386
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64
5387
#define _mm_srli_epi64(a, imm)                                                \
5388
    _sse2neon_define0(                                                        \
5389
        __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) {       \
5390
            ret = _mm_setzero_si128();                                        \
5391
        } else {                                                              \
5392
            ret = vreinterpretq_m128i_u64(                                    \
5393
                vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \
5394
        } _sse2neon_return(ret);)
5395

5396
// Shift a right by imm8 bytes while shifting in zeros, and store the results in
5397
// dst.
5398
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128
5399
#define _mm_srli_si128(a, imm)                                         \
5400
    _sse2neon_define1(                                                 \
5401
        __m128i, a, int8x16_t ret;                                     \
5402
        if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0);      \
5403
        else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \
5404
                            (imm > 15 ? 0 : imm));                     \
5405
        _sse2neon_return(vreinterpretq_m128i_s8(ret));)
5406

5407
// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5408
// elements) from a into memory. mem_addr must be aligned on a 16-byte boundary
5409
// or a general-protection exception may be generated.
5410
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd
5411
FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a)
5412
{
5413
#if defined(__aarch64__) || defined(_M_ARM64)
5414
    vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a));
5415
#else
5416
    vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a));
5417
#endif
5418
}
5419

5420
// Store the lower double-precision (64-bit) floating-point element from a into
5421
// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5422
// boundary or a general-protection exception may be generated.
5423
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1
5424
FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a)
5425
{
5426
#if defined(__aarch64__) || defined(_M_ARM64)
5427
    float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a));
5428
    vst1q_f64((float64_t *) mem_addr,
5429
              vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low)));
5430
#else
5431
    float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a));
5432
    vst1q_f32((float32_t *) mem_addr,
5433
              vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low)));
5434
#endif
5435
}
5436

5437
// Store the lower double-precision (64-bit) floating-point element from a into
5438
// memory. mem_addr does not need to be aligned on any particular boundary.
5439
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd
5440
FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a)
5441
{
5442
#if defined(__aarch64__) || defined(_M_ARM64)
5443
    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5444
#else
5445
    vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a)));
5446
#endif
5447
}
5448

5449
// Store 128-bits of integer data from a into memory. mem_addr must be aligned
5450
// on a 16-byte boundary or a general-protection exception may be generated.
5451
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128
5452
FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a)
5453
{
5454
    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5455
}
5456

5457
// Store the lower double-precision (64-bit) floating-point element from a into
5458
// 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte
5459
// boundary or a general-protection exception may be generated.
5460
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd
5461
#define _mm_store1_pd _mm_store_pd1
5462

5463
// Store the upper double-precision (64-bit) floating-point element from a into
5464
// memory.
5465
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd
5466
FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a)
5467
{
5468
#if defined(__aarch64__) || defined(_M_ARM64)
5469
    vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a)));
5470
#else
5471
    vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a)));
5472
#endif
5473
}
5474

5475
// Store 64-bit integer from the first element of a into memory.
5476
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64
5477
FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b)
5478
{
5479
    vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b)));
5480
}
5481

5482
// Store the lower double-precision (64-bit) floating-point element from a into
5483
// memory.
5484
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd
5485
FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a)
5486
{
5487
#if defined(__aarch64__) || defined(_M_ARM64)
5488
    vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a)));
5489
#else
5490
    vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a)));
5491
#endif
5492
}
5493

5494
// Store 2 double-precision (64-bit) floating-point elements from a into memory
5495
// in reverse order. mem_addr must be aligned on a 16-byte boundary or a
5496
// general-protection exception may be generated.
5497
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd
5498
FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a)
5499
{
5500
    float32x4_t f = vreinterpretq_f32_m128d(a);
5501
    _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2)));
5502
}
5503

5504
// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5505
// elements) from a into memory. mem_addr does not need to be aligned on any
5506
// particular boundary.
5507
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd
5508
FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a)
5509
{
5510
    _mm_store_pd(mem_addr, a);
5511
}
5512

5513
// Store 128-bits of integer data from a into memory. mem_addr does not need to
5514
// be aligned on any particular boundary.
5515
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128
5516
FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a)
5517
{
5518
    vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a));
5519
}
5520

5521
// Store 32-bit integer from the first element of a into memory. mem_addr does
5522
// not need to be aligned on any particular boundary.
5523
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32
5524
FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a)
5525
{
5526
    vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0);
5527
}
5528

5529
// Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point
5530
// elements) from a into memory using a non-temporal memory hint. mem_addr must
5531
// be aligned on a 16-byte boundary or a general-protection exception may be
5532
// generated.
5533
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd
5534
FORCE_INLINE void _mm_stream_pd(double *p, __m128d a)
5535
{
5536
#if __has_builtin(__builtin_nontemporal_store)
5537
    __builtin_nontemporal_store(a, (__m128d *) p);
5538
#elif defined(__aarch64__) || defined(_M_ARM64)
5539
    vst1q_f64(p, vreinterpretq_f64_m128d(a));
5540
#else
5541
    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a));
5542
#endif
5543
}
5544

5545
// Store 128-bits of integer data from a into memory using a non-temporal memory
5546
// hint. mem_addr must be aligned on a 16-byte boundary or a general-protection
5547
// exception may be generated.
5548
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128
5549
FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a)
5550
{
5551
#if __has_builtin(__builtin_nontemporal_store)
5552
    __builtin_nontemporal_store(a, p);
5553
#else
5554
    vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a));
5555
#endif
5556
}
5557

5558
// Store 32-bit integer a into memory using a non-temporal hint to minimize
5559
// cache pollution. If the cache line containing address mem_addr is already in
5560
// the cache, the cache will be updated.
5561
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32
5562
FORCE_INLINE void _mm_stream_si32(int *p, int a)
5563
{
5564
    vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0);
5565
}
5566

5567
// Store 64-bit integer a into memory using a non-temporal hint to minimize
5568
// cache pollution. If the cache line containing address mem_addr is already in
5569
// the cache, the cache will be updated.
5570
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64
5571
FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a)
5572
{
5573
    vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a));
5574
}
5575

5576
// Subtract packed 16-bit integers in b from packed 16-bit integers in a, and
5577
// store the results in dst.
5578
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16
5579
FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b)
5580
{
5581
    return vreinterpretq_m128i_s16(
5582
        vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5583
}
5584

5585
// Subtract packed 32-bit integers in b from packed 32-bit integers in a, and
5586
// store the results in dst.
5587
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32
5588
FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b)
5589
{
5590
    return vreinterpretq_m128i_s32(
5591
        vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5592
}
5593

5594
// Subtract packed 64-bit integers in b from packed 64-bit integers in a, and
5595
// store the results in dst.
5596
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64
5597
FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b)
5598
{
5599
    return vreinterpretq_m128i_s64(
5600
        vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
5601
}
5602

5603
// Subtract packed 8-bit integers in b from packed 8-bit integers in a, and
5604
// store the results in dst.
5605
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8
5606
FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b)
5607
{
5608
    return vreinterpretq_m128i_s8(
5609
        vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5610
}
5611

5612
// Subtract packed double-precision (64-bit) floating-point elements in b from
5613
// packed double-precision (64-bit) floating-point elements in a, and store the
5614
// results in dst.
5615
//  https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd
5616
FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b)
5617
{
5618
#if defined(__aarch64__) || defined(_M_ARM64)
5619
    return vreinterpretq_m128d_f64(
5620
        vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5621
#else
5622
    double *da = (double *) &a;
5623
    double *db = (double *) &b;
5624
    double c[2];
5625
    c[0] = da[0] - db[0];
5626
    c[1] = da[1] - db[1];
5627
    return vld1q_f32((float32_t *) c);
5628
#endif
5629
}
5630

5631
// Subtract the lower double-precision (64-bit) floating-point element in b from
5632
// the lower double-precision (64-bit) floating-point element in a, store the
5633
// result in the lower element of dst, and copy the upper element from a to the
5634
// upper element of dst.
5635
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd
5636
FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b)
5637
{
5638
    return _mm_move_sd(a, _mm_sub_pd(a, b));
5639
}
5640

5641
// Subtract 64-bit integer b from 64-bit integer a, and store the result in dst.
5642
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64
5643
FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b)
5644
{
5645
    return vreinterpret_m64_s64(
5646
        vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b)));
5647
}
5648

5649
// Subtract packed signed 16-bit integers in b from packed 16-bit integers in a
5650
// using saturation, and store the results in dst.
5651
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16
5652
FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b)
5653
{
5654
    return vreinterpretq_m128i_s16(
5655
        vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5656
}
5657

5658
// Subtract packed signed 8-bit integers in b from packed 8-bit integers in a
5659
// using saturation, and store the results in dst.
5660
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8
5661
FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b)
5662
{
5663
    return vreinterpretq_m128i_s8(
5664
        vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5665
}
5666

5667
// Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit
5668
// integers in a using saturation, and store the results in dst.
5669
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16
5670
FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b)
5671
{
5672
    return vreinterpretq_m128i_u16(
5673
        vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
5674
}
5675

5676
// Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit
5677
// integers in a using saturation, and store the results in dst.
5678
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8
5679
FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b)
5680
{
5681
    return vreinterpretq_m128i_u8(
5682
        vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)));
5683
}
5684

5685
#define _mm_ucomieq_sd _mm_comieq_sd
5686
#define _mm_ucomige_sd _mm_comige_sd
5687
#define _mm_ucomigt_sd _mm_comigt_sd
5688
#define _mm_ucomile_sd _mm_comile_sd
5689
#define _mm_ucomilt_sd _mm_comilt_sd
5690
#define _mm_ucomineq_sd _mm_comineq_sd
5691

5692
// Return vector of type __m128d with undefined elements.
5693
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd
5694
FORCE_INLINE __m128d _mm_undefined_pd(void)
5695
{
5696
#if defined(__GNUC__) || defined(__clang__)
5697
#pragma GCC diagnostic push
5698
#pragma GCC diagnostic ignored "-Wuninitialized"
5699
#endif
5700
    __m128d a;
5701
#if defined(_MSC_VER)
5702
    a = _mm_setzero_pd();
5703
#endif
5704
    return a;
5705
#if defined(__GNUC__) || defined(__clang__)
5706
#pragma GCC diagnostic pop
5707
#endif
5708
}
5709

5710
// Unpack and interleave 16-bit integers from the high half of a and b, and
5711
// store the results in dst.
5712
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16
5713
FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b)
5714
{
5715
#if defined(__aarch64__) || defined(_M_ARM64)
5716
    return vreinterpretq_m128i_s16(
5717
        vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5718
#else
5719
    int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a));
5720
    int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b));
5721
    int16x4x2_t result = vzip_s16(a1, b1);
5722
    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5723
#endif
5724
}
5725

5726
// Unpack and interleave 32-bit integers from the high half of a and b, and
5727
// store the results in dst.
5728
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32
5729
FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b)
5730
{
5731
#if defined(__aarch64__) || defined(_M_ARM64)
5732
    return vreinterpretq_m128i_s32(
5733
        vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5734
#else
5735
    int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a));
5736
    int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b));
5737
    int32x2x2_t result = vzip_s32(a1, b1);
5738
    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5739
#endif
5740
}
5741

5742
// Unpack and interleave 64-bit integers from the high half of a and b, and
5743
// store the results in dst.
5744
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64
5745
FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b)
5746
{
5747
#if defined(__aarch64__) || defined(_M_ARM64)
5748
    return vreinterpretq_m128i_s64(
5749
        vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
5750
#else
5751
    int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a));
5752
    int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b));
5753
    return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h));
5754
#endif
5755
}
5756

5757
// Unpack and interleave 8-bit integers from the high half of a and b, and store
5758
// the results in dst.
5759
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8
5760
FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b)
5761
{
5762
#if defined(__aarch64__) || defined(_M_ARM64)
5763
    return vreinterpretq_m128i_s8(
5764
        vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5765
#else
5766
    int8x8_t a1 =
5767
        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a)));
5768
    int8x8_t b1 =
5769
        vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b)));
5770
    int8x8x2_t result = vzip_s8(a1, b1);
5771
    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5772
#endif
5773
}
5774

5775
// Unpack and interleave double-precision (64-bit) floating-point elements from
5776
// the high half of a and b, and store the results in dst.
5777
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd
5778
FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b)
5779
{
5780
#if defined(__aarch64__) || defined(_M_ARM64)
5781
    return vreinterpretq_m128d_f64(
5782
        vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5783
#else
5784
    return vreinterpretq_m128d_s64(
5785
        vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)),
5786
                     vget_high_s64(vreinterpretq_s64_m128d(b))));
5787
#endif
5788
}
5789

5790
// Unpack and interleave 16-bit integers from the low half of a and b, and store
5791
// the results in dst.
5792
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16
5793
FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b)
5794
{
5795
#if defined(__aarch64__) || defined(_M_ARM64)
5796
    return vreinterpretq_m128i_s16(
5797
        vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)));
5798
#else
5799
    int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a));
5800
    int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b));
5801
    int16x4x2_t result = vzip_s16(a1, b1);
5802
    return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1]));
5803
#endif
5804
}
5805

5806
// Unpack and interleave 32-bit integers from the low half of a and b, and store
5807
// the results in dst.
5808
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32
5809
FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b)
5810
{
5811
#if defined(__aarch64__) || defined(_M_ARM64)
5812
    return vreinterpretq_m128i_s32(
5813
        vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5814
#else
5815
    int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a));
5816
    int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b));
5817
    int32x2x2_t result = vzip_s32(a1, b1);
5818
    return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1]));
5819
#endif
5820
}
5821

5822
// Unpack and interleave 64-bit integers from the low half of a and b, and store
5823
// the results in dst.
5824
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64
5825
FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b)
5826
{
5827
#if defined(__aarch64__) || defined(_M_ARM64)
5828
    return vreinterpretq_m128i_s64(
5829
        vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
5830
#else
5831
    int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a));
5832
    int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b));
5833
    return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l));
5834
#endif
5835
}
5836

5837
// Unpack and interleave 8-bit integers from the low half of a and b, and store
5838
// the results in dst.
5839
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8
5840
FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b)
5841
{
5842
#if defined(__aarch64__) || defined(_M_ARM64)
5843
    return vreinterpretq_m128i_s8(
5844
        vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
5845
#else
5846
    int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a)));
5847
    int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b)));
5848
    int8x8x2_t result = vzip_s8(a1, b1);
5849
    return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1]));
5850
#endif
5851
}
5852

5853
// Unpack and interleave double-precision (64-bit) floating-point elements from
5854
// the low half of a and b, and store the results in dst.
5855
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd
5856
FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b)
5857
{
5858
#if defined(__aarch64__) || defined(_M_ARM64)
5859
    return vreinterpretq_m128d_f64(
5860
        vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5861
#else
5862
    return vreinterpretq_m128d_s64(
5863
        vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)),
5864
                     vget_low_s64(vreinterpretq_s64_m128d(b))));
5865
#endif
5866
}
5867

5868
// Compute the bitwise XOR of packed double-precision (64-bit) floating-point
5869
// elements in a and b, and store the results in dst.
5870
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd
5871
FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b)
5872
{
5873
    return vreinterpretq_m128d_s64(
5874
        veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b)));
5875
}
5876

5877
// Compute the bitwise XOR of 128 bits (representing integer data) in a and b,
5878
// and store the result in dst.
5879
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128
5880
FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b)
5881
{
5882
    return vreinterpretq_m128i_s32(
5883
        veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
5884
}
5885

5886
/* SSE3 */
5887

5888
// Alternatively add and subtract packed double-precision (64-bit)
5889
// floating-point elements in a to/from packed elements in b, and store the
5890
// results in dst.
5891
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd
5892
FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b)
5893
{
5894
    _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f);
5895
#if defined(__aarch64__) || defined(_M_ARM64)
5896
    return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a),
5897
                                             vreinterpretq_f64_m128d(b),
5898
                                             vreinterpretq_f64_m128d(mask)));
5899
#else
5900
    return _mm_add_pd(_mm_mul_pd(b, mask), a);
5901
#endif
5902
}
5903

5904
// Alternatively add and subtract packed single-precision (32-bit)
5905
// floating-point elements in a to/from packed elements in b, and store the
5906
// results in dst.
5907
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps
5908
FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b)
5909
{
5910
    _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f);
5911
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
5912
    defined(__ARM_FEATURE_FMA) /* VFPv4+ */
5913
    return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a),
5914
                                            vreinterpretq_f32_m128(mask),
5915
                                            vreinterpretq_f32_m128(b)));
5916
#else
5917
    return _mm_add_ps(_mm_mul_ps(b, mask), a);
5918
#endif
5919
}
5920

5921
// Horizontally add adjacent pairs of double-precision (64-bit) floating-point
5922
// elements in a and b, and pack the results in dst.
5923
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd
5924
FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b)
5925
{
5926
#if defined(__aarch64__) || defined(_M_ARM64)
5927
    return vreinterpretq_m128d_f64(
5928
        vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)));
5929
#else
5930
    double *da = (double *) &a;
5931
    double *db = (double *) &b;
5932
    double c[] = {da[0] + da[1], db[0] + db[1]};
5933
    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
5934
#endif
5935
}
5936

5937
// Horizontally add adjacent pairs of single-precision (32-bit) floating-point
5938
// elements in a and b, and pack the results in dst.
5939
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps
5940
FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b)
5941
{
5942
#if defined(__aarch64__) || defined(_M_ARM64)
5943
    return vreinterpretq_m128_f32(
5944
        vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)));
5945
#else
5946
    float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a));
5947
    float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a));
5948
    float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b));
5949
    float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b));
5950
    return vreinterpretq_m128_f32(
5951
        vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32)));
5952
#endif
5953
}
5954

5955
// Horizontally subtract adjacent pairs of double-precision (64-bit)
5956
// floating-point elements in a and b, and pack the results in dst.
5957
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd
5958
FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b)
5959
{
5960
#if defined(__aarch64__) || defined(_M_ARM64)
5961
    float64x2_t a = vreinterpretq_f64_m128d(_a);
5962
    float64x2_t b = vreinterpretq_f64_m128d(_b);
5963
    return vreinterpretq_m128d_f64(
5964
        vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b)));
5965
#else
5966
    double *da = (double *) &_a;
5967
    double *db = (double *) &_b;
5968
    double c[] = {da[0] - da[1], db[0] - db[1]};
5969
    return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c));
5970
#endif
5971
}
5972

5973
// Horizontally subtract adjacent pairs of single-precision (32-bit)
5974
// floating-point elements in a and b, and pack the results in dst.
5975
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps
5976
FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b)
5977
{
5978
    float32x4_t a = vreinterpretq_f32_m128(_a);
5979
    float32x4_t b = vreinterpretq_f32_m128(_b);
5980
#if defined(__aarch64__) || defined(_M_ARM64)
5981
    return vreinterpretq_m128_f32(
5982
        vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b)));
5983
#else
5984
    float32x4x2_t c = vuzpq_f32(a, b);
5985
    return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1]));
5986
#endif
5987
}
5988

5989
// Load 128-bits of integer data from unaligned memory into dst. This intrinsic
5990
// may perform better than _mm_loadu_si128 when the data crosses a cache line
5991
// boundary.
5992
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128
5993
#define _mm_lddqu_si128 _mm_loadu_si128
5994

5995
// Load a double-precision (64-bit) floating-point element from memory into both
5996
// elements of dst.
5997
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd
5998
#define _mm_loaddup_pd _mm_load1_pd
5999

6000
// Duplicate the low double-precision (64-bit) floating-point element from a,
6001
// and store the results in dst.
6002
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd
6003
FORCE_INLINE __m128d _mm_movedup_pd(__m128d a)
6004
{
6005
#if defined(__aarch64__) || defined(_M_ARM64)
6006
    return vreinterpretq_m128d_f64(
6007
        vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0));
6008
#else
6009
    return vreinterpretq_m128d_u64(
6010
        vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0)));
6011
#endif
6012
}
6013

6014
// Duplicate odd-indexed single-precision (32-bit) floating-point elements
6015
// from a, and store the results in dst.
6016
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps
6017
FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a)
6018
{
6019
#if defined(__aarch64__) || defined(_M_ARM64)
6020
    return vreinterpretq_m128_f32(
6021
        vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
6022
#elif defined(_sse2neon_shuffle)
6023
    return vreinterpretq_m128_f32(vshuffleq_s32(
6024
        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3));
6025
#else
6026
    float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1);
6027
    float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3);
6028
    float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3};
6029
    return vreinterpretq_m128_f32(vld1q_f32(data));
6030
#endif
6031
}
6032

6033
// Duplicate even-indexed single-precision (32-bit) floating-point elements
6034
// from a, and store the results in dst.
6035
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps
6036
FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a)
6037
{
6038
#if defined(__aarch64__) || defined(_M_ARM64)
6039
    return vreinterpretq_m128_f32(
6040
        vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)));
6041
#elif defined(_sse2neon_shuffle)
6042
    return vreinterpretq_m128_f32(vshuffleq_s32(
6043
        vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2));
6044
#else
6045
    float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0);
6046
    float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2);
6047
    float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2};
6048
    return vreinterpretq_m128_f32(vld1q_f32(data));
6049
#endif
6050
}
6051

6052
/* SSSE3 */
6053

6054
// Compute the absolute value of packed signed 16-bit integers in a, and store
6055
// the unsigned results in dst.
6056
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16
6057
FORCE_INLINE __m128i _mm_abs_epi16(__m128i a)
6058
{
6059
    return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a)));
6060
}
6061

6062
// Compute the absolute value of packed signed 32-bit integers in a, and store
6063
// the unsigned results in dst.
6064
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32
6065
FORCE_INLINE __m128i _mm_abs_epi32(__m128i a)
6066
{
6067
    return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a)));
6068
}
6069

6070
// Compute the absolute value of packed signed 8-bit integers in a, and store
6071
// the unsigned results in dst.
6072
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8
6073
FORCE_INLINE __m128i _mm_abs_epi8(__m128i a)
6074
{
6075
    return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a)));
6076
}
6077

6078
// Compute the absolute value of packed signed 16-bit integers in a, and store
6079
// the unsigned results in dst.
6080
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16
6081
FORCE_INLINE __m64 _mm_abs_pi16(__m64 a)
6082
{
6083
    return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a)));
6084
}
6085

6086
// Compute the absolute value of packed signed 32-bit integers in a, and store
6087
// the unsigned results in dst.
6088
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32
6089
FORCE_INLINE __m64 _mm_abs_pi32(__m64 a)
6090
{
6091
    return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a)));
6092
}
6093

6094
// Compute the absolute value of packed signed 8-bit integers in a, and store
6095
// the unsigned results in dst.
6096
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8
6097
FORCE_INLINE __m64 _mm_abs_pi8(__m64 a)
6098
{
6099
    return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a)));
6100
}
6101

6102
// Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift
6103
// the result right by imm8 bytes, and store the low 16 bytes in dst.
6104
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8
6105
#if defined(__GNUC__) && !defined(__clang__)
6106
#define _mm_alignr_epi8(a, b, imm)                                            \
6107
    __extension__({                                                           \
6108
        uint8x16_t _a = vreinterpretq_u8_m128i(a);                            \
6109
        uint8x16_t _b = vreinterpretq_u8_m128i(b);                            \
6110
        __m128i ret;                                                          \
6111
        if (_sse2neon_unlikely((imm) & ~31))                                  \
6112
            ret = vreinterpretq_m128i_u8(vdupq_n_u8(0));                      \
6113
        else if (imm >= 16)                                                   \
6114
            ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0);                \
6115
        else                                                                  \
6116
            ret =                                                             \
6117
                vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \
6118
        ret;                                                                  \
6119
    })
6120

6121
#else
6122
#define _mm_alignr_epi8(a, b, imm)                                          \
6123
    _sse2neon_define2(                                                      \
6124
        __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a);         \
6125
        uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret;           \
6126
        if (_sse2neon_unlikely((imm) & ~31)) ret =                          \
6127
            vreinterpretq_m128i_u8(vdupq_n_u8(0));                          \
6128
        else if (imm >= 16) ret =                                           \
6129
            _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0);                   \
6130
        else ret =                                                          \
6131
            vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \
6132
        _sse2neon_return(ret);)
6133

6134
#endif
6135

6136
// Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift
6137
// the result right by imm8 bytes, and store the low 8 bytes in dst.
6138
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8
6139
#define _mm_alignr_pi8(a, b, imm)                                           \
6140
    _sse2neon_define2(                                                      \
6141
        __m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) {      \
6142
            ret = vreinterpret_m64_s8(vdup_n_s8(0));                        \
6143
        } else {                                                            \
6144
            uint8x8_t tmp_low;                                              \
6145
            uint8x8_t tmp_high;                                             \
6146
            if ((imm) >= 8) {                                               \
6147
                const int idx = (imm) -8;                                   \
6148
                tmp_low = vreinterpret_u8_m64(_a);                          \
6149
                tmp_high = vdup_n_u8(0);                                    \
6150
                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6151
            } else {                                                        \
6152
                const int idx = (imm);                                      \
6153
                tmp_low = vreinterpret_u8_m64(_b);                          \
6154
                tmp_high = vreinterpret_u8_m64(_a);                         \
6155
                ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \
6156
            }                                                               \
6157
        } _sse2neon_return(ret);)
6158

6159
// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6160
// signed 16-bit results in dst.
6161
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16
6162
FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b)
6163
{
6164
    int16x8_t a = vreinterpretq_s16_m128i(_a);
6165
    int16x8_t b = vreinterpretq_s16_m128i(_b);
6166
#if defined(__aarch64__) || defined(_M_ARM64)
6167
    return vreinterpretq_m128i_s16(vpaddq_s16(a, b));
6168
#else
6169
    return vreinterpretq_m128i_s16(
6170
        vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)),
6171
                     vpadd_s16(vget_low_s16(b), vget_high_s16(b))));
6172
#endif
6173
}
6174

6175
// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6176
// signed 32-bit results in dst.
6177
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32
6178
FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b)
6179
{
6180
    int32x4_t a = vreinterpretq_s32_m128i(_a);
6181
    int32x4_t b = vreinterpretq_s32_m128i(_b);
6182
#if defined(__aarch64__) || defined(_M_ARM64)
6183
    return vreinterpretq_m128i_s32(vpaddq_s32(a, b));
6184
#else
6185
    return vreinterpretq_m128i_s32(
6186
        vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)),
6187
                     vpadd_s32(vget_low_s32(b), vget_high_s32(b))));
6188
#endif
6189
}
6190

6191
// Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the
6192
// signed 16-bit results in dst.
6193
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16
6194
FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b)
6195
{
6196
    return vreinterpret_m64_s16(
6197
        vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b)));
6198
}
6199

6200
// Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the
6201
// signed 32-bit results in dst.
6202
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32
6203
FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b)
6204
{
6205
    return vreinterpret_m64_s32(
6206
        vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)));
6207
}
6208

6209
// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6210
// saturation, and pack the signed 16-bit results in dst.
6211
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16
6212
FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b)
6213
{
6214
#if defined(__aarch64__) || defined(_M_ARM64)
6215
    int16x8_t a = vreinterpretq_s16_m128i(_a);
6216
    int16x8_t b = vreinterpretq_s16_m128i(_b);
6217
    return vreinterpretq_s64_s16(
6218
        vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6219
#else
6220
    int32x4_t a = vreinterpretq_s32_m128i(_a);
6221
    int32x4_t b = vreinterpretq_s32_m128i(_b);
6222
    // Interleave using vshrn/vmovn
6223
    // [a0|a2|a4|a6|b0|b2|b4|b6]
6224
    // [a1|a3|a5|a7|b1|b3|b5|b7]
6225
    int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b));
6226
    int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16));
6227
    // Saturated add
6228
    return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357));
6229
#endif
6230
}
6231

6232
// Horizontally add adjacent pairs of signed 16-bit integers in a and b using
6233
// saturation, and pack the signed 16-bit results in dst.
6234
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16
6235
FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b)
6236
{
6237
    int16x4_t a = vreinterpret_s16_m64(_a);
6238
    int16x4_t b = vreinterpret_s16_m64(_b);
6239
#if defined(__aarch64__) || defined(_M_ARM64)
6240
    return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6241
#else
6242
    int16x4x2_t res = vuzp_s16(a, b);
6243
    return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1]));
6244
#endif
6245
}
6246

6247
// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6248
// the signed 16-bit results in dst.
6249
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16
6250
FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b)
6251
{
6252
    int16x8_t a = vreinterpretq_s16_m128i(_a);
6253
    int16x8_t b = vreinterpretq_s16_m128i(_b);
6254
#if defined(__aarch64__) || defined(_M_ARM64)
6255
    return vreinterpretq_m128i_s16(
6256
        vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6257
#else
6258
    int16x8x2_t c = vuzpq_s16(a, b);
6259
    return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1]));
6260
#endif
6261
}
6262

6263
// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6264
// the signed 32-bit results in dst.
6265
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32
6266
FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b)
6267
{
6268
    int32x4_t a = vreinterpretq_s32_m128i(_a);
6269
    int32x4_t b = vreinterpretq_s32_m128i(_b);
6270
#if defined(__aarch64__) || defined(_M_ARM64)
6271
    return vreinterpretq_m128i_s32(
6272
        vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b)));
6273
#else
6274
    int32x4x2_t c = vuzpq_s32(a, b);
6275
    return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1]));
6276
#endif
6277
}
6278

6279
// Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack
6280
// the signed 16-bit results in dst.
6281
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16
6282
FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b)
6283
{
6284
    int16x4_t a = vreinterpret_s16_m64(_a);
6285
    int16x4_t b = vreinterpret_s16_m64(_b);
6286
#if defined(__aarch64__) || defined(_M_ARM64)
6287
    return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6288
#else
6289
    int16x4x2_t c = vuzp_s16(a, b);
6290
    return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1]));
6291
#endif
6292
}
6293

6294
// Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack
6295
// the signed 32-bit results in dst.
6296
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32
6297
FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b)
6298
{
6299
    int32x2_t a = vreinterpret_s32_m64(_a);
6300
    int32x2_t b = vreinterpret_s32_m64(_b);
6301
#if defined(__aarch64__) || defined(_M_ARM64)
6302
    return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b)));
6303
#else
6304
    int32x2x2_t c = vuzp_s32(a, b);
6305
    return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1]));
6306
#endif
6307
}
6308

6309
// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6310
// using saturation, and pack the signed 16-bit results in dst.
6311
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16
6312
FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b)
6313
{
6314
    int16x8_t a = vreinterpretq_s16_m128i(_a);
6315
    int16x8_t b = vreinterpretq_s16_m128i(_b);
6316
#if defined(__aarch64__) || defined(_M_ARM64)
6317
    return vreinterpretq_m128i_s16(
6318
        vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b)));
6319
#else
6320
    int16x8x2_t c = vuzpq_s16(a, b);
6321
    return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1]));
6322
#endif
6323
}
6324

6325
// Horizontally subtract adjacent pairs of signed 16-bit integers in a and b
6326
// using saturation, and pack the signed 16-bit results in dst.
6327
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16
6328
FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b)
6329
{
6330
    int16x4_t a = vreinterpret_s16_m64(_a);
6331
    int16x4_t b = vreinterpret_s16_m64(_b);
6332
#if defined(__aarch64__) || defined(_M_ARM64)
6333
    return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b)));
6334
#else
6335
    int16x4x2_t c = vuzp_s16(a, b);
6336
    return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1]));
6337
#endif
6338
}
6339

6340
// Vertically multiply each unsigned 8-bit integer from a with the corresponding
6341
// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6342
// Horizontally add adjacent pairs of intermediate signed 16-bit integers,
6343
// and pack the saturated results in dst.
6344
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16
6345
FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b)
6346
{
6347
#if defined(__aarch64__) || defined(_M_ARM64)
6348
    uint8x16_t a = vreinterpretq_u8_m128i(_a);
6349
    int8x16_t b = vreinterpretq_s8_m128i(_b);
6350
    int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))),
6351
                             vmovl_s8(vget_low_s8(b)));
6352
    int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))),
6353
                             vmovl_s8(vget_high_s8(b)));
6354
    return vreinterpretq_m128i_s16(
6355
        vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th)));
6356
#else
6357
    // This would be much simpler if x86 would choose to zero extend OR sign
6358
    // extend, not both. This could probably be optimized better.
6359
    uint16x8_t a = vreinterpretq_u16_m128i(_a);
6360
    int16x8_t b = vreinterpretq_s16_m128i(_b);
6361

6362
    // Zero extend a
6363
    int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8));
6364
    int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00)));
6365

6366
    // Sign extend by shifting left then shifting right.
6367
    int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8);
6368
    int16x8_t b_odd = vshrq_n_s16(b, 8);
6369

6370
    // multiply
6371
    int16x8_t prod1 = vmulq_s16(a_even, b_even);
6372
    int16x8_t prod2 = vmulq_s16(a_odd, b_odd);
6373

6374
    // saturated add
6375
    return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2));
6376
#endif
6377
}
6378

6379
// Vertically multiply each unsigned 8-bit integer from a with the corresponding
6380
// signed 8-bit integer from b, producing intermediate signed 16-bit integers.
6381
// Horizontally add adjacent pairs of intermediate signed 16-bit integers, and
6382
// pack the saturated results in dst.
6383
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16
6384
FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b)
6385
{
6386
    uint16x4_t a = vreinterpret_u16_m64(_a);
6387
    int16x4_t b = vreinterpret_s16_m64(_b);
6388

6389
    // Zero extend a
6390
    int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8));
6391
    int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff)));
6392

6393
    // Sign extend by shifting left then shifting right.
6394
    int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8);
6395
    int16x4_t b_odd = vshr_n_s16(b, 8);
6396

6397
    // multiply
6398
    int16x4_t prod1 = vmul_s16(a_even, b_even);
6399
    int16x4_t prod2 = vmul_s16(a_odd, b_odd);
6400

6401
    // saturated add
6402
    return vreinterpret_m64_s16(vqadd_s16(prod1, prod2));
6403
}
6404

6405
// Multiply packed signed 16-bit integers in a and b, producing intermediate
6406
// signed 32-bit integers. Shift right by 15 bits while rounding up, and store
6407
// the packed 16-bit integers in dst.
6408
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16
6409
FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b)
6410
{
6411
    // Has issues due to saturation
6412
    // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b));
6413

6414
    // Multiply
6415
    int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)),
6416
                                 vget_low_s16(vreinterpretq_s16_m128i(b)));
6417
    int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)),
6418
                                 vget_high_s16(vreinterpretq_s16_m128i(b)));
6419

6420
    // Rounding narrowing shift right
6421
    // narrow = (int16_t)((mul + 16384) >> 15);
6422
    int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15);
6423
    int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15);
6424

6425
    // Join together
6426
    return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi));
6427
}
6428

6429
// Multiply packed signed 16-bit integers in a and b, producing intermediate
6430
// signed 32-bit integers. Truncate each intermediate integer to the 18 most
6431
// significant bits, round by adding 1, and store bits [16:1] to dst.
6432
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16
6433
FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b)
6434
{
6435
    int32x4_t mul_extend =
6436
        vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b)));
6437

6438
    // Rounding narrowing shift right
6439
    return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15));
6440
}
6441

6442
// Shuffle packed 8-bit integers in a according to shuffle control mask in the
6443
// corresponding 8-bit element of b, and store the results in dst.
6444
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8
6445
FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b)
6446
{
6447
    int8x16_t tbl = vreinterpretq_s8_m128i(a);   // input a
6448
    uint8x16_t idx = vreinterpretq_u8_m128i(b);  // input b
6449
    uint8x16_t idx_masked =
6450
        vandq_u8(idx, vdupq_n_u8(0x8F));  // avoid using meaningless bits
6451
#if defined(__aarch64__) || defined(_M_ARM64)
6452
    return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked));
6453
#elif defined(__GNUC__)
6454
    int8x16_t ret;
6455
    // %e and %f represent the even and odd D registers
6456
    // respectively.
6457
    __asm__ __volatile__(
6458
        "vtbl.8  %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n"
6459
        "vtbl.8  %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n"
6460
        : [ret] "=&w"(ret)
6461
        : [tbl] "w"(tbl), [idx] "w"(idx_masked));
6462
    return vreinterpretq_m128i_s8(ret);
6463
#else
6464
    // use this line if testing on aarch64
6465
    int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)};
6466
    return vreinterpretq_m128i_s8(
6467
        vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)),
6468
                    vtbl2_s8(a_split, vget_high_u8(idx_masked))));
6469
#endif
6470
}
6471

6472
// Shuffle packed 8-bit integers in a according to shuffle control mask in the
6473
// corresponding 8-bit element of b, and store the results in dst.
6474
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8
6475
FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b)
6476
{
6477
    const int8x8_t controlMask =
6478
        vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07)));
6479
    int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask);
6480
    return vreinterpret_m64_s8(res);
6481
}
6482

6483
// Negate packed 16-bit integers in a when the corresponding signed
6484
// 16-bit integer in b is negative, and store the results in dst.
6485
// Element in dst are zeroed out when the corresponding element
6486
// in b is zero.
6487
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16
6488
FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b)
6489
{
6490
    int16x8_t a = vreinterpretq_s16_m128i(_a);
6491
    int16x8_t b = vreinterpretq_s16_m128i(_b);
6492

6493
    // signed shift right: faster than vclt
6494
    // (b < 0) ? 0xFFFF : 0
6495
    uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15));
6496
    // (b == 0) ? 0xFFFF : 0
6497
#if defined(__aarch64__) || defined(_M_ARM64)
6498
    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b));
6499
#else
6500
    int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0)));
6501
#endif
6502

6503
    // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative
6504
    // 'a') based on ltMask
6505
    int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a);
6506
    // res = masked & (~zeroMask)
6507
    int16x8_t res = vbicq_s16(masked, zeroMask);
6508
    return vreinterpretq_m128i_s16(res);
6509
}
6510

6511
// Negate packed 32-bit integers in a when the corresponding signed
6512
// 32-bit integer in b is negative, and store the results in dst.
6513
// Element in dst are zeroed out when the corresponding element
6514
// in b is zero.
6515
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32
6516
FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b)
6517
{
6518
    int32x4_t a = vreinterpretq_s32_m128i(_a);
6519
    int32x4_t b = vreinterpretq_s32_m128i(_b);
6520

6521
    // signed shift right: faster than vclt
6522
    // (b < 0) ? 0xFFFFFFFF : 0
6523
    uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31));
6524

6525
    // (b == 0) ? 0xFFFFFFFF : 0
6526
#if defined(__aarch64__) || defined(_M_ARM64)
6527
    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b));
6528
#else
6529
    int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0)));
6530
#endif
6531

6532
    // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative
6533
    // 'a') based on ltMask
6534
    int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a);
6535
    // res = masked & (~zeroMask)
6536
    int32x4_t res = vbicq_s32(masked, zeroMask);
6537
    return vreinterpretq_m128i_s32(res);
6538
}
6539

6540
// Negate packed 8-bit integers in a when the corresponding signed
6541
// 8-bit integer in b is negative, and store the results in dst.
6542
// Element in dst are zeroed out when the corresponding element
6543
// in b is zero.
6544
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8
6545
FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b)
6546
{
6547
    int8x16_t a = vreinterpretq_s8_m128i(_a);
6548
    int8x16_t b = vreinterpretq_s8_m128i(_b);
6549

6550
    // signed shift right: faster than vclt
6551
    // (b < 0) ? 0xFF : 0
6552
    uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7));
6553

6554
    // (b == 0) ? 0xFF : 0
6555
#if defined(__aarch64__) || defined(_M_ARM64)
6556
    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b));
6557
#else
6558
    int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0)));
6559
#endif
6560

6561
    // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a')
6562
    // based on ltMask
6563
    int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a);
6564
    // res = masked & (~zeroMask)
6565
    int8x16_t res = vbicq_s8(masked, zeroMask);
6566

6567
    return vreinterpretq_m128i_s8(res);
6568
}
6569

6570
// Negate packed 16-bit integers in a when the corresponding signed 16-bit
6571
// integer in b is negative, and store the results in dst. Element in dst are
6572
// zeroed out when the corresponding element in b is zero.
6573
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16
6574
FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b)
6575
{
6576
    int16x4_t a = vreinterpret_s16_m64(_a);
6577
    int16x4_t b = vreinterpret_s16_m64(_b);
6578

6579
    // signed shift right: faster than vclt
6580
    // (b < 0) ? 0xFFFF : 0
6581
    uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15));
6582

6583
    // (b == 0) ? 0xFFFF : 0
6584
#if defined(__aarch64__) || defined(_M_ARM64)
6585
    int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b));
6586
#else
6587
    int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0)));
6588
#endif
6589

6590
    // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a')
6591
    // based on ltMask
6592
    int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a);
6593
    // res = masked & (~zeroMask)
6594
    int16x4_t res = vbic_s16(masked, zeroMask);
6595

6596
    return vreinterpret_m64_s16(res);
6597
}
6598

6599
// Negate packed 32-bit integers in a when the corresponding signed 32-bit
6600
// integer in b is negative, and store the results in dst. Element in dst are
6601
// zeroed out when the corresponding element in b is zero.
6602
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32
6603
FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b)
6604
{
6605
    int32x2_t a = vreinterpret_s32_m64(_a);
6606
    int32x2_t b = vreinterpret_s32_m64(_b);
6607

6608
    // signed shift right: faster than vclt
6609
    // (b < 0) ? 0xFFFFFFFF : 0
6610
    uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31));
6611

6612
    // (b == 0) ? 0xFFFFFFFF : 0
6613
#if defined(__aarch64__) || defined(_M_ARM64)
6614
    int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b));
6615
#else
6616
    int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0)));
6617
#endif
6618

6619
    // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a')
6620
    // based on ltMask
6621
    int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a);
6622
    // res = masked & (~zeroMask)
6623
    int32x2_t res = vbic_s32(masked, zeroMask);
6624

6625
    return vreinterpret_m64_s32(res);
6626
}
6627

6628
// Negate packed 8-bit integers in a when the corresponding signed 8-bit integer
6629
// in b is negative, and store the results in dst. Element in dst are zeroed out
6630
// when the corresponding element in b is zero.
6631
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8
6632
FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b)
6633
{
6634
    int8x8_t a = vreinterpret_s8_m64(_a);
6635
    int8x8_t b = vreinterpret_s8_m64(_b);
6636

6637
    // signed shift right: faster than vclt
6638
    // (b < 0) ? 0xFF : 0
6639
    uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7));
6640

6641
    // (b == 0) ? 0xFF : 0
6642
#if defined(__aarch64__) || defined(_M_ARM64)
6643
    int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b));
6644
#else
6645
    int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0)));
6646
#endif
6647

6648
    // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a')
6649
    // based on ltMask
6650
    int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a);
6651
    // res = masked & (~zeroMask)
6652
    int8x8_t res = vbic_s8(masked, zeroMask);
6653

6654
    return vreinterpret_m64_s8(res);
6655
}
6656

6657
/* SSE4.1 */
6658

6659
// Blend packed 16-bit integers from a and b using control mask imm8, and store
6660
// the results in dst.
6661
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16
6662
// FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b,
6663
//                                      __constrange(0,255) int imm)
6664
#define _mm_blend_epi16(a, b, imm)                                      \
6665
    _sse2neon_define2(                                                  \
6666
        __m128i, a, b,                                                  \
6667
        const uint16_t _mask[8] =                                       \
6668
            _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0,    \
6669
                           ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0,    \
6670
                           ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0,    \
6671
                           ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0,    \
6672
                           ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0,    \
6673
                           ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0,    \
6674
                           ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0,    \
6675
                           ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0);   \
6676
        uint16x8_t _mask_vec = vld1q_u16(_mask);                        \
6677
        uint16x8_t __a = vreinterpretq_u16_m128i(_a);                   \
6678
        uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \
6679
            vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));)
6680

6681
// Blend packed double-precision (64-bit) floating-point elements from a and b
6682
// using control mask imm8, and store the results in dst.
6683
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd
6684
#define _mm_blend_pd(a, b, imm)                                              \
6685
    _sse2neon_define2(                                                       \
6686
        __m128d, a, b,                                                       \
6687
        const uint64_t _mask[2] =                                            \
6688
            _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0),  \
6689
                           ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \
6690
        uint64x2_t _mask_vec = vld1q_u64(_mask);                             \
6691
        uint64x2_t __a = vreinterpretq_u64_m128d(_a);                        \
6692
        uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return(      \
6693
            vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));)
6694

6695
// Blend packed single-precision (32-bit) floating-point elements from a and b
6696
// using mask, and store the results in dst.
6697
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps
6698
FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8)
6699
{
6700
    const uint32_t ALIGN_STRUCT(16)
6701
        data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0,
6702
                   ((imm8) & (1 << 1)) ? UINT32_MAX : 0,
6703
                   ((imm8) & (1 << 2)) ? UINT32_MAX : 0,
6704
                   ((imm8) & (1 << 3)) ? UINT32_MAX : 0};
6705
    uint32x4_t mask = vld1q_u32(data);
6706
    float32x4_t a = vreinterpretq_f32_m128(_a);
6707
    float32x4_t b = vreinterpretq_f32_m128(_b);
6708
    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
6709
}
6710

6711
// Blend packed 8-bit integers from a and b using mask, and store the results in
6712
// dst.
6713
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8
6714
FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask)
6715
{
6716
    // Use a signed shift right to create a mask with the sign bit
6717
    uint8x16_t mask =
6718
        vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7));
6719
    uint8x16_t a = vreinterpretq_u8_m128i(_a);
6720
    uint8x16_t b = vreinterpretq_u8_m128i(_b);
6721
    return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a));
6722
}
6723

6724
// Blend packed double-precision (64-bit) floating-point elements from a and b
6725
// using mask, and store the results in dst.
6726
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd
6727
FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask)
6728
{
6729
    uint64x2_t mask =
6730
        vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63));
6731
#if defined(__aarch64__) || defined(_M_ARM64)
6732
    float64x2_t a = vreinterpretq_f64_m128d(_a);
6733
    float64x2_t b = vreinterpretq_f64_m128d(_b);
6734
    return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a));
6735
#else
6736
    uint64x2_t a = vreinterpretq_u64_m128d(_a);
6737
    uint64x2_t b = vreinterpretq_u64_m128d(_b);
6738
    return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a));
6739
#endif
6740
}
6741

6742
// Blend packed single-precision (32-bit) floating-point elements from a and b
6743
// using mask, and store the results in dst.
6744
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps
6745
FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask)
6746
{
6747
    // Use a signed shift right to create a mask with the sign bit
6748
    uint32x4_t mask =
6749
        vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31));
6750
    float32x4_t a = vreinterpretq_f32_m128(_a);
6751
    float32x4_t b = vreinterpretq_f32_m128(_b);
6752
    return vreinterpretq_m128_f32(vbslq_f32(mask, b, a));
6753
}
6754

6755
// Round the packed double-precision (64-bit) floating-point elements in a up
6756
// to an integer value, and store the results as packed double-precision
6757
// floating-point elements in dst.
6758
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd
6759
FORCE_INLINE __m128d _mm_ceil_pd(__m128d a)
6760
{
6761
#if defined(__aarch64__) || defined(_M_ARM64)
6762
    return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a)));
6763
#else
6764
    double *f = (double *) &a;
6765
    return _mm_set_pd(ceil(f[1]), ceil(f[0]));
6766
#endif
6767
}
6768

6769
// Round the packed single-precision (32-bit) floating-point elements in a up to
6770
// an integer value, and store the results as packed single-precision
6771
// floating-point elements in dst.
6772
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps
6773
FORCE_INLINE __m128 _mm_ceil_ps(__m128 a)
6774
{
6775
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
6776
    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
6777
    return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a)));
6778
#else
6779
    float *f = (float *) &a;
6780
    return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0]));
6781
#endif
6782
}
6783

6784
// Round the lower double-precision (64-bit) floating-point element in b up to
6785
// an integer value, store the result as a double-precision floating-point
6786
// element in the lower element of dst, and copy the upper element from a to the
6787
// upper element of dst.
6788
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd
6789
FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b)
6790
{
6791
    return _mm_move_sd(a, _mm_ceil_pd(b));
6792
}
6793

6794
// Round the lower single-precision (32-bit) floating-point element in b up to
6795
// an integer value, store the result as a single-precision floating-point
6796
// element in the lower element of dst, and copy the upper 3 packed elements
6797
// from a to the upper elements of dst.
6798
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss
6799
FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b)
6800
{
6801
    return _mm_move_ss(a, _mm_ceil_ps(b));
6802
}
6803

6804
// Compare packed 64-bit integers in a and b for equality, and store the results
6805
// in dst
6806
FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b)
6807
{
6808
#if defined(__aarch64__) || defined(_M_ARM64)
6809
    return vreinterpretq_m128i_u64(
6810
        vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b)));
6811
#else
6812
    // ARMv7 lacks vceqq_u64
6813
    // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi)
6814
    uint32x4_t cmp =
6815
        vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b));
6816
    uint32x4_t swapped = vrev64q_u32(cmp);
6817
    return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped));
6818
#endif
6819
}
6820

6821
// Sign extend packed 16-bit integers in a to packed 32-bit integers, and store
6822
// the results in dst.
6823
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32
6824
FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a)
6825
{
6826
    return vreinterpretq_m128i_s32(
6827
        vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a))));
6828
}
6829

6830
// Sign extend packed 16-bit integers in a to packed 64-bit integers, and store
6831
// the results in dst.
6832
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64
6833
FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a)
6834
{
6835
    int16x8_t s16x8 = vreinterpretq_s16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
6836
    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
6837
    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
6838
    return vreinterpretq_m128i_s64(s64x2);
6839
}
6840

6841
// Sign extend packed 32-bit integers in a to packed 64-bit integers, and store
6842
// the results in dst.
6843
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64
6844
FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a)
6845
{
6846
    return vreinterpretq_m128i_s64(
6847
        vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))));
6848
}
6849

6850
// Sign extend packed 8-bit integers in a to packed 16-bit integers, and store
6851
// the results in dst.
6852
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16
6853
FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a)
6854
{
6855
    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);    /* xxxx xxxx xxxx DCBA */
6856
    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */
6857
    return vreinterpretq_m128i_s16(s16x8);
6858
}
6859

6860
// Sign extend packed 8-bit integers in a to packed 32-bit integers, and store
6861
// the results in dst.
6862
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32
6863
FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a)
6864
{
6865
    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx DCBA */
6866
    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0D0C 0B0A */
6867
    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */
6868
    return vreinterpretq_m128i_s32(s32x4);
6869
}
6870

6871
// Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit
6872
// integers, and store the results in dst.
6873
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64
6874
FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a)
6875
{
6876
    int8x16_t s8x16 = vreinterpretq_s8_m128i(a);      /* xxxx xxxx xxxx xxBA */
6877
    int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16));   /* 0x0x 0x0x 0x0x 0B0A */
6878
    int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */
6879
    int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */
6880
    return vreinterpretq_m128i_s64(s64x2);
6881
}
6882

6883
// Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers,
6884
// and store the results in dst.
6885
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32
6886
FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a)
6887
{
6888
    return vreinterpretq_m128i_u32(
6889
        vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a))));
6890
}
6891

6892
// Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers,
6893
// and store the results in dst.
6894
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64
6895
FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a)
6896
{
6897
    uint16x8_t u16x8 = vreinterpretq_u16_m128i(a);     /* xxxx xxxx xxxx 0B0A */
6898
    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
6899
    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
6900
    return vreinterpretq_m128i_u64(u64x2);
6901
}
6902

6903
// Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers,
6904
// and store the results in dst.
6905
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64
6906
FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a)
6907
{
6908
    return vreinterpretq_m128i_u64(
6909
        vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a))));
6910
}
6911

6912
// Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers,
6913
// and store the results in dst.
6914
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16
6915
FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a)
6916
{
6917
    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);    /* xxxx xxxx HGFE DCBA */
6918
    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */
6919
    return vreinterpretq_m128i_u16(u16x8);
6920
}
6921

6922
// Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers,
6923
// and store the results in dst.
6924
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32
6925
FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a)
6926
{
6927
    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx DCBA */
6928
    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0D0C 0B0A */
6929
    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */
6930
    return vreinterpretq_m128i_u32(u32x4);
6931
}
6932

6933
// Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed
6934
// 64-bit integers, and store the results in dst.
6935
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64
6936
FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a)
6937
{
6938
    uint8x16_t u8x16 = vreinterpretq_u8_m128i(a);      /* xxxx xxxx xxxx xxBA */
6939
    uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16));   /* 0x0x 0x0x 0x0x 0B0A */
6940
    uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */
6941
    uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */
6942
    return vreinterpretq_m128i_u64(u64x2);
6943
}
6944

6945
// Conditionally multiply the packed double-precision (64-bit) floating-point
6946
// elements in a and b using the high 4 bits in imm8, sum the four products, and
6947
// conditionally store the sum in dst using the low 4 bits of imm8.
6948
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd
6949
FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm)
6950
{
6951
    // Generate mask value from constant immediate bit value
6952
    const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0;
6953
    const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0;
6954
#if !SSE2NEON_PRECISE_DP
6955
    const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0;
6956
    const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0;
6957
#endif
6958
    // Conditional multiplication
6959
#if !SSE2NEON_PRECISE_DP
6960
    __m128d mul = _mm_mul_pd(a, b);
6961
    const __m128d mulMask =
6962
        _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask));
6963
    __m128d tmp = _mm_and_pd(mul, mulMask);
6964
#else
6965
#if defined(__aarch64__) || defined(_M_ARM64)
6966
    double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) *
6967
                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0)
6968
                             : 0;
6969
    double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) *
6970
                                   vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1)
6971
                             : 0;
6972
#else
6973
    double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0;
6974
    double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0;
6975
#endif
6976
    __m128d tmp = _mm_set_pd(d1, d0);
6977
#endif
6978
    // Sum the products
6979
#if defined(__aarch64__) || defined(_M_ARM64)
6980
    double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp));
6981
#else
6982
    double sum = *((double *) &tmp) + *(((double *) &tmp) + 1);
6983
#endif
6984
    // Conditionally store the sum
6985
    const __m128d sumMask =
6986
        _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask));
6987
    __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask);
6988
    return res;
6989
}
6990

6991
// Conditionally multiply the packed single-precision (32-bit) floating-point
6992
// elements in a and b using the high 4 bits in imm8, sum the four products,
6993
// and conditionally store the sum in dst using the low 4 bits of imm.
6994
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps
6995
FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm)
6996
{
6997
    float32x4_t elementwise_prod = _mm_mul_ps(a, b);
6998

6999
#if defined(__aarch64__) || defined(_M_ARM64)
7000
    /* shortcuts */
7001
    if (imm == 0xFF) {
7002
        return _mm_set1_ps(vaddvq_f32(elementwise_prod));
7003
    }
7004

7005
    if ((imm & 0x0F) == 0x0F) {
7006
        if (!(imm & (1 << 4)))
7007
            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0);
7008
        if (!(imm & (1 << 5)))
7009
            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1);
7010
        if (!(imm & (1 << 6)))
7011
            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2);
7012
        if (!(imm & (1 << 7)))
7013
            elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3);
7014

7015
        return _mm_set1_ps(vaddvq_f32(elementwise_prod));
7016
    }
7017
#endif
7018

7019
    float s = 0.0f;
7020

7021
    if (imm & (1 << 4))
7022
        s += vgetq_lane_f32(elementwise_prod, 0);
7023
    if (imm & (1 << 5))
7024
        s += vgetq_lane_f32(elementwise_prod, 1);
7025
    if (imm & (1 << 6))
7026
        s += vgetq_lane_f32(elementwise_prod, 2);
7027
    if (imm & (1 << 7))
7028
        s += vgetq_lane_f32(elementwise_prod, 3);
7029

7030
    const float32_t res[4] = {
7031
        (imm & 0x1) ? s : 0.0f,
7032
        (imm & 0x2) ? s : 0.0f,
7033
        (imm & 0x4) ? s : 0.0f,
7034
        (imm & 0x8) ? s : 0.0f,
7035
    };
7036
    return vreinterpretq_m128_f32(vld1q_f32(res));
7037
}
7038

7039
// Extract a 32-bit integer from a, selected with imm8, and store the result in
7040
// dst.
7041
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32
7042
// FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm)
7043
#define _mm_extract_epi32(a, imm) \
7044
    vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm))
7045

7046
// Extract a 64-bit integer from a, selected with imm8, and store the result in
7047
// dst.
7048
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64
7049
// FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm)
7050
#define _mm_extract_epi64(a, imm) \
7051
    vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm))
7052

7053
// Extract an 8-bit integer from a, selected with imm8, and store the result in
7054
// the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a,
7055
// __constrange(0,16) int imm)
7056
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8
7057
#define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm))
7058

7059
// Extracts the selected single-precision (32-bit) floating-point from a.
7060
// FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm)
7061
#define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm))
7062

7063
// Round the packed double-precision (64-bit) floating-point elements in a down
7064
// to an integer value, and store the results as packed double-precision
7065
// floating-point elements in dst.
7066
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd
7067
FORCE_INLINE __m128d _mm_floor_pd(__m128d a)
7068
{
7069
#if defined(__aarch64__) || defined(_M_ARM64)
7070
    return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a)));
7071
#else
7072
    double *f = (double *) &a;
7073
    return _mm_set_pd(floor(f[1]), floor(f[0]));
7074
#endif
7075
}
7076

7077
// Round the packed single-precision (32-bit) floating-point elements in a down
7078
// to an integer value, and store the results as packed single-precision
7079
// floating-point elements in dst.
7080
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps
7081
FORCE_INLINE __m128 _mm_floor_ps(__m128 a)
7082
{
7083
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
7084
    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7085
    return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a)));
7086
#else
7087
    float *f = (float *) &a;
7088
    return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0]));
7089
#endif
7090
}
7091

7092
// Round the lower double-precision (64-bit) floating-point element in b down to
7093
// an integer value, store the result as a double-precision floating-point
7094
// element in the lower element of dst, and copy the upper element from a to the
7095
// upper element of dst.
7096
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd
7097
FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b)
7098
{
7099
    return _mm_move_sd(a, _mm_floor_pd(b));
7100
}
7101

7102
// Round the lower single-precision (32-bit) floating-point element in b down to
7103
// an integer value, store the result as a single-precision floating-point
7104
// element in the lower element of dst, and copy the upper 3 packed elements
7105
// from a to the upper elements of dst.
7106
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss
7107
FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b)
7108
{
7109
    return _mm_move_ss(a, _mm_floor_ps(b));
7110
}
7111

7112
// Copy a to dst, and insert the 32-bit integer i into dst at the location
7113
// specified by imm8.
7114
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32
7115
// FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b,
7116
//                                       __constrange(0,4) int imm)
7117
#define _mm_insert_epi32(a, b, imm) \
7118
    vreinterpretq_m128i_s32(        \
7119
        vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm)))
7120

7121
// Copy a to dst, and insert the 64-bit integer i into dst at the location
7122
// specified by imm8.
7123
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64
7124
// FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b,
7125
//                                       __constrange(0,2) int imm)
7126
#define _mm_insert_epi64(a, b, imm) \
7127
    vreinterpretq_m128i_s64(        \
7128
        vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm)))
7129

7130
// Copy a to dst, and insert the lower 8-bit integer from i into dst at the
7131
// location specified by imm8.
7132
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8
7133
// FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b,
7134
//                                      __constrange(0,16) int imm)
7135
#define _mm_insert_epi8(a, b, imm) \
7136
    vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm)))
7137

7138
// Copy a to tmp, then insert a single-precision (32-bit) floating-point
7139
// element from b into tmp using the control in imm8. Store tmp to dst using
7140
// the mask in imm8 (elements are zeroed out when the corresponding bit is set).
7141
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps
7142
#define _mm_insert_ps(a, b, imm8)                                            \
7143
    _sse2neon_define2(                                                       \
7144
        __m128, a, b,                                                        \
7145
        float32x4_t tmp1 =                                                   \
7146
            vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3),            \
7147
                           vreinterpretq_f32_m128(_a), 0);                   \
7148
        float32x4_t tmp2 =                                                   \
7149
            vsetq_lane_f32(vgetq_lane_f32(tmp1, 0),                          \
7150
                           vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \
7151
        const uint32_t data[4] =                                             \
7152
            _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0,             \
7153
                           ((imm8) & (1 << 1)) ? UINT32_MAX : 0,             \
7154
                           ((imm8) & (1 << 2)) ? UINT32_MAX : 0,             \
7155
                           ((imm8) & (1 << 3)) ? UINT32_MAX : 0);            \
7156
        uint32x4_t mask = vld1q_u32(data);                                   \
7157
        float32x4_t all_zeros = vdupq_n_f32(0);                              \
7158
                                                                             \
7159
        _sse2neon_return(vreinterpretq_m128_f32(                             \
7160
            vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));)
7161

7162
// Compare packed signed 32-bit integers in a and b, and store packed maximum
7163
// values in dst.
7164
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32
7165
FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b)
7166
{
7167
    return vreinterpretq_m128i_s32(
7168
        vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7169
}
7170

7171
// Compare packed signed 8-bit integers in a and b, and store packed maximum
7172
// values in dst.
7173
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8
7174
FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b)
7175
{
7176
    return vreinterpretq_m128i_s8(
7177
        vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7178
}
7179

7180
// Compare packed unsigned 16-bit integers in a and b, and store packed maximum
7181
// values in dst.
7182
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16
7183
FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b)
7184
{
7185
    return vreinterpretq_m128i_u16(
7186
        vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7187
}
7188

7189
// Compare packed unsigned 32-bit integers in a and b, and store packed maximum
7190
// values in dst.
7191
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
7192
FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b)
7193
{
7194
    return vreinterpretq_m128i_u32(
7195
        vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7196
}
7197

7198
// Compare packed signed 32-bit integers in a and b, and store packed minimum
7199
// values in dst.
7200
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32
7201
FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b)
7202
{
7203
    return vreinterpretq_m128i_s32(
7204
        vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7205
}
7206

7207
// Compare packed signed 8-bit integers in a and b, and store packed minimum
7208
// values in dst.
7209
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8
7210
FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b)
7211
{
7212
    return vreinterpretq_m128i_s8(
7213
        vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b)));
7214
}
7215

7216
// Compare packed unsigned 16-bit integers in a and b, and store packed minimum
7217
// values in dst.
7218
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16
7219
FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b)
7220
{
7221
    return vreinterpretq_m128i_u16(
7222
        vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)));
7223
}
7224

7225
// Compare packed unsigned 32-bit integers in a and b, and store packed minimum
7226
// values in dst.
7227
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32
7228
FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b)
7229
{
7230
    return vreinterpretq_m128i_u32(
7231
        vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)));
7232
}
7233

7234
// Horizontally compute the minimum amongst the packed unsigned 16-bit integers
7235
// in a, store the minimum and index in dst, and zero the remaining bits in dst.
7236
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16
7237
FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a)
7238
{
7239
    __m128i dst;
7240
    uint16_t min, idx = 0;
7241
#if defined(__aarch64__) || defined(_M_ARM64)
7242
    // Find the minimum value
7243
    min = vminvq_u16(vreinterpretq_u16_m128i(a));
7244

7245
    // Get the index of the minimum value
7246
    static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7};
7247
    uint16x8_t minv = vdupq_n_u16(min);
7248
    uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a));
7249
    idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq));
7250
#else
7251
    // Find the minimum value
7252
    __m64 tmp;
7253
    tmp = vreinterpret_m64_u16(
7254
        vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)),
7255
                 vget_high_u16(vreinterpretq_u16_m128i(a))));
7256
    tmp = vreinterpret_m64_u16(
7257
        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7258
    tmp = vreinterpret_m64_u16(
7259
        vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp)));
7260
    min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0);
7261
    // Get the index of the minimum value
7262
    int i;
7263
    for (i = 0; i < 8; i++) {
7264
        if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) {
7265
            idx = (uint16_t) i;
7266
            break;
7267
        }
7268
        a = _mm_srli_si128(a, 2);
7269
    }
7270
#endif
7271
    // Generate result
7272
    dst = _mm_setzero_si128();
7273
    dst = vreinterpretq_m128i_u16(
7274
        vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0));
7275
    dst = vreinterpretq_m128i_u16(
7276
        vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1));
7277
    return dst;
7278
}
7279

7280
// Compute the sum of absolute differences (SADs) of quadruplets of unsigned
7281
// 8-bit integers in a compared to those in b, and store the 16-bit results in
7282
// dst. Eight SADs are performed using one quadruplet from b and eight
7283
// quadruplets from a. One quadruplet is selected from b starting at on the
7284
// offset specified in imm8. Eight quadruplets are formed from sequential 8-bit
7285
// integers selected from a starting at the offset specified in imm8.
7286
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8
7287
FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm)
7288
{
7289
    uint8x16_t _a, _b;
7290

7291
    switch (imm & 0x4) {
7292
    case 0:
7293
        // do nothing
7294
        _a = vreinterpretq_u8_m128i(a);
7295
        break;
7296
    case 4:
7297
        _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a),
7298
                                            vreinterpretq_u32_m128i(a), 1));
7299
        break;
7300
    default:
7301
#if defined(__GNUC__) || defined(__clang__)
7302
        __builtin_unreachable();
7303
#elif defined(_MSC_VER)
7304
        __assume(0);
7305
#endif
7306
        break;
7307
    }
7308

7309
    switch (imm & 0x3) {
7310
    case 0:
7311
        _b = vreinterpretq_u8_u32(
7312
            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0)));
7313
        break;
7314
    case 1:
7315
        _b = vreinterpretq_u8_u32(
7316
            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1)));
7317
        break;
7318
    case 2:
7319
        _b = vreinterpretq_u8_u32(
7320
            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2)));
7321
        break;
7322
    case 3:
7323
        _b = vreinterpretq_u8_u32(
7324
            vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3)));
7325
        break;
7326
    default:
7327
#if defined(__GNUC__) || defined(__clang__)
7328
        __builtin_unreachable();
7329
#elif defined(_MSC_VER)
7330
        __assume(0);
7331
#endif
7332
        break;
7333
    }
7334

7335
    int16x8_t c04, c15, c26, c37;
7336
    uint8x8_t low_b = vget_low_u8(_b);
7337
    c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b));
7338
    uint8x16_t _a_1 = vextq_u8(_a, _a, 1);
7339
    c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b));
7340
    uint8x16_t _a_2 = vextq_u8(_a, _a, 2);
7341
    c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b));
7342
    uint8x16_t _a_3 = vextq_u8(_a, _a, 3);
7343
    c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b));
7344
#if defined(__aarch64__) || defined(_M_ARM64)
7345
    // |0|4|2|6|
7346
    c04 = vpaddq_s16(c04, c26);
7347
    // |1|5|3|7|
7348
    c15 = vpaddq_s16(c15, c37);
7349

7350
    int32x4_t trn1_c =
7351
        vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7352
    int32x4_t trn2_c =
7353
        vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15));
7354
    return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c),
7355
                                              vreinterpretq_s16_s32(trn2_c)));
7356
#else
7357
    int16x4_t c01, c23, c45, c67;
7358
    c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15));
7359
    c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37));
7360
    c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15));
7361
    c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37));
7362

7363
    return vreinterpretq_m128i_s16(
7364
        vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67)));
7365
#endif
7366
}
7367

7368
// Multiply the low signed 32-bit integers from each packed 64-bit element in
7369
// a and b, and store the signed 64-bit results in dst.
7370
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32
7371
FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b)
7372
{
7373
    // vmull_s32 upcasts instead of masking, so we downcast.
7374
    int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a));
7375
    int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b));
7376
    return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo));
7377
}
7378

7379
// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit
7380
// integers, and store the low 32 bits of the intermediate integers in dst.
7381
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32
7382
FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b)
7383
{
7384
    return vreinterpretq_m128i_s32(
7385
        vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b)));
7386
}
7387

7388
// Convert packed signed 32-bit integers from a and b to packed 16-bit integers
7389
// using unsigned saturation, and store the results in dst.
7390
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32
7391
FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b)
7392
{
7393
    return vreinterpretq_m128i_u16(
7394
        vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)),
7395
                     vqmovun_s32(vreinterpretq_s32_m128i(b))));
7396
}
7397

7398
// Round the packed double-precision (64-bit) floating-point elements in a using
7399
// the rounding parameter, and store the results as packed double-precision
7400
// floating-point elements in dst.
7401
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd
7402
FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding)
7403
{
7404
#if defined(__aarch64__) || defined(_M_ARM64)
7405
    switch (rounding) {
7406
    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
7407
        return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a)));
7408
    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
7409
        return _mm_floor_pd(a);
7410
    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
7411
        return _mm_ceil_pd(a);
7412
    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
7413
        return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a)));
7414
    default:  //_MM_FROUND_CUR_DIRECTION
7415
        return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)));
7416
    }
7417
#else
7418
    double *v_double = (double *) &a;
7419

7420
    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7421
        (rounding == _MM_FROUND_CUR_DIRECTION &&
7422
         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
7423
        double res[2], tmp;
7424
        for (int i = 0; i < 2; i++) {
7425
            tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i];
7426
            double roundDown = floor(tmp);  // Round down value
7427
            double roundUp = ceil(tmp);     // Round up value
7428
            double diffDown = tmp - roundDown;
7429
            double diffUp = roundUp - tmp;
7430
            if (diffDown < diffUp) {
7431
                /* If it's closer to the round down value, then use it */
7432
                res[i] = roundDown;
7433
            } else if (diffDown > diffUp) {
7434
                /* If it's closer to the round up value, then use it */
7435
                res[i] = roundUp;
7436
            } else {
7437
                /* If it's equidistant between round up and round down value,
7438
                 * pick the one which is an even number */
7439
                double half = roundDown / 2;
7440
                if (half != floor(half)) {
7441
                    /* If the round down value is odd, return the round up value
7442
                     */
7443
                    res[i] = roundUp;
7444
                } else {
7445
                    /* If the round up value is odd, return the round down value
7446
                     */
7447
                    res[i] = roundDown;
7448
                }
7449
            }
7450
            res[i] = (v_double[i] < 0) ? -res[i] : res[i];
7451
        }
7452
        return _mm_set_pd(res[1], res[0]);
7453
    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7454
               (rounding == _MM_FROUND_CUR_DIRECTION &&
7455
                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
7456
        return _mm_floor_pd(a);
7457
    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7458
               (rounding == _MM_FROUND_CUR_DIRECTION &&
7459
                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
7460
        return _mm_ceil_pd(a);
7461
    }
7462
    return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]),
7463
                      v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0]));
7464
#endif
7465
}
7466

7467
// Round the packed single-precision (32-bit) floating-point elements in a using
7468
// the rounding parameter, and store the results as packed single-precision
7469
// floating-point elements in dst.
7470
// software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps
7471
FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding)
7472
{
7473
#if (defined(__aarch64__) || defined(_M_ARM64)) || \
7474
    defined(__ARM_FEATURE_DIRECTED_ROUNDING)
7475
    switch (rounding) {
7476
    case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC):
7477
        return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a)));
7478
    case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC):
7479
        return _mm_floor_ps(a);
7480
    case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC):
7481
        return _mm_ceil_ps(a);
7482
    case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC):
7483
        return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a)));
7484
    default:  //_MM_FROUND_CUR_DIRECTION
7485
        return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a)));
7486
    }
7487
#else
7488
    float *v_float = (float *) &a;
7489

7490
    if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) ||
7491
        (rounding == _MM_FROUND_CUR_DIRECTION &&
7492
         _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) {
7493
        uint32x4_t signmask = vdupq_n_u32(0x80000000);
7494
        float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a),
7495
                                     vdupq_n_f32(0.5f)); /* +/- 0.5 */
7496
        int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32(
7497
            vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/
7498
        int32x4_t r_trunc = vcvtq_s32_f32(
7499
            vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */
7500
        int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32(
7501
            vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
7502
        int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
7503
                                     vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
7504
        float32x4_t delta = vsubq_f32(
7505
            vreinterpretq_f32_m128(a),
7506
            vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
7507
        uint32x4_t is_delta_half =
7508
            vceqq_f32(delta, half); /* delta == +/- 0.5 */
7509
        return vreinterpretq_m128_f32(
7510
            vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal)));
7511
    } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) ||
7512
               (rounding == _MM_FROUND_CUR_DIRECTION &&
7513
                _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) {
7514
        return _mm_floor_ps(a);
7515
    } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) ||
7516
               (rounding == _MM_FROUND_CUR_DIRECTION &&
7517
                _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) {
7518
        return _mm_ceil_ps(a);
7519
    }
7520
    return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]),
7521
                      v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]),
7522
                      v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]),
7523
                      v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0]));
7524
#endif
7525
}
7526

7527
// Round the lower double-precision (64-bit) floating-point element in b using
7528
// the rounding parameter, store the result as a double-precision floating-point
7529
// element in the lower element of dst, and copy the upper element from a to the
7530
// upper element of dst.
7531
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd
7532
FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding)
7533
{
7534
    return _mm_move_sd(a, _mm_round_pd(b, rounding));
7535
}
7536

7537
// Round the lower single-precision (32-bit) floating-point element in b using
7538
// the rounding parameter, store the result as a single-precision floating-point
7539
// element in the lower element of dst, and copy the upper 3 packed elements
7540
// from a to the upper elements of dst. Rounding is done according to the
7541
// rounding[3:0] parameter, which can be one of:
7542
//     (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and
7543
//     suppress exceptions
7544
//     (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC)     // round down, and
7545
//     suppress exceptions
7546
//     (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC)     // round up, and suppress
7547
//     exceptions
7548
//     (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC)        // truncate, and suppress
7549
//     exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see
7550
//     _MM_SET_ROUNDING_MODE
7551
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss
7552
FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding)
7553
{
7554
    return _mm_move_ss(a, _mm_round_ps(b, rounding));
7555
}
7556

7557
// Load 128-bits of integer data from memory into dst using a non-temporal
7558
// memory hint. mem_addr must be aligned on a 16-byte boundary or a
7559
// general-protection exception may be generated.
7560
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128
7561
FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p)
7562
{
7563
#if __has_builtin(__builtin_nontemporal_store)
7564
    return __builtin_nontemporal_load(p);
7565
#else
7566
    return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p));
7567
#endif
7568
}
7569

7570
// Compute the bitwise NOT of a and then AND with a 128-bit vector containing
7571
// all 1's, and return 1 if the result is zero, otherwise return 0.
7572
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones
7573
FORCE_INLINE int _mm_test_all_ones(__m128i a)
7574
{
7575
    return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) ==
7576
           ~(uint64_t) 0;
7577
}
7578

7579
// Compute the bitwise AND of 128 bits (representing integer data) in a and
7580
// mask, and return 1 if the result is zero, otherwise return 0.
7581
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros
7582
FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask)
7583
{
7584
    int64x2_t a_and_mask =
7585
        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask));
7586
    return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1));
7587
}
7588

7589
// Compute the bitwise AND of 128 bits (representing integer data) in a and
7590
// mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute
7591
// the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is
7592
// zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7593
// otherwise return 0.
7594
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero
7595
FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask)
7596
{
7597
    uint64x2_t zf =
7598
        vandq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
7599
    uint64x2_t cf =
7600
        vbicq_u64(vreinterpretq_u64_m128i(mask), vreinterpretq_u64_m128i(a));
7601
    uint64x2_t result = vandq_u64(zf, cf);
7602
    return !(vgetq_lane_u64(result, 0) | vgetq_lane_u64(result, 1));
7603
}
7604

7605
// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7606
// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7607
// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7608
// otherwise set CF to 0. Return the CF value.
7609
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128
7610
FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b)
7611
{
7612
    int64x2_t s64 =
7613
        vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a));
7614
    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7615
}
7616

7617
// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7618
// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7619
// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7620
// otherwise set CF to 0. Return 1 if both the ZF and CF values are zero,
7621
// otherwise return 0.
7622
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128
7623
#define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b)
7624

7625
// Compute the bitwise AND of 128 bits (representing integer data) in a and b,
7626
// and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the
7627
// bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero,
7628
// otherwise set CF to 0. Return the ZF value.
7629
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128
7630
FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b)
7631
{
7632
    int64x2_t s64 =
7633
        vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b));
7634
    return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1));
7635
}
7636

7637
/* SSE4.2 */
7638

7639
static const uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = {
7640
    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7641
};
7642
static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = {
7643
    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7644
    0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
7645
};
7646

7647
/* specify the source data format */
7648
#define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */
7649
#define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */
7650
#define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */
7651
#define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */
7652

7653
/* specify the comparison operation */
7654
#define _SIDD_CMP_EQUAL_ANY 0x00     /* compare equal any: strchr */
7655
#define _SIDD_CMP_RANGES 0x04        /* compare ranges */
7656
#define _SIDD_CMP_EQUAL_EACH 0x08    /* compare equal each: strcmp */
7657
#define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */
7658

7659
/* specify the polarity */
7660
#define _SIDD_POSITIVE_POLARITY 0x00
7661
#define _SIDD_MASKED_POSITIVE_POLARITY 0x20
7662
#define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */
7663
#define _SIDD_MASKED_NEGATIVE_POLARITY \
7664
    0x30 /* negate results only before end of string */
7665

7666
/* specify the output selection in _mm_cmpXstri */
7667
#define _SIDD_LEAST_SIGNIFICANT 0x00
7668
#define _SIDD_MOST_SIGNIFICANT 0x40
7669

7670
/* specify the output selection in _mm_cmpXstrm */
7671
#define _SIDD_BIT_MASK 0x00
7672
#define _SIDD_UNIT_MASK 0x40
7673

7674
/* Pattern Matching for C macros.
7675
 * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms
7676
 */
7677

7678
/* catenate */
7679
#define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__
7680
#define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b)
7681

7682
#define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c)
7683
/* run the 2nd parameter */
7684
#define SSE2NEON_IIF_0(t, ...) __VA_ARGS__
7685
/* run the 1st parameter */
7686
#define SSE2NEON_IIF_1(t, ...) t
7687

7688
#define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b)
7689
#define SSE2NEON_COMPL_0 1
7690
#define SSE2NEON_COMPL_1 0
7691

7692
#define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x)
7693
#define SSE2NEON_DEC_1 0
7694
#define SSE2NEON_DEC_2 1
7695
#define SSE2NEON_DEC_3 2
7696
#define SSE2NEON_DEC_4 3
7697
#define SSE2NEON_DEC_5 4
7698
#define SSE2NEON_DEC_6 5
7699
#define SSE2NEON_DEC_7 6
7700
#define SSE2NEON_DEC_8 7
7701
#define SSE2NEON_DEC_9 8
7702
#define SSE2NEON_DEC_10 9
7703
#define SSE2NEON_DEC_11 10
7704
#define SSE2NEON_DEC_12 11
7705
#define SSE2NEON_DEC_13 12
7706
#define SSE2NEON_DEC_14 13
7707
#define SSE2NEON_DEC_15 14
7708
#define SSE2NEON_DEC_16 15
7709

7710
/* detection */
7711
#define SSE2NEON_CHECK_N(x, n, ...) n
7712
#define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, )
7713
#define SSE2NEON_PROBE(x) x, 1,
7714

7715
#define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x))
7716
#define SSE2NEON_NOT_0 SSE2NEON_PROBE(~)
7717

7718
#define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x))
7719
#define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c))
7720

7721
#define SSE2NEON_EAT(...)
7722
#define SSE2NEON_EXPAND(...) __VA_ARGS__
7723
#define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT)
7724

7725
/* recursion */
7726
/* deferred expression */
7727
#define SSE2NEON_EMPTY()
7728
#define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY()
7729
#define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)()
7730
#define SSE2NEON_EXPAND(...) __VA_ARGS__
7731

7732
#define SSE2NEON_EVAL(...) \
7733
    SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__)))
7734
#define SSE2NEON_EVAL1(...) \
7735
    SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__)))
7736
#define SSE2NEON_EVAL2(...) \
7737
    SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__)))
7738
#define SSE2NEON_EVAL3(...) __VA_ARGS__
7739

7740
#define SSE2NEON_REPEAT(count, macro, ...)                         \
7741
    SSE2NEON_WHEN(count)                                           \
7742
    (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()(                \
7743
        SSE2NEON_DEC(count), macro,                                \
7744
        __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \
7745
                                              __VA_ARGS__))
7746
#define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT
7747

7748
#define SSE2NEON_SIZE_OF_byte 8
7749
#define SSE2NEON_NUMBER_OF_LANES_byte 16
7750
#define SSE2NEON_SIZE_OF_word 16
7751
#define SSE2NEON_NUMBER_OF_LANES_word 8
7752

7753
#define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type)                         \
7754
    mtx[i] = vreinterpretq_m128i_##type(vceqq_##type(                          \
7755
        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \
7756
        vreinterpretq_##type##_m128i(a)));
7757

7758
#define SSE2NEON_FILL_LANE(i, type) \
7759
    vec_b[i] =                      \
7760
        vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i));
7761

7762
#define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size,        \
7763
                       number_of_lanes, byte_or_word)                         \
7764
    do {                                                                      \
7765
        SSE2NEON_CAT(                                                         \
7766
            data_type_prefix,                                                 \
7767
            SSE2NEON_CAT(size,                                                \
7768
                         SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \
7769
        vec_b[number_of_lanes];                                               \
7770
        __m128i mask = SSE2NEON_IIF(byte_or_word)(                            \
7771
            vreinterpretq_m128i_u16(vdupq_n_u16(0xff)),                       \
7772
            vreinterpretq_m128i_u32(vdupq_n_u32(0xffff)));                    \
7773
        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE,    \
7774
                                      SSE2NEON_CAT(type_prefix, size)))       \
7775
        for (int i = 0; i < number_of_lanes; i++) {                           \
7776
            mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u,                      \
7777
                                  size)(SSE2NEON_CAT(vbslq_u, size)(          \
7778
                SSE2NEON_CAT(vreinterpretq_u,                                 \
7779
                             SSE2NEON_CAT(size, _m128i))(mask),               \
7780
                SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))(        \
7781
                    vec_b[i],                                                 \
7782
                    SSE2NEON_CAT(                                             \
7783
                        vreinterpretq_,                                       \
7784
                        SSE2NEON_CAT(type_prefix,                             \
7785
                                     SSE2NEON_CAT(size, _m128i(a))))),        \
7786
                SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))(        \
7787
                    vec_b[i],                                                 \
7788
                    SSE2NEON_CAT(                                             \
7789
                        vreinterpretq_,                                       \
7790
                        SSE2NEON_CAT(type_prefix,                             \
7791
                                     SSE2NEON_CAT(size, _m128i(a)))))));      \
7792
        }                                                                     \
7793
    } while (0)
7794

7795
#define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes)                         \
7796
    do {                                                                     \
7797
        SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes,                       \
7798
                                      SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \
7799
                                      SSE2NEON_CAT(u, size)))                \
7800
    } while (0)
7801

7802
#define SSE2NEON_CMP_EQUAL_ANY_IMPL(type)                                     \
7803
    static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \
7804
                                                int lb)                       \
7805
    {                                                                         \
7806
        __m128i mtx[16];                                                      \
7807
        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),          \
7808
                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));            \
7809
        return SSE2NEON_CAT(                                                  \
7810
            _sse2neon_aggregate_equal_any_,                                   \
7811
            SSE2NEON_CAT(                                                     \
7812
                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                        \
7813
                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,       \
7814
                                             type))))(la, lb, mtx);           \
7815
    }
7816

7817
#define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word)            \
7818
    static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \
7819
                                                 int lb)                       \
7820
    {                                                                          \
7821
        __m128i mtx[16];                                                       \
7822
        PCMPSTR_RANGES(                                                        \
7823
            a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),   \
7824
            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word);      \
7825
        return SSE2NEON_CAT(                                                   \
7826
            _sse2neon_aggregate_ranges_,                                       \
7827
            SSE2NEON_CAT(                                                      \
7828
                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
7829
                SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_,        \
7830
                                             type))))(la, lb, mtx);            \
7831
    }
7832

7833
#define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type)                                  \
7834
    static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la,         \
7835
                                                    __m128i b, int lb)         \
7836
    {                                                                          \
7837
        __m128i mtx[16];                                                       \
7838
        PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),           \
7839
                   SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type));             \
7840
        return SSE2NEON_CAT(                                                   \
7841
            _sse2neon_aggregate_equal_ordered_,                                \
7842
            SSE2NEON_CAT(                                                      \
7843
                SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type),                         \
7844
                SSE2NEON_CAT(x,                                                \
7845
                             SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \
7846
            SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx);       \
7847
    }
7848

7849
static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16])
7850
{
7851
    int res = 0;
7852
    int m = (1 << la) - 1;
7853
    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7854
    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
7855
    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
7856
    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
7857
    for (int j = 0; j < lb; j++) {
7858
        mtx[j] = vreinterpretq_m128i_u8(
7859
            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
7860
        mtx[j] = vreinterpretq_m128i_u8(
7861
            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
7862
        int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0;
7863
        res |= (tmp << j);
7864
    }
7865
    return res;
7866
}
7867

7868
static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16])
7869
{
7870
    int res = 0;
7871
    int m = (1 << la) - 1;
7872
    uint16x8_t vec =
7873
        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
7874
    for (int j = 0; j < lb; j++) {
7875
        mtx[j] = vreinterpretq_m128i_u16(
7876
            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
7877
        mtx[j] = vreinterpretq_m128i_u16(
7878
            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
7879
        int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0;
7880
        res |= (tmp << j);
7881
    }
7882
    return res;
7883
}
7884

7885
/* clang-format off */
7886
#define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \
7887
    prefix##IMPL(byte) \
7888
    prefix##IMPL(word)
7889
/* clang-format on */
7890

7891
SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_)
7892

7893
static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16])
7894
{
7895
    int res = 0;
7896
    int m = (1 << la) - 1;
7897
    uint16x8_t vec =
7898
        vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b));
7899
    for (int j = 0; j < lb; j++) {
7900
        mtx[j] = vreinterpretq_m128i_u16(
7901
            vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j])));
7902
        mtx[j] = vreinterpretq_m128i_u16(
7903
            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15));
7904
        __m128i tmp = vreinterpretq_m128i_u32(
7905
            vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16));
7906
        uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]),
7907
                                       vreinterpretq_u32_m128i(tmp));
7908
#if defined(__aarch64__) || defined(_M_ARM64)
7909
        int t = vaddvq_u32(vec_res) ? 1 : 0;
7910
#else
7911
        uint64x2_t sumh = vpaddlq_u32(vec_res);
7912
        int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1);
7913
#endif
7914
        res |= (t << j);
7915
    }
7916
    return res;
7917
}
7918

7919
static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16])
7920
{
7921
    int res = 0;
7922
    int m = (1 << la) - 1;
7923
    uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7924
    uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask);
7925
    uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask);
7926
    uint8x16_t vec = vcombine_u8(t_lo, t_hi);
7927
    for (int j = 0; j < lb; j++) {
7928
        mtx[j] = vreinterpretq_m128i_u8(
7929
            vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j])));
7930
        mtx[j] = vreinterpretq_m128i_u8(
7931
            vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7));
7932
        __m128i tmp = vreinterpretq_m128i_u16(
7933
            vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8));
7934
        uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]),
7935
                                       vreinterpretq_u16_m128i(tmp));
7936
        int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0;
7937
        res |= (t << j);
7938
    }
7939
    return res;
7940
}
7941

7942
#define SSE2NEON_CMP_RANGES_IS_BYTE 1
7943
#define SSE2NEON_CMP_RANGES_IS_WORD 0
7944

7945
/* clang-format off */
7946
#define SSE2NEON_GENERATE_CMP_RANGES(prefix)             \
7947
    prefix##IMPL(byte, uint, u, prefix##IS_BYTE)         \
7948
    prefix##IMPL(byte, int, s, prefix##IS_BYTE)          \
7949
    prefix##IMPL(word, uint, u, prefix##IS_WORD)         \
7950
    prefix##IMPL(word, int, s, prefix##IS_WORD)
7951
/* clang-format on */
7952

7953
SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_)
7954

7955
#undef SSE2NEON_CMP_RANGES_IS_BYTE
7956
#undef SSE2NEON_CMP_RANGES_IS_WORD
7957

7958
static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb)
7959
{
7960
    uint8x16_t mtx =
7961
        vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b));
7962
    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
7963
    int m1 = 0x10000 - (1 << la);
7964
    int tb = 0x10000 - (1 << lb);
7965
    uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi;
7966
    uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi;
7967
    vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b);
7968
    vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask);
7969
    vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask);
7970
    vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask);
7971
    vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask);
7972
    tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask);
7973
    tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask);
7974

7975
    res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx));
7976
    res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx));
7977
    res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo);
7978
    res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi);
7979
    res_lo = vand_u8(res_lo, vec_mask);
7980
    res_hi = vand_u8(res_hi, vec_mask);
7981

7982
    int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8);
7983
    return res;
7984
}
7985

7986
static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb)
7987
{
7988
    uint16x8_t mtx =
7989
        vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b));
7990
    int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb));
7991
    int m1 = 0x100 - (1 << la);
7992
    int tb = 0x100 - (1 << lb);
7993
    uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b);
7994
    uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask);
7995
    uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask);
7996
    uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask);
7997
    mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx);
7998
    mtx = vbslq_u16(vec1, tmp, mtx);
7999
    mtx = vandq_u16(mtx, vec_mask);
8000
    return _sse2neon_vaddvq_u16(mtx);
8001
}
8002

8003
#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1
8004
#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0
8005

8006
#define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type)  \
8007
    static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes(   \
8008
        int bound, int la, int lb, __m128i mtx[16])                            \
8009
    {                                                                          \
8010
        int res = 0;                                                           \
8011
        int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la);          \
8012
        uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)(                   \
8013
            vld1_u##size(_sse2neon_cmpestr_mask##size##b),                     \
8014
            vld1q_u##size(_sse2neon_cmpestr_mask##size##b));                   \
8015
        uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)(     \
8016
            vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask),       \
8017
                             vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \
8018
            vtstq_u##size(vdupq_n_u##size(m1), vec_mask));                     \
8019
        uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \
8020
        uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0);      \
8021
        for (int j = 0; j < lb; j++) {                                         \
8022
            mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size(                \
8023
                vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j])));   \
8024
        }                                                                      \
8025
        for (int j = lb; j < bound; j++) {                                     \
8026
            mtx[j] = vreinterpretq_m128i_u##size(                              \
8027
                vbslq_u##size(vec1, vec_minusone, vec_zero));                  \
8028
        }                                                                      \
8029
        unsigned SSE2NEON_IIF(data_type)(char, short) *ptr =                   \
8030
            (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx;             \
8031
        for (int i = 0; i < bound; i++) {                                      \
8032
            int val = 1;                                                       \
8033
            for (int j = 0, k = i; j < bound - i && k < bound; j++, k++)       \
8034
                val &= ptr[k * bound + j];                                     \
8035
            res += val << i;                                                   \
8036
        }                                                                      \
8037
        return res;                                                            \
8038
    }
8039

8040
/* clang-format off */
8041
#define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \
8042
    prefix##IMPL(8, 16, prefix##IS_UBYTE)               \
8043
    prefix##IMPL(16, 8, prefix##IS_UWORD)
8044
/* clang-format on */
8045

8046
SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_)
8047

8048
#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE
8049
#undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD
8050

8051
/* clang-format off */
8052
#define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \
8053
    prefix##IMPL(byte)                              \
8054
    prefix##IMPL(word)
8055
/* clang-format on */
8056

8057
SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_)
8058

8059
#define SSE2NEON_CMPESTR_LIST                          \
8060
    _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
8061
    _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any)         \
8062
    _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any)         \
8063
    _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any)         \
8064
    _(CMP_UBYTE_RANGES, cmp_ubyte_ranges)              \
8065
    _(CMP_UWORD_RANGES, cmp_uword_ranges)              \
8066
    _(CMP_SBYTE_RANGES, cmp_sbyte_ranges)              \
8067
    _(CMP_SWORD_RANGES, cmp_sword_ranges)              \
8068
    _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
8069
    _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each)       \
8070
    _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each)       \
8071
    _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each)       \
8072
    _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
8073
    _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \
8074
    _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \
8075
    _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered)
8076

8077
enum {
8078
#define _(name, func_suffix) name,
8079
    SSE2NEON_CMPESTR_LIST
8080
#undef _
8081
};
8082
typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb);
8083
static cmpestr_func_t _sse2neon_cmpfunc_table[] = {
8084
#define _(name, func_suffix) _sse2neon_##func_suffix,
8085
    SSE2NEON_CMPESTR_LIST
8086
#undef _
8087
};
8088

8089
FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound)
8090
{
8091
    switch (imm8 & 0x30) {
8092
    case _SIDD_NEGATIVE_POLARITY:
8093
        res ^= 0xffffffff;
8094
        break;
8095
    case _SIDD_MASKED_NEGATIVE_POLARITY:
8096
        res ^= (1 << lb) - 1;
8097
        break;
8098
    default:
8099
        break;
8100
    }
8101

8102
    return res & ((bound == 8) ? 0xFF : 0xFFFF);
8103
}
8104

8105
FORCE_INLINE int _sse2neon_clz(unsigned int x)
8106
{
8107
#ifdef _MSC_VER
8108
    unsigned long cnt = 0;
8109
    if (_BitScanReverse(&cnt, x))
8110
        return 31 - cnt;
8111
    return 32;
8112
#else
8113
    return x != 0 ? __builtin_clz(x) : 32;
8114
#endif
8115
}
8116

8117
FORCE_INLINE int _sse2neon_ctz(unsigned int x)
8118
{
8119
#ifdef _MSC_VER
8120
    unsigned long cnt = 0;
8121
    if (_BitScanForward(&cnt, x))
8122
        return cnt;
8123
    return 32;
8124
#else
8125
    return x != 0 ? __builtin_ctz(x) : 32;
8126
#endif
8127
}
8128

8129
FORCE_INLINE int _sse2neon_ctzll(unsigned long long x)
8130
{
8131
#ifdef _MSC_VER
8132
    unsigned long cnt;
8133
#if defined(SSE2NEON_HAS_BITSCAN64)
8134
    if (_BitScanForward64(&cnt, x))
8135
        return (int) (cnt);
8136
#else
8137
    if (_BitScanForward(&cnt, (unsigned long) (x)))
8138
        return (int) cnt;
8139
    if (_BitScanForward(&cnt, (unsigned long) (x >> 32)))
8140
        return (int) (cnt + 32);
8141
#endif /* SSE2NEON_HAS_BITSCAN64 */
8142
    return 64;
8143
#else /* assume GNU compatible compilers */
8144
    return x != 0 ? __builtin_ctzll(x) : 64;
8145
#endif
8146
}
8147

8148
#define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y)
8149

8150
#define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \
8151
    const int var = (imm & 0x01) ? 8 : 16
8152

8153
#define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \
8154
    int tmp1 = la ^ (la >> 31);                  \
8155
    la = tmp1 - (la >> 31);                      \
8156
    int tmp2 = lb ^ (lb >> 31);                  \
8157
    lb = tmp2 - (lb >> 31);                      \
8158
    la = SSE2NEON_MIN(la, bound);                \
8159
    lb = SSE2NEON_MIN(lb, bound)
8160

8161
// Compare all pairs of character in string a and b,
8162
// then aggregate the result.
8163
// As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the
8164
// length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of
8165
// string a and b.
8166
#define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE)                  \
8167
    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);                        \
8168
    SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb);                        \
8169
    int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \
8170
    r2 = _sse2neon_sido_negative(r2, lb, imm8, bound)
8171

8172
#define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8)          \
8173
    return (r2 == 0) ? bound                                     \
8174
                     : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \
8175
                                      : _sse2neon_ctz(r2))
8176

8177
#define SSE2NEON_CMPSTR_GENERATE_MASK(dst)                                     \
8178
    __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0));                       \
8179
    if (imm8 & 0x40) {                                                         \
8180
        if (bound == 8) {                                                      \
8181
            uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2),                        \
8182
                                       vld1q_u16(_sse2neon_cmpestr_mask16b));  \
8183
            dst = vreinterpretq_m128i_u16(vbslq_u16(                           \
8184
                tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst)));          \
8185
        } else {                                                               \
8186
            uint8x16_t vec_r2 =                                                \
8187
                vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8));                \
8188
            uint8x16_t tmp =                                                   \
8189
                vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b));          \
8190
            dst = vreinterpretq_m128i_u8(                                      \
8191
                vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst)));   \
8192
        }                                                                      \
8193
    } else {                                                                   \
8194
        if (bound == 16) {                                                     \
8195
            dst = vreinterpretq_m128i_u16(                                     \
8196
                vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \
8197
        } else {                                                               \
8198
            dst = vreinterpretq_m128i_u8(                                      \
8199
                vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0));     \
8200
        }                                                                      \
8201
    }                                                                          \
8202
    return dst
8203

8204
// Compare packed strings in a and b with lengths la and lb using the control
8205
// in imm8, and returns 1 if b did not contain a null character and the
8206
// resulting mask was zero, and 0 otherwise.
8207
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra
8208
FORCE_INLINE int _mm_cmpestra(__m128i a,
8209
                              int la,
8210
                              __m128i b,
8211
                              int lb,
8212
                              const int imm8)
8213
{
8214
    int lb_cpy = lb;
8215
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8216
    return !r2 & (lb_cpy > bound);
8217
}
8218

8219
// Compare packed strings in a and b with lengths la and lb using the control in
8220
// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
8221
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc
8222
FORCE_INLINE int _mm_cmpestrc(__m128i a,
8223
                              int la,
8224
                              __m128i b,
8225
                              int lb,
8226
                              const int imm8)
8227
{
8228
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8229
    return r2 != 0;
8230
}
8231

8232
// Compare packed strings in a and b with lengths la and lb using the control
8233
// in imm8, and store the generated index in dst.
8234
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri
8235
FORCE_INLINE int _mm_cmpestri(__m128i a,
8236
                              int la,
8237
                              __m128i b,
8238
                              int lb,
8239
                              const int imm8)
8240
{
8241
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8242
    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
8243
}
8244

8245
// Compare packed strings in a and b with lengths la and lb using the control
8246
// in imm8, and store the generated mask in dst.
8247
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm
8248
FORCE_INLINE __m128i
8249
_mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8)
8250
{
8251
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8252
    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
8253
}
8254

8255
// Compare packed strings in a and b with lengths la and lb using the control in
8256
// imm8, and returns bit 0 of the resulting bit mask.
8257
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro
8258
FORCE_INLINE int _mm_cmpestro(__m128i a,
8259
                              int la,
8260
                              __m128i b,
8261
                              int lb,
8262
                              const int imm8)
8263
{
8264
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX);
8265
    return r2 & 1;
8266
}
8267

8268
// Compare packed strings in a and b with lengths la and lb using the control in
8269
// imm8, and returns 1 if any character in a was null, and 0 otherwise.
8270
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs
8271
FORCE_INLINE int _mm_cmpestrs(__m128i a,
8272
                              int la,
8273
                              __m128i b,
8274
                              int lb,
8275
                              const int imm8)
8276
{
8277
    (void) a;
8278
    (void) b;
8279
    (void) lb;
8280
    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8281
    return la <= (bound - 1);
8282
}
8283

8284
// Compare packed strings in a and b with lengths la and lb using the control in
8285
// imm8, and returns 1 if any character in b was null, and 0 otherwise.
8286
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz
8287
FORCE_INLINE int _mm_cmpestrz(__m128i a,
8288
                              int la,
8289
                              __m128i b,
8290
                              int lb,
8291
                              const int imm8)
8292
{
8293
    (void) a;
8294
    (void) b;
8295
    (void) la;
8296
    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8297
    return lb <= (bound - 1);
8298
}
8299

8300
#define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8)                         \
8301
    do {                                                                 \
8302
        if (imm8 & 0x01) {                                               \
8303
            uint16x8_t equal_mask_##str =                                \
8304
                vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \
8305
            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
8306
            uint64_t matches_##str =                                     \
8307
                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
8308
            len = _sse2neon_ctzll(matches_##str) >> 3;                   \
8309
        } else {                                                         \
8310
            uint16x8_t equal_mask_##str = vreinterpretq_u16_u8(          \
8311
                vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0)));   \
8312
            uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4);      \
8313
            uint64_t matches_##str =                                     \
8314
                vget_lane_u64(vreinterpret_u64_u8(res_##str), 0);        \
8315
            len = _sse2neon_ctzll(matches_##str) >> 2;                   \
8316
        }                                                                \
8317
    } while (0)
8318

8319
#define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \
8320
    int la, lb;                                  \
8321
    do {                                         \
8322
        SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);   \
8323
        SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);   \
8324
    } while (0)
8325

8326
// Compare packed strings with implicit lengths in a and b using the control in
8327
// imm8, and returns 1 if b did not contain a null character and the resulting
8328
// mask was zero, and 0 otherwise.
8329
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra
8330
FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8)
8331
{
8332
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8333
    return !r2 & (lb >= bound);
8334
}
8335

8336
// Compare packed strings with implicit lengths in a and b using the control in
8337
// imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise.
8338
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc
8339
FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8)
8340
{
8341
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8342
    return r2 != 0;
8343
}
8344

8345
// Compare packed strings with implicit lengths in a and b using the control in
8346
// imm8, and store the generated index in dst.
8347
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri
8348
FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8)
8349
{
8350
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8351
    SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8);
8352
}
8353

8354
// Compare packed strings with implicit lengths in a and b using the control in
8355
// imm8, and store the generated mask in dst.
8356
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm
8357
FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8)
8358
{
8359
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8360
    SSE2NEON_CMPSTR_GENERATE_MASK(dst);
8361
}
8362

8363
// Compare packed strings with implicit lengths in a and b using the control in
8364
// imm8, and returns bit 0 of the resulting bit mask.
8365
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro
8366
FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8)
8367
{
8368
    SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX);
8369
    return r2 & 1;
8370
}
8371

8372
// Compare packed strings with implicit lengths in a and b using the control in
8373
// imm8, and returns 1 if any character in a was null, and 0 otherwise.
8374
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs
8375
FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8)
8376
{
8377
    (void) b;
8378
    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8379
    int la;
8380
    SSE2NEON_CMPISTRX_LENGTH(a, la, imm8);
8381
    return la <= (bound - 1);
8382
}
8383

8384
// Compare packed strings with implicit lengths in a and b using the control in
8385
// imm8, and returns 1 if any character in b was null, and 0 otherwise.
8386
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz
8387
FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8)
8388
{
8389
    (void) a;
8390
    SSE2NEON_CMPSTR_SET_UPPER(bound, imm8);
8391
    int lb;
8392
    SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8);
8393
    return lb <= (bound - 1);
8394
}
8395

8396
// Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers
8397
// in b for greater than.
8398
FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b)
8399
{
8400
#if defined(__aarch64__) || defined(_M_ARM64)
8401
    return vreinterpretq_m128i_u64(
8402
        vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)));
8403
#else
8404
    return vreinterpretq_m128i_s64(vshrq_n_s64(
8405
        vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)),
8406
        63));
8407
#endif
8408
}
8409

8410
// Starting with the initial value in crc, accumulates a CRC32 value for
8411
// unsigned 16-bit integer v, and stores the result in dst.
8412
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16
8413
FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v)
8414
{
8415
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8416
    __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t"
8417
                         : [c] "+r"(crc)
8418
                         : [v] "r"(v));
8419
#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8420
    (defined(_M_ARM64) && !defined(__clang__))
8421
    crc = __crc32ch(crc, v);
8422
#else
8423
    crc = _mm_crc32_u8(crc, v & 0xff);
8424
    crc = _mm_crc32_u8(crc, (v >> 8) & 0xff);
8425
#endif
8426
    return crc;
8427
}
8428

8429
// Starting with the initial value in crc, accumulates a CRC32 value for
8430
// unsigned 32-bit integer v, and stores the result in dst.
8431
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32
8432
FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v)
8433
{
8434
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8435
    __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t"
8436
                         : [c] "+r"(crc)
8437
                         : [v] "r"(v));
8438
#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8439
    (defined(_M_ARM64) && !defined(__clang__))
8440
    crc = __crc32cw(crc, v);
8441
#else
8442
    crc = _mm_crc32_u16(crc, v & 0xffff);
8443
    crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff);
8444
#endif
8445
    return crc;
8446
}
8447

8448
// Starting with the initial value in crc, accumulates a CRC32 value for
8449
// unsigned 64-bit integer v, and stores the result in dst.
8450
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64
8451
FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v)
8452
{
8453
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8454
    __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t"
8455
                         : [c] "+r"(crc)
8456
                         : [v] "r"(v));
8457
#elif (defined(_M_ARM64) && !defined(__clang__))
8458
    crc = __crc32cd((uint32_t) crc, v);
8459
#else
8460
    crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff);
8461
    crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff);
8462
#endif
8463
    return crc;
8464
}
8465

8466
// Starting with the initial value in crc, accumulates a CRC32 value for
8467
// unsigned 8-bit integer v, and stores the result in dst.
8468
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8
8469
FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v)
8470
{
8471
#if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32)
8472
    __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t"
8473
                         : [c] "+r"(crc)
8474
                         : [v] "r"(v));
8475
#elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \
8476
    (defined(_M_ARM64) && !defined(__clang__))
8477
    crc = __crc32cb(crc, v);
8478
#else
8479
    crc ^= v;
8480
    for (int bit = 0; bit < 8; bit++) {
8481
        if (crc & 1)
8482
            crc = (crc >> 1) ^ UINT32_C(0x82f63b78);
8483
        else
8484
            crc = (crc >> 1);
8485
    }
8486
#endif
8487
    return crc;
8488
}
8489

8490
/* AES */
8491

8492
#if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__))
8493
/* clang-format off */
8494
#define SSE2NEON_AES_SBOX(w)                                           \
8495
    {                                                                  \
8496
        w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \
8497
        w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \
8498
        w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \
8499
        w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \
8500
        w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \
8501
        w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \
8502
        w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \
8503
        w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \
8504
        w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \
8505
        w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \
8506
        w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \
8507
        w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \
8508
        w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \
8509
        w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \
8510
        w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \
8511
        w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \
8512
        w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \
8513
        w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \
8514
        w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \
8515
        w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \
8516
        w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \
8517
        w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \
8518
        w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \
8519
        w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \
8520
        w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \
8521
        w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \
8522
        w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \
8523
        w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \
8524
        w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \
8525
        w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \
8526
        w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \
8527
        w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \
8528
        w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \
8529
        w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \
8530
        w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \
8531
        w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \
8532
        w(0xb0), w(0x54), w(0xbb), w(0x16)                             \
8533
    }
8534
#define SSE2NEON_AES_RSBOX(w)                                          \
8535
    {                                                                  \
8536
        w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \
8537
        w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \
8538
        w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \
8539
        w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \
8540
        w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \
8541
        w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \
8542
        w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \
8543
        w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \
8544
        w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \
8545
        w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \
8546
        w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \
8547
        w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \
8548
        w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \
8549
        w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \
8550
        w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \
8551
        w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \
8552
        w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \
8553
        w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \
8554
        w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \
8555
        w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \
8556
        w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \
8557
        w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \
8558
        w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \
8559
        w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \
8560
        w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \
8561
        w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \
8562
        w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \
8563
        w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \
8564
        w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \
8565
        w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \
8566
        w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \
8567
        w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \
8568
        w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \
8569
        w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \
8570
        w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \
8571
        w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \
8572
        w(0x55), w(0x21), w(0x0c), w(0x7d)                             \
8573
    }
8574
/* clang-format on */
8575

8576
/* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */
8577
#define SSE2NEON_AES_H0(x) (x)
8578
static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0);
8579
static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0);
8580
#undef SSE2NEON_AES_H0
8581

8582
/* x_time function and matrix multiply function */
8583
#if !defined(__aarch64__) && !defined(_M_ARM64)
8584
#define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b))
8585
#define SSE2NEON_MULTIPLY(x, y)                                  \
8586
    (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^           \
8587
     ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^              \
8588
     ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \
8589
     ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x))))))
8590
#endif
8591

8592
// In the absence of crypto extensions, implement aesenc using regular NEON
8593
// intrinsics instead. See:
8594
// https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/
8595
// https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and
8596
// for more information.
8597
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey)
8598
{
8599
#if defined(__aarch64__) || defined(_M_ARM64)
8600
    static const uint8_t shift_rows[] = {
8601
        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
8602
        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
8603
    };
8604
    static const uint8_t ror32by8[] = {
8605
        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8606
        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8607
    };
8608

8609
    uint8x16_t v;
8610
    uint8x16_t w = vreinterpretq_u8_m128i(a);
8611

8612
    /* shift rows */
8613
    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8614

8615
    /* sub bytes */
8616
    // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and
8617
    // look up each of the table. After each lookup, we load the next table
8618
    // which locates at the next 64-bytes. In the meantime, the index in the
8619
    // table would be smaller than it was, so the index parameters of
8620
    // `vqtbx4q_u8()` need to be added the same constant as the loaded tables.
8621
    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
8622
    // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))'
8623
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
8624
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
8625
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
8626

8627
    /* mix columns */
8628
    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8629
    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8630
    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8631

8632
    /* add round key */
8633
    return vreinterpretq_m128i_u8(w) ^ RoundKey;
8634

8635
#else /* ARMv7-A implementation for a table-based AES */
8636
#define SSE2NEON_AES_B2W(b0, b1, b2, b3)                 \
8637
    (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \
8638
     ((uint32_t) (b1) << 8) | (uint32_t) (b0))
8639
// muliplying 'x' by 2 in GF(2^8)
8640
#define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */))
8641
// muliplying 'x' by 3 in GF(2^8)
8642
#define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x)
8643
#define SSE2NEON_AES_U0(p) \
8644
    SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p))
8645
#define SSE2NEON_AES_U1(p) \
8646
    SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p)
8647
#define SSE2NEON_AES_U2(p) \
8648
    SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p)
8649
#define SSE2NEON_AES_U3(p) \
8650
    SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p))
8651

8652
    // this generates a table containing every possible permutation of
8653
    // shift_rows() and sub_bytes() with mix_columns().
8654
    static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = {
8655
        SSE2NEON_AES_SBOX(SSE2NEON_AES_U0),
8656
        SSE2NEON_AES_SBOX(SSE2NEON_AES_U1),
8657
        SSE2NEON_AES_SBOX(SSE2NEON_AES_U2),
8658
        SSE2NEON_AES_SBOX(SSE2NEON_AES_U3),
8659
    };
8660
#undef SSE2NEON_AES_B2W
8661
#undef SSE2NEON_AES_F2
8662
#undef SSE2NEON_AES_F3
8663
#undef SSE2NEON_AES_U0
8664
#undef SSE2NEON_AES_U1
8665
#undef SSE2NEON_AES_U2
8666
#undef SSE2NEON_AES_U3
8667

8668
    uint32_t x0 = _mm_cvtsi128_si32(a);  // get a[31:0]
8669
    uint32_t x1 =
8670
        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));  // get a[63:32]
8671
    uint32_t x2 =
8672
        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA));  // get a[95:64]
8673
    uint32_t x3 =
8674
        _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));  // get a[127:96]
8675

8676
    // finish the modulo addition step in mix_columns()
8677
    __m128i out = _mm_set_epi32(
8678
        (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^
8679
         aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]),
8680
        (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^
8681
         aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]),
8682
        (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^
8683
         aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]),
8684
        (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^
8685
         aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24]));
8686

8687
    return _mm_xor_si128(out, RoundKey);
8688
#endif
8689
}
8690

8691
// Perform one round of an AES decryption flow on data (state) in a using the
8692
// round key in RoundKey, and store the result in dst.
8693
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
8694
FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
8695
{
8696
#if defined(__aarch64__)
8697
    static const uint8_t inv_shift_rows[] = {
8698
        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
8699
        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
8700
    };
8701
    static const uint8_t ror32by8[] = {
8702
        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8703
        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8704
    };
8705

8706
    uint8x16_t v;
8707
    uint8x16_t w = vreinterpretq_u8_m128i(a);
8708

8709
    // inverse shift rows
8710
    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
8711

8712
    // inverse sub bytes
8713
    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
8714
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
8715
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
8716
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
8717

8718
    // inverse mix columns
8719
    // multiplying 'v' by 4 in GF(2^8)
8720
    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8721
    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
8722
    v ^= w;
8723
    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
8724

8725
    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) &
8726
                                 0x1b);  // muliplying 'v' by 2 in GF(2^8)
8727
    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8728
    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8729

8730
    // add round key
8731
    return vreinterpretq_m128i_u8(w) ^ RoundKey;
8732

8733
#else /* ARMv7-A NEON implementation */
8734
    /* FIXME: optimized for NEON */
8735
    uint8_t i, e, f, g, h, v[4][4];
8736
    uint8_t *_a = (uint8_t *) &a;
8737
    for (i = 0; i < 16; ++i) {
8738
        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
8739
    }
8740

8741
    // inverse mix columns
8742
    for (i = 0; i < 4; ++i) {
8743
        e = v[i][0];
8744
        f = v[i][1];
8745
        g = v[i][2];
8746
        h = v[i][3];
8747

8748
        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
8749
                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
8750
        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
8751
                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
8752
        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
8753
                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
8754
        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
8755
                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
8756
    }
8757

8758
    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
8759
#endif
8760
}
8761

8762
// Perform the last round of an AES encryption flow on data (state) in a using
8763
// the round key in RoundKey, and store the result in dst.
8764
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
8765
FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8766
{
8767
#if defined(__aarch64__)
8768
    static const uint8_t shift_rows[] = {
8769
        0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3,
8770
        0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb,
8771
    };
8772

8773
    uint8x16_t v;
8774
    uint8x16_t w = vreinterpretq_u8_m128i(a);
8775

8776
    // shift rows
8777
    w = vqtbl1q_u8(w, vld1q_u8(shift_rows));
8778

8779
    // sub bytes
8780
    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w);
8781
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40);
8782
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80);
8783
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0);
8784

8785
    // add round key
8786
    return vreinterpretq_m128i_u8(v) ^ RoundKey;
8787

8788
#else /* ARMv7-A implementation */
8789
    uint8_t v[16] = {
8790
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)],
8791
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)],
8792
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)],
8793
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)],
8794
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)],
8795
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)],
8796
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)],
8797
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)],
8798
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)],
8799
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)],
8800
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)],
8801
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)],
8802
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)],
8803
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)],
8804
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)],
8805
        _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)],
8806
    };
8807

8808
    return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey;
8809
#endif
8810
}
8811

8812
// Perform the last round of an AES decryption flow on data (state) in a using
8813
// the round key in RoundKey, and store the result in dst.
8814
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
8815
FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
8816
{
8817
#if defined(__aarch64__)
8818
    static const uint8_t inv_shift_rows[] = {
8819
        0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb,
8820
        0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3,
8821
    };
8822

8823
    uint8x16_t v;
8824
    uint8x16_t w = vreinterpretq_u8_m128i(a);
8825

8826
    // inverse shift rows
8827
    w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows));
8828

8829
    // inverse sub bytes
8830
    v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w);
8831
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40);
8832
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80);
8833
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0);
8834

8835
    // add round key
8836
    return vreinterpretq_m128i_u8(v) ^ RoundKey;
8837

8838
#else /* ARMv7-A NEON implementation */
8839
    /* FIXME: optimized for NEON */
8840
    uint8_t v[4][4];
8841
    uint8_t *_a = (uint8_t *) &a;
8842
    for (int i = 0; i < 16; ++i) {
8843
        v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]];
8844
    }
8845

8846
    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey;
8847
#endif
8848
}
8849

8850
// Perform the InvMixColumns transformation on a and store the result in dst.
8851
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
8852
FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
8853
{
8854
#if defined(__aarch64__)
8855
    static const uint8_t ror32by8[] = {
8856
        0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4,
8857
        0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc,
8858
    };
8859
    uint8x16_t v = vreinterpretq_u8_m128i(a);
8860
    uint8x16_t w;
8861

8862
    // multiplying 'v' by 4 in GF(2^8)
8863
    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8864
    w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b);
8865
    v ^= w;
8866
    v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w);
8867

8868
    // multiplying 'v' by 2 in GF(2^8)
8869
    w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b);
8870
    w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v);
8871
    w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8));
8872
    return vreinterpretq_m128i_u8(w);
8873

8874
#else /* ARMv7-A NEON implementation */
8875
    uint8_t i, e, f, g, h, v[4][4];
8876
    vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a));
8877
    for (i = 0; i < 4; ++i) {
8878
        e = v[i][0];
8879
        f = v[i][1];
8880
        g = v[i][2];
8881
        h = v[i][3];
8882

8883
        v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^
8884
                  SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09);
8885
        v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^
8886
                  SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d);
8887
        v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^
8888
                  SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b);
8889
        v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^
8890
                  SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e);
8891
    }
8892

8893
    return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v));
8894
#endif
8895
}
8896

8897
// Assist in expanding the AES cipher key by computing steps towards generating
8898
// a round key for encryption cipher using data from a and an 8-bit round
8899
// constant specified in imm8, and store the result in dst.
8900
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
8901
//
8902
// Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist.
8903
// This instruction generates a round key for AES encryption. See
8904
// https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/
8905
// for details.
8906
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
8907
{
8908
#if defined(__aarch64__)
8909
    uint8x16_t _a = vreinterpretq_u8_m128i(a);
8910
    uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a);
8911
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40);
8912
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80);
8913
    v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0);
8914

8915
    uint32x4_t v_u32 = vreinterpretq_u32_u8(v);
8916
    uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24));
8917
    uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon));
8918

8919
    return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v));
8920

8921
#else /* ARMv7-A NEON implementation */
8922
    uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55));
8923
    uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF));
8924
    for (int i = 0; i < 4; ++i) {
8925
        ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]];
8926
        ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]];
8927
    }
8928
    return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3,
8929
                         ((X1 >> 8) | (X1 << 24)) ^ rcon, X1);
8930
#endif
8931
}
8932
#undef SSE2NEON_AES_SBOX
8933
#undef SSE2NEON_AES_RSBOX
8934

8935
#if defined(__aarch64__)
8936
#undef SSE2NEON_XT
8937
#undef SSE2NEON_MULTIPLY
8938
#endif
8939

8940
#else /* __ARM_FEATURE_CRYPTO */
8941
// Implements equivalent of 'aesenc' by combining AESE (with an empty key) and
8942
// AESMC and then manually applying the real key as an xor operation. This
8943
// unfortunately means an additional xor op; the compiler should be able to
8944
// optimize this away for repeated calls however. See
8945
// https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a
8946
// for more details.
8947
FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b)
8948
{
8949
    return vreinterpretq_m128i_u8(veorq_u8(
8950
        vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8951
        vreinterpretq_u8_m128i(b)));
8952
}
8953

8954
// Perform one round of an AES decryption flow on data (state) in a using the
8955
// round key in RoundKey, and store the result in dst.
8956
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128
8957
FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey)
8958
{
8959
    return vreinterpretq_m128i_u8(veorq_u8(
8960
        vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8961
        vreinterpretq_u8_m128i(RoundKey)));
8962
}
8963

8964
// Perform the last round of an AES encryption flow on data (state) in a using
8965
// the round key in RoundKey, and store the result in dst.
8966
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128
8967
FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey)
8968
{
8969
    return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8(
8970
                             vreinterpretq_u8_m128i(a), vdupq_n_u8(0))),
8971
                         RoundKey);
8972
}
8973

8974
// Perform the last round of an AES decryption flow on data (state) in a using
8975
// the round key in RoundKey, and store the result in dst.
8976
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128
8977
FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey)
8978
{
8979
    return vreinterpretq_m128i_u8(
8980
        veorq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)),
8981
                 vreinterpretq_u8_m128i(RoundKey)));
8982
}
8983

8984
// Perform the InvMixColumns transformation on a and store the result in dst.
8985
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128
8986
FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a)
8987
{
8988
    return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a)));
8989
}
8990

8991
// Assist in expanding the AES cipher key by computing steps towards generating
8992
// a round key for encryption cipher using data from a and an 8-bit round
8993
// constant specified in imm8, and store the result in dst."
8994
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128
8995
FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon)
8996
{
8997
    // AESE does ShiftRows and SubBytes on A
8998
    uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0));
8999

9000
#ifndef _MSC_VER
9001
    uint8x16_t dest = {
9002
        // Undo ShiftRows step from AESE and extract X1 and X3
9003
        u8[0x4], u8[0x1], u8[0xE], u8[0xB],  // SubBytes(X1)
9004
        u8[0x1], u8[0xE], u8[0xB], u8[0x4],  // ROT(SubBytes(X1))
9005
        u8[0xC], u8[0x9], u8[0x6], u8[0x3],  // SubBytes(X3)
9006
        u8[0x9], u8[0x6], u8[0x3], u8[0xC],  // ROT(SubBytes(X3))
9007
    };
9008
    uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon};
9009
    return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r);
9010
#else
9011
    // We have to do this hack because MSVC is strictly adhering to the CPP
9012
    // standard, in particular C++03 8.5.1 sub-section 15, which states that
9013
    // unions must be initialized by their first member type.
9014

9015
    // As per the Windows ARM64 ABI, it is always little endian, so this works
9016
    __n128 dest{
9017
        ((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) |
9018
            ((uint64_t) u8.n128_u8[0xE] << 16) |
9019
            ((uint64_t) u8.n128_u8[0xB] << 24) |
9020
            ((uint64_t) u8.n128_u8[0x1] << 32) |
9021
            ((uint64_t) u8.n128_u8[0xE] << 40) |
9022
            ((uint64_t) u8.n128_u8[0xB] << 48) |
9023
            ((uint64_t) u8.n128_u8[0x4] << 56),
9024
        ((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) |
9025
            ((uint64_t) u8.n128_u8[0x6] << 16) |
9026
            ((uint64_t) u8.n128_u8[0x3] << 24) |
9027
            ((uint64_t) u8.n128_u8[0x9] << 32) |
9028
            ((uint64_t) u8.n128_u8[0x6] << 40) |
9029
            ((uint64_t) u8.n128_u8[0x3] << 48) |
9030
            ((uint64_t) u8.n128_u8[0xC] << 56)};
9031

9032
    dest.n128_u32[1] = dest.n128_u32[1] ^ rcon;
9033
    dest.n128_u32[3] = dest.n128_u32[3] ^ rcon;
9034

9035
    return dest;
9036
#endif
9037
}
9038
#endif
9039

9040
/* Others */
9041

9042
// Perform a carry-less multiplication of two 64-bit integers, selected from a
9043
// and b according to imm8, and store the results in dst.
9044
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128
9045
FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm)
9046
{
9047
    uint64x2_t a = vreinterpretq_u64_m128i(_a);
9048
    uint64x2_t b = vreinterpretq_u64_m128i(_b);
9049
    switch (imm & 0x11) {
9050
    case 0x00:
9051
        return vreinterpretq_m128i_u64(
9052
            _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b)));
9053
    case 0x01:
9054
        return vreinterpretq_m128i_u64(
9055
            _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b)));
9056
    case 0x10:
9057
        return vreinterpretq_m128i_u64(
9058
            _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b)));
9059
    case 0x11:
9060
        return vreinterpretq_m128i_u64(
9061
            _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b)));
9062
    default:
9063
        abort();
9064
    }
9065
}
9066

9067
FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void)
9068
{
9069
    union {
9070
        fpcr_bitfield field;
9071
#if defined(__aarch64__) || defined(_M_ARM64)
9072
        uint64_t value;
9073
#else
9074
        uint32_t value;
9075
#endif
9076
    } r;
9077

9078
#if defined(__aarch64__) || defined(_M_ARM64)
9079
    r.value = _sse2neon_get_fpcr();
9080
#else
9081
    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
9082
#endif
9083

9084
    return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF;
9085
}
9086

9087
// Count the number of bits set to 1 in unsigned 32-bit integer a, and
9088
// return that count in dst.
9089
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32
9090
FORCE_INLINE int _mm_popcnt_u32(unsigned int a)
9091
{
9092
#if defined(__aarch64__) || defined(_M_ARM64)
9093
#if __has_builtin(__builtin_popcount)
9094
    return __builtin_popcount(a);
9095
#elif defined(_MSC_VER)
9096
    return _CountOneBits(a);
9097
#else
9098
    return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a)));
9099
#endif
9100
#else
9101
    uint32_t count = 0;
9102
    uint8x8_t input_val, count8x8_val;
9103
    uint16x4_t count16x4_val;
9104
    uint32x2_t count32x2_val;
9105

9106
    input_val = vld1_u8((uint8_t *) &a);
9107
    count8x8_val = vcnt_u8(input_val);
9108
    count16x4_val = vpaddl_u8(count8x8_val);
9109
    count32x2_val = vpaddl_u16(count16x4_val);
9110

9111
    vst1_u32(&count, count32x2_val);
9112
    return count;
9113
#endif
9114
}
9115

9116
// Count the number of bits set to 1 in unsigned 64-bit integer a, and
9117
// return that count in dst.
9118
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64
9119
FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a)
9120
{
9121
#if defined(__aarch64__) || defined(_M_ARM64)
9122
#if __has_builtin(__builtin_popcountll)
9123
    return __builtin_popcountll(a);
9124
#elif defined(_MSC_VER)
9125
    return _CountOneBits64(a);
9126
#else
9127
    return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a)));
9128
#endif
9129
#else
9130
    uint64_t count = 0;
9131
    uint8x8_t input_val, count8x8_val;
9132
    uint16x4_t count16x4_val;
9133
    uint32x2_t count32x2_val;
9134
    uint64x1_t count64x1_val;
9135

9136
    input_val = vld1_u8((uint8_t *) &a);
9137
    count8x8_val = vcnt_u8(input_val);
9138
    count16x4_val = vpaddl_u8(count8x8_val);
9139
    count32x2_val = vpaddl_u16(count16x4_val);
9140
    count64x1_val = vpaddl_u32(count32x2_val);
9141
    vst1_u64(&count, count64x1_val);
9142
    return count;
9143
#endif
9144
}
9145

9146
FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag)
9147
{
9148
    // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting,
9149
    // regardless of the value of the FZ bit.
9150
    union {
9151
        fpcr_bitfield field;
9152
#if defined(__aarch64__) || defined(_M_ARM64)
9153
        uint64_t value;
9154
#else
9155
        uint32_t value;
9156
#endif
9157
    } r;
9158

9159
#if defined(__aarch64__) || defined(_M_ARM64)
9160
    r.value = _sse2neon_get_fpcr();
9161
#else
9162
    __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */
9163
#endif
9164

9165
    r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON;
9166

9167
#if defined(__aarch64__) || defined(_M_ARM64)
9168
    _sse2neon_set_fpcr(r.value);
9169
#else
9170
    __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r));        /* write */
9171
#endif
9172
}
9173

9174
// Return the current 64-bit value of the processor's time-stamp counter.
9175
// https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc
9176
FORCE_INLINE uint64_t _rdtsc(void)
9177
{
9178
#if defined(__aarch64__) || defined(_M_ARM64)
9179
    uint64_t val;
9180

9181
    /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the
9182
     * system counter is at least 56 bits wide; from Armv8.6, the counter
9183
     * must be 64 bits wide.  So the system counter could be less than 64
9184
     * bits wide and it is attributed with the flag 'cap_user_time_short'
9185
     * is true.
9186
     */
9187
#if defined(_MSC_VER)
9188
    val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2));
9189
#else
9190
    __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val));
9191
#endif
9192

9193
    return val;
9194
#else
9195
    uint32_t pmccntr, pmuseren, pmcntenset;
9196
    // Read the user mode Performance Monitoring Unit (PMU)
9197
    // User Enable Register (PMUSERENR) access permissions.
9198
    __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren));
9199
    if (pmuseren & 1) {  // Allows reading PMUSERENR for user mode code.
9200
        __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset));
9201
        if (pmcntenset & 0x80000000UL) {  // Is it counting?
9202
            __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr));
9203
            // The counter is set up to count every 64th cycle
9204
            return (uint64_t) (pmccntr) << 6;
9205
        }
9206
    }
9207

9208
    // Fallback to syscall as we can't enable PMUSERENR in user mode.
9209
    struct timeval tv;
9210
    gettimeofday(&tv, NULL);
9211
    return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec;
9212
#endif
9213
}
9214

9215
#if defined(__GNUC__) || defined(__clang__)
9216
#pragma pop_macro("ALIGN_STRUCT")
9217
#pragma pop_macro("FORCE_INLINE")
9218
#endif
9219

9220
#if defined(__GNUC__) && !defined(__clang__)
9221
#pragma GCC pop_options
9222
#endif
9223

9224
#endif
9225

9226
Product

Resources

Company