Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
/*****************************************************************************
2
* macros.h: msa macros
3
*****************************************************************************
4
* Copyright (C) 2015-2016 x264 project
5
*
6
* Authors: Rishikesh More <[email protected]>
7
*
8
* This program is free software; you can redistribute it and/or modify
9
* it under the terms of the GNU General Public License as published by
10
* the Free Software Foundation; either version 2 of the License, or
11
* (at your option) any later version.
12
*
13
* This program is distributed in the hope that it will be useful,
14
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
* GNU General Public License for more details.
17
*
18
* You should have received a copy of the GNU General Public License
19
* along with this program; if not, write to the Free Software
20
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
*
22
* This program is also available under a commercial proprietary license.
23
* For more information, contact us at [email protected].
24
*****************************************************************************/
25
26
#ifndef X264_MIPS_MACROS_H
27
#define X264_MIPS_MACROS_H
28
29
#include <stdint.h>
30
#include <msa.h>
31
32
#define LD_B( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
33
#define LD_UB( ... ) LD_B( v16u8, __VA_ARGS__ )
34
#define LD_SB( ... ) LD_B( v16i8, __VA_ARGS__ )
35
36
#define LD_H( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
37
#define LD_SH( ... ) LD_H( v8i16, __VA_ARGS__ )
38
39
#define LD_W( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
40
#define LD_SW( ... ) LD_W( v4i32, __VA_ARGS__ )
41
42
#define ST_B( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in )
43
#define ST_UB( ... ) ST_B( v16u8, __VA_ARGS__ )
44
#define ST_SB( ... ) ST_B( v16i8, __VA_ARGS__ )
45
46
#define ST_H( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in )
47
#define ST_UH( ... ) ST_H( v8u16, __VA_ARGS__ )
48
#define ST_SH( ... ) ST_H( v8i16, __VA_ARGS__ )
49
50
#if ( __mips_isa_rev >= 6 )
51
#define LH( p_src ) \
52
( { \
53
uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
54
uint16_t u_val_h_m; \
55
\
56
asm volatile ( \
57
"lh %[u_val_h_m], %[p_src_m] \n\t" \
58
\
59
: [u_val_h_m] "=r" ( u_val_h_m ) \
60
: [p_src_m] "m" ( *p_src_m ) \
61
); \
62
\
63
u_val_h_m; \
64
} )
65
66
#define LW( p_src ) \
67
( { \
68
uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
69
uint32_t u_val_w_m; \
70
\
71
asm volatile ( \
72
"lw %[u_val_w_m], %[p_src_m] \n\t" \
73
\
74
: [u_val_w_m] "=r" ( u_val_w_m ) \
75
: [p_src_m] "m" ( *p_src_m ) \
76
); \
77
\
78
u_val_w_m; \
79
} )
80
81
#if ( __mips == 64 )
82
#define LD( p_src ) \
83
( { \
84
uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
85
uint64_t u_val_d_m = 0; \
86
\
87
asm volatile ( \
88
"ld %[u_val_d_m], %[p_src_m] \n\t" \
89
\
90
: [u_val_d_m] "=r" ( u_val_d_m ) \
91
: [p_src_m] "m" ( *p_src_m ) \
92
); \
93
\
94
u_val_d_m; \
95
} )
96
#else // !( __mips == 64 )
97
#define LD( p_src ) \
98
( { \
99
uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
100
uint32_t u_val0_m, u_val1_m; \
101
uint64_t u_val_d_m = 0; \
102
\
103
u_val0_m = LW( p_src_m ); \
104
u_val1_m = LW( p_src_m + 4 ); \
105
\
106
u_val_d_m = ( uint64_t ) ( u_val1_m ); \
107
u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) & \
108
0xFFFFFFFF00000000 ); \
109
u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m ); \
110
\
111
u_val_d_m; \
112
} )
113
#endif // ( __mips == 64 )
114
115
#define SH( u_val, p_dst ) \
116
{ \
117
uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \
118
uint16_t u_val_h_m = ( u_val ); \
119
\
120
asm volatile ( \
121
"sh %[u_val_h_m], %[p_dst_m] \n\t" \
122
\
123
: [p_dst_m] "=m" ( *p_dst_m ) \
124
: [u_val_h_m] "r" ( u_val_h_m ) \
125
); \
126
}
127
128
#define SW( u_val, p_dst ) \
129
{ \
130
uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \
131
uint32_t u_val_w_m = ( u_val ); \
132
\
133
asm volatile ( \
134
"sw %[u_val_w_m], %[p_dst_m] \n\t" \
135
\
136
: [p_dst_m] "=m" ( *p_dst_m ) \
137
: [u_val_w_m] "r" ( u_val_w_m ) \
138
); \
139
}
140
141
#define SD( u_val, p_dst ) \
142
{ \
143
uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \
144
uint64_t u_val_d_m = ( u_val ); \
145
\
146
asm volatile ( \
147
"sd %[u_val_d_m], %[p_dst_m] \n\t" \
148
\
149
: [p_dst_m] "=m" ( *p_dst_m ) \
150
: [u_val_d_m] "r" ( u_val_d_m ) \
151
); \
152
}
153
154
#else // !( __mips_isa_rev >= 6 )
155
#define LH( p_src ) \
156
( { \
157
uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
158
uint16_t u_val_h_m; \
159
\
160
asm volatile ( \
161
"ulh %[u_val_h_m], %[p_src_m] \n\t" \
162
\
163
: [u_val_h_m] "=r" ( u_val_h_m ) \
164
: [p_src_m] "m" ( *p_src_m ) \
165
); \
166
\
167
u_val_h_m; \
168
} )
169
170
#define LW( p_src ) \
171
( { \
172
uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
173
uint32_t u_val_w_m; \
174
\
175
asm volatile ( \
176
"ulw %[u_val_w_m], %[p_src_m] \n\t" \
177
\
178
: [u_val_w_m] "=r" ( u_val_w_m ) \
179
: [p_src_m] "m" ( *p_src_m ) \
180
); \
181
\
182
u_val_w_m; \
183
} )
184
185
#if ( __mips == 64 )
186
#define LD( p_src ) \
187
( { \
188
uint8_t *p_src_m = ( uint8_t * ) ( p_src ); \
189
uint64_t u_val_d_m = 0; \
190
\
191
asm volatile ( \
192
"uld %[u_val_d_m], %[p_src_m] \n\t" \
193
\
194
: [u_val_d_m] "=r" ( u_val_d_m ) \
195
: [p_src_m] "m" ( *p_src_m ) \
196
); \
197
\
198
u_val_d_m; \
199
} )
200
#else // !( __mips == 64 )
201
#define LD( p_src ) \
202
( { \
203
uint8_t *psrc_m1 = ( uint8_t * ) ( p_src ); \
204
uint32_t u_val0_m, u_val1_m; \
205
uint64_t u_val_d_m = 0; \
206
\
207
u_val0_m = LW( psrc_m1 ); \
208
u_val1_m = LW( psrc_m1 + 4 ); \
209
\
210
u_val_d_m = ( uint64_t ) ( u_val1_m ); \
211
u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) & \
212
0xFFFFFFFF00000000 ); \
213
u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m ); \
214
\
215
u_val_d_m; \
216
} )
217
#endif // ( __mips == 64 )
218
219
#define SH( u_val, p_dst ) \
220
{ \
221
uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \
222
uint16_t u_val_h_m = ( u_val ); \
223
\
224
asm volatile ( \
225
"ush %[u_val_h_m], %[p_dst_m] \n\t" \
226
\
227
: [p_dst_m] "=m" ( *p_dst_m ) \
228
: [u_val_h_m] "r" ( u_val_h_m ) \
229
); \
230
}
231
232
#define SW( u_val, p_dst ) \
233
{ \
234
uint8_t *p_dst_m = ( uint8_t * ) ( p_dst ); \
235
uint32_t u_val_w_m = ( u_val ); \
236
\
237
asm volatile ( \
238
"usw %[u_val_w_m], %[p_dst_m] \n\t" \
239
\
240
: [p_dst_m] "=m" ( *p_dst_m ) \
241
: [u_val_w_m] "r" ( u_val_w_m ) \
242
); \
243
}
244
245
#define SD( u_val, p_dst ) \
246
{ \
247
uint8_t *p_dst_m1 = ( uint8_t * ) ( p_dst ); \
248
uint32_t u_val0_m, u_val1_m; \
249
\
250
u_val0_m = ( uint32_t ) ( ( u_val ) & 0x00000000FFFFFFFF ); \
251
u_val1_m = ( uint32_t ) ( ( ( u_val ) >> 32 ) & 0x00000000FFFFFFFF ); \
252
\
253
SW( u_val0_m, p_dst_m1 ); \
254
SW( u_val1_m, p_dst_m1 + 4 ); \
255
}
256
257
#endif // ( __mips_isa_rev >= 6 )
258
259
/* Description : Load 4 words with stride
260
Arguments : Inputs - psrc (source pointer to load from)
261
- stride
262
Outputs - out0, out1, out2, out3
263
Details : Load word in 'out0' from (psrc)
264
Load word in 'out1' from (psrc + stride)
265
Load word in 'out2' from (psrc + 2 * stride)
266
Load word in 'out3' from (psrc + 3 * stride)
267
*/
268
#define LW4( p_src, stride, out0, out1, out2, out3 ) \
269
{ \
270
out0 = LW( ( p_src ) ); \
271
out1 = LW( ( p_src ) + stride ); \
272
out2 = LW( ( p_src ) + 2 * stride ); \
273
out3 = LW( ( p_src ) + 3 * stride ); \
274
}
275
276
/* Description : Store 4 words with stride
277
Arguments : Inputs - in0, in1, in2, in3, pdst, stride
278
Details : Store word from 'in0' to (pdst)
279
Store word from 'in1' to (pdst + stride)
280
Store word from 'in2' to (pdst + 2 * stride)
281
Store word from 'in3' to (pdst + 3 * stride)
282
*/
283
#define SW4( in0, in1, in2, in3, p_dst, stride ) \
284
{ \
285
SW( in0, ( p_dst ) ) \
286
SW( in1, ( p_dst ) + stride ); \
287
SW( in2, ( p_dst ) + 2 * stride ); \
288
SW( in3, ( p_dst ) + 3 * stride ); \
289
}
290
291
/* Description : Store 4 double words with stride
292
Arguments : Inputs - in0, in1, in2, in3, pdst, stride
293
Details : Store double word from 'in0' to (pdst)
294
Store double word from 'in1' to (pdst + stride)
295
Store double word from 'in2' to (pdst + 2 * stride)
296
Store double word from 'in3' to (pdst + 3 * stride)
297
*/
298
#define SD4( in0, in1, in2, in3, p_dst, stride ) \
299
{ \
300
SD( in0, ( p_dst ) ) \
301
SD( in1, ( p_dst ) + stride ); \
302
SD( in2, ( p_dst ) + 2 * stride ); \
303
SD( in3, ( p_dst ) + 3 * stride ); \
304
}
305
306
/* Description : Load vectors with 16 byte elements with stride
307
Arguments : Inputs - psrc (source pointer to load from)
308
- stride
309
Outputs - out0, out1
310
Return Type - as per RTYPE
311
Details : Load 16 byte elements in 'out0' from (psrc)
312
Load 16 byte elements in 'out1' from (psrc + stride)
313
*/
314
#define LD_B2( RTYPE, p_src, stride, out0, out1 ) \
315
{ \
316
out0 = LD_B( RTYPE, ( p_src ) ); \
317
out1 = LD_B( RTYPE, ( p_src ) + stride ); \
318
}
319
#define LD_UB2( ... ) LD_B2( v16u8, __VA_ARGS__ )
320
#define LD_SB2( ... ) LD_B2( v16i8, __VA_ARGS__ )
321
322
#define LD_B3( RTYPE, p_src, stride, out0, out1, out2 ) \
323
{ \
324
LD_B2( RTYPE, ( p_src ), stride, out0, out1 ); \
325
out2 = LD_B( RTYPE, ( p_src ) + 2 * stride ); \
326
}
327
#define LD_UB3( ... ) LD_B3( v16u8, __VA_ARGS__ )
328
#define LD_SB3( ... ) LD_B3( v16i8, __VA_ARGS__ )
329
330
#define LD_B4( RTYPE, p_src, stride, out0, out1, out2, out3 ) \
331
{ \
332
LD_B2( RTYPE, ( p_src ), stride, out0, out1 ); \
333
LD_B2( RTYPE, ( p_src ) + 2 * stride , stride, out2, out3 ); \
334
}
335
#define LD_UB4( ... ) LD_B4( v16u8, __VA_ARGS__ )
336
#define LD_SB4( ... ) LD_B4( v16i8, __VA_ARGS__ )
337
338
#define LD_B5( RTYPE, p_src, stride, out0, out1, out2, out3, out4 ) \
339
{ \
340
LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 ); \
341
out4 = LD_B( RTYPE, ( p_src ) + 4 * stride ); \
342
}
343
#define LD_UB5( ... ) LD_B5( v16u8, __VA_ARGS__ )
344
#define LD_SB5( ... ) LD_B5( v16i8, __VA_ARGS__ )
345
346
#define LD_B8( RTYPE, p_src, stride, \
347
out0, out1, out2, out3, out4, out5, out6, out7 ) \
348
{ \
349
LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 ); \
350
LD_B4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 ); \
351
}
352
#define LD_UB8( ... ) LD_B8( v16u8, __VA_ARGS__ )
353
#define LD_SB8( ... ) LD_B8( v16i8, __VA_ARGS__ )
354
355
/* Description : Load vectors with 8 halfword elements with stride
356
Arguments : Inputs - psrc (source pointer to load from)
357
- stride
358
Outputs - out0, out1
359
Details : Load 8 halfword elements in 'out0' from (psrc)
360
Load 8 halfword elements in 'out1' from (psrc + stride)
361
*/
362
#define LD_H2( RTYPE, p_src, stride, out0, out1 ) \
363
{ \
364
out0 = LD_H( RTYPE, ( p_src ) ); \
365
out1 = LD_H( RTYPE, ( p_src ) + ( stride ) ); \
366
}
367
#define LD_SH2( ... ) LD_H2( v8i16, __VA_ARGS__ )
368
369
#define LD_H4( RTYPE, p_src, stride, out0, out1, out2, out3 ) \
370
{ \
371
LD_H2( RTYPE, ( p_src ), stride, out0, out1 ); \
372
LD_H2( RTYPE, ( p_src ) + 2 * stride, stride, out2, out3 ); \
373
}
374
#define LD_SH4( ... ) LD_H4( v8i16, __VA_ARGS__ )
375
376
#define LD_H8( RTYPE, p_src, stride, \
377
out0, out1, out2, out3, out4, out5, out6, out7 ) \
378
{ \
379
LD_H4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 ); \
380
LD_H4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 ); \
381
}
382
#define LD_SH8( ... ) LD_H8( v8i16, __VA_ARGS__ )
383
384
/* Description : Load 4x4 block of signed halfword elements from 1D source
385
data into 4 vectors (Each vector with 4 signed halfwords)
386
Arguments : Inputs - psrc
387
Outputs - out0, out1, out2, out3
388
*/
389
#define LD4x4_SH( p_src, out0, out1, out2, out3 ) \
390
{ \
391
out0 = LD_SH( p_src ); \
392
out2 = LD_SH( p_src + 8 ); \
393
out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 ); \
394
out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out2, ( v2i64 ) out2 ); \
395
}
396
397
/* Description : Load 2 vectors of signed word elements with stride
398
Arguments : Inputs - psrc (source pointer to load from)
399
- stride
400
Outputs - out0, out1
401
Return Type - signed word
402
*/
403
#define LD_SW2( p_src, stride, out0, out1 ) \
404
{ \
405
out0 = LD_SW( ( p_src ) ); \
406
out1 = LD_SW( ( p_src ) + stride ); \
407
}
408
409
/* Description : Store vectors of 16 byte elements with stride
410
Arguments : Inputs - in0, in1, stride
411
- pdst (destination pointer to store to)
412
Details : Store 16 byte elements from 'in0' to (pdst)
413
Store 16 byte elements from 'in1' to (pdst + stride)
414
*/
415
#define ST_B2( RTYPE, in0, in1, p_dst, stride ) \
416
{ \
417
ST_B( RTYPE, in0, ( p_dst ) ); \
418
ST_B( RTYPE, in1, ( p_dst ) + stride ); \
419
}
420
#define ST_UB2( ... ) ST_B2( v16u8, __VA_ARGS__ )
421
422
#define ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride ) \
423
{ \
424
ST_B2( RTYPE, in0, in1, ( p_dst ), stride ); \
425
ST_B2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride ); \
426
}
427
#define ST_UB4( ... ) ST_B4( v16u8, __VA_ARGS__ )
428
#define ST_SB4( ... ) ST_B4( v16i8, __VA_ARGS__ )
429
430
#define ST_B8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
431
p_dst, stride ) \
432
{ \
433
ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride ); \
434
ST_B4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride ); \
435
}
436
#define ST_UB8( ... ) ST_B8( v16u8, __VA_ARGS__ )
437
438
/* Description : Store vectors of 8 halfword elements with stride
439
Arguments : Inputs - in0, in1, stride
440
- pdst (destination pointer to store to)
441
Details : Store 8 halfword elements from 'in0' to (pdst)
442
Store 8 halfword elements from 'in1' to (pdst + stride)
443
*/
444
#define ST_H2( RTYPE, in0, in1, p_dst, stride ) \
445
{ \
446
ST_H( RTYPE, in0, ( p_dst ) ); \
447
ST_H( RTYPE, in1, ( p_dst ) + stride ); \
448
}
449
#define ST_SH2( ... ) ST_H2( v8i16, __VA_ARGS__ )
450
451
#define ST_H4( RTYPE, in0, in1, in2, in3, p_dst, stride ) \
452
{ \
453
ST_H2( RTYPE, in0, in1, ( p_dst ), stride ); \
454
ST_H2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride ); \
455
}
456
#define ST_SH4( ... ) ST_H4( v8i16, __VA_ARGS__ )
457
458
#define ST_H8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, p_dst, stride ) \
459
{ \
460
ST_H4( RTYPE, in0, in1, in2, in3, ( p_dst ), stride ); \
461
ST_H4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride ); \
462
}
463
#define ST_SH8( ... ) ST_H8( v8i16, __VA_ARGS__ )
464
465
/* Description : Store 2x4 byte block to destination memory from input vector
466
Arguments : Inputs - in, stidx, pdst, stride
467
Details : Index 'stidx' halfword element from 'in' vector is copied to
468
GP register and stored to (pdst)
469
Index 'stidx+1' halfword element from 'in' vector is copied to
470
GP register and stored to (pdst + stride)
471
Index 'stidx+2' halfword element from 'in' vector is copied to
472
GP register and stored to (pdst + 2 * stride)
473
Index 'stidx+3' halfword element from 'in' vector is copied to
474
GP register and stored to (pdst + 3 * stride)
475
*/
476
#define ST2x4_UB( in, stidx, p_dst, stride ) \
477
{ \
478
uint16_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \
479
uint8_t *pblk_2x4_m = ( uint8_t * ) ( p_dst ); \
480
\
481
u_out0_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx ) ); \
482
u_out1_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 1 ) ); \
483
u_out2_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 2 ) ); \
484
u_out3_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 3 ) ); \
485
\
486
SH( u_out0_m, pblk_2x4_m ); \
487
SH( u_out1_m, pblk_2x4_m + stride ); \
488
SH( u_out2_m, pblk_2x4_m + 2 * stride ); \
489
SH( u_out3_m, pblk_2x4_m + 3 * stride ); \
490
}
491
492
/* Description : Store 4x4 byte block to destination memory from input vector
493
Arguments : Inputs - in0, in1, pdst, stride
494
Details : 'Idx0' word element from input vector 'in0' is copied to
495
GP register and stored to (pdst)
496
'Idx1' word element from input vector 'in0' is copied to
497
GP register and stored to (pdst + stride)
498
'Idx2' word element from input vector 'in0' is copied to
499
GP register and stored to (pdst + 2 * stride)
500
'Idx3' word element from input vector 'in0' is copied to
501
GP register and stored to (pdst + 3 * stride)
502
*/
503
#define ST4x4_UB( in0, in1, idx0, idx1, idx2, idx3, p_dst, stride ) \
504
{ \
505
uint32_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \
506
uint8_t *pblk_4x4_m = ( uint8_t * ) ( p_dst ); \
507
\
508
u_out0_m = __msa_copy_u_w( ( v4i32 ) in0, idx0 ); \
509
u_out1_m = __msa_copy_u_w( ( v4i32 ) in0, idx1 ); \
510
u_out2_m = __msa_copy_u_w( ( v4i32 ) in1, idx2 ); \
511
u_out3_m = __msa_copy_u_w( ( v4i32 ) in1, idx3 ); \
512
\
513
SW4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_4x4_m, stride ); \
514
}
515
516
#define ST4x8_UB( in0, in1, p_dst, stride ) \
517
{ \
518
uint8_t *pblk_4x8 = ( uint8_t * ) ( p_dst ); \
519
\
520
ST4x4_UB( in0, in0, 0, 1, 2, 3, pblk_4x8, stride ); \
521
ST4x4_UB( in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride ); \
522
}
523
524
/* Description : Store 8x1 byte block to destination memory from input vector
525
Arguments : Inputs - in, pdst
526
Details : Index 0 double word element from 'in' vector is copied to
527
GP register and stored to (pdst)
528
*/
529
#define ST8x1_UB( in, p_dst ) \
530
{ \
531
uint64_t u_out0_m; \
532
u_out0_m = __msa_copy_u_d( ( v2i64 ) in, 0 ); \
533
SD( u_out0_m, p_dst ); \
534
}
535
536
/* Description : Store 8x4 byte block to destination memory from input
537
vectors
538
Arguments : Inputs - in0, in1, pdst, stride
539
Details : Index 0 double word element from 'in0' vector is copied to
540
GP register and stored to (pdst)
541
Index 1 double word element from 'in0' vector is copied to
542
GP register and stored to (pdst + stride)
543
Index 0 double word element from 'in1' vector is copied to
544
GP register and stored to (pdst + 2 * stride)
545
Index 1 double word element from 'in1' vector is copied to
546
GP register and stored to (pdst + 3 * stride)
547
*/
548
#define ST8x4_UB( in0, in1, p_dst, stride ) \
549
{ \
550
uint64_t u_out0_m, u_out1_m, u_out2_m, u_out3_m; \
551
uint8_t *pblk_8x4_m = ( uint8_t * ) ( p_dst ); \
552
\
553
u_out0_m = __msa_copy_u_d( ( v2i64 ) in0, 0 ); \
554
u_out1_m = __msa_copy_u_d( ( v2i64 ) in0, 1 ); \
555
u_out2_m = __msa_copy_u_d( ( v2i64 ) in1, 0 ); \
556
u_out3_m = __msa_copy_u_d( ( v2i64 ) in1, 1 ); \
557
\
558
SD4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_8x4_m, stride ); \
559
}
560
561
/* Description : average with rounding (in0 + in1 + 1) / 2.
562
Arguments : Inputs - in0, in1, in2, in3,
563
Outputs - out0, out1
564
Return Type - as per RTYPE
565
Details : Each unsigned byte element from 'in0' vector is added with
566
each unsigned byte element from 'in1' vector.
567
Average with rounding is calculated and written to 'out0'
568
*/
569
#define AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
570
{ \
571
out0 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in0, ( v16u8 ) in1 ); \
572
out1 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in2, ( v16u8 ) in3 ); \
573
}
574
#define AVER_UB2_UB( ... ) AVER_UB2( v16u8, __VA_ARGS__ )
575
576
#define AVER_UB4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
577
out0, out1, out2, out3 ) \
578
{ \
579
AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
580
AVER_UB2( RTYPE, in4, in5, in6, in7, out2, out3 ) \
581
}
582
#define AVER_UB4_UB( ... ) AVER_UB4( v16u8, __VA_ARGS__ )
583
584
/* Description : Immediate number of elements to slide with zero
585
Arguments : Inputs - in0, in1, slide_val
586
Outputs - out0, out1
587
Return Type - as per RTYPE
588
Details : Byte elements from 'zero_m' vector are slide into 'in0' by
589
value specified in 'slide_val'
590
*/
591
#define SLDI_B2_0( RTYPE, in0, in1, out0, out1, slide_val ) \
592
{ \
593
v16i8 zero_m = { 0 }; \
594
out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m, \
595
( v16i8 ) in0, slide_val ); \
596
out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m, \
597
( v16i8 ) in1, slide_val ); \
598
}
599
#define SLDI_B2_0_UB( ... ) SLDI_B2_0( v16u8, __VA_ARGS__ )
600
601
/* Description : Immediate number of elements to slide
602
Arguments : Inputs - in0_0, in0_1, in1_0, in1_1, slide_val
603
Outputs - out0, out1
604
Return Type - as per RTYPE
605
Details : Byte elements from 'in0_0' vector are slide into 'in1_0' by
606
value specified in 'slide_val'
607
*/
608
#define SLDI_B2( RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val ) \
609
{ \
610
out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_0, ( v16i8 ) in1_0, \
611
slide_val ); \
612
out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_1, ( v16i8 ) in1_1, \
613
slide_val ); \
614
}
615
#define SLDI_B2_UB( ... ) SLDI_B2( v16u8, __VA_ARGS__ )
616
617
/* Description : Shuffle byte vector elements as per mask vector
618
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
619
Outputs - out0, out1
620
Return Type - as per RTYPE
621
Details : Selective byte elements from 'in0' & 'in1' are copied to
622
'out0' as per control vector 'mask0'
623
*/
624
#define VSHF_B2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 ) \
625
{ \
626
out0 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask0, \
627
( v16i8 ) in1, ( v16i8 ) in0 ); \
628
out1 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask1, \
629
( v16i8 ) in3, ( v16i8 ) in2 ); \
630
}
631
#define VSHF_B2_UB( ... ) VSHF_B2( v16u8, __VA_ARGS__ )
632
#define VSHF_B2_SB( ... ) VSHF_B2( v16i8, __VA_ARGS__ )
633
634
/* Description : Shuffle halfword vector elements as per mask vector
635
Arguments : Inputs - in0, in1, in2, in3, mask0, mask1
636
Outputs - out0, out1
637
Return Type - as per RTYPE
638
Details : Selective byte elements from 'in0' & 'in1' are copied to
639
'out0' as per control vector 'mask0'
640
*/
641
#define VSHF_H2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 ) \
642
{ \
643
out0 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask0, \
644
( v8i16 ) in1, ( v8i16 ) in0 ); \
645
out1 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask1, \
646
( v8i16 ) in3, ( v8i16 ) in2 ); \
647
}
648
#define VSHF_H2_SH( ... ) VSHF_H2( v8i16, __VA_ARGS__ )
649
650
/* Description : Dot product of byte vector elements
651
Arguments : Inputs - mult0, mult1
652
cnst0, cnst1
653
Outputs - out0, out1
654
Return Type - as per RTYPE
655
Details : Unsigned byte elements from 'mult0' are multiplied with
656
unsigned byte elements from 'cnst0' producing a result
657
twice the size of input i.e. unsigned halfword.
658
Multiplication result of adjacent odd-even elements
659
are added together and written to the 'out0' vector
660
*/
661
#define DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ) \
662
{ \
663
out0 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult0, ( v16u8 ) cnst0 ); \
664
out1 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult1, ( v16u8 ) cnst1 ); \
665
}
666
#define DOTP_UB2_UH( ... ) DOTP_UB2( v8u16, __VA_ARGS__ )
667
668
#define DOTP_UB4( RTYPE, mult0, mult1, mult2, mult3, \
669
cnst0, cnst1, cnst2, cnst3, \
670
out0, out1, out2, out3 ) \
671
{ \
672
DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ); \
673
DOTP_UB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 ); \
674
}
675
#define DOTP_UB4_UH( ... ) DOTP_UB4( v8u16, __VA_ARGS__ )
676
677
/* Description : Dot product of byte vector elements
678
Arguments : Inputs - mult0, mult1
679
cnst0, cnst1
680
Outputs - out0, out1
681
Return Type - as per RTYPE
682
Details : Signed byte elements from 'mult0' are multiplied with
683
signed byte elements from 'cnst0' producing a result
684
twice the size of input i.e. signed halfword.
685
Multiplication result of adjacent odd-even elements
686
are added together and written to the 'out0' vector
687
*/
688
#define DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ) \
689
{ \
690
out0 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out0, \
691
( v16i8 ) mult0, ( v16i8 ) cnst0 ); \
692
out1 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out1, \
693
( v16i8 ) mult1, ( v16i8 ) cnst1 ); \
694
}
695
#define DPADD_SB2_SH( ... ) DPADD_SB2( v8i16, __VA_ARGS__ )
696
697
#define DPADD_SB4( RTYPE, mult0, mult1, mult2, mult3, \
698
cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3 ) \
699
{ \
700
DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ); \
701
DPADD_SB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 ); \
702
}
703
#define DPADD_SB4_SH( ... ) DPADD_SB4( v8i16, __VA_ARGS__ )
704
705
/* Description : Dot product of halfword vector elements
706
Arguments : Inputs - mult0, mult1
707
cnst0, cnst1
708
Outputs - out0, out1
709
Return Type - as per RTYPE
710
Details : Signed halfword elements from 'mult0' are multiplied with
711
signed halfword elements from 'cnst0' producing a result
712
twice the size of input i.e. signed word.
713
Multiplication result of adjacent odd-even elements
714
are added together and written to the 'out0' vector
715
*/
716
#define DPADD_SH2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 ) \
717
{ \
718
out0 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out0, \
719
( v8i16 ) mult0, ( v8i16 ) cnst0 ); \
720
out1 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out1, \
721
( v8i16 ) mult1, ( v8i16 ) cnst1 ); \
722
}
723
#define DPADD_SH2_SW( ... ) DPADD_SH2( v4i32, __VA_ARGS__ )
724
725
/* Description : Clips all halfword elements of input vector between min & max
726
out = (in < min) ? min : ((in > max) ? max : in)
727
Arguments : Inputs - in, min, max
728
Output - out_m
729
Return Type - signed halfword
730
*/
731
#define CLIP_SH( in, min, max ) \
732
( { \
733
v8i16 out_m; \
734
\
735
out_m = __msa_max_s_h( ( v8i16 ) min, ( v8i16 ) in ); \
736
out_m = __msa_min_s_h( ( v8i16 ) max, ( v8i16 ) out_m ); \
737
out_m; \
738
} )
739
740
/* Description : Clips all signed halfword elements of input vector
741
between 0 & 255
742
Arguments : Input - in
743
Output - out_m
744
Return Type - signed halfword
745
*/
746
#define CLIP_SH_0_255( in ) \
747
( { \
748
v8i16 max_m = __msa_ldi_h( 255 ); \
749
v8i16 out_m; \
750
\
751
out_m = __msa_maxi_s_h( ( v8i16 ) in, 0 ); \
752
out_m = __msa_min_s_h( ( v8i16 ) max_m, ( v8i16 ) out_m ); \
753
out_m; \
754
} )
755
#define CLIP_SH2_0_255( in0, in1 ) \
756
{ \
757
in0 = CLIP_SH_0_255( in0 ); \
758
in1 = CLIP_SH_0_255( in1 ); \
759
}
760
#define CLIP_SH4_0_255( in0, in1, in2, in3 ) \
761
{ \
762
CLIP_SH2_0_255( in0, in1 ); \
763
CLIP_SH2_0_255( in2, in3 ); \
764
}
765
766
/* Description : Horizontal addition of 4 signed word elements of input vector
767
Arguments : Input - in (signed word vector)
768
Output - sum_m (i32 sum)
769
Return Type - signed word (GP)
770
Details : 4 signed word elements of 'in' vector are added together and
771
the resulting integer sum is returned
772
*/
773
#define HADD_SW_S32( in ) \
774
( { \
775
v2i64 res0_m, res1_m; \
776
int32_t i_sum_m; \
777
\
778
res0_m = __msa_hadd_s_d( ( v4i32 ) in, ( v4i32 ) in ); \
779
res1_m = __msa_splati_d( res0_m, 1 ); \
780
res0_m = res0_m + res1_m; \
781
i_sum_m = __msa_copy_s_w( ( v4i32 ) res0_m, 0 ); \
782
i_sum_m; \
783
} )
784
785
/* Description : Horizontal addition of 4 signed word elements of input vector
786
Arguments : Input - in (signed word vector)
787
Output - sum_m (i32 sum)
788
Return Type - signed word (GP)
789
Details : 4 signed word elements of 'in' vector are added together and
790
the resulting integer sum is returned
791
*/
792
#define HADD_UH_U32( in ) \
793
( { \
794
v4u32 res_m; \
795
v2u64 res0_m, res1_m; \
796
uint32_t u_sum_m; \
797
\
798
res_m = __msa_hadd_u_w( ( v8u16 ) in, ( v8u16 ) in ); \
799
res0_m = __msa_hadd_u_d( res_m, res_m ); \
800
res1_m = ( v2u64 ) __msa_splati_d( ( v2i64 ) res0_m, 1 ); \
801
res0_m = res0_m + res1_m; \
802
u_sum_m = __msa_copy_u_w( ( v4i32 ) res0_m, 0 ); \
803
u_sum_m; \
804
} )
805
806
/* Description : Horizontal addition of signed byte vector elements
807
Arguments : Inputs - in0, in1
808
Outputs - out0, out1
809
Return Type - as per RTYPE
810
Details : Each signed odd byte element from 'in0' is added to
811
even signed byte element from 'in0' (pairwise) and the
812
halfword result is written in 'out0'
813
*/
814
#define HADD_SB2( RTYPE, in0, in1, out0, out1 ) \
815
{ \
816
out0 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in0, ( v16i8 ) in0 ); \
817
out1 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in1, ( v16i8 ) in1 ); \
818
}
819
#define HADD_SB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 ) \
820
{ \
821
HADD_SB2( RTYPE, in0, in1, out0, out1 ); \
822
HADD_SB2( RTYPE, in2, in3, out2, out3 ); \
823
}
824
#define HADD_SB4_SH( ... ) HADD_SB4( v8i16, __VA_ARGS__ )
825
826
/* Description : Horizontal addition of unsigned byte vector elements
827
Arguments : Inputs - in0, in1
828
Outputs - out0, out1
829
Return Type - as per RTYPE
830
Details : Each unsigned odd byte element from 'in0' is added to
831
even unsigned byte element from 'in0' (pairwise) and the
832
halfword result is written to 'out0'
833
*/
834
#define HADD_UB2( RTYPE, in0, in1, out0, out1 ) \
835
{ \
836
out0 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in0, ( v16u8 ) in0 ); \
837
out1 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in1, ( v16u8 ) in1 ); \
838
}
839
#define HADD_UB2_UH( ... ) HADD_UB2( v8u16, __VA_ARGS__ )
840
841
#define HADD_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 ) \
842
{ \
843
HADD_UB2( RTYPE, in0, in1, out0, out1 ); \
844
HADD_UB2( RTYPE, in2, in3, out2, out3 ); \
845
}
846
#define HADD_UB4_UH( ... ) HADD_UB4( v8u16, __VA_ARGS__ )
847
848
/* Description : Horizontal subtraction of unsigned byte vector elements
849
Arguments : Inputs - in0, in1
850
Outputs - out0, out1
851
Return Type - as per RTYPE
852
Details : Each unsigned odd byte element from 'in0' is subtracted from
853
even unsigned byte element from 'in0' (pairwise) and the
854
halfword result is written to 'out0'
855
*/
856
#define HSUB_UB2( RTYPE, in0, in1, out0, out1 ) \
857
{ \
858
out0 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in0, ( v16u8 ) in0 ); \
859
out1 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in1, ( v16u8 ) in1 ); \
860
}
861
#define HSUB_UB2_SH( ... ) HSUB_UB2( v8i16, __VA_ARGS__ )
862
863
#define HSUB_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 ) \
864
{ \
865
HSUB_UB2( RTYPE, in0, in1, out0, out1 ); \
866
HSUB_UB2( RTYPE, in2, in3, out2, out3 ); \
867
}
868
#define HSUB_UB4_SH( ... ) HSUB_UB4( v8i16, __VA_ARGS__ )
869
870
/* Description : SAD (Sum of Absolute Difference)
871
Arguments : Inputs - in0, in1, ref0, ref1
872
Outputs - sad_m (halfword vector)
873
Return Type - unsigned halfword
874
Details : Absolute difference of all the byte elements from 'in0' with
875
'ref0' is calculated and preserved in 'diff0'. Then even-odd
876
pairs are added together to generate 8 halfword results.
877
*/
878
#define SAD_UB2_UH( in0, in1, ref0, ref1 ) \
879
( { \
880
v16u8 diff0_m, diff1_m; \
881
v8u16 sad_m = { 0 }; \
882
\
883
diff0_m = __msa_asub_u_b( ( v16u8 ) in0, ( v16u8 ) ref0 ); \
884
diff1_m = __msa_asub_u_b( ( v16u8 ) in1, ( v16u8 ) ref1 ); \
885
\
886
sad_m += __msa_hadd_u_h( ( v16u8 ) diff0_m, ( v16u8 ) diff0_m ); \
887
sad_m += __msa_hadd_u_h( ( v16u8 ) diff1_m, ( v16u8 ) diff1_m ); \
888
\
889
sad_m; \
890
} )
891
892
/* Description : Set element n input vector to GPR value
893
Arguments : Inputs - in0, in1, in2, in3 (4 input vectors)
894
Output - out (output vector)
895
Return Type - as per RTYPE
896
Details : Set element 0 in vector 'out' to value specified in 'in0'
897
*/
898
#define INSERT_W2( RTYPE, in0, in1, out ) \
899
{ \
900
out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 ); \
901
out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 ); \
902
}
903
#define INSERT_W2_SB( ... ) INSERT_W2( v16i8, __VA_ARGS__ )
904
905
#define INSERT_W4( RTYPE, in0, in1, in2, in3, out ) \
906
{ \
907
out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 ); \
908
out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 ); \
909
out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 2, in2 ); \
910
out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 3, in3 ); \
911
}
912
#define INSERT_W4_UB( ... ) INSERT_W4( v16u8, __VA_ARGS__ )
913
#define INSERT_W4_SB( ... ) INSERT_W4( v16i8, __VA_ARGS__ )
914
915
#define INSERT_D2( RTYPE, in0, in1, out ) \
916
{ \
917
out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 0, in0 ); \
918
out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 1, in1 ); \
919
}
920
#define INSERT_D2_UB( ... ) INSERT_D2( v16u8, __VA_ARGS__ )
921
922
/* Description : Interleave even halfword elements from vectors
923
Arguments : Inputs - in0, in1, in2, in3
924
Outputs - out0, out1
925
Return Type - as per RTYPE
926
Details : Even halfword elements of 'in0' and 'in1' are interleaved
927
and written to 'out0'
928
*/
929
#define ILVEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
930
{ \
931
out0 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in1, ( v8i16 ) in0 ); \
932
out1 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in3, ( v8i16 ) in2 ); \
933
}
934
#define ILVEV_H2_UB( ... ) ILVEV_H2( v16u8, __VA_ARGS__ )
935
936
/* Description : Interleave even double word elements from vectors
937
Arguments : Inputs - in0, in1, in2, in3
938
Outputs - out0, out1
939
Return Type - as per RTYPE
940
Details : Even double word elements of 'in0' and 'in1' are interleaved
941
and written to 'out0'
942
*/
943
#define ILVEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
944
{ \
945
out0 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in1, ( v2i64 ) in0 ); \
946
out1 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in3, ( v2i64 ) in2 ); \
947
}
948
#define ILVEV_D2_UB( ... ) ILVEV_D2( v16u8, __VA_ARGS__ )
949
950
/* Description : Interleave left half of byte elements from vectors
951
Arguments : Inputs - in0, in1, in2, in3
952
Outputs - out0, out1
953
Return Type - as per RTYPE
954
Details : Left half of byte elements of 'in0' and 'in1' are interleaved
955
and written to 'out0'.
956
*/
957
#define ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
958
{ \
959
out0 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \
960
out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \
961
}
962
#define ILVL_B2_UH( ... ) ILVL_B2( v8u16, __VA_ARGS__ )
963
#define ILVL_B2_SH( ... ) ILVL_B2( v8i16, __VA_ARGS__ )
964
965
#define ILVL_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
966
out0, out1, out2, out3 ) \
967
{ \
968
ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
969
ILVL_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
970
}
971
#define ILVL_B4_UB( ... ) ILVL_B4( v16u8, __VA_ARGS__ )
972
#define ILVL_B4_SB( ... ) ILVL_B4( v16i8, __VA_ARGS__ )
973
#define ILVL_B4_UH( ... ) ILVL_B4( v8u16, __VA_ARGS__ )
974
#define ILVL_B4_SH( ... ) ILVL_B4( v8i16, __VA_ARGS__ )
975
976
/* Description : Interleave left half of halfword elements from vectors
977
Arguments : Inputs - in0, in1, in2, in3
978
Outputs - out0, out1
979
Return Type - as per RTYPE
980
Details : Left half of halfword elements of 'in0' and 'in1' are
981
interleaved and written to 'out0'.
982
*/
983
#define ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
984
{ \
985
out0 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \
986
out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \
987
}
988
#define ILVL_H2_SH( ... ) ILVL_H2( v8i16, __VA_ARGS__ )
989
#define ILVL_H2_SW( ... ) ILVL_H2( v4i32, __VA_ARGS__ )
990
991
#define ILVL_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
992
out0, out1, out2, out3 ) \
993
{ \
994
ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
995
ILVL_H2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
996
}
997
#define ILVL_H4_SW( ... ) ILVL_H4( v4i32, __VA_ARGS__ )
998
999
/* Description : Interleave left half of word elements from vectors
1000
Arguments : Inputs - in0, in1, in2, in3
1001
Outputs - out0, out1
1002
Return Type - as per RTYPE
1003
Details : Left half of word elements of 'in0' and 'in1' are interleaved
1004
and written to 'out0'.
1005
*/
1006
#define ILVL_W2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1007
{ \
1008
out0 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \
1009
out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in2, ( v4i32 ) in3 ); \
1010
}
1011
#define ILVL_W2_SH( ... ) ILVL_W2( v8i16, __VA_ARGS__ )
1012
1013
/* Description : Interleave right half of byte elements from vectors
1014
Arguments : Inputs - in0, in1, in2, in3
1015
Outputs - out0, out1
1016
Return Type - as per RTYPE
1017
Details : Right half of byte elements of 'in0' and 'in1' are interleaved
1018
and written to out0.
1019
*/
1020
#define ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1021
{ \
1022
out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \
1023
out1 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \
1024
}
1025
#define ILVR_B2_SB( ... ) ILVR_B2( v16i8, __VA_ARGS__ )
1026
#define ILVR_B2_UH( ... ) ILVR_B2( v8u16, __VA_ARGS__ )
1027
#define ILVR_B2_SH( ... ) ILVR_B2( v8i16, __VA_ARGS__ )
1028
1029
#define ILVR_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1030
out0, out1, out2, out3 ) \
1031
{ \
1032
ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1033
ILVR_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
1034
}
1035
#define ILVR_B4_UB( ... ) ILVR_B4( v16u8, __VA_ARGS__ )
1036
#define ILVR_B4_SB( ... ) ILVR_B4( v16i8, __VA_ARGS__ )
1037
#define ILVR_B4_UH( ... ) ILVR_B4( v8u16, __VA_ARGS__ )
1038
#define ILVR_B4_SH( ... ) ILVR_B4( v8i16, __VA_ARGS__ )
1039
1040
/* Description : Interleave right half of halfword elements from vectors
1041
Arguments : Inputs - in0, in1, in2, in3
1042
Outputs - out0, out1
1043
Return Type - as per RTYPE
1044
Details : Right half of halfword elements of 'in0' and 'in1' are
1045
interleaved and written to 'out0'.
1046
*/
1047
#define ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1048
{ \
1049
out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \
1050
out1 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \
1051
}
1052
#define ILVR_H2_SH( ... ) ILVR_H2( v8i16, __VA_ARGS__ )
1053
#define ILVR_H2_SW( ... ) ILVR_H2( v4i32, __VA_ARGS__ )
1054
1055
#define ILVR_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1056
out0, out1, out2, out3 ) \
1057
{ \
1058
ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1059
ILVR_H2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
1060
}
1061
#define ILVR_H4_SH( ... ) ILVR_H4( v8i16, __VA_ARGS__ )
1062
#define ILVR_H4_SW( ... ) ILVR_H4( v4i32, __VA_ARGS__ )
1063
1064
#define ILVR_W2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1065
{ \
1066
out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \
1067
out1 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in2, ( v4i32 ) in3 ); \
1068
}
1069
#define ILVR_W2_SH( ... ) ILVR_W2( v8i16, __VA_ARGS__ )
1070
1071
/* Description : Interleave right half of double word elements from vectors
1072
Arguments : Inputs - in0, in1, in2, in3
1073
Outputs - out0, out1
1074
Return Type - as per RTYPE
1075
Details : Right half of double word elements of 'in0' and 'in1' are
1076
interleaved and written to 'out0'.
1077
*/
1078
#define ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1079
{ \
1080
out0 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in0 ), ( v2i64 ) ( in1 ) ); \
1081
out1 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in2 ), ( v2i64 ) ( in3 ) ); \
1082
}
1083
#define ILVR_D2_UB( ... ) ILVR_D2( v16u8, __VA_ARGS__ )
1084
#define ILVR_D2_SB( ... ) ILVR_D2( v16i8, __VA_ARGS__ )
1085
#define ILVR_D2_SH( ... ) ILVR_D2( v8i16, __VA_ARGS__ )
1086
1087
#define ILVR_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1088
out0, out1, out2, out3 ) \
1089
{ \
1090
ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1091
ILVR_D2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
1092
}
1093
#define ILVR_D4_UB( ... ) ILVR_D4( v16u8, __VA_ARGS__ )
1094
1095
/* Description : Interleave both left and right half of input vectors
1096
Arguments : Inputs - in0, in1
1097
Outputs - out0, out1
1098
Return Type - as per RTYPE
1099
Details : Right half of byte elements from 'in0' and 'in1' are
1100
interleaved and written to 'out0'
1101
*/
1102
#define ILVRL_B2( RTYPE, in0, in1, out0, out1 ) \
1103
{ \
1104
out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \
1105
out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \
1106
}
1107
#define ILVRL_B2_UB( ... ) ILVRL_B2( v16u8, __VA_ARGS__ )
1108
#define ILVRL_B2_SB( ... ) ILVRL_B2( v16i8, __VA_ARGS__ )
1109
#define ILVRL_B2_UH( ... ) ILVRL_B2( v8u16, __VA_ARGS__ )
1110
#define ILVRL_B2_SH( ... ) ILVRL_B2( v8i16, __VA_ARGS__ )
1111
#define ILVRL_B2_SW( ... ) ILVRL_B2( v4i32, __VA_ARGS__ )
1112
1113
#define ILVRL_H2( RTYPE, in0, in1, out0, out1 ) \
1114
{ \
1115
out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \
1116
out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \
1117
}
1118
#define ILVRL_H2_SH( ... ) ILVRL_H2( v8i16, __VA_ARGS__ )
1119
#define ILVRL_H2_SW( ... ) ILVRL_H2( v4i32, __VA_ARGS__ )
1120
1121
#define ILVRL_W2( RTYPE, in0, in1, out0, out1 ) \
1122
{ \
1123
out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \
1124
out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 ); \
1125
}
1126
#define ILVRL_W2_SH( ... ) ILVRL_W2( v8i16, __VA_ARGS__ )
1127
#define ILVRL_W2_SW( ... ) ILVRL_W2( v4i32, __VA_ARGS__ )
1128
1129
/* Description : Maximum values between signed elements of vector and
1130
5-bit signed immediate value are copied to the output vector
1131
Arguments : Inputs - in0, in1, in2, in3, max_val
1132
Outputs - in place operation
1133
Return Type - unsigned halfword
1134
Details : Maximum of signed halfword element values from 'in0' and
1135
'max_val' are written in place
1136
*/
1137
#define MAXI_SH2( RTYPE, in0, in1, max_val ) \
1138
{ \
1139
in0 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in0, ( max_val ) ); \
1140
in1 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in1, ( max_val ) ); \
1141
}
1142
#define MAXI_SH2_UH( ... ) MAXI_SH2( v8u16, __VA_ARGS__ )
1143
#define MAXI_SH2_SH( ... ) MAXI_SH2( v8i16, __VA_ARGS__ )
1144
1145
#define MAXI_SH4( RTYPE, in0, in1, in2, in3, max_val ) \
1146
{ \
1147
MAXI_SH2( RTYPE, in0, in1, max_val ); \
1148
MAXI_SH2( RTYPE, in2, in3, max_val ); \
1149
}
1150
#define MAXI_SH4_UH( ... ) MAXI_SH4( v8u16, __VA_ARGS__ )
1151
1152
/* Description : Saturate the halfword element values to the max
1153
unsigned value of (sat_val + 1 bits)
1154
The element data width remains unchanged
1155
Arguments : Inputs - in0, in1, sat_val
1156
Outputs - in place operation
1157
Return Type - as per RTYPE
1158
Details : Each unsigned halfword element from 'in0' is saturated to the
1159
value generated with (sat_val+1) bit range.
1160
The results are written in place
1161
*/
1162
#define SAT_UH2( RTYPE, in0, in1, sat_val ) \
1163
{ \
1164
in0 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in0, sat_val ); \
1165
in1 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in1, sat_val ); \
1166
}
1167
#define SAT_UH2_UH( ... ) SAT_UH2( v8u16, __VA_ARGS__ )
1168
1169
#define SAT_UH4( RTYPE, in0, in1, in2, in3, sat_val ) \
1170
{ \
1171
SAT_UH2( RTYPE, in0, in1, sat_val ); \
1172
SAT_UH2( RTYPE, in2, in3, sat_val ) \
1173
}
1174
#define SAT_UH4_UH( ... ) SAT_UH4( v8u16, __VA_ARGS__ )
1175
1176
/* Description : Saturate the halfword element values to the max
1177
unsigned value of (sat_val+1 bits)
1178
The element data width remains unchanged
1179
Arguments : Inputs - in0, in1, sat_val
1180
Outputs - in place operation
1181
Return Type - as per RTYPE
1182
Details : Each unsigned halfword element from 'in0' is saturated to the
1183
value generated with (sat_val+1) bit range
1184
The results are written in place
1185
*/
1186
#define SAT_SH2( RTYPE, in0, in1, sat_val ) \
1187
{ \
1188
in0 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in0, sat_val ); \
1189
in1 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in1, sat_val ); \
1190
}
1191
#define SAT_SH2_SH( ... ) SAT_SH2( v8i16, __VA_ARGS__ )
1192
1193
#define SAT_SH4( RTYPE, in0, in1, in2, in3, sat_val ) \
1194
{ \
1195
SAT_SH2( RTYPE, in0, in1, sat_val ); \
1196
SAT_SH2( RTYPE, in2, in3, sat_val ); \
1197
}
1198
#define SAT_SH4_SH( ... ) SAT_SH4( v8i16, __VA_ARGS__ )
1199
1200
/* Description : Saturate the word element values to the max
1201
unsigned value of (sat_val+1 bits)
1202
The element data width remains unchanged
1203
Arguments : Inputs - in0, in1, sat_val
1204
Outputs - in place operation
1205
Return Type - as per RTYPE
1206
Details : Each unsigned word element from 'in0' is saturated to the
1207
value generated with (sat_val+1) bit range
1208
The results are written in place
1209
*/
1210
#define SAT_SW2( RTYPE, in0, in1, sat_val ) \
1211
{ \
1212
in0 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in0, sat_val ); \
1213
in1 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in1, sat_val ); \
1214
}
1215
#define SAT_SW2_SW( ... ) SAT_SW2( v4i32, __VA_ARGS__ )
1216
1217
/* Description : Pack even byte elements of vector pairs
1218
Arguments : Inputs - in0, in1, in2, in3
1219
Outputs - out0, out1
1220
Return Type - as per RTYPE
1221
Details : Even byte elements of 'in0' are copied to the left half of
1222
'out0' & even byte elements of 'in1' are copied to the right
1223
half of 'out0'.
1224
*/
1225
#define PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1226
{ \
1227
out0 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \
1228
out1 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \
1229
}
1230
#define PCKEV_B2_SB( ... ) PCKEV_B2( v16i8, __VA_ARGS__ )
1231
#define PCKEV_B2_UB( ... ) PCKEV_B2( v16u8, __VA_ARGS__ )
1232
#define PCKEV_B2_SH( ... ) PCKEV_B2( v8i16, __VA_ARGS__ )
1233
#define PCKEV_B2_SW( ... ) PCKEV_B2( v4i32, __VA_ARGS__ )
1234
1235
#define PCKEV_B3( RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2 ) \
1236
{ \
1237
PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1238
out2 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in4, ( v16i8 ) in5 ); \
1239
}
1240
#define PCKEV_B3_UB( ... ) PCKEV_B3( v16u8, __VA_ARGS__ )
1241
1242
#define PCKEV_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1243
out0, out1, out2, out3 ) \
1244
{ \
1245
PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1246
PCKEV_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
1247
}
1248
#define PCKEV_B4_SB( ... ) PCKEV_B4( v16i8, __VA_ARGS__ )
1249
#define PCKEV_B4_UB( ... ) PCKEV_B4( v16u8, __VA_ARGS__ )
1250
1251
/* Description : Pack even halfword elements of vector pairs
1252
Arguments : Inputs - in0, in1, in2, in3
1253
Outputs - out0, out1
1254
Return Type - as per RTYPE
1255
Details : Even halfword elements of 'in0' are copied to the left half of
1256
'out0' & even halfword elements of 'in1' are copied to the
1257
right half of 'out0'.
1258
*/
1259
#define PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1260
{ \
1261
out0 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \
1262
out1 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \
1263
}
1264
#define PCKEV_H2_SH( ... ) PCKEV_H2( v8i16, __VA_ARGS__ )
1265
1266
#define PCKEV_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1267
out0, out1, out2, out3 ) \
1268
{ \
1269
PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1270
PCKEV_H2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
1271
}
1272
#define PCKEV_H4_SH( ... ) PCKEV_H4( v8i16, __VA_ARGS__ )
1273
1274
/* Description : Pack even double word elements of vector pairs
1275
Arguments : Inputs - in0, in1, in2, in3
1276
Outputs - out0, out1
1277
Return Type - as per RTYPE
1278
Details : Even double elements of 'in0' are copied to the left half of
1279
'out0' & even double elements of 'in1' are copied to the right
1280
half of 'out0'.
1281
*/
1282
#define PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1283
{ \
1284
out0 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in0, ( v2i64 ) in1 ); \
1285
out1 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in2, ( v2i64 ) in3 ); \
1286
}
1287
#define PCKEV_D2_UB( ... ) PCKEV_D2( v16u8, __VA_ARGS__ )
1288
1289
#define PCKEV_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1290
out0, out1, out2, out3 ) \
1291
{ \
1292
PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1293
PCKEV_D2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
1294
}
1295
#define PCKEV_D4_UB( ... ) PCKEV_D4( v16u8, __VA_ARGS__ )
1296
1297
/* Description : Pack odd byte elements of vector pairs
1298
Arguments : Inputs - in0, in1, in2, in3
1299
Outputs - out0, out1
1300
Return Type - as per RTYPE
1301
Details : Odd byte elements of 'in0' are copied to the left half of
1302
'out0' & odd byte elements of 'in1' are copied to the right
1303
half of 'out0'.
1304
*/
1305
#define PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1306
{ \
1307
out0 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in0, ( v16i8 ) in1 ); \
1308
out1 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in2, ( v16i8 ) in3 ); \
1309
}
1310
#define PCKOD_B2_UB( ... ) PCKOD_B2( v16u8, __VA_ARGS__ )
1311
1312
#define PCKOD_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1313
out0, out1, out2, out3 ) \
1314
{ \
1315
PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1316
PCKOD_B2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
1317
}
1318
#define PCKOD_B4_UB( ... ) PCKOD_B4( v16u8, __VA_ARGS__ )
1319
1320
/* Description : Pack odd double word elements of vector pairs
1321
Arguments : Inputs - in0, in1, in2, in3
1322
Outputs - out0, out1
1323
Return Type - as per RTYPE
1324
Details : Odd double word elements of 'in0' are copied to the left half
1325
of 'out0' & odd double word elements of 'in1' are copied to
1326
the right half of 'out0'.
1327
*/
1328
#define PCKOD_D2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1329
{ \
1330
out0 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in0, ( v2i64 ) in1 ); \
1331
out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in2, ( v2i64 ) in3 ); \
1332
}
1333
#define PCKOD_D2_SH( ... ) PCKOD_D2( v8i16, __VA_ARGS__ )
1334
#define PCKOD_D2_SD( ... ) PCKOD_D2( v2i64, __VA_ARGS__ )
1335
1336
/* Description : Each byte element is logically xor'ed with immediate 128
1337
Arguments : Inputs - in0, in1
1338
Outputs - in place operation
1339
Return Type - as per RTYPE
1340
Details : Each unsigned byte element from input vector 'in0' is
1341
logically xor'ed with 128 and the result is stored in-place.
1342
*/
1343
#define XORI_B2_128( RTYPE, in0, in1 ) \
1344
{ \
1345
in0 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in0, 128 ); \
1346
in1 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in1, 128 ); \
1347
}
1348
#define XORI_B2_128_UB( ... ) XORI_B2_128( v16u8, __VA_ARGS__ )
1349
#define XORI_B2_128_SB( ... ) XORI_B2_128( v16i8, __VA_ARGS__ )
1350
1351
#define XORI_B3_128( RTYPE, in0, in1, in2 ) \
1352
{ \
1353
XORI_B2_128( RTYPE, in0, in1 ); \
1354
in2 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in2, 128 ); \
1355
}
1356
#define XORI_B3_128_SB( ... ) XORI_B3_128( v16i8, __VA_ARGS__ )
1357
1358
#define XORI_B4_128( RTYPE, in0, in1, in2, in3 ) \
1359
{ \
1360
XORI_B2_128( RTYPE, in0, in1 ); \
1361
XORI_B2_128( RTYPE, in2, in3 ); \
1362
}
1363
#define XORI_B4_128_UB( ... ) XORI_B4_128( v16u8, __VA_ARGS__ )
1364
#define XORI_B4_128_SB( ... ) XORI_B4_128( v16i8, __VA_ARGS__ )
1365
1366
#define XORI_B5_128( RTYPE, in0, in1, in2, in3, in4 ) \
1367
{ \
1368
XORI_B3_128( RTYPE, in0, in1, in2 ); \
1369
XORI_B2_128( RTYPE, in3, in4 ); \
1370
}
1371
#define XORI_B5_128_SB( ... ) XORI_B5_128( v16i8, __VA_ARGS__ )
1372
1373
/* Description : Addition of signed halfword elements and signed saturation
1374
Arguments : Inputs - in0, in1, in2, in3
1375
Outputs - out0, out1
1376
Return Type - as per RTYPE
1377
Details : Signed halfword elements from 'in0' are added to signed
1378
halfword elements of 'in1'. The result is then signed saturated
1379
between halfword data type range
1380
*/
1381
#define ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 ) \
1382
{ \
1383
out0 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in0, ( v8i16 ) in1 ); \
1384
out1 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in2, ( v8i16 ) in3 ); \
1385
}
1386
#define ADDS_SH2_SH( ... ) ADDS_SH2( v8i16, __VA_ARGS__ )
1387
1388
#define ADDS_SH4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1389
out0, out1, out2, out3 ) \
1390
{ \
1391
ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 ); \
1392
ADDS_SH2( RTYPE, in4, in5, in6, in7, out2, out3 ); \
1393
}
1394
#define ADDS_SH4_UH( ... ) ADDS_SH4( v8u16, __VA_ARGS__ )
1395
1396
/* Description : Shift left all elements of vector (generic for all data types)
1397
Arguments : Inputs - in0, in1, in2, in3, shift
1398
Outputs - in place operation
1399
Return Type - as per input vector RTYPE
1400
Details : Each element of vector 'in0' is left shifted by 'shift' and
1401
the result is written in-place.
1402
*/
1403
#define SLLI_4V( in0, in1, in2, in3, shift ) \
1404
{ \
1405
in0 = in0 << shift; \
1406
in1 = in1 << shift; \
1407
in2 = in2 << shift; \
1408
in3 = in3 << shift; \
1409
}
1410
1411
/* Description : Arithmetic shift right all elements of vector
1412
(generic for all data types)
1413
Arguments : Inputs - in0, in1, in2, in3, shift
1414
Outputs - in place operation
1415
Return Type - as per input vector RTYPE
1416
Details : Each element of vector 'in0' is right shifted by 'shift' and
1417
the result is written in-place. 'shift' is a GP variable.
1418
*/
1419
#define SRA_4V( in0, in1, in2, in3, shift ) \
1420
{ \
1421
in0 = in0 >> shift; \
1422
in1 = in1 >> shift; \
1423
in2 = in2 >> shift; \
1424
in3 = in3 >> shift; \
1425
}
1426
1427
/* Description : Shift right arithmetic rounded halfwords
1428
Arguments : Inputs - in0, in1, shift
1429
Outputs - in place operation
1430
Return Type - as per RTYPE
1431
Details : Each element of vector 'in0' is shifted right arithmetic by
1432
number of bits respective element holds in vector 'shift'.
1433
The last discarded bit is added to shifted value for rounding
1434
and the result is written in-place.
1435
'shift' is a vector.
1436
*/
1437
#define SRAR_H2( RTYPE, in0, in1, shift ) \
1438
{ \
1439
in0 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in0, ( v8i16 ) shift ); \
1440
in1 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in1, ( v8i16 ) shift ); \
1441
}
1442
#define SRAR_H2_SH( ... ) SRAR_H2( v8i16, __VA_ARGS__ )
1443
1444
#define SRAR_H4( RTYPE, in0, in1, in2, in3, shift ) \
1445
{ \
1446
SRAR_H2( RTYPE, in0, in1, shift ) \
1447
SRAR_H2( RTYPE, in2, in3, shift ) \
1448
}
1449
#define SRAR_H4_SH( ... ) SRAR_H4( v8i16, __VA_ARGS__ )
1450
1451
/* Description : Shift right logical all halfword elements of vector
1452
Arguments : Inputs - in0, in1, in2, in3, shift
1453
Outputs - in place operation
1454
Return Type - as per RTYPE
1455
Details : Each element of vector 'in0' is shifted right logical by
1456
number of bits respective element holds in vector 'shift' and
1457
the result is stored in-place.'shift' is a vector.
1458
*/
1459
#define SRL_H4( RTYPE, in0, in1, in2, in3, shift ) \
1460
{ \
1461
in0 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in0, ( v8i16 ) shift ); \
1462
in1 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in1, ( v8i16 ) shift ); \
1463
in2 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in2, ( v8i16 ) shift ); \
1464
in3 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in3, ( v8i16 ) shift ); \
1465
}
1466
#define SRL_H4_UH( ... ) SRL_H4( v8u16, __VA_ARGS__ )
1467
1468
/* Description : Shift right arithmetic rounded (immediate)
1469
Arguments : Inputs - in0, in1, shift
1470
Outputs - in place operation
1471
Return Type - as per RTYPE
1472
Details : Each element of vector 'in0' is shifted right arithmetic by
1473
value in 'shift'. The last discarded bit is added to shifted
1474
value for rounding and the result is written in-place.
1475
'shift' is an immediate value.
1476
*/
1477
#define SRARI_H2( RTYPE, in0, in1, shift ) \
1478
{ \
1479
in0 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in0, shift ); \
1480
in1 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in1, shift ); \
1481
}
1482
#define SRARI_H2_UH( ... ) SRARI_H2( v8u16, __VA_ARGS__ )
1483
#define SRARI_H2_SH( ... ) SRARI_H2( v8i16, __VA_ARGS__ )
1484
1485
#define SRARI_H4( RTYPE, in0, in1, in2, in3, shift ) \
1486
{ \
1487
SRARI_H2( RTYPE, in0, in1, shift ); \
1488
SRARI_H2( RTYPE, in2, in3, shift ); \
1489
}
1490
#define SRARI_H4_UH( ... ) SRARI_H4( v8u16, __VA_ARGS__ )
1491
#define SRARI_H4_SH( ... ) SRARI_H4( v8i16, __VA_ARGS__ )
1492
1493
#define SRARI_W2( RTYPE, in0, in1, shift ) \
1494
{ \
1495
in0 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in0, shift ); \
1496
in1 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in1, shift ); \
1497
}
1498
#define SRARI_W2_SW( ... ) SRARI_W2( v4i32, __VA_ARGS__ )
1499
1500
#define SRARI_W4( RTYPE, in0, in1, in2, in3, shift ) \
1501
{ \
1502
SRARI_W2( RTYPE, in0, in1, shift ); \
1503
SRARI_W2( RTYPE, in2, in3, shift ); \
1504
}
1505
#define SRARI_W4_SW( ... ) SRARI_W4( v4i32, __VA_ARGS__ )
1506
1507
/* Description : Multiplication of pairs of vectors
1508
Arguments : Inputs - in0, in1, in2, in3
1509
Outputs - out0, out1
1510
Details : Each element from 'in0' is multiplied with elements from 'in1'
1511
and the result is written to 'out0'
1512
*/
1513
#define MUL2( in0, in1, in2, in3, out0, out1 ) \
1514
{ \
1515
out0 = in0 * in1; \
1516
out1 = in2 * in3; \
1517
}
1518
#define MUL4( in0, in1, in2, in3, in4, in5, in6, in7, \
1519
out0, out1, out2, out3 ) \
1520
{ \
1521
MUL2( in0, in1, in2, in3, out0, out1 ); \
1522
MUL2( in4, in5, in6, in7, out2, out3 ); \
1523
}
1524
1525
/* Description : Addition of 2 pairs of vectors
1526
Arguments : Inputs - in0, in1, in2, in3
1527
Outputs - out0, out1
1528
Details : Each element in 'in0' is added to 'in1' and result is written
1529
to 'out0'.
1530
*/
1531
#define ADD2( in0, in1, in2, in3, out0, out1 ) \
1532
{ \
1533
out0 = in0 + in1; \
1534
out1 = in2 + in3; \
1535
}
1536
#define ADD4( in0, in1, in2, in3, in4, in5, in6, in7, \
1537
out0, out1, out2, out3 ) \
1538
{ \
1539
ADD2( in0, in1, in2, in3, out0, out1 ); \
1540
ADD2( in4, in5, in6, in7, out2, out3 ); \
1541
}
1542
1543
#define SUB4( in0, in1, in2, in3, in4, in5, in6, in7, \
1544
out0, out1, out2, out3 ) \
1545
{ \
1546
out0 = in0 - in1; \
1547
out1 = in2 - in3; \
1548
out2 = in4 - in5; \
1549
out3 = in6 - in7; \
1550
}
1551
1552
/* Description : Sign extend halfword elements from right half of the vector
1553
Arguments : Input - in (halfword vector)
1554
Output - out (sign extended word vector)
1555
Return Type - signed word
1556
Details : Sign bit of halfword elements from input vector 'in' is
1557
extracted and interleaved with same vector 'in0' to generate
1558
4 word elements keeping sign intact
1559
*/
1560
#define UNPCK_R_SH_SW( in, out ) \
1561
{ \
1562
v8i16 sign_m; \
1563
\
1564
sign_m = __msa_clti_s_h( ( v8i16 ) in, 0 ); \
1565
out = ( v4i32 ) __msa_ilvr_h( sign_m, ( v8i16 ) in ); \
1566
}
1567
1568
/* Description : Zero extend unsigned byte elements to halfword elements
1569
Arguments : Input - in (unsigned byte vector)
1570
Outputs - out0, out1 (unsigned halfword vectors)
1571
Return Type - signed halfword
1572
Details : Zero extended right half of vector is returned in 'out0'
1573
Zero extended left half of vector is returned in 'out1'
1574
*/
1575
#define UNPCK_UB_SH( in, out0, out1 ) \
1576
{ \
1577
v16i8 zero_m = { 0 }; \
1578
\
1579
ILVRL_B2_SH( zero_m, in, out0, out1 ); \
1580
}
1581
1582
/* Description : Sign extend halfword elements from input vector and return
1583
the result in pair of vectors
1584
Arguments : Input - in (halfword vector)
1585
Outputs - out0, out1 (sign extended word vectors)
1586
Return Type - signed word
1587
Details : Sign bit of halfword elements from input vector 'in' is
1588
extracted and interleaved right with same vector 'in0' to
1589
generate 4 signed word elements in 'out0'
1590
Then interleaved left with same vector 'in0' to
1591
generate 4 signed word elements in 'out1'
1592
*/
1593
#define UNPCK_SH_SW( in, out0, out1 ) \
1594
{ \
1595
v8i16 tmp_m; \
1596
\
1597
tmp_m = __msa_clti_s_h( ( v8i16 ) in, 0 ); \
1598
ILVRL_H2_SW( tmp_m, in, out0, out1 ); \
1599
}
1600
1601
/* Description : Butterfly of 4 input vectors
1602
Arguments : Inputs - in0, in1, in2, in3
1603
Outputs - out0, out1, out2, out3
1604
Details : Butterfly operation
1605
*/
1606
#define BUTTERFLY_4( in0, in1, in2, in3, out0, out1, out2, out3 ) \
1607
{ \
1608
out0 = in0 + in3; \
1609
out1 = in1 + in2; \
1610
\
1611
out2 = in1 - in2; \
1612
out3 = in0 - in3; \
1613
}
1614
1615
/* Description : Butterfly of 8 input vectors
1616
Arguments : Inputs - in0 ... in7
1617
Outputs - out0 .. out7
1618
Details : Butterfly operation
1619
*/
1620
#define BUTTERFLY_8( in0, in1, in2, in3, in4, in5, in6, in7, \
1621
out0, out1, out2, out3, out4, out5, out6, out7 ) \
1622
{ \
1623
out0 = in0 + in7; \
1624
out1 = in1 + in6; \
1625
out2 = in2 + in5; \
1626
out3 = in3 + in4; \
1627
\
1628
out4 = in3 - in4; \
1629
out5 = in2 - in5; \
1630
out6 = in1 - in6; \
1631
out7 = in0 - in7; \
1632
}
1633
1634
/* Description : Transpose input 8x8 byte block
1635
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1636
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1637
Return Type - as per RTYPE
1638
*/
1639
#define TRANSPOSE8x8_UB( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1640
out0, out1, out2, out3, out4, out5, out6, out7 ) \
1641
{ \
1642
v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1643
v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
1644
\
1645
ILVR_B4_SB( in2, in0, in3, in1, in6, in4, in7, in5, \
1646
tmp0_m, tmp1_m, tmp2_m, tmp3_m ); \
1647
ILVRL_B2_SB( tmp1_m, tmp0_m, tmp4_m, tmp5_m ); \
1648
ILVRL_B2_SB( tmp3_m, tmp2_m, tmp6_m, tmp7_m ); \
1649
ILVRL_W2( RTYPE, tmp6_m, tmp4_m, out0, out2 ); \
1650
ILVRL_W2( RTYPE, tmp7_m, tmp5_m, out4, out6 ); \
1651
SLDI_B2_0( RTYPE, out0, out2, out1, out3, 8 ); \
1652
SLDI_B2_0( RTYPE, out4, out6, out5, out7, 8 ); \
1653
}
1654
#define TRANSPOSE8x8_UB_UB( ... ) TRANSPOSE8x8_UB( v16u8, __VA_ARGS__ )
1655
1656
/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
1657
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7,
1658
in8, in9, in10, in11, in12, in13, in14, in15
1659
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1660
Return Type - unsigned byte
1661
*/
1662
#define TRANSPOSE16x8_UB_UB( in0, in1, in2, in3, in4, in5, in6, in7, \
1663
in8, in9, in10, in11, in12, in13, in14, in15, \
1664
out0, out1, out2, out3, out4, out5, out6, out7 ) \
1665
{ \
1666
v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1667
v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
1668
\
1669
ILVEV_D2_UB( in0, in8, in1, in9, out7, out6 ); \
1670
ILVEV_D2_UB( in2, in10, in3, in11, out5, out4 ); \
1671
ILVEV_D2_UB( in4, in12, in5, in13, out3, out2 ); \
1672
ILVEV_D2_UB( in6, in14, in7, in15, out1, out0 ); \
1673
\
1674
tmp0_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out6, ( v16i8 ) out7 ); \
1675
tmp4_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out6, ( v16i8 ) out7 ); \
1676
tmp1_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out4, ( v16i8 ) out5 ); \
1677
tmp5_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out4, ( v16i8 ) out5 ); \
1678
out5 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out2, ( v16i8 ) out3 ); \
1679
tmp6_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out2, ( v16i8 ) out3 ); \
1680
out7 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out0, ( v16i8 ) out1 ); \
1681
tmp7_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out0, ( v16i8 ) out1 ); \
1682
\
1683
ILVEV_H2_UB( tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m ); \
1684
out0 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
1685
out4 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
1686
\
1687
tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m ); \
1688
tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) out7, ( v8i16 ) out5 ); \
1689
out2 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
1690
out6 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
1691
\
1692
ILVEV_H2_UB( tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m ); \
1693
out1 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
1694
out5 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
1695
\
1696
tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m ); \
1697
tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m ); \
1698
tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m ); \
1699
tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m ); \
1700
out3 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
1701
out7 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m ); \
1702
}
1703
1704
/* Description : Transpose 4x4 block with half word elements in vectors
1705
Arguments : Inputs - in0, in1, in2, in3
1706
Outputs - out0, out1, out2, out3
1707
Return Type - signed halfword
1708
*/
1709
#define TRANSPOSE4x4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 ) \
1710
{ \
1711
v8i16 s0_m, s1_m; \
1712
\
1713
ILVR_H2_SH( in1, in0, in3, in2, s0_m, s1_m ); \
1714
ILVRL_W2_SH( s1_m, s0_m, out0, out2 ); \
1715
out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 ); \
1716
out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out2 ); \
1717
}
1718
1719
/* Description : Transpose 4x8 block with half word elements in vectors
1720
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1721
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1722
Return Type - signed halfword
1723
*/
1724
#define TRANSPOSE4X8_SH_SH( in0, in1, in2, in3, in4, in5, in6, in7, \
1725
out0, out1, out2, out3, out4, out5, out6, out7 ) \
1726
{ \
1727
v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1728
v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n; \
1729
v8i16 zero_m = { 0 }; \
1730
\
1731
ILVR_H4_SH( in1, in0, in3, in2, in5, in4, in7, in6, \
1732
tmp0_n, tmp1_n, tmp2_n, tmp3_n ); \
1733
ILVRL_W2_SH( tmp1_n, tmp0_n, tmp0_m, tmp2_m ); \
1734
ILVRL_W2_SH( tmp3_n, tmp2_n, tmp1_m, tmp3_m ); \
1735
\
1736
out0 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m ); \
1737
out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m ); \
1738
out2 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m ); \
1739
out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m ); \
1740
\
1741
out4 = zero_m; \
1742
out5 = zero_m; \
1743
out6 = zero_m; \
1744
out7 = zero_m; \
1745
}
1746
1747
/* Description : Transpose 8x4 block with half word elements in vectors
1748
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1749
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1750
Return Type - signed halfword
1751
*/
1752
#define TRANSPOSE8X4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 ) \
1753
{ \
1754
v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1755
\
1756
ILVR_H2_SH( in1, in0, in3, in2, tmp0_m, tmp1_m ); \
1757
ILVL_H2_SH( in1, in0, in3, in2, tmp2_m, tmp3_m ); \
1758
ILVR_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2 ); \
1759
ILVL_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3 ); \
1760
}
1761
1762
/* Description : Transpose 8x8 block with half word elements in vectors
1763
Arguments : Inputs - in0, in1, in2, in3, in4, in5, in6, in7
1764
Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1765
Return Type - as per RTYPE
1766
*/
1767
#define TRANSPOSE8x8_H( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
1768
out0, out1, out2, out3, out4, out5, out6, out7 ) \
1769
{ \
1770
v8i16 s0_m, s1_m; \
1771
v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
1772
v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m; \
1773
\
1774
ILVR_H2_SH( in6, in4, in7, in5, s0_m, s1_m ); \
1775
ILVRL_H2_SH( s1_m, s0_m, tmp0_m, tmp1_m ); \
1776
ILVL_H2_SH( in6, in4, in7, in5, s0_m, s1_m ); \
1777
ILVRL_H2_SH( s1_m, s0_m, tmp2_m, tmp3_m ); \
1778
ILVR_H2_SH( in2, in0, in3, in1, s0_m, s1_m ); \
1779
ILVRL_H2_SH( s1_m, s0_m, tmp4_m, tmp5_m ); \
1780
ILVL_H2_SH( in2, in0, in3, in1, s0_m, s1_m ); \
1781
ILVRL_H2_SH( s1_m, s0_m, tmp6_m, tmp7_m ); \
1782
PCKEV_D4( RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m, \
1783
tmp3_m, tmp7_m, out0, out2, out4, out6 ); \
1784
out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp0_m, ( v2i64 ) tmp4_m ); \
1785
out3 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp5_m ); \
1786
out5 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp2_m, ( v2i64 ) tmp6_m ); \
1787
out7 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp7_m ); \
1788
}
1789
#define TRANSPOSE8x8_SH_SH( ... ) TRANSPOSE8x8_H( v8i16, __VA_ARGS__ )
1790
1791
/* Description : Transpose 4x4 block with word elements in vectors
1792
Arguments : Inputs - in0, in1, in2, in3
1793
Outputs - out0, out1, out2, out3
1794
Return Type - signed word
1795
*/
1796
#define TRANSPOSE4x4_SW_SW( in0, in1, in2, in3, out0, out1, out2, out3 ) \
1797
{ \
1798
v4i32 s0_m, s1_m, s2_m, s3_m; \
1799
\
1800
ILVRL_W2_SW( in1, in0, s0_m, s1_m ); \
1801
ILVRL_W2_SW( in3, in2, s2_m, s3_m ); \
1802
\
1803
out0 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m ); \
1804
out1 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m ); \
1805
out2 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m ); \
1806
out3 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m ); \
1807
}
1808
1809
/* Description : Add block 4x4
1810
Arguments : Inputs - in0, in1, in2, in3, pdst, stride
1811
Details : Least significant 4 bytes from each input vector are added to
1812
the destination bytes, clipped between 0-255 and stored.
1813
*/
1814
#define ADDBLK_ST4x4_UB( in0, in1, in2, in3, p_dst, stride ) \
1815
{ \
1816
uint32_t src0_m, src1_m, src2_m, src3_m; \
1817
uint32_t out0_m, out1_m, out2_m, out3_m; \
1818
v8i16 inp0_m, inp1_m, res0_m, res1_m; \
1819
v16i8 dst0_m = { 0 }; \
1820
v16i8 dst1_m = { 0 }; \
1821
v16i8 zero_m = { 0 }; \
1822
\
1823
ILVR_D2_SH( in1, in0, in3, in2, inp0_m, inp1_m ) \
1824
LW4( p_dst, stride, src0_m, src1_m, src2_m, src3_m ); \
1825
INSERT_W2_SB( src0_m, src1_m, dst0_m ); \
1826
INSERT_W2_SB( src2_m, src3_m, dst1_m ); \
1827
ILVR_B2_SH( zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m ); \
1828
ADD2( res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m ); \
1829
CLIP_SH2_0_255( res0_m, res1_m ); \
1830
PCKEV_B2_SB( res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m ); \
1831
\
1832
out0_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 0 ); \
1833
out1_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 1 ); \
1834
out2_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 0 ); \
1835
out3_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 1 ); \
1836
SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride ); \
1837
}
1838
1839
/* Description : Dot product and addition of 3 signed halfword input vectors
1840
Arguments : Inputs - in0, in1, in2, coeff0, coeff1, coeff2
1841
Output - out0_m
1842
Return Type - signed halfword
1843
Details : Dot product of 'in0' with 'coeff0'
1844
Dot product of 'in1' with 'coeff1'
1845
Dot product of 'in2' with 'coeff2'
1846
Addition of all the 3 vector results
1847
out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
1848
*/
1849
#define DPADD_SH3_SH( in0, in1, in2, coeff0, coeff1, coeff2 ) \
1850
( { \
1851
v8i16 tmp1_m; \
1852
v8i16 out0_m; \
1853
\
1854
out0_m = __msa_dotp_s_h( ( v16i8 ) in0, ( v16i8 ) coeff0 ); \
1855
out0_m = __msa_dpadd_s_h( out0_m, ( v16i8 ) in1, ( v16i8 ) coeff1 ); \
1856
tmp1_m = __msa_dotp_s_h( ( v16i8 ) in2, ( v16i8 ) coeff2 ); \
1857
out0_m = __msa_adds_s_h( out0_m, tmp1_m ); \
1858
\
1859
out0_m; \
1860
} )
1861
1862
/* Description : Pack even elements of input vectors & xor with 128
1863
Arguments : Inputs - in0, in1
1864
Output - out_m
1865
Return Type - unsigned byte
1866
Details : Signed byte even elements from 'in0' and 'in1' are packed
1867
together in one vector and the resulting vector is xor'ed with
1868
128 to shift the range from signed to unsigned byte
1869
*/
1870
#define PCKEV_XORI128_UB( in0, in1 ) \
1871
( { \
1872
v16u8 out_m; \
1873
out_m = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 ); \
1874
out_m = ( v16u8 ) __msa_xori_b( ( v16u8 ) out_m, 128 ); \
1875
out_m; \
1876
} )
1877
1878
/* Description : Pack even byte elements, extract 0 & 2 index words from pair
1879
of results and store 4 words in destination memory as per
1880
stride
1881
Arguments : Inputs - in0, in1, in2, in3, pdst, stride
1882
*/
1883
#define PCKEV_ST4x4_UB( in0, in1, in2, in3, p_dst, stride ) \
1884
{ \
1885
uint32_t out0_m, out1_m, out2_m, out3_m; \
1886
v16i8 tmp0_m, tmp1_m; \
1887
\
1888
PCKEV_B2_SB( in1, in0, in3, in2, tmp0_m, tmp1_m ); \
1889
\
1890
out0_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 0 ); \
1891
out1_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 2 ); \
1892
out2_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 0 ); \
1893
out3_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 2 ); \
1894
\
1895
SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride ); \
1896
}
1897
1898
/* Description : Pack even byte elements and store byte vector in destination
1899
memory
1900
Arguments : Inputs - in0, in1, pdst
1901
*/
1902
#define PCKEV_ST_SB( in0, in1, p_dst ) \
1903
{ \
1904
v16i8 tmp_m; \
1905
tmp_m = __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 ); \
1906
ST_SB( tmp_m, ( p_dst ) ); \
1907
}
1908
1909
#define AVC_CALC_DPADD_H_6PIX_2COEFF_SH( in0, in1, in2, in3, in4, in5 ) \
1910
( { \
1911
v4i32 tmp0_m, tmp1_m; \
1912
v8i16 out0_m, out1_m, out2_m, out3_m; \
1913
v8i16 minus5h_m = __msa_ldi_h( -5 ); \
1914
v8i16 plus20h_m = __msa_ldi_h( 20 ); \
1915
\
1916
ILVRL_H2_SW( in5, in0, tmp0_m, tmp1_m ); \
1917
\
1918
tmp0_m = __msa_hadd_s_w( ( v8i16 ) tmp0_m, ( v8i16 ) tmp0_m ); \
1919
tmp1_m = __msa_hadd_s_w( ( v8i16 ) tmp1_m, ( v8i16 ) tmp1_m ); \
1920
\
1921
ILVRL_H2_SH( in1, in4, out0_m, out1_m ); \
1922
DPADD_SH2_SW( out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m ); \
1923
ILVRL_H2_SH( in2, in3, out2_m, out3_m ); \
1924
DPADD_SH2_SW( out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m ); \
1925
\
1926
SRARI_W2_SW( tmp0_m, tmp1_m, 10 ); \
1927
SAT_SW2_SW( tmp0_m, tmp1_m, 7 ); \
1928
out0_m = __msa_pckev_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m ); \
1929
\
1930
out0_m; \
1931
} )
1932
1933
#define AVC_HORZ_FILTER_SH( in, mask0, mask1, mask2 ) \
1934
( { \
1935
v8i16 out0_m, out1_m; \
1936
v16i8 tmp0_m, tmp1_m; \
1937
v16i8 minus5b = __msa_ldi_b( -5 ); \
1938
v16i8 plus20b = __msa_ldi_b( 20 ); \
1939
\
1940
tmp0_m = __msa_vshf_b( ( v16i8 ) mask0, in, in ); \
1941
out0_m = __msa_hadd_s_h( tmp0_m, tmp0_m ); \
1942
\
1943
tmp0_m = __msa_vshf_b( ( v16i8 ) mask1, in, in ); \
1944
out0_m = __msa_dpadd_s_h( out0_m, minus5b, tmp0_m ); \
1945
\
1946
tmp1_m = __msa_vshf_b( ( v16i8 ) ( mask2 ), in, in ); \
1947
out1_m = __msa_dpadd_s_h( out0_m, plus20b, tmp1_m ); \
1948
\
1949
out1_m; \
1950
} )
1951
1952
#endif /* X264_MIPS_MACROS_H */
1953
1954