CoCalc -- macros.h

05. Matplotlib / ffmpeg-3.0 / libx264 / common / mips / macros.h
⁵⁸⁷³¹ views
1
/*****************************************************************************
2
 * macros.h: msa macros
3
 *****************************************************************************
4
 * Copyright (C) 2015-2016 x264 project
5
 *
6
 * Authors: Rishikesh More <[email protected]>
7
 *
8
 * This program is free software; you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation; either version 2 of the License, or
11
 * (at your option) any later version.
12
 *
13
 * This program is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU General Public License
19
 * along with this program; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
 *
22
 * This program is also available under a commercial proprietary license.
23
 * For more information, contact us at [email protected].
24
 *****************************************************************************/
25

26
#ifndef X264_MIPS_MACROS_H
27
#define X264_MIPS_MACROS_H
28

29
#include <stdint.h>
30
#include <msa.h>
31

32
#define LD_B( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
33
#define LD_UB( ... ) LD_B( v16u8, __VA_ARGS__ )
34
#define LD_SB( ... ) LD_B( v16i8, __VA_ARGS__ )
35

36
#define LD_H( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
37
#define LD_SH( ... ) LD_H( v8i16, __VA_ARGS__ )
38

39
#define LD_W( RTYPE, p_src ) *( ( RTYPE * )( p_src ) )
40
#define LD_SW( ... ) LD_W( v4i32, __VA_ARGS__ )
41

42
#define ST_B( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in )
43
#define ST_UB( ... ) ST_B( v16u8, __VA_ARGS__ )
44
#define ST_SB( ... ) ST_B( v16i8, __VA_ARGS__ )
45

46
#define ST_H( RTYPE, in, p_dst ) *( ( RTYPE * )( p_dst ) ) = ( in )
47
#define ST_UH( ... ) ST_H( v8u16, __VA_ARGS__ )
48
#define ST_SH( ... ) ST_H( v8i16, __VA_ARGS__ )
49

50
#if ( __mips_isa_rev >= 6 )
51
    #define LH( p_src )                              \
52
    ( {                                              \
53
        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
54
        uint16_t u_val_h_m;                          \
55
                                                     \
56
        asm volatile (                               \
57
            "lh  %[u_val_h_m],  %[p_src_m]  \n\t"    \
58
                                                     \
59
            : [u_val_h_m] "=r" ( u_val_h_m )         \
60
            : [p_src_m] "m" ( *p_src_m )             \
61
        );                                           \
62
                                                     \
63
        u_val_h_m;                                   \
64
    } )
65

66
    #define LW( p_src )                              \
67
    ( {                                              \
68
        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
69
        uint32_t u_val_w_m;                          \
70
                                                     \
71
        asm volatile (                               \
72
            "lw  %[u_val_w_m],  %[p_src_m]  \n\t"    \
73
                                                     \
74
            : [u_val_w_m] "=r" ( u_val_w_m )         \
75
            : [p_src_m] "m" ( *p_src_m )             \
76
        );                                           \
77
                                                     \
78
        u_val_w_m;                                   \
79
    } )
80

81
    #if ( __mips == 64 )
82
        #define LD( p_src )                              \
83
        ( {                                              \
84
            uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
85
            uint64_t u_val_d_m = 0;                      \
86
                                                         \
87
            asm volatile (                               \
88
                "ld  %[u_val_d_m],  %[p_src_m]  \n\t"    \
89
                                                         \
90
                : [u_val_d_m] "=r" ( u_val_d_m )         \
91
                : [p_src_m] "m" ( *p_src_m )             \
92
            );                                           \
93
                                                         \
94
            u_val_d_m;                                   \
95
        } )
96
    #else  // !( __mips == 64 )
97
        #define LD( p_src )                                                  \
98
        ( {                                                                  \
99
            uint8_t *p_src_m = ( uint8_t * ) ( p_src );                      \
100
            uint32_t u_val0_m, u_val1_m;                                     \
101
            uint64_t u_val_d_m = 0;                                          \
102
                                                                             \
103
            u_val0_m = LW( p_src_m );                                        \
104
            u_val1_m = LW( p_src_m + 4 );                                    \
105
                                                                             \
106
            u_val_d_m = ( uint64_t ) ( u_val1_m );                           \
107
            u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) &                 \
108
                                       0xFFFFFFFF00000000 );                 \
109
            u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m );  \
110
                                                                             \
111
            u_val_d_m;                                                       \
112
        } )
113
    #endif  // ( __mips == 64 )
114

115
    #define SH( u_val, p_dst )                       \
116
    {                                                \
117
        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
118
        uint16_t u_val_h_m = ( u_val );              \
119
                                                     \
120
        asm volatile (                               \
121
            "sh  %[u_val_h_m],  %[p_dst_m]  \n\t"    \
122
                                                     \
123
            : [p_dst_m] "=m" ( *p_dst_m )            \
124
            : [u_val_h_m] "r" ( u_val_h_m )          \
125
        );                                           \
126
    }
127

128
    #define SW( u_val, p_dst )                       \
129
    {                                                \
130
        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
131
        uint32_t u_val_w_m = ( u_val );              \
132
                                                     \
133
        asm volatile (                               \
134
            "sw  %[u_val_w_m],  %[p_dst_m]  \n\t"    \
135
                                                     \
136
            : [p_dst_m] "=m" ( *p_dst_m )            \
137
            : [u_val_w_m] "r" ( u_val_w_m )          \
138
        );                                           \
139
    }
140

141
    #define SD( u_val, p_dst )                       \
142
    {                                                \
143
        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
144
        uint64_t u_val_d_m = ( u_val );              \
145
                                                     \
146
        asm volatile (                               \
147
            "sd  %[u_val_d_m],  %[p_dst_m]  \n\t"    \
148
                                                     \
149
            : [p_dst_m] "=m" ( *p_dst_m )            \
150
            : [u_val_d_m] "r" ( u_val_d_m )          \
151
        );                                           \
152
    }
153

154
#else  // !( __mips_isa_rev >= 6 )
155
    #define LH( p_src )                              \
156
    ( {                                              \
157
        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
158
        uint16_t u_val_h_m;                          \
159
                                                     \
160
        asm volatile (                               \
161
            "ulh  %[u_val_h_m],  %[p_src_m]  \n\t"   \
162
                                                     \
163
            : [u_val_h_m] "=r" ( u_val_h_m )         \
164
            : [p_src_m] "m" ( *p_src_m )             \
165
        );                                           \
166
                                                     \
167
        u_val_h_m;                                   \
168
    } )
169

170
    #define LW( p_src )                              \
171
    ( {                                              \
172
        uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
173
        uint32_t u_val_w_m;                          \
174
                                                     \
175
        asm volatile (                               \
176
            "ulw  %[u_val_w_m],  %[p_src_m]  \n\t"   \
177
                                                     \
178
            : [u_val_w_m] "=r" ( u_val_w_m )         \
179
            : [p_src_m] "m" ( *p_src_m )             \
180
        );                                           \
181
                                                     \
182
        u_val_w_m;                                   \
183
    } )
184

185
    #if ( __mips == 64 )
186
        #define LD( p_src )                              \
187
        ( {                                              \
188
            uint8_t *p_src_m = ( uint8_t * ) ( p_src );  \
189
            uint64_t u_val_d_m = 0;                      \
190
                                                         \
191
            asm volatile (                               \
192
                "uld  %[u_val_d_m],  %[p_src_m]  \n\t"   \
193
                                                         \
194
                : [u_val_d_m] "=r" ( u_val_d_m )         \
195
                : [p_src_m] "m" ( *p_src_m )             \
196
            );                                           \
197
                                                         \
198
            u_val_d_m;                                   \
199
        } )
200
    #else  // !( __mips == 64 )
201
        #define LD( p_src )                                                  \
202
        ( {                                                                  \
203
            uint8_t *psrc_m1 = ( uint8_t * ) ( p_src );                      \
204
            uint32_t u_val0_m, u_val1_m;                                     \
205
            uint64_t u_val_d_m = 0;                                          \
206
                                                                             \
207
            u_val0_m = LW( psrc_m1 );                                        \
208
            u_val1_m = LW( psrc_m1 + 4 );                                    \
209
                                                                             \
210
            u_val_d_m = ( uint64_t ) ( u_val1_m );                           \
211
            u_val_d_m = ( uint64_t ) ( ( u_val_d_m << 32 ) &                 \
212
                                       0xFFFFFFFF00000000 );                 \
213
            u_val_d_m = ( uint64_t ) ( u_val_d_m | ( uint64_t ) u_val0_m );  \
214
                                                                             \
215
            u_val_d_m;                                                       \
216
        } )
217
    #endif  // ( __mips == 64 )
218

219
    #define SH( u_val, p_dst )                       \
220
    {                                                \
221
        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
222
        uint16_t u_val_h_m = ( u_val );              \
223
                                                     \
224
        asm volatile (                               \
225
            "ush  %[u_val_h_m],  %[p_dst_m]  \n\t"   \
226
                                                     \
227
            : [p_dst_m] "=m" ( *p_dst_m )            \
228
            : [u_val_h_m] "r" ( u_val_h_m )          \
229
        );                                           \
230
    }
231

232
    #define SW( u_val, p_dst )                       \
233
    {                                                \
234
        uint8_t *p_dst_m = ( uint8_t * ) ( p_dst );  \
235
        uint32_t u_val_w_m = ( u_val );              \
236
                                                     \
237
        asm volatile (                               \
238
            "usw  %[u_val_w_m],  %[p_dst_m]  \n\t"   \
239
                                                     \
240
            : [p_dst_m] "=m" ( *p_dst_m )            \
241
            : [u_val_w_m] "r" ( u_val_w_m )          \
242
        );                                           \
243
    }
244

245
    #define SD( u_val, p_dst )                                                 \
246
    {                                                                          \
247
        uint8_t *p_dst_m1 = ( uint8_t * ) ( p_dst );                           \
248
        uint32_t u_val0_m, u_val1_m;                                           \
249
                                                                               \
250
        u_val0_m = ( uint32_t ) ( ( u_val ) & 0x00000000FFFFFFFF );            \
251
        u_val1_m = ( uint32_t ) ( ( ( u_val ) >> 32 ) & 0x00000000FFFFFFFF );  \
252
                                                                               \
253
        SW( u_val0_m, p_dst_m1 );                                              \
254
        SW( u_val1_m, p_dst_m1 + 4 );                                          \
255
    }
256

257
#endif // ( __mips_isa_rev >= 6 )
258

259
/* Description : Load 4 words with stride
260
   Arguments   : Inputs  - psrc    (source pointer to load from)
261
                         - stride
262
                 Outputs - out0, out1, out2, out3
263
   Details     : Load word in 'out0' from (psrc)
264
                 Load word in 'out1' from (psrc + stride)
265
                 Load word in 'out2' from (psrc + 2 * stride)
266
                 Load word in 'out3' from (psrc + 3 * stride)
267
*/
268
#define LW4( p_src, stride, out0, out1, out2, out3 )  \
269
{                                                     \
270
    out0 = LW( ( p_src ) );                           \
271
    out1 = LW( ( p_src ) + stride );                  \
272
    out2 = LW( ( p_src ) + 2 * stride );              \
273
    out3 = LW( ( p_src ) + 3 * stride );              \
274
}
275

276
/* Description : Store 4 words with stride
277
   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
278
   Details     : Store word from 'in0' to (pdst)
279
                 Store word from 'in1' to (pdst + stride)
280
                 Store word from 'in2' to (pdst + 2 * stride)
281
                 Store word from 'in3' to (pdst + 3 * stride)
282
*/
283
#define SW4( in0, in1, in2, in3, p_dst, stride )  \
284
{                                                 \
285
    SW( in0, ( p_dst ) )                          \
286
    SW( in1, ( p_dst ) + stride );                \
287
    SW( in2, ( p_dst ) + 2 * stride );            \
288
    SW( in3, ( p_dst ) + 3 * stride );            \
289
}
290

291
/* Description : Store 4 double words with stride
292
   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
293
   Details     : Store double word from 'in0' to (pdst)
294
                 Store double word from 'in1' to (pdst + stride)
295
                 Store double word from 'in2' to (pdst + 2 * stride)
296
                 Store double word from 'in3' to (pdst + 3 * stride)
297
*/
298
#define SD4( in0, in1, in2, in3, p_dst, stride )  \
299
{                                                 \
300
    SD( in0, ( p_dst ) )                          \
301
    SD( in1, ( p_dst ) + stride );                \
302
    SD( in2, ( p_dst ) + 2 * stride );            \
303
    SD( in3, ( p_dst ) + 3 * stride );            \
304
}
305

306
/* Description : Load vectors with 16 byte elements with stride
307
   Arguments   : Inputs  - psrc    (source pointer to load from)
308
                         - stride
309
                 Outputs - out0, out1
310
                 Return Type - as per RTYPE
311
   Details     : Load 16 byte elements in 'out0' from (psrc)
312
                 Load 16 byte elements in 'out1' from (psrc + stride)
313
*/
314
#define LD_B2( RTYPE, p_src, stride, out0, out1 )  \
315
{                                                  \
316
    out0 = LD_B( RTYPE, ( p_src ) );               \
317
    out1 = LD_B( RTYPE, ( p_src ) + stride );      \
318
}
319
#define LD_UB2( ... ) LD_B2( v16u8, __VA_ARGS__ )
320
#define LD_SB2( ... ) LD_B2( v16i8, __VA_ARGS__ )
321

322
#define LD_B3( RTYPE, p_src, stride, out0, out1, out2 )  \
323
{                                                        \
324
    LD_B2( RTYPE, ( p_src ), stride, out0, out1 );       \
325
    out2 = LD_B( RTYPE, ( p_src ) + 2 * stride );        \
326
}
327
#define LD_UB3( ... ) LD_B3( v16u8, __VA_ARGS__ )
328
#define LD_SB3( ... ) LD_B3( v16i8, __VA_ARGS__ )
329

330
#define LD_B4( RTYPE, p_src, stride, out0, out1, out2, out3 )     \
331
{                                                                 \
332
    LD_B2( RTYPE, ( p_src ), stride, out0, out1 );                \
333
    LD_B2( RTYPE, ( p_src ) + 2 * stride , stride, out2, out3 );  \
334
}
335
#define LD_UB4( ... ) LD_B4( v16u8, __VA_ARGS__ )
336
#define LD_SB4( ... ) LD_B4( v16i8, __VA_ARGS__ )
337

338
#define LD_B5( RTYPE, p_src, stride, out0, out1, out2, out3, out4 )  \
339
{                                                                    \
340
    LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 );       \
341
    out4 = LD_B( RTYPE, ( p_src ) + 4 * stride );                    \
342
}
343
#define LD_UB5( ... ) LD_B5( v16u8, __VA_ARGS__ )
344
#define LD_SB5( ... ) LD_B5( v16i8, __VA_ARGS__ )
345

346
#define LD_B8( RTYPE, p_src, stride,                                         \
347
               out0, out1, out2, out3, out4, out5, out6, out7 )              \
348
{                                                                            \
349
    LD_B4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 );               \
350
    LD_B4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 );  \
351
}
352
#define LD_UB8( ... ) LD_B8( v16u8, __VA_ARGS__ )
353
#define LD_SB8( ... ) LD_B8( v16i8, __VA_ARGS__ )
354

355
/* Description : Load vectors with 8 halfword elements with stride
356
   Arguments   : Inputs  - psrc    (source pointer to load from)
357
                         - stride
358
                 Outputs - out0, out1
359
   Details     : Load 8 halfword elements in 'out0' from (psrc)
360
                 Load 8 halfword elements in 'out1' from (psrc + stride)
361
*/
362
#define LD_H2( RTYPE, p_src, stride, out0, out1 )  \
363
{                                                  \
364
    out0 = LD_H( RTYPE, ( p_src ) );               \
365
    out1 = LD_H( RTYPE, ( p_src ) + ( stride ) );  \
366
}
367
#define LD_SH2( ... ) LD_H2( v8i16, __VA_ARGS__ )
368

369
#define LD_H4( RTYPE, p_src, stride, out0, out1, out2, out3 )    \
370
{                                                                \
371
    LD_H2( RTYPE, ( p_src ), stride, out0, out1 );               \
372
    LD_H2( RTYPE, ( p_src ) + 2 * stride, stride, out2, out3 );  \
373
}
374
#define LD_SH4( ... ) LD_H4( v8i16, __VA_ARGS__ )
375

376
#define LD_H8( RTYPE, p_src, stride,                                         \
377
               out0, out1, out2, out3, out4, out5, out6, out7 )              \
378
{                                                                            \
379
    LD_H4( RTYPE, ( p_src ), stride, out0, out1, out2, out3 );               \
380
    LD_H4( RTYPE, ( p_src ) + 4 * stride, stride, out4, out5, out6, out7 );  \
381
}
382
#define LD_SH8( ... ) LD_H8( v8i16, __VA_ARGS__ )
383

384
/* Description : Load 4x4 block of signed halfword elements from 1D source
385
                 data into 4 vectors (Each vector with 4 signed halfwords)
386
   Arguments   : Inputs  - psrc
387
                 Outputs - out0, out1, out2, out3
388
*/
389
#define LD4x4_SH( p_src, out0, out1, out2, out3 )                     \
390
{                                                                     \
391
    out0 = LD_SH( p_src );                                            \
392
    out2 = LD_SH( p_src + 8 );                                        \
393
    out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 );  \
394
    out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out2, ( v2i64 ) out2 );  \
395
}
396

397
/* Description : Load 2 vectors of signed word elements with stride
398
   Arguments   : Inputs  - psrc    (source pointer to load from)
399
                         - stride
400
                 Outputs - out0, out1
401
                 Return Type - signed word
402
*/
403
#define LD_SW2( p_src, stride, out0, out1 )    \
404
{                                              \
405
    out0 = LD_SW( ( p_src ) );                 \
406
    out1 = LD_SW( ( p_src ) + stride );        \
407
}
408

409
/* Description : Store vectors of 16 byte elements with stride
410
   Arguments   : Inputs  - in0, in1, stride
411
                         - pdst    (destination pointer to store to)
412
   Details     : Store 16 byte elements from 'in0' to (pdst)
413
                 Store 16 byte elements from 'in1' to (pdst + stride)
414
*/
415
#define ST_B2( RTYPE, in0, in1, p_dst, stride )  \
416
{                                                \
417
    ST_B( RTYPE, in0, ( p_dst ) );               \
418
    ST_B( RTYPE, in1, ( p_dst ) + stride );      \
419
}
420
#define ST_UB2( ... ) ST_B2( v16u8, __VA_ARGS__ )
421

422
#define ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride )      \
423
{                                                              \
424
    ST_B2( RTYPE, in0, in1, ( p_dst ), stride );               \
425
    ST_B2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride );  \
426
}
427
#define ST_UB4( ... ) ST_B4( v16u8, __VA_ARGS__ )
428
#define ST_SB4( ... ) ST_B4( v16i8, __VA_ARGS__ )
429

430
#define ST_B8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,            \
431
               p_dst, stride )                                           \
432
{                                                                        \
433
    ST_B4( RTYPE, in0, in1, in2, in3, p_dst, stride );                   \
434
    ST_B4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride );  \
435
}
436
#define ST_UB8( ... ) ST_B8( v16u8, __VA_ARGS__ )
437

438
/* Description : Store vectors of 8 halfword elements with stride
439
   Arguments   : Inputs  - in0, in1, stride
440
                         - pdst    (destination pointer to store to)
441
   Details     : Store 8 halfword elements from 'in0' to (pdst)
442
                 Store 8 halfword elements from 'in1' to (pdst + stride)
443
*/
444
#define ST_H2( RTYPE, in0, in1, p_dst, stride )  \
445
{                                                \
446
    ST_H( RTYPE, in0, ( p_dst ) );               \
447
    ST_H( RTYPE, in1, ( p_dst ) + stride );      \
448
}
449
#define ST_SH2( ... ) ST_H2( v8i16, __VA_ARGS__ )
450

451
#define ST_H4( RTYPE, in0, in1, in2, in3, p_dst, stride )      \
452
{                                                              \
453
    ST_H2( RTYPE, in0, in1, ( p_dst ), stride );               \
454
    ST_H2( RTYPE, in2, in3, ( p_dst ) + 2 * stride, stride );  \
455
}
456
#define ST_SH4( ... ) ST_H4( v8i16, __VA_ARGS__ )
457

458
#define ST_H8( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, p_dst, stride )  \
459
{                                                                              \
460
    ST_H4( RTYPE, in0, in1, in2, in3, ( p_dst ), stride );                     \
461
    ST_H4( RTYPE, in4, in5, in6, in7, ( p_dst ) + 4 * stride, stride );        \
462
}
463
#define ST_SH8( ... ) ST_H8( v8i16, __VA_ARGS__ )
464

465
/* Description : Store 2x4 byte block to destination memory from input vector
466
   Arguments   : Inputs  - in, stidx, pdst, stride
467
   Details     : Index 'stidx' halfword element from 'in' vector is copied to
468
                 GP register and stored to (pdst)
469
                 Index 'stidx+1' halfword element from 'in' vector is copied to
470
                 GP register and stored to (pdst + stride)
471
                 Index 'stidx+2' halfword element from 'in' vector is copied to
472
                 GP register and stored to (pdst + 2 * stride)
473
                 Index 'stidx+3' halfword element from 'in' vector is copied to
474
                 GP register and stored to (pdst + 3 * stride)
475
*/
476
#define ST2x4_UB( in, stidx, p_dst, stride )                   \
477
{                                                              \
478
    uint16_t u_out0_m, u_out1_m, u_out2_m, u_out3_m;           \
479
    uint8_t *pblk_2x4_m = ( uint8_t * ) ( p_dst );             \
480
                                                               \
481
    u_out0_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx ) );      \
482
    u_out1_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 1 ) );  \
483
    u_out2_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 2 ) );  \
484
    u_out3_m = __msa_copy_u_h( ( v8i16 ) in, ( stidx + 3 ) );  \
485
                                                               \
486
    SH( u_out0_m, pblk_2x4_m );                                \
487
    SH( u_out1_m, pblk_2x4_m + stride );                       \
488
    SH( u_out2_m, pblk_2x4_m + 2 * stride );                   \
489
    SH( u_out3_m, pblk_2x4_m + 3 * stride );                   \
490
}
491

492
/* Description : Store 4x4 byte block to destination memory from input vector
493
   Arguments   : Inputs  - in0, in1, pdst, stride
494
   Details     : 'Idx0' word element from input vector 'in0' is copied to
495
                 GP register and stored to (pdst)
496
                 'Idx1' word element from input vector 'in0' is copied to
497
                 GP register and stored to (pdst + stride)
498
                 'Idx2' word element from input vector 'in0' is copied to
499
                 GP register and stored to (pdst + 2 * stride)
500
                 'Idx3' word element from input vector 'in0' is copied to
501
                 GP register and stored to (pdst + 3 * stride)
502
*/
503
#define ST4x4_UB( in0, in1, idx0, idx1, idx2, idx3, p_dst, stride )     \
504
{                                                                       \
505
    uint32_t u_out0_m, u_out1_m, u_out2_m, u_out3_m;                    \
506
    uint8_t *pblk_4x4_m = ( uint8_t * ) ( p_dst );                      \
507
                                                                        \
508
    u_out0_m = __msa_copy_u_w( ( v4i32 ) in0, idx0 );                   \
509
    u_out1_m = __msa_copy_u_w( ( v4i32 ) in0, idx1 );                   \
510
    u_out2_m = __msa_copy_u_w( ( v4i32 ) in1, idx2 );                   \
511
    u_out3_m = __msa_copy_u_w( ( v4i32 ) in1, idx3 );                   \
512
                                                                        \
513
    SW4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_4x4_m, stride );  \
514
}
515

516
#define ST4x8_UB( in0, in1, p_dst, stride )                           \
517
{                                                                     \
518
    uint8_t *pblk_4x8 = ( uint8_t * ) ( p_dst );                      \
519
                                                                      \
520
    ST4x4_UB( in0, in0, 0, 1, 2, 3, pblk_4x8, stride );               \
521
    ST4x4_UB( in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride );  \
522
}
523

524
/* Description : Store 8x1 byte block to destination memory from input vector
525
   Arguments   : Inputs  - in, pdst
526
   Details     : Index 0 double word element from 'in' vector is copied to
527
                 GP register and stored to (pdst)
528
*/
529
#define ST8x1_UB( in, p_dst )                      \
530
{                                                  \
531
    uint64_t u_out0_m;                             \
532
    u_out0_m = __msa_copy_u_d( ( v2i64 ) in, 0 );  \
533
    SD( u_out0_m, p_dst );                         \
534
}
535

536
/* Description : Store 8x4 byte block to destination memory from input
537
                 vectors
538
   Arguments   : Inputs  - in0, in1, pdst, stride
539
   Details     : Index 0 double word element from 'in0' vector is copied to
540
                 GP register and stored to (pdst)
541
                 Index 1 double word element from 'in0' vector is copied to
542
                 GP register and stored to (pdst + stride)
543
                 Index 0 double word element from 'in1' vector is copied to
544
                 GP register and stored to (pdst + 2 * stride)
545
                 Index 1 double word element from 'in1' vector is copied to
546
                 GP register and stored to (pdst + 3 * stride)
547
*/
548
#define ST8x4_UB( in0, in1, p_dst, stride )                             \
549
{                                                                       \
550
    uint64_t u_out0_m, u_out1_m, u_out2_m, u_out3_m;                    \
551
    uint8_t *pblk_8x4_m = ( uint8_t * ) ( p_dst );                      \
552
                                                                        \
553
    u_out0_m = __msa_copy_u_d( ( v2i64 ) in0, 0 );                      \
554
    u_out1_m = __msa_copy_u_d( ( v2i64 ) in0, 1 );                      \
555
    u_out2_m = __msa_copy_u_d( ( v2i64 ) in1, 0 );                      \
556
    u_out3_m = __msa_copy_u_d( ( v2i64 ) in1, 1 );                      \
557
                                                                        \
558
    SD4( u_out0_m, u_out1_m, u_out2_m, u_out3_m, pblk_8x4_m, stride );  \
559
}
560

561
/* Description : average with rounding (in0 + in1 + 1) / 2.
562
   Arguments   : Inputs  - in0, in1, in2, in3,
563
                 Outputs - out0, out1
564
                 Return Type - as per RTYPE
565
   Details     : Each unsigned byte element from 'in0' vector is added with
566
                 each unsigned byte element from 'in1' vector.
567
                 Average with rounding is calculated and written to 'out0'
568
*/
569
#define AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 )             \
570
{                                                                     \
571
    out0 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in0, ( v16u8 ) in1 );  \
572
    out1 = ( RTYPE ) __msa_aver_u_b( ( v16u8 ) in2, ( v16u8 ) in3 );  \
573
}
574
#define AVER_UB2_UB( ... ) AVER_UB2( v16u8, __VA_ARGS__ )
575

576
#define AVER_UB4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
577
                  out0, out1, out2, out3 )                        \
578
{                                                                 \
579
    AVER_UB2( RTYPE, in0, in1, in2, in3, out0, out1 )             \
580
    AVER_UB2( RTYPE, in4, in5, in6, in7, out2, out3 )             \
581
}
582
#define AVER_UB4_UB( ... ) AVER_UB4( v16u8, __VA_ARGS__ )
583

584
/* Description : Immediate number of elements to slide with zero
585
   Arguments   : Inputs  - in0, in1, slide_val
586
                 Outputs - out0, out1
587
                 Return Type - as per RTYPE
588
   Details     : Byte elements from 'zero_m' vector are slide into 'in0' by
589
                 value specified in 'slide_val'
590
*/
591
#define SLDI_B2_0( RTYPE, in0, in1, out0, out1, slide_val )     \
592
{                                                               \
593
    v16i8 zero_m = { 0 };                                       \
594
    out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m,            \
595
                                   ( v16i8 ) in0, slide_val );  \
596
    out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) zero_m,            \
597
                                   ( v16i8 ) in1, slide_val );  \
598
}
599
#define SLDI_B2_0_UB( ... ) SLDI_B2_0( v16u8, __VA_ARGS__ )
600

601
/* Description : Immediate number of elements to slide
602
   Arguments   : Inputs  - in0_0, in0_1, in1_0, in1_1, slide_val
603
                 Outputs - out0, out1
604
                 Return Type - as per RTYPE
605
   Details     : Byte elements from 'in0_0' vector are slide into 'in1_0' by
606
                 value specified in 'slide_val'
607
*/
608
#define SLDI_B2( RTYPE, in0_0, in0_1, in1_0, in1_1, out0, out1, slide_val )  \
609
{                                                                            \
610
    out0 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_0, ( v16i8 ) in1_0,         \
611
                                   slide_val );                              \
612
    out1 = ( RTYPE ) __msa_sldi_b( ( v16i8 ) in0_1, ( v16i8 ) in1_1,         \
613
                                   slide_val );                              \
614
}
615
#define SLDI_B2_UB( ... ) SLDI_B2( v16u8, __VA_ARGS__ )
616

617
/* Description : Shuffle byte vector elements as per mask vector
618
   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
619
                 Outputs - out0, out1
620
                 Return Type - as per RTYPE
621
   Details     : Selective byte elements from 'in0' & 'in1' are copied to
622
                 'out0' as per control vector 'mask0'
623
*/
624
#define VSHF_B2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 )  \
625
{                                                                       \
626
    out0 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask0,                     \
627
                                   ( v16i8 ) in1, ( v16i8 ) in0 );      \
628
    out1 = ( RTYPE ) __msa_vshf_b( ( v16i8 ) mask1,                     \
629
                                   ( v16i8 ) in3, ( v16i8 ) in2 );      \
630
}
631
#define VSHF_B2_UB( ... ) VSHF_B2( v16u8, __VA_ARGS__ )
632
#define VSHF_B2_SB( ... ) VSHF_B2( v16i8, __VA_ARGS__ )
633

634
/* Description : Shuffle halfword vector elements as per mask vector
635
   Arguments   : Inputs  - in0, in1, in2, in3, mask0, mask1
636
                 Outputs - out0, out1
637
                 Return Type - as per RTYPE
638
   Details     : Selective byte elements from 'in0' & 'in1' are copied to
639
                 'out0' as per control vector 'mask0'
640
*/
641
#define VSHF_H2( RTYPE, in0, in1, in2, in3, mask0, mask1, out0, out1 )  \
642
{                                                                       \
643
    out0 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask0,                     \
644
                                   ( v8i16 ) in1, ( v8i16 ) in0 );      \
645
    out1 = ( RTYPE ) __msa_vshf_h( ( v8i16 ) mask1,                     \
646
                                   ( v8i16 ) in3, ( v8i16 ) in2 );      \
647
}
648
#define VSHF_H2_SH( ... ) VSHF_H2( v8i16, __VA_ARGS__ )
649

650
/* Description : Dot product of byte vector elements
651
   Arguments   : Inputs  - mult0, mult1
652
                           cnst0, cnst1
653
                 Outputs - out0, out1
654
                 Return Type - as per RTYPE
655
   Details     : Unsigned byte elements from 'mult0' are multiplied with
656
                 unsigned byte elements from 'cnst0' producing a result
657
                 twice the size of input i.e. unsigned halfword.
658
                 Multiplication result of adjacent odd-even elements
659
                 are added together and written to the 'out0' vector
660
*/
661
#define DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 )         \
662
{                                                                         \
663
    out0 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult0, ( v16u8 ) cnst0 );  \
664
    out1 = ( RTYPE ) __msa_dotp_u_h( ( v16u8 ) mult1, ( v16u8 ) cnst1 );  \
665
}
666
#define DOTP_UB2_UH( ... ) DOTP_UB2( v8u16, __VA_ARGS__ )
667

668
#define DOTP_UB4( RTYPE, mult0, mult1, mult2, mult3,            \
669
                  cnst0, cnst1, cnst2, cnst3,                   \
670
                  out0, out1, out2, out3 )                      \
671
{                                                               \
672
    DOTP_UB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 );  \
673
    DOTP_UB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 );  \
674
}
675
#define DOTP_UB4_UH( ... ) DOTP_UB4( v8u16, __VA_ARGS__ )
676

677
/* Description : Dot product of byte vector elements
678
   Arguments   : Inputs  - mult0, mult1
679
                           cnst0, cnst1
680
                 Outputs - out0, out1
681
                 Return Type - as per RTYPE
682
   Details     : Signed byte elements from 'mult0' are multiplied with
683
                 signed byte elements from 'cnst0' producing a result
684
                 twice the size of input i.e. signed halfword.
685
                 Multiplication result of adjacent odd-even elements
686
                 are added together and written to the 'out0' vector
687
*/
688
#define DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 )         \
689
{                                                                          \
690
    out0 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out0,                      \
691
                                      ( v16i8 ) mult0, ( v16i8 ) cnst0 );  \
692
    out1 = ( RTYPE ) __msa_dpadd_s_h( ( v8i16 ) out1,                      \
693
                                      ( v16i8 ) mult1, ( v16i8 ) cnst1 );  \
694
}
695
#define DPADD_SB2_SH( ... ) DPADD_SB2( v8i16, __VA_ARGS__ )
696

697
#define DPADD_SB4( RTYPE, mult0, mult1, mult2, mult3,                    \
698
                   cnst0, cnst1, cnst2, cnst3, out0, out1, out2, out3 )  \
699
{                                                                        \
700
    DPADD_SB2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 );          \
701
    DPADD_SB2( RTYPE, mult2, mult3, cnst2, cnst3, out2, out3 );          \
702
}
703
#define DPADD_SB4_SH( ... ) DPADD_SB4( v8i16, __VA_ARGS__ )
704

705
/* Description : Dot product of halfword vector elements
706
   Arguments   : Inputs  - mult0, mult1
707
                           cnst0, cnst1
708
                 Outputs - out0, out1
709
                 Return Type - as per RTYPE
710
   Details     : Signed halfword elements from 'mult0' are multiplied with
711
                 signed halfword elements from 'cnst0' producing a result
712
                 twice the size of input i.e. signed word.
713
                 Multiplication result of adjacent odd-even elements
714
                 are added together and written to the 'out0' vector
715
*/
716
#define DPADD_SH2( RTYPE, mult0, mult1, cnst0, cnst1, out0, out1 )         \
717
{                                                                          \
718
    out0 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out0,                      \
719
                                      ( v8i16 ) mult0, ( v8i16 ) cnst0 );  \
720
    out1 = ( RTYPE ) __msa_dpadd_s_w( ( v4i32 ) out1,                      \
721
                                      ( v8i16 ) mult1, ( v8i16 ) cnst1 );  \
722
}
723
#define DPADD_SH2_SW( ... ) DPADD_SH2( v4i32, __VA_ARGS__ )
724

725
/* Description : Clips all halfword elements of input vector between min & max
726
                 out = (in < min) ? min : ((in > max) ? max : in)
727
   Arguments   : Inputs  - in, min, max
728
                 Output - out_m
729
                 Return Type - signed halfword
730
*/
731
#define CLIP_SH( in, min, max )                               \
732
( {                                                           \
733
    v8i16 out_m;                                              \
734
                                                              \
735
    out_m = __msa_max_s_h( ( v8i16 ) min, ( v8i16 ) in );     \
736
    out_m = __msa_min_s_h( ( v8i16 ) max, ( v8i16 ) out_m );  \
737
    out_m;                                                    \
738
} )
739

740
/* Description : Clips all signed halfword elements of input vector
741
                 between 0 & 255
742
   Arguments   : Input  - in
743
                 Output - out_m
744
                 Return Type - signed halfword
745
*/
746
#define CLIP_SH_0_255( in )                                     \
747
( {                                                             \
748
    v8i16 max_m = __msa_ldi_h( 255 );                           \
749
    v8i16 out_m;                                                \
750
                                                                \
751
    out_m = __msa_maxi_s_h( ( v8i16 ) in, 0 );                  \
752
    out_m = __msa_min_s_h( ( v8i16 ) max_m, ( v8i16 ) out_m );  \
753
    out_m;                                                      \
754
} )
755
#define CLIP_SH2_0_255( in0, in1 )  \
756
{                                   \
757
    in0 = CLIP_SH_0_255( in0 );     \
758
    in1 = CLIP_SH_0_255( in1 );     \
759
}
760
#define CLIP_SH4_0_255( in0, in1, in2, in3 )  \
761
{                                             \
762
    CLIP_SH2_0_255( in0, in1 );               \
763
    CLIP_SH2_0_255( in2, in3 );               \
764
}
765

766
/* Description : Horizontal addition of 4 signed word elements of input vector
767
   Arguments   : Input  - in       (signed word vector)
768
                 Output - sum_m    (i32 sum)
769
                 Return Type - signed word (GP)
770
   Details     : 4 signed word elements of 'in' vector are added together and
771
                 the resulting integer sum is returned
772
*/
773
#define HADD_SW_S32( in )                                   \
774
( {                                                         \
775
    v2i64 res0_m, res1_m;                                   \
776
    int32_t i_sum_m;                                        \
777
                                                            \
778
    res0_m = __msa_hadd_s_d( ( v4i32 ) in, ( v4i32 ) in );  \
779
    res1_m = __msa_splati_d( res0_m, 1 );                   \
780
    res0_m = res0_m + res1_m;                               \
781
    i_sum_m = __msa_copy_s_w( ( v4i32 ) res0_m, 0 );        \
782
    i_sum_m;                                                \
783
} )
784

785
/* Description : Horizontal addition of 4 signed word elements of input vector
786
   Arguments   : Input  - in       (signed word vector)
787
                 Output - sum_m    (i32 sum)
788
                 Return Type - signed word (GP)
789
   Details     : 4 signed word elements of 'in' vector are added together and
790
                 the resulting integer sum is returned
791
*/
792
#define HADD_UH_U32( in )                                      \
793
( {                                                            \
794
    v4u32 res_m;                                               \
795
    v2u64 res0_m, res1_m;                                      \
796
    uint32_t u_sum_m;                                          \
797
                                                               \
798
    res_m = __msa_hadd_u_w( ( v8u16 ) in, ( v8u16 ) in );      \
799
    res0_m = __msa_hadd_u_d( res_m, res_m );                   \
800
    res1_m = ( v2u64 ) __msa_splati_d( ( v2i64 ) res0_m, 1 );  \
801
    res0_m = res0_m + res1_m;                                  \
802
    u_sum_m = __msa_copy_u_w( ( v4i32 ) res0_m, 0 );           \
803
    u_sum_m;                                                   \
804
} )
805

806
/* Description : Horizontal addition of signed byte vector elements
807
   Arguments   : Inputs  - in0, in1
808
                 Outputs - out0, out1
809
                 Return Type - as per RTYPE
810
   Details     : Each signed odd byte element from 'in0' is added to
811
                 even signed byte element from 'in0' (pairwise) and the
812
                 halfword result is written in 'out0'
813
*/
814
#define HADD_SB2( RTYPE, in0, in1, out0, out1 )                       \
815
{                                                                     \
816
    out0 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in0, ( v16i8 ) in0 );  \
817
    out1 = ( RTYPE ) __msa_hadd_s_h( ( v16i8 ) in1, ( v16i8 ) in1 );  \
818
}
819
#define HADD_SB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 )  \
820
{                                                                      \
821
    HADD_SB2( RTYPE, in0, in1, out0, out1 );                           \
822
    HADD_SB2( RTYPE, in2, in3, out2, out3 );                           \
823
}
824
#define HADD_SB4_SH( ... ) HADD_SB4( v8i16, __VA_ARGS__ )
825

826
/* Description : Horizontal addition of unsigned byte vector elements
827
   Arguments   : Inputs  - in0, in1
828
                 Outputs - out0, out1
829
                 Return Type - as per RTYPE
830
   Details     : Each unsigned odd byte element from 'in0' is added to
831
                 even unsigned byte element from 'in0' (pairwise) and the
832
                 halfword result is written to 'out0'
833
*/
834
#define HADD_UB2( RTYPE, in0, in1, out0, out1 )                       \
835
{                                                                     \
836
    out0 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in0, ( v16u8 ) in0 );  \
837
    out1 = ( RTYPE ) __msa_hadd_u_h( ( v16u8 ) in1, ( v16u8 ) in1 );  \
838
}
839
#define HADD_UB2_UH( ... ) HADD_UB2( v8u16, __VA_ARGS__ )
840

841
#define HADD_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 )  \
842
{                                                                      \
843
    HADD_UB2( RTYPE, in0, in1, out0, out1 );                           \
844
    HADD_UB2( RTYPE, in2, in3, out2, out3 );                           \
845
}
846
#define HADD_UB4_UH( ... ) HADD_UB4( v8u16, __VA_ARGS__ )
847

848
/* Description : Horizontal subtraction of unsigned byte vector elements
849
   Arguments   : Inputs  - in0, in1
850
                 Outputs - out0, out1
851
                 Return Type - as per RTYPE
852
   Details     : Each unsigned odd byte element from 'in0' is subtracted from
853
                 even unsigned byte element from 'in0' (pairwise) and the
854
                 halfword result is written to 'out0'
855
*/
856
#define HSUB_UB2( RTYPE, in0, in1, out0, out1 )                       \
857
{                                                                     \
858
    out0 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in0, ( v16u8 ) in0 );  \
859
    out1 = ( RTYPE ) __msa_hsub_u_h( ( v16u8 ) in1, ( v16u8 ) in1 );  \
860
}
861
#define HSUB_UB2_SH( ... ) HSUB_UB2( v8i16, __VA_ARGS__ )
862

863
#define HSUB_UB4( RTYPE, in0, in1, in2, in3, out0, out1, out2, out3 )  \
864
{                                                                      \
865
    HSUB_UB2( RTYPE, in0, in1, out0, out1 );                           \
866
    HSUB_UB2( RTYPE, in2, in3, out2, out3 );                           \
867
}
868
#define HSUB_UB4_SH( ... ) HSUB_UB4( v8i16, __VA_ARGS__ )
869

870
/* Description : SAD (Sum of Absolute Difference)
871
   Arguments   : Inputs  - in0, in1, ref0, ref1
872
                 Outputs - sad_m                 (halfword vector)
873
                 Return Type - unsigned halfword
874
   Details     : Absolute difference of all the byte elements from 'in0' with
875
                 'ref0' is calculated and preserved in 'diff0'. Then even-odd
876
                 pairs are added together to generate 8 halfword results.
877
*/
878
#define SAD_UB2_UH( in0, in1, ref0, ref1 )                            \
879
( {                                                                   \
880
    v16u8 diff0_m, diff1_m;                                           \
881
    v8u16 sad_m = { 0 };                                              \
882
                                                                      \
883
    diff0_m = __msa_asub_u_b( ( v16u8 ) in0, ( v16u8 ) ref0 );        \
884
    diff1_m = __msa_asub_u_b( ( v16u8 ) in1, ( v16u8 ) ref1 );        \
885
                                                                      \
886
    sad_m += __msa_hadd_u_h( ( v16u8 ) diff0_m, ( v16u8 ) diff0_m );  \
887
    sad_m += __msa_hadd_u_h( ( v16u8 ) diff1_m, ( v16u8 ) diff1_m );  \
888
                                                                      \
889
    sad_m;                                                            \
890
} )
891

892
/* Description : Set element n input vector to GPR value
893
   Arguments   : Inputs  - in0, in1, in2, in3 (4 input vectors)
894
                 Output - out                 (output vector)
895
                 Return Type - as per RTYPE
896
   Details     : Set element 0 in vector 'out' to value specified in 'in0'
897
*/
898
#define INSERT_W2( RTYPE, in0, in1, out )                     \
899
{                                                             \
900
    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 );  \
901
    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 );  \
902
}
903
#define INSERT_W2_SB( ... ) INSERT_W2( v16i8, __VA_ARGS__ )
904

905
#define INSERT_W4( RTYPE, in0, in1, in2, in3, out )           \
906
{                                                             \
907
    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 0, in0 );  \
908
    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 1, in1 );  \
909
    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 2, in2 );  \
910
    out = ( RTYPE ) __msa_insert_w( ( v4i32 ) out, 3, in3 );  \
911
}
912
#define INSERT_W4_UB( ... ) INSERT_W4( v16u8, __VA_ARGS__ )
913
#define INSERT_W4_SB( ... ) INSERT_W4( v16i8, __VA_ARGS__ )
914

915
#define INSERT_D2( RTYPE, in0, in1, out )                     \
916
{                                                             \
917
    out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 0, in0 );  \
918
    out = ( RTYPE ) __msa_insert_d( ( v2i64 ) out, 1, in1 );  \
919
}
920
#define INSERT_D2_UB( ... ) INSERT_D2( v16u8, __VA_ARGS__ )
921

922
/* Description : Interleave even halfword elements from vectors
923
   Arguments   : Inputs  - in0, in1, in2, in3
924
                 Outputs - out0, out1
925
                 Return Type - as per RTYPE
926
   Details     : Even halfword elements of 'in0' and 'in1' are interleaved
927
                 and written to 'out0'
928
*/
929
#define ILVEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
930
{                                                                    \
931
    out0 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in1, ( v8i16 ) in0 );  \
932
    out1 = ( RTYPE ) __msa_ilvev_h( ( v8i16 ) in3, ( v8i16 ) in2 );  \
933
}
934
#define ILVEV_H2_UB( ... ) ILVEV_H2( v16u8, __VA_ARGS__ )
935

936
/* Description : Interleave even double word elements from vectors
937
   Arguments   : Inputs  - in0, in1, in2, in3
938
                 Outputs - out0, out1
939
                 Return Type - as per RTYPE
940
   Details     : Even double word elements of 'in0' and 'in1' are interleaved
941
                 and written to 'out0'
942
*/
943
#define ILVEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
944
{                                                                    \
945
    out0 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in1, ( v2i64 ) in0 );  \
946
    out1 = ( RTYPE ) __msa_ilvev_d( ( v2i64 ) in3, ( v2i64 ) in2 );  \
947
}
948
#define ILVEV_D2_UB( ... ) ILVEV_D2( v16u8, __VA_ARGS__ )
949

950
/* Description : Interleave left half of byte elements from vectors
951
   Arguments   : Inputs  - in0, in1, in2, in3
952
                 Outputs - out0, out1
953
                 Return Type - as per RTYPE
954
   Details     : Left half of byte elements of 'in0' and 'in1' are interleaved
955
                 and written to 'out0'.
956
*/
957
#define ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
958
{                                                                   \
959
    out0 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
960
    out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in2, ( v16i8 ) in3 );  \
961
}
962
#define ILVL_B2_UH( ... ) ILVL_B2( v8u16, __VA_ARGS__ )
963
#define ILVL_B2_SH( ... ) ILVL_B2( v8i16, __VA_ARGS__ )
964

965
#define ILVL_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
966
                 out0, out1, out2, out3 )                        \
967
{                                                                \
968
    ILVL_B2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
969
    ILVL_B2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
970
}
971
#define ILVL_B4_UB( ... ) ILVL_B4( v16u8, __VA_ARGS__ )
972
#define ILVL_B4_SB( ... ) ILVL_B4( v16i8, __VA_ARGS__ )
973
#define ILVL_B4_UH( ... ) ILVL_B4( v8u16, __VA_ARGS__ )
974
#define ILVL_B4_SH( ... ) ILVL_B4( v8i16, __VA_ARGS__ )
975

976
/* Description : Interleave left half of halfword elements from vectors
977
   Arguments   : Inputs  - in0, in1, in2, in3
978
                 Outputs - out0, out1
979
                 Return Type - as per RTYPE
980
   Details     : Left half of halfword elements of 'in0' and 'in1' are
981
                 interleaved and written to 'out0'.
982
*/
983
#define ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
984
{                                                                   \
985
    out0 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
986
    out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in2, ( v8i16 ) in3 );  \
987
}
988
#define ILVL_H2_SH( ... ) ILVL_H2( v8i16, __VA_ARGS__ )
989
#define ILVL_H2_SW( ... ) ILVL_H2( v4i32, __VA_ARGS__ )
990

991
#define ILVL_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
992
                 out0, out1, out2, out3 )                        \
993
{                                                                \
994
    ILVL_H2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
995
    ILVL_H2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
996
}
997
#define ILVL_H4_SW( ... ) ILVL_H4( v4i32, __VA_ARGS__ )
998

999
/* Description : Interleave left half of word elements from vectors
1000
   Arguments   : Inputs  - in0, in1, in2, in3
1001
                 Outputs - out0, out1
1002
                 Return Type - as per RTYPE
1003
   Details     : Left half of word elements of 'in0' and 'in1' are interleaved
1004
                 and written to 'out0'.
1005
*/
1006
#define ILVL_W2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1007
{                                                                   \
1008
    out0 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 );  \
1009
    out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in2, ( v4i32 ) in3 );  \
1010
}
1011
#define ILVL_W2_SH( ... ) ILVL_W2( v8i16, __VA_ARGS__ )
1012

1013
/* Description : Interleave right half of byte elements from vectors
1014
   Arguments   : Inputs  - in0, in1, in2, in3
1015
                 Outputs - out0, out1
1016
                 Return Type - as per RTYPE
1017
   Details     : Right half of byte elements of 'in0' and 'in1' are interleaved
1018
                 and written to out0.
1019
*/
1020
#define ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1021
{                                                                   \
1022
    out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
1023
    out1 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in2, ( v16i8 ) in3 );  \
1024
}
1025
#define ILVR_B2_SB( ... ) ILVR_B2( v16i8, __VA_ARGS__ )
1026
#define ILVR_B2_UH( ... ) ILVR_B2( v8u16, __VA_ARGS__ )
1027
#define ILVR_B2_SH( ... ) ILVR_B2( v8i16, __VA_ARGS__ )
1028

1029
#define ILVR_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1030
                 out0, out1, out2, out3 )                        \
1031
{                                                                \
1032
    ILVR_B2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
1033
    ILVR_B2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
1034
}
1035
#define ILVR_B4_UB( ... ) ILVR_B4( v16u8, __VA_ARGS__ )
1036
#define ILVR_B4_SB( ... ) ILVR_B4( v16i8, __VA_ARGS__ )
1037
#define ILVR_B4_UH( ... ) ILVR_B4( v8u16, __VA_ARGS__ )
1038
#define ILVR_B4_SH( ... ) ILVR_B4( v8i16, __VA_ARGS__ )
1039

1040
/* Description : Interleave right half of halfword elements from vectors
1041
   Arguments   : Inputs  - in0, in1, in2, in3
1042
                 Outputs - out0, out1
1043
                 Return Type - as per RTYPE
1044
   Details     : Right half of halfword elements of 'in0' and 'in1' are
1045
                 interleaved and written to 'out0'.
1046
*/
1047
#define ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1048
{                                                                   \
1049
    out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
1050
    out1 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in2, ( v8i16 ) in3 );  \
1051
}
1052
#define ILVR_H2_SH( ... ) ILVR_H2( v8i16, __VA_ARGS__ )
1053
#define ILVR_H2_SW( ... ) ILVR_H2( v4i32, __VA_ARGS__ )
1054

1055
#define ILVR_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1056
                 out0, out1, out2, out3 )                        \
1057
{                                                                \
1058
    ILVR_H2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
1059
    ILVR_H2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
1060
}
1061
#define ILVR_H4_SH( ... ) ILVR_H4( v8i16, __VA_ARGS__ )
1062
#define ILVR_H4_SW( ... ) ILVR_H4( v4i32, __VA_ARGS__ )
1063

1064
#define ILVR_W2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1065
{                                                                   \
1066
    out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 );  \
1067
    out1 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in2, ( v4i32 ) in3 );  \
1068
}
1069
#define ILVR_W2_SH( ... ) ILVR_W2( v8i16, __VA_ARGS__ )
1070

1071
/* Description : Interleave right half of double word elements from vectors
1072
   Arguments   : Inputs  - in0, in1, in2, in3
1073
                 Outputs - out0, out1
1074
                 Return Type - as per RTYPE
1075
   Details     : Right half of double word elements of 'in0' and 'in1' are
1076
                 interleaved and written to 'out0'.
1077
*/
1078
#define ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 )                    \
1079
{                                                                           \
1080
    out0 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in0 ), ( v2i64 ) ( in1 ) );  \
1081
    out1 = ( RTYPE ) __msa_ilvr_d( ( v2i64 ) ( in2 ), ( v2i64 ) ( in3 ) );  \
1082
}
1083
#define ILVR_D2_UB( ... ) ILVR_D2( v16u8, __VA_ARGS__ )
1084
#define ILVR_D2_SB( ... ) ILVR_D2( v16i8, __VA_ARGS__ )
1085
#define ILVR_D2_SH( ... ) ILVR_D2( v8i16, __VA_ARGS__ )
1086

1087
#define ILVR_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1088
                 out0, out1, out2, out3 )                        \
1089
{                                                                \
1090
    ILVR_D2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
1091
    ILVR_D2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
1092
}
1093
#define ILVR_D4_UB( ... ) ILVR_D4( v16u8, __VA_ARGS__ )
1094

1095
/* Description : Interleave both left and right half of input vectors
1096
   Arguments   : Inputs  - in0, in1
1097
                 Outputs - out0, out1
1098
                 Return Type - as per RTYPE
1099
   Details     : Right half of byte elements from 'in0' and 'in1' are
1100
                 interleaved and written to 'out0'
1101
*/
1102
#define ILVRL_B2( RTYPE, in0, in1, out0, out1 )                     \
1103
{                                                                   \
1104
    out0 = ( RTYPE ) __msa_ilvr_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
1105
    out1 = ( RTYPE ) __msa_ilvl_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
1106
}
1107
#define ILVRL_B2_UB( ... ) ILVRL_B2( v16u8, __VA_ARGS__ )
1108
#define ILVRL_B2_SB( ... ) ILVRL_B2( v16i8, __VA_ARGS__ )
1109
#define ILVRL_B2_UH( ... ) ILVRL_B2( v8u16, __VA_ARGS__ )
1110
#define ILVRL_B2_SH( ... ) ILVRL_B2( v8i16, __VA_ARGS__ )
1111
#define ILVRL_B2_SW( ... ) ILVRL_B2( v4i32, __VA_ARGS__ )
1112

1113
#define ILVRL_H2( RTYPE, in0, in1, out0, out1 )                     \
1114
{                                                                   \
1115
    out0 = ( RTYPE ) __msa_ilvr_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
1116
    out1 = ( RTYPE ) __msa_ilvl_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
1117
}
1118
#define ILVRL_H2_SH( ... ) ILVRL_H2( v8i16, __VA_ARGS__ )
1119
#define ILVRL_H2_SW( ... ) ILVRL_H2( v4i32, __VA_ARGS__ )
1120

1121
#define ILVRL_W2( RTYPE, in0, in1, out0, out1 )                     \
1122
{                                                                   \
1123
    out0 = ( RTYPE ) __msa_ilvr_w( ( v4i32 ) in0, ( v4i32 ) in1 );  \
1124
    out1 = ( RTYPE ) __msa_ilvl_w( ( v4i32 ) in0, ( v4i32 ) in1 );  \
1125
}
1126
#define ILVRL_W2_SH( ... ) ILVRL_W2( v8i16, __VA_ARGS__ )
1127
#define ILVRL_W2_SW( ... ) ILVRL_W2( v4i32, __VA_ARGS__ )
1128

1129
/* Description : Maximum values between signed elements of vector and
1130
                 5-bit signed immediate value are copied to the output vector
1131
   Arguments   : Inputs  - in0, in1, in2, in3, max_val
1132
                 Outputs - in place operation
1133
                 Return Type - unsigned halfword
1134
   Details     : Maximum of signed halfword element values from 'in0' and
1135
                 'max_val' are written in place
1136
*/
1137
#define MAXI_SH2( RTYPE, in0, in1, max_val )                       \
1138
{                                                                  \
1139
    in0 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in0, ( max_val ) );  \
1140
    in1 = ( RTYPE ) __msa_maxi_s_h( ( v8i16 ) in1, ( max_val ) );  \
1141
}
1142
#define MAXI_SH2_UH( ... ) MAXI_SH2( v8u16, __VA_ARGS__ )
1143
#define MAXI_SH2_SH( ... ) MAXI_SH2( v8i16, __VA_ARGS__ )
1144

1145
#define MAXI_SH4( RTYPE, in0, in1, in2, in3, max_val )  \
1146
{                                                       \
1147
    MAXI_SH2( RTYPE, in0, in1, max_val );               \
1148
    MAXI_SH2( RTYPE, in2, in3, max_val );               \
1149
}
1150
#define MAXI_SH4_UH( ... ) MAXI_SH4( v8u16, __VA_ARGS__ )
1151

1152
/* Description : Saturate the halfword element values to the max
1153
                 unsigned value of (sat_val + 1 bits)
1154
                 The element data width remains unchanged
1155
   Arguments   : Inputs  - in0, in1, sat_val
1156
                 Outputs - in place operation
1157
                 Return Type - as per RTYPE
1158
   Details     : Each unsigned halfword element from 'in0' is saturated to the
1159
                 value generated with (sat_val+1) bit range.
1160
                 The results are written in place
1161
*/
1162
#define SAT_UH2( RTYPE, in0, in1, sat_val )                   \
1163
{                                                             \
1164
    in0 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in0, sat_val );  \
1165
    in1 = ( RTYPE ) __msa_sat_u_h( ( v8u16 ) in1, sat_val );  \
1166
}
1167
#define SAT_UH2_UH( ... ) SAT_UH2( v8u16, __VA_ARGS__ )
1168

1169
#define SAT_UH4( RTYPE, in0, in1, in2, in3, sat_val )  \
1170
{                                                      \
1171
    SAT_UH2( RTYPE, in0, in1, sat_val );               \
1172
    SAT_UH2( RTYPE, in2, in3, sat_val )                \
1173
}
1174
#define SAT_UH4_UH( ... ) SAT_UH4( v8u16, __VA_ARGS__ )
1175

1176
/* Description : Saturate the halfword element values to the max
1177
                 unsigned value of (sat_val+1 bits)
1178
                 The element data width remains unchanged
1179
   Arguments   : Inputs  - in0, in1, sat_val
1180
                 Outputs - in place operation
1181
                 Return Type - as per RTYPE
1182
   Details     : Each unsigned halfword element from 'in0' is saturated to the
1183
                 value generated with (sat_val+1) bit range
1184
                 The results are written in place
1185
*/
1186
#define SAT_SH2( RTYPE, in0, in1, sat_val )                   \
1187
{                                                             \
1188
    in0 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in0, sat_val );  \
1189
    in1 = ( RTYPE ) __msa_sat_s_h( ( v8i16 ) in1, sat_val );  \
1190
}
1191
#define SAT_SH2_SH( ... ) SAT_SH2( v8i16, __VA_ARGS__ )
1192

1193
#define SAT_SH4( RTYPE, in0, in1, in2, in3, sat_val )  \
1194
{                                                      \
1195
    SAT_SH2( RTYPE, in0, in1, sat_val );               \
1196
    SAT_SH2( RTYPE, in2, in3, sat_val );               \
1197
}
1198
#define SAT_SH4_SH( ... ) SAT_SH4( v8i16, __VA_ARGS__ )
1199

1200
/* Description : Saturate the word element values to the max
1201
                 unsigned value of (sat_val+1 bits)
1202
                 The element data width remains unchanged
1203
   Arguments   : Inputs  - in0, in1, sat_val
1204
                 Outputs - in place operation
1205
                 Return Type - as per RTYPE
1206
   Details     : Each unsigned word element from 'in0' is saturated to the
1207
                 value generated with (sat_val+1) bit range
1208
                 The results are written in place
1209
*/
1210
#define SAT_SW2( RTYPE, in0, in1, sat_val )                   \
1211
{                                                             \
1212
    in0 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in0, sat_val );  \
1213
    in1 = ( RTYPE ) __msa_sat_s_w( ( v4i32 ) in1, sat_val );  \
1214
}
1215
#define SAT_SW2_SW( ... ) SAT_SW2( v4i32, __VA_ARGS__ )
1216

1217
/* Description : Pack even byte elements of vector pairs
1218
   Arguments   : Inputs  - in0, in1, in2, in3
1219
                 Outputs - out0, out1
1220
                 Return Type - as per RTYPE
1221
   Details     : Even byte elements of 'in0' are copied to the left half of
1222
                 'out0' & even byte elements of 'in1' are copied to the right
1223
                 half of 'out0'.
1224
*/
1225
#define PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1226
{                                                                    \
1227
    out0 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
1228
    out1 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in2, ( v16i8 ) in3 );  \
1229
}
1230
#define PCKEV_B2_SB( ... ) PCKEV_B2( v16i8, __VA_ARGS__ )
1231
#define PCKEV_B2_UB( ... ) PCKEV_B2( v16u8, __VA_ARGS__ )
1232
#define PCKEV_B2_SH( ... ) PCKEV_B2( v8i16, __VA_ARGS__ )
1233
#define PCKEV_B2_SW( ... ) PCKEV_B2( v4i32, __VA_ARGS__ )
1234

1235
#define PCKEV_B3( RTYPE, in0, in1, in2, in3, in4, in5, out0, out1, out2 ) \
1236
{                                                                         \
1237
    PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 );                    \
1238
    out2 = ( RTYPE ) __msa_pckev_b( ( v16i8 ) in4, ( v16i8 ) in5 );       \
1239
}
1240
#define PCKEV_B3_UB( ... ) PCKEV_B3( v16u8, __VA_ARGS__ )
1241

1242
#define PCKEV_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1243
                  out0, out1, out2, out3 )                        \
1244
{                                                                 \
1245
    PCKEV_B2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
1246
    PCKEV_B2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
1247
}
1248
#define PCKEV_B4_SB( ... ) PCKEV_B4( v16i8, __VA_ARGS__ )
1249
#define PCKEV_B4_UB( ... ) PCKEV_B4( v16u8, __VA_ARGS__ )
1250

1251
/* Description : Pack even halfword elements of vector pairs
1252
   Arguments   : Inputs  - in0, in1, in2, in3
1253
                 Outputs - out0, out1
1254
                 Return Type - as per RTYPE
1255
   Details     : Even halfword elements of 'in0' are copied to the left half of
1256
                 'out0' & even halfword elements of 'in1' are copied to the
1257
                 right half of 'out0'.
1258
*/
1259
#define PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1260
{                                                                    \
1261
    out0 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
1262
    out1 = ( RTYPE ) __msa_pckev_h( ( v8i16 ) in2, ( v8i16 ) in3 );  \
1263
}
1264
#define PCKEV_H2_SH( ... ) PCKEV_H2( v8i16, __VA_ARGS__ )
1265

1266
#define PCKEV_H4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1267
                  out0, out1, out2, out3 )                        \
1268
{                                                                 \
1269
    PCKEV_H2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
1270
    PCKEV_H2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
1271
}
1272
#define PCKEV_H4_SH( ... ) PCKEV_H4( v8i16, __VA_ARGS__ )
1273

1274
/* Description : Pack even double word elements of vector pairs
1275
   Arguments   : Inputs  - in0, in1, in2, in3
1276
                 Outputs - out0, out1
1277
                 Return Type - as per RTYPE
1278
   Details     : Even double elements of 'in0' are copied to the left half of
1279
                 'out0' & even double elements of 'in1' are copied to the right
1280
                 half of 'out0'.
1281
*/
1282
#define PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1283
{                                                                    \
1284
    out0 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in0, ( v2i64 ) in1 );  \
1285
    out1 = ( RTYPE ) __msa_pckev_d( ( v2i64 ) in2, ( v2i64 ) in3 );  \
1286
}
1287
#define PCKEV_D2_UB( ... ) PCKEV_D2( v16u8, __VA_ARGS__ )
1288

1289
#define PCKEV_D4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1290
                  out0, out1, out2, out3 )                        \
1291
{                                                                 \
1292
    PCKEV_D2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
1293
    PCKEV_D2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
1294
}
1295
#define PCKEV_D4_UB( ... ) PCKEV_D4( v16u8, __VA_ARGS__ )
1296

1297
/* Description : Pack odd byte elements of vector pairs
1298
   Arguments   : Inputs  - in0, in1, in2, in3
1299
                 Outputs - out0, out1
1300
                 Return Type - as per RTYPE
1301
   Details     : Odd byte elements of 'in0' are copied to the left half of
1302
                 'out0' & odd byte elements of 'in1' are copied to the right
1303
                 half of 'out0'.
1304
*/
1305
#define PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1306
{                                                                    \
1307
    out0 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in0, ( v16i8 ) in1 );  \
1308
    out1 = ( RTYPE ) __msa_pckod_b( ( v16i8 ) in2, ( v16i8 ) in3 );  \
1309
}
1310
#define PCKOD_B2_UB( ... ) PCKOD_B2( v16u8, __VA_ARGS__ )
1311

1312
#define PCKOD_B4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1313
                  out0, out1, out2, out3 )                        \
1314
{                                                                 \
1315
    PCKOD_B2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
1316
    PCKOD_B2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
1317
}
1318
#define PCKOD_B4_UB( ... ) PCKOD_B4( v16u8, __VA_ARGS__ )
1319

1320
/* Description : Pack odd double word elements of vector pairs
1321
   Arguments   : Inputs  - in0, in1, in2, in3
1322
                 Outputs - out0, out1
1323
                 Return Type - as per RTYPE
1324
   Details     : Odd double word elements of 'in0' are copied to the left half
1325
                 of 'out0' & odd double word elements of 'in1' are copied to
1326
                 the right half of 'out0'.
1327
*/
1328
#define PCKOD_D2( RTYPE, in0, in1, in2, in3, out0, out1 )            \
1329
{                                                                    \
1330
    out0 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in0, ( v2i64 ) in1 );  \
1331
    out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) in2, ( v2i64 ) in3 );  \
1332
}
1333
#define PCKOD_D2_SH( ... ) PCKOD_D2( v8i16, __VA_ARGS__ )
1334
#define PCKOD_D2_SD( ... ) PCKOD_D2( v2i64, __VA_ARGS__ )
1335

1336
/* Description : Each byte element is logically xor'ed with immediate 128
1337
   Arguments   : Inputs  - in0, in1
1338
                 Outputs - in place operation
1339
                 Return Type - as per RTYPE
1340
   Details     : Each unsigned byte element from input vector 'in0' is
1341
                 logically xor'ed with 128 and the result is stored in-place.
1342
*/
1343
#define XORI_B2_128( RTYPE, in0, in1 )                   \
1344
{                                                        \
1345
    in0 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in0, 128 );  \
1346
    in1 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in1, 128 );  \
1347
}
1348
#define XORI_B2_128_UB( ... ) XORI_B2_128( v16u8, __VA_ARGS__ )
1349
#define XORI_B2_128_SB( ... ) XORI_B2_128( v16i8, __VA_ARGS__ )
1350

1351
#define XORI_B3_128( RTYPE, in0, in1, in2 )              \
1352
{                                                        \
1353
    XORI_B2_128( RTYPE, in0, in1 );                      \
1354
    in2 = ( RTYPE ) __msa_xori_b( ( v16u8 ) in2, 128 );  \
1355
}
1356
#define XORI_B3_128_SB( ... ) XORI_B3_128( v16i8, __VA_ARGS__ )
1357

1358
#define XORI_B4_128( RTYPE, in0, in1, in2, in3 )  \
1359
{                                                 \
1360
    XORI_B2_128( RTYPE, in0, in1 );               \
1361
    XORI_B2_128( RTYPE, in2, in3 );               \
1362
}
1363
#define XORI_B4_128_UB( ... ) XORI_B4_128( v16u8, __VA_ARGS__ )
1364
#define XORI_B4_128_SB( ... ) XORI_B4_128( v16i8, __VA_ARGS__ )
1365

1366
#define XORI_B5_128( RTYPE, in0, in1, in2, in3, in4 )  \
1367
{                                                      \
1368
    XORI_B3_128( RTYPE, in0, in1, in2 );               \
1369
    XORI_B2_128( RTYPE, in3, in4 );                    \
1370
}
1371
#define XORI_B5_128_SB( ... ) XORI_B5_128( v16i8, __VA_ARGS__ )
1372

1373
/* Description : Addition of signed halfword elements and signed saturation
1374
   Arguments   : Inputs  - in0, in1, in2, in3
1375
                 Outputs - out0, out1
1376
                 Return Type - as per RTYPE
1377
   Details     : Signed halfword elements from 'in0' are added to signed
1378
                 halfword elements of 'in1'. The result is then signed saturated
1379
                 between halfword data type range
1380
*/
1381
#define ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 )             \
1382
{                                                                     \
1383
    out0 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in0, ( v8i16 ) in1 );  \
1384
    out1 = ( RTYPE ) __msa_adds_s_h( ( v8i16 ) in2, ( v8i16 ) in3 );  \
1385
}
1386
#define ADDS_SH2_SH( ... ) ADDS_SH2( v8i16, __VA_ARGS__ )
1387

1388
#define ADDS_SH4( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,  \
1389
                  out0, out1, out2, out3 )                        \
1390
{                                                                 \
1391
    ADDS_SH2( RTYPE, in0, in1, in2, in3, out0, out1 );            \
1392
    ADDS_SH2( RTYPE, in4, in5, in6, in7, out2, out3 );            \
1393
}
1394
#define ADDS_SH4_UH( ... ) ADDS_SH4( v8u16, __VA_ARGS__ )
1395

1396
/* Description : Shift left all elements of vector (generic for all data types)
1397
   Arguments   : Inputs  - in0, in1, in2, in3, shift
1398
                 Outputs - in place operation
1399
                 Return Type - as per input vector RTYPE
1400
   Details     : Each element of vector 'in0' is left shifted by 'shift' and
1401
                 the result is written in-place.
1402
*/
1403
#define SLLI_4V( in0, in1, in2, in3, shift )  \
1404
{                                             \
1405
    in0 = in0 << shift;                       \
1406
    in1 = in1 << shift;                       \
1407
    in2 = in2 << shift;                       \
1408
    in3 = in3 << shift;                       \
1409
}
1410

1411
/* Description : Arithmetic shift right all elements of vector
1412
                 (generic for all data types)
1413
   Arguments   : Inputs  - in0, in1, in2, in3, shift
1414
                 Outputs - in place operation
1415
                 Return Type - as per input vector RTYPE
1416
   Details     : Each element of vector 'in0' is right shifted by 'shift' and
1417
                 the result is written in-place. 'shift' is a GP variable.
1418
*/
1419
#define SRA_4V( in0, in1, in2, in3, shift )  \
1420
{                                            \
1421
    in0 = in0 >> shift;                      \
1422
    in1 = in1 >> shift;                      \
1423
    in2 = in2 >> shift;                      \
1424
    in3 = in3 >> shift;                      \
1425
}
1426

1427
/* Description : Shift right arithmetic rounded halfwords
1428
   Arguments   : Inputs  - in0, in1, shift
1429
                 Outputs - in place operation
1430
                 Return Type - as per RTYPE
1431
   Details     : Each element of vector 'in0' is shifted right arithmetic by
1432
                 number of bits respective element holds in vector 'shift'.
1433
                 The last discarded bit is added to shifted value for rounding
1434
                 and the result is written in-place.
1435
                 'shift' is a vector.
1436
*/
1437
#define SRAR_H2( RTYPE, in0, in1, shift )                            \
1438
{                                                                    \
1439
    in0 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in0, ( v8i16 ) shift );  \
1440
    in1 = ( RTYPE ) __msa_srar_h( ( v8i16 ) in1, ( v8i16 ) shift );  \
1441
}
1442
#define SRAR_H2_SH( ... ) SRAR_H2( v8i16, __VA_ARGS__ )
1443

1444
#define SRAR_H4( RTYPE, in0, in1, in2, in3, shift )  \
1445
{                                                    \
1446
    SRAR_H2( RTYPE, in0, in1, shift )                \
1447
    SRAR_H2( RTYPE, in2, in3, shift )                \
1448
}
1449
#define SRAR_H4_SH( ... ) SRAR_H4( v8i16, __VA_ARGS__ )
1450

1451
/* Description : Shift right logical all halfword elements of vector
1452
   Arguments   : Inputs  - in0, in1, in2, in3, shift
1453
                 Outputs - in place operation
1454
                 Return Type - as per RTYPE
1455
   Details     : Each element of vector 'in0' is shifted right logical by
1456
                 number of bits respective element holds in vector 'shift' and
1457
                 the result is stored in-place.'shift' is a vector.
1458
*/
1459
#define SRL_H4( RTYPE, in0, in1, in2, in3, shift )                  \
1460
{                                                                   \
1461
    in0 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in0, ( v8i16 ) shift );  \
1462
    in1 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in1, ( v8i16 ) shift );  \
1463
    in2 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in2, ( v8i16 ) shift );  \
1464
    in3 = ( RTYPE ) __msa_srl_h( ( v8i16 ) in3, ( v8i16 ) shift );  \
1465
}
1466
#define SRL_H4_UH( ... ) SRL_H4( v8u16, __VA_ARGS__ )
1467

1468
/* Description : Shift right arithmetic rounded (immediate)
1469
   Arguments   : Inputs  - in0, in1, shift
1470
                 Outputs - in place operation
1471
                 Return Type - as per RTYPE
1472
   Details     : Each element of vector 'in0' is shifted right arithmetic by
1473
                 value in 'shift'. The last discarded bit is added to shifted
1474
                 value for rounding and the result is written in-place.
1475
                 'shift' is an immediate value.
1476
*/
1477
#define SRARI_H2( RTYPE, in0, in1, shift )                  \
1478
{                                                           \
1479
    in0 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in0, shift );  \
1480
    in1 = ( RTYPE ) __msa_srari_h( ( v8i16 ) in1, shift );  \
1481
}
1482
#define SRARI_H2_UH( ... ) SRARI_H2( v8u16, __VA_ARGS__ )
1483
#define SRARI_H2_SH( ... ) SRARI_H2( v8i16, __VA_ARGS__ )
1484

1485
#define SRARI_H4( RTYPE, in0, in1, in2, in3, shift )    \
1486
{                                                       \
1487
    SRARI_H2( RTYPE, in0, in1, shift );                 \
1488
    SRARI_H2( RTYPE, in2, in3, shift );                 \
1489
}
1490
#define SRARI_H4_UH( ... ) SRARI_H4( v8u16, __VA_ARGS__ )
1491
#define SRARI_H4_SH( ... ) SRARI_H4( v8i16, __VA_ARGS__ )
1492

1493
#define SRARI_W2( RTYPE, in0, in1, shift )                  \
1494
{                                                           \
1495
    in0 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in0, shift );  \
1496
    in1 = ( RTYPE ) __msa_srari_w( ( v4i32 ) in1, shift );  \
1497
}
1498
#define SRARI_W2_SW( ... ) SRARI_W2( v4i32, __VA_ARGS__ )
1499

1500
#define SRARI_W4( RTYPE, in0, in1, in2, in3, shift )  \
1501
{                                                     \
1502
    SRARI_W2( RTYPE, in0, in1, shift );               \
1503
    SRARI_W2( RTYPE, in2, in3, shift );               \
1504
}
1505
#define SRARI_W4_SW( ... ) SRARI_W4( v4i32, __VA_ARGS__ )
1506

1507
/* Description : Multiplication of pairs of vectors
1508
   Arguments   : Inputs  - in0, in1, in2, in3
1509
                 Outputs - out0, out1
1510
   Details     : Each element from 'in0' is multiplied with elements from 'in1'
1511
                 and the result is written to 'out0'
1512
*/
1513
#define MUL2( in0, in1, in2, in3, out0, out1 )  \
1514
{                                               \
1515
    out0 = in0 * in1;                           \
1516
    out1 = in2 * in3;                           \
1517
}
1518
#define MUL4( in0, in1, in2, in3, in4, in5, in6, in7,  \
1519
              out0, out1, out2, out3 )                 \
1520
{                                                      \
1521
    MUL2( in0, in1, in2, in3, out0, out1 );            \
1522
    MUL2( in4, in5, in6, in7, out2, out3 );            \
1523
}
1524

1525
/* Description : Addition of 2 pairs of vectors
1526
   Arguments   : Inputs  - in0, in1, in2, in3
1527
                 Outputs - out0, out1
1528
   Details     : Each element in 'in0' is added to 'in1' and result is written
1529
                 to 'out0'.
1530
*/
1531
#define ADD2( in0, in1, in2, in3, out0, out1 )  \
1532
{                                               \
1533
    out0 = in0 + in1;                           \
1534
    out1 = in2 + in3;                           \
1535
}
1536
#define ADD4( in0, in1, in2, in3, in4, in5, in6, in7,  \
1537
              out0, out1, out2, out3 )                 \
1538
{                                                      \
1539
    ADD2( in0, in1, in2, in3, out0, out1 );            \
1540
    ADD2( in4, in5, in6, in7, out2, out3 );            \
1541
}
1542

1543
#define SUB4( in0, in1, in2, in3, in4, in5, in6, in7,  \
1544
              out0, out1, out2, out3 )                 \
1545
{                                                      \
1546
    out0 = in0 - in1;                                  \
1547
    out1 = in2 - in3;                                  \
1548
    out2 = in4 - in5;                                  \
1549
    out3 = in6 - in7;                                  \
1550
}
1551

1552
/* Description : Sign extend halfword elements from right half of the vector
1553
   Arguments   : Input  - in    (halfword vector)
1554
                 Output - out   (sign extended word vector)
1555
                 Return Type - signed word
1556
   Details     : Sign bit of halfword elements from input vector 'in' is
1557
                 extracted and interleaved with same vector 'in0' to generate
1558
                 4 word elements keeping sign intact
1559
*/
1560
#define UNPCK_R_SH_SW( in, out )                           \
1561
{                                                          \
1562
    v8i16 sign_m;                                          \
1563
                                                           \
1564
    sign_m = __msa_clti_s_h( ( v8i16 ) in, 0 );            \
1565
    out = ( v4i32 ) __msa_ilvr_h( sign_m, ( v8i16 ) in );  \
1566
}
1567

1568
/* Description : Zero extend unsigned byte elements to halfword elements
1569
   Arguments   : Input  - in           (unsigned byte vector)
1570
                 Outputs - out0, out1  (unsigned  halfword vectors)
1571
                 Return Type - signed halfword
1572
   Details     : Zero extended right half of vector is returned in 'out0'
1573
                 Zero extended left half of vector is returned in 'out1'
1574
*/
1575
#define UNPCK_UB_SH( in, out0, out1 )       \
1576
{                                           \
1577
    v16i8 zero_m = { 0 };                   \
1578
                                            \
1579
    ILVRL_B2_SH( zero_m, in, out0, out1 );  \
1580
}
1581

1582
/* Description : Sign extend halfword elements from input vector and return
1583
                 the result in pair of vectors
1584
   Arguments   : Input  - in            (halfword vector)
1585
                 Outputs - out0, out1   (sign extended word vectors)
1586
                 Return Type - signed word
1587
   Details     : Sign bit of halfword elements from input vector 'in' is
1588
                 extracted and interleaved right with same vector 'in0' to
1589
                 generate 4 signed word elements in 'out0'
1590
                 Then interleaved left with same vector 'in0' to
1591
                 generate 4 signed word elements in 'out1'
1592
*/
1593
#define UNPCK_SH_SW( in, out0, out1 )           \
1594
{                                               \
1595
    v8i16 tmp_m;                                \
1596
                                                \
1597
    tmp_m = __msa_clti_s_h( ( v8i16 ) in, 0 );  \
1598
    ILVRL_H2_SW( tmp_m, in, out0, out1 );       \
1599
}
1600

1601
/* Description : Butterfly of 4 input vectors
1602
   Arguments   : Inputs  - in0, in1, in2, in3
1603
                 Outputs - out0, out1, out2, out3
1604
   Details     : Butterfly operation
1605
*/
1606
#define BUTTERFLY_4( in0, in1, in2, in3, out0, out1, out2, out3 )  \
1607
{                                                                  \
1608
    out0 = in0 + in3;                                              \
1609
    out1 = in1 + in2;                                              \
1610
                                                                   \
1611
    out2 = in1 - in2;                                              \
1612
    out3 = in0 - in3;                                              \
1613
}
1614

1615
/* Description : Butterfly of 8 input vectors
1616
   Arguments   : Inputs  - in0 ...  in7
1617
                 Outputs - out0 .. out7
1618
   Details     : Butterfly operation
1619
*/
1620
#define BUTTERFLY_8( in0, in1, in2, in3, in4, in5, in6, in7,           \
1621
                     out0, out1, out2, out3, out4, out5, out6, out7 )  \
1622
{                                                                      \
1623
    out0 = in0 + in7;                                                  \
1624
    out1 = in1 + in6;                                                  \
1625
    out2 = in2 + in5;                                                  \
1626
    out3 = in3 + in4;                                                  \
1627
                                                                       \
1628
    out4 = in3 - in4;                                                  \
1629
    out5 = in2 - in5;                                                  \
1630
    out6 = in1 - in6;                                                  \
1631
    out7 = in0 - in7;                                                  \
1632
}
1633

1634
/* Description : Transpose input 8x8 byte block
1635
   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1636
                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1637
                 Return Type - as per RTYPE
1638
*/
1639
#define TRANSPOSE8x8_UB( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,    \
1640
                         out0, out1, out2, out3, out4, out5, out6, out7 )  \
1641
{                                                                          \
1642
    v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
1643
    v16i8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                  \
1644
                                                                           \
1645
    ILVR_B4_SB( in2, in0, in3, in1, in6, in4, in7, in5,                    \
1646
                tmp0_m, tmp1_m, tmp2_m, tmp3_m );                          \
1647
    ILVRL_B2_SB( tmp1_m, tmp0_m, tmp4_m, tmp5_m );                         \
1648
    ILVRL_B2_SB( tmp3_m, tmp2_m, tmp6_m, tmp7_m );                         \
1649
    ILVRL_W2( RTYPE, tmp6_m, tmp4_m, out0, out2 );                         \
1650
    ILVRL_W2( RTYPE, tmp7_m, tmp5_m, out4, out6 );                         \
1651
    SLDI_B2_0( RTYPE, out0, out2, out1, out3, 8 );                         \
1652
    SLDI_B2_0( RTYPE, out4, out6, out5, out7, 8 );                         \
1653
}
1654
#define TRANSPOSE8x8_UB_UB( ... ) TRANSPOSE8x8_UB( v16u8, __VA_ARGS__ )
1655

1656
/* Description : Transpose 16x8 block into 8x16 with byte elements in vectors
1657
   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7,
1658
                           in8, in9, in10, in11, in12, in13, in14, in15
1659
                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1660
                 Return Type - unsigned byte
1661
*/
1662
#define TRANSPOSE16x8_UB_UB( in0, in1, in2, in3, in4, in5, in6, in7,           \
1663
                             in8, in9, in10, in11, in12, in13, in14, in15,     \
1664
                             out0, out1, out2, out3, out4, out5, out6, out7 )  \
1665
{                                                                              \
1666
    v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                      \
1667
    v16u8 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                      \
1668
                                                                               \
1669
    ILVEV_D2_UB( in0, in8, in1, in9, out7, out6 );                             \
1670
    ILVEV_D2_UB( in2, in10, in3, in11, out5, out4 );                           \
1671
    ILVEV_D2_UB( in4, in12, in5, in13, out3, out2 );                           \
1672
    ILVEV_D2_UB( in6, in14, in7, in15, out1, out0 );                           \
1673
                                                                               \
1674
    tmp0_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out6, ( v16i8 ) out7 );        \
1675
    tmp4_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out6, ( v16i8 ) out7 );        \
1676
    tmp1_m = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out4, ( v16i8 ) out5 );        \
1677
    tmp5_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out4, ( v16i8 ) out5 );        \
1678
    out5 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out2, ( v16i8 ) out3 );          \
1679
    tmp6_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out2, ( v16i8 ) out3 );        \
1680
    out7 = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) out0, ( v16i8 ) out1 );          \
1681
    tmp7_m = ( v16u8 ) __msa_ilvod_b( ( v16i8 ) out0, ( v16i8 ) out1 );        \
1682
                                                                               \
1683
    ILVEV_H2_UB( tmp0_m, tmp1_m, out5, out7, tmp2_m, tmp3_m );                 \
1684
    out0 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
1685
    out4 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
1686
                                                                               \
1687
    tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m );    \
1688
    tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) out7, ( v8i16 ) out5 );        \
1689
    out2 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
1690
    out6 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
1691
                                                                               \
1692
    ILVEV_H2_UB( tmp4_m, tmp5_m, tmp6_m, tmp7_m, tmp2_m, tmp3_m );             \
1693
    out1 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
1694
    out5 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
1695
                                                                               \
1696
    tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m );    \
1697
    tmp2_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp5_m, ( v8i16 ) tmp4_m );    \
1698
    tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m );    \
1699
    tmp3_m = ( v16u8 ) __msa_ilvod_h( ( v8i16 ) tmp7_m, ( v8i16 ) tmp6_m );    \
1700
    out3 = ( v16u8 ) __msa_ilvev_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
1701
    out7 = ( v16u8 ) __msa_ilvod_w( ( v4i32 ) tmp3_m, ( v4i32 ) tmp2_m );      \
1702
}
1703

1704
/* Description : Transpose 4x4 block with half word elements in vectors
1705
   Arguments   : Inputs  - in0, in1, in2, in3
1706
                 Outputs - out0, out1, out2, out3
1707
                 Return Type - signed halfword
1708
*/
1709
#define TRANSPOSE4x4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 )  \
1710
{                                                                         \
1711
    v8i16 s0_m, s1_m;                                                     \
1712
                                                                          \
1713
    ILVR_H2_SH( in1, in0, in3, in2, s0_m, s1_m );                         \
1714
    ILVRL_W2_SH( s1_m, s0_m, out0, out2 );                                \
1715
    out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out0 );      \
1716
    out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) out0, ( v2i64 ) out2 );      \
1717
}
1718

1719
/* Description : Transpose 4x8 block with half word elements in vectors
1720
   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1721
                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1722
                 Return Type - signed halfword
1723
*/
1724
#define TRANSPOSE4X8_SH_SH( in0, in1, in2, in3, in4, in5, in6, in7,           \
1725
                            out0, out1, out2, out3, out4, out5, out6, out7 )  \
1726
{                                                                             \
1727
    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                     \
1728
    v8i16 tmp0_n, tmp1_n, tmp2_n, tmp3_n;                                     \
1729
    v8i16 zero_m = { 0 };                                                     \
1730
                                                                              \
1731
    ILVR_H4_SH( in1, in0, in3, in2, in5, in4, in7, in6,                       \
1732
                tmp0_n, tmp1_n, tmp2_n, tmp3_n );                             \
1733
    ILVRL_W2_SH( tmp1_n, tmp0_n, tmp0_m, tmp2_m );                            \
1734
    ILVRL_W2_SH( tmp3_n, tmp2_n, tmp1_m, tmp3_m );                            \
1735
                                                                              \
1736
    out0 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m );      \
1737
    out1 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp0_m );      \
1738
    out2 = ( v8i16 ) __msa_ilvr_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m );      \
1739
    out3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp2_m );      \
1740
                                                                              \
1741
    out4 = zero_m;                                                            \
1742
    out5 = zero_m;                                                            \
1743
    out6 = zero_m;                                                            \
1744
    out7 = zero_m;                                                            \
1745
}
1746

1747
/* Description : Transpose 8x4 block with half word elements in vectors
1748
   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1749
                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1750
                 Return Type - signed halfword
1751
*/
1752
#define TRANSPOSE8X4_SH_SH( in0, in1, in2, in3, out0, out1, out2, out3 )  \
1753
{                                                                         \
1754
    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                 \
1755
                                                                          \
1756
    ILVR_H2_SH( in1, in0, in3, in2, tmp0_m, tmp1_m );                     \
1757
    ILVL_H2_SH( in1, in0, in3, in2, tmp2_m, tmp3_m );                     \
1758
    ILVR_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2 );             \
1759
    ILVL_W2_SH( tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3 );             \
1760
}
1761

1762
/* Description : Transpose 8x8 block with half word elements in vectors
1763
   Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
1764
                 Outputs - out0, out1, out2, out3, out4, out5, out6, out7
1765
                 Return Type - as per RTYPE
1766
*/
1767
#define TRANSPOSE8x8_H( RTYPE, in0, in1, in2, in3, in4, in5, in6, in7,     \
1768
                        out0, out1, out2, out3, out4, out5, out6, out7 )   \
1769
{                                                                          \
1770
    v8i16 s0_m, s1_m;                                                      \
1771
    v8i16 tmp0_m, tmp1_m, tmp2_m, tmp3_m;                                  \
1772
    v8i16 tmp4_m, tmp5_m, tmp6_m, tmp7_m;                                  \
1773
                                                                           \
1774
    ILVR_H2_SH( in6, in4, in7, in5, s0_m, s1_m );                          \
1775
    ILVRL_H2_SH( s1_m, s0_m, tmp0_m, tmp1_m );                             \
1776
    ILVL_H2_SH( in6, in4, in7, in5, s0_m, s1_m );                          \
1777
    ILVRL_H2_SH( s1_m, s0_m, tmp2_m, tmp3_m );                             \
1778
    ILVR_H2_SH( in2, in0, in3, in1, s0_m, s1_m );                          \
1779
    ILVRL_H2_SH( s1_m, s0_m, tmp4_m, tmp5_m );                             \
1780
    ILVL_H2_SH( in2, in0, in3, in1, s0_m, s1_m );                          \
1781
    ILVRL_H2_SH( s1_m, s0_m, tmp6_m, tmp7_m );                             \
1782
    PCKEV_D4( RTYPE, tmp0_m, tmp4_m, tmp1_m, tmp5_m, tmp2_m, tmp6_m,       \
1783
              tmp3_m, tmp7_m, out0, out2, out4, out6 );                    \
1784
    out1 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp0_m, ( v2i64 ) tmp4_m );  \
1785
    out3 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp1_m, ( v2i64 ) tmp5_m );  \
1786
    out5 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp2_m, ( v2i64 ) tmp6_m );  \
1787
    out7 = ( RTYPE ) __msa_pckod_d( ( v2i64 ) tmp3_m, ( v2i64 ) tmp7_m );  \
1788
}
1789
#define TRANSPOSE8x8_SH_SH( ... ) TRANSPOSE8x8_H( v8i16, __VA_ARGS__ )
1790

1791
/* Description : Transpose 4x4 block with word elements in vectors
1792
   Arguments   : Inputs  - in0, in1, in2, in3
1793
                 Outputs - out0, out1, out2, out3
1794
                 Return Type - signed word
1795
*/
1796
#define TRANSPOSE4x4_SW_SW( in0, in1, in2, in3, out0, out1, out2, out3 )  \
1797
{                                                                         \
1798
    v4i32 s0_m, s1_m, s2_m, s3_m;                                         \
1799
                                                                          \
1800
    ILVRL_W2_SW( in1, in0, s0_m, s1_m );                                  \
1801
    ILVRL_W2_SW( in3, in2, s2_m, s3_m );                                  \
1802
                                                                          \
1803
    out0 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m );      \
1804
    out1 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s2_m, ( v2i64 ) s0_m );      \
1805
    out2 = ( v4i32 ) __msa_ilvr_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m );      \
1806
    out3 = ( v4i32 ) __msa_ilvl_d( ( v2i64 ) s3_m, ( v2i64 ) s1_m );      \
1807
}
1808

1809
/* Description : Add block 4x4
1810
   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
1811
   Details     : Least significant 4 bytes from each input vector are added to
1812
                 the destination bytes, clipped between 0-255 and stored.
1813
*/
1814
#define ADDBLK_ST4x4_UB( in0, in1, in2, in3, p_dst, stride )        \
1815
{                                                                   \
1816
    uint32_t src0_m, src1_m, src2_m, src3_m;                        \
1817
    uint32_t out0_m, out1_m, out2_m, out3_m;                        \
1818
    v8i16 inp0_m, inp1_m, res0_m, res1_m;                           \
1819
    v16i8 dst0_m = { 0 };                                           \
1820
    v16i8 dst1_m = { 0 };                                           \
1821
    v16i8 zero_m = { 0 };                                           \
1822
                                                                    \
1823
    ILVR_D2_SH( in1, in0, in3, in2, inp0_m, inp1_m )                \
1824
    LW4( p_dst, stride,  src0_m, src1_m, src2_m, src3_m );          \
1825
    INSERT_W2_SB( src0_m, src1_m, dst0_m );                         \
1826
    INSERT_W2_SB( src2_m, src3_m, dst1_m );                         \
1827
    ILVR_B2_SH( zero_m, dst0_m, zero_m, dst1_m, res0_m, res1_m );   \
1828
    ADD2( res0_m, inp0_m, res1_m, inp1_m, res0_m, res1_m );         \
1829
    CLIP_SH2_0_255( res0_m, res1_m );                               \
1830
    PCKEV_B2_SB( res0_m, res0_m, res1_m, res1_m, dst0_m, dst1_m );  \
1831
                                                                    \
1832
    out0_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 0 );                 \
1833
    out1_m = __msa_copy_u_w( ( v4i32 ) dst0_m, 1 );                 \
1834
    out2_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 0 );                 \
1835
    out3_m = __msa_copy_u_w( ( v4i32 ) dst1_m, 1 );                 \
1836
    SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride );           \
1837
}
1838

1839
/* Description : Dot product and addition of 3 signed halfword input vectors
1840
   Arguments   : Inputs  - in0, in1, in2, coeff0, coeff1, coeff2
1841
                 Output - out0_m
1842
                 Return Type - signed halfword
1843
   Details     : Dot product of 'in0' with 'coeff0'
1844
                 Dot product of 'in1' with 'coeff1'
1845
                 Dot product of 'in2' with 'coeff2'
1846
                 Addition of all the 3 vector results
1847
                 out0_m = (in0 * coeff0) + (in1 * coeff1) + (in2 * coeff2)
1848
*/
1849
#define DPADD_SH3_SH( in0, in1, in2, coeff0, coeff1, coeff2 )             \
1850
( {                                                                       \
1851
    v8i16 tmp1_m;                                                         \
1852
    v8i16 out0_m;                                                         \
1853
                                                                          \
1854
    out0_m = __msa_dotp_s_h( ( v16i8 ) in0, ( v16i8 ) coeff0 );           \
1855
    out0_m = __msa_dpadd_s_h( out0_m, ( v16i8 ) in1, ( v16i8 ) coeff1 );  \
1856
    tmp1_m = __msa_dotp_s_h( ( v16i8 ) in2, ( v16i8 ) coeff2 );           \
1857
    out0_m = __msa_adds_s_h( out0_m, tmp1_m );                            \
1858
                                                                          \
1859
    out0_m;                                                               \
1860
} )
1861

1862
/* Description : Pack even elements of input vectors & xor with 128
1863
   Arguments   : Inputs  - in0, in1
1864
                 Output - out_m
1865
                 Return Type - unsigned byte
1866
   Details     : Signed byte even elements from 'in0' and 'in1' are packed
1867
                 together in one vector and the resulting vector is xor'ed with
1868
                 128 to shift the range from signed to unsigned byte
1869
*/
1870
#define PCKEV_XORI128_UB( in0, in1 )                                  \
1871
( {                                                                   \
1872
    v16u8 out_m;                                                      \
1873
    out_m = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 );  \
1874
    out_m = ( v16u8 ) __msa_xori_b( ( v16u8 ) out_m, 128 );           \
1875
    out_m;                                                            \
1876
} )
1877

1878
/* Description : Pack even byte elements, extract 0 & 2 index words from pair
1879
                 of results and store 4 words in destination memory as per
1880
                 stride
1881
   Arguments   : Inputs  - in0, in1, in2, in3, pdst, stride
1882
*/
1883
#define PCKEV_ST4x4_UB( in0, in1, in2, in3, p_dst, stride )  \
1884
{                                                            \
1885
    uint32_t out0_m, out1_m, out2_m, out3_m;                 \
1886
    v16i8 tmp0_m, tmp1_m;                                    \
1887
                                                             \
1888
    PCKEV_B2_SB( in1, in0, in3, in2, tmp0_m, tmp1_m );       \
1889
                                                             \
1890
    out0_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 0 );          \
1891
    out1_m = __msa_copy_u_w( ( v4i32 ) tmp0_m, 2 );          \
1892
    out2_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 0 );          \
1893
    out3_m = __msa_copy_u_w( ( v4i32 ) tmp1_m, 2 );          \
1894
                                                             \
1895
    SW4( out0_m, out1_m, out2_m, out3_m, p_dst, stride );    \
1896
}
1897

1898
/* Description : Pack even byte elements and store byte vector in destination
1899
                 memory
1900
   Arguments   : Inputs  - in0, in1, pdst
1901
*/
1902
#define PCKEV_ST_SB( in0, in1, p_dst )                      \
1903
{                                                           \
1904
    v16i8 tmp_m;                                            \
1905
    tmp_m = __msa_pckev_b( ( v16i8 ) in1, ( v16i8 ) in0 );  \
1906
    ST_SB( tmp_m, ( p_dst ) );                              \
1907
}
1908

1909
#define AVC_CALC_DPADD_H_6PIX_2COEFF_SH( in0, in1, in2, in3, in4, in5 )    \
1910
( {                                                                        \
1911
    v4i32 tmp0_m, tmp1_m;                                                  \
1912
    v8i16 out0_m, out1_m, out2_m, out3_m;                                  \
1913
    v8i16 minus5h_m = __msa_ldi_h( -5 );                                   \
1914
    v8i16 plus20h_m = __msa_ldi_h( 20 );                                   \
1915
                                                                           \
1916
    ILVRL_H2_SW( in5, in0, tmp0_m, tmp1_m );                               \
1917
                                                                           \
1918
    tmp0_m = __msa_hadd_s_w( ( v8i16 ) tmp0_m, ( v8i16 ) tmp0_m );         \
1919
    tmp1_m = __msa_hadd_s_w( ( v8i16 ) tmp1_m, ( v8i16 ) tmp1_m );         \
1920
                                                                           \
1921
    ILVRL_H2_SH( in1, in4, out0_m, out1_m );                               \
1922
    DPADD_SH2_SW( out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m );  \
1923
    ILVRL_H2_SH( in2, in3, out2_m, out3_m );                               \
1924
    DPADD_SH2_SW( out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m );  \
1925
                                                                           \
1926
    SRARI_W2_SW( tmp0_m, tmp1_m, 10 );                                     \
1927
    SAT_SW2_SW( tmp0_m, tmp1_m, 7 );                                       \
1928
    out0_m = __msa_pckev_h( ( v8i16 ) tmp1_m, ( v8i16 ) tmp0_m );          \
1929
                                                                           \
1930
    out0_m;                                                                \
1931
} )
1932

1933
#define AVC_HORZ_FILTER_SH( in, mask0, mask1, mask2 )      \
1934
( {                                                        \
1935
    v8i16 out0_m, out1_m;                                  \
1936
    v16i8 tmp0_m, tmp1_m;                                  \
1937
    v16i8 minus5b = __msa_ldi_b( -5 );                     \
1938
    v16i8 plus20b = __msa_ldi_b( 20 );                     \
1939
                                                           \
1940
    tmp0_m = __msa_vshf_b( ( v16i8 ) mask0, in, in );      \
1941
    out0_m = __msa_hadd_s_h( tmp0_m, tmp0_m );             \
1942
                                                           \
1943
    tmp0_m = __msa_vshf_b( ( v16i8 ) mask1, in, in );      \
1944
    out0_m = __msa_dpadd_s_h( out0_m, minus5b, tmp0_m );   \
1945
                                                           \
1946
    tmp1_m = __msa_vshf_b( ( v16i8 ) ( mask2 ), in, in );  \
1947
    out1_m = __msa_dpadd_s_h( out0_m, plus20b, tmp1_m );   \
1948
                                                           \
1949
    out1_m;                                                \
1950
} )
1951

1952
#endif  /* X264_MIPS_MACROS_H */
1953

1954
Product

Resources

Company