CoCalc -- mc-c.c

05. Matplotlib / ffmpeg-3.0 / libx264 / common / mips / mc-c.c
⁵²⁸⁶⁸ views
1
/*****************************************************************************
2
 * mc-c.c: msa motion compensation
3
 *****************************************************************************
4
 * Copyright (C) 2015-2016 x264 project
5
 *
6
 * Authors: Neha Rana <[email protected]>
7
 *
8
 * This program is free software; you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation; either version 2 of the License, or
11
 * (at your option) any later version.
12
 *
13
 * This program is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU General Public License
19
 * along with this program; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
 *
22
 * This program is also available under a commercial proprietary license.
23
 * For more information, contact us at [email protected].
24
 *****************************************************************************/
25

26
#include "common/common.h"
27
#include "macros.h"
28
#include "mc.h"
29

30
#if !HIGH_BIT_DEPTH
31
static const uint8_t pu_luma_mask_arr[16 * 8] =
32
{
33
    /* 8 width cases */
34
    0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
35
    1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
36
    2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
37
    /* 4 width cases */
38
    0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
39
    1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
40
    2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
41
    2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
42
    3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
43
};
44

45
static const uint8_t pu_chroma_mask_arr[16 * 5] =
46
{
47
    0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
48
    0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
49
    0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
50
    0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
51
    0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
52
};
53

54
void x264_mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
55
                           uint8_t *p_src, intptr_t i_src_stride,
56
                           int32_t i_height );
57
void x264_mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
58
                          uint8_t *p_src, intptr_t i_src_stride,
59
                          int32_t i_height );
60
void x264_mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
61
                          intptr_t i_src_stride, int32_t i_height );
62
void x264_memzero_aligned_msa( void *p_dst, size_t n );
63

64
void x264_pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
65
                               uint8_t *p_pix2, intptr_t i_pix2_stride,
66
                               uint8_t *p_pix3, intptr_t i_pix3_stride,
67
                               int32_t i_weight );
68
void x264_pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
69
                              uint8_t *p_pix2, intptr_t i_pix2_stride,
70
                              uint8_t *p_pix3, intptr_t i_pix3_stride,
71
                              int32_t i_weight );
72
void x264_pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
73
                              uint8_t *p_pix2, intptr_t i_pix2_stride,
74
                              uint8_t *p_pix3, intptr_t i_pix3_stride,
75
                              int32_t i_weight );
76
void x264_pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
77
                             uint8_t *p_pix2, intptr_t i_pix2_stride,
78
                             uint8_t *p_pix3, intptr_t i_pix3_stride,
79
                             int32_t i_weight );
80
void x264_pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
81
                             uint8_t *p_pix2, intptr_t i_pix2_stride,
82
                             uint8_t *p_pix3, intptr_t i_pix3_stride,
83
                             int32_t i_weight );
84
void x264_pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
85
                              uint8_t *p_pix2, intptr_t pix2_stride,
86
                              uint8_t *p_pix3, intptr_t pix3_stride,
87
                              int32_t i_weight );
88
void x264_pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
89
                             uint8_t *p_pix2, intptr_t i_pix2_stride,
90
                             uint8_t *p_pix3, intptr_t i_pix3_stride,
91
                             int32_t i_weight );
92
void x264_pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
93
                             uint8_t *p_pix2, intptr_t i_pix2_stride,
94
                             uint8_t *p_pix3, intptr_t i_pix3_stride,
95
                             int32_t i_weight );
96
void x264_pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
97
                             uint8_t *p_pix2, intptr_t i_pix2_stride,
98
                             uint8_t *p_pix3, intptr_t i_pix3_stride,
99
                             int32_t i_weight );
100

101
void x264_mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride,
102
                             uint8_t *p_src, intptr_t i_src_stride,
103
                             const x264_weight_t *pWeight, int32_t i_height );
104
void x264_mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride,
105
                            uint8_t *p_src, intptr_t i_src_stride,
106
                            const x264_weight_t *pWeight, int32_t i_height );
107
void x264_mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
108
                            uint8_t *p_src, intptr_t i_src_stride,
109
                            const x264_weight_t *pWeight, int32_t i_height );
110
void x264_mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
111
                             uint8_t *p_src, intptr_t i_src_stride,
112
                             const x264_weight_t *pWeight, int32_t i_height );
113

114
weight_fn_t x264_mc_weight_wtab_msa[6] =
115
{
116
    x264_mc_weight_w4_msa,
117
    x264_mc_weight_w4_msa,
118
    x264_mc_weight_w8_msa,
119
    x264_mc_weight_w16_msa,
120
    x264_mc_weight_w16_msa,
121
    x264_mc_weight_w20_msa,
122
};
123

124
void x264_mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
125
                       uint8_t *p_src[4], intptr_t i_src_stride,
126
                       int32_t m_vx, int32_t m_vy,
127
                       int32_t i_width, int32_t i_height,
128
                       const x264_weight_t *pWeight );
129
uint8_t *x264_get_ref_msa( uint8_t *p_dst,   intptr_t *p_dst_stride,
130
                           uint8_t *p_src[4], intptr_t i_src_stride,
131
                           int32_t m_vx, int32_t m_vy,
132
                           int32_t i_width, int32_t i_height,
133
                           const x264_weight_t *pWeight );
134
void x264_mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v,
135
                         intptr_t i_dst_stride,
136
                         uint8_t *p_src, intptr_t i_src_stride,
137
                         int32_t m_vx, int32_t m_vy,
138
                         int32_t i_width, int32_t i_height );
139
void x264_hpel_filter_msa( uint8_t *p_dsth, uint8_t *p_dst_v,
140
                           uint8_t *p_dstc, uint8_t *p_src,
141
                           intptr_t i_stride, int32_t i_width,
142
                           int32_t i_height, int16_t *p_buf );
143

144
void x264_plane_copy_interleave_msa( uint8_t *p_dst,  intptr_t i_dst_stride,
145
                                     uint8_t *p_src0, intptr_t i_src_stride0,
146
                                     uint8_t *p_src1, intptr_t i_src_stride1,
147
                                     int32_t i_width, int32_t i_height );
148
void x264_plane_copy_deinterleave_msa( uint8_t *p_dst0, intptr_t i_dst_stride0,
149
                                       uint8_t *p_dst1, intptr_t i_dst_stride1,
150
                                       uint8_t *p_src,  intptr_t i_src_stride,
151
                                       int32_t i_width, int32_t i_height );
152
void x264_plane_copy_deinterleave_rgb_msa( uint8_t *p_dst0,
153
                                           intptr_t i_dst_stride0,
154
                                           uint8_t *p_dst1,
155
                                           intptr_t i_dst_stride1,
156
                                           uint8_t *p_dst2,
157
                                           intptr_t i_dst_stride2,
158
                                           uint8_t *p_src,
159
                                           intptr_t i_src_stride,
160
                                           int32_t i_src_width, int32_t i_width,
161
                                           int32_t i_height );
162
void x264_store_interleave_chroma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
163
                                       uint8_t *p_src0, uint8_t *p_src1,
164
                                       int32_t i_height );
165
void x264_load_deinterleave_chroma_fenc_msa( uint8_t *p_dst, uint8_t *p_src,
166
                                             intptr_t i_src_stride,
167
                                             int32_t i_height );
168
void x264_load_deinterleave_chroma_fdec_msa( uint8_t *p_dst, uint8_t *p_src,
169
                                             intptr_t i_src_stride,
170
                                             int32_t i_height );
171
void x264_frame_init_lowres_core_msa( uint8_t *p_src, uint8_t *p_dst0,
172
                                      uint8_t *p_dst1, uint8_t *p_dst2,
173
                                      uint8_t *p_dst3, intptr_t i_src_stride,
174
                                      intptr_t i_dst_stride, int32_t i_width,
175
                                      int32_t i_height );
176

177
static void avc_luma_hz_16w_msa( uint8_t *p_src, int32_t i_src_stride,
178
                                 uint8_t *p_dst, int32_t i_dst_stride,
179
                                 int32_t i_height )
180
{
181
    uint32_t u_loop_cnt, u_h4w;
182
    v16u8 dst0;
183
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
184
    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
185
    v16i8 mask0, mask1, mask2;
186
    v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
187
    v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
188
    v16i8 minus5b = __msa_ldi_b( -5 );
189
    v16i8 plus20b = __msa_ldi_b( 20 );
190

191
    u_h4w = i_height % 4;
192
    LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 );
193

194
    for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
195
    {
196
        LD_SB2( p_src, 8, src0, src1 );
197
        p_src += i_src_stride;
198
        LD_SB2( p_src, 8, src2, src3 );
199
        p_src += i_src_stride;
200

201
        XORI_B4_128_SB( src0, src1, src2, src3 );
202
        VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 );
203
        VSHF_B2_SB( src2, src2, src3, src3, mask0, mask0, vec6, vec9 );
204
        VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 );
205
        VSHF_B2_SB( src2, src2, src3, src3, mask1, mask1, vec7, vec10 );
206
        VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 );
207
        VSHF_B2_SB( src2, src2, src3, src3, mask2, mask2, vec8, vec11 );
208
        HADD_SB4_SH( vec0, vec3, vec6, vec9, res0, res1, res2, res3 );
209
        DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
210
                      minus5b, res0, res1, res2, res3 );
211
        DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
212
                      plus20b, res0, res1, res2, res3 );
213

214
        LD_SB2( p_src, 8, src4, src5 );
215
        p_src += i_src_stride;
216
        LD_SB2( p_src, 8, src6, src7 );
217
        p_src += i_src_stride;
218

219
        XORI_B4_128_SB( src4, src5, src6, src7 );
220
        VSHF_B2_SB( src4, src4, src5, src5, mask0, mask0, vec0, vec3 );
221
        VSHF_B2_SB( src6, src6, src7, src7, mask0, mask0, vec6, vec9 );
222
        VSHF_B2_SB( src4, src4, src5, src5, mask1, mask1, vec1, vec4 );
223
        VSHF_B2_SB( src6, src6, src7, src7, mask1, mask1, vec7, vec10 );
224
        VSHF_B2_SB( src4, src4, src5, src5, mask2, mask2, vec2, vec5 );
225
        VSHF_B2_SB( src6, src6, src7, src7, mask2, mask2, vec8, vec11 );
226
        HADD_SB4_SH( vec0, vec3, vec6, vec9, res4, res5, res6, res7 );
227
        DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
228
                      minus5b, res4, res5, res6, res7 );
229
        DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
230
                      plus20b, res4, res5, res6, res7 );
231
        SRARI_H4_SH( res0, res1, res2, res3, 5 );
232
        SRARI_H4_SH( res4, res5, res6, res7, 5 );
233
        SAT_SH4_SH( res0, res1, res2, res3, 7 );
234
        SAT_SH4_SH( res4, res5, res6, res7, 7 );
235
        PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
236
                     vec0, vec1, vec2, vec3 );
237
        XORI_B4_128_SB( vec0, vec1, vec2, vec3 );
238

239
        ST_SB4( vec0, vec1, vec2, vec3, p_dst, i_dst_stride );
240
        p_dst += ( 4 * i_dst_stride );
241
    }
242

243
    for( u_loop_cnt = u_h4w; u_loop_cnt--; )
244
    {
245
        LD_SB2( p_src, 8, src0, src1 );
246
        p_src += i_src_stride;
247

248
        XORI_B2_128_SB( src0, src1 );
249
        VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 );
250
        VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 );
251
        VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 );
252
        res0 = __msa_hadd_s_h( vec0, vec0 );
253
        DPADD_SB2_SH( vec1, vec2, minus5b, plus20b, res0, res0 );
254
        res1 = __msa_hadd_s_h( vec3, vec3 );
255
        DPADD_SB2_SH( vec4, vec5, minus5b, plus20b, res1, res1 );
256
        SRARI_H2_SH( res0, res1, 5 );
257
        SAT_SH2_SH( res0, res1, 7 );
258
        dst0 = PCKEV_XORI128_UB( res0, res1 );
259
        ST_UB( dst0, p_dst );
260
        p_dst += i_dst_stride;
261
    }
262
}
263

264
static void avc_luma_vt_16w_msa( uint8_t *p_src, int32_t i_src_stride,
265
                                 uint8_t *p_dst, int32_t i_dst_stride,
266
                                 int32_t i_height )
267
{
268
    uint32_t u_loop_cnt, u_h4w;
269
    const int16_t i_filt_const0 = 0xfb01;
270
    const int16_t i_filt_const1 = 0x1414;
271
    const int16_t i_filt_const2 = 0x1fb;
272
    v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
273
    v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
274
    v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
275
    v16i8 src65_l, src87_l;
276
    v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
277
    v16u8 res0, res1, res2, res3;
278
    v16i8 filt0, filt1, filt2;
279

280
    u_h4w = i_height % 4;
281
    filt0 = ( v16i8 ) __msa_fill_h( i_filt_const0 );
282
    filt1 = ( v16i8 ) __msa_fill_h( i_filt_const1 );
283
    filt2 = ( v16i8 ) __msa_fill_h( i_filt_const2 );
284

285
    LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
286
    p_src += ( 5 * i_src_stride );
287

288
    XORI_B5_128_SB( src0, src1, src2, src3, src4 );
289
    ILVR_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3,
290
                src10_r, src21_r, src32_r, src43_r );
291
    ILVL_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3,
292
                src10_l, src21_l, src32_l, src43_l );
293

294
    for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
295
    {
296
        LD_SB4( p_src, i_src_stride, src5, src6, src7, src8 );
297
        p_src += ( 4 * i_src_stride );
298

299
        XORI_B4_128_SB( src5, src6, src7, src8 );
300
        ILVR_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7,
301
                    src54_r, src65_r, src76_r, src87_r );
302
        ILVL_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7,
303
                    src54_l, src65_l, src76_l, src87_l );
304
        out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r,
305
                               filt0, filt1, filt2 );
306
        out1_r = DPADD_SH3_SH( src21_r, src43_r, src65_r,
307
                               filt0, filt1, filt2 );
308
        out2_r = DPADD_SH3_SH( src32_r, src54_r, src76_r,
309
                               filt0, filt1, filt2 );
310
        out3_r = DPADD_SH3_SH( src43_r, src65_r, src87_r,
311
                               filt0, filt1, filt2 );
312
        out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l,
313
                               filt0, filt1, filt2 );
314
        out1_l = DPADD_SH3_SH( src21_l, src43_l, src65_l,
315
                               filt0, filt1, filt2 );
316
        out2_l = DPADD_SH3_SH( src32_l, src54_l, src76_l,
317
                               filt0, filt1, filt2 );
318
        out3_l = DPADD_SH3_SH( src43_l, src65_l, src87_l,
319
                               filt0, filt1, filt2 );
320
        SRARI_H4_SH( out0_r, out1_r, out2_r, out3_r, 5 );
321
        SAT_SH4_SH( out0_r, out1_r, out2_r, out3_r, 7 );
322
        SRARI_H4_SH( out0_l, out1_l, out2_l, out3_l, 5 );
323
        SAT_SH4_SH( out0_l, out1_l, out2_l, out3_l, 7 );
324
        PCKEV_B4_UB( out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
325
                     out3_r, res0, res1, res2, res3 );
326
        XORI_B4_128_UB( res0, res1, res2, res3 );
327

328
        ST_UB4( res0, res1, res2, res3, p_dst, i_dst_stride );
329
        p_dst += ( 4 * i_dst_stride );
330

331
        src10_r = src54_r;
332
        src32_r = src76_r;
333
        src21_r = src65_r;
334
        src43_r = src87_r;
335
        src10_l = src54_l;
336
        src32_l = src76_l;
337
        src21_l = src65_l;
338
        src43_l = src87_l;
339
        src4 = src8;
340
    }
341

342
    for( u_loop_cnt = u_h4w; u_loop_cnt--; )
343
    {
344
        src5 = LD_SB( p_src );
345
        p_src += ( i_src_stride );
346
        src5 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src5, 128 );
347
        ILVRL_B2_SB( src5, src4, src54_r, src54_l );
348
        out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r,
349
                               filt0, filt1, filt2 );
350
        out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l,
351
                               filt0, filt1, filt2 );
352
        SRARI_H2_SH( out0_r, out0_l, 5 );
353
        SAT_SH2_SH( out0_r, out0_l, 7 );
354
        out0_r = ( v8i16 ) __msa_pckev_b( ( v16i8 ) out0_l, ( v16i8 ) out0_r );
355
        res0 = __msa_xori_b( ( v16u8 ) out0_r, 128 );
356
        ST_UB( res0, p_dst );
357
        p_dst += i_dst_stride;
358

359
        src10_r = src21_r;
360
        src21_r = src32_r;
361
        src32_r = src43_r;
362
        src43_r = src54_r;
363

364
        src10_l = src21_l;
365
        src21_l = src32_l;
366
        src32_l = src43_l;
367
        src43_l = src54_l;
368

369
        src4 = src5;
370
    }
371
}
372

373
static void avc_luma_mid_8w_msa( uint8_t *p_src, int32_t i_src_stride,
374
                                 uint8_t *p_dst, int32_t i_dst_stride,
375
                                 int32_t i_height )
376
{
377
    uint32_t u_loop_cnt, u_h4w;
378
    uint64_t u_out0;
379
    v16i8 tmp0;
380
    v16i8 src0, src1, src2, src3, src4;
381
    v16i8 mask0, mask1, mask2;
382
    v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
383
    v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
384
    v8i16 dst0, dst1, dst2, dst3;
385
    v16u8 out0, out1;
386

387
    u_h4w = i_height % 4;
388
    LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 );
389

390
    LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
391
    XORI_B5_128_SB( src0, src1, src2, src3, src4 );
392
    p_src += ( 5 * i_src_stride );
393

394
    hz_out0 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
395
    hz_out1 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 );
396
    hz_out2 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 );
397
    hz_out3 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 );
398
    hz_out4 = AVC_HORZ_FILTER_SH( src4, mask0, mask1, mask2 );
399

400
    for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
401
    {
402
        LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 );
403
        XORI_B4_128_SB( src0, src1, src2, src3 );
404
        p_src += ( 4 * i_src_stride );
405

406
        hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
407
        hz_out6 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 );
408
        hz_out7 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 );
409
        hz_out8 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 );
410
        dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1, hz_out2,
411
                                                hz_out3, hz_out4, hz_out5 );
412
        dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out1, hz_out2, hz_out3,
413
                                                hz_out4, hz_out5, hz_out6 );
414
        dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out2, hz_out3, hz_out4,
415
                                                hz_out5, hz_out6, hz_out7 );
416
        dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out3, hz_out4, hz_out5,
417
                                                hz_out6, hz_out7, hz_out8 );
418
        out0 = PCKEV_XORI128_UB( dst0, dst1 );
419
        out1 = PCKEV_XORI128_UB( dst2, dst3 );
420
        ST8x4_UB( out0, out1, p_dst, i_dst_stride );
421

422
        p_dst += ( 4 * i_dst_stride );
423
        hz_out3 = hz_out7;
424
        hz_out1 = hz_out5;
425
        hz_out5 = hz_out4;
426
        hz_out4 = hz_out8;
427
        hz_out2 = hz_out6;
428
        hz_out0 = hz_out5;
429
    }
430

431
    for( u_loop_cnt = u_h4w; u_loop_cnt--; )
432
    {
433
        src0 = LD_SB( p_src );
434
        p_src += i_src_stride;
435

436
        src0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src0, 128 );
437
        hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
438

439
        dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1,
440
                                                hz_out2, hz_out3,
441
                                                hz_out4, hz_out5 );
442

443
        tmp0 = __msa_pckev_b( ( v16i8 ) ( dst0 ), ( v16i8 ) ( dst0 ) );
444
        tmp0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) tmp0, 128 );
445
        u_out0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
446
        SD( u_out0, p_dst );
447
        p_dst += i_dst_stride;
448

449
        hz_out0 = hz_out1;
450
        hz_out1 = hz_out2;
451
        hz_out2 = hz_out3;
452
        hz_out3 = hz_out4;
453
        hz_out4 = hz_out5;
454
    }
455
}
456

457
static void avc_luma_mid_16w_msa( uint8_t *p_src, int32_t i_src_stride,
458
                                  uint8_t *p_dst, int32_t i_dst_stride,
459
                                  int32_t i_height )
460
{
461
    uint32_t u_multiple8_cnt;
462

463
    for( u_multiple8_cnt = 2; u_multiple8_cnt--; )
464
    {
465
        avc_luma_mid_8w_msa( p_src, i_src_stride, p_dst, i_dst_stride,
466
                             i_height );
467
        p_src += 8;
468
        p_dst += 8;
469
    }
470
}
471

472
static void avc_interleaved_chroma_hv_2x2_msa( uint8_t *p_src,
473
                                               int32_t i_src_stride,
474
                                               uint8_t *p_dst_u,
475
                                               uint8_t *p_dst_v,
476
                                               int32_t i_dst_stride,
477
                                               uint32_t u_coef_hor0,
478
                                               uint32_t u_coef_hor1,
479
                                               uint32_t u_coef_ver0,
480
                                               uint32_t u_coef_ver1 )
481
{
482
    uint16_t u_out0, u_out1, u_out2, u_out3;
483
    v16u8 src0, src1, src2, src3, src4;
484
    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
485
    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
486
    v16i8 mask;
487
    v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
488
    v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
489
    v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
490
    v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
491
    v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
492
    v8i16 res0, res1;
493

494
    mask = LD_SB( &pu_chroma_mask_arr[16] );
495

496
    LD_UB3( p_src, i_src_stride, src0, src1, src2 );
497
    VSHF_B2_UB( src0, src1, src1, src2,
498
                ( mask + 1 ), ( mask + 1 ), src3, src4 );
499
    VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
500
    DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec,
501
                 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
502
                 res_hz3 );
503
    MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
504
          coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
505
          res_vt3 );
506
    ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 );
507
    SRARI_H2_UH( res_vt0, res_vt2, 6 );
508
    SAT_UH2_UH( res_vt0, res_vt2, 7 );
509
    PCKEV_B2_SH( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 );
510

511
    u_out0 = __msa_copy_u_h( res0, 0 );
512
    u_out1 = __msa_copy_u_h( res0, 2 );
513
    u_out2 = __msa_copy_u_h( res1, 0 );
514
    u_out3 = __msa_copy_u_h( res1, 2 );
515

516
    SH( u_out0, p_dst_u );
517
    p_dst_u += i_dst_stride;
518
    SH( u_out1, p_dst_u );
519

520
    SH( u_out2, p_dst_v );
521
    p_dst_v += i_dst_stride;
522
    SH( u_out3, p_dst_v );
523
}
524

525
static void avc_interleaved_chroma_hv_2x4_msa( uint8_t *p_src,
526
                                               int32_t i_src_stride,
527
                                               uint8_t *p_dst_u,
528
                                               uint8_t *p_dst_v,
529
                                               int32_t i_dst_stride,
530
                                               uint32_t u_coef_hor0,
531
                                               uint32_t u_coef_hor1,
532
                                               uint32_t u_coef_ver0,
533
                                               uint32_t u_coef_ver1 )
534
{
535
    uint16_t u_out0, u_out1, u_out2, u_out3;
536
    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
537
    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
538
    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
539
    v16i8 mask;
540
    v8i16 res0, res1;
541
    v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
542
    v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
543
    v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
544
    v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
545
    v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
546

547
    mask = LD_SB( &pu_chroma_mask_arr[16] );
548

549
    LD_UB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
550

551
    VSHF_B2_UB( src0, src1, src1, src2,
552
                ( mask + 1 ), ( mask + 1 ), src5, src6 );
553
    VSHF_B2_UB( src2, src3, src3, src4,
554
                ( mask + 1 ), ( mask + 1 ), src7, src8 );
555
    VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
556
    VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 );
557
    DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
558
                 coeff_hz_vec, coeff_hz_vec, res_hz0,
559
                 res_hz1, res_hz2, res_hz3 );
560
    MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
561
          coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
562
          res_vt3 );
563
    ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
564
    SRARI_H2_UH( res_vt0, res_vt1, 6 );
565
    SAT_UH2_UH( res_vt0, res_vt1, 7 );
566
    PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
567

568
    u_out0 = __msa_copy_u_h( res0, 0 );
569
    u_out1 = __msa_copy_u_h( res0, 2 );
570
    u_out2 = __msa_copy_u_h( res1, 0 );
571
    u_out3 = __msa_copy_u_h( res1, 2 );
572

573
    SH( u_out0, p_dst_u );
574
    p_dst_u += i_dst_stride;
575
    SH( u_out1, p_dst_u );
576
    p_dst_u += i_dst_stride;
577
    SH( u_out2, p_dst_u );
578
    p_dst_u += i_dst_stride;
579
    SH( u_out3, p_dst_u );
580

581
    DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
582
                 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
583
                 res_hz3 );
584
    MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
585
          coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
586
          res_vt3 );
587
    ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
588
    SRARI_H2_UH( res_vt0, res_vt1, 6 );
589
    SAT_UH2_UH( res_vt0, res_vt1, 7 );
590
    PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
591

592
    u_out0 = __msa_copy_u_h( res0, 0 );
593
    u_out1 = __msa_copy_u_h( res0, 2 );
594
    u_out2 = __msa_copy_u_h( res1, 0 );
595
    u_out3 = __msa_copy_u_h( res1, 2 );
596

597
    SH( u_out0, p_dst_v );
598
    p_dst_v += i_dst_stride;
599
    SH( u_out1, p_dst_v );
600
    p_dst_v += i_dst_stride;
601
    SH( u_out2, p_dst_v );
602
    p_dst_v += i_dst_stride;
603
    SH( u_out3, p_dst_v );
604
}
605

606
static void avc_interleaved_chroma_hv_2w_msa( uint8_t *p_src,
607
                                              int32_t i_src_stride,
608
                                              uint8_t *p_dst_u,
609
                                              uint8_t *p_dst_v,
610
                                              int32_t i_dst_stride,
611
                                              uint32_t u_coef_hor0,
612
                                              uint32_t u_coef_hor1,
613
                                              uint32_t u_coef_ver0,
614
                                              uint32_t u_coef_ver1,
615
                                              int32_t i_height )
616
{
617
    if( 2 == i_height )
618
    {
619
        avc_interleaved_chroma_hv_2x2_msa( p_src, i_src_stride,
620
                                           p_dst_u, p_dst_v, i_dst_stride,
621
                                           u_coef_hor0, u_coef_hor1,
622
                                           u_coef_ver0, u_coef_ver1 );
623
    }
624
    else if( 4 == i_height )
625
    {
626
        avc_interleaved_chroma_hv_2x4_msa( p_src, i_src_stride,
627
                                           p_dst_u, p_dst_v, i_dst_stride,
628
                                           u_coef_hor0, u_coef_hor1,
629
                                           u_coef_ver0, u_coef_ver1 );
630
    }
631
}
632

633
static void avc_interleaved_chroma_hv_4x2_msa( uint8_t *p_src,
634
                                               int32_t i_src_stride,
635
                                               uint8_t *p_dst_u,
636
                                               uint8_t *p_dst_v,
637
                                               int32_t i_dst_stride,
638
                                               uint32_t u_coef_hor0,
639
                                               uint32_t u_coef_hor1,
640
                                               uint32_t u_coef_ver0,
641
                                               uint32_t u_coef_ver1 )
642
{
643
    uint32_t u_out0, u_out1, u_out2, u_out3;
644
    v16u8 src0, src1, src2, src3, src4;
645
    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
646
    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
647
    v16i8 mask;
648
    v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
649
    v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
650
    v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
651
    v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
652
    v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
653
    v4i32 res0, res1;
654

655
    mask = LD_SB( &pu_chroma_mask_arr[16] );
656

657
    LD_UB3( p_src, i_src_stride, src0, src1, src2 );
658
    VSHF_B2_UB( src0, src1, src1, src2,
659
                ( mask + 1 ), ( mask + 1 ), src3, src4 );
660
    VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
661
    DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec,
662
                 coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
663
                 res_hz3 );
664
    MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
665
          coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
666
          res_vt3 );
667
    ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 );
668
    SRARI_H2_UH( res_vt0, res_vt2, 6 );
669
    SAT_UH2_UH( res_vt0, res_vt2, 7 );
670
    PCKEV_B2_SW( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 );
671

672
    u_out0 = __msa_copy_u_w( res0, 0 );
673
    u_out1 = __msa_copy_u_w( res0, 1 );
674
    u_out2 = __msa_copy_u_w( res1, 0 );
675
    u_out3 = __msa_copy_u_w( res1, 1 );
676
    SW( u_out0, p_dst_u );
677
    p_dst_u += i_dst_stride;
678
    SW( u_out1, p_dst_u );
679
    SW( u_out2, p_dst_v );
680
    p_dst_v += i_dst_stride;
681
    SW( u_out3, p_dst_v );
682
}
683

684
static void avc_interleaved_chroma_hv_4x4mul_msa( uint8_t *p_src,
685
                                                  int32_t i_src_stride,
686
                                                  uint8_t *p_dst_u,
687
                                                  uint8_t *p_dst_v,
688
                                                  int32_t i_dst_stride,
689
                                                  uint32_t u_coef_hor0,
690
                                                  uint32_t u_coef_hor1,
691
                                                  uint32_t u_coef_ver0,
692
                                                  uint32_t u_coef_ver1,
693
                                                  int32_t i_height )
694
{
695
    uint32_t u_row;
696
    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
697
    v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
698
    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
699
    v16i8 mask;
700
    v4i32 res0, res1;
701
    v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
702
    v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
703
    v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
704
    v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
705
    v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
706

707
    mask = LD_SB( &pu_chroma_mask_arr[16] );
708

709
    src0 = LD_UB( p_src );
710
    p_src += i_src_stride;
711

712
    for( u_row = ( i_height >> 2 ); u_row--; )
713
    {
714
        LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 );
715
        p_src += ( 4 * i_src_stride );
716

717
        VSHF_B2_UB( src0, src1, src1, src2,
718
                    ( mask + 1 ), ( mask + 1 ), src5, src6 );
719
        VSHF_B2_UB( src2, src3, src3, src4,
720
                    ( mask + 1 ), ( mask + 1 ), src7, src8 );
721
        VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
722
        VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 );
723
        DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
724
                     coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
725
                     res_hz3 );
726
        MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
727
              coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
728
              res_vt3 );
729
        ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
730
        SRARI_H2_UH( res_vt0, res_vt1, 6 );
731
        SAT_UH2_UH( res_vt0, res_vt1, 7 );
732
        PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
733

734
        ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_u, i_dst_stride );
735
        p_dst_u += ( 4 * i_dst_stride );
736

737
        DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
738
                     coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
739
                     res_hz3 );
740
        MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
741
              coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
742
              res_vt3 );
743
        ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
744
        SRARI_H2_UH( res_vt0, res_vt1, 6 );
745
        SAT_UH2_UH( res_vt0, res_vt1, 7 );
746
        PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
747

748
        ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_v, i_dst_stride );
749
        p_dst_v += ( 4 * i_dst_stride );
750
        src0 = src4;
751
    }
752
}
753

754
static void avc_interleaved_chroma_hv_4w_msa( uint8_t *p_src,
755
                                              int32_t i_src_stride,
756
                                              uint8_t *p_dst_u,
757
                                              uint8_t *p_dst_v,
758
                                              int32_t i_dst_stride,
759
                                              uint32_t u_coef_hor0,
760
                                              uint32_t u_coef_hor1,
761
                                              uint32_t u_coef_ver0,
762
                                              uint32_t u_coef_ver1,
763
                                              int32_t i_height )
764
{
765
    if( 2 == i_height )
766
    {
767
        avc_interleaved_chroma_hv_4x2_msa( p_src, i_src_stride,
768
                                           p_dst_u, p_dst_v, i_dst_stride,
769
                                           u_coef_hor0, u_coef_hor1,
770
                                           u_coef_ver0, u_coef_ver1 );
771
    }
772
    else
773
    {
774
        avc_interleaved_chroma_hv_4x4mul_msa( p_src, i_src_stride,
775
                                              p_dst_u, p_dst_v, i_dst_stride,
776
                                              u_coef_hor0, u_coef_hor1,
777
                                              u_coef_ver0, u_coef_ver1,
778
                                              i_height );
779
    }
780
}
781

782
static void avc_interleaved_chroma_hv_8w_msa( uint8_t *p_src,
783
                                              int32_t i_src_stride,
784
                                              uint8_t *p_dst_u,
785
                                              uint8_t *p_dst_v,
786
                                              int32_t i_dst_stride,
787
                                              uint32_t u_coef_hor0,
788
                                              uint32_t u_coef_hor1,
789
                                              uint32_t u_coef_ver0,
790
                                              uint32_t u_coef_ver1,
791
                                              int32_t i_height )
792
{
793
    uint32_t u_row;
794
    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
795
    v16u8 src10, src11, src12, src13, src14;
796
    v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5;
797
    v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
798
    v16i8 mask = { 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 16 };
799
    v16i8 coeff_hz_vec0, coeff_hz_vec1;
800
    v16i8 tmp0, tmp1;
801
    v16u8 coeff_hz_vec;
802
    v8u16 coeff_vt_vec0, coeff_vt_vec1;
803

804
    coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
805
    coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
806
    coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
807
    coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
808
    coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
809

810
    LD_UB2( p_src, 16, src0, src13 );
811
    p_src += i_src_stride;
812

813
    VSHF_B2_UB( src0, src13, src0, src13, ( mask + 1 ), mask, src14, src0 );
814
    DOTP_UB2_UH( src0, src14, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz5 );
815

816
    for( u_row = ( i_height >> 2 ); u_row--; )
817
    {
818
        LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 );
819
        LD_UB4( p_src + 16, i_src_stride, src5, src6, src7, src8 );
820
        p_src += ( 4 * i_src_stride );
821

822
        VSHF_B2_UB( src1, src5, src2, src6, mask, mask, src9, src10 );
823
        VSHF_B2_UB( src3, src7, src4, src8, mask, mask, src11, src12 );
824
        DOTP_UB4_UH( src9, src10, src11, src12, coeff_hz_vec, coeff_hz_vec,
825
                     coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
826
                     res_hz4 );
827
        MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
828
              coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
829
              res_vt3 );
830

831
        res_vt0 += ( res_hz0 * coeff_vt_vec1 );
832
        res_vt1 += ( res_hz1 * coeff_vt_vec1 );
833
        res_vt2 += ( res_hz2 * coeff_vt_vec1 );
834
        res_vt3 += ( res_hz3 * coeff_vt_vec1 );
835

836
        SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 );
837
        SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 );
838
        PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 );
839
        ST8x4_UB( tmp0, tmp1, p_dst_u, i_dst_stride );
840
        p_dst_u += ( 4 * i_dst_stride );
841
        res_hz0 = res_hz4;
842

843
        VSHF_B2_UB( src1, src5, src2, src6,
844
                    ( mask + 1 ), ( mask + 1 ), src5, src6 );
845
        VSHF_B2_UB( src3, src7, src4, src8,
846
                    ( mask + 1 ), ( mask + 1 ), src7, src8 );
847
        DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
848
                     coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
849
                     res_hz4 );
850
        MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
851
              coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
852
              res_vt3 );
853

854
        res_vt0 += ( res_hz5 * coeff_vt_vec1 );
855
        res_vt1 += ( res_hz1 * coeff_vt_vec1 );
856
        res_vt2 += ( res_hz2 * coeff_vt_vec1 );
857
        res_vt3 += ( res_hz3 * coeff_vt_vec1 );
858

859
        SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 );
860
        SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 );
861
        PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 );
862
        ST8x4_UB( tmp0, tmp1, p_dst_v, i_dst_stride );
863
        p_dst_v += ( 4 * i_dst_stride );
864
        res_hz5 = res_hz4;
865
    }
866
}
867

868
static void avc_wgt_opscale_4x2_msa( uint8_t *p_src, int32_t i_src_stride,
869
                                     uint8_t *p_dst, int32_t i_dst_stride,
870
                                     int32_t i_log2_denom, int32_t i_weight,
871
                                     int32_t i_offset_in )
872
{
873
    uint32_t u_load0, u_load1, u_out0, u_out1;
874
    v16u8 zero = { 0 };
875
    v16u8 src0, src1;
876
    v4i32 dst0, dst1;
877
    v8u16 temp0, temp1, wgt, denom, offset, tp0, tp1;
878
    v8i16 vec0, vec1;
879

880
    i_offset_in <<= ( i_log2_denom );
881

882
    if( i_log2_denom )
883
    {
884
        i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
885
    }
886

887
    wgt = ( v8u16 ) __msa_fill_h( i_weight );
888
    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
889
    denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
890

891
    u_load0 = LW( p_src );
892
    p_src += i_src_stride;
893
    u_load1 = LW( p_src );
894

895
    src0 = ( v16u8 ) __msa_fill_w( u_load0 );
896
    src1 = ( v16u8 ) __msa_fill_w( u_load1 );
897

898
    ILVR_B2_UH( zero, src0, zero, src1, temp0, temp1 );
899
    MUL2( wgt, temp0, wgt, temp1, temp0, temp1 );
900
    ADDS_SH2_SH( temp0, offset, temp1, offset, vec0, vec1 );
901
    MAXI_SH2_SH( vec0, vec1, 0 );
902

903
    tp0 = ( v8u16 ) __msa_srl_h( vec0, ( v8i16 ) denom );
904
    tp1 = ( v8u16 ) __msa_srl_h( vec1, ( v8i16 ) denom );
905

906
    SAT_UH2_UH( tp0, tp1, 7 );
907
    PCKEV_B2_SW( tp0, tp0, tp1, tp1, dst0, dst1 );
908

909
    u_out0 = __msa_copy_u_w( dst0, 0 );
910
    u_out1 = __msa_copy_u_w( dst1, 0 );
911
    SW( u_out0, p_dst );
912
    p_dst += i_dst_stride;
913
    SW( u_out1, p_dst );
914
}
915

916
static void avc_wgt_opscale_4x4multiple_msa( uint8_t *p_src,
917
                                             int32_t i_src_stride,
918
                                             uint8_t *p_dst,
919
                                             int32_t i_dst_stride,
920
                                             int32_t i_height,
921
                                             int32_t i_log2_denom,
922
                                             int32_t i_weight,
923
                                             int32_t i_offset_in )
924
{
925
    uint8_t u_cnt;
926
    uint32_t u_load0, u_load1, u_load2, u_load3;
927
    v16u8 zero = { 0 };
928
    v16u8 src0, src1, src2, src3;
929
    v8u16 temp0, temp1, temp2, temp3;
930
    v8u16 wgt, denom, offset;
931

932
    i_offset_in <<= ( i_log2_denom );
933

934
    if( i_log2_denom )
935
    {
936
        i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
937
    }
938

939
    wgt = ( v8u16 ) __msa_fill_h( i_weight );
940
    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
941
    denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
942

943
    for( u_cnt = i_height / 4; u_cnt--; )
944
    {
945
        LW4( p_src, i_src_stride, u_load0, u_load1, u_load2, u_load3 );
946
        p_src += 4 * i_src_stride;
947

948
        src0 = ( v16u8 ) __msa_fill_w( u_load0 );
949
        src1 = ( v16u8 ) __msa_fill_w( u_load1 );
950
        src2 = ( v16u8 ) __msa_fill_w( u_load2 );
951
        src3 = ( v16u8 ) __msa_fill_w( u_load3 );
952

953
        ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
954
                    temp0, temp1, temp2, temp3 );
955
        MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
956
              temp0, temp1, temp2, temp3 );
957
        ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
958
                     temp0, temp1, temp2, temp3 );
959
        MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
960
        SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
961
        SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
962
        PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
963
        p_dst += ( 4 * i_dst_stride );
964
    }
965
}
966

967
static void avc_wgt_opscale_4width_msa( uint8_t *p_src, int32_t i_src_stride,
968
                                        uint8_t *p_dst, int32_t i_dst_stride,
969
                                        int32_t i_height, int32_t i_log2_denom,
970
                                        int32_t i_weight, int32_t i_offset_in )
971
{
972
    if( 2 == i_height )
973
    {
974
        avc_wgt_opscale_4x2_msa( p_src, i_src_stride, p_dst, i_dst_stride,
975
                                 i_log2_denom, i_weight, i_offset_in );
976
    }
977
    else
978
    {
979
        avc_wgt_opscale_4x4multiple_msa( p_src, i_src_stride,
980
                                         p_dst, i_dst_stride,
981
                                         i_height, i_log2_denom,
982
                                         i_weight, i_offset_in );
983
    }
984
}
985

986
static void avc_wgt_opscale_8width_msa( uint8_t *p_src, int32_t i_src_stride,
987
                                        uint8_t *p_dst, int32_t i_dst_stride,
988
                                        int32_t i_height, int32_t i_log2_denom,
989
                                        int32_t i_weight, int32_t i_offset_in )
990
{
991
    uint8_t u_cnt;
992
    v16u8 zero = { 0 };
993
    v16u8 src0, src1, src2, src3;
994
    v8u16 temp0, temp1, temp2, temp3;
995
    v8u16 wgt, denom, offset;
996
    v16i8 out0, out1;
997

998
    i_offset_in <<= ( i_log2_denom );
999

1000
    if( i_log2_denom )
1001
    {
1002
        i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
1003
    }
1004

1005
    wgt = ( v8u16 ) __msa_fill_h( i_weight );
1006
    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1007
    denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
1008

1009
    for( u_cnt = i_height / 4; u_cnt--; )
1010
    {
1011
        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1012
        p_src += 4 * i_src_stride;
1013

1014
        ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
1015
                    temp0, temp1, temp2, temp3 );
1016
        MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
1017
              temp0, temp1, temp2, temp3 );
1018
        ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
1019
                     temp0, temp1, temp2, temp3 );
1020
        MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
1021
        SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
1022
        SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
1023
        PCKEV_B2_SB( temp1, temp0, temp3, temp2, out0, out1 );
1024
        ST8x4_UB( out0, out1, p_dst, i_dst_stride );
1025
        p_dst += ( 4 * i_dst_stride );
1026
    }
1027
}
1028

1029
static void avc_wgt_opscale_16width_msa( uint8_t *p_src, int32_t i_src_stride,
1030
                                         uint8_t *p_dst, int32_t i_dst_stride,
1031
                                         int32_t i_height, int32_t i_log2_denom,
1032
                                         int32_t i_weight, int32_t i_offset_in )
1033
{
1034
    uint8_t u_cnt;
1035
    v16i8 zero = { 0 };
1036
    v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
1037
    v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1038
    v8u16 wgt, denom, offset;
1039

1040
    i_offset_in <<= ( i_log2_denom );
1041

1042
    if( i_log2_denom )
1043
    {
1044
        i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
1045
    }
1046

1047
    wgt = ( v8u16 ) __msa_fill_h( i_weight );
1048
    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1049
    denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
1050

1051
    for( u_cnt = i_height / 4; u_cnt--; )
1052
    {
1053
        LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1054
        p_src += 4 * i_src_stride;
1055

1056
        ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
1057
                    temp0, temp2, temp4, temp6 );
1058
        ILVL_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
1059
                    temp1, temp3, temp5, temp7 );
1060
        MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
1061
              temp0, temp1, temp2, temp3 );
1062
        MUL4( wgt, temp4, wgt, temp5, wgt, temp6, wgt, temp7,
1063
              temp4, temp5, temp6, temp7 );
1064
        ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
1065
                     temp0, temp1, temp2, temp3 );
1066
        ADDS_SH4_UH( temp4, offset, temp5, offset, temp6, offset, temp7, offset,
1067
                     temp4, temp5, temp6, temp7 );
1068
        MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
1069
        MAXI_SH4_UH( temp4, temp5, temp6, temp7, 0 );
1070
        SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
1071
        SRL_H4_UH( temp4, temp5, temp6, temp7, denom );
1072
        SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
1073
        SAT_UH4_UH( temp4, temp5, temp6, temp7, 7 );
1074
        PCKEV_B4_UB( temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
1075
                     dst0, dst1, dst2, dst3 );
1076

1077
        ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride );
1078
        p_dst += 4 * i_dst_stride;
1079
    }
1080
}
1081

1082
static void avc_biwgt_opscale_4x2_nw_msa( uint8_t *p_src1_in,
1083
                                          int32_t i_src1_stride,
1084
                                          uint8_t *p_src2_in,
1085
                                          int32_t i_src2_stride,
1086
                                          uint8_t *p_dst,
1087
                                          int32_t i_dst_stride,
1088
                                          int32_t i_log2_denom,
1089
                                          int32_t i_src1_weight,
1090
                                          int32_t i_src2_weight,
1091
                                          int32_t i_offset_in )
1092
{
1093
    uint32_t u_load0, u_load1, u_out0, u_out1;
1094
    v8i16 src1_wgt, src2_wgt;
1095
    v16u8 in0, in1, in2, in3;
1096
    v8i16 temp0, temp1, temp2, temp3;
1097
    v16i8 zero = { 0 };
1098
    v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
1099

1100
    src1_wgt = __msa_fill_h( i_src1_weight );
1101
    src2_wgt = __msa_fill_h( i_src2_weight );
1102
    u_load0 = LW( p_src1_in );
1103
    u_load1 = LW( p_src1_in + i_src1_stride );
1104
    in0 = ( v16u8 ) __msa_fill_w( u_load0 );
1105
    in1 = ( v16u8 ) __msa_fill_w( u_load1 );
1106
    u_load0 = LW( p_src2_in );
1107
    u_load1 = LW( p_src2_in + i_src2_stride );
1108
    in2 = ( v16u8 ) __msa_fill_w( u_load0 );
1109
    in3 = ( v16u8 ) __msa_fill_w( u_load1 );
1110
    ILVR_B4_SH( zero, in0, zero, in1, zero, in2, zero, in3,
1111
                temp0, temp1, temp2, temp3 );
1112
    temp0 = ( temp0 * src1_wgt ) + ( temp2 * src2_wgt );
1113
    temp1 = ( temp1 * src1_wgt ) + ( temp3 * src2_wgt );
1114
    SRAR_H2_SH( temp0, temp1, denom );
1115
    CLIP_SH2_0_255( temp0, temp1 );
1116
    PCKEV_B2_UB( temp0, temp0, temp1, temp1, in0, in1 );
1117
    u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 );
1118
    u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 );
1119
    SW( u_out0, p_dst );
1120
    p_dst += i_dst_stride;
1121
    SW( u_out1, p_dst );
1122
}
1123

1124
static void avc_biwgt_opscale_4x4multiple_nw_msa( uint8_t *p_src1_in,
1125
                                                  int32_t i_src1_stride,
1126
                                                  uint8_t *p_src2_in,
1127
                                                  int32_t i_src2_stride,
1128
                                                  uint8_t *p_dst,
1129
                                                  int32_t i_dst_stride,
1130
                                                  int32_t i_height,
1131
                                                  int32_t i_log2_denom,
1132
                                                  int32_t i_src1_weight,
1133
                                                  int32_t i_src2_weight,
1134
                                                  int32_t i_offset_in )
1135
{
1136
    uint8_t u_cnt;
1137
    uint32_t u_load0, u_load1, u_load2, u_load3;
1138
    v8i16 src1_wgt, src2_wgt;
1139
    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1140
    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1141
    v16i8 zero = { 0 };
1142
    v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
1143

1144
    src1_wgt = __msa_fill_h( i_src1_weight );
1145
    src2_wgt = __msa_fill_h( i_src2_weight );
1146
    for( u_cnt = i_height / 4; u_cnt--; )
1147
    {
1148
        LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 );
1149
        p_src1_in += ( 4 * i_src1_stride );
1150
        src0 = ( v16u8 ) __msa_fill_w( u_load0 );
1151
        src1 = ( v16u8 ) __msa_fill_w( u_load1 );
1152
        src2 = ( v16u8 ) __msa_fill_w( u_load2 );
1153
        src3 = ( v16u8 ) __msa_fill_w( u_load3 );
1154
        LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 );
1155
        p_src2_in += ( 4 * i_src2_stride );
1156
        src4 = ( v16u8 ) __msa_fill_w( u_load0 );
1157
        src5 = ( v16u8 ) __msa_fill_w( u_load1 );
1158
        src6 = ( v16u8 ) __msa_fill_w( u_load2 );
1159
        src7 = ( v16u8 ) __msa_fill_w( u_load3 );
1160
        ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3,
1161
                    temp0, temp1, temp2, temp3 );
1162
        ILVR_B4_SH( zero, src4, zero, src5, zero, src6, zero, src7,
1163
                    temp4, temp5, temp6, temp7 );
1164
        temp0 = ( temp0 * src1_wgt ) + ( temp4 * src2_wgt );
1165
        temp1 = ( temp1 * src1_wgt ) + ( temp5 * src2_wgt );
1166
        temp2 = ( temp2 * src1_wgt ) + ( temp6 * src2_wgt );
1167
        temp3 = ( temp3 * src1_wgt ) + ( temp7 * src2_wgt );
1168
        SRAR_H4_SH( temp0, temp1, temp2, temp3, denom );
1169
        CLIP_SH4_0_255( temp0, temp1, temp2, temp3 );
1170
        PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
1171
        p_dst += ( 4 * i_dst_stride );
1172
    }
1173
}
1174

1175
static void avc_biwgt_opscale_4width_nw_msa( uint8_t *p_src1_in,
1176
                                             int32_t i_src1_stride,
1177
                                             uint8_t *p_src2_in,
1178
                                             int32_t i_src2_stride,
1179
                                             uint8_t *p_dst,
1180
                                             int32_t i_dst_stride,
1181
                                             int32_t i_height,
1182
                                             int32_t i_log2_denom,
1183
                                             int32_t i_src1_weight,
1184
                                             int32_t i_src2_weight,
1185
                                             int32_t i_offset_in )
1186
{
1187
    if( 2 == i_height )
1188
    {
1189
        avc_biwgt_opscale_4x2_nw_msa( p_src1_in, i_src1_stride,
1190
                                      p_src2_in, i_src2_stride,
1191
                                      p_dst, i_dst_stride,
1192
                                      i_log2_denom, i_src1_weight,
1193
                                      i_src2_weight, i_offset_in );
1194
    }
1195
    else
1196
    {
1197
        avc_biwgt_opscale_4x4multiple_nw_msa( p_src1_in, i_src1_stride,
1198
                                              p_src2_in, i_src2_stride,
1199
                                              p_dst, i_dst_stride,
1200
                                              i_height, i_log2_denom,
1201
                                              i_src1_weight, i_src2_weight,
1202
                                              i_offset_in );
1203
    }
1204
}
1205

1206
static void avc_biwgt_opscale_8width_nw_msa( uint8_t *p_src1_in,
1207
                                             int32_t i_src1_stride,
1208
                                             uint8_t *p_src2_in,
1209
                                             int32_t i_src2_stride,
1210
                                             uint8_t *p_dst,
1211
                                             int32_t i_dst_stride,
1212
                                             int32_t i_height,
1213
                                             int32_t i_log2_denom,
1214
                                             int32_t i_src1_weight,
1215
                                             int32_t i_src2_weight,
1216
                                             int32_t i_offset_in )
1217
{
1218
    uint8_t u_cnt;
1219
    v8i16 src1_wgt, src2_wgt;
1220
    v16u8 src0, src1, src2, src3;
1221
    v16u8 dst0, dst1, dst2, dst3;
1222
    v8i16 temp0, temp1, temp2, temp3;
1223
    v8i16 res0, res1, res2, res3;
1224
    v16i8 zero = { 0 };
1225
    v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
1226

1227
    src1_wgt = __msa_fill_h( i_src1_weight );
1228
    src2_wgt = __msa_fill_h( i_src2_weight );
1229

1230
    for( u_cnt = i_height / 4; u_cnt--; )
1231
    {
1232
        LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
1233
        p_src1_in += ( 4 * i_src1_stride );
1234
        LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 );
1235
        p_src2_in += ( 4 * i_src2_stride );
1236
        ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3,
1237
                    temp0, temp1, temp2, temp3 );
1238
        ILVR_B4_SH( zero, dst0, zero, dst1, zero, dst2, zero, dst3,
1239
                    res0, res1, res2, res3 );
1240
        res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt );
1241
        res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt );
1242
        res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt );
1243
        res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt );
1244
        SRAR_H4_SH( res0, res1, res2, res3, denom );
1245
        CLIP_SH4_0_255( res0, res1, res2, res3 );
1246
        PCKEV_B4_UB( res0, res0, res1, res1, res2, res2, res3, res3,
1247
                     dst0, dst1, dst2, dst3 );
1248
        ST8x1_UB( dst0, p_dst );
1249
        p_dst += i_dst_stride;
1250
        ST8x1_UB( dst1, p_dst );
1251
        p_dst += i_dst_stride;
1252
        ST8x1_UB( dst2, p_dst );
1253
        p_dst += i_dst_stride;
1254
        ST8x1_UB( dst3, p_dst );
1255
        p_dst += i_dst_stride;
1256
    }
1257
}
1258

1259
static void avc_biwgt_opscale_16width_nw_msa( uint8_t *p_src1_in,
1260
                                              int32_t i_src1_stride,
1261
                                              uint8_t *p_src2_in,
1262
                                              int32_t i_src2_stride,
1263
                                              uint8_t *p_dst,
1264
                                              int32_t i_dst_stride,
1265
                                              int32_t i_height,
1266
                                              int32_t i_log2_denom,
1267
                                              int32_t i_src1_weight,
1268
                                              int32_t i_src2_weight,
1269
                                              int32_t i_offset_in )
1270
{
1271
    uint8_t u_cnt;
1272
    v8i16 src1_wgt, src2_wgt;
1273
    v16u8 src0, src1, src2, src3;
1274
    v16u8 dst0, dst1, dst2, dst3;
1275
    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1276
    v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1277
    v16i8 zero = { 0 };
1278
    v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
1279

1280
    src1_wgt = __msa_fill_h( i_src1_weight );
1281
    src2_wgt = __msa_fill_h( i_src2_weight );
1282

1283
    for( u_cnt = i_height / 4; u_cnt--; )
1284
    {
1285
        LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
1286
        p_src1_in += ( 4 * i_src1_stride );
1287
        LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 );
1288
        p_src2_in += ( 4 * i_src2_stride );
1289
        ILVRL_B2_SH( zero, src0, temp1, temp0 );
1290
        ILVRL_B2_SH( zero, src1, temp3, temp2 );
1291
        ILVRL_B2_SH( zero, src2, temp5, temp4 );
1292
        ILVRL_B2_SH( zero, src3, temp7, temp6 );
1293
        ILVRL_B2_SH( zero, dst0, res1, res0 );
1294
        ILVRL_B2_SH( zero, dst1, res3, res2 );
1295
        ILVRL_B2_SH( zero, dst2, res5, res4 );
1296
        ILVRL_B2_SH( zero, dst3, res7, res6 );
1297
        res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt );
1298
        res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt );
1299
        res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt );
1300
        res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt );
1301
        res4 = ( temp4 * src1_wgt ) + ( res4 * src2_wgt );
1302
        res5 = ( temp5 * src1_wgt ) + ( res5 * src2_wgt );
1303
        res6 = ( temp6 * src1_wgt ) + ( res6 * src2_wgt );
1304
        res7 = ( temp7 * src1_wgt ) + ( res7 * src2_wgt );
1305
        SRAR_H4_SH( res0, res1, res2, res3, denom );
1306
        SRAR_H4_SH( res4, res5, res6, res7, denom );
1307
        CLIP_SH4_0_255( res0, res1, res2, res3 );
1308
        CLIP_SH4_0_255( res4, res5, res6, res7 );
1309
        PCKEV_B4_UB( res0, res1, res2, res3, res4, res5, res6, res7,
1310
                     dst0, dst1, dst2, dst3 );
1311
        ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride );
1312
        p_dst += 4 * i_dst_stride;
1313
    }
1314
}
1315

1316
static void avc_biwgt_opscale_4x2_msa( uint8_t *p_src1_in,
1317
                                       int32_t i_src1_stride,
1318
                                       uint8_t *p_src2_in,
1319
                                       int32_t i_src2_stride,
1320
                                       uint8_t *p_dst, int32_t i_dst_stride,
1321
                                       int32_t i_log2_denom,
1322
                                       int32_t i_src1_weight,
1323
                                       int32_t i_src2_weight,
1324
                                       int32_t i_offset_in )
1325
{
1326
    uint32_t u_load0, u_load1, u_out0, u_out1;
1327
    v16u8 src1_wgt, src2_wgt, wgt;
1328
    v16i8 in0, in1, in2, in3;
1329
    v8u16 temp0, temp1, denom, offset;
1330

1331
    i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
1332

1333
    src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
1334
    src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
1335
    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1336
    denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
1337

1338
    wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
1339

1340
    u_load0 = LW( p_src1_in );
1341
    u_load1 = LW( p_src1_in + i_src1_stride );
1342
    in0 = ( v16i8 ) __msa_fill_w( u_load0 );
1343
    in1 = ( v16i8 ) __msa_fill_w( u_load1 );
1344

1345
    u_load0 = LW( p_src2_in );
1346
    u_load1 = LW( p_src2_in + i_src2_stride );
1347
    in2 = ( v16i8 ) __msa_fill_w( u_load0 );
1348
    in3 = ( v16i8 ) __msa_fill_w( u_load1 );
1349

1350
    ILVR_B2_SB( in2, in0, in3, in1, in0, in1 );
1351

1352
    temp0 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in0 );
1353
    temp1 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in1 );
1354
    temp0 >>= denom;
1355
    temp1 >>= denom;
1356
    MAXI_SH2_UH( temp0, temp1, 0 );
1357
    SAT_UH2_UH( temp0, temp1, 7 );
1358
    PCKEV_B2_SB( temp0, temp0, temp1, temp1, in0, in1 );
1359

1360
    u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 );
1361
    u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 );
1362
    SW( u_out0, p_dst );
1363
    p_dst += i_dst_stride;
1364
    SW( u_out1, p_dst );
1365
}
1366

1367
static void avc_biwgt_opscale_4x4multiple_msa( uint8_t *p_src1_in,
1368
                                               int32_t i_src1_stride,
1369
                                               uint8_t *p_src2_in,
1370
                                               int32_t i_src2_stride,
1371
                                               uint8_t *p_dst,
1372
                                               int32_t i_dst_stride,
1373
                                               int32_t i_height,
1374
                                               int32_t i_log2_denom,
1375
                                               int32_t i_src1_weight,
1376
                                               int32_t i_src2_weight,
1377
                                               int32_t i_offset_in )
1378
{
1379
    uint8_t u_cnt;
1380
    uint32_t u_load0, u_load1, u_load2, u_load3;
1381
    v16u8 src1_wgt, src2_wgt, wgt;
1382
    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1383
    v16u8 temp0, temp1, temp2, temp3;
1384
    v8u16 res0, res1, res2, res3;
1385
    v8u16 denom, offset;
1386

1387
    i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
1388

1389
    src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
1390
    src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
1391
    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1392
    denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
1393

1394
    wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
1395

1396
    for( u_cnt = i_height / 4; u_cnt--; )
1397
    {
1398
        LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 );
1399
        p_src1_in += ( 4 * i_src1_stride );
1400

1401
        src0 = ( v16u8 ) __msa_fill_w( u_load0 );
1402
        src1 = ( v16u8 ) __msa_fill_w( u_load1 );
1403
        src2 = ( v16u8 ) __msa_fill_w( u_load2 );
1404
        src3 = ( v16u8 ) __msa_fill_w( u_load3 );
1405

1406
        LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 );
1407
        p_src2_in += ( 4 * i_src2_stride );
1408

1409
        src4 = ( v16u8 ) __msa_fill_w( u_load0 );
1410
        src5 = ( v16u8 ) __msa_fill_w( u_load1 );
1411
        src6 = ( v16u8 ) __msa_fill_w( u_load2 );
1412
        src7 = ( v16u8 ) __msa_fill_w( u_load3 );
1413

1414
        ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1415
                    temp0, temp1, temp2, temp3 );
1416
        DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
1417
                     res0, res1, res2, res3 );
1418
        ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
1419
              res0, res1, res2, res3 );
1420
        SRA_4V( res0, res1, res2, res3, denom );
1421
        MAXI_SH4_UH( res0, res1, res2, res3, 0 );
1422
        SAT_UH4_UH( res0, res1, res2, res3, 7 );
1423
        PCKEV_ST4x4_UB( res0, res1, res2, res3, p_dst, i_dst_stride );
1424
        p_dst += ( 4 * i_dst_stride );
1425
    }
1426
}
1427

1428
static void avc_biwgt_opscale_4width_msa( uint8_t *p_src1_in,
1429
                                          int32_t i_src1_stride,
1430
                                          uint8_t *p_src2_in,
1431
                                          int32_t i_src2_stride,
1432
                                          uint8_t *p_dst,
1433
                                          int32_t i_dst_stride,
1434
                                          int32_t i_height,
1435
                                          int32_t i_log2_denom,
1436
                                          int32_t i_src1_weight,
1437
                                          int32_t i_src2_weight,
1438
                                          int32_t i_offset_in )
1439
{
1440
    if( 2 == i_height )
1441
    {
1442
        avc_biwgt_opscale_4x2_msa( p_src1_in, i_src1_stride,
1443
                                   p_src2_in, i_src2_stride,
1444
                                   p_dst, i_dst_stride,
1445
                                   i_log2_denom, i_src1_weight,
1446
                                   i_src2_weight, i_offset_in );
1447
    }
1448
    else
1449
    {
1450
        avc_biwgt_opscale_4x4multiple_msa( p_src1_in, i_src1_stride,
1451
                                           p_src2_in, i_src2_stride,
1452
                                           p_dst, i_dst_stride,
1453
                                           i_height, i_log2_denom,
1454
                                           i_src1_weight,
1455
                                           i_src2_weight, i_offset_in );
1456
    }
1457
}
1458

1459

1460
static void avc_biwgt_opscale_8width_msa( uint8_t *p_src1_in,
1461
                                          int32_t i_src1_stride,
1462
                                          uint8_t *p_src2_in,
1463
                                          int32_t i_src2_stride,
1464
                                          uint8_t *p_dst,
1465
                                          int32_t i_dst_stride,
1466
                                          int32_t i_height,
1467
                                          int32_t i_log2_denom,
1468
                                          int32_t i_src1_weight,
1469
                                          int32_t i_src2_weight,
1470
                                          int32_t i_offset_in )
1471
{
1472
    uint8_t u_cnt;
1473
    v16u8 src1_wgt, src2_wgt, wgt;
1474
    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1475
    v16u8 temp0, temp1, temp2, temp3;
1476
    v8u16 res0, res1, res2, res3;
1477
    v8u16 denom, offset;
1478
    v16i8 out0, out1;
1479

1480
    i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
1481

1482
    src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
1483
    src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
1484
    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1485
    denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
1486

1487
    wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
1488

1489
    for( u_cnt = i_height / 4; u_cnt--; )
1490
    {
1491
        LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
1492
        p_src1_in += ( 4 * i_src1_stride );
1493

1494
        LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 );
1495
        p_src2_in += ( 4 * i_src2_stride );
1496

1497
        ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1498
                    temp0, temp1, temp2, temp3 );
1499
        DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
1500
                     res0, res1, res2, res3 );
1501
        ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
1502
              res0, res1, res2, res3 );
1503
        SRA_4V( res0, res1, res2, res3, denom );
1504
        MAXI_SH4_UH( res0, res1, res2, res3, 0 );
1505
        SAT_UH4_UH( res0, res1, res2, res3, 7 );
1506
        PCKEV_B2_SB( res1, res0, res3, res2, out0, out1 );
1507
        ST8x4_UB( out0, out1, p_dst, i_dst_stride );
1508
        p_dst += 4 * i_dst_stride;
1509
    }
1510
}
1511

1512
static void avc_biwgt_opscale_16width_msa( uint8_t *p_src1_in,
1513
                                           int32_t i_src1_stride,
1514
                                           uint8_t *p_src2_in,
1515
                                           int32_t i_src2_stride,
1516
                                           uint8_t *p_dst,
1517
                                           int32_t i_dst_stride,
1518
                                           int32_t i_height,
1519
                                           int32_t i_log2_denom,
1520
                                           int32_t i_src1_weight,
1521
                                           int32_t i_src2_weight,
1522
                                           int32_t i_offset_in )
1523
{
1524
    uint8_t u_cnt;
1525
    v16u8 src1_wgt, src2_wgt, wgt;
1526
    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1527
    v16u8 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1528
    v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1529
    v8u16 denom, offset;
1530

1531
    i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
1532

1533
    src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
1534
    src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
1535
    offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1536
    denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
1537

1538
    wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
1539

1540
    for( u_cnt = i_height / 4; u_cnt--; )
1541
    {
1542
        LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
1543
        p_src1_in += ( 4 * i_src1_stride );
1544

1545
        LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 );
1546
        p_src2_in += ( 4 * i_src2_stride );
1547

1548
        ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1549
                    temp0, temp2, temp4, temp6 );
1550
        ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1551
                    temp1, temp3, temp5, temp7 );
1552
        DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
1553
                     res0, res1, res2, res3 );
1554
        ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
1555
              res0, res1, res2, res3 );
1556
        DOTP_UB4_UH( temp4, temp5, temp6, temp7, wgt, wgt, wgt, wgt,
1557
                     res4, res5, res6, res7 );
1558
        ADD4( res4, offset, res5, offset, res6, offset, res7, offset,
1559
              res4, res5, res6, res7 );
1560
        SRA_4V( res0, res1, res2, res3, denom );
1561
        SRA_4V( res4, res5, res6, res7, denom );
1562
        MAXI_SH4_UH( res0, res1, res2, res3, 0 );
1563
        MAXI_SH4_UH( res4, res5, res6, res7, 0 );
1564
        SAT_UH4_UH( res0, res1, res2, res3, 7 );
1565
        SAT_UH4_UH( res4, res5, res6, res7, 7 );
1566
        PCKEV_B4_UB( res1, res0, res3, res2, res5, res4, res7, res6,
1567
                     temp0, temp1, temp2, temp3 );
1568
        ST_UB4( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
1569
        p_dst += 4 * i_dst_stride;
1570
    }
1571
}
1572

1573
static void copy_width4_msa( uint8_t *p_src, int32_t i_src_stride,
1574
                             uint8_t *p_dst, int32_t i_dst_stride,
1575
                             int32_t i_height )
1576
{
1577
    int32_t i_cnt;
1578
    uint32_t u_src0, u_src1;
1579

1580
    for( i_cnt = ( i_height / 2 ); i_cnt--;  )
1581
    {
1582
        u_src0 = LW( p_src );
1583
        p_src += i_src_stride;
1584
        u_src1 = LW( p_src );
1585
        p_src += i_src_stride;
1586

1587
        SW( u_src0, p_dst );
1588
        p_dst += i_dst_stride;
1589
        SW( u_src1, p_dst );
1590
        p_dst += i_dst_stride;
1591
    }
1592
}
1593

1594
static void copy_width8_msa( uint8_t *p_src, int32_t i_src_stride,
1595
                             uint8_t *p_dst, int32_t i_dst_stride,
1596
                             int32_t i_height )
1597
{
1598
    int32_t i_cnt;
1599
    uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
1600
    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1601

1602
    if( 0 == i_height % 12 )
1603
    {
1604
        for( i_cnt = ( i_height / 12 ); i_cnt--; )
1605
        {
1606
            LD_UB8( p_src, i_src_stride,
1607
                    src0, src1, src2, src3, src4, src5, src6, src7 );
1608
            p_src += ( 8 * i_src_stride );
1609

1610
            u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1611
            u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1612
            u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
1613
            u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
1614
            u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 );
1615
            u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 );
1616
            u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 );
1617
            u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 );
1618

1619
            SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1620
            p_dst += ( 4 * i_dst_stride );
1621
            SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
1622
            p_dst += ( 4 * i_dst_stride );
1623

1624
            LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1625
            p_src += ( 4 * i_src_stride );
1626

1627
            u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1628
            u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1629
            u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
1630
            u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
1631

1632
            SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1633
            p_dst += ( 4 * i_dst_stride );
1634
        }
1635
    }
1636
    else if( 0 == i_height % 8 )
1637
    {
1638
        for( i_cnt = i_height >> 3; i_cnt--; )
1639
        {
1640
            LD_UB8( p_src, i_src_stride,
1641
                    src0, src1, src2, src3, src4, src5, src6, src7 );
1642
            p_src += ( 8 * i_src_stride );
1643

1644
            u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1645
            u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1646
            u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
1647
            u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
1648
            u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 );
1649
            u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 );
1650
            u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 );
1651
            u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 );
1652

1653
            SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1654
            p_dst += ( 4 * i_dst_stride );
1655
            SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
1656
            p_dst += ( 4 * i_dst_stride );
1657
        }
1658
    }
1659
    else if( 0 == i_height % 4 )
1660
    {
1661
        for( i_cnt = ( i_height / 4 ); i_cnt--; )
1662
        {
1663
            LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1664
            p_src += ( 4 * i_src_stride );
1665
            u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1666
            u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1667
            u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
1668
            u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
1669

1670
            SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1671
            p_dst += ( 4 * i_dst_stride );
1672
        }
1673
    }
1674
    else if( 0 == i_height % 2 )
1675
    {
1676
        for( i_cnt = ( i_height / 2 ); i_cnt--; )
1677
        {
1678
            LD_UB2( p_src, i_src_stride, src0, src1 );
1679
            p_src += ( 2 * i_src_stride );
1680
            u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1681
            u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1682

1683
            SD( u_out0, p_dst );
1684
            p_dst += i_dst_stride;
1685
            SD( u_out1, p_dst );
1686
            p_dst += i_dst_stride;
1687
        }
1688
    }
1689
}
1690

1691

1692
static void copy_16multx8mult_msa( uint8_t *p_src, int32_t i_src_stride,
1693
                                   uint8_t *p_dst, int32_t i_dst_stride,
1694
                                   int32_t i_height, int32_t i_width )
1695
{
1696
    int32_t i_cnt, i_loop_cnt;
1697
    uint8_t *p_src_tmp, *p_dst_tmp;
1698
    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1699

1700
    for( i_cnt = ( i_width >> 4 ); i_cnt--; )
1701
    {
1702
        p_src_tmp = p_src;
1703
        p_dst_tmp = p_dst;
1704

1705
        for( i_loop_cnt = ( i_height >> 3 ); i_loop_cnt--; )
1706
        {
1707
            LD_UB8( p_src_tmp, i_src_stride,
1708
                    src0, src1, src2, src3, src4, src5, src6, src7 );
1709
            p_src_tmp += ( 8 * i_src_stride );
1710

1711
            ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7,
1712
                    p_dst_tmp, i_dst_stride );
1713
            p_dst_tmp += ( 8 * i_dst_stride );
1714
        }
1715

1716
        p_src += 16;
1717
        p_dst += 16;
1718
    }
1719
}
1720

1721
static void copy_width16_msa( uint8_t *p_src, int32_t i_src_stride,
1722
                              uint8_t *p_dst, int32_t i_dst_stride,
1723
                              int32_t i_height )
1724
{
1725
    int32_t i_cnt;
1726
    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1727

1728
    if( 0 == i_height % 12 )
1729
    {
1730
        for( i_cnt = ( i_height / 12 ); i_cnt--; )
1731
        {
1732
            LD_UB8( p_src, i_src_stride,
1733
                    src0, src1, src2, src3, src4, src5, src6, src7 );
1734
            p_src += ( 8 * i_src_stride );
1735
            ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7,
1736
                    p_dst, i_dst_stride );
1737
            p_dst += ( 8 * i_dst_stride );
1738

1739
            LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1740
            p_src += ( 4 * i_src_stride );
1741
            ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
1742
            p_dst += ( 4 * i_dst_stride );
1743
        }
1744
    }
1745
    else if( 0 == i_height % 8 )
1746
    {
1747
        copy_16multx8mult_msa( p_src, i_src_stride,
1748
                               p_dst, i_dst_stride, i_height, 16 );
1749
    }
1750
    else if( 0 == i_height % 4 )
1751
    {
1752
        for( i_cnt = ( i_height >> 2 ); i_cnt--; )
1753
        {
1754
            LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1755
            p_src += ( 4 * i_src_stride );
1756

1757
            ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
1758
            p_dst += ( 4 * i_dst_stride );
1759
        }
1760
    }
1761
}
1762

1763
static void avg_src_width4_msa( uint8_t *p_src1, int32_t i_src1_stride,
1764
                                uint8_t *p_src2, int32_t i_src2_stride,
1765
                                uint8_t *p_dst, int32_t i_dst_stride,
1766
                                int32_t i_height )
1767
{
1768
    int32_t i_cnt;
1769
    uint32_t u_out0, u_out1;
1770
    v16u8 src0, src1, src2, src3;
1771
    v16u8 dst0, dst1;
1772

1773
    for( i_cnt = ( i_height / 2 ); i_cnt--; )
1774
    {
1775
        LD_UB2( p_src1, i_src1_stride, src0, src1 );
1776
        p_src1 += ( 2 * i_src1_stride );
1777
        LD_UB2( p_src2, i_src2_stride, src2, src3 );
1778
        p_src2 += ( 2 * i_src2_stride );
1779

1780
        AVER_UB2_UB( src0, src2, src1, src3, dst0, dst1 );
1781

1782
        u_out0 = __msa_copy_u_w( ( v4i32 ) dst0, 0 );
1783
        u_out1 = __msa_copy_u_w( ( v4i32 ) dst1, 0 );
1784
        SW( u_out0, p_dst );
1785
        p_dst += i_dst_stride;
1786
        SW( u_out1, p_dst );
1787
        p_dst += i_dst_stride;
1788
    }
1789
}
1790

1791
static void avg_src_width8_msa( uint8_t *p_src1, int32_t i_src1_stride,
1792
                                uint8_t *p_src2, int32_t i_src2_stride,
1793
                                uint8_t *p_dst, int32_t i_dst_stride,
1794
                                int32_t i_height )
1795
{
1796
    int32_t i_cnt;
1797
    uint64_t u_out0, u_out1, u_out2, u_out3;
1798
    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1799
    v16u8 dst0, dst1, dst2, dst3;
1800

1801
    for( i_cnt = ( i_height / 4 ); i_cnt--; )
1802
    {
1803
        LD_UB4( p_src1, i_src1_stride, src0, src1, src2, src3 );
1804
        p_src1 += ( 4 * i_src1_stride );
1805
        LD_UB4( p_src2, i_src2_stride, src4, src5, src6, src7 );
1806
        p_src2 += ( 4 * i_src2_stride );
1807

1808
        AVER_UB4_UB( src0, src4, src1, src5, src2, src6, src3, src7,
1809
                     dst0, dst1, dst2, dst3 );
1810

1811
        u_out0 = __msa_copy_u_d( ( v2i64 ) dst0, 0 );
1812
        u_out1 = __msa_copy_u_d( ( v2i64 ) dst1, 0 );
1813
        u_out2 = __msa_copy_u_d( ( v2i64 ) dst2, 0 );
1814
        u_out3 = __msa_copy_u_d( ( v2i64 ) dst3, 0 );
1815
        SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1816
        p_dst += ( 4 * i_dst_stride );
1817
    }
1818
}
1819

1820
static void avg_src_width16_msa( uint8_t *p_src1, int32_t i_src1_stride,
1821
                                 uint8_t *p_src2, int32_t i_src2_stride,
1822
                                 uint8_t *p_dst, int32_t i_dst_stride,
1823
                                 int32_t i_height )
1824
{
1825
    int32_t i_cnt;
1826
    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1827
    v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1828

1829
    for( i_cnt = ( i_height / 8 ); i_cnt--; )
1830
    {
1831
        LD_UB8( p_src1, i_src1_stride,
1832
                src0, src1, src2, src3, src4, src5, src6, src7 );
1833
        p_src1 += ( 8 * i_src1_stride );
1834
        LD_UB8( p_src2, i_src2_stride,
1835
                dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7 );
1836
        p_src2 += ( 8 * i_src2_stride );
1837

1838
        AVER_UB4_UB( src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1839
                     dst0, dst1, dst2, dst3 );
1840
        AVER_UB4_UB( src4, dst4, src5, dst5, src6, dst6, src7, dst7,
1841
                     dst4, dst5, dst6, dst7 );
1842

1843
        ST_UB8( dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
1844
                p_dst, i_dst_stride );
1845
        p_dst += ( 8 * i_dst_stride );
1846
    }
1847
}
1848

1849
static void memset_zero_16width_msa( uint8_t *p_src, int32_t i_stride,
1850
                                     int32_t i_height )
1851
{
1852
    int8_t i_cnt;
1853
    v16u8 zero = { 0 };
1854

1855
    for( i_cnt = ( i_height / 2 ); i_cnt--; )
1856
    {
1857
        ST_UB( zero, p_src );
1858
        p_src += i_stride;
1859
        ST_UB( zero, p_src );
1860
        p_src += i_stride;
1861
    }
1862
}
1863

1864
static void plane_copy_interleave_msa( uint8_t *p_src0, int32_t i_src0_stride,
1865
                                       uint8_t *p_src1, int32_t i_src1_stride,
1866
                                       uint8_t *p_dst, int32_t i_dst_stride,
1867
                                       int32_t i_width, int32_t i_height )
1868
{
1869
    int32_t i_loop_width, i_loop_height, i_w_mul8, i_h4w;
1870
    v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1871
    v16u8 vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3;
1872
    v16u8 vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3;
1873

1874
    i_w_mul8 = i_width - i_width % 8;
1875
    i_h4w = i_height - i_height % 4;
1876

1877
    for( i_loop_height = ( i_h4w >> 2 ); i_loop_height--; )
1878
    {
1879
        for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
1880
        {
1881
            LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 );
1882
            LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 );
1883
            ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1884
                        vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 );
1885
            ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1886
                        vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3 );
1887
            ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3,
1888
                    p_dst, i_dst_stride );
1889
            ST_UB4( vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3,
1890
                    ( p_dst + 16 ), i_dst_stride );
1891
            p_src0 += 16;
1892
            p_src1 += 16;
1893
            p_dst += 32;
1894
        }
1895

1896
        for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; )
1897
        {
1898
            LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 );
1899
            LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 );
1900
            ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1901
                        vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 );
1902
            ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3,
1903
                    p_dst, i_dst_stride );
1904
            p_src0 += 8;
1905
            p_src1 += 8;
1906
            p_dst += 16;
1907
        }
1908

1909
        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
1910
        {
1911
            p_dst[0] = p_src0[0];
1912
            p_dst[1] = p_src1[0];
1913
            p_dst[i_dst_stride] = p_src0[i_src0_stride];
1914
            p_dst[i_dst_stride + 1] = p_src1[i_src1_stride];
1915
            p_dst[2 * i_dst_stride] = p_src0[2 * i_src0_stride];
1916
            p_dst[2 * i_dst_stride + 1] = p_src1[2 * i_src1_stride];
1917
            p_dst[3 * i_dst_stride] = p_src0[3 * i_src0_stride];
1918
            p_dst[3 * i_dst_stride + 1] = p_src1[3 * i_src1_stride];
1919
            p_src0 += 1;
1920
            p_src1 += 1;
1921
            p_dst += 2;
1922
        }
1923

1924
        p_src0 += ( ( 4 * i_src0_stride ) - i_width );
1925
        p_src1 += ( ( 4 * i_src1_stride ) - i_width );
1926
        p_dst += ( ( 4 * i_dst_stride ) - ( i_width * 2 ) );
1927
    }
1928

1929
    for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ )
1930
    {
1931
        for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
1932
        {
1933
            src0 = LD_UB( p_src0 );
1934
            src4 = LD_UB( p_src1 );
1935
            ILVRL_B2_UB( src4, src0, vec_ilv_r0, vec_ilv_l0 );
1936
            ST_UB2( vec_ilv_r0, vec_ilv_l0, p_dst, 16 );
1937
            p_src0 += 16;
1938
            p_src1 += 16;
1939
            p_dst += 32;
1940
        }
1941

1942
        for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; )
1943
        {
1944
            src0 = LD_UB( p_src0 );
1945
            src4 = LD_UB( p_src1 );
1946
            vec_ilv_r0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) src4,
1947
                                                 ( v16i8 ) src0 );
1948
            ST_UB( vec_ilv_r0, p_dst );
1949
            p_src0 += 8;
1950
            p_src1 += 8;
1951
            p_dst += 16;
1952
        }
1953

1954
        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
1955
        {
1956
            p_dst[0] = p_src0[0];
1957
            p_dst[1] = p_src1[0];
1958
            p_src0 += 1;
1959
            p_src1 += 1;
1960
            p_dst += 2;
1961
        }
1962

1963
        p_src0 += ( i_src0_stride - i_width );
1964
        p_src1 += ( i_src1_stride - i_width );
1965
        p_dst += ( i_dst_stride - ( i_width * 2 ) );
1966
    }
1967
}
1968

1969
static void plane_copy_deinterleave_msa( uint8_t *p_src, int32_t i_src_stride,
1970
                                         uint8_t *p_dst0, int32_t dst0_stride,
1971
                                         uint8_t *p_dst1, int32_t dst1_stride,
1972
                                         int32_t i_width, int32_t i_height )
1973
{
1974
    int32_t i_loop_width, i_loop_height, i_w_mul4, i_w_mul8, i_h4w;
1975
    uint32_t u_res_w0, u_res_w1;
1976
    v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
1977
    v16u8 vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3;
1978
    v16u8 vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3;
1979
    uint8_t *p_dst;
1980

1981
    i_w_mul8 = i_width - i_width % 8;
1982
    i_w_mul4 = i_width - i_width % 4;
1983
    i_h4w = i_height - i_height % 8;
1984

1985
    for( i_loop_height = ( i_h4w >> 3 ); i_loop_height--; )
1986
    {
1987
        for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; )
1988
        {
1989
            LD_UB8( p_src, i_src_stride,
1990
                    in0, in1, in2, in3, in4, in5, in6, in7 );
1991
            p_src += 16;
1992
            PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
1993
                         vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 );
1994
            PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
1995
                         vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 );
1996
            ST8x4_UB( vec_pckev0, vec_pckev1, p_dst0, dst0_stride );
1997
            p_dst = p_dst0 + 4 * dst0_stride;
1998
            ST8x4_UB( vec_pckev2, vec_pckev3, p_dst, dst0_stride );
1999
            ST8x4_UB( vec_pckod0, vec_pckod1, p_dst1, dst1_stride );
2000
            p_dst = p_dst1 + 4 * dst1_stride;
2001
            ST8x4_UB( vec_pckod2, vec_pckod3, p_dst, dst1_stride );
2002
            p_dst0 += 8;
2003
            p_dst1 += 8;
2004
        }
2005

2006
        for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; )
2007
        {
2008
            LD_UB8( p_src, i_src_stride,
2009
                    in0, in1, in2, in3, in4, in5, in6, in7 );
2010
            p_src += 8;
2011
            PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
2012
                         vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 );
2013
            PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
2014
                         vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 );
2015
            ST4x4_UB( vec_pckev0, vec_pckev1, 0, 2, 0, 2, p_dst0, dst0_stride );
2016
            p_dst = p_dst0 + 4 * dst0_stride;
2017
            ST4x4_UB( vec_pckev2, vec_pckev3, 0, 2, 0, 2, p_dst, dst0_stride );
2018
            ST4x4_UB( vec_pckod0, vec_pckod1, 0, 2, 0, 2, p_dst1, dst1_stride );
2019
            p_dst = p_dst1 + 4 * dst1_stride;
2020
            ST4x4_UB( vec_pckod2, vec_pckod3, 0, 2, 0, 2, p_dst, dst1_stride );
2021
            p_dst0 += 4;
2022
            p_dst1 += 4;
2023
        }
2024

2025
        for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ )
2026
        {
2027
            p_dst0[0] = p_src[0];
2028
            p_dst1[0] = p_src[1];
2029
            p_dst0[dst0_stride] = p_src[i_src_stride];
2030
            p_dst1[dst1_stride] = p_src[i_src_stride + 1];
2031
            p_dst0[2 * dst0_stride] = p_src[2 * i_src_stride];
2032
            p_dst1[2 * dst1_stride] = p_src[2 * i_src_stride + 1];
2033
            p_dst0[3 * dst0_stride] = p_src[3 * i_src_stride];
2034
            p_dst1[3 * dst1_stride] = p_src[3 * i_src_stride + 1];
2035
            p_dst0[4 * dst0_stride] = p_src[4 * i_src_stride];
2036
            p_dst1[4 * dst1_stride] = p_src[4 * i_src_stride + 1];
2037
            p_dst0[5 * dst0_stride] = p_src[5 * i_src_stride];
2038
            p_dst1[5 * dst1_stride] = p_src[5 * i_src_stride + 1];
2039
            p_dst0[6 * dst0_stride] = p_src[6 * i_src_stride];
2040
            p_dst1[6 * dst1_stride] = p_src[6 * i_src_stride + 1];
2041
            p_dst0[7 * dst0_stride] = p_src[7 * i_src_stride];
2042
            p_dst1[7 * dst1_stride] = p_src[7 * i_src_stride + 1];
2043
            p_dst0 += 1;
2044
            p_dst1 += 1;
2045
            p_src += 2;
2046
        }
2047

2048
        p_src += ( ( 8 * i_src_stride ) - ( i_width << 1 ) );
2049
        p_dst0 += ( ( 8 * dst0_stride ) - i_width );
2050
        p_dst1 += ( ( 8 * dst1_stride ) - i_width );
2051
    }
2052

2053
    for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ )
2054
    {
2055
        for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; )
2056
        {
2057
            in0 = LD_UB( p_src );
2058
            p_src += 16;
2059
            vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0,
2060
                                                  ( v16i8 ) in0 );
2061
            vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0,
2062
                                                  ( v16i8 ) in0 );
2063
            ST8x1_UB( vec_pckev0, p_dst0 );
2064
            ST8x1_UB( vec_pckod0, p_dst1 );
2065
            p_dst0 += 8;
2066
            p_dst1 += 8;
2067
        }
2068

2069
        for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; )
2070
        {
2071
            in0 = LD_UB( p_src );
2072
            p_src += 8;
2073
            vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0,
2074
                                                  ( v16i8 ) in0 );
2075
            vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0,
2076
                                                  ( v16i8 ) in0 );
2077
            u_res_w0 = __msa_copy_u_w( ( v4i32 ) vec_pckev0, 0 );
2078
            SW( u_res_w0, p_dst0 );
2079
            u_res_w1 = __msa_copy_u_w( ( v4i32 ) vec_pckod0, 0 );
2080
            SW( u_res_w1, p_dst1 );
2081
            p_dst0 += 4;
2082
            p_dst1 += 4;
2083
        }
2084

2085
        for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ )
2086
        {
2087
            p_dst0[0] = p_src[0];
2088
            p_dst1[0] = p_src[1];
2089
            p_dst0 += 1;
2090
            p_dst1 += 1;
2091
            p_src += 2;
2092
        }
2093

2094
        p_src += ( ( i_src_stride ) - ( i_width << 1 ) );
2095
        p_dst0 += ( ( dst0_stride ) - i_width );
2096
        p_dst1 += ( ( dst1_stride ) - i_width );
2097
    }
2098
}
2099

2100

2101
static void plane_copy_deinterleave_rgb_msa( uint8_t *p_src,
2102
                                             int32_t i_src_stride,
2103
                                             uint8_t *p_dst0,
2104
                                             int32_t i_dst0_stride,
2105
                                             uint8_t *p_dst1,
2106
                                             int32_t i_dst1_stride,
2107
                                             uint8_t *p_dst2,
2108
                                             int32_t i_dst2_stride,
2109
                                             int32_t i_width,
2110
                                             int32_t i_height )
2111
{
2112
    uint8_t *p_src_orig = p_src;
2113
    uint8_t *p_dst0_orig = p_dst0;
2114
    uint8_t *p_dst1_orig = p_dst1;
2115
    uint8_t *p_dst2_orig = p_dst2;
2116
    int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4;
2117
    v16i8 in0, in1, in2, in3, in4, in5, in6, in7;
2118
    v16i8 temp0, temp1, temp2, temp3;
2119
    v16i8 mask0 = { 0, 3, 6, 9, 12, 15, 18, 21, 0, 0, 0, 0, 0, 0, 0, 0 };
2120
    v16i8 mask1 = { 1, 4, 7, 10, 13, 16, 19, 22, 0, 0, 0, 0, 0, 0, 0, 0 };
2121
    v16i8 mask2 = { 2, 5, 8, 11, 14, 17, 20, 23, 0, 0, 0, 0, 0, 0, 0, 0 };
2122

2123
    i_w_mul8 = i_width - i_width % 8;
2124
    i_h_mul4 = i_height - i_height % 4;
2125

2126
    for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
2127
    {
2128
        p_src = p_src_orig;
2129
        p_dst0 = p_dst0_orig;
2130
        p_dst1 = p_dst1_orig;
2131
        p_dst2 = p_dst2_orig;
2132

2133
        for( i_loop_width = ( i_width >> 3 ); i_loop_width--; )
2134
        {
2135
            LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
2136
            LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 );
2137

2138
            VSHF_B2_SB( in0, in4, in1, in5, mask0, mask0, temp0, temp1 );
2139
            VSHF_B2_SB( in2, in6, in3, in7, mask0, mask0, temp2, temp3 );
2140
            ST8x1_UB( temp0, p_dst0 );
2141
            ST8x1_UB( temp1, p_dst0 + i_dst0_stride );
2142
            ST8x1_UB( temp2, p_dst0 + 2 * i_dst0_stride );
2143
            ST8x1_UB( temp3, p_dst0 + 3 * i_dst0_stride );
2144

2145
            VSHF_B2_SB( in0, in4, in1, in5, mask1, mask1, temp0, temp1 );
2146
            VSHF_B2_SB( in2, in6, in3, in7, mask1, mask1, temp2, temp3 );
2147
            ST8x1_UB( temp0, p_dst1 );
2148
            ST8x1_UB( temp1, p_dst1 + i_dst1_stride );
2149
            ST8x1_UB( temp2, p_dst1 + 2 * i_dst1_stride );
2150
            ST8x1_UB( temp3, p_dst1 + 3 * i_dst1_stride );
2151

2152
            VSHF_B2_SB( in0, in4, in1, in5, mask2, mask2, temp0, temp1 );
2153
            VSHF_B2_SB( in2, in6, in3, in7, mask2, mask2, temp2, temp3 );
2154
            ST8x1_UB( temp0, p_dst2 );
2155
            ST8x1_UB( temp1, p_dst2 + i_dst2_stride );
2156
            ST8x1_UB( temp2, p_dst2 + 2 * i_dst2_stride );
2157
            ST8x1_UB( temp3, p_dst2 + 3 * i_dst2_stride );
2158

2159
            p_src += 8 * 3;
2160
            p_dst0 += 8;
2161
            p_dst1 += 8;
2162
            p_dst2 += 8;
2163
        }
2164

2165
        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
2166
        {
2167
            p_dst0_orig[i_loop_width] = p_src_orig[0 + 3 * i_loop_width];
2168
            p_dst1_orig[i_loop_width] = p_src_orig[1 + 3 * i_loop_width];
2169
            p_dst2_orig[i_loop_width] = p_src_orig[2 + 3 * i_loop_width];
2170

2171
            p_dst0_orig[i_loop_width + i_dst0_stride] =
2172
                p_src_orig[0 + i_src_stride + 3 * i_loop_width];
2173
            p_dst1_orig[i_loop_width + i_dst1_stride] =
2174
                p_src_orig[1 + i_src_stride + 3 * i_loop_width];
2175
            p_dst2_orig[i_loop_width + i_dst2_stride] =
2176
                p_src_orig[2 + i_src_stride + 3 * i_loop_width];
2177

2178
            p_dst0_orig[i_loop_width + 2 * i_dst0_stride] =
2179
                p_src_orig[0 + 2 * i_src_stride + 3 * i_loop_width];
2180
            p_dst1_orig[i_loop_width + 2 * i_dst1_stride] =
2181
                p_src_orig[1 + 2 * i_src_stride + 3 * i_loop_width];
2182
            p_dst2_orig[i_loop_width + 2 * i_dst2_stride] =
2183
                p_src_orig[2 + 2 * i_src_stride + 3 * i_loop_width];
2184

2185
            p_dst0_orig[i_loop_width + 3 * i_dst0_stride] =
2186
                p_src_orig[0 + 3 * i_src_stride + 3 * i_loop_width];
2187
            p_dst1_orig[i_loop_width + 3 * i_dst1_stride] =
2188
                p_src_orig[1 + 3 * i_src_stride + 3 * i_loop_width];
2189
            p_dst2_orig[i_loop_width + 3 * i_dst2_stride] =
2190
                p_src_orig[2 + 3 * i_src_stride + 3 * i_loop_width];
2191
        }
2192

2193
        p_src_orig += ( 4 * i_src_stride );
2194
        p_dst0_orig += ( 4 * i_dst0_stride );
2195
        p_dst1_orig += ( 4 * i_dst1_stride );
2196
        p_dst2_orig += ( 4 * i_dst2_stride );
2197
    }
2198

2199
    for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ )
2200
    {
2201
        p_src = p_src_orig;
2202
        p_dst0 = p_dst0_orig;
2203
        p_dst1 = p_dst1_orig;
2204
        p_dst2 = p_dst2_orig;
2205

2206
        for( i_loop_width = ( i_width >> 3 ); i_loop_width--; )
2207
        {
2208
            in0 = LD_SB( p_src );
2209
            in4 = LD_SB( p_src + 16 );
2210
            temp0 = __msa_vshf_b( mask0, in4, in0 );
2211
            ST8x1_UB( temp0, p_dst0 );
2212
            temp0 = __msa_vshf_b( mask1, in4, in0 );
2213
            ST8x1_UB( temp0, p_dst1 );
2214
            temp0 = __msa_vshf_b( mask2, in4, in0 );
2215
            ST8x1_UB( temp0, p_dst2 );
2216

2217
            p_src += 8 * 3;
2218
            p_dst0 += 8;
2219
            p_dst1 += 8;
2220
            p_dst2 += 8;
2221
        }
2222

2223
        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
2224
        {
2225
            p_dst0_orig[i_loop_width] = p_src_orig[3 * i_loop_width];
2226
            p_dst1_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 1];
2227
            p_dst2_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 2];
2228
        }
2229

2230
        p_src_orig += ( i_src_stride );
2231
        p_dst0_orig += ( i_dst0_stride );
2232
        p_dst1_orig += ( i_dst1_stride );
2233
        p_dst2_orig += ( i_dst2_stride );
2234
    }
2235
}
2236

2237
static void plane_copy_deinterleave_rgba_msa( uint8_t *p_src,
2238
                                              int32_t i_src_stride,
2239
                                              uint8_t *p_dst0,
2240
                                              int32_t i_dst0_stride,
2241
                                              uint8_t *p_dst1,
2242
                                              int32_t i_dst1_stride,
2243
                                              uint8_t *p_dst2,
2244
                                              int32_t i_dst2_stride,
2245
                                              int32_t i_width,
2246
                                              int32_t i_height )
2247
{
2248
    uint8_t *p_src_orig = p_src;
2249
    uint8_t *p_dst0_orig = p_dst0;
2250
    uint8_t *p_dst1_orig = p_dst1;
2251
    uint8_t *p_dst2_orig = p_dst2;
2252
    int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4;
2253
    v16i8 in0, in1, in2, in3, in4, in5, in6, in7;
2254
    v16i8 in8, in9, in10, in11, in12, in13, in14, in15;
2255
    v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
2256
    v8i16 temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15;
2257

2258
    i_w_mul8 = i_width - i_width % 8;
2259
    i_h_mul4 = i_height - i_height % 4;
2260

2261
    for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
2262
    {
2263
        p_src = p_src_orig;
2264
        p_dst0 = p_dst0_orig;
2265
        p_dst1 = p_dst1_orig;
2266
        p_dst2 = p_dst2_orig;
2267

2268
        for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
2269
        {
2270
            LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
2271
            LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 );
2272
            LD_SB4( ( p_src + 32 ), i_src_stride, in8, in9, in10, in11 );
2273
            LD_SB4( ( p_src + 48 ), i_src_stride, in12, in13, in14, in15 );
2274

2275
            PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 );
2276
            temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2277
            temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 );
2278
            PCKEV_H2_SH( in5, in1, in13, in9, temp4, temp5 );
2279
            temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 );
2280
            temp7 = __msa_pckod_h( ( v8i16 ) in13, ( v8i16 ) in9 );
2281
            PCKEV_H2_SH( in6, in2, in14, in10, temp8, temp9 );
2282
            temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 );
2283
            temp11 = __msa_pckod_h( ( v8i16 ) in14, ( v8i16 ) in10 );
2284
            PCKEV_H2_SH( in7, in3, in15, in11, temp12, temp13 );
2285
            temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 );
2286
            temp15 = __msa_pckod_h( ( v8i16 ) in15, ( v8i16 ) in11 );
2287
            PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 );
2288
            in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 );
2289
            PCKEV_B2_SB( temp5, temp4, temp7, temp6, in4, in5 );
2290
            in6 = __msa_pckod_b( ( v16i8 ) temp5, ( v16i8 ) temp4 );
2291
            PCKEV_B2_SB( temp9, temp8, temp11, temp10, in8, in9 );
2292
            in10 = __msa_pckod_b( ( v16i8 ) temp9, ( v16i8 ) temp8 );
2293
            PCKEV_B2_SB( temp13, temp12, temp15, temp14, in12, in13 );
2294
            in14 = __msa_pckod_b( ( v16i8 ) temp13, ( v16i8 ) temp12 );
2295
            ST_SB4( in0, in4, in8, in12, p_dst0, i_dst0_stride );
2296
            ST_SB4( in1, in5, in9, in13, p_dst2, i_dst2_stride );
2297
            ST_SB4( in2, in6, in10, in14, p_dst1, i_dst1_stride );
2298

2299
            p_src += 16 * 4;
2300
            p_dst0 += 16;
2301
            p_dst1 += 16;
2302
            p_dst2 += 16;
2303
        }
2304

2305
        for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; )
2306
        {
2307
            LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
2308
            LD_SB4( p_src + 16, i_src_stride, in4, in5, in6, in7 );
2309

2310
            PCKEV_H2_SH( in4, in0, in5, in1, temp0, temp4 );
2311
            temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2312
            temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 );
2313

2314
            PCKEV_H2_SH( in6, in2, in7, in3, temp8, temp12 );
2315
            temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 );
2316
            temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 );
2317

2318
            PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 );
2319
            in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 );
2320
            PCKEV_B2_SB( temp4, temp4, temp6, temp6, in4, in5 );
2321
            in6 = __msa_pckod_b( ( v16i8 ) temp4, ( v16i8 ) temp4 );
2322
            PCKEV_B2_SB( temp8, temp8, temp10, temp10, in8, in9 );
2323
            in10 = __msa_pckod_b( ( v16i8 ) temp8, ( v16i8 ) temp8 );
2324
            PCKEV_B2_SB( temp12, temp12, temp14, temp14, in12, in13 );
2325
            in14 = __msa_pckod_b( ( v16i8 ) temp12, ( v16i8 ) temp12 );
2326

2327
            ST8x1_UB( in0, p_dst0 );
2328
            ST8x1_UB( in4, p_dst0 + i_dst0_stride );
2329
            ST8x1_UB( in8, p_dst0 + 2 * i_dst0_stride );
2330
            ST8x1_UB( in12, p_dst0 + 3 * i_dst0_stride );
2331

2332
            ST8x1_UB( in1, p_dst2 );
2333
            ST8x1_UB( in5, p_dst2 + i_dst2_stride );
2334
            ST8x1_UB( in9, p_dst2 + 2 * i_dst2_stride );
2335
            ST8x1_UB( in13, p_dst2 + 3 * i_dst2_stride );
2336

2337
            ST8x1_UB( in2, p_dst1 );
2338
            ST8x1_UB( in6, p_dst1 + i_dst1_stride );
2339
            ST8x1_UB( in10, p_dst1 + 2 * i_dst1_stride );
2340
            ST8x1_UB( in14, p_dst1 + 3 * i_dst1_stride );
2341

2342
            p_src += 8 * 4;
2343
            p_dst0 += 8;
2344
            p_dst1 += 8;
2345
            p_dst2 += 8;
2346
        }
2347

2348
        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
2349
        {
2350
            p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width];
2351
            p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1];
2352
            p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2];
2353

2354
            p_dst0_orig[i_dst0_stride + i_loop_width] =
2355
                p_src_orig[i_src_stride + 4 * i_loop_width];
2356
            p_dst1_orig[i_dst1_stride + i_loop_width] =
2357
                p_src_orig[i_src_stride + 4 * i_loop_width + 1];
2358
            p_dst2_orig[i_dst2_stride + i_loop_width] =
2359
                p_src_orig[i_src_stride + 4 * i_loop_width + 2];
2360

2361
            p_dst0_orig[2 * i_dst0_stride + i_loop_width] =
2362
                p_src_orig[2 * i_src_stride + 4 * i_loop_width];
2363
            p_dst1_orig[2 * i_dst1_stride + i_loop_width] =
2364
                p_src_orig[2 * i_src_stride + 4 * i_loop_width + 1];
2365
            p_dst2_orig[2 * i_dst2_stride + i_loop_width] =
2366
                p_src_orig[2 * i_src_stride + 4 * i_loop_width + 2];
2367

2368
            p_dst0_orig[3 * i_dst0_stride + i_loop_width] =
2369
                p_src_orig[3 * i_src_stride + 4 * i_loop_width];
2370
            p_dst1_orig[3 * i_dst1_stride + i_loop_width] =
2371
                p_src_orig[3 * i_src_stride + 4 * i_loop_width + 1];
2372
            p_dst2_orig[3 * i_dst2_stride + i_loop_width] =
2373
                p_src_orig[3 * i_src_stride + 4 * i_loop_width + 2];
2374
        }
2375

2376
        p_src_orig += ( 4 * i_src_stride );
2377
        p_dst0_orig += ( 4 * i_dst0_stride );
2378
        p_dst1_orig += ( 4 * i_dst1_stride );
2379
        p_dst2_orig += ( 4 * i_dst2_stride );
2380
    }
2381

2382
    for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ )
2383
    {
2384
        p_src = p_src_orig;
2385
        p_dst0 = p_dst0_orig;
2386
        p_dst1 = p_dst1_orig;
2387
        p_dst2 = p_dst2_orig;
2388

2389
        for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
2390
        {
2391
            LD_SB4( p_src, 16, in0, in4, in8, in12 );
2392

2393
            PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 );
2394
            temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2395
            temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 );
2396
            PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 );
2397
            in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 );
2398
            ST_SB( in0, p_dst0 );
2399
            ST_SB( in0, p_dst0 );
2400
            ST_SB( in1, p_dst2 );
2401
            ST_SB( in1, p_dst2 );
2402
            ST_SB( in2, p_dst1 );
2403
            ST_SB( in2, p_dst1 );
2404

2405
            p_src += 16 * 4;
2406
            p_dst0 += 16;
2407
            p_dst1 += 16;
2408
            p_dst2 += 16;
2409
        }
2410

2411
        for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; )
2412
        {
2413
            in0 = LD_SB( p_src );
2414
            in4 = LD_SB( p_src + 16 );
2415

2416
            temp0 = __msa_pckev_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2417
            temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2418
            PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 );
2419
            in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 );
2420
            ST8x1_UB( in0, p_dst0 );
2421
            ST8x1_UB( in1, p_dst2 );
2422
            ST8x1_UB( in2, p_dst1 );
2423

2424
            p_src += 8 * 4;
2425
            p_dst0 += 8;
2426
            p_dst1 += 8;
2427
            p_dst2 += 8;
2428
        }
2429

2430
        for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
2431
        {
2432
            p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width];
2433
            p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1];
2434
            p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2];
2435
        }
2436

2437
        p_src_orig += ( i_src_stride );
2438
        p_dst0_orig += ( i_dst0_stride );
2439
        p_dst1_orig += ( i_dst1_stride );
2440
        p_dst2_orig += ( i_dst2_stride );
2441
    }
2442
}
2443

2444
static void store_interleave_chroma_msa( uint8_t *p_src0, int32_t i_src0_stride,
2445
                                         uint8_t *p_src1, int32_t i_src1_stride,
2446
                                         uint8_t *p_dst, int32_t i_dst_stride,
2447
                                         int32_t i_height )
2448
{
2449
    int32_t i_loop_height, i_h4w;
2450
    v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
2451
    v16u8 ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3;
2452

2453
    i_h4w = i_height % 4;
2454
    for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
2455
    {
2456
        LD_UB4( p_src0, i_src0_stride, in0, in1, in2, in3 );
2457
        p_src0 += ( 4 * i_src0_stride );
2458
        LD_UB4( p_src1, i_src1_stride, in4, in5, in6, in7 );
2459
        p_src1 += ( 4 * i_src1_stride );
2460
        ILVR_B4_UB( in4, in0, in5, in1, in6, in2, in7, in3,
2461
                    ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3 );
2462
        ST_UB4( ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3,
2463
                p_dst, i_dst_stride );
2464
        p_dst += ( 4 * i_dst_stride );
2465
    }
2466

2467
    for( i_loop_height = i_h4w; i_loop_height--; )
2468
    {
2469
        in0 = LD_UB( p_src0 );
2470
        p_src0 += ( i_src0_stride );
2471
        in1 = LD_UB( p_src1 );
2472
        p_src1 += ( i_src1_stride );
2473
        ilvr_vec0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) in1, ( v16i8 ) in0 );
2474
        ST_UB( ilvr_vec0, p_dst );
2475
        p_dst += ( i_dst_stride );
2476
    }
2477
}
2478

2479
static void frame_init_lowres_core_msa( uint8_t *p_src, int32_t i_src_stride,
2480
                                        uint8_t *p_dst0, int32_t dst0_stride,
2481
                                        uint8_t *p_dst1, int32_t dst1_stride,
2482
                                        uint8_t *p_dst2, int32_t dst2_stride,
2483
                                        uint8_t *p_dst3, int32_t dst3_stride,
2484
                                        int32_t i_width, int32_t i_height )
2485
{
2486
    int32_t i_loop_width, i_loop_height, i_w16_mul;
2487
    v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2488
    v16u8 sld1_vec0, sld1_vec1, sld1_vec2, sld1_vec3, sld1_vec4, sld1_vec5;
2489
    v16u8 pckev_vec0, pckev_vec1, pckev_vec2;
2490
    v16u8 pckod_vec0, pckod_vec1, pckod_vec2;
2491
    v16u8 tmp0, tmp1, tmp2, tmp3;
2492
    v16u8 res0, res1;
2493

2494
    i_w16_mul = i_width - i_width % 16;
2495
    for( i_loop_height = i_height; i_loop_height--; )
2496
    {
2497
        LD_UB3( p_src, i_src_stride, src0, src1, src2 );
2498
        p_src += 16;
2499
        for( i_loop_width = 0; i_loop_width < ( i_w16_mul >> 4 ); i_loop_width++ )
2500
        {
2501
            LD_UB3( p_src, i_src_stride, src3, src4, src5 );
2502
            p_src += 16;
2503
            LD_UB3( p_src, i_src_stride, src6, src7, src8 );
2504
            p_src += 16;
2505
            PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 );
2506
            PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 );
2507
            pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5,
2508
                                                  ( v16i8 ) src2 );
2509
            pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5,
2510
                                                  ( v16i8 ) src2 );
2511
            AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
2512
                         pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
2513
                         tmp0, tmp1, tmp2, tmp3 );
2514
            AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
2515
            ST_UB( res0, p_dst0 );
2516
            ST_UB( res1, p_dst2 );
2517

2518
            SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 );
2519
            SLDI_B2_UB( src5, src6, src2, src3, sld1_vec2, sld1_vec3, 1 );
2520
            SLDI_B2_UB( src7, src8, src4, src5, sld1_vec4, sld1_vec5, 1 );
2521
            PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1,
2522
                         pckev_vec0, pckev_vec1 )
2523
            pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5,
2524
                                                  ( v16i8 ) sld1_vec2 );
2525
            AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
2526
                         pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
2527
                         tmp0, tmp1, tmp2, tmp3 );
2528
            AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
2529
            ST_UB( res0, p_dst1 );
2530
            ST_UB( res1, p_dst3 );
2531

2532
            src0 = src6;
2533
            src1 = src7;
2534
            src2 = src8;
2535
            p_dst0 += 16;
2536
            p_dst1 += 16;
2537
            p_dst2 += 16;
2538
            p_dst3 += 16;
2539
        }
2540

2541
        for( i_loop_width = i_w16_mul; i_loop_width < i_width;
2542
             i_loop_width += 8 )
2543
        {
2544
            LD_UB3( p_src, i_src_stride, src3, src4, src5 );
2545
            p_src += 16;
2546
            PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 );
2547
            PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 );
2548
            pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5,
2549
                                                  ( v16i8 ) src2 );
2550
            pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5,
2551
                                                  ( v16i8 ) src2 );
2552
            AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
2553
                         pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
2554
                         tmp0, tmp1, tmp2, tmp3 );
2555
            AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
2556
            ST8x1_UB( res0, p_dst0 );
2557
            ST8x1_UB( res1, p_dst2 );
2558

2559
            SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 );
2560
            SLDI_B2_UB( src5, src3, src2, src3, sld1_vec2, sld1_vec3, 1 );
2561
            SLDI_B2_UB( src4, src5, src4, src5, sld1_vec4, sld1_vec5, 1 );
2562
            PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1,
2563
                         pckev_vec0, pckev_vec1 )
2564
            pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5,
2565
                                                  ( v16i8 ) sld1_vec2 );
2566
            AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
2567
                         pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
2568
                         tmp0, tmp1, tmp2, tmp3 );
2569
            AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
2570
            ST8x1_UB( res0, p_dst1 );
2571
            ST8x1_UB( res1, p_dst3 );
2572
            p_dst0 += 8;
2573
            p_dst1 += 8;
2574
            p_dst2 += 8;
2575
            p_dst3 += 8;
2576
        }
2577

2578
        p_src += ( i_src_stride * 2 - ( ( i_width * 2 ) + 16 ) );
2579
        p_dst0 += ( dst0_stride - i_width );
2580
        p_dst1 += ( dst1_stride - i_width );
2581
        p_dst2 += ( dst2_stride - i_width );
2582
        p_dst3 += ( dst3_stride - i_width );
2583
    }
2584
}
2585

2586
void x264_mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2587
                           uint8_t *p_src, intptr_t i_src_stride,
2588
                           int32_t i_height )
2589
{
2590
    copy_width16_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
2591
}
2592

2593
void x264_mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
2594
                          intptr_t i_src_stride, int32_t i_height )
2595
{
2596
    copy_width8_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
2597
}
2598

2599
void x264_mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
2600
                          intptr_t i_src_stride, int32_t i_height )
2601
{
2602
    copy_width4_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
2603
}
2604

2605
void x264_pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2606
                               uint8_t *p_pix2, intptr_t pix2_stride,
2607
                               uint8_t *p_pix3, intptr_t pix3_stride,
2608
                               int32_t i_weight )
2609
{
2610
    if( 32 == i_weight )
2611
    {
2612
        avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2613
                             p_pix1, pix1_stride, 16 );
2614
    }
2615
    else if( i_weight < 0 || i_weight > 63 )
2616
    {
2617
        avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride,
2618
                                          p_pix3, pix3_stride,
2619
                                          p_pix1, pix1_stride,
2620
                                          16, 5, i_weight,
2621
                                          ( 64 - i_weight ), 0 );
2622
    }
2623
    else
2624
    {
2625
        avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride,
2626
                                       p_pix3, pix3_stride,
2627
                                       p_pix1, pix1_stride,
2628
                                       16, 5, i_weight,
2629
                                       ( 64 - i_weight ), 0 );
2630
    }
2631
}
2632

2633
void x264_pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2634
                              uint8_t *p_pix2, intptr_t pix2_stride,
2635
                              uint8_t *p_pix3, intptr_t pix3_stride,
2636
                              int32_t i_weight )
2637
{
2638
    if( 32 == i_weight )
2639
    {
2640
        avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2641
                             p_pix1, pix1_stride, 8 );
2642
    }
2643
    else if( i_weight < 0 || i_weight > 63 )
2644
    {
2645
        avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride,
2646
                                          p_pix3, pix3_stride,
2647
                                          p_pix1, pix1_stride,
2648
                                          8, 5, i_weight,
2649
                                          ( 64 - i_weight ), 0 );
2650
    }
2651
    else
2652
    {
2653
        avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride,
2654
                                       p_pix3, pix3_stride,
2655
                                       p_pix1, pix1_stride,
2656
                                       8, 5, i_weight,
2657
                                       ( 64 - i_weight ), 0 );
2658
    }
2659
}
2660

2661
void x264_pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2662
                              uint8_t *p_pix2, intptr_t pix2_stride,
2663
                              uint8_t *p_pix3, intptr_t pix3_stride,
2664
                              int32_t i_weight )
2665
{
2666
    if( 32 == i_weight )
2667
    {
2668
        avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2669
                            p_pix1, pix1_stride, 16 );
2670
    }
2671
    else if( i_weight < 0 || i_weight > 63 )
2672
    {
2673
        avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
2674
                                         p_pix3, pix3_stride,
2675
                                         p_pix1, pix1_stride, 16, 5, i_weight,
2676
                                         ( 64 - i_weight ), 0 );
2677
    }
2678
    else
2679
    {
2680
        avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
2681
                                      p_pix3, pix3_stride,
2682
                                      p_pix1, pix1_stride, 16, 5, i_weight,
2683
                                      ( 64 - i_weight ), 0 );
2684
    }
2685
}
2686

2687
void x264_pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2688
                             uint8_t *p_pix2, intptr_t pix2_stride,
2689
                             uint8_t *p_pix3, intptr_t pix3_stride,
2690
                             int32_t i_weight )
2691
{
2692
    if( 32 == i_weight )
2693
    {
2694
        avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2695
                            p_pix1, pix1_stride, 8 );
2696
    }
2697
    else if( i_weight < 0 || i_weight > 63 )
2698
    {
2699
        avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
2700
                                         p_pix3, pix3_stride,
2701
                                         p_pix1, pix1_stride, 8, 5, i_weight,
2702
                                         ( 64 - i_weight ), 0 );
2703
    }
2704
    else
2705
    {
2706
        avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
2707
                                      p_pix3, pix3_stride,
2708
                                      p_pix1, pix1_stride, 8, 5, i_weight,
2709
                                      ( 64 - i_weight ), 0 );
2710
    }
2711
}
2712

2713
void x264_pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2714
                             uint8_t *p_pix2, intptr_t pix2_stride,
2715
                             uint8_t *p_pix3, intptr_t pix3_stride,
2716
                             int32_t i_weight )
2717
{
2718
    if( 32 == i_weight )
2719
    {
2720
        avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2721
                            p_pix1, pix1_stride, 4 );
2722
    }
2723
    else if( i_weight < 0 || i_weight > 63 )
2724
    {
2725
        avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
2726
                                         p_pix3, pix3_stride,
2727
                                         p_pix1, pix1_stride, 4, 5, i_weight,
2728
                                         ( 64 - i_weight ), 0 );
2729
    }
2730
    else
2731
    {
2732
        avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
2733
                                      p_pix3, pix3_stride,
2734
                                      p_pix1, pix1_stride, 4, 5, i_weight,
2735
                                      ( 64 - i_weight ), 0 );
2736
    }
2737
}
2738

2739
void x264_pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2740
                              uint8_t *p_pix2, intptr_t pix2_stride,
2741
                              uint8_t *p_pix3, intptr_t pix3_stride,
2742
                              int32_t i_weight )
2743
{
2744
    if( 32 == i_weight )
2745
    {
2746
        avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2747
                            p_pix1, pix1_stride, 16 );
2748
    }
2749
    else if( i_weight < 0 || i_weight > 63 )
2750
    {
2751
        avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
2752
                                         p_pix3, pix3_stride,
2753
                                         p_pix1, pix1_stride, 16, 5, i_weight,
2754
                                         ( 64 - i_weight ), 0 );
2755
    }
2756
    else
2757
    {
2758
        avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
2759
                                      p_pix3, pix3_stride,
2760
                                      p_pix1, pix1_stride, 16, 5, i_weight,
2761
                                      ( 64 - i_weight ), 0 );
2762
    }
2763
}
2764

2765
void x264_pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2766
                             uint8_t *p_pix2, intptr_t pix2_stride,
2767
                             uint8_t *p_pix3, intptr_t pix3_stride,
2768
                             int32_t i_weight )
2769
{
2770
    if( 32 == i_weight )
2771
    {
2772
        avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2773
                            p_pix1, pix1_stride, 8 );
2774
    }
2775
    else if( i_weight < 0 || i_weight > 63 )
2776
    {
2777
        avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
2778
                                         p_pix3, pix3_stride,
2779
                                         p_pix1, pix1_stride, 8, 5, i_weight,
2780
                                         ( 64 - i_weight ), 0 );
2781
    }
2782
    else
2783
    {
2784
        avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
2785
                                      p_pix3, pix3_stride,
2786
                                      p_pix1, pix1_stride, 8, 5, i_weight,
2787
                                      ( 64 - i_weight ), 0 );
2788
    }
2789
}
2790

2791
void x264_pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2792
                             uint8_t *p_pix2, intptr_t pix2_stride,
2793
                             uint8_t *p_pix3, intptr_t pix3_stride,
2794
                             int32_t i_weight )
2795
{
2796
    if( 32 == i_weight )
2797
    {
2798
        avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2799
                            p_pix1, pix1_stride, 4 );
2800
    }
2801
    else if( i_weight < 0 || i_weight > 63 )
2802
    {
2803
        avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
2804
                                         p_pix3, pix3_stride,
2805
                                         p_pix1, pix1_stride, 4, 5, i_weight,
2806
                                         ( 64 - i_weight ), 0 );
2807
    }
2808
    else
2809
    {
2810
        avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
2811
                                      p_pix3, pix3_stride,
2812
                                      p_pix1, pix1_stride, 4, 5, i_weight,
2813
                                      ( 64 - i_weight ), 0 );
2814
    }
2815
}
2816

2817
void x264_pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2818
                             uint8_t *p_pix2, intptr_t pix2_stride,
2819
                             uint8_t *p_pix3, intptr_t pix3_stride,
2820
                             int32_t i_weight )
2821
{
2822
    if( 32 == i_weight )
2823
    {
2824
        avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2825
                            p_pix1, pix1_stride, 2 );
2826
    }
2827
    else if( i_weight < 0 || i_weight > 63 )
2828
    {
2829
        avc_biwgt_opscale_4x2_nw_msa( p_pix2, pix2_stride,
2830
                                      p_pix3, pix3_stride,
2831
                                      p_pix1, pix1_stride, 5, i_weight,
2832
                                      ( 64 - i_weight ), 0 );
2833
    }
2834
    else
2835
    {
2836
        avc_biwgt_opscale_4x2_msa( p_pix2, pix2_stride,
2837
                                   p_pix3, pix3_stride,
2838
                                   p_pix1, pix1_stride, 5, i_weight,
2839
                                   ( 64 - i_weight ), 0 );
2840
    }
2841
}
2842

2843

2844
void x264_memzero_aligned_msa( void *p_dst, size_t n )
2845
{
2846
    uint32_t u_tot32_mul_lines = n >> 5;
2847
    uint32_t u_remaining = n - ( u_tot32_mul_lines << 5 );
2848

2849
    memset_zero_16width_msa( p_dst, 16, ( n / 16 ) );
2850

2851
    if( u_remaining )
2852
    {
2853
        memset( p_dst + ( u_tot32_mul_lines << 5 ), 0, u_remaining );
2854
    }
2855
}
2856

2857
void x264_mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2858
                            uint8_t *p_src, intptr_t i_src_stride,
2859
                            const x264_weight_t *pWeight, int32_t i_height )
2860
{
2861
    int32_t i_log2_denom = pWeight->i_denom;
2862
    int32_t i_offset = pWeight->i_offset;
2863
    int32_t i_weight = pWeight->i_scale;
2864

2865
    avc_wgt_opscale_4width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
2866
                                i_height, i_log2_denom, i_weight, i_offset );
2867
}
2868

2869
void x264_mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2870
                            uint8_t *p_src, intptr_t i_src_stride,
2871
                            const x264_weight_t *pWeight, int32_t i_height )
2872
{
2873
    int32_t i_log2_denom = pWeight->i_denom;
2874
    int32_t i_offset = pWeight->i_offset;
2875
    int32_t i_weight = pWeight->i_scale;
2876

2877
    avc_wgt_opscale_8width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
2878
                                i_height, i_log2_denom, i_weight, i_offset );
2879
}
2880

2881
void x264_mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2882
                             uint8_t *p_src, intptr_t i_src_stride,
2883
                             const x264_weight_t *pWeight, int32_t i_height )
2884
{
2885
    int32_t i_log2_denom = pWeight->i_denom;
2886
    int32_t i_offset = pWeight->i_offset;
2887
    int32_t i_weight = pWeight->i_scale;
2888

2889
    avc_wgt_opscale_16width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
2890
                                 i_height, i_log2_denom, i_weight, i_offset );
2891
}
2892

2893
void x264_mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2894
                             uint8_t *p_src, intptr_t i_src_stride,
2895
                             const x264_weight_t *pWeight, int32_t i_height )
2896
{
2897
    x264_mc_weight_w16_msa( p_dst, i_dst_stride, p_src, i_src_stride,
2898
                            pWeight, i_height );
2899
    x264_mc_weight_w4_msa( p_dst + 16, i_dst_stride, p_src + 16, i_src_stride,
2900
                           pWeight, i_height );
2901
}
2902

2903
void x264_mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2904
                       uint8_t *p_src[4], intptr_t i_src_stride,
2905
                       int32_t m_vx, int32_t m_vy,
2906
                       int32_t i_width, int32_t i_height,
2907
                       const x264_weight_t *pWeight )
2908
{
2909
    int32_t  i_qpel_idx;
2910
    int32_t  i_offset;
2911
    uint8_t  *p_src1;
2912

2913
    i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
2914
    i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
2915
    p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
2916
             ( 3 == ( m_vy & 3 ) ) * i_src_stride;
2917

2918
    if( i_qpel_idx & 5 )
2919
    {
2920
        uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
2921
                          i_offset + ( 3 == ( m_vx&3 ) );
2922

2923
        if( 16 == i_width )
2924
        {
2925
            avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride,
2926
                                 p_dst, i_dst_stride, i_height );
2927
        }
2928
        else if( 8 == i_width )
2929
        {
2930
            avg_src_width8_msa( p_src1, i_src_stride, p_src2, i_src_stride,
2931
                                p_dst, i_dst_stride, i_height );
2932
        }
2933
        else if( 4 == i_width )
2934
        {
2935
            avg_src_width4_msa( p_src1, i_src_stride, p_src2, i_src_stride,
2936
                                p_dst, i_dst_stride, i_height );
2937
        }
2938

2939
        if( pWeight->weightfn )
2940
        {
2941
            if( 16 == i_width )
2942
            {
2943
                x264_mc_weight_w16_msa( p_dst, i_dst_stride,
2944
                                        p_dst, i_dst_stride,
2945
                                        pWeight, i_height );
2946
            }
2947
            else if( 8 == i_width )
2948
            {
2949
                x264_mc_weight_w8_msa( p_dst, i_dst_stride, p_dst, i_dst_stride,
2950
                                       pWeight, i_height );
2951
            }
2952
            else if( 4 == i_width )
2953
            {
2954
                x264_mc_weight_w4_msa( p_dst, i_dst_stride, p_dst, i_dst_stride,
2955
                                       pWeight, i_height );
2956
            }
2957
        }
2958
    }
2959
    else if( pWeight->weightfn )
2960
    {
2961
        if( 16 == i_width )
2962
        {
2963
            x264_mc_weight_w16_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
2964
                                    pWeight, i_height );
2965
        }
2966
        else if( 8 == i_width )
2967
        {
2968
            x264_mc_weight_w8_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
2969
                                   pWeight, i_height );
2970
        }
2971
        else if( 4 == i_width )
2972
        {
2973
            x264_mc_weight_w4_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
2974
                                   pWeight, i_height );
2975
        }
2976
    }
2977
    else
2978
    {
2979
        if( 16 == i_width )
2980
        {
2981
            copy_width16_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
2982
                              i_height );
2983
        }
2984
        else if( 8 == i_width )
2985
        {
2986
            copy_width8_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
2987
                             i_height );
2988
        }
2989
        else if( 4 == i_width )
2990
        {
2991
            copy_width4_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
2992
                             i_height );
2993
        }
2994
    }
2995
}
2996

2997
void x264_mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v,
2998
                         intptr_t i_dst_stride,
2999
                         uint8_t *p_src, intptr_t i_src_stride,
3000
                         int32_t m_vx, int32_t m_vy,
3001
                         int32_t i_width, int32_t i_height )
3002
{
3003
    int32_t i_d8x = m_vx & 0x07;
3004
    int32_t i_d8y = m_vy & 0x07;
3005
    int32_t i_coeff_horiz1 = ( 8 - i_d8x );
3006
    int32_t i_coeff_vert1 = ( 8 - i_d8y );
3007
    int32_t i_coeff_horiz0 = i_d8x;
3008
    int32_t i_coeff_vert0 = i_d8y;
3009

3010
    p_src += ( m_vy >> 3 ) * i_src_stride + ( m_vx >> 3 ) * 2;
3011

3012
    if( 2 == i_width )
3013
    {
3014
        avc_interleaved_chroma_hv_2w_msa( p_src, i_src_stride,
3015
                                          p_dst_u, p_dst_v, i_dst_stride,
3016
                                          i_coeff_horiz0, i_coeff_horiz1,
3017
                                          i_coeff_vert0, i_coeff_vert1,
3018
                                          i_height );
3019
    }
3020
    else if( 4 == i_width )
3021
    {
3022
        avc_interleaved_chroma_hv_4w_msa( p_src, i_src_stride,
3023
                                          p_dst_u, p_dst_v, i_dst_stride,
3024
                                          i_coeff_horiz0, i_coeff_horiz1,
3025
                                          i_coeff_vert0, i_coeff_vert1,
3026
                                          i_height );
3027
    }
3028
    else if( 8 == i_width )
3029
    {
3030
        avc_interleaved_chroma_hv_8w_msa( p_src, i_src_stride,
3031
                                          p_dst_u, p_dst_v, i_dst_stride,
3032
                                          i_coeff_horiz0, i_coeff_horiz1,
3033
                                          i_coeff_vert0, i_coeff_vert1,
3034
                                          i_height );
3035
    }
3036
}
3037

3038
void x264_hpel_filter_msa( uint8_t *p_dsth, uint8_t *p_dst_v,
3039
                           uint8_t *p_dstc, uint8_t *p_src,
3040
                           intptr_t i_stride, int32_t i_width,
3041
                           int32_t i_height, int16_t *p_buf )
3042
{
3043
    for( int32_t i = 0; i < ( i_width / 16 ); i++ )
3044
    {
3045
        avc_luma_vt_16w_msa( p_src - 2 - ( 2 * i_stride ), i_stride,
3046
                             p_dst_v - 2, i_stride, i_height );
3047
        avc_luma_mid_16w_msa( p_src - 2 - ( 2 * i_stride ) , i_stride,
3048
                              p_dstc, i_stride, i_height );
3049
        avc_luma_hz_16w_msa( p_src - 2, i_stride, p_dsth, i_stride, i_height );
3050

3051
        p_src += 16;
3052
        p_dst_v += 16;
3053
        p_dsth += 16;
3054
        p_dstc += 16;
3055
    }
3056
}
3057

3058
void x264_plane_copy_interleave_msa( uint8_t *p_dst, intptr_t i_dst_stride,
3059
                                     uint8_t *p_src0, intptr_t i_src_stride0,
3060
                                     uint8_t *p_src1, intptr_t i_src_stride1,
3061
                                     int32_t i_width, int32_t i_height )
3062
{
3063
    plane_copy_interleave_msa( p_src0, i_src_stride0, p_src1, i_src_stride1,
3064
                               p_dst, i_dst_stride, i_width, i_height );
3065
}
3066

3067
void x264_plane_copy_deinterleave_msa( uint8_t *p_dst0, intptr_t i_dst_stride0,
3068
                                       uint8_t *p_dst1, intptr_t i_dst_stride1,
3069
                                       uint8_t *p_src, intptr_t i_src_stride,
3070
                                       int32_t i_width, int32_t i_height )
3071
{
3072
    plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst0, i_dst_stride0,
3073
                                 p_dst1, i_dst_stride1, i_width, i_height );
3074
}
3075

3076
void x264_plane_copy_deinterleave_rgb_msa( uint8_t *p_dst0,
3077
                                           intptr_t i_dst_stride0,
3078
                                           uint8_t *p_dst1,
3079
                                           intptr_t i_dst_stride1,
3080
                                           uint8_t *p_dst2,
3081
                                           intptr_t i_dst_stride2,
3082
                                           uint8_t *p_src,
3083
                                           intptr_t i_src_stride,
3084
                                           int32_t i_src_width,
3085
                                           int32_t i_width,
3086
                                           int32_t i_height )
3087
{
3088
    if( 3 == i_src_width )
3089
    {
3090
        plane_copy_deinterleave_rgb_msa( p_src, i_src_stride,
3091
                                         p_dst0, i_dst_stride0,
3092
                                         p_dst1, i_dst_stride1,
3093
                                         p_dst2, i_dst_stride2,
3094
                                         i_width, i_height );
3095
    }
3096
    else if( 4 == i_src_width )
3097
    {
3098
        plane_copy_deinterleave_rgba_msa( p_src, i_src_stride,
3099
                                          p_dst0, i_dst_stride0,
3100
                                          p_dst1, i_dst_stride1,
3101
                                          p_dst2, i_dst_stride2,
3102
                                          i_width, i_height );
3103
    }
3104
}
3105

3106
void x264_store_interleave_chroma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
3107
                                       uint8_t *p_src0, uint8_t *p_src1,
3108
                                       int32_t i_height )
3109
{
3110
    store_interleave_chroma_msa( p_src0, FDEC_STRIDE, p_src1, FDEC_STRIDE,
3111
                                 p_dst, i_dst_stride, i_height );
3112
}
3113

3114
void x264_load_deinterleave_chroma_fenc_msa( uint8_t *p_dst, uint8_t *p_src,
3115
                                             intptr_t i_src_stride,
3116
                                             int32_t i_height )
3117
{
3118
    plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FENC_STRIDE,
3119
                                 ( p_dst + ( FENC_STRIDE / 2 ) ), FENC_STRIDE,
3120
                                 8, i_height );
3121
}
3122

3123
void x264_load_deinterleave_chroma_fdec_msa( uint8_t *p_dst, uint8_t *p_src,
3124
                                             intptr_t i_src_stride,
3125
                                             int32_t i_height )
3126
{
3127
    plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FDEC_STRIDE,
3128
                                 ( p_dst + ( FDEC_STRIDE / 2 ) ), FDEC_STRIDE,
3129
                                 8, i_height );
3130
}
3131

3132
void x264_frame_init_lowres_core_msa( uint8_t *p_src, uint8_t *p_dst0,
3133
                                      uint8_t *p_dst1, uint8_t *p_dst2,
3134
                                      uint8_t *p_dst3, intptr_t i_src_stride,
3135
                                      intptr_t i_dst_stride, int32_t i_width,
3136
                                      int32_t i_height )
3137
{
3138
    frame_init_lowres_core_msa( p_src, i_src_stride, p_dst0, i_dst_stride,
3139
                                p_dst1, i_dst_stride, p_dst2, i_dst_stride,
3140
                                p_dst3, i_dst_stride, i_width, i_height );
3141
}
3142

3143
uint8_t *x264_get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride,
3144
                           uint8_t *p_src[4], intptr_t i_src_stride,
3145
                           int32_t m_vx, int32_t m_vy,
3146
                           int32_t i_width, int32_t i_height,
3147
                           const x264_weight_t *pWeight )
3148
{
3149
    int32_t i_qpel_idx, i_cnt, i_h4w;
3150
    int32_t i_offset;
3151
    uint8_t *p_src1, *src1_org;
3152

3153
    i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
3154
    i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
3155
    p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
3156
           ( 3 == ( m_vy & 3 ) ) * i_src_stride;
3157

3158
    i_h4w = i_height - i_height%4;
3159

3160
    if( i_qpel_idx & 5 )
3161
    {
3162
        uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
3163
                          i_offset + ( 3 == ( m_vx & 3 ) );
3164

3165
        if( 16 == i_width )
3166
        {
3167
            avg_src_width16_msa( p_src1, i_src_stride,
3168
                                 p_src2, i_src_stride,
3169
                                 p_dst, *p_dst_stride, i_h4w );
3170
            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3171
            {
3172
                v16u8 src_vec1, src_vec2;
3173
                v16u8 dst_vec0;
3174

3175
                src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3176
                src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3177

3178
                dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3179

3180
                ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) );
3181
            }
3182
        }
3183
        else if( 20 == i_width )
3184
        {
3185
            avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride,
3186
                                 p_dst, *p_dst_stride, i_h4w );
3187
            avg_src_width4_msa( p_src1 + 16, i_src_stride,
3188
                                p_src2 + 16, i_src_stride,
3189
                                p_dst + 16, *p_dst_stride, i_h4w );
3190

3191
            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3192
            {
3193
                v16u8 src_vec1, src_vec2, src_vec3, src_vec4;
3194
                v16u8 dst_vec0, dst_vec1;
3195
                uint32_t temp0;
3196

3197
                src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3198
                src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3199
                src_vec3 = LD_UB( p_src1 + i_cnt * i_src_stride + 16 );
3200
                src_vec4 = LD_UB( p_src2 + i_cnt * i_src_stride + 16 );
3201

3202
                dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3203
                dst_vec1 = __msa_aver_u_b( src_vec3, src_vec4 );
3204

3205
                temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec1, 0 );
3206

3207
                ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) );
3208
                SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 );
3209
            }
3210
        }
3211
        else if( 12 == i_width )
3212
        {
3213
            avg_src_width8_msa( p_src1, i_src_stride,
3214
                                p_src2, i_src_stride,
3215
                                p_dst, *p_dst_stride, i_h4w );
3216
            avg_src_width4_msa( p_src1 + 8, i_src_stride,
3217
                                p_src2 + 8, i_src_stride,
3218
                                p_dst + 8, *p_dst_stride, i_h4w );
3219
            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3220
            {
3221
                uint32_t temp0;
3222
                uint64_t dst0;
3223
                v16u8 src_vec1, src_vec2;
3224
                v16u8 dst_vec0;
3225

3226
                src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3227
                src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3228

3229
                dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3230

3231
                dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 );
3232
                temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 2 );
3233

3234
                SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) );
3235
                SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 8 );
3236
            }
3237
        }
3238
        else if( 8 == i_width )
3239
        {
3240
            avg_src_width8_msa( p_src1, i_src_stride,
3241
                                p_src2, i_src_stride,
3242
                                p_dst, *p_dst_stride, i_h4w );
3243
            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3244
            {
3245
                uint64_t dst0;
3246
                v16u8 src_vec1, src_vec2;
3247
                v16u8 dst_vec0;
3248

3249
                src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3250
                src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3251

3252
                dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3253

3254
                dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 );
3255

3256
                SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) );
3257
            }
3258
        }
3259
        else if( 4 == i_width )
3260
        {
3261
            avg_src_width4_msa( p_src1, i_src_stride,
3262
                                p_src2, i_src_stride,
3263
                                p_dst, *p_dst_stride, i_h4w );
3264
            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3265
            {
3266
                uint32_t temp0;
3267
                v16u8 src_vec1, src_vec2;
3268
                v16u8 dst_vec0;
3269

3270
                src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3271
                src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3272

3273
                dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3274
                temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 0 );
3275

3276
                SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3277
            }
3278
        }
3279

3280
        if( pWeight->weightfn )
3281
        {
3282
            int32_t i_log2_denom;
3283
            int32_t i_offset_val;
3284
            int32_t i_weight;
3285

3286
            i_log2_denom = pWeight->i_denom;
3287
            i_offset_val = pWeight->i_offset;
3288
            i_weight = pWeight->i_scale;
3289

3290
            if( 16 == i_width || 12 == i_width )
3291
            {
3292
                x264_mc_weight_w16_msa( p_dst, *p_dst_stride,
3293
                                        p_dst, *p_dst_stride,
3294
                                        pWeight, i_h4w );
3295
                for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3296
                {
3297
                    v16i8 zero = {0};
3298
                    v16u8 src_vec0;
3299
                    v16i8 tmp0;
3300
                    v8u16 temp_vec0, temp_vec1;
3301
                    v8u16 wgt, offset_val0;
3302
                    v8i16 denom;
3303

3304
                    i_offset_val <<= ( i_log2_denom );
3305

3306
                    if( i_log2_denom )
3307
                    {
3308
                        i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3309
                    }
3310

3311
                    wgt = ( v8u16 ) __msa_fill_h( i_weight );
3312
                    offset_val0 = ( v8u16 ) __msa_fill_h( i_offset_val );
3313
                    denom = __msa_fill_h( i_log2_denom );
3314

3315
                    src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
3316

3317
                    temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero,
3318
                                                        ( v16i8 ) src_vec0 );
3319
                    temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3320
                                                        ( v16i8 ) src_vec0 );
3321

3322
                    temp_vec0 = wgt * temp_vec0;
3323
                    temp_vec1 = wgt * temp_vec1;
3324

3325
                    temp_vec0 =
3326
                        ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3327
                                                  ( v8i16 ) offset_val0 );
3328
                    temp_vec1 =
3329
                        ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
3330
                                                  ( v8i16 ) offset_val0 );
3331

3332
                    temp_vec0 =
3333
                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3334
                    temp_vec1 =
3335
                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
3336

3337
                    temp_vec0 =
3338
                        ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3339
                    temp_vec1 =
3340
                        ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
3341

3342
                    temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3343
                    temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
3344

3345
                    tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
3346
                                          ( v16i8 ) temp_vec0 );
3347
                    ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
3348
                }
3349
            }
3350
            else if( 20 == i_width )
3351
            {
3352
                x264_mc_weight_w20_msa( p_dst, *p_dst_stride,
3353
                                        p_dst, *p_dst_stride,
3354
                                        pWeight, i_h4w );
3355
                for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3356
                {
3357
                    uint32_t temp0;
3358
                    v16i8 zero = {0};
3359
                    v16u8 src_vec0;
3360
                    v16i8 tmp0;
3361
                    v8u16 temp_vec0, temp_vec1;
3362
                    v8u16 wgt;
3363
                    v8i16 denom, offset_val0;
3364

3365
                    i_offset_val <<= ( i_log2_denom );
3366

3367
                    if( i_log2_denom )
3368
                    {
3369
                        i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3370
                    }
3371

3372
                    wgt = ( v8u16 ) __msa_fill_h( i_weight );
3373
                    offset_val0 = __msa_fill_h( i_offset_val );
3374
                    denom = __msa_fill_h( i_log2_denom );
3375

3376
                    src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
3377
                    temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) + 16 );
3378

3379
                    temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero,
3380
                                                        ( v16i8 ) src_vec0 );
3381
                    temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3382
                                                        ( v16i8 ) src_vec0 );
3383

3384
                    temp_vec0 = wgt * temp_vec0;
3385
                    temp_vec1 = wgt * temp_vec1;
3386

3387
                    temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3388
                                                          offset_val0 );
3389
                    temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
3390
                                                          offset_val0 );
3391

3392
                    temp_vec0 =
3393
                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3394
                    temp_vec1 =
3395
                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
3396

3397
                    temp_vec0 =
3398
                        ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3399
                    temp_vec1 =
3400
                        ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
3401

3402
                    temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3403
                    temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
3404

3405
                    tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
3406
                                          ( v16i8 ) temp_vec0 );
3407
                    ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
3408

3409
                    src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
3410
                    temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3411
                                                        ( v16i8 ) src_vec0 );
3412
                    temp_vec0 = wgt * temp_vec0;
3413

3414
                    temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3415
                                                          offset_val0 );
3416
                    temp_vec0 =
3417
                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3418
                    temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0,
3419
                                                       denom );
3420
                    temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3421

3422
                    tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3423
                                          ( v16i8 ) temp_vec0 );
3424
                    temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
3425
                    SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 );
3426
                }
3427
            }
3428
            else if( 8 == i_width )
3429
            {
3430
                x264_mc_weight_w8_msa( p_dst, *p_dst_stride,
3431
                                       p_dst, *p_dst_stride,
3432
                                       pWeight, i_h4w );
3433
                for( i_cnt = i_h4w; i_cnt < i_height ; i_cnt++ )
3434
                {
3435
                    uint64_t temp0;
3436
                    v16i8 zero = {0};
3437
                    v16u8 src_vec0;
3438
                    v16i8 tmp0;
3439
                    v8u16 temp_vec0;
3440
                    v8u16 wgt;
3441
                    v8i16 denom, offset_val0;
3442

3443
                    i_offset_val = i_offset_val << i_log2_denom;
3444

3445
                    if( i_log2_denom )
3446
                    {
3447
                        i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3448
                    }
3449

3450
                    wgt = ( v8u16 ) __msa_fill_h( i_weight );
3451
                    offset_val0 = __msa_fill_h( i_offset_val );
3452
                    denom = __msa_fill_h( i_log2_denom );
3453

3454
                    src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
3455

3456
                    temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3457
                                                        ( v16i8 ) src_vec0 );
3458
                    temp_vec0 = wgt * temp_vec0;
3459

3460
                    temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3461
                                                          offset_val0 );
3462
                    temp_vec0 =
3463
                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3464
                    temp_vec0 =
3465
                        ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3466
                    temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3467

3468
                    tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3469
                                          ( v16i8 ) temp_vec0 );
3470
                    temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
3471
                    SD( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3472
                }
3473
            }
3474
            else if( 4 == i_width )
3475
            {
3476
                x264_mc_weight_w4_msa( p_dst, *p_dst_stride,
3477
                                       p_dst, *p_dst_stride,
3478
                                       pWeight, i_h4w );
3479
                for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3480
                {
3481
                    uint32_t temp0;
3482
                    v16i8 zero = {0};
3483
                    v16u8 src_vec0;
3484
                    v16i8 tmp0;
3485
                    v8u16 temp_vec0;
3486
                    v8u16 wgt;
3487
                    v8i16 denom, offset_val0;
3488

3489
                    i_offset_val <<= ( i_log2_denom );
3490

3491
                    if( i_log2_denom )
3492
                    {
3493
                        i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3494
                    }
3495

3496
                    wgt = ( v8u16 ) __msa_fill_h( i_weight );
3497
                    offset_val0 = __msa_fill_h( i_offset_val );
3498
                    denom = __msa_fill_h( i_log2_denom );
3499

3500
                    temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) );
3501

3502
                    src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
3503

3504
                    temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3505
                                                        ( v16i8 ) src_vec0 );
3506
                    temp_vec0 = wgt * temp_vec0;
3507

3508
                    temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3509
                                                          offset_val0 );
3510
                    temp_vec0 =
3511
                        ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3512
                    temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0,
3513
                                                       denom );
3514
                    temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3515

3516
                    tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3517
                                          ( v16i8 ) temp_vec0 );
3518
                    temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
3519
                    SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3520
                }
3521
            }
3522
        }
3523

3524
        return p_dst;
3525
    }
3526
    else if( pWeight->weightfn )
3527
    {
3528
        int32_t i_offset_val, i_log2_denom, i_weight;
3529

3530
        i_log2_denom = pWeight->i_denom;
3531
        i_offset_val = pWeight->i_offset;
3532
        i_weight = pWeight->i_scale;
3533

3534
        i_h4w = i_height - i_height%4;
3535

3536
        src1_org = p_src1;
3537

3538
        if( 16 == i_width || 12 == i_width )
3539
        {
3540
            x264_mc_weight_w16_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
3541
                                    pWeight, i_h4w );
3542
            p_src1 = src1_org + i_h4w * i_src_stride;
3543

3544
            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3545
            {
3546
                v16i8 zero = {0};
3547
                v16u8 src_vec0;
3548
                v16i8 tmp0;
3549
                v8u16 temp_vec0, temp_vec1;
3550
                v8u16 wgt;
3551
                v8i16 denom, offset_val0;
3552

3553
                i_offset_val <<= ( i_log2_denom );
3554

3555
                if( i_log2_denom )
3556
                {
3557
                    i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3558
                }
3559

3560
                wgt = ( v8u16 ) __msa_fill_h( i_weight );
3561
                offset_val0 = __msa_fill_h( i_offset_val );
3562
                denom = __msa_fill_h( i_log2_denom );
3563

3564
                src_vec0 = LD_UB( p_src1 );
3565
                p_src1 += i_src_stride;
3566

3567
                temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 );
3568
                temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3569

3570
                temp_vec0 = wgt * temp_vec0;
3571
                temp_vec1 = wgt * temp_vec1;
3572

3573
                temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3574
                                                      offset_val0 );
3575
                temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
3576
                                                      offset_val0 );
3577

3578
                temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3579
                temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
3580

3581
                temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3582
                temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
3583

3584
                temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3585
                temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
3586

3587
                tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
3588
                                      ( v16i8 ) temp_vec0 );
3589
                ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
3590
            }
3591
        }
3592
        else if( 20 == i_width )
3593
        {
3594
            x264_mc_weight_w20_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
3595
                                    pWeight, i_h4w );
3596
            p_src1 = src1_org + i_h4w * i_src_stride;
3597

3598
            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3599
            {
3600
                uint32_t temp0;
3601
                v16i8 zero = {0};
3602
                v16u8 src_vec0;
3603
                v16i8 tmp0;
3604
                v8u16 temp_vec0, temp_vec1;
3605
                v8u16 wgt;
3606
                v8i16 denom, offset_val0;
3607

3608
                i_offset_val <<= ( i_log2_denom );
3609

3610
                if( i_log2_denom )
3611
                {
3612
                    i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3613
                }
3614

3615
                wgt = ( v8u16 ) __msa_fill_h( i_weight );
3616
                offset_val0 = __msa_fill_h( i_offset_val );
3617
                denom = __msa_fill_h( i_log2_denom );
3618

3619
                src_vec0 = LD_UB( p_src1 );
3620
                temp0 = LW( p_src1 + 16 );
3621
                p_src1 += i_src_stride;
3622

3623
                temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 );
3624
                temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3625

3626
                temp_vec0 = wgt * temp_vec0;
3627
                temp_vec1 = wgt * temp_vec1;
3628

3629
                temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3630
                                                      offset_val0 );
3631
                temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
3632
                                                      offset_val0 );
3633

3634
                temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3635
                temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
3636

3637
                temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3638
                temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
3639

3640
                temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3641
                temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
3642

3643
                tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
3644
                                      ( v16i8 ) temp_vec0 );
3645
                ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
3646

3647
                src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
3648
                temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3649
                temp_vec0 = wgt * temp_vec0;
3650

3651
                temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3652
                                                      offset_val0 );
3653
                temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3654
                temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3655
                temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3656

3657
                tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3658
                                      ( v16i8 ) temp_vec0 );
3659
                temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
3660
                SW( temp0,p_dst + i_cnt * ( *p_dst_stride ) + 16 );
3661
            }
3662
        }
3663
        else if( 8 == i_width )
3664
        {
3665
            x264_mc_weight_w8_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
3666
                                   pWeight, i_h4w );
3667
            p_src1 = src1_org + i_h4w * i_src_stride;
3668

3669
            for( i_cnt = i_h4w; i_cnt < i_height ; i_cnt++ )
3670
            {
3671
                uint64_t u_temp0;
3672
                v16i8 zero = {0};
3673
                v16u8 src_vec0;
3674
                v16i8 tmp0;
3675
                v8u16 temp_vec0;
3676
                v8u16 wgt;
3677
                v8i16 denom, offset_val0;
3678

3679
                i_offset_val = i_offset_val << i_log2_denom;
3680

3681
                if( i_log2_denom )
3682
                {
3683
                    i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3684
                }
3685

3686
                wgt = ( v8u16 ) __msa_fill_h( i_weight );
3687
                offset_val0 = __msa_fill_h( i_offset_val );
3688
                denom = __msa_fill_h( i_log2_denom );
3689

3690
                src_vec0 = LD_UB( p_src1 );
3691
                p_src1 += i_src_stride;
3692

3693
                temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3694
                temp_vec0 = wgt * temp_vec0;
3695

3696
                temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3697
                                                      offset_val0 );
3698
                temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3699
                temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3700
                temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3701

3702
                tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3703
                                      ( v16i8 ) temp_vec0 );
3704
                u_temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
3705
                SD( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3706
            }
3707
        }
3708
        else if( 4 == i_width )
3709
        {
3710
            x264_mc_weight_w4_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
3711
                                   pWeight, i_h4w );
3712
            p_src1 = src1_org + i_h4w * i_src_stride;
3713

3714
            for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3715
            {
3716
                uint32_t u_temp0;
3717
                v16i8 zero = {0};
3718
                v16u8 src_vec0;
3719
                v16i8 tmp0;
3720
                v8u16 temp_vec0;
3721
                v8u16 wgt;
3722
                v8i16 denom, offset_val0;
3723

3724
                i_offset_val <<= ( i_log2_denom );
3725

3726
                if( i_log2_denom )
3727
                {
3728
                    i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3729
                }
3730

3731
                wgt = ( v8u16 ) __msa_fill_h( i_weight );
3732
                offset_val0 = __msa_fill_h( i_offset_val );
3733
                denom = __msa_fill_h( i_log2_denom );
3734

3735
                u_temp0 = LW( p_src1 );
3736
                p_src1 += i_src_stride;
3737

3738
                src_vec0 = ( v16u8 ) __msa_fill_w( u_temp0 );
3739

3740
                temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3741
                temp_vec0 = wgt * temp_vec0;
3742

3743
                temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3744
                                                      offset_val0 );
3745
                temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3746
                temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3747
                temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3748

3749
                tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3750
                                      ( v16i8 ) temp_vec0 );
3751
                u_temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
3752
                SW( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3753
            }
3754
        }
3755

3756
        return p_dst;
3757
    }
3758
    else
3759
    {
3760
        *p_dst_stride = i_src_stride;
3761
        return p_src1;
3762
    }
3763
}
3764

3765
void x264_mc_init_mips( int32_t cpu, x264_mc_functions_t *pf  )
3766
{
3767
    if( cpu & X264_CPU_MSA )
3768
    {
3769
        pf->mc_luma = x264_mc_luma_msa;
3770
        pf->mc_chroma = x264_mc_chroma_msa;
3771
        pf->get_ref = x264_get_ref_msa;
3772

3773
        pf->avg[PIXEL_16x16]= x264_pixel_avg_16x16_msa;
3774
        pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_msa;
3775
        pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_msa;
3776
        pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_msa;
3777
        pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_msa;
3778
        pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_msa;
3779
        pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_msa;
3780
        pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_msa;
3781
        pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_msa;
3782

3783
        pf->weight = x264_mc_weight_wtab_msa;
3784
        pf->offsetadd = x264_mc_weight_wtab_msa;
3785
        pf->offsetsub = x264_mc_weight_wtab_msa;
3786

3787
        pf->copy_16x16_unaligned = x264_mc_copy_w16_msa;
3788
        pf->copy[PIXEL_16x16] = x264_mc_copy_w16_msa;
3789
        pf->copy[PIXEL_8x8] = x264_mc_copy_w8_msa;
3790
        pf->copy[PIXEL_4x4] = x264_mc_copy_w4_msa;
3791

3792
        pf->store_interleave_chroma = x264_store_interleave_chroma_msa;
3793
        pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_msa;
3794
        pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_msa;
3795

3796
        pf->plane_copy_interleave = x264_plane_copy_interleave_msa;
3797
        pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_msa;
3798
        pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_msa;
3799

3800
        pf->hpel_filter = x264_hpel_filter_msa;
3801

3802
        pf->memcpy_aligned = memcpy;
3803
        pf->memzero_aligned = x264_memzero_aligned_msa;
3804
        pf->frame_init_lowres_core = x264_frame_init_lowres_core_msa;
3805
    }
3806
}
3807
#endif
3808

3809
Product

Resources

Company