Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
/*****************************************************************************
2
* mc-c.c: msa motion compensation
3
*****************************************************************************
4
* Copyright (C) 2015-2016 x264 project
5
*
6
* Authors: Neha Rana <[email protected]>
7
*
8
* This program is free software; you can redistribute it and/or modify
9
* it under the terms of the GNU General Public License as published by
10
* the Free Software Foundation; either version 2 of the License, or
11
* (at your option) any later version.
12
*
13
* This program is distributed in the hope that it will be useful,
14
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
* GNU General Public License for more details.
17
*
18
* You should have received a copy of the GNU General Public License
19
* along with this program; if not, write to the Free Software
20
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
*
22
* This program is also available under a commercial proprietary license.
23
* For more information, contact us at [email protected].
24
*****************************************************************************/
25
26
#include "common/common.h"
27
#include "macros.h"
28
#include "mc.h"
29
30
#if !HIGH_BIT_DEPTH
31
static const uint8_t pu_luma_mask_arr[16 * 8] =
32
{
33
/* 8 width cases */
34
0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
35
1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
36
2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
37
/* 4 width cases */
38
0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
39
1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
40
2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
41
2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
42
3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
43
};
44
45
static const uint8_t pu_chroma_mask_arr[16 * 5] =
46
{
47
0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
48
0, 2, 2, 4, 4, 6, 6, 8, 16, 18, 18, 20, 20, 22, 22, 24,
49
0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
50
0, 1, 1, 2, 16, 17, 17, 18, 4, 5, 5, 6, 6, 7, 7, 8,
51
0, 1, 1, 2, 16, 17, 17, 18, 16, 17, 17, 18, 18, 19, 19, 20
52
};
53
54
void x264_mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
55
uint8_t *p_src, intptr_t i_src_stride,
56
int32_t i_height );
57
void x264_mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
58
uint8_t *p_src, intptr_t i_src_stride,
59
int32_t i_height );
60
void x264_mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
61
intptr_t i_src_stride, int32_t i_height );
62
void x264_memzero_aligned_msa( void *p_dst, size_t n );
63
64
void x264_pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
65
uint8_t *p_pix2, intptr_t i_pix2_stride,
66
uint8_t *p_pix3, intptr_t i_pix3_stride,
67
int32_t i_weight );
68
void x264_pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
69
uint8_t *p_pix2, intptr_t i_pix2_stride,
70
uint8_t *p_pix3, intptr_t i_pix3_stride,
71
int32_t i_weight );
72
void x264_pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
73
uint8_t *p_pix2, intptr_t i_pix2_stride,
74
uint8_t *p_pix3, intptr_t i_pix3_stride,
75
int32_t i_weight );
76
void x264_pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
77
uint8_t *p_pix2, intptr_t i_pix2_stride,
78
uint8_t *p_pix3, intptr_t i_pix3_stride,
79
int32_t i_weight );
80
void x264_pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
81
uint8_t *p_pix2, intptr_t i_pix2_stride,
82
uint8_t *p_pix3, intptr_t i_pix3_stride,
83
int32_t i_weight );
84
void x264_pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
85
uint8_t *p_pix2, intptr_t pix2_stride,
86
uint8_t *p_pix3, intptr_t pix3_stride,
87
int32_t i_weight );
88
void x264_pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
89
uint8_t *p_pix2, intptr_t i_pix2_stride,
90
uint8_t *p_pix3, intptr_t i_pix3_stride,
91
int32_t i_weight );
92
void x264_pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
93
uint8_t *p_pix2, intptr_t i_pix2_stride,
94
uint8_t *p_pix3, intptr_t i_pix3_stride,
95
int32_t i_weight );
96
void x264_pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t i_pix1_stride,
97
uint8_t *p_pix2, intptr_t i_pix2_stride,
98
uint8_t *p_pix3, intptr_t i_pix3_stride,
99
int32_t i_weight );
100
101
void x264_mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride,
102
uint8_t *p_src, intptr_t i_src_stride,
103
const x264_weight_t *pWeight, int32_t i_height );
104
void x264_mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride,
105
uint8_t *p_src, intptr_t i_src_stride,
106
const x264_weight_t *pWeight, int32_t i_height );
107
void x264_mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
108
uint8_t *p_src, intptr_t i_src_stride,
109
const x264_weight_t *pWeight, int32_t i_height );
110
void x264_mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
111
uint8_t *p_src, intptr_t i_src_stride,
112
const x264_weight_t *pWeight, int32_t i_height );
113
114
weight_fn_t x264_mc_weight_wtab_msa[6] =
115
{
116
x264_mc_weight_w4_msa,
117
x264_mc_weight_w4_msa,
118
x264_mc_weight_w8_msa,
119
x264_mc_weight_w16_msa,
120
x264_mc_weight_w16_msa,
121
x264_mc_weight_w20_msa,
122
};
123
124
void x264_mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
125
uint8_t *p_src[4], intptr_t i_src_stride,
126
int32_t m_vx, int32_t m_vy,
127
int32_t i_width, int32_t i_height,
128
const x264_weight_t *pWeight );
129
uint8_t *x264_get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride,
130
uint8_t *p_src[4], intptr_t i_src_stride,
131
int32_t m_vx, int32_t m_vy,
132
int32_t i_width, int32_t i_height,
133
const x264_weight_t *pWeight );
134
void x264_mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v,
135
intptr_t i_dst_stride,
136
uint8_t *p_src, intptr_t i_src_stride,
137
int32_t m_vx, int32_t m_vy,
138
int32_t i_width, int32_t i_height );
139
void x264_hpel_filter_msa( uint8_t *p_dsth, uint8_t *p_dst_v,
140
uint8_t *p_dstc, uint8_t *p_src,
141
intptr_t i_stride, int32_t i_width,
142
int32_t i_height, int16_t *p_buf );
143
144
void x264_plane_copy_interleave_msa( uint8_t *p_dst, intptr_t i_dst_stride,
145
uint8_t *p_src0, intptr_t i_src_stride0,
146
uint8_t *p_src1, intptr_t i_src_stride1,
147
int32_t i_width, int32_t i_height );
148
void x264_plane_copy_deinterleave_msa( uint8_t *p_dst0, intptr_t i_dst_stride0,
149
uint8_t *p_dst1, intptr_t i_dst_stride1,
150
uint8_t *p_src, intptr_t i_src_stride,
151
int32_t i_width, int32_t i_height );
152
void x264_plane_copy_deinterleave_rgb_msa( uint8_t *p_dst0,
153
intptr_t i_dst_stride0,
154
uint8_t *p_dst1,
155
intptr_t i_dst_stride1,
156
uint8_t *p_dst2,
157
intptr_t i_dst_stride2,
158
uint8_t *p_src,
159
intptr_t i_src_stride,
160
int32_t i_src_width, int32_t i_width,
161
int32_t i_height );
162
void x264_store_interleave_chroma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
163
uint8_t *p_src0, uint8_t *p_src1,
164
int32_t i_height );
165
void x264_load_deinterleave_chroma_fenc_msa( uint8_t *p_dst, uint8_t *p_src,
166
intptr_t i_src_stride,
167
int32_t i_height );
168
void x264_load_deinterleave_chroma_fdec_msa( uint8_t *p_dst, uint8_t *p_src,
169
intptr_t i_src_stride,
170
int32_t i_height );
171
void x264_frame_init_lowres_core_msa( uint8_t *p_src, uint8_t *p_dst0,
172
uint8_t *p_dst1, uint8_t *p_dst2,
173
uint8_t *p_dst3, intptr_t i_src_stride,
174
intptr_t i_dst_stride, int32_t i_width,
175
int32_t i_height );
176
177
static void avc_luma_hz_16w_msa( uint8_t *p_src, int32_t i_src_stride,
178
uint8_t *p_dst, int32_t i_dst_stride,
179
int32_t i_height )
180
{
181
uint32_t u_loop_cnt, u_h4w;
182
v16u8 dst0;
183
v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
184
v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
185
v16i8 mask0, mask1, mask2;
186
v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
187
v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
188
v16i8 minus5b = __msa_ldi_b( -5 );
189
v16i8 plus20b = __msa_ldi_b( 20 );
190
191
u_h4w = i_height % 4;
192
LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 );
193
194
for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
195
{
196
LD_SB2( p_src, 8, src0, src1 );
197
p_src += i_src_stride;
198
LD_SB2( p_src, 8, src2, src3 );
199
p_src += i_src_stride;
200
201
XORI_B4_128_SB( src0, src1, src2, src3 );
202
VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 );
203
VSHF_B2_SB( src2, src2, src3, src3, mask0, mask0, vec6, vec9 );
204
VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 );
205
VSHF_B2_SB( src2, src2, src3, src3, mask1, mask1, vec7, vec10 );
206
VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 );
207
VSHF_B2_SB( src2, src2, src3, src3, mask2, mask2, vec8, vec11 );
208
HADD_SB4_SH( vec0, vec3, vec6, vec9, res0, res1, res2, res3 );
209
DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
210
minus5b, res0, res1, res2, res3 );
211
DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
212
plus20b, res0, res1, res2, res3 );
213
214
LD_SB2( p_src, 8, src4, src5 );
215
p_src += i_src_stride;
216
LD_SB2( p_src, 8, src6, src7 );
217
p_src += i_src_stride;
218
219
XORI_B4_128_SB( src4, src5, src6, src7 );
220
VSHF_B2_SB( src4, src4, src5, src5, mask0, mask0, vec0, vec3 );
221
VSHF_B2_SB( src6, src6, src7, src7, mask0, mask0, vec6, vec9 );
222
VSHF_B2_SB( src4, src4, src5, src5, mask1, mask1, vec1, vec4 );
223
VSHF_B2_SB( src6, src6, src7, src7, mask1, mask1, vec7, vec10 );
224
VSHF_B2_SB( src4, src4, src5, src5, mask2, mask2, vec2, vec5 );
225
VSHF_B2_SB( src6, src6, src7, src7, mask2, mask2, vec8, vec11 );
226
HADD_SB4_SH( vec0, vec3, vec6, vec9, res4, res5, res6, res7 );
227
DPADD_SB4_SH( vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
228
minus5b, res4, res5, res6, res7 );
229
DPADD_SB4_SH( vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
230
plus20b, res4, res5, res6, res7 );
231
SRARI_H4_SH( res0, res1, res2, res3, 5 );
232
SRARI_H4_SH( res4, res5, res6, res7, 5 );
233
SAT_SH4_SH( res0, res1, res2, res3, 7 );
234
SAT_SH4_SH( res4, res5, res6, res7, 7 );
235
PCKEV_B4_SB( res1, res0, res3, res2, res5, res4, res7, res6,
236
vec0, vec1, vec2, vec3 );
237
XORI_B4_128_SB( vec0, vec1, vec2, vec3 );
238
239
ST_SB4( vec0, vec1, vec2, vec3, p_dst, i_dst_stride );
240
p_dst += ( 4 * i_dst_stride );
241
}
242
243
for( u_loop_cnt = u_h4w; u_loop_cnt--; )
244
{
245
LD_SB2( p_src, 8, src0, src1 );
246
p_src += i_src_stride;
247
248
XORI_B2_128_SB( src0, src1 );
249
VSHF_B2_SB( src0, src0, src1, src1, mask0, mask0, vec0, vec3 );
250
VSHF_B2_SB( src0, src0, src1, src1, mask1, mask1, vec1, vec4 );
251
VSHF_B2_SB( src0, src0, src1, src1, mask2, mask2, vec2, vec5 );
252
res0 = __msa_hadd_s_h( vec0, vec0 );
253
DPADD_SB2_SH( vec1, vec2, minus5b, plus20b, res0, res0 );
254
res1 = __msa_hadd_s_h( vec3, vec3 );
255
DPADD_SB2_SH( vec4, vec5, minus5b, plus20b, res1, res1 );
256
SRARI_H2_SH( res0, res1, 5 );
257
SAT_SH2_SH( res0, res1, 7 );
258
dst0 = PCKEV_XORI128_UB( res0, res1 );
259
ST_UB( dst0, p_dst );
260
p_dst += i_dst_stride;
261
}
262
}
263
264
static void avc_luma_vt_16w_msa( uint8_t *p_src, int32_t i_src_stride,
265
uint8_t *p_dst, int32_t i_dst_stride,
266
int32_t i_height )
267
{
268
uint32_t u_loop_cnt, u_h4w;
269
const int16_t i_filt_const0 = 0xfb01;
270
const int16_t i_filt_const1 = 0x1414;
271
const int16_t i_filt_const2 = 0x1fb;
272
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
273
v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
274
v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
275
v16i8 src65_l, src87_l;
276
v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
277
v16u8 res0, res1, res2, res3;
278
v16i8 filt0, filt1, filt2;
279
280
u_h4w = i_height % 4;
281
filt0 = ( v16i8 ) __msa_fill_h( i_filt_const0 );
282
filt1 = ( v16i8 ) __msa_fill_h( i_filt_const1 );
283
filt2 = ( v16i8 ) __msa_fill_h( i_filt_const2 );
284
285
LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
286
p_src += ( 5 * i_src_stride );
287
288
XORI_B5_128_SB( src0, src1, src2, src3, src4 );
289
ILVR_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3,
290
src10_r, src21_r, src32_r, src43_r );
291
ILVL_B4_SB( src1, src0, src2, src1, src3, src2, src4, src3,
292
src10_l, src21_l, src32_l, src43_l );
293
294
for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
295
{
296
LD_SB4( p_src, i_src_stride, src5, src6, src7, src8 );
297
p_src += ( 4 * i_src_stride );
298
299
XORI_B4_128_SB( src5, src6, src7, src8 );
300
ILVR_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7,
301
src54_r, src65_r, src76_r, src87_r );
302
ILVL_B4_SB( src5, src4, src6, src5, src7, src6, src8, src7,
303
src54_l, src65_l, src76_l, src87_l );
304
out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r,
305
filt0, filt1, filt2 );
306
out1_r = DPADD_SH3_SH( src21_r, src43_r, src65_r,
307
filt0, filt1, filt2 );
308
out2_r = DPADD_SH3_SH( src32_r, src54_r, src76_r,
309
filt0, filt1, filt2 );
310
out3_r = DPADD_SH3_SH( src43_r, src65_r, src87_r,
311
filt0, filt1, filt2 );
312
out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l,
313
filt0, filt1, filt2 );
314
out1_l = DPADD_SH3_SH( src21_l, src43_l, src65_l,
315
filt0, filt1, filt2 );
316
out2_l = DPADD_SH3_SH( src32_l, src54_l, src76_l,
317
filt0, filt1, filt2 );
318
out3_l = DPADD_SH3_SH( src43_l, src65_l, src87_l,
319
filt0, filt1, filt2 );
320
SRARI_H4_SH( out0_r, out1_r, out2_r, out3_r, 5 );
321
SAT_SH4_SH( out0_r, out1_r, out2_r, out3_r, 7 );
322
SRARI_H4_SH( out0_l, out1_l, out2_l, out3_l, 5 );
323
SAT_SH4_SH( out0_l, out1_l, out2_l, out3_l, 7 );
324
PCKEV_B4_UB( out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
325
out3_r, res0, res1, res2, res3 );
326
XORI_B4_128_UB( res0, res1, res2, res3 );
327
328
ST_UB4( res0, res1, res2, res3, p_dst, i_dst_stride );
329
p_dst += ( 4 * i_dst_stride );
330
331
src10_r = src54_r;
332
src32_r = src76_r;
333
src21_r = src65_r;
334
src43_r = src87_r;
335
src10_l = src54_l;
336
src32_l = src76_l;
337
src21_l = src65_l;
338
src43_l = src87_l;
339
src4 = src8;
340
}
341
342
for( u_loop_cnt = u_h4w; u_loop_cnt--; )
343
{
344
src5 = LD_SB( p_src );
345
p_src += ( i_src_stride );
346
src5 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src5, 128 );
347
ILVRL_B2_SB( src5, src4, src54_r, src54_l );
348
out0_r = DPADD_SH3_SH( src10_r, src32_r, src54_r,
349
filt0, filt1, filt2 );
350
out0_l = DPADD_SH3_SH( src10_l, src32_l, src54_l,
351
filt0, filt1, filt2 );
352
SRARI_H2_SH( out0_r, out0_l, 5 );
353
SAT_SH2_SH( out0_r, out0_l, 7 );
354
out0_r = ( v8i16 ) __msa_pckev_b( ( v16i8 ) out0_l, ( v16i8 ) out0_r );
355
res0 = __msa_xori_b( ( v16u8 ) out0_r, 128 );
356
ST_UB( res0, p_dst );
357
p_dst += i_dst_stride;
358
359
src10_r = src21_r;
360
src21_r = src32_r;
361
src32_r = src43_r;
362
src43_r = src54_r;
363
364
src10_l = src21_l;
365
src21_l = src32_l;
366
src32_l = src43_l;
367
src43_l = src54_l;
368
369
src4 = src5;
370
}
371
}
372
373
static void avc_luma_mid_8w_msa( uint8_t *p_src, int32_t i_src_stride,
374
uint8_t *p_dst, int32_t i_dst_stride,
375
int32_t i_height )
376
{
377
uint32_t u_loop_cnt, u_h4w;
378
uint64_t u_out0;
379
v16i8 tmp0;
380
v16i8 src0, src1, src2, src3, src4;
381
v16i8 mask0, mask1, mask2;
382
v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
383
v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
384
v8i16 dst0, dst1, dst2, dst3;
385
v16u8 out0, out1;
386
387
u_h4w = i_height % 4;
388
LD_SB3( &pu_luma_mask_arr[0], 16, mask0, mask1, mask2 );
389
390
LD_SB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
391
XORI_B5_128_SB( src0, src1, src2, src3, src4 );
392
p_src += ( 5 * i_src_stride );
393
394
hz_out0 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
395
hz_out1 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 );
396
hz_out2 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 );
397
hz_out3 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 );
398
hz_out4 = AVC_HORZ_FILTER_SH( src4, mask0, mask1, mask2 );
399
400
for( u_loop_cnt = ( i_height >> 2 ); u_loop_cnt--; )
401
{
402
LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 );
403
XORI_B4_128_SB( src0, src1, src2, src3 );
404
p_src += ( 4 * i_src_stride );
405
406
hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
407
hz_out6 = AVC_HORZ_FILTER_SH( src1, mask0, mask1, mask2 );
408
hz_out7 = AVC_HORZ_FILTER_SH( src2, mask0, mask1, mask2 );
409
hz_out8 = AVC_HORZ_FILTER_SH( src3, mask0, mask1, mask2 );
410
dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1, hz_out2,
411
hz_out3, hz_out4, hz_out5 );
412
dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out1, hz_out2, hz_out3,
413
hz_out4, hz_out5, hz_out6 );
414
dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out2, hz_out3, hz_out4,
415
hz_out5, hz_out6, hz_out7 );
416
dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out3, hz_out4, hz_out5,
417
hz_out6, hz_out7, hz_out8 );
418
out0 = PCKEV_XORI128_UB( dst0, dst1 );
419
out1 = PCKEV_XORI128_UB( dst2, dst3 );
420
ST8x4_UB( out0, out1, p_dst, i_dst_stride );
421
422
p_dst += ( 4 * i_dst_stride );
423
hz_out3 = hz_out7;
424
hz_out1 = hz_out5;
425
hz_out5 = hz_out4;
426
hz_out4 = hz_out8;
427
hz_out2 = hz_out6;
428
hz_out0 = hz_out5;
429
}
430
431
for( u_loop_cnt = u_h4w; u_loop_cnt--; )
432
{
433
src0 = LD_SB( p_src );
434
p_src += i_src_stride;
435
436
src0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) src0, 128 );
437
hz_out5 = AVC_HORZ_FILTER_SH( src0, mask0, mask1, mask2 );
438
439
dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH( hz_out0, hz_out1,
440
hz_out2, hz_out3,
441
hz_out4, hz_out5 );
442
443
tmp0 = __msa_pckev_b( ( v16i8 ) ( dst0 ), ( v16i8 ) ( dst0 ) );
444
tmp0 = ( v16i8 ) __msa_xori_b( ( v16u8 ) tmp0, 128 );
445
u_out0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
446
SD( u_out0, p_dst );
447
p_dst += i_dst_stride;
448
449
hz_out0 = hz_out1;
450
hz_out1 = hz_out2;
451
hz_out2 = hz_out3;
452
hz_out3 = hz_out4;
453
hz_out4 = hz_out5;
454
}
455
}
456
457
static void avc_luma_mid_16w_msa( uint8_t *p_src, int32_t i_src_stride,
458
uint8_t *p_dst, int32_t i_dst_stride,
459
int32_t i_height )
460
{
461
uint32_t u_multiple8_cnt;
462
463
for( u_multiple8_cnt = 2; u_multiple8_cnt--; )
464
{
465
avc_luma_mid_8w_msa( p_src, i_src_stride, p_dst, i_dst_stride,
466
i_height );
467
p_src += 8;
468
p_dst += 8;
469
}
470
}
471
472
static void avc_interleaved_chroma_hv_2x2_msa( uint8_t *p_src,
473
int32_t i_src_stride,
474
uint8_t *p_dst_u,
475
uint8_t *p_dst_v,
476
int32_t i_dst_stride,
477
uint32_t u_coef_hor0,
478
uint32_t u_coef_hor1,
479
uint32_t u_coef_ver0,
480
uint32_t u_coef_ver1 )
481
{
482
uint16_t u_out0, u_out1, u_out2, u_out3;
483
v16u8 src0, src1, src2, src3, src4;
484
v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
485
v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
486
v16i8 mask;
487
v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
488
v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
489
v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
490
v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
491
v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
492
v8i16 res0, res1;
493
494
mask = LD_SB( &pu_chroma_mask_arr[16] );
495
496
LD_UB3( p_src, i_src_stride, src0, src1, src2 );
497
VSHF_B2_UB( src0, src1, src1, src2,
498
( mask + 1 ), ( mask + 1 ), src3, src4 );
499
VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
500
DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec,
501
coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
502
res_hz3 );
503
MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
504
coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
505
res_vt3 );
506
ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 );
507
SRARI_H2_UH( res_vt0, res_vt2, 6 );
508
SAT_UH2_UH( res_vt0, res_vt2, 7 );
509
PCKEV_B2_SH( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 );
510
511
u_out0 = __msa_copy_u_h( res0, 0 );
512
u_out1 = __msa_copy_u_h( res0, 2 );
513
u_out2 = __msa_copy_u_h( res1, 0 );
514
u_out3 = __msa_copy_u_h( res1, 2 );
515
516
SH( u_out0, p_dst_u );
517
p_dst_u += i_dst_stride;
518
SH( u_out1, p_dst_u );
519
520
SH( u_out2, p_dst_v );
521
p_dst_v += i_dst_stride;
522
SH( u_out3, p_dst_v );
523
}
524
525
static void avc_interleaved_chroma_hv_2x4_msa( uint8_t *p_src,
526
int32_t i_src_stride,
527
uint8_t *p_dst_u,
528
uint8_t *p_dst_v,
529
int32_t i_dst_stride,
530
uint32_t u_coef_hor0,
531
uint32_t u_coef_hor1,
532
uint32_t u_coef_ver0,
533
uint32_t u_coef_ver1 )
534
{
535
uint16_t u_out0, u_out1, u_out2, u_out3;
536
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
537
v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
538
v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
539
v16i8 mask;
540
v8i16 res0, res1;
541
v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
542
v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
543
v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
544
v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
545
v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
546
547
mask = LD_SB( &pu_chroma_mask_arr[16] );
548
549
LD_UB5( p_src, i_src_stride, src0, src1, src2, src3, src4 );
550
551
VSHF_B2_UB( src0, src1, src1, src2,
552
( mask + 1 ), ( mask + 1 ), src5, src6 );
553
VSHF_B2_UB( src2, src3, src3, src4,
554
( mask + 1 ), ( mask + 1 ), src7, src8 );
555
VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
556
VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 );
557
DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
558
coeff_hz_vec, coeff_hz_vec, res_hz0,
559
res_hz1, res_hz2, res_hz3 );
560
MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
561
coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
562
res_vt3 );
563
ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
564
SRARI_H2_UH( res_vt0, res_vt1, 6 );
565
SAT_UH2_UH( res_vt0, res_vt1, 7 );
566
PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
567
568
u_out0 = __msa_copy_u_h( res0, 0 );
569
u_out1 = __msa_copy_u_h( res0, 2 );
570
u_out2 = __msa_copy_u_h( res1, 0 );
571
u_out3 = __msa_copy_u_h( res1, 2 );
572
573
SH( u_out0, p_dst_u );
574
p_dst_u += i_dst_stride;
575
SH( u_out1, p_dst_u );
576
p_dst_u += i_dst_stride;
577
SH( u_out2, p_dst_u );
578
p_dst_u += i_dst_stride;
579
SH( u_out3, p_dst_u );
580
581
DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
582
coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
583
res_hz3 );
584
MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
585
coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
586
res_vt3 );
587
ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
588
SRARI_H2_UH( res_vt0, res_vt1, 6 );
589
SAT_UH2_UH( res_vt0, res_vt1, 7 );
590
PCKEV_B2_SH( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
591
592
u_out0 = __msa_copy_u_h( res0, 0 );
593
u_out1 = __msa_copy_u_h( res0, 2 );
594
u_out2 = __msa_copy_u_h( res1, 0 );
595
u_out3 = __msa_copy_u_h( res1, 2 );
596
597
SH( u_out0, p_dst_v );
598
p_dst_v += i_dst_stride;
599
SH( u_out1, p_dst_v );
600
p_dst_v += i_dst_stride;
601
SH( u_out2, p_dst_v );
602
p_dst_v += i_dst_stride;
603
SH( u_out3, p_dst_v );
604
}
605
606
static void avc_interleaved_chroma_hv_2w_msa( uint8_t *p_src,
607
int32_t i_src_stride,
608
uint8_t *p_dst_u,
609
uint8_t *p_dst_v,
610
int32_t i_dst_stride,
611
uint32_t u_coef_hor0,
612
uint32_t u_coef_hor1,
613
uint32_t u_coef_ver0,
614
uint32_t u_coef_ver1,
615
int32_t i_height )
616
{
617
if( 2 == i_height )
618
{
619
avc_interleaved_chroma_hv_2x2_msa( p_src, i_src_stride,
620
p_dst_u, p_dst_v, i_dst_stride,
621
u_coef_hor0, u_coef_hor1,
622
u_coef_ver0, u_coef_ver1 );
623
}
624
else if( 4 == i_height )
625
{
626
avc_interleaved_chroma_hv_2x4_msa( p_src, i_src_stride,
627
p_dst_u, p_dst_v, i_dst_stride,
628
u_coef_hor0, u_coef_hor1,
629
u_coef_ver0, u_coef_ver1 );
630
}
631
}
632
633
static void avc_interleaved_chroma_hv_4x2_msa( uint8_t *p_src,
634
int32_t i_src_stride,
635
uint8_t *p_dst_u,
636
uint8_t *p_dst_v,
637
int32_t i_dst_stride,
638
uint32_t u_coef_hor0,
639
uint32_t u_coef_hor1,
640
uint32_t u_coef_ver0,
641
uint32_t u_coef_ver1 )
642
{
643
uint32_t u_out0, u_out1, u_out2, u_out3;
644
v16u8 src0, src1, src2, src3, src4;
645
v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
646
v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
647
v16i8 mask;
648
v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
649
v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
650
v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
651
v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
652
v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
653
v4i32 res0, res1;
654
655
mask = LD_SB( &pu_chroma_mask_arr[16] );
656
657
LD_UB3( p_src, i_src_stride, src0, src1, src2 );
658
VSHF_B2_UB( src0, src1, src1, src2,
659
( mask + 1 ), ( mask + 1 ), src3, src4 );
660
VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
661
DOTP_UB4_UH( src0, src1, src3, src4, coeff_hz_vec, coeff_hz_vec,
662
coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
663
res_hz3 );
664
MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
665
coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
666
res_vt3 );
667
ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt2 );
668
SRARI_H2_UH( res_vt0, res_vt2, 6 );
669
SAT_UH2_UH( res_vt0, res_vt2, 7 );
670
PCKEV_B2_SW( res_vt0, res_vt0, res_vt2, res_vt2, res0, res1 );
671
672
u_out0 = __msa_copy_u_w( res0, 0 );
673
u_out1 = __msa_copy_u_w( res0, 1 );
674
u_out2 = __msa_copy_u_w( res1, 0 );
675
u_out3 = __msa_copy_u_w( res1, 1 );
676
SW( u_out0, p_dst_u );
677
p_dst_u += i_dst_stride;
678
SW( u_out1, p_dst_u );
679
SW( u_out2, p_dst_v );
680
p_dst_v += i_dst_stride;
681
SW( u_out3, p_dst_v );
682
}
683
684
static void avc_interleaved_chroma_hv_4x4mul_msa( uint8_t *p_src,
685
int32_t i_src_stride,
686
uint8_t *p_dst_u,
687
uint8_t *p_dst_v,
688
int32_t i_dst_stride,
689
uint32_t u_coef_hor0,
690
uint32_t u_coef_hor1,
691
uint32_t u_coef_ver0,
692
uint32_t u_coef_ver1,
693
int32_t i_height )
694
{
695
uint32_t u_row;
696
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
697
v8u16 res_hz0, res_hz1, res_hz2, res_hz3;
698
v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
699
v16i8 mask;
700
v4i32 res0, res1;
701
v16i8 coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
702
v16i8 coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
703
v16u8 coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
704
v8u16 coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
705
v8u16 coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
706
707
mask = LD_SB( &pu_chroma_mask_arr[16] );
708
709
src0 = LD_UB( p_src );
710
p_src += i_src_stride;
711
712
for( u_row = ( i_height >> 2 ); u_row--; )
713
{
714
LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 );
715
p_src += ( 4 * i_src_stride );
716
717
VSHF_B2_UB( src0, src1, src1, src2,
718
( mask + 1 ), ( mask + 1 ), src5, src6 );
719
VSHF_B2_UB( src2, src3, src3, src4,
720
( mask + 1 ), ( mask + 1 ), src7, src8 );
721
VSHF_B2_UB( src0, src1, src1, src2, mask, mask, src0, src1 );
722
VSHF_B2_UB( src2, src3, src3, src4, mask, mask, src2, src3 );
723
DOTP_UB4_UH( src0, src1, src2, src3, coeff_hz_vec, coeff_hz_vec,
724
coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
725
res_hz3 );
726
MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
727
coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
728
res_vt3 );
729
ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
730
SRARI_H2_UH( res_vt0, res_vt1, 6 );
731
SAT_UH2_UH( res_vt0, res_vt1, 7 );
732
PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
733
734
ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_u, i_dst_stride );
735
p_dst_u += ( 4 * i_dst_stride );
736
737
DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
738
coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz1, res_hz2,
739
res_hz3 );
740
MUL4( res_hz0, coeff_vt_vec1, res_hz1, coeff_vt_vec0, res_hz2,
741
coeff_vt_vec1, res_hz3, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
742
res_vt3 );
743
ADD2( res_vt0, res_vt1, res_vt2, res_vt3, res_vt0, res_vt1 );
744
SRARI_H2_UH( res_vt0, res_vt1, 6 );
745
SAT_UH2_UH( res_vt0, res_vt1, 7 );
746
PCKEV_B2_SW( res_vt0, res_vt0, res_vt1, res_vt1, res0, res1 );
747
748
ST4x4_UB( res0, res1, 0, 1, 0, 1, p_dst_v, i_dst_stride );
749
p_dst_v += ( 4 * i_dst_stride );
750
src0 = src4;
751
}
752
}
753
754
static void avc_interleaved_chroma_hv_4w_msa( uint8_t *p_src,
755
int32_t i_src_stride,
756
uint8_t *p_dst_u,
757
uint8_t *p_dst_v,
758
int32_t i_dst_stride,
759
uint32_t u_coef_hor0,
760
uint32_t u_coef_hor1,
761
uint32_t u_coef_ver0,
762
uint32_t u_coef_ver1,
763
int32_t i_height )
764
{
765
if( 2 == i_height )
766
{
767
avc_interleaved_chroma_hv_4x2_msa( p_src, i_src_stride,
768
p_dst_u, p_dst_v, i_dst_stride,
769
u_coef_hor0, u_coef_hor1,
770
u_coef_ver0, u_coef_ver1 );
771
}
772
else
773
{
774
avc_interleaved_chroma_hv_4x4mul_msa( p_src, i_src_stride,
775
p_dst_u, p_dst_v, i_dst_stride,
776
u_coef_hor0, u_coef_hor1,
777
u_coef_ver0, u_coef_ver1,
778
i_height );
779
}
780
}
781
782
static void avc_interleaved_chroma_hv_8w_msa( uint8_t *p_src,
783
int32_t i_src_stride,
784
uint8_t *p_dst_u,
785
uint8_t *p_dst_v,
786
int32_t i_dst_stride,
787
uint32_t u_coef_hor0,
788
uint32_t u_coef_hor1,
789
uint32_t u_coef_ver0,
790
uint32_t u_coef_ver1,
791
int32_t i_height )
792
{
793
uint32_t u_row;
794
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
795
v16u8 src10, src11, src12, src13, src14;
796
v8u16 res_hz0, res_hz1, res_hz2, res_hz3, res_hz4, res_hz5;
797
v8u16 res_vt0, res_vt1, res_vt2, res_vt3;
798
v16i8 mask = { 0, 2, 2, 4, 4, 6, 6, 8, 8, 10, 10, 12, 12, 14, 14, 16 };
799
v16i8 coeff_hz_vec0, coeff_hz_vec1;
800
v16i8 tmp0, tmp1;
801
v16u8 coeff_hz_vec;
802
v8u16 coeff_vt_vec0, coeff_vt_vec1;
803
804
coeff_hz_vec0 = __msa_fill_b( u_coef_hor0 );
805
coeff_hz_vec1 = __msa_fill_b( u_coef_hor1 );
806
coeff_hz_vec = ( v16u8 ) __msa_ilvr_b( coeff_hz_vec0, coeff_hz_vec1 );
807
coeff_vt_vec0 = ( v8u16 ) __msa_fill_h( u_coef_ver0 );
808
coeff_vt_vec1 = ( v8u16 ) __msa_fill_h( u_coef_ver1 );
809
810
LD_UB2( p_src, 16, src0, src13 );
811
p_src += i_src_stride;
812
813
VSHF_B2_UB( src0, src13, src0, src13, ( mask + 1 ), mask, src14, src0 );
814
DOTP_UB2_UH( src0, src14, coeff_hz_vec, coeff_hz_vec, res_hz0, res_hz5 );
815
816
for( u_row = ( i_height >> 2 ); u_row--; )
817
{
818
LD_UB4( p_src, i_src_stride, src1, src2, src3, src4 );
819
LD_UB4( p_src + 16, i_src_stride, src5, src6, src7, src8 );
820
p_src += ( 4 * i_src_stride );
821
822
VSHF_B2_UB( src1, src5, src2, src6, mask, mask, src9, src10 );
823
VSHF_B2_UB( src3, src7, src4, src8, mask, mask, src11, src12 );
824
DOTP_UB4_UH( src9, src10, src11, src12, coeff_hz_vec, coeff_hz_vec,
825
coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
826
res_hz4 );
827
MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
828
coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
829
res_vt3 );
830
831
res_vt0 += ( res_hz0 * coeff_vt_vec1 );
832
res_vt1 += ( res_hz1 * coeff_vt_vec1 );
833
res_vt2 += ( res_hz2 * coeff_vt_vec1 );
834
res_vt3 += ( res_hz3 * coeff_vt_vec1 );
835
836
SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 );
837
SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 );
838
PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 );
839
ST8x4_UB( tmp0, tmp1, p_dst_u, i_dst_stride );
840
p_dst_u += ( 4 * i_dst_stride );
841
res_hz0 = res_hz4;
842
843
VSHF_B2_UB( src1, src5, src2, src6,
844
( mask + 1 ), ( mask + 1 ), src5, src6 );
845
VSHF_B2_UB( src3, src7, src4, src8,
846
( mask + 1 ), ( mask + 1 ), src7, src8 );
847
DOTP_UB4_UH( src5, src6, src7, src8, coeff_hz_vec, coeff_hz_vec,
848
coeff_hz_vec, coeff_hz_vec, res_hz1, res_hz2, res_hz3,
849
res_hz4 );
850
MUL4( res_hz1, coeff_vt_vec0, res_hz2, coeff_vt_vec0, res_hz3,
851
coeff_vt_vec0, res_hz4, coeff_vt_vec0, res_vt0, res_vt1, res_vt2,
852
res_vt3 );
853
854
res_vt0 += ( res_hz5 * coeff_vt_vec1 );
855
res_vt1 += ( res_hz1 * coeff_vt_vec1 );
856
res_vt2 += ( res_hz2 * coeff_vt_vec1 );
857
res_vt3 += ( res_hz3 * coeff_vt_vec1 );
858
859
SRARI_H4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 6 );
860
SAT_UH4_UH( res_vt0, res_vt1, res_vt2, res_vt3, 7 );
861
PCKEV_B2_SB( res_vt1, res_vt0, res_vt3, res_vt2, tmp0, tmp1 );
862
ST8x4_UB( tmp0, tmp1, p_dst_v, i_dst_stride );
863
p_dst_v += ( 4 * i_dst_stride );
864
res_hz5 = res_hz4;
865
}
866
}
867
868
static void avc_wgt_opscale_4x2_msa( uint8_t *p_src, int32_t i_src_stride,
869
uint8_t *p_dst, int32_t i_dst_stride,
870
int32_t i_log2_denom, int32_t i_weight,
871
int32_t i_offset_in )
872
{
873
uint32_t u_load0, u_load1, u_out0, u_out1;
874
v16u8 zero = { 0 };
875
v16u8 src0, src1;
876
v4i32 dst0, dst1;
877
v8u16 temp0, temp1, wgt, denom, offset, tp0, tp1;
878
v8i16 vec0, vec1;
879
880
i_offset_in <<= ( i_log2_denom );
881
882
if( i_log2_denom )
883
{
884
i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
885
}
886
887
wgt = ( v8u16 ) __msa_fill_h( i_weight );
888
offset = ( v8u16 ) __msa_fill_h( i_offset_in );
889
denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
890
891
u_load0 = LW( p_src );
892
p_src += i_src_stride;
893
u_load1 = LW( p_src );
894
895
src0 = ( v16u8 ) __msa_fill_w( u_load0 );
896
src1 = ( v16u8 ) __msa_fill_w( u_load1 );
897
898
ILVR_B2_UH( zero, src0, zero, src1, temp0, temp1 );
899
MUL2( wgt, temp0, wgt, temp1, temp0, temp1 );
900
ADDS_SH2_SH( temp0, offset, temp1, offset, vec0, vec1 );
901
MAXI_SH2_SH( vec0, vec1, 0 );
902
903
tp0 = ( v8u16 ) __msa_srl_h( vec0, ( v8i16 ) denom );
904
tp1 = ( v8u16 ) __msa_srl_h( vec1, ( v8i16 ) denom );
905
906
SAT_UH2_UH( tp0, tp1, 7 );
907
PCKEV_B2_SW( tp0, tp0, tp1, tp1, dst0, dst1 );
908
909
u_out0 = __msa_copy_u_w( dst0, 0 );
910
u_out1 = __msa_copy_u_w( dst1, 0 );
911
SW( u_out0, p_dst );
912
p_dst += i_dst_stride;
913
SW( u_out1, p_dst );
914
}
915
916
static void avc_wgt_opscale_4x4multiple_msa( uint8_t *p_src,
917
int32_t i_src_stride,
918
uint8_t *p_dst,
919
int32_t i_dst_stride,
920
int32_t i_height,
921
int32_t i_log2_denom,
922
int32_t i_weight,
923
int32_t i_offset_in )
924
{
925
uint8_t u_cnt;
926
uint32_t u_load0, u_load1, u_load2, u_load3;
927
v16u8 zero = { 0 };
928
v16u8 src0, src1, src2, src3;
929
v8u16 temp0, temp1, temp2, temp3;
930
v8u16 wgt, denom, offset;
931
932
i_offset_in <<= ( i_log2_denom );
933
934
if( i_log2_denom )
935
{
936
i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
937
}
938
939
wgt = ( v8u16 ) __msa_fill_h( i_weight );
940
offset = ( v8u16 ) __msa_fill_h( i_offset_in );
941
denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
942
943
for( u_cnt = i_height / 4; u_cnt--; )
944
{
945
LW4( p_src, i_src_stride, u_load0, u_load1, u_load2, u_load3 );
946
p_src += 4 * i_src_stride;
947
948
src0 = ( v16u8 ) __msa_fill_w( u_load0 );
949
src1 = ( v16u8 ) __msa_fill_w( u_load1 );
950
src2 = ( v16u8 ) __msa_fill_w( u_load2 );
951
src3 = ( v16u8 ) __msa_fill_w( u_load3 );
952
953
ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
954
temp0, temp1, temp2, temp3 );
955
MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
956
temp0, temp1, temp2, temp3 );
957
ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
958
temp0, temp1, temp2, temp3 );
959
MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
960
SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
961
SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
962
PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
963
p_dst += ( 4 * i_dst_stride );
964
}
965
}
966
967
static void avc_wgt_opscale_4width_msa( uint8_t *p_src, int32_t i_src_stride,
968
uint8_t *p_dst, int32_t i_dst_stride,
969
int32_t i_height, int32_t i_log2_denom,
970
int32_t i_weight, int32_t i_offset_in )
971
{
972
if( 2 == i_height )
973
{
974
avc_wgt_opscale_4x2_msa( p_src, i_src_stride, p_dst, i_dst_stride,
975
i_log2_denom, i_weight, i_offset_in );
976
}
977
else
978
{
979
avc_wgt_opscale_4x4multiple_msa( p_src, i_src_stride,
980
p_dst, i_dst_stride,
981
i_height, i_log2_denom,
982
i_weight, i_offset_in );
983
}
984
}
985
986
static void avc_wgt_opscale_8width_msa( uint8_t *p_src, int32_t i_src_stride,
987
uint8_t *p_dst, int32_t i_dst_stride,
988
int32_t i_height, int32_t i_log2_denom,
989
int32_t i_weight, int32_t i_offset_in )
990
{
991
uint8_t u_cnt;
992
v16u8 zero = { 0 };
993
v16u8 src0, src1, src2, src3;
994
v8u16 temp0, temp1, temp2, temp3;
995
v8u16 wgt, denom, offset;
996
v16i8 out0, out1;
997
998
i_offset_in <<= ( i_log2_denom );
999
1000
if( i_log2_denom )
1001
{
1002
i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
1003
}
1004
1005
wgt = ( v8u16 ) __msa_fill_h( i_weight );
1006
offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1007
denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
1008
1009
for( u_cnt = i_height / 4; u_cnt--; )
1010
{
1011
LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1012
p_src += 4 * i_src_stride;
1013
1014
ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
1015
temp0, temp1, temp2, temp3 );
1016
MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
1017
temp0, temp1, temp2, temp3 );
1018
ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
1019
temp0, temp1, temp2, temp3 );
1020
MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
1021
SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
1022
SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
1023
PCKEV_B2_SB( temp1, temp0, temp3, temp2, out0, out1 );
1024
ST8x4_UB( out0, out1, p_dst, i_dst_stride );
1025
p_dst += ( 4 * i_dst_stride );
1026
}
1027
}
1028
1029
static void avc_wgt_opscale_16width_msa( uint8_t *p_src, int32_t i_src_stride,
1030
uint8_t *p_dst, int32_t i_dst_stride,
1031
int32_t i_height, int32_t i_log2_denom,
1032
int32_t i_weight, int32_t i_offset_in )
1033
{
1034
uint8_t u_cnt;
1035
v16i8 zero = { 0 };
1036
v16u8 src0, src1, src2, src3, dst0, dst1, dst2, dst3;
1037
v8u16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1038
v8u16 wgt, denom, offset;
1039
1040
i_offset_in <<= ( i_log2_denom );
1041
1042
if( i_log2_denom )
1043
{
1044
i_offset_in += ( 1 << ( i_log2_denom - 1 ) );
1045
}
1046
1047
wgt = ( v8u16 ) __msa_fill_h( i_weight );
1048
offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1049
denom = ( v8u16 ) __msa_fill_h( i_log2_denom );
1050
1051
for( u_cnt = i_height / 4; u_cnt--; )
1052
{
1053
LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1054
p_src += 4 * i_src_stride;
1055
1056
ILVR_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
1057
temp0, temp2, temp4, temp6 );
1058
ILVL_B4_UH( zero, src0, zero, src1, zero, src2, zero, src3,
1059
temp1, temp3, temp5, temp7 );
1060
MUL4( wgt, temp0, wgt, temp1, wgt, temp2, wgt, temp3,
1061
temp0, temp1, temp2, temp3 );
1062
MUL4( wgt, temp4, wgt, temp5, wgt, temp6, wgt, temp7,
1063
temp4, temp5, temp6, temp7 );
1064
ADDS_SH4_UH( temp0, offset, temp1, offset, temp2, offset, temp3, offset,
1065
temp0, temp1, temp2, temp3 );
1066
ADDS_SH4_UH( temp4, offset, temp5, offset, temp6, offset, temp7, offset,
1067
temp4, temp5, temp6, temp7 );
1068
MAXI_SH4_UH( temp0, temp1, temp2, temp3, 0 );
1069
MAXI_SH4_UH( temp4, temp5, temp6, temp7, 0 );
1070
SRL_H4_UH( temp0, temp1, temp2, temp3, denom );
1071
SRL_H4_UH( temp4, temp5, temp6, temp7, denom );
1072
SAT_UH4_UH( temp0, temp1, temp2, temp3, 7 );
1073
SAT_UH4_UH( temp4, temp5, temp6, temp7, 7 );
1074
PCKEV_B4_UB( temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
1075
dst0, dst1, dst2, dst3 );
1076
1077
ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride );
1078
p_dst += 4 * i_dst_stride;
1079
}
1080
}
1081
1082
static void avc_biwgt_opscale_4x2_nw_msa( uint8_t *p_src1_in,
1083
int32_t i_src1_stride,
1084
uint8_t *p_src2_in,
1085
int32_t i_src2_stride,
1086
uint8_t *p_dst,
1087
int32_t i_dst_stride,
1088
int32_t i_log2_denom,
1089
int32_t i_src1_weight,
1090
int32_t i_src2_weight,
1091
int32_t i_offset_in )
1092
{
1093
uint32_t u_load0, u_load1, u_out0, u_out1;
1094
v8i16 src1_wgt, src2_wgt;
1095
v16u8 in0, in1, in2, in3;
1096
v8i16 temp0, temp1, temp2, temp3;
1097
v16i8 zero = { 0 };
1098
v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
1099
1100
src1_wgt = __msa_fill_h( i_src1_weight );
1101
src2_wgt = __msa_fill_h( i_src2_weight );
1102
u_load0 = LW( p_src1_in );
1103
u_load1 = LW( p_src1_in + i_src1_stride );
1104
in0 = ( v16u8 ) __msa_fill_w( u_load0 );
1105
in1 = ( v16u8 ) __msa_fill_w( u_load1 );
1106
u_load0 = LW( p_src2_in );
1107
u_load1 = LW( p_src2_in + i_src2_stride );
1108
in2 = ( v16u8 ) __msa_fill_w( u_load0 );
1109
in3 = ( v16u8 ) __msa_fill_w( u_load1 );
1110
ILVR_B4_SH( zero, in0, zero, in1, zero, in2, zero, in3,
1111
temp0, temp1, temp2, temp3 );
1112
temp0 = ( temp0 * src1_wgt ) + ( temp2 * src2_wgt );
1113
temp1 = ( temp1 * src1_wgt ) + ( temp3 * src2_wgt );
1114
SRAR_H2_SH( temp0, temp1, denom );
1115
CLIP_SH2_0_255( temp0, temp1 );
1116
PCKEV_B2_UB( temp0, temp0, temp1, temp1, in0, in1 );
1117
u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 );
1118
u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 );
1119
SW( u_out0, p_dst );
1120
p_dst += i_dst_stride;
1121
SW( u_out1, p_dst );
1122
}
1123
1124
static void avc_biwgt_opscale_4x4multiple_nw_msa( uint8_t *p_src1_in,
1125
int32_t i_src1_stride,
1126
uint8_t *p_src2_in,
1127
int32_t i_src2_stride,
1128
uint8_t *p_dst,
1129
int32_t i_dst_stride,
1130
int32_t i_height,
1131
int32_t i_log2_denom,
1132
int32_t i_src1_weight,
1133
int32_t i_src2_weight,
1134
int32_t i_offset_in )
1135
{
1136
uint8_t u_cnt;
1137
uint32_t u_load0, u_load1, u_load2, u_load3;
1138
v8i16 src1_wgt, src2_wgt;
1139
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1140
v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1141
v16i8 zero = { 0 };
1142
v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
1143
1144
src1_wgt = __msa_fill_h( i_src1_weight );
1145
src2_wgt = __msa_fill_h( i_src2_weight );
1146
for( u_cnt = i_height / 4; u_cnt--; )
1147
{
1148
LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 );
1149
p_src1_in += ( 4 * i_src1_stride );
1150
src0 = ( v16u8 ) __msa_fill_w( u_load0 );
1151
src1 = ( v16u8 ) __msa_fill_w( u_load1 );
1152
src2 = ( v16u8 ) __msa_fill_w( u_load2 );
1153
src3 = ( v16u8 ) __msa_fill_w( u_load3 );
1154
LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 );
1155
p_src2_in += ( 4 * i_src2_stride );
1156
src4 = ( v16u8 ) __msa_fill_w( u_load0 );
1157
src5 = ( v16u8 ) __msa_fill_w( u_load1 );
1158
src6 = ( v16u8 ) __msa_fill_w( u_load2 );
1159
src7 = ( v16u8 ) __msa_fill_w( u_load3 );
1160
ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3,
1161
temp0, temp1, temp2, temp3 );
1162
ILVR_B4_SH( zero, src4, zero, src5, zero, src6, zero, src7,
1163
temp4, temp5, temp6, temp7 );
1164
temp0 = ( temp0 * src1_wgt ) + ( temp4 * src2_wgt );
1165
temp1 = ( temp1 * src1_wgt ) + ( temp5 * src2_wgt );
1166
temp2 = ( temp2 * src1_wgt ) + ( temp6 * src2_wgt );
1167
temp3 = ( temp3 * src1_wgt ) + ( temp7 * src2_wgt );
1168
SRAR_H4_SH( temp0, temp1, temp2, temp3, denom );
1169
CLIP_SH4_0_255( temp0, temp1, temp2, temp3 );
1170
PCKEV_ST4x4_UB( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
1171
p_dst += ( 4 * i_dst_stride );
1172
}
1173
}
1174
1175
static void avc_biwgt_opscale_4width_nw_msa( uint8_t *p_src1_in,
1176
int32_t i_src1_stride,
1177
uint8_t *p_src2_in,
1178
int32_t i_src2_stride,
1179
uint8_t *p_dst,
1180
int32_t i_dst_stride,
1181
int32_t i_height,
1182
int32_t i_log2_denom,
1183
int32_t i_src1_weight,
1184
int32_t i_src2_weight,
1185
int32_t i_offset_in )
1186
{
1187
if( 2 == i_height )
1188
{
1189
avc_biwgt_opscale_4x2_nw_msa( p_src1_in, i_src1_stride,
1190
p_src2_in, i_src2_stride,
1191
p_dst, i_dst_stride,
1192
i_log2_denom, i_src1_weight,
1193
i_src2_weight, i_offset_in );
1194
}
1195
else
1196
{
1197
avc_biwgt_opscale_4x4multiple_nw_msa( p_src1_in, i_src1_stride,
1198
p_src2_in, i_src2_stride,
1199
p_dst, i_dst_stride,
1200
i_height, i_log2_denom,
1201
i_src1_weight, i_src2_weight,
1202
i_offset_in );
1203
}
1204
}
1205
1206
static void avc_biwgt_opscale_8width_nw_msa( uint8_t *p_src1_in,
1207
int32_t i_src1_stride,
1208
uint8_t *p_src2_in,
1209
int32_t i_src2_stride,
1210
uint8_t *p_dst,
1211
int32_t i_dst_stride,
1212
int32_t i_height,
1213
int32_t i_log2_denom,
1214
int32_t i_src1_weight,
1215
int32_t i_src2_weight,
1216
int32_t i_offset_in )
1217
{
1218
uint8_t u_cnt;
1219
v8i16 src1_wgt, src2_wgt;
1220
v16u8 src0, src1, src2, src3;
1221
v16u8 dst0, dst1, dst2, dst3;
1222
v8i16 temp0, temp1, temp2, temp3;
1223
v8i16 res0, res1, res2, res3;
1224
v16i8 zero = { 0 };
1225
v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
1226
1227
src1_wgt = __msa_fill_h( i_src1_weight );
1228
src2_wgt = __msa_fill_h( i_src2_weight );
1229
1230
for( u_cnt = i_height / 4; u_cnt--; )
1231
{
1232
LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
1233
p_src1_in += ( 4 * i_src1_stride );
1234
LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 );
1235
p_src2_in += ( 4 * i_src2_stride );
1236
ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3,
1237
temp0, temp1, temp2, temp3 );
1238
ILVR_B4_SH( zero, dst0, zero, dst1, zero, dst2, zero, dst3,
1239
res0, res1, res2, res3 );
1240
res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt );
1241
res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt );
1242
res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt );
1243
res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt );
1244
SRAR_H4_SH( res0, res1, res2, res3, denom );
1245
CLIP_SH4_0_255( res0, res1, res2, res3 );
1246
PCKEV_B4_UB( res0, res0, res1, res1, res2, res2, res3, res3,
1247
dst0, dst1, dst2, dst3 );
1248
ST8x1_UB( dst0, p_dst );
1249
p_dst += i_dst_stride;
1250
ST8x1_UB( dst1, p_dst );
1251
p_dst += i_dst_stride;
1252
ST8x1_UB( dst2, p_dst );
1253
p_dst += i_dst_stride;
1254
ST8x1_UB( dst3, p_dst );
1255
p_dst += i_dst_stride;
1256
}
1257
}
1258
1259
static void avc_biwgt_opscale_16width_nw_msa( uint8_t *p_src1_in,
1260
int32_t i_src1_stride,
1261
uint8_t *p_src2_in,
1262
int32_t i_src2_stride,
1263
uint8_t *p_dst,
1264
int32_t i_dst_stride,
1265
int32_t i_height,
1266
int32_t i_log2_denom,
1267
int32_t i_src1_weight,
1268
int32_t i_src2_weight,
1269
int32_t i_offset_in )
1270
{
1271
uint8_t u_cnt;
1272
v8i16 src1_wgt, src2_wgt;
1273
v16u8 src0, src1, src2, src3;
1274
v16u8 dst0, dst1, dst2, dst3;
1275
v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1276
v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1277
v16i8 zero = { 0 };
1278
v8i16 denom = __msa_ldi_h( i_log2_denom + 1 );
1279
1280
src1_wgt = __msa_fill_h( i_src1_weight );
1281
src2_wgt = __msa_fill_h( i_src2_weight );
1282
1283
for( u_cnt = i_height / 4; u_cnt--; )
1284
{
1285
LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
1286
p_src1_in += ( 4 * i_src1_stride );
1287
LD_UB4( p_src2_in, i_src2_stride, dst0, dst1, dst2, dst3 );
1288
p_src2_in += ( 4 * i_src2_stride );
1289
ILVRL_B2_SH( zero, src0, temp1, temp0 );
1290
ILVRL_B2_SH( zero, src1, temp3, temp2 );
1291
ILVRL_B2_SH( zero, src2, temp5, temp4 );
1292
ILVRL_B2_SH( zero, src3, temp7, temp6 );
1293
ILVRL_B2_SH( zero, dst0, res1, res0 );
1294
ILVRL_B2_SH( zero, dst1, res3, res2 );
1295
ILVRL_B2_SH( zero, dst2, res5, res4 );
1296
ILVRL_B2_SH( zero, dst3, res7, res6 );
1297
res0 = ( temp0 * src1_wgt ) + ( res0 * src2_wgt );
1298
res1 = ( temp1 * src1_wgt ) + ( res1 * src2_wgt );
1299
res2 = ( temp2 * src1_wgt ) + ( res2 * src2_wgt );
1300
res3 = ( temp3 * src1_wgt ) + ( res3 * src2_wgt );
1301
res4 = ( temp4 * src1_wgt ) + ( res4 * src2_wgt );
1302
res5 = ( temp5 * src1_wgt ) + ( res5 * src2_wgt );
1303
res6 = ( temp6 * src1_wgt ) + ( res6 * src2_wgt );
1304
res7 = ( temp7 * src1_wgt ) + ( res7 * src2_wgt );
1305
SRAR_H4_SH( res0, res1, res2, res3, denom );
1306
SRAR_H4_SH( res4, res5, res6, res7, denom );
1307
CLIP_SH4_0_255( res0, res1, res2, res3 );
1308
CLIP_SH4_0_255( res4, res5, res6, res7 );
1309
PCKEV_B4_UB( res0, res1, res2, res3, res4, res5, res6, res7,
1310
dst0, dst1, dst2, dst3 );
1311
ST_UB4( dst0, dst1, dst2, dst3, p_dst, i_dst_stride );
1312
p_dst += 4 * i_dst_stride;
1313
}
1314
}
1315
1316
static void avc_biwgt_opscale_4x2_msa( uint8_t *p_src1_in,
1317
int32_t i_src1_stride,
1318
uint8_t *p_src2_in,
1319
int32_t i_src2_stride,
1320
uint8_t *p_dst, int32_t i_dst_stride,
1321
int32_t i_log2_denom,
1322
int32_t i_src1_weight,
1323
int32_t i_src2_weight,
1324
int32_t i_offset_in )
1325
{
1326
uint32_t u_load0, u_load1, u_out0, u_out1;
1327
v16u8 src1_wgt, src2_wgt, wgt;
1328
v16i8 in0, in1, in2, in3;
1329
v8u16 temp0, temp1, denom, offset;
1330
1331
i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
1332
1333
src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
1334
src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
1335
offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1336
denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
1337
1338
wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
1339
1340
u_load0 = LW( p_src1_in );
1341
u_load1 = LW( p_src1_in + i_src1_stride );
1342
in0 = ( v16i8 ) __msa_fill_w( u_load0 );
1343
in1 = ( v16i8 ) __msa_fill_w( u_load1 );
1344
1345
u_load0 = LW( p_src2_in );
1346
u_load1 = LW( p_src2_in + i_src2_stride );
1347
in2 = ( v16i8 ) __msa_fill_w( u_load0 );
1348
in3 = ( v16i8 ) __msa_fill_w( u_load1 );
1349
1350
ILVR_B2_SB( in2, in0, in3, in1, in0, in1 );
1351
1352
temp0 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in0 );
1353
temp1 = __msa_dpadd_u_h( offset, wgt, ( v16u8 ) in1 );
1354
temp0 >>= denom;
1355
temp1 >>= denom;
1356
MAXI_SH2_UH( temp0, temp1, 0 );
1357
SAT_UH2_UH( temp0, temp1, 7 );
1358
PCKEV_B2_SB( temp0, temp0, temp1, temp1, in0, in1 );
1359
1360
u_out0 = __msa_copy_u_w( ( v4i32 ) in0, 0 );
1361
u_out1 = __msa_copy_u_w( ( v4i32 ) in1, 0 );
1362
SW( u_out0, p_dst );
1363
p_dst += i_dst_stride;
1364
SW( u_out1, p_dst );
1365
}
1366
1367
static void avc_biwgt_opscale_4x4multiple_msa( uint8_t *p_src1_in,
1368
int32_t i_src1_stride,
1369
uint8_t *p_src2_in,
1370
int32_t i_src2_stride,
1371
uint8_t *p_dst,
1372
int32_t i_dst_stride,
1373
int32_t i_height,
1374
int32_t i_log2_denom,
1375
int32_t i_src1_weight,
1376
int32_t i_src2_weight,
1377
int32_t i_offset_in )
1378
{
1379
uint8_t u_cnt;
1380
uint32_t u_load0, u_load1, u_load2, u_load3;
1381
v16u8 src1_wgt, src2_wgt, wgt;
1382
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1383
v16u8 temp0, temp1, temp2, temp3;
1384
v8u16 res0, res1, res2, res3;
1385
v8u16 denom, offset;
1386
1387
i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
1388
1389
src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
1390
src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
1391
offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1392
denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
1393
1394
wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
1395
1396
for( u_cnt = i_height / 4; u_cnt--; )
1397
{
1398
LW4( p_src1_in, i_src1_stride, u_load0, u_load1, u_load2, u_load3 );
1399
p_src1_in += ( 4 * i_src1_stride );
1400
1401
src0 = ( v16u8 ) __msa_fill_w( u_load0 );
1402
src1 = ( v16u8 ) __msa_fill_w( u_load1 );
1403
src2 = ( v16u8 ) __msa_fill_w( u_load2 );
1404
src3 = ( v16u8 ) __msa_fill_w( u_load3 );
1405
1406
LW4( p_src2_in, i_src2_stride, u_load0, u_load1, u_load2, u_load3 );
1407
p_src2_in += ( 4 * i_src2_stride );
1408
1409
src4 = ( v16u8 ) __msa_fill_w( u_load0 );
1410
src5 = ( v16u8 ) __msa_fill_w( u_load1 );
1411
src6 = ( v16u8 ) __msa_fill_w( u_load2 );
1412
src7 = ( v16u8 ) __msa_fill_w( u_load3 );
1413
1414
ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1415
temp0, temp1, temp2, temp3 );
1416
DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
1417
res0, res1, res2, res3 );
1418
ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
1419
res0, res1, res2, res3 );
1420
SRA_4V( res0, res1, res2, res3, denom );
1421
MAXI_SH4_UH( res0, res1, res2, res3, 0 );
1422
SAT_UH4_UH( res0, res1, res2, res3, 7 );
1423
PCKEV_ST4x4_UB( res0, res1, res2, res3, p_dst, i_dst_stride );
1424
p_dst += ( 4 * i_dst_stride );
1425
}
1426
}
1427
1428
static void avc_biwgt_opscale_4width_msa( uint8_t *p_src1_in,
1429
int32_t i_src1_stride,
1430
uint8_t *p_src2_in,
1431
int32_t i_src2_stride,
1432
uint8_t *p_dst,
1433
int32_t i_dst_stride,
1434
int32_t i_height,
1435
int32_t i_log2_denom,
1436
int32_t i_src1_weight,
1437
int32_t i_src2_weight,
1438
int32_t i_offset_in )
1439
{
1440
if( 2 == i_height )
1441
{
1442
avc_biwgt_opscale_4x2_msa( p_src1_in, i_src1_stride,
1443
p_src2_in, i_src2_stride,
1444
p_dst, i_dst_stride,
1445
i_log2_denom, i_src1_weight,
1446
i_src2_weight, i_offset_in );
1447
}
1448
else
1449
{
1450
avc_biwgt_opscale_4x4multiple_msa( p_src1_in, i_src1_stride,
1451
p_src2_in, i_src2_stride,
1452
p_dst, i_dst_stride,
1453
i_height, i_log2_denom,
1454
i_src1_weight,
1455
i_src2_weight, i_offset_in );
1456
}
1457
}
1458
1459
1460
static void avc_biwgt_opscale_8width_msa( uint8_t *p_src1_in,
1461
int32_t i_src1_stride,
1462
uint8_t *p_src2_in,
1463
int32_t i_src2_stride,
1464
uint8_t *p_dst,
1465
int32_t i_dst_stride,
1466
int32_t i_height,
1467
int32_t i_log2_denom,
1468
int32_t i_src1_weight,
1469
int32_t i_src2_weight,
1470
int32_t i_offset_in )
1471
{
1472
uint8_t u_cnt;
1473
v16u8 src1_wgt, src2_wgt, wgt;
1474
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1475
v16u8 temp0, temp1, temp2, temp3;
1476
v8u16 res0, res1, res2, res3;
1477
v8u16 denom, offset;
1478
v16i8 out0, out1;
1479
1480
i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
1481
1482
src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
1483
src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
1484
offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1485
denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
1486
1487
wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
1488
1489
for( u_cnt = i_height / 4; u_cnt--; )
1490
{
1491
LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
1492
p_src1_in += ( 4 * i_src1_stride );
1493
1494
LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 );
1495
p_src2_in += ( 4 * i_src2_stride );
1496
1497
ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1498
temp0, temp1, temp2, temp3 );
1499
DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
1500
res0, res1, res2, res3 );
1501
ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
1502
res0, res1, res2, res3 );
1503
SRA_4V( res0, res1, res2, res3, denom );
1504
MAXI_SH4_UH( res0, res1, res2, res3, 0 );
1505
SAT_UH4_UH( res0, res1, res2, res3, 7 );
1506
PCKEV_B2_SB( res1, res0, res3, res2, out0, out1 );
1507
ST8x4_UB( out0, out1, p_dst, i_dst_stride );
1508
p_dst += 4 * i_dst_stride;
1509
}
1510
}
1511
1512
static void avc_biwgt_opscale_16width_msa( uint8_t *p_src1_in,
1513
int32_t i_src1_stride,
1514
uint8_t *p_src2_in,
1515
int32_t i_src2_stride,
1516
uint8_t *p_dst,
1517
int32_t i_dst_stride,
1518
int32_t i_height,
1519
int32_t i_log2_denom,
1520
int32_t i_src1_weight,
1521
int32_t i_src2_weight,
1522
int32_t i_offset_in )
1523
{
1524
uint8_t u_cnt;
1525
v16u8 src1_wgt, src2_wgt, wgt;
1526
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1527
v16u8 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
1528
v8u16 res0, res1, res2, res3, res4, res5, res6, res7;
1529
v8u16 denom, offset;
1530
1531
i_offset_in = ( ( i_offset_in + 1 ) | 1 ) << i_log2_denom;
1532
1533
src1_wgt = ( v16u8 ) __msa_fill_b( i_src1_weight );
1534
src2_wgt = ( v16u8 ) __msa_fill_b( i_src2_weight );
1535
offset = ( v8u16 ) __msa_fill_h( i_offset_in );
1536
denom = ( v8u16 ) __msa_fill_h( i_log2_denom + 1 );
1537
1538
wgt = ( v16u8 ) __msa_ilvev_b( ( v16i8 ) src2_wgt, ( v16i8 ) src1_wgt );
1539
1540
for( u_cnt = i_height / 4; u_cnt--; )
1541
{
1542
LD_UB4( p_src1_in, i_src1_stride, src0, src1, src2, src3 );
1543
p_src1_in += ( 4 * i_src1_stride );
1544
1545
LD_UB4( p_src2_in, i_src2_stride, src4, src5, src6, src7 );
1546
p_src2_in += ( 4 * i_src2_stride );
1547
1548
ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1549
temp0, temp2, temp4, temp6 );
1550
ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1551
temp1, temp3, temp5, temp7 );
1552
DOTP_UB4_UH( temp0, temp1, temp2, temp3, wgt, wgt, wgt, wgt,
1553
res0, res1, res2, res3 );
1554
ADD4( res0, offset, res1, offset, res2, offset, res3, offset,
1555
res0, res1, res2, res3 );
1556
DOTP_UB4_UH( temp4, temp5, temp6, temp7, wgt, wgt, wgt, wgt,
1557
res4, res5, res6, res7 );
1558
ADD4( res4, offset, res5, offset, res6, offset, res7, offset,
1559
res4, res5, res6, res7 );
1560
SRA_4V( res0, res1, res2, res3, denom );
1561
SRA_4V( res4, res5, res6, res7, denom );
1562
MAXI_SH4_UH( res0, res1, res2, res3, 0 );
1563
MAXI_SH4_UH( res4, res5, res6, res7, 0 );
1564
SAT_UH4_UH( res0, res1, res2, res3, 7 );
1565
SAT_UH4_UH( res4, res5, res6, res7, 7 );
1566
PCKEV_B4_UB( res1, res0, res3, res2, res5, res4, res7, res6,
1567
temp0, temp1, temp2, temp3 );
1568
ST_UB4( temp0, temp1, temp2, temp3, p_dst, i_dst_stride );
1569
p_dst += 4 * i_dst_stride;
1570
}
1571
}
1572
1573
static void copy_width4_msa( uint8_t *p_src, int32_t i_src_stride,
1574
uint8_t *p_dst, int32_t i_dst_stride,
1575
int32_t i_height )
1576
{
1577
int32_t i_cnt;
1578
uint32_t u_src0, u_src1;
1579
1580
for( i_cnt = ( i_height / 2 ); i_cnt--; )
1581
{
1582
u_src0 = LW( p_src );
1583
p_src += i_src_stride;
1584
u_src1 = LW( p_src );
1585
p_src += i_src_stride;
1586
1587
SW( u_src0, p_dst );
1588
p_dst += i_dst_stride;
1589
SW( u_src1, p_dst );
1590
p_dst += i_dst_stride;
1591
}
1592
}
1593
1594
static void copy_width8_msa( uint8_t *p_src, int32_t i_src_stride,
1595
uint8_t *p_dst, int32_t i_dst_stride,
1596
int32_t i_height )
1597
{
1598
int32_t i_cnt;
1599
uint64_t u_out0, u_out1, u_out2, u_out3, u_out4, u_out5, u_out6, u_out7;
1600
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1601
1602
if( 0 == i_height % 12 )
1603
{
1604
for( i_cnt = ( i_height / 12 ); i_cnt--; )
1605
{
1606
LD_UB8( p_src, i_src_stride,
1607
src0, src1, src2, src3, src4, src5, src6, src7 );
1608
p_src += ( 8 * i_src_stride );
1609
1610
u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1611
u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1612
u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
1613
u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
1614
u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 );
1615
u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 );
1616
u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 );
1617
u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 );
1618
1619
SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1620
p_dst += ( 4 * i_dst_stride );
1621
SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
1622
p_dst += ( 4 * i_dst_stride );
1623
1624
LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1625
p_src += ( 4 * i_src_stride );
1626
1627
u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1628
u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1629
u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
1630
u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
1631
1632
SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1633
p_dst += ( 4 * i_dst_stride );
1634
}
1635
}
1636
else if( 0 == i_height % 8 )
1637
{
1638
for( i_cnt = i_height >> 3; i_cnt--; )
1639
{
1640
LD_UB8( p_src, i_src_stride,
1641
src0, src1, src2, src3, src4, src5, src6, src7 );
1642
p_src += ( 8 * i_src_stride );
1643
1644
u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1645
u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1646
u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
1647
u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
1648
u_out4 = __msa_copy_u_d( ( v2i64 ) src4, 0 );
1649
u_out5 = __msa_copy_u_d( ( v2i64 ) src5, 0 );
1650
u_out6 = __msa_copy_u_d( ( v2i64 ) src6, 0 );
1651
u_out7 = __msa_copy_u_d( ( v2i64 ) src7, 0 );
1652
1653
SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1654
p_dst += ( 4 * i_dst_stride );
1655
SD4( u_out4, u_out5, u_out6, u_out7, p_dst, i_dst_stride );
1656
p_dst += ( 4 * i_dst_stride );
1657
}
1658
}
1659
else if( 0 == i_height % 4 )
1660
{
1661
for( i_cnt = ( i_height / 4 ); i_cnt--; )
1662
{
1663
LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1664
p_src += ( 4 * i_src_stride );
1665
u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1666
u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1667
u_out2 = __msa_copy_u_d( ( v2i64 ) src2, 0 );
1668
u_out3 = __msa_copy_u_d( ( v2i64 ) src3, 0 );
1669
1670
SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1671
p_dst += ( 4 * i_dst_stride );
1672
}
1673
}
1674
else if( 0 == i_height % 2 )
1675
{
1676
for( i_cnt = ( i_height / 2 ); i_cnt--; )
1677
{
1678
LD_UB2( p_src, i_src_stride, src0, src1 );
1679
p_src += ( 2 * i_src_stride );
1680
u_out0 = __msa_copy_u_d( ( v2i64 ) src0, 0 );
1681
u_out1 = __msa_copy_u_d( ( v2i64 ) src1, 0 );
1682
1683
SD( u_out0, p_dst );
1684
p_dst += i_dst_stride;
1685
SD( u_out1, p_dst );
1686
p_dst += i_dst_stride;
1687
}
1688
}
1689
}
1690
1691
1692
static void copy_16multx8mult_msa( uint8_t *p_src, int32_t i_src_stride,
1693
uint8_t *p_dst, int32_t i_dst_stride,
1694
int32_t i_height, int32_t i_width )
1695
{
1696
int32_t i_cnt, i_loop_cnt;
1697
uint8_t *p_src_tmp, *p_dst_tmp;
1698
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1699
1700
for( i_cnt = ( i_width >> 4 ); i_cnt--; )
1701
{
1702
p_src_tmp = p_src;
1703
p_dst_tmp = p_dst;
1704
1705
for( i_loop_cnt = ( i_height >> 3 ); i_loop_cnt--; )
1706
{
1707
LD_UB8( p_src_tmp, i_src_stride,
1708
src0, src1, src2, src3, src4, src5, src6, src7 );
1709
p_src_tmp += ( 8 * i_src_stride );
1710
1711
ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7,
1712
p_dst_tmp, i_dst_stride );
1713
p_dst_tmp += ( 8 * i_dst_stride );
1714
}
1715
1716
p_src += 16;
1717
p_dst += 16;
1718
}
1719
}
1720
1721
static void copy_width16_msa( uint8_t *p_src, int32_t i_src_stride,
1722
uint8_t *p_dst, int32_t i_dst_stride,
1723
int32_t i_height )
1724
{
1725
int32_t i_cnt;
1726
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1727
1728
if( 0 == i_height % 12 )
1729
{
1730
for( i_cnt = ( i_height / 12 ); i_cnt--; )
1731
{
1732
LD_UB8( p_src, i_src_stride,
1733
src0, src1, src2, src3, src4, src5, src6, src7 );
1734
p_src += ( 8 * i_src_stride );
1735
ST_UB8( src0, src1, src2, src3, src4, src5, src6, src7,
1736
p_dst, i_dst_stride );
1737
p_dst += ( 8 * i_dst_stride );
1738
1739
LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1740
p_src += ( 4 * i_src_stride );
1741
ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
1742
p_dst += ( 4 * i_dst_stride );
1743
}
1744
}
1745
else if( 0 == i_height % 8 )
1746
{
1747
copy_16multx8mult_msa( p_src, i_src_stride,
1748
p_dst, i_dst_stride, i_height, 16 );
1749
}
1750
else if( 0 == i_height % 4 )
1751
{
1752
for( i_cnt = ( i_height >> 2 ); i_cnt--; )
1753
{
1754
LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
1755
p_src += ( 4 * i_src_stride );
1756
1757
ST_UB4( src0, src1, src2, src3, p_dst, i_dst_stride );
1758
p_dst += ( 4 * i_dst_stride );
1759
}
1760
}
1761
}
1762
1763
static void avg_src_width4_msa( uint8_t *p_src1, int32_t i_src1_stride,
1764
uint8_t *p_src2, int32_t i_src2_stride,
1765
uint8_t *p_dst, int32_t i_dst_stride,
1766
int32_t i_height )
1767
{
1768
int32_t i_cnt;
1769
uint32_t u_out0, u_out1;
1770
v16u8 src0, src1, src2, src3;
1771
v16u8 dst0, dst1;
1772
1773
for( i_cnt = ( i_height / 2 ); i_cnt--; )
1774
{
1775
LD_UB2( p_src1, i_src1_stride, src0, src1 );
1776
p_src1 += ( 2 * i_src1_stride );
1777
LD_UB2( p_src2, i_src2_stride, src2, src3 );
1778
p_src2 += ( 2 * i_src2_stride );
1779
1780
AVER_UB2_UB( src0, src2, src1, src3, dst0, dst1 );
1781
1782
u_out0 = __msa_copy_u_w( ( v4i32 ) dst0, 0 );
1783
u_out1 = __msa_copy_u_w( ( v4i32 ) dst1, 0 );
1784
SW( u_out0, p_dst );
1785
p_dst += i_dst_stride;
1786
SW( u_out1, p_dst );
1787
p_dst += i_dst_stride;
1788
}
1789
}
1790
1791
static void avg_src_width8_msa( uint8_t *p_src1, int32_t i_src1_stride,
1792
uint8_t *p_src2, int32_t i_src2_stride,
1793
uint8_t *p_dst, int32_t i_dst_stride,
1794
int32_t i_height )
1795
{
1796
int32_t i_cnt;
1797
uint64_t u_out0, u_out1, u_out2, u_out3;
1798
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1799
v16u8 dst0, dst1, dst2, dst3;
1800
1801
for( i_cnt = ( i_height / 4 ); i_cnt--; )
1802
{
1803
LD_UB4( p_src1, i_src1_stride, src0, src1, src2, src3 );
1804
p_src1 += ( 4 * i_src1_stride );
1805
LD_UB4( p_src2, i_src2_stride, src4, src5, src6, src7 );
1806
p_src2 += ( 4 * i_src2_stride );
1807
1808
AVER_UB4_UB( src0, src4, src1, src5, src2, src6, src3, src7,
1809
dst0, dst1, dst2, dst3 );
1810
1811
u_out0 = __msa_copy_u_d( ( v2i64 ) dst0, 0 );
1812
u_out1 = __msa_copy_u_d( ( v2i64 ) dst1, 0 );
1813
u_out2 = __msa_copy_u_d( ( v2i64 ) dst2, 0 );
1814
u_out3 = __msa_copy_u_d( ( v2i64 ) dst3, 0 );
1815
SD4( u_out0, u_out1, u_out2, u_out3, p_dst, i_dst_stride );
1816
p_dst += ( 4 * i_dst_stride );
1817
}
1818
}
1819
1820
static void avg_src_width16_msa( uint8_t *p_src1, int32_t i_src1_stride,
1821
uint8_t *p_src2, int32_t i_src2_stride,
1822
uint8_t *p_dst, int32_t i_dst_stride,
1823
int32_t i_height )
1824
{
1825
int32_t i_cnt;
1826
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1827
v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1828
1829
for( i_cnt = ( i_height / 8 ); i_cnt--; )
1830
{
1831
LD_UB8( p_src1, i_src1_stride,
1832
src0, src1, src2, src3, src4, src5, src6, src7 );
1833
p_src1 += ( 8 * i_src1_stride );
1834
LD_UB8( p_src2, i_src2_stride,
1835
dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7 );
1836
p_src2 += ( 8 * i_src2_stride );
1837
1838
AVER_UB4_UB( src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1839
dst0, dst1, dst2, dst3 );
1840
AVER_UB4_UB( src4, dst4, src5, dst5, src6, dst6, src7, dst7,
1841
dst4, dst5, dst6, dst7 );
1842
1843
ST_UB8( dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7,
1844
p_dst, i_dst_stride );
1845
p_dst += ( 8 * i_dst_stride );
1846
}
1847
}
1848
1849
static void memset_zero_16width_msa( uint8_t *p_src, int32_t i_stride,
1850
int32_t i_height )
1851
{
1852
int8_t i_cnt;
1853
v16u8 zero = { 0 };
1854
1855
for( i_cnt = ( i_height / 2 ); i_cnt--; )
1856
{
1857
ST_UB( zero, p_src );
1858
p_src += i_stride;
1859
ST_UB( zero, p_src );
1860
p_src += i_stride;
1861
}
1862
}
1863
1864
static void plane_copy_interleave_msa( uint8_t *p_src0, int32_t i_src0_stride,
1865
uint8_t *p_src1, int32_t i_src1_stride,
1866
uint8_t *p_dst, int32_t i_dst_stride,
1867
int32_t i_width, int32_t i_height )
1868
{
1869
int32_t i_loop_width, i_loop_height, i_w_mul8, i_h4w;
1870
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1871
v16u8 vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3;
1872
v16u8 vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3;
1873
1874
i_w_mul8 = i_width - i_width % 8;
1875
i_h4w = i_height - i_height % 4;
1876
1877
for( i_loop_height = ( i_h4w >> 2 ); i_loop_height--; )
1878
{
1879
for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
1880
{
1881
LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 );
1882
LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 );
1883
ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1884
vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 );
1885
ILVL_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1886
vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3 );
1887
ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3,
1888
p_dst, i_dst_stride );
1889
ST_UB4( vec_ilv_l0, vec_ilv_l1, vec_ilv_l2, vec_ilv_l3,
1890
( p_dst + 16 ), i_dst_stride );
1891
p_src0 += 16;
1892
p_src1 += 16;
1893
p_dst += 32;
1894
}
1895
1896
for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; )
1897
{
1898
LD_UB4( p_src0, i_src0_stride, src0, src1, src2, src3 );
1899
LD_UB4( p_src1, i_src1_stride, src4, src5, src6, src7 );
1900
ILVR_B4_UB( src4, src0, src5, src1, src6, src2, src7, src3,
1901
vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3 );
1902
ST_UB4( vec_ilv_r0, vec_ilv_r1, vec_ilv_r2, vec_ilv_r3,
1903
p_dst, i_dst_stride );
1904
p_src0 += 8;
1905
p_src1 += 8;
1906
p_dst += 16;
1907
}
1908
1909
for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
1910
{
1911
p_dst[0] = p_src0[0];
1912
p_dst[1] = p_src1[0];
1913
p_dst[i_dst_stride] = p_src0[i_src0_stride];
1914
p_dst[i_dst_stride + 1] = p_src1[i_src1_stride];
1915
p_dst[2 * i_dst_stride] = p_src0[2 * i_src0_stride];
1916
p_dst[2 * i_dst_stride + 1] = p_src1[2 * i_src1_stride];
1917
p_dst[3 * i_dst_stride] = p_src0[3 * i_src0_stride];
1918
p_dst[3 * i_dst_stride + 1] = p_src1[3 * i_src1_stride];
1919
p_src0 += 1;
1920
p_src1 += 1;
1921
p_dst += 2;
1922
}
1923
1924
p_src0 += ( ( 4 * i_src0_stride ) - i_width );
1925
p_src1 += ( ( 4 * i_src1_stride ) - i_width );
1926
p_dst += ( ( 4 * i_dst_stride ) - ( i_width * 2 ) );
1927
}
1928
1929
for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ )
1930
{
1931
for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
1932
{
1933
src0 = LD_UB( p_src0 );
1934
src4 = LD_UB( p_src1 );
1935
ILVRL_B2_UB( src4, src0, vec_ilv_r0, vec_ilv_l0 );
1936
ST_UB2( vec_ilv_r0, vec_ilv_l0, p_dst, 16 );
1937
p_src0 += 16;
1938
p_src1 += 16;
1939
p_dst += 32;
1940
}
1941
1942
for( i_loop_width = ( i_width % 16 ) >> 3; i_loop_width--; )
1943
{
1944
src0 = LD_UB( p_src0 );
1945
src4 = LD_UB( p_src1 );
1946
vec_ilv_r0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) src4,
1947
( v16i8 ) src0 );
1948
ST_UB( vec_ilv_r0, p_dst );
1949
p_src0 += 8;
1950
p_src1 += 8;
1951
p_dst += 16;
1952
}
1953
1954
for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
1955
{
1956
p_dst[0] = p_src0[0];
1957
p_dst[1] = p_src1[0];
1958
p_src0 += 1;
1959
p_src1 += 1;
1960
p_dst += 2;
1961
}
1962
1963
p_src0 += ( i_src0_stride - i_width );
1964
p_src1 += ( i_src1_stride - i_width );
1965
p_dst += ( i_dst_stride - ( i_width * 2 ) );
1966
}
1967
}
1968
1969
static void plane_copy_deinterleave_msa( uint8_t *p_src, int32_t i_src_stride,
1970
uint8_t *p_dst0, int32_t dst0_stride,
1971
uint8_t *p_dst1, int32_t dst1_stride,
1972
int32_t i_width, int32_t i_height )
1973
{
1974
int32_t i_loop_width, i_loop_height, i_w_mul4, i_w_mul8, i_h4w;
1975
uint32_t u_res_w0, u_res_w1;
1976
v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
1977
v16u8 vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3;
1978
v16u8 vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3;
1979
uint8_t *p_dst;
1980
1981
i_w_mul8 = i_width - i_width % 8;
1982
i_w_mul4 = i_width - i_width % 4;
1983
i_h4w = i_height - i_height % 8;
1984
1985
for( i_loop_height = ( i_h4w >> 3 ); i_loop_height--; )
1986
{
1987
for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; )
1988
{
1989
LD_UB8( p_src, i_src_stride,
1990
in0, in1, in2, in3, in4, in5, in6, in7 );
1991
p_src += 16;
1992
PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
1993
vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 );
1994
PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
1995
vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 );
1996
ST8x4_UB( vec_pckev0, vec_pckev1, p_dst0, dst0_stride );
1997
p_dst = p_dst0 + 4 * dst0_stride;
1998
ST8x4_UB( vec_pckev2, vec_pckev3, p_dst, dst0_stride );
1999
ST8x4_UB( vec_pckod0, vec_pckod1, p_dst1, dst1_stride );
2000
p_dst = p_dst1 + 4 * dst1_stride;
2001
ST8x4_UB( vec_pckod2, vec_pckod3, p_dst, dst1_stride );
2002
p_dst0 += 8;
2003
p_dst1 += 8;
2004
}
2005
2006
for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; )
2007
{
2008
LD_UB8( p_src, i_src_stride,
2009
in0, in1, in2, in3, in4, in5, in6, in7 );
2010
p_src += 8;
2011
PCKEV_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
2012
vec_pckev0, vec_pckev1, vec_pckev2, vec_pckev3 );
2013
PCKOD_B4_UB( in1, in0, in3, in2, in5, in4, in7, in6,
2014
vec_pckod0, vec_pckod1, vec_pckod2, vec_pckod3 );
2015
ST4x4_UB( vec_pckev0, vec_pckev1, 0, 2, 0, 2, p_dst0, dst0_stride );
2016
p_dst = p_dst0 + 4 * dst0_stride;
2017
ST4x4_UB( vec_pckev2, vec_pckev3, 0, 2, 0, 2, p_dst, dst0_stride );
2018
ST4x4_UB( vec_pckod0, vec_pckod1, 0, 2, 0, 2, p_dst1, dst1_stride );
2019
p_dst = p_dst1 + 4 * dst1_stride;
2020
ST4x4_UB( vec_pckod2, vec_pckod3, 0, 2, 0, 2, p_dst, dst1_stride );
2021
p_dst0 += 4;
2022
p_dst1 += 4;
2023
}
2024
2025
for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ )
2026
{
2027
p_dst0[0] = p_src[0];
2028
p_dst1[0] = p_src[1];
2029
p_dst0[dst0_stride] = p_src[i_src_stride];
2030
p_dst1[dst1_stride] = p_src[i_src_stride + 1];
2031
p_dst0[2 * dst0_stride] = p_src[2 * i_src_stride];
2032
p_dst1[2 * dst1_stride] = p_src[2 * i_src_stride + 1];
2033
p_dst0[3 * dst0_stride] = p_src[3 * i_src_stride];
2034
p_dst1[3 * dst1_stride] = p_src[3 * i_src_stride + 1];
2035
p_dst0[4 * dst0_stride] = p_src[4 * i_src_stride];
2036
p_dst1[4 * dst1_stride] = p_src[4 * i_src_stride + 1];
2037
p_dst0[5 * dst0_stride] = p_src[5 * i_src_stride];
2038
p_dst1[5 * dst1_stride] = p_src[5 * i_src_stride + 1];
2039
p_dst0[6 * dst0_stride] = p_src[6 * i_src_stride];
2040
p_dst1[6 * dst1_stride] = p_src[6 * i_src_stride + 1];
2041
p_dst0[7 * dst0_stride] = p_src[7 * i_src_stride];
2042
p_dst1[7 * dst1_stride] = p_src[7 * i_src_stride + 1];
2043
p_dst0 += 1;
2044
p_dst1 += 1;
2045
p_src += 2;
2046
}
2047
2048
p_src += ( ( 8 * i_src_stride ) - ( i_width << 1 ) );
2049
p_dst0 += ( ( 8 * dst0_stride ) - i_width );
2050
p_dst1 += ( ( 8 * dst1_stride ) - i_width );
2051
}
2052
2053
for( i_loop_height = i_h4w; i_loop_height < i_height; i_loop_height++ )
2054
{
2055
for( i_loop_width = ( i_w_mul8 >> 3 ); i_loop_width--; )
2056
{
2057
in0 = LD_UB( p_src );
2058
p_src += 16;
2059
vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0,
2060
( v16i8 ) in0 );
2061
vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0,
2062
( v16i8 ) in0 );
2063
ST8x1_UB( vec_pckev0, p_dst0 );
2064
ST8x1_UB( vec_pckod0, p_dst1 );
2065
p_dst0 += 8;
2066
p_dst1 += 8;
2067
}
2068
2069
for( i_loop_width = ( ( i_width % 8 ) >> 2 ); i_loop_width--; )
2070
{
2071
in0 = LD_UB( p_src );
2072
p_src += 8;
2073
vec_pckev0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) in0,
2074
( v16i8 ) in0 );
2075
vec_pckod0 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) in0,
2076
( v16i8 ) in0 );
2077
u_res_w0 = __msa_copy_u_w( ( v4i32 ) vec_pckev0, 0 );
2078
SW( u_res_w0, p_dst0 );
2079
u_res_w1 = __msa_copy_u_w( ( v4i32 ) vec_pckod0, 0 );
2080
SW( u_res_w1, p_dst1 );
2081
p_dst0 += 4;
2082
p_dst1 += 4;
2083
}
2084
2085
for( i_loop_width = i_w_mul4; i_loop_width < i_width; i_loop_width++ )
2086
{
2087
p_dst0[0] = p_src[0];
2088
p_dst1[0] = p_src[1];
2089
p_dst0 += 1;
2090
p_dst1 += 1;
2091
p_src += 2;
2092
}
2093
2094
p_src += ( ( i_src_stride ) - ( i_width << 1 ) );
2095
p_dst0 += ( ( dst0_stride ) - i_width );
2096
p_dst1 += ( ( dst1_stride ) - i_width );
2097
}
2098
}
2099
2100
2101
static void plane_copy_deinterleave_rgb_msa( uint8_t *p_src,
2102
int32_t i_src_stride,
2103
uint8_t *p_dst0,
2104
int32_t i_dst0_stride,
2105
uint8_t *p_dst1,
2106
int32_t i_dst1_stride,
2107
uint8_t *p_dst2,
2108
int32_t i_dst2_stride,
2109
int32_t i_width,
2110
int32_t i_height )
2111
{
2112
uint8_t *p_src_orig = p_src;
2113
uint8_t *p_dst0_orig = p_dst0;
2114
uint8_t *p_dst1_orig = p_dst1;
2115
uint8_t *p_dst2_orig = p_dst2;
2116
int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4;
2117
v16i8 in0, in1, in2, in3, in4, in5, in6, in7;
2118
v16i8 temp0, temp1, temp2, temp3;
2119
v16i8 mask0 = { 0, 3, 6, 9, 12, 15, 18, 21, 0, 0, 0, 0, 0, 0, 0, 0 };
2120
v16i8 mask1 = { 1, 4, 7, 10, 13, 16, 19, 22, 0, 0, 0, 0, 0, 0, 0, 0 };
2121
v16i8 mask2 = { 2, 5, 8, 11, 14, 17, 20, 23, 0, 0, 0, 0, 0, 0, 0, 0 };
2122
2123
i_w_mul8 = i_width - i_width % 8;
2124
i_h_mul4 = i_height - i_height % 4;
2125
2126
for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
2127
{
2128
p_src = p_src_orig;
2129
p_dst0 = p_dst0_orig;
2130
p_dst1 = p_dst1_orig;
2131
p_dst2 = p_dst2_orig;
2132
2133
for( i_loop_width = ( i_width >> 3 ); i_loop_width--; )
2134
{
2135
LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
2136
LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 );
2137
2138
VSHF_B2_SB( in0, in4, in1, in5, mask0, mask0, temp0, temp1 );
2139
VSHF_B2_SB( in2, in6, in3, in7, mask0, mask0, temp2, temp3 );
2140
ST8x1_UB( temp0, p_dst0 );
2141
ST8x1_UB( temp1, p_dst0 + i_dst0_stride );
2142
ST8x1_UB( temp2, p_dst0 + 2 * i_dst0_stride );
2143
ST8x1_UB( temp3, p_dst0 + 3 * i_dst0_stride );
2144
2145
VSHF_B2_SB( in0, in4, in1, in5, mask1, mask1, temp0, temp1 );
2146
VSHF_B2_SB( in2, in6, in3, in7, mask1, mask1, temp2, temp3 );
2147
ST8x1_UB( temp0, p_dst1 );
2148
ST8x1_UB( temp1, p_dst1 + i_dst1_stride );
2149
ST8x1_UB( temp2, p_dst1 + 2 * i_dst1_stride );
2150
ST8x1_UB( temp3, p_dst1 + 3 * i_dst1_stride );
2151
2152
VSHF_B2_SB( in0, in4, in1, in5, mask2, mask2, temp0, temp1 );
2153
VSHF_B2_SB( in2, in6, in3, in7, mask2, mask2, temp2, temp3 );
2154
ST8x1_UB( temp0, p_dst2 );
2155
ST8x1_UB( temp1, p_dst2 + i_dst2_stride );
2156
ST8x1_UB( temp2, p_dst2 + 2 * i_dst2_stride );
2157
ST8x1_UB( temp3, p_dst2 + 3 * i_dst2_stride );
2158
2159
p_src += 8 * 3;
2160
p_dst0 += 8;
2161
p_dst1 += 8;
2162
p_dst2 += 8;
2163
}
2164
2165
for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
2166
{
2167
p_dst0_orig[i_loop_width] = p_src_orig[0 + 3 * i_loop_width];
2168
p_dst1_orig[i_loop_width] = p_src_orig[1 + 3 * i_loop_width];
2169
p_dst2_orig[i_loop_width] = p_src_orig[2 + 3 * i_loop_width];
2170
2171
p_dst0_orig[i_loop_width + i_dst0_stride] =
2172
p_src_orig[0 + i_src_stride + 3 * i_loop_width];
2173
p_dst1_orig[i_loop_width + i_dst1_stride] =
2174
p_src_orig[1 + i_src_stride + 3 * i_loop_width];
2175
p_dst2_orig[i_loop_width + i_dst2_stride] =
2176
p_src_orig[2 + i_src_stride + 3 * i_loop_width];
2177
2178
p_dst0_orig[i_loop_width + 2 * i_dst0_stride] =
2179
p_src_orig[0 + 2 * i_src_stride + 3 * i_loop_width];
2180
p_dst1_orig[i_loop_width + 2 * i_dst1_stride] =
2181
p_src_orig[1 + 2 * i_src_stride + 3 * i_loop_width];
2182
p_dst2_orig[i_loop_width + 2 * i_dst2_stride] =
2183
p_src_orig[2 + 2 * i_src_stride + 3 * i_loop_width];
2184
2185
p_dst0_orig[i_loop_width + 3 * i_dst0_stride] =
2186
p_src_orig[0 + 3 * i_src_stride + 3 * i_loop_width];
2187
p_dst1_orig[i_loop_width + 3 * i_dst1_stride] =
2188
p_src_orig[1 + 3 * i_src_stride + 3 * i_loop_width];
2189
p_dst2_orig[i_loop_width + 3 * i_dst2_stride] =
2190
p_src_orig[2 + 3 * i_src_stride + 3 * i_loop_width];
2191
}
2192
2193
p_src_orig += ( 4 * i_src_stride );
2194
p_dst0_orig += ( 4 * i_dst0_stride );
2195
p_dst1_orig += ( 4 * i_dst1_stride );
2196
p_dst2_orig += ( 4 * i_dst2_stride );
2197
}
2198
2199
for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ )
2200
{
2201
p_src = p_src_orig;
2202
p_dst0 = p_dst0_orig;
2203
p_dst1 = p_dst1_orig;
2204
p_dst2 = p_dst2_orig;
2205
2206
for( i_loop_width = ( i_width >> 3 ); i_loop_width--; )
2207
{
2208
in0 = LD_SB( p_src );
2209
in4 = LD_SB( p_src + 16 );
2210
temp0 = __msa_vshf_b( mask0, in4, in0 );
2211
ST8x1_UB( temp0, p_dst0 );
2212
temp0 = __msa_vshf_b( mask1, in4, in0 );
2213
ST8x1_UB( temp0, p_dst1 );
2214
temp0 = __msa_vshf_b( mask2, in4, in0 );
2215
ST8x1_UB( temp0, p_dst2 );
2216
2217
p_src += 8 * 3;
2218
p_dst0 += 8;
2219
p_dst1 += 8;
2220
p_dst2 += 8;
2221
}
2222
2223
for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
2224
{
2225
p_dst0_orig[i_loop_width] = p_src_orig[3 * i_loop_width];
2226
p_dst1_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 1];
2227
p_dst2_orig[i_loop_width] = p_src_orig[3 * i_loop_width + 2];
2228
}
2229
2230
p_src_orig += ( i_src_stride );
2231
p_dst0_orig += ( i_dst0_stride );
2232
p_dst1_orig += ( i_dst1_stride );
2233
p_dst2_orig += ( i_dst2_stride );
2234
}
2235
}
2236
2237
static void plane_copy_deinterleave_rgba_msa( uint8_t *p_src,
2238
int32_t i_src_stride,
2239
uint8_t *p_dst0,
2240
int32_t i_dst0_stride,
2241
uint8_t *p_dst1,
2242
int32_t i_dst1_stride,
2243
uint8_t *p_dst2,
2244
int32_t i_dst2_stride,
2245
int32_t i_width,
2246
int32_t i_height )
2247
{
2248
uint8_t *p_src_orig = p_src;
2249
uint8_t *p_dst0_orig = p_dst0;
2250
uint8_t *p_dst1_orig = p_dst1;
2251
uint8_t *p_dst2_orig = p_dst2;
2252
int32_t i_loop_width, i_loop_height, i_w_mul8, i_h_mul4;
2253
v16i8 in0, in1, in2, in3, in4, in5, in6, in7;
2254
v16i8 in8, in9, in10, in11, in12, in13, in14, in15;
2255
v8i16 temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
2256
v8i16 temp8, temp9, temp10, temp11, temp12, temp13, temp14, temp15;
2257
2258
i_w_mul8 = i_width - i_width % 8;
2259
i_h_mul4 = i_height - i_height % 4;
2260
2261
for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
2262
{
2263
p_src = p_src_orig;
2264
p_dst0 = p_dst0_orig;
2265
p_dst1 = p_dst1_orig;
2266
p_dst2 = p_dst2_orig;
2267
2268
for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
2269
{
2270
LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
2271
LD_SB4( ( p_src + 16 ), i_src_stride, in4, in5, in6, in7 );
2272
LD_SB4( ( p_src + 32 ), i_src_stride, in8, in9, in10, in11 );
2273
LD_SB4( ( p_src + 48 ), i_src_stride, in12, in13, in14, in15 );
2274
2275
PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 );
2276
temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2277
temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 );
2278
PCKEV_H2_SH( in5, in1, in13, in9, temp4, temp5 );
2279
temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 );
2280
temp7 = __msa_pckod_h( ( v8i16 ) in13, ( v8i16 ) in9 );
2281
PCKEV_H2_SH( in6, in2, in14, in10, temp8, temp9 );
2282
temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 );
2283
temp11 = __msa_pckod_h( ( v8i16 ) in14, ( v8i16 ) in10 );
2284
PCKEV_H2_SH( in7, in3, in15, in11, temp12, temp13 );
2285
temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 );
2286
temp15 = __msa_pckod_h( ( v8i16 ) in15, ( v8i16 ) in11 );
2287
PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 );
2288
in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 );
2289
PCKEV_B2_SB( temp5, temp4, temp7, temp6, in4, in5 );
2290
in6 = __msa_pckod_b( ( v16i8 ) temp5, ( v16i8 ) temp4 );
2291
PCKEV_B2_SB( temp9, temp8, temp11, temp10, in8, in9 );
2292
in10 = __msa_pckod_b( ( v16i8 ) temp9, ( v16i8 ) temp8 );
2293
PCKEV_B2_SB( temp13, temp12, temp15, temp14, in12, in13 );
2294
in14 = __msa_pckod_b( ( v16i8 ) temp13, ( v16i8 ) temp12 );
2295
ST_SB4( in0, in4, in8, in12, p_dst0, i_dst0_stride );
2296
ST_SB4( in1, in5, in9, in13, p_dst2, i_dst2_stride );
2297
ST_SB4( in2, in6, in10, in14, p_dst1, i_dst1_stride );
2298
2299
p_src += 16 * 4;
2300
p_dst0 += 16;
2301
p_dst1 += 16;
2302
p_dst2 += 16;
2303
}
2304
2305
for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; )
2306
{
2307
LD_SB4( p_src, i_src_stride, in0, in1, in2, in3 );
2308
LD_SB4( p_src + 16, i_src_stride, in4, in5, in6, in7 );
2309
2310
PCKEV_H2_SH( in4, in0, in5, in1, temp0, temp4 );
2311
temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2312
temp6 = __msa_pckod_h( ( v8i16 ) in5, ( v8i16 ) in1 );
2313
2314
PCKEV_H2_SH( in6, in2, in7, in3, temp8, temp12 );
2315
temp10 = __msa_pckod_h( ( v8i16 ) in6, ( v8i16 ) in2 );
2316
temp14 = __msa_pckod_h( ( v8i16 ) in7, ( v8i16 ) in3 );
2317
2318
PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 );
2319
in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 );
2320
PCKEV_B2_SB( temp4, temp4, temp6, temp6, in4, in5 );
2321
in6 = __msa_pckod_b( ( v16i8 ) temp4, ( v16i8 ) temp4 );
2322
PCKEV_B2_SB( temp8, temp8, temp10, temp10, in8, in9 );
2323
in10 = __msa_pckod_b( ( v16i8 ) temp8, ( v16i8 ) temp8 );
2324
PCKEV_B2_SB( temp12, temp12, temp14, temp14, in12, in13 );
2325
in14 = __msa_pckod_b( ( v16i8 ) temp12, ( v16i8 ) temp12 );
2326
2327
ST8x1_UB( in0, p_dst0 );
2328
ST8x1_UB( in4, p_dst0 + i_dst0_stride );
2329
ST8x1_UB( in8, p_dst0 + 2 * i_dst0_stride );
2330
ST8x1_UB( in12, p_dst0 + 3 * i_dst0_stride );
2331
2332
ST8x1_UB( in1, p_dst2 );
2333
ST8x1_UB( in5, p_dst2 + i_dst2_stride );
2334
ST8x1_UB( in9, p_dst2 + 2 * i_dst2_stride );
2335
ST8x1_UB( in13, p_dst2 + 3 * i_dst2_stride );
2336
2337
ST8x1_UB( in2, p_dst1 );
2338
ST8x1_UB( in6, p_dst1 + i_dst1_stride );
2339
ST8x1_UB( in10, p_dst1 + 2 * i_dst1_stride );
2340
ST8x1_UB( in14, p_dst1 + 3 * i_dst1_stride );
2341
2342
p_src += 8 * 4;
2343
p_dst0 += 8;
2344
p_dst1 += 8;
2345
p_dst2 += 8;
2346
}
2347
2348
for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
2349
{
2350
p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width];
2351
p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1];
2352
p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2];
2353
2354
p_dst0_orig[i_dst0_stride + i_loop_width] =
2355
p_src_orig[i_src_stride + 4 * i_loop_width];
2356
p_dst1_orig[i_dst1_stride + i_loop_width] =
2357
p_src_orig[i_src_stride + 4 * i_loop_width + 1];
2358
p_dst2_orig[i_dst2_stride + i_loop_width] =
2359
p_src_orig[i_src_stride + 4 * i_loop_width + 2];
2360
2361
p_dst0_orig[2 * i_dst0_stride + i_loop_width] =
2362
p_src_orig[2 * i_src_stride + 4 * i_loop_width];
2363
p_dst1_orig[2 * i_dst1_stride + i_loop_width] =
2364
p_src_orig[2 * i_src_stride + 4 * i_loop_width + 1];
2365
p_dst2_orig[2 * i_dst2_stride + i_loop_width] =
2366
p_src_orig[2 * i_src_stride + 4 * i_loop_width + 2];
2367
2368
p_dst0_orig[3 * i_dst0_stride + i_loop_width] =
2369
p_src_orig[3 * i_src_stride + 4 * i_loop_width];
2370
p_dst1_orig[3 * i_dst1_stride + i_loop_width] =
2371
p_src_orig[3 * i_src_stride + 4 * i_loop_width + 1];
2372
p_dst2_orig[3 * i_dst2_stride + i_loop_width] =
2373
p_src_orig[3 * i_src_stride + 4 * i_loop_width + 2];
2374
}
2375
2376
p_src_orig += ( 4 * i_src_stride );
2377
p_dst0_orig += ( 4 * i_dst0_stride );
2378
p_dst1_orig += ( 4 * i_dst1_stride );
2379
p_dst2_orig += ( 4 * i_dst2_stride );
2380
}
2381
2382
for( i_loop_height = i_h_mul4; i_loop_height < i_height; i_loop_height++ )
2383
{
2384
p_src = p_src_orig;
2385
p_dst0 = p_dst0_orig;
2386
p_dst1 = p_dst1_orig;
2387
p_dst2 = p_dst2_orig;
2388
2389
for( i_loop_width = ( i_width >> 4 ); i_loop_width--; )
2390
{
2391
LD_SB4( p_src, 16, in0, in4, in8, in12 );
2392
2393
PCKEV_H2_SH( in4, in0, in12, in8, temp0, temp1 );
2394
temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2395
temp3 = __msa_pckod_h( ( v8i16 ) in12, ( v8i16 ) in8 );
2396
PCKEV_B2_SB( temp1, temp0, temp3, temp2, in0, in1 );
2397
in2 = __msa_pckod_b( ( v16i8 ) temp1, ( v16i8 ) temp0 );
2398
ST_SB( in0, p_dst0 );
2399
ST_SB( in0, p_dst0 );
2400
ST_SB( in1, p_dst2 );
2401
ST_SB( in1, p_dst2 );
2402
ST_SB( in2, p_dst1 );
2403
ST_SB( in2, p_dst1 );
2404
2405
p_src += 16 * 4;
2406
p_dst0 += 16;
2407
p_dst1 += 16;
2408
p_dst2 += 16;
2409
}
2410
2411
for( i_loop_width = ( ( i_width % 16 ) >> 3 ); i_loop_width--; )
2412
{
2413
in0 = LD_SB( p_src );
2414
in4 = LD_SB( p_src + 16 );
2415
2416
temp0 = __msa_pckev_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2417
temp2 = __msa_pckod_h( ( v8i16 ) in4, ( v8i16 ) in0 );
2418
PCKEV_B2_SB( temp0, temp0, temp2, temp2, in0, in1 );
2419
in2 = __msa_pckod_b( ( v16i8 ) temp0, ( v16i8 ) temp0 );
2420
ST8x1_UB( in0, p_dst0 );
2421
ST8x1_UB( in1, p_dst2 );
2422
ST8x1_UB( in2, p_dst1 );
2423
2424
p_src += 8 * 4;
2425
p_dst0 += 8;
2426
p_dst1 += 8;
2427
p_dst2 += 8;
2428
}
2429
2430
for( i_loop_width = i_w_mul8; i_loop_width < i_width; i_loop_width++ )
2431
{
2432
p_dst0_orig[i_loop_width] = p_src_orig[4 * i_loop_width];
2433
p_dst1_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 1];
2434
p_dst2_orig[i_loop_width] = p_src_orig[4 * i_loop_width + 2];
2435
}
2436
2437
p_src_orig += ( i_src_stride );
2438
p_dst0_orig += ( i_dst0_stride );
2439
p_dst1_orig += ( i_dst1_stride );
2440
p_dst2_orig += ( i_dst2_stride );
2441
}
2442
}
2443
2444
static void store_interleave_chroma_msa( uint8_t *p_src0, int32_t i_src0_stride,
2445
uint8_t *p_src1, int32_t i_src1_stride,
2446
uint8_t *p_dst, int32_t i_dst_stride,
2447
int32_t i_height )
2448
{
2449
int32_t i_loop_height, i_h4w;
2450
v16u8 in0, in1, in2, in3, in4, in5, in6, in7;
2451
v16u8 ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3;
2452
2453
i_h4w = i_height % 4;
2454
for( i_loop_height = ( i_height >> 2 ); i_loop_height--; )
2455
{
2456
LD_UB4( p_src0, i_src0_stride, in0, in1, in2, in3 );
2457
p_src0 += ( 4 * i_src0_stride );
2458
LD_UB4( p_src1, i_src1_stride, in4, in5, in6, in7 );
2459
p_src1 += ( 4 * i_src1_stride );
2460
ILVR_B4_UB( in4, in0, in5, in1, in6, in2, in7, in3,
2461
ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3 );
2462
ST_UB4( ilvr_vec0, ilvr_vec1, ilvr_vec2, ilvr_vec3,
2463
p_dst, i_dst_stride );
2464
p_dst += ( 4 * i_dst_stride );
2465
}
2466
2467
for( i_loop_height = i_h4w; i_loop_height--; )
2468
{
2469
in0 = LD_UB( p_src0 );
2470
p_src0 += ( i_src0_stride );
2471
in1 = LD_UB( p_src1 );
2472
p_src1 += ( i_src1_stride );
2473
ilvr_vec0 = ( v16u8 ) __msa_ilvr_b( ( v16i8 ) in1, ( v16i8 ) in0 );
2474
ST_UB( ilvr_vec0, p_dst );
2475
p_dst += ( i_dst_stride );
2476
}
2477
}
2478
2479
static void frame_init_lowres_core_msa( uint8_t *p_src, int32_t i_src_stride,
2480
uint8_t *p_dst0, int32_t dst0_stride,
2481
uint8_t *p_dst1, int32_t dst1_stride,
2482
uint8_t *p_dst2, int32_t dst2_stride,
2483
uint8_t *p_dst3, int32_t dst3_stride,
2484
int32_t i_width, int32_t i_height )
2485
{
2486
int32_t i_loop_width, i_loop_height, i_w16_mul;
2487
v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2488
v16u8 sld1_vec0, sld1_vec1, sld1_vec2, sld1_vec3, sld1_vec4, sld1_vec5;
2489
v16u8 pckev_vec0, pckev_vec1, pckev_vec2;
2490
v16u8 pckod_vec0, pckod_vec1, pckod_vec2;
2491
v16u8 tmp0, tmp1, tmp2, tmp3;
2492
v16u8 res0, res1;
2493
2494
i_w16_mul = i_width - i_width % 16;
2495
for( i_loop_height = i_height; i_loop_height--; )
2496
{
2497
LD_UB3( p_src, i_src_stride, src0, src1, src2 );
2498
p_src += 16;
2499
for( i_loop_width = 0; i_loop_width < ( i_w16_mul >> 4 ); i_loop_width++ )
2500
{
2501
LD_UB3( p_src, i_src_stride, src3, src4, src5 );
2502
p_src += 16;
2503
LD_UB3( p_src, i_src_stride, src6, src7, src8 );
2504
p_src += 16;
2505
PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 );
2506
PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 );
2507
pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5,
2508
( v16i8 ) src2 );
2509
pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5,
2510
( v16i8 ) src2 );
2511
AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
2512
pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
2513
tmp0, tmp1, tmp2, tmp3 );
2514
AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
2515
ST_UB( res0, p_dst0 );
2516
ST_UB( res1, p_dst2 );
2517
2518
SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 );
2519
SLDI_B2_UB( src5, src6, src2, src3, sld1_vec2, sld1_vec3, 1 );
2520
SLDI_B2_UB( src7, src8, src4, src5, sld1_vec4, sld1_vec5, 1 );
2521
PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1,
2522
pckev_vec0, pckev_vec1 )
2523
pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5,
2524
( v16i8 ) sld1_vec2 );
2525
AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
2526
pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
2527
tmp0, tmp1, tmp2, tmp3 );
2528
AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
2529
ST_UB( res0, p_dst1 );
2530
ST_UB( res1, p_dst3 );
2531
2532
src0 = src6;
2533
src1 = src7;
2534
src2 = src8;
2535
p_dst0 += 16;
2536
p_dst1 += 16;
2537
p_dst2 += 16;
2538
p_dst3 += 16;
2539
}
2540
2541
for( i_loop_width = i_w16_mul; i_loop_width < i_width;
2542
i_loop_width += 8 )
2543
{
2544
LD_UB3( p_src, i_src_stride, src3, src4, src5 );
2545
p_src += 16;
2546
PCKEV_B2_UB( src3, src0, src4, src1, pckev_vec0, pckev_vec1 );
2547
PCKOD_B2_UB( src3, src0, src4, src1, pckod_vec0, pckod_vec1 );
2548
pckev_vec2 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) src5,
2549
( v16i8 ) src2 );
2550
pckod_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) src5,
2551
( v16i8 ) src2 );
2552
AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
2553
pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
2554
tmp0, tmp1, tmp2, tmp3 );
2555
AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
2556
ST8x1_UB( res0, p_dst0 );
2557
ST8x1_UB( res1, p_dst2 );
2558
2559
SLDI_B2_UB( src3, src4, src0, src1, sld1_vec0, sld1_vec1, 1 );
2560
SLDI_B2_UB( src5, src3, src2, src3, sld1_vec2, sld1_vec3, 1 );
2561
SLDI_B2_UB( src4, src5, src4, src5, sld1_vec4, sld1_vec5, 1 );
2562
PCKOD_B2_UB( sld1_vec3, sld1_vec0, sld1_vec4, sld1_vec1,
2563
pckev_vec0, pckev_vec1 )
2564
pckev_vec2 = ( v16u8 ) __msa_pckod_b( ( v16i8 ) sld1_vec5,
2565
( v16i8 ) sld1_vec2 );
2566
AVER_UB4_UB( pckev_vec1, pckev_vec0, pckod_vec1, pckod_vec0,
2567
pckev_vec2, pckev_vec1, pckod_vec2, pckod_vec1,
2568
tmp0, tmp1, tmp2, tmp3 );
2569
AVER_UB2_UB( tmp1, tmp0, tmp3, tmp2, res0, res1 );
2570
ST8x1_UB( res0, p_dst1 );
2571
ST8x1_UB( res1, p_dst3 );
2572
p_dst0 += 8;
2573
p_dst1 += 8;
2574
p_dst2 += 8;
2575
p_dst3 += 8;
2576
}
2577
2578
p_src += ( i_src_stride * 2 - ( ( i_width * 2 ) + 16 ) );
2579
p_dst0 += ( dst0_stride - i_width );
2580
p_dst1 += ( dst1_stride - i_width );
2581
p_dst2 += ( dst2_stride - i_width );
2582
p_dst3 += ( dst3_stride - i_width );
2583
}
2584
}
2585
2586
void x264_mc_copy_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2587
uint8_t *p_src, intptr_t i_src_stride,
2588
int32_t i_height )
2589
{
2590
copy_width16_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
2591
}
2592
2593
void x264_mc_copy_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
2594
intptr_t i_src_stride, int32_t i_height )
2595
{
2596
copy_width8_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
2597
}
2598
2599
void x264_mc_copy_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride, uint8_t *p_src,
2600
intptr_t i_src_stride, int32_t i_height )
2601
{
2602
copy_width4_msa( p_src, i_src_stride, p_dst, i_dst_stride, i_height );
2603
}
2604
2605
void x264_pixel_avg_16x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2606
uint8_t *p_pix2, intptr_t pix2_stride,
2607
uint8_t *p_pix3, intptr_t pix3_stride,
2608
int32_t i_weight )
2609
{
2610
if( 32 == i_weight )
2611
{
2612
avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2613
p_pix1, pix1_stride, 16 );
2614
}
2615
else if( i_weight < 0 || i_weight > 63 )
2616
{
2617
avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride,
2618
p_pix3, pix3_stride,
2619
p_pix1, pix1_stride,
2620
16, 5, i_weight,
2621
( 64 - i_weight ), 0 );
2622
}
2623
else
2624
{
2625
avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride,
2626
p_pix3, pix3_stride,
2627
p_pix1, pix1_stride,
2628
16, 5, i_weight,
2629
( 64 - i_weight ), 0 );
2630
}
2631
}
2632
2633
void x264_pixel_avg_16x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2634
uint8_t *p_pix2, intptr_t pix2_stride,
2635
uint8_t *p_pix3, intptr_t pix3_stride,
2636
int32_t i_weight )
2637
{
2638
if( 32 == i_weight )
2639
{
2640
avg_src_width16_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2641
p_pix1, pix1_stride, 8 );
2642
}
2643
else if( i_weight < 0 || i_weight > 63 )
2644
{
2645
avc_biwgt_opscale_16width_nw_msa( p_pix2, pix2_stride,
2646
p_pix3, pix3_stride,
2647
p_pix1, pix1_stride,
2648
8, 5, i_weight,
2649
( 64 - i_weight ), 0 );
2650
}
2651
else
2652
{
2653
avc_biwgt_opscale_16width_msa( p_pix2, pix2_stride,
2654
p_pix3, pix3_stride,
2655
p_pix1, pix1_stride,
2656
8, 5, i_weight,
2657
( 64 - i_weight ), 0 );
2658
}
2659
}
2660
2661
void x264_pixel_avg_8x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2662
uint8_t *p_pix2, intptr_t pix2_stride,
2663
uint8_t *p_pix3, intptr_t pix3_stride,
2664
int32_t i_weight )
2665
{
2666
if( 32 == i_weight )
2667
{
2668
avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2669
p_pix1, pix1_stride, 16 );
2670
}
2671
else if( i_weight < 0 || i_weight > 63 )
2672
{
2673
avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
2674
p_pix3, pix3_stride,
2675
p_pix1, pix1_stride, 16, 5, i_weight,
2676
( 64 - i_weight ), 0 );
2677
}
2678
else
2679
{
2680
avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
2681
p_pix3, pix3_stride,
2682
p_pix1, pix1_stride, 16, 5, i_weight,
2683
( 64 - i_weight ), 0 );
2684
}
2685
}
2686
2687
void x264_pixel_avg_8x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2688
uint8_t *p_pix2, intptr_t pix2_stride,
2689
uint8_t *p_pix3, intptr_t pix3_stride,
2690
int32_t i_weight )
2691
{
2692
if( 32 == i_weight )
2693
{
2694
avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2695
p_pix1, pix1_stride, 8 );
2696
}
2697
else if( i_weight < 0 || i_weight > 63 )
2698
{
2699
avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
2700
p_pix3, pix3_stride,
2701
p_pix1, pix1_stride, 8, 5, i_weight,
2702
( 64 - i_weight ), 0 );
2703
}
2704
else
2705
{
2706
avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
2707
p_pix3, pix3_stride,
2708
p_pix1, pix1_stride, 8, 5, i_weight,
2709
( 64 - i_weight ), 0 );
2710
}
2711
}
2712
2713
void x264_pixel_avg_8x4_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2714
uint8_t *p_pix2, intptr_t pix2_stride,
2715
uint8_t *p_pix3, intptr_t pix3_stride,
2716
int32_t i_weight )
2717
{
2718
if( 32 == i_weight )
2719
{
2720
avg_src_width8_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2721
p_pix1, pix1_stride, 4 );
2722
}
2723
else if( i_weight < 0 || i_weight > 63 )
2724
{
2725
avc_biwgt_opscale_8width_nw_msa( p_pix2, pix2_stride,
2726
p_pix3, pix3_stride,
2727
p_pix1, pix1_stride, 4, 5, i_weight,
2728
( 64 - i_weight ), 0 );
2729
}
2730
else
2731
{
2732
avc_biwgt_opscale_8width_msa( p_pix2, pix2_stride,
2733
p_pix3, pix3_stride,
2734
p_pix1, pix1_stride, 4, 5, i_weight,
2735
( 64 - i_weight ), 0 );
2736
}
2737
}
2738
2739
void x264_pixel_avg_4x16_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2740
uint8_t *p_pix2, intptr_t pix2_stride,
2741
uint8_t *p_pix3, intptr_t pix3_stride,
2742
int32_t i_weight )
2743
{
2744
if( 32 == i_weight )
2745
{
2746
avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2747
p_pix1, pix1_stride, 16 );
2748
}
2749
else if( i_weight < 0 || i_weight > 63 )
2750
{
2751
avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
2752
p_pix3, pix3_stride,
2753
p_pix1, pix1_stride, 16, 5, i_weight,
2754
( 64 - i_weight ), 0 );
2755
}
2756
else
2757
{
2758
avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
2759
p_pix3, pix3_stride,
2760
p_pix1, pix1_stride, 16, 5, i_weight,
2761
( 64 - i_weight ), 0 );
2762
}
2763
}
2764
2765
void x264_pixel_avg_4x8_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2766
uint8_t *p_pix2, intptr_t pix2_stride,
2767
uint8_t *p_pix3, intptr_t pix3_stride,
2768
int32_t i_weight )
2769
{
2770
if( 32 == i_weight )
2771
{
2772
avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2773
p_pix1, pix1_stride, 8 );
2774
}
2775
else if( i_weight < 0 || i_weight > 63 )
2776
{
2777
avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
2778
p_pix3, pix3_stride,
2779
p_pix1, pix1_stride, 8, 5, i_weight,
2780
( 64 - i_weight ), 0 );
2781
}
2782
else
2783
{
2784
avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
2785
p_pix3, pix3_stride,
2786
p_pix1, pix1_stride, 8, 5, i_weight,
2787
( 64 - i_weight ), 0 );
2788
}
2789
}
2790
2791
void x264_pixel_avg_4x4_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2792
uint8_t *p_pix2, intptr_t pix2_stride,
2793
uint8_t *p_pix3, intptr_t pix3_stride,
2794
int32_t i_weight )
2795
{
2796
if( 32 == i_weight )
2797
{
2798
avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2799
p_pix1, pix1_stride, 4 );
2800
}
2801
else if( i_weight < 0 || i_weight > 63 )
2802
{
2803
avc_biwgt_opscale_4width_nw_msa( p_pix2, pix2_stride,
2804
p_pix3, pix3_stride,
2805
p_pix1, pix1_stride, 4, 5, i_weight,
2806
( 64 - i_weight ), 0 );
2807
}
2808
else
2809
{
2810
avc_biwgt_opscale_4width_msa( p_pix2, pix2_stride,
2811
p_pix3, pix3_stride,
2812
p_pix1, pix1_stride, 4, 5, i_weight,
2813
( 64 - i_weight ), 0 );
2814
}
2815
}
2816
2817
void x264_pixel_avg_4x2_msa( uint8_t *p_pix1, intptr_t pix1_stride,
2818
uint8_t *p_pix2, intptr_t pix2_stride,
2819
uint8_t *p_pix3, intptr_t pix3_stride,
2820
int32_t i_weight )
2821
{
2822
if( 32 == i_weight )
2823
{
2824
avg_src_width4_msa( p_pix2, pix2_stride, p_pix3, pix3_stride,
2825
p_pix1, pix1_stride, 2 );
2826
}
2827
else if( i_weight < 0 || i_weight > 63 )
2828
{
2829
avc_biwgt_opscale_4x2_nw_msa( p_pix2, pix2_stride,
2830
p_pix3, pix3_stride,
2831
p_pix1, pix1_stride, 5, i_weight,
2832
( 64 - i_weight ), 0 );
2833
}
2834
else
2835
{
2836
avc_biwgt_opscale_4x2_msa( p_pix2, pix2_stride,
2837
p_pix3, pix3_stride,
2838
p_pix1, pix1_stride, 5, i_weight,
2839
( 64 - i_weight ), 0 );
2840
}
2841
}
2842
2843
2844
void x264_memzero_aligned_msa( void *p_dst, size_t n )
2845
{
2846
uint32_t u_tot32_mul_lines = n >> 5;
2847
uint32_t u_remaining = n - ( u_tot32_mul_lines << 5 );
2848
2849
memset_zero_16width_msa( p_dst, 16, ( n / 16 ) );
2850
2851
if( u_remaining )
2852
{
2853
memset( p_dst + ( u_tot32_mul_lines << 5 ), 0, u_remaining );
2854
}
2855
}
2856
2857
void x264_mc_weight_w4_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2858
uint8_t *p_src, intptr_t i_src_stride,
2859
const x264_weight_t *pWeight, int32_t i_height )
2860
{
2861
int32_t i_log2_denom = pWeight->i_denom;
2862
int32_t i_offset = pWeight->i_offset;
2863
int32_t i_weight = pWeight->i_scale;
2864
2865
avc_wgt_opscale_4width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
2866
i_height, i_log2_denom, i_weight, i_offset );
2867
}
2868
2869
void x264_mc_weight_w8_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2870
uint8_t *p_src, intptr_t i_src_stride,
2871
const x264_weight_t *pWeight, int32_t i_height )
2872
{
2873
int32_t i_log2_denom = pWeight->i_denom;
2874
int32_t i_offset = pWeight->i_offset;
2875
int32_t i_weight = pWeight->i_scale;
2876
2877
avc_wgt_opscale_8width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
2878
i_height, i_log2_denom, i_weight, i_offset );
2879
}
2880
2881
void x264_mc_weight_w16_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2882
uint8_t *p_src, intptr_t i_src_stride,
2883
const x264_weight_t *pWeight, int32_t i_height )
2884
{
2885
int32_t i_log2_denom = pWeight->i_denom;
2886
int32_t i_offset = pWeight->i_offset;
2887
int32_t i_weight = pWeight->i_scale;
2888
2889
avc_wgt_opscale_16width_msa( p_src, i_src_stride, p_dst, i_dst_stride,
2890
i_height, i_log2_denom, i_weight, i_offset );
2891
}
2892
2893
void x264_mc_weight_w20_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2894
uint8_t *p_src, intptr_t i_src_stride,
2895
const x264_weight_t *pWeight, int32_t i_height )
2896
{
2897
x264_mc_weight_w16_msa( p_dst, i_dst_stride, p_src, i_src_stride,
2898
pWeight, i_height );
2899
x264_mc_weight_w4_msa( p_dst + 16, i_dst_stride, p_src + 16, i_src_stride,
2900
pWeight, i_height );
2901
}
2902
2903
void x264_mc_luma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
2904
uint8_t *p_src[4], intptr_t i_src_stride,
2905
int32_t m_vx, int32_t m_vy,
2906
int32_t i_width, int32_t i_height,
2907
const x264_weight_t *pWeight )
2908
{
2909
int32_t i_qpel_idx;
2910
int32_t i_offset;
2911
uint8_t *p_src1;
2912
2913
i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
2914
i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
2915
p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
2916
( 3 == ( m_vy & 3 ) ) * i_src_stride;
2917
2918
if( i_qpel_idx & 5 )
2919
{
2920
uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
2921
i_offset + ( 3 == ( m_vx&3 ) );
2922
2923
if( 16 == i_width )
2924
{
2925
avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride,
2926
p_dst, i_dst_stride, i_height );
2927
}
2928
else if( 8 == i_width )
2929
{
2930
avg_src_width8_msa( p_src1, i_src_stride, p_src2, i_src_stride,
2931
p_dst, i_dst_stride, i_height );
2932
}
2933
else if( 4 == i_width )
2934
{
2935
avg_src_width4_msa( p_src1, i_src_stride, p_src2, i_src_stride,
2936
p_dst, i_dst_stride, i_height );
2937
}
2938
2939
if( pWeight->weightfn )
2940
{
2941
if( 16 == i_width )
2942
{
2943
x264_mc_weight_w16_msa( p_dst, i_dst_stride,
2944
p_dst, i_dst_stride,
2945
pWeight, i_height );
2946
}
2947
else if( 8 == i_width )
2948
{
2949
x264_mc_weight_w8_msa( p_dst, i_dst_stride, p_dst, i_dst_stride,
2950
pWeight, i_height );
2951
}
2952
else if( 4 == i_width )
2953
{
2954
x264_mc_weight_w4_msa( p_dst, i_dst_stride, p_dst, i_dst_stride,
2955
pWeight, i_height );
2956
}
2957
}
2958
}
2959
else if( pWeight->weightfn )
2960
{
2961
if( 16 == i_width )
2962
{
2963
x264_mc_weight_w16_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
2964
pWeight, i_height );
2965
}
2966
else if( 8 == i_width )
2967
{
2968
x264_mc_weight_w8_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
2969
pWeight, i_height );
2970
}
2971
else if( 4 == i_width )
2972
{
2973
x264_mc_weight_w4_msa( p_dst, i_dst_stride, p_src1, i_src_stride,
2974
pWeight, i_height );
2975
}
2976
}
2977
else
2978
{
2979
if( 16 == i_width )
2980
{
2981
copy_width16_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
2982
i_height );
2983
}
2984
else if( 8 == i_width )
2985
{
2986
copy_width8_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
2987
i_height );
2988
}
2989
else if( 4 == i_width )
2990
{
2991
copy_width4_msa( p_src1, i_src_stride, p_dst, i_dst_stride,
2992
i_height );
2993
}
2994
}
2995
}
2996
2997
void x264_mc_chroma_msa( uint8_t *p_dst_u, uint8_t *p_dst_v,
2998
intptr_t i_dst_stride,
2999
uint8_t *p_src, intptr_t i_src_stride,
3000
int32_t m_vx, int32_t m_vy,
3001
int32_t i_width, int32_t i_height )
3002
{
3003
int32_t i_d8x = m_vx & 0x07;
3004
int32_t i_d8y = m_vy & 0x07;
3005
int32_t i_coeff_horiz1 = ( 8 - i_d8x );
3006
int32_t i_coeff_vert1 = ( 8 - i_d8y );
3007
int32_t i_coeff_horiz0 = i_d8x;
3008
int32_t i_coeff_vert0 = i_d8y;
3009
3010
p_src += ( m_vy >> 3 ) * i_src_stride + ( m_vx >> 3 ) * 2;
3011
3012
if( 2 == i_width )
3013
{
3014
avc_interleaved_chroma_hv_2w_msa( p_src, i_src_stride,
3015
p_dst_u, p_dst_v, i_dst_stride,
3016
i_coeff_horiz0, i_coeff_horiz1,
3017
i_coeff_vert0, i_coeff_vert1,
3018
i_height );
3019
}
3020
else if( 4 == i_width )
3021
{
3022
avc_interleaved_chroma_hv_4w_msa( p_src, i_src_stride,
3023
p_dst_u, p_dst_v, i_dst_stride,
3024
i_coeff_horiz0, i_coeff_horiz1,
3025
i_coeff_vert0, i_coeff_vert1,
3026
i_height );
3027
}
3028
else if( 8 == i_width )
3029
{
3030
avc_interleaved_chroma_hv_8w_msa( p_src, i_src_stride,
3031
p_dst_u, p_dst_v, i_dst_stride,
3032
i_coeff_horiz0, i_coeff_horiz1,
3033
i_coeff_vert0, i_coeff_vert1,
3034
i_height );
3035
}
3036
}
3037
3038
void x264_hpel_filter_msa( uint8_t *p_dsth, uint8_t *p_dst_v,
3039
uint8_t *p_dstc, uint8_t *p_src,
3040
intptr_t i_stride, int32_t i_width,
3041
int32_t i_height, int16_t *p_buf )
3042
{
3043
for( int32_t i = 0; i < ( i_width / 16 ); i++ )
3044
{
3045
avc_luma_vt_16w_msa( p_src - 2 - ( 2 * i_stride ), i_stride,
3046
p_dst_v - 2, i_stride, i_height );
3047
avc_luma_mid_16w_msa( p_src - 2 - ( 2 * i_stride ) , i_stride,
3048
p_dstc, i_stride, i_height );
3049
avc_luma_hz_16w_msa( p_src - 2, i_stride, p_dsth, i_stride, i_height );
3050
3051
p_src += 16;
3052
p_dst_v += 16;
3053
p_dsth += 16;
3054
p_dstc += 16;
3055
}
3056
}
3057
3058
void x264_plane_copy_interleave_msa( uint8_t *p_dst, intptr_t i_dst_stride,
3059
uint8_t *p_src0, intptr_t i_src_stride0,
3060
uint8_t *p_src1, intptr_t i_src_stride1,
3061
int32_t i_width, int32_t i_height )
3062
{
3063
plane_copy_interleave_msa( p_src0, i_src_stride0, p_src1, i_src_stride1,
3064
p_dst, i_dst_stride, i_width, i_height );
3065
}
3066
3067
void x264_plane_copy_deinterleave_msa( uint8_t *p_dst0, intptr_t i_dst_stride0,
3068
uint8_t *p_dst1, intptr_t i_dst_stride1,
3069
uint8_t *p_src, intptr_t i_src_stride,
3070
int32_t i_width, int32_t i_height )
3071
{
3072
plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst0, i_dst_stride0,
3073
p_dst1, i_dst_stride1, i_width, i_height );
3074
}
3075
3076
void x264_plane_copy_deinterleave_rgb_msa( uint8_t *p_dst0,
3077
intptr_t i_dst_stride0,
3078
uint8_t *p_dst1,
3079
intptr_t i_dst_stride1,
3080
uint8_t *p_dst2,
3081
intptr_t i_dst_stride2,
3082
uint8_t *p_src,
3083
intptr_t i_src_stride,
3084
int32_t i_src_width,
3085
int32_t i_width,
3086
int32_t i_height )
3087
{
3088
if( 3 == i_src_width )
3089
{
3090
plane_copy_deinterleave_rgb_msa( p_src, i_src_stride,
3091
p_dst0, i_dst_stride0,
3092
p_dst1, i_dst_stride1,
3093
p_dst2, i_dst_stride2,
3094
i_width, i_height );
3095
}
3096
else if( 4 == i_src_width )
3097
{
3098
plane_copy_deinterleave_rgba_msa( p_src, i_src_stride,
3099
p_dst0, i_dst_stride0,
3100
p_dst1, i_dst_stride1,
3101
p_dst2, i_dst_stride2,
3102
i_width, i_height );
3103
}
3104
}
3105
3106
void x264_store_interleave_chroma_msa( uint8_t *p_dst, intptr_t i_dst_stride,
3107
uint8_t *p_src0, uint8_t *p_src1,
3108
int32_t i_height )
3109
{
3110
store_interleave_chroma_msa( p_src0, FDEC_STRIDE, p_src1, FDEC_STRIDE,
3111
p_dst, i_dst_stride, i_height );
3112
}
3113
3114
void x264_load_deinterleave_chroma_fenc_msa( uint8_t *p_dst, uint8_t *p_src,
3115
intptr_t i_src_stride,
3116
int32_t i_height )
3117
{
3118
plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FENC_STRIDE,
3119
( p_dst + ( FENC_STRIDE / 2 ) ), FENC_STRIDE,
3120
8, i_height );
3121
}
3122
3123
void x264_load_deinterleave_chroma_fdec_msa( uint8_t *p_dst, uint8_t *p_src,
3124
intptr_t i_src_stride,
3125
int32_t i_height )
3126
{
3127
plane_copy_deinterleave_msa( p_src, i_src_stride, p_dst, FDEC_STRIDE,
3128
( p_dst + ( FDEC_STRIDE / 2 ) ), FDEC_STRIDE,
3129
8, i_height );
3130
}
3131
3132
void x264_frame_init_lowres_core_msa( uint8_t *p_src, uint8_t *p_dst0,
3133
uint8_t *p_dst1, uint8_t *p_dst2,
3134
uint8_t *p_dst3, intptr_t i_src_stride,
3135
intptr_t i_dst_stride, int32_t i_width,
3136
int32_t i_height )
3137
{
3138
frame_init_lowres_core_msa( p_src, i_src_stride, p_dst0, i_dst_stride,
3139
p_dst1, i_dst_stride, p_dst2, i_dst_stride,
3140
p_dst3, i_dst_stride, i_width, i_height );
3141
}
3142
3143
uint8_t *x264_get_ref_msa( uint8_t *p_dst, intptr_t *p_dst_stride,
3144
uint8_t *p_src[4], intptr_t i_src_stride,
3145
int32_t m_vx, int32_t m_vy,
3146
int32_t i_width, int32_t i_height,
3147
const x264_weight_t *pWeight )
3148
{
3149
int32_t i_qpel_idx, i_cnt, i_h4w;
3150
int32_t i_offset;
3151
uint8_t *p_src1, *src1_org;
3152
3153
i_qpel_idx = ( ( m_vy & 3 ) << 2 ) + ( m_vx & 3 );
3154
i_offset = ( m_vy >> 2 ) * i_src_stride + ( m_vx >> 2 );
3155
p_src1 = p_src[x264_hpel_ref0[i_qpel_idx]] + i_offset +
3156
( 3 == ( m_vy & 3 ) ) * i_src_stride;
3157
3158
i_h4w = i_height - i_height%4;
3159
3160
if( i_qpel_idx & 5 )
3161
{
3162
uint8_t *p_src2 = p_src[x264_hpel_ref1[i_qpel_idx]] +
3163
i_offset + ( 3 == ( m_vx & 3 ) );
3164
3165
if( 16 == i_width )
3166
{
3167
avg_src_width16_msa( p_src1, i_src_stride,
3168
p_src2, i_src_stride,
3169
p_dst, *p_dst_stride, i_h4w );
3170
for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3171
{
3172
v16u8 src_vec1, src_vec2;
3173
v16u8 dst_vec0;
3174
3175
src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3176
src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3177
3178
dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3179
3180
ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) );
3181
}
3182
}
3183
else if( 20 == i_width )
3184
{
3185
avg_src_width16_msa( p_src1, i_src_stride, p_src2, i_src_stride,
3186
p_dst, *p_dst_stride, i_h4w );
3187
avg_src_width4_msa( p_src1 + 16, i_src_stride,
3188
p_src2 + 16, i_src_stride,
3189
p_dst + 16, *p_dst_stride, i_h4w );
3190
3191
for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3192
{
3193
v16u8 src_vec1, src_vec2, src_vec3, src_vec4;
3194
v16u8 dst_vec0, dst_vec1;
3195
uint32_t temp0;
3196
3197
src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3198
src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3199
src_vec3 = LD_UB( p_src1 + i_cnt * i_src_stride + 16 );
3200
src_vec4 = LD_UB( p_src2 + i_cnt * i_src_stride + 16 );
3201
3202
dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3203
dst_vec1 = __msa_aver_u_b( src_vec3, src_vec4 );
3204
3205
temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec1, 0 );
3206
3207
ST_UB( dst_vec0, p_dst + i_cnt * ( *p_dst_stride ) );
3208
SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 );
3209
}
3210
}
3211
else if( 12 == i_width )
3212
{
3213
avg_src_width8_msa( p_src1, i_src_stride,
3214
p_src2, i_src_stride,
3215
p_dst, *p_dst_stride, i_h4w );
3216
avg_src_width4_msa( p_src1 + 8, i_src_stride,
3217
p_src2 + 8, i_src_stride,
3218
p_dst + 8, *p_dst_stride, i_h4w );
3219
for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3220
{
3221
uint32_t temp0;
3222
uint64_t dst0;
3223
v16u8 src_vec1, src_vec2;
3224
v16u8 dst_vec0;
3225
3226
src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3227
src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3228
3229
dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3230
3231
dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 );
3232
temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 2 );
3233
3234
SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) );
3235
SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 8 );
3236
}
3237
}
3238
else if( 8 == i_width )
3239
{
3240
avg_src_width8_msa( p_src1, i_src_stride,
3241
p_src2, i_src_stride,
3242
p_dst, *p_dst_stride, i_h4w );
3243
for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3244
{
3245
uint64_t dst0;
3246
v16u8 src_vec1, src_vec2;
3247
v16u8 dst_vec0;
3248
3249
src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3250
src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3251
3252
dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3253
3254
dst0 = __msa_copy_u_d( ( v2i64 ) dst_vec0, 0 );
3255
3256
SD( dst0, p_dst + i_cnt * ( *p_dst_stride ) );
3257
}
3258
}
3259
else if( 4 == i_width )
3260
{
3261
avg_src_width4_msa( p_src1, i_src_stride,
3262
p_src2, i_src_stride,
3263
p_dst, *p_dst_stride, i_h4w );
3264
for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3265
{
3266
uint32_t temp0;
3267
v16u8 src_vec1, src_vec2;
3268
v16u8 dst_vec0;
3269
3270
src_vec1 = LD_UB( p_src1 + i_cnt * i_src_stride );
3271
src_vec2 = LD_UB( p_src2 + i_cnt * i_src_stride );
3272
3273
dst_vec0 = __msa_aver_u_b( src_vec1, src_vec2 );
3274
temp0 = __msa_copy_u_w( ( v4i32 ) dst_vec0, 0 );
3275
3276
SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3277
}
3278
}
3279
3280
if( pWeight->weightfn )
3281
{
3282
int32_t i_log2_denom;
3283
int32_t i_offset_val;
3284
int32_t i_weight;
3285
3286
i_log2_denom = pWeight->i_denom;
3287
i_offset_val = pWeight->i_offset;
3288
i_weight = pWeight->i_scale;
3289
3290
if( 16 == i_width || 12 == i_width )
3291
{
3292
x264_mc_weight_w16_msa( p_dst, *p_dst_stride,
3293
p_dst, *p_dst_stride,
3294
pWeight, i_h4w );
3295
for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3296
{
3297
v16i8 zero = {0};
3298
v16u8 src_vec0;
3299
v16i8 tmp0;
3300
v8u16 temp_vec0, temp_vec1;
3301
v8u16 wgt, offset_val0;
3302
v8i16 denom;
3303
3304
i_offset_val <<= ( i_log2_denom );
3305
3306
if( i_log2_denom )
3307
{
3308
i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3309
}
3310
3311
wgt = ( v8u16 ) __msa_fill_h( i_weight );
3312
offset_val0 = ( v8u16 ) __msa_fill_h( i_offset_val );
3313
denom = __msa_fill_h( i_log2_denom );
3314
3315
src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
3316
3317
temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero,
3318
( v16i8 ) src_vec0 );
3319
temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3320
( v16i8 ) src_vec0 );
3321
3322
temp_vec0 = wgt * temp_vec0;
3323
temp_vec1 = wgt * temp_vec1;
3324
3325
temp_vec0 =
3326
( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3327
( v8i16 ) offset_val0 );
3328
temp_vec1 =
3329
( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
3330
( v8i16 ) offset_val0 );
3331
3332
temp_vec0 =
3333
( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3334
temp_vec1 =
3335
( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
3336
3337
temp_vec0 =
3338
( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3339
temp_vec1 =
3340
( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
3341
3342
temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3343
temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
3344
3345
tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
3346
( v16i8 ) temp_vec0 );
3347
ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
3348
}
3349
}
3350
else if( 20 == i_width )
3351
{
3352
x264_mc_weight_w20_msa( p_dst, *p_dst_stride,
3353
p_dst, *p_dst_stride,
3354
pWeight, i_h4w );
3355
for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3356
{
3357
uint32_t temp0;
3358
v16i8 zero = {0};
3359
v16u8 src_vec0;
3360
v16i8 tmp0;
3361
v8u16 temp_vec0, temp_vec1;
3362
v8u16 wgt;
3363
v8i16 denom, offset_val0;
3364
3365
i_offset_val <<= ( i_log2_denom );
3366
3367
if( i_log2_denom )
3368
{
3369
i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3370
}
3371
3372
wgt = ( v8u16 ) __msa_fill_h( i_weight );
3373
offset_val0 = __msa_fill_h( i_offset_val );
3374
denom = __msa_fill_h( i_log2_denom );
3375
3376
src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
3377
temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) + 16 );
3378
3379
temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero,
3380
( v16i8 ) src_vec0 );
3381
temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3382
( v16i8 ) src_vec0 );
3383
3384
temp_vec0 = wgt * temp_vec0;
3385
temp_vec1 = wgt * temp_vec1;
3386
3387
temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3388
offset_val0 );
3389
temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
3390
offset_val0 );
3391
3392
temp_vec0 =
3393
( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3394
temp_vec1 =
3395
( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
3396
3397
temp_vec0 =
3398
( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3399
temp_vec1 =
3400
( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
3401
3402
temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3403
temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
3404
3405
tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
3406
( v16i8 ) temp_vec0 );
3407
ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
3408
3409
src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
3410
temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3411
( v16i8 ) src_vec0 );
3412
temp_vec0 = wgt * temp_vec0;
3413
3414
temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3415
offset_val0 );
3416
temp_vec0 =
3417
( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3418
temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0,
3419
denom );
3420
temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3421
3422
tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3423
( v16i8 ) temp_vec0 );
3424
temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
3425
SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) + 16 );
3426
}
3427
}
3428
else if( 8 == i_width )
3429
{
3430
x264_mc_weight_w8_msa( p_dst, *p_dst_stride,
3431
p_dst, *p_dst_stride,
3432
pWeight, i_h4w );
3433
for( i_cnt = i_h4w; i_cnt < i_height ; i_cnt++ )
3434
{
3435
uint64_t temp0;
3436
v16i8 zero = {0};
3437
v16u8 src_vec0;
3438
v16i8 tmp0;
3439
v8u16 temp_vec0;
3440
v8u16 wgt;
3441
v8i16 denom, offset_val0;
3442
3443
i_offset_val = i_offset_val << i_log2_denom;
3444
3445
if( i_log2_denom )
3446
{
3447
i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3448
}
3449
3450
wgt = ( v8u16 ) __msa_fill_h( i_weight );
3451
offset_val0 = __msa_fill_h( i_offset_val );
3452
denom = __msa_fill_h( i_log2_denom );
3453
3454
src_vec0 = LD_UB( p_dst + i_cnt * ( *p_dst_stride ) );
3455
3456
temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3457
( v16i8 ) src_vec0 );
3458
temp_vec0 = wgt * temp_vec0;
3459
3460
temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3461
offset_val0 );
3462
temp_vec0 =
3463
( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3464
temp_vec0 =
3465
( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3466
temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3467
3468
tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3469
( v16i8 ) temp_vec0 );
3470
temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
3471
SD( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3472
}
3473
}
3474
else if( 4 == i_width )
3475
{
3476
x264_mc_weight_w4_msa( p_dst, *p_dst_stride,
3477
p_dst, *p_dst_stride,
3478
pWeight, i_h4w );
3479
for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3480
{
3481
uint32_t temp0;
3482
v16i8 zero = {0};
3483
v16u8 src_vec0;
3484
v16i8 tmp0;
3485
v8u16 temp_vec0;
3486
v8u16 wgt;
3487
v8i16 denom, offset_val0;
3488
3489
i_offset_val <<= ( i_log2_denom );
3490
3491
if( i_log2_denom )
3492
{
3493
i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3494
}
3495
3496
wgt = ( v8u16 ) __msa_fill_h( i_weight );
3497
offset_val0 = __msa_fill_h( i_offset_val );
3498
denom = __msa_fill_h( i_log2_denom );
3499
3500
temp0 = LW( p_dst + i_cnt * ( *p_dst_stride ) );
3501
3502
src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
3503
3504
temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero,
3505
( v16i8 ) src_vec0 );
3506
temp_vec0 = wgt * temp_vec0;
3507
3508
temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3509
offset_val0 );
3510
temp_vec0 =
3511
( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3512
temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0,
3513
denom );
3514
temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3515
3516
tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3517
( v16i8 ) temp_vec0 );
3518
temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
3519
SW( temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3520
}
3521
}
3522
}
3523
3524
return p_dst;
3525
}
3526
else if( pWeight->weightfn )
3527
{
3528
int32_t i_offset_val, i_log2_denom, i_weight;
3529
3530
i_log2_denom = pWeight->i_denom;
3531
i_offset_val = pWeight->i_offset;
3532
i_weight = pWeight->i_scale;
3533
3534
i_h4w = i_height - i_height%4;
3535
3536
src1_org = p_src1;
3537
3538
if( 16 == i_width || 12 == i_width )
3539
{
3540
x264_mc_weight_w16_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
3541
pWeight, i_h4w );
3542
p_src1 = src1_org + i_h4w * i_src_stride;
3543
3544
for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3545
{
3546
v16i8 zero = {0};
3547
v16u8 src_vec0;
3548
v16i8 tmp0;
3549
v8u16 temp_vec0, temp_vec1;
3550
v8u16 wgt;
3551
v8i16 denom, offset_val0;
3552
3553
i_offset_val <<= ( i_log2_denom );
3554
3555
if( i_log2_denom )
3556
{
3557
i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3558
}
3559
3560
wgt = ( v8u16 ) __msa_fill_h( i_weight );
3561
offset_val0 = __msa_fill_h( i_offset_val );
3562
denom = __msa_fill_h( i_log2_denom );
3563
3564
src_vec0 = LD_UB( p_src1 );
3565
p_src1 += i_src_stride;
3566
3567
temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 );
3568
temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3569
3570
temp_vec0 = wgt * temp_vec0;
3571
temp_vec1 = wgt * temp_vec1;
3572
3573
temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3574
offset_val0 );
3575
temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
3576
offset_val0 );
3577
3578
temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3579
temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
3580
3581
temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3582
temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
3583
3584
temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3585
temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
3586
3587
tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
3588
( v16i8 ) temp_vec0 );
3589
ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
3590
}
3591
}
3592
else if( 20 == i_width )
3593
{
3594
x264_mc_weight_w20_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
3595
pWeight, i_h4w );
3596
p_src1 = src1_org + i_h4w * i_src_stride;
3597
3598
for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3599
{
3600
uint32_t temp0;
3601
v16i8 zero = {0};
3602
v16u8 src_vec0;
3603
v16i8 tmp0;
3604
v8u16 temp_vec0, temp_vec1;
3605
v8u16 wgt;
3606
v8i16 denom, offset_val0;
3607
3608
i_offset_val <<= ( i_log2_denom );
3609
3610
if( i_log2_denom )
3611
{
3612
i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3613
}
3614
3615
wgt = ( v8u16 ) __msa_fill_h( i_weight );
3616
offset_val0 = __msa_fill_h( i_offset_val );
3617
denom = __msa_fill_h( i_log2_denom );
3618
3619
src_vec0 = LD_UB( p_src1 );
3620
temp0 = LW( p_src1 + 16 );
3621
p_src1 += i_src_stride;
3622
3623
temp_vec1 = ( v8u16 ) __msa_ilvl_b( zero, ( v16i8 ) src_vec0 );
3624
temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3625
3626
temp_vec0 = wgt * temp_vec0;
3627
temp_vec1 = wgt * temp_vec1;
3628
3629
temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3630
offset_val0 );
3631
temp_vec1 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec1,
3632
offset_val0 );
3633
3634
temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3635
temp_vec1 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec1, 0 );
3636
3637
temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3638
temp_vec1 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec1, denom );
3639
3640
temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3641
temp_vec1 = __msa_sat_u_h( temp_vec1, 7 );
3642
3643
tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec1,
3644
( v16i8 ) temp_vec0 );
3645
ST_SB( tmp0, p_dst + i_cnt * ( *p_dst_stride ) );
3646
3647
src_vec0 = ( v16u8 ) __msa_fill_w( temp0 );
3648
temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3649
temp_vec0 = wgt * temp_vec0;
3650
3651
temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3652
offset_val0 );
3653
temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3654
temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3655
temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3656
3657
tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3658
( v16i8 ) temp_vec0 );
3659
temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
3660
SW( temp0,p_dst + i_cnt * ( *p_dst_stride ) + 16 );
3661
}
3662
}
3663
else if( 8 == i_width )
3664
{
3665
x264_mc_weight_w8_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
3666
pWeight, i_h4w );
3667
p_src1 = src1_org + i_h4w * i_src_stride;
3668
3669
for( i_cnt = i_h4w; i_cnt < i_height ; i_cnt++ )
3670
{
3671
uint64_t u_temp0;
3672
v16i8 zero = {0};
3673
v16u8 src_vec0;
3674
v16i8 tmp0;
3675
v8u16 temp_vec0;
3676
v8u16 wgt;
3677
v8i16 denom, offset_val0;
3678
3679
i_offset_val = i_offset_val << i_log2_denom;
3680
3681
if( i_log2_denom )
3682
{
3683
i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3684
}
3685
3686
wgt = ( v8u16 ) __msa_fill_h( i_weight );
3687
offset_val0 = __msa_fill_h( i_offset_val );
3688
denom = __msa_fill_h( i_log2_denom );
3689
3690
src_vec0 = LD_UB( p_src1 );
3691
p_src1 += i_src_stride;
3692
3693
temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3694
temp_vec0 = wgt * temp_vec0;
3695
3696
temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3697
offset_val0 );
3698
temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3699
temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3700
temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3701
3702
tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3703
( v16i8 ) temp_vec0 );
3704
u_temp0 = __msa_copy_u_d( ( v2i64 ) tmp0, 0 );
3705
SD( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3706
}
3707
}
3708
else if( 4 == i_width )
3709
{
3710
x264_mc_weight_w4_msa( p_dst, *p_dst_stride, p_src1, i_src_stride,
3711
pWeight, i_h4w );
3712
p_src1 = src1_org + i_h4w * i_src_stride;
3713
3714
for( i_cnt = i_h4w; i_cnt < i_height; i_cnt++ )
3715
{
3716
uint32_t u_temp0;
3717
v16i8 zero = {0};
3718
v16u8 src_vec0;
3719
v16i8 tmp0;
3720
v8u16 temp_vec0;
3721
v8u16 wgt;
3722
v8i16 denom, offset_val0;
3723
3724
i_offset_val <<= ( i_log2_denom );
3725
3726
if( i_log2_denom )
3727
{
3728
i_offset_val += ( 1 << ( i_log2_denom - 1 ) );
3729
}
3730
3731
wgt = ( v8u16 ) __msa_fill_h( i_weight );
3732
offset_val0 = __msa_fill_h( i_offset_val );
3733
denom = __msa_fill_h( i_log2_denom );
3734
3735
u_temp0 = LW( p_src1 );
3736
p_src1 += i_src_stride;
3737
3738
src_vec0 = ( v16u8 ) __msa_fill_w( u_temp0 );
3739
3740
temp_vec0 = ( v8u16 ) __msa_ilvr_b( zero, ( v16i8 ) src_vec0 );
3741
temp_vec0 = wgt * temp_vec0;
3742
3743
temp_vec0 = ( v8u16 ) __msa_adds_s_h( ( v8i16 ) temp_vec0,
3744
offset_val0 );
3745
temp_vec0 = ( v8u16 ) __msa_maxi_s_h( ( v8i16 ) temp_vec0, 0 );
3746
temp_vec0 = ( v8u16 ) __msa_srl_h( ( v8i16 ) temp_vec0, denom );
3747
temp_vec0 = __msa_sat_u_h( temp_vec0, 7 );
3748
3749
tmp0 = __msa_pckev_b( ( v16i8 ) temp_vec0,
3750
( v16i8 ) temp_vec0 );
3751
u_temp0 = __msa_copy_u_w( ( v4i32 ) tmp0, 0 );
3752
SW( u_temp0, p_dst + i_cnt * ( *p_dst_stride ) );
3753
}
3754
}
3755
3756
return p_dst;
3757
}
3758
else
3759
{
3760
*p_dst_stride = i_src_stride;
3761
return p_src1;
3762
}
3763
}
3764
3765
void x264_mc_init_mips( int32_t cpu, x264_mc_functions_t *pf )
3766
{
3767
if( cpu & X264_CPU_MSA )
3768
{
3769
pf->mc_luma = x264_mc_luma_msa;
3770
pf->mc_chroma = x264_mc_chroma_msa;
3771
pf->get_ref = x264_get_ref_msa;
3772
3773
pf->avg[PIXEL_16x16]= x264_pixel_avg_16x16_msa;
3774
pf->avg[PIXEL_16x8] = x264_pixel_avg_16x8_msa;
3775
pf->avg[PIXEL_8x16] = x264_pixel_avg_8x16_msa;
3776
pf->avg[PIXEL_8x8] = x264_pixel_avg_8x8_msa;
3777
pf->avg[PIXEL_8x4] = x264_pixel_avg_8x4_msa;
3778
pf->avg[PIXEL_4x16] = x264_pixel_avg_4x16_msa;
3779
pf->avg[PIXEL_4x8] = x264_pixel_avg_4x8_msa;
3780
pf->avg[PIXEL_4x4] = x264_pixel_avg_4x4_msa;
3781
pf->avg[PIXEL_4x2] = x264_pixel_avg_4x2_msa;
3782
3783
pf->weight = x264_mc_weight_wtab_msa;
3784
pf->offsetadd = x264_mc_weight_wtab_msa;
3785
pf->offsetsub = x264_mc_weight_wtab_msa;
3786
3787
pf->copy_16x16_unaligned = x264_mc_copy_w16_msa;
3788
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_msa;
3789
pf->copy[PIXEL_8x8] = x264_mc_copy_w8_msa;
3790
pf->copy[PIXEL_4x4] = x264_mc_copy_w4_msa;
3791
3792
pf->store_interleave_chroma = x264_store_interleave_chroma_msa;
3793
pf->load_deinterleave_chroma_fenc = x264_load_deinterleave_chroma_fenc_msa;
3794
pf->load_deinterleave_chroma_fdec = x264_load_deinterleave_chroma_fdec_msa;
3795
3796
pf->plane_copy_interleave = x264_plane_copy_interleave_msa;
3797
pf->plane_copy_deinterleave = x264_plane_copy_deinterleave_msa;
3798
pf->plane_copy_deinterleave_rgb = x264_plane_copy_deinterleave_rgb_msa;
3799
3800
pf->hpel_filter = x264_hpel_filter_msa;
3801
3802
pf->memcpy_aligned = memcpy;
3803
pf->memzero_aligned = x264_memzero_aligned_msa;
3804
pf->frame_init_lowres_core = x264_frame_init_lowres_core_msa;
3805
}
3806
}
3807
#endif
3808
3809