Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
/*****************************************************************************
2
* pixel-c.c: msa pixel metrics
3
*****************************************************************************
4
* Copyright (C) 2015-2016 x264 project
5
*
6
* Authors: Mandar Sahastrabuddhe <[email protected]>
7
*
8
* This program is free software; you can redistribute it and/or modify
9
* it under the terms of the GNU General Public License as published by
10
* the Free Software Foundation; either version 2 of the License, or
11
* (at your option) any later version.
12
*
13
* This program is distributed in the hope that it will be useful,
14
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
* GNU General Public License for more details.
17
*
18
* You should have received a copy of the GNU General Public License
19
* along with this program; if not, write to the Free Software
20
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
*
22
* This program is also available under a commercial proprietary license.
23
* For more information, contact us at [email protected].
24
*****************************************************************************/
25
26
#include "common/common.h"
27
#include "macros.h"
28
#include "pixel.h"
29
#include "predict.h"
30
31
#if !HIGH_BIT_DEPTH
32
#define CALC_MSE_B( src, ref, var ) \
33
{ \
34
v16u8 src_l0_m, src_l1_m; \
35
v8i16 res_l0_m, res_l1_m; \
36
\
37
ILVRL_B2_UB( src, ref, src_l0_m, src_l1_m ); \
38
HSUB_UB2_SH( src_l0_m, src_l1_m, res_l0_m, res_l1_m ); \
39
DPADD_SH2_SW( res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var ); \
40
}
41
42
#define CALC_MSE_AVG_B( src, ref, var, sub ) \
43
{ \
44
v16u8 src_l0_m, src_l1_m; \
45
v8i16 res_l0_m, res_l1_m; \
46
\
47
ILVRL_B2_UB( src, ref, src_l0_m, src_l1_m ); \
48
HSUB_UB2_SH( src_l0_m, src_l1_m, res_l0_m, res_l1_m ); \
49
DPADD_SH2_SW( res_l0_m, res_l1_m, res_l0_m, res_l1_m, var, var ); \
50
\
51
sub += res_l0_m + res_l1_m; \
52
}
53
54
#define VARIANCE_WxH( sse, diff, shift ) \
55
( ( sse ) - ( ( ( uint32_t )( diff ) * ( diff ) ) >> ( shift ) ) )
56
57
static uint32_t sad_4width_msa( uint8_t *p_src, int32_t i_src_stride,
58
uint8_t *p_ref, int32_t i_ref_stride,
59
int32_t i_height )
60
{
61
int32_t i_ht_cnt;
62
uint32_t u_src0, u_src1, u_src2, u_src3, u_ref0, u_ref1, u_ref2, u_ref3;
63
v16u8 src = { 0 };
64
v16u8 ref = { 0 };
65
v16u8 diff;
66
v8u16 sad = { 0 };
67
68
for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
69
{
70
LW4( p_src, i_src_stride, u_src0, u_src1, u_src2, u_src3 );
71
p_src += ( 4 * i_src_stride );
72
LW4( p_ref, i_ref_stride, u_ref0, u_ref1, u_ref2, u_ref3 );
73
p_ref += ( 4 * i_ref_stride );
74
75
INSERT_W4_UB( u_src0, u_src1, u_src2, u_src3, src );
76
INSERT_W4_UB( u_ref0, u_ref1, u_ref2, u_ref3, ref );
77
78
diff = __msa_asub_u_b( src, ref );
79
sad += __msa_hadd_u_h( diff, diff );
80
}
81
82
return ( HADD_UH_U32( sad ) );
83
}
84
85
static uint32_t sad_8width_msa( uint8_t *p_src, int32_t i_src_stride,
86
uint8_t *p_ref, int32_t i_ref_stride,
87
int32_t i_height )
88
{
89
int32_t i_ht_cnt;
90
v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
91
v8u16 sad = { 0 };
92
93
for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
94
{
95
LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
96
p_src += ( 4 * i_src_stride );
97
LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
98
p_ref += ( 4 * i_ref_stride );
99
100
PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2,
101
src0, src1, ref0, ref1 );
102
sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
103
}
104
105
return ( HADD_UH_U32( sad ) );
106
}
107
108
static uint32_t sad_16width_msa( uint8_t *p_src, int32_t i_src_stride,
109
uint8_t *p_ref, int32_t i_ref_stride,
110
int32_t i_height )
111
{
112
int32_t i_ht_cnt;
113
v16u8 src0, src1, ref0, ref1;
114
v8u16 sad = { 0 };
115
116
for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
117
{
118
LD_UB2( p_src, i_src_stride, src0, src1 );
119
p_src += ( 2 * i_src_stride );
120
LD_UB2( p_ref, i_ref_stride, ref0, ref1 );
121
p_ref += ( 2 * i_ref_stride );
122
sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
123
124
LD_UB2( p_src, i_src_stride, src0, src1 );
125
p_src += ( 2 * i_src_stride );
126
LD_UB2( p_ref, i_ref_stride, ref0, ref1 );
127
p_ref += ( 2 * i_ref_stride );
128
sad += SAD_UB2_UH( src0, src1, ref0, ref1 );
129
}
130
131
return ( HADD_UH_U32( sad ) );
132
}
133
134
static void sad_4width_x3d_msa( uint8_t *p_src, int32_t i_src_stride,
135
uint8_t *p_ref0, uint8_t *p_ref1,
136
uint8_t *p_ref2, int32_t i_ref_stride,
137
int32_t i_height, uint32_t *pu_sad_array )
138
{
139
int32_t i_ht_cnt;
140
v16u8 src = { 0 };
141
uint32_t src0, src1, src2, src3, load0, load1, load2, load3;
142
v16u8 ref0 = { 0 };
143
v16u8 ref1 = { 0 };
144
v16u8 ref2 = { 0 };
145
v16u8 diff;
146
v8u16 sad0 = { 0 };
147
v8u16 sad1 = { 0 };
148
v8u16 sad2 = { 0 };
149
150
for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
151
{
152
LW4( p_src, i_src_stride, src0, src1, src2, src3 );
153
INSERT_W4_UB( src0, src1, src2, src3, src );
154
p_src += ( 4 * i_src_stride );
155
156
LW4( p_ref0, i_ref_stride, load0, load1, load2, load3 );
157
INSERT_W4_UB( load0, load1, load2, load3, ref0 );
158
p_ref0 += ( 4 * i_ref_stride );
159
160
LW4( p_ref1, i_ref_stride, load0, load1, load2, load3 );
161
INSERT_W4_UB( load0, load1, load2, load3, ref1 );
162
p_ref1 += ( 4 * i_ref_stride );
163
164
LW4( p_ref2, i_ref_stride, load0, load1, load2, load3 );
165
INSERT_W4_UB( load0, load1, load2, load3, ref2 );
166
p_ref2 += ( 4 * i_ref_stride );
167
168
diff = __msa_asub_u_b( src, ref0 );
169
sad0 += __msa_hadd_u_h( diff, diff );
170
171
diff = __msa_asub_u_b( src, ref1 );
172
sad1 += __msa_hadd_u_h( diff, diff );
173
174
diff = __msa_asub_u_b( src, ref2 );
175
sad2 += __msa_hadd_u_h( diff, diff );
176
}
177
178
pu_sad_array[0] = HADD_UH_U32( sad0 );
179
pu_sad_array[1] = HADD_UH_U32( sad1 );
180
pu_sad_array[2] = HADD_UH_U32( sad2 );
181
}
182
183
static void sad_8width_x3d_msa( uint8_t *p_src, int32_t i_src_stride,
184
uint8_t *p_ref0, uint8_t *p_ref1,
185
uint8_t *p_ref2, int32_t i_ref_stride,
186
int32_t i_height, uint32_t *pu_sad_array )
187
{
188
int32_t i_ht_cnt;
189
v16u8 src0, src1, src2, src3;
190
v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
191
v8u16 sad0 = { 0 };
192
v8u16 sad1 = { 0 };
193
v8u16 sad2 = { 0 };
194
195
for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
196
{
197
LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
198
p_src += ( 4 * i_src_stride );
199
LD_UB4( p_ref0, i_ref_stride, ref00, ref11, ref22, ref33 );
200
p_ref0 += ( 4 * i_ref_stride );
201
202
PCKEV_D4_UB( src1, src0, src3, src2, ref11, ref00, ref33, ref22,
203
src0, src1, ref0, ref1 );
204
sad0 += SAD_UB2_UH( src0, src1, ref0, ref1 );
205
206
LD_UB4( p_ref1, i_ref_stride, ref00, ref11, ref22, ref33 );
207
p_ref1 += ( 4 * i_ref_stride );
208
209
PCKEV_D2_UB( ref11, ref00, ref33, ref22, ref0, ref1 );
210
sad1 += SAD_UB2_UH( src0, src1, ref0, ref1 );
211
212
LD_UB4( p_ref2, i_ref_stride, ref00, ref11, ref22, ref33 );
213
p_ref2 += ( 4 * i_ref_stride );
214
215
PCKEV_D2_UB( ref11, ref00, ref33, ref22, ref0, ref1 );
216
sad2 += SAD_UB2_UH( src0, src1, ref0, ref1 );
217
}
218
219
pu_sad_array[0] = HADD_UH_U32( sad0 );
220
pu_sad_array[1] = HADD_UH_U32( sad1 );
221
pu_sad_array[2] = HADD_UH_U32( sad2 );
222
}
223
224
static void sad_16width_x3d_msa( uint8_t *p_src, int32_t i_src_stride,
225
uint8_t *p_ref0, uint8_t *p_ref1,
226
uint8_t *p_ref2, int32_t i_ref_stride,
227
int32_t i_height, uint32_t *pu_sad_array )
228
{
229
int32_t i_ht_cnt;
230
v16u8 src, ref;
231
v16u8 diff;
232
v8u16 sad0 = { 0 };
233
v8u16 sad1 = { 0 };
234
v8u16 sad2 = { 0 };
235
236
for ( i_ht_cnt = ( i_height >> 1 ); i_ht_cnt--; )
237
{
238
src = LD_UB( p_src );
239
p_src += i_src_stride;
240
241
ref = LD_UB( p_ref0 );
242
p_ref0 += i_ref_stride;
243
diff = __msa_asub_u_b( src, ref );
244
sad0 += __msa_hadd_u_h( diff, diff );
245
246
ref = LD_UB( p_ref1 );
247
p_ref1 += i_ref_stride;
248
diff = __msa_asub_u_b( src, ref );
249
sad1 += __msa_hadd_u_h( diff, diff );
250
251
ref = LD_UB( p_ref2 );
252
p_ref2 += i_ref_stride;
253
diff = __msa_asub_u_b( src, ref );
254
sad2 += __msa_hadd_u_h( diff, diff );
255
256
src = LD_UB( p_src );
257
p_src += i_src_stride;
258
259
ref = LD_UB( p_ref0 );
260
p_ref0 += i_ref_stride;
261
diff = __msa_asub_u_b( src, ref );
262
sad0 += __msa_hadd_u_h( diff, diff );
263
264
ref = LD_UB( p_ref1 );
265
p_ref1 += i_ref_stride;
266
diff = __msa_asub_u_b( src, ref );
267
sad1 += __msa_hadd_u_h( diff, diff );
268
269
ref = LD_UB( p_ref2 );
270
p_ref2 += i_ref_stride;
271
diff = __msa_asub_u_b( src, ref );
272
sad2 += __msa_hadd_u_h( diff, diff );
273
}
274
275
pu_sad_array[0] = HADD_UH_U32( sad0 );
276
pu_sad_array[1] = HADD_UH_U32( sad1 );
277
pu_sad_array[2] = HADD_UH_U32( sad2 );
278
}
279
280
static void sad_4width_x4d_msa( uint8_t *p_src, int32_t i_src_stride,
281
uint8_t *p_aref[], int32_t i_ref_stride,
282
int32_t i_height, uint32_t *pu_sad_array )
283
{
284
uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3;
285
int32_t i_ht_cnt;
286
uint32_t src0, src1, src2, src3;
287
uint32_t ref0, ref1, ref2, ref3;
288
v16u8 src = { 0 };
289
v16u8 ref = { 0 };
290
v16u8 diff;
291
v8u16 sad0 = { 0 };
292
v8u16 sad1 = { 0 };
293
v8u16 sad2 = { 0 };
294
v8u16 sad3 = { 0 };
295
296
p_ref0 = p_aref[0];
297
p_ref1 = p_aref[1];
298
p_ref2 = p_aref[2];
299
p_ref3 = p_aref[3];
300
301
for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
302
{
303
LW4( p_src, i_src_stride, src0, src1, src2, src3 );
304
INSERT_W4_UB( src0, src1, src2, src3, src );
305
p_src += ( 4 * i_src_stride );
306
307
LW4( p_ref0, i_ref_stride, ref0, ref1, ref2, ref3 );
308
INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
309
p_ref0 += ( 4 * i_ref_stride );
310
311
diff = __msa_asub_u_b( src, ref );
312
sad0 += __msa_hadd_u_h( diff, diff );
313
314
LW4( p_ref1, i_ref_stride, ref0, ref1, ref2, ref3 );
315
INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
316
p_ref1 += ( 4 * i_ref_stride );
317
318
diff = __msa_asub_u_b( src, ref );
319
sad1 += __msa_hadd_u_h( diff, diff );
320
321
LW4( p_ref2, i_ref_stride, ref0, ref1, ref2, ref3 );
322
INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
323
p_ref2 += ( 4 * i_ref_stride );
324
325
diff = __msa_asub_u_b( src, ref );
326
sad2 += __msa_hadd_u_h( diff, diff );
327
328
LW4( p_ref3, i_ref_stride, ref0, ref1, ref2, ref3 );
329
INSERT_W4_UB( ref0, ref1, ref2, ref3, ref );
330
p_ref3 += ( 4 * i_ref_stride );
331
332
diff = __msa_asub_u_b( src, ref );
333
sad3 += __msa_hadd_u_h( diff, diff );
334
}
335
336
pu_sad_array[0] = HADD_UH_U32( sad0 );
337
pu_sad_array[1] = HADD_UH_U32( sad1 );
338
pu_sad_array[2] = HADD_UH_U32( sad2 );
339
pu_sad_array[3] = HADD_UH_U32( sad3 );
340
}
341
342
static void sad_8width_x4d_msa( uint8_t *p_src, int32_t i_src_stride,
343
uint8_t *p_aref[], int32_t i_ref_stride,
344
int32_t i_height, uint32_t *pu_sad_array )
345
{
346
int32_t i_ht_cnt;
347
uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3;
348
v16u8 src0, src1, src2, src3;
349
v16u8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
350
v16u8 ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
351
v8u16 sad0 = { 0 };
352
v8u16 sad1 = { 0 };
353
v8u16 sad2 = { 0 };
354
v8u16 sad3 = { 0 };
355
356
p_ref0 = p_aref[0];
357
p_ref1 = p_aref[1];
358
p_ref2 = p_aref[2];
359
p_ref3 = p_aref[3];
360
361
for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
362
{
363
LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
364
p_src += ( 4 * i_src_stride );
365
LD_UB4( p_ref0, i_ref_stride, ref0, ref1, ref2, ref3 );
366
p_ref0 += ( 4 * i_ref_stride );
367
LD_UB4( p_ref1, i_ref_stride, ref4, ref5, ref6, ref7 );
368
p_ref1 += ( 4 * i_ref_stride );
369
LD_UB4( p_ref2, i_ref_stride, ref8, ref9, ref10, ref11 );
370
p_ref2 += ( 4 * i_ref_stride );
371
LD_UB4( p_ref3, i_ref_stride, ref12, ref13, ref14, ref15 );
372
p_ref3 += ( 4 * i_ref_stride );
373
374
PCKEV_D2_UB( src1, src0, src3, src2, src0, src1 );
375
PCKEV_D2_UB( ref1, ref0, ref3, ref2, ref0, ref1 );
376
sad0 += SAD_UB2_UH( src0, src1, ref0, ref1 );
377
378
PCKEV_D2_UB( ref5, ref4, ref7, ref6, ref0, ref1 );
379
sad1 += SAD_UB2_UH( src0, src1, ref0, ref1 );
380
381
PCKEV_D2_UB( ref9, ref8, ref11, ref10, ref0, ref1 );
382
sad2 += SAD_UB2_UH( src0, src1, ref0, ref1 );
383
384
PCKEV_D2_UB( ref13, ref12, ref15, ref14, ref0, ref1 );
385
sad3 += SAD_UB2_UH( src0, src1, ref0, ref1 );
386
}
387
388
pu_sad_array[0] = HADD_UH_U32( sad0 );
389
pu_sad_array[1] = HADD_UH_U32( sad1 );
390
pu_sad_array[2] = HADD_UH_U32( sad2 );
391
pu_sad_array[3] = HADD_UH_U32( sad3 );
392
}
393
394
static void sad_16width_x4d_msa( uint8_t *p_src, int32_t i_src_stride,
395
uint8_t *p_aref[], int32_t i_ref_stride,
396
int32_t i_height, uint32_t *pu_sad_array )
397
{
398
int32_t i_ht_cnt;
399
uint8_t *p_ref0, *p_ref1, *p_ref2, *p_ref3;
400
v16u8 src, ref0, ref1, ref2, ref3, diff;
401
v8u16 sad0 = { 0 };
402
v8u16 sad1 = { 0 };
403
v8u16 sad2 = { 0 };
404
v8u16 sad3 = { 0 };
405
406
p_ref0 = p_aref[0];
407
p_ref1 = p_aref[1];
408
p_ref2 = p_aref[2];
409
p_ref3 = p_aref[3];
410
411
for ( i_ht_cnt = ( i_height >> 1 ); i_ht_cnt--; )
412
{
413
src = LD_UB( p_src );
414
p_src += i_src_stride;
415
ref0 = LD_UB( p_ref0 );
416
p_ref0 += i_ref_stride;
417
ref1 = LD_UB( p_ref1 );
418
p_ref1 += i_ref_stride;
419
ref2 = LD_UB( p_ref2 );
420
p_ref2 += i_ref_stride;
421
ref3 = LD_UB( p_ref3 );
422
p_ref3 += i_ref_stride;
423
424
diff = __msa_asub_u_b( src, ref0 );
425
sad0 += __msa_hadd_u_h( diff, diff );
426
diff = __msa_asub_u_b( src, ref1 );
427
sad1 += __msa_hadd_u_h( diff, diff );
428
diff = __msa_asub_u_b( src, ref2 );
429
sad2 += __msa_hadd_u_h( diff, diff );
430
diff = __msa_asub_u_b( src, ref3 );
431
sad3 += __msa_hadd_u_h( diff, diff );
432
433
src = LD_UB( p_src );
434
p_src += i_src_stride;
435
ref0 = LD_UB( p_ref0 );
436
p_ref0 += i_ref_stride;
437
ref1 = LD_UB( p_ref1 );
438
p_ref1 += i_ref_stride;
439
ref2 = LD_UB( p_ref2 );
440
p_ref2 += i_ref_stride;
441
ref3 = LD_UB( p_ref3 );
442
p_ref3 += i_ref_stride;
443
444
diff = __msa_asub_u_b( src, ref0 );
445
sad0 += __msa_hadd_u_h( diff, diff );
446
diff = __msa_asub_u_b( src, ref1 );
447
sad1 += __msa_hadd_u_h( diff, diff );
448
diff = __msa_asub_u_b( src, ref2 );
449
sad2 += __msa_hadd_u_h( diff, diff );
450
diff = __msa_asub_u_b( src, ref3 );
451
sad3 += __msa_hadd_u_h( diff, diff );
452
}
453
454
pu_sad_array[0] = HADD_UH_U32( sad0 );
455
pu_sad_array[1] = HADD_UH_U32( sad1 );
456
pu_sad_array[2] = HADD_UH_U32( sad2 );
457
pu_sad_array[3] = HADD_UH_U32( sad3 );
458
}
459
460
static uint64_t avc_pixel_var16width_msa( uint8_t *p_pix, int32_t i_stride,
461
uint8_t i_height )
462
{
463
uint32_t u_sum = 0, u_sqr_out = 0, u_cnt;
464
v16i8 pix, zero = { 0 };
465
v8u16 add, pix_r, pix_l;
466
v4u32 sqr = { 0 };
467
468
for ( u_cnt = i_height; u_cnt--; )
469
{
470
pix = LD_SB( p_pix );
471
p_pix += i_stride;
472
add = __msa_hadd_u_h( ( v16u8 ) pix, ( v16u8 ) pix );
473
u_sum += HADD_UH_U32( add );
474
ILVRL_B2_UH( zero, pix, pix_r, pix_l );
475
sqr = __msa_dpadd_u_w( sqr, pix_r, pix_r );
476
sqr = __msa_dpadd_u_w( sqr, pix_l, pix_l );
477
}
478
479
u_sqr_out = HADD_SW_S32( sqr );
480
481
return ( u_sum + ( ( uint64_t ) u_sqr_out << 32 ) );
482
}
483
484
static uint64_t avc_pixel_var8width_msa( uint8_t *p_pix, int32_t i_stride,
485
uint8_t i_height )
486
{
487
uint32_t u_sum = 0, u_sqr_out = 0, u_cnt;
488
v16i8 pix, zero = { 0 };
489
v8u16 add, pix_r;
490
v4u32 sqr = { 0 };
491
492
for ( u_cnt = i_height; u_cnt--; )
493
{
494
pix = LD_SB( p_pix );
495
p_pix += i_stride;
496
pix_r = ( v8u16 ) __msa_ilvr_b( zero, pix );
497
add = __msa_hadd_u_h( ( v16u8 ) pix_r, ( v16u8 ) pix_r );
498
u_sum += HADD_UH_U32( add );
499
sqr = __msa_dpadd_u_w( sqr, pix_r, pix_r );
500
}
501
502
u_sqr_out = HADD_SW_S32( sqr );
503
504
return ( u_sum + ( ( uint64_t ) u_sqr_out << 32 ) );
505
}
506
507
static uint32_t sse_diff_8width_msa( uint8_t *p_src, int32_t i_src_stride,
508
uint8_t *p_ref, int32_t i_ref_stride,
509
int32_t i_height, int32_t *p_diff )
510
{
511
int32_t i_ht_cnt;
512
uint32_t u_sse;
513
v16u8 src0, src1, src2, src3;
514
v16u8 ref0, ref1, ref2, ref3;
515
v8i16 avg = { 0 };
516
v4i32 vec, var = { 0 };
517
518
for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
519
{
520
LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
521
p_src += ( 4 * i_src_stride );
522
LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
523
p_ref += ( 4 * i_ref_stride );
524
525
PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2,
526
src0, src1, ref0, ref1 );
527
CALC_MSE_AVG_B( src0, ref0, var, avg );
528
CALC_MSE_AVG_B( src1, ref1, var, avg );
529
}
530
531
vec = __msa_hadd_s_w( avg, avg );
532
*p_diff = HADD_SW_S32( vec );
533
u_sse = HADD_SW_S32( var );
534
535
return u_sse;
536
}
537
538
static uint32_t sse_4width_msa( uint8_t *p_src, int32_t i_src_stride,
539
uint8_t *p_ref, int32_t i_ref_stride,
540
int32_t i_height )
541
{
542
int32_t i_ht_cnt;
543
uint32_t u_sse;
544
uint32_t u_src0, u_src1, u_src2, u_src3;
545
uint32_t u_ref0, u_ref1, u_ref2, u_ref3;
546
v16u8 src = { 0 };
547
v16u8 ref = { 0 };
548
v4i32 var = { 0 };
549
550
for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
551
{
552
LW4( p_src, i_src_stride, u_src0, u_src1, u_src2, u_src3 );
553
p_src += ( 4 * i_src_stride );
554
LW4( p_ref, i_ref_stride, u_ref0, u_ref1, u_ref2, u_ref3 );
555
p_ref += ( 4 * i_ref_stride );
556
557
INSERT_W4_UB( u_src0, u_src1, u_src2, u_src3, src );
558
INSERT_W4_UB( u_ref0, u_ref1, u_ref2, u_ref3, ref );
559
CALC_MSE_B( src, ref, var );
560
}
561
562
u_sse = HADD_SW_S32( var );
563
564
return u_sse;
565
}
566
567
static uint32_t sse_8width_msa( uint8_t *p_src, int32_t i_src_stride,
568
uint8_t *p_ref, int32_t i_ref_stride,
569
int32_t i_height )
570
{
571
int32_t i_ht_cnt;
572
uint32_t u_sse;
573
v16u8 src0, src1, src2, src3;
574
v16u8 ref0, ref1, ref2, ref3;
575
v4i32 var = { 0 };
576
577
for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
578
{
579
LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
580
p_src += ( 4 * i_src_stride );
581
LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
582
p_ref += ( 4 * i_ref_stride );
583
584
PCKEV_D4_UB( src1, src0, src3, src2, ref1, ref0, ref3, ref2,
585
src0, src1, ref0, ref1 );
586
CALC_MSE_B( src0, ref0, var );
587
CALC_MSE_B( src1, ref1, var );
588
}
589
590
u_sse = HADD_SW_S32( var );
591
592
return u_sse;
593
}
594
595
static uint32_t sse_16width_msa( uint8_t *p_src, int32_t i_src_stride,
596
uint8_t *p_ref, int32_t i_ref_stride,
597
int32_t i_height )
598
{
599
int32_t i_ht_cnt;
600
uint32_t u_sse;
601
v16u8 src, ref;
602
v4i32 var = { 0 };
603
604
for ( i_ht_cnt = ( i_height >> 2 ); i_ht_cnt--; )
605
{
606
src = LD_UB( p_src );
607
p_src += i_src_stride;
608
ref = LD_UB( p_ref );
609
p_ref += i_ref_stride;
610
CALC_MSE_B( src, ref, var );
611
612
src = LD_UB( p_src );
613
p_src += i_src_stride;
614
ref = LD_UB( p_ref );
615
p_ref += i_ref_stride;
616
CALC_MSE_B( src, ref, var );
617
618
src = LD_UB( p_src );
619
p_src += i_src_stride;
620
ref = LD_UB( p_ref );
621
p_ref += i_ref_stride;
622
CALC_MSE_B( src, ref, var );
623
624
src = LD_UB( p_src );
625
p_src += i_src_stride;
626
ref = LD_UB( p_ref );
627
p_ref += i_ref_stride;
628
CALC_MSE_B( src, ref, var );
629
}
630
631
u_sse = HADD_SW_S32( var );
632
633
return u_sse;
634
}
635
636
static void ssim_4x4x2_core_msa( const uint8_t *p_src, int32_t i_src_stride,
637
const uint8_t *p_ref, int32_t i_ref_stride,
638
int32_t pi_sum_array[2][4] )
639
{
640
v16i8 zero = { 0 };
641
v16u8 src0, src1, src2, src3, ref0, ref1, ref2, ref3;
642
v8u16 temp0, temp1, temp2, temp3;
643
v8u16 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
644
v4u32 tmp0;
645
v4i32 tmp2, tmp3;
646
647
LD_UB4( p_src, i_src_stride, src0, src1, src2, src3 );
648
p_src += ( 4 * i_src_stride );
649
LD_UB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
650
p_ref += ( 4 * i_ref_stride );
651
652
ILVR_D2_UB( src1, src0, src3, src2, src0, src2 );
653
ILVR_D2_UB( ref1, ref0, ref3, ref2, ref0, ref2 );
654
HADD_UB2_UH( src0, src2, temp0, temp1 );
655
656
temp2 = ( v8u16 ) __msa_ilvev_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
657
temp3 = ( v8u16 ) __msa_ilvod_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
658
659
pi_sum_array[0][0] = ( int32_t ) HADD_UH_U32( temp2 );
660
pi_sum_array[1][0] = ( int32_t ) HADD_UH_U32( temp3 );
661
662
HADD_UB2_UH( ref0, ref2, temp0, temp1 );
663
664
temp2 = ( v8u16 ) __msa_ilvev_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
665
temp3 = ( v8u16 ) __msa_ilvod_w( ( v4i32 ) temp1, ( v4i32 ) temp0 );
666
667
pi_sum_array[0][1] = ( int32_t ) HADD_UH_U32( temp2 );
668
pi_sum_array[1][1] = ( int32_t ) HADD_UH_U32( temp3 );
669
670
ILVR_B4_UH( zero, src0, zero, src2, zero, ref0, zero, ref2, vec0, vec2,
671
vec4, vec6 );
672
ILVL_B4_UH( zero, src0, zero, src2, zero, ref0, zero, ref2, vec1, vec3,
673
vec5, vec7 );
674
675
tmp0 = __msa_dotp_u_w( vec0, vec0 );
676
tmp0 = __msa_dpadd_u_w( tmp0, vec1, vec1 );
677
tmp0 = __msa_dpadd_u_w( tmp0, vec2, vec2 );
678
tmp0 = __msa_dpadd_u_w( tmp0, vec3, vec3 );
679
tmp0 = __msa_dpadd_u_w( tmp0, vec4, vec4 );
680
tmp0 = __msa_dpadd_u_w( tmp0, vec5, vec5 );
681
tmp0 = __msa_dpadd_u_w( tmp0, vec6, vec6 );
682
tmp0 = __msa_dpadd_u_w( tmp0, vec7, vec7 );
683
684
tmp2 = ( v4i32 ) __msa_ilvev_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
685
tmp3 = ( v4i32 ) __msa_ilvod_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
686
tmp2 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp2, ( v4u32 ) tmp2 );
687
tmp3 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp3, ( v4u32 ) tmp3 );
688
689
pi_sum_array[0][2] = __msa_copy_u_w( tmp2, 0 );
690
pi_sum_array[1][2] = __msa_copy_u_w( tmp3, 0 );
691
692
tmp0 = __msa_dotp_u_w( vec4, vec0 );
693
tmp0 = __msa_dpadd_u_w( tmp0, vec5, vec1 );
694
tmp0 = __msa_dpadd_u_w( tmp0, vec6, vec2 );
695
tmp0 = __msa_dpadd_u_w( tmp0, vec7, vec3 );
696
697
tmp2 = ( v4i32 ) __msa_ilvev_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
698
tmp3 = ( v4i32 ) __msa_ilvod_d( ( v2i64 ) tmp0, ( v2i64 ) tmp0 );
699
tmp2 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp2, ( v4u32 ) tmp2 );
700
tmp3 = ( v4i32 ) __msa_hadd_u_d( ( v4u32 ) tmp3, ( v4u32 ) tmp3 );
701
702
pi_sum_array[0][3] = __msa_copy_u_w( tmp2, 0 );
703
pi_sum_array[1][3] = __msa_copy_u_w( tmp3, 0 );
704
}
705
706
static int32_t pixel_satd_4width_msa( uint8_t *p_src, int32_t i_src_stride,
707
uint8_t *p_ref, int32_t i_ref_stride,
708
uint8_t i_height )
709
{
710
int32_t cnt;
711
uint32_t u_sum = 0;
712
v16i8 src0, src1, src2, src3;
713
v16i8 ref0, ref1, ref2, ref3;
714
v8i16 zero = { 0 };
715
v8i16 diff0, diff1, diff2, diff3;
716
v8i16 temp0, temp1, temp2, temp3;
717
718
for ( cnt = i_height >> 2; cnt--; )
719
{
720
LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 );
721
p_src += 4 * i_src_stride;
722
LD_SB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
723
p_ref += 4 * i_ref_stride;
724
725
ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3,
726
diff0, diff1, diff2, diff3 );
727
HSUB_UB4_SH( diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3 );
728
TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
729
diff0, diff1, diff2, diff3 );
730
BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
731
BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
732
TRANSPOSE4x4_SH_SH( diff0, diff1, diff2, diff3,
733
diff0, diff1, diff2, diff3 );
734
BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
735
BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
736
737
diff0 = __msa_add_a_h( diff0, zero );
738
diff1 = __msa_add_a_h( diff1, zero );
739
diff2 = __msa_add_a_h( diff2, zero );
740
diff3 = __msa_add_a_h( diff3, zero );
741
diff0 = ( diff0 + diff1 + diff2 + diff3 );
742
diff0 = ( v8i16 ) __msa_hadd_u_w( ( v8u16 ) diff0, ( v8u16 ) diff0 );
743
diff0 = ( v8i16 ) __msa_hadd_u_d( ( v4u32 ) diff0, ( v4u32 ) diff0 );
744
u_sum += __msa_copy_u_w( ( v4i32 ) diff0, 0 );
745
}
746
747
return ( u_sum >> 1 );
748
}
749
750
static int32_t pixel_satd_8width_msa( uint8_t *p_src, int32_t i_src_stride,
751
uint8_t *p_ref, int32_t i_ref_stride,
752
uint8_t i_height )
753
{
754
int32_t cnt;
755
uint32_t u_sum = 0;
756
v16i8 src0, src1, src2, src3;
757
v16i8 ref0, ref1, ref2, ref3;
758
v8i16 zero = { 0 };
759
v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
760
v8i16 temp0, temp1, temp2, temp3;
761
762
for ( cnt = i_height >> 2; cnt--; )
763
{
764
LD_SB4( p_src, i_src_stride, src0, src1, src2, src3 );
765
p_src += 4 * i_src_stride;
766
LD_SB4( p_ref, i_ref_stride, ref0, ref1, ref2, ref3 );
767
p_ref += 4 * i_ref_stride;
768
769
ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3,
770
diff0, diff1, diff2, diff3 );
771
HSUB_UB4_SH( diff0, diff1, diff2, diff3, diff0, diff1, diff2, diff3 );
772
TRANSPOSE8X4_SH_SH( diff0, diff1, diff2, diff3,
773
diff0, diff2, diff4, diff6 );
774
775
diff1 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff0, 1 );
776
diff3 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff2, 1 );
777
diff5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff4, 1 );
778
diff7 = ( v8i16 ) __msa_splati_d( ( v2i64 ) diff6, 1 );
779
780
BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
781
BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
782
BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
783
BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
784
TRANSPOSE4X8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6,
785
diff7, diff0, diff1, diff2, diff3, diff4, diff5,
786
diff6, diff7 );
787
BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
788
BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
789
790
diff0 = __msa_add_a_h( diff0, zero );
791
diff1 = __msa_add_a_h( diff1, zero );
792
diff2 = __msa_add_a_h( diff2, zero );
793
diff3 = __msa_add_a_h( diff3, zero );
794
diff0 = ( diff0 + diff1 + diff2 + diff3 );
795
u_sum += HADD_UH_U32( diff0 );
796
}
797
798
return ( u_sum >> 1 );
799
}
800
801
static int32_t sa8d_8x8_msa( uint8_t *p_src, int32_t i_src_stride,
802
uint8_t *p_ref, int32_t i_ref_stride )
803
{
804
uint32_t u_sum = 0;
805
v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
806
v16i8 ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
807
v8i16 zero = { 0 };
808
v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
809
v8i16 sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7;
810
v8i16 temp0, temp1, temp2, temp3;
811
812
LD_SB8( p_src, i_src_stride, src0, src1, src2, src3, src4, src5, src6, src7 );
813
LD_SB8( p_ref, i_ref_stride, ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7 );
814
ILVR_B4_SH( src0, ref0, src1, ref1, src2, ref2, src3, ref3, sub0, sub1,
815
sub2, sub3 );
816
ILVR_B4_SH( src4, ref4, src5, ref5, src6, ref6, src7, ref7, sub4, sub5,
817
sub6, sub7 );
818
HSUB_UB4_SH( sub0, sub1, sub2, sub3, sub0, sub1, sub2, sub3 );
819
HSUB_UB4_SH( sub4, sub5, sub6, sub7, sub4, sub5, sub6, sub7 );
820
TRANSPOSE8x8_SH_SH( sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
821
sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7 );
822
BUTTERFLY_4( sub0, sub2, sub3, sub1, diff0, diff1, diff4, diff5 );
823
BUTTERFLY_4( sub4, sub6, sub7, sub5, diff2, diff3, diff7, diff6 );
824
BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
825
BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
826
BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
827
BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
828
TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
829
diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7 );
830
BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
831
BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
832
BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
833
BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
834
835
temp0 = diff0 + diff4;
836
temp1 = diff1 + diff5;
837
temp2 = diff2 + diff6;
838
temp3 = diff3 + diff7;
839
840
temp0 = __msa_add_a_h( temp0, zero );
841
temp1 = __msa_add_a_h( temp1, zero );
842
temp2 = __msa_add_a_h( temp2, zero );
843
temp3 = __msa_add_a_h( temp3, zero );
844
845
diff0 = temp0 + __msa_asub_s_h( diff0, diff4 );
846
diff1 = temp1 + __msa_asub_s_h( diff1, diff5 );
847
diff2 = temp2 + __msa_asub_s_h( diff2, diff6 );
848
diff3 = temp3 + __msa_asub_s_h( diff3, diff7 );
849
diff0 = ( diff0 + diff1 + diff2 + diff3 );
850
851
u_sum = HADD_UH_U32( diff0 );
852
853
return u_sum;
854
}
855
856
static uint64_t pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, int32_t i_stride )
857
{
858
int16_t tmp0, tmp1, tmp2, tmp3;
859
uint32_t u_sum4 = 0, u_sum8 = 0, u_dc;
860
v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
861
v8i16 zero = { 0 };
862
v8i16 diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7;
863
v8i16 sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7;
864
v8i16 temp0, temp1, temp2, temp3;
865
866
LD_UB8( p_pix, i_stride, src0, src1, src2, src3, src4, src5, src6, src7 );
867
868
ILVR_B4_SH( zero, src0, zero, src1, zero, src2, zero, src3, diff0, diff1,
869
diff2, diff3 );
870
ILVR_B4_SH( zero, src4, zero, src5, zero, src6, zero, src7, diff4, diff5,
871
diff6, diff7 );
872
TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3,
873
diff4, diff5, diff6, diff7,
874
diff0, diff1, diff2, diff3,
875
diff4, diff5, diff6, diff7 );
876
BUTTERFLY_4( diff0, diff2, diff3, diff1,
877
temp0, temp2, temp3, temp1 );
878
BUTTERFLY_4( temp0, temp1, temp3, temp2,
879
diff0, diff1, diff3, diff2 );
880
BUTTERFLY_4( diff4, diff6, diff7, diff5,
881
temp0, temp2, temp3, temp1 );
882
BUTTERFLY_4( temp0, temp1, temp3, temp2,
883
diff4, diff5, diff7, diff6 );
884
TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3,
885
diff4, diff5, diff6, diff7,
886
diff0, diff1, diff2, diff3,
887
diff4, diff5, diff6, diff7 );
888
BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
889
BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
890
BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
891
BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
892
893
tmp0 = diff0[0];
894
tmp1 = diff0[4];
895
tmp2 = diff4[0];
896
tmp3 = diff4[4];
897
898
sub0 = __msa_add_a_h( diff0, zero );
899
sub1 = __msa_add_a_h( diff1, zero );
900
sub2 = __msa_add_a_h( diff2, zero );
901
sub3 = __msa_add_a_h( diff3, zero );
902
sub4 = __msa_add_a_h( diff4, zero );
903
sub5 = __msa_add_a_h( diff5, zero );
904
sub6 = __msa_add_a_h( diff6, zero );
905
sub7 = __msa_add_a_h( diff7, zero );
906
907
sub0 = ( sub0 + sub1 + sub2 + sub3 );
908
sub1 = ( sub4 + sub5 + sub6 + sub7 );
909
sub0 += sub1;
910
911
u_sum4 += HADD_UH_U32( sub0 );
912
913
TRANSPOSE8x8_SH_SH( diff0, diff1, diff2, diff3, diff4, diff5, diff6, diff7,
914
sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7 );
915
916
ILVR_D2_SH( sub2, sub0, sub6, sub4, diff0, diff1 );
917
ILVR_D2_SH( sub3, sub1, sub7, sub5, diff4, diff6 );
918
919
diff2 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub2, ( v2i64 ) sub0 );
920
diff3 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub6, ( v2i64 ) sub4 );
921
diff5 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub3, ( v2i64 ) sub1 );
922
diff7 = ( v8i16 ) __msa_ilvl_d( ( v2i64 ) sub7, ( v2i64 ) sub5 );
923
924
BUTTERFLY_4( diff0, diff2, diff3, diff1, temp0, temp2, temp3, temp1 );
925
BUTTERFLY_4( temp0, temp1, temp3, temp2, diff0, diff1, diff3, diff2 );
926
BUTTERFLY_4( diff4, diff6, diff7, diff5, temp0, temp2, temp3, temp1 );
927
BUTTERFLY_4( temp0, temp1, temp3, temp2, diff4, diff5, diff7, diff6 );
928
929
sub0 = __msa_add_a_h( diff0, zero );
930
sub1 = __msa_add_a_h( diff1, zero );
931
sub2 = __msa_add_a_h( diff2, zero );
932
sub3 = __msa_add_a_h( diff3, zero );
933
sub4 = __msa_add_a_h( diff4, zero );
934
sub5 = __msa_add_a_h( diff5, zero );
935
sub6 = __msa_add_a_h( diff6, zero );
936
sub7 = __msa_add_a_h( diff7, zero );
937
938
sub0 = ( sub0 + sub1 + sub2 + sub3 );
939
sub1 = ( sub4 + sub5 + sub6 + sub7 );
940
sub0 += sub1;
941
942
u_sum8 += HADD_UH_U32( sub0 );
943
944
u_dc = ( uint16_t ) ( tmp0 + tmp1 + tmp2 + tmp3 );
945
u_sum4 = u_sum4 - u_dc;
946
u_sum8 = u_sum8 - u_dc;
947
948
return ( ( uint64_t ) u_sum8 << 32 ) + u_sum4;
949
}
950
951
int32_t x264_pixel_sad_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
952
uint8_t *p_ref, intptr_t i_ref_stride )
953
{
954
return sad_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
955
}
956
957
int32_t x264_pixel_sad_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
958
uint8_t *p_ref, intptr_t i_ref_stride )
959
{
960
return sad_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
961
}
962
963
int32_t x264_pixel_sad_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
964
uint8_t *p_ref, intptr_t i_ref_stride )
965
{
966
return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
967
}
968
969
int32_t x264_pixel_sad_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
970
uint8_t *p_ref, intptr_t i_ref_stride )
971
{
972
return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
973
}
974
975
int32_t x264_pixel_sad_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
976
uint8_t *p_ref, intptr_t i_ref_stride )
977
{
978
return sad_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
979
}
980
981
int32_t x264_pixel_sad_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
982
uint8_t *p_ref, intptr_t i_ref_stride )
983
{
984
return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
985
}
986
987
int32_t x264_pixel_sad_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
988
uint8_t *p_ref, intptr_t i_ref_stride )
989
{
990
return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
991
}
992
993
int32_t x264_pixel_sad_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
994
uint8_t *p_ref, intptr_t i_ref_stride )
995
{
996
return sad_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
997
}
998
999
void x264_pixel_sad_x4_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
1000
uint8_t *p_ref1, uint8_t *p_ref2,
1001
uint8_t *p_ref3, intptr_t i_ref_stride,
1002
int32_t p_sad_array[4] )
1003
{
1004
uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
1005
1006
sad_16width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 16,
1007
( uint32_t * ) p_sad_array );
1008
}
1009
1010
void x264_pixel_sad_x4_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
1011
uint8_t *p_ref1, uint8_t *p_ref2,
1012
uint8_t *p_ref3, intptr_t i_ref_stride,
1013
int32_t p_sad_array[4] )
1014
{
1015
uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
1016
1017
sad_16width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8,
1018
( uint32_t * ) p_sad_array );
1019
}
1020
1021
void x264_pixel_sad_x4_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
1022
uint8_t *p_ref1, uint8_t *p_ref2,
1023
uint8_t *p_ref3, intptr_t i_ref_stride,
1024
int32_t p_sad_array[4] )
1025
{
1026
uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
1027
1028
sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 16,
1029
( uint32_t * ) p_sad_array );
1030
}
1031
1032
void x264_pixel_sad_x4_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
1033
uint8_t *p_ref1, uint8_t *p_ref2,
1034
uint8_t *p_ref3, intptr_t i_ref_stride,
1035
int32_t p_sad_array[4] )
1036
{
1037
uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
1038
1039
sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8,
1040
( uint32_t * ) p_sad_array );
1041
}
1042
1043
void x264_pixel_sad_x4_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
1044
uint8_t *p_ref1, uint8_t *p_ref2,
1045
uint8_t *p_ref3, intptr_t i_ref_stride,
1046
int32_t p_sad_array[4] )
1047
{
1048
uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
1049
1050
sad_8width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 4,
1051
( uint32_t * ) p_sad_array );
1052
}
1053
1054
void x264_pixel_sad_x4_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
1055
uint8_t *p_ref1, uint8_t *p_ref2,
1056
uint8_t *p_ref3, intptr_t i_ref_stride,
1057
int32_t p_sad_array[4] )
1058
{
1059
uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
1060
1061
sad_4width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 8,
1062
( uint32_t * ) p_sad_array );
1063
}
1064
1065
void x264_pixel_sad_x4_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
1066
uint8_t *p_ref1, uint8_t *p_ref2,
1067
uint8_t *p_ref3, intptr_t i_ref_stride,
1068
int32_t p_sad_array[4] )
1069
{
1070
uint8_t *p_aref[4] = { p_ref0, p_ref1, p_ref2, p_ref3 };
1071
1072
sad_4width_x4d_msa( p_src, FENC_STRIDE, p_aref, i_ref_stride, 4,
1073
( uint32_t * ) p_sad_array );
1074
}
1075
1076
void x264_pixel_sad_x3_16x16_msa( uint8_t *p_src, uint8_t *p_ref0,
1077
uint8_t *p_ref1, uint8_t *p_ref2,
1078
intptr_t i_ref_stride,
1079
int32_t p_sad_array[3] )
1080
{
1081
sad_16width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
1082
i_ref_stride, 16, ( uint32_t * ) p_sad_array );
1083
}
1084
1085
void x264_pixel_sad_x3_16x8_msa( uint8_t *p_src, uint8_t *p_ref0,
1086
uint8_t *p_ref1, uint8_t *p_ref2,
1087
intptr_t i_ref_stride,
1088
int32_t p_sad_array[3] )
1089
{
1090
sad_16width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
1091
i_ref_stride, 8, ( uint32_t * ) p_sad_array );
1092
}
1093
1094
void x264_pixel_sad_x3_8x16_msa( uint8_t *p_src, uint8_t *p_ref0,
1095
uint8_t *p_ref1, uint8_t *p_ref2,
1096
intptr_t i_ref_stride,
1097
int32_t p_sad_array[3] )
1098
{
1099
sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
1100
i_ref_stride, 16, ( uint32_t * ) p_sad_array );
1101
}
1102
1103
void x264_pixel_sad_x3_8x8_msa( uint8_t *p_src, uint8_t *p_ref0,
1104
uint8_t *p_ref1, uint8_t *p_ref2,
1105
intptr_t i_ref_stride,
1106
int32_t p_sad_array[3] )
1107
{
1108
sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
1109
i_ref_stride, 8, ( uint32_t * ) p_sad_array );
1110
}
1111
1112
void x264_pixel_sad_x3_8x4_msa( uint8_t *p_src, uint8_t *p_ref0,
1113
uint8_t *p_ref1, uint8_t *p_ref2,
1114
intptr_t i_ref_stride,
1115
int32_t p_sad_array[3] )
1116
{
1117
sad_8width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
1118
i_ref_stride, 4, ( uint32_t * ) p_sad_array );
1119
}
1120
1121
void x264_pixel_sad_x3_4x8_msa( uint8_t *p_src, uint8_t *p_ref0,
1122
uint8_t *p_ref1, uint8_t *p_ref2,
1123
intptr_t i_ref_stride,
1124
int32_t p_sad_array[3] )
1125
{
1126
sad_4width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
1127
i_ref_stride, 8, ( uint32_t * ) p_sad_array );
1128
}
1129
1130
void x264_pixel_sad_x3_4x4_msa( uint8_t *p_src, uint8_t *p_ref0,
1131
uint8_t *p_ref1, uint8_t *p_ref2,
1132
intptr_t i_ref_stride,
1133
int32_t p_sad_array[3] )
1134
{
1135
sad_4width_x3d_msa( p_src, FENC_STRIDE, p_ref0, p_ref1, p_ref2,
1136
i_ref_stride, 4, ( uint32_t * ) p_sad_array );
1137
}
1138
1139
int32_t x264_pixel_ssd_16x16_msa( uint8_t *p_src, intptr_t i_src_stride,
1140
uint8_t *p_ref, intptr_t i_ref_stride )
1141
{
1142
return sse_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
1143
}
1144
1145
int32_t x264_pixel_ssd_16x8_msa( uint8_t *p_src, intptr_t i_src_stride,
1146
uint8_t *p_ref, intptr_t i_ref_stride )
1147
{
1148
return sse_16width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
1149
}
1150
1151
int32_t x264_pixel_ssd_8x16_msa( uint8_t *p_src, intptr_t i_src_stride,
1152
uint8_t *p_ref, intptr_t i_ref_stride )
1153
{
1154
return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
1155
}
1156
1157
int32_t x264_pixel_ssd_8x8_msa( uint8_t *p_src, intptr_t i_src_stride,
1158
uint8_t *p_ref, intptr_t i_ref_stride )
1159
{
1160
return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
1161
}
1162
1163
int32_t x264_pixel_ssd_8x4_msa( uint8_t *p_src, intptr_t i_src_stride,
1164
uint8_t *p_ref, intptr_t i_ref_stride )
1165
{
1166
return sse_8width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
1167
}
1168
1169
int32_t x264_pixel_ssd_4x16_msa( uint8_t *p_src, intptr_t i_src_stride,
1170
uint8_t *p_ref, intptr_t i_ref_stride )
1171
{
1172
return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 16 );
1173
}
1174
1175
int32_t x264_pixel_ssd_4x8_msa( uint8_t *p_src, intptr_t i_src_stride,
1176
uint8_t *p_ref, intptr_t i_ref_stride )
1177
{
1178
return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 8 );
1179
}
1180
1181
int32_t x264_pixel_ssd_4x4_msa( uint8_t *p_src, intptr_t i_src_stride,
1182
uint8_t *p_ref, intptr_t i_ref_stride )
1183
{
1184
return sse_4width_msa( p_src, i_src_stride, p_ref, i_ref_stride, 4 );
1185
}
1186
1187
void x264_intra_sad_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
1188
int32_t p_sad_array[3] )
1189
{
1190
x264_intra_predict_vert_4x4_msa( p_dec );
1191
p_sad_array[0] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE,
1192
p_enc, FENC_STRIDE );
1193
1194
x264_intra_predict_hor_4x4_msa( p_dec );
1195
p_sad_array[1] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE,
1196
p_enc, FENC_STRIDE );
1197
1198
x264_intra_predict_dc_4x4_msa( p_dec );
1199
p_sad_array[2] = x264_pixel_sad_4x4_msa( p_dec, FDEC_STRIDE,
1200
p_enc, FENC_STRIDE );
1201
}
1202
1203
void x264_intra_sad_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
1204
int32_t p_sad_array[3] )
1205
{
1206
x264_intra_predict_vert_16x16_msa( p_dec );
1207
p_sad_array[0] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE,
1208
p_enc, FENC_STRIDE );
1209
1210
x264_intra_predict_hor_16x16_msa( p_dec );
1211
p_sad_array[1] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE,
1212
p_enc, FENC_STRIDE );
1213
1214
x264_intra_predict_dc_16x16_msa( p_dec );
1215
p_sad_array[2] = x264_pixel_sad_16x16_msa( p_dec, FDEC_STRIDE,
1216
p_enc, FENC_STRIDE );
1217
}
1218
1219
void x264_intra_sad_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
1220
int32_t p_sad_array[3] )
1221
{
1222
ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
1223
1224
x264_intra_predict_v_8x8_msa( pix, p_edge );
1225
p_sad_array[0] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE,
1226
p_enc, FENC_STRIDE );
1227
1228
x264_intra_predict_h_8x8_msa( pix, p_edge );
1229
p_sad_array[1] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE,
1230
p_enc, FENC_STRIDE );
1231
1232
x264_intra_predict_dc_8x8_msa( pix, p_edge );
1233
p_sad_array[2] = x264_pixel_sad_8x8_msa( pix, FDEC_STRIDE,
1234
p_enc, FENC_STRIDE );
1235
}
1236
1237
void x264_intra_sad_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
1238
int32_t p_sad_array[3] )
1239
{
1240
x264_intra_predict_dc_4blk_8x8_msa( p_dec );
1241
p_sad_array[0] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE,
1242
p_enc, FENC_STRIDE );
1243
1244
x264_intra_predict_hor_8x8_msa( p_dec );
1245
p_sad_array[1] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE,
1246
p_enc, FENC_STRIDE );
1247
1248
x264_intra_predict_vert_8x8_msa( p_dec );
1249
p_sad_array[2] = x264_pixel_sad_8x8_msa( p_dec, FDEC_STRIDE,
1250
p_enc, FENC_STRIDE );
1251
}
1252
1253
void x264_ssim_4x4x2_core_msa( const uint8_t *p_pix1, intptr_t i_stride1,
1254
const uint8_t *p_pix2, intptr_t i_stride2,
1255
int32_t i_sums[2][4] )
1256
{
1257
ssim_4x4x2_core_msa( p_pix1, i_stride1, p_pix2, i_stride2, i_sums );
1258
}
1259
1260
uint64_t x264_pixel_hadamard_ac_8x8_msa( uint8_t *p_pix, intptr_t i_stride )
1261
{
1262
uint64_t u_sum;
1263
1264
u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
1265
1266
return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
1267
}
1268
1269
uint64_t x264_pixel_hadamard_ac_8x16_msa( uint8_t *p_pix, intptr_t i_stride )
1270
{
1271
uint64_t u_sum;
1272
1273
u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
1274
u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride, i_stride );
1275
1276
return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
1277
}
1278
1279
uint64_t x264_pixel_hadamard_ac_16x8_msa( uint8_t *p_pix, intptr_t i_stride )
1280
{
1281
uint64_t u_sum;
1282
1283
u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
1284
u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8, i_stride );
1285
1286
return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
1287
}
1288
1289
uint64_t x264_pixel_hadamard_ac_16x16_msa( uint8_t *p_pix, intptr_t i_stride )
1290
{
1291
uint64_t u_sum;
1292
1293
u_sum = pixel_hadamard_ac_8x8_msa( p_pix, i_stride );
1294
u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8, i_stride );
1295
u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride, i_stride );
1296
u_sum += pixel_hadamard_ac_8x8_msa( p_pix + 8 * i_stride + 8, i_stride );
1297
1298
return ( ( u_sum >> 34 ) << 32 ) + ( ( uint32_t ) u_sum >> 1 );
1299
}
1300
1301
int32_t x264_pixel_satd_4x4_msa( uint8_t *p_pix1, intptr_t i_stride,
1302
uint8_t *p_pix2, intptr_t i_stride2 )
1303
{
1304
return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 4 );
1305
}
1306
1307
int32_t x264_pixel_satd_4x8_msa( uint8_t *p_pix1, intptr_t i_stride,
1308
uint8_t *p_pix2, intptr_t i_stride2 )
1309
{
1310
return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 );
1311
}
1312
1313
int32_t x264_pixel_satd_4x16_msa( uint8_t *p_pix1, intptr_t i_stride,
1314
uint8_t *p_pix2, intptr_t i_stride2 )
1315
{
1316
return pixel_satd_4width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 );
1317
}
1318
1319
int32_t x264_pixel_satd_8x4_msa( uint8_t *p_pix1, intptr_t i_stride,
1320
uint8_t *p_pix2, intptr_t i_stride2 )
1321
{
1322
return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 4 );
1323
}
1324
1325
int32_t x264_pixel_satd_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
1326
uint8_t *p_pix2, intptr_t i_stride2 )
1327
{
1328
return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 );
1329
}
1330
1331
int32_t x264_pixel_satd_8x16_msa( uint8_t *p_pix1, intptr_t i_stride,
1332
uint8_t *p_pix2, intptr_t i_stride2 )
1333
{
1334
return pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 );
1335
}
1336
1337
int32_t x264_pixel_satd_16x8_msa( uint8_t *p_pix1, intptr_t i_stride,
1338
uint8_t *p_pix2, intptr_t i_stride2 )
1339
{
1340
uint32_t u32Sum = 0;
1341
1342
u32Sum = pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 8 );
1343
u32Sum += pixel_satd_8width_msa( p_pix1 + 8, i_stride,
1344
p_pix2 + 8, i_stride2, 8 );
1345
1346
return u32Sum;
1347
}
1348
1349
int32_t x264_pixel_satd_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
1350
uint8_t *p_pix2, intptr_t i_stride2 )
1351
{
1352
uint32_t u32Sum = 0;
1353
1354
u32Sum = pixel_satd_8width_msa( p_pix1, i_stride, p_pix2, i_stride2, 16 );
1355
u32Sum += pixel_satd_8width_msa( p_pix1 + 8, i_stride,
1356
p_pix2 + 8, i_stride2, 16 );
1357
1358
return u32Sum;
1359
}
1360
1361
int32_t x264_pixel_sa8d_8x8_msa( uint8_t *p_pix1, intptr_t i_stride,
1362
uint8_t *p_pix2, intptr_t i_stride2 )
1363
{
1364
int32_t i32Sum = sa8d_8x8_msa( p_pix1, i_stride, p_pix2, i_stride2 );
1365
1366
return ( i32Sum + 2 ) >> 2;
1367
}
1368
1369
int32_t x264_pixel_sa8d_16x16_msa( uint8_t *p_pix1, intptr_t i_stride,
1370
uint8_t *p_pix2, intptr_t i_stride2 )
1371
{
1372
int32_t i32Sum = sa8d_8x8_msa( p_pix1, i_stride, p_pix2, i_stride2 ) +
1373
sa8d_8x8_msa( p_pix1 + 8, i_stride,
1374
p_pix2 + 8, i_stride2 ) +
1375
sa8d_8x8_msa( p_pix1 + 8 * i_stride, i_stride,
1376
p_pix2 + 8 * i_stride2, i_stride2 ) +
1377
sa8d_8x8_msa( p_pix1 + 8 + 8 * i_stride, i_stride,
1378
p_pix2 + 8 + 8 * i_stride2, i_stride2 );
1379
1380
return ( i32Sum + 2 ) >> 2;
1381
}
1382
1383
void x264_intra_satd_x3_4x4_msa( uint8_t *p_enc, uint8_t *p_dec,
1384
int32_t p_sad_array[3] )
1385
{
1386
x264_intra_predict_vert_4x4_msa( p_dec );
1387
p_sad_array[0] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE,
1388
p_enc, FENC_STRIDE );
1389
1390
x264_intra_predict_hor_4x4_msa( p_dec );
1391
p_sad_array[1] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE,
1392
p_enc, FENC_STRIDE );
1393
1394
x264_intra_predict_dc_4x4_msa( p_dec );
1395
p_sad_array[2] = x264_pixel_satd_4x4_msa( p_dec, FDEC_STRIDE,
1396
p_enc, FENC_STRIDE );
1397
}
1398
1399
void x264_intra_satd_x3_16x16_msa( uint8_t *p_enc, uint8_t *p_dec,
1400
int32_t p_sad_array[3] )
1401
{
1402
x264_intra_predict_vert_16x16_msa( p_dec );
1403
p_sad_array[0] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE,
1404
p_enc, FENC_STRIDE );
1405
1406
x264_intra_predict_hor_16x16_msa( p_dec );
1407
p_sad_array[1] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE,
1408
p_enc, FENC_STRIDE );
1409
1410
x264_intra_predict_dc_16x16_msa( p_dec );
1411
p_sad_array[2] = x264_pixel_satd_16x16_msa( p_dec, FDEC_STRIDE,
1412
p_enc, FENC_STRIDE );
1413
}
1414
1415
void x264_intra_sa8d_x3_8x8_msa( uint8_t *p_enc, uint8_t p_edge[36],
1416
int32_t p_sad_array[3] )
1417
{
1418
ALIGNED_ARRAY_16( uint8_t, pix, [8 * FDEC_STRIDE] );
1419
1420
x264_intra_predict_v_8x8_msa( pix, p_edge );
1421
p_sad_array[0] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE,
1422
p_enc, FENC_STRIDE );
1423
1424
x264_intra_predict_h_8x8_msa( pix, p_edge );
1425
p_sad_array[1] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE,
1426
p_enc, FENC_STRIDE );
1427
1428
x264_intra_predict_dc_8x8_msa( pix, p_edge );
1429
p_sad_array[2] = x264_pixel_sa8d_8x8_msa( pix, FDEC_STRIDE,
1430
p_enc, FENC_STRIDE );
1431
}
1432
1433
void x264_intra_satd_x3_8x8c_msa( uint8_t *p_enc, uint8_t *p_dec,
1434
int32_t p_sad_array[3] )
1435
{
1436
x264_intra_predict_dc_4blk_8x8_msa( p_dec );
1437
p_sad_array[0] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE,
1438
p_enc, FENC_STRIDE );
1439
1440
x264_intra_predict_hor_8x8_msa( p_dec );
1441
p_sad_array[1] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE,
1442
p_enc, FENC_STRIDE );
1443
1444
x264_intra_predict_vert_8x8_msa( p_dec );
1445
p_sad_array[2] = x264_pixel_satd_8x8_msa( p_dec, FDEC_STRIDE,
1446
p_enc, FENC_STRIDE );
1447
}
1448
1449
uint64_t x264_pixel_var_16x16_msa( uint8_t *p_pix, intptr_t i_stride )
1450
{
1451
return avc_pixel_var16width_msa( p_pix, i_stride, 16 );
1452
}
1453
1454
uint64_t x264_pixel_var_8x16_msa( uint8_t *p_pix, intptr_t i_stride )
1455
{
1456
return avc_pixel_var8width_msa( p_pix, i_stride, 16 );
1457
}
1458
1459
uint64_t x264_pixel_var_8x8_msa( uint8_t *p_pix, intptr_t i_stride )
1460
{
1461
return avc_pixel_var8width_msa( p_pix, i_stride, 8 );
1462
}
1463
1464
int32_t x264_pixel_var2_8x16_msa( uint8_t *p_pix1, intptr_t i_stride1,
1465
uint8_t *p_pix2, intptr_t i_stride2,
1466
int32_t *p_ssd )
1467
{
1468
int32_t i_var = 0, i_diff = 0, i_sqr = 0;
1469
1470
i_sqr = sse_diff_8width_msa( p_pix1, i_stride1, p_pix2, i_stride2, 16,
1471
&i_diff );
1472
i_var = VARIANCE_WxH( i_sqr, i_diff, 7 );
1473
*p_ssd = i_sqr;
1474
1475
return i_var;
1476
}
1477
1478
int32_t x264_pixel_var2_8x8_msa( uint8_t *p_pix1, intptr_t i_stride1,
1479
uint8_t *p_pix2, intptr_t i_stride2,
1480
int32_t *p_ssd )
1481
{
1482
int32_t i_var = 0, i_diff = 0, i_sqr = 0;
1483
1484
i_sqr = sse_diff_8width_msa( p_pix1, i_stride1,
1485
p_pix2, i_stride2, 8, &i_diff );
1486
i_var = VARIANCE_WxH( i_sqr, i_diff, 6 );
1487
*p_ssd = i_sqr;
1488
1489
return i_var;
1490
}
1491
#endif
1492
1493