Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
/*****************************************************************************
2
* deblock-c.c: msa deblocking
3
*****************************************************************************
4
* Copyright (C) 2015-2016 x264 project
5
*
6
* Authors: Neha Rana <[email protected]>
7
*
8
* This program is free software; you can redistribute it and/or modify
9
* it under the terms of the GNU General Public License as published by
10
* the Free Software Foundation; either version 2 of the License, or
11
* (at your option) any later version.
12
*
13
* This program is distributed in the hope that it will be useful,
14
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
* GNU General Public License for more details.
17
*
18
* You should have received a copy of the GNU General Public License
19
* along with this program; if not, write to the Free Software
20
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
*
22
* This program is also available under a commercial proprietary license.
23
* For more information, contact us at [email protected].
24
*****************************************************************************/
25
26
#include "common/common.h"
27
#include "macros.h"
28
29
#if !HIGH_BIT_DEPTH
30
#define AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_or_q3_org_in, p0_or_q0_org_in, \
31
q3_or_p3_org_in, p1_or_q1_org_in, \
32
p2_or_q2_org_in, q1_or_p1_org_in, \
33
p0_or_q0_out, p1_or_q1_out, p2_or_q2_out ) \
34
{ \
35
v8i16 threshold; \
36
v8i16 const3 = __msa_ldi_h( 3 ); \
37
\
38
threshold = p0_or_q0_org_in + q3_or_p3_org_in; \
39
threshold += p1_or_q1_org_in; \
40
\
41
p0_or_q0_out = threshold << 1; \
42
p0_or_q0_out += p2_or_q2_org_in; \
43
p0_or_q0_out += q1_or_p1_org_in; \
44
p0_or_q0_out = __msa_srari_h( p0_or_q0_out, 3 ); \
45
\
46
p1_or_q1_out = p2_or_q2_org_in + threshold; \
47
p1_or_q1_out = __msa_srari_h( p1_or_q1_out, 2 ); \
48
\
49
p2_or_q2_out = p2_or_q2_org_in * const3; \
50
p2_or_q2_out += p3_or_q3_org_in; \
51
p2_or_q2_out += p3_or_q3_org_in; \
52
p2_or_q2_out += threshold; \
53
p2_or_q2_out = __msa_srari_h( p2_or_q2_out, 3 ); \
54
}
55
56
/* data[-u32_u_img_width] = ( uint8_t )( ( 2 * p1 + p0 + q1 + 2 ) >> 2 ); */
57
#define AVC_LPF_P0_OR_Q0( p0_or_q0_org_in, q1_or_p1_org_in, \
58
p1_or_q1_org_in, p0_or_q0_out ) \
59
{ \
60
p0_or_q0_out = p0_or_q0_org_in + q1_or_p1_org_in; \
61
p0_or_q0_out += p1_or_q1_org_in; \
62
p0_or_q0_out += p1_or_q1_org_in; \
63
p0_or_q0_out = __msa_srari_h( p0_or_q0_out, 2 ); \
64
}
65
66
#define AVC_LPF_P1_OR_Q1( p0_or_q0_org_in, q0_or_p0_org_in, \
67
p1_or_q1_org_in, p2_or_q2_org_in, \
68
negate_tc_in, tc_in, p1_or_q1_out ) \
69
{ \
70
v8i16 clip3, temp; \
71
\
72
clip3 = ( v8i16 ) __msa_aver_u_h( ( v8u16 ) p0_or_q0_org_in, \
73
( v8u16 ) q0_or_p0_org_in ); \
74
temp = p1_or_q1_org_in << 1; \
75
clip3 -= temp; \
76
clip3 = __msa_ave_s_h( p2_or_q2_org_in, clip3 ); \
77
clip3 = CLIP_SH( clip3, negate_tc_in, tc_in ); \
78
p1_or_q1_out = p1_or_q1_org_in + clip3; \
79
}
80
81
#define AVC_LPF_P0Q0( q0_or_p0_org_in, p0_or_q0_org_in, \
82
p1_or_q1_org_in, q1_or_p1_org_in, \
83
negate_threshold_in, threshold_in, \
84
p0_or_q0_out, q0_or_p0_out ) \
85
{ \
86
v8i16 q0_sub_p0, p1_sub_q1, delta; \
87
\
88
q0_sub_p0 = q0_or_p0_org_in - p0_or_q0_org_in; \
89
p1_sub_q1 = p1_or_q1_org_in - q1_or_p1_org_in; \
90
q0_sub_p0 <<= 2; \
91
p1_sub_q1 += 4; \
92
delta = q0_sub_p0 + p1_sub_q1; \
93
delta >>= 3; \
94
\
95
delta = CLIP_SH( delta, negate_threshold_in, threshold_in ); \
96
\
97
p0_or_q0_out = p0_or_q0_org_in + delta; \
98
q0_or_p0_out = q0_or_p0_org_in - delta; \
99
\
100
CLIP_SH2_0_255( p0_or_q0_out, q0_or_p0_out ); \
101
}
102
103
static void avc_loopfilter_luma_intra_edge_hor_msa( uint8_t *p_data,
104
uint8_t u_alpha_in,
105
uint8_t u_beta_in,
106
uint32_t u_img_width )
107
{
108
v16u8 p2_asub_p0, q2_asub_q0, p0_asub_q0;
109
v16u8 alpha, beta;
110
v16u8 is_less_than, is_less_than_beta, negate_is_less_than_beta;
111
v16u8 p2, p1, p0, q0, q1, q2;
112
v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
113
v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
114
v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
115
v8i16 p2_r = { 0 };
116
v8i16 p1_r = { 0 };
117
v8i16 p0_r = { 0 };
118
v8i16 q0_r = { 0 };
119
v8i16 q1_r = { 0 };
120
v8i16 q2_r = { 0 };
121
v8i16 p2_l = { 0 };
122
v8i16 p1_l = { 0 };
123
v8i16 p0_l = { 0 };
124
v8i16 q0_l = { 0 };
125
v8i16 q1_l = { 0 };
126
v8i16 q2_l = { 0 };
127
v16u8 tmp_flag;
128
v16i8 zero = { 0 };
129
130
alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
131
beta = ( v16u8 ) __msa_fill_b( u_beta_in );
132
133
LD_UB4( p_data - ( u_img_width << 1 ), u_img_width,
134
p1_org, p0_org, q0_org, q1_org );
135
136
{
137
v16u8 p1_asub_p0, q1_asub_q0, is_less_than_alpha;
138
139
p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
140
p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
141
q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
142
143
is_less_than_alpha = ( p0_asub_q0 < alpha );
144
is_less_than_beta = ( p1_asub_p0 < beta );
145
is_less_than = is_less_than_beta & is_less_than_alpha;
146
is_less_than_beta = ( q1_asub_q0 < beta );
147
is_less_than = is_less_than_beta & is_less_than;
148
}
149
150
if( !__msa_test_bz_v( is_less_than ) )
151
{
152
q2_org = LD_UB( p_data + ( 2 * u_img_width ) );
153
p3_org = LD_UB( p_data - ( u_img_width << 2 ) );
154
p2_org = LD_UB( p_data - ( 3 * u_img_width ) );
155
156
UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
157
UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
158
UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
159
160
tmp_flag = alpha >> 2;
161
tmp_flag = tmp_flag + 2;
162
tmp_flag = ( p0_asub_q0 < tmp_flag );
163
164
p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
165
is_less_than_beta = ( p2_asub_p0 < beta );
166
is_less_than_beta = is_less_than_beta & tmp_flag;
167
negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
168
is_less_than_beta = is_less_than_beta & is_less_than;
169
negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
170
{
171
v8u16 is_less_than_beta_l, is_less_than_beta_r;
172
173
q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org );
174
175
is_less_than_beta_r =
176
( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
177
if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
178
{
179
v8i16 p3_org_r;
180
181
ILVR_B2_SH( zero, p3_org, zero, p2_org, p3_org_r, p2_r );
182
AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_r, p0_org_r,
183
q0_org_r, p1_org_r,
184
p2_r, q1_org_r, p0_r, p1_r, p2_r );
185
}
186
187
q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org );
188
189
is_less_than_beta_l =
190
( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
191
192
if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
193
{
194
v8i16 p3_org_l;
195
196
ILVL_B2_SH( zero, p3_org, zero, p2_org, p3_org_l, p2_l );
197
AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_l, p0_org_l,
198
q0_org_l, p1_org_l,
199
p2_l, q1_org_l, p0_l, p1_l, p2_l );
200
}
201
}
202
/* combine and store */
203
if( !__msa_test_bz_v( is_less_than_beta ) )
204
{
205
PCKEV_B3_UB( p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2 );
206
207
p0_org = __msa_bmnz_v( p0_org, p0, is_less_than_beta );
208
p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
209
p2_org = __msa_bmnz_v( p2_org, p2, is_less_than_beta );
210
211
ST_UB( p1_org, p_data - ( 2 * u_img_width ) );
212
ST_UB( p2_org, p_data - ( 3 * u_img_width ) );
213
}
214
{
215
v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
216
217
negate_is_less_than_beta_r =
218
( v8u16 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
219
zero, 8 );
220
if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_r ) )
221
{
222
AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
223
}
224
225
negate_is_less_than_beta_l =
226
( v8u16 ) __msa_sldi_b( zero,
227
( v16i8 ) negate_is_less_than_beta, 8 );
228
if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_l ) )
229
{
230
AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
231
}
232
}
233
if( !__msa_test_bz_v( negate_is_less_than_beta ) )
234
{
235
p0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p0_l, ( v16i8 ) p0_r );
236
p0_org = __msa_bmnz_v( p0_org, p0, negate_is_less_than_beta );
237
}
238
239
ST_UB( p0_org, p_data - u_img_width );
240
241
q3_org = LD_UB( p_data + ( 3 * u_img_width ) );
242
q2_asub_q0 = __msa_asub_u_b( q2_org, q0_org );
243
is_less_than_beta = ( q2_asub_q0 < beta );
244
is_less_than_beta = is_less_than_beta & tmp_flag;
245
negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
246
is_less_than_beta = is_less_than_beta & is_less_than;
247
negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
248
249
{
250
v8u16 is_less_than_beta_l, is_less_than_beta_r;
251
is_less_than_beta_r =
252
( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
253
if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
254
{
255
v8i16 q3_org_r;
256
257
ILVR_B2_SH( zero, q3_org, zero, q2_org, q3_org_r, q2_r );
258
AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_r, q0_org_r,
259
p0_org_r, q1_org_r,
260
q2_r, p1_org_r, q0_r, q1_r, q2_r );
261
}
262
is_less_than_beta_l =
263
( v8u16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
264
if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
265
{
266
v8i16 q3_org_l;
267
268
ILVL_B2_SH( zero, q3_org, zero, q2_org, q3_org_l, q2_l );
269
AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_l, q0_org_l,
270
p0_org_l, q1_org_l,
271
q2_l, p1_org_l, q0_l, q1_l, q2_l );
272
}
273
}
274
275
if( !__msa_test_bz_v( is_less_than_beta ) )
276
{
277
PCKEV_B3_UB( q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2 );
278
q0_org = __msa_bmnz_v( q0_org, q0, is_less_than_beta );
279
q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
280
q2_org = __msa_bmnz_v( q2_org, q2, is_less_than_beta );
281
282
ST_UB( q1_org, p_data + u_img_width );
283
ST_UB( q2_org, p_data + 2 * u_img_width );
284
}
285
{
286
v8u16 negate_is_less_than_beta_r, negate_is_less_than_beta_l;
287
negate_is_less_than_beta_r =
288
( v8u16 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
289
zero, 8 );
290
if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_r ) )
291
{
292
AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
293
}
294
295
negate_is_less_than_beta_l =
296
( v8u16 ) __msa_sldi_b( zero,
297
( v16i8 ) negate_is_less_than_beta, 8 );
298
if( !__msa_test_bz_v( ( v16u8 ) negate_is_less_than_beta_l ) )
299
{
300
AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
301
}
302
}
303
if( !__msa_test_bz_v( negate_is_less_than_beta ) )
304
{
305
q0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q0_l, ( v16i8 ) q0_r );
306
q0_org = __msa_bmnz_v( q0_org, q0, negate_is_less_than_beta );
307
}
308
309
ST_UB( q0_org, p_data );
310
}
311
}
312
313
static void avc_loopfilter_luma_intra_edge_ver_msa( uint8_t *p_data,
314
uint8_t u_alpha_in,
315
uint8_t u_beta_in,
316
uint32_t u_img_width )
317
{
318
uint8_t *p_src;
319
v16u8 alpha, beta, p0_asub_q0;
320
v16u8 is_less_than_alpha, is_less_than;
321
v16u8 is_less_than_beta, negate_is_less_than_beta;
322
v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
323
v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
324
v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
325
v8i16 p2_r = { 0 };
326
v8i16 p1_r = { 0 };
327
v8i16 p0_r = { 0 };
328
v8i16 q0_r = { 0 };
329
v8i16 q1_r = { 0 };
330
v8i16 q2_r = { 0 };
331
v8i16 p2_l = { 0 };
332
v8i16 p1_l = { 0 };
333
v8i16 p0_l = { 0 };
334
v8i16 q0_l = { 0 };
335
v8i16 q1_l = { 0 };
336
v8i16 q2_l = { 0 };
337
v16i8 zero = { 0 };
338
v16u8 tmp_flag;
339
340
p_src = p_data - 4;
341
342
{
343
v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
344
v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
345
346
LD_UB8( p_src, u_img_width,
347
row0, row1, row2, row3, row4, row5, row6, row7 );
348
LD_UB8( p_src + ( 8 * u_img_width ), u_img_width,
349
row8, row9, row10, row11, row12, row13, row14, row15 );
350
351
TRANSPOSE16x8_UB_UB( row0, row1, row2, row3,
352
row4, row5, row6, row7,
353
row8, row9, row10, row11,
354
row12, row13, row14, row15,
355
p3_org, p2_org, p1_org, p0_org,
356
q0_org, q1_org, q2_org, q3_org );
357
}
358
359
UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
360
UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
361
UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
362
UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l );
363
364
{
365
v16u8 p1_asub_p0, q1_asub_q0;
366
367
p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
368
p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
369
q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
370
371
alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
372
beta = ( v16u8 ) __msa_fill_b( u_beta_in );
373
374
is_less_than_alpha = ( p0_asub_q0 < alpha );
375
is_less_than_beta = ( p1_asub_p0 < beta );
376
is_less_than = is_less_than_beta & is_less_than_alpha;
377
is_less_than_beta = ( q1_asub_q0 < beta );
378
is_less_than = is_less_than_beta & is_less_than;
379
}
380
381
if( !__msa_test_bz_v( is_less_than ) )
382
{
383
tmp_flag = alpha >> 2;
384
tmp_flag = tmp_flag + 2;
385
tmp_flag = ( p0_asub_q0 < tmp_flag );
386
387
{
388
v16u8 p2_asub_p0;
389
390
p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
391
is_less_than_beta = ( p2_asub_p0 < beta );
392
}
393
is_less_than_beta = tmp_flag & is_less_than_beta;
394
negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
395
is_less_than_beta = is_less_than_beta & is_less_than;
396
negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
397
398
{
399
v16u8 is_less_than_beta_r;
400
401
is_less_than_beta_r =
402
( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
403
if( !__msa_test_bz_v( is_less_than_beta_r ) )
404
{
405
v8i16 p3_org_r;
406
407
ILVR_B2_SH( zero, p3_org, zero, p2_org, p3_org_r, p2_r );
408
AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_r, p0_org_r,
409
q0_org_r, p1_org_r,
410
p2_r, q1_org_r, p0_r, p1_r, p2_r );
411
}
412
}
413
414
{
415
v16u8 is_less_than_beta_l;
416
417
is_less_than_beta_l =
418
( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
419
if( !__msa_test_bz_v( is_less_than_beta_l ) )
420
{
421
v8i16 p3_org_l;
422
423
ILVL_B2_SH( zero, p3_org, zero, p2_org, p3_org_l, p2_l );
424
AVC_LPF_P0P1P2_OR_Q0Q1Q2( p3_org_l, p0_org_l,
425
q0_org_l, p1_org_l,
426
p2_l, q1_org_l, p0_l, p1_l, p2_l );
427
}
428
}
429
if( !__msa_test_bz_v( is_less_than_beta ) )
430
{
431
v16u8 p0, p2, p1;
432
433
PCKEV_B3_UB( p0_l, p0_r, p1_l, p1_r, p2_l, p2_r, p0, p1, p2 );
434
p0_org = __msa_bmnz_v( p0_org, p0, is_less_than_beta );
435
p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
436
p2_org = __msa_bmnz_v( p2_org, p2, is_less_than_beta );
437
}
438
{
439
v16u8 negate_is_less_than_beta_r;
440
441
negate_is_less_than_beta_r =
442
( v16u8 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
443
zero, 8 );
444
445
if( !__msa_test_bz_v( negate_is_less_than_beta_r ) )
446
{
447
AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
448
}
449
}
450
{
451
v16u8 negate_is_less_than_beta_l;
452
453
negate_is_less_than_beta_l =
454
( v16u8 ) __msa_sldi_b( zero,
455
( v16i8 ) negate_is_less_than_beta, 8 );
456
if( !__msa_test_bz_v( negate_is_less_than_beta_l ) )
457
{
458
AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
459
}
460
}
461
462
if( !__msa_test_bz_v( negate_is_less_than_beta ) )
463
{
464
v16u8 p0;
465
466
p0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p0_l, ( v16i8 ) p0_r );
467
p0_org = __msa_bmnz_v( p0_org, p0, negate_is_less_than_beta );
468
}
469
470
{
471
v16u8 q2_asub_q0;
472
473
q2_asub_q0 = __msa_asub_u_b( q2_org, q0_org );
474
is_less_than_beta = ( q2_asub_q0 < beta );
475
}
476
477
is_less_than_beta = is_less_than_beta & tmp_flag;
478
negate_is_less_than_beta = __msa_xori_b( is_less_than_beta, 0xff );
479
480
is_less_than_beta = is_less_than_beta & is_less_than;
481
negate_is_less_than_beta = negate_is_less_than_beta & is_less_than;
482
483
{
484
v16u8 is_less_than_beta_r;
485
486
is_less_than_beta_r =
487
( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta, zero, 8 );
488
if( !__msa_test_bz_v( is_less_than_beta_r ) )
489
{
490
v8i16 q3_org_r;
491
492
ILVR_B2_SH( zero, q3_org, zero, q2_org, q3_org_r, q2_r );
493
AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_r, q0_org_r,
494
p0_org_r, q1_org_r,
495
q2_r, p1_org_r, q0_r, q1_r, q2_r );
496
}
497
}
498
{
499
v16u8 is_less_than_beta_l;
500
501
is_less_than_beta_l =
502
( v16u8 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than_beta, 8 );
503
if( !__msa_test_bz_v( is_less_than_beta_l ) )
504
{
505
v8i16 q3_org_l;
506
507
ILVL_B2_SH( zero, q3_org, zero, q2_org, q3_org_l, q2_l );
508
AVC_LPF_P0P1P2_OR_Q0Q1Q2( q3_org_l, q0_org_l,
509
p0_org_l, q1_org_l,
510
q2_l, p1_org_l, q0_l, q1_l, q2_l );
511
}
512
}
513
if( !__msa_test_bz_v( is_less_than_beta ) )
514
{
515
v16u8 q0, q1, q2;
516
517
PCKEV_B3_UB( q0_l, q0_r, q1_l, q1_r, q2_l, q2_r, q0, q1, q2 );
518
q0_org = __msa_bmnz_v( q0_org, q0, is_less_than_beta );
519
q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
520
q2_org = __msa_bmnz_v( q2_org, q2, is_less_than_beta );
521
}
522
523
{
524
v16u8 negate_is_less_than_beta_r;
525
526
negate_is_less_than_beta_r =
527
( v16u8 ) __msa_sldi_b( ( v16i8 ) negate_is_less_than_beta,
528
zero, 8 );
529
if( !__msa_test_bz_v( negate_is_less_than_beta_r ) )
530
{
531
AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
532
}
533
}
534
{
535
v16u8 negate_is_less_than_beta_l;
536
537
negate_is_less_than_beta_l =
538
( v16u8 ) __msa_sldi_b( zero,
539
( v16i8 ) negate_is_less_than_beta, 8 );
540
if( !__msa_test_bz_v( negate_is_less_than_beta_l ) )
541
{
542
AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
543
}
544
}
545
if( !__msa_test_bz_v( negate_is_less_than_beta ) )
546
{
547
v16u8 q0;
548
549
q0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q0_l, ( v16i8 ) q0_r );
550
q0_org = __msa_bmnz_v( q0_org, q0, negate_is_less_than_beta );
551
}
552
}
553
{
554
v8i16 tp0, tp1, tp2, tp3, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
555
556
ILVRL_B2_SH( p1_org, p2_org, tp0, tp2 );
557
ILVRL_B2_SH( q0_org, p0_org, tp1, tp3 );
558
ILVRL_B2_SH( q2_org, q1_org, tmp2, tmp5 );
559
560
ILVRL_H2_SH( tp1, tp0, tmp3, tmp4 );
561
ILVRL_H2_SH( tp3, tp2, tmp6, tmp7 );
562
563
p_src = p_data - 3;
564
ST4x4_UB( tmp3, tmp3, 0, 1, 2, 3, p_src, u_img_width );
565
ST2x4_UB( tmp2, 0, p_src + 4, u_img_width );
566
p_src += 4 * u_img_width;
567
ST4x4_UB( tmp4, tmp4, 0, 1, 2, 3, p_src, u_img_width );
568
ST2x4_UB( tmp2, 4, p_src + 4, u_img_width );
569
p_src += 4 * u_img_width;
570
571
ST4x4_UB( tmp6, tmp6, 0, 1, 2, 3, p_src, u_img_width );
572
ST2x4_UB( tmp5, 0, p_src + 4, u_img_width );
573
p_src += 4 * u_img_width;
574
ST4x4_UB( tmp7, tmp7, 0, 1, 2, 3, p_src, u_img_width );
575
ST2x4_UB( tmp5, 4, p_src + 4, u_img_width );
576
}
577
}
578
579
static void avc_lpf_cbcr_interleaved_intra_edge_hor_msa( uint8_t *p_chroma,
580
uint8_t u_alpha_in,
581
uint8_t u_beta_in,
582
uint32_t u_img_width )
583
{
584
v16u8 alpha, beta, is_less_than;
585
v16u8 p0, q0, p1_org, p0_org, q0_org, q1_org;
586
v8i16 p0_r = { 0 };
587
v8i16 q0_r = { 0 };
588
v8i16 p0_l = { 0 };
589
v8i16 q0_l = { 0 };
590
591
alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
592
beta = ( v16u8 ) __msa_fill_b( u_beta_in );
593
594
LD_UB4( p_chroma - ( u_img_width << 1 ), u_img_width,
595
p1_org, p0_org, q0_org, q1_org );
596
597
{
598
v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
599
v16u8 is_less_than_alpha, is_less_than_beta;
600
601
p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
602
p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
603
q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
604
605
is_less_than_alpha = ( p0_asub_q0 < alpha );
606
is_less_than_beta = ( p1_asub_p0 < beta );
607
is_less_than = is_less_than_beta & is_less_than_alpha;
608
is_less_than_beta = ( q1_asub_q0 < beta );
609
is_less_than = is_less_than_beta & is_less_than;
610
}
611
612
if( !__msa_test_bz_v( is_less_than ) )
613
{
614
v16i8 zero = { 0 };
615
v16u8 is_less_than_r, is_less_than_l;
616
617
is_less_than_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than,
618
zero, 8 );
619
if( !__msa_test_bz_v( is_less_than_r ) )
620
{
621
v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
622
623
ILVR_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
624
zero, q1_org, p1_org_r, p0_org_r, q0_org_r,
625
q1_org_r );
626
AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
627
AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
628
}
629
630
is_less_than_l = ( v16u8 ) __msa_sldi_b( zero,
631
( v16i8 ) is_less_than, 8 );
632
if( !__msa_test_bz_v( is_less_than_l ) )
633
{
634
v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
635
636
ILVL_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
637
zero, q1_org, p1_org_l, p0_org_l, q0_org_l,
638
q1_org_l );
639
AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
640
AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
641
}
642
643
PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
644
645
p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
646
q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
647
648
ST_UB( p0_org, ( p_chroma - u_img_width ) );
649
ST_UB( q0_org, p_chroma );
650
}
651
}
652
653
static void avc_lpf_cbcr_interleaved_intra_edge_ver_msa( uint8_t *p_chroma,
654
uint8_t u_alpha_in,
655
uint8_t u_beta_in,
656
uint32_t u_img_width )
657
{
658
v16u8 is_less_than;
659
v16u8 p0, q0, p1_org, p0_org, q0_org, q1_org;
660
v8i16 p0_r = { 0 };
661
v8i16 q0_r = { 0 };
662
v8i16 p0_l = { 0 };
663
v8i16 q0_l = { 0 };
664
v16u8 p1_u_org, p0_u_org, q0_u_org, q1_u_org;
665
v16u8 p1_v_org, p0_v_org, q0_v_org, q1_v_org;
666
v16i8 tmp0, tmp1, tmp2, tmp3;
667
v4i32 vec0, vec1;
668
v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
669
670
LD_UB8( ( p_chroma - 4 ), u_img_width,
671
row0, row1, row2, row3, row4, row5, row6, row7 );
672
673
TRANSPOSE8x8_UB_UB( row0, row1, row2, row3, row4, row5, row6, row7,
674
p1_u_org, p1_v_org, p0_u_org, p0_v_org,
675
q0_u_org, q0_v_org, q1_u_org, q1_v_org );
676
677
ILVR_D4_UB( p1_v_org, p1_u_org, p0_v_org, p0_u_org, q0_v_org, q0_u_org,
678
q1_v_org, q1_u_org, p1_org, p0_org, q0_org, q1_org );
679
680
{
681
v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
682
v16u8 is_less_than_beta, is_less_than_alpha, alpha, beta;
683
684
p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
685
p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
686
q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
687
688
alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
689
beta = ( v16u8 ) __msa_fill_b( u_beta_in );
690
691
is_less_than_alpha = ( p0_asub_q0 < alpha );
692
is_less_than_beta = ( p1_asub_p0 < beta );
693
is_less_than = is_less_than_beta & is_less_than_alpha;
694
is_less_than_beta = ( q1_asub_q0 < beta );
695
is_less_than = is_less_than_beta & is_less_than;
696
}
697
698
if( !__msa_test_bz_v( is_less_than ) )
699
{
700
v16u8 is_less_than_r, is_less_than_l;
701
v16i8 zero = { 0 };
702
703
is_less_than_r = ( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than,
704
zero, 8 );
705
if( !__msa_test_bz_v( is_less_than_r ) )
706
{
707
v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
708
709
ILVR_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
710
zero, q1_org, p1_org_r, p0_org_r, q0_org_r, q1_org_r );
711
AVC_LPF_P0_OR_Q0( p0_org_r, q1_org_r, p1_org_r, p0_r );
712
AVC_LPF_P0_OR_Q0( q0_org_r, p1_org_r, q1_org_r, q0_r );
713
}
714
715
is_less_than_l = ( v16u8 ) __msa_sldi_b( zero,
716
( v16i8 ) is_less_than, 8 );
717
if( !__msa_test_bz_v( is_less_than_l ) )
718
{
719
v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
720
721
ILVL_B4_SH( zero, p1_org, zero, p0_org, zero, q0_org,
722
zero, q1_org, p1_org_l, p0_org_l, q0_org_l, q1_org_l );
723
AVC_LPF_P0_OR_Q0( p0_org_l, q1_org_l, p1_org_l, p0_l );
724
AVC_LPF_P0_OR_Q0( q0_org_l, p1_org_l, q1_org_l, q0_l );
725
}
726
727
PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
728
729
p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
730
q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
731
732
SLDI_B2_0_UB( p0_org, q0_org, p0_v_org, q0_v_org, 8 );
733
ILVR_D2_SB( p0_v_org, p0_org, q0_v_org, q0_org, tmp0, tmp1 );
734
ILVRL_B2_SB( tmp1, tmp0, tmp2, tmp3 );
735
ILVRL_B2_SW( tmp3, tmp2, vec0, vec1 );
736
737
ST4x8_UB( vec0, vec1, ( p_chroma - 2 ), u_img_width );
738
}
739
}
740
741
static void avc_loopfilter_luma_inter_edge_ver_msa( uint8_t *p_data,
742
uint8_t u_bs0,
743
uint8_t u_bs1,
744
uint8_t u_bs2,
745
uint8_t u_bs3,
746
uint8_t u_tc0,
747
uint8_t u_tc1,
748
uint8_t u_tc2,
749
uint8_t u_tc3,
750
uint8_t u_alpha_in,
751
uint8_t u_beta_in,
752
uint32_t u_img_width )
753
{
754
uint8_t *p_src;
755
v16u8 beta, tmp_vec, bs = { 0 };
756
v16u8 tc = { 0 };
757
v16u8 is_less_than, is_less_than_beta;
758
v16u8 p1, p0, q0, q1;
759
v8i16 p0_r, q0_r, p1_r = { 0 };
760
v8i16 q1_r = { 0 };
761
v8i16 p0_l, q0_l, p1_l = { 0 };
762
v8i16 q1_l = { 0 };
763
v16u8 p3_org, p2_org, p1_org, p0_org, q0_org, q1_org, q2_org, q3_org;
764
v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
765
v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
766
v8i16 tc_r, tc_l;
767
v16i8 zero = { 0 };
768
v16u8 is_bs_greater_than0;
769
770
tmp_vec = ( v16u8 ) __msa_fill_b( u_bs0 );
771
bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 0, ( v4i32 ) tmp_vec );
772
tmp_vec = ( v16u8 ) __msa_fill_b( u_bs1 );
773
bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 1, ( v4i32 ) tmp_vec );
774
tmp_vec = ( v16u8 ) __msa_fill_b( u_bs2 );
775
bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 2, ( v4i32 ) tmp_vec );
776
tmp_vec = ( v16u8 ) __msa_fill_b( u_bs3 );
777
bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 3, ( v4i32 ) tmp_vec );
778
779
if( !__msa_test_bz_v( bs ) )
780
{
781
tmp_vec = ( v16u8 ) __msa_fill_b( u_tc0 );
782
tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 0, ( v4i32 ) tmp_vec );
783
tmp_vec = ( v16u8 ) __msa_fill_b( u_tc1 );
784
tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 1, ( v4i32 ) tmp_vec );
785
tmp_vec = ( v16u8 ) __msa_fill_b( u_tc2 );
786
tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 2, ( v4i32 ) tmp_vec );
787
tmp_vec = ( v16u8 ) __msa_fill_b( u_tc3 );
788
tc = ( v16u8 ) __msa_insve_w( ( v4i32 ) tc, 3, ( v4i32 ) tmp_vec );
789
790
is_bs_greater_than0 = ( zero < bs );
791
792
{
793
v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
794
v16u8 row8, row9, row10, row11, row12, row13, row14, row15;
795
796
p_src = p_data;
797
p_src -= 4;
798
799
LD_UB8( p_src, u_img_width,
800
row0, row1, row2, row3, row4, row5, row6, row7 );
801
p_src += ( 8 * u_img_width );
802
LD_UB8( p_src, u_img_width,
803
row8, row9, row10, row11, row12, row13, row14, row15 );
804
805
TRANSPOSE16x8_UB_UB( row0, row1, row2, row3, row4, row5, row6, row7,
806
row8, row9, row10, row11,
807
row12, row13, row14, row15,
808
p3_org, p2_org, p1_org, p0_org,
809
q0_org, q1_org, q2_org, q3_org );
810
}
811
{
812
v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0, alpha;
813
v16u8 is_less_than_alpha;
814
815
p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
816
p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
817
q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
818
819
alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
820
beta = ( v16u8 ) __msa_fill_b( u_beta_in );
821
822
is_less_than_alpha = ( p0_asub_q0 < alpha );
823
is_less_than_beta = ( p1_asub_p0 < beta );
824
is_less_than = is_less_than_beta & is_less_than_alpha;
825
is_less_than_beta = ( q1_asub_q0 < beta );
826
is_less_than = is_less_than_beta & is_less_than;
827
is_less_than = is_less_than & is_bs_greater_than0;
828
}
829
if( !__msa_test_bz_v( is_less_than ) )
830
{
831
v16i8 negate_tc, sign_negate_tc;
832
v8i16 negate_tc_r, i16_negatetc_l;
833
834
negate_tc = zero - ( v16i8 ) tc;
835
sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
836
837
ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r,
838
i16_negatetc_l );
839
840
UNPCK_UB_SH( tc, tc_r, tc_l );
841
UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
842
UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
843
UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
844
845
{
846
v16u8 p2_asub_p0;
847
v16u8 is_less_than_beta_r, is_less_than_beta_l;
848
849
p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
850
is_less_than_beta = ( p2_asub_p0 < beta );
851
is_less_than_beta = is_less_than_beta & is_less_than;
852
853
is_less_than_beta_r =
854
( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
855
zero, 8 );
856
if( !__msa_test_bz_v( is_less_than_beta_r ) )
857
{
858
p2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) p2_org );
859
860
AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, p1_org_r, p2_org_r,
861
negate_tc_r, tc_r, p1_r );
862
}
863
864
is_less_than_beta_l =
865
( v16u8 ) __msa_sldi_b( zero,
866
( v16i8 ) is_less_than_beta, 8 );
867
if( !__msa_test_bz_v( is_less_than_beta_l ) )
868
{
869
p2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) p2_org );
870
871
AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, p1_org_l, p2_org_l,
872
i16_negatetc_l, tc_l, p1_l );
873
}
874
}
875
876
if( !__msa_test_bz_v( is_less_than_beta ) )
877
{
878
p1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p1_l, ( v16i8 ) p1_r );
879
p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
880
881
is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
882
tc = tc + is_less_than_beta;
883
}
884
885
{
886
v16u8 u8_q2asub_q0;
887
v16u8 is_less_than_beta_l, is_less_than_beta_r;
888
889
u8_q2asub_q0 = __msa_asub_u_b( q2_org, q0_org );
890
is_less_than_beta = ( u8_q2asub_q0 < beta );
891
is_less_than_beta = is_less_than_beta & is_less_than;
892
893
q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org );
894
895
is_less_than_beta_r =
896
( v16u8 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
897
zero, 8 );
898
if( !__msa_test_bz_v( is_less_than_beta_r ) )
899
{
900
q2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q2_org );
901
AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, q1_org_r, q2_org_r,
902
negate_tc_r, tc_r, q1_r );
903
}
904
905
q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org );
906
907
is_less_than_beta_l =
908
( v16u8 ) __msa_sldi_b( zero,
909
( v16i8 ) is_less_than_beta, 8 );
910
if( !__msa_test_bz_v( is_less_than_beta_l ) )
911
{
912
q2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q2_org );
913
AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, q1_org_l, q2_org_l,
914
i16_negatetc_l, tc_l, q1_l );
915
}
916
}
917
918
if( !__msa_test_bz_v( is_less_than_beta ) )
919
{
920
q1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q1_l, ( v16i8 ) q1_r );
921
q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
922
923
is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
924
tc = tc + is_less_than_beta;
925
}
926
927
{
928
v8i16 threshold_r, negate_thresh_r;
929
v8i16 threshold_l, negate_thresh_l;
930
v16i8 negate_thresh, sign_negate_thresh;
931
932
negate_thresh = zero - ( v16i8 ) tc;
933
sign_negate_thresh = __msa_clti_s_b( negate_thresh, 0 );
934
935
ILVR_B2_SH( zero, tc, sign_negate_thresh, negate_thresh,
936
threshold_r, negate_thresh_r );
937
938
AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
939
negate_thresh_r, threshold_r, p0_r, q0_r );
940
941
threshold_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) tc );
942
negate_thresh_l = ( v8i16 ) __msa_ilvl_b( sign_negate_thresh,
943
negate_thresh );
944
945
AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
946
negate_thresh_l, threshold_l, p0_l, q0_l );
947
}
948
949
PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
950
951
p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
952
q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
953
}
954
{
955
v16i8 tp0, tp1, tp2, tp3;
956
v8i16 tmp2, tmp5;
957
v4i32 tmp3, tmp4, tmp6, tmp7;
958
uint32_t u_out0, u_out2;
959
uint16_t u_out1, u_out3;
960
961
p_src = p_data - 3;
962
963
ILVRL_B2_SB( p1_org, p2_org, tp0, tp2 );
964
ILVRL_B2_SB( q0_org, p0_org, tp1, tp3 );
965
ILVRL_B2_SH( q2_org, q1_org, tmp2, tmp5 );
966
967
ILVRL_H2_SW( tp1, tp0, tmp3, tmp4 );
968
ILVRL_H2_SW( tp3, tp2, tmp6, tmp7 );
969
970
u_out0 = __msa_copy_u_w( tmp3, 0 );
971
u_out1 = __msa_copy_u_h( tmp2, 0 );
972
u_out2 = __msa_copy_u_w( tmp3, 1 );
973
u_out3 = __msa_copy_u_h( tmp2, 1 );
974
975
SW( u_out0, p_src );
976
SH( u_out1, ( p_src + 4 ) );
977
p_src += u_img_width;
978
SW( u_out2, p_src );
979
SH( u_out3, ( p_src + 4 ) );
980
981
u_out0 = __msa_copy_u_w( tmp3, 2 );
982
u_out1 = __msa_copy_u_h( tmp2, 2 );
983
u_out2 = __msa_copy_u_w( tmp3, 3 );
984
u_out3 = __msa_copy_u_h( tmp2, 3 );
985
986
p_src += u_img_width;
987
SW( u_out0, p_src );
988
SH( u_out1, ( p_src + 4 ) );
989
p_src += u_img_width;
990
SW( u_out2, p_src );
991
SH( u_out3, ( p_src + 4 ) );
992
993
u_out0 = __msa_copy_u_w( tmp4, 0 );
994
u_out1 = __msa_copy_u_h( tmp2, 4 );
995
u_out2 = __msa_copy_u_w( tmp4, 1 );
996
u_out3 = __msa_copy_u_h( tmp2, 5 );
997
998
p_src += u_img_width;
999
SW( u_out0, p_src );
1000
SH( u_out1, ( p_src + 4 ) );
1001
p_src += u_img_width;
1002
SW( u_out2, p_src );
1003
SH( u_out3, ( p_src + 4 ) );
1004
1005
u_out0 = __msa_copy_u_w( tmp4, 2 );
1006
u_out1 = __msa_copy_u_h( tmp2, 6 );
1007
u_out2 = __msa_copy_u_w( tmp4, 3 );
1008
u_out3 = __msa_copy_u_h( tmp2, 7 );
1009
1010
p_src += u_img_width;
1011
SW( u_out0, p_src );
1012
SH( u_out1, ( p_src + 4 ) );
1013
p_src += u_img_width;
1014
SW( u_out2, p_src );
1015
SH( u_out3, ( p_src + 4 ) );
1016
1017
u_out0 = __msa_copy_u_w( tmp6, 0 );
1018
u_out1 = __msa_copy_u_h( tmp5, 0 );
1019
u_out2 = __msa_copy_u_w( tmp6, 1 );
1020
u_out3 = __msa_copy_u_h( tmp5, 1 );
1021
1022
p_src += u_img_width;
1023
SW( u_out0, p_src );
1024
SH( u_out1, ( p_src + 4 ) );
1025
p_src += u_img_width;
1026
SW( u_out2, p_src );
1027
SH( u_out3, ( p_src + 4 ) );
1028
1029
u_out0 = __msa_copy_u_w( tmp6, 2 );
1030
u_out1 = __msa_copy_u_h( tmp5, 2 );
1031
u_out2 = __msa_copy_u_w( tmp6, 3 );
1032
u_out3 = __msa_copy_u_h( tmp5, 3 );
1033
1034
p_src += u_img_width;
1035
SW( u_out0, p_src );
1036
SH( u_out1, ( p_src + 4 ) );
1037
p_src += u_img_width;
1038
SW( u_out2, p_src );
1039
SH( u_out3, ( p_src + 4 ) );
1040
1041
u_out0 = __msa_copy_u_w( tmp7, 0 );
1042
u_out1 = __msa_copy_u_h( tmp5, 4 );
1043
u_out2 = __msa_copy_u_w( tmp7, 1 );
1044
u_out3 = __msa_copy_u_h( tmp5, 5 );
1045
1046
p_src += u_img_width;
1047
SW( u_out0, p_src );
1048
SH( u_out1, ( p_src + 4 ) );
1049
p_src += u_img_width;
1050
SW( u_out2, p_src );
1051
SH( u_out3, ( p_src + 4 ) );
1052
1053
u_out0 = __msa_copy_u_w( tmp7, 2 );
1054
u_out1 = __msa_copy_u_h( tmp5, 6 );
1055
u_out2 = __msa_copy_u_w( tmp7, 3 );
1056
u_out3 = __msa_copy_u_h( tmp5, 7 );
1057
1058
p_src += u_img_width;
1059
SW( u_out0, p_src );
1060
SH( u_out1, ( p_src + 4 ) );
1061
p_src += u_img_width;
1062
SW( u_out2, p_src );
1063
SH( u_out3, ( p_src + 4 ) );
1064
}
1065
}
1066
}
1067
1068
static void avc_loopfilter_luma_inter_edge_hor_msa( uint8_t *p_data,
1069
uint8_t u_bs0,
1070
uint8_t u_bs1,
1071
uint8_t u_bs2,
1072
uint8_t u_bs3,
1073
uint8_t u_tc0,
1074
uint8_t u_tc1,
1075
uint8_t u_tc2,
1076
uint8_t u_tc3,
1077
uint8_t u_alpha_in,
1078
uint8_t u_beta_in,
1079
uint32_t u_image_width )
1080
{
1081
v16u8 p2_asub_p0, u8_q2asub_q0;
1082
v16u8 alpha, beta, is_less_than, is_less_than_beta;
1083
v16u8 p1, p0, q0, q1;
1084
v8i16 p1_r = { 0 };
1085
v8i16 p0_r, q0_r, q1_r = { 0 };
1086
v8i16 p1_l = { 0 };
1087
v8i16 p0_l, q0_l, q1_l = { 0 };
1088
v16u8 p2_org, p1_org, p0_org, q0_org, q1_org, q2_org;
1089
v8i16 p2_org_r, p1_org_r, p0_org_r, q0_org_r, q1_org_r, q2_org_r;
1090
v8i16 p2_org_l, p1_org_l, p0_org_l, q0_org_l, q1_org_l, q2_org_l;
1091
v16i8 zero = { 0 };
1092
v16u8 tmp_vec;
1093
v16u8 bs = { 0 };
1094
v16i8 tc = { 0 };
1095
1096
tmp_vec = ( v16u8 ) __msa_fill_b( u_bs0 );
1097
bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 0, ( v4i32 ) tmp_vec );
1098
tmp_vec = ( v16u8 ) __msa_fill_b( u_bs1 );
1099
bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 1, ( v4i32 ) tmp_vec );
1100
tmp_vec = ( v16u8 ) __msa_fill_b( u_bs2 );
1101
bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 2, ( v4i32 ) tmp_vec );
1102
tmp_vec = ( v16u8 ) __msa_fill_b( u_bs3 );
1103
bs = ( v16u8 ) __msa_insve_w( ( v4i32 ) bs, 3, ( v4i32 ) tmp_vec );
1104
1105
if( !__msa_test_bz_v( bs ) )
1106
{
1107
tmp_vec = ( v16u8 ) __msa_fill_b( u_tc0 );
1108
tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 0, ( v4i32 ) tmp_vec );
1109
tmp_vec = ( v16u8 ) __msa_fill_b( u_tc1 );
1110
tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 1, ( v4i32 ) tmp_vec );
1111
tmp_vec = ( v16u8 ) __msa_fill_b( u_tc2 );
1112
tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 2, ( v4i32 ) tmp_vec );
1113
tmp_vec = ( v16u8 ) __msa_fill_b( u_tc3 );
1114
tc = ( v16i8 ) __msa_insve_w( ( v4i32 ) tc, 3, ( v4i32 ) tmp_vec );
1115
1116
alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
1117
beta = ( v16u8 ) __msa_fill_b( u_beta_in );
1118
1119
LD_UB5( p_data - ( 3 * u_image_width ), u_image_width,
1120
p2_org, p1_org, p0_org, q0_org, q1_org );
1121
1122
{
1123
v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1124
v16u8 is_less_than_alpha, is_bs_greater_than0;
1125
1126
is_bs_greater_than0 = ( ( v16u8 ) zero < bs );
1127
p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
1128
p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
1129
q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
1130
1131
is_less_than_alpha = ( p0_asub_q0 < alpha );
1132
is_less_than_beta = ( p1_asub_p0 < beta );
1133
is_less_than = is_less_than_beta & is_less_than_alpha;
1134
is_less_than_beta = ( q1_asub_q0 < beta );
1135
is_less_than = is_less_than_beta & is_less_than;
1136
is_less_than = is_less_than & is_bs_greater_than0;
1137
}
1138
1139
if( !__msa_test_bz_v( is_less_than ) )
1140
{
1141
v16i8 sign_negate_tc, negate_tc;
1142
v8i16 negate_tc_r, i16_negatetc_l, tc_l, tc_r;
1143
1144
q2_org = LD_UB( p_data + ( 2 * u_image_width ) );
1145
negate_tc = zero - tc;
1146
sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
1147
1148
ILVRL_B2_SH( sign_negate_tc, negate_tc,
1149
negate_tc_r, i16_negatetc_l );
1150
1151
UNPCK_UB_SH( tc, tc_r, tc_l );
1152
UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
1153
UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
1154
UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
1155
1156
p2_asub_p0 = __msa_asub_u_b( p2_org, p0_org );
1157
is_less_than_beta = ( p2_asub_p0 < beta );
1158
is_less_than_beta = is_less_than_beta & is_less_than;
1159
{
1160
v8u16 is_less_than_beta_r, is_less_than_beta_l;
1161
1162
is_less_than_beta_r =
1163
( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
1164
zero, 8 );
1165
if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
1166
{
1167
p2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) p2_org );
1168
1169
AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, p1_org_r, p2_org_r,
1170
negate_tc_r, tc_r, p1_r );
1171
}
1172
1173
is_less_than_beta_l =
1174
( v8u16 ) __msa_sldi_b( zero,
1175
( v16i8 ) is_less_than_beta, 8 );
1176
if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
1177
{
1178
p2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) p2_org );
1179
1180
AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, p1_org_l, p2_org_l,
1181
i16_negatetc_l, tc_l, p1_l );
1182
}
1183
}
1184
if( !__msa_test_bz_v( is_less_than_beta ) )
1185
{
1186
p1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) p1_l, ( v16i8 ) p1_r );
1187
p1_org = __msa_bmnz_v( p1_org, p1, is_less_than_beta );
1188
ST_UB( p1_org, p_data - ( 2 * u_image_width ) );
1189
1190
is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
1191
tc = tc + ( v16i8 ) is_less_than_beta;
1192
}
1193
1194
u8_q2asub_q0 = __msa_asub_u_b( q2_org, q0_org );
1195
is_less_than_beta = ( u8_q2asub_q0 < beta );
1196
is_less_than_beta = is_less_than_beta & is_less_than;
1197
1198
{
1199
v8u16 is_less_than_beta_r, is_less_than_beta_l;
1200
is_less_than_beta_r =
1201
( v8u16 ) __msa_sldi_b( ( v16i8 ) is_less_than_beta,
1202
zero, 8 );
1203
1204
q1_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q1_org );
1205
if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_r ) )
1206
{
1207
q2_org_r = ( v8i16 ) __msa_ilvr_b( zero, ( v16i8 ) q2_org );
1208
1209
AVC_LPF_P1_OR_Q1( p0_org_r, q0_org_r, q1_org_r, q2_org_r,
1210
negate_tc_r, tc_r, q1_r );
1211
}
1212
is_less_than_beta_l =
1213
( v8u16 ) __msa_sldi_b( zero,
1214
( v16i8 ) is_less_than_beta, 8 );
1215
1216
q1_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q1_org );
1217
if( !__msa_test_bz_v( ( v16u8 ) is_less_than_beta_l ) )
1218
{
1219
q2_org_l = ( v8i16 ) __msa_ilvl_b( zero, ( v16i8 ) q2_org );
1220
1221
AVC_LPF_P1_OR_Q1( p0_org_l, q0_org_l, q1_org_l, q2_org_l,
1222
i16_negatetc_l, tc_l, q1_l );
1223
}
1224
}
1225
if( !__msa_test_bz_v( is_less_than_beta ) )
1226
{
1227
q1 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) q1_l, ( v16i8 ) q1_r );
1228
q1_org = __msa_bmnz_v( q1_org, q1, is_less_than_beta );
1229
ST_UB( q1_org, p_data + u_image_width );
1230
1231
is_less_than_beta = __msa_andi_b( is_less_than_beta, 1 );
1232
tc = tc + ( v16i8 ) is_less_than_beta;
1233
}
1234
{
1235
v16i8 negate_thresh, sign_negate_thresh;
1236
v8i16 threshold_r, threshold_l;
1237
v8i16 negate_thresh_l, negate_thresh_r;
1238
1239
negate_thresh = zero - tc;
1240
sign_negate_thresh = __msa_clti_s_b( negate_thresh, 0 );
1241
1242
ILVR_B2_SH( zero, tc, sign_negate_thresh, negate_thresh,
1243
threshold_r, negate_thresh_r );
1244
AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1245
negate_thresh_r, threshold_r, p0_r, q0_r );
1246
1247
threshold_l = ( v8i16 ) __msa_ilvl_b( zero, tc );
1248
negate_thresh_l = ( v8i16 ) __msa_ilvl_b( sign_negate_thresh,
1249
negate_thresh );
1250
AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1251
negate_thresh_l, threshold_l, p0_l, q0_l );
1252
}
1253
1254
PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
1255
1256
p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
1257
q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
1258
1259
ST_UB( p0_org, ( p_data - u_image_width ) );
1260
ST_UB( q0_org, p_data );
1261
}
1262
}
1263
}
1264
1265
static void avc_lpf_cbcr_interleaved_inter_edge_hor_msa( uint8_t *p_chroma,
1266
uint8_t u_bs0,
1267
uint8_t u_bs1,
1268
uint8_t u_bs2,
1269
uint8_t u_bs3,
1270
uint8_t u_tc0,
1271
uint8_t u_tc1,
1272
uint8_t u_tc2,
1273
uint8_t u_tc3,
1274
uint8_t u_alpha_in,
1275
uint8_t u_beta_in,
1276
uint32_t u_img_width )
1277
{
1278
v16u8 alpha, beta;
1279
v4i32 tmp_vec, bs = { 0 };
1280
v4i32 tc = { 0 };
1281
v16u8 p0_asub_q0, p1_asub_p0, q1_asub_q0;
1282
v16u8 is_less_than;
1283
v8i16 is_less_than_r, is_less_than_l;
1284
v16u8 is_less_than_beta, is_less_than_alpha, is_bs_greater_than0;
1285
v16u8 p0, q0;
1286
v8i16 p0_r = { 0 };
1287
v8i16 q0_r = { 0 };
1288
v8i16 p0_l = { 0 };
1289
v8i16 q0_l = { 0 };
1290
v16u8 p1_org, p0_org, q0_org, q1_org;
1291
v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1292
v16i8 negate_tc, sign_negate_tc;
1293
v8i16 negate_tc_r, i16_negatetc_l;
1294
v8i16 tc_r, tc_l;
1295
v16i8 zero = { 0 };
1296
v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1297
1298
tmp_vec = ( v4i32 ) __msa_fill_b( u_bs0 );
1299
bs = __msa_insve_w( bs, 0, tmp_vec );
1300
tmp_vec = ( v4i32 ) __msa_fill_b( u_bs1 );
1301
bs = __msa_insve_w( bs, 1, tmp_vec );
1302
tmp_vec = ( v4i32 ) __msa_fill_b( u_bs2 );
1303
bs = __msa_insve_w( bs, 2, tmp_vec );
1304
tmp_vec = ( v4i32 ) __msa_fill_b( u_bs3 );
1305
bs = __msa_insve_w( bs, 3, tmp_vec );
1306
1307
if( !__msa_test_bz_v( ( v16u8 ) bs ) )
1308
{
1309
tmp_vec = ( v4i32 ) __msa_fill_b( u_tc0 );
1310
tc = __msa_insve_w( tc, 0, tmp_vec );
1311
tmp_vec = ( v4i32 ) __msa_fill_b( u_tc1 );
1312
tc = __msa_insve_w( tc, 1, tmp_vec );
1313
tmp_vec = ( v4i32 ) __msa_fill_b( u_tc2 );
1314
tc = __msa_insve_w( tc, 2, tmp_vec );
1315
tmp_vec = ( v4i32 ) __msa_fill_b( u_tc3 );
1316
tc = __msa_insve_w( tc, 3, tmp_vec );
1317
1318
is_bs_greater_than0 = ( v16u8 ) ( zero < ( v16i8 ) bs );
1319
1320
alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
1321
beta = ( v16u8 ) __msa_fill_b( u_beta_in );
1322
1323
LD_UB4( p_chroma - ( u_img_width << 1 ), u_img_width,
1324
p1_org, p0_org, q0_org, q1_org );
1325
1326
p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
1327
p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
1328
q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
1329
1330
is_less_than_alpha = ( p0_asub_q0 < alpha );
1331
is_less_than_beta = ( p1_asub_p0 < beta );
1332
is_less_than = is_less_than_beta & is_less_than_alpha;
1333
is_less_than_beta = ( q1_asub_q0 < beta );
1334
is_less_than = is_less_than_beta & is_less_than;
1335
1336
is_less_than = is_less_than & is_bs_greater_than0;
1337
1338
if( !__msa_test_bz_v( is_less_than ) )
1339
{
1340
negate_tc = zero - ( v16i8 ) tc;
1341
sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
1342
1343
ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r,
1344
i16_negatetc_l );
1345
1346
UNPCK_UB_SH( tc, tc_r, tc_l );
1347
UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
1348
UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
1349
UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
1350
UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l );
1351
1352
is_less_than_r =
1353
( v8i16 ) __msa_sldi_b( ( v16i8 ) is_less_than, zero, 8 );
1354
if( !__msa_test_bz_v( ( v16u8 ) is_less_than_r ) )
1355
{
1356
AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1357
negate_tc_r, tc_r, p0_r, q0_r );
1358
}
1359
1360
is_less_than_l =
1361
( v8i16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than, 8 );
1362
if( !__msa_test_bz_v( ( v16u8 ) is_less_than_l ) )
1363
{
1364
AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1365
i16_negatetc_l, tc_l, p0_l, q0_l );
1366
}
1367
1368
PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
1369
1370
p0_org = __msa_bmnz_v( p0_org, p0, is_less_than );
1371
q0_org = __msa_bmnz_v( q0_org, q0, is_less_than );
1372
1373
ST_UB( p0_org, p_chroma - u_img_width );
1374
ST_UB( q0_org, p_chroma );
1375
}
1376
}
1377
}
1378
1379
static void avc_lpf_cbcr_interleaved_inter_edge_ver_msa( uint8_t *p_chroma,
1380
uint8_t u_bs0,
1381
uint8_t u_bs1,
1382
uint8_t u_bs2,
1383
uint8_t u_bs3,
1384
uint8_t u_tc0,
1385
uint8_t u_tc1,
1386
uint8_t u_tc2,
1387
uint8_t u_tc3,
1388
uint8_t u_alpha_in,
1389
uint8_t u_beta_in,
1390
uint32_t u_img_width )
1391
{
1392
v16u8 alpha, beta;
1393
v16u8 p0, q0, p0_asub_q0, p1_asub_p0, q1_asub_q0;
1394
v16u8 is_less_than, is_less_than1;
1395
v8i16 is_less_than_r, is_less_than_l;
1396
v16u8 is_less_than_beta, is_less_than_alpha;
1397
v8i16 p0_r = { 0 };
1398
v8i16 q0_r = { 0 };
1399
v8i16 p0_l = { 0 };
1400
v8i16 q0_l = { 0 };
1401
v16u8 p1_org, p0_org, q0_org, q1_org;
1402
v8i16 p1_org_r, p0_org_r, q0_org_r, q1_org_r;
1403
v8i16 p1_org_l, p0_org_l, q0_org_l, q1_org_l;
1404
v16u8 is_bs_less_than4, is_bs_greater_than0;
1405
v8i16 tc_r, tc_l, negate_tc_r, i16_negatetc_l;
1406
v16u8 const4;
1407
v16i8 zero = { 0 };
1408
v8i16 tmp_vec, bs = { 0 };
1409
v8i16 tc = { 0 };
1410
v16u8 p1_u_org, p0_u_org, q0_u_org, q1_u_org;
1411
v16u8 p1_v_org, p0_v_org, q0_v_org, q1_v_org;
1412
v16i8 tmp0, tmp1, tmp2, tmp3;
1413
v4i32 vec0, vec1;
1414
v16u8 row0, row1, row2, row3, row4, row5, row6, row7;
1415
v16i8 negate_tc, sign_negate_tc;
1416
1417
const4 = ( v16u8 ) __msa_ldi_b( 4 );
1418
1419
tmp_vec = ( v8i16 ) __msa_fill_b( u_bs0 );
1420
bs = __msa_insve_h( bs, 0, tmp_vec );
1421
bs = __msa_insve_h( bs, 4, tmp_vec );
1422
1423
tmp_vec = ( v8i16 ) __msa_fill_b( u_bs1 );
1424
bs = __msa_insve_h( bs, 1, tmp_vec );
1425
bs = __msa_insve_h( bs, 5, tmp_vec );
1426
1427
tmp_vec = ( v8i16 ) __msa_fill_b( u_bs2 );
1428
bs = __msa_insve_h( bs, 2, tmp_vec );
1429
bs = __msa_insve_h( bs, 6, tmp_vec );
1430
1431
tmp_vec = ( v8i16 ) __msa_fill_b( u_bs3 );
1432
bs = __msa_insve_h( bs, 3, tmp_vec );
1433
bs = __msa_insve_h( bs, 7, tmp_vec );
1434
1435
if( !__msa_test_bz_v( ( v16u8 ) bs ) )
1436
{
1437
tmp_vec = ( v8i16 ) __msa_fill_b( u_tc0 );
1438
tc = __msa_insve_h( tc, 0, tmp_vec );
1439
tc = __msa_insve_h( tc, 4, tmp_vec );
1440
1441
tmp_vec = ( v8i16 ) __msa_fill_b( u_tc1 );
1442
tc = __msa_insve_h( tc, 1, tmp_vec );
1443
tc = __msa_insve_h( tc, 5, tmp_vec );
1444
1445
tmp_vec = ( v8i16 ) __msa_fill_b( u_tc2 );
1446
tc = __msa_insve_h( tc, 2, tmp_vec );
1447
tc = __msa_insve_h( tc, 6, tmp_vec );
1448
1449
tmp_vec = ( v8i16 ) __msa_fill_b( u_tc3 );
1450
tc = __msa_insve_h( tc, 3, tmp_vec );
1451
tc = __msa_insve_h( tc, 7, tmp_vec );
1452
1453
is_bs_greater_than0 = ( v16u8 ) ( zero < ( v16i8 ) bs );
1454
1455
LD_UB8( ( p_chroma - 4 ), u_img_width,
1456
row0, row1, row2, row3, row4, row5, row6, row7 );
1457
1458
TRANSPOSE8x8_UB_UB( row0, row1, row2, row3,
1459
row4, row5, row6, row7,
1460
p1_u_org, p1_v_org, p0_u_org, p0_v_org,
1461
q0_u_org, q0_v_org, q1_u_org, q1_v_org );
1462
1463
ILVR_D4_UB( p1_v_org, p1_u_org, p0_v_org, p0_u_org, q0_v_org, q0_u_org,
1464
q1_v_org, q1_u_org, p1_org, p0_org, q0_org, q1_org );
1465
1466
p0_asub_q0 = __msa_asub_u_b( p0_org, q0_org );
1467
p1_asub_p0 = __msa_asub_u_b( p1_org, p0_org );
1468
q1_asub_q0 = __msa_asub_u_b( q1_org, q0_org );
1469
1470
alpha = ( v16u8 ) __msa_fill_b( u_alpha_in );
1471
beta = ( v16u8 ) __msa_fill_b( u_beta_in );
1472
1473
is_less_than_alpha = ( p0_asub_q0 < alpha );
1474
is_less_than_beta = ( p1_asub_p0 < beta );
1475
is_less_than = is_less_than_beta & is_less_than_alpha;
1476
is_less_than_beta = ( q1_asub_q0 < beta );
1477
is_less_than = is_less_than_beta & is_less_than;
1478
is_less_than = is_bs_greater_than0 & is_less_than;
1479
1480
if( !__msa_test_bz_v( is_less_than ) )
1481
{
1482
UNPCK_UB_SH( p1_org, p1_org_r, p1_org_l );
1483
UNPCK_UB_SH( p0_org, p0_org_r, p0_org_l );
1484
UNPCK_UB_SH( q0_org, q0_org_r, q0_org_l );
1485
UNPCK_UB_SH( q1_org, q1_org_r, q1_org_l );
1486
1487
is_bs_less_than4 = ( ( v16u8 ) bs < const4 );
1488
1489
is_less_than1 = is_less_than & is_bs_less_than4;
1490
if( !__msa_test_bz_v( ( v16u8 ) is_less_than1 ) )
1491
{
1492
negate_tc = zero - ( v16i8 ) tc;
1493
sign_negate_tc = __msa_clti_s_b( negate_tc, 0 );
1494
1495
ILVRL_B2_SH( sign_negate_tc, negate_tc, negate_tc_r,
1496
i16_negatetc_l );
1497
1498
UNPCK_UB_SH( tc, tc_r, tc_l );
1499
1500
is_less_than_r =
1501
( v8i16 ) __msa_sldi_b( ( v16i8 ) is_less_than1, zero, 8 );
1502
if( !__msa_test_bz_v( ( v16u8 ) is_less_than_r ) )
1503
{
1504
AVC_LPF_P0Q0( q0_org_r, p0_org_r, p1_org_r, q1_org_r,
1505
negate_tc_r, tc_r, p0_r, q0_r );
1506
}
1507
1508
is_less_than_l =
1509
( v8i16 ) __msa_sldi_b( zero, ( v16i8 ) is_less_than1, 8 );
1510
if( !__msa_test_bz_v( ( v16u8 ) is_less_than_l ) )
1511
{
1512
AVC_LPF_P0Q0( q0_org_l, p0_org_l, p1_org_l, q1_org_l,
1513
i16_negatetc_l, tc_l, p0_l, q0_l );
1514
}
1515
1516
PCKEV_B2_UB( p0_l, p0_r, q0_l, q0_r, p0, q0 );
1517
1518
p0_org = __msa_bmnz_v( p0_org, p0, is_less_than1 );
1519
q0_org = __msa_bmnz_v( q0_org, q0, is_less_than1 );
1520
}
1521
1522
SLDI_B2_0_UB( p0_org, q0_org, p0_v_org, q0_v_org, 8 );
1523
ILVR_D2_SB( p0_v_org, p0_org, q0_v_org, q0_org, tmp0, tmp1 );
1524
ILVRL_B2_SB( tmp1, tmp0, tmp2, tmp3 );
1525
ILVRL_B2_SW( tmp3, tmp2, vec0, vec1 );
1526
ST4x8_UB( vec0, vec1, ( p_chroma - 2 ), u_img_width );
1527
}
1528
}
1529
}
1530
1531
static void avc_deblock_strength_msa( uint8_t *nnz,
1532
int8_t pi_ref[2][X264_SCAN8_LUMA_SIZE],
1533
int16_t pi_mv[2][X264_SCAN8_LUMA_SIZE][2],
1534
uint8_t pu_bs[2][8][4],
1535
int32_t i_mvy_limit )
1536
{
1537
uint32_t u_tmp;
1538
v16u8 nnz0, nnz1, nnz2, nnz3, nnz4;
1539
v16u8 nnz_mask, ref_mask, mask, one, two, dst = { 0 };
1540
v16i8 ref0, ref1, ref2, ref3, ref4;
1541
v16i8 temp_vec0, temp_vec1, temp_vec4, temp_vec5;
1542
v8i16 mv0, mv1, mv2, mv3, mv4, mv5, mv6, mv7, mv8, mv9, mv_a, mv_b;
1543
v8u16 four, mvy_limit_vec, sub0, sub1;
1544
1545
nnz0 = LD_UB( nnz + 4 );
1546
nnz2 = LD_UB( nnz + 20 );
1547
nnz4 = LD_UB( nnz + 36 );
1548
1549
ref0 = LD_SB( pi_ref[0] + 4 );
1550
ref2 = LD_SB( pi_ref[0] + 20 );
1551
ref4 = LD_SB( pi_ref[0] + 36 );
1552
1553
mv0 = LD_SH( ( pi_mv[0] + 4 )[0] );
1554
mv1 = LD_SH( ( pi_mv[0] + 12 )[0] );
1555
mv2 = LD_SH( ( pi_mv[0] + 20 )[0] );
1556
mv3 = LD_SH( ( pi_mv[0] + 28 )[0] );
1557
mv4 = LD_SH( ( pi_mv[0] + 36 )[0] );
1558
1559
mvy_limit_vec = ( v8u16 ) __msa_fill_h( i_mvy_limit );
1560
four = ( v8u16 ) __msa_fill_h( 4 );
1561
mask = ( v16u8 ) __msa_ldi_b( 0 );
1562
one = ( v16u8 ) __msa_ldi_b( 1 );
1563
two = ( v16u8 ) __msa_ldi_b( 2 );
1564
1565
mv5 = __msa_pckod_h( mv0, mv0 );
1566
mv6 = __msa_pckod_h( mv1, mv1 );
1567
mv_a = __msa_pckev_h( mv0, mv0 );
1568
mv_b = __msa_pckev_h( mv1, mv1 );
1569
nnz1 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz0, 2 );
1570
ref1 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref0, 2 );
1571
nnz_mask = nnz0 | nnz1;
1572
nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
1573
two = __msa_bmnz_v( two, mask, nnz_mask );
1574
1575
ref_mask = ( v16u8 ) __msa_ceq_b( ref0, ref1 );
1576
ref_mask = ref_mask ^ 255;
1577
1578
sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
1579
sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
1580
1581
sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
1582
sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
1583
1584
ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
1585
ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
1586
1587
dst = __msa_bmnz_v( dst, one, ref_mask );
1588
dst = __msa_bmnz_v( two, dst, nnz_mask );
1589
1590
u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
1591
SW( u_tmp, pu_bs[1][0] );
1592
1593
dst = ( v16u8 ) __msa_ldi_b( 0 );
1594
two = ( v16u8 ) __msa_ldi_b( 2 );
1595
1596
mv5 = __msa_pckod_h( mv1, mv1 );
1597
mv6 = __msa_pckod_h( mv2, mv2 );
1598
mv_a = __msa_pckev_h( mv1, mv1 );
1599
mv_b = __msa_pckev_h( mv2, mv2 );
1600
1601
nnz_mask = nnz2 | nnz1;
1602
nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
1603
two = __msa_bmnz_v( two, mask, nnz_mask );
1604
1605
ref_mask = ( v16u8 ) __msa_ceq_b( ref1, ref2 );
1606
ref_mask = ref_mask ^ 255;
1607
1608
sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
1609
sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
1610
sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
1611
sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
1612
1613
ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
1614
ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
1615
1616
dst = __msa_bmnz_v( dst, one, ref_mask );
1617
dst = __msa_bmnz_v( two, dst, nnz_mask );
1618
1619
u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
1620
SW( u_tmp, pu_bs[1][1] );
1621
1622
dst = ( v16u8 ) __msa_ldi_b( 0 );
1623
two = ( v16u8 ) __msa_ldi_b( 2 );
1624
1625
mv5 = __msa_pckod_h( mv2, mv2 );
1626
mv6 = __msa_pckod_h( mv3, mv3 );
1627
mv_a = __msa_pckev_h( mv2, mv2 );
1628
mv_b = __msa_pckev_h( mv3, mv3 );
1629
1630
nnz3 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz2, 2 );
1631
ref3 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref2, 2 );
1632
1633
nnz_mask = nnz3 | nnz2;
1634
nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
1635
two = __msa_bmnz_v( two, mask, nnz_mask );
1636
1637
ref_mask = ( v16u8 ) __msa_ceq_b( ref2, ref3 );
1638
ref_mask = ref_mask ^ 255;
1639
1640
sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
1641
sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
1642
1643
sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
1644
sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
1645
1646
ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
1647
ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
1648
1649
dst = __msa_bmnz_v( dst, one, ref_mask );
1650
dst = __msa_bmnz_v( two, dst, nnz_mask );
1651
1652
u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
1653
SW( u_tmp, pu_bs[1][2] );
1654
1655
dst = ( v16u8 ) __msa_ldi_b( 0 );
1656
two = ( v16u8 ) __msa_ldi_b( 2 );
1657
1658
mv5 = __msa_pckod_h( mv3, mv3 );
1659
mv6 = __msa_pckod_h( mv4, mv4 );
1660
mv_a = __msa_pckev_h( mv3, mv3 );
1661
mv_b = __msa_pckev_h( mv4, mv4 );
1662
1663
nnz_mask = nnz4 | nnz3;
1664
nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
1665
two = __msa_bmnz_v( two, mask, nnz_mask );
1666
1667
ref_mask = ( v16u8 ) __msa_ceq_b( ref3, ref4 );
1668
ref_mask = ref_mask ^ 255;
1669
1670
sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
1671
sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
1672
1673
sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
1674
sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
1675
1676
ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
1677
ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
1678
1679
dst = __msa_bmnz_v( dst, one, ref_mask );
1680
dst = __msa_bmnz_v( two, dst, nnz_mask );
1681
1682
u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
1683
SW( u_tmp, pu_bs[1][3] );
1684
1685
nnz0 = LD_UB( nnz + 8 );
1686
nnz2 = LD_UB( nnz + 24 );
1687
1688
ref0 = LD_SB( pi_ref[0] + 8 );
1689
ref2 = LD_SB( pi_ref[0] + 24 );
1690
1691
mv0 = LD_SH( ( pi_mv[0] + 8 )[0] );
1692
mv1 = LD_SH( ( pi_mv[0] + 12 )[0] );
1693
mv2 = LD_SH( ( pi_mv[0] + 16 )[0] );
1694
mv3 = LD_SH( ( pi_mv[0] + 20 )[0] );
1695
mv4 = LD_SH( ( pi_mv[0] + 24 )[0] );
1696
mv7 = LD_SH( ( pi_mv[0] + 28 )[0] );
1697
mv8 = LD_SH( ( pi_mv[0] + 32 )[0] );
1698
mv9 = LD_SH( ( pi_mv[0] + 36 )[0] );
1699
1700
nnz1 = ( v16u8 ) __msa_splati_d( ( v2i64 ) nnz0, 1 );
1701
nnz3 = ( v16u8 ) __msa_splati_d( ( v2i64 ) nnz2, 1 );
1702
1703
ILVR_B2_SB( nnz2, nnz0, nnz3, nnz1, temp_vec0, temp_vec1 );
1704
1705
ILVRL_B2_SB( temp_vec1, temp_vec0, temp_vec5, temp_vec4 );
1706
1707
nnz0 = ( v16u8 ) __msa_splati_w( ( v4i32 ) temp_vec5, 3 );
1708
nnz1 = ( v16u8 ) temp_vec4;
1709
nnz2 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 1 );
1710
nnz3 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 2 );
1711
nnz4 = ( v16u8 ) __msa_splati_w( ( v4i32 ) nnz1, 3 );
1712
1713
ref1 = ( v16i8 ) __msa_splati_d( ( v2i64 ) ref0, 1 );
1714
ref3 = ( v16i8 ) __msa_splati_d( ( v2i64 ) ref2, 1 );
1715
1716
ILVR_B2_SB( ref2, ref0, ref3, ref1, temp_vec0, temp_vec1 );
1717
1718
ILVRL_B2_SB( temp_vec1, temp_vec0, temp_vec5, ref1 );
1719
1720
ref0 = ( v16i8 ) __msa_splati_w( ( v4i32 ) temp_vec5, 3 );
1721
1722
ref2 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 1 );
1723
ref3 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 2 );
1724
ref4 = ( v16i8 ) __msa_splati_w( ( v4i32 ) ref1, 3 );
1725
1726
TRANSPOSE8X4_SH_SH( mv0, mv2, mv4, mv8, mv5, mv5, mv5, mv0 );
1727
TRANSPOSE8X4_SH_SH( mv1, mv3, mv7, mv9, mv1, mv2, mv3, mv4 );
1728
1729
mvy_limit_vec = ( v8u16 ) __msa_fill_h( i_mvy_limit );
1730
four = ( v8u16 ) __msa_fill_h( 4 );
1731
mask = ( v16u8 ) __msa_ldi_b( 0 );
1732
one = ( v16u8 ) __msa_ldi_b( 1 );
1733
two = ( v16u8 ) __msa_ldi_b( 2 );
1734
dst = ( v16u8 ) __msa_ldi_b( 0 );
1735
1736
mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv0, 1 );
1737
mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv1, 1 );
1738
mv_a = mv0;
1739
mv_b = mv1;
1740
1741
nnz_mask = nnz0 | nnz1;
1742
nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
1743
two = __msa_bmnz_v( two, mask, nnz_mask );
1744
1745
ref_mask = ( v16u8 ) __msa_ceq_b( ref0, ref1 );
1746
ref_mask = ref_mask ^ 255;
1747
1748
sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
1749
sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
1750
1751
sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
1752
sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
1753
1754
ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
1755
ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
1756
1757
dst = __msa_bmnz_v( dst, one, ref_mask );
1758
dst = __msa_bmnz_v( two, dst, nnz_mask );
1759
1760
u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
1761
SW( u_tmp, pu_bs[0][0] );
1762
1763
two = ( v16u8 ) __msa_ldi_b( 2 );
1764
dst = ( v16u8 ) __msa_ldi_b( 0 );
1765
1766
mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv1, 1 );
1767
mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv2, 1 );
1768
mv_a = mv1;
1769
mv_b = mv2;
1770
1771
nnz_mask = nnz1 | nnz2;
1772
nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
1773
two = __msa_bmnz_v( two, mask, nnz_mask );
1774
1775
ref_mask = ( v16u8 ) __msa_ceq_b( ref1, ref2 );
1776
ref_mask = ref_mask ^ 255;
1777
1778
sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
1779
sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
1780
sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
1781
sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
1782
1783
ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
1784
ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
1785
1786
dst = __msa_bmnz_v( dst, one, ref_mask );
1787
dst = __msa_bmnz_v( two, dst, nnz_mask );
1788
1789
u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
1790
SW( u_tmp, pu_bs[0][1] );
1791
1792
two = ( v16u8 ) __msa_ldi_b( 2 );
1793
dst = ( v16u8 ) __msa_ldi_b( 0 );
1794
1795
mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv2, 1 );
1796
mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv3, 1 );
1797
mv_a = mv2;
1798
mv_b = mv3;
1799
1800
nnz_mask = nnz2 | nnz3;
1801
nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
1802
two = __msa_bmnz_v( two, mask, nnz_mask );
1803
1804
ref_mask = ( v16u8 ) __msa_ceq_b( ref2, ref3 );
1805
ref_mask = ref_mask ^ 255;
1806
1807
sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
1808
sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
1809
sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
1810
sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
1811
1812
ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
1813
ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
1814
1815
dst = __msa_bmnz_v( dst, one, ref_mask );
1816
dst = __msa_bmnz_v( two, dst, nnz_mask );
1817
1818
u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
1819
SW( u_tmp, pu_bs[0][2] );
1820
1821
two = ( v16u8 ) __msa_ldi_b( 2 );
1822
dst = ( v16u8 ) __msa_ldi_b( 0 );
1823
1824
mv5 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv3, 1 );
1825
mv6 = ( v8i16 ) __msa_splati_d( ( v2i64 ) mv4, 1 );
1826
mv_a = mv3;
1827
mv_b = mv4;
1828
1829
nnz_mask = nnz3 | nnz4;
1830
nnz_mask = ( v16u8 ) __msa_ceq_b( ( v16i8 ) mask, ( v16i8 ) nnz_mask );
1831
two = __msa_bmnz_v( two, mask, nnz_mask );
1832
1833
ref_mask = ( v16u8 ) __msa_ceq_b( ref3, ref4 );
1834
ref_mask = ref_mask ^ 255;
1835
1836
sub0 = ( v8u16 ) __msa_asub_s_h( mv_b, mv_a );
1837
sub1 = ( v8u16 ) __msa_asub_s_h( mv6, mv5 );
1838
sub0 = ( v8u16 ) __msa_cle_u_h( four, sub0 );
1839
sub1 = ( v8u16 ) __msa_cle_u_h( mvy_limit_vec, sub1 );
1840
1841
ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub0, ( v16i8 ) sub0 );
1842
ref_mask |= ( v16u8 ) __msa_pckev_b( ( v16i8 ) sub1, ( v16i8 ) sub1 );
1843
1844
dst = __msa_bmnz_v( dst, one, ref_mask );
1845
dst = __msa_bmnz_v( two, dst, nnz_mask );
1846
1847
u_tmp = __msa_copy_u_w( ( v4i32 ) dst, 0 );
1848
SW( u_tmp, pu_bs[0][3] );
1849
}
1850
1851
void x264_deblock_v_luma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
1852
int32_t i_alpha, int32_t i_beta )
1853
{
1854
avc_loopfilter_luma_intra_edge_hor_msa( p_pix, ( uint8_t ) i_alpha,
1855
( uint8_t ) i_beta, i_stride );
1856
}
1857
1858
void x264_deblock_h_luma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
1859
int32_t i_alpha, int32_t i_beta )
1860
{
1861
avc_loopfilter_luma_intra_edge_ver_msa( p_pix, ( uint8_t ) i_alpha,
1862
( uint8_t ) i_beta, i_stride );
1863
}
1864
1865
void x264_deblock_v_chroma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
1866
int32_t i_alpha, int32_t i_beta )
1867
{
1868
avc_lpf_cbcr_interleaved_intra_edge_hor_msa( p_pix, ( uint8_t ) i_alpha,
1869
( uint8_t ) i_beta, i_stride );
1870
}
1871
1872
void x264_deblock_h_chroma_intra_msa( uint8_t *p_pix, intptr_t i_stride,
1873
int32_t i_alpha, int32_t i_beta )
1874
{
1875
avc_lpf_cbcr_interleaved_intra_edge_ver_msa( p_pix, ( uint8_t ) i_alpha,
1876
( uint8_t ) i_beta, i_stride );
1877
}
1878
1879
void x264_deblock_h_luma_msa( uint8_t *p_pix, intptr_t i_stride,
1880
int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
1881
{
1882
uint8_t u_bs0 = 1;
1883
uint8_t u_bs1 = 1;
1884
uint8_t u_bs2 = 1;
1885
uint8_t u_bs3 = 1;
1886
1887
if( p_tc0[0] < 0 ) u_bs0 = 0;
1888
if( p_tc0[1] < 0 ) u_bs1 = 0;
1889
if( p_tc0[2] < 0 ) u_bs2 = 0;
1890
if( p_tc0[3] < 0 ) u_bs3 = 0;
1891
1892
avc_loopfilter_luma_inter_edge_ver_msa( p_pix,
1893
u_bs0, u_bs1, u_bs2, u_bs3,
1894
p_tc0[0], p_tc0[1], p_tc0[2],
1895
p_tc0[3], i_alpha, i_beta,
1896
i_stride );
1897
}
1898
1899
void x264_deblock_v_luma_msa( uint8_t *p_pix, intptr_t i_stride,
1900
int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
1901
{
1902
uint8_t u_bs0 = 1;
1903
uint8_t u_bs1 = 1;
1904
uint8_t u_bs2 = 1;
1905
uint8_t u_bs3 = 1;
1906
1907
if( p_tc0[0] < 0 ) u_bs0 = 0;
1908
if( p_tc0[1] < 0 ) u_bs1 = 0;
1909
if( p_tc0[2] < 0 ) u_bs2 = 0;
1910
if( p_tc0[3] < 0 ) u_bs3 = 0;
1911
1912
avc_loopfilter_luma_inter_edge_hor_msa( p_pix,
1913
u_bs0, u_bs1, u_bs2, u_bs3,
1914
p_tc0[0], p_tc0[1], p_tc0[2],
1915
p_tc0[3], i_alpha, i_beta,
1916
i_stride );
1917
}
1918
1919
void x264_deblock_v_chroma_msa( uint8_t *p_pix, intptr_t i_stride,
1920
int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
1921
{
1922
uint8_t u_bs0 = 1;
1923
uint8_t u_bs1 = 1;
1924
uint8_t u_bs2 = 1;
1925
uint8_t u_bs3 = 1;
1926
1927
if( p_tc0[0] < 0 ) u_bs0 = 0;
1928
if( p_tc0[1] < 0 ) u_bs1 = 0;
1929
if( p_tc0[2] < 0 ) u_bs2 = 0;
1930
if( p_tc0[3] < 0 ) u_bs3 = 0;
1931
1932
avc_lpf_cbcr_interleaved_inter_edge_hor_msa( p_pix,
1933
u_bs0, u_bs1, u_bs2, u_bs3,
1934
p_tc0[0], p_tc0[1], p_tc0[2],
1935
p_tc0[3], i_alpha, i_beta,
1936
i_stride );
1937
}
1938
1939
void x264_deblock_h_chroma_msa( uint8_t *p_pix, intptr_t i_stride,
1940
int32_t i_alpha, int32_t i_beta, int8_t *p_tc0 )
1941
{
1942
uint8_t u_bs0 = 1;
1943
uint8_t u_bs1 = 1;
1944
uint8_t u_bs2 = 1;
1945
uint8_t u_bs3 = 1;
1946
1947
if( p_tc0[0] < 0 ) u_bs0 = 0;
1948
if( p_tc0[1] < 0 ) u_bs1 = 0;
1949
if( p_tc0[2] < 0 ) u_bs2 = 0;
1950
if( p_tc0[3] < 0 ) u_bs3 = 0;
1951
1952
avc_lpf_cbcr_interleaved_inter_edge_ver_msa( p_pix,
1953
u_bs0, u_bs1, u_bs2, u_bs3,
1954
p_tc0[0], p_tc0[1], p_tc0[2],
1955
p_tc0[3], i_alpha, i_beta,
1956
i_stride );
1957
}
1958
1959
void x264_deblock_strength_msa( uint8_t u_nnz[X264_SCAN8_SIZE],
1960
int8_t pi_ref[2][X264_SCAN8_LUMA_SIZE],
1961
int16_t pi_mv[2][X264_SCAN8_LUMA_SIZE][2],
1962
uint8_t pu_bs[2][8][4], int32_t i_mvy_limit,
1963
int32_t i_bframe )
1964
{
1965
if( i_bframe )
1966
{
1967
for( int32_t i_dir = 0; i_dir < 2; i_dir++ )
1968
{
1969
int32_t s1 = i_dir ? 1 : 8;
1970
int32_t s2 = i_dir ? 8 : 1;
1971
1972
for( int32_t i_edge = 0; i_edge < 4; i_edge++ )
1973
{
1974
for( int32_t i = 0, loc = X264_SCAN8_0 + i_edge * s2; i < 4;
1975
i++, loc += s1 )
1976
{
1977
int32_t locn = loc - s2;
1978
if( u_nnz[loc] || u_nnz[locn] )
1979
{
1980
pu_bs[i_dir][i_edge][i] = 2;
1981
}
1982
else if( pi_ref[0][loc] != pi_ref[0][locn] ||
1983
abs( pi_mv[0][loc][0] -
1984
pi_mv[0][locn][0] ) >= 4 ||
1985
abs( pi_mv[0][loc][1] -
1986
pi_mv[0][locn][1] ) >= i_mvy_limit ||
1987
( i_bframe &&
1988
( pi_ref[1][loc] != pi_ref[1][locn] ||
1989
abs( pi_mv[1][loc][0] -
1990
pi_mv[1][locn][0] ) >= 4 ||
1991
abs( pi_mv[1][loc][1] -
1992
pi_mv[1][locn][1] ) >= i_mvy_limit ) )
1993
)
1994
{
1995
pu_bs[i_dir][i_edge][i] = 1;
1996
}
1997
else
1998
{
1999
pu_bs[i_dir][i_edge][i] = 0;
2000
}
2001
}
2002
}
2003
}
2004
}
2005
else
2006
{
2007
avc_deblock_strength_msa( u_nnz, pi_ref, pi_mv, pu_bs, i_mvy_limit );
2008
}
2009
}
2010
#endif
2011
2012