Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
/*****************************************************************************
2
* quant-c.c: msa quantization and level-run
3
*****************************************************************************
4
* Copyright (C) 2015-2016 x264 project
5
*
6
* Authors: Rishikesh More <[email protected]>
7
*
8
* This program is free software; you can redistribute it and/or modify
9
* it under the terms of the GNU General Public License as published by
10
* the Free Software Foundation; either version 2 of the License, or
11
* (at your option) any later version.
12
*
13
* This program is distributed in the hope that it will be useful,
14
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
* GNU General Public License for more details.
17
*
18
* You should have received a copy of the GNU General Public License
19
* along with this program; if not, write to the Free Software
20
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
*
22
* This program is also available under a commercial proprietary license.
23
* For more information, contact us at [email protected].
24
*****************************************************************************/
25
26
#include "common/common.h"
27
#include "macros.h"
28
29
#if !HIGH_BIT_DEPTH
30
static void avc_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
31
int32_t i_qp )
32
{
33
const int32_t i_mf = i_qp % 6;
34
const int32_t q_bits = i_qp / 6 - 4;
35
v8i16 dct0, dct1;
36
v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
37
38
LD_SH2( p_dct, 8, dct0, dct1 );
39
40
LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
41
LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
42
43
if ( q_bits >= 0 )
44
{
45
v8i16 dequant_mf_h0, dequant_mf_h1, q_bits_vec;
46
47
q_bits_vec = __msa_fill_h( q_bits );
48
49
PCKEV_H2_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
50
dequant_mf_h0, dequant_mf_h1 );
51
52
dct0 *= dequant_mf_h0;
53
dct1 *= dequant_mf_h1;
54
dct0 <<= q_bits_vec;
55
dct1 <<= q_bits_vec;
56
ST_SH2( dct0, dct1, p_dct, 8 );
57
}
58
else
59
{
60
const int32_t q_bits_add = 1 << ( -q_bits - 1 );
61
v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
62
v4i32 q_bits_vec, q_bits_vec_add;
63
64
q_bits_vec_add = __msa_fill_w( q_bits_add );
65
q_bits_vec = __msa_fill_w( -q_bits );
66
67
UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
68
UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
69
70
dct_signed_w0 *= dequant_m_f0;
71
dct_signed_w1 *= dequant_m_f1;
72
dct_signed_w2 *= dequant_m_f2;
73
dct_signed_w3 *= dequant_m_f3;
74
dct_signed_w0 += q_bits_vec_add;
75
dct_signed_w1 += q_bits_vec_add;
76
dct_signed_w2 += q_bits_vec_add;
77
dct_signed_w3 += q_bits_vec_add;
78
79
SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
80
q_bits_vec );
81
PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
82
dct0, dct1 );
83
ST_SH2( dct0, dct1, p_dct, 8 );
84
}
85
}
86
87
static void avc_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
88
int32_t i_qp )
89
{
90
const int32_t i_mf = i_qp % 6;
91
const int32_t q_bits = i_qp / 6 - 6;
92
v8i16 dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7;
93
v4i32 dequant_m_f0, dequant_m_f1, dequant_m_f2, dequant_m_f3;
94
v4i32 dequant_m_f4, dequant_m_f5, dequant_m_f6, dequant_m_f7;
95
v4i32 dequant_m_f8, dequant_m_f9, dequant_m_f10, dequant_m_f11;
96
v4i32 dequant_m_f12, dequant_m_f13, dequant_m_f14, dequant_m_f15;
97
98
LD_SH8( p_dct, 8, dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7 );
99
100
LD_SW2( pi_dequant_mf[i_mf], 4, dequant_m_f0, dequant_m_f1 );
101
LD_SW2( pi_dequant_mf[i_mf] + 8, 4, dequant_m_f2, dequant_m_f3 );
102
LD_SW2( pi_dequant_mf[i_mf] + 16, 4, dequant_m_f4, dequant_m_f5 );
103
LD_SW2( pi_dequant_mf[i_mf] + 24, 4, dequant_m_f6, dequant_m_f7 );
104
LD_SW2( pi_dequant_mf[i_mf] + 32, 4, dequant_m_f8, dequant_m_f9 );
105
LD_SW2( pi_dequant_mf[i_mf] + 40, 4, dequant_m_f10, dequant_m_f11 );
106
LD_SW2( pi_dequant_mf[i_mf] + 48, 4, dequant_m_f12, dequant_m_f13 );
107
LD_SW2( pi_dequant_mf[i_mf] + 56, 4, dequant_m_f14, dequant_m_f15 );
108
109
if ( q_bits >= 0 )
110
{
111
v8i16 q_bits_vec;
112
v8i16 dequant_mf_h0, dequant_mf_h1, dequant_mf_h2, dequant_mf_h3;
113
v8i16 dequant_mf_h4, dequant_mf_h5, dequant_mf_h6, dequant_mf_h7;
114
115
q_bits_vec = __msa_fill_h( q_bits );
116
117
PCKEV_H4_SH( dequant_m_f1, dequant_m_f0, dequant_m_f3, dequant_m_f2,
118
dequant_m_f5, dequant_m_f4, dequant_m_f7, dequant_m_f6,
119
dequant_mf_h0, dequant_mf_h1,
120
dequant_mf_h2, dequant_mf_h3 );
121
PCKEV_H4_SH( dequant_m_f9, dequant_m_f8, dequant_m_f11, dequant_m_f10,
122
dequant_m_f13, dequant_m_f12, dequant_m_f15, dequant_m_f14,
123
dequant_mf_h4, dequant_mf_h5,
124
dequant_mf_h6, dequant_mf_h7 );
125
126
dct0 *= dequant_mf_h0;
127
dct1 *= dequant_mf_h1;
128
dct2 *= dequant_mf_h2;
129
dct3 *= dequant_mf_h3;
130
dct4 *= dequant_mf_h4;
131
dct5 *= dequant_mf_h5;
132
dct6 *= dequant_mf_h6;
133
dct7 *= dequant_mf_h7;
134
135
SLLI_4V( dct0, dct1, dct2, dct3, q_bits_vec );
136
SLLI_4V( dct4, dct5, dct6, dct7, q_bits_vec );
137
138
ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 );
139
}
140
else
141
{
142
const int32_t q_bits_add = 1 << ( -q_bits - 1 );
143
v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
144
v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7;
145
v4i32 dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11;
146
v4i32 dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15;
147
v4i32 q_bits_vec, q_bits_vec_add;
148
149
q_bits_vec_add = __msa_fill_w( q_bits_add );
150
q_bits_vec = __msa_fill_w( -q_bits );
151
152
UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
153
UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
154
UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
155
UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
156
UNPCK_SH_SW( dct4, dct_signed_w8, dct_signed_w9 );
157
UNPCK_SH_SW( dct5, dct_signed_w10, dct_signed_w11 );
158
UNPCK_SH_SW( dct6, dct_signed_w12, dct_signed_w13 );
159
UNPCK_SH_SW( dct7, dct_signed_w14, dct_signed_w15 );
160
161
dct_signed_w0 *= dequant_m_f0;
162
dct_signed_w1 *= dequant_m_f1;
163
dct_signed_w2 *= dequant_m_f2;
164
dct_signed_w3 *= dequant_m_f3;
165
dct_signed_w4 *= dequant_m_f4;
166
dct_signed_w5 *= dequant_m_f5;
167
dct_signed_w6 *= dequant_m_f6;
168
dct_signed_w7 *= dequant_m_f7;
169
dct_signed_w8 *= dequant_m_f8;
170
dct_signed_w9 *= dequant_m_f9;
171
dct_signed_w10 *= dequant_m_f10;
172
dct_signed_w11 *= dequant_m_f11;
173
dct_signed_w12 *= dequant_m_f12;
174
dct_signed_w13 *= dequant_m_f13;
175
dct_signed_w14 *= dequant_m_f14;
176
dct_signed_w15 *= dequant_m_f15;
177
178
dct_signed_w0 += q_bits_vec_add;
179
dct_signed_w1 += q_bits_vec_add;
180
dct_signed_w2 += q_bits_vec_add;
181
dct_signed_w3 += q_bits_vec_add;
182
dct_signed_w4 += q_bits_vec_add;
183
dct_signed_w5 += q_bits_vec_add;
184
dct_signed_w6 += q_bits_vec_add;
185
dct_signed_w7 += q_bits_vec_add;
186
dct_signed_w8 += q_bits_vec_add;
187
dct_signed_w9 += q_bits_vec_add;
188
dct_signed_w10 += q_bits_vec_add;
189
dct_signed_w11 += q_bits_vec_add;
190
dct_signed_w12 += q_bits_vec_add;
191
dct_signed_w13 += q_bits_vec_add;
192
dct_signed_w14 += q_bits_vec_add;
193
dct_signed_w15 += q_bits_vec_add;
194
195
SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
196
q_bits_vec );
197
SRA_4V( dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7,
198
q_bits_vec );
199
SRA_4V( dct_signed_w8, dct_signed_w9, dct_signed_w10, dct_signed_w11,
200
q_bits_vec );
201
SRA_4V( dct_signed_w12, dct_signed_w13, dct_signed_w14, dct_signed_w15,
202
q_bits_vec );
203
PCKEV_H4_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
204
dct_signed_w5, dct_signed_w4, dct_signed_w7, dct_signed_w6,
205
dct0, dct1, dct2, dct3 );
206
PCKEV_H4_SH( dct_signed_w9, dct_signed_w8, dct_signed_w11,
207
dct_signed_w10, dct_signed_w13, dct_signed_w12,
208
dct_signed_w15, dct_signed_w14, dct4, dct5, dct6, dct7 );
209
ST_SH8( dct0, dct1, dct2, dct3, dct4, dct5, dct6, dct7, p_dct, 8 );
210
}
211
}
212
213
static void avc_dequant_4x4_dc_msa( int16_t *p_dct,
214
int32_t pi_dequant_mf[6][16],
215
int32_t i_qp )
216
{
217
const int32_t q_bits = i_qp / 6 - 6;
218
int32_t i_dmf = pi_dequant_mf[i_qp % 6][0];
219
v8i16 dct0, dct1, dequant_mf_h;
220
221
LD_SH2( p_dct, 8, dct0, dct1 );
222
223
if ( q_bits >= 0 )
224
{
225
i_dmf <<= q_bits;
226
227
dequant_mf_h = __msa_fill_h( i_dmf );
228
dct0 = dct0 * dequant_mf_h;
229
dct1 = dct1 * dequant_mf_h;
230
231
ST_SH2( dct0, dct1, p_dct, 8 );
232
}
233
else
234
{
235
const int32_t q_bits_add = 1 << ( -q_bits - 1 );
236
v4i32 dequant_m_f, q_bits_vec, q_bits_vec_add;
237
v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
238
239
q_bits_vec_add = __msa_fill_w( q_bits_add );
240
q_bits_vec = __msa_fill_w( -q_bits );
241
242
dequant_m_f = __msa_fill_w( i_dmf );
243
244
UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
245
UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
246
247
dct_signed_w0 *= dequant_m_f;
248
dct_signed_w1 *= dequant_m_f;
249
dct_signed_w2 *= dequant_m_f;
250
dct_signed_w3 *= dequant_m_f;
251
252
dct_signed_w0 += q_bits_vec_add;
253
dct_signed_w1 += q_bits_vec_add;
254
dct_signed_w2 += q_bits_vec_add;
255
dct_signed_w3 += q_bits_vec_add;
256
257
SRA_4V( dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3,
258
q_bits_vec );
259
PCKEV_H2_SH( dct_signed_w1, dct_signed_w0, dct_signed_w3, dct_signed_w2,
260
dct0, dct1 );
261
ST_SH2( dct0, dct1, p_dct, 8 );
262
}
263
}
264
265
static int32_t avc_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf,
266
uint16_t *p_bias )
267
{
268
int32_t non_zero = 0;
269
v8i16 dct0, dct1;
270
v8i16 zero = { 0 };
271
v8i16 dct0_mask, dct1_mask;
272
v8i16 dct_h0, dct_h1, mf_h0, mf_h1, bias_h0, bias_h1;
273
v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
274
v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
275
v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
276
v4i32 bias0, bias1, bias2, bias3;
277
278
LD_SH2( p_dct, 8, dct0, dct1 );
279
LD_SH2( p_bias, 8, bias_h0, bias_h1 );
280
LD_SH2( p_mf, 8, mf_h0, mf_h1 );
281
282
dct0_mask = __msa_clei_s_h( dct0, 0 );
283
dct1_mask = __msa_clei_s_h( dct1, 0 );
284
285
UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
286
UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
287
ILVR_H2_SW( zero, bias_h0, zero, bias_h1, bias0, bias2 );
288
ILVL_H2_SW( zero, bias_h0, zero, bias_h1, bias1, bias3 );
289
ILVR_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec0, mf_vec2 );
290
ILVL_H2_SW( zero, mf_h0, zero, mf_h1, mf_vec1, mf_vec3 );
291
292
dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
293
dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
294
dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
295
dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
296
297
dct_w0 *= mf_vec0;
298
dct_w1 *= mf_vec1;
299
dct_w2 *= mf_vec2;
300
dct_w3 *= mf_vec3;
301
302
SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
303
PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
304
305
dct0 = zero - dct_h0;
306
dct1 = zero - dct_h1;
307
308
dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0, ( v16u8 ) dct0,
309
( v16u8 ) dct0_mask );
310
dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1, ( v16u8 ) dct1,
311
( v16u8 ) dct1_mask );
312
non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
313
ST_SH2( dct0, dct1, p_dct, 8 );
314
315
return !!non_zero;
316
}
317
318
static int32_t avc_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf,
319
uint16_t *p_bias )
320
{
321
int32_t non_zero = 0;
322
v8i16 dct0, dct1, dct2, dct3;
323
v8i16 zero = { 0 };
324
v8i16 dct0_mask, dct1_mask, dct2_mask, dct3_mask;
325
v8i16 dct_h0, dct_h1, dct_h2, dct_h3, mf_h0, mf_h1, mf_h2, mf_h3;
326
v8i16 bias_h0, bias_h1, bias_h2, bias_h3;
327
v4i32 dct_w0, dct_w1, dct_w2, dct_w3, dct_w4, dct_w5, dct_w6, dct_w7;
328
v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
329
v4i32 dct_signed_w4, dct_signed_w5, dct_signed_w6, dct_signed_w7;
330
v4i32 mf_vec0, mf_vec1, mf_vec2, mf_vec3;
331
v4i32 mf_vec4, mf_vec5, mf_vec6, mf_vec7;
332
v4i32 bias0, bias1, bias2, bias3, bias4, bias5, bias6, bias7;
333
334
LD_SH4( p_dct, 8, dct0, dct1, dct2, dct3 );
335
336
dct0_mask = __msa_clei_s_h( dct0, 0 );
337
dct1_mask = __msa_clei_s_h( dct1, 0 );
338
dct2_mask = __msa_clei_s_h( dct2, 0 );
339
dct3_mask = __msa_clei_s_h( dct3, 0 );
340
341
UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
342
UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
343
UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
344
UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
345
LD_SH4( p_bias, 8, bias_h0, bias_h1, bias_h2, bias_h3 );
346
ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
347
bias0, bias2, bias4, bias6 );
348
ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
349
bias1, bias3, bias5, bias7 );
350
LD_SH4( p_mf, 8, mf_h0, mf_h1, mf_h2, mf_h3 );
351
ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
352
mf_vec0, mf_vec2, mf_vec4, mf_vec6 );
353
ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
354
mf_vec1, mf_vec3, mf_vec5, mf_vec7 );
355
356
dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
357
dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
358
dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
359
dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
360
dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 );
361
dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 );
362
dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 );
363
dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 );
364
365
dct_w0 *= mf_vec0;
366
dct_w1 *= mf_vec1;
367
dct_w2 *= mf_vec2;
368
dct_w3 *= mf_vec3;
369
dct_w4 *= mf_vec4;
370
dct_w5 *= mf_vec5;
371
dct_w6 *= mf_vec6;
372
dct_w7 *= mf_vec7;
373
374
SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
375
SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 );
376
PCKEV_H4_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_w5, dct_w4, dct_w7, dct_w6,
377
dct_h0, dct_h1, dct_h2, dct_h3 );
378
SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3,
379
dct0, dct1, dct2, dct3 );
380
381
dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
382
( v16u8 ) dct0, ( v16u8 ) dct0_mask );
383
dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
384
( v16u8 ) dct1, ( v16u8 ) dct1_mask );
385
dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2,
386
( v16u8 ) dct2, ( v16u8 ) dct2_mask );
387
dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3,
388
( v16u8 ) dct3, ( v16u8 ) dct3_mask );
389
390
non_zero = HADD_SW_S32( ( v4u32 )( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) );
391
ST_SH4( dct0, dct1, dct2, dct3, p_dct, 8 );
392
LD_SH4( p_dct + 32, 8, dct0, dct1, dct2, dct3 );
393
394
dct0_mask = __msa_clei_s_h( dct0, 0 );
395
dct1_mask = __msa_clei_s_h( dct1, 0 );
396
dct2_mask = __msa_clei_s_h( dct2, 0 );
397
dct3_mask = __msa_clei_s_h( dct3, 0 );
398
399
UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
400
UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
401
UNPCK_SH_SW( dct2, dct_signed_w4, dct_signed_w5 );
402
UNPCK_SH_SW( dct3, dct_signed_w6, dct_signed_w7 );
403
LD_SH4( p_bias + 32, 8, bias_h0, bias_h1, bias_h2, bias_h3 );
404
ILVR_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
405
bias0, bias2, bias4, bias6 );
406
ILVL_H4_SW( zero, bias_h0, zero, bias_h1, zero, bias_h2, zero, bias_h3,
407
bias1, bias3, bias5, bias7 );
408
LD_SH4( p_mf + 32, 8, mf_h0, mf_h1, mf_h2, mf_h3 );
409
ILVR_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
410
mf_vec0, mf_vec2, mf_vec4, mf_vec6 );
411
ILVL_H4_SW( zero, mf_h0, zero, mf_h1, zero, mf_h2, zero, mf_h3,
412
mf_vec1, mf_vec3, mf_vec5, mf_vec7 );
413
414
dct_w0 = __msa_add_a_w( dct_signed_w0, bias0 );
415
dct_w1 = __msa_add_a_w( dct_signed_w1, bias1 );
416
dct_w2 = __msa_add_a_w( dct_signed_w2, bias2 );
417
dct_w3 = __msa_add_a_w( dct_signed_w3, bias3 );
418
dct_w4 = __msa_add_a_w( dct_signed_w4, bias4 );
419
dct_w5 = __msa_add_a_w( dct_signed_w5, bias5 );
420
dct_w6 = __msa_add_a_w( dct_signed_w6, bias6 );
421
dct_w7 = __msa_add_a_w( dct_signed_w7, bias7 );
422
423
dct_w0 *= mf_vec0;
424
dct_w1 *= mf_vec1;
425
dct_w2 *= mf_vec2;
426
dct_w3 *= mf_vec3;
427
dct_w4 *= mf_vec4;
428
dct_w5 *= mf_vec5;
429
dct_w6 *= mf_vec6;
430
dct_w7 *= mf_vec7;
431
432
SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
433
SRA_4V( dct_w4, dct_w5, dct_w6, dct_w7, 16 );
434
PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
435
PCKEV_H2_SH( dct_w5, dct_w4, dct_w7, dct_w6, dct_h2, dct_h3 );
436
SUB4( zero, dct_h0, zero, dct_h1, zero, dct_h2, zero, dct_h3,
437
dct0, dct1, dct2, dct3 );
438
439
dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
440
( v16u8 ) dct0, ( v16u8 ) dct0_mask );
441
dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
442
( v16u8 ) dct1, ( v16u8 ) dct1_mask );
443
dct2 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h2,
444
( v16u8 ) dct2, ( v16u8 ) dct2_mask );
445
dct3 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h3,
446
( v16u8 ) dct3, ( v16u8 ) dct3_mask );
447
448
non_zero += HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 + dct_h2 + dct_h3 ) );
449
ST_SH4( dct0, dct1, dct2, dct3, p_dct + 32, 8 );
450
451
return !!non_zero;
452
}
453
454
static int32_t avc_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf,
455
int32_t i_bias )
456
{
457
int32_t non_zero = 0;
458
v8i16 dct0, dct1, dct0_mask, dct1_mask;
459
v8i16 zero = { 0 };
460
v8i16 dct_h0, dct_h1;
461
v4i32 dct_signed_w0, dct_signed_w1, dct_signed_w2, dct_signed_w3;
462
v4i32 dct_w0, dct_w1, dct_w2, dct_w3;
463
v4i32 mf_vec, bias_vec;
464
465
LD_SH2( p_dct, 8, dct0, dct1 );
466
467
dct0_mask = __msa_clei_s_h( dct0, 0 );
468
dct1_mask = __msa_clei_s_h( dct1, 0 );
469
470
UNPCK_SH_SW( dct0, dct_signed_w0, dct_signed_w1 );
471
UNPCK_SH_SW( dct1, dct_signed_w2, dct_signed_w3 );
472
473
bias_vec = __msa_fill_w( i_bias );
474
mf_vec = __msa_fill_w( i_mf );
475
476
dct_w0 = __msa_add_a_w( dct_signed_w0, bias_vec );
477
dct_w1 = __msa_add_a_w( dct_signed_w1, bias_vec );
478
dct_w2 = __msa_add_a_w( dct_signed_w2, bias_vec );
479
dct_w3 = __msa_add_a_w( dct_signed_w3, bias_vec );
480
481
dct_w0 *= mf_vec;
482
dct_w1 *= mf_vec;
483
dct_w2 *= mf_vec;
484
dct_w3 *= mf_vec;
485
486
SRA_4V( dct_w0, dct_w1, dct_w2, dct_w3, 16 );
487
PCKEV_H2_SH( dct_w1, dct_w0, dct_w3, dct_w2, dct_h0, dct_h1 );
488
489
dct0 = zero - dct_h0;
490
dct1 = zero - dct_h1;
491
dct0 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h0,
492
( v16u8 ) dct0, ( v16u8 ) dct0_mask );
493
dct1 = ( v8i16 ) __msa_bmnz_v( ( v16u8 ) dct_h1,
494
( v16u8 ) dct1, ( v16u8 ) dct1_mask );
495
non_zero = HADD_SW_S32( ( v4u32 ) ( dct_h0 + dct_h1 ) );
496
497
ST_SH2( dct0, dct1, p_dct, 8 );
498
499
return !!non_zero;
500
}
501
502
static int32_t avc_coeff_last64_msa( int16_t *p_src )
503
{
504
uint32_t u_res;
505
v8i16 src0, src1, src2, src3, src4, src5, src6, src7;
506
v8i16 tmp_h0, tmp_h1, tmp_h2, tmp_h3, tmp_h4, tmp_h5, tmp_h6, tmp_h7;
507
v16u8 tmp0, tmp1, tmp2, tmp3;
508
v8u16 vec0, vec1, vec2, vec3;
509
v4i32 out0;
510
v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
511
512
LD_SH8( p_src, 8, src0, src1, src2, src3, src4, src5, src6, src7 );
513
514
tmp_h0 = __msa_ceqi_h( src0, 0 );
515
tmp_h1 = __msa_ceqi_h( src1, 0 );
516
tmp_h2 = __msa_ceqi_h( src2, 0 );
517
tmp_h3 = __msa_ceqi_h( src3, 0 );
518
tmp_h4 = __msa_ceqi_h( src4, 0 );
519
tmp_h5 = __msa_ceqi_h( src5, 0 );
520
tmp_h6 = __msa_ceqi_h( src6, 0 );
521
tmp_h7 = __msa_ceqi_h( src7, 0 );
522
523
PCKEV_B4_UB( tmp_h1, tmp_h0, tmp_h3, tmp_h2, tmp_h5, tmp_h4, tmp_h7, tmp_h6,
524
tmp0, tmp1, tmp2, tmp3 );
525
526
tmp0 = tmp0 & mask;
527
tmp1 = tmp1 & mask;
528
tmp2 = tmp2 & mask;
529
tmp3 = tmp3 & mask;
530
531
HADD_UB4_UH( tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3 );
532
PCKEV_B2_UB( vec1, vec0, vec3, vec2, tmp0, tmp1 );
533
HADD_UB2_UH( tmp0, tmp1, vec0, vec1 );
534
535
tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec1, ( v16i8 ) vec0 );
536
vec0 = __msa_hadd_u_h( tmp0, tmp0 );
537
tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) vec0, ( v16i8 ) vec0 );
538
out0 = ( v4i32 ) __msa_nloc_d( ( v2i64 ) tmp0 );
539
u_res = __msa_copy_u_w( out0, 0 );
540
541
return ( 63 - u_res );
542
}
543
544
static int32_t avc_coeff_last16_msa( int16_t *p_src )
545
{
546
uint32_t u_res;
547
v8i16 src0, src1;
548
v8u16 tmp_h0;
549
v16u8 tmp0;
550
v8i16 out0, out1;
551
v16i8 res0;
552
v16u8 mask = { 1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128 };
553
554
LD_SH2( p_src, 8, src0, src1 );
555
556
out0 = __msa_ceqi_h( src0, 0 );
557
out1 = __msa_ceqi_h( src1, 0 );
558
559
tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) out1, ( v16i8 ) out0 );
560
tmp0 = tmp0 & mask;
561
tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
562
tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
563
tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
564
tmp0 = ( v16u8 ) __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
565
tmp_h0 = __msa_hadd_u_h( tmp0, tmp0 );
566
res0 = __msa_pckev_b( ( v16i8 ) tmp_h0, ( v16i8 ) tmp_h0 );
567
out0 = __msa_nloc_h( ( v8i16 ) res0 );
568
u_res = __msa_copy_u_h( out0, 0 );
569
570
return ( 15 - u_res );
571
}
572
573
void x264_dequant_4x4_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
574
int32_t i_qp )
575
{
576
avc_dequant_4x4_msa( p_dct, pi_dequant_mf, i_qp );
577
}
578
579
void x264_dequant_8x8_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][64],
580
int32_t i_qp )
581
{
582
avc_dequant_8x8_msa( p_dct, pi_dequant_mf, i_qp );
583
}
584
585
void x264_dequant_4x4_dc_msa( int16_t *p_dct, int32_t pi_dequant_mf[6][16],
586
int32_t i_qp )
587
{
588
avc_dequant_4x4_dc_msa( p_dct, pi_dequant_mf, i_qp );
589
}
590
591
int32_t x264_quant_4x4_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias )
592
{
593
return avc_quant_4x4_msa( p_dct, p_mf, p_bias );
594
}
595
596
int32_t x264_quant_4x4x4_msa( int16_t p_dct[4][16],
597
uint16_t pu_mf[16], uint16_t pu_bias[16] )
598
{
599
int32_t i_non_zero, i_non_zero_acc = 0;
600
601
for( int32_t j = 0; j < 4; j++ )
602
{
603
i_non_zero = x264_quant_4x4_msa( p_dct[j], pu_mf, pu_bias );
604
605
i_non_zero_acc |= ( !!i_non_zero ) << j;
606
}
607
608
return i_non_zero_acc;
609
}
610
611
int32_t x264_quant_8x8_msa( int16_t *p_dct, uint16_t *p_mf, uint16_t *p_bias )
612
{
613
return avc_quant_8x8_msa( p_dct, p_mf, p_bias );
614
}
615
616
int32_t x264_quant_4x4_dc_msa( int16_t *p_dct, int32_t i_mf, int32_t i_bias )
617
{
618
return avc_quant_4x4_dc_msa( p_dct, i_mf, i_bias );
619
}
620
621
int32_t x264_coeff_last64_msa( int16_t *p_src )
622
{
623
return avc_coeff_last64_msa( p_src );
624
}
625
626
int32_t x264_coeff_last16_msa( int16_t *p_src )
627
{
628
return avc_coeff_last16_msa( p_src );
629
}
630
#endif
631
632