Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
/*****************************************************************************
2
* quant.c: ppc quantization
3
*****************************************************************************
4
* Copyright (C) 2007-2016 x264 project
5
*
6
* Authors: Guillaume Poirier <[email protected]>
7
*
8
* This program is free software; you can redistribute it and/or modify
9
* it under the terms of the GNU General Public License as published by
10
* the Free Software Foundation; either version 2 of the License, or
11
* (at your option) any later version.
12
*
13
* This program is distributed in the hope that it will be useful,
14
* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16
* GNU General Public License for more details.
17
*
18
* You should have received a copy of the GNU General Public License
19
* along with this program; if not, write to the Free Software
20
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
21
*
22
* This program is also available under a commercial proprietary license.
23
* For more information, contact us at [email protected].
24
*****************************************************************************/
25
26
#include "common/common.h"
27
#include "ppccommon.h"
28
#include "quant.h"
29
30
#if !HIGH_BIT_DEPTH
31
// quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
32
#define QUANT_16_U( idx0, idx1 ) \
33
{ \
34
temp1v = vec_ld((idx0), dct); \
35
temp2v = vec_ld((idx1), dct); \
36
mfvA = vec_ld((idx0), mf); \
37
mfvB = vec_ld((idx1), mf); \
38
biasvA = vec_ld((idx0), bias); \
39
biasvB = vec_ld((idx1), bias); \
40
mskA = vec_cmplt(temp1v, zero_s16v); \
41
mskB = vec_cmplt(temp2v, zero_s16v); \
42
coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
43
coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
44
coefvA = vec_adds(coefvA, biasvA); \
45
coefvB = vec_adds(coefvB, biasvB); \
46
multEvenvA = vec_mule(coefvA, mfvA); \
47
multOddvA = vec_mulo(coefvA, mfvA); \
48
multEvenvB = vec_mule(coefvB, mfvB); \
49
multOddvB = vec_mulo(coefvB, mfvB); \
50
multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
51
multOddvA = vec_sr(multOddvA, i_qbitsv); \
52
multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
53
multOddvB = vec_sr(multOddvB, i_qbitsv); \
54
temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
55
temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
56
temp1v = vec_xor(temp1v, mskA); \
57
temp2v = vec_xor(temp2v, mskB); \
58
temp1v = vec_adds(temp1v, vec_and(mskA, one)); \
59
vec_st(temp1v, (idx0), dct); \
60
temp2v = vec_adds(temp2v, vec_and(mskB, one)); \
61
nz = vec_or(nz, vec_or(temp1v, temp2v)); \
62
vec_st(temp2v, (idx1), dct); \
63
}
64
65
int x264_quant_4x4_altivec( int16_t dct[16], uint16_t mf[16], uint16_t bias[16] )
66
{
67
LOAD_ZERO;
68
vector bool short mskA;
69
vec_u32_t i_qbitsv;
70
vec_u16_t coefvA;
71
vec_u32_t multEvenvA, multOddvA;
72
vec_u16_t mfvA;
73
vec_u16_t biasvA;
74
vec_s16_t one = vec_splat_s16(1);;
75
vec_s16_t nz = zero_s16v;
76
77
vector bool short mskB;
78
vec_u16_t coefvB;
79
vec_u32_t multEvenvB, multOddvB;
80
vec_u16_t mfvB;
81
vec_u16_t biasvB;
82
83
vec_s16_t temp1v, temp2v;
84
85
vec_u32_u qbits_u;
86
qbits_u.s[0]=16;
87
i_qbitsv = vec_splat(qbits_u.v, 0);
88
89
QUANT_16_U( 0, 16 );
90
return vec_any_ne(nz, zero_s16v);
91
}
92
93
// DC quant of a whole 4x4 block, unrolled 2x and "pre-scheduled"
94
#define QUANT_16_U_DC( idx0, idx1 ) \
95
{ \
96
temp1v = vec_ld((idx0), dct); \
97
temp2v = vec_ld((idx1), dct); \
98
mskA = vec_cmplt(temp1v, zero_s16v); \
99
mskB = vec_cmplt(temp2v, zero_s16v); \
100
coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
101
coefvB = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp2v), temp2v);\
102
coefvA = vec_add(coefvA, biasv); \
103
coefvB = vec_add(coefvB, biasv); \
104
multEvenvA = vec_mule(coefvA, mfv); \
105
multOddvA = vec_mulo(coefvA, mfv); \
106
multEvenvB = vec_mule(coefvB, mfv); \
107
multOddvB = vec_mulo(coefvB, mfv); \
108
multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
109
multOddvA = vec_sr(multOddvA, i_qbitsv); \
110
multEvenvB = vec_sr(multEvenvB, i_qbitsv); \
111
multOddvB = vec_sr(multOddvB, i_qbitsv); \
112
temp1v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
113
temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvB, multOddvB), vec_mergel(multEvenvB, multOddvB)); \
114
temp1v = vec_xor(temp1v, mskA); \
115
temp2v = vec_xor(temp2v, mskB); \
116
temp1v = vec_add(temp1v, vec_and(mskA, one)); \
117
vec_st(temp1v, (idx0), dct); \
118
temp2v = vec_add(temp2v, vec_and(mskB, one)); \
119
nz = vec_or(nz, vec_or(temp1v, temp2v)); \
120
vec_st(temp2v, (idx1), dct); \
121
}
122
123
int x264_quant_4x4_dc_altivec( int16_t dct[16], int mf, int bias )
124
{
125
LOAD_ZERO;
126
vector bool short mskA;
127
vec_u32_t i_qbitsv;
128
vec_u16_t coefvA;
129
vec_u32_t multEvenvA, multOddvA;
130
vec_s16_t one = vec_splat_s16(1);
131
vec_s16_t nz = zero_s16v;
132
133
vector bool short mskB;
134
vec_u16_t coefvB;
135
vec_u32_t multEvenvB, multOddvB;
136
137
vec_s16_t temp1v, temp2v;
138
139
vec_u16_t mfv;
140
vec_u16_t biasv;
141
142
vec_u16_u mf_u;
143
mf_u.s[0]=mf;
144
mfv = vec_splat( mf_u.v, 0 );
145
146
vec_u32_u qbits_u;
147
qbits_u.s[0]=16;
148
i_qbitsv = vec_splat(qbits_u.v, 0);
149
150
vec_u16_u bias_u;
151
bias_u.s[0]=bias;
152
biasv = vec_splat(bias_u.v, 0);
153
154
QUANT_16_U_DC( 0, 16 );
155
return vec_any_ne(nz, zero_s16v);
156
}
157
158
// DC quant of a whole 2x2 block
159
#define QUANT_4_U_DC( idx0 ) \
160
{ \
161
const vec_u16_t sel = (vec_u16_t) CV(-1,-1,-1,-1,0,0,0,0); \
162
temp1v = vec_ld((idx0), dct); \
163
mskA = vec_cmplt(temp1v, zero_s16v); \
164
coefvA = (vec_u16_t)vec_max(vec_sub(zero_s16v, temp1v), temp1v);\
165
coefvA = vec_add(coefvA, biasv); \
166
multEvenvA = vec_mule(coefvA, mfv); \
167
multOddvA = vec_mulo(coefvA, mfv); \
168
multEvenvA = vec_sr(multEvenvA, i_qbitsv); \
169
multOddvA = vec_sr(multOddvA, i_qbitsv); \
170
temp2v = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), vec_mergel(multEvenvA, multOddvA)); \
171
temp2v = vec_xor(temp2v, mskA); \
172
temp2v = vec_add(temp2v, vec_and(mskA, one)); \
173
temp1v = vec_sel(temp1v, temp2v, sel); \
174
nz = vec_or(nz, temp1v); \
175
vec_st(temp1v, (idx0), dct); \
176
}
177
178
int x264_quant_2x2_dc_altivec( int16_t dct[4], int mf, int bias )
179
{
180
LOAD_ZERO;
181
vector bool short mskA;
182
vec_u32_t i_qbitsv;
183
vec_u16_t coefvA;
184
vec_u32_t multEvenvA, multOddvA;
185
vec_s16_t one = vec_splat_s16(1);
186
vec_s16_t nz = zero_s16v;
187
188
vec_s16_t temp1v, temp2v;
189
190
vec_u16_t mfv;
191
vec_u16_t biasv;
192
193
vec_u16_u mf_u;
194
mf_u.s[0]=mf;
195
mfv = vec_splat( mf_u.v, 0 );
196
197
vec_u32_u qbits_u;
198
qbits_u.s[0]=16;
199
i_qbitsv = vec_splat(qbits_u.v, 0);
200
201
vec_u16_u bias_u;
202
bias_u.s[0]=bias;
203
biasv = vec_splat(bias_u.v, 0);
204
205
static const vec_s16_t mask2 = CV(-1, -1, -1, -1, 0, 0, 0, 0);
206
QUANT_4_U_DC(0);
207
return vec_any_ne(vec_and(nz, mask2), zero_s16v);
208
}
209
210
int x264_quant_8x8_altivec( int16_t dct[64], uint16_t mf[64], uint16_t bias[64] )
211
{
212
LOAD_ZERO;
213
vector bool short mskA;
214
vec_u32_t i_qbitsv;
215
vec_u16_t coefvA;
216
vec_u32_t multEvenvA, multOddvA;
217
vec_u16_t mfvA;
218
vec_u16_t biasvA;
219
vec_s16_t one = vec_splat_s16(1);;
220
vec_s16_t nz = zero_s16v;
221
222
vector bool short mskB;
223
vec_u16_t coefvB;
224
vec_u32_t multEvenvB, multOddvB;
225
vec_u16_t mfvB;
226
vec_u16_t biasvB;
227
228
vec_s16_t temp1v, temp2v;
229
230
vec_u32_u qbits_u;
231
qbits_u.s[0]=16;
232
i_qbitsv = vec_splat(qbits_u.v, 0);
233
234
for( int i = 0; i < 4; i++ )
235
QUANT_16_U( i*2*16, i*2*16+16 );
236
return vec_any_ne(nz, zero_s16v);
237
}
238
239
#define DEQUANT_SHL() \
240
{ \
241
dctv = vec_ld(8*y, dct); \
242
mf1v = vec_ld(16*y, dequant_mf[i_mf]); \
243
mf2v = vec_ld(16+16*y, dequant_mf[i_mf]); \
244
mfv = vec_packs(mf1v, mf2v); \
245
\
246
multEvenvA = vec_mule(dctv, mfv); \
247
multOddvA = vec_mulo(dctv, mfv); \
248
dctv = (vec_s16_t) vec_packs(vec_mergeh(multEvenvA, multOddvA), \
249
vec_mergel(multEvenvA, multOddvA)); \
250
dctv = vec_sl(dctv, i_qbitsv); \
251
vec_st(dctv, 8*y, dct); \
252
}
253
254
#ifdef WORDS_BIGENDIAN
255
#define VEC_MULE vec_mule
256
#define VEC_MULO vec_mulo
257
#else
258
#define VEC_MULE vec_mulo
259
#define VEC_MULO vec_mule
260
#endif
261
262
#define DEQUANT_SHR() \
263
{ \
264
dctv = vec_ld(8*y, dct); \
265
dct1v = vec_mergeh(dctv, dctv); \
266
dct2v = vec_mergel(dctv, dctv); \
267
mf1v = vec_ld(16*y, dequant_mf[i_mf]); \
268
mf2v = vec_ld(16+16*y, dequant_mf[i_mf]); \
269
\
270
multEvenvA = VEC_MULE(dct1v, (vec_s16_t)mf1v); \
271
multOddvA = VEC_MULO(dct1v, (vec_s16_t)mf1v); \
272
temp1v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
273
temp1v = vec_add(temp1v, fv); \
274
temp1v = vec_sra(temp1v, i_qbitsv); \
275
\
276
multEvenvA = VEC_MULE(dct2v, (vec_s16_t)mf2v); \
277
multOddvA = VEC_MULO(dct2v, (vec_s16_t)mf2v); \
278
temp2v = vec_add(vec_sl(multEvenvA, sixteenv), multOddvA); \
279
temp2v = vec_add(temp2v, fv); \
280
temp2v = vec_sra(temp2v, i_qbitsv); \
281
\
282
dctv = (vec_s16_t)vec_packs(temp1v, temp2v); \
283
vec_st(dctv, y*8, dct); \
284
}
285
286
void x264_dequant_4x4_altivec( int16_t dct[16], int dequant_mf[6][16], int i_qp )
287
{
288
int i_mf = i_qp%6;
289
int i_qbits = i_qp/6 - 4;
290
291
vec_s16_t dctv;
292
vec_s16_t dct1v, dct2v;
293
vec_s32_t mf1v, mf2v;
294
vec_s16_t mfv;
295
vec_s32_t multEvenvA, multOddvA;
296
vec_s32_t temp1v, temp2v;
297
298
if( i_qbits >= 0 )
299
{
300
vec_u16_t i_qbitsv;
301
vec_u16_u qbits_u;
302
qbits_u.s[0]=i_qbits;
303
i_qbitsv = vec_splat(qbits_u.v, 0);
304
305
for( int y = 0; y < 4; y+=2 )
306
DEQUANT_SHL();
307
}
308
else
309
{
310
const int f = 1 << (-i_qbits-1);
311
312
vec_s32_t fv;
313
vec_u32_u f_u;
314
f_u.s[0]=f;
315
fv = (vec_s32_t)vec_splat(f_u.v, 0);
316
317
vec_u32_t i_qbitsv;
318
vec_u32_u qbits_u;
319
qbits_u.s[0]=-i_qbits;
320
i_qbitsv = vec_splat(qbits_u.v, 0);
321
322
vec_u32_t sixteenv;
323
vec_u32_u sixteen_u;
324
sixteen_u.s[0]=16;
325
sixteenv = vec_splat(sixteen_u.v, 0);
326
327
for( int y = 0; y < 4; y+=2 )
328
DEQUANT_SHR();
329
}
330
}
331
332
void x264_dequant_8x8_altivec( int16_t dct[64], int dequant_mf[6][64], int i_qp )
333
{
334
int i_mf = i_qp%6;
335
int i_qbits = i_qp/6 - 6;
336
337
vec_s16_t dctv;
338
vec_s16_t dct1v, dct2v;
339
vec_s32_t mf1v, mf2v;
340
vec_s16_t mfv;
341
vec_s32_t multEvenvA, multOddvA;
342
vec_s32_t temp1v, temp2v;
343
344
if( i_qbits >= 0 )
345
{
346
vec_u16_t i_qbitsv;
347
vec_u16_u qbits_u;
348
qbits_u.s[0]=i_qbits;
349
i_qbitsv = vec_splat(qbits_u.v, 0);
350
351
for( int y = 0; y < 16; y+=2 )
352
DEQUANT_SHL();
353
}
354
else
355
{
356
const int f = 1 << (-i_qbits-1);
357
358
vec_s32_t fv;
359
vec_u32_u f_u;
360
f_u.s[0]=f;
361
fv = (vec_s32_t)vec_splat(f_u.v, 0);
362
363
vec_u32_t i_qbitsv;
364
vec_u32_u qbits_u;
365
qbits_u.s[0]=-i_qbits;
366
i_qbitsv = vec_splat(qbits_u.v, 0);
367
368
vec_u32_t sixteenv;
369
vec_u32_u sixteen_u;
370
sixteen_u.s[0]=16;
371
sixteenv = vec_splat(sixteen_u.v, 0);
372
373
for( int y = 0; y < 16; y+=2 )
374
DEQUANT_SHR();
375
}
376
}
377
#endif // !HIGH_BIT_DEPTH
378
379
380