Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52867 views
1
/*****************************************************************************
2
* pixel.c: ppc pixel metrics
3
*****************************************************************************
4
* Copyright (C) 2003-2016 x264 project
5
*
6
* Authors: Eric Petit <[email protected]>
7
* Guillaume Poirier <[email protected]>
8
*
9
* This program is free software; you can redistribute it and/or modify
10
* it under the terms of the GNU General Public License as published by
11
* the Free Software Foundation; either version 2 of the License, or
12
* (at your option) any later version.
13
*
14
* This program is distributed in the hope that it will be useful,
15
* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
* GNU General Public License for more details.
18
*
19
* You should have received a copy of the GNU General Public License
20
* along with this program; if not, write to the Free Software
21
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22
*
23
* This program is also available under a commercial proprietary license.
24
* For more information, contact us at [email protected].
25
*****************************************************************************/
26
27
#include "common/common.h"
28
#include "ppccommon.h"
29
#include "../predict.h"
30
31
#if !HIGH_BIT_DEPTH
32
/***********************************************************************
33
* SAD routines
34
**********************************************************************/
35
36
#define PIXEL_SAD_ALTIVEC( name, lx, ly, a, b ) \
37
static int name( uint8_t *pix1, intptr_t i_pix1, \
38
uint8_t *pix2, intptr_t i_pix2 ) \
39
{ \
40
ALIGNED_16( int sum ); \
41
\
42
LOAD_ZERO; \
43
PREP_LOAD; \
44
vec_u8_t pix1v, pix2v; \
45
vec_s32_t sumv = zero_s32v; \
46
for( int y = 0; y < ly; y++ ) \
47
{ \
48
VEC_LOAD_G( pix1, pix1v, lx, vec_u8_t ); \
49
VEC_LOAD_G( pix2, pix2v, lx, vec_u8_t ); \
50
sumv = (vec_s32_t) vec_sum4s( \
51
vec_sub( vec_max( pix1v, pix2v ), \
52
vec_min( pix1v, pix2v ) ), \
53
(vec_u32_t) sumv ); \
54
pix1 += i_pix1; \
55
pix2 += i_pix2; \
56
} \
57
sumv = vec_sum##a( sumv, zero_s32v ); \
58
sumv = vec_splat( sumv, b ); \
59
vec_ste( sumv, 0, &sum ); \
60
return sum; \
61
}
62
63
PIXEL_SAD_ALTIVEC( pixel_sad_16x16_altivec, 16, 16, s, 3 )
64
PIXEL_SAD_ALTIVEC( pixel_sad_8x16_altivec, 8, 16, 2s, 1 )
65
PIXEL_SAD_ALTIVEC( pixel_sad_16x8_altivec, 16, 8, s, 3 )
66
PIXEL_SAD_ALTIVEC( pixel_sad_8x8_altivec, 8, 8, 2s, 1 )
67
68
69
70
/***********************************************************************
71
* SATD routines
72
**********************************************************************/
73
74
/***********************************************************************
75
* VEC_HADAMAR
76
***********************************************************************
77
* b[0] = a[0] + a[1] + a[2] + a[3]
78
* b[1] = a[0] + a[1] - a[2] - a[3]
79
* b[2] = a[0] - a[1] - a[2] + a[3]
80
* b[3] = a[0] - a[1] + a[2] - a[3]
81
**********************************************************************/
82
#define VEC_HADAMAR(a0,a1,a2,a3,b0,b1,b2,b3) \
83
b2 = vec_add( a0, a1 ); \
84
b3 = vec_add( a2, a3 ); \
85
a0 = vec_sub( a0, a1 ); \
86
a2 = vec_sub( a2, a3 ); \
87
b0 = vec_add( b2, b3 ); \
88
b1 = vec_sub( b2, b3 ); \
89
b2 = vec_sub( a0, a2 ); \
90
b3 = vec_add( a0, a2 )
91
92
/***********************************************************************
93
* VEC_ABS
94
***********************************************************************
95
* a: s16v
96
*
97
* a = abs(a)
98
*
99
* Call vec_sub()/vec_max() instead of vec_abs() because vec_abs()
100
* actually also calls vec_splat(0), but we already have a null vector.
101
**********************************************************************/
102
#define VEC_ABS(a) \
103
a = vec_max( a, vec_sub( zero_s16v, a ) );
104
105
#define VEC_ABSOLUTE(a) (vec_u16_t)vec_max( a, vec_sub( zero_s16v, a ) )
106
107
/***********************************************************************
108
* VEC_ADD_ABS
109
***********************************************************************
110
* a: s16v
111
* b, c: s32v
112
*
113
* c[i] = abs(a[2*i]) + abs(a[2*i+1]) + [bi]
114
**********************************************************************/
115
#define VEC_ADD_ABS(a,b,c) \
116
VEC_ABS( a ); \
117
c = vec_sum4s( a, b )
118
119
/***********************************************************************
120
* SATD 4x4
121
**********************************************************************/
122
static int pixel_satd_4x4_altivec( uint8_t *pix1, intptr_t i_pix1,
123
uint8_t *pix2, intptr_t i_pix2 )
124
{
125
ALIGNED_16( int i_satd );
126
127
PREP_DIFF;
128
PREP_LOAD_SRC( pix1 );
129
vec_s16_t diff0v, diff1v, diff2v, diff3v;
130
vec_s16_t temp0v, temp1v, temp2v, temp3v;
131
vec_s32_t satdv;
132
133
vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
134
vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
135
136
137
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
138
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
139
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
140
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
141
142
/* Hadamar H */
143
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
144
temp0v, temp1v, temp2v, temp3v );
145
146
VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
147
diff0v, diff1v, diff2v, diff3v );
148
/* Hadamar V */
149
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
150
temp0v, temp1v, temp2v, temp3v );
151
152
VEC_ADD_ABS( temp0v, zero_s32v, satdv );
153
VEC_ADD_ABS( temp1v, satdv, satdv );
154
VEC_ADD_ABS( temp2v, satdv, satdv );
155
VEC_ADD_ABS( temp3v, satdv, satdv );
156
157
satdv = vec_sum2s( satdv, zero_s32v );
158
satdv = vec_splat( satdv, 1 );
159
vec_ste( satdv, 0, &i_satd );
160
161
return i_satd >> 1;
162
}
163
164
/***********************************************************************
165
* SATD 4x8
166
**********************************************************************/
167
static int pixel_satd_4x8_altivec( uint8_t *pix1, intptr_t i_pix1,
168
uint8_t *pix2, intptr_t i_pix2 )
169
{
170
ALIGNED_16( int i_satd );
171
172
PREP_DIFF;
173
vec_s16_t diff0v, diff1v, diff2v, diff3v;
174
vec_s16_t temp0v, temp1v, temp2v, temp3v;
175
vec_s32_t satdv;
176
177
PREP_LOAD_SRC( pix1 );
178
vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
179
vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
180
181
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
182
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
183
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
184
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
185
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
186
temp0v, temp1v, temp2v, temp3v );
187
VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
188
diff0v, diff1v, diff2v, diff3v );
189
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
190
temp0v, temp1v, temp2v, temp3v );
191
VEC_ADD_ABS( temp0v, zero_s32v, satdv );
192
VEC_ADD_ABS( temp1v, satdv, satdv );
193
VEC_ADD_ABS( temp2v, satdv, satdv );
194
VEC_ADD_ABS( temp3v, satdv, satdv );
195
196
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff0v, offset1v );
197
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff1v, offset2v );
198
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff2v, offset1v );
199
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 4, diff3v, offset2v );
200
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
201
temp0v, temp1v, temp2v, temp3v );
202
VEC_TRANSPOSE_4( temp0v, temp1v, temp2v, temp3v,
203
diff0v, diff1v, diff2v, diff3v );
204
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
205
temp0v, temp1v, temp2v, temp3v );
206
VEC_ADD_ABS( temp0v, satdv, satdv );
207
VEC_ADD_ABS( temp1v, satdv, satdv );
208
VEC_ADD_ABS( temp2v, satdv, satdv );
209
VEC_ADD_ABS( temp3v, satdv, satdv );
210
211
satdv = vec_sum2s( satdv, zero_s32v );
212
satdv = vec_splat( satdv, 1 );
213
vec_ste( satdv, 0, &i_satd );
214
215
return i_satd >> 1;
216
}
217
218
/***********************************************************************
219
* SATD 8x4
220
**********************************************************************/
221
static int pixel_satd_8x4_altivec( uint8_t *pix1, intptr_t i_pix1,
222
uint8_t *pix2, intptr_t i_pix2 )
223
{
224
ALIGNED_16( int i_satd );
225
226
PREP_DIFF;
227
vec_s16_t diff0v, diff1v, diff2v, diff3v,
228
diff4v, diff5v, diff6v, diff7v;
229
vec_s16_t temp0v, temp1v, temp2v, temp3v,
230
temp4v, temp5v, temp6v, temp7v;
231
vec_s32_t satdv;
232
233
234
PREP_LOAD_SRC( pix1 );
235
vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
236
vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
237
238
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
239
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
240
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
241
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
242
243
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
244
temp0v, temp1v, temp2v, temp3v );
245
/* This causes warnings because temp4v...temp7v haven't be set,
246
but we don't care */
247
VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
248
temp4v, temp5v, temp6v, temp7v,
249
diff0v, diff1v, diff2v, diff3v,
250
diff4v, diff5v, diff6v, diff7v );
251
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
252
temp0v, temp1v, temp2v, temp3v );
253
VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
254
temp4v, temp5v, temp6v, temp7v );
255
256
VEC_ADD_ABS( temp0v, zero_s32v, satdv );
257
VEC_ADD_ABS( temp1v, satdv, satdv );
258
VEC_ADD_ABS( temp2v, satdv, satdv );
259
VEC_ADD_ABS( temp3v, satdv, satdv );
260
VEC_ADD_ABS( temp4v, satdv, satdv );
261
VEC_ADD_ABS( temp5v, satdv, satdv );
262
VEC_ADD_ABS( temp6v, satdv, satdv );
263
VEC_ADD_ABS( temp7v, satdv, satdv );
264
265
satdv = vec_sum2s( satdv, zero_s32v );
266
satdv = vec_splat( satdv, 1 );
267
vec_ste( satdv, 0, &i_satd );
268
269
return i_satd >> 1;
270
}
271
272
/***********************************************************************
273
* SATD 8x8
274
**********************************************************************/
275
static int pixel_satd_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
276
uint8_t *pix2, intptr_t i_pix2 )
277
{
278
ALIGNED_16( int i_satd );
279
280
PREP_DIFF;
281
vec_s16_t diff0v, diff1v, diff2v, diff3v,
282
diff4v, diff5v, diff6v, diff7v;
283
vec_s16_t temp0v, temp1v, temp2v, temp3v,
284
temp4v, temp5v, temp6v, temp7v;
285
vec_s32_t satdv;
286
287
vec_u8_t _offset1_1v_ = vec_lvsl(0, pix1);
288
vec_u8_t _offset1_2v_ = vec_lvsl(0, pix1 + i_pix1);
289
vec_u8_t _offset2_1v_ = vec_lvsl(0, pix2);
290
vec_u8_t _offset2_2v_ = vec_lvsl(0, pix2 + i_pix2);
291
292
VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1_1v, offset2_1v );
293
VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset1_2v, offset2_2v );
294
VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1_1v, offset2_1v );
295
VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset1_2v, offset2_2v );
296
VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1_1v, offset2_1v );
297
VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset1_2v, offset2_2v );
298
VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1_1v, offset2_1v );
299
VEC_DIFF_H_OFFSET( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset1_2v, offset2_2v );
300
301
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
302
temp0v, temp1v, temp2v, temp3v );
303
VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
304
temp4v, temp5v, temp6v, temp7v );
305
306
VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
307
temp4v, temp5v, temp6v, temp7v,
308
diff0v, diff1v, diff2v, diff3v,
309
diff4v, diff5v, diff6v, diff7v );
310
311
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
312
temp0v, temp1v, temp2v, temp3v );
313
VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
314
temp4v, temp5v, temp6v, temp7v );
315
316
VEC_ADD_ABS( temp0v, zero_s32v, satdv );
317
VEC_ADD_ABS( temp1v, satdv, satdv );
318
VEC_ADD_ABS( temp2v, satdv, satdv );
319
VEC_ADD_ABS( temp3v, satdv, satdv );
320
VEC_ADD_ABS( temp4v, satdv, satdv );
321
VEC_ADD_ABS( temp5v, satdv, satdv );
322
VEC_ADD_ABS( temp6v, satdv, satdv );
323
VEC_ADD_ABS( temp7v, satdv, satdv );
324
325
satdv = vec_sums( satdv, zero_s32v );
326
satdv = vec_splat( satdv, 3 );
327
vec_ste( satdv, 0, &i_satd );
328
329
return i_satd >> 1;
330
}
331
332
/***********************************************************************
333
* SATD 8x16
334
**********************************************************************/
335
static int pixel_satd_8x16_altivec( uint8_t *pix1, intptr_t i_pix1,
336
uint8_t *pix2, intptr_t i_pix2 )
337
{
338
ALIGNED_16( int i_satd );
339
340
PREP_DIFF;
341
vec_s16_t diff0v, diff1v, diff2v, diff3v,
342
diff4v, diff5v, diff6v, diff7v;
343
vec_s16_t temp0v, temp1v, temp2v, temp3v,
344
temp4v, temp5v, temp6v, temp7v;
345
vec_s32_t satdv;
346
347
PREP_LOAD_SRC( pix1 );
348
vec_u8_t _offset1v_ = vec_lvsl(0, pix2);
349
vec_u8_t _offset2v_ = vec_lvsl(0, pix2 + i_pix2);
350
351
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
352
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
353
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
354
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
355
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
356
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
357
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v , offset1v);
358
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
359
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
360
temp0v, temp1v, temp2v, temp3v );
361
VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
362
temp4v, temp5v, temp6v, temp7v );
363
VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
364
temp4v, temp5v, temp6v, temp7v,
365
diff0v, diff1v, diff2v, diff3v,
366
diff4v, diff5v, diff6v, diff7v );
367
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
368
temp0v, temp1v, temp2v, temp3v );
369
VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
370
temp4v, temp5v, temp6v, temp7v );
371
VEC_ADD_ABS( temp0v, zero_s32v, satdv );
372
VEC_ADD_ABS( temp1v, satdv, satdv );
373
VEC_ADD_ABS( temp2v, satdv, satdv );
374
VEC_ADD_ABS( temp3v, satdv, satdv );
375
VEC_ADD_ABS( temp4v, satdv, satdv );
376
VEC_ADD_ABS( temp5v, satdv, satdv );
377
VEC_ADD_ABS( temp6v, satdv, satdv );
378
VEC_ADD_ABS( temp7v, satdv, satdv );
379
380
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, offset1v );
381
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, offset2v );
382
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, offset1v );
383
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, offset2v );
384
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, offset1v );
385
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, offset2v );
386
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, offset1v );
387
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, offset2v );
388
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
389
temp0v, temp1v, temp2v, temp3v );
390
VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
391
temp4v, temp5v, temp6v, temp7v );
392
VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
393
temp4v, temp5v, temp6v, temp7v,
394
diff0v, diff1v, diff2v, diff3v,
395
diff4v, diff5v, diff6v, diff7v );
396
VEC_HADAMAR( diff0v, diff1v, diff2v, diff3v,
397
temp0v, temp1v, temp2v, temp3v );
398
VEC_HADAMAR( diff4v, diff5v, diff6v, diff7v,
399
temp4v, temp5v, temp6v, temp7v );
400
VEC_ADD_ABS( temp0v, satdv, satdv );
401
VEC_ADD_ABS( temp1v, satdv, satdv );
402
VEC_ADD_ABS( temp2v, satdv, satdv );
403
VEC_ADD_ABS( temp3v, satdv, satdv );
404
VEC_ADD_ABS( temp4v, satdv, satdv );
405
VEC_ADD_ABS( temp5v, satdv, satdv );
406
VEC_ADD_ABS( temp6v, satdv, satdv );
407
VEC_ADD_ABS( temp7v, satdv, satdv );
408
409
satdv = vec_sums( satdv, zero_s32v );
410
satdv = vec_splat( satdv, 3 );
411
vec_ste( satdv, 0, &i_satd );
412
413
return i_satd >> 1;
414
}
415
416
/***********************************************************************
417
* SATD 16x8
418
**********************************************************************/
419
static int pixel_satd_16x8_altivec( uint8_t *pix1, intptr_t i_pix1,
420
uint8_t *pix2, intptr_t i_pix2 )
421
{
422
ALIGNED_16( int i_satd );
423
424
LOAD_ZERO;
425
PREP_LOAD;
426
PREP_LOAD_SRC( pix2 );
427
vec_s32_t satdv;
428
vec_s16_t pix1v, pix2v;
429
vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
430
diffh4v, diffh5v, diffh6v, diffh7v;
431
vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
432
diffl4v, diffl5v, diffl6v, diffl7v;
433
vec_s16_t temp0v, temp1v, temp2v, temp3v,
434
temp4v, temp5v, temp6v, temp7v;
435
436
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
437
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
438
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
439
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
440
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
441
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
442
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
443
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
444
445
VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
446
temp0v, temp1v, temp2v, temp3v );
447
VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
448
temp4v, temp5v, temp6v, temp7v );
449
450
VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
451
temp4v, temp5v, temp6v, temp7v,
452
diffh0v, diffh1v, diffh2v, diffh3v,
453
diffh4v, diffh5v, diffh6v, diffh7v );
454
455
VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
456
temp0v, temp1v, temp2v, temp3v );
457
VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
458
temp4v, temp5v, temp6v, temp7v );
459
460
VEC_ADD_ABS( temp0v, zero_s32v, satdv );
461
VEC_ADD_ABS( temp1v, satdv, satdv );
462
VEC_ADD_ABS( temp2v, satdv, satdv );
463
VEC_ADD_ABS( temp3v, satdv, satdv );
464
VEC_ADD_ABS( temp4v, satdv, satdv );
465
VEC_ADD_ABS( temp5v, satdv, satdv );
466
VEC_ADD_ABS( temp6v, satdv, satdv );
467
VEC_ADD_ABS( temp7v, satdv, satdv );
468
469
VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
470
temp0v, temp1v, temp2v, temp3v );
471
VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
472
temp4v, temp5v, temp6v, temp7v );
473
474
VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
475
temp4v, temp5v, temp6v, temp7v,
476
diffl0v, diffl1v, diffl2v, diffl3v,
477
diffl4v, diffl5v, diffl6v, diffl7v );
478
479
VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
480
temp0v, temp1v, temp2v, temp3v );
481
VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
482
temp4v, temp5v, temp6v, temp7v );
483
484
VEC_ADD_ABS( temp0v, satdv, satdv );
485
VEC_ADD_ABS( temp1v, satdv, satdv );
486
VEC_ADD_ABS( temp2v, satdv, satdv );
487
VEC_ADD_ABS( temp3v, satdv, satdv );
488
VEC_ADD_ABS( temp4v, satdv, satdv );
489
VEC_ADD_ABS( temp5v, satdv, satdv );
490
VEC_ADD_ABS( temp6v, satdv, satdv );
491
VEC_ADD_ABS( temp7v, satdv, satdv );
492
493
satdv = vec_sums( satdv, zero_s32v );
494
satdv = vec_splat( satdv, 3 );
495
vec_ste( satdv, 0, &i_satd );
496
497
return i_satd >> 1;
498
}
499
500
/***********************************************************************
501
* SATD 16x16
502
**********************************************************************/
503
static int pixel_satd_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
504
uint8_t *pix2, intptr_t i_pix2 )
505
{
506
ALIGNED_16( int i_satd );
507
508
LOAD_ZERO;
509
PREP_LOAD;
510
vec_s32_t satdv;
511
vec_s16_t pix1v, pix2v;
512
vec_s16_t diffh0v, diffh1v, diffh2v, diffh3v,
513
diffh4v, diffh5v, diffh6v, diffh7v;
514
vec_s16_t diffl0v, diffl1v, diffl2v, diffl3v,
515
diffl4v, diffl5v, diffl6v, diffl7v;
516
vec_s16_t temp0v, temp1v, temp2v, temp3v,
517
temp4v, temp5v, temp6v, temp7v;
518
PREP_LOAD_SRC( pix2 );
519
520
521
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
522
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
523
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
524
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
525
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
526
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
527
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
528
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
529
VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
530
temp0v, temp1v, temp2v, temp3v );
531
VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
532
temp4v, temp5v, temp6v, temp7v );
533
VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
534
temp4v, temp5v, temp6v, temp7v,
535
diffh0v, diffh1v, diffh2v, diffh3v,
536
diffh4v, diffh5v, diffh6v, diffh7v );
537
VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
538
temp0v, temp1v, temp2v, temp3v );
539
VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
540
temp4v, temp5v, temp6v, temp7v );
541
VEC_ADD_ABS( temp0v, zero_s32v, satdv );
542
VEC_ADD_ABS( temp1v, satdv, satdv );
543
VEC_ADD_ABS( temp2v, satdv, satdv );
544
VEC_ADD_ABS( temp3v, satdv, satdv );
545
VEC_ADD_ABS( temp4v, satdv, satdv );
546
VEC_ADD_ABS( temp5v, satdv, satdv );
547
VEC_ADD_ABS( temp6v, satdv, satdv );
548
VEC_ADD_ABS( temp7v, satdv, satdv );
549
VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
550
temp0v, temp1v, temp2v, temp3v );
551
VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
552
temp4v, temp5v, temp6v, temp7v );
553
VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
554
temp4v, temp5v, temp6v, temp7v,
555
diffl0v, diffl1v, diffl2v, diffl3v,
556
diffl4v, diffl5v, diffl6v, diffl7v );
557
VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
558
temp0v, temp1v, temp2v, temp3v );
559
VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
560
temp4v, temp5v, temp6v, temp7v );
561
VEC_ADD_ABS( temp0v, satdv, satdv );
562
VEC_ADD_ABS( temp1v, satdv, satdv );
563
VEC_ADD_ABS( temp2v, satdv, satdv );
564
VEC_ADD_ABS( temp3v, satdv, satdv );
565
VEC_ADD_ABS( temp4v, satdv, satdv );
566
VEC_ADD_ABS( temp5v, satdv, satdv );
567
VEC_ADD_ABS( temp6v, satdv, satdv );
568
VEC_ADD_ABS( temp7v, satdv, satdv );
569
570
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh0v, diffl0v );
571
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh1v, diffl1v );
572
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh2v, diffl2v );
573
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh3v, diffl3v );
574
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh4v, diffl4v );
575
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh5v, diffl5v );
576
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh6v, diffl6v );
577
VEC_DIFF_HL( pix1, i_pix1, pix2, i_pix2, diffh7v, diffl7v );
578
VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
579
temp0v, temp1v, temp2v, temp3v );
580
VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
581
temp4v, temp5v, temp6v, temp7v );
582
VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
583
temp4v, temp5v, temp6v, temp7v,
584
diffh0v, diffh1v, diffh2v, diffh3v,
585
diffh4v, diffh5v, diffh6v, diffh7v );
586
VEC_HADAMAR( diffh0v, diffh1v, diffh2v, diffh3v,
587
temp0v, temp1v, temp2v, temp3v );
588
VEC_HADAMAR( diffh4v, diffh5v, diffh6v, diffh7v,
589
temp4v, temp5v, temp6v, temp7v );
590
VEC_ADD_ABS( temp0v, satdv, satdv );
591
VEC_ADD_ABS( temp1v, satdv, satdv );
592
VEC_ADD_ABS( temp2v, satdv, satdv );
593
VEC_ADD_ABS( temp3v, satdv, satdv );
594
VEC_ADD_ABS( temp4v, satdv, satdv );
595
VEC_ADD_ABS( temp5v, satdv, satdv );
596
VEC_ADD_ABS( temp6v, satdv, satdv );
597
VEC_ADD_ABS( temp7v, satdv, satdv );
598
VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
599
temp0v, temp1v, temp2v, temp3v );
600
VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
601
temp4v, temp5v, temp6v, temp7v );
602
VEC_TRANSPOSE_8( temp0v, temp1v, temp2v, temp3v,
603
temp4v, temp5v, temp6v, temp7v,
604
diffl0v, diffl1v, diffl2v, diffl3v,
605
diffl4v, diffl5v, diffl6v, diffl7v );
606
VEC_HADAMAR( diffl0v, diffl1v, diffl2v, diffl3v,
607
temp0v, temp1v, temp2v, temp3v );
608
VEC_HADAMAR( diffl4v, diffl5v, diffl6v, diffl7v,
609
temp4v, temp5v, temp6v, temp7v );
610
VEC_ADD_ABS( temp0v, satdv, satdv );
611
VEC_ADD_ABS( temp1v, satdv, satdv );
612
VEC_ADD_ABS( temp2v, satdv, satdv );
613
VEC_ADD_ABS( temp3v, satdv, satdv );
614
VEC_ADD_ABS( temp4v, satdv, satdv );
615
VEC_ADD_ABS( temp5v, satdv, satdv );
616
VEC_ADD_ABS( temp6v, satdv, satdv );
617
VEC_ADD_ABS( temp7v, satdv, satdv );
618
619
satdv = vec_sums( satdv, zero_s32v );
620
satdv = vec_splat( satdv, 3 );
621
vec_ste( satdv, 0, &i_satd );
622
623
return i_satd >> 1;
624
}
625
626
627
628
/***********************************************************************
629
* Interleaved SAD routines
630
**********************************************************************/
631
632
static void pixel_sad_x4_16x16_altivec( uint8_t *fenc,
633
uint8_t *pix0, uint8_t *pix1,
634
uint8_t *pix2, uint8_t *pix3,
635
intptr_t i_stride, int scores[4] )
636
{
637
ALIGNED_16( int sum0 );
638
ALIGNED_16( int sum1 );
639
ALIGNED_16( int sum2 );
640
ALIGNED_16( int sum3 );
641
642
LOAD_ZERO;
643
vec_u8_t temp_lv, temp_hv;
644
vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
645
//vec_u8_t perm0v, perm1v, perm2v, perm3v;
646
vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;
647
648
vec_s32_t sum0v, sum1v, sum2v, sum3v;
649
650
sum0v = vec_splat_s32(0);
651
sum1v = vec_splat_s32(0);
652
sum2v = vec_splat_s32(0);
653
sum3v = vec_splat_s32(0);
654
655
perm0vA = vec_lvsl(0, pix0);
656
perm1vA = vec_lvsl(0, pix1);
657
perm2vA = vec_lvsl(0, pix2);
658
perm3vA = vec_lvsl(0, pix3);
659
660
perm0vB = vec_lvsl(0, pix0 + i_stride);
661
perm1vB = vec_lvsl(0, pix1 + i_stride);
662
perm2vB = vec_lvsl(0, pix2 + i_stride);
663
perm3vB = vec_lvsl(0, pix3 + i_stride);
664
665
for( int y = 0; y < 8; y++ )
666
{
667
temp_lv = vec_ld(0, pix0);
668
temp_hv = vec_ld(16, pix0);
669
pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
670
pix0 += i_stride;
671
672
temp_lv = vec_ld(0, pix1);
673
temp_hv = vec_ld(16, pix1);
674
pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
675
pix1 += i_stride;
676
677
fencv = vec_ld(0, fenc);
678
fenc += FENC_STRIDE;
679
680
temp_lv = vec_ld(0, pix2);
681
temp_hv = vec_ld(16, pix2);
682
pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
683
pix2 += i_stride;
684
685
temp_lv = vec_ld(0, pix3);
686
temp_hv = vec_ld(16, pix3);
687
pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
688
pix3 += i_stride;
689
690
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
691
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
692
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
693
sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
694
695
temp_lv = vec_ld(0, pix0);
696
temp_hv = vec_ld(16, pix0);
697
pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
698
pix0 += i_stride;
699
700
temp_lv = vec_ld(0, pix1);
701
temp_hv = vec_ld(16, pix1);
702
pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
703
pix1 += i_stride;
704
705
fencv = vec_ld(0, fenc);
706
fenc += FENC_STRIDE;
707
708
temp_lv = vec_ld(0, pix2);
709
temp_hv = vec_ld(16, pix2);
710
pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
711
pix2 += i_stride;
712
713
temp_lv = vec_ld(0, pix3);
714
temp_hv = vec_ld(16, pix3);
715
pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
716
pix3 += i_stride;
717
718
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
719
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
720
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
721
sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
722
}
723
724
sum0v = vec_sums( sum0v, zero_s32v );
725
sum1v = vec_sums( sum1v, zero_s32v );
726
sum2v = vec_sums( sum2v, zero_s32v );
727
sum3v = vec_sums( sum3v, zero_s32v );
728
729
sum0v = vec_splat( sum0v, 3 );
730
sum1v = vec_splat( sum1v, 3 );
731
sum2v = vec_splat( sum2v, 3 );
732
sum3v = vec_splat( sum3v, 3 );
733
734
vec_ste( sum0v, 0, &sum0);
735
vec_ste( sum1v, 0, &sum1);
736
vec_ste( sum2v, 0, &sum2);
737
vec_ste( sum3v, 0, &sum3);
738
739
scores[0] = sum0;
740
scores[1] = sum1;
741
scores[2] = sum2;
742
scores[3] = sum3;
743
}
744
745
static void pixel_sad_x3_16x16_altivec( uint8_t *fenc, uint8_t *pix0,
746
uint8_t *pix1, uint8_t *pix2,
747
intptr_t i_stride, int scores[3] )
748
{
749
ALIGNED_16( int sum0 );
750
ALIGNED_16( int sum1 );
751
ALIGNED_16( int sum2 );
752
753
LOAD_ZERO;
754
vec_u8_t temp_lv, temp_hv; // temporary load vectors
755
vec_u8_t fencv, pix0v, pix1v, pix2v;
756
vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;
757
758
vec_s32_t sum0v, sum1v, sum2v;
759
760
sum0v = vec_splat_s32(0);
761
sum1v = vec_splat_s32(0);
762
sum2v = vec_splat_s32(0);
763
764
perm0vA = vec_lvsl(0, pix0);
765
perm1vA = vec_lvsl(0, pix1);
766
perm2vA = vec_lvsl(0, pix2);
767
768
perm0vB = vec_lvsl(0, pix0 + i_stride);
769
perm1vB = vec_lvsl(0, pix1 + i_stride);
770
perm2vB = vec_lvsl(0, pix2 + i_stride);
771
772
for( int y = 0; y < 8; y++ )
773
{
774
temp_lv = vec_ld(0, pix0);
775
temp_hv = vec_ld(16, pix0);
776
pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
777
pix0 += i_stride;
778
779
temp_lv = vec_ld(0, pix1);
780
temp_hv = vec_ld(16, pix1);
781
pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
782
pix1 += i_stride;
783
784
fencv = vec_ld(0, fenc);
785
fenc += FENC_STRIDE;
786
787
temp_lv = vec_ld(0, pix2);
788
temp_hv = vec_ld(16, pix2);
789
pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
790
pix2 += i_stride;
791
792
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
793
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
794
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
795
796
temp_lv = vec_ld(0, pix0);
797
temp_hv = vec_ld(16, pix0);
798
pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
799
pix0 += i_stride;
800
801
802
temp_lv = vec_ld(0, pix1);
803
temp_hv = vec_ld(16, pix1);
804
pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
805
pix1 += i_stride;
806
807
fencv = vec_ld(0, fenc);
808
fenc += FENC_STRIDE;
809
810
temp_lv = vec_ld(0, pix2);
811
temp_hv = vec_ld(16, pix2);
812
pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
813
pix2 += i_stride;
814
815
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
816
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
817
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
818
}
819
820
sum0v = vec_sums( sum0v, zero_s32v );
821
sum1v = vec_sums( sum1v, zero_s32v );
822
sum2v = vec_sums( sum2v, zero_s32v );
823
824
sum0v = vec_splat( sum0v, 3 );
825
sum1v = vec_splat( sum1v, 3 );
826
sum2v = vec_splat( sum2v, 3 );
827
828
vec_ste( sum0v, 0, &sum0);
829
vec_ste( sum1v, 0, &sum1);
830
vec_ste( sum2v, 0, &sum2);
831
832
scores[0] = sum0;
833
scores[1] = sum1;
834
scores[2] = sum2;
835
}
836
837
static void pixel_sad_x4_16x8_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,
838
uint8_t *pix3, intptr_t i_stride, int scores[4] )
839
{
840
ALIGNED_16( int sum0 );
841
ALIGNED_16( int sum1 );
842
ALIGNED_16( int sum2 );
843
ALIGNED_16( int sum3 );
844
845
LOAD_ZERO;
846
vec_u8_t temp_lv, temp_hv;
847
vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
848
vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB;
849
850
vec_s32_t sum0v, sum1v, sum2v, sum3v;
851
852
sum0v = vec_splat_s32(0);
853
sum1v = vec_splat_s32(0);
854
sum2v = vec_splat_s32(0);
855
sum3v = vec_splat_s32(0);
856
857
perm0vA = vec_lvsl(0, pix0);
858
perm1vA = vec_lvsl(0, pix1);
859
perm2vA = vec_lvsl(0, pix2);
860
perm3vA = vec_lvsl(0, pix3);
861
862
perm0vB = vec_lvsl(0, pix0 + i_stride);
863
perm1vB = vec_lvsl(0, pix1 + i_stride);
864
perm2vB = vec_lvsl(0, pix2 + i_stride);
865
perm3vB = vec_lvsl(0, pix3 + i_stride);
866
867
for( int y = 0; y < 4; y++ )
868
{
869
temp_lv = vec_ld(0, pix0);
870
temp_hv = vec_ld(16, pix0);
871
pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
872
pix0 += i_stride;
873
874
temp_lv = vec_ld(0, pix1);
875
temp_hv = vec_ld(16, pix1);
876
pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
877
pix1 += i_stride;
878
879
fencv = vec_ld(0, fenc);
880
fenc += FENC_STRIDE;
881
882
temp_lv = vec_ld(0, pix2);
883
temp_hv = vec_ld(16, pix2);
884
pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
885
pix2 += i_stride;
886
887
temp_lv = vec_ld(0, pix3);
888
temp_hv = vec_ld(16, pix3);
889
pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
890
pix3 += i_stride;
891
892
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
893
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
894
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
895
sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
896
897
temp_lv = vec_ld(0, pix0);
898
temp_hv = vec_ld(16, pix0);
899
pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
900
pix0 += i_stride;
901
902
temp_lv = vec_ld(0, pix1);
903
temp_hv = vec_ld(16, pix1);
904
pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
905
pix1 += i_stride;
906
907
fencv = vec_ld(0, fenc);
908
fenc += FENC_STRIDE;
909
910
temp_lv = vec_ld(0, pix2);
911
temp_hv = vec_ld(16, pix2);
912
pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
913
pix2 += i_stride;
914
915
temp_lv = vec_ld(0, pix3);
916
temp_hv = vec_ld(16, pix3);
917
pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
918
pix3 += i_stride;
919
920
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
921
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
922
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
923
sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
924
}
925
926
sum0v = vec_sums( sum0v, zero_s32v );
927
sum1v = vec_sums( sum1v, zero_s32v );
928
sum2v = vec_sums( sum2v, zero_s32v );
929
sum3v = vec_sums( sum3v, zero_s32v );
930
931
sum0v = vec_splat( sum0v, 3 );
932
sum1v = vec_splat( sum1v, 3 );
933
sum2v = vec_splat( sum2v, 3 );
934
sum3v = vec_splat( sum3v, 3 );
935
936
vec_ste( sum0v, 0, &sum0);
937
vec_ste( sum1v, 0, &sum1);
938
vec_ste( sum2v, 0, &sum2);
939
vec_ste( sum3v, 0, &sum3);
940
941
scores[0] = sum0;
942
scores[1] = sum1;
943
scores[2] = sum2;
944
scores[3] = sum3;
945
}
946
947
static void pixel_sad_x3_16x8_altivec( uint8_t *fenc, uint8_t *pix0,
948
uint8_t *pix1, uint8_t *pix2,
949
intptr_t i_stride, int scores[3] )
950
{
951
ALIGNED_16( int sum0 );
952
ALIGNED_16( int sum1 );
953
ALIGNED_16( int sum2 );
954
955
LOAD_ZERO;
956
vec_u8_t temp_lv, temp_hv;
957
vec_u8_t fencv, pix0v, pix1v, pix2v;
958
vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB;
959
960
vec_s32_t sum0v, sum1v, sum2v;
961
962
sum0v = vec_splat_s32(0);
963
sum1v = vec_splat_s32(0);
964
sum2v = vec_splat_s32(0);
965
966
perm0vA = vec_lvsl(0, pix0);
967
perm1vA = vec_lvsl(0, pix1);
968
perm2vA = vec_lvsl(0, pix2);
969
970
perm0vB = vec_lvsl(0, pix0 + i_stride);
971
perm1vB = vec_lvsl(0, pix1 + i_stride);
972
perm2vB = vec_lvsl(0, pix2 + i_stride);
973
974
for( int y = 0; y < 4; y++ )
975
{
976
temp_lv = vec_ld(0, pix0);
977
temp_hv = vec_ld(16, pix0);
978
pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
979
pix0 += i_stride;
980
981
temp_lv = vec_ld(0, pix1);
982
temp_hv = vec_ld(16, pix1);
983
pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
984
pix1 += i_stride;
985
986
fencv = vec_ld(0, fenc);
987
fenc += FENC_STRIDE;
988
989
temp_lv = vec_ld(0, pix2);
990
temp_hv = vec_ld(16, pix2);
991
pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
992
pix2 += i_stride;
993
994
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
995
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
996
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
997
998
temp_lv = vec_ld(0, pix0);
999
temp_hv = vec_ld(16, pix0);
1000
pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1001
pix0 += i_stride;
1002
1003
temp_lv = vec_ld(0, pix1);
1004
temp_hv = vec_ld(16, pix1);
1005
pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1006
pix1 += i_stride;
1007
1008
fencv = vec_ld(0, fenc);
1009
fenc += FENC_STRIDE;
1010
1011
temp_lv = vec_ld(0, pix2);
1012
temp_hv = vec_ld(16, pix2);
1013
pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1014
pix2 += i_stride;
1015
1016
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1017
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1018
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1019
}
1020
1021
sum0v = vec_sums( sum0v, zero_s32v );
1022
sum1v = vec_sums( sum1v, zero_s32v );
1023
sum2v = vec_sums( sum2v, zero_s32v );
1024
1025
sum0v = vec_splat( sum0v, 3 );
1026
sum1v = vec_splat( sum1v, 3 );
1027
sum2v = vec_splat( sum2v, 3 );
1028
1029
vec_ste( sum0v, 0, &sum0);
1030
vec_ste( sum1v, 0, &sum1);
1031
vec_ste( sum2v, 0, &sum2);
1032
1033
scores[0] = sum0;
1034
scores[1] = sum1;
1035
scores[2] = sum2;
1036
}
1037
1038
1039
static void pixel_sad_x4_8x16_altivec( uint8_t *fenc,
1040
uint8_t *pix0, uint8_t *pix1,
1041
uint8_t *pix2, uint8_t *pix3,
1042
intptr_t i_stride, int scores[4] )
1043
{
1044
ALIGNED_16( int sum0 );
1045
ALIGNED_16( int sum1 );
1046
ALIGNED_16( int sum2 );
1047
ALIGNED_16( int sum3 );
1048
1049
LOAD_ZERO;
1050
vec_u8_t temp_lv, temp_hv;
1051
vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
1052
vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;
1053
1054
vec_s32_t sum0v, sum1v, sum2v, sum3v;
1055
1056
sum0v = vec_splat_s32(0);
1057
sum1v = vec_splat_s32(0);
1058
sum2v = vec_splat_s32(0);
1059
sum3v = vec_splat_s32(0);
1060
1061
permEncv = vec_lvsl(0, fenc);
1062
perm0vA = vec_lvsl(0, pix0);
1063
perm1vA = vec_lvsl(0, pix1);
1064
perm2vA = vec_lvsl(0, pix2);
1065
perm3vA = vec_lvsl(0, pix3);
1066
1067
perm0vB = vec_lvsl(0, pix0 + i_stride);
1068
perm1vB = vec_lvsl(0, pix1 + i_stride);
1069
perm2vB = vec_lvsl(0, pix2 + i_stride);
1070
perm3vB = vec_lvsl(0, pix3 + i_stride);
1071
1072
for( int y = 0; y < 8; y++ )
1073
{
1074
temp_lv = vec_ld(0, pix0);
1075
temp_hv = vec_ld(16, pix0);
1076
pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
1077
pix0 += i_stride;
1078
1079
temp_lv = vec_ld(0, pix1);
1080
temp_hv = vec_ld(16, pix1);
1081
pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
1082
pix1 += i_stride;
1083
1084
temp_lv = vec_ld(0, fenc);
1085
fencv = vec_perm(temp_lv, temp_hv, permEncv);
1086
fenc += FENC_STRIDE;
1087
1088
temp_lv = vec_ld(0, pix2);
1089
temp_hv = vec_ld(16, pix2);
1090
pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1091
pix2 += i_stride;
1092
1093
temp_lv = vec_ld(0, pix3);
1094
temp_hv = vec_ld(16, pix3);
1095
pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
1096
pix3 += i_stride;
1097
1098
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1099
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1100
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1101
sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
1102
1103
temp_lv = vec_ld(0, pix0);
1104
temp_hv = vec_ld(16, pix0);
1105
pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1106
pix0 += i_stride;
1107
1108
temp_lv = vec_ld(0, pix1);
1109
temp_hv = vec_ld(16, pix1);
1110
pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1111
pix1 += i_stride;
1112
1113
temp_lv = vec_ld(0, fenc);
1114
fencv = vec_perm(temp_lv, temp_hv, permEncv);
1115
fenc += FENC_STRIDE;
1116
1117
temp_lv = vec_ld(0, pix2);
1118
temp_hv = vec_ld(16, pix2);
1119
pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1120
pix2 += i_stride;
1121
1122
temp_lv = vec_ld(0, pix3);
1123
temp_hv = vec_ld(16, pix3);
1124
pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
1125
pix3 += i_stride;
1126
1127
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1128
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1129
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1130
sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
1131
}
1132
1133
sum0v = vec_sum2s( sum0v, zero_s32v );
1134
sum1v = vec_sum2s( sum1v, zero_s32v );
1135
sum2v = vec_sum2s( sum2v, zero_s32v );
1136
sum3v = vec_sum2s( sum3v, zero_s32v );
1137
1138
sum0v = vec_splat( sum0v, 1 );
1139
sum1v = vec_splat( sum1v, 1 );
1140
sum2v = vec_splat( sum2v, 1 );
1141
sum3v = vec_splat( sum3v, 1 );
1142
1143
vec_ste( sum0v, 0, &sum0);
1144
vec_ste( sum1v, 0, &sum1);
1145
vec_ste( sum2v, 0, &sum2);
1146
vec_ste( sum3v, 0, &sum3);
1147
1148
scores[0] = sum0;
1149
scores[1] = sum1;
1150
scores[2] = sum2;
1151
scores[3] = sum3;
1152
}
1153
1154
static void pixel_sad_x3_8x16_altivec( uint8_t *fenc, uint8_t *pix0,
1155
uint8_t *pix1, uint8_t *pix2,
1156
intptr_t i_stride, int scores[3] )
1157
{
1158
ALIGNED_16( int sum0 );
1159
ALIGNED_16( int sum1 );
1160
ALIGNED_16( int sum2 );
1161
1162
LOAD_ZERO;
1163
vec_u8_t temp_lv, temp_hv;
1164
vec_u8_t fencv, pix0v, pix1v, pix2v;
1165
vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB,permEncv;
1166
1167
vec_s32_t sum0v, sum1v, sum2v;
1168
1169
sum0v = vec_splat_s32(0);
1170
sum1v = vec_splat_s32(0);
1171
sum2v = vec_splat_s32(0);
1172
1173
permEncv = vec_lvsl(0, fenc);
1174
perm0vA = vec_lvsl(0, pix0);
1175
perm1vA = vec_lvsl(0, pix1);
1176
perm2vA = vec_lvsl(0, pix2);
1177
1178
perm0vB = vec_lvsl(0, pix0 + i_stride);
1179
perm1vB = vec_lvsl(0, pix1 + i_stride);
1180
perm2vB = vec_lvsl(0, pix2 + i_stride);
1181
1182
for( int y = 0; y < 8; y++ )
1183
{
1184
temp_lv = vec_ld(0, pix0);
1185
temp_hv = vec_ld(16, pix0);
1186
pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
1187
pix0 += i_stride;
1188
1189
temp_lv = vec_ld(0, pix1);
1190
temp_hv = vec_ld(16, pix1);
1191
pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
1192
pix1 += i_stride;
1193
1194
temp_lv = vec_ld(0, fenc);
1195
fencv = vec_perm(temp_lv, temp_hv, permEncv);
1196
fenc += FENC_STRIDE;
1197
1198
temp_lv = vec_ld(0, pix2);
1199
temp_hv = vec_ld(16, pix2);
1200
pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1201
pix2 += i_stride;
1202
1203
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1204
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1205
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1206
1207
temp_lv = vec_ld(0, pix0);
1208
temp_hv = vec_ld(16, pix0);
1209
pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1210
pix0 += i_stride;
1211
1212
temp_lv = vec_ld(0, pix1);
1213
temp_hv = vec_ld(16, pix1);
1214
pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1215
pix1 += i_stride;
1216
1217
temp_lv = vec_ld(0, fenc);
1218
fencv = vec_perm(temp_lv, temp_hv, permEncv);
1219
fenc += FENC_STRIDE;
1220
1221
temp_lv = vec_ld(0, pix2);
1222
temp_hv = vec_ld(16, pix2);
1223
pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1224
pix2 += i_stride;
1225
1226
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1227
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1228
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1229
}
1230
1231
sum0v = vec_sum2s( sum0v, zero_s32v );
1232
sum1v = vec_sum2s( sum1v, zero_s32v );
1233
sum2v = vec_sum2s( sum2v, zero_s32v );
1234
1235
sum0v = vec_splat( sum0v, 1 );
1236
sum1v = vec_splat( sum1v, 1 );
1237
sum2v = vec_splat( sum2v, 1 );
1238
1239
vec_ste( sum0v, 0, &sum0);
1240
vec_ste( sum1v, 0, &sum1);
1241
vec_ste( sum2v, 0, &sum2);
1242
1243
scores[0] = sum0;
1244
scores[1] = sum1;
1245
scores[2] = sum2;
1246
}
1247
1248
static void pixel_sad_x4_8x8_altivec( uint8_t *fenc,
1249
uint8_t *pix0, uint8_t *pix1,
1250
uint8_t *pix2, uint8_t *pix3,
1251
intptr_t i_stride, int scores[4] )
1252
{
1253
ALIGNED_16( int sum0 );
1254
ALIGNED_16( int sum1 );
1255
ALIGNED_16( int sum2 );
1256
ALIGNED_16( int sum3 );
1257
1258
LOAD_ZERO;
1259
vec_u8_t temp_lv, temp_hv;
1260
vec_u8_t fencv, pix0v, pix1v, pix2v, pix3v;
1261
vec_u8_t perm0vA, perm1vA, perm2vA, perm3vA, perm0vB, perm1vB, perm2vB, perm3vB, permEncv;
1262
1263
vec_s32_t sum0v, sum1v, sum2v, sum3v;
1264
1265
sum0v = vec_splat_s32(0);
1266
sum1v = vec_splat_s32(0);
1267
sum2v = vec_splat_s32(0);
1268
sum3v = vec_splat_s32(0);
1269
1270
permEncv = vec_lvsl(0, fenc);
1271
perm0vA = vec_lvsl(0, pix0);
1272
perm1vA = vec_lvsl(0, pix1);
1273
perm2vA = vec_lvsl(0, pix2);
1274
perm3vA = vec_lvsl(0, pix3);
1275
1276
perm0vB = vec_lvsl(0, pix0 + i_stride);
1277
perm1vB = vec_lvsl(0, pix1 + i_stride);
1278
perm2vB = vec_lvsl(0, pix2 + i_stride);
1279
perm3vB = vec_lvsl(0, pix3 + i_stride);
1280
1281
for( int y = 0; y < 4; y++ )
1282
{
1283
temp_lv = vec_ld(0, pix0);
1284
temp_hv = vec_ld(16, pix0);
1285
pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
1286
pix0 += i_stride;
1287
1288
temp_lv = vec_ld(0, pix1);
1289
temp_hv = vec_ld(16, pix1);
1290
pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
1291
pix1 += i_stride;
1292
1293
temp_lv = vec_ld(0, fenc);
1294
fencv = vec_perm(temp_lv, temp_hv, permEncv);
1295
fenc += FENC_STRIDE;
1296
1297
temp_lv = vec_ld(0, pix2);
1298
temp_hv = vec_ld(16, pix2);
1299
pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1300
pix2 += i_stride;
1301
1302
temp_lv = vec_ld(0, pix3);
1303
temp_hv = vec_ld(16, pix3);
1304
pix3v = vec_perm(temp_lv, temp_hv, perm3vA);
1305
pix3 += i_stride;
1306
1307
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1308
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1309
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1310
sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
1311
1312
temp_lv = vec_ld(0, pix0);
1313
temp_hv = vec_ld(16, pix0);
1314
pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1315
pix0 += i_stride;
1316
1317
temp_lv = vec_ld(0, pix1);
1318
temp_hv = vec_ld(16, pix1);
1319
pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1320
pix1 += i_stride;
1321
1322
temp_lv = vec_ld(0, fenc);
1323
fencv = vec_perm(temp_lv, temp_hv, permEncv);
1324
fenc += FENC_STRIDE;
1325
1326
temp_lv = vec_ld(0, pix2);
1327
temp_hv = vec_ld(16, pix2);
1328
pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1329
pix2 += i_stride;
1330
1331
temp_lv = vec_ld(0, pix3);
1332
temp_hv = vec_ld(16, pix3);
1333
pix3v = vec_perm(temp_lv, temp_hv, perm3vB);
1334
pix3 += i_stride;
1335
1336
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1337
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1338
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1339
sum3v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix3v ), vec_min( fencv, pix3v ) ), (vec_u32_t) sum3v );
1340
}
1341
1342
sum0v = vec_sum2s( sum0v, zero_s32v );
1343
sum1v = vec_sum2s( sum1v, zero_s32v );
1344
sum2v = vec_sum2s( sum2v, zero_s32v );
1345
sum3v = vec_sum2s( sum3v, zero_s32v );
1346
1347
sum0v = vec_splat( sum0v, 1 );
1348
sum1v = vec_splat( sum1v, 1 );
1349
sum2v = vec_splat( sum2v, 1 );
1350
sum3v = vec_splat( sum3v, 1 );
1351
1352
vec_ste( sum0v, 0, &sum0);
1353
vec_ste( sum1v, 0, &sum1);
1354
vec_ste( sum2v, 0, &sum2);
1355
vec_ste( sum3v, 0, &sum3);
1356
1357
scores[0] = sum0;
1358
scores[1] = sum1;
1359
scores[2] = sum2;
1360
scores[3] = sum3;
1361
}
1362
1363
static void pixel_sad_x3_8x8_altivec( uint8_t *fenc, uint8_t *pix0,
1364
uint8_t *pix1, uint8_t *pix2,
1365
intptr_t i_stride, int scores[3] )
1366
{
1367
ALIGNED_16( int sum0 );
1368
ALIGNED_16( int sum1 );
1369
ALIGNED_16( int sum2 );
1370
1371
LOAD_ZERO;
1372
vec_u8_t temp_lv, temp_hv;
1373
vec_u8_t fencv, pix0v, pix1v, pix2v;
1374
vec_u8_t perm0vA, perm1vA, perm2vA, perm0vB, perm1vB, perm2vB, permEncv;
1375
1376
vec_s32_t sum0v, sum1v, sum2v;
1377
1378
sum0v = vec_splat_s32(0);
1379
sum1v = vec_splat_s32(0);
1380
sum2v = vec_splat_s32(0);
1381
1382
permEncv = vec_lvsl(0, fenc);
1383
perm0vA = vec_lvsl(0, pix0);
1384
perm1vA = vec_lvsl(0, pix1);
1385
perm2vA = vec_lvsl(0, pix2);
1386
1387
perm0vB = vec_lvsl(0, pix0 + i_stride);
1388
perm1vB = vec_lvsl(0, pix1 + i_stride);
1389
perm2vB = vec_lvsl(0, pix2 + i_stride);
1390
1391
for( int y = 0; y < 4; y++ )
1392
{
1393
temp_lv = vec_ld(0, pix0);
1394
temp_hv = vec_ld(16, pix0);
1395
pix0v = vec_perm(temp_lv, temp_hv, perm0vA);
1396
pix0 += i_stride;
1397
1398
temp_lv = vec_ld(0, pix1);
1399
temp_hv = vec_ld(16, pix1);
1400
pix1v = vec_perm(temp_lv, temp_hv, perm1vA);
1401
pix1 += i_stride;
1402
1403
temp_lv = vec_ld(0, fenc);
1404
fencv = vec_perm(temp_lv, temp_hv, permEncv);
1405
fenc += FENC_STRIDE;
1406
1407
temp_lv = vec_ld(0, pix2);
1408
temp_hv = vec_ld(16, pix2);
1409
pix2v = vec_perm(temp_lv, temp_hv, perm2vA);
1410
pix2 += i_stride;
1411
1412
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1413
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1414
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1415
1416
temp_lv = vec_ld(0, pix0);
1417
temp_hv = vec_ld(16, pix0);
1418
pix0v = vec_perm(temp_lv, temp_hv, perm0vB);
1419
pix0 += i_stride;
1420
1421
temp_lv = vec_ld(0, pix1);
1422
temp_hv = vec_ld(16, pix1);
1423
pix1v = vec_perm(temp_lv, temp_hv, perm1vB);
1424
pix1 += i_stride;
1425
1426
temp_lv = vec_ld(0, fenc);
1427
fencv = vec_perm(temp_lv, temp_hv, permEncv);
1428
fenc += FENC_STRIDE;
1429
1430
temp_lv = vec_ld(0, pix2);
1431
temp_hv = vec_ld(16, pix2);
1432
pix2v = vec_perm(temp_lv, temp_hv, perm2vB);
1433
pix2 += i_stride;
1434
1435
sum0v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix0v ), vec_min( fencv, pix0v ) ), (vec_u32_t) sum0v );
1436
sum1v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix1v ), vec_min( fencv, pix1v ) ), (vec_u32_t) sum1v );
1437
sum2v = (vec_s32_t) vec_sum4s( vec_sub( vec_max( fencv, pix2v ), vec_min( fencv, pix2v ) ), (vec_u32_t) sum2v );
1438
}
1439
1440
sum0v = vec_sum2s( sum0v, zero_s32v );
1441
sum1v = vec_sum2s( sum1v, zero_s32v );
1442
sum2v = vec_sum2s( sum2v, zero_s32v );
1443
1444
sum0v = vec_splat( sum0v, 1 );
1445
sum1v = vec_splat( sum1v, 1 );
1446
sum2v = vec_splat( sum2v, 1 );
1447
1448
vec_ste( sum0v, 0, &sum0);
1449
vec_ste( sum1v, 0, &sum1);
1450
vec_ste( sum2v, 0, &sum2);
1451
1452
scores[0] = sum0;
1453
scores[1] = sum1;
1454
scores[2] = sum2;
1455
}
1456
1457
/***********************************************************************
1458
* SSD routines
1459
**********************************************************************/
1460
1461
static int pixel_ssd_16x16_altivec ( uint8_t *pix1, intptr_t i_stride_pix1,
1462
uint8_t *pix2, intptr_t i_stride_pix2 )
1463
{
1464
ALIGNED_16( int sum );
1465
1466
LOAD_ZERO;
1467
vec_u8_t pix1vA, pix2vA, pix1vB, pix2vB;
1468
vec_u32_t sumv;
1469
vec_u8_t maxA, minA, diffA, maxB, minB, diffB;
1470
vec_u8_t temp_lv, temp_hv;
1471
vec_u8_t permA, permB;
1472
1473
sumv = vec_splat_u32(0);
1474
1475
permA = vec_lvsl(0, pix2);
1476
permB = vec_lvsl(0, pix2 + i_stride_pix2);
1477
1478
temp_lv = vec_ld(0, pix2);
1479
temp_hv = vec_ld(16, pix2);
1480
pix2vA = vec_perm(temp_lv, temp_hv, permA);
1481
pix1vA = vec_ld(0, pix1);
1482
1483
for( int y = 0; y < 7; y++ )
1484
{
1485
pix1 += i_stride_pix1;
1486
pix2 += i_stride_pix2;
1487
1488
maxA = vec_max(pix1vA, pix2vA);
1489
minA = vec_min(pix1vA, pix2vA);
1490
1491
temp_lv = vec_ld(0, pix2);
1492
temp_hv = vec_ld(16, pix2);
1493
pix2vB = vec_perm(temp_lv, temp_hv, permB);
1494
pix1vB = vec_ld(0, pix1);
1495
1496
diffA = vec_sub(maxA, minA);
1497
sumv = vec_msum(diffA, diffA, sumv);
1498
1499
pix1 += i_stride_pix1;
1500
pix2 += i_stride_pix2;
1501
1502
maxB = vec_max(pix1vB, pix2vB);
1503
minB = vec_min(pix1vB, pix2vB);
1504
1505
temp_lv = vec_ld(0, pix2);
1506
temp_hv = vec_ld(16, pix2);
1507
pix2vA = vec_perm(temp_lv, temp_hv, permA);
1508
pix1vA = vec_ld(0, pix1);
1509
1510
diffB = vec_sub(maxB, minB);
1511
sumv = vec_msum(diffB, diffB, sumv);
1512
}
1513
1514
pix1 += i_stride_pix1;
1515
pix2 += i_stride_pix2;
1516
1517
temp_lv = vec_ld(0, pix2);
1518
temp_hv = vec_ld(16, pix2);
1519
pix2vB = vec_perm(temp_lv, temp_hv, permB);
1520
pix1vB = vec_ld(0, pix1);
1521
1522
maxA = vec_max(pix1vA, pix2vA);
1523
minA = vec_min(pix1vA, pix2vA);
1524
1525
maxB = vec_max(pix1vB, pix2vB);
1526
minB = vec_min(pix1vB, pix2vB);
1527
1528
diffA = vec_sub(maxA, minA);
1529
sumv = vec_msum(diffA, diffA, sumv);
1530
1531
diffB = vec_sub(maxB, minB);
1532
sumv = vec_msum(diffB, diffB, sumv);
1533
1534
sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
1535
sumv = vec_splat(sumv, 3);
1536
vec_ste((vec_s32_t) sumv, 0, &sum);
1537
return sum;
1538
}
1539
1540
static int pixel_ssd_8x8_altivec ( uint8_t *pix1, intptr_t i_stride_pix1,
1541
uint8_t *pix2, intptr_t i_stride_pix2 )
1542
{
1543
ALIGNED_16( int sum );
1544
1545
LOAD_ZERO;
1546
vec_u8_t pix1v, pix2v;
1547
vec_u32_t sumv;
1548
vec_u8_t maxv, minv, diffv;
1549
vec_u8_t temp_lv, temp_hv;
1550
vec_u8_t perm1v, perm2v;
1551
1552
const vec_u32_t sel = (vec_u32_t)CV(-1,-1,0,0);
1553
1554
sumv = vec_splat_u32(0);
1555
1556
perm1v = vec_lvsl(0, pix1);
1557
perm2v = vec_lvsl(0, pix2);
1558
1559
for( int y = 0; y < 8; y++ )
1560
{
1561
temp_hv = vec_ld(0, pix1);
1562
temp_lv = vec_ld(7, pix1);
1563
pix1v = vec_perm(temp_hv, temp_lv, perm1v);
1564
1565
temp_hv = vec_ld(0, pix2);
1566
temp_lv = vec_ld(7, pix2);
1567
pix2v = vec_perm(temp_hv, temp_lv, perm2v);
1568
1569
maxv = vec_max(pix1v, pix2v);
1570
minv = vec_min(pix1v, pix2v);
1571
1572
diffv = vec_sub(maxv, minv);
1573
sumv = vec_msum(diffv, diffv, sumv);
1574
1575
pix1 += i_stride_pix1;
1576
pix2 += i_stride_pix2;
1577
}
1578
1579
sumv = vec_sel( zero_u32v, sumv, sel );
1580
1581
sumv = (vec_u32_t) vec_sums((vec_s32_t) sumv, zero_s32v);
1582
sumv = vec_splat(sumv, 3);
1583
vec_ste((vec_s32_t) sumv, 0, &sum);
1584
1585
return sum;
1586
}
1587
1588
1589
/****************************************************************************
1590
* variance
1591
****************************************************************************/
1592
static uint64_t x264_pixel_var_16x16_altivec( uint8_t *pix, intptr_t i_stride )
1593
{
1594
ALIGNED_16(uint32_t sum_tab[4]);
1595
ALIGNED_16(uint32_t sqr_tab[4]);
1596
1597
LOAD_ZERO;
1598
vec_u32_t sqr_v = zero_u32v;
1599
vec_u32_t sum_v = zero_u32v;
1600
1601
for( int y = 0; y < 16; y++ )
1602
{
1603
vec_u8_t pix0_v = vec_ld(0, pix);
1604
sum_v = vec_sum4s(pix0_v, sum_v);
1605
sqr_v = vec_msum(pix0_v, pix0_v, sqr_v);
1606
1607
pix += i_stride;
1608
}
1609
sum_v = (vec_u32_t)vec_sums( (vec_s32_t)sum_v, zero_s32v );
1610
sqr_v = (vec_u32_t)vec_sums( (vec_s32_t)sqr_v, zero_s32v );
1611
vec_ste(sum_v, 12, sum_tab);
1612
vec_ste(sqr_v, 12, sqr_tab);
1613
1614
uint32_t sum = sum_tab[3];
1615
uint32_t sqr = sqr_tab[3];
1616
return sum + ((uint64_t)sqr<<32);
1617
}
1618
1619
static uint64_t x264_pixel_var_8x8_altivec( uint8_t *pix, intptr_t i_stride )
1620
{
1621
ALIGNED_16(uint32_t sum_tab[4]);
1622
ALIGNED_16(uint32_t sqr_tab[4]);
1623
1624
LOAD_ZERO;
1625
vec_u32_t sqr_v = zero_u32v;
1626
vec_u32_t sum_v = zero_u32v;
1627
1628
static const vec_u8_t perm_tab[] =
1629
{
1630
CV(0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* pix=mod16, i_stride=mod16 */
1631
0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17),
1632
CV(0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, /* pix=mod8, i_stride=mod16 */
1633
0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F),
1634
};
1635
vec_u8_t perm = perm_tab[ ((uintptr_t)pix & 8) >> 3 ];
1636
1637
for( int y = 0; y < 4; y++ )
1638
{
1639
vec_u8_t pix0_v = vec_ld(0, pix);
1640
vec_u8_t pix1_v = vec_ld(i_stride, pix);
1641
vec_u8_t pix_v = vec_perm(pix0_v, pix1_v, perm);
1642
sum_v = vec_sum4s(pix_v, sum_v);
1643
sqr_v = vec_msum(pix_v, pix_v, sqr_v);
1644
1645
pix += i_stride<<1;
1646
}
1647
sum_v = (vec_u32_t)vec_sums( (vec_s32_t)sum_v, zero_s32v );
1648
sqr_v = (vec_u32_t)vec_sums( (vec_s32_t)sqr_v, zero_s32v );
1649
vec_ste(sum_v, 12, sum_tab);
1650
vec_ste(sqr_v, 12, sqr_tab);
1651
1652
uint32_t sum = sum_tab[3];
1653
uint32_t sqr = sqr_tab[3];
1654
return sum + ((uint64_t)sqr<<32);
1655
}
1656
1657
1658
/**********************************************************************
1659
* SA8D routines: sum of 8x8 Hadamard transformed differences
1660
**********************************************************************/
1661
/* SA8D_1D unrolled by 8 in Altivec */
1662
#define SA8D_1D_ALTIVEC( sa8d0v, sa8d1v, sa8d2v, sa8d3v, \
1663
sa8d4v, sa8d5v, sa8d6v, sa8d7v ) \
1664
{ \
1665
/* int a0 = SRC(0) + SRC(4) */ \
1666
vec_s16_t a0v = vec_add(sa8d0v, sa8d4v); \
1667
/* int a4 = SRC(0) - SRC(4) */ \
1668
vec_s16_t a4v = vec_sub(sa8d0v, sa8d4v); \
1669
/* int a1 = SRC(1) + SRC(5) */ \
1670
vec_s16_t a1v = vec_add(sa8d1v, sa8d5v); \
1671
/* int a5 = SRC(1) - SRC(5) */ \
1672
vec_s16_t a5v = vec_sub(sa8d1v, sa8d5v); \
1673
/* int a2 = SRC(2) + SRC(6) */ \
1674
vec_s16_t a2v = vec_add(sa8d2v, sa8d6v); \
1675
/* int a6 = SRC(2) - SRC(6) */ \
1676
vec_s16_t a6v = vec_sub(sa8d2v, sa8d6v); \
1677
/* int a3 = SRC(3) + SRC(7) */ \
1678
vec_s16_t a3v = vec_add(sa8d3v, sa8d7v); \
1679
/* int a7 = SRC(3) - SRC(7) */ \
1680
vec_s16_t a7v = vec_sub(sa8d3v, sa8d7v); \
1681
\
1682
/* int b0 = a0 + a2 */ \
1683
vec_s16_t b0v = vec_add(a0v, a2v); \
1684
/* int b2 = a0 - a2; */ \
1685
vec_s16_t b2v = vec_sub(a0v, a2v); \
1686
/* int b1 = a1 + a3; */ \
1687
vec_s16_t b1v = vec_add(a1v, a3v); \
1688
/* int b3 = a1 - a3; */ \
1689
vec_s16_t b3v = vec_sub(a1v, a3v); \
1690
/* int b4 = a4 + a6; */ \
1691
vec_s16_t b4v = vec_add(a4v, a6v); \
1692
/* int b6 = a4 - a6; */ \
1693
vec_s16_t b6v = vec_sub(a4v, a6v); \
1694
/* int b5 = a5 + a7; */ \
1695
vec_s16_t b5v = vec_add(a5v, a7v); \
1696
/* int b7 = a5 - a7; */ \
1697
vec_s16_t b7v = vec_sub(a5v, a7v); \
1698
\
1699
/* DST(0, b0 + b1) */ \
1700
sa8d0v = vec_add(b0v, b1v); \
1701
/* DST(1, b0 - b1) */ \
1702
sa8d1v = vec_sub(b0v, b1v); \
1703
/* DST(2, b2 + b3) */ \
1704
sa8d2v = vec_add(b2v, b3v); \
1705
/* DST(3, b2 - b3) */ \
1706
sa8d3v = vec_sub(b2v, b3v); \
1707
/* DST(4, b4 + b5) */ \
1708
sa8d4v = vec_add(b4v, b5v); \
1709
/* DST(5, b4 - b5) */ \
1710
sa8d5v = vec_sub(b4v, b5v); \
1711
/* DST(6, b6 + b7) */ \
1712
sa8d6v = vec_add(b6v, b7v); \
1713
/* DST(7, b6 - b7) */ \
1714
sa8d7v = vec_sub(b6v, b7v); \
1715
}
1716
1717
static int pixel_sa8d_8x8_core_altivec( uint8_t *pix1, intptr_t i_pix1,
1718
uint8_t *pix2, intptr_t i_pix2 )
1719
{
1720
int32_t i_satd=0;
1721
1722
PREP_DIFF;
1723
PREP_LOAD_SRC( pix1 );
1724
PREP_LOAD_SRC( pix2 );
1725
1726
vec_s16_t diff0v, diff1v, diff2v, diff3v, diff4v, diff5v, diff6v, diff7v;
1727
1728
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff0v, pix2 );
1729
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff1v, pix2 );
1730
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff2v, pix2 );
1731
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff3v, pix2 );
1732
1733
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff4v, pix2 );
1734
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff5v, pix2 );
1735
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff6v, pix2 );
1736
VEC_DIFF_H( pix1, i_pix1, pix2, i_pix2, 8, diff7v, pix2 );
1737
1738
vec_s16_t sa8d0v, sa8d1v, sa8d2v, sa8d3v, sa8d4v, sa8d5v, sa8d6v, sa8d7v;
1739
1740
SA8D_1D_ALTIVEC(diff0v, diff1v, diff2v, diff3v,
1741
diff4v, diff5v, diff6v, diff7v);
1742
1743
VEC_TRANSPOSE_8(diff0v, diff1v, diff2v, diff3v,
1744
diff4v, diff5v, diff6v, diff7v,
1745
sa8d0v, sa8d1v, sa8d2v, sa8d3v,
1746
sa8d4v, sa8d5v, sa8d6v, sa8d7v );
1747
1748
SA8D_1D_ALTIVEC(sa8d0v, sa8d1v, sa8d2v, sa8d3v,
1749
sa8d4v, sa8d5v, sa8d6v, sa8d7v );
1750
1751
/* accumulation of the absolute value of all elements of the resulting bloc */
1752
vec_s16_t abs0v = VEC_ABS(sa8d0v);
1753
vec_s16_t abs1v = VEC_ABS(sa8d1v);
1754
vec_s16_t sum01v = vec_add(abs0v, abs1v);
1755
1756
vec_s16_t abs2v = VEC_ABS(sa8d2v);
1757
vec_s16_t abs3v = VEC_ABS(sa8d3v);
1758
vec_s16_t sum23v = vec_add(abs2v, abs3v);
1759
1760
vec_s16_t abs4v = VEC_ABS(sa8d4v);
1761
vec_s16_t abs5v = VEC_ABS(sa8d5v);
1762
vec_s16_t sum45v = vec_add(abs4v, abs5v);
1763
1764
vec_s16_t abs6v = VEC_ABS(sa8d6v);
1765
vec_s16_t abs7v = VEC_ABS(sa8d7v);
1766
vec_s16_t sum67v = vec_add(abs6v, abs7v);
1767
1768
vec_s16_t sum0123v = vec_add(sum01v, sum23v);
1769
vec_s16_t sum4567v = vec_add(sum45v, sum67v);
1770
1771
vec_s32_t sumblocv;
1772
1773
sumblocv = vec_sum4s(sum0123v, (vec_s32_t)zerov );
1774
sumblocv = vec_sum4s(sum4567v, sumblocv );
1775
1776
sumblocv = vec_sums(sumblocv, (vec_s32_t)zerov );
1777
1778
sumblocv = vec_splat(sumblocv, 3);
1779
1780
vec_ste(sumblocv, 0, &i_satd);
1781
1782
return i_satd;
1783
}
1784
1785
static int pixel_sa8d_8x8_altivec( uint8_t *pix1, intptr_t i_pix1,
1786
uint8_t *pix2, intptr_t i_pix2 )
1787
{
1788
int32_t i_satd;
1789
i_satd = (pixel_sa8d_8x8_core_altivec( pix1, i_pix1, pix2, i_pix2 )+2)>>2;
1790
return i_satd;
1791
}
1792
1793
static int pixel_sa8d_16x16_altivec( uint8_t *pix1, intptr_t i_pix1,
1794
uint8_t *pix2, intptr_t i_pix2 )
1795
{
1796
int32_t i_satd;
1797
1798
i_satd = (pixel_sa8d_8x8_core_altivec( &pix1[0], i_pix1, &pix2[0], i_pix2 )
1799
+ pixel_sa8d_8x8_core_altivec( &pix1[8], i_pix1, &pix2[8], i_pix2 )
1800
+ pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1], i_pix1, &pix2[8*i_pix2], i_pix2 )
1801
+ pixel_sa8d_8x8_core_altivec( &pix1[8*i_pix1+8], i_pix1, &pix2[8*i_pix2+8], i_pix2 ) +2)>>2;
1802
return i_satd;
1803
}
1804
1805
#define HADAMARD4_ALTIVEC(d0,d1,d2,d3,s0,s1,s2,s3) {\
1806
vec_s16_t t0 = vec_add(s0, s1); \
1807
vec_s16_t t1 = vec_sub(s0, s1); \
1808
vec_s16_t t2 = vec_add(s2, s3); \
1809
vec_s16_t t3 = vec_sub(s2, s3); \
1810
d0 = vec_add(t0, t2); \
1811
d2 = vec_sub(t0, t2); \
1812
d1 = vec_add(t1, t3); \
1813
d3 = vec_sub(t1, t3); \
1814
}
1815
1816
#define VEC_LOAD_HIGH( p, num ) \
1817
vec_u8_t pix8_##num = vec_ld( stride*num, p ); \
1818
vec_s16_t pix16_s##num = (vec_s16_t)vec_perm(pix8_##num, zero_u8v, perm); \
1819
vec_s16_t pix16_d##num;
1820
1821
static uint64_t pixel_hadamard_ac_altivec( uint8_t *pix, intptr_t stride, const vec_u8_t perm )
1822
{
1823
ALIGNED_16( int32_t sum4_tab[4] );
1824
ALIGNED_16( int32_t sum8_tab[4] );
1825
LOAD_ZERO;
1826
1827
VEC_LOAD_HIGH( pix, 0 );
1828
VEC_LOAD_HIGH( pix, 1 );
1829
VEC_LOAD_HIGH( pix, 2 );
1830
VEC_LOAD_HIGH( pix, 3 );
1831
HADAMARD4_ALTIVEC(pix16_d0,pix16_d1,pix16_d2,pix16_d3,
1832
pix16_s0,pix16_s1,pix16_s2,pix16_s3);
1833
1834
VEC_LOAD_HIGH( pix, 4 );
1835
VEC_LOAD_HIGH( pix, 5 );
1836
VEC_LOAD_HIGH( pix, 6 );
1837
VEC_LOAD_HIGH( pix, 7 );
1838
HADAMARD4_ALTIVEC(pix16_d4,pix16_d5,pix16_d6,pix16_d7,
1839
pix16_s4,pix16_s5,pix16_s6,pix16_s7);
1840
1841
VEC_TRANSPOSE_8(pix16_d0, pix16_d1, pix16_d2, pix16_d3,
1842
pix16_d4, pix16_d5, pix16_d6, pix16_d7,
1843
pix16_s0, pix16_s1, pix16_s2, pix16_s3,
1844
pix16_s4, pix16_s5, pix16_s6, pix16_s7);
1845
1846
HADAMARD4_ALTIVEC(pix16_d0,pix16_d1,pix16_d2,pix16_d3,
1847
pix16_s0,pix16_s1,pix16_s2,pix16_s3);
1848
1849
HADAMARD4_ALTIVEC(pix16_d4,pix16_d5,pix16_d6,pix16_d7,
1850
pix16_s4,pix16_s5,pix16_s6,pix16_s7);
1851
1852
vec_u16_t addabs01 = vec_add( VEC_ABSOLUTE(pix16_d0), VEC_ABSOLUTE(pix16_d1) );
1853
vec_u16_t addabs23 = vec_add( VEC_ABSOLUTE(pix16_d2), VEC_ABSOLUTE(pix16_d3) );
1854
vec_u16_t addabs45 = vec_add( VEC_ABSOLUTE(pix16_d4), VEC_ABSOLUTE(pix16_d5) );
1855
vec_u16_t addabs67 = vec_add( VEC_ABSOLUTE(pix16_d6), VEC_ABSOLUTE(pix16_d7) );
1856
1857
vec_u16_t sum4_v = vec_add(vec_add(addabs01, addabs23), vec_add(addabs45, addabs67));
1858
vec_ste(vec_sums(vec_sum4s((vec_s16_t)sum4_v, zero_s32v), zero_s32v), 12, sum4_tab);
1859
1860
vec_s16_t tmpi0 = vec_add(pix16_d0, pix16_d4);
1861
vec_s16_t tmpi4 = vec_sub(pix16_d0, pix16_d4);
1862
vec_s16_t tmpi1 = vec_add(pix16_d1, pix16_d5);
1863
vec_s16_t tmpi5 = vec_sub(pix16_d1, pix16_d5);
1864
vec_s16_t tmpi2 = vec_add(pix16_d2, pix16_d6);
1865
vec_s16_t tmpi6 = vec_sub(pix16_d2, pix16_d6);
1866
vec_s16_t tmpi3 = vec_add(pix16_d3, pix16_d7);
1867
vec_s16_t tmpi7 = vec_sub(pix16_d3, pix16_d7);
1868
1869
int sum4 = sum4_tab[3];
1870
1871
VEC_TRANSPOSE_8(tmpi0, tmpi1, tmpi2, tmpi3,
1872
tmpi4, tmpi5, tmpi6, tmpi7,
1873
pix16_d0, pix16_d1, pix16_d2, pix16_d3,
1874
pix16_d4, pix16_d5, pix16_d6, pix16_d7);
1875
1876
vec_u16_t addsum04 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d0, pix16_d4) ),
1877
VEC_ABSOLUTE( vec_sub(pix16_d0, pix16_d4) ) );
1878
vec_u16_t addsum15 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d1, pix16_d5) ),
1879
VEC_ABSOLUTE( vec_sub(pix16_d1, pix16_d5) ) );
1880
vec_u16_t addsum26 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d2, pix16_d6) ),
1881
VEC_ABSOLUTE( vec_sub(pix16_d2, pix16_d6) ) );
1882
vec_u16_t addsum37 = vec_add( VEC_ABSOLUTE( vec_add(pix16_d3, pix16_d7) ),
1883
VEC_ABSOLUTE( vec_sub(pix16_d3, pix16_d7) ) );
1884
1885
vec_u16_t sum8_v = vec_add( vec_add(addsum04, addsum15), vec_add(addsum26, addsum37) );
1886
vec_ste(vec_sums(vec_sum4s((vec_s16_t)sum8_v, zero_s32v), zero_s32v), 12, sum8_tab);
1887
1888
int sum8 = sum8_tab[3];
1889
1890
ALIGNED_16( int16_t tmp0_4_tab[8] );
1891
vec_ste(vec_add(pix16_d0, pix16_d4), 0, tmp0_4_tab);
1892
1893
sum4 -= tmp0_4_tab[0];
1894
sum8 -= tmp0_4_tab[0];
1895
return ((uint64_t)sum8<<32) + sum4;
1896
}
1897
1898
1899
static const vec_u8_t hadamard_permtab[] =
1900
{
1901
CV(0x10,0x00,0x11,0x01, 0x12,0x02,0x13,0x03, /* pix = mod16 */
1902
0x14,0x04,0x15,0x05, 0x16,0x06,0x17,0x07 ),
1903
CV(0x18,0x08,0x19,0x09, 0x1A,0x0A,0x1B,0x0B, /* pix = mod8 */
1904
0x1C,0x0C,0x1D,0x0D, 0x1E,0x0E,0x1F,0x0F )
1905
};
1906
1907
static uint64_t x264_pixel_hadamard_ac_16x16_altivec( uint8_t *pix, intptr_t stride )
1908
{
1909
int idx = ((uintptr_t)pix & 8) >> 3;
1910
vec_u8_t permh = hadamard_permtab[idx];
1911
vec_u8_t perml = hadamard_permtab[!idx];
1912
uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, permh );
1913
sum += pixel_hadamard_ac_altivec( pix+8, stride, perml );
1914
sum += pixel_hadamard_ac_altivec( pix+8*stride, stride, permh );
1915
sum += pixel_hadamard_ac_altivec( pix+8*stride+8, stride, perml );
1916
return ((sum>>34)<<32) + ((uint32_t)sum>>1);
1917
}
1918
1919
static uint64_t x264_pixel_hadamard_ac_16x8_altivec( uint8_t *pix, intptr_t stride )
1920
{
1921
int idx = ((uintptr_t)pix & 8) >> 3;
1922
vec_u8_t permh = hadamard_permtab[idx];
1923
vec_u8_t perml = hadamard_permtab[!idx];
1924
uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, permh );
1925
sum += pixel_hadamard_ac_altivec( pix+8, stride, perml );
1926
return ((sum>>34)<<32) + ((uint32_t)sum>>1);
1927
}
1928
1929
static uint64_t x264_pixel_hadamard_ac_8x16_altivec( uint8_t *pix, intptr_t stride )
1930
{
1931
vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ];
1932
uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm );
1933
sum += pixel_hadamard_ac_altivec( pix+8*stride, stride, perm );
1934
return ((sum>>34)<<32) + ((uint32_t)sum>>1);
1935
}
1936
1937
static uint64_t x264_pixel_hadamard_ac_8x8_altivec( uint8_t *pix, intptr_t stride )
1938
{
1939
vec_u8_t perm = hadamard_permtab[ (((uintptr_t)pix & 8) >> 3) ];
1940
uint64_t sum = pixel_hadamard_ac_altivec( pix, stride, perm );
1941
return ((sum>>34)<<32) + ((uint32_t)sum>>1);
1942
}
1943
1944
1945
/****************************************************************************
1946
* structural similarity metric
1947
****************************************************************************/
1948
static void ssim_4x4x2_core_altivec( const uint8_t *pix1, intptr_t stride1,
1949
const uint8_t *pix2, intptr_t stride2,
1950
int sums[2][4] )
1951
{
1952
ALIGNED_16( int temp[4] );
1953
1954
vec_u8_t pix1v, pix2v;
1955
vec_u32_t s1v, s2v, ssv, s12v;
1956
PREP_LOAD;
1957
PREP_LOAD_SRC (pix1);
1958
PREP_LOAD_SRC (pix2);
1959
LOAD_ZERO;
1960
1961
s1v = s2v = ssv = s12v = zero_u32v;
1962
1963
for( int y = 0; y < 4; y++ )
1964
{
1965
VEC_LOAD( &pix1[y*stride1], pix1v, 16, vec_u8_t, pix1 );
1966
VEC_LOAD( &pix2[y*stride2], pix2v, 16, vec_u8_t, pix2 );
1967
1968
s1v = vec_sum4s( pix1v, s1v );
1969
s2v = vec_sum4s( pix2v, s2v );
1970
ssv = vec_msum( pix1v, pix1v, ssv );
1971
ssv = vec_msum( pix2v, pix2v, ssv );
1972
s12v = vec_msum( pix1v, pix2v, s12v );
1973
}
1974
1975
vec_st( (vec_s32_t)s1v, 0, temp );
1976
sums[0][0] = temp[0];
1977
sums[1][0] = temp[1];
1978
vec_st( (vec_s32_t)s2v, 0, temp );
1979
sums[0][1] = temp[0];
1980
sums[1][1] = temp[1];
1981
vec_st( (vec_s32_t)ssv, 0, temp );
1982
sums[0][2] = temp[0];
1983
sums[1][2] = temp[1];
1984
vec_st( (vec_s32_t)s12v, 0, temp );
1985
sums[0][3] = temp[0];
1986
sums[1][3] = temp[1];
1987
}
1988
1989
#define SATD_X( size ) \
1990
static void pixel_satd_x3_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,\
1991
intptr_t i_stride, int scores[3] )\
1992
{\
1993
scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\
1994
scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\
1995
scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\
1996
}\
1997
static void pixel_satd_x4_##size##_altivec( uint8_t *fenc, uint8_t *pix0, uint8_t *pix1, uint8_t *pix2,\
1998
uint8_t *pix3, intptr_t i_stride, int scores[4] )\
1999
{\
2000
scores[0] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix0, i_stride );\
2001
scores[1] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix1, i_stride );\
2002
scores[2] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix2, i_stride );\
2003
scores[3] = pixel_satd_##size##_altivec( fenc, FENC_STRIDE, pix3, i_stride );\
2004
}
2005
SATD_X( 16x16 )\
2006
SATD_X( 16x8 )\
2007
SATD_X( 8x16 )\
2008
SATD_X( 8x8 )\
2009
SATD_X( 8x4 )\
2010
SATD_X( 4x8 )\
2011
SATD_X( 4x4 )
2012
2013
2014
#define INTRA_MBCMP_8x8( mbcmp )\
2015
void intra_##mbcmp##_x3_8x8_altivec( uint8_t *fenc, uint8_t edge[36], int res[3] )\
2016
{\
2017
ALIGNED_8( uint8_t pix[8*FDEC_STRIDE] );\
2018
x264_predict_8x8_v_c( pix, edge );\
2019
res[0] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
2020
x264_predict_8x8_h_c( pix, edge );\
2021
res[1] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
2022
x264_predict_8x8_dc_c( pix, edge );\
2023
res[2] = pixel_##mbcmp##_8x8_altivec( pix, FDEC_STRIDE, fenc, FENC_STRIDE );\
2024
}
2025
2026
INTRA_MBCMP_8x8(sad)
2027
INTRA_MBCMP_8x8(sa8d)
2028
2029
#define INTRA_MBCMP( mbcmp, size, pred1, pred2, pred3, chroma )\
2030
void intra_##mbcmp##_x3_##size##x##size##chroma##_altivec( uint8_t *fenc, uint8_t *fdec, int res[3] )\
2031
{\
2032
x264_predict_##size##x##size##chroma##_##pred1##_c( fdec );\
2033
res[0] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
2034
x264_predict_##size##x##size##chroma##_##pred2##_c( fdec );\
2035
res[1] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
2036
x264_predict_##size##x##size##chroma##_##pred3##_c( fdec );\
2037
res[2] = pixel_##mbcmp##_##size##x##size##_altivec( fdec, FDEC_STRIDE, fenc, FENC_STRIDE );\
2038
}
2039
2040
INTRA_MBCMP(satd, 4, v, h, dc, )
2041
INTRA_MBCMP(sad, 8, dc, h, v, c )
2042
INTRA_MBCMP(satd, 8, dc, h, v, c )
2043
INTRA_MBCMP(sad, 16, v, h, dc, )
2044
INTRA_MBCMP(satd, 16, v, h, dc, )
2045
#endif // !HIGH_BIT_DEPTH
2046
2047
/****************************************************************************
2048
* x264_pixel_init:
2049
****************************************************************************/
2050
void x264_pixel_altivec_init( x264_pixel_function_t *pixf )
2051
{
2052
#if !HIGH_BIT_DEPTH
2053
pixf->sad[PIXEL_16x16] = pixel_sad_16x16_altivec;
2054
pixf->sad[PIXEL_8x16] = pixel_sad_8x16_altivec;
2055
pixf->sad[PIXEL_16x8] = pixel_sad_16x8_altivec;
2056
pixf->sad[PIXEL_8x8] = pixel_sad_8x8_altivec;
2057
2058
pixf->sad_x3[PIXEL_16x16] = pixel_sad_x3_16x16_altivec;
2059
pixf->sad_x3[PIXEL_8x16] = pixel_sad_x3_8x16_altivec;
2060
pixf->sad_x3[PIXEL_16x8] = pixel_sad_x3_16x8_altivec;
2061
pixf->sad_x3[PIXEL_8x8] = pixel_sad_x3_8x8_altivec;
2062
2063
pixf->sad_x4[PIXEL_16x16] = pixel_sad_x4_16x16_altivec;
2064
pixf->sad_x4[PIXEL_8x16] = pixel_sad_x4_8x16_altivec;
2065
pixf->sad_x4[PIXEL_16x8] = pixel_sad_x4_16x8_altivec;
2066
pixf->sad_x4[PIXEL_8x8] = pixel_sad_x4_8x8_altivec;
2067
2068
pixf->satd[PIXEL_16x16] = pixel_satd_16x16_altivec;
2069
pixf->satd[PIXEL_8x16] = pixel_satd_8x16_altivec;
2070
pixf->satd[PIXEL_16x8] = pixel_satd_16x8_altivec;
2071
pixf->satd[PIXEL_8x8] = pixel_satd_8x8_altivec;
2072
pixf->satd[PIXEL_8x4] = pixel_satd_8x4_altivec;
2073
pixf->satd[PIXEL_4x8] = pixel_satd_4x8_altivec;
2074
pixf->satd[PIXEL_4x4] = pixel_satd_4x4_altivec;
2075
2076
pixf->satd_x3[PIXEL_16x16] = pixel_satd_x3_16x16_altivec;
2077
pixf->satd_x3[PIXEL_8x16] = pixel_satd_x3_8x16_altivec;
2078
pixf->satd_x3[PIXEL_16x8] = pixel_satd_x3_16x8_altivec;
2079
pixf->satd_x3[PIXEL_8x8] = pixel_satd_x3_8x8_altivec;
2080
pixf->satd_x3[PIXEL_8x4] = pixel_satd_x3_8x4_altivec;
2081
pixf->satd_x3[PIXEL_4x8] = pixel_satd_x3_4x8_altivec;
2082
pixf->satd_x3[PIXEL_4x4] = pixel_satd_x3_4x4_altivec;
2083
2084
pixf->satd_x4[PIXEL_16x16] = pixel_satd_x4_16x16_altivec;
2085
pixf->satd_x4[PIXEL_8x16] = pixel_satd_x4_8x16_altivec;
2086
pixf->satd_x4[PIXEL_16x8] = pixel_satd_x4_16x8_altivec;
2087
pixf->satd_x4[PIXEL_8x8] = pixel_satd_x4_8x8_altivec;
2088
pixf->satd_x4[PIXEL_8x4] = pixel_satd_x4_8x4_altivec;
2089
pixf->satd_x4[PIXEL_4x8] = pixel_satd_x4_4x8_altivec;
2090
pixf->satd_x4[PIXEL_4x4] = pixel_satd_x4_4x4_altivec;
2091
2092
pixf->intra_sad_x3_8x8 = intra_sad_x3_8x8_altivec;
2093
pixf->intra_sad_x3_8x8c = intra_sad_x3_8x8c_altivec;
2094
pixf->intra_sad_x3_16x16 = intra_sad_x3_16x16_altivec;
2095
2096
pixf->intra_satd_x3_4x4 = intra_satd_x3_4x4_altivec;
2097
pixf->intra_satd_x3_8x8c = intra_satd_x3_8x8c_altivec;
2098
pixf->intra_satd_x3_16x16 = intra_satd_x3_16x16_altivec;
2099
2100
pixf->ssd[PIXEL_16x16] = pixel_ssd_16x16_altivec;
2101
pixf->ssd[PIXEL_8x8] = pixel_ssd_8x8_altivec;
2102
2103
pixf->sa8d[PIXEL_16x16] = pixel_sa8d_16x16_altivec;
2104
pixf->sa8d[PIXEL_8x8] = pixel_sa8d_8x8_altivec;
2105
2106
pixf->intra_sa8d_x3_8x8 = intra_sa8d_x3_8x8_altivec;
2107
2108
pixf->var[PIXEL_16x16] = x264_pixel_var_16x16_altivec;
2109
pixf->var[PIXEL_8x8] = x264_pixel_var_8x8_altivec;
2110
2111
pixf->hadamard_ac[PIXEL_16x16] = x264_pixel_hadamard_ac_16x16_altivec;
2112
pixf->hadamard_ac[PIXEL_16x8] = x264_pixel_hadamard_ac_16x8_altivec;
2113
pixf->hadamard_ac[PIXEL_8x16] = x264_pixel_hadamard_ac_8x16_altivec;
2114
pixf->hadamard_ac[PIXEL_8x8] = x264_pixel_hadamard_ac_8x8_altivec;
2115
2116
pixf->ssim_4x4x2_core = ssim_4x4x2_core_altivec;
2117
#endif // !HIGH_BIT_DEPTH
2118
}
2119
2120