Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
/*****************************************************************************
2
* mc.c: ppc motion compensation
3
*****************************************************************************
4
* Copyright (C) 2003-2016 x264 project
5
*
6
* Authors: Eric Petit <[email protected]>
7
* Guillaume Poirier <[email protected]>
8
*
9
* This program is free software; you can redistribute it and/or modify
10
* it under the terms of the GNU General Public License as published by
11
* the Free Software Foundation; either version 2 of the License, or
12
* (at your option) any later version.
13
*
14
* This program is distributed in the hope that it will be useful,
15
* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
* GNU General Public License for more details.
18
*
19
* You should have received a copy of the GNU General Public License
20
* along with this program; if not, write to the Free Software
21
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22
*
23
* This program is also available under a commercial proprietary license.
24
* For more information, contact us at [email protected].
25
*****************************************************************************/
26
27
#include <stdlib.h>
28
#include <stdio.h>
29
#include <string.h>
30
#include <stdint.h>
31
#include <stdarg.h>
32
33
#include "x264.h"
34
#include "common/common.h"
35
#include "common/mc.h"
36
#include "mc.h"
37
#include "ppccommon.h"
38
39
#if !HIGH_BIT_DEPTH
40
typedef void (*pf_mc_t)( uint8_t *src, intptr_t i_src,
41
uint8_t *dst, intptr_t i_dst, int i_height );
42
43
static inline int x264_tapfilter( uint8_t *pix, int i_pix_next )
44
{
45
return pix[-2*i_pix_next] - 5*pix[-1*i_pix_next] + 20*(pix[0] +
46
pix[1*i_pix_next]) - 5*pix[ 2*i_pix_next] +
47
pix[ 3*i_pix_next];
48
}
49
50
static inline int x264_tapfilter1( uint8_t *pix )
51
{
52
return pix[-2] - 5*pix[-1] + 20*(pix[0] + pix[1]) - 5*pix[ 2] +
53
pix[ 3];
54
}
55
56
static inline void x264_pixel_avg2_w4_altivec( uint8_t *dst, intptr_t i_dst,
57
uint8_t *src1, intptr_t i_src1,
58
uint8_t *src2, int i_height )
59
{
60
for( int y = 0; y < i_height; y++ )
61
{
62
for( int x = 0; x < 4; x++ )
63
dst[x] = ( src1[x] + src2[x] + 1 ) >> 1;
64
dst += i_dst;
65
src1 += i_src1;
66
src2 += i_src1;
67
}
68
}
69
70
static inline void x264_pixel_avg2_w8_altivec( uint8_t *dst, intptr_t i_dst,
71
uint8_t *src1, intptr_t i_src1,
72
uint8_t *src2, int i_height )
73
{
74
vec_u8_t src1v, src2v;
75
PREP_LOAD;
76
PREP_STORE8;
77
PREP_LOAD_SRC( src1 );
78
PREP_LOAD_SRC( src2 );
79
80
for( int y = 0; y < i_height; y++ )
81
{
82
VEC_LOAD( src1, src1v, 8, vec_u8_t, src1 );
83
VEC_LOAD( src2, src2v, 8, vec_u8_t, src2 );
84
src1v = vec_avg( src1v, src2v );
85
VEC_STORE8( src1v, dst );
86
87
dst += i_dst;
88
src1 += i_src1;
89
src2 += i_src1;
90
}
91
}
92
93
static inline void x264_pixel_avg2_w16_altivec( uint8_t *dst, intptr_t i_dst,
94
uint8_t *src1, intptr_t i_src1,
95
uint8_t *src2, int i_height )
96
{
97
vec_u8_t src1v, src2v;
98
PREP_LOAD;
99
PREP_LOAD_SRC( src1 );
100
PREP_LOAD_SRC( src2 );
101
102
for( int y = 0; y < i_height; y++ )
103
{
104
VEC_LOAD( src1, src1v, 16, vec_u8_t, src1 );
105
VEC_LOAD( src2, src2v, 16, vec_u8_t, src2 );
106
src1v = vec_avg( src1v, src2v );
107
vec_st(src1v, 0, dst);
108
109
dst += i_dst;
110
src1 += i_src1;
111
src2 += i_src1;
112
}
113
}
114
115
static inline void x264_pixel_avg2_w20_altivec( uint8_t *dst, intptr_t i_dst,
116
uint8_t *src1, intptr_t i_src1,
117
uint8_t *src2, int i_height )
118
{
119
x264_pixel_avg2_w16_altivec(dst, i_dst, src1, i_src1, src2, i_height);
120
x264_pixel_avg2_w4_altivec(dst+16, i_dst, src1+16, i_src1, src2+16, i_height);
121
}
122
123
/* mc_copy: plain c */
124
125
#define MC_COPY( name, a ) \
126
static void name( uint8_t *dst, intptr_t i_dst, \
127
uint8_t *src, intptr_t i_src, int i_height ) \
128
{ \
129
int y; \
130
for( y = 0; y < i_height; y++ ) \
131
{ \
132
memcpy( dst, src, a ); \
133
src += i_src; \
134
dst += i_dst; \
135
} \
136
}
137
MC_COPY( x264_mc_copy_w4_altivec, 4 )
138
MC_COPY( x264_mc_copy_w8_altivec, 8 )
139
140
static void x264_mc_copy_w16_altivec( uint8_t *dst, intptr_t i_dst,
141
uint8_t *src, intptr_t i_src, int i_height )
142
{
143
vec_u8_t cpyV;
144
PREP_LOAD;
145
PREP_LOAD_SRC( src );
146
147
for( int y = 0; y < i_height; y++ )
148
{
149
VEC_LOAD( src, cpyV, 16, vec_u8_t, src );
150
vec_st(cpyV, 0, dst);
151
152
src += i_src;
153
dst += i_dst;
154
}
155
}
156
157
158
static void x264_mc_copy_w16_aligned_altivec( uint8_t *dst, intptr_t i_dst,
159
uint8_t *src, intptr_t i_src, int i_height )
160
{
161
for( int y = 0; y < i_height; ++y )
162
{
163
vec_u8_t cpyV = vec_ld( 0, src );
164
vec_st(cpyV, 0, dst);
165
166
src += i_src;
167
dst += i_dst;
168
}
169
}
170
171
172
static void mc_luma_altivec( uint8_t *dst, intptr_t i_dst_stride,
173
uint8_t *src[4], intptr_t i_src_stride,
174
int mvx, int mvy,
175
int i_width, int i_height, const x264_weight_t *weight )
176
{
177
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
178
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
179
uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
180
if( qpel_idx & 5 ) /* qpel interpolation needed */
181
{
182
uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
183
184
switch( i_width )
185
{
186
case 4:
187
x264_pixel_avg2_w4_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
188
break;
189
case 8:
190
x264_pixel_avg2_w8_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
191
break;
192
case 16:
193
default:
194
x264_pixel_avg2_w16_altivec( dst, i_dst_stride, src1, i_src_stride, src2, i_height );
195
}
196
if( weight->weightfn )
197
weight->weightfn[i_width>>2]( dst, i_dst_stride, dst, i_dst_stride, weight, i_height );
198
}
199
else if( weight->weightfn )
200
weight->weightfn[i_width>>2]( dst, i_dst_stride, src1, i_src_stride, weight, i_height );
201
else
202
{
203
switch( i_width )
204
{
205
case 4:
206
x264_mc_copy_w4_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
207
break;
208
case 8:
209
x264_mc_copy_w8_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
210
break;
211
case 16:
212
x264_mc_copy_w16_altivec( dst, i_dst_stride, src1, i_src_stride, i_height );
213
break;
214
}
215
}
216
}
217
218
219
220
static uint8_t *get_ref_altivec( uint8_t *dst, intptr_t *i_dst_stride,
221
uint8_t *src[4], intptr_t i_src_stride,
222
int mvx, int mvy,
223
int i_width, int i_height, const x264_weight_t *weight )
224
{
225
int qpel_idx = ((mvy&3)<<2) + (mvx&3);
226
intptr_t offset = (mvy>>2)*i_src_stride + (mvx>>2);
227
uint8_t *src1 = src[x264_hpel_ref0[qpel_idx]] + offset + ((mvy&3) == 3) * i_src_stride;
228
if( qpel_idx & 5 ) /* qpel interpolation needed */
229
{
230
uint8_t *src2 = src[x264_hpel_ref1[qpel_idx]] + offset + ((mvx&3) == 3);
231
switch( i_width )
232
{
233
case 4:
234
x264_pixel_avg2_w4_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
235
break;
236
case 8:
237
x264_pixel_avg2_w8_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
238
break;
239
case 12:
240
case 16:
241
default:
242
x264_pixel_avg2_w16_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
243
break;
244
case 20:
245
x264_pixel_avg2_w20_altivec( dst, *i_dst_stride, src1, i_src_stride, src2, i_height );
246
break;
247
}
248
if( weight->weightfn )
249
weight->weightfn[i_width>>2]( dst, *i_dst_stride, dst, *i_dst_stride, weight, i_height );
250
return dst;
251
}
252
else if( weight->weightfn )
253
{
254
weight->weightfn[i_width>>2]( dst, *i_dst_stride, src1, i_src_stride, weight, i_height );
255
return dst;
256
}
257
else
258
{
259
*i_dst_stride = i_src_stride;
260
return src1;
261
}
262
}
263
264
static void mc_chroma_2xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
265
uint8_t *src, intptr_t i_src_stride,
266
int mvx, int mvy, int i_height )
267
{
268
uint8_t *srcp;
269
int d8x = mvx&0x07;
270
int d8y = mvy&0x07;
271
272
int cA = (8-d8x)*(8-d8y);
273
int cB = d8x *(8-d8y);
274
int cC = (8-d8x)*d8y;
275
int cD = d8x *d8y;
276
277
src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
278
srcp = &src[i_src_stride];
279
280
for( int y = 0; y < i_height; y++ )
281
{
282
dstu[0] = ( cA*src[0] + cB*src[2] + cC*srcp[0] + cD*srcp[2] + 32 ) >> 6;
283
dstv[0] = ( cA*src[1] + cB*src[3] + cC*srcp[1] + cD*srcp[3] + 32 ) >> 6;
284
dstu[1] = ( cA*src[2] + cB*src[4] + cC*srcp[2] + cD*srcp[4] + 32 ) >> 6;
285
dstv[1] = ( cA*src[3] + cB*src[5] + cC*srcp[3] + cD*srcp[5] + 32 ) >> 6;
286
287
src += i_src_stride;
288
srcp += i_src_stride;
289
dstu += i_dst_stride;
290
dstv += i_dst_stride;
291
}
292
}
293
294
#ifdef WORDS_BIGENDIAN
295
#define VSLD(a,b,n) vec_sld(a,b,n)
296
#else
297
#define VSLD(a,b,n) vec_sld(b,a,16-n)
298
#endif
299
300
static void mc_chroma_altivec_4xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
301
uint8_t *src, intptr_t i_src_stride,
302
int mvx, int mvy, int i_height )
303
{
304
uint8_t *srcp;
305
int d8x = mvx & 0x07;
306
int d8y = mvy & 0x07;
307
308
ALIGNED_16( uint16_t coeff[4] );
309
coeff[0] = (8-d8x)*(8-d8y);
310
coeff[1] = d8x *(8-d8y);
311
coeff[2] = (8-d8x)*d8y;
312
coeff[3] = d8x *d8y;
313
314
src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
315
srcp = &src[i_src_stride];
316
317
LOAD_ZERO;
318
PREP_LOAD;
319
PREP_LOAD_SRC( src );
320
vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v;
321
vec_u8_t src2v_8, dstuv, dstvv;
322
vec_u16_t src0v_16, src1v_16, src2v_16, src3v_16, dstv16;
323
vec_u16_t shiftv, k32v;
324
325
#ifdef WORDS_BIGENDIAN
326
static const vec_u8_t perm0v = CV(1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13);
327
static const vec_u8_t perm1v = CV(3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15);
328
#else
329
static const vec_u8_t perm0v = CV(0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12);
330
static const vec_u8_t perm1v = CV(2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14);
331
#endif
332
333
coeff0v = vec_ld( 0, coeff );
334
coeff3v = vec_splat( coeff0v, 3 );
335
coeff2v = vec_splat( coeff0v, 2 );
336
coeff1v = vec_splat( coeff0v, 1 );
337
coeff0v = vec_splat( coeff0v, 0 );
338
k32v = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
339
shiftv = vec_splat_u16( 6 );
340
341
VEC_LOAD( src, src2v_8, 9, vec_u8_t, src );
342
src2v_16 = vec_u8_to_u16( src2v_8 );
343
src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
344
345
for( int y = 0; y < i_height; y += 2 )
346
{
347
src0v_16 = src2v_16;
348
src1v_16 = src3v_16;
349
VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
350
src2v_16 = vec_u8_to_u16( src2v_8 );
351
src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
352
353
dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
354
dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
355
dstv16 = vec_mladd( coeff2v, src2v_16, dstv16 );
356
dstv16 = vec_mladd( coeff3v, src3v_16, dstv16 );
357
358
dstv16 = vec_sr( dstv16, shiftv );
359
360
dstuv = (vec_u8_t)vec_perm( dstv16, dstv16, perm0v );
361
dstvv = (vec_u8_t)vec_perm( dstv16, dstv16, perm1v );
362
vec_ste( (vec_u32_t)dstuv, 0, (uint32_t*) dstu );
363
vec_ste( (vec_u32_t)dstvv, 0, (uint32_t*) dstv );
364
365
srcp += i_src_stride;
366
dstu += i_dst_stride;
367
dstv += i_dst_stride;
368
369
src0v_16 = src2v_16;
370
src1v_16 = src3v_16;
371
VEC_LOAD( srcp, src2v_8, 9, vec_u8_t, src );
372
src2v_16 = vec_u8_to_u16( src2v_8 );
373
src3v_16 = vec_u8_to_u16( VSLD( src2v_8, src2v_8, 2 ) );
374
375
dstv16 = vec_mladd( coeff0v, src0v_16, k32v );
376
dstv16 = vec_mladd( coeff1v, src1v_16, dstv16 );
377
dstv16 = vec_mladd( coeff2v, src2v_16, dstv16 );
378
dstv16 = vec_mladd( coeff3v, src3v_16, dstv16 );
379
380
dstv16 = vec_sr( dstv16, shiftv );
381
382
dstuv = (vec_u8_t)vec_perm( dstv16, dstv16, perm0v );
383
dstvv = (vec_u8_t)vec_perm( dstv16, dstv16, perm1v );
384
vec_ste( (vec_u32_t)dstuv, 0, (uint32_t*) dstu );
385
vec_ste( (vec_u32_t)dstvv, 0, (uint32_t*) dstv );
386
387
srcp += i_src_stride;
388
dstu += i_dst_stride;
389
dstv += i_dst_stride;
390
}
391
}
392
393
static void mc_chroma_altivec_8xh( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
394
uint8_t *src, intptr_t i_src_stride,
395
int mvx, int mvy, int i_height )
396
{
397
uint8_t *srcp;
398
int d8x = mvx & 0x07;
399
int d8y = mvy & 0x07;
400
401
ALIGNED_16( uint16_t coeff[4] );
402
coeff[0] = (8-d8x)*(8-d8y);
403
coeff[1] = d8x *(8-d8y);
404
coeff[2] = (8-d8x)*d8y;
405
coeff[3] = d8x *d8y;
406
407
src += (mvy >> 3) * i_src_stride + (mvx >> 3)*2;
408
srcp = &src[i_src_stride];
409
410
LOAD_ZERO;
411
PREP_LOAD;
412
PREP_LOAD_SRC( src );
413
PREP_STORE8;
414
vec_u16_t coeff0v, coeff1v, coeff2v, coeff3v;
415
vec_u8_t src0v_8, src1v_8, src2v_8, src3v_8;
416
vec_u8_t dstuv, dstvv;
417
vec_u16_t src0v_16h, src1v_16h, src2v_16h, src3v_16h, dstv_16h;
418
vec_u16_t src0v_16l, src1v_16l, src2v_16l, src3v_16l, dstv_16l;
419
vec_u16_t shiftv, k32v;
420
421
coeff0v = vec_ld( 0, coeff );
422
coeff3v = vec_splat( coeff0v, 3 );
423
coeff2v = vec_splat( coeff0v, 2 );
424
coeff1v = vec_splat( coeff0v, 1 );
425
coeff0v = vec_splat( coeff0v, 0 );
426
k32v = vec_sl( vec_splat_u16( 1 ), vec_splat_u16( 5 ) );
427
shiftv = vec_splat_u16( 6 );
428
429
#ifdef WORDS_BIGENDIAN
430
static const vec_u8_t perm0v = CV(1,5,9,13,17,21,25,29,0,0,0,0,0,0,0,0);
431
static const vec_u8_t perm1v = CV(3,7,11,15,19,23,27,31,0,0,0,0,0,0,0,0);
432
#else
433
static const vec_u8_t perm0v = CV(0,4,8,12,16,20,24,28,1,1,1,1,1,1,1,1);
434
static const vec_u8_t perm1v = CV(2,6,10,14,18,22,26,30,1,1,1,1,1,1,1,1);
435
#endif
436
437
VEC_LOAD( src, src2v_8, 16, vec_u8_t, src );
438
VEC_LOAD( src+16, src3v_8, 2, vec_u8_t, src );
439
src3v_8 = VSLD( src2v_8, src3v_8, 2 );
440
441
for( int y = 0; y < i_height; y += 2 )
442
{
443
src0v_8 = src2v_8;
444
src1v_8 = src3v_8;
445
VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
446
VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
447
448
src3v_8 = VSLD( src2v_8, src3v_8, 2 );
449
450
src0v_16h = vec_u8_to_u16_h( src0v_8 );
451
src0v_16l = vec_u8_to_u16_l( src0v_8 );
452
src1v_16h = vec_u8_to_u16_h( src1v_8 );
453
src1v_16l = vec_u8_to_u16_l( src1v_8 );
454
src2v_16h = vec_u8_to_u16_h( src2v_8 );
455
src2v_16l = vec_u8_to_u16_l( src2v_8 );
456
src3v_16h = vec_u8_to_u16_h( src3v_8 );
457
src3v_16l = vec_u8_to_u16_l( src3v_8 );
458
459
dstv_16h = vec_mladd( coeff0v, src0v_16h, k32v );
460
dstv_16l = vec_mladd( coeff0v, src0v_16l, k32v );
461
dstv_16h = vec_mladd( coeff1v, src1v_16h, dstv_16h );
462
dstv_16l = vec_mladd( coeff1v, src1v_16l, dstv_16l );
463
dstv_16h = vec_mladd( coeff2v, src2v_16h, dstv_16h );
464
dstv_16l = vec_mladd( coeff2v, src2v_16l, dstv_16l );
465
dstv_16h = vec_mladd( coeff3v, src3v_16h, dstv_16h );
466
dstv_16l = vec_mladd( coeff3v, src3v_16l, dstv_16l );
467
468
dstv_16h = vec_sr( dstv_16h, shiftv );
469
dstv_16l = vec_sr( dstv_16l, shiftv );
470
471
dstuv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm0v );
472
dstvv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm1v );
473
474
VEC_STORE8( dstuv, dstu );
475
VEC_STORE8( dstvv, dstv );
476
477
srcp += i_src_stride;
478
dstu += i_dst_stride;
479
dstv += i_dst_stride;
480
481
src0v_8 = src2v_8;
482
src1v_8 = src3v_8;
483
VEC_LOAD( srcp, src2v_8, 16, vec_u8_t, src );
484
VEC_LOAD( srcp+16, src3v_8, 2, vec_u8_t, src );
485
486
src3v_8 = VSLD( src2v_8, src3v_8, 2 );
487
488
src0v_16h = vec_u8_to_u16_h( src0v_8 );
489
src0v_16l = vec_u8_to_u16_l( src0v_8 );
490
src1v_16h = vec_u8_to_u16_h( src1v_8 );
491
src1v_16l = vec_u8_to_u16_l( src1v_8 );
492
src2v_16h = vec_u8_to_u16_h( src2v_8 );
493
src2v_16l = vec_u8_to_u16_l( src2v_8 );
494
src3v_16h = vec_u8_to_u16_h( src3v_8 );
495
src3v_16l = vec_u8_to_u16_l( src3v_8 );
496
497
dstv_16h = vec_mladd( coeff0v, src0v_16h, k32v );
498
dstv_16l = vec_mladd( coeff0v, src0v_16l, k32v );
499
dstv_16h = vec_mladd( coeff1v, src1v_16h, dstv_16h );
500
dstv_16l = vec_mladd( coeff1v, src1v_16l, dstv_16l );
501
dstv_16h = vec_mladd( coeff2v, src2v_16h, dstv_16h );
502
dstv_16l = vec_mladd( coeff2v, src2v_16l, dstv_16l );
503
dstv_16h = vec_mladd( coeff3v, src3v_16h, dstv_16h );
504
dstv_16l = vec_mladd( coeff3v, src3v_16l, dstv_16l );
505
506
dstv_16h = vec_sr( dstv_16h, shiftv );
507
dstv_16l = vec_sr( dstv_16l, shiftv );
508
509
dstuv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm0v );
510
dstvv = (vec_u8_t)vec_perm( dstv_16h, dstv_16l, perm1v );
511
512
VEC_STORE8( dstuv, dstu );
513
VEC_STORE8( dstvv, dstv );
514
515
srcp += i_src_stride;
516
dstu += i_dst_stride;
517
dstv += i_dst_stride;
518
}
519
}
520
521
static void mc_chroma_altivec( uint8_t *dstu, uint8_t *dstv, intptr_t i_dst_stride,
522
uint8_t *src, intptr_t i_src_stride,
523
int mvx, int mvy, int i_width, int i_height )
524
{
525
if( i_width == 8 )
526
mc_chroma_altivec_8xh( dstu, dstv, i_dst_stride, src, i_src_stride,
527
mvx, mvy, i_height );
528
else if( i_width == 4 )
529
mc_chroma_altivec_4xh( dstu, dstv, i_dst_stride, src, i_src_stride,
530
mvx, mvy, i_height );
531
else
532
mc_chroma_2xh( dstu, dstv, i_dst_stride, src, i_src_stride,
533
mvx, mvy, i_height );
534
}
535
536
#define HPEL_FILTER_1( t1v, t2v, t3v, t4v, t5v, t6v ) \
537
{ \
538
t1v = vec_add( t1v, t6v ); \
539
t2v = vec_add( t2v, t5v ); \
540
t3v = vec_add( t3v, t4v ); \
541
\
542
t1v = vec_sub( t1v, t2v ); /* (a-b) */ \
543
t2v = vec_sub( t2v, t3v ); /* (b-c) */ \
544
t2v = vec_sl( t2v, twov ); /* (b-c)*4 */ \
545
t1v = vec_sub( t1v, t2v ); /* a-5*b+4*c */ \
546
t3v = vec_sl( t3v, fourv ); /* 16*c */ \
547
t1v = vec_add( t1v, t3v ); /* a-5*b+20*c */ \
548
}
549
550
#define HPEL_FILTER_2( t1v, t2v, t3v, t4v, t5v, t6v ) \
551
{ \
552
t1v = vec_add( t1v, t6v ); \
553
t2v = vec_add( t2v, t5v ); \
554
t3v = vec_add( t3v, t4v ); \
555
\
556
t1v = vec_sub( t1v, t2v ); /* (a-b) */ \
557
t1v = vec_sra( t1v, twov ); /* (a-b)/4 */ \
558
t1v = vec_sub( t1v, t2v ); /* (a-b)/4-b */ \
559
t1v = vec_add( t1v, t3v ); /* (a-b)/4-b+c */ \
560
t1v = vec_sra( t1v, twov ); /* ((a-b)/4-b+c)/4 */ \
561
t1v = vec_add( t1v, t3v ); /* ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 */ \
562
}
563
564
#define HPEL_FILTER_HORIZONTAL() \
565
{ \
566
VEC_LOAD_G( &src[x- 2+i_stride*y], src1v, 16, vec_u8_t); \
567
VEC_LOAD_G( &src[x+14+i_stride*y], src6v, 16, vec_u8_t); \
568
\
569
src2v = VSLD( src1v, src6v, 1 ); \
570
src3v = VSLD( src1v, src6v, 2 ); \
571
src4v = VSLD( src1v, src6v, 3 ); \
572
src5v = VSLD( src1v, src6v, 4 ); \
573
src6v = VSLD( src1v, src6v, 5 ); \
574
\
575
temp1v = vec_u8_to_s16_h( src1v ); \
576
temp2v = vec_u8_to_s16_h( src2v ); \
577
temp3v = vec_u8_to_s16_h( src3v ); \
578
temp4v = vec_u8_to_s16_h( src4v ); \
579
temp5v = vec_u8_to_s16_h( src5v ); \
580
temp6v = vec_u8_to_s16_h( src6v ); \
581
\
582
HPEL_FILTER_1( temp1v, temp2v, temp3v, \
583
temp4v, temp5v, temp6v ); \
584
\
585
dest1v = vec_add( temp1v, sixteenv ); \
586
dest1v = vec_sra( dest1v, fivev ); \
587
\
588
temp1v = vec_u8_to_s16_l( src1v ); \
589
temp2v = vec_u8_to_s16_l( src2v ); \
590
temp3v = vec_u8_to_s16_l( src3v ); \
591
temp4v = vec_u8_to_s16_l( src4v ); \
592
temp5v = vec_u8_to_s16_l( src5v ); \
593
temp6v = vec_u8_to_s16_l( src6v ); \
594
\
595
HPEL_FILTER_1( temp1v, temp2v, temp3v, \
596
temp4v, temp5v, temp6v ); \
597
\
598
dest2v = vec_add( temp1v, sixteenv ); \
599
dest2v = vec_sra( dest2v, fivev ); \
600
\
601
destv = vec_packsu( dest1v, dest2v ); \
602
\
603
VEC_STORE16( destv, &dsth[x+i_stride*y], dsth ); \
604
}
605
606
#define HPEL_FILTER_VERTICAL() \
607
{ \
608
VEC_LOAD( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src ); \
609
VEC_LOAD( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src ); \
610
VEC_LOAD( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src ); \
611
VEC_LOAD( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src ); \
612
VEC_LOAD( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src ); \
613
VEC_LOAD( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src ); \
614
\
615
temp1v = vec_u8_to_s16_h( src1v ); \
616
temp2v = vec_u8_to_s16_h( src2v ); \
617
temp3v = vec_u8_to_s16_h( src3v ); \
618
temp4v = vec_u8_to_s16_h( src4v ); \
619
temp5v = vec_u8_to_s16_h( src5v ); \
620
temp6v = vec_u8_to_s16_h( src6v ); \
621
\
622
HPEL_FILTER_1( temp1v, temp2v, temp3v, \
623
temp4v, temp5v, temp6v ); \
624
\
625
dest1v = vec_add( temp1v, sixteenv ); \
626
dest1v = vec_sra( dest1v, fivev ); \
627
\
628
temp4v = vec_u8_to_s16_l( src1v ); \
629
temp5v = vec_u8_to_s16_l( src2v ); \
630
temp6v = vec_u8_to_s16_l( src3v ); \
631
temp7v = vec_u8_to_s16_l( src4v ); \
632
temp8v = vec_u8_to_s16_l( src5v ); \
633
temp9v = vec_u8_to_s16_l( src6v ); \
634
\
635
HPEL_FILTER_1( temp4v, temp5v, temp6v, \
636
temp7v, temp8v, temp9v ); \
637
\
638
dest2v = vec_add( temp4v, sixteenv ); \
639
dest2v = vec_sra( dest2v, fivev ); \
640
\
641
destv = vec_packsu( dest1v, dest2v ); \
642
\
643
VEC_STORE16( destv, &dstv[x+i_stride*y], dsth ); \
644
}
645
646
#define HPEL_FILTER_CENTRAL() \
647
{ \
648
temp1v = VSLD( tempav, tempbv, 12 ); \
649
temp2v = VSLD( tempav, tempbv, 14 ); \
650
temp3v = tempbv; \
651
temp4v = VSLD( tempbv, tempcv, 2 ); \
652
temp5v = VSLD( tempbv, tempcv, 4 ); \
653
temp6v = VSLD( tempbv, tempcv, 6 ); \
654
\
655
HPEL_FILTER_2( temp1v, temp2v, temp3v, \
656
temp4v, temp5v, temp6v ); \
657
\
658
dest1v = vec_add( temp1v, thirtytwov ); \
659
dest1v = vec_sra( dest1v, sixv ); \
660
\
661
temp1v = VSLD( tempbv, tempcv, 12 ); \
662
temp2v = VSLD( tempbv, tempcv, 14 ); \
663
temp3v = tempcv; \
664
temp4v = VSLD( tempcv, tempdv, 2 ); \
665
temp5v = VSLD( tempcv, tempdv, 4 ); \
666
temp6v = VSLD( tempcv, tempdv, 6 ); \
667
\
668
HPEL_FILTER_2( temp1v, temp2v, temp3v, \
669
temp4v, temp5v, temp6v ); \
670
\
671
dest2v = vec_add( temp1v, thirtytwov ); \
672
dest2v = vec_sra( dest2v, sixv ); \
673
\
674
destv = vec_packsu( dest1v, dest2v ); \
675
\
676
VEC_STORE16( destv, &dstc[x-16+i_stride*y], dsth ); \
677
}
678
679
void x264_hpel_filter_altivec( uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, uint8_t *src,
680
intptr_t i_stride, int i_width, int i_height, int16_t *buf )
681
{
682
vec_u8_t destv;
683
vec_u8_t src1v, src2v, src3v, src4v, src5v, src6v;
684
vec_s16_t dest1v, dest2v;
685
vec_s16_t temp1v, temp2v, temp3v, temp4v, temp5v, temp6v, temp7v, temp8v, temp9v;
686
vec_s16_t tempav, tempbv, tempcv, tempdv, tempev;
687
688
PREP_LOAD;
689
PREP_LOAD_SRC( src);
690
PREP_STORE16;
691
PREP_STORE16_DST( dsth );
692
LOAD_ZERO;
693
694
vec_u16_t twov, fourv, fivev, sixv;
695
vec_s16_t sixteenv, thirtytwov;
696
vec_u16_u temp_u;
697
698
temp_u.s[0]=2;
699
twov = vec_splat( temp_u.v, 0 );
700
temp_u.s[0]=4;
701
fourv = vec_splat( temp_u.v, 0 );
702
temp_u.s[0]=5;
703
fivev = vec_splat( temp_u.v, 0 );
704
temp_u.s[0]=6;
705
sixv = vec_splat( temp_u.v, 0 );
706
temp_u.s[0]=16;
707
sixteenv = (vec_s16_t)vec_splat( temp_u.v, 0 );
708
temp_u.s[0]=32;
709
thirtytwov = (vec_s16_t)vec_splat( temp_u.v, 0 );
710
711
for( int y = 0; y < i_height; y++ )
712
{
713
int x = 0;
714
715
/* horizontal_filter */
716
HPEL_FILTER_HORIZONTAL();
717
718
/* vertical_filter */
719
HPEL_FILTER_VERTICAL();
720
721
/* central_filter */
722
tempav = tempcv;
723
tempbv = tempdv;
724
tempcv = vec_splat( temp1v, 0 ); /* first only */
725
tempdv = temp1v;
726
tempev = temp4v;
727
728
for( x = 16; x < i_width; x+=16 )
729
{
730
/* horizontal_filter */
731
HPEL_FILTER_HORIZONTAL();
732
733
/* vertical_filter */
734
HPEL_FILTER_VERTICAL();
735
736
/* central_filter */
737
tempav = tempcv;
738
tempbv = tempdv;
739
tempcv = tempev;
740
tempdv = temp1v;
741
tempev = temp4v;
742
743
HPEL_FILTER_CENTRAL();
744
}
745
746
/* Partial vertical filter */
747
VEC_LOAD_PARTIAL( &src[x+i_stride*(y-2)], src1v, 16, vec_u8_t, src );
748
VEC_LOAD_PARTIAL( &src[x+i_stride*(y-1)], src2v, 16, vec_u8_t, src );
749
VEC_LOAD_PARTIAL( &src[x+i_stride*(y-0)], src3v, 16, vec_u8_t, src );
750
VEC_LOAD_PARTIAL( &src[x+i_stride*(y+1)], src4v, 16, vec_u8_t, src );
751
VEC_LOAD_PARTIAL( &src[x+i_stride*(y+2)], src5v, 16, vec_u8_t, src );
752
VEC_LOAD_PARTIAL( &src[x+i_stride*(y+3)], src6v, 16, vec_u8_t, src );
753
754
temp1v = vec_u8_to_s16_h( src1v );
755
temp2v = vec_u8_to_s16_h( src2v );
756
temp3v = vec_u8_to_s16_h( src3v );
757
temp4v = vec_u8_to_s16_h( src4v );
758
temp5v = vec_u8_to_s16_h( src5v );
759
temp6v = vec_u8_to_s16_h( src6v );
760
761
HPEL_FILTER_1( temp1v, temp2v, temp3v, temp4v, temp5v, temp6v );
762
763
/* central_filter */
764
tempav = tempcv;
765
tempbv = tempdv;
766
tempcv = tempev;
767
tempdv = temp1v;
768
/* tempev is not used */
769
770
HPEL_FILTER_CENTRAL();
771
}
772
}
773
774
static void frame_init_lowres_core_altivec( uint8_t *src0, uint8_t *dst0, uint8_t *dsth, uint8_t *dstv, uint8_t *dstc,
775
intptr_t src_stride, intptr_t dst_stride, int width, int height )
776
{
777
int w = width >> 4;
778
int end = (width & 15);
779
vec_u8_t src0v, src1v, src2v;
780
vec_u8_t lv, hv, src1p1v;
781
vec_u8_t avg0v, avg1v, avghv, avghp1v, avgleftv, avgrightv;
782
static const vec_u8_t inverse_bridge_shuffle = CV(0x00, 0x02, 0x04, 0x06, 0x08, 0x0A, 0x0C, 0x0E, 0x10, 0x12, 0x14, 0x16, 0x18, 0x1A, 0x1C, 0x1E );
783
#ifndef WORDS_BIGENDIAN
784
static const vec_u8_t inverse_bridge_shuffle_1 = CV(0x01, 0x03, 0x05, 0x07, 0x09, 0x0B, 0x0D, 0x0F, 0x11, 0x13, 0x15, 0x17, 0x19, 0x1B, 0x1D, 0x1F );
785
#endif
786
787
for( int y = 0; y < height; y++ )
788
{
789
int x;
790
uint8_t *src1 = src0+src_stride;
791
uint8_t *src2 = src1+src_stride;
792
793
src0v = vec_ld(0, src0);
794
src1v = vec_ld(0, src1);
795
src2v = vec_ld(0, src2);
796
797
avg0v = vec_avg(src0v, src1v);
798
avg1v = vec_avg(src1v, src2v);
799
800
for( x = 0; x < w; x++ )
801
{
802
lv = vec_ld(16*(x*2+1), src0);
803
src1v = vec_ld(16*(x*2+1), src1);
804
avghv = vec_avg(lv, src1v);
805
806
lv = vec_ld(16*(x*2+2), src0);
807
src1p1v = vec_ld(16*(x*2+2), src1);
808
avghp1v = vec_avg(lv, src1p1v);
809
810
avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v);
811
avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv);
812
813
vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dst0);
814
#ifdef WORDS_BIGENDIAN
815
vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dsth);
816
#else
817
vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dsth);
818
#endif
819
820
avg0v = avghp1v;
821
822
hv = vec_ld(16*(x*2+1), src2);
823
avghv = vec_avg(src1v, hv);
824
825
hv = vec_ld(16*(x*2+2), src2);
826
avghp1v = vec_avg(src1p1v, hv);
827
828
avgleftv = vec_avg(VSLD(avg1v, avghv, 1), avg1v);
829
avgrightv = vec_avg(VSLD(avghv, avghp1v, 1), avghv);
830
831
vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle), 16*x, dstv);
832
#ifdef WORDS_BIGENDIAN
833
vec_st((vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv), 16*x, dstc);
834
#else
835
vec_st(vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1), 16*x, dstc);
836
#endif
837
838
avg1v = avghp1v;
839
840
}
841
if( end )
842
{
843
lv = vec_ld(16*(x*2+1), src0);
844
src1v = vec_ld(16*(x*2+1), src1);
845
avghv = vec_avg(lv, src1v);
846
847
lv = vec_ld(16*(x*2+1), src2);
848
avghp1v = vec_avg(src1v, lv);
849
850
avgleftv = vec_avg(VSLD(avg0v, avghv, 1), avg0v);
851
avgrightv = vec_avg(VSLD(avg1v, avghp1v, 1), avg1v);
852
853
lv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle);
854
#ifdef WORDS_BIGENDIAN
855
hv = (vec_u8_t)vec_pack((vec_u16_t)avgleftv,(vec_u16_t)avgrightv);
856
#else
857
hv = vec_perm(avgleftv, avgrightv, inverse_bridge_shuffle_1);
858
#endif
859
860
vec_ste((vec_u32_t)lv,16*x,(uint32_t*)dst0);
861
vec_ste((vec_u32_t)lv,16*x+4,(uint32_t*)dst0);
862
vec_ste((vec_u32_t)hv,16*x,(uint32_t*)dsth);
863
vec_ste((vec_u32_t)hv,16*x+4,(uint32_t*)dsth);
864
865
lv = vec_sld(lv, lv, 8);
866
hv = vec_sld(hv, hv, 8);
867
868
vec_ste((vec_u32_t)lv,16*x,(uint32_t*)dstv);
869
vec_ste((vec_u32_t)lv,16*x+4,(uint32_t*)dstv);
870
vec_ste((vec_u32_t)hv,16*x,(uint32_t*)dstc);
871
vec_ste((vec_u32_t)hv,16*x+4,(uint32_t*)dstc);
872
}
873
874
src0 += src_stride*2;
875
dst0 += dst_stride;
876
dsth += dst_stride;
877
dstv += dst_stride;
878
dstc += dst_stride;
879
}
880
}
881
882
static void mc_weight_w2_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
883
const x264_weight_t *weight, int i_height )
884
{
885
LOAD_ZERO;
886
PREP_LOAD;
887
PREP_LOAD_SRC( src );
888
vec_u8_t srcv;
889
vec_s16_t weightv;
890
vec_s16_t scalev, offsetv, denomv, roundv;
891
vec_s16_u loadv;
892
893
int denom = weight->i_denom;
894
895
loadv.s[0] = weight->i_scale;
896
scalev = vec_splat( loadv.v, 0 );
897
898
loadv.s[0] = weight->i_offset;
899
offsetv = vec_splat( loadv.v, 0 );
900
901
if( denom >= 1 )
902
{
903
loadv.s[0] = denom;
904
denomv = vec_splat( loadv.v, 0 );
905
906
loadv.s[0] = 1<<(denom - 1);
907
roundv = vec_splat( loadv.v, 0 );
908
909
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
910
{
911
VEC_LOAD( src, srcv, 2, vec_u8_t, src );
912
weightv = vec_u8_to_s16( srcv );
913
914
weightv = vec_mladd( weightv, scalev, roundv );
915
weightv = vec_sra( weightv, (vec_u16_t)denomv );
916
weightv = vec_add( weightv, offsetv );
917
918
srcv = vec_packsu( weightv, zero_s16v );
919
vec_ste( vec_splat( (vec_u16_t)srcv, 0 ), 0, (uint16_t*)dst );
920
}
921
}
922
else
923
{
924
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
925
{
926
VEC_LOAD( src, srcv, 2, vec_u8_t, src );
927
weightv = vec_u8_to_s16( srcv );
928
929
weightv = vec_mladd( weightv, scalev, offsetv );
930
931
srcv = vec_packsu( weightv, zero_s16v );
932
vec_ste( vec_splat( (vec_u16_t)srcv, 0 ), 0, (uint16_t*)dst );
933
}
934
}
935
}
936
static void mc_weight_w4_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
937
const x264_weight_t *weight, int i_height )
938
{
939
LOAD_ZERO;
940
PREP_LOAD;
941
PREP_LOAD_SRC( src );
942
vec_u8_t srcv;
943
vec_s16_t weightv;
944
vec_s16_t scalev, offsetv, denomv, roundv;
945
vec_s16_u loadv;
946
947
int denom = weight->i_denom;
948
949
loadv.s[0] = weight->i_scale;
950
scalev = vec_splat( loadv.v, 0 );
951
952
loadv.s[0] = weight->i_offset;
953
offsetv = vec_splat( loadv.v, 0 );
954
955
if( denom >= 1 )
956
{
957
loadv.s[0] = denom;
958
denomv = vec_splat( loadv.v, 0 );
959
960
loadv.s[0] = 1<<(denom - 1);
961
roundv = vec_splat( loadv.v, 0 );
962
963
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
964
{
965
VEC_LOAD( src, srcv, 4, vec_u8_t, src );
966
weightv = vec_u8_to_s16( srcv );
967
968
weightv = vec_mladd( weightv, scalev, roundv );
969
weightv = vec_sra( weightv, (vec_u16_t)denomv );
970
weightv = vec_add( weightv, offsetv );
971
972
srcv = vec_packsu( weightv, zero_s16v );
973
vec_ste( vec_splat( (vec_u32_t)srcv, 0 ), 0, (uint32_t*)dst );
974
}
975
}
976
else
977
{
978
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
979
{
980
VEC_LOAD( src, srcv, 4, vec_u8_t, src );
981
weightv = vec_u8_to_s16( srcv );
982
983
weightv = vec_mladd( weightv, scalev, offsetv );
984
985
srcv = vec_packsu( weightv, zero_s16v );
986
vec_ste( vec_splat( (vec_u32_t)srcv, 0 ), 0, (uint32_t*)dst );
987
}
988
}
989
}
990
static void mc_weight_w8_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
991
const x264_weight_t *weight, int i_height )
992
{
993
LOAD_ZERO;
994
PREP_LOAD;
995
PREP_LOAD_SRC( src );
996
PREP_STORE8;
997
vec_u8_t srcv;
998
vec_s16_t weightv;
999
vec_s16_t scalev, offsetv, denomv, roundv;
1000
vec_s16_u loadv;
1001
1002
int denom = weight->i_denom;
1003
1004
loadv.s[0] = weight->i_scale;
1005
scalev = vec_splat( loadv.v, 0 );
1006
1007
loadv.s[0] = weight->i_offset;
1008
offsetv = vec_splat( loadv.v, 0 );
1009
1010
if( denom >= 1 )
1011
{
1012
loadv.s[0] = denom;
1013
denomv = vec_splat( loadv.v, 0 );
1014
1015
loadv.s[0] = 1<<(denom - 1);
1016
roundv = vec_splat( loadv.v, 0 );
1017
1018
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
1019
{
1020
VEC_LOAD( src, srcv, 8, vec_u8_t, src );
1021
weightv = vec_u8_to_s16( srcv );
1022
1023
weightv = vec_mladd( weightv, scalev, roundv );
1024
weightv = vec_sra( weightv, (vec_u16_t)denomv );
1025
weightv = vec_add( weightv, offsetv );
1026
1027
srcv = vec_packsu( weightv, zero_s16v );
1028
VEC_STORE8( srcv, dst );
1029
}
1030
}
1031
else
1032
{
1033
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
1034
{
1035
VEC_LOAD( src, srcv, 8, vec_u8_t, src );
1036
weightv = vec_u8_to_s16( srcv );
1037
1038
weightv = vec_mladd( weightv, scalev, offsetv );
1039
1040
srcv = vec_packsu( weightv, zero_s16v );
1041
VEC_STORE8( srcv, dst );
1042
}
1043
}
1044
}
1045
static void mc_weight_w16_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
1046
const x264_weight_t *weight, int i_height )
1047
{
1048
LOAD_ZERO;
1049
PREP_LOAD;
1050
PREP_LOAD_SRC( src );
1051
vec_u8_t srcv;
1052
vec_s16_t weight_lv, weight_hv;
1053
vec_s16_t scalev, offsetv, denomv, roundv;
1054
vec_s16_u loadv;
1055
1056
int denom = weight->i_denom;
1057
1058
loadv.s[0] = weight->i_scale;
1059
scalev = vec_splat( loadv.v, 0 );
1060
1061
loadv.s[0] = weight->i_offset;
1062
offsetv = vec_splat( loadv.v, 0 );
1063
1064
if( denom >= 1 )
1065
{
1066
loadv.s[0] = denom;
1067
denomv = vec_splat( loadv.v, 0 );
1068
1069
loadv.s[0] = 1<<(denom - 1);
1070
roundv = vec_splat( loadv.v, 0 );
1071
1072
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
1073
{
1074
VEC_LOAD( src, srcv, 16, vec_u8_t, src );
1075
weight_hv = vec_u8_to_s16_h( srcv );
1076
weight_lv = vec_u8_to_s16_l( srcv );
1077
1078
weight_hv = vec_mladd( weight_hv, scalev, roundv );
1079
weight_lv = vec_mladd( weight_lv, scalev, roundv );
1080
weight_hv = vec_sra( weight_hv, (vec_u16_t)denomv );
1081
weight_lv = vec_sra( weight_lv, (vec_u16_t)denomv );
1082
weight_hv = vec_add( weight_hv, offsetv );
1083
weight_lv = vec_add( weight_lv, offsetv );
1084
1085
srcv = vec_packsu( weight_hv, weight_lv );
1086
vec_st( srcv, 0, dst );
1087
}
1088
}
1089
else
1090
{
1091
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
1092
{
1093
VEC_LOAD( src, srcv, 16, vec_u8_t, src );
1094
weight_hv = vec_u8_to_s16_h( srcv );
1095
weight_lv = vec_u8_to_s16_l( srcv );
1096
1097
weight_hv = vec_mladd( weight_hv, scalev, offsetv );
1098
weight_lv = vec_mladd( weight_lv, scalev, offsetv );
1099
1100
srcv = vec_packsu( weight_hv, weight_lv );
1101
vec_st( srcv, 0, dst );
1102
}
1103
}
1104
}
1105
static void mc_weight_w20_altivec( uint8_t *dst, intptr_t i_dst, uint8_t *src, intptr_t i_src,
1106
const x264_weight_t *weight, int i_height )
1107
{
1108
LOAD_ZERO;
1109
PREP_LOAD_SRC( src );
1110
vec_u8_t src_1v, src_2v, src_3v;
1111
vec_s16_t weight_lv, weight_hv, weight_3v;
1112
vec_s16_t scalev, offsetv, denomv, roundv;
1113
vec_s16_u loadv;
1114
1115
int denom = weight->i_denom;
1116
1117
loadv.s[0] = weight->i_scale;
1118
scalev = vec_splat( loadv.v, 0 );
1119
1120
loadv.s[0] = weight->i_offset;
1121
offsetv = vec_splat( loadv.v, 0 );
1122
1123
if( denom >= 1 )
1124
{
1125
loadv.s[0] = denom;
1126
denomv = vec_splat( loadv.v, 0 );
1127
1128
loadv.s[0] = 1<<(denom - 1);
1129
roundv = vec_splat( loadv.v, 0 );
1130
1131
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
1132
{
1133
src_1v = vec_ld( 0, src );
1134
src_2v = vec_ld( 16, src );
1135
src_3v = vec_ld( 19, src );
1136
src_1v = vec_perm( src_1v, src_2v, _src_ );
1137
src_3v = vec_perm( src_2v, src_3v, _src_ );
1138
weight_hv = vec_u8_to_s16_h( src_1v );
1139
weight_lv = vec_u8_to_s16_l( src_1v );
1140
weight_3v = vec_u8_to_s16_h( src_3v );
1141
1142
weight_hv = vec_mladd( weight_hv, scalev, roundv );
1143
weight_lv = vec_mladd( weight_lv, scalev, roundv );
1144
weight_3v = vec_mladd( weight_3v, scalev, roundv );
1145
weight_hv = vec_sra( weight_hv, (vec_u16_t)denomv );
1146
weight_lv = vec_sra( weight_lv, (vec_u16_t)denomv );
1147
weight_3v = vec_sra( weight_3v, (vec_u16_t)denomv );
1148
weight_hv = vec_add( weight_hv, offsetv );
1149
weight_lv = vec_add( weight_lv, offsetv );
1150
weight_3v = vec_add( weight_3v, offsetv );
1151
1152
src_1v = vec_packsu( weight_hv, weight_lv );
1153
src_3v = vec_packsu( weight_3v, zero_s16v );
1154
vec_st( src_1v, 0, dst );
1155
vec_ste( (vec_u32_t)src_3v, 16, (uint32_t*)dst );
1156
}
1157
}
1158
else
1159
{
1160
for( int y = 0; y < i_height; y++, dst += i_dst, src += i_src )
1161
{
1162
src_1v = vec_ld( 0, src );
1163
src_2v = vec_ld( 16, src );
1164
src_3v = vec_ld( 19, src );
1165
src_1v = vec_perm( src_1v, src_2v, _src_ );
1166
src_3v = vec_perm( src_2v, src_3v, _src_ );
1167
weight_hv = vec_u8_to_s16_h( src_1v );
1168
weight_lv = vec_u8_to_s16_l( src_1v );
1169
weight_3v = vec_u8_to_s16_h( src_3v );
1170
1171
weight_hv = vec_mladd( weight_hv, scalev, offsetv );
1172
weight_lv = vec_mladd( weight_lv, scalev, offsetv );
1173
weight_3v = vec_mladd( weight_3v, scalev, offsetv );
1174
1175
src_1v = vec_packsu( weight_hv, weight_lv );
1176
src_3v = vec_packsu( weight_3v, zero_s16v );
1177
vec_st( src_1v, 0, dst );
1178
vec_ste( (vec_u32_t)src_3v, 16, (uint32_t*)dst );
1179
}
1180
}
1181
}
1182
1183
static weight_fn_t x264_mc_weight_wtab_altivec[6] =
1184
{
1185
mc_weight_w2_altivec,
1186
mc_weight_w4_altivec,
1187
mc_weight_w8_altivec,
1188
mc_weight_w16_altivec,
1189
mc_weight_w16_altivec,
1190
mc_weight_w20_altivec,
1191
};
1192
1193
#endif // !HIGH_BIT_DEPTH
1194
1195
void x264_mc_altivec_init( x264_mc_functions_t *pf )
1196
{
1197
#if !HIGH_BIT_DEPTH
1198
pf->mc_luma = mc_luma_altivec;
1199
pf->get_ref = get_ref_altivec;
1200
pf->mc_chroma = mc_chroma_altivec;
1201
1202
pf->copy_16x16_unaligned = x264_mc_copy_w16_altivec;
1203
pf->copy[PIXEL_16x16] = x264_mc_copy_w16_aligned_altivec;
1204
1205
pf->hpel_filter = x264_hpel_filter_altivec;
1206
pf->frame_init_lowres_core = frame_init_lowres_core_altivec;
1207
1208
pf->weight = x264_mc_weight_wtab_altivec;
1209
#endif // !HIGH_BIT_DEPTH
1210
}
1211
1212