CoCalc -- intra.cl

05. Matplotlib / ffmpeg-3.0 / libx264 / common / opencl / intra.cl
⁵⁸⁷³³ views
1
/* Lookahead lowres intra analysis
2
 *
3
 * Each intra analysis function has been implemented twice, once for scalar GPUs
4
 * (NV) and once for vectorized GPUs (AMD pre-Southern Islands).  x264 detects
5
 * the GPU type and sets the -DVECTORIZE compile flag accordingly.
6
 *
7
 * All the intra analysis functions were based on their C versions in pixel.c
8
 * and produce the exact same results.
9
 */
10

11
/* force all clamp arguments and return value to int, prevent ambiguous types */
12
#define clamp_int( X, MIN, MAX ) (int) clamp( (int)(X), (int)(MIN), (int)(MAX) )
13

14
#if VECTORIZE
15
int satd_8x4_intra_lr( const local pixel *data, int data_stride, int8 pr0, int8 pr1, int8 pr2, int8 pr3 )
16
{
17
    int8 a_v, d_v;
18
    int2 tmp00, tmp01, tmp02, tmp03, tmp10, tmp11, tmp12, tmp13;
19
    int2 tmp20, tmp21, tmp22, tmp23, tmp30, tmp31, tmp32, tmp33;
20

21
    d_v = convert_int8( vload8( 0, data ) );
22
    a_v.s01234567 = (d_v - pr0).s04152637;
23
    HADAMARD4V( tmp00, tmp01, tmp02, tmp03, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
24

25
    data += data_stride;
26
    d_v = convert_int8( vload8( 0, data ) );
27
    a_v.s01234567 = (d_v - pr1).s04152637;
28
    HADAMARD4V( tmp10, tmp11, tmp12, tmp13, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
29

30
    data += data_stride;
31
    d_v = convert_int8( vload8( 0, data ) );
32
    a_v.s01234567 = (d_v - pr2).s04152637;
33
    HADAMARD4V( tmp20, tmp21, tmp22, tmp23, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
34

35
    data += data_stride;
36
    d_v = convert_int8( vload8( 0, data ) );
37
    a_v.s01234567 = (d_v - pr3).s04152637;
38
    HADAMARD4V( tmp30, tmp31, tmp32, tmp33, a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi );
39

40
    uint8 sum_v;
41

42
    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp00, tmp10, tmp20, tmp30 );
43
    sum_v = abs( a_v );
44

45
    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp01, tmp11, tmp21, tmp31 );
46
    sum_v += abs( a_v );
47

48
    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp02, tmp12, tmp22, tmp32 );
49
    sum_v += abs( a_v );
50

51
    HADAMARD4V( a_v.lo.lo, a_v.lo.hi, a_v.hi.lo, a_v.hi.hi, tmp03, tmp13, tmp23, tmp33 );
52
    sum_v += abs( a_v );
53

54
    uint4 sum2 = sum_v.hi + sum_v.lo;
55
    uint2 sum3 = sum2.hi + sum2.lo;
56
    return ( sum3.hi + sum3.lo ) >> 1;
57
}
58
#else
59
SATD_C_8x4_Q( satd_8x4_lp, const local, private )
60
#endif
61

62
/****************************************************************************
63
 * 8x8 prediction for intra luma block
64
 ****************************************************************************/
65

66
#define F1            rhadd
67
#define F2( a, b, c ) ( a+2*b+c+2 )>>2
68

69
#if VECTORIZE
70
int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
71
{
72
    int8 pr0, pr1, pr2, pr3;
73

74
    // Upper half of pred[]
75
    pr0.s0 = ( 2 + top[0] + 2*top[1] + top[2] ) >> 2;
76
    pr0.s1 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
77
    pr0.s2 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
78
    pr0.s3 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
79
    pr0.s4 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
80
    pr0.s5 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
81
    pr0.s6 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
82
    pr0.s7 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
83

84
    pr1.s0 = ( 2 + top[1] + 2*top[2] + top[3] ) >> 2;
85
    pr1.s1 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
86
    pr1.s2 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
87
    pr1.s3 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
88
    pr1.s4 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
89
    pr1.s5 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
90
    pr1.s6 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
91
    pr1.s7 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
92

93
    pr2.s0 = ( 2 + top[2] + 2*top[3] + top[4] ) >> 2;
94
    pr2.s1 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
95
    pr2.s2 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
96
    pr2.s3 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
97
    pr2.s4 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
98
    pr2.s5 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
99
    pr2.s6 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
100
    pr2.s7 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
101

102
    pr3.s0 = ( 2 + top[3] + 2*top[4] + top[5] ) >> 2;
103
    pr3.s1 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
104
    pr3.s2 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
105
    pr3.s3 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
106
    pr3.s4 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
107
    pr3.s5 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
108
    pr3.s6 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
109
    pr3.s7 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
110
    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
111

112
    // Lower half of pred[]
113
    pr0.s0 = ( 2 + top[4] + 2*top[5] + top[6] ) >> 2;
114
    pr0.s1 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
115
    pr0.s2 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
116
    pr0.s3 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
117
    pr0.s4 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
118
    pr0.s5 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
119
    pr0.s6 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
120
    pr0.s7 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
121

122
    pr1.s0 = ( 2 + top[5] + 2*top[6] + top[7] ) >> 2;
123
    pr1.s1 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
124
    pr1.s2 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
125
    pr1.s3 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
126
    pr1.s4 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
127
    pr1.s5 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
128
    pr1.s6 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
129
    pr1.s7 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
130

131
    pr2.s0 = ( 2 + top[6] + 2*top[7] + top[8] ) >> 2;
132
    pr2.s1 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
133
    pr2.s2 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
134
    pr2.s3 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
135
    pr2.s4 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
136
    pr2.s5 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
137
    pr2.s6 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
138
    pr2.s7 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
139

140
    pr3.s0 = ( 2 + top[7] + 2*top[8] + top[9] ) >> 2;
141
    pr3.s1 = ( 2 + top[8] + 2*top[9] + top[10] ) >> 2;
142
    pr3.s2 = ( 2 + top[9] + 2*top[10] + top[11] ) >> 2;
143
    pr3.s3 = ( 2 + top[10] + 2*top[11] + top[12] ) >> 2;
144
    pr3.s4 = ( 2 + top[11] + 2*top[12] + top[13] ) >> 2;
145
    pr3.s5 = ( 2 + top[12] + 2*top[13] + top[14] ) >> 2;
146
    pr3.s6 = ( 2 + top[13] + 2*top[14] + top[15] ) >> 2;
147
    pr3.s7 = ( 2 + top[14] + 3*top[15] ) >> 2;
148

149
    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
150
}
151

152
int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
153
{
154
    int8 pr0, pr1, pr2, pr3;
155

156
    // Upper half of pred[]
157
    pr3.s0 = F2( left[1], left[2], left[3] );
158
    pr2.s0 = pr3.s1 = F2( left[0], left[1], left[2] );
159
    pr1.s0 = pr2.s1 = pr3.s2 = F2( left[1], left[0], left_top );
160
    pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
161
    pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
162
    pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
163
    pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
164
    pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
165
    pr0.s5 = pr1.s6 = pr2.s7 = F2( top[3], top[4], top[5] );
166
    pr0.s6 = pr1.s7 = F2( top[4], top[5], top[6] );
167
    pr0.s7 = F2( top[5], top[6], top[7] );
168
    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
169

170
    // Lower half of pred[]
171
    pr3.s0 = F2( left[5], left[6], left[7] );
172
    pr2.s0 = pr3.s1 = F2( left[4], left[5], left[6] );
173
    pr1.s0 = pr2.s1 = pr3.s2 = F2( left[3], left[4], left[5] );
174
    pr0.s0 = pr1.s1 = pr2.s2 = pr3.s3 = F2( left[2], left[3], left[4] );
175
    pr0.s1 = pr1.s2 = pr2.s3 = pr3.s4 = F2( left[1], left[2], left[3] );
176
    pr0.s2 = pr1.s3 = pr2.s4 = pr3.s5 = F2( left[0], left[1], left[2] );
177
    pr0.s3 = pr1.s4 = pr2.s5 = pr3.s6 = F2( left[1], left[0], left_top );
178
    pr0.s4 = pr1.s5 = pr2.s6 = pr3.s7 = F2( left[0], left_top, top[0] );
179
    pr0.s5 = pr1.s6 = pr2.s7 = F2( left_top, top[0], top[1] );
180
    pr0.s6 = pr1.s7 = F2( top[0], top[1], top[2] );
181
    pr0.s7 = F2( top[1], top[2], top[3] );
182
    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
183
}
184

185
int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
186
{
187
    int8 pr0, pr1, pr2, pr3;
188

189
    // Upper half of pred[]
190
    pr2.s0 = F2( left[1], left[0], left_top );
191
    pr3.s0 = F2( left[2], left[1], left[0] );
192
    pr1.s0 = pr3.s1 = F2( left[0], left_top, top[0] );
193
    pr0.s0 = pr2.s1 = F1( left_top, top[0] );
194
    pr1.s1 = pr3.s2 = F2( left_top, top[0], top[1] );
195
    pr0.s1 = pr2.s2 = F1( top[0], top[1] );
196
    pr1.s2 = pr3.s3 = F2( top[0], top[1], top[2] );
197
    pr0.s2 = pr2.s3 = F1( top[1], top[2] );
198
    pr1.s3 = pr3.s4 = F2( top[1], top[2], top[3] );
199
    pr0.s3 = pr2.s4 = F1( top[2], top[3] );
200
    pr1.s4 = pr3.s5 = F2( top[2], top[3], top[4] );
201
    pr0.s4 = pr2.s5 = F1( top[3], top[4] );
202
    pr1.s5 = pr3.s6 = F2( top[3], top[4], top[5] );
203
    pr0.s5 = pr2.s6 = F1( top[4], top[5] );
204
    pr1.s6 = pr3.s7 = F2( top[4], top[5], top[6] );
205
    pr0.s6 = pr2.s7 = F1( top[5], top[6] );
206
    pr1.s7 = F2( top[5], top[6], top[7] );
207
    pr0.s7 = F1( top[6], top[7] );
208
    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
209

210
    // Lower half of pred[]
211
    pr2.s0 = F2( left[5], left[4], left[3] );
212
    pr3.s0 = F2( left[6], left[5], left[4] );
213
    pr0.s0 = pr2.s1 = F2( left[3], left[2], left[1] );
214
    pr1.s0 = pr3.s1 = F2( left[4], left[3], left[2] );
215
    pr0.s1 = pr2.s2 = F2( left[1], left[0], left_top );
216
    pr1.s1 = pr3.s2 = F2( left[2], left[1], left[0] );
217
    pr1.s2 = pr3.s3 = F2( left[0], left_top, top[0] );
218
    pr0.s2 = pr2.s3 = F1( left_top, top[0] );
219
    pr1.s3 = pr3.s4 = F2( left_top, top[0], top[1] );
220
    pr0.s3 = pr2.s4 = F1( top[0], top[1] );
221
    pr1.s4 = pr3.s5 = F2( top[0], top[1], top[2] );
222
    pr0.s4 = pr2.s5 = F1( top[1], top[2] );
223
    pr1.s5 = pr3.s6 = F2( top[1], top[2], top[3] );
224
    pr0.s5 = pr2.s6 = F1( top[2], top[3] );
225
    pr1.s6 = pr3.s7 = F2( top[2], top[3], top[4] );
226
    pr0.s6 = pr2.s7 = F1( top[3], top[4] );
227
    pr1.s7 = F2( top[3], top[4], top[5] );
228
    pr0.s7 = F1( top[4], top[5] );
229
    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
230
#undef PRED
231
}
232

233
int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
234
{
235
    int8 pr0, pr1, pr2, pr3;
236

237
    // Upper half of pred[]
238
    pr0.s0 = F1( left_top, left[0] ); pr0.s1 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
239
    pr0.s2 = F2( top[1], top[0], left_top ); pr0.s3 = F2( top[2], top[1], top[0] );
240
    pr0.s4 = F2( top[3], top[2], top[1] ); pr0.s5 = F2( top[4], top[3], top[2] );
241
    pr0.s6 = F2( top[5], top[4], top[3] ); pr0.s7 = F2( top[6], top[5], top[4] );
242

243
    pr1.s0 = F1( left[0], left[1] ); pr1.s1 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
244
    pr1.s2 = F1( left_top, left[0] ); pr1.s3 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
245
    pr1.s4 = F2( top[1], top[0], left_top ); pr1.s5 = F2( top[2], top[1], top[0] );
246
    pr1.s6 = F2( top[3], top[2], top[1] ); pr1.s7 = F2( top[4], top[3], top[2] );
247

248
    pr2.s0 = F1( left[1], left[2] ); pr2.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
249
    pr2.s2 = F1( left[0], left[1] ); pr2.s3 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
250
    pr2.s4 = F1( left_top, left[0] ); pr2.s5 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
251
    pr2.s6 = F2( top[1], top[0], left_top ); pr2.s7 = F2( top[2], top[1], top[0] );
252

253
    pr3.s0 = F1( left[2], left[3] ); pr3.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
254
    pr3.s2 = F1( left[1], left[2] ); pr3.s3 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
255
    pr3.s4 = F1( left[0], left[1] ); pr3.s5 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
256
    pr3.s6 = F1( left_top, left[0] ); pr3.s7 = (left[0] + 2 * left_top + top[0] + 2) >> 2;
257
    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
258

259
    // Lower half of pred[]
260
    pr0.s0 = F1( left[3], left[4] ); pr0.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
261
    pr0.s2 = F1( left[2], left[3] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
262
    pr0.s4 = F1( left[1], left[2] ); pr0.s5 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
263
    pr0.s6 = F1( left[0], left[1] ); pr0.s7 = (left_top + 2 * left[0] + left[1] + 2) >> 2;
264

265
    pr1.s0 = F1( left[4], left[5] ); pr1.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
266
    pr1.s2 = F1( left[3], left[4] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
267
    pr1.s4 = F1( left[2], left[3] ); pr1.s5 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
268
    pr1.s6 = F1( left[1], left[2] ); pr1.s7 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
269

270
    pr2.s0 = F1( left[5], left[6] ); pr2.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
271
    pr2.s2 = F1( left[4], left[5] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
272
    pr2.s4 = F1( left[3], left[4] ); pr2.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
273
    pr2.s6 = F1( left[2], left[3] ); pr2.s7 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
274

275
    pr3.s0 = F1( left[6], left[7] ); pr3.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
276
    pr3.s2 = F1( left[5], left[6] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
277
    pr3.s4 = F1( left[4], left[5] ); pr3.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
278
    pr3.s6 = F1( left[3], left[4] ); pr3.s7 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
279
    return satd + satd_8x4_intra_lr( src + (src_stride << 2), src_stride, pr0, pr1, pr2, pr3 );
280
}
281

282
int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
283
{
284
    int8 pr0, pr1, pr2, pr3;
285

286
    // Upper half of pred[]
287
    pr0.s0 = F1( top[0], top[1] );
288
    pr1.s0 = F2( top[0], top[1], top[2] );
289
    pr2.s0 = pr0.s1 = F1( top[1], top[2] );
290
    pr3.s0 = pr1.s1 = F2( top[1], top[2], top[3] );
291
    pr2.s1 = pr0.s2 = F1( top[2], top[3] );
292
    pr3.s1 = pr1.s2 = F2( top[2], top[3], top[4] );
293
    pr2.s2 = pr0.s3 = F1( top[3], top[4] );
294
    pr3.s2 = pr1.s3 = F2( top[3], top[4], top[5] );
295
    pr2.s3 = pr0.s4 = F1( top[4], top[5] );
296
    pr3.s3 = pr1.s4 = F2( top[4], top[5], top[6] );
297
    pr2.s4 = pr0.s5 = F1( top[5], top[6] );
298
    pr3.s4 = pr1.s5 = F2( top[5], top[6], top[7] );
299
    pr2.s5 = pr0.s6 = F1( top[6], top[7] );
300
    pr3.s5 = pr1.s6 = F2( top[6], top[7], top[8] );
301
    pr2.s6 = pr0.s7 = F1( top[7], top[8] );
302
    pr3.s6 = pr1.s7 = F2( top[7], top[8], top[9] );
303
    pr2.s7 = F1( top[8], top[9] );
304
    pr3.s7 = F2( top[8], top[9], top[10] );
305
    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
306

307
    // Lower half of pred[]
308
    pr0.s0 = F1( top[2], top[3] );
309
    pr1.s0 = F2( top[2], top[3], top[4] );
310
    pr2.s0 = pr0.s1 = F1( top[3], top[4] );
311
    pr3.s0 = pr1.s1 = F2( top[3], top[4], top[5] );
312
    pr2.s1 = pr0.s2 = F1( top[4], top[5] );
313
    pr3.s1 = pr1.s2 = F2( top[4], top[5], top[6] );
314
    pr2.s2 = pr0.s3 = F1( top[5], top[6] );
315
    pr3.s2 = pr1.s3 = F2( top[5], top[6], top[7] );
316
    pr2.s3 = pr0.s4 = F1( top[6], top[7] );
317
    pr3.s3 = pr1.s4 = F2( top[6], top[7], top[8] );
318
    pr2.s4 = pr0.s5 = F1( top[7], top[8] );
319
    pr3.s4 = pr1.s5 = F2( top[7], top[8], top[9] );
320
    pr2.s5 = pr0.s6 = F1( top[8], top[9] );
321
    pr3.s5 = pr1.s6 = F2( top[8], top[9], top[10] );
322
    pr2.s6 = pr0.s7 = F1( top[9], top[10] );
323
    pr3.s6 = pr1.s7 = F2( top[9], top[10], top[11] );
324
    pr2.s7 = F1( top[10], top[11] );
325
    pr3.s7 = F2( top[10], top[11], top[12] );
326
    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
327
}
328

329
int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
330
{
331
    int8 pr0, pr1, pr2, pr3;
332

333
    // Upper half of pred[]
334
    pr0.s0 = F1( left[0], left[1] ); pr0.s1 = (left[0] + 2 * left[1] + left[2] + 2) >> 2;
335
    pr0.s2 = F1( left[1], left[2] ); pr0.s3 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
336
    pr0.s4 = F1( left[2], left[3] ); pr0.s5 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
337
    pr0.s6 = F1( left[3], left[4] ); pr0.s7 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
338

339
    pr1.s0 = F1( left[1], left[2] ); pr1.s1 = (left[1] + 2 * left[2] + left[3] + 2) >> 2;
340
    pr1.s2 = F1( left[2], left[3] ); pr1.s3 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
341
    pr1.s4 = F1( left[3], left[4] ); pr1.s5 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
342
    pr1.s6 = F1( left[4], left[5] ); pr1.s7 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
343

344
    pr2.s0 = F1( left[2], left[3] ); pr2.s1 = (left[2] + 2 * left[3] + left[4] + 2) >> 2;
345
    pr2.s2 = F1( left[3], left[4] ); pr2.s3 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
346
    pr2.s4 = F1( left[4], left[5] ); pr2.s5 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
347
    pr2.s6 = F1( left[5], left[6] ); pr2.s7 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
348

349
    pr3.s0 = F1( left[3], left[4] ); pr3.s1 = (left[3] + 2 * left[4] + left[5] + 2) >> 2;
350
    pr3.s2 = F1( left[4], left[5] ); pr3.s3 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
351
    pr3.s4 = F1( left[5], left[6] ); pr3.s5 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
352
    pr3.s6 = F1( left[6], left[7] ); pr3.s7 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
353
    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
354

355
    // Lower half of pred[]
356
    pr0.s0 = F1( left[4], left[5] ); pr0.s1 = (left[4] + 2 * left[5] + left[6] + 2) >> 2;
357
    pr0.s2 = F1( left[5], left[6] ); pr0.s3 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
358
    pr0.s4 = F1( left[6], left[7] ); pr0.s5 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
359
    pr0.s6 = left[7]; pr0.s7 = left[7];
360

361
    pr1.s0 = F1( left[5], left[6] ); pr1.s1 = (left[5] + 2 * left[6] + left[7] + 2) >> 2;
362
    pr1.s2 = F1( left[6], left[7] ); pr1.s3 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
363
    pr1.s4 = left[7]; pr1.s5 = left[7];
364
    pr1.s6 = left[7]; pr1.s7 = left[7];
365

366
    pr2.s0 = F1( left[6], left[7] ); pr2.s1 = (left[6] + 2 * left[7] + left[7] + 2) >> 2;
367
    pr2.s2 = left[7]; pr2.s3 = left[7];
368
    pr2.s4 = left[7]; pr2.s5 = left[7];
369
    pr2.s6 = left[7]; pr2.s7 = left[7];
370

371
    pr3 = (int8)left[7];
372

373
    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
374
}
375

376
int x264_predict_8x8c_h( const local pixel *src, int src_stride )
377
{
378
    const local pixel *src_l = src;
379
    int8 pr0, pr1, pr2, pr3;
380

381
    // Upper half of pred[]
382
    pr0 = (int8)src[-1]; src += src_stride;
383
    pr1 = (int8)src[-1]; src += src_stride;
384
    pr2 = (int8)src[-1]; src += src_stride;
385
    pr3 = (int8)src[-1]; src += src_stride;
386
    int satd = satd_8x4_intra_lr( src_l, src_stride, pr0, pr1, pr2, pr3 );
387

388
    //Lower half of pred[]
389
    pr0 = (int8)src[-1]; src += src_stride;
390
    pr1 = (int8)src[-1]; src += src_stride;
391
    pr2 = (int8)src[-1]; src += src_stride;
392
    pr3 = (int8)src[-1];
393
    return satd + satd_8x4_intra_lr( src_l + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
394
}
395

396
int x264_predict_8x8c_v( const local pixel *src, int src_stride )
397
{
398
    int8 pred = convert_int8( vload8( 0, &src[-src_stride] ));
399
    return satd_8x4_intra_lr( src, src_stride, pred, pred, pred, pred ) +
400
           satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pred, pred, pred, pred );
401
}
402

403
int x264_predict_8x8c_p( const local pixel *src, int src_stride )
404
{
405
    int H = 0, V = 0;
406
    for( int i = 0; i < 4; i++ )
407
    {
408
        H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
409
        V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
410
    }
411

412
    int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
413
    int b = (17 * H + 16) >> 5;
414
    int c = (17 * V + 16) >> 5;
415
    int i00 = a - 3 * b - 3 * c + 16;
416

417
    // Upper half of pred[]
418
    int pix = i00;
419
    int8 pr0, pr1, pr2, pr3;
420
    pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
421
    pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
422
    pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
423
    pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
424
    pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
425
    pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
426
    pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
427
    pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
428

429
    pix = i00;
430
    pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
431
    pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
432
    pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
433
    pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
434
    pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
435
    pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
436
    pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
437
    pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
438

439
    pix = i00;
440
    pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
441
    pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
442
    pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
443
    pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
444
    pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
445
    pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
446
    pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
447
    pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
448

449
    pix = i00;
450
    pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
451
    pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
452
    pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
453
    pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
454
    pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
455
    pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
456
    pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
457
    pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
458
    int satd = satd_8x4_intra_lr( src, src_stride, pr0, pr1, pr2, pr3 );
459

460
    //Lower half of pred[]
461
    pix = i00;
462
    pr0.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
463
    pr0.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
464
    pr0.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
465
    pr0.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
466
    pr0.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
467
    pr0.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
468
    pr0.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
469
    pr0.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
470

471
    pix = i00;
472
    pr1.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
473
    pr1.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
474
    pr1.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
475
    pr1.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
476
    pr1.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
477
    pr1.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
478
    pr1.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
479
    pr1.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
480

481
    pix = i00;
482
    pr2.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
483
    pr2.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
484
    pr2.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
485
    pr2.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
486
    pr2.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
487
    pr2.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
488
    pr2.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
489
    pr2.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
490

491
    pix = i00;
492
    pr3.s0 = x264_clip_pixel( pix >> 5 ); pix += b;
493
    pr3.s1 = x264_clip_pixel( pix >> 5 ); pix += b;
494
    pr3.s2 = x264_clip_pixel( pix >> 5 ); pix += b;
495
    pr3.s3 = x264_clip_pixel( pix >> 5 ); pix += b;
496
    pr3.s4 = x264_clip_pixel( pix >> 5 ); pix += b;
497
    pr3.s5 = x264_clip_pixel( pix >> 5 ); pix += b;
498
    pr3.s6 = x264_clip_pixel( pix >> 5 ); pix += b;
499
    pr3.s7 = x264_clip_pixel( pix >> 5 ); i00 += c;
500
    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, pr0, pr1, pr2, pr3 );
501
}
502

503
int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
504
{
505
    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
506
    for( int i = 0; i < 4; i++ )
507
    {
508
        s0 += src[i - src_stride];
509
        s1 += src[i + 4 - src_stride];
510
        s2 += src[-1 + i * src_stride];
511
        s3 += src[-1 + (i+4)*src_stride];
512
    }
513

514
    // Upper half of pred[]
515
    int8 dc0;
516
    dc0.lo = (int4)( (s0 + s2 + 4) >> 3 );
517
    dc0.hi = (int4)( (s1 + 2) >> 2 );
518
    int satd = satd_8x4_intra_lr( src, src_stride, dc0, dc0, dc0, dc0 );
519

520
    // Lower half of pred[]
521
    dc0.lo = (int4)( (s3 + 2) >> 2 );
522
    dc0.hi = (int4)( (s1 + s3 + 4) >> 3 );
523
    return satd + satd_8x4_intra_lr( src + ( src_stride << 2 ), src_stride, dc0, dc0, dc0, dc0 );
524
}
525

526
#else  /* not vectorized: private is cheap registers are scarce */
527

528
int x264_predict_8x8_ddl( const local pixel *src, int src_stride, const local pixel *top )
529
{
530
    private pixel pred[32];
531

532
    // Upper half of pred[]
533
    for( int y = 0; y < 4; y++ )
534
    {
535
        for( int x = 0; x < 8; x++ )
536
        {
537
            pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 );
538
            pred[x + y*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
539
        }
540
    }
541
    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
542
    //Lower half of pred[]
543
    for( int y = 4; y < 8; y++ )
544
    {
545
        for( int x = 0; x < 8; x++ )
546
        {
547
            pixel x_plus_y = (pixel) clamp_int( x + y, 0, 13 );
548
            pred[x + ( y - 4 )*8] = ( 2 + top[x_plus_y] + 2*top[x_plus_y + 1] + top[x_plus_y + 2] ) >> 2;
549
        }
550
    }
551
    pred[31] = ( 2 + top[14] + 3*top[15] ) >> 2;
552
    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
553
    return satd;
554
}
555

556
int x264_predict_8x8_ddr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
557
{
558
    private pixel pred[32];
559
#define PRED( x, y ) pred[(x) + (y)*8]
560
    // Upper half of pred[]
561
    PRED( 0, 3 ) = F2( left[1], left[2], left[3] );
562
    PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[0], left[1], left[2] );
563
    PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[1], left[0], left_top );
564
    PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
565
    PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
566
    PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
567
    PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
568
    PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
569
    PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( top[3], top[4], top[5] );
570
    PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[4], top[5], top[6] );
571
    PRED( 7, 0 ) = F2( top[5], top[6], top[7] );
572
    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
573

574
    // Lower half of pred[]
575
    PRED( 0, 3 ) = F2( left[5], left[6], left[7] );
576
    PRED( 0, 2 ) = PRED( 1, 3 ) = F2( left[4], left[5], left[6] );
577
    PRED( 0, 1 ) = PRED( 1, 2 ) = PRED( 2, 3 ) = F2( left[3], left[4], left[5] );
578
    PRED( 0, 0 ) = PRED( 1, 1 ) = PRED( 2, 2 ) = PRED( 3, 3 ) = F2( left[2], left[3], left[4] );
579
    PRED( 1, 0 ) = PRED( 2, 1 ) = PRED( 3, 2 ) = PRED( 4, 3 ) = F2( left[1], left[2], left[3] );
580
    PRED( 2, 0 ) = PRED( 3, 1 ) = PRED( 4, 2 ) = PRED( 5, 3 ) = F2( left[0], left[1], left[2] );
581
    PRED( 3, 0 ) = PRED( 4, 1 ) = PRED( 5, 2 ) = PRED( 6, 3 ) = F2( left[1], left[0], left_top );
582
    PRED( 4, 0 ) = PRED( 5, 1 ) = PRED( 6, 2 ) = PRED( 7, 3 ) = F2( left[0], left_top, top[0] );
583
    PRED( 5, 0 ) = PRED( 6, 1 ) = PRED( 7, 2 ) = F2( left_top, top[0], top[1] );
584
    PRED( 6, 0 ) = PRED( 7, 1 ) = F2( top[0], top[1], top[2] );
585
    PRED( 7, 0 ) = F2( top[1], top[2], top[3] );
586
    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
587
    return satd;
588
#undef PRED
589
}
590

591
int x264_predict_8x8_vr( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
592
{
593
    private pixel pred[32];
594
#define PRED( x, y ) pred[(x) + (y)*8]
595
    // Upper half of pred[]
596
    PRED( 0, 2 ) = F2( left[1], left[0], left_top );
597
    PRED( 0, 3 ) = F2( left[2], left[1], left[0] );
598
    PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[0], left_top, top[0] );
599
    PRED( 0, 0 ) = PRED( 1, 2 ) = F1( left_top, top[0] );
600
    PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left_top, top[0], top[1] );
601
    PRED( 1, 0 ) = PRED( 2, 2 ) = F1( top[0], top[1] );
602
    PRED( 2, 1 ) = PRED( 3, 3 ) = F2( top[0], top[1], top[2] );
603
    PRED( 2, 0 ) = PRED( 3, 2 ) = F1( top[1], top[2] );
604
    PRED( 3, 1 ) = PRED( 4, 3 ) = F2( top[1], top[2], top[3] );
605
    PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[2], top[3] );
606
    PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[2], top[3], top[4] );
607
    PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[3], top[4] );
608
    PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[3], top[4], top[5] );
609
    PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[4], top[5] );
610
    PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[4], top[5], top[6] );
611
    PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[5], top[6] );
612
    PRED( 7, 1 ) = F2( top[5], top[6], top[7] );
613
    PRED( 7, 0 ) = F1( top[6], top[7] );
614
    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
615

616
    //Lower half of pred[]
617
    PRED( 0, 2 ) = F2( left[5], left[4], left[3] );
618
    PRED( 0, 3 ) = F2( left[6], left[5], left[4] );
619
    PRED( 0, 0 ) = PRED( 1, 2 ) = F2( left[3], left[2], left[1] );
620
    PRED( 0, 1 ) = PRED( 1, 3 ) = F2( left[4], left[3], left[2] );
621
    PRED( 1, 0 ) = PRED( 2, 2 ) = F2( left[1], left[0], left_top );
622
    PRED( 1, 1 ) = PRED( 2, 3 ) = F2( left[2], left[1], left[0] );
623
    PRED( 2, 1 ) = PRED( 3, 3 ) = F2( left[0], left_top, top[0] );
624
    PRED( 2, 0 ) = PRED( 3, 2 ) = F1( left_top, top[0] );
625
    PRED( 3, 1 ) = PRED( 4, 3 ) = F2( left_top, top[0], top[1] );
626
    PRED( 3, 0 ) = PRED( 4, 2 ) = F1( top[0], top[1] );
627
    PRED( 4, 1 ) = PRED( 5, 3 ) = F2( top[0], top[1], top[2] );
628
    PRED( 4, 0 ) = PRED( 5, 2 ) = F1( top[1], top[2] );
629
    PRED( 5, 1 ) = PRED( 6, 3 ) = F2( top[1], top[2], top[3] );
630
    PRED( 5, 0 ) = PRED( 6, 2 ) = F1( top[2], top[3] );
631
    PRED( 6, 1 ) = PRED( 7, 3 ) = F2( top[2], top[3], top[4] );
632
    PRED( 6, 0 ) = PRED( 7, 2 ) = F1( top[3], top[4] );
633
    PRED( 7, 1 ) = F2( top[3], top[4], top[5] );
634
    PRED( 7, 0 ) = F1( top[4], top[5] );
635
    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
636
    return satd;
637
#undef PRED
638
}
639

640
inline uint32_t pack16to32( uint32_t a, uint32_t b )
641
{
642
    return a + (b << 16);
643
}
644

645
inline uint32_t pack8to16( uint32_t a, uint32_t b )
646
{
647
    return a + (b << 8);
648
}
649

650
int x264_predict_8x8_hd( const local pixel *src, int src_stride, const local pixel *top, const local pixel *left, pixel left_top )
651
{
652
    private pixel pred[32];
653
    int satd;
654
    int p1 =  pack8to16( (F1( left[6], left[7] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
655
    int p2 =  pack8to16( (F1( left[5], left[6] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
656
    int p3 =  pack8to16( (F1( left[4], left[5] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
657
    int p4 =  pack8to16( (F1( left[3], left[4] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
658
    int p5 =  pack8to16( (F1( left[2], left[3] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
659
    int p6 =  pack8to16( (F1( left[1], left[2] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
660
    int p7 =  pack8to16( (F1( left[0], left[1] )), ((left_top + 2 * left[0] + left[1] + 2) >> 2) );
661
    int p8 =  pack8to16( (F1( left_top, left[0] )), ((left[0] + 2 * left_top + top[0] + 2) >> 2) );
662
    int p9 =  pack8to16( (F2( top[1], top[0], left_top )), (F2( top[2], top[1], top[0] )) );
663
    int p10 =  pack8to16( (F2( top[3], top[2], top[1] )), (F2( top[4], top[3], top[2] )) );
664
    int p11 =  pack8to16( (F2( top[5], top[4], top[3] )), (F2( top[6], top[5], top[4] )) );
665
    // Upper half of pred[]
666
    vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[0 + 0 * 8] );
667
    vstore4( as_uchar4( pack16to32( p10, p11 ) ), 0, &pred[4 + 0 * 8] );
668
    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[0 + 1 * 8] );
669
    vstore4( as_uchar4( pack16to32( p9, p10 ) ), 0, &pred[4 + 1 * 8] );
670
    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[0 + 2 * 8] );
671
    vstore4( as_uchar4( pack16to32( p8, p9 ) ), 0, &pred[4 + 2 * 8] );
672
    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[0 + 3 * 8] );
673
    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[4 + 3 * 8] );
674
    satd = satd_8x4_lp( src, src_stride, pred, 8 );
675
    // Lower half of pred[]
676
    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[0 + 0 * 8] );
677
    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[4 + 0 * 8] );
678
    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[0 + 1 * 8] );
679
    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[4 + 1 * 8] );
680
    vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[0 + 2 * 8] );
681
    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[4 + 2 * 8] );
682
    vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[0 + 3 * 8] );
683
    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[4 + 3 * 8] );
684
    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
685
    return satd;
686
}
687

688
int x264_predict_8x8_vl( const local pixel *src, int src_stride, const local pixel *top )
689
{
690
    private pixel pred[32];
691
    int satd;
692
#define PRED( x, y ) pred[(x) + (y)*8]
693
    // Upper half of pred[]
694
    PRED( 0, 0 ) = F1( top[0], top[1] );
695
    PRED( 0, 1 ) = F2( top[0], top[1], top[2] );
696
    PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[1], top[2] );
697
    PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[1], top[2], top[3] );
698
    PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[2], top[3] );
699
    PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[2], top[3], top[4] );
700
    PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[3], top[4] );
701
    PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[3], top[4], top[5] );
702
    PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[4], top[5] );
703
    PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[4], top[5], top[6] );
704
    PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[5], top[6] );
705
    PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[5], top[6], top[7] );
706
    PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[6], top[7] );
707
    PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[6], top[7], top[8] );
708
    PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[7], top[8] );
709
    PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[7], top[8], top[9] );
710
    PRED( 7, 2 ) = F1( top[8], top[9] );
711
    PRED( 7, 3 ) = F2( top[8], top[9], top[10] );
712
    satd = satd_8x4_lp( src, src_stride, pred, 8 );
713
    // Lower half of pred[]
714
    PRED( 0, 0 ) = F1( top[2], top[3] );
715
    PRED( 0, 1 ) = F2( top[2], top[3], top[4] );
716
    PRED( 0, 2 ) = PRED( 1, 0 ) = F1( top[3], top[4] );
717
    PRED( 0, 3 ) = PRED( 1, 1 ) = F2( top[3], top[4], top[5] );
718
    PRED( 1, 2 ) = PRED( 2, 0 ) = F1( top[4], top[5] );
719
    PRED( 1, 3 ) = PRED( 2, 1 ) = F2( top[4], top[5], top[6] );
720
    PRED( 2, 2 ) = PRED( 3, 0 ) = F1( top[5], top[6] );
721
    PRED( 2, 3 ) = PRED( 3, 1 ) = F2( top[5], top[6], top[7] );
722
    PRED( 3, 2 ) = PRED( 4, 0 ) = F1( top[6], top[7] );
723
    PRED( 3, 3 ) = PRED( 4, 1 ) = F2( top[6], top[7], top[8] );
724
    PRED( 4, 2 ) = PRED( 5, 0 ) = F1( top[7], top[8] );
725
    PRED( 4, 3 ) = PRED( 5, 1 ) = F2( top[7], top[8], top[9] );
726
    PRED( 5, 2 ) = PRED( 6, 0 ) = F1( top[8], top[9] );
727
    PRED( 5, 3 ) = PRED( 6, 1 ) = F2( top[8], top[9], top[10] );
728
    PRED( 6, 2 ) = PRED( 7, 0 ) = F1( top[9], top[10] );
729
    PRED( 6, 3 ) = PRED( 7, 1 ) = F2( top[9], top[10], top[11] );
730
    PRED( 7, 2 ) = F1( top[10], top[11] );
731
    PRED( 7, 3 ) = F2( top[10], top[11], top[12] );
732
    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
733
    return satd;
734
#undef PRED
735
}
736

737
int x264_predict_8x8_hu( const local pixel *src, int src_stride, const local pixel *left )
738
{
739
    private pixel pred[32];
740
    int satd;
741
    int p1 = pack8to16( (F1( left[0], left[1] )), ((left[0] + 2 * left[1] + left[2] + 2) >> 2) );
742
    int p2 = pack8to16( (F1( left[1], left[2] )), ((left[1] + 2 * left[2] + left[3] + 2) >> 2) );
743
    int p3 = pack8to16( (F1( left[2], left[3] )), ((left[2] + 2 * left[3] + left[4] + 2) >> 2) );
744
    int p4 = pack8to16( (F1( left[3], left[4] )), ((left[3] + 2 * left[4] + left[5] + 2) >> 2) );
745
    int p5 = pack8to16( (F1( left[4], left[5] )), ((left[4] + 2 * left[5] + left[6] + 2) >> 2) );
746
    int p6 = pack8to16( (F1( left[5], left[6] )), ((left[5] + 2 * left[6] + left[7] + 2) >> 2) );
747
    int p7 = pack8to16( (F1( left[6], left[7] )), ((left[6] + 2 * left[7] + left[7] + 2) >> 2) );
748
    int p8 = pack8to16( left[7], left[7] );
749
    // Upper half of pred[]
750
    vstore4( as_uchar4( pack16to32( p1, p2 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
751
    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
752
    vstore4( as_uchar4( pack16to32( p2, p3 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
753
    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
754
    vstore4( as_uchar4( pack16to32( p3, p4 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
755
    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
756
    vstore4( as_uchar4( pack16to32( p4, p5 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
757
    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
758
    satd = satd_8x4_lp( src, src_stride, pred, 8 );
759
    // Lower half of pred[]
760
    vstore4( as_uchar4( pack16to32( p5, p6 ) ), 0, &pred[( 0 ) + ( 0 ) * 8] );
761
    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 4 ) + ( 0 ) * 8] );
762
    vstore4( as_uchar4( pack16to32( p6, p7 ) ), 0, &pred[( 0 ) + ( 1 ) * 8] );
763
    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 1 ) * 8] );
764
    vstore4( as_uchar4( pack16to32( p7, p8 ) ), 0, &pred[( 0 ) + ( 2 ) * 8] );
765
    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 2 ) * 8] );
766
    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 0 ) + ( 3 ) * 8] );
767
    vstore4( as_uchar4( pack16to32( p8, p8 ) ), 0, &pred[( 4 ) + ( 3 ) * 8] );
768
    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
769
    return satd;
770
}
771

772
int x264_predict_8x8c_h( const local pixel *src, int src_stride )
773
{
774
    private pixel pred[32];
775
    const local pixel *src_l = src;
776

777
    // Upper half of pred[]
778
    vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
779
    vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
780
    vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
781
    vstore8( (uchar8)(src[-1]), 3, pred ); src += src_stride;
782
    int satd = satd_8x4_lp( src_l, src_stride, pred, 8 );
783

784
    // Lower half of pred[]
785
    vstore8( (uchar8)(src[-1]), 0, pred ); src += src_stride;
786
    vstore8( (uchar8)(src[-1]), 1, pred ); src += src_stride;
787
    vstore8( (uchar8)(src[-1]), 2, pred ); src += src_stride;
788
    vstore8( (uchar8)(src[-1]), 3, pred );
789
    return satd + satd_8x4_lp( src_l + ( src_stride << 2 ), src_stride, pred, 8 );
790
}
791

792
int x264_predict_8x8c_v( const local pixel *src, int src_stride )
793
{
794
    private pixel pred[32];
795
    uchar16 v16;
796
    v16.lo = vload8( 0, &src[-src_stride] );
797
    v16.hi = vload8( 0, &src[-src_stride] );
798

799
    vstore16( v16, 0, pred );
800
    vstore16( v16, 1, pred );
801

802
    return satd_8x4_lp( src, src_stride, pred, 8 ) +
803
           satd_8x4_lp( src + (src_stride << 2), src_stride, pred, 8 );
804
}
805

806
int x264_predict_8x8c_p( const local pixel *src, int src_stride )
807
{
808
    int H = 0, V = 0;
809
    private pixel pred[32];
810
    int satd;
811

812
    for( int i = 0; i < 4; i++ )
813
    {
814
        H += (i + 1) * (src[4 + i - src_stride] - src[2 - i - src_stride]);
815
        V += (i + 1) * (src[-1 + (i + 4) * src_stride] - src[-1 + (2 - i) * src_stride]);
816
    }
817

818
    int a = 16 * (src[-1 + 7 * src_stride] + src[7 - src_stride]);
819
    int b = (17 * H + 16) >> 5;
820
    int c = (17 * V + 16) >> 5;
821
    int i00 = a - 3 * b - 3 * c + 16;
822

823
    // Upper half of pred[]
824
    for( int y = 0; y < 4; y++ )
825
    {
826
        int pix = i00;
827
        for( int x = 0; x < 8; x++ )
828
        {
829
            pred[x + y*8] = x264_clip_pixel( pix >> 5 );
830
            pix += b;
831
        }
832
        i00 += c;
833
    }
834
    satd = satd_8x4_lp( src, src_stride, pred, 8 );
835
    // Lower half of pred[]
836
    for( int y = 0; y < 4; y++ )
837
    {
838
        int pix = i00;
839
        for( int x = 0; x < 8; x++ )
840
        {
841
            pred[x + y*8] = x264_clip_pixel( pix >> 5 );
842
            pix += b;
843
        }
844
        i00 += c;
845
    }
846
    satd += satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
847
    return satd;
848
}
849

850
int x264_predict_8x8c_dc( const local pixel *src, int src_stride )
851
{
852
    private pixel pred[32];
853
    int s0 = 0, s1 = 0, s2 = 0, s3 = 0;
854
    for( int i = 0; i < 4; i++ )
855
    {
856
        s0 += src[i - src_stride];
857
        s1 += src[i + 4 - src_stride];
858
        s2 += src[-1 + i * src_stride];
859
        s3 += src[-1 + (i+4)*src_stride];
860
    }
861

862
    // Upper half of pred[]
863
    uchar8 dc0;
864
    dc0.lo = (uchar4)( (s0 + s2 + 4) >> 3 );
865
    dc0.hi = (uchar4)( (s1 + 2) >> 2 );
866
    vstore8( dc0, 0, pred );
867
    vstore8( dc0, 1, pred );
868
    vstore8( dc0, 2, pred );
869
    vstore8( dc0, 3, pred );
870
    int satd = satd_8x4_lp( src, src_stride, pred, 8 );
871

872
    // Lower half of pred[]
873
    dc0.lo = (uchar4)( (s3 + 2) >> 2 );
874
    dc0.hi = (uchar4)( (s1 + s3 + 4) >> 3 );
875
    vstore8( dc0, 0, pred );
876
    vstore8( dc0, 1, pred );
877
    vstore8( dc0, 2, pred );
878
    vstore8( dc0, 3, pred );
879
    return satd + satd_8x4_lp( src + ( src_stride << 2 ), src_stride, pred, 8 );
880
}
881
#endif
882

883
/* Find the least cost intra mode for 32 8x8 macroblocks per workgroup
884
 *
885
 * Loads 33 macroblocks plus the pixels directly above them into local memory,
886
 * padding where necessary with edge pixels.  It then cooperatively calculates
887
 * smoothed top and left pixels for use in some of the analysis.
888
 *
889
 * Then groups of 32 threads each calculate a single intra mode for each 8x8
890
 * block.  Since consecutive threads are calculating the same intra mode there
891
 * is no code-path divergence.  8 intra costs are calculated simultaneously.  If
892
 * the "slow" argument is not zero, the final two (least likely) intra modes are
893
 * tested in a second pass.  The slow mode is only enabled for presets slow,
894
 * slower, and placebo.
895
 *
896
 * This allows all of the pixels functions to read pixels from local memory, and
897
 * avoids re-fetching edge pixels from global memory.  And it allows us to
898
 * calculate all of the intra mode costs simultaneously without branch divergence.
899
 *
900
 * Local dimension:    [ 32, 8 ]
901
 * Global dimensions:  [ paddedWidth, height ] */
902
kernel void mb_intra_cost_satd_8x8( read_only image2d_t  fenc,
903
                                    global uint16_t     *fenc_intra_cost,
904
                                    global int          *frame_stats,
905
                                    int                  lambda,
906
                                    int                  mb_width,
907
                                    int                  slow )
908
{
909
#define CACHE_STRIDE 265
910
#define BLOCK_OFFSET 266
911
    local pixel cache[2385];
912
    local int cost_buf[32];
913
    local pixel top[32 * 16];
914
    local pixel left[32 * 8];
915
    local pixel left_top[32];
916

917
    int lx = get_local_id( 0 );
918
    int ly = get_local_id( 1 );
919
    int gx = get_global_id( 0 );
920
    int gy = get_global_id( 1 );
921
    int gidx = get_group_id( 0 );
922
    int gidy = get_group_id( 1 );
923
    int linear_id = ly * get_local_size( 0 ) + lx;
924
    int satd = COST_MAX;
925
    int basex = gidx << 8;
926
    int basey = (gidy << 3) - 1;
927

928
    /* Load 33 8x8 macroblocks and the pixels above them into local cache */
929
    for( int y = 0; y < 9 && linear_id < (33<<3)>>2; y++ )
930
    {
931
        int x = linear_id << 2;
932
        uint4 data = read_imageui( fenc, sampler, (int2)(x + basex, y + basey) );
933
        cache[y * CACHE_STRIDE + 1 + x] = data.s0;
934
        cache[y * CACHE_STRIDE + 1 + x + 1] = data.s1;
935
        cache[y * CACHE_STRIDE + 1 + x + 2] = data.s2;
936
        cache[y * CACHE_STRIDE + 1 + x + 3] = data.s3;
937
    }
938
    /* load pixels on left edge */
939
    if( linear_id < 9 )
940
        cache[linear_id * CACHE_STRIDE] = read_imageui( fenc, sampler, (int2)( basex - 1, linear_id + basey) ).s0;
941

942
    barrier( CLK_LOCAL_MEM_FENCE );
943

944
    // Cooperatively build the top edge for the macroblock using lowpass filter
945
    int j = ly;
946
    top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
947
                       2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
948
                       cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
949
    j += 8;
950
    top[lx*16 + j] = ( cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j - 1, -1, 15 )] +
951
                       2*cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j, 0, 15 )] +
952
                       cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE + clamp_int( j + 1, 0, 15 )] + 2 ) >> 2;
953
    // Cooperatively build the left edge for the macroblock using lowpass filter
954
    left[lx*8 + ly] = ( cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*(ly - 1)] +
955
                        2*cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*ly] +
956
                        cache[BLOCK_OFFSET + 8*lx - 1 + CACHE_STRIDE*clamp((ly + 1), 0, 7 )] + 2 ) >> 2;
957
    // One left_top per macroblock
958
    if( 0 == ly )
959
    {
960
        left_top[lx] = ( cache[BLOCK_OFFSET + 8*lx - 1] + 2*cache[BLOCK_OFFSET + 8*lx - 1 - CACHE_STRIDE] +
961
                         cache[BLOCK_OFFSET + 8*lx - CACHE_STRIDE] + 2 ) >> 2;
962
        cost_buf[lx] = COST_MAX;
963
    }
964
    barrier( CLK_LOCAL_MEM_FENCE );
965

966
    // each warp/wavefront generates a different prediction type; no divergence
967
    switch( ly )
968
    {
969
        case 0:
970
            satd = x264_predict_8x8c_h( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
971
            break;
972
        case 1:
973
            satd = x264_predict_8x8c_v( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
974
            break;
975
        case 2:
976
            satd = x264_predict_8x8c_dc( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
977
            break;
978
        case 3:
979
            satd = x264_predict_8x8c_p( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE );
980
            break;
981
        case 4:
982
            satd = x264_predict_8x8_ddr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
983
            break;
984
        case 5:
985
            satd = x264_predict_8x8_vr( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
986
            break;
987
        case 6:
988
            satd = x264_predict_8x8_hd( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx], &left[8*lx], left_top[lx] );
989
            break;
990
        case 7:
991
            satd = x264_predict_8x8_hu( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &left[8*lx] );
992
            break;
993
        default:
994
            break;
995
    }
996
    atom_min( &cost_buf[lx], satd );
997
    if( slow )
998
    {
999
        // Do the remaining two (least likely) prediction modes
1000
        switch( ly )
1001
        {
1002
            case 0: // DDL
1003
                satd = x264_predict_8x8_ddl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
1004
                atom_min( &cost_buf[lx], satd );
1005
                break;
1006
            case 1: // VL
1007
                satd = x264_predict_8x8_vl( &cache[BLOCK_OFFSET + 8*lx], CACHE_STRIDE, &top[16*lx] );
1008
                atom_min( &cost_buf[lx], satd );
1009
                break;
1010
            default:
1011
                break;
1012
        }
1013
    }
1014
    barrier( CLK_LOCAL_MEM_FENCE );
1015

1016
    if( (0 == ly) && (gx < mb_width) )
1017
        fenc_intra_cost[gidy * mb_width + gx] = cost_buf[lx]+ 5*lambda;
1018

1019
    // initialize the frame_stats[2] buffer for kernel sum_intra_cost().
1020
    if( gx < 2 && gy == 0 )
1021
        frame_stats[gx] = 0;
1022
#undef CACHE_STRIDE
1023
#undef BLOCK_OFFSET
1024
}
1025

1026
/*
1027
 * parallel sum intra costs
1028
 *
1029
 * global launch dimensions: [256, mb_height]
1030
 */
1031
kernel void sum_intra_cost( const global uint16_t *fenc_intra_cost,
1032
                            const global uint16_t *inv_qscale_factor,
1033
                            global int           *fenc_row_satds,
1034
                            global int           *frame_stats,
1035
                            int                   mb_width )
1036
{
1037
    int y = get_global_id( 1 );
1038
    int mb_height = get_global_size( 1 );
1039

1040
    int row_satds = 0;
1041
    int cost_est = 0;
1042
    int cost_est_aq = 0;
1043

1044
    for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
1045
    {
1046
        int mb_xy = x + y * mb_width;
1047
        int cost = fenc_intra_cost[mb_xy];
1048
        int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
1049
        int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
1050

1051
        row_satds += cost_aq;
1052
        if( b_frame_score_mb )
1053
        {
1054
            cost_est += cost;
1055
            cost_est_aq += cost_aq;
1056
        }
1057
    }
1058

1059
    local int buffer[256];
1060
    int x = get_global_id( 0 );
1061

1062
    row_satds   = parallel_sum( row_satds, x, buffer );
1063
    cost_est    = parallel_sum( cost_est, x, buffer );
1064
    cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
1065

1066
    if( get_global_id( 0 ) == 0 )
1067
    {
1068
        fenc_row_satds[y] = row_satds;
1069
        atomic_add( frame_stats + COST_EST,    cost_est );
1070
        atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
1071
    }
1072
}
1073

1074
Product

Resources

Company