CoCalc -- subpel.cl

05. Matplotlib / ffmpeg-3.0 / libx264 / common / opencl / subpel.cl
⁵⁸⁷³⁷ views
1
/* OpenCL lowres subpel Refine */
2

3
/* Each thread performs 8x8 SAD.  4 threads per MB, so the 4 DIA HPEL offsets are
4
 * calculated simultaneously */
5
int sad_8x8_ii_hpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
6
{
7
    int2 frefpos = qpos >> 2;
8
    int hpel_idx = ((qpos.x & 2) >> 1) + (qpos.y & 2);
9
    uint mask_shift = 8 * hpel_idx;
10

11
    uint4 cost4 = 0;
12

13
    for( int y = 0; y < 8; y++ )
14
    {
15
        uint4 enc, val4;
16
        enc = read_imageui( fenc, sampler, fencpos + (int2)(0, y));
17
        val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(0, y)).s0 >> mask_shift) & 0xFF;
18
        val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(1, y)).s0 >> mask_shift) & 0xFF;
19
        val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(2, y)).s0 >> mask_shift) & 0xFF;
20
        val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(3, y)).s0 >> mask_shift) & 0xFF;
21
        cost4 += abs_diff( enc, val4 );
22

23
        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, y));
24
        val4.s0 = (read_imageui( fref_planes, sampler, frefpos + (int2)(4, y)).s0 >> mask_shift) & 0xFF;
25
        val4.s1 = (read_imageui( fref_planes, sampler, frefpos + (int2)(5, y)).s0 >> mask_shift) & 0xFF;
26
        val4.s2 = (read_imageui( fref_planes, sampler, frefpos + (int2)(6, y)).s0 >> mask_shift) & 0xFF;
27
        val4.s3 = (read_imageui( fref_planes, sampler, frefpos + (int2)(7, y)).s0 >> mask_shift) & 0xFF;
28
        cost4 += abs_diff( enc, val4 );
29
    }
30

31
    return cost4.s0 + cost4.s1 + cost4.s2 + cost4.s3;
32
}
33

34
/* One thread measures 8x8 SAD cost at a QPEL offset into an HPEL plane */
35
int sad_8x8_ii_qpel( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref_planes, int2 qpos )
36
{
37
    int2 frefApos = qpos >> 2;
38
    int hpelA = ((qpos.x & 2) >> 1) + (qpos.y & 2);
39

40
    int2 qposB = qpos + ((qpos & 1) << 1);
41
    int2 frefBpos = qposB >> 2;
42
    int hpelB = ((qposB.x & 2) >> 1) + (qposB.y & 2);
43

44
    uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
45

46
    int cost = 0;
47

48
    for( int y = 0; y < 8; y++ )
49
    {
50
        for( int x = 0; x < 8; x++ )
51
        {
52
            uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y)).s0;
53
            uint vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(x, y)).s0 >> mask_shift0) & 0xFF;
54
            uint vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(x, y)).s0 >> mask_shift1) & 0xFF;
55
            cost += abs_diff( enc, rhadd( vA, vB ) );
56
        }
57
    }
58

59
    return cost;
60
}
61

62
/* Four threads measure 8x8 SATD cost at a QPEL offset into an HPEL plane
63
 *
64
 * Each thread collects 1/4 of the rows of diffs and processes one quarter of
65
 * the transforms
66
 */
67
int satd_8x8_ii_qpel_coop4( read_only image2d_t fenc,
68
                            int2 fencpos,
69
                            read_only image2d_t fref_planes,
70
                            int2 qpos,
71
                            local sum2_t *tmpp,
72
                            int idx )
73
{
74
    volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
75
    sum2_t b0, b1, b2, b3;
76

77
    // fencpos is full-pel position of original MB
78
    // qpos is qpel position within reference frame
79
    int2 frefApos = qpos >> 2;
80
    int hpelA = ((qpos.x&2)>>1) + (qpos.y&2);
81

82
    int2 qposB = qpos + (int2)(((qpos.x&1)<<1), ((qpos.y&1)<<1));
83
    int2 frefBpos = qposB >> 2;
84
    int hpelB = ((qposB.x&2)>>1) + (qposB.y&2);
85

86
    uint mask_shift0 = 8 * hpelA, mask_shift1 = 8 * hpelB;
87

88
    uint vA, vB;
89
    uint a0, a1;
90
    uint enc;
91
    sum2_t sum = 0;
92

93
#define READ_DIFF( OUT, X )\
94
    enc = read_imageui( fenc, sampler, fencpos + (int2)(X, idx) ).s0;\
95
    vA = (read_imageui( fref_planes, sampler, frefApos + (int2)(X, idx) ).s0 >> mask_shift0) & 0xFF;\
96
    vB = (read_imageui( fref_planes, sampler, frefBpos + (int2)(X, idx) ).s0 >> mask_shift1) & 0xFF;\
97
    OUT = enc - rhadd( vA, vB );
98

99
#define READ_DIFF_EX( OUT, a, b )\
100
    {\
101
        READ_DIFF( a0, a );\
102
        READ_DIFF( a1, b );\
103
        OUT = a0 + (a1<<BITS_PER_SUM);\
104
    }
105
#define ROW_8x4_SATD( a, b )\
106
    {\
107
        fencpos.y += a;\
108
        frefApos.y += b;\
109
        frefBpos.y += b;\
110
        READ_DIFF_EX( b0, 0, 4 );\
111
        READ_DIFF_EX( b1, 1, 5 );\
112
        READ_DIFF_EX( b2, 2, 6 );\
113
        READ_DIFF_EX( b3, 3, 7 );\
114
        HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
115
        HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
116
        sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );\
117
    }
118
    ROW_8x4_SATD( 0, 0 );
119
    ROW_8x4_SATD( 4, 4 );
120

121
#undef READ_DIFF
122
#undef READ_DIFF_EX
123
#undef ROW_8x4_SATD
124
    return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
125
}
126

127
constant int2 hpoffs[4] =
128
{
129
    {0, -2}, {-2, 0}, {2, 0}, {0, 2}
130
};
131

132
/* sub pixel refinement of motion vectors, output MVs and costs are moved from
133
 * temporary buffers into final per-frame buffer
134
 *
135
 * global launch dimensions:  [mb_width * 4, mb_height]
136
 *
137
 * With X being the source 16x16 pixels, F is the lowres pixel used by the
138
 * motion search.  We will now utilize the H V and C pixels (stored in separate
139
 * planes) to search at half-pel increments.
140
 *
141
 * X X X X X X
142
 *  F H F H F
143
 * X X X X X X
144
 *  V C V C V
145
 * X X X X X X
146
 *  F H F H F
147
 * X X X X X X
148
 *
149
 * The YX HPEL bits of the motion vector selects the plane we search in.  The
150
 * four planes are packed in the fref_planes 2D image buffer.  Each sample
151
 * returns:  s0 = F, s1 = H, s2 = V, s3 = C */
152
kernel void subpel_refine( read_only image2d_t   fenc,
153
                           read_only image2d_t   fref_planes,
154
                           const global short2  *in_mvs,
155
                           const global int16_t *in_sad_mv_costs,
156
                           local int16_t        *cost_local,
157
                           local sum2_t         *satd_local,
158
                           local short2         *mvc_local,
159
                           global short2        *fenc_lowres_mv,
160
                           global int16_t       *fenc_lowres_mv_costs,
161
                           int                   mb_width,
162
                           int                   lambda,
163
                           int                   b,
164
                           int                   ref,
165
                           int                   b_islist1 )
166
{
167
    int mb_x = get_global_id( 0 ) >> 2;
168
    if( mb_x >= mb_width )
169
        return;
170
    int mb_height = get_global_size( 1 );
171

172
    int mb_i = get_global_id( 0 ) & 3;
173
    int mb_y = get_global_id( 1 );
174
    int mb_xy = mb_y * mb_width + mb_x;
175

176
    /* fenc_lowres_mv and fenc_lowres_mv_costs are large buffers that
177
     * hold many frames worth of motion vectors.  We must offset into the correct
178
     * location for this frame's vectors.  The kernel will be passed the correct
179
     * directional buffer for the direction of the search: list1 or list0
180
     *
181
     *   CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
182
     *   GPU equivalent: fenc_lowres_mvs[(b - p0 - 1) * mb_count] */
183
    fenc_lowres_mv +=       (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
184
    fenc_lowres_mv_costs += (b_islist1 ? (ref-b-1) : (b-ref-1)) * mb_width * mb_height;
185

186
    /* Adjust pointers into local memory buffers for this thread's data */
187
    int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
188
    cost_local += mb_in_group * 4;
189
    satd_local += mb_in_group * 16;
190
    mvc_local += mb_in_group * 4;
191

192
    int i_mvc = 0;
193

194
    mvc_local[0] = mvc_local[1] = mvc_local[2] = mvc_local[3] = 0;
195

196
#define MVC( DX, DY ) mvc_local[i_mvc++] = in_mvs[mb_width * (mb_y + DY) + (mb_x + DX)];
197
    if( mb_x > 0 )
198
        MVC( -1, 0 );
199
    if( mb_y > 0 )
200
    {
201
        MVC( 0, -1 );
202
        if( mb_x < mb_width - 1 )
203
            MVC( 1, -1 );
204
        if( mb_x > 0 )
205
            MVC( -1, -1 );
206
    }
207
#undef MVC
208
    int2 mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
209

210
    int bcost =  in_sad_mv_costs[mb_xy];
211
    int2 coord = (int2)(mb_x, mb_y) << 3;
212
    int2 bmv = convert_int2_sat( in_mvs[mb_xy] );
213

214
    /* Make mvp and bmv QPEL MV */
215
    mvp <<= 2; bmv <<= 2;
216

217
#define HPEL_QPEL( ARR, FUNC )\
218
    {\
219
        int2 trymv = bmv + ARR[mb_i];\
220
        int2 qpos = (coord << 2) + trymv;\
221
        int cost = FUNC( fenc, coord, fref_planes, qpos ) + lambda * mv_cost( abs_diff( trymv, mvp ) );\
222
        cost_local[mb_i] = (cost<<2) + mb_i;\
223
        cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );\
224
        if( (cost>>2) < bcost )\
225
        {\
226
            bmv += ARR[cost&3];\
227
            bcost = cost>>2;\
228
        }\
229
    }
230

231
    HPEL_QPEL( hpoffs, sad_8x8_ii_hpel );
232
    HPEL_QPEL( dia_offs, sad_8x8_ii_qpel );
233
    fenc_lowres_mv[mb_xy] = convert_short2_sat( bmv );
234

235
    /* remeasure cost of bmv using SATD */
236
    int2 qpos = (coord << 2) + bmv;
237
    cost_local[mb_i] = satd_8x8_ii_qpel_coop4( fenc, coord, fref_planes, qpos, satd_local, mb_i );
238
    bcost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];
239
    bcost += lambda * mv_cost( abs_diff( bmv, mvp ) );
240

241
    fenc_lowres_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );
242
}
243

244
Product

Resources

Company