CoCalc -- motionsearch.cl

05. Matplotlib / ffmpeg-3.0 / libx264 / common / opencl / motionsearch.cl
⁵²⁸⁶⁸ views
1
/* Hierarchical (iterative) OpenCL lowres motion search */
2

3
inline int find_downscale_mb_xy( int x, int y, int mb_width, int mb_height )
4
{
5
    /* edge macroblocks might not have a direct descendant, use nearest */
6
    x = select( x >> 1, (x - (mb_width&1)) >> 1, x == mb_width-1 );
7
    y = select( y >> 1, (y - (mb_height&1)) >> 1, y == mb_height-1 );
8
    return (mb_width>>1) * y + x;
9
}
10

11
/* Four threads calculate an 8x8 SAD.  Each does two rows */
12
int sad_8x8_ii_coop4( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos, int idx, local int16_t *costs )
13
{
14
    frefpos.y += idx << 1;
15
    fencpos.y += idx << 1;
16
    int cost = 0;
17
    if( frefpos.x < 0 )
18
    {
19
        /* slow path when MV goes past left edge.  The GPU clamps reads from
20
         * (-1, 0) to (0,0), so you get pixels [0, 1, 2, 3] when what you really
21
         * want are [0, 0, 1, 2]
22
         */
23
        for( int y = 0; y < 2; y++ )
24
        {
25
            for( int x = 0; x < 8; x++ )
26
            {
27
                pixel enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
28
                pixel ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
29
                cost += abs_diff( enc, ref );
30
            }
31
        }
32
    }
33
    else
34
    {
35
        uint4 enc, ref, costs = 0;
36
        enc = read_imageui( fenc, sampler, fencpos );
37
        ref = read_imageui( fref, sampler, frefpos );
38
        costs += abs_diff( enc, ref );
39
        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 0) );
40
        ref = read_imageui( fref, sampler, frefpos + (int2)(4, 0) );
41
        costs += abs_diff( enc, ref );
42
        enc = read_imageui( fenc, sampler, fencpos + (int2)(0, 1) );
43
        ref = read_imageui( fref, sampler, frefpos + (int2)(0, 1) );
44
        costs += abs_diff( enc, ref );
45
        enc = read_imageui( fenc, sampler, fencpos + (int2)(4, 1) );
46
        ref = read_imageui( fref, sampler, frefpos + (int2)(4, 1) );
47
        costs += abs_diff( enc, ref );
48
        cost = costs.s0 + costs.s1 + costs.s2 + costs.s3;
49
    }
50
    costs[idx] = cost;
51
    return costs[0] + costs[1] + costs[2] + costs[3];
52
}
53

54
/* One thread performs 8x8 SAD */
55
int sad_8x8_ii( read_only image2d_t fenc, int2 fencpos, read_only image2d_t fref, int2 frefpos )
56
{
57
    if( frefpos.x < 0 )
58
    {
59
        /* slow path when MV goes past left edge */
60
        int cost = 0;
61
        for( int y = 0; y < 8; y++ )
62
        {
63
            for( int x = 0; x < 8; x++ )
64
            {
65
                uint enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) ).s0;
66
                uint ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) ).s0;
67
                cost += abs_diff( enc, ref );
68
            }
69
        }
70
        return cost;
71
    }
72
    else
73
    {
74
        uint4 enc, ref, cost = 0;
75
        for( int y = 0; y < 8; y++ )
76
        {
77
            for( int x = 0; x < 8; x += 4 )
78
            {
79
                enc = read_imageui( fenc, sampler, fencpos + (int2)(x, y) );
80
                ref = read_imageui( fref, sampler, frefpos + (int2)(x, y) );
81
                cost += abs_diff( enc, ref );
82
            }
83
        }
84
        return cost.s0 + cost.s1 + cost.s2 + cost.s3;
85
    }
86
}
87
/*
88
 * hierarchical motion estimation
89
 *
90
 * Each kernel launch is a single iteration
91
 *
92
 * MB per work group is determined by lclx / 4 * lcly
93
 *
94
 * global launch dimensions:  [mb_width * 4, mb_height]
95
 */
96
kernel void hierarchical_motion( read_only image2d_t  fenc,
97
                                 read_only image2d_t  fref,
98
                                 const global short2 *in_mvs,
99
                                 global short2       *out_mvs,
100
                                 global int16_t      *out_mv_costs,
101
                                 global short2       *mvp_buffer,
102
                                 local int16_t       *cost_local,
103
                                 local short2        *mvc_local,
104
                                 int                  mb_width,
105
                                 int                  lambda,
106
                                 int                  me_range,
107
                                 int                  scale,
108
                                 int                  b_shift_index,
109
                                 int                  b_first_iteration,
110
                                 int                  b_reverse_references )
111
{
112
    int mb_x = get_global_id( 0 ) >> 2;
113
    if( mb_x >= mb_width )
114
        return;
115
    int mb_height = get_global_size( 1 );
116
    int mb_i = get_global_id( 0 ) & 3;
117
    int mb_y = get_global_id( 1 );
118
    int mb_xy = mb_y * mb_width + mb_x;
119
    const int mb_size = 8;
120
    int2 coord = (int2)(mb_x, mb_y) * mb_size;
121

122
    const int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
123
    cost_local += 4 * mb_in_group;
124

125
    int i_mvc = 0;
126
    mvc_local += 4 * mb_in_group;
127
    mvc_local[mb_i] = 0;
128
    int2 mvp =0;
129

130
    if( !b_first_iteration )
131
    {
132
#define MVC( DX, DY )\
133
    {\
134
        int px = mb_x + DX;\
135
        int py = mb_y + DY;\
136
        mvc_local[i_mvc] = b_shift_index ? in_mvs[find_downscale_mb_xy( px, py, mb_width, mb_height )] : \
137
                                           in_mvs[mb_width * py + px];\
138
        mvc_local[i_mvc] >>= (short) scale;\
139
        i_mvc++;\
140
    }
141
        /* Find MVP from median of MVCs */
142
        if( b_reverse_references )
143
        {
144
            /* odd iterations: derive MVP from down and right */
145
            if( mb_x < mb_width - 1 )
146
                MVC( 1, 0 );
147
            if( mb_y < mb_height - 1 )
148
            {
149
                MVC( 0, 1 );
150
                if( mb_x > b_shift_index )
151
                    MVC( -1, 1 );
152
                if( mb_x < mb_width - 1 )
153
                    MVC( 1, 1 );
154
            }
155
        }
156
        else
157
        {
158
            /* even iterations: derive MVP from up and left */
159
            if( mb_x > 0 )
160
                MVC( -1, 0 );
161
            if( mb_y > 0 )
162
            {
163
                MVC( 0, -1 );
164
                if( mb_x < mb_width - 1 )
165
                    MVC( 1, -1 );
166
                if( mb_x > b_shift_index )
167
                    MVC( -1, -1 );
168
            }
169
        }
170
#undef MVC
171
        mvp = (i_mvc <= 1) ? convert_int2_sat(mvc_local[0]) : x264_median_mv( mvc_local[0], mvc_local[1], mvc_local[2] );
172
    }
173
    /* current mvp matches the previous mvp and we have not changed scale.  We know
174
     * we're going to arrive at the same MV again, so just copy the previous
175
     * result to our output. */
176
    if( !b_shift_index && mvp.x == mvp_buffer[mb_xy].x && mvp.y == mvp_buffer[mb_xy].y )
177
    {
178
        out_mvs[mb_xy] = in_mvs[mb_xy];
179
        return;
180
    }
181
    mvp_buffer[mb_xy] = convert_short2_sat(mvp);
182
    int2 mv_min = -mb_size * (int2)(mb_x, mb_y) - 4;
183
    int2 mv_max = mb_size * ((int2)(mb_width, mb_height) - (int2)(mb_x, mb_y) - 1) + 4;
184

185
    int2 bestmv = clamp(mvp, mv_min, mv_max);
186
    int2 refcrd = coord + bestmv;
187

188
    /* measure cost at bestmv */
189
    int bcost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +
190
                lambda * mv_cost( abs_diff( bestmv, mvp ) << (2 + scale) );
191

192
    do
193
    {
194
        /* measure costs at offsets from bestmv */
195
        refcrd = coord + bestmv + dia_offs[mb_i];
196
        int2 trymv = bestmv + dia_offs[mb_i];
197
        int cost = sad_8x8_ii( fenc, coord, fref, refcrd ) +
198
                   lambda * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );
199

200
        cost_local[mb_i] = (cost<<2) | mb_i;
201
        cost = min( cost_local[0], min( cost_local[1], min( cost_local[2], cost_local[3] ) ) );
202

203
        if( (cost >> 2) >= bcost )
204
            break;
205

206
        bestmv += dia_offs[cost&3];
207
        bcost = cost>>2;
208

209
        if( bestmv.x >= mv_max.x || bestmv.x <= mv_min.x || bestmv.y >= mv_max.y || bestmv.y <= mv_min.y )
210
            break;
211
    }
212
    while( --me_range > 0 );
213

214
    int2 trymv = 0, diff = 0;
215

216
#define COST_MV_NO_PAD( L )\
217
    trymv = clamp( trymv, mv_min, mv_max );\
218
    diff = convert_int2_sat(abs_diff( mvp, trymv ));\
219
    if( diff.x > 1 || diff.y > 1 ) {\
220
        int2 refcrd = coord + trymv;\
221
        int cost = sad_8x8_ii_coop4( fenc, coord, fref, refcrd, mb_i, cost_local ) +\
222
                   L * mv_cost( abs_diff( trymv, mvp ) << (2 + scale) );\
223
        if( cost < bcost ) { bcost = cost; bestmv = trymv; } }
224

225
    COST_MV_NO_PAD( 0 );
226

227
    if( !b_first_iteration )
228
    {
229
        /* try cost at previous iteration's MV, if MVP was too far away */
230
        int2 prevmv = b_shift_index ? convert_int2_sat(in_mvs[find_downscale_mb_xy( mb_x, mb_y, mb_width, mb_height )]) : convert_int2_sat(in_mvs[mb_xy]);
231
        prevmv >>= scale;
232
        trymv = prevmv;
233
        COST_MV_NO_PAD( lambda );
234
    }
235

236
    for( int i = 0; i < i_mvc; i++ )
237
    {
238
        /* try cost at each candidate MV, if MVP was too far away */
239
        trymv = convert_int2_sat( mvc_local[i] );
240
        COST_MV_NO_PAD( lambda );
241
    }
242

243
    if( mb_i == 0 )
244
    {
245
        bestmv <<= scale;
246
        out_mvs[mb_xy] = convert_short2_sat(bestmv);
247
        out_mv_costs[mb_xy] = min( bcost, LOWRES_COST_MASK );
248
    }
249
}
250

251
Product

Resources

Company