CoCalc -- bidir.cl

05. Matplotlib / ffmpeg-3.0 / libx264 / common / opencl / bidir.cl
⁵²⁸⁶⁷ views
1
/* Mode selection routines, select the least SATD cost mode for each lowres
2
 * macroblock.  When measuring B slices, this includes measuring the cost of
3
 * three bidir modes.  */
4

5
/* Four threads cooperatively measure 8x8 BIDIR cost with SATD */
6
int bidir_satd_8x8_ii_coop4( read_only image2d_t fenc_lowres,
7
                             int2 fencpos,
8
                             read_only image2d_t fref0_planes,
9
                             int2 qpos0,
10
                             read_only image2d_t fref1_planes,
11
                             int2 qpos1,
12
                             int weight,
13
                             local sum2_t *tmpp,
14
                             int idx )
15
{
16
    volatile local sum2_t( *tmp )[4] = (volatile local sum2_t( * )[4])tmpp;
17
    sum2_t b0, b1, b2, b3;
18
    sum2_t sum = 0;
19

20
    // fencpos is full-pel position of original MB
21
    // qpos0 is qpel position within reference frame 0
22
    // qpos1 is qpel position within reference frame 1
23

24
    int2 fref0Apos = (int2)(qpos0.x>>2, qpos0.y>>2);
25
    int hpel0A = ((qpos0.x&2)>>1) + (qpos0.y&2);
26

27
    int2 qpos0B = (int2)qpos0 + (int2)(((qpos0.x&1)<<1), ((qpos0.y&1)<<1));
28
    int2 fref0Bpos = (int2)(qpos0B.x>>2, qpos0B.y>>2);
29
    int hpel0B = ((qpos0B.x&2)>>1) + (qpos0B.y&2);
30

31
    int2 fref1Apos = (int2)(qpos1.x>>2, qpos1.y>>2);
32
    int hpel1A = ((qpos1.x&2)>>1) + (qpos1.y&2);
33

34
    int2 qpos1B = (int2)qpos1 + (int2)(((qpos1.x&1)<<1), ((qpos1.y&1)<<1));
35
    int2 fref1Bpos = (int2)(qpos1B.x>>2, qpos1B.y>>2);
36
    int hpel1B = ((qpos1B.x&2)>>1) + (qpos1B.y&2);
37

38
    uint mask_shift0A = 8 * hpel0A, mask_shift0B = 8 * hpel0B;
39
    uint mask_shift1A = 8 * hpel1A, mask_shift1B = 8 * hpel1B;
40

41
    uint vA, vB;
42
    uint enc, ref0, ref1;
43
    uint a0, a1;
44
    const int weight2 = 64 - weight;
45

46
#define READ_BIDIR_DIFF( OUT, X )\
47
    enc = read_imageui( fenc_lowres, sampler, fencpos + (int2)(X, idx) ).s0;\
48
    vA = (read_imageui( fref0_planes, sampler, fref0Apos + (int2)(X, idx) ).s0 >> mask_shift0A) & 0xFF;\
49
    vB = (read_imageui( fref0_planes, sampler, fref0Bpos + (int2)(X, idx) ).s0 >> mask_shift0B) & 0xFF;\
50
    ref0 = rhadd( vA, vB );\
51
    vA = (read_imageui( fref1_planes, sampler, fref1Apos + (int2)(X, idx) ).s0 >> mask_shift1A) & 0xFF;\
52
    vB = (read_imageui( fref1_planes, sampler, fref1Bpos + (int2)(X, idx) ).s0 >> mask_shift1B) & 0xFF;\
53
    ref1 = rhadd( vA, vB );\
54
    OUT = enc - ((ref0 * weight + ref1 * weight2 + (1 << 5)) >> 6);
55

56
#define READ_DIFF_EX( OUT, a, b )\
57
    READ_BIDIR_DIFF( a0, a );\
58
    READ_BIDIR_DIFF( a1, b );\
59
    OUT = a0 + (a1<<BITS_PER_SUM);
60

61
#define ROW_8x4_SATD( a, b, c )\
62
    fencpos.y += a;\
63
    fref0Apos.y += b;\
64
    fref0Bpos.y += b;\
65
    fref1Apos.y += c;\
66
    fref1Bpos.y += c;\
67
    READ_DIFF_EX( b0, 0, 4 );\
68
    READ_DIFF_EX( b1, 1, 5 );\
69
    READ_DIFF_EX( b2, 2, 6 );\
70
    READ_DIFF_EX( b3, 3, 7 );\
71
    HADAMARD4( tmp[idx][0], tmp[idx][1], tmp[idx][2], tmp[idx][3], b0, b1, b2, b3 );\
72
    HADAMARD4( b0, b1, b2, b3, tmp[0][idx], tmp[1][idx], tmp[2][idx], tmp[3][idx] );\
73
    sum += abs2( b0 ) + abs2( b1 ) + abs2( b2 ) + abs2( b3 );
74

75
    ROW_8x4_SATD( 0, 0, 0 );
76
    ROW_8x4_SATD( 4, 4, 4 );
77

78
#undef READ_BIDIR_DIFF
79
#undef READ_DIFF_EX
80
#undef ROW_8x4_SATD
81

82
    return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;
83
}
84

85
/*
86
 * mode selection - pick the least cost partition type for each 8x8 macroblock.
87
 * Intra, list0 or list1.  When measuring a B slice, also test three bidir
88
 * possibilities.
89
 *
90
 * fenc_lowres_mvs[0|1] and fenc_lowres_mv_costs[0|1] are large buffers that
91
 * hold many frames worth of motion vectors.  We must offset into the correct
92
 * location for this frame's vectors:
93
 *
94
 *   CPU equivalent: fenc->lowres_mvs[0][b - p0 - 1]
95
 *   GPU equivalent: fenc_lowres_mvs0[(b - p0 - 1) * mb_count]
96
 *
97
 * global launch dimensions for P slice estimate:  [mb_width, mb_height]
98
 * global launch dimensions for B slice estimate:  [mb_width * 4, mb_height]
99
 */
100
kernel void mode_selection( read_only image2d_t   fenc_lowres,
101
                            read_only image2d_t   fref0_planes,
102
                            read_only image2d_t   fref1_planes,
103
                            const global short2  *fenc_lowres_mvs0,
104
                            const global short2  *fenc_lowres_mvs1,
105
                            const global short2  *fref1_lowres_mvs0,
106
                            const global int16_t *fenc_lowres_mv_costs0,
107
                            const global int16_t *fenc_lowres_mv_costs1,
108
                            const global uint16_t *fenc_intra_cost,
109
                            global uint16_t      *lowres_costs,
110
                            global int           *frame_stats,
111
                            local int16_t        *cost_local,
112
                            local sum2_t         *satd_local,
113
                            int                   mb_width,
114
                            int                   bipred_weight,
115
                            int                   dist_scale_factor,
116
                            int                   b,
117
                            int                   p0,
118
                            int                   p1,
119
                            int                   lambda )
120
{
121
    int mb_x = get_global_id( 0 );
122
    int b_bidir = b < p1;
123
    if( b_bidir )
124
    {
125
        /* when mode_selection is run for B frames, it must perform BIDIR SATD
126
         * measurements, so it is launched with four times as many threads in
127
         * order to spread the work around more of the GPU.  And it can add
128
         * padding threads in the X direction. */
129
        mb_x >>= 2;
130
        if( mb_x >= mb_width )
131
            return;
132
    }
133
    int mb_y = get_global_id( 1 );
134
    int mb_height = get_global_size( 1 );
135
    int mb_count = mb_width * mb_height;
136
    int mb_xy = mb_x + mb_y * mb_width;
137

138
    /* Initialize int frame_stats[4] for next kernel (sum_inter_cost) */
139
    if( mb_x < 4 && mb_y == 0 )
140
        frame_stats[mb_x] = 0;
141

142
    int bcost = COST_MAX;
143
    int list_used = 0;
144

145
    if( !b_bidir )
146
    {
147
        int icost = fenc_intra_cost[mb_xy];
148
        COPY2_IF_LT( bcost, icost, list_used, 0 );
149
    }
150
    if( b != p0 )
151
    {
152
        int mv_cost0 = fenc_lowres_mv_costs0[(b - p0 - 1) * mb_count + mb_xy];
153
        COPY2_IF_LT( bcost, mv_cost0, list_used, 1 );
154
    }
155
    if( b != p1 )
156
    {
157
        int mv_cost1 = fenc_lowres_mv_costs1[(p1 - b - 1) * mb_count + mb_xy];
158
        COPY2_IF_LT( bcost, mv_cost1, list_used, 2 );
159
    }
160

161
    if( b_bidir )
162
    {
163
        int2 coord = (int2)(mb_x, mb_y) << 3;
164
        int mb_i = get_global_id( 0 ) & 3;
165
        int mb_in_group = get_local_id( 1 ) * (get_local_size( 0 ) >> 2) + (get_local_id( 0 ) >> 2);
166
        cost_local += mb_in_group * 4;
167
        satd_local += mb_in_group * 16;
168

169
#define TRY_BIDIR( mv0, mv1, penalty )\
170
{\
171
    int2 qpos0 = (int2)((coord.x<<2) + mv0.x, (coord.y<<2) + mv0.y);\
172
    int2 qpos1 = (int2)((coord.x<<2) + mv1.x, (coord.y<<2) + mv1.y);\
173
    cost_local[mb_i] = bidir_satd_8x8_ii_coop4( fenc_lowres, coord, fref0_planes, qpos0, fref1_planes, qpos1, bipred_weight, satd_local, mb_i );\
174
    int cost = cost_local[0] + cost_local[1] + cost_local[2] + cost_local[3];\
175
    COPY2_IF_LT( bcost, penalty * lambda + cost, list_used, 3 );\
176
}
177

178
        /* temporal prediction */
179
        short2 dmv0, dmv1;
180
        short2 mvr = fref1_lowres_mvs0[mb_xy];
181
        dmv0 = (mvr * (short) dist_scale_factor + (short) 128) >> (short) 8;
182
        dmv1 = dmv0 - mvr;
183
        TRY_BIDIR( dmv0, dmv1, 0 )
184

185
        if( as_uint( dmv0 ) || as_uint( dmv1 ) )
186
        {
187
            /* B-direct prediction */
188
            dmv0 = 0; dmv1 = 0;
189
            TRY_BIDIR( dmv0, dmv1, 0 );
190
        }
191

192
        /* L0+L1 prediction */
193
        dmv0 = fenc_lowres_mvs0[(b - p0 - 1) * mb_count + mb_xy];
194
        dmv1 = fenc_lowres_mvs1[(p1 - b - 1) * mb_count + mb_xy];
195
        TRY_BIDIR( dmv0, dmv1, 5 );
196
#undef TRY_BIDIR
197
    }
198

199
    lowres_costs[mb_xy] = min( bcost, LOWRES_COST_MASK ) + (list_used << LOWRES_COST_SHIFT);
200
}
201

202
/*
203
 * parallel sum inter costs
204
 *
205
 * global launch dimensions: [256, mb_height]
206
 */
207
kernel void sum_inter_cost( const global uint16_t *fenc_lowres_costs,
208
                            const global uint16_t *inv_qscale_factor,
209
                            global int           *fenc_row_satds,
210
                            global int           *frame_stats,
211
                            int                   mb_width,
212
                            int                   bframe_bias,
213
                            int                   b,
214
                            int                   p0,
215
                            int                   p1 )
216
{
217
    int y = get_global_id( 1 );
218
    int mb_height = get_global_size( 1 );
219

220
    int row_satds = 0;
221
    int cost_est = 0;
222
    int cost_est_aq = 0;
223
    int intra_mbs = 0;
224

225
    for( int x = get_global_id( 0 ); x < mb_width; x += get_global_size( 0 ))
226
    {
227
        int mb_xy = x + y * mb_width;
228
        int cost = fenc_lowres_costs[mb_xy] & LOWRES_COST_MASK;
229
        int list = fenc_lowres_costs[mb_xy] >> LOWRES_COST_SHIFT;
230
        int b_frame_score_mb = (x > 0 && x < mb_width - 1 && y > 0 && y < mb_height - 1) || mb_width <= 2 || mb_height <= 2;
231

232
        if( list == 0 && b_frame_score_mb )
233
            intra_mbs++;
234

235
        int cost_aq = (cost * inv_qscale_factor[mb_xy] + 128) >> 8;
236

237
        row_satds += cost_aq;
238

239
        if( b_frame_score_mb )
240
        {
241
            cost_est += cost;
242
            cost_est_aq += cost_aq;
243
        }
244
    }
245

246
    local int buffer[256];
247
    int x = get_global_id( 0 );
248

249
    row_satds   = parallel_sum( row_satds, x, buffer );
250
    cost_est    = parallel_sum( cost_est, x, buffer );
251
    cost_est_aq = parallel_sum( cost_est_aq, x, buffer );
252
    intra_mbs   = parallel_sum( intra_mbs, x, buffer );
253

254
    if( b != p1 )
255
        // Use floating point math to avoid 32bit integer overflow conditions
256
        cost_est = (int)((float)cost_est * 100.0f / (120.0f + (float)bframe_bias));
257

258
    if( get_global_id( 0 ) == 0 )
259
    {
260
        fenc_row_satds[y] = row_satds;
261
        atomic_add( frame_stats + COST_EST, cost_est );
262
        atomic_add( frame_stats + COST_EST_AQ, cost_est_aq );
263
        atomic_add( frame_stats + INTRA_MBS, intra_mbs );
264
    }
265
}
266

267
Product

Resources

Company