CoCalc -- downscale.cl

05. Matplotlib / ffmpeg-3.0 / libx264 / common / opencl / downscale.cl
⁵⁸⁷²⁷ views
1
/*
2
 * downscale lowres luma: full-res buffer to down scale image, and to packed hpel image
3
 *
4
 * --
5
 *
6
 * fenc_img is an output image (area of memory referenced through a texture
7
 * cache). A read of any pixel location (x,y) returns four pixel values:
8
 *
9
 * val.s0 = P(x,y)
10
 * val.s1 = P(x+1,y)
11
 * val.s2 = P(x+2,y)
12
 * val.s3 = P(x+3,y)
13
 *
14
 * This is a 4x replication of the lowres pixels, a trade-off between memory
15
 * size and read latency.
16
 *
17
 * --
18
 *
19
 * hpel_planes is an output image that contains the four HPEL planes used for
20
 * subpel refinement. A read of any pixel location (x,y) returns a UInt32 with
21
 * the four planar values C | V | H | F
22
 *
23
 * launch dimensions:  [lowres-width, lowres-height]
24
 */
25
kernel void downscale_hpel( const global pixel *fenc,
26
                            write_only image2d_t fenc_img,
27
                            write_only image2d_t hpel_planes,
28
                            int stride )
29
{
30
    int x = get_global_id( 0 );
31
    int y = get_global_id( 1 );
32
    uint4 values;
33

34
    fenc += y * stride * 2;
35
    const global pixel *src1 = fenc + stride;
36
    const global pixel *src2 = (y == get_global_size( 1 )-1) ? src1 : src1 + stride;
37
    int2 pos = (int2)(x, y);
38
    pixel right, left;
39

40
    right = rhadd( fenc[x*2], src1[x*2] );
41
    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
42
    values.s0 = rhadd( right, left );           // F
43

44
    right = rhadd( fenc[2*x+1], src1[2*x+1] );
45
    left  = rhadd( fenc[2*x+2], src1[2*x+2] );
46
    values.s1 = rhadd( right, left );           // H
47

48
    right = rhadd( src1[2*x], src2[2*x] );
49
    left  = rhadd( src1[2*x+1], src2[2*x+1] );
50
    values.s2 = rhadd( right, left );           // V
51

52
    right = rhadd( src1[2*x+1], src2[2*x+1] );
53
    left  = rhadd( src1[2*x+2], src2[2*x+2] );
54
    values.s3 = rhadd( right, left );           // C
55

56
    uint4 val = (uint4) ((values.s3 & 0xff) << 24) | ((values.s2 & 0xff) << 16) | ((values.s1 & 0xff) << 8) | (values.s0 & 0xff);
57
    write_imageui( hpel_planes, pos, val );
58

59
    x = select( x, x+1, x+1 < get_global_size( 0 ) );
60
    right = rhadd( fenc[x*2], src1[x*2] );
61
    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
62
    values.s1 = rhadd( right, left );
63

64
    x = select( x, x+1, x+1 < get_global_size( 0 ) );
65
    right = rhadd( fenc[x*2], src1[x*2] );
66
    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
67
    values.s2 = rhadd( right, left );
68

69
    x = select( x, x+1, x+1 < get_global_size( 0 ) );
70
    right = rhadd( fenc[x*2], src1[x*2] );
71
    left  = rhadd( fenc[x*2+1], src1[x*2+1] );
72
    values.s3 = rhadd( right, left );
73

74
    write_imageui( fenc_img, pos, values );
75
}
76

77
/*
78
 * downscale lowres hierarchical motion search image, copy from one image to
79
 * another decimated image.  This kernel is called iteratively to generate all
80
 * of the downscales.
81
 *
82
 * launch dimensions:  [lower_res width, lower_res height]
83
 */
84
kernel void downscale1( read_only image2d_t higher_res, write_only image2d_t lower_res )
85
{
86
    int x = get_global_id( 0 );
87
    int y = get_global_id( 1 );
88
    int2 pos = (int2)(x, y);
89
    int gs = get_global_size( 0 );
90
    uint4 top, bot, values;
91
    top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
92
    bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
93
    values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
94

95
    /* these select statements appear redundant, and they should be, but tests break when
96
     * they are not here.  I believe this was caused by a driver bug
97
     */
98
    values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
99
    top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
100
    bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
101
    values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
102
    values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
103
    write_imageui( lower_res, pos, (uint4)(values) );
104
}
105

106
/*
107
 * Second copy of downscale kernel, no differences. This is a (no perf loss)
108
 * workaround for a scheduling bug in current Tahiti drivers.  This bug has
109
 * theoretically been fixed in the July 2012 driver release from AMD.
110
 */
111
kernel void downscale2( read_only image2d_t higher_res, write_only image2d_t lower_res )
112
{
113
    int x = get_global_id( 0 );
114
    int y = get_global_id( 1 );
115
    int2 pos = (int2)(x, y);
116
    int gs = get_global_size( 0 );
117
    uint4 top, bot, values;
118
    top = read_imageui( higher_res, sampler, (int2)(x*2, 2*y) );
119
    bot = read_imageui( higher_res, sampler, (int2)(x*2, 2*y+1) );
120
    values.s0 = rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) );
121

122
    // see comment in above function copy
123
    values.s1 = select( values.s0, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 1 < gs) );
124
    top = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y) );
125
    bot = read_imageui( higher_res, sampler, (int2)(x*2+4, 2*y+1) );
126
    values.s2 = select( values.s1, rhadd( rhadd( top.s0, bot.s0 ), rhadd( top.s1, bot.s1 ) ), ( x + 2 < gs ) );
127
    values.s3 = select( values.s2, rhadd( rhadd( top.s2, bot.s2 ), rhadd( top.s3, bot.s3 ) ), ( x + 3 < gs ) );
128
    write_imageui( lower_res, pos, (uint4)(values) );
129
}
130

131
/* OpenCL 1.2 finally added a memset command, but we're not targeting 1.2 */
132
kernel void memset_int16( global int16_t *buf, int16_t value )
133
{
134
    buf[get_global_id( 0 )] = value;
135
}
136

137
Product

Resources

Company