CoCalc -- x264-cl.h

05. Matplotlib / ffmpeg-3.0 / libx264 / common / opencl / x264-cl.h
⁵⁸⁷³⁶ views
1
#pragma OPENCL EXTENSION cl_khr_local_int32_extended_atomics : enable
2

3
constant sampler_t sampler = CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP_TO_EDGE | CLK_FILTER_NEAREST;
4

5
/* 7.18.1.1  Exact-width integer types */
6
typedef signed char int8_t;
7
typedef unsigned char   uint8_t;
8
typedef short  int16_t;
9
typedef unsigned short  uint16_t;
10
typedef int  int32_t;
11
typedef unsigned   uint32_t;
12

13
typedef uint8_t  pixel;
14
typedef uint16_t sum_t;
15
typedef uint32_t sum2_t;
16

17
#define LOWRES_COST_MASK ((1<<14)-1)
18
#define LOWRES_COST_SHIFT 14
19
#define COST_MAX (1<<28)
20

21
#define PIXEL_MAX 255
22
#define BITS_PER_SUM (8 * sizeof(sum_t))
23

24
/* Constants for offsets into frame statistics buffer */
25
#define COST_EST    0
26
#define COST_EST_AQ 1
27
#define INTRA_MBS   2
28

29
#define COPY2_IF_LT( x, y, a, b )\
30
    if((y)<(x))\
31
    {\
32
        (x) = (y);\
33
        (a) = (b);\
34
    }
35

36
constant int2 dia_offs[4] =
37
{
38
    {0, -1}, {-1, 0}, {1, 0}, {0, 1},
39
};
40

41
inline pixel x264_clip_pixel( int x )
42
{
43
    return (pixel) clamp( x, (int) 0, (int) PIXEL_MAX );
44
}
45

46
inline int2 x264_median_mv( short2 a, short2 b, short2 c )
47
{
48
    short2 t1 = min(a, b);
49
    short2 t2 = min(max(a, b), c);
50
    return convert_int2(max(t1, t2));
51
}
52

53
inline sum2_t abs2( sum2_t a )
54
{
55
    sum2_t s = ((a >> (BITS_PER_SUM - 1)) & (((sum2_t)1 << BITS_PER_SUM) + 1)) * ((sum_t)-1);
56
    return (a + s) ^ s;
57
}
58

59
#define HADAMARD4( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
60
    sum2_t t0 = s0 + s1;\
61
    sum2_t t1 = s0 - s1;\
62
    sum2_t t2 = s2 + s3;\
63
    sum2_t t3 = s2 - s3;\
64
    d0 = t0 + t2;\
65
    d2 = t0 - t2;\
66
    d1 = t1 + t3;\
67
    d3 = t1 - t3;\
68
}
69

70
#define HADAMARD4V( d0, d1, d2, d3, s0, s1, s2, s3 ) {\
71
    int2 t0 = s0 + s1;\
72
    int2 t1 = s0 - s1;\
73
    int2 t2 = s2 + s3;\
74
    int2 t3 = s2 - s3;\
75
    d0 = t0 + t2;\
76
    d2 = t0 - t2;\
77
    d1 = t1 + t3;\
78
    d3 = t1 - t3;\
79
}
80

81
#define SATD_C_8x4_Q( name, q1, q2 )\
82
    int name( q1 pixel *pix1, int i_pix1, q2 pixel *pix2, int i_pix2 )\
83
    {\
84
        sum2_t tmp[4][4];\
85
        sum2_t a0, a1, a2, a3;\
86
        sum2_t sum = 0;\
87
        for( int i = 0; i < 4; i++, pix1 += i_pix1, pix2 += i_pix2 )\
88
        {\
89
            a0 = (pix1[0] - pix2[0]) + ((sum2_t)(pix1[4] - pix2[4]) << BITS_PER_SUM);\
90
            a1 = (pix1[1] - pix2[1]) + ((sum2_t)(pix1[5] - pix2[5]) << BITS_PER_SUM);\
91
            a2 = (pix1[2] - pix2[2]) + ((sum2_t)(pix1[6] - pix2[6]) << BITS_PER_SUM);\
92
            a3 = (pix1[3] - pix2[3]) + ((sum2_t)(pix1[7] - pix2[7]) << BITS_PER_SUM);\
93
            HADAMARD4( tmp[i][0], tmp[i][1], tmp[i][2], tmp[i][3], a0, a1, a2, a3 );\
94
        }\
95
        for( int i = 0; i < 4; i++ )\
96
        {\
97
            HADAMARD4( a0, a1, a2, a3, tmp[0][i], tmp[1][i], tmp[2][i], tmp[3][i] );\
98
            sum += abs2( a0 ) + abs2( a1 ) + abs2( a2 ) + abs2( a3 );\
99
        }\
100
        return (((sum_t)sum) + (sum>>BITS_PER_SUM)) >> 1;\
101
    }
102

103
/*
104
 * Utility function to perform a parallel sum reduction of an array of integers
105
 */
106
int parallel_sum( int value, int x, volatile local int *array )
107
{
108
    array[x] = value;
109
    barrier( CLK_LOCAL_MEM_FENCE );
110

111
    int dim = get_local_size( 0 );
112

113
    while( dim > 1 )
114
    {
115
        dim >>= 1;
116

117
        if( x < dim )
118
            array[x] += array[x + dim];
119

120
        if( dim > 32 )
121
            barrier( CLK_LOCAL_MEM_FENCE );
122
    }
123

124
    return array[0];
125
}
126

127
int mv_cost( uint2 mvd )
128
{
129
    float2 mvdf = (float2)(mvd.x, mvd.y) + 1.0f;
130
    float2 cost = round( log2(mvdf) * 2.0f + 0.718f + (float2)(!!mvd.x, !!mvd.y) );
131
    return (int) (cost.x + cost.y);
132
}
133

134
Product

Resources

Company