CoCalc -- util.h

05. Matplotlib / ffmpeg-3.0 / libx264 / common / x86 / util.h
⁵²⁸⁶⁷ views
1
/*****************************************************************************
2
 * util.h: x86 inline asm
3
 *****************************************************************************
4
 * Copyright (C) 2008-2016 x264 project
5
 *
6
 * Authors: Fiona Glaser <[email protected]>
7
 *          Loren Merritt <[email protected]>
8
 *
9
 * This program is free software; you can redistribute it and/or modify
10
 * it under the terms of the GNU General Public License as published by
11
 * the Free Software Foundation; either version 2 of the License, or
12
 * (at your option) any later version.
13
 *
14
 * This program is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
 * GNU General Public License for more details.
18
 *
19
 * You should have received a copy of the GNU General Public License
20
 * along with this program; if not, write to the Free Software
21
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
22
 *
23
 * This program is also available under a commercial proprietary license.
24
 * For more information, contact us at [email protected].
25
 *****************************************************************************/
26

27
#ifndef X264_X86_UTIL_H
28
#define X264_X86_UTIL_H
29

30
#ifdef __SSE__
31
#include <xmmintrin.h>
32

33
#undef M128_ZERO
34
#define M128_ZERO ((__m128){0,0,0,0})
35
#define x264_union128_t x264_union128_sse_t
36
typedef union { __m128 i; uint64_t a[2]; uint32_t b[4]; uint16_t c[8]; uint8_t d[16]; } MAY_ALIAS x264_union128_sse_t;
37
#if HAVE_VECTOREXT
38
typedef uint32_t v4si __attribute__((vector_size (16)));
39
#endif
40
#endif // __SSE__
41

42
#if HAVE_X86_INLINE_ASM && HAVE_MMX
43

44
#define x264_median_mv x264_median_mv_mmx2
45
static ALWAYS_INLINE void x264_median_mv_mmx2( int16_t *dst, int16_t *a, int16_t *b, int16_t *c )
46
{
47
    asm(
48
        "movd   %1,    %%mm0 \n"
49
        "movd   %2,    %%mm1 \n"
50
        "movq   %%mm0, %%mm3 \n"
51
        "movd   %3,    %%mm2 \n"
52
        "pmaxsw %%mm1, %%mm0 \n"
53
        "pminsw %%mm3, %%mm1 \n"
54
        "pminsw %%mm2, %%mm0 \n"
55
        "pmaxsw %%mm1, %%mm0 \n"
56
        "movd   %%mm0, %0    \n"
57
        :"=m"(*(x264_union32_t*)dst)
58
        :"m"(M32( a )), "m"(M32( b )), "m"(M32( c ))
59
    );
60
}
61

62
#define x264_predictor_difference x264_predictor_difference_mmx2
63
static ALWAYS_INLINE int x264_predictor_difference_mmx2( int16_t (*mvc)[2], intptr_t i_mvc )
64
{
65
    int sum;
66
    static const uint64_t pw_1 = 0x0001000100010001ULL;
67

68
    asm(
69
        "pxor    %%mm4, %%mm4 \n"
70
        "test    $1, %1       \n"
71
        "jnz 3f               \n"
72
        "movd    -8(%2,%1,4), %%mm0 \n"
73
        "movd    -4(%2,%1,4), %%mm3 \n"
74
        "psubw   %%mm3, %%mm0 \n"
75
        "jmp 2f               \n"
76
        "3:                   \n"
77
        "dec     %1           \n"
78
        "1:                   \n"
79
        "movq    -8(%2,%1,4), %%mm0 \n"
80
        "psubw   -4(%2,%1,4), %%mm0 \n"
81
        "2:                   \n"
82
        "sub     $2,    %1    \n"
83
        "pxor    %%mm2, %%mm2 \n"
84
        "psubw   %%mm0, %%mm2 \n"
85
        "pmaxsw  %%mm2, %%mm0 \n"
86
        "paddusw %%mm0, %%mm4 \n"
87
        "jg 1b                \n"
88
        "pmaddwd %4, %%mm4    \n"
89
        "pshufw $14, %%mm4, %%mm0 \n"
90
        "paddd   %%mm0, %%mm4 \n"
91
        "movd    %%mm4, %0    \n"
92
        :"=r"(sum), "+r"(i_mvc)
93
        :"r"(mvc), "m"(M64( mvc )), "m"(pw_1)
94
    );
95
    return sum;
96
}
97

98
#define x264_cabac_mvd_sum x264_cabac_mvd_sum_mmx2
99
static ALWAYS_INLINE uint16_t x264_cabac_mvd_sum_mmx2(uint8_t *mvdleft, uint8_t *mvdtop)
100
{
101
    static const uint64_t pb_2    = 0x0202020202020202ULL;
102
    static const uint64_t pb_32   = 0x2020202020202020ULL;
103
    static const uint64_t pb_33   = 0x2121212121212121ULL;
104
    int amvd;
105
    asm(
106
        "movd         %1, %%mm0 \n"
107
        "movd         %2, %%mm1 \n"
108
        "paddusb   %%mm1, %%mm0 \n"
109
        "pminub       %5, %%mm0 \n"
110
        "pxor      %%mm2, %%mm2 \n"
111
        "movq      %%mm0, %%mm1 \n"
112
        "pcmpgtb      %3, %%mm0 \n"
113
        "pcmpgtb      %4, %%mm1 \n"
114
        "psubb     %%mm0, %%mm2 \n"
115
        "psubb     %%mm1, %%mm2 \n"
116
        "movd      %%mm2, %0    \n"
117
        :"=r"(amvd)
118
        :"m"(M16( mvdleft )),"m"(M16( mvdtop )),
119
         "m"(pb_2),"m"(pb_32),"m"(pb_33)
120
    );
121
    return amvd;
122
}
123

124
#define x264_predictor_clip x264_predictor_clip_mmx2
125
static int ALWAYS_INLINE x264_predictor_clip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
126
{
127
    static const uint32_t pd_32 = 0x20;
128
    intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
129

130
    asm(
131
        "movq       (%2), %%mm5 \n"
132
        "movd         %6, %%mm3 \n"
133
        "psllw        $2, %%mm5 \n" // Convert to subpel
134
        "pshufw $0xEE, %%mm5, %%mm6 \n"
135
        "dec         %k3        \n"
136
        "jz 2f                  \n" // if( i_mvc == 1 ) {do the last iteration}
137
        "punpckldq %%mm3, %%mm3 \n"
138
        "punpckldq %%mm5, %%mm5 \n"
139
        "movd         %7, %%mm4 \n"
140
        "lea   (%0,%3,4), %3    \n"
141
        "1:                     \n"
142
        "movq       (%0), %%mm0 \n"
143
        "add          $8, %0    \n"
144
        "movq      %%mm3, %%mm1 \n"
145
        "pxor      %%mm2, %%mm2 \n"
146
        "pcmpeqd   %%mm0, %%mm1 \n" // mv == pmv
147
        "pcmpeqd   %%mm0, %%mm2 \n" // mv == 0
148
        "por       %%mm1, %%mm2 \n" // (mv == pmv || mv == 0) * -1
149
        "pmovmskb  %%mm2, %k2   \n" // (mv == pmv || mv == 0) * 0xf
150
        "pmaxsw    %%mm5, %%mm0 \n"
151
        "pminsw    %%mm6, %%mm0 \n"
152
        "pand      %%mm4, %%mm2 \n" // (mv0 == pmv || mv0 == 0) * 32
153
        "psrlq     %%mm2, %%mm0 \n" // drop mv0 if it's skipped
154
        "movq      %%mm0, (%5,%4,4) \n"
155
        "and         $24, %k2   \n"
156
        "add          $2, %4    \n"
157
        "add          $8, %k2   \n"
158
        "shr          $4, %k2   \n" // (4-val)>>1
159
        "sub          %2, %4    \n" // +1 for each valid motion vector
160
        "cmp          %3, %0    \n"
161
        "jl 1b                  \n"
162
        "jg 3f                  \n" // if( i == i_mvc - 1 ) {do the last iteration}
163

164
        /* Do the last iteration */
165
        "2:                     \n"
166
        "movd       (%0), %%mm0 \n"
167
        "pxor      %%mm2, %%mm2 \n"
168
        "pcmpeqd   %%mm0, %%mm3 \n"
169
        "pcmpeqd   %%mm0, %%mm2 \n"
170
        "por       %%mm3, %%mm2 \n"
171
        "pmovmskb  %%mm2, %k2   \n"
172
        "pmaxsw    %%mm5, %%mm0 \n"
173
        "pminsw    %%mm6, %%mm0 \n"
174
        "movd      %%mm0, (%5,%4,4) \n"
175
        "inc          %4        \n"
176
        "and          $1, %k2   \n"
177
        "sub          %2, %4    \n" // output += !(mv == pmv || mv == 0)
178
        "3:                     \n"
179
        :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
180
        :"r"(dst), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
181
    );
182
    return i;
183
}
184

185
/* Same as the above, except we do (mv + 2) >> 2 on the input. */
186
#define x264_predictor_roundclip x264_predictor_roundclip_mmx2
187
static int ALWAYS_INLINE x264_predictor_roundclip_mmx2( int16_t (*dst)[2], int16_t (*mvc)[2], int i_mvc, int16_t mv_limit[2][2], uint32_t pmv )
188
{
189
    static const uint64_t pw_2 = 0x0002000200020002ULL;
190
    static const uint32_t pd_32 = 0x20;
191
    intptr_t tmp = (intptr_t)mv_limit, mvc_max = i_mvc, i = 0;
192

193
    asm(
194
        "movq       (%2), %%mm5 \n"
195
        "movq         %6, %%mm7 \n"
196
        "movd         %7, %%mm3 \n"
197
        "pshufw $0xEE, %%mm5, %%mm6 \n"
198
        "dec         %k3        \n"
199
        "jz 2f                  \n"
200
        "punpckldq %%mm3, %%mm3 \n"
201
        "punpckldq %%mm5, %%mm5 \n"
202
        "movd         %8, %%mm4 \n"
203
        "lea   (%0,%3,4), %3    \n"
204
        "1:                     \n"
205
        "movq       (%0), %%mm0 \n"
206
        "add          $8, %0    \n"
207
        "paddw     %%mm7, %%mm0 \n"
208
        "psraw        $2, %%mm0 \n"
209
        "movq      %%mm3, %%mm1 \n"
210
        "pxor      %%mm2, %%mm2 \n"
211
        "pcmpeqd   %%mm0, %%mm1 \n"
212
        "pcmpeqd   %%mm0, %%mm2 \n"
213
        "por       %%mm1, %%mm2 \n"
214
        "pmovmskb  %%mm2, %k2   \n"
215
        "pmaxsw    %%mm5, %%mm0 \n"
216
        "pminsw    %%mm6, %%mm0 \n"
217
        "pand      %%mm4, %%mm2 \n"
218
        "psrlq     %%mm2, %%mm0 \n"
219
        "movq      %%mm0, (%5,%4,4) \n"
220
        "and         $24, %k2   \n"
221
        "add          $2, %4    \n"
222
        "add          $8, %k2   \n"
223
        "shr          $4, %k2   \n"
224
        "sub          %2, %4    \n"
225
        "cmp          %3, %0    \n"
226
        "jl 1b                  \n"
227
        "jg 3f                  \n"
228

229
        /* Do the last iteration */
230
        "2:                     \n"
231
        "movd       (%0), %%mm0 \n"
232
        "paddw     %%mm7, %%mm0 \n"
233
        "psraw        $2, %%mm0 \n"
234
        "pxor      %%mm2, %%mm2 \n"
235
        "pcmpeqd   %%mm0, %%mm3 \n"
236
        "pcmpeqd   %%mm0, %%mm2 \n"
237
        "por       %%mm3, %%mm2 \n"
238
        "pmovmskb  %%mm2, %k2   \n"
239
        "pmaxsw    %%mm5, %%mm0 \n"
240
        "pminsw    %%mm6, %%mm0 \n"
241
        "movd      %%mm0, (%5,%4,4) \n"
242
        "inc          %4        \n"
243
        "and          $1, %k2   \n"
244
        "sub          %2, %4    \n"
245
        "3:                     \n"
246
        :"+r"(mvc), "=m"(M64( dst )), "+r"(tmp), "+r"(mvc_max), "+r"(i)
247
        :"r"(dst), "m"(pw_2), "g"(pmv), "m"(pd_32), "m"(M64( mvc ))
248
    );
249
    return i;
250
}
251

252
#endif
253

254
#endif
255

256
Product

Resources

Company