CoCalc -- swscale

05. Matplotlib / ffmpeg-3.0 / libswscale / x86 / swscale_template.c
⁵²⁸⁶⁸ views
1
/*
2
 * Copyright (C) 2001-2011 Michael Niedermayer <[email protected]>
3
 *
4
 * This file is part of FFmpeg.
5
 *
6
 * FFmpeg is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU Lesser General Public
8
 * License as published by the Free Software Foundation; either
9
 * version 2.1 of the License, or (at your option) any later version.
10
 *
11
 * FFmpeg is distributed in the hope that it will be useful,
12
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
 * Lesser General Public License for more details.
15
 *
16
 * You should have received a copy of the GNU Lesser General Public
17
 * License along with FFmpeg; if not, write to the Free Software
18
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
 */
20

21
#include <stdint.h>
22

23
#include "libavutil/x86/asm.h"
24
#include "libswscale/swscale_internal.h"
25

26
#undef REAL_MOVNTQ
27
#undef MOVNTQ
28
#undef MOVNTQ2
29
#undef PREFETCH
30

31

32
#if COMPILE_TEMPLATE_MMXEXT
33
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
34
#define MOVNTQ2 "movntq "
35
#else
36
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
37
#define MOVNTQ2 "movq "
38
#endif
39
#define MOVNTQ(a,b)  REAL_MOVNTQ(a,b)
40

41
#if !COMPILE_TEMPLATE_MMXEXT
42
static av_always_inline void
43
dither_8to16(const uint8_t *srcDither, int rot)
44
{
45
    if (rot) {
46
        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
47
                         "movq       (%0), %%mm3\n\t"
48
                         "movq      %%mm3, %%mm4\n\t"
49
                         "psrlq       $24, %%mm3\n\t"
50
                         "psllq       $40, %%mm4\n\t"
51
                         "por       %%mm4, %%mm3\n\t"
52
                         "movq      %%mm3, %%mm4\n\t"
53
                         "punpcklbw %%mm0, %%mm3\n\t"
54
                         "punpckhbw %%mm0, %%mm4\n\t"
55
                         :: "r"(srcDither)
56
                         );
57
    } else {
58
        __asm__ volatile("pxor      %%mm0, %%mm0\n\t"
59
                         "movq       (%0), %%mm3\n\t"
60
                         "movq      %%mm3, %%mm4\n\t"
61
                         "punpcklbw %%mm0, %%mm3\n\t"
62
                         "punpckhbw %%mm0, %%mm4\n\t"
63
                         :: "r"(srcDither)
64
                         );
65
    }
66
}
67
#endif
68

69
static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
70
                           const int16_t **src, uint8_t *dest, int dstW,
71
                           const uint8_t *dither, int offset)
72
{
73
    dither_8to16(dither, offset);
74
    filterSize--;
75
    __asm__ volatile(
76
        "movd %0, %%mm1\n\t"
77
        "punpcklwd %%mm1, %%mm1\n\t"
78
        "punpckldq %%mm1, %%mm1\n\t"
79
        "psllw        $3, %%mm1\n\t"
80
        "paddw     %%mm1, %%mm3\n\t"
81
        "paddw     %%mm1, %%mm4\n\t"
82
        "psraw        $4, %%mm3\n\t"
83
        "psraw        $4, %%mm4\n\t"
84
        ::"m"(filterSize)
85
     );
86

87
    __asm__ volatile(\
88
        "movq    %%mm3, %%mm6\n\t"
89
        "movq    %%mm4, %%mm7\n\t"
90
        "movl %3, %%ecx\n\t"
91
        "mov                                 %0, %%"REG_d"  \n\t"\
92
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
93
        ".p2align                             4             \n\t" /* FIXME Unroll? */\
94
        "1:                                                 \n\t"\
95
        "movq                      8(%%"REG_d"), %%mm0      \n\t" /* filterCoeff */\
96
        "movq                (%%"REG_S", %%"REG_c", 2), %%mm2      \n\t" /* srcData */\
97
        "movq               8(%%"REG_S", %%"REG_c", 2), %%mm5      \n\t" /* srcData */\
98
        "add                                $16, %%"REG_d"  \n\t"\
99
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
100
        "test                         %%"REG_S", %%"REG_S"  \n\t"\
101
        "pmulhw                           %%mm0, %%mm2      \n\t"\
102
        "pmulhw                           %%mm0, %%mm5      \n\t"\
103
        "paddw                            %%mm2, %%mm3      \n\t"\
104
        "paddw                            %%mm5, %%mm4      \n\t"\
105
        " jnz                                1b             \n\t"\
106
        "psraw                               $3, %%mm3      \n\t"\
107
        "psraw                               $3, %%mm4      \n\t"\
108
        "packuswb                         %%mm4, %%mm3      \n\t"
109
        MOVNTQ2 "                         %%mm3, (%1, %%"REG_c")\n\t"
110
        "add                          $8, %%"REG_c"         \n\t"\
111
        "cmp                          %2, %%"REG_c"         \n\t"\
112
        "movq    %%mm6, %%mm3\n\t"
113
        "movq    %%mm7, %%mm4\n\t"
114
        "mov                                 %0, %%"REG_d"  \n\t"\
115
        "mov                        (%%"REG_d"), %%"REG_S"  \n\t"\
116
        "jb                                  1b             \n\t"\
117
        :: "g" (filter),
118
           "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
119
        : "%"REG_d, "%"REG_S, "%"REG_c
120
    );
121
}
122

123
#define YSCALEYUV2PACKEDX_UV \
124
    __asm__ volatile(\
125
        "xor                   %%"REG_a", %%"REG_a"     \n\t"\
126
        ".p2align                      4                \n\t"\
127
        "nop                                            \n\t"\
128
        "1:                                             \n\t"\
129
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
130
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
131
        "movq      "VROUNDER_OFFSET"(%0), %%mm3         \n\t"\
132
        "movq                      %%mm3, %%mm4         \n\t"\
133
        ".p2align                      4                \n\t"\
134
        "2:                                             \n\t"\
135
        "movq               8(%%"REG_d"), %%mm0         \n\t" /* filterCoeff */\
136
        "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* UsrcData */\
137
        "add                          %6, %%"REG_S"     \n\t" \
138
        "movq     (%%"REG_S", %%"REG_a"), %%mm5         \n\t" /* VsrcData */\
139
        "add                         $16, %%"REG_d"     \n\t"\
140
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
141
        "pmulhw                    %%mm0, %%mm2         \n\t"\
142
        "pmulhw                    %%mm0, %%mm5         \n\t"\
143
        "paddw                     %%mm2, %%mm3         \n\t"\
144
        "paddw                     %%mm5, %%mm4         \n\t"\
145
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
146
        " jnz                         2b                \n\t"\
147

148
#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
149
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
150
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
151
    "movq      "VROUNDER_OFFSET"(%0), "#dst1"       \n\t"\
152
    "movq                    "#dst1", "#dst2"       \n\t"\
153
    ".p2align                      4                \n\t"\
154
    "2:                                             \n\t"\
155
    "movq               8(%%"REG_d"), "#coeff"      \n\t" /* filterCoeff */\
156
    "movq  (%%"REG_S", %%"REG_a", 2), "#src1"       \n\t" /* Y1srcData */\
157
    "movq 8(%%"REG_S", %%"REG_a", 2), "#src2"       \n\t" /* Y2srcData */\
158
    "add                         $16, %%"REG_d"            \n\t"\
159
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
160
    "pmulhw                 "#coeff", "#src1"       \n\t"\
161
    "pmulhw                 "#coeff", "#src2"       \n\t"\
162
    "paddw                   "#src1", "#dst1"       \n\t"\
163
    "paddw                   "#src2", "#dst2"       \n\t"\
164
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
165
    " jnz                         2b                \n\t"\
166

167
#define YSCALEYUV2PACKEDX \
168
    YSCALEYUV2PACKEDX_UV \
169
    YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
170

171
#define YSCALEYUV2PACKEDX_END                     \
172
        :: "r" (&c->redDither),                   \
173
            "m" (dummy), "m" (dummy), "m" (dummy),\
174
            "r" (dest), "m" (dstW_reg), "m"(uv_off) \
175
            NAMED_CONSTRAINTS_ADD(bF8,bFC) \
176
        : "%"REG_a, "%"REG_d, "%"REG_S            \
177
    );
178

179
#define YSCALEYUV2PACKEDX_ACCURATE_UV \
180
    __asm__ volatile(\
181
        "xor %%"REG_a", %%"REG_a"                       \n\t"\
182
        ".p2align                      4                \n\t"\
183
        "nop                                            \n\t"\
184
        "1:                                             \n\t"\
185
        "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d"     \n\t"\
186
        "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
187
        "pxor                      %%mm4, %%mm4         \n\t"\
188
        "pxor                      %%mm5, %%mm5         \n\t"\
189
        "pxor                      %%mm6, %%mm6         \n\t"\
190
        "pxor                      %%mm7, %%mm7         \n\t"\
191
        ".p2align                      4                \n\t"\
192
        "2:                                             \n\t"\
193
        "movq     (%%"REG_S", %%"REG_a"), %%mm0         \n\t" /* UsrcData */\
194
        "add                          %6, %%"REG_S"      \n\t" \
195
        "movq     (%%"REG_S", %%"REG_a"), %%mm2         \n\t" /* VsrcData */\
196
        "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
197
        "movq     (%%"REG_S", %%"REG_a"), %%mm1         \n\t" /* UsrcData */\
198
        "movq                      %%mm0, %%mm3         \n\t"\
199
        "punpcklwd                 %%mm1, %%mm0         \n\t"\
200
        "punpckhwd                 %%mm1, %%mm3         \n\t"\
201
        "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1         \n\t" /* filterCoeff */\
202
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
203
        "pmaddwd                   %%mm1, %%mm3         \n\t"\
204
        "paddd                     %%mm0, %%mm4         \n\t"\
205
        "paddd                     %%mm3, %%mm5         \n\t"\
206
        "add                          %6, %%"REG_S"      \n\t" \
207
        "movq     (%%"REG_S", %%"REG_a"), %%mm3         \n\t" /* VsrcData */\
208
        "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
209
        "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
210
        "test                  %%"REG_S", %%"REG_S"     \n\t"\
211
        "movq                      %%mm2, %%mm0         \n\t"\
212
        "punpcklwd                 %%mm3, %%mm2         \n\t"\
213
        "punpckhwd                 %%mm3, %%mm0         \n\t"\
214
        "pmaddwd                   %%mm1, %%mm2         \n\t"\
215
        "pmaddwd                   %%mm1, %%mm0         \n\t"\
216
        "paddd                     %%mm2, %%mm6         \n\t"\
217
        "paddd                     %%mm0, %%mm7         \n\t"\
218
        " jnz                         2b                \n\t"\
219
        "psrad                       $16, %%mm4         \n\t"\
220
        "psrad                       $16, %%mm5         \n\t"\
221
        "psrad                       $16, %%mm6         \n\t"\
222
        "psrad                       $16, %%mm7         \n\t"\
223
        "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
224
        "packssdw                  %%mm5, %%mm4         \n\t"\
225
        "packssdw                  %%mm7, %%mm6         \n\t"\
226
        "paddw                     %%mm0, %%mm4         \n\t"\
227
        "paddw                     %%mm0, %%mm6         \n\t"\
228
        "movq                      %%mm4, "U_TEMP"(%0)  \n\t"\
229
        "movq                      %%mm6, "V_TEMP"(%0)  \n\t"\
230

231
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
232
    "lea                "offset"(%0), %%"REG_d"     \n\t"\
233
    "mov                 (%%"REG_d"), %%"REG_S"     \n\t"\
234
    "pxor                      %%mm1, %%mm1         \n\t"\
235
    "pxor                      %%mm5, %%mm5         \n\t"\
236
    "pxor                      %%mm7, %%mm7         \n\t"\
237
    "pxor                      %%mm6, %%mm6         \n\t"\
238
    ".p2align                      4                \n\t"\
239
    "2:                                             \n\t"\
240
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm0         \n\t" /* Y1srcData */\
241
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2         \n\t" /* Y2srcData */\
242
    "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S"     \n\t"\
243
    "movq  (%%"REG_S", %%"REG_a", 2), %%mm4         \n\t" /* Y1srcData */\
244
    "movq                      %%mm0, %%mm3         \n\t"\
245
    "punpcklwd                 %%mm4, %%mm0         \n\t"\
246
    "punpckhwd                 %%mm4, %%mm3         \n\t"\
247
    "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4         \n\t" /* filterCoeff */\
248
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
249
    "pmaddwd                   %%mm4, %%mm3         \n\t"\
250
    "paddd                     %%mm0, %%mm1         \n\t"\
251
    "paddd                     %%mm3, %%mm5         \n\t"\
252
    "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3         \n\t" /* Y2srcData */\
253
    "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S"     \n\t"\
254
    "add           $"STR(APCK_SIZE)", %%"REG_d"     \n\t"\
255
    "test                  %%"REG_S", %%"REG_S"     \n\t"\
256
    "movq                      %%mm2, %%mm0         \n\t"\
257
    "punpcklwd                 %%mm3, %%mm2         \n\t"\
258
    "punpckhwd                 %%mm3, %%mm0         \n\t"\
259
    "pmaddwd                   %%mm4, %%mm2         \n\t"\
260
    "pmaddwd                   %%mm4, %%mm0         \n\t"\
261
    "paddd                     %%mm2, %%mm7         \n\t"\
262
    "paddd                     %%mm0, %%mm6         \n\t"\
263
    " jnz                         2b                \n\t"\
264
    "psrad                       $16, %%mm1         \n\t"\
265
    "psrad                       $16, %%mm5         \n\t"\
266
    "psrad                       $16, %%mm7         \n\t"\
267
    "psrad                       $16, %%mm6         \n\t"\
268
    "movq      "VROUNDER_OFFSET"(%0), %%mm0         \n\t"\
269
    "packssdw                  %%mm5, %%mm1         \n\t"\
270
    "packssdw                  %%mm6, %%mm7         \n\t"\
271
    "paddw                     %%mm0, %%mm1         \n\t"\
272
    "paddw                     %%mm0, %%mm7         \n\t"\
273
    "movq               "U_TEMP"(%0), %%mm3         \n\t"\
274
    "movq               "V_TEMP"(%0), %%mm4         \n\t"\
275

276
#define YSCALEYUV2PACKEDX_ACCURATE \
277
    YSCALEYUV2PACKEDX_ACCURATE_UV \
278
    YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
279

280
#define YSCALEYUV2RGBX \
281
    "psubw  "U_OFFSET"(%0), %%mm3       \n\t" /* (U-128)8*/\
282
    "psubw  "V_OFFSET"(%0), %%mm4       \n\t" /* (V-128)8*/\
283
    "movq            %%mm3, %%mm2       \n\t" /* (U-128)8*/\
284
    "movq            %%mm4, %%mm5       \n\t" /* (V-128)8*/\
285
    "pmulhw "UG_COEFF"(%0), %%mm3       \n\t"\
286
    "pmulhw "VG_COEFF"(%0), %%mm4       \n\t"\
287
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
288
    "pmulhw "UB_COEFF"(%0), %%mm2       \n\t"\
289
    "pmulhw "VR_COEFF"(%0), %%mm5       \n\t"\
290
    "psubw  "Y_OFFSET"(%0), %%mm1       \n\t" /* 8(Y-16)*/\
291
    "psubw  "Y_OFFSET"(%0), %%mm7       \n\t" /* 8(Y-16)*/\
292
    "pmulhw  "Y_COEFF"(%0), %%mm1       \n\t"\
293
    "pmulhw  "Y_COEFF"(%0), %%mm7       \n\t"\
294
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
295
    "paddw           %%mm3, %%mm4       \n\t"\
296
    "movq            %%mm2, %%mm0       \n\t"\
297
    "movq            %%mm5, %%mm6       \n\t"\
298
    "movq            %%mm4, %%mm3       \n\t"\
299
    "punpcklwd       %%mm2, %%mm2       \n\t"\
300
    "punpcklwd       %%mm5, %%mm5       \n\t"\
301
    "punpcklwd       %%mm4, %%mm4       \n\t"\
302
    "paddw           %%mm1, %%mm2       \n\t"\
303
    "paddw           %%mm1, %%mm5       \n\t"\
304
    "paddw           %%mm1, %%mm4       \n\t"\
305
    "punpckhwd       %%mm0, %%mm0       \n\t"\
306
    "punpckhwd       %%mm6, %%mm6       \n\t"\
307
    "punpckhwd       %%mm3, %%mm3       \n\t"\
308
    "paddw           %%mm7, %%mm0       \n\t"\
309
    "paddw           %%mm7, %%mm6       \n\t"\
310
    "paddw           %%mm7, %%mm3       \n\t"\
311
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
312
    "packuswb        %%mm0, %%mm2       \n\t"\
313
    "packuswb        %%mm6, %%mm5       \n\t"\
314
    "packuswb        %%mm3, %%mm4       \n\t"\
315

316
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
317
    "movq       "#b", "#q2"     \n\t" /* B */\
318
    "movq       "#r", "#t"      \n\t" /* R */\
319
    "punpcklbw  "#g", "#b"      \n\t" /* GBGBGBGB 0 */\
320
    "punpcklbw  "#a", "#r"      \n\t" /* ARARARAR 0 */\
321
    "punpckhbw  "#g", "#q2"     \n\t" /* GBGBGBGB 2 */\
322
    "punpckhbw  "#a", "#t"      \n\t" /* ARARARAR 2 */\
323
    "movq       "#b", "#q0"     \n\t" /* GBGBGBGB 0 */\
324
    "movq      "#q2", "#q3"     \n\t" /* GBGBGBGB 2 */\
325
    "punpcklwd  "#r", "#q0"     \n\t" /* ARGBARGB 0 */\
326
    "punpckhwd  "#r", "#b"      \n\t" /* ARGBARGB 1 */\
327
    "punpcklwd  "#t", "#q2"     \n\t" /* ARGBARGB 2 */\
328
    "punpckhwd  "#t", "#q3"     \n\t" /* ARGBARGB 3 */\
329
\
330
    MOVNTQ(   q0,   (dst, index, 4))\
331
    MOVNTQ(    b,  8(dst, index, 4))\
332
    MOVNTQ(   q2, 16(dst, index, 4))\
333
    MOVNTQ(   q3, 24(dst, index, 4))\
334
\
335
    "add      $8, "#index"      \n\t"\
336
    "cmp  "dstw", "#index"      \n\t"\
337
    " jb      1b                \n\t"
338
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)  REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
339

340
static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
341
                                   const int16_t **lumSrc, int lumFilterSize,
342
                                   const int16_t *chrFilter, const int16_t **chrUSrc,
343
                                   const int16_t **chrVSrc,
344
                                   int chrFilterSize, const int16_t **alpSrc,
345
                                   uint8_t *dest, int dstW, int dstY)
346
{
347
    x86_reg dummy=0;
348
    x86_reg dstW_reg = dstW;
349
    x86_reg uv_off = c->uv_offx2;
350

351
    if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
352
        YSCALEYUV2PACKEDX_ACCURATE
353
        YSCALEYUV2RGBX
354
        "movq                      %%mm2, "U_TEMP"(%0)  \n\t"
355
        "movq                      %%mm4, "V_TEMP"(%0)  \n\t"
356
        "movq                      %%mm5, "Y_TEMP"(%0)  \n\t"
357
        YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
358
        "movq               "Y_TEMP"(%0), %%mm5         \n\t"
359
        "psraw                        $3, %%mm1         \n\t"
360
        "psraw                        $3, %%mm7         \n\t"
361
        "packuswb                  %%mm7, %%mm1         \n\t"
362
        WRITEBGR32(%4, "%5", %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
363
        YSCALEYUV2PACKEDX_END
364
    } else {
365
        YSCALEYUV2PACKEDX_ACCURATE
366
        YSCALEYUV2RGBX
367
        "pcmpeqd %%mm7, %%mm7 \n\t"
368
        WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
369
        YSCALEYUV2PACKEDX_END
370
    }
371
}
372

373
static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
374
                                const int16_t **lumSrc, int lumFilterSize,
375
                                const int16_t *chrFilter, const int16_t **chrUSrc,
376
                                const int16_t **chrVSrc,
377
                                int chrFilterSize, const int16_t **alpSrc,
378
                                uint8_t *dest, int dstW, int dstY)
379
{
380
    x86_reg dummy=0;
381
    x86_reg dstW_reg = dstW;
382
    x86_reg uv_off = c->uv_offx2;
383

384
    if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
385
        YSCALEYUV2PACKEDX
386
        YSCALEYUV2RGBX
387
        YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
388
        "psraw                        $3, %%mm1         \n\t"
389
        "psraw                        $3, %%mm7         \n\t"
390
        "packuswb                  %%mm7, %%mm1         \n\t"
391
        WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
392
        YSCALEYUV2PACKEDX_END
393
    } else {
394
        YSCALEYUV2PACKEDX
395
        YSCALEYUV2RGBX
396
        "pcmpeqd %%mm7, %%mm7 \n\t"
397
        WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
398
        YSCALEYUV2PACKEDX_END
399
    }
400
}
401

402
static void RENAME(yuv2bgr32_X)(SwsContext *c, const int16_t *lumFilter,
403
                                const int16_t **lumSrc, int lumFilterSize,
404
                                const int16_t *chrFilter, const int16_t **chrUSrc,
405
                                const int16_t **chrVSrc,
406
                                int chrFilterSize, const int16_t **alpSrc,
407
                                uint8_t *dest, int dstW, int dstY)
408
{
409
    x86_reg dummy=0;
410
    x86_reg dstW_reg = dstW;
411
    x86_reg uv_off = c->uv_offx2;
412

413
    if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
414
        YSCALEYUV2PACKEDX
415
        YSCALEYUV2RGBX
416
        YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
417
        "psraw                        $3, %%mm1         \n\t"
418
        "psraw                        $3, %%mm7         \n\t"
419
        "packuswb                  %%mm7, %%mm1         \n\t"
420
        WRITEBGR32(%4, "%5", %%REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
421
        YSCALEYUV2PACKEDX_END
422
    } else {
423
        YSCALEYUV2PACKEDX
424
        YSCALEYUV2RGBX
425
        "pcmpeqd %%mm7, %%mm7 \n\t"
426
        WRITEBGR32(%4, "%5", %%REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
427
        YSCALEYUV2PACKEDX_END
428
    }
429
}
430

431
#define REAL_WRITERGB16(dst, dstw, index) \
432
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
433
    "pand "MANGLE(bFC)", %%mm4  \n\t" /* G */\
434
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
435
    "psrlq           $3, %%mm2  \n\t"\
436
\
437
    "movq         %%mm2, %%mm1  \n\t"\
438
    "movq         %%mm4, %%mm3  \n\t"\
439
\
440
    "punpcklbw    %%mm7, %%mm3  \n\t"\
441
    "punpcklbw    %%mm5, %%mm2  \n\t"\
442
    "punpckhbw    %%mm7, %%mm4  \n\t"\
443
    "punpckhbw    %%mm5, %%mm1  \n\t"\
444
\
445
    "psllq           $3, %%mm3  \n\t"\
446
    "psllq           $3, %%mm4  \n\t"\
447
\
448
    "por          %%mm3, %%mm2  \n\t"\
449
    "por          %%mm4, %%mm1  \n\t"\
450
\
451
    MOVNTQ(%%mm2,  (dst, index, 2))\
452
    MOVNTQ(%%mm1, 8(dst, index, 2))\
453
\
454
    "add             $8, "#index"   \n\t"\
455
    "cmp         "dstw", "#index"   \n\t"\
456
    " jb             1b             \n\t"
457
#define WRITERGB16(dst, dstw, index)  REAL_WRITERGB16(dst, dstw, index)
458

459
static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
460
                                    const int16_t **lumSrc, int lumFilterSize,
461
                                    const int16_t *chrFilter, const int16_t **chrUSrc,
462
                                    const int16_t **chrVSrc,
463
                                    int chrFilterSize, const int16_t **alpSrc,
464
                                    uint8_t *dest, int dstW, int dstY)
465
{
466
    x86_reg dummy=0;
467
    x86_reg dstW_reg = dstW;
468
    x86_reg uv_off = c->uv_offx2;
469

470
    YSCALEYUV2PACKEDX_ACCURATE
471
    YSCALEYUV2RGBX
472
    "pxor %%mm7, %%mm7 \n\t"
473
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
474
#ifdef DITHER1XBPP
475
    "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
476
    "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
477
    "paddusb "RED_DITHER"(%0), %%mm5\n\t"
478
#endif
479
    WRITERGB16(%4, "%5", %%REGa)
480
    YSCALEYUV2PACKEDX_END
481
}
482

483
static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
484
                                 const int16_t **lumSrc, int lumFilterSize,
485
                                 const int16_t *chrFilter, const int16_t **chrUSrc,
486
                                 const int16_t **chrVSrc,
487
                                 int chrFilterSize, const int16_t **alpSrc,
488
                                 uint8_t *dest, int dstW, int dstY)
489
{
490
    x86_reg dummy=0;
491
    x86_reg dstW_reg = dstW;
492
    x86_reg uv_off = c->uv_offx2;
493

494
    YSCALEYUV2PACKEDX
495
    YSCALEYUV2RGBX
496
    "pxor %%mm7, %%mm7 \n\t"
497
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
498
#ifdef DITHER1XBPP
499
    "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
500
    "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
501
    "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
502
#endif
503
    WRITERGB16(%4, "%5", %%REGa)
504
    YSCALEYUV2PACKEDX_END
505
}
506

507
#define REAL_WRITERGB15(dst, dstw, index) \
508
    "pand "MANGLE(bF8)", %%mm2  \n\t" /* B */\
509
    "pand "MANGLE(bF8)", %%mm4  \n\t" /* G */\
510
    "pand "MANGLE(bF8)", %%mm5  \n\t" /* R */\
511
    "psrlq           $3, %%mm2  \n\t"\
512
    "psrlq           $1, %%mm5  \n\t"\
513
\
514
    "movq         %%mm2, %%mm1  \n\t"\
515
    "movq         %%mm4, %%mm3  \n\t"\
516
\
517
    "punpcklbw    %%mm7, %%mm3  \n\t"\
518
    "punpcklbw    %%mm5, %%mm2  \n\t"\
519
    "punpckhbw    %%mm7, %%mm4  \n\t"\
520
    "punpckhbw    %%mm5, %%mm1  \n\t"\
521
\
522
    "psllq           $2, %%mm3  \n\t"\
523
    "psllq           $2, %%mm4  \n\t"\
524
\
525
    "por          %%mm3, %%mm2  \n\t"\
526
    "por          %%mm4, %%mm1  \n\t"\
527
\
528
    MOVNTQ(%%mm2,  (dst, index, 2))\
529
    MOVNTQ(%%mm1, 8(dst, index, 2))\
530
\
531
    "add             $8, "#index"   \n\t"\
532
    "cmp         "dstw", "#index"   \n\t"\
533
    " jb             1b             \n\t"
534
#define WRITERGB15(dst, dstw, index)  REAL_WRITERGB15(dst, dstw, index)
535

536
static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
537
                                    const int16_t **lumSrc, int lumFilterSize,
538
                                    const int16_t *chrFilter, const int16_t **chrUSrc,
539
                                    const int16_t **chrVSrc,
540
                                    int chrFilterSize, const int16_t **alpSrc,
541
                                    uint8_t *dest, int dstW, int dstY)
542
{
543
    x86_reg dummy=0;
544
    x86_reg dstW_reg = dstW;
545
    x86_reg uv_off = c->uv_offx2;
546

547
    YSCALEYUV2PACKEDX_ACCURATE
548
    YSCALEYUV2RGBX
549
    "pxor %%mm7, %%mm7 \n\t"
550
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
551
#ifdef DITHER1XBPP
552
    "paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
553
    "paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
554
    "paddusb "RED_DITHER"(%0), %%mm5\n\t"
555
#endif
556
    WRITERGB15(%4, "%5", %%REGa)
557
    YSCALEYUV2PACKEDX_END
558
}
559

560
static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
561
                                 const int16_t **lumSrc, int lumFilterSize,
562
                                 const int16_t *chrFilter, const int16_t **chrUSrc,
563
                                 const int16_t **chrVSrc,
564
                                 int chrFilterSize, const int16_t **alpSrc,
565
                                 uint8_t *dest, int dstW, int dstY)
566
{
567
    x86_reg dummy=0;
568
    x86_reg dstW_reg = dstW;
569
    x86_reg uv_off = c->uv_offx2;
570

571
    YSCALEYUV2PACKEDX
572
    YSCALEYUV2RGBX
573
    "pxor %%mm7, %%mm7 \n\t"
574
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
575
#ifdef DITHER1XBPP
576
    "paddusb "BLUE_DITHER"(%0), %%mm2  \n\t"
577
    "paddusb "GREEN_DITHER"(%0), %%mm4  \n\t"
578
    "paddusb "RED_DITHER"(%0), %%mm5  \n\t"
579
#endif
580
    WRITERGB15(%4, "%5", %%REGa)
581
    YSCALEYUV2PACKEDX_END
582
}
583

584
#define WRITEBGR24MMX(dst, dstw, index) \
585
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
586
    "movq      %%mm2, %%mm1     \n\t" /* B */\
587
    "movq      %%mm5, %%mm6     \n\t" /* R */\
588
    "punpcklbw %%mm4, %%mm2     \n\t" /* GBGBGBGB 0 */\
589
    "punpcklbw %%mm7, %%mm5     \n\t" /* 0R0R0R0R 0 */\
590
    "punpckhbw %%mm4, %%mm1     \n\t" /* GBGBGBGB 2 */\
591
    "punpckhbw %%mm7, %%mm6     \n\t" /* 0R0R0R0R 2 */\
592
    "movq      %%mm2, %%mm0     \n\t" /* GBGBGBGB 0 */\
593
    "movq      %%mm1, %%mm3     \n\t" /* GBGBGBGB 2 */\
594
    "punpcklwd %%mm5, %%mm0     \n\t" /* 0RGB0RGB 0 */\
595
    "punpckhwd %%mm5, %%mm2     \n\t" /* 0RGB0RGB 1 */\
596
    "punpcklwd %%mm6, %%mm1     \n\t" /* 0RGB0RGB 2 */\
597
    "punpckhwd %%mm6, %%mm3     \n\t" /* 0RGB0RGB 3 */\
598
\
599
    "movq      %%mm0, %%mm4     \n\t" /* 0RGB0RGB 0 */\
600
    "movq      %%mm2, %%mm6     \n\t" /* 0RGB0RGB 1 */\
601
    "movq      %%mm1, %%mm5     \n\t" /* 0RGB0RGB 2 */\
602
    "movq      %%mm3, %%mm7     \n\t" /* 0RGB0RGB 3 */\
603
\
604
    "psllq       $40, %%mm0     \n\t" /* RGB00000 0 */\
605
    "psllq       $40, %%mm2     \n\t" /* RGB00000 1 */\
606
    "psllq       $40, %%mm1     \n\t" /* RGB00000 2 */\
607
    "psllq       $40, %%mm3     \n\t" /* RGB00000 3 */\
608
\
609
    "punpckhdq %%mm4, %%mm0     \n\t" /* 0RGBRGB0 0 */\
610
    "punpckhdq %%mm6, %%mm2     \n\t" /* 0RGBRGB0 1 */\
611
    "punpckhdq %%mm5, %%mm1     \n\t" /* 0RGBRGB0 2 */\
612
    "punpckhdq %%mm7, %%mm3     \n\t" /* 0RGBRGB0 3 */\
613
\
614
    "psrlq        $8, %%mm0     \n\t" /* 00RGBRGB 0 */\
615
    "movq      %%mm2, %%mm6     \n\t" /* 0RGBRGB0 1 */\
616
    "psllq       $40, %%mm2     \n\t" /* GB000000 1 */\
617
    "por       %%mm2, %%mm0     \n\t" /* GBRGBRGB 0 */\
618
    MOVNTQ(%%mm0, (dst))\
619
\
620
    "psrlq       $24, %%mm6     \n\t" /* 0000RGBR 1 */\
621
    "movq      %%mm1, %%mm5     \n\t" /* 0RGBRGB0 2 */\
622
    "psllq       $24, %%mm1     \n\t" /* BRGB0000 2 */\
623
    "por       %%mm1, %%mm6     \n\t" /* BRGBRGBR 1 */\
624
    MOVNTQ(%%mm6, 8(dst))\
625
\
626
    "psrlq       $40, %%mm5     \n\t" /* 000000RG 2 */\
627
    "psllq        $8, %%mm3     \n\t" /* RGBRGB00 3 */\
628
    "por       %%mm3, %%mm5     \n\t" /* RGBRGBRG 2 */\
629
    MOVNTQ(%%mm5, 16(dst))\
630
\
631
    "add         $24, "#dst"    \n\t"\
632
\
633
    "add          $8, "#index"  \n\t"\
634
    "cmp      "dstw", "#index"  \n\t"\
635
    " jb          1b            \n\t"
636

637
#define WRITEBGR24MMXEXT(dst, dstw, index) \
638
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
639
    "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
640
    "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
641
    "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2  B1 B0 B1 B0 */\
642
    "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2  G1 G0 G1 G0 */\
643
    "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0  R1 R0 R1 R0 */\
644
\
645
    "pand   %%mm0, %%mm1        \n\t" /*    B2        B1       B0 */\
646
    "pand   %%mm0, %%mm3        \n\t" /*    G2        G1       G0 */\
647
    "pand   %%mm7, %%mm6        \n\t" /*       R1        R0       */\
648
\
649
    "psllq     $8, %%mm3        \n\t" /* G2        G1       G0    */\
650
    "por    %%mm1, %%mm6        \n\t"\
651
    "por    %%mm3, %%mm6        \n\t"\
652
    MOVNTQ(%%mm6, (dst))\
653
\
654
    "psrlq     $8, %%mm4        \n\t" /* 00 G7 G6 G5  G4 G3 G2 G1 */\
655
    "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4  B3 B2 B3 B2 */\
656
    "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3  G4 G3 G4 G3 */\
657
    "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4  R3 R2 R3 R2 */\
658
\
659
    "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5       B4        B3    */\
660
    "pand   %%mm7, %%mm3        \n\t" /*       G4        G3       */\
661
    "pand   %%mm0, %%mm6        \n\t" /*    R4        R3       R2 */\
662
\
663
    "por    %%mm1, %%mm3        \n\t" /* B5    G4 B4     G3 B3    */\
664
    "por    %%mm3, %%mm6        \n\t"\
665
    MOVNTQ(%%mm6, 8(dst))\
666
\
667
    "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6  B7 B6 B6 B7 */\
668
    "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7  G6 G5 G6 G5 */\
669
    "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6  R5 R4 R5 R4 */\
670
\
671
    "pand   %%mm7, %%mm1        \n\t" /*       B7        B6       */\
672
    "pand   %%mm0, %%mm3        \n\t" /*    G7        G6       G5 */\
673
    "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7       R6        R5    */\
674
\
675
    "por    %%mm1, %%mm3        \n\t"\
676
    "por    %%mm3, %%mm6        \n\t"\
677
    MOVNTQ(%%mm6, 16(dst))\
678
\
679
    "add      $24, "#dst"       \n\t"\
680
\
681
    "add       $8, "#index"     \n\t"\
682
    "cmp   "dstw", "#index"     \n\t"\
683
    " jb       1b               \n\t"
684

685
#if COMPILE_TEMPLATE_MMXEXT
686
#undef WRITEBGR24
687
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMXEXT(dst, dstw, index)
688
#else
689
#undef WRITEBGR24
690
#define WRITEBGR24(dst, dstw, index)  WRITEBGR24MMX(dst, dstw, index)
691
#endif
692

693
#if HAVE_6REGS
694
static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
695
                                   const int16_t **lumSrc, int lumFilterSize,
696
                                   const int16_t *chrFilter, const int16_t **chrUSrc,
697
                                   const int16_t **chrVSrc,
698
                                   int chrFilterSize, const int16_t **alpSrc,
699
                                   uint8_t *dest, int dstW, int dstY)
700
{
701
    x86_reg dummy=0;
702
    x86_reg dstW_reg = dstW;
703
    x86_reg uv_off = c->uv_offx2;
704

705
    YSCALEYUV2PACKEDX_ACCURATE
706
    YSCALEYUV2RGBX
707
    "pxor %%mm7, %%mm7 \n\t"
708
    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
709
    "add %4, %%"REG_c"                        \n\t"
710
    WRITEBGR24(%%REGc, "%5", %%REGa)
711
    :: "r" (&c->redDither),
712
       "m" (dummy), "m" (dummy), "m" (dummy),
713
       "r" (dest), "m" (dstW_reg), "m"(uv_off)
714
       NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
715
    : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
716
    );
717
}
718

719
static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
720
                                const int16_t **lumSrc, int lumFilterSize,
721
                                const int16_t *chrFilter, const int16_t **chrUSrc,
722
                                const int16_t **chrVSrc,
723
                                int chrFilterSize, const int16_t **alpSrc,
724
                                uint8_t *dest, int dstW, int dstY)
725
{
726
    x86_reg dummy=0;
727
    x86_reg dstW_reg = dstW;
728
    x86_reg uv_off = c->uv_offx2;
729

730
    YSCALEYUV2PACKEDX
731
    YSCALEYUV2RGBX
732
    "pxor                    %%mm7, %%mm7       \n\t"
733
    "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"   \n\t" //FIXME optimize
734
    "add                        %4, %%"REG_c"   \n\t"
735
    WRITEBGR24(%%REGc, "%5", %%REGa)
736
    :: "r" (&c->redDither),
737
       "m" (dummy), "m" (dummy), "m" (dummy),
738
       "r" (dest),  "m" (dstW_reg), "m"(uv_off)
739
       NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
740
    : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
741
    );
742
}
743
#endif /* HAVE_6REGS */
744

745
#define REAL_WRITEYUY2(dst, dstw, index) \
746
    "packuswb  %%mm3, %%mm3     \n\t"\
747
    "packuswb  %%mm4, %%mm4     \n\t"\
748
    "packuswb  %%mm7, %%mm1     \n\t"\
749
    "punpcklbw %%mm4, %%mm3     \n\t"\
750
    "movq      %%mm1, %%mm7     \n\t"\
751
    "punpcklbw %%mm3, %%mm1     \n\t"\
752
    "punpckhbw %%mm3, %%mm7     \n\t"\
753
\
754
    MOVNTQ(%%mm1, (dst, index, 2))\
755
    MOVNTQ(%%mm7, 8(dst, index, 2))\
756
\
757
    "add          $8, "#index"  \n\t"\
758
    "cmp      "dstw", "#index"  \n\t"\
759
    " jb          1b            \n\t"
760
#define WRITEYUY2(dst, dstw, index)  REAL_WRITEYUY2(dst, dstw, index)
761

762
static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
763
                                     const int16_t **lumSrc, int lumFilterSize,
764
                                     const int16_t *chrFilter, const int16_t **chrUSrc,
765
                                     const int16_t **chrVSrc,
766
                                     int chrFilterSize, const int16_t **alpSrc,
767
                                     uint8_t *dest, int dstW, int dstY)
768
{
769
    x86_reg dummy=0;
770
    x86_reg dstW_reg = dstW;
771
    x86_reg uv_off = c->uv_offx2;
772

773
    YSCALEYUV2PACKEDX_ACCURATE
774
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
775
    "psraw $3, %%mm3    \n\t"
776
    "psraw $3, %%mm4    \n\t"
777
    "psraw $3, %%mm1    \n\t"
778
    "psraw $3, %%mm7    \n\t"
779
    WRITEYUY2(%4, "%5", %%REGa)
780
    YSCALEYUV2PACKEDX_END
781
}
782

783
static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
784
                                  const int16_t **lumSrc, int lumFilterSize,
785
                                  const int16_t *chrFilter, const int16_t **chrUSrc,
786
                                  const int16_t **chrVSrc,
787
                                  int chrFilterSize, const int16_t **alpSrc,
788
                                  uint8_t *dest, int dstW, int dstY)
789
{
790
    x86_reg dummy=0;
791
    x86_reg dstW_reg = dstW;
792
    x86_reg uv_off = c->uv_offx2;
793

794
    YSCALEYUV2PACKEDX
795
    /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
796
    "psraw $3, %%mm3    \n\t"
797
    "psraw $3, %%mm4    \n\t"
798
    "psraw $3, %%mm1    \n\t"
799
    "psraw $3, %%mm7    \n\t"
800
    WRITEYUY2(%4, "%5", %%REGa)
801
    YSCALEYUV2PACKEDX_END
802
}
803

804
#define REAL_YSCALEYUV2RGB_UV(index, c) \
805
    "xor            "#index", "#index"  \n\t"\
806
    ".p2align              4            \n\t"\
807
    "1:                                 \n\t"\
808
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
809
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
810
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
811
    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
812
    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
813
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
814
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
815
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
816
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
817
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
818
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
819
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
820
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
821
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
822
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
823
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
824
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
825
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
826
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
827
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
828
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
829
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
830

831
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
832
    "movq  ("#b1", "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
833
    "movq  ("#b2", "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
834
    "movq 8("#b1", "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
835
    "movq 8("#b2", "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
836
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
837
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
838
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
839
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
840
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
841
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
842
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
843
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
844

845
#define REAL_YSCALEYUV2RGB_COEFF(c) \
846
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
847
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
848
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
849
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
850
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
851
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
852
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
853
    "paddw             %%mm3, %%mm4     \n\t"\
854
    "movq              %%mm2, %%mm0     \n\t"\
855
    "movq              %%mm5, %%mm6     \n\t"\
856
    "movq              %%mm4, %%mm3     \n\t"\
857
    "punpcklwd         %%mm2, %%mm2     \n\t"\
858
    "punpcklwd         %%mm5, %%mm5     \n\t"\
859
    "punpcklwd         %%mm4, %%mm4     \n\t"\
860
    "paddw             %%mm1, %%mm2     \n\t"\
861
    "paddw             %%mm1, %%mm5     \n\t"\
862
    "paddw             %%mm1, %%mm4     \n\t"\
863
    "punpckhwd         %%mm0, %%mm0     \n\t"\
864
    "punpckhwd         %%mm6, %%mm6     \n\t"\
865
    "punpckhwd         %%mm3, %%mm3     \n\t"\
866
    "paddw             %%mm7, %%mm0     \n\t"\
867
    "paddw             %%mm7, %%mm6     \n\t"\
868
    "paddw             %%mm7, %%mm3     \n\t"\
869
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
870
    "packuswb          %%mm0, %%mm2     \n\t"\
871
    "packuswb          %%mm6, %%mm5     \n\t"\
872
    "packuswb          %%mm3, %%mm4     \n\t"\
873

874
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
875

876
#define YSCALEYUV2RGB(index, c) \
877
    REAL_YSCALEYUV2RGB_UV(index, c) \
878
    REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
879
    REAL_YSCALEYUV2RGB_COEFF(c)
880

881
/**
882
 * vertical bilinear scale YV12 to RGB
883
 */
884
static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
885
                                const int16_t *ubuf[2], const int16_t *vbuf[2],
886
                                const int16_t *abuf[2], uint8_t *dest,
887
                                int dstW, int yalpha, int uvalpha, int y)
888
{
889
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
890
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
891

892
    if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
893
        const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
894
#if ARCH_X86_64
895
        __asm__ volatile(
896
            YSCALEYUV2RGB(%%r8, %5)
897
            YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
898
            "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
899
            "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
900
            "packuswb            %%mm7, %%mm1       \n\t"
901
            WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
902
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
903
               "a" (&c->redDither),
904
               "r" (abuf0), "r" (abuf1)
905
            : "%r8"
906
        );
907
#else
908
        c->u_temp=(intptr_t)abuf0;
909
        c->v_temp=(intptr_t)abuf1;
910
        __asm__ volatile(
911
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
912
            "mov        %4, %%"REG_b"               \n\t"
913
            "push %%"REG_BP"                        \n\t"
914
            YSCALEYUV2RGB(%%REGBP, %5)
915
            "push                   %0              \n\t"
916
            "push                   %1              \n\t"
917
            "mov          "U_TEMP"(%5), %0          \n\t"
918
            "mov          "V_TEMP"(%5), %1          \n\t"
919
            YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
920
            "psraw                  $3, %%mm1       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
921
            "psraw                  $3, %%mm7       \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
922
            "packuswb            %%mm7, %%mm1       \n\t"
923
            "pop                    %1              \n\t"
924
            "pop                    %0              \n\t"
925
            WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
926
            "pop %%"REG_BP"                         \n\t"
927
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
928
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
929
               "a" (&c->redDither)
930
        );
931
#endif
932
    } else {
933
        __asm__ volatile(
934
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
935
            "mov        %4, %%"REG_b"               \n\t"
936
            "push %%"REG_BP"                        \n\t"
937
            YSCALEYUV2RGB(%%REGBP, %5)
938
            "pcmpeqd %%mm7, %%mm7                   \n\t"
939
            WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
940
            "pop %%"REG_BP"                         \n\t"
941
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
942
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
943
               "a" (&c->redDither)
944
        );
945
    }
946
}
947

948
static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
949
                                const int16_t *ubuf[2], const int16_t *vbuf[2],
950
                                const int16_t *abuf[2], uint8_t *dest,
951
                                int dstW, int yalpha, int uvalpha, int y)
952
{
953
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
954
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
955

956
    __asm__ volatile(
957
        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
958
        "mov        %4, %%"REG_b"               \n\t"
959
        "push %%"REG_BP"                        \n\t"
960
        YSCALEYUV2RGB(%%REGBP, %5)
961
        "pxor    %%mm7, %%mm7                   \n\t"
962
        WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
963
        "pop %%"REG_BP"                         \n\t"
964
        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
965
        :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
966
           "a" (&c->redDither)
967
           NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
968
    );
969
}
970

971
static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
972
                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
973
                                 const int16_t *abuf[2], uint8_t *dest,
974
                                 int dstW, int yalpha, int uvalpha, int y)
975
{
976
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
977
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
978

979
    __asm__ volatile(
980
        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
981
        "mov        %4, %%"REG_b"               \n\t"
982
        "push %%"REG_BP"                        \n\t"
983
        YSCALEYUV2RGB(%%REGBP, %5)
984
        "pxor    %%mm7, %%mm7                   \n\t"
985
        /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
986
#ifdef DITHER1XBPP
987
        "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
988
        "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
989
        "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
990
#endif
991
        WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
992
        "pop %%"REG_BP"                         \n\t"
993
        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
994
        :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
995
           "a" (&c->redDither)
996
           NAMED_CONSTRAINTS_ADD(bF8)
997
    );
998
}
999

1000
static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
1001
                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
1002
                                 const int16_t *abuf[2], uint8_t *dest,
1003
                                 int dstW, int yalpha, int uvalpha, int y)
1004
{
1005
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1006
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1007

1008
    __asm__ volatile(
1009
        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1010
        "mov        %4, %%"REG_b"               \n\t"
1011
        "push %%"REG_BP"                        \n\t"
1012
        YSCALEYUV2RGB(%%REGBP, %5)
1013
        "pxor    %%mm7, %%mm7                   \n\t"
1014
        /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1015
#ifdef DITHER1XBPP
1016
        "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1017
        "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1018
        "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1019
#endif
1020
        WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1021
        "pop %%"REG_BP"                         \n\t"
1022
        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1023
        :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1024
           "a" (&c->redDither)
1025
           NAMED_CONSTRAINTS_ADD(bF8,bFC)
1026
    );
1027
}
1028

1029
#define REAL_YSCALEYUV2PACKED(index, c) \
1030
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0              \n\t"\
1031
    "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1              \n\t"\
1032
    "psraw                $3, %%mm0                           \n\t"\
1033
    "psraw                $3, %%mm1                           \n\t"\
1034
    "movq              %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
1035
    "movq              %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
1036
    "xor            "#index", "#index"                        \n\t"\
1037
    ".p2align              4            \n\t"\
1038
    "1:                                 \n\t"\
1039
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
1040
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
1041
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1042
    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
1043
    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
1044
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1045
    "psubw             %%mm3, %%mm2     \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
1046
    "psubw             %%mm4, %%mm5     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
1047
    "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0    \n\t"\
1048
    "pmulhw            %%mm0, %%mm2     \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
1049
    "pmulhw            %%mm0, %%mm5     \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
1050
    "psraw                $7, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
1051
    "psraw                $7, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
1052
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
1053
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
1054
    "movq  (%0, "#index", 2), %%mm0     \n\t" /*buf0[eax]*/\
1055
    "movq  (%1, "#index", 2), %%mm1     \n\t" /*buf1[eax]*/\
1056
    "movq 8(%0, "#index", 2), %%mm6     \n\t" /*buf0[eax]*/\
1057
    "movq 8(%1, "#index", 2), %%mm7     \n\t" /*buf1[eax]*/\
1058
    "psubw             %%mm1, %%mm0     \n\t" /* buf0[eax] - buf1[eax]*/\
1059
    "psubw             %%mm7, %%mm6     \n\t" /* buf0[eax] - buf1[eax]*/\
1060
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
1061
    "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6  \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
1062
    "psraw                $7, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1063
    "psraw                $7, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1064
    "paddw             %%mm0, %%mm1     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
1065
    "paddw             %%mm6, %%mm7     \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
1066

1067
#define YSCALEYUV2PACKED(index, c)  REAL_YSCALEYUV2PACKED(index, c)
1068

1069
static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
1070
                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
1071
                                  const int16_t *abuf[2], uint8_t *dest,
1072
                                  int dstW, int yalpha, int uvalpha, int y)
1073
{
1074
    const int16_t *buf0  = buf[0],  *buf1  = buf[1],
1075
                  *ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
1076

1077
    __asm__ volatile(
1078
        "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1079
        "mov %4, %%"REG_b"                        \n\t"
1080
        "push %%"REG_BP"                        \n\t"
1081
        YSCALEYUV2PACKED(%%REGBP, %5)
1082
        WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1083
        "pop %%"REG_BP"                         \n\t"
1084
        "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1085
        :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1086
           "a" (&c->redDither)
1087
    );
1088
}
1089

1090
#define REAL_YSCALEYUV2RGB1(index, c) \
1091
    "xor            "#index", "#index"  \n\t"\
1092
    ".p2align              4            \n\t"\
1093
    "1:                                 \n\t"\
1094
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
1095
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1096
    "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
1097
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1098
    "psraw                $4, %%mm3     \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
1099
    "psraw                $4, %%mm4     \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
1100
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
1101
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
1102
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
1103
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
1104
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
1105
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
1106
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1107
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
1108
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
1109
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1110
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1111
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
1112
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
1113
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
1114
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
1115
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
1116
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
1117
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1118
    "paddw             %%mm3, %%mm4     \n\t"\
1119
    "movq              %%mm2, %%mm0     \n\t"\
1120
    "movq              %%mm5, %%mm6     \n\t"\
1121
    "movq              %%mm4, %%mm3     \n\t"\
1122
    "punpcklwd         %%mm2, %%mm2     \n\t"\
1123
    "punpcklwd         %%mm5, %%mm5     \n\t"\
1124
    "punpcklwd         %%mm4, %%mm4     \n\t"\
1125
    "paddw             %%mm1, %%mm2     \n\t"\
1126
    "paddw             %%mm1, %%mm5     \n\t"\
1127
    "paddw             %%mm1, %%mm4     \n\t"\
1128
    "punpckhwd         %%mm0, %%mm0     \n\t"\
1129
    "punpckhwd         %%mm6, %%mm6     \n\t"\
1130
    "punpckhwd         %%mm3, %%mm3     \n\t"\
1131
    "paddw             %%mm7, %%mm0     \n\t"\
1132
    "paddw             %%mm7, %%mm6     \n\t"\
1133
    "paddw             %%mm7, %%mm3     \n\t"\
1134
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1135
    "packuswb          %%mm0, %%mm2     \n\t"\
1136
    "packuswb          %%mm6, %%mm5     \n\t"\
1137
    "packuswb          %%mm3, %%mm4     \n\t"\
1138

1139
#define YSCALEYUV2RGB1(index, c)  REAL_YSCALEYUV2RGB1(index, c)
1140

1141
// do vertical chrominance interpolation
1142
#define REAL_YSCALEYUV2RGB1b(index, c) \
1143
    "xor            "#index", "#index"  \n\t"\
1144
    ".p2align              4            \n\t"\
1145
    "1:                                 \n\t"\
1146
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
1147
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
1148
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1149
    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
1150
    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
1151
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1152
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1153
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1154
    "psrlw                $5, %%mm3     \n\t" /*FIXME might overflow*/\
1155
    "psrlw                $5, %%mm4     \n\t" /*FIXME might overflow*/\
1156
    "psubw  "U_OFFSET"("#c"), %%mm3     \n\t" /* (U-128)8*/\
1157
    "psubw  "V_OFFSET"("#c"), %%mm4     \n\t" /* (V-128)8*/\
1158
    "movq              %%mm3, %%mm2     \n\t" /* (U-128)8*/\
1159
    "movq              %%mm4, %%mm5     \n\t" /* (V-128)8*/\
1160
    "pmulhw "UG_COEFF"("#c"), %%mm3     \n\t"\
1161
    "pmulhw "VG_COEFF"("#c"), %%mm4     \n\t"\
1162
    /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
1163
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
1164
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
1165
    "psraw                $4, %%mm1     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1166
    "psraw                $4, %%mm7     \n\t" /* buf0[eax] - buf1[eax] >>4*/\
1167
    "pmulhw "UB_COEFF"("#c"), %%mm2     \n\t"\
1168
    "pmulhw "VR_COEFF"("#c"), %%mm5     \n\t"\
1169
    "psubw  "Y_OFFSET"("#c"), %%mm1     \n\t" /* 8(Y-16)*/\
1170
    "psubw  "Y_OFFSET"("#c"), %%mm7     \n\t" /* 8(Y-16)*/\
1171
    "pmulhw  "Y_COEFF"("#c"), %%mm1     \n\t"\
1172
    "pmulhw  "Y_COEFF"("#c"), %%mm7     \n\t"\
1173
    /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
1174
    "paddw             %%mm3, %%mm4     \n\t"\
1175
    "movq              %%mm2, %%mm0     \n\t"\
1176
    "movq              %%mm5, %%mm6     \n\t"\
1177
    "movq              %%mm4, %%mm3     \n\t"\
1178
    "punpcklwd         %%mm2, %%mm2     \n\t"\
1179
    "punpcklwd         %%mm5, %%mm5     \n\t"\
1180
    "punpcklwd         %%mm4, %%mm4     \n\t"\
1181
    "paddw             %%mm1, %%mm2     \n\t"\
1182
    "paddw             %%mm1, %%mm5     \n\t"\
1183
    "paddw             %%mm1, %%mm4     \n\t"\
1184
    "punpckhwd         %%mm0, %%mm0     \n\t"\
1185
    "punpckhwd         %%mm6, %%mm6     \n\t"\
1186
    "punpckhwd         %%mm3, %%mm3     \n\t"\
1187
    "paddw             %%mm7, %%mm0     \n\t"\
1188
    "paddw             %%mm7, %%mm6     \n\t"\
1189
    "paddw             %%mm7, %%mm3     \n\t"\
1190
    /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
1191
    "packuswb          %%mm0, %%mm2     \n\t"\
1192
    "packuswb          %%mm6, %%mm5     \n\t"\
1193
    "packuswb          %%mm3, %%mm4     \n\t"\
1194

1195
#define YSCALEYUV2RGB1b(index, c)  REAL_YSCALEYUV2RGB1b(index, c)
1196

1197
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
1198
    "movq  (%1, "#index", 2), %%mm7     \n\t" /* abuf0[index  ]     */\
1199
    "movq 8(%1, "#index", 2), %%mm1     \n\t" /* abuf0[index+4]     */\
1200
    "psraw                $7, %%mm7     \n\t" /* abuf0[index  ] >>7 */\
1201
    "psraw                $7, %%mm1     \n\t" /* abuf0[index+4] >>7 */\
1202
    "packuswb          %%mm1, %%mm7     \n\t"
1203
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
1204

1205
/**
1206
 * YV12 to RGB without scaling or interpolating
1207
 */
1208
static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
1209
                                const int16_t *ubuf[2], const int16_t *vbuf[2],
1210
                                const int16_t *abuf0, uint8_t *dest,
1211
                                int dstW, int uvalpha, int y)
1212
{
1213
    const int16_t *ubuf0 = ubuf[0];
1214
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1215

1216
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1217
        const int16_t *ubuf1 = ubuf[0];
1218
        if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1219
            __asm__ volatile(
1220
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1221
                "mov        %4, %%"REG_b"               \n\t"
1222
                "push %%"REG_BP"                        \n\t"
1223
                YSCALEYUV2RGB1(%%REGBP, %5)
1224
                YSCALEYUV2RGB1_ALPHA(%%REGBP)
1225
                WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1226
                "pop %%"REG_BP"                         \n\t"
1227
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1228
                :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1229
                   "a" (&c->redDither)
1230
            );
1231
        } else {
1232
            __asm__ volatile(
1233
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1234
                "mov        %4, %%"REG_b"               \n\t"
1235
                "push %%"REG_BP"                        \n\t"
1236
                YSCALEYUV2RGB1(%%REGBP, %5)
1237
                "pcmpeqd %%mm7, %%mm7                   \n\t"
1238
                WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1239
                "pop %%"REG_BP"                         \n\t"
1240
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1241
                :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1242
                   "a" (&c->redDither)
1243
            );
1244
        }
1245
    } else {
1246
        const int16_t *ubuf1 = ubuf[1];
1247
        if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
1248
            __asm__ volatile(
1249
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1250
                "mov        %4, %%"REG_b"               \n\t"
1251
                "push %%"REG_BP"                        \n\t"
1252
                YSCALEYUV2RGB1b(%%REGBP, %5)
1253
                YSCALEYUV2RGB1_ALPHA(%%REGBP)
1254
                WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1255
                "pop %%"REG_BP"                         \n\t"
1256
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1257
                :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1258
                   "a" (&c->redDither)
1259
            );
1260
        } else {
1261
            __asm__ volatile(
1262
                "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1263
                "mov        %4, %%"REG_b"               \n\t"
1264
                "push %%"REG_BP"                        \n\t"
1265
                YSCALEYUV2RGB1b(%%REGBP, %5)
1266
                "pcmpeqd %%mm7, %%mm7                   \n\t"
1267
                WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
1268
                "pop %%"REG_BP"                         \n\t"
1269
                "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1270
                :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1271
                   "a" (&c->redDither)
1272
            );
1273
        }
1274
    }
1275
}
1276

1277
static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
1278
                                const int16_t *ubuf[2], const int16_t *vbuf[2],
1279
                                const int16_t *abuf0, uint8_t *dest,
1280
                                int dstW, int uvalpha, int y)
1281
{
1282
    const int16_t *ubuf0 = ubuf[0];
1283
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1284

1285
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1286
        const int16_t *ubuf1 = ubuf[0];
1287
        __asm__ volatile(
1288
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1289
            "mov        %4, %%"REG_b"               \n\t"
1290
            "push %%"REG_BP"                        \n\t"
1291
            YSCALEYUV2RGB1(%%REGBP, %5)
1292
            "pxor    %%mm7, %%mm7                   \n\t"
1293
            WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1294
            "pop %%"REG_BP"                         \n\t"
1295
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1296
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1297
               "a" (&c->redDither)
1298
               NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
1299
        );
1300
    } else {
1301
        const int16_t *ubuf1 = ubuf[1];
1302
        __asm__ volatile(
1303
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1304
            "mov        %4, %%"REG_b"               \n\t"
1305
            "push %%"REG_BP"                        \n\t"
1306
            YSCALEYUV2RGB1b(%%REGBP, %5)
1307
            "pxor    %%mm7, %%mm7                   \n\t"
1308
            WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1309
            "pop %%"REG_BP"                         \n\t"
1310
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1311
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1312
               "a" (&c->redDither)
1313
               NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
1314
        );
1315
    }
1316
}
1317

1318
static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
1319
                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
1320
                                 const int16_t *abuf0, uint8_t *dest,
1321
                                 int dstW, int uvalpha, int y)
1322
{
1323
    const int16_t *ubuf0 = ubuf[0];
1324
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1325

1326
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1327
        const int16_t *ubuf1 = ubuf[0];
1328
        __asm__ volatile(
1329
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1330
            "mov        %4, %%"REG_b"               \n\t"
1331
            "push %%"REG_BP"                        \n\t"
1332
            YSCALEYUV2RGB1(%%REGBP, %5)
1333
            "pxor    %%mm7, %%mm7                   \n\t"
1334
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1335
#ifdef DITHER1XBPP
1336
            "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1337
            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1338
            "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1339
#endif
1340
            WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1341
            "pop %%"REG_BP"                         \n\t"
1342
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1343
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1344
               "a" (&c->redDither)
1345
               NAMED_CONSTRAINTS_ADD(bF8)
1346
        );
1347
    } else {
1348
        const int16_t *ubuf1 = ubuf[1];
1349
        __asm__ volatile(
1350
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1351
            "mov        %4, %%"REG_b"               \n\t"
1352
            "push %%"REG_BP"                        \n\t"
1353
            YSCALEYUV2RGB1b(%%REGBP, %5)
1354
            "pxor    %%mm7, %%mm7                   \n\t"
1355
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1356
#ifdef DITHER1XBPP
1357
            "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1358
            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1359
            "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1360
#endif
1361
            WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1362
            "pop %%"REG_BP"                         \n\t"
1363
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1364
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1365
               "a" (&c->redDither)
1366
               NAMED_CONSTRAINTS_ADD(bF8)
1367
        );
1368
    }
1369
}
1370

1371
static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
1372
                                 const int16_t *ubuf[2], const int16_t *vbuf[2],
1373
                                 const int16_t *abuf0, uint8_t *dest,
1374
                                 int dstW, int uvalpha, int y)
1375
{
1376
    const int16_t *ubuf0 = ubuf[0];
1377
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1378

1379
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1380
        const int16_t *ubuf1 = ubuf[0];
1381
        __asm__ volatile(
1382
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1383
            "mov        %4, %%"REG_b"               \n\t"
1384
            "push %%"REG_BP"                        \n\t"
1385
            YSCALEYUV2RGB1(%%REGBP, %5)
1386
            "pxor    %%mm7, %%mm7                   \n\t"
1387
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1388
#ifdef DITHER1XBPP
1389
            "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1390
            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1391
            "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1392
#endif
1393
            WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1394
            "pop %%"REG_BP"                         \n\t"
1395
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1396
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1397
               "a" (&c->redDither)
1398
               NAMED_CONSTRAINTS_ADD(bF8,bFC)
1399
        );
1400
    } else {
1401
        const int16_t *ubuf1 = ubuf[1];
1402
        __asm__ volatile(
1403
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1404
            "mov        %4, %%"REG_b"               \n\t"
1405
            "push %%"REG_BP"                        \n\t"
1406
            YSCALEYUV2RGB1b(%%REGBP, %5)
1407
            "pxor    %%mm7, %%mm7                   \n\t"
1408
            /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
1409
#ifdef DITHER1XBPP
1410
            "paddusb "BLUE_DITHER"(%5), %%mm2      \n\t"
1411
            "paddusb "GREEN_DITHER"(%5), %%mm4      \n\t"
1412
            "paddusb "RED_DITHER"(%5), %%mm5      \n\t"
1413
#endif
1414
            WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1415
            "pop %%"REG_BP"                         \n\t"
1416
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1417
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1418
               "a" (&c->redDither)
1419
               NAMED_CONSTRAINTS_ADD(bF8,bFC)
1420
        );
1421
    }
1422
}
1423

1424
#define REAL_YSCALEYUV2PACKED1(index, c) \
1425
    "xor            "#index", "#index"  \n\t"\
1426
    ".p2align              4            \n\t"\
1427
    "1:                                 \n\t"\
1428
    "movq     (%2, "#index"), %%mm3     \n\t" /* uvbuf0[eax]*/\
1429
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1430
    "movq     (%2, "#index"), %%mm4     \n\t" /* uvbuf0[eax+2048]*/\
1431
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1432
    "psraw                $7, %%mm3     \n\t" \
1433
    "psraw                $7, %%mm4     \n\t" \
1434
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
1435
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
1436
    "psraw                $7, %%mm1     \n\t" \
1437
    "psraw                $7, %%mm7     \n\t" \
1438

1439
#define YSCALEYUV2PACKED1(index, c)  REAL_YSCALEYUV2PACKED1(index, c)
1440

1441
#define REAL_YSCALEYUV2PACKED1b(index, c) \
1442
    "xor "#index", "#index"             \n\t"\
1443
    ".p2align              4            \n\t"\
1444
    "1:                                 \n\t"\
1445
    "movq     (%2, "#index"), %%mm2     \n\t" /* uvbuf0[eax]*/\
1446
    "movq     (%3, "#index"), %%mm3     \n\t" /* uvbuf1[eax]*/\
1447
    "add "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1448
    "movq     (%2, "#index"), %%mm5     \n\t" /* uvbuf0[eax+2048]*/\
1449
    "movq     (%3, "#index"), %%mm4     \n\t" /* uvbuf1[eax+2048]*/\
1450
    "sub "UV_OFF_BYTE"("#c"), "#index"  \n\t" \
1451
    "paddw             %%mm2, %%mm3     \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
1452
    "paddw             %%mm5, %%mm4     \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
1453
    "psrlw                $8, %%mm3     \n\t" \
1454
    "psrlw                $8, %%mm4     \n\t" \
1455
    "movq  (%0, "#index", 2), %%mm1     \n\t" /*buf0[eax]*/\
1456
    "movq 8(%0, "#index", 2), %%mm7     \n\t" /*buf0[eax]*/\
1457
    "psraw                $7, %%mm1     \n\t" \
1458
    "psraw                $7, %%mm7     \n\t"
1459
#define YSCALEYUV2PACKED1b(index, c)  REAL_YSCALEYUV2PACKED1b(index, c)
1460

1461
static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
1462
                                  const int16_t *ubuf[2], const int16_t *vbuf[2],
1463
                                  const int16_t *abuf0, uint8_t *dest,
1464
                                  int dstW, int uvalpha, int y)
1465
{
1466
    const int16_t *ubuf0 = ubuf[0];
1467
    const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
1468

1469
    if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
1470
        const int16_t *ubuf1 = ubuf[0];
1471
        __asm__ volatile(
1472
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1473
            "mov        %4, %%"REG_b"               \n\t"
1474
            "push %%"REG_BP"                        \n\t"
1475
            YSCALEYUV2PACKED1(%%REGBP, %5)
1476
            WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1477
            "pop %%"REG_BP"                         \n\t"
1478
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1479
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1480
               "a" (&c->redDither)
1481
        );
1482
    } else {
1483
        const int16_t *ubuf1 = ubuf[1];
1484
        __asm__ volatile(
1485
            "mov %%"REG_b", "ESP_OFFSET"(%5)        \n\t"
1486
            "mov        %4, %%"REG_b"               \n\t"
1487
            "push %%"REG_BP"                        \n\t"
1488
            YSCALEYUV2PACKED1b(%%REGBP, %5)
1489
            WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
1490
            "pop %%"REG_BP"                         \n\t"
1491
            "mov "ESP_OFFSET"(%5), %%"REG_b"        \n\t"
1492
            :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
1493
               "a" (&c->redDither)
1494
        );
1495
    }
1496
}
1497
static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
1498
{
1499
    enum AVPixelFormat dstFormat = c->dstFormat;
1500

1501
    c->use_mmx_vfilter= 0;
1502
    if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
1503
        && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
1504
            if (c->flags & SWS_ACCURATE_RND) {
1505
                if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1506
                    switch (c->dstFormat) {
1507
                    case AV_PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X_ar);   break;
1508
#if HAVE_6REGS
1509
                    case AV_PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X_ar);   break;
1510
#endif
1511
                    case AV_PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X_ar);  break;
1512
                    case AV_PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X_ar);  break;
1513
                    case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
1514
                    default: break;
1515
                    }
1516
                }
1517
            } else {
1518
                c->use_mmx_vfilter= 1;
1519
                c->yuv2planeX = RENAME(yuv2yuvX    );
1520
                if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1521
                    switch (c->dstFormat) {
1522
                    case AV_PIX_FMT_RGB32:   c->yuv2packedX = RENAME(yuv2rgb32_X);   break;
1523
                    case AV_PIX_FMT_BGR32:   c->yuv2packedX = RENAME(yuv2bgr32_X);   break;
1524
#if HAVE_6REGS
1525
                    case AV_PIX_FMT_BGR24:   c->yuv2packedX = RENAME(yuv2bgr24_X);   break;
1526
#endif
1527
                    case AV_PIX_FMT_RGB555:  c->yuv2packedX = RENAME(yuv2rgb555_X);  break;
1528
                    case AV_PIX_FMT_RGB565:  c->yuv2packedX = RENAME(yuv2rgb565_X);  break;
1529
                    case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
1530
                    default: break;
1531
                    }
1532
                }
1533
            }
1534
        if (!(c->flags & SWS_FULL_CHR_H_INT)) {
1535
            switch (c->dstFormat) {
1536
            case AV_PIX_FMT_RGB32:
1537
                c->yuv2packed1 = RENAME(yuv2rgb32_1);
1538
                c->yuv2packed2 = RENAME(yuv2rgb32_2);
1539
                break;
1540
            case AV_PIX_FMT_BGR24:
1541
                c->yuv2packed1 = RENAME(yuv2bgr24_1);
1542
                c->yuv2packed2 = RENAME(yuv2bgr24_2);
1543
                break;
1544
            case AV_PIX_FMT_RGB555:
1545
                c->yuv2packed1 = RENAME(yuv2rgb555_1);
1546
                c->yuv2packed2 = RENAME(yuv2rgb555_2);
1547
                break;
1548
            case AV_PIX_FMT_RGB565:
1549
                c->yuv2packed1 = RENAME(yuv2rgb565_1);
1550
                c->yuv2packed2 = RENAME(yuv2rgb565_2);
1551
                break;
1552
            case AV_PIX_FMT_YUYV422:
1553
                c->yuv2packed1 = RENAME(yuv2yuyv422_1);
1554
                c->yuv2packed2 = RENAME(yuv2yuyv422_2);
1555
                break;
1556
            default:
1557
                break;
1558
            }
1559
        }
1560
    }
1561

1562
    if (c->srcBpc == 8 && c->dstBpc <= 14) {
1563
    // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
1564
#if COMPILE_TEMPLATE_MMXEXT
1565
    if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
1566
        c->hyscale_fast = ff_hyscale_fast_mmxext;
1567
        c->hcscale_fast = ff_hcscale_fast_mmxext;
1568
    } else {
1569
#endif /* COMPILE_TEMPLATE_MMXEXT */
1570
        c->hyscale_fast = NULL;
1571
        c->hcscale_fast = NULL;
1572
#if COMPILE_TEMPLATE_MMXEXT
1573
    }
1574
#endif /* COMPILE_TEMPLATE_MMXEXT */
1575
    }
1576
}
1577

1578
Product

Resources

Company