Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52869 views
1
/*
2
* software RGB to RGB converter
3
* pluralize by software PAL8 to RGB converter
4
* software YUV to YUV converter
5
* software YUV to RGB converter
6
* Written by Nick Kurshev.
7
* palette & YUV & runtime CPU stuff by Michael ([email protected])
8
* lot of big-endian byte order fixes by Alex Beregszaszi
9
*
10
* This file is part of FFmpeg.
11
*
12
* FFmpeg is free software; you can redistribute it and/or
13
* modify it under the terms of the GNU Lesser General Public
14
* License as published by the Free Software Foundation; either
15
* version 2.1 of the License, or (at your option) any later version.
16
*
17
* FFmpeg is distributed in the hope that it will be useful,
18
* but WITHOUT ANY WARRANTY; without even the implied warranty of
19
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20
* Lesser General Public License for more details.
21
*
22
* You should have received a copy of the GNU Lesser General Public
23
* License along with FFmpeg; if not, write to the Free Software
24
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
*/
26
27
#include <stddef.h>
28
#include <stdint.h>
29
30
#include "libavutil/attributes.h"
31
#include "libavutil/x86/asm.h"
32
33
#undef PREFETCH
34
#undef MOVNTQ
35
#undef EMMS
36
#undef SFENCE
37
#undef PAVGB
38
39
#if COMPILE_TEMPLATE_AMD3DNOW
40
#define PREFETCH "prefetch"
41
#define PAVGB "pavgusb"
42
#elif COMPILE_TEMPLATE_MMXEXT
43
#define PREFETCH "prefetchnta"
44
#define PAVGB "pavgb"
45
#else
46
#define PREFETCH " # nop"
47
#endif
48
49
#if COMPILE_TEMPLATE_AMD3DNOW
50
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
51
#define EMMS "femms"
52
#else
53
#define EMMS "emms"
54
#endif
55
56
#if COMPILE_TEMPLATE_MMXEXT
57
#define MOVNTQ "movntq"
58
#define SFENCE "sfence"
59
#else
60
#define MOVNTQ "movq"
61
#define SFENCE " # nop"
62
#endif
63
64
#if !COMPILE_TEMPLATE_SSE2
65
66
#if !COMPILE_TEMPLATE_AMD3DNOW
67
68
static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
69
{
70
uint8_t *dest = dst;
71
const uint8_t *s = src;
72
const uint8_t *end;
73
const uint8_t *mm_end;
74
end = s + src_size;
75
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
76
mm_end = end - 23;
77
__asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
78
while (s < mm_end) {
79
__asm__ volatile(
80
PREFETCH" 32(%1) \n\t"
81
"movd (%1), %%mm0 \n\t"
82
"punpckldq 3(%1), %%mm0 \n\t"
83
"movd 6(%1), %%mm1 \n\t"
84
"punpckldq 9(%1), %%mm1 \n\t"
85
"movd 12(%1), %%mm2 \n\t"
86
"punpckldq 15(%1), %%mm2 \n\t"
87
"movd 18(%1), %%mm3 \n\t"
88
"punpckldq 21(%1), %%mm3 \n\t"
89
"por %%mm7, %%mm0 \n\t"
90
"por %%mm7, %%mm1 \n\t"
91
"por %%mm7, %%mm2 \n\t"
92
"por %%mm7, %%mm3 \n\t"
93
MOVNTQ" %%mm0, (%0) \n\t"
94
MOVNTQ" %%mm1, 8(%0) \n\t"
95
MOVNTQ" %%mm2, 16(%0) \n\t"
96
MOVNTQ" %%mm3, 24(%0)"
97
:: "r"(dest), "r"(s)
98
:"memory");
99
dest += 32;
100
s += 24;
101
}
102
__asm__ volatile(SFENCE:::"memory");
103
__asm__ volatile(EMMS:::"memory");
104
while (s < end) {
105
*dest++ = *s++;
106
*dest++ = *s++;
107
*dest++ = *s++;
108
*dest++ = 255;
109
}
110
}
111
112
#define STORE_BGR24_MMX \
113
"psrlq $8, %%mm2 \n\t" \
114
"psrlq $8, %%mm3 \n\t" \
115
"psrlq $8, %%mm6 \n\t" \
116
"psrlq $8, %%mm7 \n\t" \
117
"pand "MANGLE(mask24l)", %%mm0\n\t" \
118
"pand "MANGLE(mask24l)", %%mm1\n\t" \
119
"pand "MANGLE(mask24l)", %%mm4\n\t" \
120
"pand "MANGLE(mask24l)", %%mm5\n\t" \
121
"pand "MANGLE(mask24h)", %%mm2\n\t" \
122
"pand "MANGLE(mask24h)", %%mm3\n\t" \
123
"pand "MANGLE(mask24h)", %%mm6\n\t" \
124
"pand "MANGLE(mask24h)", %%mm7\n\t" \
125
"por %%mm2, %%mm0 \n\t" \
126
"por %%mm3, %%mm1 \n\t" \
127
"por %%mm6, %%mm4 \n\t" \
128
"por %%mm7, %%mm5 \n\t" \
129
\
130
"movq %%mm1, %%mm2 \n\t" \
131
"movq %%mm4, %%mm3 \n\t" \
132
"psllq $48, %%mm2 \n\t" \
133
"psllq $32, %%mm3 \n\t" \
134
"por %%mm2, %%mm0 \n\t" \
135
"psrlq $16, %%mm1 \n\t" \
136
"psrlq $32, %%mm4 \n\t" \
137
"psllq $16, %%mm5 \n\t" \
138
"por %%mm3, %%mm1 \n\t" \
139
"por %%mm5, %%mm4 \n\t" \
140
\
141
MOVNTQ" %%mm0, (%0) \n\t" \
142
MOVNTQ" %%mm1, 8(%0) \n\t" \
143
MOVNTQ" %%mm4, 16(%0)"
144
145
146
static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
147
{
148
uint8_t *dest = dst;
149
const uint8_t *s = src;
150
const uint8_t *end;
151
const uint8_t *mm_end;
152
end = s + src_size;
153
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
154
mm_end = end - 31;
155
while (s < mm_end) {
156
__asm__ volatile(
157
PREFETCH" 32(%1) \n\t"
158
"movq (%1), %%mm0 \n\t"
159
"movq 8(%1), %%mm1 \n\t"
160
"movq 16(%1), %%mm4 \n\t"
161
"movq 24(%1), %%mm5 \n\t"
162
"movq %%mm0, %%mm2 \n\t"
163
"movq %%mm1, %%mm3 \n\t"
164
"movq %%mm4, %%mm6 \n\t"
165
"movq %%mm5, %%mm7 \n\t"
166
STORE_BGR24_MMX
167
:: "r"(dest), "r"(s)
168
NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
169
:"memory");
170
dest += 24;
171
s += 32;
172
}
173
__asm__ volatile(SFENCE:::"memory");
174
__asm__ volatile(EMMS:::"memory");
175
while (s < end) {
176
*dest++ = *s++;
177
*dest++ = *s++;
178
*dest++ = *s++;
179
s++;
180
}
181
}
182
183
/*
184
original by Strepto/Astral
185
ported to gcc & bugfixed: A'rpi
186
MMXEXT, 3DNOW optimization by Nick Kurshev
187
32-bit C version, and and&add trick by Michael Niedermayer
188
*/
189
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
190
{
191
register const uint8_t* s=src;
192
register uint8_t* d=dst;
193
register const uint8_t *end;
194
const uint8_t *mm_end;
195
end = s + src_size;
196
__asm__ volatile(PREFETCH" %0"::"m"(*s));
197
__asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
198
mm_end = end - 15;
199
while (s<mm_end) {
200
__asm__ volatile(
201
PREFETCH" 32(%1) \n\t"
202
"movq (%1), %%mm0 \n\t"
203
"movq 8(%1), %%mm2 \n\t"
204
"movq %%mm0, %%mm1 \n\t"
205
"movq %%mm2, %%mm3 \n\t"
206
"pand %%mm4, %%mm0 \n\t"
207
"pand %%mm4, %%mm2 \n\t"
208
"paddw %%mm1, %%mm0 \n\t"
209
"paddw %%mm3, %%mm2 \n\t"
210
MOVNTQ" %%mm0, (%0) \n\t"
211
MOVNTQ" %%mm2, 8(%0)"
212
:: "r"(d), "r"(s)
213
);
214
d+=16;
215
s+=16;
216
}
217
__asm__ volatile(SFENCE:::"memory");
218
__asm__ volatile(EMMS:::"memory");
219
mm_end = end - 3;
220
while (s < mm_end) {
221
register unsigned x= *((const uint32_t *)s);
222
*((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
223
d+=4;
224
s+=4;
225
}
226
if (s < end) {
227
register unsigned short x= *((const uint16_t *)s);
228
*((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
229
}
230
}
231
232
static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
233
{
234
register const uint8_t* s=src;
235
register uint8_t* d=dst;
236
register const uint8_t *end;
237
const uint8_t *mm_end;
238
end = s + src_size;
239
__asm__ volatile(PREFETCH" %0"::"m"(*s));
240
__asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
241
__asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
242
mm_end = end - 15;
243
while (s<mm_end) {
244
__asm__ volatile(
245
PREFETCH" 32(%1) \n\t"
246
"movq (%1), %%mm0 \n\t"
247
"movq 8(%1), %%mm2 \n\t"
248
"movq %%mm0, %%mm1 \n\t"
249
"movq %%mm2, %%mm3 \n\t"
250
"psrlq $1, %%mm0 \n\t"
251
"psrlq $1, %%mm2 \n\t"
252
"pand %%mm7, %%mm0 \n\t"
253
"pand %%mm7, %%mm2 \n\t"
254
"pand %%mm6, %%mm1 \n\t"
255
"pand %%mm6, %%mm3 \n\t"
256
"por %%mm1, %%mm0 \n\t"
257
"por %%mm3, %%mm2 \n\t"
258
MOVNTQ" %%mm0, (%0) \n\t"
259
MOVNTQ" %%mm2, 8(%0)"
260
:: "r"(d), "r"(s)
261
);
262
d+=16;
263
s+=16;
264
}
265
__asm__ volatile(SFENCE:::"memory");
266
__asm__ volatile(EMMS:::"memory");
267
mm_end = end - 3;
268
while (s < mm_end) {
269
register uint32_t x= *((const uint32_t*)s);
270
*((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
271
s+=4;
272
d+=4;
273
}
274
if (s < end) {
275
register uint16_t x= *((const uint16_t*)s);
276
*((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
277
}
278
}
279
280
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
281
{
282
const uint8_t *s = src;
283
const uint8_t *end;
284
const uint8_t *mm_end;
285
uint16_t *d = (uint16_t *)dst;
286
end = s + src_size;
287
mm_end = end - 15;
288
__asm__ volatile(
289
"movq %3, %%mm5 \n\t"
290
"movq %4, %%mm6 \n\t"
291
"movq %5, %%mm7 \n\t"
292
"jmp 2f \n\t"
293
".p2align 4 \n\t"
294
"1: \n\t"
295
PREFETCH" 32(%1) \n\t"
296
"movd (%1), %%mm0 \n\t"
297
"movd 4(%1), %%mm3 \n\t"
298
"punpckldq 8(%1), %%mm0 \n\t"
299
"punpckldq 12(%1), %%mm3 \n\t"
300
"movq %%mm0, %%mm1 \n\t"
301
"movq %%mm3, %%mm4 \n\t"
302
"pand %%mm6, %%mm0 \n\t"
303
"pand %%mm6, %%mm3 \n\t"
304
"pmaddwd %%mm7, %%mm0 \n\t"
305
"pmaddwd %%mm7, %%mm3 \n\t"
306
"pand %%mm5, %%mm1 \n\t"
307
"pand %%mm5, %%mm4 \n\t"
308
"por %%mm1, %%mm0 \n\t"
309
"por %%mm4, %%mm3 \n\t"
310
"psrld $5, %%mm0 \n\t"
311
"pslld $11, %%mm3 \n\t"
312
"por %%mm3, %%mm0 \n\t"
313
MOVNTQ" %%mm0, (%0) \n\t"
314
"add $16, %1 \n\t"
315
"add $8, %0 \n\t"
316
"2: \n\t"
317
"cmp %2, %1 \n\t"
318
" jb 1b \n\t"
319
: "+r" (d), "+r"(s)
320
: "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
321
);
322
__asm__ volatile(SFENCE:::"memory");
323
__asm__ volatile(EMMS:::"memory");
324
while (s < end) {
325
register int rgb = *(const uint32_t*)s; s += 4;
326
*d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
327
}
328
}
329
330
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
331
{
332
const uint8_t *s = src;
333
const uint8_t *end;
334
const uint8_t *mm_end;
335
uint16_t *d = (uint16_t *)dst;
336
end = s + src_size;
337
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
338
__asm__ volatile(
339
"movq %0, %%mm7 \n\t"
340
"movq %1, %%mm6 \n\t"
341
::"m"(red_16mask),"m"(green_16mask));
342
mm_end = end - 15;
343
while (s < mm_end) {
344
__asm__ volatile(
345
PREFETCH" 32(%1) \n\t"
346
"movd (%1), %%mm0 \n\t"
347
"movd 4(%1), %%mm3 \n\t"
348
"punpckldq 8(%1), %%mm0 \n\t"
349
"punpckldq 12(%1), %%mm3 \n\t"
350
"movq %%mm0, %%mm1 \n\t"
351
"movq %%mm0, %%mm2 \n\t"
352
"movq %%mm3, %%mm4 \n\t"
353
"movq %%mm3, %%mm5 \n\t"
354
"psllq $8, %%mm0 \n\t"
355
"psllq $8, %%mm3 \n\t"
356
"pand %%mm7, %%mm0 \n\t"
357
"pand %%mm7, %%mm3 \n\t"
358
"psrlq $5, %%mm1 \n\t"
359
"psrlq $5, %%mm4 \n\t"
360
"pand %%mm6, %%mm1 \n\t"
361
"pand %%mm6, %%mm4 \n\t"
362
"psrlq $19, %%mm2 \n\t"
363
"psrlq $19, %%mm5 \n\t"
364
"pand %2, %%mm2 \n\t"
365
"pand %2, %%mm5 \n\t"
366
"por %%mm1, %%mm0 \n\t"
367
"por %%mm4, %%mm3 \n\t"
368
"por %%mm2, %%mm0 \n\t"
369
"por %%mm5, %%mm3 \n\t"
370
"psllq $16, %%mm3 \n\t"
371
"por %%mm3, %%mm0 \n\t"
372
MOVNTQ" %%mm0, (%0) \n\t"
373
:: "r"(d),"r"(s),"m"(blue_16mask):"memory");
374
d += 4;
375
s += 16;
376
}
377
__asm__ volatile(SFENCE:::"memory");
378
__asm__ volatile(EMMS:::"memory");
379
while (s < end) {
380
register int rgb = *(const uint32_t*)s; s += 4;
381
*d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
382
}
383
}
384
385
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
386
{
387
const uint8_t *s = src;
388
const uint8_t *end;
389
const uint8_t *mm_end;
390
uint16_t *d = (uint16_t *)dst;
391
end = s + src_size;
392
mm_end = end - 15;
393
__asm__ volatile(
394
"movq %3, %%mm5 \n\t"
395
"movq %4, %%mm6 \n\t"
396
"movq %5, %%mm7 \n\t"
397
"jmp 2f \n\t"
398
".p2align 4 \n\t"
399
"1: \n\t"
400
PREFETCH" 32(%1) \n\t"
401
"movd (%1), %%mm0 \n\t"
402
"movd 4(%1), %%mm3 \n\t"
403
"punpckldq 8(%1), %%mm0 \n\t"
404
"punpckldq 12(%1), %%mm3 \n\t"
405
"movq %%mm0, %%mm1 \n\t"
406
"movq %%mm3, %%mm4 \n\t"
407
"pand %%mm6, %%mm0 \n\t"
408
"pand %%mm6, %%mm3 \n\t"
409
"pmaddwd %%mm7, %%mm0 \n\t"
410
"pmaddwd %%mm7, %%mm3 \n\t"
411
"pand %%mm5, %%mm1 \n\t"
412
"pand %%mm5, %%mm4 \n\t"
413
"por %%mm1, %%mm0 \n\t"
414
"por %%mm4, %%mm3 \n\t"
415
"psrld $6, %%mm0 \n\t"
416
"pslld $10, %%mm3 \n\t"
417
"por %%mm3, %%mm0 \n\t"
418
MOVNTQ" %%mm0, (%0) \n\t"
419
"add $16, %1 \n\t"
420
"add $8, %0 \n\t"
421
"2: \n\t"
422
"cmp %2, %1 \n\t"
423
" jb 1b \n\t"
424
: "+r" (d), "+r"(s)
425
: "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
426
);
427
__asm__ volatile(SFENCE:::"memory");
428
__asm__ volatile(EMMS:::"memory");
429
while (s < end) {
430
register int rgb = *(const uint32_t*)s; s += 4;
431
*d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
432
}
433
}
434
435
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
436
{
437
const uint8_t *s = src;
438
const uint8_t *end;
439
const uint8_t *mm_end;
440
uint16_t *d = (uint16_t *)dst;
441
end = s + src_size;
442
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
443
__asm__ volatile(
444
"movq %0, %%mm7 \n\t"
445
"movq %1, %%mm6 \n\t"
446
::"m"(red_15mask),"m"(green_15mask));
447
mm_end = end - 15;
448
while (s < mm_end) {
449
__asm__ volatile(
450
PREFETCH" 32(%1) \n\t"
451
"movd (%1), %%mm0 \n\t"
452
"movd 4(%1), %%mm3 \n\t"
453
"punpckldq 8(%1), %%mm0 \n\t"
454
"punpckldq 12(%1), %%mm3 \n\t"
455
"movq %%mm0, %%mm1 \n\t"
456
"movq %%mm0, %%mm2 \n\t"
457
"movq %%mm3, %%mm4 \n\t"
458
"movq %%mm3, %%mm5 \n\t"
459
"psllq $7, %%mm0 \n\t"
460
"psllq $7, %%mm3 \n\t"
461
"pand %%mm7, %%mm0 \n\t"
462
"pand %%mm7, %%mm3 \n\t"
463
"psrlq $6, %%mm1 \n\t"
464
"psrlq $6, %%mm4 \n\t"
465
"pand %%mm6, %%mm1 \n\t"
466
"pand %%mm6, %%mm4 \n\t"
467
"psrlq $19, %%mm2 \n\t"
468
"psrlq $19, %%mm5 \n\t"
469
"pand %2, %%mm2 \n\t"
470
"pand %2, %%mm5 \n\t"
471
"por %%mm1, %%mm0 \n\t"
472
"por %%mm4, %%mm3 \n\t"
473
"por %%mm2, %%mm0 \n\t"
474
"por %%mm5, %%mm3 \n\t"
475
"psllq $16, %%mm3 \n\t"
476
"por %%mm3, %%mm0 \n\t"
477
MOVNTQ" %%mm0, (%0) \n\t"
478
::"r"(d),"r"(s),"m"(blue_15mask):"memory");
479
d += 4;
480
s += 16;
481
}
482
__asm__ volatile(SFENCE:::"memory");
483
__asm__ volatile(EMMS:::"memory");
484
while (s < end) {
485
register int rgb = *(const uint32_t*)s; s += 4;
486
*d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
487
}
488
}
489
490
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
491
{
492
const uint8_t *s = src;
493
const uint8_t *end;
494
const uint8_t *mm_end;
495
uint16_t *d = (uint16_t *)dst;
496
end = s + src_size;
497
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
498
__asm__ volatile(
499
"movq %0, %%mm7 \n\t"
500
"movq %1, %%mm6 \n\t"
501
::"m"(red_16mask),"m"(green_16mask));
502
mm_end = end - 11;
503
while (s < mm_end) {
504
__asm__ volatile(
505
PREFETCH" 32(%1) \n\t"
506
"movd (%1), %%mm0 \n\t"
507
"movd 3(%1), %%mm3 \n\t"
508
"punpckldq 6(%1), %%mm0 \n\t"
509
"punpckldq 9(%1), %%mm3 \n\t"
510
"movq %%mm0, %%mm1 \n\t"
511
"movq %%mm0, %%mm2 \n\t"
512
"movq %%mm3, %%mm4 \n\t"
513
"movq %%mm3, %%mm5 \n\t"
514
"psrlq $3, %%mm0 \n\t"
515
"psrlq $3, %%mm3 \n\t"
516
"pand %2, %%mm0 \n\t"
517
"pand %2, %%mm3 \n\t"
518
"psrlq $5, %%mm1 \n\t"
519
"psrlq $5, %%mm4 \n\t"
520
"pand %%mm6, %%mm1 \n\t"
521
"pand %%mm6, %%mm4 \n\t"
522
"psrlq $8, %%mm2 \n\t"
523
"psrlq $8, %%mm5 \n\t"
524
"pand %%mm7, %%mm2 \n\t"
525
"pand %%mm7, %%mm5 \n\t"
526
"por %%mm1, %%mm0 \n\t"
527
"por %%mm4, %%mm3 \n\t"
528
"por %%mm2, %%mm0 \n\t"
529
"por %%mm5, %%mm3 \n\t"
530
"psllq $16, %%mm3 \n\t"
531
"por %%mm3, %%mm0 \n\t"
532
MOVNTQ" %%mm0, (%0) \n\t"
533
::"r"(d),"r"(s),"m"(blue_16mask):"memory");
534
d += 4;
535
s += 12;
536
}
537
__asm__ volatile(SFENCE:::"memory");
538
__asm__ volatile(EMMS:::"memory");
539
while (s < end) {
540
const int b = *s++;
541
const int g = *s++;
542
const int r = *s++;
543
*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
544
}
545
}
546
547
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
548
{
549
const uint8_t *s = src;
550
const uint8_t *end;
551
const uint8_t *mm_end;
552
uint16_t *d = (uint16_t *)dst;
553
end = s + src_size;
554
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
555
__asm__ volatile(
556
"movq %0, %%mm7 \n\t"
557
"movq %1, %%mm6 \n\t"
558
::"m"(red_16mask),"m"(green_16mask));
559
mm_end = end - 15;
560
while (s < mm_end) {
561
__asm__ volatile(
562
PREFETCH" 32(%1) \n\t"
563
"movd (%1), %%mm0 \n\t"
564
"movd 3(%1), %%mm3 \n\t"
565
"punpckldq 6(%1), %%mm0 \n\t"
566
"punpckldq 9(%1), %%mm3 \n\t"
567
"movq %%mm0, %%mm1 \n\t"
568
"movq %%mm0, %%mm2 \n\t"
569
"movq %%mm3, %%mm4 \n\t"
570
"movq %%mm3, %%mm5 \n\t"
571
"psllq $8, %%mm0 \n\t"
572
"psllq $8, %%mm3 \n\t"
573
"pand %%mm7, %%mm0 \n\t"
574
"pand %%mm7, %%mm3 \n\t"
575
"psrlq $5, %%mm1 \n\t"
576
"psrlq $5, %%mm4 \n\t"
577
"pand %%mm6, %%mm1 \n\t"
578
"pand %%mm6, %%mm4 \n\t"
579
"psrlq $19, %%mm2 \n\t"
580
"psrlq $19, %%mm5 \n\t"
581
"pand %2, %%mm2 \n\t"
582
"pand %2, %%mm5 \n\t"
583
"por %%mm1, %%mm0 \n\t"
584
"por %%mm4, %%mm3 \n\t"
585
"por %%mm2, %%mm0 \n\t"
586
"por %%mm5, %%mm3 \n\t"
587
"psllq $16, %%mm3 \n\t"
588
"por %%mm3, %%mm0 \n\t"
589
MOVNTQ" %%mm0, (%0) \n\t"
590
::"r"(d),"r"(s),"m"(blue_16mask):"memory");
591
d += 4;
592
s += 12;
593
}
594
__asm__ volatile(SFENCE:::"memory");
595
__asm__ volatile(EMMS:::"memory");
596
while (s < end) {
597
const int r = *s++;
598
const int g = *s++;
599
const int b = *s++;
600
*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
601
}
602
}
603
604
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
605
{
606
const uint8_t *s = src;
607
const uint8_t *end;
608
const uint8_t *mm_end;
609
uint16_t *d = (uint16_t *)dst;
610
end = s + src_size;
611
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
612
__asm__ volatile(
613
"movq %0, %%mm7 \n\t"
614
"movq %1, %%mm6 \n\t"
615
::"m"(red_15mask),"m"(green_15mask));
616
mm_end = end - 11;
617
while (s < mm_end) {
618
__asm__ volatile(
619
PREFETCH" 32(%1) \n\t"
620
"movd (%1), %%mm0 \n\t"
621
"movd 3(%1), %%mm3 \n\t"
622
"punpckldq 6(%1), %%mm0 \n\t"
623
"punpckldq 9(%1), %%mm3 \n\t"
624
"movq %%mm0, %%mm1 \n\t"
625
"movq %%mm0, %%mm2 \n\t"
626
"movq %%mm3, %%mm4 \n\t"
627
"movq %%mm3, %%mm5 \n\t"
628
"psrlq $3, %%mm0 \n\t"
629
"psrlq $3, %%mm3 \n\t"
630
"pand %2, %%mm0 \n\t"
631
"pand %2, %%mm3 \n\t"
632
"psrlq $6, %%mm1 \n\t"
633
"psrlq $6, %%mm4 \n\t"
634
"pand %%mm6, %%mm1 \n\t"
635
"pand %%mm6, %%mm4 \n\t"
636
"psrlq $9, %%mm2 \n\t"
637
"psrlq $9, %%mm5 \n\t"
638
"pand %%mm7, %%mm2 \n\t"
639
"pand %%mm7, %%mm5 \n\t"
640
"por %%mm1, %%mm0 \n\t"
641
"por %%mm4, %%mm3 \n\t"
642
"por %%mm2, %%mm0 \n\t"
643
"por %%mm5, %%mm3 \n\t"
644
"psllq $16, %%mm3 \n\t"
645
"por %%mm3, %%mm0 \n\t"
646
MOVNTQ" %%mm0, (%0) \n\t"
647
::"r"(d),"r"(s),"m"(blue_15mask):"memory");
648
d += 4;
649
s += 12;
650
}
651
__asm__ volatile(SFENCE:::"memory");
652
__asm__ volatile(EMMS:::"memory");
653
while (s < end) {
654
const int b = *s++;
655
const int g = *s++;
656
const int r = *s++;
657
*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
658
}
659
}
660
661
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
662
{
663
const uint8_t *s = src;
664
const uint8_t *end;
665
const uint8_t *mm_end;
666
uint16_t *d = (uint16_t *)dst;
667
end = s + src_size;
668
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
669
__asm__ volatile(
670
"movq %0, %%mm7 \n\t"
671
"movq %1, %%mm6 \n\t"
672
::"m"(red_15mask),"m"(green_15mask));
673
mm_end = end - 15;
674
while (s < mm_end) {
675
__asm__ volatile(
676
PREFETCH" 32(%1) \n\t"
677
"movd (%1), %%mm0 \n\t"
678
"movd 3(%1), %%mm3 \n\t"
679
"punpckldq 6(%1), %%mm0 \n\t"
680
"punpckldq 9(%1), %%mm3 \n\t"
681
"movq %%mm0, %%mm1 \n\t"
682
"movq %%mm0, %%mm2 \n\t"
683
"movq %%mm3, %%mm4 \n\t"
684
"movq %%mm3, %%mm5 \n\t"
685
"psllq $7, %%mm0 \n\t"
686
"psllq $7, %%mm3 \n\t"
687
"pand %%mm7, %%mm0 \n\t"
688
"pand %%mm7, %%mm3 \n\t"
689
"psrlq $6, %%mm1 \n\t"
690
"psrlq $6, %%mm4 \n\t"
691
"pand %%mm6, %%mm1 \n\t"
692
"pand %%mm6, %%mm4 \n\t"
693
"psrlq $19, %%mm2 \n\t"
694
"psrlq $19, %%mm5 \n\t"
695
"pand %2, %%mm2 \n\t"
696
"pand %2, %%mm5 \n\t"
697
"por %%mm1, %%mm0 \n\t"
698
"por %%mm4, %%mm3 \n\t"
699
"por %%mm2, %%mm0 \n\t"
700
"por %%mm5, %%mm3 \n\t"
701
"psllq $16, %%mm3 \n\t"
702
"por %%mm3, %%mm0 \n\t"
703
MOVNTQ" %%mm0, (%0) \n\t"
704
::"r"(d),"r"(s),"m"(blue_15mask):"memory");
705
d += 4;
706
s += 12;
707
}
708
__asm__ volatile(SFENCE:::"memory");
709
__asm__ volatile(EMMS:::"memory");
710
while (s < end) {
711
const int r = *s++;
712
const int g = *s++;
713
const int b = *s++;
714
*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
715
}
716
}
717
718
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
719
{
720
const uint16_t *end;
721
const uint16_t *mm_end;
722
uint8_t *d = dst;
723
const uint16_t *s = (const uint16_t*)src;
724
end = s + src_size/2;
725
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
726
mm_end = end - 7;
727
while (s < mm_end) {
728
__asm__ volatile(
729
PREFETCH" 32(%1) \n\t"
730
"movq (%1), %%mm0 \n\t"
731
"movq (%1), %%mm1 \n\t"
732
"movq (%1), %%mm2 \n\t"
733
"pand %2, %%mm0 \n\t"
734
"pand %3, %%mm1 \n\t"
735
"pand %4, %%mm2 \n\t"
736
"psllq $5, %%mm0 \n\t"
737
"pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
738
"pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
739
"pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
740
"movq %%mm0, %%mm3 \n\t"
741
"movq %%mm1, %%mm4 \n\t"
742
"movq %%mm2, %%mm5 \n\t"
743
"punpcklwd %5, %%mm0 \n\t"
744
"punpcklwd %5, %%mm1 \n\t"
745
"punpcklwd %5, %%mm2 \n\t"
746
"punpckhwd %5, %%mm3 \n\t"
747
"punpckhwd %5, %%mm4 \n\t"
748
"punpckhwd %5, %%mm5 \n\t"
749
"psllq $8, %%mm1 \n\t"
750
"psllq $16, %%mm2 \n\t"
751
"por %%mm1, %%mm0 \n\t"
752
"por %%mm2, %%mm0 \n\t"
753
"psllq $8, %%mm4 \n\t"
754
"psllq $16, %%mm5 \n\t"
755
"por %%mm4, %%mm3 \n\t"
756
"por %%mm5, %%mm3 \n\t"
757
758
"movq %%mm0, %%mm6 \n\t"
759
"movq %%mm3, %%mm7 \n\t"
760
761
"movq 8(%1), %%mm0 \n\t"
762
"movq 8(%1), %%mm1 \n\t"
763
"movq 8(%1), %%mm2 \n\t"
764
"pand %2, %%mm0 \n\t"
765
"pand %3, %%mm1 \n\t"
766
"pand %4, %%mm2 \n\t"
767
"psllq $5, %%mm0 \n\t"
768
"pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
769
"pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
770
"pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
771
"movq %%mm0, %%mm3 \n\t"
772
"movq %%mm1, %%mm4 \n\t"
773
"movq %%mm2, %%mm5 \n\t"
774
"punpcklwd %5, %%mm0 \n\t"
775
"punpcklwd %5, %%mm1 \n\t"
776
"punpcklwd %5, %%mm2 \n\t"
777
"punpckhwd %5, %%mm3 \n\t"
778
"punpckhwd %5, %%mm4 \n\t"
779
"punpckhwd %5, %%mm5 \n\t"
780
"psllq $8, %%mm1 \n\t"
781
"psllq $16, %%mm2 \n\t"
782
"por %%mm1, %%mm0 \n\t"
783
"por %%mm2, %%mm0 \n\t"
784
"psllq $8, %%mm4 \n\t"
785
"psllq $16, %%mm5 \n\t"
786
"por %%mm4, %%mm3 \n\t"
787
"por %%mm5, %%mm3 \n\t"
788
789
:"=m"(*d)
790
:"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
791
NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi)
792
:"memory");
793
/* borrowed 32 to 24 */
794
__asm__ volatile(
795
"movq %%mm0, %%mm4 \n\t"
796
"movq %%mm3, %%mm5 \n\t"
797
"movq %%mm6, %%mm0 \n\t"
798
"movq %%mm7, %%mm1 \n\t"
799
800
"movq %%mm4, %%mm6 \n\t"
801
"movq %%mm5, %%mm7 \n\t"
802
"movq %%mm0, %%mm2 \n\t"
803
"movq %%mm1, %%mm3 \n\t"
804
805
STORE_BGR24_MMX
806
807
:: "r"(d), "m"(*s)
808
NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
809
:"memory");
810
d += 24;
811
s += 8;
812
}
813
__asm__ volatile(SFENCE:::"memory");
814
__asm__ volatile(EMMS:::"memory");
815
while (s < end) {
816
register uint16_t bgr;
817
bgr = *s++;
818
*d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
819
*d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
820
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
821
}
822
}
823
824
static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
825
{
826
const uint16_t *end;
827
const uint16_t *mm_end;
828
uint8_t *d = (uint8_t *)dst;
829
const uint16_t *s = (const uint16_t *)src;
830
end = s + src_size/2;
831
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
832
mm_end = end - 7;
833
while (s < mm_end) {
834
__asm__ volatile(
835
PREFETCH" 32(%1) \n\t"
836
"movq (%1), %%mm0 \n\t"
837
"movq (%1), %%mm1 \n\t"
838
"movq (%1), %%mm2 \n\t"
839
"pand %2, %%mm0 \n\t"
840
"pand %3, %%mm1 \n\t"
841
"pand %4, %%mm2 \n\t"
842
"psllq $5, %%mm0 \n\t"
843
"psrlq $1, %%mm2 \n\t"
844
"pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
845
"pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
846
"pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
847
"movq %%mm0, %%mm3 \n\t"
848
"movq %%mm1, %%mm4 \n\t"
849
"movq %%mm2, %%mm5 \n\t"
850
"punpcklwd %5, %%mm0 \n\t"
851
"punpcklwd %5, %%mm1 \n\t"
852
"punpcklwd %5, %%mm2 \n\t"
853
"punpckhwd %5, %%mm3 \n\t"
854
"punpckhwd %5, %%mm4 \n\t"
855
"punpckhwd %5, %%mm5 \n\t"
856
"psllq $8, %%mm1 \n\t"
857
"psllq $16, %%mm2 \n\t"
858
"por %%mm1, %%mm0 \n\t"
859
"por %%mm2, %%mm0 \n\t"
860
"psllq $8, %%mm4 \n\t"
861
"psllq $16, %%mm5 \n\t"
862
"por %%mm4, %%mm3 \n\t"
863
"por %%mm5, %%mm3 \n\t"
864
865
"movq %%mm0, %%mm6 \n\t"
866
"movq %%mm3, %%mm7 \n\t"
867
868
"movq 8(%1), %%mm0 \n\t"
869
"movq 8(%1), %%mm1 \n\t"
870
"movq 8(%1), %%mm2 \n\t"
871
"pand %2, %%mm0 \n\t"
872
"pand %3, %%mm1 \n\t"
873
"pand %4, %%mm2 \n\t"
874
"psllq $5, %%mm0 \n\t"
875
"psrlq $1, %%mm2 \n\t"
876
"pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
877
"pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
878
"pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
879
"movq %%mm0, %%mm3 \n\t"
880
"movq %%mm1, %%mm4 \n\t"
881
"movq %%mm2, %%mm5 \n\t"
882
"punpcklwd %5, %%mm0 \n\t"
883
"punpcklwd %5, %%mm1 \n\t"
884
"punpcklwd %5, %%mm2 \n\t"
885
"punpckhwd %5, %%mm3 \n\t"
886
"punpckhwd %5, %%mm4 \n\t"
887
"punpckhwd %5, %%mm5 \n\t"
888
"psllq $8, %%mm1 \n\t"
889
"psllq $16, %%mm2 \n\t"
890
"por %%mm1, %%mm0 \n\t"
891
"por %%mm2, %%mm0 \n\t"
892
"psllq $8, %%mm4 \n\t"
893
"psllq $16, %%mm5 \n\t"
894
"por %%mm4, %%mm3 \n\t"
895
"por %%mm5, %%mm3 \n\t"
896
:"=m"(*d)
897
:"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
898
NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi)
899
:"memory");
900
/* borrowed 32 to 24 */
901
__asm__ volatile(
902
"movq %%mm0, %%mm4 \n\t"
903
"movq %%mm3, %%mm5 \n\t"
904
"movq %%mm6, %%mm0 \n\t"
905
"movq %%mm7, %%mm1 \n\t"
906
907
"movq %%mm4, %%mm6 \n\t"
908
"movq %%mm5, %%mm7 \n\t"
909
"movq %%mm0, %%mm2 \n\t"
910
"movq %%mm1, %%mm3 \n\t"
911
912
STORE_BGR24_MMX
913
914
:: "r"(d), "m"(*s)
915
NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
916
:"memory");
917
d += 24;
918
s += 8;
919
}
920
__asm__ volatile(SFENCE:::"memory");
921
__asm__ volatile(EMMS:::"memory");
922
while (s < end) {
923
register uint16_t bgr;
924
bgr = *s++;
925
*d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
926
*d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
927
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
928
}
929
}
930
931
/*
932
* mm0 = 00 B3 00 B2 00 B1 00 B0
933
* mm1 = 00 G3 00 G2 00 G1 00 G0
934
* mm2 = 00 R3 00 R2 00 R1 00 R0
935
* mm6 = FF FF FF FF FF FF FF FF
936
* mm7 = 00 00 00 00 00 00 00 00
937
*/
938
#define PACK_RGB32 \
939
"packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
940
"packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
941
"packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
942
"punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
943
"punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
944
"movq %%mm0, %%mm3 \n\t" \
945
"punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
946
"punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
947
MOVNTQ" %%mm0, (%0) \n\t" \
948
MOVNTQ" %%mm3, 8(%0) \n\t" \
949
950
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
951
{
952
const uint16_t *end;
953
const uint16_t *mm_end;
954
uint8_t *d = dst;
955
const uint16_t *s = (const uint16_t *)src;
956
end = s + src_size/2;
957
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
958
__asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
959
__asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
960
mm_end = end - 3;
961
while (s < mm_end) {
962
__asm__ volatile(
963
PREFETCH" 32(%1) \n\t"
964
"movq (%1), %%mm0 \n\t"
965
"movq (%1), %%mm1 \n\t"
966
"movq (%1), %%mm2 \n\t"
967
"pand %2, %%mm0 \n\t"
968
"pand %3, %%mm1 \n\t"
969
"pand %4, %%mm2 \n\t"
970
"psllq $5, %%mm0 \n\t"
971
"pmulhw %5, %%mm0 \n\t"
972
"pmulhw %5, %%mm1 \n\t"
973
"pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
974
PACK_RGB32
975
::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
976
NAMED_CONSTRAINTS_ADD(mul15_hi)
977
:"memory");
978
d += 16;
979
s += 4;
980
}
981
__asm__ volatile(SFENCE:::"memory");
982
__asm__ volatile(EMMS:::"memory");
983
while (s < end) {
984
register uint16_t bgr;
985
bgr = *s++;
986
*d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
987
*d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
988
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
989
*d++ = 255;
990
}
991
}
992
993
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
994
{
995
const uint16_t *end;
996
const uint16_t *mm_end;
997
uint8_t *d = dst;
998
const uint16_t *s = (const uint16_t*)src;
999
end = s + src_size/2;
1000
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
1001
__asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
1002
__asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
1003
mm_end = end - 3;
1004
while (s < mm_end) {
1005
__asm__ volatile(
1006
PREFETCH" 32(%1) \n\t"
1007
"movq (%1), %%mm0 \n\t"
1008
"movq (%1), %%mm1 \n\t"
1009
"movq (%1), %%mm2 \n\t"
1010
"pand %2, %%mm0 \n\t"
1011
"pand %3, %%mm1 \n\t"
1012
"pand %4, %%mm2 \n\t"
1013
"psllq $5, %%mm0 \n\t"
1014
"psrlq $1, %%mm2 \n\t"
1015
"pmulhw %5, %%mm0 \n\t"
1016
"pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
1017
"pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
1018
PACK_RGB32
1019
::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
1020
NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi)
1021
:"memory");
1022
d += 16;
1023
s += 4;
1024
}
1025
__asm__ volatile(SFENCE:::"memory");
1026
__asm__ volatile(EMMS:::"memory");
1027
while (s < end) {
1028
register uint16_t bgr;
1029
bgr = *s++;
1030
*d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
1031
*d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
1032
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
1033
*d++ = 255;
1034
}
1035
}
1036
1037
static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
1038
{
1039
x86_reg idx = 15 - src_size;
1040
const uint8_t *s = src-idx;
1041
uint8_t *d = dst-idx;
1042
__asm__ volatile(
1043
"test %0, %0 \n\t"
1044
"jns 2f \n\t"
1045
PREFETCH" (%1, %0) \n\t"
1046
"movq %3, %%mm7 \n\t"
1047
"pxor %4, %%mm7 \n\t"
1048
"movq %%mm7, %%mm6 \n\t"
1049
"pxor %5, %%mm7 \n\t"
1050
".p2align 4 \n\t"
1051
"1: \n\t"
1052
PREFETCH" 32(%1, %0) \n\t"
1053
"movq (%1, %0), %%mm0 \n\t"
1054
"movq 8(%1, %0), %%mm1 \n\t"
1055
# if COMPILE_TEMPLATE_MMXEXT
1056
"pshufw $177, %%mm0, %%mm3 \n\t"
1057
"pshufw $177, %%mm1, %%mm5 \n\t"
1058
"pand %%mm7, %%mm0 \n\t"
1059
"pand %%mm6, %%mm3 \n\t"
1060
"pand %%mm7, %%mm1 \n\t"
1061
"pand %%mm6, %%mm5 \n\t"
1062
"por %%mm3, %%mm0 \n\t"
1063
"por %%mm5, %%mm1 \n\t"
1064
# else
1065
"movq %%mm0, %%mm2 \n\t"
1066
"movq %%mm1, %%mm4 \n\t"
1067
"pand %%mm7, %%mm0 \n\t"
1068
"pand %%mm6, %%mm2 \n\t"
1069
"pand %%mm7, %%mm1 \n\t"
1070
"pand %%mm6, %%mm4 \n\t"
1071
"movq %%mm2, %%mm3 \n\t"
1072
"movq %%mm4, %%mm5 \n\t"
1073
"pslld $16, %%mm2 \n\t"
1074
"psrld $16, %%mm3 \n\t"
1075
"pslld $16, %%mm4 \n\t"
1076
"psrld $16, %%mm5 \n\t"
1077
"por %%mm2, %%mm0 \n\t"
1078
"por %%mm4, %%mm1 \n\t"
1079
"por %%mm3, %%mm0 \n\t"
1080
"por %%mm5, %%mm1 \n\t"
1081
# endif
1082
MOVNTQ" %%mm0, (%2, %0) \n\t"
1083
MOVNTQ" %%mm1, 8(%2, %0) \n\t"
1084
"add $16, %0 \n\t"
1085
"js 1b \n\t"
1086
SFENCE" \n\t"
1087
EMMS" \n\t"
1088
"2: \n\t"
1089
: "+&r"(idx)
1090
: "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
1091
: "memory");
1092
for (; idx<15; idx+=4) {
1093
register unsigned v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
1094
v &= 0xff00ff;
1095
*(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
1096
}
1097
}
1098
1099
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
1100
{
1101
unsigned i;
1102
x86_reg mmx_size= 23 - src_size;
1103
__asm__ volatile (
1104
"test %%"REG_a", %%"REG_a" \n\t"
1105
"jns 2f \n\t"
1106
"movq "MANGLE(mask24r)", %%mm5 \n\t"
1107
"movq "MANGLE(mask24g)", %%mm6 \n\t"
1108
"movq "MANGLE(mask24b)", %%mm7 \n\t"
1109
".p2align 4 \n\t"
1110
"1: \n\t"
1111
PREFETCH" 32(%1, %%"REG_a") \n\t"
1112
"movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1113
"movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
1114
"movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
1115
"psllq $16, %%mm0 \n\t" // 00 BGR BGR
1116
"pand %%mm5, %%mm0 \n\t"
1117
"pand %%mm6, %%mm1 \n\t"
1118
"pand %%mm7, %%mm2 \n\t"
1119
"por %%mm0, %%mm1 \n\t"
1120
"por %%mm2, %%mm1 \n\t"
1121
"movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
1122
MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
1123
"movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
1124
"movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
1125
"pand %%mm7, %%mm0 \n\t"
1126
"pand %%mm5, %%mm1 \n\t"
1127
"pand %%mm6, %%mm2 \n\t"
1128
"por %%mm0, %%mm1 \n\t"
1129
"por %%mm2, %%mm1 \n\t"
1130
"movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
1131
MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
1132
"movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
1133
"movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
1134
"pand %%mm6, %%mm0 \n\t"
1135
"pand %%mm7, %%mm1 \n\t"
1136
"pand %%mm5, %%mm2 \n\t"
1137
"por %%mm0, %%mm1 \n\t"
1138
"por %%mm2, %%mm1 \n\t"
1139
MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
1140
"add $24, %%"REG_a" \n\t"
1141
" js 1b \n\t"
1142
"2: \n\t"
1143
: "+a" (mmx_size)
1144
: "r" (src-mmx_size), "r"(dst-mmx_size)
1145
NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b)
1146
);
1147
1148
__asm__ volatile(SFENCE:::"memory");
1149
__asm__ volatile(EMMS:::"memory");
1150
1151
if (mmx_size==23) return; //finished, was multiple of 8
1152
1153
src+= src_size;
1154
dst+= src_size;
1155
src_size= 23-mmx_size;
1156
src-= src_size;
1157
dst-= src_size;
1158
for (i=0; i<src_size; i+=3) {
1159
register uint8_t x;
1160
x = src[i + 2];
1161
dst[i + 1] = src[i + 1];
1162
dst[i + 2] = src[i + 0];
1163
dst[i + 0] = x;
1164
}
1165
}
1166
1167
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1168
int width, int height,
1169
int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1170
{
1171
int y;
1172
const x86_reg chromWidth= width>>1;
1173
for (y=0; y<height; y++) {
1174
//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1175
__asm__ volatile(
1176
"xor %%"REG_a", %%"REG_a" \n\t"
1177
".p2align 4 \n\t"
1178
"1: \n\t"
1179
PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1180
PREFETCH" 32(%2, %%"REG_a") \n\t"
1181
PREFETCH" 32(%3, %%"REG_a") \n\t"
1182
"movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1183
"movq %%mm0, %%mm2 \n\t" // U(0)
1184
"movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1185
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1186
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1187
1188
"movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1189
"movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1190
"movq %%mm3, %%mm4 \n\t" // Y(0)
1191
"movq %%mm5, %%mm6 \n\t" // Y(8)
1192
"punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
1193
"punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
1194
"punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
1195
"punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
1196
1197
MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
1198
MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1199
MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
1200
MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1201
1202
"add $8, %%"REG_a" \n\t"
1203
"cmp %4, %%"REG_a" \n\t"
1204
" jb 1b \n\t"
1205
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1206
: "%"REG_a
1207
);
1208
if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1209
usrc += chromStride;
1210
vsrc += chromStride;
1211
}
1212
ysrc += lumStride;
1213
dst += dstStride;
1214
}
1215
__asm__(EMMS" \n\t"
1216
SFENCE" \n\t"
1217
:::"memory");
1218
}
1219
1220
/**
1221
* Height should be a multiple of 2 and width should be a multiple of 16.
1222
* (If this is a problem for anyone then tell me, and I will fix it.)
1223
*/
1224
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1225
int width, int height,
1226
int lumStride, int chromStride, int dstStride)
1227
{
1228
//FIXME interpolate chroma
1229
RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1230
}
1231
1232
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1233
int width, int height,
1234
int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
1235
{
1236
int y;
1237
const x86_reg chromWidth= width>>1;
1238
for (y=0; y<height; y++) {
1239
//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
1240
__asm__ volatile(
1241
"xor %%"REG_a", %%"REG_a" \n\t"
1242
".p2align 4 \n\t"
1243
"1: \n\t"
1244
PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
1245
PREFETCH" 32(%2, %%"REG_a") \n\t"
1246
PREFETCH" 32(%3, %%"REG_a") \n\t"
1247
"movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
1248
"movq %%mm0, %%mm2 \n\t" // U(0)
1249
"movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
1250
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1251
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
1252
1253
"movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
1254
"movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
1255
"movq %%mm0, %%mm4 \n\t" // Y(0)
1256
"movq %%mm2, %%mm6 \n\t" // Y(8)
1257
"punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
1258
"punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
1259
"punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
1260
"punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
1261
1262
MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
1263
MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
1264
MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
1265
MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
1266
1267
"add $8, %%"REG_a" \n\t"
1268
"cmp %4, %%"REG_a" \n\t"
1269
" jb 1b \n\t"
1270
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
1271
: "%"REG_a
1272
);
1273
if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
1274
usrc += chromStride;
1275
vsrc += chromStride;
1276
}
1277
ysrc += lumStride;
1278
dst += dstStride;
1279
}
1280
__asm__(EMMS" \n\t"
1281
SFENCE" \n\t"
1282
:::"memory");
1283
}
1284
1285
/**
1286
* Height should be a multiple of 2 and width should be a multiple of 16
1287
* (If this is a problem for anyone then tell me, and I will fix it.)
1288
*/
1289
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1290
int width, int height,
1291
int lumStride, int chromStride, int dstStride)
1292
{
1293
//FIXME interpolate chroma
1294
RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
1295
}
1296
1297
/**
1298
* Width should be a multiple of 16.
1299
*/
1300
static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1301
int width, int height,
1302
int lumStride, int chromStride, int dstStride)
1303
{
1304
RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1305
}
1306
1307
/**
1308
* Width should be a multiple of 16.
1309
*/
1310
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
1311
int width, int height,
1312
int lumStride, int chromStride, int dstStride)
1313
{
1314
RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
1315
}
1316
1317
/**
1318
* Height should be a multiple of 2 and width should be a multiple of 16.
1319
* (If this is a problem for anyone then tell me, and I will fix it.)
1320
*/
1321
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1322
int width, int height,
1323
int lumStride, int chromStride, int srcStride)
1324
{
1325
int y;
1326
const x86_reg chromWidth= width>>1;
1327
for (y=0; y<height; y+=2) {
1328
__asm__ volatile(
1329
"xor %%"REG_a", %%"REG_a" \n\t"
1330
"pcmpeqw %%mm7, %%mm7 \n\t"
1331
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1332
".p2align 4 \n\t"
1333
"1: \n\t"
1334
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1335
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1336
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1337
"movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
1338
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
1339
"psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
1340
"psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
1341
"pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1342
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1343
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1344
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1345
1346
MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1347
1348
"movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
1349
"movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
1350
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
1351
"movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
1352
"psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
1353
"psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
1354
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1355
"pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1356
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1357
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1358
1359
MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1360
1361
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1362
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1363
"psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1364
"psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1365
"pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1366
"pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1367
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1368
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1369
1370
MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1371
MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1372
1373
"add $8, %%"REG_a" \n\t"
1374
"cmp %4, %%"REG_a" \n\t"
1375
" jb 1b \n\t"
1376
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1377
: "memory", "%"REG_a
1378
);
1379
1380
ydst += lumStride;
1381
src += srcStride;
1382
1383
__asm__ volatile(
1384
"xor %%"REG_a", %%"REG_a" \n\t"
1385
".p2align 4 \n\t"
1386
"1: \n\t"
1387
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1388
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1389
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1390
"movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1391
"movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1392
"pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1393
"pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1394
"pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1395
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1396
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1397
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1398
1399
MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1400
MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1401
1402
"add $8, %%"REG_a" \n\t"
1403
"cmp %4, %%"REG_a" \n\t"
1404
" jb 1b \n\t"
1405
1406
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1407
: "memory", "%"REG_a
1408
);
1409
udst += chromStride;
1410
vdst += chromStride;
1411
ydst += lumStride;
1412
src += srcStride;
1413
}
1414
__asm__ volatile(EMMS" \n\t"
1415
SFENCE" \n\t"
1416
:::"memory");
1417
}
1418
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1419
1420
#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1421
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
1422
{
1423
int x,y;
1424
1425
dst[0]= src[0];
1426
1427
// first line
1428
for (x=0; x<srcWidth-1; x++) {
1429
dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1430
dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1431
}
1432
dst[2*srcWidth-1]= src[srcWidth-1];
1433
1434
dst+= dstStride;
1435
1436
for (y=1; y<srcHeight; y++) {
1437
x86_reg mmxSize= srcWidth&~15;
1438
1439
if (mmxSize) {
1440
__asm__ volatile(
1441
"mov %4, %%"REG_a" \n\t"
1442
"movq "MANGLE(mmx_ff)", %%mm0 \n\t"
1443
"movq (%0, %%"REG_a"), %%mm4 \n\t"
1444
"movq %%mm4, %%mm2 \n\t"
1445
"psllq $8, %%mm4 \n\t"
1446
"pand %%mm0, %%mm2 \n\t"
1447
"por %%mm2, %%mm4 \n\t"
1448
"movq (%1, %%"REG_a"), %%mm5 \n\t"
1449
"movq %%mm5, %%mm3 \n\t"
1450
"psllq $8, %%mm5 \n\t"
1451
"pand %%mm0, %%mm3 \n\t"
1452
"por %%mm3, %%mm5 \n\t"
1453
"1: \n\t"
1454
"movq (%0, %%"REG_a"), %%mm0 \n\t"
1455
"movq (%1, %%"REG_a"), %%mm1 \n\t"
1456
"movq 1(%0, %%"REG_a"), %%mm2 \n\t"
1457
"movq 1(%1, %%"REG_a"), %%mm3 \n\t"
1458
PAVGB" %%mm0, %%mm5 \n\t"
1459
PAVGB" %%mm0, %%mm3 \n\t"
1460
PAVGB" %%mm0, %%mm5 \n\t"
1461
PAVGB" %%mm0, %%mm3 \n\t"
1462
PAVGB" %%mm1, %%mm4 \n\t"
1463
PAVGB" %%mm1, %%mm2 \n\t"
1464
PAVGB" %%mm1, %%mm4 \n\t"
1465
PAVGB" %%mm1, %%mm2 \n\t"
1466
"movq %%mm5, %%mm7 \n\t"
1467
"movq %%mm4, %%mm6 \n\t"
1468
"punpcklbw %%mm3, %%mm5 \n\t"
1469
"punpckhbw %%mm3, %%mm7 \n\t"
1470
"punpcklbw %%mm2, %%mm4 \n\t"
1471
"punpckhbw %%mm2, %%mm6 \n\t"
1472
MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
1473
MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
1474
MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
1475
MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
1476
"add $8, %%"REG_a" \n\t"
1477
"movq -1(%0, %%"REG_a"), %%mm4 \n\t"
1478
"movq -1(%1, %%"REG_a"), %%mm5 \n\t"
1479
" js 1b \n\t"
1480
:: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
1481
"r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
1482
"g" (-mmxSize)
1483
NAMED_CONSTRAINTS_ADD(mmx_ff)
1484
: "%"REG_a
1485
);
1486
} else {
1487
mmxSize = 1;
1488
dst[0] = (src[0] * 3 + src[srcStride]) >> 2;
1489
dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
1490
}
1491
1492
for (x=mmxSize-1; x<srcWidth-1; x++) {
1493
dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
1494
dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
1495
dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
1496
dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
1497
}
1498
dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
1499
dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
1500
1501
dst+=dstStride*2;
1502
src+=srcStride;
1503
}
1504
1505
// last line
1506
dst[0]= src[0];
1507
1508
for (x=0; x<srcWidth-1; x++) {
1509
dst[2*x+1]= (3*src[x] + src[x+1])>>2;
1510
dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
1511
}
1512
dst[2*srcWidth-1]= src[srcWidth-1];
1513
1514
__asm__ volatile(EMMS" \n\t"
1515
SFENCE" \n\t"
1516
:::"memory");
1517
}
1518
#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
1519
1520
#if !COMPILE_TEMPLATE_AMD3DNOW
1521
/**
1522
* Height should be a multiple of 2 and width should be a multiple of 16.
1523
* (If this is a problem for anyone then tell me, and I will fix it.)
1524
* Chrominance data is only taken from every second line, others are ignored.
1525
* FIXME: Write HQ version.
1526
*/
1527
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1528
int width, int height,
1529
int lumStride, int chromStride, int srcStride)
1530
{
1531
int y;
1532
const x86_reg chromWidth= width>>1;
1533
for (y=0; y<height; y+=2) {
1534
__asm__ volatile(
1535
"xor %%"REG_a", %%"REG_a" \n\t"
1536
"pcmpeqw %%mm7, %%mm7 \n\t"
1537
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
1538
".p2align 4 \n\t"
1539
"1: \n\t"
1540
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1541
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
1542
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
1543
"movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
1544
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
1545
"pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
1546
"pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
1547
"psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
1548
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
1549
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
1550
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
1551
1552
MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
1553
1554
"movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
1555
"movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
1556
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
1557
"movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
1558
"pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
1559
"pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
1560
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
1561
"psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
1562
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
1563
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
1564
1565
MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
1566
1567
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
1568
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
1569
"psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
1570
"psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
1571
"pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
1572
"pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
1573
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
1574
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
1575
1576
MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
1577
MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
1578
1579
"add $8, %%"REG_a" \n\t"
1580
"cmp %4, %%"REG_a" \n\t"
1581
" jb 1b \n\t"
1582
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1583
: "memory", "%"REG_a
1584
);
1585
1586
ydst += lumStride;
1587
src += srcStride;
1588
1589
__asm__ volatile(
1590
"xor %%"REG_a", %%"REG_a" \n\t"
1591
".p2align 4 \n\t"
1592
"1: \n\t"
1593
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
1594
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
1595
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
1596
"movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
1597
"movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
1598
"psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
1599
"psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
1600
"psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
1601
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
1602
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
1603
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
1604
1605
MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
1606
MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
1607
1608
"add $8, %%"REG_a" \n\t"
1609
"cmp %4, %%"REG_a" \n\t"
1610
" jb 1b \n\t"
1611
1612
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
1613
: "memory", "%"REG_a
1614
);
1615
udst += chromStride;
1616
vdst += chromStride;
1617
ydst += lumStride;
1618
src += srcStride;
1619
}
1620
__asm__ volatile(EMMS" \n\t"
1621
SFENCE" \n\t"
1622
:::"memory");
1623
}
1624
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1625
1626
/**
1627
* Height should be a multiple of 2 and width should be a multiple of 2.
1628
* (If this is a problem for anyone then tell me, and I will fix it.)
1629
* Chrominance data is only taken from every second line,
1630
* others are ignored in the C version.
1631
* FIXME: Write HQ version.
1632
*/
1633
#if HAVE_7REGS
1634
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
1635
int width, int height,
1636
int lumStride, int chromStride, int srcStride,
1637
int32_t *rgb2yuv)
1638
{
1639
#define BGR2Y_IDX "16*4+16*32"
1640
#define BGR2U_IDX "16*4+16*33"
1641
#define BGR2V_IDX "16*4+16*34"
1642
int y;
1643
const x86_reg chromWidth= width>>1;
1644
1645
if (height > 2) {
1646
ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv);
1647
src += 2*srcStride;
1648
ydst += 2*lumStride;
1649
udst += chromStride;
1650
vdst += chromStride;
1651
height -= 2;
1652
}
1653
1654
for (y=0; y<height-2; y+=2) {
1655
int i;
1656
for (i=0; i<2; i++) {
1657
__asm__ volatile(
1658
"mov %2, %%"REG_a" \n\t"
1659
"movq "BGR2Y_IDX"(%3), %%mm6 \n\t"
1660
"movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1661
"pxor %%mm7, %%mm7 \n\t"
1662
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1663
".p2align 4 \n\t"
1664
"1: \n\t"
1665
PREFETCH" 64(%0, %%"REG_d") \n\t"
1666
"movd (%0, %%"REG_d"), %%mm0 \n\t"
1667
"movd 3(%0, %%"REG_d"), %%mm1 \n\t"
1668
"punpcklbw %%mm7, %%mm0 \n\t"
1669
"punpcklbw %%mm7, %%mm1 \n\t"
1670
"movd 6(%0, %%"REG_d"), %%mm2 \n\t"
1671
"movd 9(%0, %%"REG_d"), %%mm3 \n\t"
1672
"punpcklbw %%mm7, %%mm2 \n\t"
1673
"punpcklbw %%mm7, %%mm3 \n\t"
1674
"pmaddwd %%mm6, %%mm0 \n\t"
1675
"pmaddwd %%mm6, %%mm1 \n\t"
1676
"pmaddwd %%mm6, %%mm2 \n\t"
1677
"pmaddwd %%mm6, %%mm3 \n\t"
1678
"psrad $8, %%mm0 \n\t"
1679
"psrad $8, %%mm1 \n\t"
1680
"psrad $8, %%mm2 \n\t"
1681
"psrad $8, %%mm3 \n\t"
1682
"packssdw %%mm1, %%mm0 \n\t"
1683
"packssdw %%mm3, %%mm2 \n\t"
1684
"pmaddwd %%mm5, %%mm0 \n\t"
1685
"pmaddwd %%mm5, %%mm2 \n\t"
1686
"packssdw %%mm2, %%mm0 \n\t"
1687
"psraw $7, %%mm0 \n\t"
1688
1689
"movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1690
"movd 15(%0, %%"REG_d"), %%mm1 \n\t"
1691
"punpcklbw %%mm7, %%mm4 \n\t"
1692
"punpcklbw %%mm7, %%mm1 \n\t"
1693
"movd 18(%0, %%"REG_d"), %%mm2 \n\t"
1694
"movd 21(%0, %%"REG_d"), %%mm3 \n\t"
1695
"punpcklbw %%mm7, %%mm2 \n\t"
1696
"punpcklbw %%mm7, %%mm3 \n\t"
1697
"pmaddwd %%mm6, %%mm4 \n\t"
1698
"pmaddwd %%mm6, %%mm1 \n\t"
1699
"pmaddwd %%mm6, %%mm2 \n\t"
1700
"pmaddwd %%mm6, %%mm3 \n\t"
1701
"psrad $8, %%mm4 \n\t"
1702
"psrad $8, %%mm1 \n\t"
1703
"psrad $8, %%mm2 \n\t"
1704
"psrad $8, %%mm3 \n\t"
1705
"packssdw %%mm1, %%mm4 \n\t"
1706
"packssdw %%mm3, %%mm2 \n\t"
1707
"pmaddwd %%mm5, %%mm4 \n\t"
1708
"pmaddwd %%mm5, %%mm2 \n\t"
1709
"add $24, %%"REG_d" \n\t"
1710
"packssdw %%mm2, %%mm4 \n\t"
1711
"psraw $7, %%mm4 \n\t"
1712
1713
"packuswb %%mm4, %%mm0 \n\t"
1714
"paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
1715
1716
MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
1717
"add $8, %%"REG_a" \n\t"
1718
" js 1b \n\t"
1719
: : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv)
1720
NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset)
1721
: "%"REG_a, "%"REG_d
1722
);
1723
ydst += lumStride;
1724
src += srcStride;
1725
}
1726
src -= srcStride*2;
1727
__asm__ volatile(
1728
"mov %4, %%"REG_a" \n\t"
1729
"movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1730
"movq "BGR2U_IDX"(%5), %%mm6 \n\t"
1731
"pxor %%mm7, %%mm7 \n\t"
1732
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
1733
"add %%"REG_d", %%"REG_d" \n\t"
1734
".p2align 4 \n\t"
1735
"1: \n\t"
1736
PREFETCH" 64(%0, %%"REG_d") \n\t"
1737
PREFETCH" 64(%1, %%"REG_d") \n\t"
1738
#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1739
"movq (%0, %%"REG_d"), %%mm0 \n\t"
1740
"movq (%1, %%"REG_d"), %%mm1 \n\t"
1741
"movq 6(%0, %%"REG_d"), %%mm2 \n\t"
1742
"movq 6(%1, %%"REG_d"), %%mm3 \n\t"
1743
PAVGB" %%mm1, %%mm0 \n\t"
1744
PAVGB" %%mm3, %%mm2 \n\t"
1745
"movq %%mm0, %%mm1 \n\t"
1746
"movq %%mm2, %%mm3 \n\t"
1747
"psrlq $24, %%mm0 \n\t"
1748
"psrlq $24, %%mm2 \n\t"
1749
PAVGB" %%mm1, %%mm0 \n\t"
1750
PAVGB" %%mm3, %%mm2 \n\t"
1751
"punpcklbw %%mm7, %%mm0 \n\t"
1752
"punpcklbw %%mm7, %%mm2 \n\t"
1753
#else
1754
"movd (%0, %%"REG_d"), %%mm0 \n\t"
1755
"movd (%1, %%"REG_d"), %%mm1 \n\t"
1756
"movd 3(%0, %%"REG_d"), %%mm2 \n\t"
1757
"movd 3(%1, %%"REG_d"), %%mm3 \n\t"
1758
"punpcklbw %%mm7, %%mm0 \n\t"
1759
"punpcklbw %%mm7, %%mm1 \n\t"
1760
"punpcklbw %%mm7, %%mm2 \n\t"
1761
"punpcklbw %%mm7, %%mm3 \n\t"
1762
"paddw %%mm1, %%mm0 \n\t"
1763
"paddw %%mm3, %%mm2 \n\t"
1764
"paddw %%mm2, %%mm0 \n\t"
1765
"movd 6(%0, %%"REG_d"), %%mm4 \n\t"
1766
"movd 6(%1, %%"REG_d"), %%mm1 \n\t"
1767
"movd 9(%0, %%"REG_d"), %%mm2 \n\t"
1768
"movd 9(%1, %%"REG_d"), %%mm3 \n\t"
1769
"punpcklbw %%mm7, %%mm4 \n\t"
1770
"punpcklbw %%mm7, %%mm1 \n\t"
1771
"punpcklbw %%mm7, %%mm2 \n\t"
1772
"punpcklbw %%mm7, %%mm3 \n\t"
1773
"paddw %%mm1, %%mm4 \n\t"
1774
"paddw %%mm3, %%mm2 \n\t"
1775
"paddw %%mm4, %%mm2 \n\t"
1776
"psrlw $2, %%mm0 \n\t"
1777
"psrlw $2, %%mm2 \n\t"
1778
#endif
1779
"movq "BGR2V_IDX"(%5), %%mm1 \n\t"
1780
"movq "BGR2V_IDX"(%5), %%mm3 \n\t"
1781
1782
"pmaddwd %%mm0, %%mm1 \n\t"
1783
"pmaddwd %%mm2, %%mm3 \n\t"
1784
"pmaddwd %%mm6, %%mm0 \n\t"
1785
"pmaddwd %%mm6, %%mm2 \n\t"
1786
"psrad $8, %%mm0 \n\t"
1787
"psrad $8, %%mm1 \n\t"
1788
"psrad $8, %%mm2 \n\t"
1789
"psrad $8, %%mm3 \n\t"
1790
"packssdw %%mm2, %%mm0 \n\t"
1791
"packssdw %%mm3, %%mm1 \n\t"
1792
"pmaddwd %%mm5, %%mm0 \n\t"
1793
"pmaddwd %%mm5, %%mm1 \n\t"
1794
"packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
1795
"psraw $7, %%mm0 \n\t"
1796
1797
#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
1798
"movq 12(%0, %%"REG_d"), %%mm4 \n\t"
1799
"movq 12(%1, %%"REG_d"), %%mm1 \n\t"
1800
"movq 18(%0, %%"REG_d"), %%mm2 \n\t"
1801
"movq 18(%1, %%"REG_d"), %%mm3 \n\t"
1802
PAVGB" %%mm1, %%mm4 \n\t"
1803
PAVGB" %%mm3, %%mm2 \n\t"
1804
"movq %%mm4, %%mm1 \n\t"
1805
"movq %%mm2, %%mm3 \n\t"
1806
"psrlq $24, %%mm4 \n\t"
1807
"psrlq $24, %%mm2 \n\t"
1808
PAVGB" %%mm1, %%mm4 \n\t"
1809
PAVGB" %%mm3, %%mm2 \n\t"
1810
"punpcklbw %%mm7, %%mm4 \n\t"
1811
"punpcklbw %%mm7, %%mm2 \n\t"
1812
#else
1813
"movd 12(%0, %%"REG_d"), %%mm4 \n\t"
1814
"movd 12(%1, %%"REG_d"), %%mm1 \n\t"
1815
"movd 15(%0, %%"REG_d"), %%mm2 \n\t"
1816
"movd 15(%1, %%"REG_d"), %%mm3 \n\t"
1817
"punpcklbw %%mm7, %%mm4 \n\t"
1818
"punpcklbw %%mm7, %%mm1 \n\t"
1819
"punpcklbw %%mm7, %%mm2 \n\t"
1820
"punpcklbw %%mm7, %%mm3 \n\t"
1821
"paddw %%mm1, %%mm4 \n\t"
1822
"paddw %%mm3, %%mm2 \n\t"
1823
"paddw %%mm2, %%mm4 \n\t"
1824
"movd 18(%0, %%"REG_d"), %%mm5 \n\t"
1825
"movd 18(%1, %%"REG_d"), %%mm1 \n\t"
1826
"movd 21(%0, %%"REG_d"), %%mm2 \n\t"
1827
"movd 21(%1, %%"REG_d"), %%mm3 \n\t"
1828
"punpcklbw %%mm7, %%mm5 \n\t"
1829
"punpcklbw %%mm7, %%mm1 \n\t"
1830
"punpcklbw %%mm7, %%mm2 \n\t"
1831
"punpcklbw %%mm7, %%mm3 \n\t"
1832
"paddw %%mm1, %%mm5 \n\t"
1833
"paddw %%mm3, %%mm2 \n\t"
1834
"paddw %%mm5, %%mm2 \n\t"
1835
"movq "MANGLE(ff_w1111)", %%mm5 \n\t"
1836
"psrlw $2, %%mm4 \n\t"
1837
"psrlw $2, %%mm2 \n\t"
1838
#endif
1839
"movq "BGR2V_IDX"(%5), %%mm1 \n\t"
1840
"movq "BGR2V_IDX"(%5), %%mm3 \n\t"
1841
1842
"pmaddwd %%mm4, %%mm1 \n\t"
1843
"pmaddwd %%mm2, %%mm3 \n\t"
1844
"pmaddwd %%mm6, %%mm4 \n\t"
1845
"pmaddwd %%mm6, %%mm2 \n\t"
1846
"psrad $8, %%mm4 \n\t"
1847
"psrad $8, %%mm1 \n\t"
1848
"psrad $8, %%mm2 \n\t"
1849
"psrad $8, %%mm3 \n\t"
1850
"packssdw %%mm2, %%mm4 \n\t"
1851
"packssdw %%mm3, %%mm1 \n\t"
1852
"pmaddwd %%mm5, %%mm4 \n\t"
1853
"pmaddwd %%mm5, %%mm1 \n\t"
1854
"add $24, %%"REG_d" \n\t"
1855
"packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
1856
"psraw $7, %%mm4 \n\t"
1857
1858
"movq %%mm0, %%mm1 \n\t"
1859
"punpckldq %%mm4, %%mm0 \n\t"
1860
"punpckhdq %%mm4, %%mm1 \n\t"
1861
"packsswb %%mm1, %%mm0 \n\t"
1862
"paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
1863
"movd %%mm0, (%2, %%"REG_a") \n\t"
1864
"punpckhdq %%mm0, %%mm0 \n\t"
1865
"movd %%mm0, (%3, %%"REG_a") \n\t"
1866
"add $4, %%"REG_a" \n\t"
1867
" js 1b \n\t"
1868
: : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv)
1869
NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset)
1870
: "%"REG_a, "%"REG_d
1871
);
1872
1873
udst += chromStride;
1874
vdst += chromStride;
1875
src += srcStride*2;
1876
}
1877
1878
__asm__ volatile(EMMS" \n\t"
1879
SFENCE" \n\t"
1880
:::"memory");
1881
1882
ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv);
1883
}
1884
#endif /* HAVE_7REGS */
1885
#endif /* !COMPILE_TEMPLATE_SSE2 */
1886
1887
#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
1888
static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
1889
int width, int height, int src1Stride,
1890
int src2Stride, int dstStride)
1891
{
1892
int h;
1893
1894
for (h=0; h < height; h++) {
1895
int w;
1896
1897
if (width >= 16) {
1898
#if COMPILE_TEMPLATE_SSE2
1899
if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) {
1900
__asm__(
1901
"xor %%"REG_a", %%"REG_a" \n\t"
1902
"1: \n\t"
1903
PREFETCH" 64(%1, %%"REG_a") \n\t"
1904
PREFETCH" 64(%2, %%"REG_a") \n\t"
1905
"movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
1906
"movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
1907
"movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
1908
"punpcklbw %%xmm2, %%xmm0 \n\t"
1909
"punpckhbw %%xmm2, %%xmm1 \n\t"
1910
"movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
1911
"movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
1912
"add $16, %%"REG_a" \n\t"
1913
"cmp %3, %%"REG_a" \n\t"
1914
" jb 1b \n\t"
1915
::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1916
: "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"REG_a
1917
);
1918
} else
1919
#endif
1920
__asm__(
1921
"xor %%"REG_a", %%"REG_a" \n\t"
1922
"1: \n\t"
1923
PREFETCH" 64(%1, %%"REG_a") \n\t"
1924
PREFETCH" 64(%2, %%"REG_a") \n\t"
1925
"movq (%1, %%"REG_a"), %%mm0 \n\t"
1926
"movq 8(%1, %%"REG_a"), %%mm2 \n\t"
1927
"movq %%mm0, %%mm1 \n\t"
1928
"movq %%mm2, %%mm3 \n\t"
1929
"movq (%2, %%"REG_a"), %%mm4 \n\t"
1930
"movq 8(%2, %%"REG_a"), %%mm5 \n\t"
1931
"punpcklbw %%mm4, %%mm0 \n\t"
1932
"punpckhbw %%mm4, %%mm1 \n\t"
1933
"punpcklbw %%mm5, %%mm2 \n\t"
1934
"punpckhbw %%mm5, %%mm3 \n\t"
1935
MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
1936
MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
1937
MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
1938
MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
1939
"add $16, %%"REG_a" \n\t"
1940
"cmp %3, %%"REG_a" \n\t"
1941
" jb 1b \n\t"
1942
::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
1943
: "memory", "%"REG_a
1944
);
1945
1946
}
1947
for (w= (width&(~15)); w < width; w++) {
1948
dest[2*w+0] = src1[w];
1949
dest[2*w+1] = src2[w];
1950
}
1951
dest += dstStride;
1952
src1 += src1Stride;
1953
src2 += src2Stride;
1954
}
1955
__asm__(
1956
EMMS" \n\t"
1957
SFENCE" \n\t"
1958
::: "memory"
1959
);
1960
}
1961
#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
1962
1963
#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
1964
#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
1965
void RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
1966
const uint8_t *unused,
1967
const uint8_t *src1,
1968
const uint8_t *src2,
1969
int w,
1970
uint32_t *unused2);
1971
static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
1972
int width, int height, int srcStride,
1973
int dst1Stride, int dst2Stride)
1974
{
1975
int h;
1976
1977
for (h = 0; h < height; h++) {
1978
RENAME(ff_nv12ToUV)(dst1, dst2, NULL, src, NULL, width, NULL);
1979
src += srcStride;
1980
dst1 += dst1Stride;
1981
dst2 += dst2Stride;
1982
}
1983
__asm__(
1984
#if !COMPILE_TEMPLATE_SSE2
1985
EMMS" \n\t"
1986
#endif
1987
SFENCE" \n\t"
1988
::: "memory"
1989
);
1990
}
1991
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
1992
#endif /* !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL */
1993
1994
#if !COMPILE_TEMPLATE_SSE2
1995
#if !COMPILE_TEMPLATE_AMD3DNOW
1996
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
1997
uint8_t *dst1, uint8_t *dst2,
1998
int width, int height,
1999
int srcStride1, int srcStride2,
2000
int dstStride1, int dstStride2)
2001
{
2002
x86_reg x, y;
2003
int w,h;
2004
w=width/2; h=height/2;
2005
__asm__ volatile(
2006
PREFETCH" %0 \n\t"
2007
PREFETCH" %1 \n\t"
2008
::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
2009
for (y=0;y<h;y++) {
2010
const uint8_t* s1=src1+srcStride1*(y>>1);
2011
uint8_t* d=dst1+dstStride1*y;
2012
x=0;
2013
for (;x<w-31;x+=32) {
2014
__asm__ volatile(
2015
PREFETCH" 32(%1,%2) \n\t"
2016
"movq (%1,%2), %%mm0 \n\t"
2017
"movq 8(%1,%2), %%mm2 \n\t"
2018
"movq 16(%1,%2), %%mm4 \n\t"
2019
"movq 24(%1,%2), %%mm6 \n\t"
2020
"movq %%mm0, %%mm1 \n\t"
2021
"movq %%mm2, %%mm3 \n\t"
2022
"movq %%mm4, %%mm5 \n\t"
2023
"movq %%mm6, %%mm7 \n\t"
2024
"punpcklbw %%mm0, %%mm0 \n\t"
2025
"punpckhbw %%mm1, %%mm1 \n\t"
2026
"punpcklbw %%mm2, %%mm2 \n\t"
2027
"punpckhbw %%mm3, %%mm3 \n\t"
2028
"punpcklbw %%mm4, %%mm4 \n\t"
2029
"punpckhbw %%mm5, %%mm5 \n\t"
2030
"punpcklbw %%mm6, %%mm6 \n\t"
2031
"punpckhbw %%mm7, %%mm7 \n\t"
2032
MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2033
MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2034
MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2035
MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2036
MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2037
MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2038
MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2039
MOVNTQ" %%mm7, 56(%0,%2,2)"
2040
:: "r"(d), "r"(s1), "r"(x)
2041
:"memory");
2042
}
2043
for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
2044
}
2045
for (y=0;y<h;y++) {
2046
const uint8_t* s2=src2+srcStride2*(y>>1);
2047
uint8_t* d=dst2+dstStride2*y;
2048
x=0;
2049
for (;x<w-31;x+=32) {
2050
__asm__ volatile(
2051
PREFETCH" 32(%1,%2) \n\t"
2052
"movq (%1,%2), %%mm0 \n\t"
2053
"movq 8(%1,%2), %%mm2 \n\t"
2054
"movq 16(%1,%2), %%mm4 \n\t"
2055
"movq 24(%1,%2), %%mm6 \n\t"
2056
"movq %%mm0, %%mm1 \n\t"
2057
"movq %%mm2, %%mm3 \n\t"
2058
"movq %%mm4, %%mm5 \n\t"
2059
"movq %%mm6, %%mm7 \n\t"
2060
"punpcklbw %%mm0, %%mm0 \n\t"
2061
"punpckhbw %%mm1, %%mm1 \n\t"
2062
"punpcklbw %%mm2, %%mm2 \n\t"
2063
"punpckhbw %%mm3, %%mm3 \n\t"
2064
"punpcklbw %%mm4, %%mm4 \n\t"
2065
"punpckhbw %%mm5, %%mm5 \n\t"
2066
"punpcklbw %%mm6, %%mm6 \n\t"
2067
"punpckhbw %%mm7, %%mm7 \n\t"
2068
MOVNTQ" %%mm0, (%0,%2,2) \n\t"
2069
MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
2070
MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
2071
MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
2072
MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
2073
MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
2074
MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
2075
MOVNTQ" %%mm7, 56(%0,%2,2)"
2076
:: "r"(d), "r"(s2), "r"(x)
2077
:"memory");
2078
}
2079
for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
2080
}
2081
__asm__(
2082
EMMS" \n\t"
2083
SFENCE" \n\t"
2084
::: "memory"
2085
);
2086
}
2087
2088
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
2089
uint8_t *dst,
2090
int width, int height,
2091
int srcStride1, int srcStride2,
2092
int srcStride3, int dstStride)
2093
{
2094
x86_reg x;
2095
int y,w,h;
2096
w=width/2; h=height;
2097
for (y=0;y<h;y++) {
2098
const uint8_t* yp=src1+srcStride1*y;
2099
const uint8_t* up=src2+srcStride2*(y>>2);
2100
const uint8_t* vp=src3+srcStride3*(y>>2);
2101
uint8_t* d=dst+dstStride*y;
2102
x=0;
2103
for (;x<w-7;x+=8) {
2104
__asm__ volatile(
2105
PREFETCH" 32(%1, %0) \n\t"
2106
PREFETCH" 32(%2, %0) \n\t"
2107
PREFETCH" 32(%3, %0) \n\t"
2108
"movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2109
"movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
2110
"movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
2111
"movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
2112
"movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
2113
"movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
2114
"punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
2115
"punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
2116
"punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
2117
"punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
2118
2119
"movq %%mm1, %%mm6 \n\t"
2120
"punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
2121
"punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
2122
"punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
2123
MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
2124
MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
2125
2126
"punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
2127
"movq 8(%1, %0, 4), %%mm0 \n\t"
2128
"movq %%mm0, %%mm3 \n\t"
2129
"punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
2130
"punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
2131
MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
2132
MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
2133
2134
"movq %%mm4, %%mm6 \n\t"
2135
"movq 16(%1, %0, 4), %%mm0 \n\t"
2136
"movq %%mm0, %%mm3 \n\t"
2137
"punpcklbw %%mm5, %%mm4 \n\t"
2138
"punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
2139
"punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
2140
MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
2141
MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
2142
2143
"punpckhbw %%mm5, %%mm6 \n\t"
2144
"movq 24(%1, %0, 4), %%mm0 \n\t"
2145
"movq %%mm0, %%mm3 \n\t"
2146
"punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
2147
"punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
2148
MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
2149
MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
2150
2151
: "+r" (x)
2152
: "r"(yp), "r" (up), "r"(vp), "r"(d)
2153
:"memory");
2154
}
2155
for (; x<w; x++) {
2156
const int x2 = x<<2;
2157
d[8*x+0] = yp[x2];
2158
d[8*x+1] = up[x];
2159
d[8*x+2] = yp[x2+1];
2160
d[8*x+3] = vp[x];
2161
d[8*x+4] = yp[x2+2];
2162
d[8*x+5] = up[x];
2163
d[8*x+6] = yp[x2+3];
2164
d[8*x+7] = vp[x];
2165
}
2166
}
2167
__asm__(
2168
EMMS" \n\t"
2169
SFENCE" \n\t"
2170
::: "memory"
2171
);
2172
}
2173
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2174
2175
static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
2176
{
2177
dst += count;
2178
src += 2*count;
2179
count= - count;
2180
2181
if(count <= -16) {
2182
count += 15;
2183
__asm__ volatile(
2184
"pcmpeqw %%mm7, %%mm7 \n\t"
2185
"psrlw $8, %%mm7 \n\t"
2186
"1: \n\t"
2187
"movq -30(%1, %0, 2), %%mm0 \n\t"
2188
"movq -22(%1, %0, 2), %%mm1 \n\t"
2189
"movq -14(%1, %0, 2), %%mm2 \n\t"
2190
"movq -6(%1, %0, 2), %%mm3 \n\t"
2191
"pand %%mm7, %%mm0 \n\t"
2192
"pand %%mm7, %%mm1 \n\t"
2193
"pand %%mm7, %%mm2 \n\t"
2194
"pand %%mm7, %%mm3 \n\t"
2195
"packuswb %%mm1, %%mm0 \n\t"
2196
"packuswb %%mm3, %%mm2 \n\t"
2197
MOVNTQ" %%mm0,-15(%2, %0) \n\t"
2198
MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
2199
"add $16, %0 \n\t"
2200
" js 1b \n\t"
2201
: "+r"(count)
2202
: "r"(src), "r"(dst)
2203
);
2204
count -= 15;
2205
}
2206
while(count<0) {
2207
dst[count]= src[2*count];
2208
count++;
2209
}
2210
}
2211
2212
static void RENAME(extract_odd)(const uint8_t *src, uint8_t *dst, x86_reg count)
2213
{
2214
src ++;
2215
dst += count;
2216
src += 2*count;
2217
count= - count;
2218
2219
if(count < -16) {
2220
count += 16;
2221
__asm__ volatile(
2222
"pcmpeqw %%mm7, %%mm7 \n\t"
2223
"psrlw $8, %%mm7 \n\t"
2224
"1: \n\t"
2225
"movq -32(%1, %0, 2), %%mm0 \n\t"
2226
"movq -24(%1, %0, 2), %%mm1 \n\t"
2227
"movq -16(%1, %0, 2), %%mm2 \n\t"
2228
"movq -8(%1, %0, 2), %%mm3 \n\t"
2229
"pand %%mm7, %%mm0 \n\t"
2230
"pand %%mm7, %%mm1 \n\t"
2231
"pand %%mm7, %%mm2 \n\t"
2232
"pand %%mm7, %%mm3 \n\t"
2233
"packuswb %%mm1, %%mm0 \n\t"
2234
"packuswb %%mm3, %%mm2 \n\t"
2235
MOVNTQ" %%mm0,-16(%2, %0) \n\t"
2236
MOVNTQ" %%mm2,- 8(%2, %0) \n\t"
2237
"add $16, %0 \n\t"
2238
" js 1b \n\t"
2239
: "+r"(count)
2240
: "r"(src), "r"(dst)
2241
);
2242
count -= 16;
2243
}
2244
while(count<0) {
2245
dst[count]= src[2*count];
2246
count++;
2247
}
2248
}
2249
2250
#if !COMPILE_TEMPLATE_AMD3DNOW
2251
static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2252
{
2253
dst0+= count;
2254
dst1+= count;
2255
src += 4*count;
2256
count= - count;
2257
if(count <= -8) {
2258
count += 7;
2259
__asm__ volatile(
2260
"pcmpeqw %%mm7, %%mm7 \n\t"
2261
"psrlw $8, %%mm7 \n\t"
2262
"1: \n\t"
2263
"movq -28(%1, %0, 4), %%mm0 \n\t"
2264
"movq -20(%1, %0, 4), %%mm1 \n\t"
2265
"movq -12(%1, %0, 4), %%mm2 \n\t"
2266
"movq -4(%1, %0, 4), %%mm3 \n\t"
2267
"pand %%mm7, %%mm0 \n\t"
2268
"pand %%mm7, %%mm1 \n\t"
2269
"pand %%mm7, %%mm2 \n\t"
2270
"pand %%mm7, %%mm3 \n\t"
2271
"packuswb %%mm1, %%mm0 \n\t"
2272
"packuswb %%mm3, %%mm2 \n\t"
2273
"movq %%mm0, %%mm1 \n\t"
2274
"movq %%mm2, %%mm3 \n\t"
2275
"psrlw $8, %%mm0 \n\t"
2276
"psrlw $8, %%mm2 \n\t"
2277
"pand %%mm7, %%mm1 \n\t"
2278
"pand %%mm7, %%mm3 \n\t"
2279
"packuswb %%mm2, %%mm0 \n\t"
2280
"packuswb %%mm3, %%mm1 \n\t"
2281
MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2282
MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2283
"add $8, %0 \n\t"
2284
" js 1b \n\t"
2285
: "+r"(count)
2286
: "r"(src), "r"(dst0), "r"(dst1)
2287
);
2288
count -= 7;
2289
}
2290
while(count<0) {
2291
dst0[count]= src[4*count+0];
2292
dst1[count]= src[4*count+2];
2293
count++;
2294
}
2295
}
2296
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2297
2298
static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2299
{
2300
dst0 += count;
2301
dst1 += count;
2302
src0 += 4*count;
2303
src1 += 4*count;
2304
count= - count;
2305
#ifdef PAVGB
2306
if(count <= -8) {
2307
count += 7;
2308
__asm__ volatile(
2309
"pcmpeqw %%mm7, %%mm7 \n\t"
2310
"psrlw $8, %%mm7 \n\t"
2311
"1: \n\t"
2312
"movq -28(%1, %0, 4), %%mm0 \n\t"
2313
"movq -20(%1, %0, 4), %%mm1 \n\t"
2314
"movq -12(%1, %0, 4), %%mm2 \n\t"
2315
"movq -4(%1, %0, 4), %%mm3 \n\t"
2316
PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2317
PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2318
PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2319
PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2320
"pand %%mm7, %%mm0 \n\t"
2321
"pand %%mm7, %%mm1 \n\t"
2322
"pand %%mm7, %%mm2 \n\t"
2323
"pand %%mm7, %%mm3 \n\t"
2324
"packuswb %%mm1, %%mm0 \n\t"
2325
"packuswb %%mm3, %%mm2 \n\t"
2326
"movq %%mm0, %%mm1 \n\t"
2327
"movq %%mm2, %%mm3 \n\t"
2328
"psrlw $8, %%mm0 \n\t"
2329
"psrlw $8, %%mm2 \n\t"
2330
"pand %%mm7, %%mm1 \n\t"
2331
"pand %%mm7, %%mm3 \n\t"
2332
"packuswb %%mm2, %%mm0 \n\t"
2333
"packuswb %%mm3, %%mm1 \n\t"
2334
MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2335
MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2336
"add $8, %0 \n\t"
2337
" js 1b \n\t"
2338
: "+r"(count)
2339
: "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2340
);
2341
count -= 7;
2342
}
2343
#endif
2344
while(count<0) {
2345
dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2346
dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2347
count++;
2348
}
2349
}
2350
2351
#if !COMPILE_TEMPLATE_AMD3DNOW
2352
static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2353
{
2354
dst0+= count;
2355
dst1+= count;
2356
src += 4*count;
2357
count= - count;
2358
if(count <= -8) {
2359
count += 7;
2360
__asm__ volatile(
2361
"pcmpeqw %%mm7, %%mm7 \n\t"
2362
"psrlw $8, %%mm7 \n\t"
2363
"1: \n\t"
2364
"movq -28(%1, %0, 4), %%mm0 \n\t"
2365
"movq -20(%1, %0, 4), %%mm1 \n\t"
2366
"movq -12(%1, %0, 4), %%mm2 \n\t"
2367
"movq -4(%1, %0, 4), %%mm3 \n\t"
2368
"psrlw $8, %%mm0 \n\t"
2369
"psrlw $8, %%mm1 \n\t"
2370
"psrlw $8, %%mm2 \n\t"
2371
"psrlw $8, %%mm3 \n\t"
2372
"packuswb %%mm1, %%mm0 \n\t"
2373
"packuswb %%mm3, %%mm2 \n\t"
2374
"movq %%mm0, %%mm1 \n\t"
2375
"movq %%mm2, %%mm3 \n\t"
2376
"psrlw $8, %%mm0 \n\t"
2377
"psrlw $8, %%mm2 \n\t"
2378
"pand %%mm7, %%mm1 \n\t"
2379
"pand %%mm7, %%mm3 \n\t"
2380
"packuswb %%mm2, %%mm0 \n\t"
2381
"packuswb %%mm3, %%mm1 \n\t"
2382
MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
2383
MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
2384
"add $8, %0 \n\t"
2385
" js 1b \n\t"
2386
: "+r"(count)
2387
: "r"(src), "r"(dst0), "r"(dst1)
2388
);
2389
count -= 7;
2390
}
2391
src++;
2392
while(count<0) {
2393
dst0[count]= src[4*count+0];
2394
dst1[count]= src[4*count+2];
2395
count++;
2396
}
2397
}
2398
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2399
2400
static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
2401
{
2402
dst0 += count;
2403
dst1 += count;
2404
src0 += 4*count;
2405
src1 += 4*count;
2406
count= - count;
2407
#ifdef PAVGB
2408
if(count <= -8) {
2409
count += 7;
2410
__asm__ volatile(
2411
"pcmpeqw %%mm7, %%mm7 \n\t"
2412
"psrlw $8, %%mm7 \n\t"
2413
"1: \n\t"
2414
"movq -28(%1, %0, 4), %%mm0 \n\t"
2415
"movq -20(%1, %0, 4), %%mm1 \n\t"
2416
"movq -12(%1, %0, 4), %%mm2 \n\t"
2417
"movq -4(%1, %0, 4), %%mm3 \n\t"
2418
PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
2419
PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
2420
PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
2421
PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
2422
"psrlw $8, %%mm0 \n\t"
2423
"psrlw $8, %%mm1 \n\t"
2424
"psrlw $8, %%mm2 \n\t"
2425
"psrlw $8, %%mm3 \n\t"
2426
"packuswb %%mm1, %%mm0 \n\t"
2427
"packuswb %%mm3, %%mm2 \n\t"
2428
"movq %%mm0, %%mm1 \n\t"
2429
"movq %%mm2, %%mm3 \n\t"
2430
"psrlw $8, %%mm0 \n\t"
2431
"psrlw $8, %%mm2 \n\t"
2432
"pand %%mm7, %%mm1 \n\t"
2433
"pand %%mm7, %%mm3 \n\t"
2434
"packuswb %%mm2, %%mm0 \n\t"
2435
"packuswb %%mm3, %%mm1 \n\t"
2436
MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
2437
MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
2438
"add $8, %0 \n\t"
2439
" js 1b \n\t"
2440
: "+r"(count)
2441
: "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
2442
);
2443
count -= 7;
2444
}
2445
#endif
2446
src0++;
2447
src1++;
2448
while(count<0) {
2449
dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
2450
dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
2451
count++;
2452
}
2453
}
2454
2455
static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2456
int width, int height,
2457
int lumStride, int chromStride, int srcStride)
2458
{
2459
int y;
2460
const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2461
2462
for (y=0; y<height; y++) {
2463
RENAME(extract_even)(src, ydst, width);
2464
if(y&1) {
2465
RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
2466
udst+= chromStride;
2467
vdst+= chromStride;
2468
}
2469
2470
src += srcStride;
2471
ydst+= lumStride;
2472
}
2473
__asm__(
2474
EMMS" \n\t"
2475
SFENCE" \n\t"
2476
::: "memory"
2477
);
2478
}
2479
2480
#if !COMPILE_TEMPLATE_AMD3DNOW
2481
static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2482
int width, int height,
2483
int lumStride, int chromStride, int srcStride)
2484
{
2485
int y;
2486
const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2487
2488
for (y=0; y<height; y++) {
2489
RENAME(extract_even)(src, ydst, width);
2490
RENAME(extract_odd2)(src, udst, vdst, chromWidth);
2491
2492
src += srcStride;
2493
ydst+= lumStride;
2494
udst+= chromStride;
2495
vdst+= chromStride;
2496
}
2497
__asm__(
2498
EMMS" \n\t"
2499
SFENCE" \n\t"
2500
::: "memory"
2501
);
2502
}
2503
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2504
2505
static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2506
int width, int height,
2507
int lumStride, int chromStride, int srcStride)
2508
{
2509
int y;
2510
const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2511
2512
for (y=0; y<height; y++) {
2513
RENAME(extract_odd)(src, ydst, width);
2514
if(y&1) {
2515
RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
2516
udst+= chromStride;
2517
vdst+= chromStride;
2518
}
2519
2520
src += srcStride;
2521
ydst+= lumStride;
2522
}
2523
__asm__(
2524
EMMS" \n\t"
2525
SFENCE" \n\t"
2526
::: "memory"
2527
);
2528
}
2529
2530
#if !COMPILE_TEMPLATE_AMD3DNOW
2531
static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
2532
int width, int height,
2533
int lumStride, int chromStride, int srcStride)
2534
{
2535
int y;
2536
const int chromWidth = AV_CEIL_RSHIFT(width, 1);
2537
2538
for (y=0; y<height; y++) {
2539
RENAME(extract_odd)(src, ydst, width);
2540
RENAME(extract_even2)(src, udst, vdst, chromWidth);
2541
2542
src += srcStride;
2543
ydst+= lumStride;
2544
udst+= chromStride;
2545
vdst+= chromStride;
2546
}
2547
__asm__(
2548
EMMS" \n\t"
2549
SFENCE" \n\t"
2550
::: "memory"
2551
);
2552
}
2553
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2554
#endif /* !COMPILE_TEMPLATE_SSE2 */
2555
2556
static av_cold void RENAME(rgb2rgb_init)(void)
2557
{
2558
#if !COMPILE_TEMPLATE_SSE2
2559
#if !COMPILE_TEMPLATE_AMD3DNOW
2560
rgb15to16 = RENAME(rgb15to16);
2561
rgb15tobgr24 = RENAME(rgb15tobgr24);
2562
rgb15to32 = RENAME(rgb15to32);
2563
rgb16tobgr24 = RENAME(rgb16tobgr24);
2564
rgb16to32 = RENAME(rgb16to32);
2565
rgb16to15 = RENAME(rgb16to15);
2566
rgb24tobgr16 = RENAME(rgb24tobgr16);
2567
rgb24tobgr15 = RENAME(rgb24tobgr15);
2568
rgb24tobgr32 = RENAME(rgb24tobgr32);
2569
rgb32to16 = RENAME(rgb32to16);
2570
rgb32to15 = RENAME(rgb32to15);
2571
rgb32tobgr24 = RENAME(rgb32tobgr24);
2572
rgb24to15 = RENAME(rgb24to15);
2573
rgb24to16 = RENAME(rgb24to16);
2574
rgb24tobgr24 = RENAME(rgb24tobgr24);
2575
shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
2576
rgb32tobgr16 = RENAME(rgb32tobgr16);
2577
rgb32tobgr15 = RENAME(rgb32tobgr15);
2578
yv12toyuy2 = RENAME(yv12toyuy2);
2579
yv12touyvy = RENAME(yv12touyvy);
2580
yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
2581
yuv422ptouyvy = RENAME(yuv422ptouyvy);
2582
yuy2toyv12 = RENAME(yuy2toyv12);
2583
vu9_to_vu12 = RENAME(vu9_to_vu12);
2584
yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
2585
uyvytoyuv422 = RENAME(uyvytoyuv422);
2586
yuyvtoyuv422 = RENAME(yuyvtoyuv422);
2587
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
2588
2589
#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
2590
planar2x = RENAME(planar2x);
2591
#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
2592
#if HAVE_7REGS
2593
ff_rgb24toyv12 = RENAME(rgb24toyv12);
2594
#endif /* HAVE_7REGS */
2595
2596
yuyvtoyuv420 = RENAME(yuyvtoyuv420);
2597
uyvytoyuv420 = RENAME(uyvytoyuv420);
2598
#endif /* !COMPILE_TEMPLATE_SSE2 */
2599
2600
#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
2601
interleaveBytes = RENAME(interleaveBytes);
2602
#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
2603
#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
2604
#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
2605
deinterleaveBytes = RENAME(deinterleaveBytes);
2606
#endif
2607
#endif
2608
}
2609
2610