Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52869 views
1
/*
2
* Copyright (C) 2001-2003 Michael Niedermayer <[email protected]>
3
*
4
* This file is part of FFmpeg.
5
*
6
* FFmpeg is free software; you can redistribute it and/or
7
* modify it under the terms of the GNU Lesser General Public
8
* License as published by the Free Software Foundation; either
9
* version 2.1 of the License, or (at your option) any later version.
10
*
11
* FFmpeg is distributed in the hope that it will be useful,
12
* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14
* Lesser General Public License for more details.
15
*
16
* You should have received a copy of the GNU Lesser General Public
17
* License along with FFmpeg; if not, write to the Free Software
18
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
*/
20
21
#include "../swscale_internal.h"
22
#include "libavutil/x86/asm.h"
23
#include "libavutil/x86/cpu.h"
24
25
#define RET 0xC3 // near return opcode for x86
26
#define PREFETCH "prefetchnta"
27
28
#if HAVE_INLINE_ASM
29
av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
30
int16_t *filter, int32_t *filterPos,
31
int numSplits)
32
{
33
uint8_t *fragmentA;
34
x86_reg imm8OfPShufW1A;
35
x86_reg imm8OfPShufW2A;
36
x86_reg fragmentLengthA;
37
uint8_t *fragmentB;
38
x86_reg imm8OfPShufW1B;
39
x86_reg imm8OfPShufW2B;
40
x86_reg fragmentLengthB;
41
int fragmentPos;
42
43
int xpos, i;
44
45
// create an optimized horizontal scaling routine
46
/* This scaler is made of runtime-generated MMXEXT code using specially tuned
47
* pshufw instructions. For every four output pixels, if four input pixels
48
* are enough for the fast bilinear scaling, then a chunk of fragmentB is
49
* used. If five input pixels are needed, then a chunk of fragmentA is used.
50
*/
51
52
// code fragment
53
54
__asm__ volatile (
55
"jmp 9f \n\t"
56
// Begin
57
"0: \n\t"
58
"movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
59
"movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
60
"movd 1(%%"REG_c", %%"REG_S"), %%mm1 \n\t"
61
"punpcklbw %%mm7, %%mm1 \n\t"
62
"punpcklbw %%mm7, %%mm0 \n\t"
63
"pshufw $0xFF, %%mm1, %%mm1 \n\t"
64
"1: \n\t"
65
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
66
"2: \n\t"
67
"psubw %%mm1, %%mm0 \n\t"
68
"movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
69
"pmullw %%mm3, %%mm0 \n\t"
70
"psllw $7, %%mm1 \n\t"
71
"paddw %%mm1, %%mm0 \n\t"
72
73
"movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
74
75
"add $8, %%"REG_a" \n\t"
76
// End
77
"9: \n\t"
78
"lea " LOCAL_MANGLE(0b) ", %0 \n\t"
79
"lea " LOCAL_MANGLE(1b) ", %1 \n\t"
80
"lea " LOCAL_MANGLE(2b) ", %2 \n\t"
81
"dec %1 \n\t"
82
"dec %2 \n\t"
83
"sub %0, %1 \n\t"
84
"sub %0, %2 \n\t"
85
"lea " LOCAL_MANGLE(9b) ", %3 \n\t"
86
"sub %0, %3 \n\t"
87
88
89
: "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
90
"=r" (fragmentLengthA)
91
);
92
93
__asm__ volatile (
94
"jmp 9f \n\t"
95
// Begin
96
"0: \n\t"
97
"movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
98
"movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
99
"punpcklbw %%mm7, %%mm0 \n\t"
100
"pshufw $0xFF, %%mm0, %%mm1 \n\t"
101
"1: \n\t"
102
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
103
"2: \n\t"
104
"psubw %%mm1, %%mm0 \n\t"
105
"movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
106
"pmullw %%mm3, %%mm0 \n\t"
107
"psllw $7, %%mm1 \n\t"
108
"paddw %%mm1, %%mm0 \n\t"
109
110
"movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
111
112
"add $8, %%"REG_a" \n\t"
113
// End
114
"9: \n\t"
115
"lea " LOCAL_MANGLE(0b) ", %0 \n\t"
116
"lea " LOCAL_MANGLE(1b) ", %1 \n\t"
117
"lea " LOCAL_MANGLE(2b) ", %2 \n\t"
118
"dec %1 \n\t"
119
"dec %2 \n\t"
120
"sub %0, %1 \n\t"
121
"sub %0, %2 \n\t"
122
"lea " LOCAL_MANGLE(9b) ", %3 \n\t"
123
"sub %0, %3 \n\t"
124
125
126
: "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
127
"=r" (fragmentLengthB)
128
);
129
130
xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
131
fragmentPos = 0;
132
133
for (i = 0; i < dstW / numSplits; i++) {
134
int xx = xpos >> 16;
135
136
if ((i & 3) == 0) {
137
int a = 0;
138
int b = ((xpos + xInc) >> 16) - xx;
139
int c = ((xpos + xInc * 2) >> 16) - xx;
140
int d = ((xpos + xInc * 3) >> 16) - xx;
141
int inc = (d + 1 < 4);
142
uint8_t *fragment = inc ? fragmentB : fragmentA;
143
x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
144
x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
145
x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
146
int maxShift = 3 - (d + inc);
147
int shift = 0;
148
149
if (filterCode) {
150
filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9;
151
filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9;
152
filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
153
filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
154
filterPos[i / 2] = xx;
155
156
memcpy(filterCode + fragmentPos, fragment, fragmentLength);
157
158
filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) |
159
((b + inc) << 2) |
160
((c + inc) << 4) |
161
((d + inc) << 6);
162
filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) |
163
(c << 4) |
164
(d << 6);
165
166
if (i + 4 - inc >= dstW)
167
shift = maxShift; // avoid overread
168
else if ((filterPos[i / 2] & 3) <= maxShift)
169
shift = filterPos[i / 2] & 3; // align
170
171
if (shift && i >= shift) {
172
filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
173
filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
174
filterPos[i / 2] -= shift;
175
}
176
}
177
178
fragmentPos += fragmentLength;
179
180
if (filterCode)
181
filterCode[fragmentPos] = RET;
182
}
183
xpos += xInc;
184
}
185
if (filterCode)
186
filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part
187
188
return fragmentPos + 1;
189
}
190
191
void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst,
192
int dstWidth, const uint8_t *src,
193
int srcW, int xInc)
194
{
195
int32_t *filterPos = c->hLumFilterPos;
196
int16_t *filter = c->hLumFilter;
197
void *mmxextFilterCode = c->lumMmxextFilterCode;
198
int i;
199
#if ARCH_X86_64
200
uint64_t retsave;
201
#else
202
#if defined(PIC)
203
uint64_t ebxsave;
204
#endif
205
#endif
206
207
__asm__ volatile(
208
#if ARCH_X86_64
209
"mov -8(%%rsp), %%"REG_a" \n\t"
210
"mov %%"REG_a", %5 \n\t" // retsave
211
#else
212
#if defined(PIC)
213
"mov %%"REG_b", %5 \n\t" // ebxsave
214
#endif
215
#endif
216
"pxor %%mm7, %%mm7 \n\t"
217
"mov %0, %%"REG_c" \n\t"
218
"mov %1, %%"REG_D" \n\t"
219
"mov %2, %%"REG_d" \n\t"
220
"mov %3, %%"REG_b" \n\t"
221
"xor %%"REG_a", %%"REG_a" \n\t" // i
222
PREFETCH" (%%"REG_c") \n\t"
223
PREFETCH" 32(%%"REG_c") \n\t"
224
PREFETCH" 64(%%"REG_c") \n\t"
225
226
#if ARCH_X86_64
227
#define CALL_MMXEXT_FILTER_CODE \
228
"movl (%%"REG_b"), %%esi \n\t"\
229
"call *%4 \n\t"\
230
"movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
231
"add %%"REG_S", %%"REG_c" \n\t"\
232
"add %%"REG_a", %%"REG_D" \n\t"\
233
"xor %%"REG_a", %%"REG_a" \n\t"\
234
235
#else
236
#define CALL_MMXEXT_FILTER_CODE \
237
"movl (%%"REG_b"), %%esi \n\t"\
238
"call *%4 \n\t"\
239
"addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
240
"add %%"REG_a", %%"REG_D" \n\t"\
241
"xor %%"REG_a", %%"REG_a" \n\t"\
242
243
#endif /* ARCH_X86_64 */
244
245
CALL_MMXEXT_FILTER_CODE
246
CALL_MMXEXT_FILTER_CODE
247
CALL_MMXEXT_FILTER_CODE
248
CALL_MMXEXT_FILTER_CODE
249
CALL_MMXEXT_FILTER_CODE
250
CALL_MMXEXT_FILTER_CODE
251
CALL_MMXEXT_FILTER_CODE
252
CALL_MMXEXT_FILTER_CODE
253
254
#if ARCH_X86_64
255
"mov %5, %%"REG_a" \n\t"
256
"mov %%"REG_a", -8(%%rsp) \n\t"
257
#else
258
#if defined(PIC)
259
"mov %5, %%"REG_b" \n\t"
260
#endif
261
#endif
262
:: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
263
"m" (mmxextFilterCode)
264
#if ARCH_X86_64
265
,"m"(retsave)
266
#else
267
#if defined(PIC)
268
,"m" (ebxsave)
269
#endif
270
#endif
271
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
272
#if ARCH_X86_64 || !defined(PIC)
273
,"%"REG_b
274
#endif
275
);
276
277
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
278
dst[i] = src[srcW-1]*128;
279
}
280
281
void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2,
282
int dstWidth, const uint8_t *src1,
283
const uint8_t *src2, int srcW, int xInc)
284
{
285
int32_t *filterPos = c->hChrFilterPos;
286
int16_t *filter = c->hChrFilter;
287
void *mmxextFilterCode = c->chrMmxextFilterCode;
288
int i;
289
#if ARCH_X86_64
290
DECLARE_ALIGNED(8, uint64_t, retsave);
291
#else
292
#if defined(PIC)
293
DECLARE_ALIGNED(8, uint64_t, ebxsave);
294
#endif
295
#endif
296
__asm__ volatile(
297
#if ARCH_X86_64
298
"mov -8(%%rsp), %%"REG_a" \n\t"
299
"mov %%"REG_a", %7 \n\t" // retsave
300
#else
301
#if defined(PIC)
302
"mov %%"REG_b", %7 \n\t" // ebxsave
303
#endif
304
#endif
305
"pxor %%mm7, %%mm7 \n\t"
306
"mov %0, %%"REG_c" \n\t"
307
"mov %1, %%"REG_D" \n\t"
308
"mov %2, %%"REG_d" \n\t"
309
"mov %3, %%"REG_b" \n\t"
310
"xor %%"REG_a", %%"REG_a" \n\t" // i
311
PREFETCH" (%%"REG_c") \n\t"
312
PREFETCH" 32(%%"REG_c") \n\t"
313
PREFETCH" 64(%%"REG_c") \n\t"
314
315
CALL_MMXEXT_FILTER_CODE
316
CALL_MMXEXT_FILTER_CODE
317
CALL_MMXEXT_FILTER_CODE
318
CALL_MMXEXT_FILTER_CODE
319
"xor %%"REG_a", %%"REG_a" \n\t" // i
320
"mov %5, %%"REG_c" \n\t" // src2
321
"mov %6, %%"REG_D" \n\t" // dst2
322
PREFETCH" (%%"REG_c") \n\t"
323
PREFETCH" 32(%%"REG_c") \n\t"
324
PREFETCH" 64(%%"REG_c") \n\t"
325
326
CALL_MMXEXT_FILTER_CODE
327
CALL_MMXEXT_FILTER_CODE
328
CALL_MMXEXT_FILTER_CODE
329
CALL_MMXEXT_FILTER_CODE
330
331
#if ARCH_X86_64
332
"mov %7, %%"REG_a" \n\t"
333
"mov %%"REG_a", -8(%%rsp) \n\t"
334
#else
335
#if defined(PIC)
336
"mov %7, %%"REG_b" \n\t"
337
#endif
338
#endif
339
:: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
340
"m" (mmxextFilterCode), "m" (src2), "m"(dst2)
341
#if ARCH_X86_64
342
,"m"(retsave)
343
#else
344
#if defined(PIC)
345
,"m" (ebxsave)
346
#endif
347
#endif
348
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
349
#if ARCH_X86_64 || !defined(PIC)
350
,"%"REG_b
351
#endif
352
);
353
354
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
355
dst1[i] = src1[srcW-1]*128;
356
dst2[i] = src2[srcW-1]*128;
357
}
358
}
359
#endif //HAVE_INLINE_ASM
360
361