Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52867 views
1
;*****************************************************************************
2
;* pixel-32.asm: x86_32 pixel metrics
3
;*****************************************************************************
4
;* Copyright (C) 2003-2016 x264 project
5
;*
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;* Laurent Aimar <fenrir@via.ecp.fr>
8
;*
9
;* This program is free software; you can redistribute it and/or modify
10
;* it under the terms of the GNU General Public License as published by
11
;* the Free Software Foundation; either version 2 of the License, or
12
;* (at your option) any later version.
13
;*
14
;* This program is distributed in the hope that it will be useful,
15
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17
;* GNU General Public License for more details.
18
;*
19
;* You should have received a copy of the GNU General Public License
20
;* along with this program; if not, write to the Free Software
21
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
22
;*
23
;* This program is also available under a commercial proprietary license.
24
;* For more information, contact us at licensing@x264.com.
25
;*****************************************************************************
26
27
%include "x86inc.asm"
28
%include "x86util.asm"
29
30
cextern pw_ppmmppmm
31
cextern pw_pmpmpmpm
32
33
SECTION .text
34
INIT_MMX mmx2
35
36
%macro LOAD_DIFF_4x8P 1 ; dx
37
LOAD_DIFF m0, m7, none, [r0+%1], [r2+%1]
38
LOAD_DIFF m1, m6, none, [r0+%1+r1], [r2+%1+r3]
39
LOAD_DIFF m2, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
40
LOAD_DIFF m3, m6, none, [r0+%1+r4], [r2+%1+r5]
41
lea r0, [r0+4*r1]
42
lea r2, [r2+4*r3]
43
LOAD_DIFF m4, m7, none, [r0+%1], [r2+%1]
44
LOAD_DIFF m5, m6, none, [r0+%1+r1], [r2+%1+r3]
45
LOAD_DIFF m6, m7, none, [r0+%1+r1*2], [r2+%1+r3*2]
46
movq [spill], m5
47
LOAD_DIFF m7, m5, none, [r0+%1+r4], [r2+%1+r5]
48
movq m5, [spill]
49
%endmacro
50
51
%macro SUM4x8_MM 0
52
movq [spill], m6
53
movq [spill+8], m7
54
ABSW2 m0, m1, m0, m1, m6, m7
55
ABSW2 m2, m3, m2, m3, m6, m7
56
paddw m0, m2
57
paddw m1, m3
58
movq m6, [spill]
59
movq m7, [spill+8]
60
ABSW2 m4, m5, m4, m5, m2, m3
61
ABSW2 m6, m7, m6, m7, m2, m3
62
paddw m4, m6
63
paddw m5, m7
64
paddw m0, m4
65
paddw m1, m5
66
paddw m0, m1
67
%endmacro
68
69
;-----------------------------------------------------------------------------
70
; int pixel_sa8d_8x8( uint8_t *, intptr_t, uint8_t *, intptr_t )
71
;-----------------------------------------------------------------------------
72
cglobal pixel_sa8d_8x8_internal
73
push r0
74
push r2
75
sub esp, 0x74
76
%define args esp+0x74
77
%define spill esp+0x60 ; +16
78
%define trans esp+0 ; +96
79
LOAD_DIFF_4x8P 0
80
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
81
82
movq [spill], m1
83
TRANSPOSE4x4W 4, 5, 6, 7, 1
84
movq [trans+0x00], m4
85
movq [trans+0x08], m5
86
movq [trans+0x10], m6
87
movq [trans+0x18], m7
88
movq m1, [spill]
89
TRANSPOSE4x4W 0, 1, 2, 3, 4
90
movq [trans+0x20], m0
91
movq [trans+0x28], m1
92
movq [trans+0x30], m2
93
movq [trans+0x38], m3
94
95
mov r0, [args+4]
96
mov r2, [args]
97
LOAD_DIFF_4x8P 4
98
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
99
100
movq [spill], m7
101
TRANSPOSE4x4W 0, 1, 2, 3, 7
102
movq [trans+0x40], m0
103
movq [trans+0x48], m1
104
movq [trans+0x50], m2
105
movq [trans+0x58], m3
106
movq m7, [spill]
107
TRANSPOSE4x4W 4, 5, 6, 7, 1
108
movq m0, [trans+0x00]
109
movq m1, [trans+0x08]
110
movq m2, [trans+0x10]
111
movq m3, [trans+0x18]
112
113
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
114
SUM4x8_MM
115
movq [trans], m0
116
117
movq m0, [trans+0x20]
118
movq m1, [trans+0x28]
119
movq m2, [trans+0x30]
120
movq m3, [trans+0x38]
121
movq m4, [trans+0x40]
122
movq m5, [trans+0x48]
123
movq m6, [trans+0x50]
124
movq m7, [trans+0x58]
125
126
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
127
SUM4x8_MM
128
129
pavgw m0, [trans]
130
add esp, 0x7c
131
ret
132
%undef args
133
%undef spill
134
%undef trans
135
136
%macro SUM_MM_X3 8 ; 3x sum, 4x tmp, op
137
pxor %7, %7
138
pshufw %4, %1, q1032
139
pshufw %5, %2, q1032
140
pshufw %6, %3, q1032
141
paddusw %1, %4
142
paddusw %2, %5
143
paddusw %3, %6
144
punpcklwd %1, %7
145
punpcklwd %2, %7
146
punpcklwd %3, %7
147
pshufw %4, %1, q1032
148
pshufw %5, %2, q1032
149
pshufw %6, %3, q1032
150
%8 %1, %4
151
%8 %2, %5
152
%8 %3, %6
153
%endmacro
154
155
%macro LOAD_4x8P 1 ; dx
156
pxor m7, m7
157
movd m6, [r0+%1+7*FENC_STRIDE]
158
movd m0, [r0+%1+0*FENC_STRIDE]
159
movd m1, [r0+%1+1*FENC_STRIDE]
160
movd m2, [r0+%1+2*FENC_STRIDE]
161
movd m3, [r0+%1+3*FENC_STRIDE]
162
movd m4, [r0+%1+4*FENC_STRIDE]
163
movd m5, [r0+%1+5*FENC_STRIDE]
164
punpcklbw m6, m7
165
punpcklbw m0, m7
166
punpcklbw m1, m7
167
movq [spill], m6
168
punpcklbw m2, m7
169
punpcklbw m3, m7
170
movd m6, [r0+%1+6*FENC_STRIDE]
171
punpcklbw m4, m7
172
punpcklbw m5, m7
173
punpcklbw m6, m7
174
movq m7, [spill]
175
%endmacro
176
177
%macro HSUMSUB2 4
178
pshufw m4, %1, %3
179
pshufw m5, %2, %3
180
pmullw %1, %4
181
pmullw m5, %4
182
paddw %1, m4
183
paddw %2, m5
184
%endmacro
185
186
;-----------------------------------------------------------------------------
187
; void intra_sa8d_x3_8x8( uint8_t *fenc, uint8_t edge[36], int *res )
188
;-----------------------------------------------------------------------------
189
cglobal intra_sa8d_x3_8x8, 2,3
190
SUB esp, 0x94
191
%define edge esp+0x70 ; +32
192
%define spill esp+0x60 ; +16
193
%define trans esp+0 ; +96
194
%define sum esp+0 ; +32
195
196
pxor m7, m7
197
movq m0, [r1+7]
198
movq m2, [r1+16]
199
movq m1, m0
200
movq m3, m2
201
punpcklbw m0, m7
202
punpckhbw m1, m7
203
punpcklbw m2, m7
204
punpckhbw m3, m7
205
movq m6, [pw_ppmmppmm]
206
HSUMSUB2 m0, m2, q1032, m6
207
HSUMSUB2 m1, m3, q1032, m6
208
movq m6, [pw_pmpmpmpm]
209
HSUMSUB2 m0, m2, q2301, m6
210
HSUMSUB2 m1, m3, q2301, m6
211
movq m4, m0
212
movq m5, m2
213
paddw m0, m1
214
paddw m2, m3
215
psubw m4, m1
216
psubw m3, m5
217
movq [edge+0], m0
218
movq [edge+8], m4
219
movq [edge+16], m2
220
movq [edge+24], m3
221
222
LOAD_4x8P 0
223
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
224
225
movq [spill], m0
226
TRANSPOSE4x4W 4, 5, 6, 7, 0
227
movq [trans+0x00], m4
228
movq [trans+0x08], m5
229
movq [trans+0x10], m6
230
movq [trans+0x18], m7
231
movq m0, [spill]
232
TRANSPOSE4x4W 0, 1, 2, 3, 4
233
movq [trans+0x20], m0
234
movq [trans+0x28], m1
235
movq [trans+0x30], m2
236
movq [trans+0x38], m3
237
238
LOAD_4x8P 4
239
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
240
241
movq [spill], m7
242
TRANSPOSE4x4W 0, 1, 2, 3, 7
243
movq [trans+0x40], m0
244
movq [trans+0x48], m1
245
movq [trans+0x50], m2
246
movq [trans+0x58], m3
247
movq m7, [spill]
248
TRANSPOSE4x4W 4, 5, 6, 7, 0
249
movq m0, [trans+0x00]
250
movq m1, [trans+0x08]
251
movq m2, [trans+0x10]
252
movq m3, [trans+0x18]
253
254
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
255
256
movq [spill+0], m0
257
movq [spill+8], m1
258
ABSW2 m2, m3, m2, m3, m0, m1
259
ABSW2 m4, m5, m4, m5, m0, m1
260
paddw m2, m4
261
paddw m3, m5
262
ABSW2 m6, m7, m6, m7, m4, m5
263
movq m0, [spill+0]
264
movq m1, [spill+8]
265
paddw m2, m6
266
paddw m3, m7
267
paddw m2, m3
268
ABSW m1, m1, m4
269
paddw m2, m1 ; 7x4 sum
270
movq m7, m0
271
movq m1, [edge+8] ; left bottom
272
psllw m1, 3
273
psubw m7, m1
274
ABSW2 m0, m7, m0, m7, m5, m3
275
paddw m0, m2
276
paddw m7, m2
277
movq [sum+0], m0 ; dc
278
movq [sum+8], m7 ; left
279
280
movq m0, [trans+0x20]
281
movq m1, [trans+0x28]
282
movq m2, [trans+0x30]
283
movq m3, [trans+0x38]
284
movq m4, [trans+0x40]
285
movq m5, [trans+0x48]
286
movq m6, [trans+0x50]
287
movq m7, [trans+0x58]
288
289
HADAMARD8_V 0, 1, 2, 3, 4, 5, 6, 7
290
291
movd [sum+0x10], m0
292
movd [sum+0x12], m1
293
movd [sum+0x14], m2
294
movd [sum+0x16], m3
295
movd [sum+0x18], m4
296
movd [sum+0x1a], m5
297
movd [sum+0x1c], m6
298
movd [sum+0x1e], m7
299
300
movq [spill], m0
301
movq [spill+8], m1
302
ABSW2 m2, m3, m2, m3, m0, m1
303
ABSW2 m4, m5, m4, m5, m0, m1
304
paddw m2, m4
305
paddw m3, m5
306
paddw m2, m3
307
movq m0, [spill]
308
movq m1, [spill+8]
309
ABSW2 m6, m7, m6, m7, m4, m5
310
ABSW m1, m1, m3
311
paddw m2, m7
312
paddw m1, m6
313
paddw m2, m1 ; 7x4 sum
314
movq m1, m0
315
316
movq m7, [edge+0]
317
psllw m7, 3 ; left top
318
319
mov r2, [edge+0]
320
add r2, [edge+16]
321
lea r2, [4*r2+32]
322
and r2, 0xffc0
323
movd m6, r2 ; dc
324
325
psubw m1, m7
326
psubw m0, m6
327
ABSW2 m0, m1, m0, m1, m5, m6
328
movq m3, [sum+0] ; dc
329
paddw m0, m2
330
paddw m1, m2
331
movq m2, m0
332
paddw m0, m3
333
paddw m1, [sum+8] ; h
334
psrlq m2, 16
335
paddw m2, m3
336
337
movq m3, [edge+16] ; top left
338
movq m4, [edge+24] ; top right
339
psllw m3, 3
340
psllw m4, 3
341
psubw m3, [sum+16]
342
psubw m4, [sum+24]
343
ABSW2 m3, m4, m3, m4, m5, m6
344
paddw m2, m3
345
paddw m2, m4 ; v
346
347
SUM_MM_X3 m0, m1, m2, m3, m4, m5, m6, pavgw
348
mov r2, r2m
349
pxor m7, m7
350
punpckldq m2, m1
351
pavgw m0, m7
352
pavgw m2, m7
353
movd [r2+8], m0 ; dc
354
movq [r2+0], m2 ; v, h
355
ADD esp, 0x94
356
RET
357
%undef edge
358
%undef spill
359
%undef trans
360
%undef sum
361
362
363
364
;-----------------------------------------------------------------------------
365
; void pixel_ssim_4x4x2_core( const uint8_t *pix1, intptr_t stride1,
366
; const uint8_t *pix2, intptr_t stride2, int sums[2][4] )
367
;-----------------------------------------------------------------------------
368
cglobal pixel_ssim_4x4x2_core, 0,5
369
mov r1, r1m
370
mov r3, r3m
371
mov r4, 4
372
pxor m0, m0
373
.loop:
374
mov r0, r0m
375
mov r2, r2m
376
add r0, r4
377
add r2, r4
378
pxor m1, m1
379
pxor m2, m2
380
pxor m3, m3
381
pxor m4, m4
382
%rep 4
383
movd m5, [r0]
384
movd m6, [r2]
385
punpcklbw m5, m0
386
punpcklbw m6, m0
387
paddw m1, m5
388
paddw m2, m6
389
movq m7, m5
390
pmaddwd m5, m5
391
pmaddwd m7, m6
392
pmaddwd m6, m6
393
paddd m3, m5
394
paddd m4, m7
395
paddd m3, m6
396
add r0, r1
397
add r2, r3
398
%endrep
399
mov r0, r4m
400
lea r0, [r0+r4*4]
401
pshufw m5, m1, q0032
402
pshufw m6, m2, q0032
403
paddusw m1, m5
404
paddusw m2, m6
405
punpcklwd m1, m2
406
pshufw m2, m1, q0032
407
pshufw m5, m3, q0032
408
pshufw m6, m4, q0032
409
paddusw m1, m2
410
paddd m3, m5
411
paddd m4, m6
412
punpcklwd m1, m0
413
punpckldq m3, m4
414
movq [r0+0], m1
415
movq [r0+8], m3
416
sub r4, 4
417
jge .loop
418
emms
419
RET
420
421
422