Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
Download
52868 views
1
;*****************************************************************************
2
;* cabac-a.asm: x86 cabac
3
;*****************************************************************************
4
;* Copyright (C) 2008-2016 x264 project
5
;*
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;* Fiona Glaser <fiona@x264.com>
8
;* Holger Lubitz <holger@lubitz.org>
9
;*
10
;* This program is free software; you can redistribute it and/or modify
11
;* it under the terms of the GNU General Public License as published by
12
;* the Free Software Foundation; either version 2 of the License, or
13
;* (at your option) any later version.
14
;*
15
;* This program is distributed in the hope that it will be useful,
16
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18
;* GNU General Public License for more details.
19
;*
20
;* You should have received a copy of the GNU General Public License
21
;* along with this program; if not, write to the Free Software
22
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02111, USA.
23
;*
24
;* This program is also available under a commercial proprietary license.
25
;* For more information, contact us at licensing@x264.com.
26
;*****************************************************************************
27
28
%include "x86inc.asm"
29
%include "x86util.asm"
30
31
SECTION_RODATA
32
33
coeff_abs_level1_ctx: db 1, 2, 3, 4, 0, 0, 0, 0
34
coeff_abs_levelgt1_ctx: db 5, 5, 5, 5, 6, 7, 8, 9
35
coeff_abs_level_transition: db 1, 2, 3, 3, 4, 5, 6, 7
36
db 4, 4, 4, 4, 5, 6, 7, 7
37
38
%if ARCH_X86_64
39
%macro COEFF_LAST_TABLE 17
40
%define funccpu1 %1
41
%define funccpu2 %2
42
%define funccpu3 %3
43
%rep 14
44
%ifidn %4, 4
45
dq mangle(x264_coeff_last%4_ %+ funccpu1)
46
%elifidn %4, 64
47
dq mangle(x264_coeff_last%4_ %+ funccpu2)
48
%else
49
dq mangle(x264_coeff_last%4_ %+ funccpu3)
50
%endif
51
%rotate 1
52
%endrep
53
%endmacro
54
55
cextern coeff_last4_mmx2
56
cextern coeff_last4_mmx2_lzcnt
57
cextern coeff_last15_sse2
58
cextern coeff_last15_sse2_lzcnt
59
cextern coeff_last16_sse2
60
cextern coeff_last16_sse2_lzcnt
61
cextern coeff_last64_sse2
62
cextern coeff_last64_sse2_lzcnt
63
cextern coeff_last64_avx2_lzcnt
64
65
%ifdef PIC
66
SECTION .data
67
%endif
68
coeff_last_sse2: COEFF_LAST_TABLE mmx2, sse2, sse2, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
69
coeff_last_sse2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, sse2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
70
coeff_last_avx2_lzcnt: COEFF_LAST_TABLE mmx2_lzcnt, avx2_lzcnt, sse2_lzcnt, 16, 15, 16, 4, 15, 64, 16, 15, 16, 64, 16, 15, 16, 64
71
%endif
72
73
SECTION .text
74
75
cextern cabac_range_lps
76
cextern cabac_transition
77
cextern cabac_renorm_shift
78
cextern cabac_entropy
79
cextern cabac_size_unary
80
cextern cabac_transition_unary
81
cextern significant_coeff_flag_offset
82
cextern significant_coeff_flag_offset_8x8
83
cextern last_coeff_flag_offset
84
cextern last_coeff_flag_offset_8x8
85
cextern coeff_abs_level_m1_offset
86
cextern count_cat_m1
87
cextern cabac_encode_ue_bypass
88
89
%if ARCH_X86_64
90
%define pointer resq
91
%else
92
%define pointer resd
93
%endif
94
95
struc cb
96
.low: resd 1
97
.range: resd 1
98
.queue: resd 1
99
.bytes_outstanding: resd 1
100
.start: pointer 1
101
.p: pointer 1
102
.end: pointer 1
103
align 16, resb 1
104
.bits_encoded: resd 1
105
.state: resb 1024
106
endstruc
107
108
%macro LOAD_GLOBAL 3-5 0 ; dst, base, off1, off2, tmp
109
%ifdef PIC
110
%ifidn %4, 0
111
movzx %1, byte [%2+%3+r7-$$]
112
%else
113
lea %5, [r7+%4]
114
movzx %1, byte [%2+%3+%5-$$]
115
%endif
116
%else
117
movzx %1, byte [%2+%3+%4]
118
%endif
119
%endmacro
120
121
%macro CABAC 1
122
; t3 must be ecx, since it's used for shift.
123
%if WIN64
124
DECLARE_REG_TMP 3,1,2,0,5,6,4,4
125
%elif ARCH_X86_64
126
DECLARE_REG_TMP 0,1,2,3,4,5,6,6
127
%else
128
DECLARE_REG_TMP 0,4,2,1,3,5,6,2
129
%endif
130
131
cglobal cabac_encode_decision_%1, 1,7
132
movifnidn t1d, r1m
133
mov t5d, [r0+cb.range]
134
movzx t6d, byte [r0+cb.state+t1]
135
movifnidn t0, r0 ; WIN64
136
mov t4d, ~1
137
mov t3d, t5d
138
and t4d, t6d
139
shr t5d, 6
140
movifnidn t2d, r2m
141
%if WIN64
142
PUSH r7
143
%endif
144
%ifdef PIC
145
lea r7, [$$]
146
%endif
147
LOAD_GLOBAL t5d, cabac_range_lps-4, t5, t4*2, t4
148
LOAD_GLOBAL t4d, cabac_transition, t2, t6*2, t4
149
and t6d, 1
150
sub t3d, t5d
151
cmp t6d, t2d
152
mov t6d, [t0+cb.low]
153
lea t2, [t6+t3]
154
cmovne t3d, t5d
155
cmovne t6d, t2d
156
mov [t0+cb.state+t1], t4b
157
;cabac_encode_renorm
158
mov t4d, t3d
159
%ifidn %1, bmi2
160
lzcnt t3d, t3d
161
sub t3d, 23
162
shlx t4d, t4d, t3d
163
shlx t6d, t6d, t3d
164
%else
165
shr t3d, 3
166
LOAD_GLOBAL t3d, cabac_renorm_shift, t3
167
shl t4d, t3b
168
shl t6d, t3b
169
%endif
170
%if WIN64
171
POP r7
172
%endif
173
mov [t0+cb.range], t4d
174
add t3d, [t0+cb.queue]
175
jge cabac_putbyte_%1
176
.update_queue_low:
177
mov [t0+cb.low], t6d
178
mov [t0+cb.queue], t3d
179
RET
180
181
cglobal cabac_encode_bypass_%1, 2,3
182
mov t7d, [r0+cb.low]
183
and r1d, [r0+cb.range]
184
lea t7d, [t7*2+r1]
185
movifnidn t0, r0 ; WIN64
186
mov t3d, [r0+cb.queue]
187
inc t3d
188
%if ARCH_X86_64 ; .putbyte compiles to nothing but a jmp
189
jge cabac_putbyte_%1
190
%else
191
jge .putbyte
192
%endif
193
mov [t0+cb.low], t7d
194
mov [t0+cb.queue], t3d
195
RET
196
%if ARCH_X86_64 == 0
197
.putbyte:
198
PROLOGUE 0,7
199
movifnidn t6d, t7d
200
jmp cabac_putbyte_%1
201
%endif
202
203
%ifnidn %1,bmi2
204
cglobal cabac_encode_terminal_%1, 1,3
205
sub dword [r0+cb.range], 2
206
; shortcut: the renormalization shift in terminal
207
; can only be 0 or 1 and is zero over 99% of the time.
208
test dword [r0+cb.range], 0x100
209
je .renorm
210
RET
211
.renorm:
212
shl dword [r0+cb.low], 1
213
shl dword [r0+cb.range], 1
214
inc dword [r0+cb.queue]
215
jge .putbyte
216
RET
217
.putbyte:
218
PROLOGUE 0,7
219
movifnidn t0, r0 ; WIN64
220
mov t3d, [r0+cb.queue]
221
mov t6d, [t0+cb.low]
222
%endif
223
224
cabac_putbyte_%1:
225
; alive: t0=cb t3=queue t6=low
226
%if WIN64
227
DECLARE_REG_TMP 3,6,1,0,2,5,4
228
%endif
229
%ifidn %1, bmi2
230
add t3d, 10
231
shrx t2d, t6d, t3d
232
bzhi t6d, t6d, t3d
233
sub t3d, 18
234
%else
235
mov t1d, -1
236
add t3d, 10
237
mov t2d, t6d
238
shl t1d, t3b
239
shr t2d, t3b ; out
240
not t1d
241
sub t3d, 18
242
and t6d, t1d
243
%endif
244
mov t5d, [t0+cb.bytes_outstanding]
245
cmp t2b, 0xff ; FIXME is a 32bit op faster?
246
jz .postpone
247
mov t1, [t0+cb.p]
248
add [t1-1], t2h
249
dec t2h
250
.loop_outstanding:
251
mov [t1], t2h
252
inc t1
253
dec t5d
254
jge .loop_outstanding
255
mov [t1-1], t2b
256
mov [t0+cb.p], t1
257
.postpone:
258
inc t5d
259
mov [t0+cb.bytes_outstanding], t5d
260
jmp mangle(x264_cabac_encode_decision_%1.update_queue_low)
261
%endmacro
262
263
CABAC asm
264
CABAC bmi2
265
266
; %1 = label name
267
; %2 = node_ctx init?
268
%macro COEFF_ABS_LEVEL_GT1 2
269
%if %2
270
%define ctx 1
271
%else
272
movzx r11d, byte [coeff_abs_level1_ctx+r2 GLOBAL]
273
%define ctx r11
274
%endif
275
movzx r9d, byte [r8+ctx]
276
; if( coeff_abs > 1 )
277
cmp r1d, 1
278
jg .%1_gt1
279
; x264_cabac_encode_decision( cb, ctx_level+ctx, 0 )
280
movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
281
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
282
lea r0d, [r0+r9+256]
283
mov [r8+ctx], r10b
284
%if %2
285
mov r2d, 1
286
%else
287
movzx r2d, byte [coeff_abs_level_transition+r2 GLOBAL]
288
%endif
289
jmp .%1_end
290
291
.%1_gt1:
292
; x264_cabac_encode_decision( cb, ctx_level+ctx, 1 )
293
movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL]
294
xor r9d, 1
295
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
296
mov [r8+ctx], r10b
297
add r0d, r9d
298
%if %2
299
%define ctx 5
300
%else
301
movzx r11d, byte [coeff_abs_levelgt1_ctx+r2 GLOBAL]
302
%define ctx r11
303
%endif
304
; if( coeff_abs < 15 )
305
cmp r1d, 15
306
jge .%1_escape
307
shl r1d, 7
308
; x264_cabac_transition_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
309
movzx r9d, byte [r8+ctx]
310
add r9d, r1d
311
movzx r10d, byte [cabac_transition_unary-128+r9 GLOBAL]
312
; x264_cabac_size_unary[coeff_abs-1][cb->state[ctx_level+ctx]]
313
movzx r9d, word [cabac_size_unary-256+r9*2 GLOBAL]
314
mov [r8+ctx], r10b
315
add r0d, r9d
316
jmp .%1_gt1_end
317
318
.%1_escape:
319
; x264_cabac_transition_unary[14][cb->state[ctx_level+ctx]]
320
movzx r9d, byte [r8+ctx]
321
movzx r10d, byte [cabac_transition_unary+128*14+r9 GLOBAL]
322
; x264_cabac_size_unary[14][cb->state[ctx_level+ctx]]
323
movzx r9d, word [cabac_size_unary+256*14+r9*2 GLOBAL]
324
add r0d, r9d
325
mov [r8+ctx], r10b
326
sub r1d, 14
327
%if cpuflag(lzcnt)
328
lzcnt r9d, r1d
329
xor r9d, 0x1f
330
%else
331
bsr r9d, r1d
332
%endif
333
; bs_size_ue_big(coeff_abs-15)<<8
334
shl r9d, 9
335
; (ilog2(coeff_abs-14)+1) << 8
336
lea r0d, [r0+r9+256]
337
.%1_gt1_end:
338
%if %2
339
mov r2d, 4
340
%else
341
movzx r2d, byte [coeff_abs_level_transition+8+r2 GLOBAL]
342
%endif
343
.%1_end:
344
%endmacro
345
346
%macro LOAD_DCTCOEF 1
347
%if HIGH_BIT_DEPTH
348
mov %1, [dct+r6*4]
349
%else
350
movzx %1, word [dct+r6*2]
351
%endif
352
%endmacro
353
354
%macro ABS_DCTCOEFS 2
355
%assign i 0
356
%rep %2/16
357
%if HIGH_BIT_DEPTH
358
ABSD m0, [%1+ 0+i*64], m4
359
ABSD m1, [%1+16+i*64], m5
360
ABSD m2, [%1+32+i*64], m4
361
ABSD m3, [%1+48+i*64], m5
362
mova [rsp+ 0+i*64], m0
363
mova [rsp+16+i*64], m1
364
mova [rsp+32+i*64], m2
365
mova [rsp+48+i*64], m3
366
%else
367
ABSW m0, [%1+ 0+i*32], m2
368
ABSW m1, [%1+16+i*32], m3
369
mova [rsp+ 0+i*32], m0
370
mova [rsp+16+i*32], m1
371
%endif
372
%assign i i+1
373
%endrep
374
%endmacro
375
376
%macro SIG_OFFSET 1
377
%if %1
378
movzx r11d, byte [r4+r6]
379
%endif
380
%endmacro
381
382
%macro LAST_OFFSET 1
383
%if %1
384
movzx r11d, byte [last_coeff_flag_offset_8x8+r6 GLOBAL]
385
%endif
386
%endmacro
387
388
;-----------------------------------------------------------------------------
389
; void x264_cabac_block_residual_rd_internal_sse2 ( dctcoef *l, int b_interlaced,
390
; int ctx_block_cat, x264_cabac_t *cb );
391
;-----------------------------------------------------------------------------
392
393
;%1 = 8x8 mode
394
%macro CABAC_RESIDUAL_RD 2
395
%if %1
396
%define func cabac_block_residual_8x8_rd_internal
397
%define maxcoeffs 64
398
%define dct rsp
399
%else
400
%define func cabac_block_residual_rd_internal
401
%define maxcoeffs 16
402
%define dct r4
403
%endif
404
405
%ifdef PIC
406
cglobal func, 4,13
407
lea r12, [$$]
408
%define GLOBAL +r12-$$
409
%else
410
cglobal func, 4,12
411
%define GLOBAL
412
%endif
413
414
%assign pad gprsize+SIZEOF_DCTCOEF*maxcoeffs-(stack_offset&15)
415
SUB rsp, pad
416
shl r1d, 4 ; MB_INTERLACED*16
417
%if %1
418
lea r4, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL] ; r12 = sig offset 8x8
419
%endif
420
add r1d, r2d
421
movzx r5d, word [significant_coeff_flag_offset+r1*2 GLOBAL] ; r5 = ctx_sig
422
movzx r7d, word [last_coeff_flag_offset+r1*2 GLOBAL] ; r7 = ctx_last
423
movzx r8d, word [coeff_abs_level_m1_offset+r2*2 GLOBAL] ; r8 = ctx_level
424
425
; abs() all the coefficients; copy them to the stack to avoid
426
; changing the originals.
427
; overreading is okay; it's all valid aligned data anyways.
428
%if %1
429
ABS_DCTCOEFS r0, 64
430
%else
431
mov r4, r0 ; r4 = dct
432
mov r6, ~SIZEOF_DCTCOEF
433
and r6, r4 ; handle AC coefficient case
434
ABS_DCTCOEFS r6, 16
435
sub r4, r6 ; calculate our new dct pointer
436
add r4, rsp ; restore AC coefficient offset
437
%endif
438
mov r1, [%2+gprsize*r2 GLOBAL]
439
; for improved OOE performance, run coeff_last on the original coefficients.
440
call r1 ; coeff_last[ctx_block_cat]( dct )
441
; we know on 64-bit that the SSE2 versions of this function only
442
; overwrite r0, r1, and rax (r6). last64 overwrites r2 too, but we
443
; don't need r2 in 8x8 mode.
444
mov r0d, [r3+cb.bits_encoded] ; r0 = cabac.f8_bits_encoded
445
; pre-add some values to simplify addressing
446
add r3, cb.state
447
add r5, r3
448
add r7, r3
449
add r8, r3 ; precalculate cabac state pointers
450
451
; if( last != count_cat_m1[ctx_block_cat] )
452
%if %1
453
cmp r6b, 63
454
%else
455
cmp r6b, [count_cat_m1+r2 GLOBAL]
456
%endif
457
je .skip_last_sigmap
458
459
; in 8x8 mode we have to do a bit of extra calculation for ctx_sig/last,
460
; so we'll use r11 for this.
461
%if %1
462
%define siglast_ctx r11
463
%else
464
%define siglast_ctx r6
465
%endif
466
467
; x264_cabac_encode_decision( cb, ctx_sig + last, 1 )
468
; x264_cabac_encode_decision( cb, ctx_last + last, 1 )
469
SIG_OFFSET %1
470
movzx r1d, byte [r5+siglast_ctx]
471
movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL]
472
xor r1d, 1
473
movzx r1d, word [cabac_entropy+r1*2 GLOBAL]
474
mov [r5+siglast_ctx], r9b
475
add r0d, r1d
476
477
LAST_OFFSET %1
478
movzx r1d, byte [r7+siglast_ctx]
479
movzx r9d, byte [cabac_transition+1+r1*2 GLOBAL]
480
xor r1d, 1
481
movzx r1d, word [cabac_entropy+r1*2 GLOBAL]
482
mov [r7+siglast_ctx], r9b
483
add r0d, r1d
484
.skip_last_sigmap:
485
LOAD_DCTCOEF r1d
486
COEFF_ABS_LEVEL_GT1 last, 1
487
; for( int i = last-1 ; i >= 0; i-- )
488
dec r6d
489
jl .end
490
.coeff_loop:
491
LOAD_DCTCOEF r1d
492
; if( l[i] )
493
SIG_OFFSET %1
494
movzx r9d, byte [r5+siglast_ctx]
495
test r1d, r1d
496
jnz .coeff_nonzero
497
; x264_cabac_encode_decision( cb, ctx_sig + i, 0 )
498
movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
499
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
500
mov [r5+siglast_ctx], r10b
501
add r0d, r9d
502
dec r6d
503
jge .coeff_loop
504
jmp .end
505
.coeff_nonzero:
506
; x264_cabac_encode_decision( cb, ctx_sig + i, 1 )
507
movzx r10d, byte [cabac_transition+r9*2+1 GLOBAL]
508
xor r9d, 1
509
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
510
mov [r5+siglast_ctx], r10b
511
add r0d, r9d
512
; x264_cabac_encode_decision( cb, ctx_last + i, 0 );
513
LAST_OFFSET %1
514
movzx r9d, byte [r7+siglast_ctx]
515
movzx r10d, byte [cabac_transition+r9*2 GLOBAL]
516
movzx r9d, word [cabac_entropy+r9*2 GLOBAL]
517
mov [r7+siglast_ctx], r10b
518
add r0d, r9d
519
COEFF_ABS_LEVEL_GT1 coeff, 0
520
dec r6d
521
jge .coeff_loop
522
.end:
523
mov [r3+cb.bits_encoded-cb.state], r0d
524
ADD rsp, pad
525
RET
526
%endmacro
527
528
%if ARCH_X86_64
529
INIT_XMM sse2
530
CABAC_RESIDUAL_RD 0, coeff_last_sse2
531
CABAC_RESIDUAL_RD 1, coeff_last_sse2
532
INIT_XMM sse2,lzcnt
533
CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
534
CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
535
INIT_XMM ssse3
536
CABAC_RESIDUAL_RD 0, coeff_last_sse2
537
CABAC_RESIDUAL_RD 1, coeff_last_sse2
538
INIT_XMM ssse3,lzcnt
539
CABAC_RESIDUAL_RD 0, coeff_last_sse2_lzcnt
540
CABAC_RESIDUAL_RD 1, coeff_last_sse2_lzcnt
541
%endif
542
543
;-----------------------------------------------------------------------------
544
; void x264_cabac_block_residual_internal_sse2 ( dctcoef *l, int b_interlaced,
545
; int ctx_block_cat, x264_cabac_t *cb );
546
;-----------------------------------------------------------------------------
547
548
%macro CALL_CABAC 0
549
%if cpuflag(bmi2)
550
call cabac_encode_decision_bmi2
551
%else
552
call cabac_encode_decision_asm
553
%endif
554
%if WIN64 ; move cabac back
555
mov r0, r3
556
%endif
557
%endmacro
558
559
; %1 = 8x8 mode
560
; %2 = dct register
561
; %3 = countcat
562
; %4 = name
563
%macro SIGMAP_LOOP 3-4
564
.sigmap_%4loop:
565
%if HIGH_BIT_DEPTH
566
mov %2, [dct+r10*4]
567
%else
568
movsx %2, word [dct+r10*2]
569
%endif
570
%if %1
571
movzx r1d, byte [sigoff_8x8 + r10]
572
add r1d, sigoffd
573
%else
574
lea r1d, [sigoffd + r10d]
575
%endif
576
test %2, %2
577
jz .sigmap_%4zero ; if( l[i] )
578
inc coeffidxd
579
mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i];
580
mov r2d, 1
581
CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 1 );
582
%if %1
583
movzx r1d, byte [last_coeff_flag_offset_8x8 + r10 GLOBAL]
584
add r1d, lastoffd
585
%else
586
lea r1d, [lastoffd + r10d]
587
%endif
588
cmp r10d, lastm ; if( i == last )
589
je .sigmap_%4last
590
xor r2d, r2d
591
CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_last + last_off, 0 );
592
jmp .sigmap_%4loop_endcheck
593
.sigmap_%4zero:
594
xor r2d, r2d
595
CALL_CABAC ; x264_cabac_encode_decision( cb, ctx_sig + sig_off, 0 );
596
.sigmap_%4loop_endcheck:
597
inc r10d
598
cmp r10d, %3
599
jne .sigmap_%4loop ; if( ++i == count_m1 )
600
%if HIGH_BIT_DEPTH
601
mov %2, [dct+r10*4]
602
%else
603
movsx %2, word [dct+r10*2]
604
%endif
605
inc coeffidxd
606
mov [coeffs+coeffidxq*4], %2 ; coeffs[++coeff_idx] = l[i]
607
jmp .sigmap_%4end
608
.sigmap_%4last: ; x264_cabac_encode_decision( cb, ctx_last + last_off, 1 );
609
mov r2d, 1
610
CALL_CABAC
611
.sigmap_%4end:
612
%if %1==0
613
jmp .level_loop_start
614
%endif
615
%endmacro
616
617
%macro CABAC_RESIDUAL 1
618
cglobal cabac_block_residual_internal, 4,15
619
%ifdef PIC
620
; if we use the same r7 as in cabac_encode_decision, we can cheat and save a register.
621
lea r7, [$$]
622
%define lastm [rsp+4*1]
623
%define GLOBAL +r7-$$
624
%else
625
%define lastm r7d
626
%define GLOBAL
627
%endif
628
%assign pad gprsize+4*2+4*64-(stack_offset&15)
629
SUB rsp, pad
630
shl r1d, 4
631
632
%define sigoffq r8
633
%define sigoffd r8d
634
%define lastoffq r9
635
%define lastoffd r9d
636
%define leveloffq r10
637
%define leveloffd r10d
638
%define leveloffm [rsp+4*0]
639
%define countcatd r11d
640
%define sigoff_8x8 r12
641
%define coeffidxq r13
642
%define coeffidxd r13d
643
%define dct r14
644
%define coeffs rsp+4*2
645
646
lea sigoff_8x8, [significant_coeff_flag_offset_8x8+r1*4 GLOBAL]
647
add r1d, r2d
648
movzx sigoffd, word [significant_coeff_flag_offset+r1*2 GLOBAL]
649
movzx lastoffd, word [last_coeff_flag_offset+r1*2 GLOBAL]
650
movzx leveloffd, word [coeff_abs_level_m1_offset+r2*2 GLOBAL]
651
movzx countcatd, byte [count_cat_m1+r2 GLOBAL]
652
mov coeffidxd, -1
653
mov dct, r0
654
mov leveloffm, leveloffd
655
656
mov r1, [%1+gprsize*r2 GLOBAL]
657
call r1
658
mov lastm, eax
659
; put cabac in r0; needed for cabac_encode_decision
660
mov r0, r3
661
662
xor r10d, r10d
663
cmp countcatd, 63
664
je .sigmap_8x8
665
SIGMAP_LOOP 0, r12d, countcatd,
666
.sigmap_8x8:
667
SIGMAP_LOOP 1, r11d, 63, _8x8
668
.level_loop_start:
669
; we now have r8, r9, r11, r12, and r7/r14(dct) free for the main loop.
670
%define nodectxq r8
671
%define nodectxd r8d
672
mov leveloffd, leveloffm
673
xor nodectxd, nodectxd
674
.level_loop:
675
mov r9d, [coeffs+coeffidxq*4]
676
mov r11d, r9d
677
sar r11d, 31
678
add r9d, r11d
679
movzx r1d, byte [coeff_abs_level1_ctx+nodectxq GLOBAL]
680
xor r9d, r11d
681
add r1d, leveloffd
682
cmp r9d, 1
683
jg .level_gt1
684
xor r2d, r2d
685
CALL_CABAC
686
movzx nodectxd, byte [coeff_abs_level_transition+nodectxq GLOBAL]
687
jmp .level_sign
688
.level_gt1:
689
mov r2d, 1
690
CALL_CABAC
691
movzx r14d, byte [coeff_abs_levelgt1_ctx+nodectxq GLOBAL]
692
add r14d, leveloffd
693
cmp r9d, 15
694
mov r12d, 15
695
cmovl r12d, r9d
696
sub r12d, 2
697
jz .level_eq2
698
.level_gt1_loop:
699
mov r1d, r14d
700
mov r2d, 1
701
CALL_CABAC
702
dec r12d
703
jg .level_gt1_loop
704
cmp r9d, 15
705
jge .level_bypass
706
.level_eq2:
707
mov r1d, r14d
708
xor r2d, r2d
709
CALL_CABAC
710
jmp .level_gt1_end
711
.level_bypass:
712
lea r2d, [r9d-15]
713
xor r1d, r1d
714
push r0
715
; we could avoid this if we implemented it in asm, but I don't feel like that
716
; right now.
717
%if UNIX64
718
push r7
719
push r8
720
%else
721
sub rsp, 32 ; shadow space
722
%endif
723
call cabac_encode_ue_bypass
724
%if UNIX64
725
pop r8
726
pop r7
727
%else
728
add rsp, 32
729
%endif
730
pop r0
731
.level_gt1_end:
732
movzx nodectxd, byte [coeff_abs_level_transition+8+nodectxq GLOBAL]
733
.level_sign:
734
mov r1d, r11d
735
%if cpuflag(bmi2)
736
call cabac_encode_bypass_bmi2
737
%else
738
call cabac_encode_bypass_asm
739
%endif
740
%if WIN64
741
mov r0, r3
742
%endif
743
dec coeffidxd
744
jge .level_loop
745
ADD rsp, pad
746
RET
747
%endmacro
748
749
%if ARCH_X86_64
750
INIT_XMM sse2
751
CABAC_RESIDUAL coeff_last_sse2
752
INIT_XMM sse2,lzcnt
753
CABAC_RESIDUAL coeff_last_sse2_lzcnt
754
INIT_XMM avx2,bmi2
755
CABAC_RESIDUAL coeff_last_avx2_lzcnt
756
%endif
757
758