CoCalc -- trellis-64.asm

05. Matplotlib / ffmpeg-3.0 / libx264 / common / x86 / trellis-64.asm
⁵²⁸⁶⁷ views
1
;*****************************************************************************
2
;* trellis-64.asm: x86_64 trellis quantization
3
;*****************************************************************************
4
;* Copyright (C) 2012-2016 x264 project
5
;*
6
;* Authors: Loren Merritt <lorenm@u.washington.edu>
7
;*
8
;* This program is free software; you can redistribute it and/or modify
9
;* it under the terms of the GNU General Public License as published by
10
;* the Free Software Foundation; either version 2 of the License, or
11
;* (at your option) any later version.
12
;*
13
;* This program is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
;* GNU General Public License for more details.
17
;*
18
;* You should have received a copy of the GNU General Public License
19
;* along with this program; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02111, USA.
21
;*
22
;* This program is also available under a commercial proprietary license.
23
;* For more information, contact us at licensing@x264.com.
24
;*****************************************************************************
25

26
; This is a pretty straight-forward translation of the C code, except:
27
; * simd ssd and psy: 2x parallel, handling the 2 candidate values of abs_level.
28
; * simd trellis_coef0, ZERO_LEVEL_IDX, and the coef0 part of the main loop:
29
;   4x parallel, handling 4 node_ctxs of the same coef (even if some of those
30
;   nodes are invalid).
31
; * Interprocedural register allocation. Eliminates argument-passing overhead
32
;   to trellis_coef* subroutines. Also reduces codesize.
33

34
; Optimizations that I tried, and rejected because they were not faster:
35
; * Separate loops for node_ctx [4..7] or smaller subsets of [0..3].
36
;   Costs too much icache compared to the negligible speedup.
37
; * There are only 21 possible sets of live node_ctxs; we could keep track of
38
;   exactly which set we're in and feed that (along with abs_level) into a jump
39
;   table instead of the switch to select a trellis_coef subroutine. This would
40
;   eliminate all branches about which node_ctxs are live, but costs either a
41
;   bunch of icache or a bunch of call/ret, and the jump table itself is
42
;   unpredictable.
43
; * Separate versions of trellis_coef* depending on whether we're doing the 1st
44
;   or the 2nd of the two abs_level candidates. This would eliminate some
45
;   branches about if(score is better).
46
; * Special case more values of coef. I had a coef2 at some intermediate point
47
;   in the optimization process, but it didn't end up worthwhile in conjunction
48
;   with all the other optimizations.
49
; * Unroll or simd writeback. I don't know why this didn't help.
50

51
%include "x86inc.asm"
52
%include "x86util.asm"
53

54
SECTION_RODATA
55

56
pd_8: times 4 dd 8
57
pd_m16: times 4 dd -16
58
pd_0123: dd 0, 1, 2, 3
59
pd_4567: dd 4, 5, 6, 7
60
sq_1: dq 1, 0
61
pq_128: times 2 dq 128
62
pq_ffffffff: times 2 dq 0xffffffff
63

64
cextern cabac_entropy
65
cextern cabac_transition
66
cextern cabac_size_unary
67
cextern cabac_transition_unary
68
cextern dct4_weight_tab
69
cextern dct8_weight_tab
70
cextern dct4_weight2_tab
71
cextern dct8_weight2_tab
72
cextern last_coeff_flag_offset_8x8
73
cextern significant_coeff_flag_offset_8x8
74
cextern coeff_flag_offset_chroma_422_dc
75

76
SECTION .text
77

78
%define TRELLIS_SCORE_BIAS 1<<60
79
%define SIZEOF_NODE 16
80
%define CABAC_SIZE_BITS 8
81
%define LAMBDA_BITS 4
82

83
%macro SQUARE 2 ; dst, tmp
84
    ; could use pmuldq here, to eliminate the abs. but that would involve
85
    ; templating a sse4 version of all of trellis, for negligible speedup.
86
%if cpuflag(ssse3)
87
    pabsd   m%1, m%1
88
    pmuludq m%1, m%1
89
%elif HIGH_BIT_DEPTH
90
    ABSD    m%2, m%1
91
    SWAP     %1, %2
92
    pmuludq m%1, m%1
93
%else
94
    pmuludq m%1, m%1
95
    pand    m%1, [pq_ffffffff]
96
%endif
97
%endmacro
98

99
%macro LOAD_DUP 2 ; dst, src
100
%if cpuflag(ssse3)
101
    movddup    %1, %2
102
%else
103
    movd       %1, %2
104
    punpcklqdq %1, %1
105
%endif
106
%endmacro
107

108
;-----------------------------------------------------------------------------
109
; int trellis_cabac_4x4_psy(
110
;     const int *unquant_mf, const uint8_t *zigzag, int lambda2,
111
;     int last_nnz, dctcoef *orig_coefs, dctcoef *quant_coefs, dctcoef *dct,
112
;     uint8_t *cabac_state_sig, uint8_t *cabac_state_last,
113
;     uint64_t level_state0, uint16_t level_state1,
114
;     int b_ac, dctcoef *fenc_dct, int psy_trellis )
115
;-----------------------------------------------------------------------------
116
%macro TRELLIS 4
117
%define num_coefs %2
118
%define dc %3
119
%define psy %4
120
cglobal %1, 4,15,9
121
    %assign level_tree_size 64*8*2*4 ; could depend on num_coefs, but nonuniform stack size would prevent accessing args from trellis_coef*
122
    %assign pad 96 + level_tree_size + 16*SIZEOF_NODE + 16-gprsize-(stack_offset&15)
123
    SUB  rsp, pad
124
    DEFINE_ARGS unquant_mf, zigzag, lambda2, ii, orig_coefs, quant_coefs, dct, cabac_state_sig, cabac_state_last
125
%if WIN64
126
    %define level_statem rsp+stack_offset+80 ; r9m, except that we need to index into it (and r10m) as an array
127
%else
128
    %define level_statem rsp+stack_offset+32
129
%endif
130
    %define b_acm r11m ; 4x4 only
131
    %define b_interlacedm r11m ; 8x8 only
132
    %define i_coefsm1 r11m ; dc only
133
    %define fenc_dctm r12m
134
    %define psy_trellism r13m
135
%if num_coefs == 64
136
    shl dword b_interlacedm, 6
137
    %define dct_weight1_tab dct8_weight_tab
138
    %define dct_weight2_tab dct8_weight2_tab
139
%else
140
    %define dct_weight1_tab dct4_weight_tab
141
    %define dct_weight2_tab dct4_weight2_tab
142
%endif
143

144
    %define stack rsp
145
    %define last_nnzm [stack+0]
146
    %define zigzagm   [stack+8]
147
    mov     last_nnzm, iid
148
    mov     zigzagm,   zigzagq
149
%if WIN64 == 0
150
    %define orig_coefsm  [stack+16]
151
    %define quant_coefsm [stack+24]
152
    mov     orig_coefsm,  orig_coefsq
153
    mov     quant_coefsm, quant_coefsq
154
%endif
155
    %define unquant_mfm   [stack+32]
156
    %define levelgt1_ctxm [stack+40]
157
    %define ssd            stack+48
158
    %define cost_siglast   stack+80
159
    %define level_tree     stack+96
160

161
    ; trellis_node_t is layed out differently than C.
162
    ; struct-of-arrays rather than array-of-structs, for simd.
163
    %define nodes_curq r7
164
    %define nodes_prevq r8
165
    %define node_score(x) x*8
166
    %define node_level_idx(x) 64+x*4
167
    %define node_cabac_state(x) 96+x*4
168
    lea nodes_curq, [level_tree + level_tree_size]
169
    lea nodes_prevq, [nodes_curq + 8*SIZEOF_NODE]
170
    mov        r6, TRELLIS_SCORE_BIAS
171
    mov       [nodes_curq + node_score(0)], r6
172
    mov dword [nodes_curq + node_level_idx(0)], 0
173
    movd      mm0, [level_statem + 0]
174
    punpcklbw mm0, [level_statem + 4]
175
    punpcklwd mm0, [level_statem + 8]
176
    %define level_state_packed mm0 ; version for copying into node.cabac_state
177
    pcmpeqb    m7, m7 ; TRELLIS_SCORE_MAX
178
    movq [nodes_curq + node_score(1)], m7
179
    mova [nodes_curq + node_score(2)], m7
180

181
    %define levels_usedq r4
182
    %define levels_usedd r4d
183
    mov dword [level_tree], 0
184
    mov       levels_usedd, 1
185

186
    %define abs_levelq r9
187
    %define abs_leveld r9d
188
    %define abs_coefq r14
189
    %define zigzagiq r5
190
    %define zigzagid r5d
191

192
%if num_coefs == 8
193
    mov dword levelgt1_ctxm, 8
194
%else
195
    mov dword levelgt1_ctxm, 9
196
%endif
197
%if psy
198
    LOAD_DUP m6, psy_trellism
199
    %define psy_trellis m6
200
%elif dc
201
    LOAD_DUP   m6, [unquant_mfq]
202
    paddd      m6, m6
203
    %define unquant_mf m6
204
%endif
205
%ifdef PIC
206
%if dc == 0
207
    mov unquant_mfm, unquant_mfq
208
%endif
209
    ; Keep a single offset register to PICify all global constants.
210
    ; They're all relative to "beginning of this asm file's .text section",
211
    ; even tables that aren't in this file.
212
    ; (Any address in .text would work, this one was just convenient.)
213
    lea r0, [$$]
214
    %define GLOBAL +r0-$$
215
%else
216
    %define GLOBAL
217
%endif
218

219
    TRELLIS_LOOP 0 ; node_ctx 0..3
220
    TRELLIS_LOOP 1 ; node_ctx 1..7
221

222
.writeback:
223
    ; int level = bnode->level_idx;
224
    ; for( int i = b_ac; i <= last_nnz; i++ )
225
    ;     dct[zigzag[i]] = SIGN(level_tree[level].abs_level, orig_coefs[zigzag[i]]);
226
    ;     level = level_tree[level].next;
227
    mov    iid, last_nnzm
228
    add zigzagq, iiq
229
    neg    iiq
230
%if num_coefs == 16 && dc == 0
231
    mov    r2d, b_acm
232
    add    iiq, r2
233
%endif
234
    %define dctq r10
235
    mov    r0d, [nodes_curq + node_level_idx(0) + rax*4]
236
.writeback_loop:
237
    movzx   r2, byte [zigzagq + iiq]
238
%if cpuflag(ssse3)
239
    movd    m0, [level_tree + r0*4]
240
    movzx   r0, word [level_tree + r0*4]
241
    psrld   m0, 16
242
    movd    m1, [dctq + r2*SIZEOF_DCTCOEF]
243
%if HIGH_BIT_DEPTH
244
    psignd  m0, m1
245
    movd [dctq + r2*SIZEOF_DCTCOEF], m0
246
%else
247
    psignw  m0, m1
248
    movd   r4d, m0
249
    mov  [dctq + r2*SIZEOF_DCTCOEF], r4w
250
%endif
251
%else
252
    mov    r5d, [level_tree + r0*4]
253
%if HIGH_BIT_DEPTH
254
    mov    r4d, dword [dctq + r2*SIZEOF_DCTCOEF]
255
%else
256
    movsx  r4d, word [dctq + r2*SIZEOF_DCTCOEF]
257
%endif
258
    movzx  r0d, r5w
259
    sar    r4d, 31
260
    shr    r5d, 16
261
    xor    r5d, r4d
262
    sub    r5d, r4d
263
%if HIGH_BIT_DEPTH
264
    mov  [dctq + r2*SIZEOF_DCTCOEF], r5d
265
%else
266
    mov  [dctq + r2*SIZEOF_DCTCOEF], r5w
267
%endif
268
%endif
269
    inc    iiq
270
    jle .writeback_loop
271

272
    mov eax, 1
273
.return:
274
    ADD rsp, pad
275
    RET
276

277
%if num_coefs == 16 && dc == 0
278
.return_zero:
279
    pxor       m0, m0
280
    mova [r10+ 0], m0
281
    mova [r10+16], m0
282
%if HIGH_BIT_DEPTH
283
    mova [r10+32], m0
284
    mova [r10+48], m0
285
%endif
286
    jmp .return
287
%endif
288
%endmacro ; TRELLIS
289

290

291

292
%macro TRELLIS_LOOP 1 ; ctx_hi
293
.i_loop%1:
294
    ; if( !quant_coefs[i] )
295
    mov   r6, quant_coefsm
296
%if HIGH_BIT_DEPTH
297
    mov   abs_leveld, dword [r6 + iiq*SIZEOF_DCTCOEF]
298
%else
299
    movsx abs_leveld, word [r6 + iiq*SIZEOF_DCTCOEF]
300
%endif
301

302
    ; int sigindex  = num_coefs == 64 ? significant_coeff_flag_offset_8x8[b_interlaced][i] :
303
    ;                 num_coefs == 8  ? coeff_flag_offset_chroma_422_dc[i] : i;
304
    mov    r10, cabac_state_sigm
305
%if num_coefs == 64
306
    mov    r6d, b_interlacedm
307
%ifdef PIC
308
    add    r6d, iid
309
    movzx  r6d, byte [significant_coeff_flag_offset_8x8 + r6 GLOBAL]
310
%else
311
    movzx  r6d, byte [significant_coeff_flag_offset_8x8 + r6 + iiq]
312
%endif
313
    movzx  r10, byte [r10 + r6]
314
%elif num_coefs == 8
315
    movzx  r13, byte [coeff_flag_offset_chroma_422_dc + iiq GLOBAL]
316
    movzx  r10, byte [r10 + r13]
317
%else
318
    movzx  r10, byte [r10 + iiq]
319
%endif
320

321
    test  abs_leveld, abs_leveld
322
    jnz %%.nonzero_quant_coef
323

324
%if %1 == 0
325
    ; int cost_sig0 = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 )
326
    ;               * (uint64_t)lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
327
    ; nodes_cur[0].score -= cost_sig0;
328
    movzx  r10, word [cabac_entropy + r10*2 GLOBAL]
329
    imul   r10, lambda2q
330
    shr    r10, CABAC_SIZE_BITS - LAMBDA_BITS
331
    sub   [nodes_curq + node_score(0)], r10
332
%endif
333
    ZERO_LEVEL_IDX %1, cur
334
    jmp .i_continue%1
335

336
%%.nonzero_quant_coef:
337
    ; int sign_coef = orig_coefs[zigzag[i]];
338
    ; int abs_coef = abs( sign_coef );
339
    ; int q = abs( quant_coefs[i] );
340
    movzx   zigzagid, byte [zigzagq+iiq]
341
    movd    m0, abs_leveld
342
    mov     r6, orig_coefsm
343
%if HIGH_BIT_DEPTH
344
    LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF]
345
%else
346
    LOAD_DUP m1, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
347
    psrad    m1, 16     ; sign_coef
348
%endif
349
    punpcklqdq m0, m0 ; quant_coef
350
%if cpuflag(ssse3)
351
    pabsd   m0, m0
352
    pabsd   m2, m1 ; abs_coef
353
%else
354
    pxor    m8, m8
355
    pcmpgtd m8, m1 ; sign_mask
356
    pxor    m0, m8
357
    pxor    m2, m1, m8
358
    psubd   m0, m8
359
    psubd   m2, m8
360
%endif
361
    psubd   m0, [sq_1] ; abs_level
362
    movd  abs_leveld, m0
363

364
    xchg  nodes_curq, nodes_prevq
365

366
    ; if( i < num_coefs-1 )
367
    ;     int lastindex = num_coefs == 64 ? last_coeff_flag_offset_8x8[i] : i;
368
    ;                     num_coefs == 8  ? coeff_flag_offset_chroma_422_dc[i] : i
369
    ;     cost_siglast[0] = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 0 );
370
    ;     cost_sig1       = x264_cabac_size_decision_noup2( &cabac_state_sig[sigindex], 1 );
371
    ;     cost_siglast[1] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 0 ) + cost_sig1;
372
    ;     cost_siglast[2] = x264_cabac_size_decision_noup2( &cabac_state_last[lastindex], 1 ) + cost_sig1;
373
%if %1 == 0
374
%if dc && num_coefs != 8
375
    cmp    iid, i_coefsm1
376
%else
377
    cmp    iid, num_coefs-1
378
%endif
379
    je %%.zero_siglast
380
%endif
381
    movzx  r11, word [cabac_entropy + r10*2 GLOBAL]
382
    xor    r10, 1
383
    movzx  r12, word [cabac_entropy + r10*2 GLOBAL]
384
    mov   [cost_siglast+0], r11d
385
    mov    r10, cabac_state_lastm
386
%if num_coefs == 64
387
    movzx  r6d, byte [last_coeff_flag_offset_8x8 + iiq GLOBAL]
388
    movzx  r10, byte [r10 + r6]
389
%elif num_coefs == 8
390
    movzx  r10, byte [r10 + r13]
391
%else
392
    movzx  r10, byte [r10 + iiq]
393
%endif
394
    movzx  r11, word [cabac_entropy + r10*2 GLOBAL]
395
    add    r11, r12
396
    mov   [cost_siglast+4], r11d
397
%if %1 == 0
398
    xor    r10, 1
399
    movzx  r10, word [cabac_entropy + r10*2 GLOBAL]
400
    add    r10, r12
401
    mov   [cost_siglast+8], r10d
402
%endif
403
%%.skip_siglast:
404

405
    ; int unquant_abs_level = ((unquant_mf[zigzag[i]] * abs_level + 128) >> 8);
406
    ; int d = abs_coef - unquant_abs_level;
407
    ; uint64_t ssd = (int64_t)d*d * coef_weight[i];
408
%if dc
409
    pmuludq m0, unquant_mf
410
%else
411
%ifdef PIC
412
    mov    r10, unquant_mfm
413
    LOAD_DUP m3, [r10 + zigzagiq*4]
414
%else
415
    LOAD_DUP m3, [unquant_mfq + zigzagiq*4]
416
%endif
417
    pmuludq m0, m3
418
%endif
419
    paddd   m0, [pq_128]
420
    psrld   m0, 8 ; unquant_abs_level
421
%if psy || dc == 0
422
    mova    m4, m0
423
%endif
424
    psubd   m0, m2
425
    SQUARE   0, 3
426
%if dc
427
    psllq   m0, 8
428
%else
429
    LOAD_DUP m5, [dct_weight2_tab + zigzagiq*4 GLOBAL]
430
    pmuludq m0, m5
431
%endif
432

433
%if psy
434
    test   iid, iid
435
    jz %%.dc_rounding
436
    ; int predicted_coef = fenc_dct[zigzag[i]] - sign_coef
437
    ; int psy_value = abs(unquant_abs_level + SIGN(predicted_coef, sign_coef));
438
    ; int psy_weight = dct_weight_tab[zigzag[i]] * h->mb.i_psy_trellis;
439
    ; ssd1[k] -= psy_weight * psy_value;
440
    mov     r6, fenc_dctm
441
%if HIGH_BIT_DEPTH
442
    LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF]
443
%else
444
    LOAD_DUP m3, [r6 + zigzagiq*SIZEOF_DCTCOEF - 2]
445
    psrad   m3, 16 ; orig_coef
446
%endif
447
%if cpuflag(ssse3)
448
    psignd  m4, m1 ; SIGN(unquant_abs_level, sign_coef)
449
%else
450
    PSIGN d, m4, m8
451
%endif
452
    psubd   m3, m1 ; predicted_coef
453
    paddd   m4, m3
454
%if cpuflag(ssse3)
455
    pabsd   m4, m4
456
%else
457
    ABSD    m3, m4
458
    SWAP     4, 3
459
%endif
460
    LOAD_DUP m1, [dct_weight1_tab + zigzagiq*4 GLOBAL]
461
    pmuludq m1, psy_trellis
462
    pmuludq m4, m1
463
    psubq   m0, m4
464
%if %1
465
%%.dc_rounding:
466
%endif
467
%endif
468
%if %1 == 0
469
    mova [ssd], m0
470
%endif
471

472
%if dc == 0 && %1 == 0
473
    test   iid, iid
474
    jnz %%.skip_dc_rounding
475
%%.dc_rounding:
476
    ; Optimize rounding for DC coefficients in DC-only luma 4x4/8x8 blocks.
477
    ; int d = abs_coef - ((unquant_abs_level + (sign_coef>>31) + 8)&~15);
478
    ; uint64_t ssd = (int64_t)d*d * coef_weight[i];
479
    psrad   m1, 31 ; sign_coef>>31
480
    paddd   m4, [pd_8]
481
    paddd   m4, m1
482
    pand    m4, [pd_m16] ; (unquant_abs_level + (sign_coef>>31) + 8)&~15
483
    psubd   m4, m2 ; d
484
    SQUARE   4, 3
485
    pmuludq m4, m5
486
    mova [ssd], m4
487
%%.skip_dc_rounding:
488
%endif
489
    mova [ssd+16], m0
490

491
    %assign stack_offset_bak stack_offset
492
    cmp abs_leveld, 1
493
    jl %%.switch_coef0
494
%if %1 == 0
495
    mov    r10, [ssd] ; trellis_coef* args
496
%endif
497
    movq   r12, m0
498
    ; for( int j = 0; j < 8; j++ )
499
    ;     nodes_cur[j].score = TRELLIS_SCORE_MAX;
500
%if cpuflag(ssse3)
501
    mova [nodes_curq + node_score(0)], m7
502
    mova [nodes_curq + node_score(2)], m7
503
%else ; avoid store-forwarding stalls on k8/k10
504
%if %1 == 0
505
    movq [nodes_curq + node_score(0)], m7
506
%endif
507
    movq [nodes_curq + node_score(1)], m7
508
    movq [nodes_curq + node_score(2)], m7
509
    movq [nodes_curq + node_score(3)], m7
510
%endif
511
    mova [nodes_curq + node_score(4)], m7
512
    mova [nodes_curq + node_score(6)], m7
513
    je %%.switch_coef1
514
%%.switch_coefn:
515
    call trellis_coefn.entry%1
516
    call trellis_coefn.entry%1b
517
    jmp .i_continue1
518
%%.switch_coef1:
519
    call trellis_coef1.entry%1
520
    call trellis_coefn.entry%1b
521
    jmp .i_continue1
522
%%.switch_coef0:
523
    call trellis_coef0_%1
524
    call trellis_coef1.entry%1b
525

526
.i_continue%1:
527
    dec iid
528
%if num_coefs == 16 && dc == 0
529
    cmp iid, b_acm
530
%endif
531
    jge .i_loop%1
532

533
    call trellis_bnode_%1
534
%if %1 == 0
535
%if num_coefs == 16 && dc == 0
536
    jz .return_zero
537
%else
538
    jz .return
539
%endif
540
    jmp .writeback
541

542
%%.zero_siglast:
543
    xor  r6d, r6d
544
    mov [cost_siglast+0], r6
545
    mov [cost_siglast+8], r6d
546
    jmp %%.skip_siglast
547
%endif
548
%endmacro ; TRELLIS_LOOP
549

550
; just a synonym for %if
551
%macro IF0 1+
552
%endmacro
553
%macro IF1 1+
554
    %1
555
%endmacro
556

557
%macro ZERO_LEVEL_IDX 2 ; ctx_hi, prev
558
    ; for( int j = 0; j < 8; j++ )
559
    ;     nodes_cur[j].level_idx = levels_used;
560
    ;     level_tree[levels_used].next = (trellis_level_t){ .next = nodes_cur[j].level_idx, .abs_level = 0 };
561
    ;     levels_used++;
562
    add  levels_usedd, 3
563
    and  levels_usedd, ~3 ; allow aligned stores
564
    movd       m0, levels_usedd
565
    pshufd     m0, m0, 0
566
    IF%1 mova  m1, m0
567
         paddd m0, [pd_0123]
568
    IF%1 paddd m1, [pd_4567]
569
         mova  m2, [nodes_%2q + node_level_idx(0)]
570
    IF%1 mova  m3, [nodes_%2q + node_level_idx(4)]
571
         mova [nodes_curq + node_level_idx(0)], m0
572
    IF%1 mova [nodes_curq + node_level_idx(4)], m1
573
         mova [level_tree + (levels_usedq+0)*4], m2
574
    IF%1 mova [level_tree + (levels_usedq+4)*4], m3
575
    add  levels_usedd, (1+%1)*4
576
%endmacro
577

578
INIT_XMM sse2
579
TRELLIS trellis_cabac_4x4, 16, 0, 0
580
TRELLIS trellis_cabac_8x8, 64, 0, 0
581
TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
582
TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
583
TRELLIS trellis_cabac_dc, 16, 1, 0
584
TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
585
INIT_XMM ssse3
586
TRELLIS trellis_cabac_4x4, 16, 0, 0
587
TRELLIS trellis_cabac_8x8, 64, 0, 0
588
TRELLIS trellis_cabac_4x4_psy, 16, 0, 1
589
TRELLIS trellis_cabac_8x8_psy, 64, 0, 1
590
TRELLIS trellis_cabac_dc, 16, 1, 0
591
TRELLIS trellis_cabac_chroma_422_dc, 8, 1, 0
592

593

594

595
%define stack rsp+gprsize
596
%define scoreq r14
597
%define bitsq r13
598
%define bitsd r13d
599

600
INIT_XMM
601
%macro clocal 1
602
    ALIGN 16
603
    global mangle(x264_%1)
604
    mangle(x264_%1):
605
    %1:
606
    %assign stack_offset stack_offset_bak+gprsize
607
%endmacro
608

609
%macro TRELLIS_BNODE 1 ; ctx_hi
610
clocal trellis_bnode_%1
611
    ; int j = ctx_hi?1:0;
612
    ; trellis_node_t *bnode = &nodes_cur[j];
613
    ; while( ++j < (ctx_hi?8:4) )
614
    ;     if( nodes_cur[j].score < bnode->score )
615
    ;         bnode = &nodes_cur[j];
616
%assign j %1
617
    mov   rax, [nodes_curq + node_score(j)]
618
    lea   rax, [rax*8 + j]
619
%rep 3+3*%1
620
%assign j j+1
621
    mov   r11, [nodes_curq + node_score(j)]
622
    lea   r11, [r11*8 + j]
623
    cmp   rax, r11
624
    cmova rax, r11
625
%endrep
626
    mov   r10, dctm
627
    and   eax, 7
628
    ret
629
%endmacro ; TRELLIS_BNODE
630
TRELLIS_BNODE 0
631
TRELLIS_BNODE 1
632

633

634
%macro TRELLIS_COEF0 1 ; ctx_hi
635
clocal trellis_coef0_%1
636
    ; ssd1 += (uint64_t)cost_sig * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
637
    mov  r11d, [cost_siglast+0]
638
    imul  r11, lambda2q
639
    shr   r11, CABAC_SIZE_BITS - LAMBDA_BITS
640
    add   r11, [ssd+16]
641
%if %1 == 0
642
    ; nodes_cur[0].score = nodes_prev[0].score + ssd - ssd1;
643
    mov  scoreq, [nodes_prevq + node_score(0)]
644
    add  scoreq, [ssd]
645
    sub  scoreq, r11
646
    mov  [nodes_curq + node_score(0)], scoreq
647
%endif
648
    ; memcpy
649
    mov  scoreq, [nodes_prevq + node_score(1)]
650
    mov  [nodes_curq + node_score(1)], scoreq
651
    mova m1, [nodes_prevq + node_score(2)]
652
    mova [nodes_curq + node_score(2)], m1
653
%if %1
654
    mova m1, [nodes_prevq + node_score(4)]
655
    mova [nodes_curq + node_score(4)], m1
656
    mova m1, [nodes_prevq + node_score(6)]
657
    mova [nodes_curq + node_score(6)], m1
658
%endif
659
    mov  r6d, [nodes_prevq + node_cabac_state(3)]
660
    mov  [nodes_curq + node_cabac_state(3)], r6d
661
%if %1
662
    mova m1, [nodes_prevq + node_cabac_state(4)]
663
    mova [nodes_curq + node_cabac_state(4)], m1
664
%endif
665
    ZERO_LEVEL_IDX %1, prev
666
    ret
667
%endmacro ; TRELLIS_COEF0
668
TRELLIS_COEF0 0
669
TRELLIS_COEF0 1
670

671

672

673
%macro START_COEF 1 ; gt1
674
    ; if( (int64_t)nodes_prev[0].score < 0 ) continue;
675
    mov  scoreq, [nodes_prevq + node_score(j)]
676
%if j > 0
677
    test scoreq, scoreq
678
    js .ctx %+ nextj_if_invalid
679
%endif
680

681
    ; f8_bits += x264_cabac_size_decision2( &n.cabac_state[coeff_abs_level1_ctx[j]], abs_level > 1 );
682
%if j >= 3
683
    movzx r6d, byte [nodes_prevq + node_cabac_state(j) + (coeff_abs_level1_offs>>2)] ; >> because node only stores ctx 0 and 4
684
    movzx r11, byte [cabac_transition + r6*2 + %1 GLOBAL]
685
%else
686
    movzx r6d, byte [level_statem + coeff_abs_level1_offs]
687
%endif
688
%if %1
689
    xor   r6d, 1
690
%endif
691
    movzx bitsd, word [cabac_entropy + r6*2 GLOBAL]
692

693
    ; n.score += ssd;
694
    ; unsigned f8_bits = cost_siglast[ j ? 1 : 2 ];
695
%if j == 0
696
    add  scoreq, r10
697
    add  bitsd, [cost_siglast+8]
698
%else
699
    add  scoreq, r12
700
    add  bitsd, [cost_siglast+4]
701
%endif
702
%endmacro ; START_COEF
703

704
%macro END_COEF 1
705
    ; n.score += (uint64_t)f8_bits * lambda2 >> ( CABAC_SIZE_BITS - LAMBDA_BITS );
706
    imul bitsq, lambda2q
707
    shr  bitsq, CABAC_SIZE_BITS - LAMBDA_BITS
708
    add  scoreq, bitsq
709

710
    ; if( n.score < nodes_cur[node_ctx].score )
711
    ;     SET_LEVEL( n, abs_level );
712
    ;     nodes_cur[node_ctx] = n;
713
    cmp scoreq, [nodes_curq + node_score(node_ctx)]
714
    jae .ctx %+ nextj_if_valid
715
    mov [nodes_curq + node_score(node_ctx)], scoreq
716
%if j == 2 || (j <= 3 && node_ctx == 4)
717
    ; if this node hasn't previously needed to keep track of abs_level cabac_state, import a pristine copy of the input states
718
    movd [nodes_curq + node_cabac_state(node_ctx)], level_state_packed
719
%elif j >= 3
720
    ; if we have updated before, then copy cabac_state from the parent node
721
    mov  r6d, [nodes_prevq + node_cabac_state(j)]
722
    mov [nodes_curq + node_cabac_state(node_ctx)], r6d
723
%endif
724
%if j >= 3 ; skip the transition if we're not going to reuse the context
725
    mov [nodes_curq + node_cabac_state(node_ctx) + (coeff_abs_level1_offs>>2)], r11b ; delayed from x264_cabac_size_decision2
726
%endif
727
%if %1 && node_ctx == 7
728
    mov  r6d, levelgt1_ctxm
729
    mov [nodes_curq + node_cabac_state(node_ctx) + coeff_abs_levelgt1_offs-6], r10b
730
%endif
731
    mov  r6d, [nodes_prevq + node_level_idx(j)]
732
%if %1
733
    mov r11d, abs_leveld
734
    shl r11d, 16
735
    or   r6d, r11d
736
%else
737
    or   r6d, 1<<16
738
%endif
739
    mov [level_tree + levels_usedq*4], r6d
740
    mov [nodes_curq + node_level_idx(node_ctx)], levels_usedd
741
    inc levels_usedd
742
%endmacro ; END_COEF
743

744

745

746
%macro COEF1 2
747
    %assign j %1
748
    %assign nextj_if_valid %1+1
749
    %assign nextj_if_invalid %2
750
%if j < 4
751
    %assign coeff_abs_level1_offs j+1
752
%else
753
    %assign coeff_abs_level1_offs 0
754
%endif
755
%if j < 3
756
    %assign node_ctx j+1
757
%else
758
    %assign node_ctx j
759
%endif
760
.ctx %+ j:
761
    START_COEF 0
762
    add  bitsd, 1 << CABAC_SIZE_BITS
763
    END_COEF 0
764
%endmacro ; COEF1
765

766
%macro COEFN 2
767
    %assign j %1
768
    %assign nextj_if_valid %2
769
    %assign nextj_if_invalid %2
770
%if j < 4
771
    %assign coeff_abs_level1_offs j+1
772
    %assign coeff_abs_levelgt1_offs 5
773
%else
774
    %assign coeff_abs_level1_offs 0
775
    %assign coeff_abs_levelgt1_offs j+2 ; this is the one used for all block types except 4:2:2 chroma dc
776
%endif
777
%if j < 4
778
    %assign node_ctx 4
779
%elif j < 7
780
    %assign node_ctx j+1
781
%else
782
    %assign node_ctx 7
783
%endif
784
.ctx %+ j:
785
    START_COEF 1
786
    ; if( abs_level >= 15 )
787
    ;     bits += bs_size_ue_big(...)
788
    add  bitsd, r5d ; bs_size_ue_big from COEFN_SUFFIX
789
    ; n.cabac_state[levelgt1_ctx]
790
%if j == 7 ; && compiling support for 4:2:2
791
    mov    r6d, levelgt1_ctxm
792
    %define coeff_abs_levelgt1_offs r6
793
%endif
794
%if j == 7
795
    movzx  r10, byte [nodes_prevq + node_cabac_state(j) + coeff_abs_levelgt1_offs-6] ; -6 because node only stores ctx 8 and 9
796
%else
797
    movzx  r10, byte [level_statem + coeff_abs_levelgt1_offs]
798
%endif
799
    ; f8_bits += cabac_size_unary[abs_level-1][n.cabac_state[levelgt1_ctx[j]]];
800
    add   r10d, r1d
801
    movzx  r6d, word [cabac_size_unary + (r10-128)*2 GLOBAL]
802
    add  bitsd, r6d
803
%if node_ctx == 7
804
    movzx  r10, byte [cabac_transition_unary + r10-128 GLOBAL]
805
%endif
806
    END_COEF 1
807
%endmacro ; COEFN
808

809

810

811
clocal trellis_coef1
812
.entry0b: ; ctx_lo, larger of the two abs_level candidates
813
    mov  r10, [ssd+8]
814
    sub  r10, r11
815
    mov  r12, [ssd+24]
816
    sub  r12, r11
817
.entry0: ; ctx_lo, smaller of the two abs_level candidates
818
    COEF1 0, 4
819
    COEF1 1, 4
820
    COEF1 2, 4
821
    COEF1 3, 4
822
.ctx4:
823
    rep ret
824
.entry1b: ; ctx_hi, larger of the two abs_level candidates
825
    mov  r12, [ssd+24]
826
    sub  r12, r11
827
.entry1: ; ctx_hi, smaller of the two abs_level candidates
828
trellis_coef1_hi:
829
    COEF1 1, 2
830
    COEF1 2, 3
831
    COEF1 3, 4
832
    COEF1 4, 5
833
    COEF1 5, 6
834
    COEF1 6, 7
835
    COEF1 7, 8
836
.ctx8:
837
    rep ret
838

839
%macro COEFN_PREFIX 1
840
    ; int prefix = X264_MIN( abs_level - 1, 14 );
841
    mov  r1d, abs_leveld
842
    cmp  abs_leveld, 15
843
    jge .level_suffix%1
844
    xor  r5d, r5d
845
.skip_level_suffix%1:
846
    shl  r1d, 7
847
%endmacro
848

849
%macro COEFN_SUFFIX 1
850
.level_suffix%1:
851
    ; bs_size_ue_big( abs_level - 15 ) << CABAC_SIZE_BITS;
852
    lea  r5d, [abs_levelq-14]
853
    bsr  r5d, r5d
854
    shl  r5d, CABAC_SIZE_BITS+1
855
    add  r5d, 1<<CABAC_SIZE_BITS
856
    ; int prefix = X264_MIN( abs_level - 1, 14 );
857
    mov  r1d, 15
858
    jmp .skip_level_suffix%1
859
%endmacro
860

861
clocal trellis_coefn
862
.entry0b:
863
    mov  r10, [ssd+8]
864
    mov  r12, [ssd+24]
865
    inc  abs_leveld
866
.entry0:
867
    ; I could fully separate the ctx_lo and ctx_hi versions of coefn, and then
868
    ; apply return-on-first-failure to ctx_lo. Or I can use multiple entrypoints
869
    ; to merge the common portion of ctx_lo and ctx_hi, and thus reduce codesize.
870
    ; I can't do both, as return-on-first-failure doesn't work for ctx_hi.
871
    ; The C version has to be fully separate since C doesn't support multiple
872
    ; entrypoints. But return-on-first-failure isn't very important here (as
873
    ; opposed to coef1), so I might as well reduce codesize.
874
    COEFN_PREFIX 0
875
    COEFN 0, 1
876
    COEFN 1, 2
877
    COEFN 2, 3
878
    COEFN 3, 8
879
.ctx8:
880
    mov zigzagq, zigzagm ; unspill since r1 was clobbered
881
    ret
882
.entry1b:
883
    mov  r12, [ssd+24]
884
    inc  abs_leveld
885
.entry1:
886
    COEFN_PREFIX 1
887
    COEFN 4, 5
888
    COEFN 5, 6
889
    COEFN 6, 7
890
    COEFN 7, 1
891
    jmp .ctx1
892
    COEFN_SUFFIX 0
893
    COEFN_SUFFIX 1
894

895
Product

Resources

Company