Path: blob/master/thirdparty/libtheora/x86/mmxfrag.c
10278 views
/********************************************************************1* *2* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *3* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *4* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *5* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *6* *7* THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009,2025 *8* by the Xiph.Org Foundation and contributors *9* https://www.xiph.org/ *10* *11********************************************************************1213function:1415********************************************************************/1617/*MMX acceleration of fragment reconstruction for motion compensation.18Originally written by Rudolf Marek.19Additional optimization by Nils Pipenbrinck.20Note: Loops are unrolled for best performance.21The iteration each instruction belongs to is marked in the comments as #i.*/22#include <stddef.h>23#include "x86int.h"2425#if defined(OC_X86_ASM)2627/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes28between rows.*/29# define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \30do{ \31const unsigned char *src; \32unsigned char *dst; \33ptrdiff_t ystride3; \34src=(_src); \35dst=(_dst); \36__asm__ __volatile__( \37/*src+0*ystride*/ \38"movq (%[src]),%%mm0\n\t" \39/*src+1*ystride*/ \40"movq (%[src],%[ystride]),%%mm1\n\t" \41/*ystride3=ystride*3*/ \42"lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \43/*src+2*ystride*/ \44"movq (%[src],%[ystride],2),%%mm2\n\t" \45/*src+3*ystride*/ \46"movq (%[src],%[ystride3]),%%mm3\n\t" \47/*dst+0*ystride*/ \48"movq %%mm0,(%[dst])\n\t" \49/*dst+1*ystride*/ \50"movq %%mm1,(%[dst],%[ystride])\n\t" \51/*Pointer to next 4.*/ \52"lea (%[src],%[ystride],4),%[src]\n\t" \53/*dst+2*ystride*/ \54"movq %%mm2,(%[dst],%[ystride],2)\n\t" \55/*dst+3*ystride*/ \56"movq %%mm3,(%[dst],%[ystride3])\n\t" \57/*Pointer to next 4.*/ \58"lea (%[dst],%[ystride],4),%[dst]\n\t" \59/*src+0*ystride*/ \60"movq (%[src]),%%mm0\n\t" \61/*src+1*ystride*/ \62"movq (%[src],%[ystride]),%%mm1\n\t" \63/*src+2*ystride*/ \64"movq (%[src],%[ystride],2),%%mm2\n\t" \65/*src+3*ystride*/ \66"movq (%[src],%[ystride3]),%%mm3\n\t" \67/*dst+0*ystride*/ \68"movq %%mm0,(%[dst])\n\t" \69/*dst+1*ystride*/ \70"movq %%mm1,(%[dst],%[ystride])\n\t" \71/*dst+2*ystride*/ \72"movq %%mm2,(%[dst],%[ystride],2)\n\t" \73/*dst+3*ystride*/ \74"movq %%mm3,(%[dst],%[ystride3])\n\t" \75:[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \76:[ystride]"r"((ptrdiff_t)(_ystride)) \77:"memory" \78); \79} \80while(0)8182/*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes83between rows.*/84void oc_frag_copy_mmx(unsigned char *_dst,85const unsigned char *_src,int _ystride){86OC_FRAG_COPY_MMX(_dst,_src,_ystride);87}8889/*Copies the fragments specified by the lists of fragment indices from one90frame to another.91_dst_frame: The reference frame to copy to.92_src_frame: The reference frame to copy from.93_ystride: The row stride of the reference frames.94_fragis: A pointer to a list of fragment indices.95_nfragis: The number of fragment indices to copy.96_frag_buf_offs: The offsets of fragments in the reference frames.*/97void oc_frag_copy_list_mmx(unsigned char *_dst_frame,98const unsigned char *_src_frame,int _ystride,99const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){100ptrdiff_t fragii;101for(fragii=0;fragii<_nfragis;fragii++){102ptrdiff_t frag_buf_off;103frag_buf_off=_frag_buf_offs[_fragis[fragii]];104OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off,105_src_frame+frag_buf_off,_ystride);106}107}108109110void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride,111const ogg_int16_t *_residue){112__asm__ __volatile__(113/*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/114"pcmpeqw %%mm0,%%mm0\n\t"115/*#0 Load low residue.*/116"movq 0*8(%[residue]),%%mm1\n\t"117/*#0 Load high residue.*/118"movq 1*8(%[residue]),%%mm2\n\t"119/*Set mm0 to 0x8000800080008000.*/120"psllw $15,%%mm0\n\t"121/*#1 Load low residue.*/122"movq 2*8(%[residue]),%%mm3\n\t"123/*#1 Load high residue.*/124"movq 3*8(%[residue]),%%mm4\n\t"125/*Set mm0 to 0x0080008000800080.*/126"psrlw $8,%%mm0\n\t"127/*#2 Load low residue.*/128"movq 4*8(%[residue]),%%mm5\n\t"129/*#2 Load high residue.*/130"movq 5*8(%[residue]),%%mm6\n\t"131/*#0 Bias low residue.*/132"paddsw %%mm0,%%mm1\n\t"133/*#0 Bias high residue.*/134"paddsw %%mm0,%%mm2\n\t"135/*#0 Pack to byte.*/136"packuswb %%mm2,%%mm1\n\t"137/*#1 Bias low residue.*/138"paddsw %%mm0,%%mm3\n\t"139/*#1 Bias high residue.*/140"paddsw %%mm0,%%mm4\n\t"141/*#1 Pack to byte.*/142"packuswb %%mm4,%%mm3\n\t"143/*#2 Bias low residue.*/144"paddsw %%mm0,%%mm5\n\t"145/*#2 Bias high residue.*/146"paddsw %%mm0,%%mm6\n\t"147/*#2 Pack to byte.*/148"packuswb %%mm6,%%mm5\n\t"149/*#0 Write row.*/150"movq %%mm1,(%[dst])\n\t"151/*#1 Write row.*/152"movq %%mm3,(%[dst],%[ystride])\n\t"153/*#2 Write row.*/154"movq %%mm5,(%[dst],%[ystride],2)\n\t"155/*#3 Load low residue.*/156"movq 6*8(%[residue]),%%mm1\n\t"157/*#3 Load high residue.*/158"movq 7*8(%[residue]),%%mm2\n\t"159/*#4 Load high residue.*/160"movq 8*8(%[residue]),%%mm3\n\t"161/*#4 Load high residue.*/162"movq 9*8(%[residue]),%%mm4\n\t"163/*#5 Load high residue.*/164"movq 10*8(%[residue]),%%mm5\n\t"165/*#5 Load high residue.*/166"movq 11*8(%[residue]),%%mm6\n\t"167/*#3 Bias low residue.*/168"paddsw %%mm0,%%mm1\n\t"169/*#3 Bias high residue.*/170"paddsw %%mm0,%%mm2\n\t"171/*#3 Pack to byte.*/172"packuswb %%mm2,%%mm1\n\t"173/*#4 Bias low residue.*/174"paddsw %%mm0,%%mm3\n\t"175/*#4 Bias high residue.*/176"paddsw %%mm0,%%mm4\n\t"177/*#4 Pack to byte.*/178"packuswb %%mm4,%%mm3\n\t"179/*#5 Bias low residue.*/180"paddsw %%mm0,%%mm5\n\t"181/*#5 Bias high residue.*/182"paddsw %%mm0,%%mm6\n\t"183/*#5 Pack to byte.*/184"packuswb %%mm6,%%mm5\n\t"185/*#3 Write row.*/186"movq %%mm1,(%[dst],%[ystride3])\n\t"187/*#4 Write row.*/188"movq %%mm3,(%[dst4])\n\t"189/*#5 Write row.*/190"movq %%mm5,(%[dst4],%[ystride])\n\t"191/*#6 Load low residue.*/192"movq 12*8(%[residue]),%%mm1\n\t"193/*#6 Load high residue.*/194"movq 13*8(%[residue]),%%mm2\n\t"195/*#7 Load low residue.*/196"movq 14*8(%[residue]),%%mm3\n\t"197/*#7 Load high residue.*/198"movq 15*8(%[residue]),%%mm4\n\t"199/*#6 Bias low residue.*/200"paddsw %%mm0,%%mm1\n\t"201/*#6 Bias high residue.*/202"paddsw %%mm0,%%mm2\n\t"203/*#6 Pack to byte.*/204"packuswb %%mm2,%%mm1\n\t"205/*#7 Bias low residue.*/206"paddsw %%mm0,%%mm3\n\t"207/*#7 Bias high residue.*/208"paddsw %%mm0,%%mm4\n\t"209/*#7 Pack to byte.*/210"packuswb %%mm4,%%mm3\n\t"211/*#6 Write row.*/212"movq %%mm1,(%[dst4],%[ystride],2)\n\t"213/*#7 Write row.*/214"movq %%mm3,(%[dst4],%[ystride3])\n\t"215:216:[residue]"r"(_residue),217[dst]"r"(_dst),218[dst4]"r"(_dst+(_ystride*4)),219[ystride]"r"((ptrdiff_t)_ystride),220[ystride3]"r"((ptrdiff_t)_ystride*3)221:"memory"222);223}224225void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src,226int _ystride,const ogg_int16_t *_residue){227int i;228/*Zero mm0.*/229__asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);230for(i=4;i-->0;){231__asm__ __volatile__(232/*#0 Load source.*/233"movq (%[src]),%%mm3\n\t"234/*#1 Load source.*/235"movq (%[src],%[ystride]),%%mm7\n\t"236/*#0 Get copy of src.*/237"movq %%mm3,%%mm4\n\t"238/*#0 Expand high source.*/239"punpckhbw %%mm0,%%mm4\n\t"240/*#0 Expand low source.*/241"punpcklbw %%mm0,%%mm3\n\t"242/*#0 Add residue high.*/243"paddsw 8(%[residue]),%%mm4\n\t"244/*#1 Get copy of src.*/245"movq %%mm7,%%mm2\n\t"246/*#0 Add residue low.*/247"paddsw (%[residue]), %%mm3\n\t"248/*#1 Expand high source.*/249"punpckhbw %%mm0,%%mm2\n\t"250/*#0 Pack final row pixels.*/251"packuswb %%mm4,%%mm3\n\t"252/*#1 Expand low source.*/253"punpcklbw %%mm0,%%mm7\n\t"254/*#1 Add residue low.*/255"paddsw 16(%[residue]),%%mm7\n\t"256/*#1 Add residue high.*/257"paddsw 24(%[residue]),%%mm2\n\t"258/*Advance residue.*/259"lea 32(%[residue]),%[residue]\n\t"260/*#1 Pack final row pixels.*/261"packuswb %%mm2,%%mm7\n\t"262/*Advance src.*/263"lea (%[src],%[ystride],2),%[src]\n\t"264/*#0 Write row.*/265"movq %%mm3,(%[dst])\n\t"266/*#1 Write row.*/267"movq %%mm7,(%[dst],%[ystride])\n\t"268/*Advance dst.*/269"lea (%[dst],%[ystride],2),%[dst]\n\t"270:[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src)271:[ystride]"r"((ptrdiff_t)_ystride)272:"memory"273);274}275}276277void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1,278const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){279int i;280/*Zero mm7.*/281__asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::);282for(i=4;i-->0;){283__asm__ __volatile__(284/*#0 Load src1.*/285"movq (%[src1]),%%mm0\n\t"286/*#0 Load src2.*/287"movq (%[src2]),%%mm2\n\t"288/*#0 Copy src1.*/289"movq %%mm0,%%mm1\n\t"290/*#0 Copy src2.*/291"movq %%mm2,%%mm3\n\t"292/*#1 Load src1.*/293"movq (%[src1],%[ystride]),%%mm4\n\t"294/*#0 Unpack lower src1.*/295"punpcklbw %%mm7,%%mm0\n\t"296/*#1 Load src2.*/297"movq (%[src2],%[ystride]),%%mm5\n\t"298/*#0 Unpack higher src1.*/299"punpckhbw %%mm7,%%mm1\n\t"300/*#0 Unpack lower src2.*/301"punpcklbw %%mm7,%%mm2\n\t"302/*#0 Unpack higher src2.*/303"punpckhbw %%mm7,%%mm3\n\t"304/*Advance src1 ptr.*/305"lea (%[src1],%[ystride],2),%[src1]\n\t"306/*Advance src2 ptr.*/307"lea (%[src2],%[ystride],2),%[src2]\n\t"308/*#0 Lower src1+src2.*/309"paddsw %%mm2,%%mm0\n\t"310/*#0 Higher src1+src2.*/311"paddsw %%mm3,%%mm1\n\t"312/*#1 Copy src1.*/313"movq %%mm4,%%mm2\n\t"314/*#0 Build lo average.*/315"psraw $1,%%mm0\n\t"316/*#1 Copy src2.*/317"movq %%mm5,%%mm3\n\t"318/*#1 Unpack lower src1.*/319"punpcklbw %%mm7,%%mm4\n\t"320/*#0 Build hi average.*/321"psraw $1,%%mm1\n\t"322/*#1 Unpack higher src1.*/323"punpckhbw %%mm7,%%mm2\n\t"324/*#0 low+=residue.*/325"paddsw (%[residue]),%%mm0\n\t"326/*#1 Unpack lower src2.*/327"punpcklbw %%mm7,%%mm5\n\t"328/*#0 high+=residue.*/329"paddsw 8(%[residue]),%%mm1\n\t"330/*#1 Unpack higher src2.*/331"punpckhbw %%mm7,%%mm3\n\t"332/*#1 Lower src1+src2.*/333"paddsw %%mm4,%%mm5\n\t"334/*#0 Pack and saturate.*/335"packuswb %%mm1,%%mm0\n\t"336/*#1 Higher src1+src2.*/337"paddsw %%mm2,%%mm3\n\t"338/*#0 Write row.*/339"movq %%mm0,(%[dst])\n\t"340/*#1 Build lo average.*/341"psraw $1,%%mm5\n\t"342/*#1 Build hi average.*/343"psraw $1,%%mm3\n\t"344/*#1 low+=residue.*/345"paddsw 16(%[residue]),%%mm5\n\t"346/*#1 high+=residue.*/347"paddsw 24(%[residue]),%%mm3\n\t"348/*#1 Pack and saturate.*/349"packuswb %%mm3,%%mm5\n\t"350/*#1 Write row ptr.*/351"movq %%mm5,(%[dst],%[ystride])\n\t"352/*Advance residue ptr.*/353"add $32,%[residue]\n\t"354/*Advance dest ptr.*/355"lea (%[dst],%[ystride],2),%[dst]\n\t"356:[dst]"+r"(_dst),[residue]"+r"(_residue),357[src1]"+r"(_src1),[src2]"+r"(_src2)358:[ystride]"r"((ptrdiff_t)_ystride)359:"memory"360);361}362}363364void oc_restore_fpu_mmx(void){365__asm__ __volatile__("emms\n\t");366}367#endif368369370