Path: blob/master/thirdparty/libtheora/x86/mmxfdct.c
10278 views
/********************************************************************1* *2* THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *3* USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *4* GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *5* IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *6* *7* THE Theora SOURCE CODE IS COPYRIGHT (C) 1999-2006 *8* by the Xiph.Org Foundation https://www.xiph.org/ *9* *10********************************************************************/11/*MMX fDCT implementation for x86_32*/12/*$Id: fdct_ses2.c 14579 2008-03-12 06:42:40Z xiphmont $*/13#include "x86enc.h"14#include "x86zigzag.h"1516#if defined(OC_X86_ASM)1718# define OC_FDCT_STAGE1_8x4 \19"#OC_FDCT_STAGE1_8x4\n\t" \20/*Stage 1:*/ \21/*mm0=t7'=t0-t7*/ \22"psubw %%mm7,%%mm0\n\t" \23"paddw %%mm7,%%mm7\n\t" \24/*mm1=t6'=t1-t6*/ \25"psubw %%mm6,%%mm1\n\t" \26"paddw %%mm6,%%mm6\n\t" \27/*mm2=t5'=t2-t5*/ \28"psubw %%mm5,%%mm2\n\t" \29"paddw %%mm5,%%mm5\n\t" \30/*mm3=t4'=t3-t4*/ \31"psubw %%mm4,%%mm3\n\t" \32"paddw %%mm4,%%mm4\n\t" \33/*mm7=t0'=t0+t7*/ \34"paddw %%mm0,%%mm7\n\t" \35/*mm6=t1'=t1+t6*/ \36"paddw %%mm1,%%mm6\n\t" \37/*mm5=t2'=t2+t5*/ \38"paddw %%mm2,%%mm5\n\t" \39/*mm4=t3'=t3+t4*/ \40"paddw %%mm3,%%mm4\n\t" \4142# define OC_FDCT8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \43"#OC_FDCT8x4\n\t" \44/*Stage 2:*/ \45/*mm7=t3''=t0'-t3'*/ \46"psubw %%mm4,%%mm7\n\t" \47"paddw %%mm4,%%mm4\n\t" \48/*mm6=t2''=t1'-t2'*/ \49"psubw %%mm5,%%mm6\n\t" \50"movq %%mm7,"_r6"(%[y])\n\t" \51"paddw %%mm5,%%mm5\n\t" \52/*mm1=t5''=t6'-t5'*/ \53"psubw %%mm2,%%mm1\n\t" \54"movq %%mm6,"_r2"(%[y])\n\t" \55/*mm4=t0''=t0'+t3'*/ \56"paddw %%mm7,%%mm4\n\t" \57"paddw %%mm2,%%mm2\n\t" \58/*mm5=t1''=t1'+t2'*/ \59"movq %%mm4,"_r0"(%[y])\n\t" \60"paddw %%mm6,%%mm5\n\t" \61/*mm2=t6''=t6'+t5'*/ \62"paddw %%mm1,%%mm2\n\t" \63"movq %%mm5,"_r4"(%[y])\n\t" \64/*mm0=t7', mm1=t5'', mm2=t6'', mm3=t4'.*/ \65/*mm4, mm5, mm6, mm7 are free.*/ \66/*Stage 3:*/ \67/*mm6={2}x4, mm7={27146,0xB500>>1}x2*/ \68"mov $0x5A806A0A,%[a]\n\t" \69"pcmpeqb %%mm6,%%mm6\n\t" \70"movd %[a],%%mm7\n\t" \71"psrlw $15,%%mm6\n\t" \72"punpckldq %%mm7,%%mm7\n\t" \73"paddw %%mm6,%%mm6\n\t" \74/*mm0=0, m2={-1}x4 \75mm5:mm4=t5''*27146+0xB500*/ \76"movq %%mm1,%%mm4\n\t" \77"movq %%mm1,%%mm5\n\t" \78"punpcklwd %%mm6,%%mm4\n\t" \79"movq %%mm2,"_r3"(%[y])\n\t" \80"pmaddwd %%mm7,%%mm4\n\t" \81"movq %%mm0,"_r7"(%[y])\n\t" \82"punpckhwd %%mm6,%%mm5\n\t" \83"pxor %%mm0,%%mm0\n\t" \84"pmaddwd %%mm7,%%mm5\n\t" \85"pcmpeqb %%mm2,%%mm2\n\t" \86/*mm2=t6'', mm1=t5''+(t5''!=0) \87mm4=(t5''*27146+0xB500>>16)*/ \88"pcmpeqw %%mm1,%%mm0\n\t" \89"psrad $16,%%mm4\n\t" \90"psubw %%mm2,%%mm0\n\t" \91"movq "_r3"(%[y]),%%mm2\n\t" \92"psrad $16,%%mm5\n\t" \93"paddw %%mm0,%%mm1\n\t" \94"packssdw %%mm5,%%mm4\n\t" \95/*mm4=s=(t5''*27146+0xB500>>16)+t5''+(t5''!=0)>>1*/ \96"paddw %%mm1,%%mm4\n\t" \97"movq "_r7"(%[y]),%%mm0\n\t" \98"psraw $1,%%mm4\n\t" \99"movq %%mm3,%%mm1\n\t" \100/*mm3=t4''=t4'+s*/ \101"paddw %%mm4,%%mm3\n\t" \102/*mm1=t5'''=t4'-s*/ \103"psubw %%mm4,%%mm1\n\t" \104/*mm1=0, mm3={-1}x4 \105mm5:mm4=t6''*27146+0xB500*/ \106"movq %%mm2,%%mm4\n\t" \107"movq %%mm2,%%mm5\n\t" \108"punpcklwd %%mm6,%%mm4\n\t" \109"movq %%mm1,"_r5"(%[y])\n\t" \110"pmaddwd %%mm7,%%mm4\n\t" \111"movq %%mm3,"_r1"(%[y])\n\t" \112"punpckhwd %%mm6,%%mm5\n\t" \113"pxor %%mm1,%%mm1\n\t" \114"pmaddwd %%mm7,%%mm5\n\t" \115"pcmpeqb %%mm3,%%mm3\n\t" \116/*mm2=t6''+(t6''!=0), mm4=(t6''*27146+0xB500>>16)*/ \117"psrad $16,%%mm4\n\t" \118"pcmpeqw %%mm2,%%mm1\n\t" \119"psrad $16,%%mm5\n\t" \120"psubw %%mm3,%%mm1\n\t" \121"packssdw %%mm5,%%mm4\n\t" \122"paddw %%mm1,%%mm2\n\t" \123/*mm1=t1'' \124mm4=s=(t6''*27146+0xB500>>16)+t6''+(t6''!=0)>>1*/ \125"paddw %%mm2,%%mm4\n\t" \126"movq "_r4"(%[y]),%%mm1\n\t" \127"psraw $1,%%mm4\n\t" \128"movq %%mm0,%%mm2\n\t" \129/*mm7={54491-0x7FFF,0x7FFF}x2 \130mm0=t7''=t7'+s*/ \131"paddw %%mm4,%%mm0\n\t" \132/*mm2=t6'''=t7'-s*/ \133"psubw %%mm4,%%mm2\n\t" \134/*Stage 4:*/ \135/*mm0=0, mm2=t0'' \136mm5:mm4=t1''*27146+0xB500*/ \137"movq %%mm1,%%mm4\n\t" \138"movq %%mm1,%%mm5\n\t" \139"punpcklwd %%mm6,%%mm4\n\t" \140"movq %%mm2,"_r3"(%[y])\n\t" \141"pmaddwd %%mm7,%%mm4\n\t" \142"movq "_r0"(%[y]),%%mm2\n\t" \143"punpckhwd %%mm6,%%mm5\n\t" \144"movq %%mm0,"_r7"(%[y])\n\t" \145"pmaddwd %%mm7,%%mm5\n\t" \146"pxor %%mm0,%%mm0\n\t" \147/*mm7={27146,0x4000>>1}x2 \148mm0=s=(t1''*27146+0xB500>>16)+t1''+(t1''!=0)*/ \149"psrad $16,%%mm4\n\t" \150"mov $0x20006A0A,%[a]\n\t" \151"pcmpeqw %%mm1,%%mm0\n\t" \152"movd %[a],%%mm7\n\t" \153"psrad $16,%%mm5\n\t" \154"psubw %%mm3,%%mm0\n\t" \155"packssdw %%mm5,%%mm4\n\t" \156"paddw %%mm1,%%mm0\n\t" \157"punpckldq %%mm7,%%mm7\n\t" \158"paddw %%mm4,%%mm0\n\t" \159/*mm6={0x00000E3D}x2 \160mm1=-(t0''==0), mm5:mm4=t0''*27146+0x4000*/ \161"movq %%mm2,%%mm4\n\t" \162"movq %%mm2,%%mm5\n\t" \163"punpcklwd %%mm6,%%mm4\n\t" \164"mov $0x0E3D,%[a]\n\t" \165"pmaddwd %%mm7,%%mm4\n\t" \166"punpckhwd %%mm6,%%mm5\n\t" \167"movd %[a],%%mm6\n\t" \168"pmaddwd %%mm7,%%mm5\n\t" \169"pxor %%mm1,%%mm1\n\t" \170"punpckldq %%mm6,%%mm6\n\t" \171"pcmpeqw %%mm2,%%mm1\n\t" \172/*mm4=r=(t0''*27146+0x4000>>16)+t0''+(t0''!=0)*/ \173"psrad $16,%%mm4\n\t" \174"psubw %%mm3,%%mm1\n\t" \175"psrad $16,%%mm5\n\t" \176"paddw %%mm1,%%mm2\n\t" \177"packssdw %%mm5,%%mm4\n\t" \178"movq "_r5"(%[y]),%%mm1\n\t" \179"paddw %%mm2,%%mm4\n\t" \180/*mm2=t6'', mm0=_y[0]=u=r+s>>1 \181The naive implementation could cause overflow, so we use \182u=(r&s)+((r^s)>>1).*/ \183"movq "_r3"(%[y]),%%mm2\n\t" \184"movq %%mm0,%%mm7\n\t" \185"pxor %%mm4,%%mm0\n\t" \186"pand %%mm4,%%mm7\n\t" \187"psraw $1,%%mm0\n\t" \188"mov $0x7FFF54DC,%[a]\n\t" \189"paddw %%mm7,%%mm0\n\t" \190"movd %[a],%%mm7\n\t" \191/*mm7={54491-0x7FFF,0x7FFF}x2 \192mm4=_y[4]=v=r-u*/ \193"psubw %%mm0,%%mm4\n\t" \194"punpckldq %%mm7,%%mm7\n\t" \195"movq %%mm4,"_r4"(%[y])\n\t" \196/*mm0=0, mm7={36410}x4 \197mm1=(t5'''!=0), mm5:mm4=54491*t5'''+0x0E3D*/ \198"movq %%mm1,%%mm4\n\t" \199"movq %%mm1,%%mm5\n\t" \200"punpcklwd %%mm1,%%mm4\n\t" \201"mov $0x8E3A8E3A,%[a]\n\t" \202"pmaddwd %%mm7,%%mm4\n\t" \203"movq %%mm0,"_r0"(%[y])\n\t" \204"punpckhwd %%mm1,%%mm5\n\t" \205"pxor %%mm0,%%mm0\n\t" \206"pmaddwd %%mm7,%%mm5\n\t" \207"pcmpeqw %%mm0,%%mm1\n\t" \208"movd %[a],%%mm7\n\t" \209"psubw %%mm3,%%mm1\n\t" \210"punpckldq %%mm7,%%mm7\n\t" \211"paddd %%mm6,%%mm4\n\t" \212"paddd %%mm6,%%mm5\n\t" \213/*mm0=0 \214mm3:mm1=36410*t6'''+((t5'''!=0)<<16)*/ \215"movq %%mm2,%%mm6\n\t" \216"movq %%mm2,%%mm3\n\t" \217"pmulhw %%mm7,%%mm6\n\t" \218"paddw %%mm2,%%mm1\n\t" \219"pmullw %%mm7,%%mm3\n\t" \220"pxor %%mm0,%%mm0\n\t" \221"paddw %%mm1,%%mm6\n\t" \222"movq %%mm3,%%mm1\n\t" \223"punpckhwd %%mm6,%%mm3\n\t" \224"punpcklwd %%mm6,%%mm1\n\t" \225/*mm3={-1}x4, mm6={1}x4 \226mm4=_y[5]=u=(54491*t5'''+36410*t6'''+0x0E3D>>16)+(t5'''!=0)*/ \227"paddd %%mm3,%%mm5\n\t" \228"paddd %%mm1,%%mm4\n\t" \229"psrad $16,%%mm5\n\t" \230"pxor %%mm6,%%mm6\n\t" \231"psrad $16,%%mm4\n\t" \232"pcmpeqb %%mm3,%%mm3\n\t" \233"packssdw %%mm5,%%mm4\n\t" \234"psubw %%mm3,%%mm6\n\t" \235/*mm1=t7'', mm7={26568,0x3400}x2 \236mm2=s=t6'''-(36410*u>>16)*/ \237"movq %%mm4,%%mm1\n\t" \238"mov $0x340067C8,%[a]\n\t" \239"pmulhw %%mm7,%%mm4\n\t" \240"movd %[a],%%mm7\n\t" \241"movq %%mm1,"_r5"(%[y])\n\t" \242"punpckldq %%mm7,%%mm7\n\t" \243"paddw %%mm1,%%mm4\n\t" \244"movq "_r7"(%[y]),%%mm1\n\t" \245"psubw %%mm4,%%mm2\n\t" \246/*mm6={0x00007B1B}x2 \247mm0=(s!=0), mm5:mm4=s*26568+0x3400*/ \248"movq %%mm2,%%mm4\n\t" \249"movq %%mm2,%%mm5\n\t" \250"punpcklwd %%mm6,%%mm4\n\t" \251"pcmpeqw %%mm2,%%mm0\n\t" \252"pmaddwd %%mm7,%%mm4\n\t" \253"mov $0x7B1B,%[a]\n\t" \254"punpckhwd %%mm6,%%mm5\n\t" \255"movd %[a],%%mm6\n\t" \256"pmaddwd %%mm7,%%mm5\n\t" \257"psubw %%mm3,%%mm0\n\t" \258"punpckldq %%mm6,%%mm6\n\t" \259/*mm7={64277-0x7FFF,0x7FFF}x2 \260mm2=_y[3]=v=(s*26568+0x3400>>17)+s+(s!=0)*/ \261"psrad $17,%%mm4\n\t" \262"paddw %%mm0,%%mm2\n\t" \263"psrad $17,%%mm5\n\t" \264"mov $0x7FFF7B16,%[a]\n\t" \265"packssdw %%mm5,%%mm4\n\t" \266"movd %[a],%%mm7\n\t" \267"paddw %%mm4,%%mm2\n\t" \268"punpckldq %%mm7,%%mm7\n\t" \269/*mm0=0, mm7={12785}x4 \270mm1=(t7''!=0), mm2=t4'', mm5:mm4=64277*t7''+0x7B1B*/ \271"movq %%mm1,%%mm4\n\t" \272"movq %%mm1,%%mm5\n\t" \273"movq %%mm2,"_r3"(%[y])\n\t" \274"punpcklwd %%mm1,%%mm4\n\t" \275"movq "_r1"(%[y]),%%mm2\n\t" \276"pmaddwd %%mm7,%%mm4\n\t" \277"mov $0x31F131F1,%[a]\n\t" \278"punpckhwd %%mm1,%%mm5\n\t" \279"pxor %%mm0,%%mm0\n\t" \280"pmaddwd %%mm7,%%mm5\n\t" \281"pcmpeqw %%mm0,%%mm1\n\t" \282"movd %[a],%%mm7\n\t" \283"psubw %%mm3,%%mm1\n\t" \284"punpckldq %%mm7,%%mm7\n\t" \285"paddd %%mm6,%%mm4\n\t" \286"paddd %%mm6,%%mm5\n\t" \287/*mm3:mm1=12785*t4'''+((t7''!=0)<<16)*/ \288"movq %%mm2,%%mm6\n\t" \289"movq %%mm2,%%mm3\n\t" \290"pmulhw %%mm7,%%mm6\n\t" \291"pmullw %%mm7,%%mm3\n\t" \292"paddw %%mm1,%%mm6\n\t" \293"movq %%mm3,%%mm1\n\t" \294"punpckhwd %%mm6,%%mm3\n\t" \295"punpcklwd %%mm6,%%mm1\n\t" \296/*mm3={-1}x4, mm6={1}x4 \297mm4=_y[1]=u=(12785*t4'''+64277*t7''+0x7B1B>>16)+(t7''!=0)*/ \298"paddd %%mm3,%%mm5\n\t" \299"paddd %%mm1,%%mm4\n\t" \300"psrad $16,%%mm5\n\t" \301"pxor %%mm6,%%mm6\n\t" \302"psrad $16,%%mm4\n\t" \303"pcmpeqb %%mm3,%%mm3\n\t" \304"packssdw %%mm5,%%mm4\n\t" \305"psubw %%mm3,%%mm6\n\t" \306/*mm1=t3'', mm7={20539,0x3000}x2 \307mm4=s=(12785*u>>16)-t4''*/ \308"movq %%mm4,"_r1"(%[y])\n\t" \309"pmulhw %%mm7,%%mm4\n\t" \310"mov $0x3000503B,%[a]\n\t" \311"movq "_r6"(%[y]),%%mm1\n\t" \312"movd %[a],%%mm7\n\t" \313"psubw %%mm2,%%mm4\n\t" \314"punpckldq %%mm7,%%mm7\n\t" \315/*mm6={0x00006CB7}x2 \316mm0=(s!=0), mm5:mm4=s*20539+0x3000*/ \317"movq %%mm4,%%mm5\n\t" \318"movq %%mm4,%%mm2\n\t" \319"punpcklwd %%mm6,%%mm4\n\t" \320"pcmpeqw %%mm2,%%mm0\n\t" \321"pmaddwd %%mm7,%%mm4\n\t" \322"mov $0x6CB7,%[a]\n\t" \323"punpckhwd %%mm6,%%mm5\n\t" \324"movd %[a],%%mm6\n\t" \325"pmaddwd %%mm7,%%mm5\n\t" \326"psubw %%mm3,%%mm0\n\t" \327"punpckldq %%mm6,%%mm6\n\t" \328/*mm7={60547-0x7FFF,0x7FFF}x2 \329mm2=_y[7]=v=(s*20539+0x3000>>20)+s+(s!=0)*/ \330"psrad $20,%%mm4\n\t" \331"paddw %%mm0,%%mm2\n\t" \332"psrad $20,%%mm5\n\t" \333"mov $0x7FFF6C84,%[a]\n\t" \334"packssdw %%mm5,%%mm4\n\t" \335"movd %[a],%%mm7\n\t" \336"paddw %%mm4,%%mm2\n\t" \337"punpckldq %%mm7,%%mm7\n\t" \338/*mm0=0, mm7={25080}x4 \339mm2=t2'', mm5:mm4=60547*t3''+0x6CB7*/ \340"movq %%mm1,%%mm4\n\t" \341"movq %%mm1,%%mm5\n\t" \342"movq %%mm2,"_r7"(%[y])\n\t" \343"punpcklwd %%mm1,%%mm4\n\t" \344"movq "_r2"(%[y]),%%mm2\n\t" \345"pmaddwd %%mm7,%%mm4\n\t" \346"mov $0x61F861F8,%[a]\n\t" \347"punpckhwd %%mm1,%%mm5\n\t" \348"pxor %%mm0,%%mm0\n\t" \349"pmaddwd %%mm7,%%mm5\n\t" \350"movd %[a],%%mm7\n\t" \351"pcmpeqw %%mm0,%%mm1\n\t" \352"psubw %%mm3,%%mm1\n\t" \353"punpckldq %%mm7,%%mm7\n\t" \354"paddd %%mm6,%%mm4\n\t" \355"paddd %%mm6,%%mm5\n\t" \356/*mm3:mm1=25080*t2''+((t3''!=0)<<16)*/ \357"movq %%mm2,%%mm6\n\t" \358"movq %%mm2,%%mm3\n\t" \359"pmulhw %%mm7,%%mm6\n\t" \360"pmullw %%mm7,%%mm3\n\t" \361"paddw %%mm1,%%mm6\n\t" \362"movq %%mm3,%%mm1\n\t" \363"punpckhwd %%mm6,%%mm3\n\t" \364"punpcklwd %%mm6,%%mm1\n\t" \365/*mm1={-1}x4 \366mm4=u=(25080*t2''+60547*t3''+0x6CB7>>16)+(t3''!=0)*/ \367"paddd %%mm3,%%mm5\n\t" \368"paddd %%mm1,%%mm4\n\t" \369"psrad $16,%%mm5\n\t" \370"mov $0x28005460,%[a]\n\t" \371"psrad $16,%%mm4\n\t" \372"pcmpeqb %%mm1,%%mm1\n\t" \373"packssdw %%mm5,%%mm4\n\t" \374/*mm5={1}x4, mm6=_y[2]=u, mm7={21600,0x2800}x2 \375mm4=s=(25080*u>>16)-t2''*/ \376"movq %%mm4,%%mm6\n\t" \377"pmulhw %%mm7,%%mm4\n\t" \378"pxor %%mm5,%%mm5\n\t" \379"movd %[a],%%mm7\n\t" \380"psubw %%mm1,%%mm5\n\t" \381"punpckldq %%mm7,%%mm7\n\t" \382"psubw %%mm2,%%mm4\n\t" \383/*mm2=s+(s!=0) \384mm4:mm3=s*21600+0x2800*/ \385"movq %%mm4,%%mm3\n\t" \386"movq %%mm4,%%mm2\n\t" \387"punpckhwd %%mm5,%%mm4\n\t" \388"pcmpeqw %%mm2,%%mm0\n\t" \389"pmaddwd %%mm7,%%mm4\n\t" \390"psubw %%mm1,%%mm0\n\t" \391"punpcklwd %%mm5,%%mm3\n\t" \392"paddw %%mm0,%%mm2\n\t" \393"pmaddwd %%mm7,%%mm3\n\t" \394/*mm0=_y[4], mm1=_y[7], mm4=_y[0], mm5=_y[5] \395mm3=_y[6]=v=(s*21600+0x2800>>18)+s+(s!=0)*/ \396"movq "_r4"(%[y]),%%mm0\n\t" \397"psrad $18,%%mm4\n\t" \398"movq "_r5"(%[y]),%%mm5\n\t" \399"psrad $18,%%mm3\n\t" \400"movq "_r7"(%[y]),%%mm1\n\t" \401"packssdw %%mm4,%%mm3\n\t" \402"movq "_r0"(%[y]),%%mm4\n\t" \403"paddw %%mm2,%%mm3\n\t" \404405/*On input, mm4=_y[0], mm6=_y[2], mm0=_y[4], mm5=_y[5], mm3=_y[6], mm1=_y[7].406On output, {_y[4],mm1,mm2,mm3} contains the transpose of _y[4...7] and407{mm4,mm5,mm6,mm7} contains the transpose of _y[0...3].*/408# define OC_TRANSPOSE8x4(_r0,_r1,_r2,_r3,_r4,_r5,_r6,_r7) \409"#OC_TRANSPOSE8x4\n\t" \410/*First 4x4 transpose:*/ \411/*mm0 = e3 e2 e1 e0 \412mm5 = f3 f2 f1 f0 \413mm3 = g3 g2 g1 g0 \414mm1 = h3 h2 h1 h0*/ \415"movq %%mm0,%%mm2\n\t" \416"punpcklwd %%mm5,%%mm0\n\t" \417"punpckhwd %%mm5,%%mm2\n\t" \418"movq %%mm3,%%mm5\n\t" \419"punpcklwd %%mm1,%%mm3\n\t" \420"punpckhwd %%mm1,%%mm5\n\t" \421/*mm0 = f1 e1 f0 e0 \422mm2 = f3 e3 f2 e2 \423mm3 = h1 g1 h0 g0 \424mm5 = h3 g3 h2 g2*/ \425"movq %%mm0,%%mm1\n\t" \426"punpckldq %%mm3,%%mm0\n\t" \427"movq %%mm0,"_r4"(%[y])\n\t" \428"punpckhdq %%mm3,%%mm1\n\t" \429"movq "_r1"(%[y]),%%mm0\n\t" \430"movq %%mm2,%%mm3\n\t" \431"punpckldq %%mm5,%%mm2\n\t" \432"punpckhdq %%mm5,%%mm3\n\t" \433"movq "_r3"(%[y]),%%mm5\n\t" \434/*_y[4] = h0 g0 f0 e0 \435mm1 = h1 g1 f1 e1 \436mm2 = h2 g2 f2 e2 \437mm3 = h3 g3 f3 e3*/ \438/*Second 4x4 transpose:*/ \439/*mm4 = a3 a2 a1 a0 \440mm0 = b3 b2 b1 b0 \441mm6 = c3 c2 c1 c0 \442mm5 = d3 d2 d1 d0*/ \443"movq %%mm4,%%mm7\n\t" \444"punpcklwd %%mm0,%%mm4\n\t" \445"punpckhwd %%mm0,%%mm7\n\t" \446"movq %%mm6,%%mm0\n\t" \447"punpcklwd %%mm5,%%mm6\n\t" \448"punpckhwd %%mm5,%%mm0\n\t" \449/*mm4 = b1 a1 b0 a0 \450mm7 = b3 a3 b2 a2 \451mm6 = d1 c1 d0 c0 \452mm0 = d3 c3 d2 c2*/ \453"movq %%mm4,%%mm5\n\t" \454"punpckldq %%mm6,%%mm4\n\t" \455"punpckhdq %%mm6,%%mm5\n\t" \456"movq %%mm7,%%mm6\n\t" \457"punpckhdq %%mm0,%%mm7\n\t" \458"punpckldq %%mm0,%%mm6\n\t" \459/*mm4 = d0 c0 b0 a0 \460mm5 = d1 c1 b1 a1 \461mm6 = d2 c2 b2 a2 \462mm7 = d3 c3 b3 a3*/ \463464/*MMX implementation of the fDCT.*/465void oc_enc_fdct8x8_mmxext(ogg_int16_t _y[64],const ogg_int16_t _x[64]){466OC_ALIGN8(ogg_int16_t buf[64]);467ptrdiff_t a;468__asm__ __volatile__(469/*Add two extra bits of working precision to improve accuracy; any more and470we could overflow.*/471/*We also add biases to correct for some systematic error that remains in472the full fDCT->iDCT round trip.*/473"movq 0x00(%[x]),%%mm0\n\t"474"movq 0x10(%[x]),%%mm1\n\t"475"movq 0x20(%[x]),%%mm2\n\t"476"movq 0x30(%[x]),%%mm3\n\t"477"pcmpeqb %%mm4,%%mm4\n\t"478"pxor %%mm7,%%mm7\n\t"479"movq %%mm0,%%mm5\n\t"480"psllw $2,%%mm0\n\t"481"pcmpeqw %%mm7,%%mm5\n\t"482"movq 0x70(%[x]),%%mm7\n\t"483"psllw $2,%%mm1\n\t"484"psubw %%mm4,%%mm5\n\t"485"psllw $2,%%mm2\n\t"486"mov $1,%[a]\n\t"487"pslld $16,%%mm5\n\t"488"movd %[a],%%mm6\n\t"489"psllq $16,%%mm5\n\t"490"mov $0x10001,%[a]\n\t"491"psllw $2,%%mm3\n\t"492"movd %[a],%%mm4\n\t"493"punpckhwd %%mm6,%%mm5\n\t"494"psubw %%mm6,%%mm1\n\t"495"movq 0x60(%[x]),%%mm6\n\t"496"paddw %%mm5,%%mm0\n\t"497"movq 0x50(%[x]),%%mm5\n\t"498"paddw %%mm4,%%mm0\n\t"499"movq 0x40(%[x]),%%mm4\n\t"500/*We inline stage1 of the transform here so we can get better instruction501scheduling with the shifts.*/502/*mm0=t7'=t0-t7*/503"psllw $2,%%mm7\n\t"504"psubw %%mm7,%%mm0\n\t"505"psllw $2,%%mm6\n\t"506"paddw %%mm7,%%mm7\n\t"507/*mm1=t6'=t1-t6*/508"psllw $2,%%mm5\n\t"509"psubw %%mm6,%%mm1\n\t"510"psllw $2,%%mm4\n\t"511"paddw %%mm6,%%mm6\n\t"512/*mm2=t5'=t2-t5*/513"psubw %%mm5,%%mm2\n\t"514"paddw %%mm5,%%mm5\n\t"515/*mm3=t4'=t3-t4*/516"psubw %%mm4,%%mm3\n\t"517"paddw %%mm4,%%mm4\n\t"518/*mm7=t0'=t0+t7*/519"paddw %%mm0,%%mm7\n\t"520/*mm6=t1'=t1+t6*/521"paddw %%mm1,%%mm6\n\t"522/*mm5=t2'=t2+t5*/523"paddw %%mm2,%%mm5\n\t"524/*mm4=t3'=t3+t4*/525"paddw %%mm3,%%mm4\n\t"526OC_FDCT8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")527OC_TRANSPOSE8x4("0x00","0x10","0x20","0x30","0x40","0x50","0x60","0x70")528/*Swap out this 8x4 block for the next one.*/529"movq 0x08(%[x]),%%mm0\n\t"530"movq %%mm7,0x30(%[y])\n\t"531"movq 0x78(%[x]),%%mm7\n\t"532"movq %%mm1,0x50(%[y])\n\t"533"movq 0x18(%[x]),%%mm1\n\t"534"movq %%mm6,0x20(%[y])\n\t"535"movq 0x68(%[x]),%%mm6\n\t"536"movq %%mm2,0x60(%[y])\n\t"537"movq 0x28(%[x]),%%mm2\n\t"538"movq %%mm5,0x10(%[y])\n\t"539"movq 0x58(%[x]),%%mm5\n\t"540"movq %%mm3,0x70(%[y])\n\t"541"movq 0x38(%[x]),%%mm3\n\t"542/*And increase its working precision, too.*/543"psllw $2,%%mm0\n\t"544"movq %%mm4,0x00(%[y])\n\t"545"psllw $2,%%mm7\n\t"546"movq 0x48(%[x]),%%mm4\n\t"547/*We inline stage1 of the transform here so we can get better instruction548scheduling with the shifts.*/549/*mm0=t7'=t0-t7*/550"psubw %%mm7,%%mm0\n\t"551"psllw $2,%%mm1\n\t"552"paddw %%mm7,%%mm7\n\t"553"psllw $2,%%mm6\n\t"554/*mm1=t6'=t1-t6*/555"psubw %%mm6,%%mm1\n\t"556"psllw $2,%%mm2\n\t"557"paddw %%mm6,%%mm6\n\t"558"psllw $2,%%mm5\n\t"559/*mm2=t5'=t2-t5*/560"psubw %%mm5,%%mm2\n\t"561"psllw $2,%%mm3\n\t"562"paddw %%mm5,%%mm5\n\t"563"psllw $2,%%mm4\n\t"564/*mm3=t4'=t3-t4*/565"psubw %%mm4,%%mm3\n\t"566"paddw %%mm4,%%mm4\n\t"567/*mm7=t0'=t0+t7*/568"paddw %%mm0,%%mm7\n\t"569/*mm6=t1'=t1+t6*/570"paddw %%mm1,%%mm6\n\t"571/*mm5=t2'=t2+t5*/572"paddw %%mm2,%%mm5\n\t"573/*mm4=t3'=t3+t4*/574"paddw %%mm3,%%mm4\n\t"575OC_FDCT8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")576OC_TRANSPOSE8x4("0x08","0x18","0x28","0x38","0x48","0x58","0x68","0x78")577/*Here the first 4x4 block of output from the last transpose is the second5784x4 block of input for the next transform.579We have cleverly arranged that it already be in the appropriate place,580so we only have to do half the stores and loads.*/581"movq 0x00(%[y]),%%mm0\n\t"582"movq %%mm1,0x58(%[y])\n\t"583"movq 0x10(%[y]),%%mm1\n\t"584"movq %%mm2,0x68(%[y])\n\t"585"movq 0x20(%[y]),%%mm2\n\t"586"movq %%mm3,0x78(%[y])\n\t"587"movq 0x30(%[y]),%%mm3\n\t"588OC_FDCT_STAGE1_8x4589OC_FDCT8x4("0x00","0x10","0x20","0x30","0x08","0x18","0x28","0x38")590/*mm2={-2}x4*/591"pcmpeqw %%mm2,%%mm2\n\t"592"paddw %%mm2,%%mm2\n\t"593/*Round and store the results (no transpose).*/594"movq 0x10(%[y]),%%mm7\n\t"595"psubw %%mm2,%%mm4\n\t"596"psubw %%mm2,%%mm6\n\t"597"psraw $2,%%mm4\n\t"598"psubw %%mm2,%%mm0\n\t"599"movq %%mm4,"OC_MEM_OFFS(0x00,buf)"\n\t"600"movq 0x30(%[y]),%%mm4\n\t"601"psraw $2,%%mm6\n\t"602"psubw %%mm2,%%mm5\n\t"603"movq %%mm6,"OC_MEM_OFFS(0x20,buf)"\n\t"604"psraw $2,%%mm0\n\t"605"psubw %%mm2,%%mm3\n\t"606"movq %%mm0,"OC_MEM_OFFS(0x40,buf)"\n\t"607"psraw $2,%%mm5\n\t"608"psubw %%mm2,%%mm1\n\t"609"movq %%mm5,"OC_MEM_OFFS(0x50,buf)"\n\t"610"psraw $2,%%mm3\n\t"611"psubw %%mm2,%%mm7\n\t"612"movq %%mm3,"OC_MEM_OFFS(0x60,buf)"\n\t"613"psraw $2,%%mm1\n\t"614"psubw %%mm2,%%mm4\n\t"615"movq %%mm1,"OC_MEM_OFFS(0x70,buf)"\n\t"616"psraw $2,%%mm7\n\t"617"movq %%mm7,"OC_MEM_OFFS(0x10,buf)"\n\t"618"psraw $2,%%mm4\n\t"619"movq %%mm4,"OC_MEM_OFFS(0x30,buf)"\n\t"620/*Load the next block.*/621"movq 0x40(%[y]),%%mm0\n\t"622"movq 0x78(%[y]),%%mm7\n\t"623"movq 0x50(%[y]),%%mm1\n\t"624"movq 0x68(%[y]),%%mm6\n\t"625"movq 0x60(%[y]),%%mm2\n\t"626"movq 0x58(%[y]),%%mm5\n\t"627"movq 0x70(%[y]),%%mm3\n\t"628"movq 0x48(%[y]),%%mm4\n\t"629OC_FDCT_STAGE1_8x4630OC_FDCT8x4("0x40","0x50","0x60","0x70","0x48","0x58","0x68","0x78")631/*mm2={-2}x4*/632"pcmpeqw %%mm2,%%mm2\n\t"633"paddw %%mm2,%%mm2\n\t"634/*Round and store the results (no transpose).*/635"movq 0x50(%[y]),%%mm7\n\t"636"psubw %%mm2,%%mm4\n\t"637"psubw %%mm2,%%mm6\n\t"638"psraw $2,%%mm4\n\t"639"psubw %%mm2,%%mm0\n\t"640"movq %%mm4,"OC_MEM_OFFS(0x08,buf)"\n\t"641"movq 0x70(%[y]),%%mm4\n\t"642"psraw $2,%%mm6\n\t"643"psubw %%mm2,%%mm5\n\t"644"movq %%mm6,"OC_MEM_OFFS(0x28,buf)"\n\t"645"psraw $2,%%mm0\n\t"646"psubw %%mm2,%%mm3\n\t"647"movq %%mm0,"OC_MEM_OFFS(0x48,buf)"\n\t"648"psraw $2,%%mm5\n\t"649"psubw %%mm2,%%mm1\n\t"650"movq %%mm5,"OC_MEM_OFFS(0x58,buf)"\n\t"651"psraw $2,%%mm3\n\t"652"psubw %%mm2,%%mm7\n\t"653"movq %%mm3,"OC_MEM_OFFS(0x68,buf)"\n\t"654"psraw $2,%%mm1\n\t"655"psubw %%mm2,%%mm4\n\t"656"movq %%mm1,"OC_MEM_OFFS(0x78,buf)"\n\t"657"psraw $2,%%mm7\n\t"658"movq %%mm7,"OC_MEM_OFFS(0x18,buf)"\n\t"659"psraw $2,%%mm4\n\t"660"movq %%mm4,"OC_MEM_OFFS(0x38,buf)"\n\t"661/*Final transpose and zig-zag.*/662#define OC_ZZ_LOAD_ROW_LO(_row,_reg) \663"movq "OC_MEM_OFFS(16*_row,buf)","_reg"\n\t" \664665#define OC_ZZ_LOAD_ROW_HI(_row,_reg) \666"movq "OC_MEM_OFFS(16*_row+8,buf)","_reg"\n\t" \667668OC_TRANSPOSE_ZIG_ZAG_MMXEXT669#undef OC_ZZ_LOAD_ROW_LO670#undef OC_ZZ_LOAD_ROW_HI671:[a]"=&r"(a),[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,64))672:[y]"r"(_y),[x]"r"(_x)673:"memory"674);675}676677#endif678679680