diff options
author | wiz <wiz@pkgsrc.org> | 2008-10-09 19:46:55 +0000 |
---|---|---|
committer | wiz <wiz@pkgsrc.org> | 2008-10-09 19:46:55 +0000 |
commit | 8ff5bfd0bd02d0db94124f5d1bbdb4ddabffc131 (patch) | |
tree | 95536edebc0c42203d5722bcdb434d75ca402eff /multimedia | |
parent | dd87b7af6b12231da70b3eecd9020a6a22d3a329 (diff) | |
download | pkgsrc-8ff5bfd0bd02d0db94124f5d1bbdb4ddabffc131.tar.gz |
Update to 1.0rc1:
libtheora 1.0rc1
- Merge x86 assembly for forward DCT from Thusnelda branch.
- Update 32 bit MMX with loop filter fix.
- Check for an uninitialized state before dereferencing in propagating
decode calls.
- Remove all TH_DEBUG statements.
- Rename the bitpacker source files imported from libogg to avoid
confusing simple build systems using both libraries.
- Add VS2008 project files.
- Add explicit casts as a work-around for Solaris's cc ignoring the
signedness of bitfield types.
- Set quantization parameters to default values when an empty buffer is
passed with TH_ENCCTL_SET_QUANT_PARAMS.
- Split encoder and decoder tests depending on configure settings.
- Return lstylex.sty to the distribution.
- Disable inline assembly on gcc versions prior to 3.1.
- Remove extern references for OC_*_QUANT_MIN.
- Make various data tables static const so they can be read-only.
- cpuid assembly fix for MSVC.
- Remove ENCCTL codes from the old encoder API.
- Implement TH_ENCCTL_SET_KEYFRAME_FREQUENCY_FORCE ctl.
Diffstat (limited to 'multimedia')
-rw-r--r-- | multimedia/libtheora/Makefile | 10 | ||||
-rw-r--r-- | multimedia/libtheora/PLIST | 85 | ||||
-rw-r--r-- | multimedia/libtheora/distinfo | 8 | ||||
-rw-r--r-- | multimedia/libtheora/files/dct_decode_mmx.c | 409 |
4 files changed, 499 insertions, 13 deletions
diff --git a/multimedia/libtheora/Makefile b/multimedia/libtheora/Makefile index 39943b8004d..fa185557a34 100644 --- a/multimedia/libtheora/Makefile +++ b/multimedia/libtheora/Makefile @@ -1,7 +1,8 @@ -# $NetBSD: Makefile,v 1.17 2008/04/23 12:26:47 tron Exp $ +# $NetBSD: Makefile,v 1.18 2008/10/09 19:46:55 wiz Exp $ # -DISTNAME= libtheora-1.0beta3 +DISTNAME= libtheora-1.0RC1 +PKGNAME= libtheora-1.0rc1 CATEGORIES= multimedia MASTER_SITES= http://downloads.xiph.org/releases/theora/ EXTRACT_SUFX= .tar.bz2 @@ -24,6 +25,11 @@ CONFIGURE_ENV+= ac_cv_path_SDL_CONFIG=no CONFIGURE_ENV+= ac_cv_prog_HAVE_DOXYGEN=no CONFIGURE_ENV+= ac_cv_prog_HAVE_PDFLATEX=no +PLIST_SUBST+= SUBDIR=${DISTNAME} + +post-extract: + ${CP} ${FILESDIR}/dct_decode_mmx.c ${WRKSRC}/lib/enc/x86_64 + .include "../../mk/bsd.prefs.mk" .if !empty(MACHINE_PLATFORM:MDarwin-[9].*-i386) diff --git a/multimedia/libtheora/PLIST b/multimedia/libtheora/PLIST index 2a616531a33..2471099df6d 100644 --- a/multimedia/libtheora/PLIST +++ b/multimedia/libtheora/PLIST @@ -1,4 +1,4 @@ -@comment $NetBSD: PLIST,v 1.8 2008/04/22 20:27:04 wiz Exp $ +@comment $NetBSD: PLIST,v 1.9 2008/10/09 19:46:55 wiz Exp $ include/theora/codec.h include/theora/theora.h include/theora/theoradec.h @@ -9,10 +9,81 @@ lib/libtheoraenc.la lib/pkgconfig/theora.pc lib/pkgconfig/theoradec.pc lib/pkgconfig/theoraenc.pc -share/doc/${PKGNAME}/color.html -share/doc/${PKGNAME}/doxygen-build.stamp -share/doc/${PKGNAME}/draft-ietf-avt-rtp-theora-00.txt -share/doc/${PKGNAME}/draft-ietf-avt-rtp-theora-00.xml -share/doc/${PKGNAME}/vp3-format.txt -@dirrm share/doc/${PKGNAME} +share/doc/${SUBDIR}/color.html +share/doc/${SUBDIR}/doxygen-build.stamp +share/doc/${SUBDIR}/draft-ietf-avt-rtp-theora-00.txt +share/doc/${SUBDIR}/draft-ietf-avt-rtp-theora-00.xml +share/doc/${SUBDIR}/html/annotated.html +share/doc/${SUBDIR}/html/codec_8h-source.html +share/doc/${SUBDIR}/html/codec_8h.html +share/doc/${SUBDIR}/html/doxygen.css +share/doc/${SUBDIR}/html/doxygen.png +share/doc/${SUBDIR}/html/files.html +share/doc/${SUBDIR}/html/functions.html +share/doc/${SUBDIR}/html/functions_vars.html +share/doc/${SUBDIR}/html/globals.html +share/doc/${SUBDIR}/html/globals_defs.html +share/doc/${SUBDIR}/html/globals_enum.html +share/doc/${SUBDIR}/html/globals_eval.html +share/doc/${SUBDIR}/html/globals_func.html +share/doc/${SUBDIR}/html/globals_type.html +share/doc/${SUBDIR}/html/globals_vars.html +share/doc/${SUBDIR}/html/group__basefuncs.html +share/doc/${SUBDIR}/html/group__decfuncs.html +share/doc/${SUBDIR}/html/group__encfuncs.html +share/doc/${SUBDIR}/html/group__oldfuncs.html +share/doc/${SUBDIR}/html/index.html +share/doc/${SUBDIR}/html/modules.html +share/doc/${SUBDIR}/html/structth__comment.html +share/doc/${SUBDIR}/html/structth__huff__code.html +share/doc/${SUBDIR}/html/structth__img__plane.html +share/doc/${SUBDIR}/html/structth__info.html +share/doc/${SUBDIR}/html/structth__quant__info.html +share/doc/${SUBDIR}/html/structth__quant__ranges.html +share/doc/${SUBDIR}/html/structth__stripe__callback.html +share/doc/${SUBDIR}/html/structtheora__comment.html +share/doc/${SUBDIR}/html/structtheora__info.html +share/doc/${SUBDIR}/html/structtheora__state.html +share/doc/${SUBDIR}/html/structyuv__buffer.html +share/doc/${SUBDIR}/html/tab_b.gif +share/doc/${SUBDIR}/html/tab_l.gif +share/doc/${SUBDIR}/html/tab_r.gif +share/doc/${SUBDIR}/html/tabs.css +share/doc/${SUBDIR}/html/theora_8h-source.html +share/doc/${SUBDIR}/html/theora_8h.html +share/doc/${SUBDIR}/html/theoradec_8h-source.html +share/doc/${SUBDIR}/html/theoradec_8h.html +share/doc/${SUBDIR}/html/theoraenc_8h-source.html +share/doc/${SUBDIR}/html/theoraenc_8h.html +share/doc/${SUBDIR}/latex/FreeSans.ttf +share/doc/${SUBDIR}/latex/Makefile +share/doc/${SUBDIR}/latex/annotated.tex +share/doc/${SUBDIR}/latex/codec_8h.tex +share/doc/${SUBDIR}/latex/doxygen.sty +share/doc/${SUBDIR}/latex/files.tex +share/doc/${SUBDIR}/latex/group__basefuncs.tex +share/doc/${SUBDIR}/latex/group__decfuncs.tex +share/doc/${SUBDIR}/latex/group__encfuncs.tex +share/doc/${SUBDIR}/latex/group__oldfuncs.tex +share/doc/${SUBDIR}/latex/index.tex +share/doc/${SUBDIR}/latex/modules.tex +share/doc/${SUBDIR}/latex/refman.tex +share/doc/${SUBDIR}/latex/structth__comment.tex +share/doc/${SUBDIR}/latex/structth__huff__code.tex +share/doc/${SUBDIR}/latex/structth__img__plane.tex +share/doc/${SUBDIR}/latex/structth__info.tex +share/doc/${SUBDIR}/latex/structth__quant__info.tex +share/doc/${SUBDIR}/latex/structth__quant__ranges.tex +share/doc/${SUBDIR}/latex/structth__stripe__callback.tex +share/doc/${SUBDIR}/latex/structtheora__comment.tex +share/doc/${SUBDIR}/latex/structtheora__info.tex +share/doc/${SUBDIR}/latex/structtheora__state.tex +share/doc/${SUBDIR}/latex/structyuv__buffer.tex +share/doc/${SUBDIR}/latex/theora_8h.tex +share/doc/${SUBDIR}/latex/theoradec_8h.tex +share/doc/${SUBDIR}/latex/theoraenc_8h.tex +share/doc/${SUBDIR}/vp3-format.txt +@dirrm share/doc/${SUBDIR}/latex +@dirrm share/doc/${SUBDIR}/html +@dirrm share/doc/${SUBDIR} @dirrm include/theora diff --git a/multimedia/libtheora/distinfo b/multimedia/libtheora/distinfo index f510ea75227..9815181771f 100644 --- a/multimedia/libtheora/distinfo +++ b/multimedia/libtheora/distinfo @@ -1,5 +1,5 @@ -$NetBSD: distinfo,v 1.11 2008/04/22 20:27:04 wiz Exp $ +$NetBSD: distinfo,v 1.12 2008/10/09 19:46:55 wiz Exp $ -SHA1 (libtheora-1.0beta3.tar.bz2) = 01f0a5adcdde5bdc5b7e700b5975067af60b456c -RMD160 (libtheora-1.0beta3.tar.bz2) = 5308039be56064c7a54d464b0dc32f31542a34d8 -Size (libtheora-1.0beta3.tar.bz2) = 1531449 bytes +SHA1 (libtheora-1.0RC1.tar.bz2) = b6157bff9d1953497f29bca49e5654958ab75c80 +RMD160 (libtheora-1.0RC1.tar.bz2) = 43e5c55b886fa1842dc077459377d7bbcc1ad338 +Size (libtheora-1.0RC1.tar.bz2) = 1697726 bytes diff --git a/multimedia/libtheora/files/dct_decode_mmx.c b/multimedia/libtheora/files/dct_decode_mmx.c new file mode 100644 index 00000000000..1f54bad87d3 --- /dev/null +++ b/multimedia/libtheora/files/dct_decode_mmx.c @@ -0,0 +1,409 @@ +/******************************************************************** + * * + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * + * * + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2008 * + * by the Xiph.Org Foundation http://www.xiph.org/ * + * * + ******************************************************************** + + function: + last mod: $Id: dct_decode_mmx.c,v 1.1 2008/10/09 19:46:55 wiz Exp $ + + ********************************************************************/ + +#include <stdlib.h> + +#include "codec_internal.h" + +#if defined(USE_ASM) + +static const __attribute__((aligned(8),used)) ogg_int64_t OC_V3= + 0x0003000300030003LL; +static const __attribute__((aligned(8),used)) ogg_int64_t OC_V4= + 0x0004000400040004LL; + +static void loop_filter_v(unsigned char *_pix,int _ystride, + const ogg_int16_t *_ll){ + long esi; + _pix-=_ystride*2; + __asm__ __volatile__( + /*mm0=0*/ + "pxor %%mm0,%%mm0\n\t" + /*esi=_ystride*3*/ + "lea (%[ystride],%[ystride],2),%[s]\n\t" + /*mm7=_pix[0...8]*/ + "movq (%[pix]),%%mm7\n\t" + /*mm4=_pix[0...8+_ystride*3]*/ + "movq (%[pix],%[s]),%%mm4\n\t" + /*mm6=_pix[0...8]*/ + "movq %%mm7,%%mm6\n\t" + /*Expand unsigned _pix[0...3] to 16 bits.*/ + "punpcklbw %%mm0,%%mm6\n\t" + "movq %%mm4,%%mm5\n\t" + /*Expand unsigned _pix[4...8] to 16 bits.*/ + "punpckhbw %%mm0,%%mm7\n\t" + /*Expand other arrays too.*/ + "punpcklbw %%mm0,%%mm4\n\t" + "punpckhbw %%mm0,%%mm5\n\t" + /*mm7:mm6=_p[0...8]-_p[0...8+_ystride*3]:*/ + "psubw %%mm4,%%mm6\n\t" + "psubw %%mm5,%%mm7\n\t" + /*mm5=mm4=_pix[0...8+_ystride]*/ + "movq (%[pix],%[ystride]),%%mm4\n\t" + /*mm1=mm3=mm2=_pix[0..8]+_ystride*2]*/ + "movq (%[pix],%[ystride],2),%%mm2\n\t" + "movq %%mm4,%%mm5\n\t" + "movq %%mm2,%%mm3\n\t" + "movq %%mm2,%%mm1\n\t" + /*Expand these arrays.*/ + "punpckhbw %%mm0,%%mm5\n\t" + "punpcklbw %%mm0,%%mm4\n\t" + "punpckhbw %%mm0,%%mm3\n\t" + "punpcklbw %%mm0,%%mm2\n\t" + /*Preload...*/ + "movq %[OC_V3],%%mm0\n\t" + /*mm3:mm2=_pix[0...8+_ystride*2]-_pix[0...8+_ystride]*/ + "psubw %%mm5,%%mm3\n\t" + "psubw %%mm4,%%mm2\n\t" + /*Scale by 3.*/ + "pmullw %%mm0,%%mm3\n\t" + "pmullw %%mm0,%%mm2\n\t" + /*Preload...*/ + "movq %[OC_V4],%%mm0\n\t" + /*f=mm3:mm2==_pix[0...8]-_pix[0...8+_ystride*3]+ + 3*(_pix[0...8+_ystride*2]-_pix[0...8+_ystride])*/ + "paddw %%mm7,%%mm3\n\t" + "paddw %%mm6,%%mm2\n\t" + /*Add 4.*/ + "paddw %%mm0,%%mm3\n\t" + "paddw %%mm0,%%mm2\n\t" + /*"Divide" by 8.*/ + "psraw $3,%%mm3\n\t" + "psraw $3,%%mm2\n\t" + /*Now compute lflim of mm3:mm2 cf. Section 7.10 of the sepc.*/ + /*Free up mm5.*/ + "packuswb %%mm5,%%mm4\n\t" + /*mm0=L L L L*/ + "movq (%[ll]),%%mm0\n\t" + /*if(R_i<-2L||R_i>2L)R_i=0:*/ + "movq %%mm2,%%mm5\n\t" + "pxor %%mm6,%%mm6\n\t" + "movq %%mm0,%%mm7\n\t" + "psubw %%mm0,%%mm6\n\t" + "psllw $1,%%mm7\n\t" + "psllw $1,%%mm6\n\t" + /*mm2==R_3 R_2 R_1 R_0*/ + /*mm5==R_3 R_2 R_1 R_0*/ + /*mm6==-2L -2L -2L -2L*/ + /*mm7==2L 2L 2L 2L*/ + "pcmpgtw %%mm2,%%mm7\n\t" + "pcmpgtw %%mm6,%%mm5\n\t" + "pand %%mm7,%%mm2\n\t" + "movq %%mm0,%%mm7\n\t" + "pand %%mm5,%%mm2\n\t" + "psllw $1,%%mm7\n\t" + "movq %%mm3,%%mm5\n\t" + /*mm3==R_7 R_6 R_5 R_4*/ + /*mm5==R_7 R_6 R_5 R_4*/ + /*mm6==-2L -2L -2L -2L*/ + /*mm7==2L 2L 2L 2L*/ + "pcmpgtw %%mm3,%%mm7\n\t" + "pcmpgtw %%mm6,%%mm5\n\t" + "pand %%mm7,%%mm3\n\t" + "movq %%mm0,%%mm7\n\t" + "pand %%mm5,%%mm3\n\t" + /*if(R_i<-L)R_i'=R_i+2L; + if(R_i>L)R_i'=R_i-2L; + if(R_i<-L||R_i>L)R_i=-R_i':*/ + "psraw $1,%%mm6\n\t" + "movq %%mm2,%%mm5\n\t" + "psllw $1,%%mm7\n\t" + /*mm2==R_3 R_2 R_1 R_0*/ + /*mm5==R_3 R_2 R_1 R_0*/ + /*mm6==-L -L -L -L*/ + /*mm0==L L L L*/ + /*mm5=R_i>L?FF:00*/ + "pcmpgtw %%mm0,%%mm5\n\t" + /*mm6=-L>R_i?FF:00*/ + "pcmpgtw %%mm2,%%mm6\n\t" + /*mm7=R_i>L?2L:0*/ + "pand %%mm5,%%mm7\n\t" + /*mm2=R_i>L?R_i-2L:R_i*/ + "psubw %%mm7,%%mm2\n\t" + "movq %%mm0,%%mm7\n\t" + /*mm5=-L>R_i||R_i>L*/ + "por %%mm6,%%mm5\n\t" + "psllw $1,%%mm7\n\t" + /*mm7=-L>R_i?2L:0*/ + "pand %%mm6,%%mm7\n\t" + "pxor %%mm6,%%mm6\n\t" + /*mm2=-L>R_i?R_i+2L:R_i*/ + "paddw %%mm7,%%mm2\n\t" + "psubw %%mm0,%%mm6\n\t" + /*mm5=-L>R_i||R_i>L?-R_i':0*/ + "pand %%mm2,%%mm5\n\t" + "movq %%mm0,%%mm7\n\t" + /*mm2=-L>R_i||R_i>L?0:R_i*/ + "psubw %%mm5,%%mm2\n\t" + "psllw $1,%%mm7\n\t" + /*mm2=-L>R_i||R_i>L?-R_i':R_i*/ + "psubw %%mm5,%%mm2\n\t" + "movq %%mm3,%%mm5\n\t" + /*mm3==R_7 R_6 R_5 R_4*/ + /*mm5==R_7 R_6 R_5 R_4*/ + /*mm6==-L -L -L -L*/ + /*mm0==L L L L*/ + /*mm6=-L>R_i?FF:00*/ + "pcmpgtw %%mm3,%%mm6\n\t" + /*mm5=R_i>L?FF:00*/ + "pcmpgtw %%mm0,%%mm5\n\t" + /*mm7=R_i>L?2L:0*/ + "pand %%mm5,%%mm7\n\t" + /*mm2=R_i>L?R_i-2L:R_i*/ + "psubw %%mm7,%%mm3\n\t" + "psllw $1,%%mm0\n\t" + /*mm5=-L>R_i||R_i>L*/ + "por %%mm6,%%mm5\n\t" + /*mm0=-L>R_i?2L:0*/ + "pand %%mm6,%%mm0\n\t" + /*mm3=-L>R_i?R_i+2L:R_i*/ + "paddw %%mm0,%%mm3\n\t" + /*mm5=-L>R_i||R_i>L?-R_i':0*/ + "pand %%mm3,%%mm5\n\t" + /*mm2=-L>R_i||R_i>L?0:R_i*/ + "psubw %%mm5,%%mm3\n\t" + /*mm2=-L>R_i||R_i>L?-R_i':R_i*/ + "psubw %%mm5,%%mm3\n\t" + /*Unfortunately, there's no unsigned byte+signed byte with unsigned + saturation op code, so we have to promote things back 16 bits.*/ + "pxor %%mm0,%%mm0\n\t" + "movq %%mm4,%%mm5\n\t" + "punpcklbw %%mm0,%%mm4\n\t" + "punpckhbw %%mm0,%%mm5\n\t" + "movq %%mm1,%%mm6\n\t" + "punpcklbw %%mm0,%%mm1\n\t" + "punpckhbw %%mm0,%%mm6\n\t" + /*_pix[0...8+_ystride]+=R_i*/ + "paddw %%mm2,%%mm4\n\t" + "paddw %%mm3,%%mm5\n\t" + /*_pix[0...8+_ystride*2]-=R_i*/ + "psubw %%mm2,%%mm1\n\t" + "psubw %%mm3,%%mm6\n\t" + "packuswb %%mm5,%%mm4\n\t" + "packuswb %%mm6,%%mm1\n\t" + /*Write it back out.*/ + "movq %%mm4,(%[pix],%[ystride])\n\t" + "movq %%mm1,(%[pix],%[ystride],2)\n\t" + :[s]"=&S"(esi) + :[pix]"r"(_pix),[ystride]"r"((long)_ystride),[ll]"r"(_ll), + [OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4) + :"memory" + ); +} + +/*This code implements the bulk of loop_filter_h(). + Data are striped p0 p1 p2 p3 ... p0 p1 p2 p3 ..., so in order to load all + four p0's to one register we must transpose the values in four mmx regs. + When half is done we repeat this for the rest.*/ +static void loop_filter_h4(unsigned char *_pix,long _ystride, + const ogg_int16_t *_ll){ + long esi; + long edi; + __asm__ __volatile__( + /*x x x x 3 2 1 0*/ + "movd (%[pix]),%%mm0\n\t" + /*esi=_ystride*3*/ + "lea (%[ystride],%[ystride],2),%[s]\n\t" + /*x x x x 7 6 5 4*/ + "movd (%[pix],%[ystride]),%%mm1\n\t" + /*x x x x B A 9 8*/ + "movd (%[pix],%[ystride],2),%%mm2\n\t" + /*x x x x F E D C*/ + "movd (%[pix],%[s]),%%mm3\n\t" + /*mm0=7 3 6 2 5 1 4 0*/ + "punpcklbw %%mm1,%%mm0\n\t" + /*mm2=F B E A D 9 C 8*/ + "punpcklbw %%mm3,%%mm2\n\t" + /*mm1=7 3 6 2 5 1 4 0*/ + "movq %%mm0,%%mm1\n\t" + /*mm0=F B 7 3 E A 6 2*/ + "punpckhwd %%mm2,%%mm0\n\t" + /*mm1=D 9 5 1 C 8 4 0*/ + "punpcklwd %%mm2,%%mm1\n\t" + "pxor %%mm7,%%mm7\n\t" + /*mm5=D 9 5 1 C 8 4 0*/ + "movq %%mm1,%%mm5\n\t" + /*mm1=x C x 8 x 4 x 0==pix[0]*/ + "punpcklbw %%mm7,%%mm1\n\t" + /*mm5=x D x 9 x 5 x 1==pix[1]*/ + "punpckhbw %%mm7,%%mm5\n\t" + /*mm3=F B 7 3 E A 6 2*/ + "movq %%mm0,%%mm3\n\t" + /*mm0=x E x A x 6 x 2==pix[2]*/ + "punpcklbw %%mm7,%%mm0\n\t" + /*mm3=x F x B x 7 x 3==pix[3]*/ + "punpckhbw %%mm7,%%mm3\n\t" + /*mm1=mm1-mm3==pix[0]-pix[3]*/ + "psubw %%mm3,%%mm1\n\t" + /*Save a copy of pix[2] for later.*/ + "movq %%mm0,%%mm4\n\t" + /*mm0=mm0-mm5==pix[2]-pix[1]*/ + "psubw %%mm5,%%mm0\n\t" + /*Scale by 3.*/ + "pmullw %[OC_V3],%%mm0\n\t" + /*f=mm1==_pix[0]-_pix[3]+ 3*(_pix[2]-_pix[1])*/ + "paddw %%mm1,%%mm0\n\t" + /*Add 4.*/ + "paddw %[OC_V4],%%mm0\n\t" + /*"Divide" by 8, producing the residuals R_i.*/ + "psraw $3,%%mm0\n\t" + /*Now compute lflim of mm0 cf. Section 7.10 of the sepc.*/ + /*mm6=L L L L*/ + "movq (%[ll]),%%mm6\n\t" + /*if(R_i<-2L||R_i>2L)R_i=0:*/ + "movq %%mm0,%%mm1\n\t" + "pxor %%mm2,%%mm2\n\t" + "movq %%mm6,%%mm3\n\t" + "psubw %%mm6,%%mm2\n\t" + "psllw $1,%%mm3\n\t" + "psllw $1,%%mm2\n\t" + /*mm0==R_3 R_2 R_1 R_0*/ + /*mm1==R_3 R_2 R_1 R_0*/ + /*mm2==-2L -2L -2L -2L*/ + /*mm3==2L 2L 2L 2L*/ + "pcmpgtw %%mm0,%%mm3\n\t" + "pcmpgtw %%mm2,%%mm1\n\t" + "pand %%mm3,%%mm0\n\t" + "pand %%mm1,%%mm0\n\t" + /*if(R_i<-L)R_i'=R_i+2L; + if(R_i>L)R_i'=R_i-2L; + if(R_i<-L||R_i>L)R_i=-R_i':*/ + "psraw $1,%%mm2\n\t" + "movq %%mm0,%%mm1\n\t" + "movq %%mm6,%%mm3\n\t" + /*mm0==R_3 R_2 R_1 R_0*/ + /*mm1==R_3 R_2 R_1 R_0*/ + /*mm2==-L -L -L -L*/ + /*mm6==L L L L*/ + /*mm2=-L>R_i?FF:00*/ + "pcmpgtw %%mm0,%%mm2\n\t" + /*mm1=R_i>L?FF:00*/ + "pcmpgtw %%mm6,%%mm1\n\t" + /*mm3=2L 2L 2L 2L*/ + "psllw $1,%%mm3\n\t" + /*mm6=2L 2L 2L 2L*/ + "psllw $1,%%mm6\n\t" + /*mm3=R_i>L?2L:0*/ + "pand %%mm1,%%mm3\n\t" + /*mm6=-L>R_i?2L:0*/ + "pand %%mm2,%%mm6\n\t" + /*mm0=R_i>L?R_i-2L:R_i*/ + "psubw %%mm3,%%mm0\n\t" + /*mm1=-L>R_i||R_i>L*/ + "por %%mm2,%%mm1\n\t" + /*mm0=-L>R_i?R_i+2L:R_i*/ + "paddw %%mm6,%%mm0\n\t" + /*mm1=-L>R_i||R_i>L?R_i':0*/ + "pand %%mm0,%%mm1\n\t" + /*mm0=-L>R_i||R_i>L?0:R_i*/ + "psubw %%mm1,%%mm0\n\t" + /*mm0=-L>R_i||R_i>L?-R_i':R_i*/ + "psubw %%mm1,%%mm0\n\t" + /*_pix[1]+=R_i;*/ + "paddw %%mm0,%%mm5\n\t" + /*_pix[2]-=R_i;*/ + "psubw %%mm0,%%mm4\n\t" + /*mm5=x x x x D 9 5 1*/ + "packuswb %%mm7,%%mm5\n\t" + /*mm4=x x x x E A 6 2*/ + "packuswb %%mm7,%%mm4\n\t" + /*mm5=E D A 9 6 5 2 1*/ + "punpcklbw %%mm4,%%mm5\n\t" + /*edi=6 5 2 1*/ + "movd %%mm5,%%edi\n\t" + "movw %%di,1(%[pix])\n\t" + /*Why is there such a big stall here?*/ + "psrlq $32,%%mm5\n\t" + "shrl $16,%%edi\n\t" + "movw %%di,1(%[pix],%[ystride])\n\t" + /*edi=E D A 9*/ + "movd %%mm5,%%edi\n\t" + "movw %%di,1(%[pix],%[ystride],2)\n\t" + "shrl $16,%%edi\n\t" + "movw %%di,1(%[pix],%[s])\n\t" + :[s]"=&S"(esi),[d]"=&D"(edi), + [pix]"+r"(_pix),[ystride]"+r"(_ystride),[ll]"+r"(_ll) + :[OC_V3]"m"(OC_V3),[OC_V4]"m"(OC_V4) + :"memory" + ); +} + +static void loop_filter_h(unsigned char *_pix,int _ystride, + const ogg_int16_t *_ll){ + _pix-=2; + loop_filter_h4(_pix,_ystride,_ll); + loop_filter_h4(_pix+(_ystride<<2),_ystride,_ll); +} + +static void loop_filter_mmx(PB_INSTANCE *pbi, int FLimit){ + int j; + ogg_int16_t __attribute__((aligned(8))) ll[4]; + unsigned char *cp = pbi->display_fragments; + ogg_uint32_t *bp = pbi->recon_pixel_index_table; + + if ( FLimit == 0 ) return; + ll[0]=ll[1]=ll[2]=ll[3]=FLimit; + + for ( j = 0; j < 3 ; j++){ + ogg_uint32_t *bp_begin = bp; + ogg_uint32_t *bp_end; + int stride; + int h; + + switch(j) { + case 0: /* y */ + bp_end = bp + pbi->YPlaneFragments; + h = pbi->HFragments; + stride = pbi->YStride; + break; + default: /* u,v, 4:20 specific */ + bp_end = bp + pbi->UVPlaneFragments; + h = pbi->HFragments >> 1; + stride = pbi->UVStride; + break; + } + + while(bp<bp_end){ + ogg_uint32_t *bp_left = bp; + ogg_uint32_t *bp_right = bp + h; + while(bp<bp_right){ + if(cp[0]){ + if(bp>bp_left) + loop_filter_h(&pbi->LastFrameRecon[bp[0]],stride,ll); + if(bp_left>bp_begin) + loop_filter_v(&pbi->LastFrameRecon[bp[0]],stride,ll); + if(bp+1<bp_right && !cp[1]) + loop_filter_h(&pbi->LastFrameRecon[bp[0]]+8,stride,ll); + if(bp+h<bp_end && !cp[h]) + loop_filter_v(&pbi->LastFrameRecon[bp[h]],stride,ll); + } + bp++; + cp++; + } + } + } + + __asm__ __volatile__("emms\n\t"); +} + +/* install our implementation in the function table */ +void dsp_mmx_dct_decode_init(DspFunctions *funcs) +{ + funcs->LoopFilter = loop_filter_mmx; +} + +#endif /* USE_ASM */ |