Libav 0.7.1
|
00001 /* 00002 * Format Conversion Utils 00003 * Copyright (c) 2000, 2001 Fabrice Bellard 00004 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> 00005 * 00006 * This file is part of Libav. 00007 * 00008 * Libav is free software; you can redistribute it and/or 00009 * modify it under the terms of the GNU Lesser General Public 00010 * License as published by the Free Software Foundation; either 00011 * version 2.1 of the License, or (at your option) any later version. 00012 * 00013 * Libav is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 * Lesser General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU Lesser General Public 00019 * License along with Libav; if not, write to the Free Software 00020 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00021 * 00022 * MMX optimization by Nick Kurshev <nickols_k@mail.ru> 00023 */ 00024 00025 #include "libavutil/cpu.h" 00026 #include "libavutil/x86_cpu.h" 00027 #include "libavcodec/fmtconvert.h" 00028 00029 static void int32_to_float_fmul_scalar_sse(float *dst, const int *src, float mul, int len) 00030 { 00031 x86_reg i = -4*len; 00032 __asm__ volatile( 00033 "movss %3, %%xmm4 \n" 00034 "shufps $0, %%xmm4, %%xmm4 \n" 00035 "1: \n" 00036 "cvtpi2ps (%2,%0), %%xmm0 \n" 00037 "cvtpi2ps 8(%2,%0), %%xmm1 \n" 00038 "cvtpi2ps 16(%2,%0), %%xmm2 \n" 00039 "cvtpi2ps 24(%2,%0), %%xmm3 \n" 00040 "movlhps %%xmm1, %%xmm0 \n" 00041 "movlhps %%xmm3, %%xmm2 \n" 00042 "mulps %%xmm4, %%xmm0 \n" 00043 "mulps %%xmm4, %%xmm2 \n" 00044 "movaps %%xmm0, (%1,%0) \n" 00045 "movaps %%xmm2, 16(%1,%0) \n" 00046 "add $32, %0 \n" 00047 "jl 1b \n" 00048 :"+r"(i) 00049 :"r"(dst+len), "r"(src+len), "m"(mul) 00050 ); 00051 } 00052 00053 static void int32_to_float_fmul_scalar_sse2(float *dst, const int *src, float mul, int len) 00054 { 00055 x86_reg i = -4*len; 00056 __asm__ volatile( 00057 "movss %3, %%xmm4 \n" 00058 "shufps $0, %%xmm4, %%xmm4 \n" 00059 "1: \n" 00060 "cvtdq2ps (%2,%0), %%xmm0 \n" 00061 "cvtdq2ps 16(%2,%0), %%xmm1 \n" 00062 "mulps %%xmm4, %%xmm0 \n" 00063 "mulps %%xmm4, %%xmm1 \n" 00064 "movaps %%xmm0, (%1,%0) \n" 00065 "movaps %%xmm1, 16(%1,%0) \n" 00066 "add $32, %0 \n" 00067 "jl 1b \n" 00068 :"+r"(i) 00069 :"r"(dst+len), "r"(src+len), "m"(mul) 00070 ); 00071 } 00072 00073 static void float_to_int16_3dnow(int16_t *dst, const float *src, long len){ 00074 x86_reg reglen = len; 00075 // not bit-exact: pf2id uses different rounding than C and SSE 00076 __asm__ volatile( 00077 "add %0 , %0 \n\t" 00078 "lea (%2,%0,2) , %2 \n\t" 00079 "add %0 , %1 \n\t" 00080 "neg %0 \n\t" 00081 "1: \n\t" 00082 "pf2id (%2,%0,2) , %%mm0 \n\t" 00083 "pf2id 8(%2,%0,2) , %%mm1 \n\t" 00084 "pf2id 16(%2,%0,2) , %%mm2 \n\t" 00085 "pf2id 24(%2,%0,2) , %%mm3 \n\t" 00086 "packssdw %%mm1 , %%mm0 \n\t" 00087 "packssdw %%mm3 , %%mm2 \n\t" 00088 "movq %%mm0 , (%1,%0) \n\t" 00089 "movq %%mm2 , 8(%1,%0) \n\t" 00090 "add $16 , %0 \n\t" 00091 " js 1b \n\t" 00092 "femms \n\t" 00093 :"+r"(reglen), "+r"(dst), "+r"(src) 00094 ); 00095 } 00096 00097 static void float_to_int16_sse(int16_t *dst, const float *src, long len){ 00098 x86_reg reglen = len; 00099 __asm__ volatile( 00100 "add %0 , %0 \n\t" 00101 "lea (%2,%0,2) , %2 \n\t" 00102 "add %0 , %1 \n\t" 00103 "neg %0 \n\t" 00104 "1: \n\t" 00105 "cvtps2pi (%2,%0,2) , %%mm0 \n\t" 00106 "cvtps2pi 8(%2,%0,2) , %%mm1 \n\t" 00107 "cvtps2pi 16(%2,%0,2) , %%mm2 \n\t" 00108 "cvtps2pi 24(%2,%0,2) , %%mm3 \n\t" 00109 "packssdw %%mm1 , %%mm0 \n\t" 00110 "packssdw %%mm3 , %%mm2 \n\t" 00111 "movq %%mm0 , (%1,%0) \n\t" 00112 "movq %%mm2 , 8(%1,%0) \n\t" 00113 "add $16 , %0 \n\t" 00114 " js 1b \n\t" 00115 "emms \n\t" 00116 :"+r"(reglen), "+r"(dst), "+r"(src) 00117 ); 00118 } 00119 00120 static void float_to_int16_sse2(int16_t *dst, const float *src, long len){ 00121 x86_reg reglen = len; 00122 __asm__ volatile( 00123 "add %0 , %0 \n\t" 00124 "lea (%2,%0,2) , %2 \n\t" 00125 "add %0 , %1 \n\t" 00126 "neg %0 \n\t" 00127 "1: \n\t" 00128 "cvtps2dq (%2,%0,2) , %%xmm0 \n\t" 00129 "cvtps2dq 16(%2,%0,2) , %%xmm1 \n\t" 00130 "packssdw %%xmm1 , %%xmm0 \n\t" 00131 "movdqa %%xmm0 , (%1,%0) \n\t" 00132 "add $16 , %0 \n\t" 00133 " js 1b \n\t" 00134 :"+r"(reglen), "+r"(dst), "+r"(src) 00135 ); 00136 } 00137 00138 void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); 00139 void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); 00140 void ff_float_to_int16_interleave6_3dn2(int16_t *dst, const float **src, int len); 00141 00142 #if !HAVE_YASM 00143 #define ff_float_to_int16_interleave6_sse(a,b,c) float_to_int16_interleave_misc_sse(a,b,c,6) 00144 #define ff_float_to_int16_interleave6_3dnow(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) 00145 #define ff_float_to_int16_interleave6_3dn2(a,b,c) float_to_int16_interleave_misc_3dnow(a,b,c,6) 00146 #endif 00147 #define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse 00148 00149 #define FLOAT_TO_INT16_INTERLEAVE(cpu, body) \ 00150 /* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ 00151 static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ 00152 DECLARE_ALIGNED(16, int16_t, tmp)[len];\ 00153 int i,j,c;\ 00154 for(c=0; c<channels; c++){\ 00155 float_to_int16_##cpu(tmp, src[c], len);\ 00156 for(i=0, j=c; i<len; i++, j+=channels)\ 00157 dst[j] = tmp[i];\ 00158 }\ 00159 }\ 00160 \ 00161 static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ 00162 if(channels==1)\ 00163 float_to_int16_##cpu(dst, src[0], len);\ 00164 else if(channels==2){\ 00165 x86_reg reglen = len; \ 00166 const float *src0 = src[0];\ 00167 const float *src1 = src[1];\ 00168 __asm__ volatile(\ 00169 "shl $2, %0 \n"\ 00170 "add %0, %1 \n"\ 00171 "add %0, %2 \n"\ 00172 "add %0, %3 \n"\ 00173 "neg %0 \n"\ 00174 body\ 00175 :"+r"(reglen), "+r"(dst), "+r"(src0), "+r"(src1)\ 00176 );\ 00177 }else if(channels==6){\ 00178 ff_float_to_int16_interleave6_##cpu(dst, src, len);\ 00179 }else\ 00180 float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ 00181 } 00182 00183 FLOAT_TO_INT16_INTERLEAVE(3dnow, 00184 "1: \n" 00185 "pf2id (%2,%0), %%mm0 \n" 00186 "pf2id 8(%2,%0), %%mm1 \n" 00187 "pf2id (%3,%0), %%mm2 \n" 00188 "pf2id 8(%3,%0), %%mm3 \n" 00189 "packssdw %%mm1, %%mm0 \n" 00190 "packssdw %%mm3, %%mm2 \n" 00191 "movq %%mm0, %%mm1 \n" 00192 "punpcklwd %%mm2, %%mm0 \n" 00193 "punpckhwd %%mm2, %%mm1 \n" 00194 "movq %%mm0, (%1,%0)\n" 00195 "movq %%mm1, 8(%1,%0)\n" 00196 "add $16, %0 \n" 00197 "js 1b \n" 00198 "femms \n" 00199 ) 00200 00201 FLOAT_TO_INT16_INTERLEAVE(sse, 00202 "1: \n" 00203 "cvtps2pi (%2,%0), %%mm0 \n" 00204 "cvtps2pi 8(%2,%0), %%mm1 \n" 00205 "cvtps2pi (%3,%0), %%mm2 \n" 00206 "cvtps2pi 8(%3,%0), %%mm3 \n" 00207 "packssdw %%mm1, %%mm0 \n" 00208 "packssdw %%mm3, %%mm2 \n" 00209 "movq %%mm0, %%mm1 \n" 00210 "punpcklwd %%mm2, %%mm0 \n" 00211 "punpckhwd %%mm2, %%mm1 \n" 00212 "movq %%mm0, (%1,%0)\n" 00213 "movq %%mm1, 8(%1,%0)\n" 00214 "add $16, %0 \n" 00215 "js 1b \n" 00216 "emms \n" 00217 ) 00218 00219 FLOAT_TO_INT16_INTERLEAVE(sse2, 00220 "1: \n" 00221 "cvtps2dq (%2,%0), %%xmm0 \n" 00222 "cvtps2dq (%3,%0), %%xmm1 \n" 00223 "packssdw %%xmm1, %%xmm0 \n" 00224 "movhlps %%xmm0, %%xmm1 \n" 00225 "punpcklwd %%xmm1, %%xmm0 \n" 00226 "movdqa %%xmm0, (%1,%0) \n" 00227 "add $16, %0 \n" 00228 "js 1b \n" 00229 ) 00230 00231 static void float_to_int16_interleave_3dn2(int16_t *dst, const float **src, long len, int channels){ 00232 if(channels==6) 00233 ff_float_to_int16_interleave6_3dn2(dst, src, len); 00234 else 00235 float_to_int16_interleave_3dnow(dst, src, len, channels); 00236 } 00237 00238 #if HAVE_YASM 00239 void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len); 00240 void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len); 00241 00242 void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len); 00243 void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len); 00244 00245 static void float_interleave_mmx(float *dst, const float **src, 00246 unsigned int len, int channels) 00247 { 00248 if (channels == 2) { 00249 ff_float_interleave2_mmx(dst, src, len); 00250 } else if (channels == 6) 00251 ff_float_interleave6_mmx(dst, src, len); 00252 else 00253 ff_float_interleave_c(dst, src, len, channels); 00254 } 00255 00256 static void float_interleave_sse(float *dst, const float **src, 00257 unsigned int len, int channels) 00258 { 00259 if (channels == 2) { 00260 ff_float_interleave2_sse(dst, src, len); 00261 } else if (channels == 6) 00262 ff_float_interleave6_sse(dst, src, len); 00263 else 00264 ff_float_interleave_c(dst, src, len, channels); 00265 } 00266 #endif 00267 00268 void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) 00269 { 00270 int mm_flags = av_get_cpu_flags(); 00271 00272 if (mm_flags & AV_CPU_FLAG_MMX) { 00273 #if HAVE_YASM 00274 c->float_interleave = float_interleave_mmx; 00275 #endif 00276 00277 if(mm_flags & AV_CPU_FLAG_3DNOW){ 00278 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 00279 c->float_to_int16 = float_to_int16_3dnow; 00280 c->float_to_int16_interleave = float_to_int16_interleave_3dnow; 00281 } 00282 } 00283 if(mm_flags & AV_CPU_FLAG_3DNOWEXT){ 00284 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ 00285 c->float_to_int16_interleave = float_to_int16_interleave_3dn2; 00286 } 00287 } 00288 if(mm_flags & AV_CPU_FLAG_SSE){ 00289 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse; 00290 c->float_to_int16 = float_to_int16_sse; 00291 c->float_to_int16_interleave = float_to_int16_interleave_sse; 00292 #if HAVE_YASM 00293 c->float_interleave = float_interleave_sse; 00294 #endif 00295 } 00296 if(mm_flags & AV_CPU_FLAG_SSE2){ 00297 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_sse2; 00298 c->float_to_int16 = float_to_int16_sse2; 00299 c->float_to_int16_interleave = float_to_int16_interleave_sse2; 00300 } 00301 } 00302 }