Libav 0.7.1
|
00001 /* 00002 * Copyright (C) 2009 Loren Merritt <lorenm@u.washignton.edu> 00003 * 00004 * This file is part of Libav. 00005 * 00006 * Libav is free software; you can redistribute it and/or 00007 * modify it under the terms of the GNU Lesser General Public 00008 * License as published by the Free Software Foundation; either 00009 * version 2.1 of the License, or (at your option) any later version. 00010 * 00011 * Libav is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00014 * Lesser General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU Lesser General Public 00017 * License along with Libav; if not, write to the Free Software 00018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00019 */ 00020 00021 #include "libavutil/cpu.h" 00022 #include "libavutil/x86_cpu.h" 00023 #include "libavfilter/gradfun.h" 00024 00025 DECLARE_ALIGNED(16, static const uint16_t, pw_7f)[8] = {0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F,0x7F}; 00026 DECLARE_ALIGNED(16, static const uint16_t, pw_ff)[8] = {0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF,0xFF}; 00027 00028 void ff_gradfun_filter_line_mmx2(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers) 00029 { 00030 #if HAVE_MMX 00031 intptr_t x; 00032 if (width & 3) { 00033 x = width & ~3; 00034 ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers); 00035 width = x; 00036 } 00037 x = -width; 00038 __asm__ volatile( 00039 "movd %4, %%mm5 \n" 00040 "pxor %%mm7, %%mm7 \n" 00041 "pshufw $0, %%mm5, %%mm5 \n" 00042 "movq %6, %%mm6 \n" 00043 "movq %5, %%mm4 \n" 00044 "1: \n" 00045 "movd (%2,%0), %%mm0 \n" 00046 "movd (%3,%0), %%mm1 \n" 00047 "punpcklbw %%mm7, %%mm0 \n" 00048 "punpcklwd %%mm1, %%mm1 \n" 00049 "psllw $7, %%mm0 \n" 00050 "pxor %%mm2, %%mm2 \n" 00051 "psubw %%mm0, %%mm1 \n" // delta = dc - pix 00052 "psubw %%mm1, %%mm2 \n" 00053 "pmaxsw %%mm1, %%mm2 \n" 00054 "pmulhuw %%mm5, %%mm2 \n" // m = abs(delta) * thresh >> 16 00055 "psubw %%mm6, %%mm2 \n" 00056 "pminsw %%mm7, %%mm2 \n" // m = -max(0, 127-m) 00057 "pmullw %%mm2, %%mm2 \n" 00058 "paddw %%mm4, %%mm0 \n" // pix += dither 00059 "pmulhw %%mm2, %%mm1 \n" 00060 "psllw $2, %%mm1 \n" // m = m*m*delta >> 14 00061 "paddw %%mm1, %%mm0 \n" // pix += m 00062 "psraw $7, %%mm0 \n" 00063 "packuswb %%mm0, %%mm0 \n" 00064 "movd %%mm0, (%1,%0) \n" // dst = clip(pix>>7) 00065 "add $4, %0 \n" 00066 "jl 1b \n" 00067 "emms \n" 00068 :"+r"(x) 00069 :"r"(dst+width), "r"(src+width), "r"(dc+width/2), 00070 "rm"(thresh), "m"(*dithers), "m"(*pw_7f) 00071 :"memory" 00072 ); 00073 #endif 00074 } 00075 00076 void ff_gradfun_filter_line_ssse3(uint8_t *dst, uint8_t *src, uint16_t *dc, int width, int thresh, const uint16_t *dithers) 00077 { 00078 #if HAVE_SSSE3 00079 intptr_t x; 00080 if (width & 7) { 00081 // could be 10% faster if I somehow eliminated this 00082 x = width & ~7; 00083 ff_gradfun_filter_line_c(dst + x, src + x, dc + x / 2, width - x, thresh, dithers); 00084 width = x; 00085 } 00086 x = -width; 00087 __asm__ volatile( 00088 "movd %4, %%xmm5 \n" 00089 "pxor %%xmm7, %%xmm7 \n" 00090 "pshuflw $0,%%xmm5, %%xmm5 \n" 00091 "movdqa %6, %%xmm6 \n" 00092 "punpcklqdq %%xmm5, %%xmm5 \n" 00093 "movdqa %5, %%xmm4 \n" 00094 "1: \n" 00095 "movq (%2,%0), %%xmm0 \n" 00096 "movq (%3,%0), %%xmm1 \n" 00097 "punpcklbw %%xmm7, %%xmm0 \n" 00098 "punpcklwd %%xmm1, %%xmm1 \n" 00099 "psllw $7, %%xmm0 \n" 00100 "psubw %%xmm0, %%xmm1 \n" // delta = dc - pix 00101 "pabsw %%xmm1, %%xmm2 \n" 00102 "pmulhuw %%xmm5, %%xmm2 \n" // m = abs(delta) * thresh >> 16 00103 "psubw %%xmm6, %%xmm2 \n" 00104 "pminsw %%xmm7, %%xmm2 \n" // m = -max(0, 127-m) 00105 "pmullw %%xmm2, %%xmm2 \n" 00106 "psllw $1, %%xmm2 \n" 00107 "paddw %%xmm4, %%xmm0 \n" // pix += dither 00108 "pmulhrsw %%xmm2, %%xmm1 \n" // m = m*m*delta >> 14 00109 "paddw %%xmm1, %%xmm0 \n" // pix += m 00110 "psraw $7, %%xmm0 \n" 00111 "packuswb %%xmm0, %%xmm0 \n" 00112 "movq %%xmm0, (%1,%0) \n" // dst = clip(pix>>7) 00113 "add $8, %0 \n" 00114 "jl 1b \n" 00115 :"+&r"(x) 00116 :"r"(dst+width), "r"(src+width), "r"(dc+width/2), 00117 "rm"(thresh), "m"(*dithers), "m"(*pw_7f) 00118 :"memory" 00119 ); 00120 #endif // HAVE_SSSE3 00121 } 00122 00123 void ff_gradfun_blur_line_sse2(uint16_t *dc, uint16_t *buf, uint16_t *buf1, uint8_t *src, int src_linesize, int width) 00124 { 00125 #if HAVE_SSE 00126 #define BLURV(load)\ 00127 intptr_t x = -2*width;\ 00128 __asm__ volatile(\ 00129 "movdqa %6, %%xmm7 \n"\ 00130 "1: \n"\ 00131 load" (%4,%0), %%xmm0 \n"\ 00132 load" (%5,%0), %%xmm1 \n"\ 00133 "movdqa %%xmm0, %%xmm2 \n"\ 00134 "movdqa %%xmm1, %%xmm3 \n"\ 00135 "psrlw $8, %%xmm0 \n"\ 00136 "psrlw $8, %%xmm1 \n"\ 00137 "pand %%xmm7, %%xmm2 \n"\ 00138 "pand %%xmm7, %%xmm3 \n"\ 00139 "paddw %%xmm1, %%xmm0 \n"\ 00140 "paddw %%xmm3, %%xmm2 \n"\ 00141 "paddw %%xmm2, %%xmm0 \n"\ 00142 "paddw (%2,%0), %%xmm0 \n"\ 00143 "movdqa (%1,%0), %%xmm1 \n"\ 00144 "movdqa %%xmm0, (%1,%0) \n"\ 00145 "psubw %%xmm1, %%xmm0 \n"\ 00146 "movdqa %%xmm0, (%3,%0) \n"\ 00147 "add $16, %0 \n"\ 00148 "jl 1b \n"\ 00149 :"+&r"(x)\ 00150 :"r"(buf+width),\ 00151 "r"(buf1+width),\ 00152 "r"(dc+width),\ 00153 "r"(src+width*2),\ 00154 "r"(src+width*2+src_linesize),\ 00155 "m"(*pw_ff)\ 00156 :"memory"\ 00157 ); 00158 if (((intptr_t) src | src_linesize) & 15) { 00159 BLURV("movdqu"); 00160 } else { 00161 BLURV("movdqa"); 00162 } 00163 #endif // HAVE_SSE 00164 }