Libav 0.7.1
|
00001 /* 00002 * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) 00003 * 00004 * This file is part of Libav. 00005 * 00006 * Libav is free software; you can redistribute it and/or modify 00007 * it under the terms of the GNU General Public License as published by 00008 * the Free Software Foundation; either version 2 of the License, or 00009 * (at your option) any later version. 00010 * 00011 * Libav is distributed in the hope that it will be useful, 00012 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00013 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 00014 * GNU General Public License for more details. 00015 * 00016 * You should have received a copy of the GNU General Public License 00017 * along with Libav; if not, write to the Free Software 00018 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00019 */ 00020 00026 #include "libavutil/x86_cpu.h" 00027 00028 #define ALIGN_MASK "$-8" 00029 00030 #undef REAL_PAVGB 00031 #undef PAVGB 00032 #undef PMINUB 00033 #undef PMAXUB 00034 00035 #if HAVE_MMX2 00036 #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t" 00037 #elif HAVE_AMD3DNOW 00038 #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" 00039 #endif 00040 #define PAVGB(a,b) REAL_PAVGB(a,b) 00041 00042 #if HAVE_MMX2 00043 #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" 00044 #elif HAVE_MMX 00045 #define PMINUB(b,a,t) \ 00046 "movq " #a ", " #t " \n\t"\ 00047 "psubusb " #b ", " #t " \n\t"\ 00048 "psubb " #t ", " #a " \n\t" 00049 #endif 00050 00051 #if HAVE_MMX2 00052 #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" 00053 #elif HAVE_MMX 00054 #define PMAXUB(a,b) \ 00055 "psubusb " #a ", " #b " \n\t"\ 00056 "paddb " #a ", " #b " \n\t" 00057 #endif 00058 00059 //FIXME? |255-0| = 1 (should not be a problem ...) 00060 #if HAVE_MMX 00061 00064 static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){ 00065 int numEq= 0, dcOk; 00066 src+= stride*4; // src points to begin of the 8x8 Block 00067 __asm__ volatile( 00068 "movq %0, %%mm7 \n\t" 00069 "movq %1, %%mm6 \n\t" 00070 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) 00071 ); 00072 00073 __asm__ volatile( 00074 "lea (%2, %3), %%"REG_a" \n\t" 00075 // 0 1 2 3 4 5 6 7 8 9 00076 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 00077 00078 "movq (%2), %%mm0 \n\t" 00079 "movq (%%"REG_a"), %%mm1 \n\t" 00080 "movq %%mm0, %%mm3 \n\t" 00081 "movq %%mm0, %%mm4 \n\t" 00082 PMAXUB(%%mm1, %%mm4) 00083 PMINUB(%%mm1, %%mm3, %%mm5) 00084 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece 00085 "paddb %%mm7, %%mm0 \n\t" 00086 "pcmpgtb %%mm6, %%mm0 \n\t" 00087 00088 "movq (%%"REG_a",%3), %%mm2 \n\t" 00089 PMAXUB(%%mm2, %%mm4) 00090 PMINUB(%%mm2, %%mm3, %%mm5) 00091 "psubb %%mm2, %%mm1 \n\t" 00092 "paddb %%mm7, %%mm1 \n\t" 00093 "pcmpgtb %%mm6, %%mm1 \n\t" 00094 "paddb %%mm1, %%mm0 \n\t" 00095 00096 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 00097 PMAXUB(%%mm1, %%mm4) 00098 PMINUB(%%mm1, %%mm3, %%mm5) 00099 "psubb %%mm1, %%mm2 \n\t" 00100 "paddb %%mm7, %%mm2 \n\t" 00101 "pcmpgtb %%mm6, %%mm2 \n\t" 00102 "paddb %%mm2, %%mm0 \n\t" 00103 00104 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" 00105 00106 "movq (%2, %3, 4), %%mm2 \n\t" 00107 PMAXUB(%%mm2, %%mm4) 00108 PMINUB(%%mm2, %%mm3, %%mm5) 00109 "psubb %%mm2, %%mm1 \n\t" 00110 "paddb %%mm7, %%mm1 \n\t" 00111 "pcmpgtb %%mm6, %%mm1 \n\t" 00112 "paddb %%mm1, %%mm0 \n\t" 00113 00114 "movq (%%"REG_a"), %%mm1 \n\t" 00115 PMAXUB(%%mm1, %%mm4) 00116 PMINUB(%%mm1, %%mm3, %%mm5) 00117 "psubb %%mm1, %%mm2 \n\t" 00118 "paddb %%mm7, %%mm2 \n\t" 00119 "pcmpgtb %%mm6, %%mm2 \n\t" 00120 "paddb %%mm2, %%mm0 \n\t" 00121 00122 "movq (%%"REG_a", %3), %%mm2 \n\t" 00123 PMAXUB(%%mm2, %%mm4) 00124 PMINUB(%%mm2, %%mm3, %%mm5) 00125 "psubb %%mm2, %%mm1 \n\t" 00126 "paddb %%mm7, %%mm1 \n\t" 00127 "pcmpgtb %%mm6, %%mm1 \n\t" 00128 "paddb %%mm1, %%mm0 \n\t" 00129 00130 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 00131 PMAXUB(%%mm1, %%mm4) 00132 PMINUB(%%mm1, %%mm3, %%mm5) 00133 "psubb %%mm1, %%mm2 \n\t" 00134 "paddb %%mm7, %%mm2 \n\t" 00135 "pcmpgtb %%mm6, %%mm2 \n\t" 00136 "paddb %%mm2, %%mm0 \n\t" 00137 "psubusb %%mm3, %%mm4 \n\t" 00138 00139 " \n\t" 00140 #if HAVE_MMX2 00141 "pxor %%mm7, %%mm7 \n\t" 00142 "psadbw %%mm7, %%mm0 \n\t" 00143 #else 00144 "movq %%mm0, %%mm1 \n\t" 00145 "psrlw $8, %%mm0 \n\t" 00146 "paddb %%mm1, %%mm0 \n\t" 00147 "movq %%mm0, %%mm1 \n\t" 00148 "psrlq $16, %%mm0 \n\t" 00149 "paddb %%mm1, %%mm0 \n\t" 00150 "movq %%mm0, %%mm1 \n\t" 00151 "psrlq $32, %%mm0 \n\t" 00152 "paddb %%mm1, %%mm0 \n\t" 00153 #endif 00154 "movq %4, %%mm7 \n\t" // QP,..., QP 00155 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP 00156 "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0 00157 "packssdw %%mm4, %%mm4 \n\t" 00158 "movd %%mm0, %0 \n\t" 00159 "movd %%mm4, %1 \n\t" 00160 00161 : "=r" (numEq), "=r" (dcOk) 00162 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) 00163 : "%"REG_a 00164 ); 00165 00166 numEq= (-numEq) &0xFF; 00167 if(numEq > c->ppMode.flatnessThreshold){ 00168 if(dcOk) return 0; 00169 else return 1; 00170 }else{ 00171 return 2; 00172 } 00173 } 00174 #endif //HAVE_MMX 00175 00180 #if !HAVE_ALTIVEC 00181 static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) 00182 { 00183 #if HAVE_MMX2 || HAVE_AMD3DNOW 00184 src+= stride*3; 00185 __asm__ volatile( //"movv %0 %1 %2\n\t" 00186 "movq %2, %%mm0 \n\t" // QP,..., QP 00187 "pxor %%mm4, %%mm4 \n\t" 00188 00189 "movq (%0), %%mm6 \n\t" 00190 "movq (%0, %1), %%mm5 \n\t" 00191 "movq %%mm5, %%mm1 \n\t" 00192 "movq %%mm6, %%mm2 \n\t" 00193 "psubusb %%mm6, %%mm5 \n\t" 00194 "psubusb %%mm1, %%mm2 \n\t" 00195 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 00196 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 00197 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF 00198 00199 "pand %%mm2, %%mm6 \n\t" 00200 "pandn %%mm1, %%mm2 \n\t" 00201 "por %%mm2, %%mm6 \n\t"// First Line to Filter 00202 00203 "movq (%0, %1, 8), %%mm5 \n\t" 00204 "lea (%0, %1, 4), %%"REG_a" \n\t" 00205 "lea (%0, %1, 8), %%"REG_c" \n\t" 00206 "sub %1, %%"REG_c" \n\t" 00207 "add %1, %0 \n\t" // %0 points to line 1 not 0 00208 "movq (%0, %1, 8), %%mm7 \n\t" 00209 "movq %%mm5, %%mm1 \n\t" 00210 "movq %%mm7, %%mm2 \n\t" 00211 "psubusb %%mm7, %%mm5 \n\t" 00212 "psubusb %%mm1, %%mm2 \n\t" 00213 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 00214 "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 00215 "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF 00216 00217 "pand %%mm2, %%mm7 \n\t" 00218 "pandn %%mm1, %%mm2 \n\t" 00219 "por %%mm2, %%mm7 \n\t" // First Line to Filter 00220 00221 00222 // 1 2 3 4 5 6 7 8 00223 // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 00224 // 6 4 2 2 1 1 00225 // 6 4 4 2 00226 // 6 8 2 00227 00228 "movq (%0, %1), %%mm0 \n\t" // 1 00229 "movq %%mm0, %%mm1 \n\t" // 1 00230 PAVGB(%%mm6, %%mm0) //1 1 /2 00231 PAVGB(%%mm6, %%mm0) //3 1 /4 00232 00233 "movq (%0, %1, 4), %%mm2 \n\t" // 1 00234 "movq %%mm2, %%mm5 \n\t" // 1 00235 PAVGB((%%REGa), %%mm2) // 11 /2 00236 PAVGB((%0, %1, 2), %%mm2) // 211 /4 00237 "movq %%mm2, %%mm3 \n\t" // 211 /4 00238 "movq (%0), %%mm4 \n\t" // 1 00239 PAVGB(%%mm4, %%mm3) // 4 211 /8 00240 PAVGB(%%mm0, %%mm3) //642211 /16 00241 "movq %%mm3, (%0) \n\t" // X 00242 // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 00243 "movq %%mm1, %%mm0 \n\t" // 1 00244 PAVGB(%%mm6, %%mm0) //1 1 /2 00245 "movq %%mm4, %%mm3 \n\t" // 1 00246 PAVGB((%0,%1,2), %%mm3) // 1 1 /2 00247 PAVGB((%%REGa,%1,2), %%mm5) // 11 /2 00248 PAVGB((%%REGa), %%mm5) // 211 /4 00249 PAVGB(%%mm5, %%mm3) // 2 2211 /8 00250 PAVGB(%%mm0, %%mm3) //4242211 /16 00251 "movq %%mm3, (%0,%1) \n\t" // X 00252 // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 00253 PAVGB(%%mm4, %%mm6) //11 /2 00254 "movq (%%"REG_c"), %%mm0 \n\t" // 1 00255 PAVGB((%%REGa, %1, 2), %%mm0) // 11/2 00256 "movq %%mm0, %%mm3 \n\t" // 11/2 00257 PAVGB(%%mm1, %%mm0) // 2 11/4 00258 PAVGB(%%mm6, %%mm0) //222 11/8 00259 PAVGB(%%mm2, %%mm0) //22242211/16 00260 "movq (%0, %1, 2), %%mm2 \n\t" // 1 00261 "movq %%mm0, (%0, %1, 2) \n\t" // X 00262 // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 00263 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 00264 PAVGB((%%REGc), %%mm0) // 11 /2 00265 PAVGB(%%mm0, %%mm6) //11 11 /4 00266 PAVGB(%%mm1, %%mm4) // 11 /2 00267 PAVGB(%%mm2, %%mm1) // 11 /2 00268 PAVGB(%%mm1, %%mm6) //1122 11 /8 00269 PAVGB(%%mm5, %%mm6) //112242211 /16 00270 "movq (%%"REG_a"), %%mm5 \n\t" // 1 00271 "movq %%mm6, (%%"REG_a") \n\t" // X 00272 // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 00273 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" // 1 00274 PAVGB(%%mm7, %%mm6) // 11 /2 00275 PAVGB(%%mm4, %%mm6) // 11 11 /4 00276 PAVGB(%%mm3, %%mm6) // 11 2211 /8 00277 PAVGB(%%mm5, %%mm2) // 11 /2 00278 "movq (%0, %1, 4), %%mm4 \n\t" // 1 00279 PAVGB(%%mm4, %%mm2) // 112 /4 00280 PAVGB(%%mm2, %%mm6) // 112242211 /16 00281 "movq %%mm6, (%0, %1, 4) \n\t" // X 00282 // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 00283 PAVGB(%%mm7, %%mm1) // 11 2 /4 00284 PAVGB(%%mm4, %%mm5) // 11 /2 00285 PAVGB(%%mm5, %%mm0) // 11 11 /4 00286 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" // 1 00287 PAVGB(%%mm6, %%mm1) // 11 4 2 /8 00288 PAVGB(%%mm0, %%mm1) // 11224222 /16 00289 "movq %%mm1, (%%"REG_a", %1, 2) \n\t" // X 00290 // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 00291 PAVGB((%%REGc), %%mm2) // 112 4 /8 00292 "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 00293 PAVGB(%%mm0, %%mm6) // 1 1 /2 00294 PAVGB(%%mm7, %%mm6) // 1 12 /4 00295 PAVGB(%%mm2, %%mm6) // 1122424 /4 00296 "movq %%mm6, (%%"REG_c") \n\t" // X 00297 // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 00298 PAVGB(%%mm7, %%mm5) // 11 2 /4 00299 PAVGB(%%mm7, %%mm5) // 11 6 /8 00300 00301 PAVGB(%%mm3, %%mm0) // 112 /4 00302 PAVGB(%%mm0, %%mm5) // 112246 /16 00303 "movq %%mm5, (%%"REG_a", %1, 4) \n\t" // X 00304 "sub %1, %0 \n\t" 00305 00306 : 00307 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) 00308 : "%"REG_a, "%"REG_c 00309 ); 00310 #else //HAVE_MMX2 || HAVE_AMD3DNOW 00311 const int l1= stride; 00312 const int l2= stride + l1; 00313 const int l3= stride + l2; 00314 const int l4= stride + l3; 00315 const int l5= stride + l4; 00316 const int l6= stride + l5; 00317 const int l7= stride + l6; 00318 const int l8= stride + l7; 00319 const int l9= stride + l8; 00320 int x; 00321 src+= stride*3; 00322 for(x=0; x<BLOCK_SIZE; x++){ 00323 const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; 00324 const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; 00325 00326 int sums[10]; 00327 sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4; 00328 sums[1] = sums[0] - first + src[l4]; 00329 sums[2] = sums[1] - first + src[l5]; 00330 sums[3] = sums[2] - first + src[l6]; 00331 sums[4] = sums[3] - first + src[l7]; 00332 sums[5] = sums[4] - src[l1] + src[l8]; 00333 sums[6] = sums[5] - src[l2] + last; 00334 sums[7] = sums[6] - src[l3] + last; 00335 sums[8] = sums[7] - src[l4] + last; 00336 sums[9] = sums[8] - src[l5] + last; 00337 00338 src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4; 00339 src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4; 00340 src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4; 00341 src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4; 00342 src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4; 00343 src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4; 00344 src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; 00345 src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; 00346 00347 src++; 00348 } 00349 #endif //HAVE_MMX2 || HAVE_AMD3DNOW 00350 } 00351 #endif //HAVE_ALTIVEC 00352 00360 static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) 00361 { 00362 #if HAVE_MMX2 || HAVE_AMD3DNOW 00363 src+= stride*3; 00364 00365 __asm__ volatile( 00366 "pxor %%mm7, %%mm7 \n\t" // 0 00367 "lea (%0, %1), %%"REG_a" \n\t" 00368 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 00369 // 0 1 2 3 4 5 6 7 8 9 00370 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 00371 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 00372 "movq (%0, %1, 4), %%mm1 \n\t" // line 4 00373 "movq %%mm1, %%mm2 \n\t" // line 4 00374 "psubusb %%mm0, %%mm1 \n\t" 00375 "psubusb %%mm2, %%mm0 \n\t" 00376 "por %%mm1, %%mm0 \n\t" // |l2 - l3| 00377 "movq (%%"REG_c"), %%mm3 \n\t" // line 5 00378 "movq (%%"REG_c", %1), %%mm4 \n\t" // line 6 00379 "movq %%mm3, %%mm5 \n\t" // line 5 00380 "psubusb %%mm4, %%mm3 \n\t" 00381 "psubusb %%mm5, %%mm4 \n\t" 00382 "por %%mm4, %%mm3 \n\t" // |l5 - l6| 00383 PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 00384 "movq %%mm2, %%mm1 \n\t" // line 4 00385 "psubusb %%mm5, %%mm2 \n\t" 00386 "movq %%mm2, %%mm4 \n\t" 00387 "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 00388 "psubusb %%mm1, %%mm5 \n\t" 00389 "por %%mm5, %%mm4 \n\t" // |l4 - l5| 00390 "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) 00391 "movq %%mm4, %%mm3 \n\t" // d 00392 "movq %2, %%mm0 \n\t" 00393 "paddusb %%mm0, %%mm0 \n\t" 00394 "psubusb %%mm0, %%mm4 \n\t" 00395 "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 00396 "psubusb "MANGLE(b01)", %%mm3 \n\t" 00397 "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 00398 00399 PAVGB(%%mm7, %%mm3) // d/2 00400 "movq %%mm3, %%mm1 \n\t" // d/2 00401 PAVGB(%%mm7, %%mm3) // d/4 00402 PAVGB(%%mm1, %%mm3) // 3*d/8 00403 00404 "movq (%0, %1, 4), %%mm0 \n\t" // line 4 00405 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 00406 "psubusb %%mm3, %%mm0 \n\t" 00407 "pxor %%mm2, %%mm0 \n\t" 00408 "movq %%mm0, (%0, %1, 4) \n\t" // line 4 00409 00410 "movq (%%"REG_c"), %%mm0 \n\t" // line 5 00411 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 00412 "paddusb %%mm3, %%mm0 \n\t" 00413 "pxor %%mm2, %%mm0 \n\t" 00414 "movq %%mm0, (%%"REG_c") \n\t" // line 5 00415 00416 PAVGB(%%mm7, %%mm1) // d/4 00417 00418 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 00419 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 00420 "psubusb %%mm1, %%mm0 \n\t" 00421 "pxor %%mm2, %%mm0 \n\t" 00422 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" // line 3 00423 00424 "movq (%%"REG_c", %1), %%mm0 \n\t" // line 6 00425 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 00426 "paddusb %%mm1, %%mm0 \n\t" 00427 "pxor %%mm2, %%mm0 \n\t" 00428 "movq %%mm0, (%%"REG_c", %1) \n\t" // line 6 00429 00430 PAVGB(%%mm7, %%mm1) // d/8 00431 00432 "movq (%%"REG_a", %1), %%mm0 \n\t" // line 2 00433 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 00434 "psubusb %%mm1, %%mm0 \n\t" 00435 "pxor %%mm2, %%mm0 \n\t" 00436 "movq %%mm0, (%%"REG_a", %1) \n\t" // line 2 00437 00438 "movq (%%"REG_c", %1, 2), %%mm0 \n\t" // line 7 00439 "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 00440 "paddusb %%mm1, %%mm0 \n\t" 00441 "pxor %%mm2, %%mm0 \n\t" 00442 "movq %%mm0, (%%"REG_c", %1, 2) \n\t" // line 7 00443 00444 : 00445 : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb) 00446 : "%"REG_a, "%"REG_c 00447 ); 00448 #else //HAVE_MMX2 || HAVE_AMD3DNOW 00449 00450 const int l1= stride; 00451 const int l2= stride + l1; 00452 const int l3= stride + l2; 00453 const int l4= stride + l3; 00454 const int l5= stride + l4; 00455 const int l6= stride + l5; 00456 const int l7= stride + l6; 00457 // const int l8= stride + l7; 00458 // const int l9= stride + l8; 00459 int x; 00460 00461 src+= stride*3; 00462 for(x=0; x<BLOCK_SIZE; x++){ 00463 int a= src[l3] - src[l4]; 00464 int b= src[l4] - src[l5]; 00465 int c= src[l5] - src[l6]; 00466 00467 int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1); 00468 d= FFMAX(d, 0); 00469 00470 if(d < co->QP*2){ 00471 int v = d * FFSIGN(-b); 00472 00473 src[l2] +=v>>3; 00474 src[l3] +=v>>2; 00475 src[l4] +=(3*v)>>3; 00476 src[l5] -=(3*v)>>3; 00477 src[l6] -=v>>2; 00478 src[l7] -=v>>3; 00479 } 00480 src++; 00481 } 00482 #endif //HAVE_MMX2 || HAVE_AMD3DNOW 00483 } 00484 00485 #if !HAVE_ALTIVEC 00486 static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) 00487 { 00488 #if HAVE_MMX2 || HAVE_AMD3DNOW 00489 /* 00490 uint8_t tmp[16]; 00491 const int l1= stride; 00492 const int l2= stride + l1; 00493 const int l3= stride + l2; 00494 const int l4= (int)tmp - (int)src - stride*3; 00495 const int l5= (int)tmp - (int)src - stride*3 + 8; 00496 const int l6= stride*3 + l3; 00497 const int l7= stride + l6; 00498 const int l8= stride + l7; 00499 00500 memcpy(tmp, src+stride*7, 8); 00501 memcpy(tmp+8, src+stride*8, 8); 00502 */ 00503 src+= stride*4; 00504 __asm__ volatile( 00505 00506 #if 0 //slightly more accurate and slightly slower 00507 "pxor %%mm7, %%mm7 \n\t" // 0 00508 "lea (%0, %1), %%"REG_a" \n\t" 00509 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 00510 // 0 1 2 3 4 5 6 7 00511 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 00512 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 00513 00514 00515 "movq (%0, %1, 2), %%mm0 \n\t" // l2 00516 "movq (%0), %%mm1 \n\t" // l0 00517 "movq %%mm0, %%mm2 \n\t" // l2 00518 PAVGB(%%mm7, %%mm0) // ~l2/2 00519 PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 00520 PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 00521 00522 "movq (%%"REG_a"), %%mm1 \n\t" // l1 00523 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" // l3 00524 "movq %%mm1, %%mm4 \n\t" // l1 00525 PAVGB(%%mm7, %%mm1) // ~l1/2 00526 PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 00527 PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 00528 00529 "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 00530 "psubusb %%mm1, %%mm0 \n\t" 00531 "psubusb %%mm4, %%mm1 \n\t" 00532 "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 00533 // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 00534 00535 "movq (%0, %1, 4), %%mm0 \n\t" // l4 00536 "movq %%mm0, %%mm4 \n\t" // l4 00537 PAVGB(%%mm7, %%mm0) // ~l4/2 00538 PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 00539 PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 00540 00541 "movq (%%"REG_c"), %%mm2 \n\t" // l5 00542 "movq %%mm3, %%mm5 \n\t" // l3 00543 PAVGB(%%mm7, %%mm3) // ~l3/2 00544 PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 00545 PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 00546 00547 "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 00548 "psubusb %%mm3, %%mm0 \n\t" 00549 "psubusb %%mm6, %%mm3 \n\t" 00550 "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 00551 "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) 00552 // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 00553 00554 "movq (%%"REG_c", %1), %%mm6 \n\t" // l6 00555 "movq %%mm6, %%mm5 \n\t" // l6 00556 PAVGB(%%mm7, %%mm6) // ~l6/2 00557 PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 00558 PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 00559 00560 "movq (%%"REG_c", %1, 2), %%mm5 \n\t" // l7 00561 "movq %%mm2, %%mm4 \n\t" // l5 00562 PAVGB(%%mm7, %%mm2) // ~l5/2 00563 PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 00564 PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 00565 00566 "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 00567 "psubusb %%mm2, %%mm6 \n\t" 00568 "psubusb %%mm4, %%mm2 \n\t" 00569 "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 00570 // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 00571 00572 00573 PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 00574 "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? 00575 "paddusb "MANGLE(b01)", %%mm4 \n\t" 00576 "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP 00577 "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 00578 "pand %%mm4, %%mm3 \n\t" 00579 00580 "movq %%mm3, %%mm1 \n\t" 00581 // "psubusb "MANGLE(b01)", %%mm3 \n\t" 00582 PAVGB(%%mm7, %%mm3) 00583 PAVGB(%%mm7, %%mm3) 00584 "paddusb %%mm1, %%mm3 \n\t" 00585 // "paddusb "MANGLE(b01)", %%mm3 \n\t" 00586 00587 "movq (%%"REG_a", %1, 2), %%mm6 \n\t" //l3 00588 "movq (%0, %1, 4), %%mm5 \n\t" //l4 00589 "movq (%0, %1, 4), %%mm4 \n\t" //l4 00590 "psubusb %%mm6, %%mm5 \n\t" 00591 "psubusb %%mm4, %%mm6 \n\t" 00592 "por %%mm6, %%mm5 \n\t" // |l3-l4| 00593 "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) 00594 "pxor %%mm6, %%mm0 \n\t" 00595 "pand %%mm0, %%mm3 \n\t" 00596 PMINUB(%%mm5, %%mm3, %%mm0) 00597 00598 "psubusb "MANGLE(b01)", %%mm3 \n\t" 00599 PAVGB(%%mm7, %%mm3) 00600 00601 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 00602 "movq (%0, %1, 4), %%mm2 \n\t" 00603 "pxor %%mm6, %%mm0 \n\t" 00604 "pxor %%mm6, %%mm2 \n\t" 00605 "psubb %%mm3, %%mm0 \n\t" 00606 "paddb %%mm3, %%mm2 \n\t" 00607 "pxor %%mm6, %%mm0 \n\t" 00608 "pxor %%mm6, %%mm2 \n\t" 00609 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" 00610 "movq %%mm2, (%0, %1, 4) \n\t" 00611 #endif //0 00612 00613 "lea (%0, %1), %%"REG_a" \n\t" 00614 "pcmpeqb %%mm6, %%mm6 \n\t" // -1 00615 // 0 1 2 3 4 5 6 7 00616 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 00617 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 00618 00619 00620 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // l3 00621 "movq (%0, %1, 4), %%mm0 \n\t" // l4 00622 "pxor %%mm6, %%mm1 \n\t" // -l3-1 00623 PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 00624 // mm1=-l3-1, mm0=128-q 00625 00626 "movq (%%"REG_a", %1, 4), %%mm2 \n\t" // l5 00627 "movq (%%"REG_a", %1), %%mm3 \n\t" // l2 00628 "pxor %%mm6, %%mm2 \n\t" // -l5-1 00629 "movq %%mm2, %%mm5 \n\t" // -l5-1 00630 "movq "MANGLE(b80)", %%mm4 \n\t" // 128 00631 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 00632 PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 00633 PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 00634 PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 00635 PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 00636 // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 00637 00638 "movq (%%"REG_a"), %%mm2 \n\t" // l1 00639 "pxor %%mm6, %%mm2 \n\t" // -l1-1 00640 PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 00641 PAVGB((%0), %%mm1) // (l0-l3+256)/2 00642 "movq "MANGLE(b80)", %%mm3 \n\t" // 128 00643 PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 00644 PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 00645 PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 00646 // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 00647 00648 PAVGB((%%REGc, %1), %%mm5) // (l6-l5+256)/2 00649 "movq (%%"REG_c", %1, 2), %%mm1 \n\t" // l7 00650 "pxor %%mm6, %%mm1 \n\t" // -l7-1 00651 PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 00652 "movq "MANGLE(b80)", %%mm2 \n\t" // 128 00653 PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 00654 PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 00655 PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 00656 // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 00657 00658 "movq "MANGLE(b00)", %%mm1 \n\t" // 0 00659 "movq "MANGLE(b00)", %%mm5 \n\t" // 0 00660 "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 00661 "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 00662 PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| 00663 PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| 00664 PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 00665 00666 // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 00667 00668 "movq "MANGLE(b00)", %%mm7 \n\t" // 0 00669 "movq %2, %%mm2 \n\t" // QP 00670 PAVGB(%%mm6, %%mm2) // 128 + QP/2 00671 "psubb %%mm6, %%mm2 \n\t" 00672 00673 "movq %%mm4, %%mm1 \n\t" 00674 "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) 00675 "pxor %%mm1, %%mm4 \n\t" 00676 "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 00677 "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 00678 "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 00679 // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 00680 00681 "movq %%mm4, %%mm3 \n\t" // d 00682 "psubusb "MANGLE(b01)", %%mm4 \n\t" 00683 PAVGB(%%mm7, %%mm4) // d/32 00684 PAVGB(%%mm7, %%mm4) // (d + 32)/64 00685 "paddb %%mm3, %%mm4 \n\t" // 5d/64 00686 "pand %%mm2, %%mm4 \n\t" 00687 00688 "movq "MANGLE(b80)", %%mm5 \n\t" // 128 00689 "psubb %%mm0, %%mm5 \n\t" // q 00690 "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding 00691 "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) 00692 "pxor %%mm7, %%mm5 \n\t" 00693 00694 PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) 00695 "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) 00696 00697 "pand %%mm7, %%mm4 \n\t" 00698 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 00699 "movq (%0, %1, 4), %%mm2 \n\t" 00700 "pxor %%mm1, %%mm0 \n\t" 00701 "pxor %%mm1, %%mm2 \n\t" 00702 "paddb %%mm4, %%mm0 \n\t" 00703 "psubb %%mm4, %%mm2 \n\t" 00704 "pxor %%mm1, %%mm0 \n\t" 00705 "pxor %%mm1, %%mm2 \n\t" 00706 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" 00707 "movq %%mm2, (%0, %1, 4) \n\t" 00708 00709 : 00710 : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) 00711 : "%"REG_a, "%"REG_c 00712 ); 00713 00714 /* 00715 { 00716 int x; 00717 src-= stride; 00718 for(x=0; x<BLOCK_SIZE; x++){ 00719 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); 00720 if(FFABS(middleEnergy)< 8*QP){ 00721 const int q=(src[l4] - src[l5])/2; 00722 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); 00723 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); 00724 00725 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 00726 d= FFMAX(d, 0); 00727 00728 d= (5*d + 32) >> 6; 00729 d*= FFSIGN(-middleEnergy); 00730 00731 if(q>0){ 00732 d= d<0 ? 0 : d; 00733 d= d>q ? q : d; 00734 }else{ 00735 d= d>0 ? 0 : d; 00736 d= d<q ? q : d; 00737 } 00738 00739 src[l4]-= d; 00740 src[l5]+= d; 00741 } 00742 src++; 00743 } 00744 src-=8; 00745 for(x=0; x<8; x++){ 00746 int y; 00747 for(y=4; y<6; y++){ 00748 int d= src[x+y*stride] - tmp[x+(y-4)*8]; 00749 int ad= FFABS(d); 00750 static int max=0; 00751 static int sum=0; 00752 static int num=0; 00753 static int bias=0; 00754 00755 if(max<ad) max=ad; 00756 sum+= ad>3 ? 1 : 0; 00757 if(ad>3){ 00758 src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; 00759 } 00760 if(y==4) bias+=d; 00761 num++; 00762 if(num%1000000 == 0){ 00763 av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias); 00764 } 00765 } 00766 } 00767 } 00768 */ 00769 #elif HAVE_MMX 00770 src+= stride*4; 00771 __asm__ volatile( 00772 "pxor %%mm7, %%mm7 \n\t" 00773 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars 00774 "and "ALIGN_MASK", %%"REG_c" \n\t" // align 00775 // 0 1 2 3 4 5 6 7 00776 // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 00777 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 00778 00779 "movq (%0), %%mm0 \n\t" 00780 "movq %%mm0, %%mm1 \n\t" 00781 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 00782 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 00783 00784 "movq (%0, %1), %%mm2 \n\t" 00785 "lea (%0, %1, 2), %%"REG_a" \n\t" 00786 "movq %%mm2, %%mm3 \n\t" 00787 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 00788 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 00789 00790 "movq (%%"REG_a"), %%mm4 \n\t" 00791 "movq %%mm4, %%mm5 \n\t" 00792 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 00793 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 00794 00795 "paddw %%mm0, %%mm0 \n\t" // 2L0 00796 "paddw %%mm1, %%mm1 \n\t" // 2H0 00797 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 00798 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 00799 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 00800 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 00801 00802 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 00803 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 00804 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 00805 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 00806 00807 "movq (%%"REG_a", %1), %%mm2 \n\t" 00808 "movq %%mm2, %%mm3 \n\t" 00809 "punpcklbw %%mm7, %%mm2 \n\t" // L3 00810 "punpckhbw %%mm7, %%mm3 \n\t" // H3 00811 00812 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 00813 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 00814 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 00815 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 00816 "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3 00817 "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3 00818 00819 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 00820 "movq %%mm0, %%mm1 \n\t" 00821 "punpcklbw %%mm7, %%mm0 \n\t" // L4 00822 "punpckhbw %%mm7, %%mm1 \n\t" // H4 00823 00824 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 00825 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 00826 "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4 00827 "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4 00828 "paddw %%mm4, %%mm4 \n\t" // 2L2 00829 "paddw %%mm5, %%mm5 \n\t" // 2H2 00830 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 00831 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 00832 00833 "lea (%%"REG_a", %1), %0 \n\t" 00834 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 00835 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 00836 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 00837 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 00838 //50 opcodes so far 00839 "movq (%0, %1, 2), %%mm2 \n\t" 00840 "movq %%mm2, %%mm3 \n\t" 00841 "punpcklbw %%mm7, %%mm2 \n\t" // L5 00842 "punpckhbw %%mm7, %%mm3 \n\t" // H5 00843 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 00844 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 00845 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 00846 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 00847 00848 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 00849 "punpcklbw %%mm7, %%mm6 \n\t" // L6 00850 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 00851 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 00852 "punpckhbw %%mm7, %%mm6 \n\t" // H6 00853 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 00854 00855 "paddw %%mm0, %%mm0 \n\t" // 2L4 00856 "paddw %%mm1, %%mm1 \n\t" // 2H4 00857 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 00858 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 00859 00860 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 00861 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 00862 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 00863 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 00864 00865 "movq (%0, %1, 4), %%mm2 \n\t" 00866 "movq %%mm2, %%mm3 \n\t" 00867 "punpcklbw %%mm7, %%mm2 \n\t" // L7 00868 "punpckhbw %%mm7, %%mm3 \n\t" // H7 00869 00870 "paddw %%mm2, %%mm2 \n\t" // 2L7 00871 "paddw %%mm3, %%mm3 \n\t" // 2H7 00872 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 00873 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 00874 00875 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 00876 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 00877 00878 #if HAVE_MMX2 00879 "movq %%mm7, %%mm6 \n\t" // 0 00880 "psubw %%mm0, %%mm6 \n\t" 00881 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 00882 "movq %%mm7, %%mm6 \n\t" // 0 00883 "psubw %%mm1, %%mm6 \n\t" 00884 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 00885 "movq %%mm7, %%mm6 \n\t" // 0 00886 "psubw %%mm2, %%mm6 \n\t" 00887 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 00888 "movq %%mm7, %%mm6 \n\t" // 0 00889 "psubw %%mm3, %%mm6 \n\t" 00890 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 00891 #else 00892 "movq %%mm7, %%mm6 \n\t" // 0 00893 "pcmpgtw %%mm0, %%mm6 \n\t" 00894 "pxor %%mm6, %%mm0 \n\t" 00895 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 00896 "movq %%mm7, %%mm6 \n\t" // 0 00897 "pcmpgtw %%mm1, %%mm6 \n\t" 00898 "pxor %%mm6, %%mm1 \n\t" 00899 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 00900 "movq %%mm7, %%mm6 \n\t" // 0 00901 "pcmpgtw %%mm2, %%mm6 \n\t" 00902 "pxor %%mm6, %%mm2 \n\t" 00903 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 00904 "movq %%mm7, %%mm6 \n\t" // 0 00905 "pcmpgtw %%mm3, %%mm6 \n\t" 00906 "pxor %%mm6, %%mm3 \n\t" 00907 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 00908 #endif 00909 00910 #if HAVE_MMX2 00911 "pminsw %%mm2, %%mm0 \n\t" 00912 "pminsw %%mm3, %%mm1 \n\t" 00913 #else 00914 "movq %%mm0, %%mm6 \n\t" 00915 "psubusw %%mm2, %%mm6 \n\t" 00916 "psubw %%mm6, %%mm0 \n\t" 00917 "movq %%mm1, %%mm6 \n\t" 00918 "psubusw %%mm3, %%mm6 \n\t" 00919 "psubw %%mm6, %%mm1 \n\t" 00920 #endif 00921 00922 "movd %2, %%mm2 \n\t" // QP 00923 "punpcklbw %%mm7, %%mm2 \n\t" 00924 00925 "movq %%mm7, %%mm6 \n\t" // 0 00926 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) 00927 "pxor %%mm6, %%mm4 \n\t" 00928 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| 00929 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) 00930 "pxor %%mm7, %%mm5 \n\t" 00931 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| 00932 // 100 opcodes 00933 "psllw $3, %%mm2 \n\t" // 8QP 00934 "movq %%mm2, %%mm3 \n\t" // 8QP 00935 "pcmpgtw %%mm4, %%mm2 \n\t" 00936 "pcmpgtw %%mm5, %%mm3 \n\t" 00937 "pand %%mm2, %%mm4 \n\t" 00938 "pand %%mm3, %%mm5 \n\t" 00939 00940 00941 "psubusw %%mm0, %%mm4 \n\t" // hd 00942 "psubusw %%mm1, %%mm5 \n\t" // ld 00943 00944 00945 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 00946 "pmullw %%mm2, %%mm4 \n\t" 00947 "pmullw %%mm2, %%mm5 \n\t" 00948 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 00949 "paddw %%mm2, %%mm4 \n\t" 00950 "paddw %%mm2, %%mm5 \n\t" 00951 "psrlw $6, %%mm4 \n\t" 00952 "psrlw $6, %%mm5 \n\t" 00953 00954 "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4 00955 "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4 00956 00957 "pxor %%mm2, %%mm2 \n\t" 00958 "pxor %%mm3, %%mm3 \n\t" 00959 00960 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) 00961 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) 00962 "pxor %%mm2, %%mm0 \n\t" 00963 "pxor %%mm3, %%mm1 \n\t" 00964 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| 00965 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| 00966 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 00967 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 00968 00969 "pxor %%mm6, %%mm2 \n\t" 00970 "pxor %%mm7, %%mm3 \n\t" 00971 "pand %%mm2, %%mm4 \n\t" 00972 "pand %%mm3, %%mm5 \n\t" 00973 00974 #if HAVE_MMX2 00975 "pminsw %%mm0, %%mm4 \n\t" 00976 "pminsw %%mm1, %%mm5 \n\t" 00977 #else 00978 "movq %%mm4, %%mm2 \n\t" 00979 "psubusw %%mm0, %%mm2 \n\t" 00980 "psubw %%mm2, %%mm4 \n\t" 00981 "movq %%mm5, %%mm2 \n\t" 00982 "psubusw %%mm1, %%mm2 \n\t" 00983 "psubw %%mm2, %%mm5 \n\t" 00984 #endif 00985 "pxor %%mm6, %%mm4 \n\t" 00986 "pxor %%mm7, %%mm5 \n\t" 00987 "psubw %%mm6, %%mm4 \n\t" 00988 "psubw %%mm7, %%mm5 \n\t" 00989 "packsswb %%mm5, %%mm4 \n\t" 00990 "movq (%0), %%mm0 \n\t" 00991 "paddb %%mm4, %%mm0 \n\t" 00992 "movq %%mm0, (%0) \n\t" 00993 "movq (%0, %1), %%mm0 \n\t" 00994 "psubb %%mm4, %%mm0 \n\t" 00995 "movq %%mm0, (%0, %1) \n\t" 00996 00997 : "+r" (src) 00998 : "r" ((x86_reg)stride), "m" (c->pQPb) 00999 : "%"REG_a, "%"REG_c 01000 ); 01001 #else //HAVE_MMX2 || HAVE_AMD3DNOW 01002 const int l1= stride; 01003 const int l2= stride + l1; 01004 const int l3= stride + l2; 01005 const int l4= stride + l3; 01006 const int l5= stride + l4; 01007 const int l6= stride + l5; 01008 const int l7= stride + l6; 01009 const int l8= stride + l7; 01010 // const int l9= stride + l8; 01011 int x; 01012 src+= stride*3; 01013 for(x=0; x<BLOCK_SIZE; x++){ 01014 const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); 01015 if(FFABS(middleEnergy) < 8*c->QP){ 01016 const int q=(src[l4] - src[l5])/2; 01017 const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); 01018 const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); 01019 01020 int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); 01021 d= FFMAX(d, 0); 01022 01023 d= (5*d + 32) >> 6; 01024 d*= FFSIGN(-middleEnergy); 01025 01026 if(q>0){ 01027 d= d<0 ? 0 : d; 01028 d= d>q ? q : d; 01029 }else{ 01030 d= d>0 ? 0 : d; 01031 d= d<q ? q : d; 01032 } 01033 01034 src[l4]-= d; 01035 src[l5]+= d; 01036 } 01037 src++; 01038 } 01039 #endif //HAVE_MMX2 || HAVE_AMD3DNOW 01040 } 01041 #endif //HAVE_ALTIVEC 01042 01043 #if !HAVE_ALTIVEC 01044 static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) 01045 { 01046 #if HAVE_MMX2 || HAVE_AMD3DNOW 01047 __asm__ volatile( 01048 "pxor %%mm6, %%mm6 \n\t" 01049 "pcmpeqb %%mm7, %%mm7 \n\t" 01050 "movq %2, %%mm0 \n\t" 01051 "punpcklbw %%mm6, %%mm0 \n\t" 01052 "psrlw $1, %%mm0 \n\t" 01053 "psubw %%mm7, %%mm0 \n\t" 01054 "packuswb %%mm0, %%mm0 \n\t" 01055 "movq %%mm0, %3 \n\t" 01056 01057 "lea (%0, %1), %%"REG_a" \n\t" 01058 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 01059 01060 // 0 1 2 3 4 5 6 7 8 9 01061 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 01062 01063 #undef REAL_FIND_MIN_MAX 01064 #undef FIND_MIN_MAX 01065 #if HAVE_MMX2 01066 #define REAL_FIND_MIN_MAX(addr)\ 01067 "movq " #addr ", %%mm0 \n\t"\ 01068 "pminub %%mm0, %%mm7 \n\t"\ 01069 "pmaxub %%mm0, %%mm6 \n\t" 01070 #else 01071 #define REAL_FIND_MIN_MAX(addr)\ 01072 "movq " #addr ", %%mm0 \n\t"\ 01073 "movq %%mm7, %%mm1 \n\t"\ 01074 "psubusb %%mm0, %%mm6 \n\t"\ 01075 "paddb %%mm0, %%mm6 \n\t"\ 01076 "psubusb %%mm0, %%mm1 \n\t"\ 01077 "psubb %%mm1, %%mm7 \n\t" 01078 #endif 01079 #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr) 01080 01081 FIND_MIN_MAX((%%REGa)) 01082 FIND_MIN_MAX((%%REGa, %1)) 01083 FIND_MIN_MAX((%%REGa, %1, 2)) 01084 FIND_MIN_MAX((%0, %1, 4)) 01085 FIND_MIN_MAX((%%REGd)) 01086 FIND_MIN_MAX((%%REGd, %1)) 01087 FIND_MIN_MAX((%%REGd, %1, 2)) 01088 FIND_MIN_MAX((%0, %1, 8)) 01089 01090 "movq %%mm7, %%mm4 \n\t" 01091 "psrlq $8, %%mm7 \n\t" 01092 #if HAVE_MMX2 01093 "pminub %%mm4, %%mm7 \n\t" // min of pixels 01094 "pshufw $0xF9, %%mm7, %%mm4 \n\t" 01095 "pminub %%mm4, %%mm7 \n\t" // min of pixels 01096 "pshufw $0xFE, %%mm7, %%mm4 \n\t" 01097 "pminub %%mm4, %%mm7 \n\t" 01098 #else 01099 "movq %%mm7, %%mm1 \n\t" 01100 "psubusb %%mm4, %%mm1 \n\t" 01101 "psubb %%mm1, %%mm7 \n\t" 01102 "movq %%mm7, %%mm4 \n\t" 01103 "psrlq $16, %%mm7 \n\t" 01104 "movq %%mm7, %%mm1 \n\t" 01105 "psubusb %%mm4, %%mm1 \n\t" 01106 "psubb %%mm1, %%mm7 \n\t" 01107 "movq %%mm7, %%mm4 \n\t" 01108 "psrlq $32, %%mm7 \n\t" 01109 "movq %%mm7, %%mm1 \n\t" 01110 "psubusb %%mm4, %%mm1 \n\t" 01111 "psubb %%mm1, %%mm7 \n\t" 01112 #endif 01113 01114 01115 "movq %%mm6, %%mm4 \n\t" 01116 "psrlq $8, %%mm6 \n\t" 01117 #if HAVE_MMX2 01118 "pmaxub %%mm4, %%mm6 \n\t" // max of pixels 01119 "pshufw $0xF9, %%mm6, %%mm4 \n\t" 01120 "pmaxub %%mm4, %%mm6 \n\t" 01121 "pshufw $0xFE, %%mm6, %%mm4 \n\t" 01122 "pmaxub %%mm4, %%mm6 \n\t" 01123 #else 01124 "psubusb %%mm4, %%mm6 \n\t" 01125 "paddb %%mm4, %%mm6 \n\t" 01126 "movq %%mm6, %%mm4 \n\t" 01127 "psrlq $16, %%mm6 \n\t" 01128 "psubusb %%mm4, %%mm6 \n\t" 01129 "paddb %%mm4, %%mm6 \n\t" 01130 "movq %%mm6, %%mm4 \n\t" 01131 "psrlq $32, %%mm6 \n\t" 01132 "psubusb %%mm4, %%mm6 \n\t" 01133 "paddb %%mm4, %%mm6 \n\t" 01134 #endif 01135 "movq %%mm6, %%mm0 \n\t" // max 01136 "psubb %%mm7, %%mm6 \n\t" // max - min 01137 "movd %%mm6, %%ecx \n\t" 01138 "cmpb "MANGLE(deringThreshold)", %%cl \n\t" 01139 " jb 1f \n\t" 01140 "lea -24(%%"REG_SP"), %%"REG_c" \n\t" 01141 "and "ALIGN_MASK", %%"REG_c" \n\t" 01142 PAVGB(%%mm0, %%mm7) // a=(max + min)/2 01143 "punpcklbw %%mm7, %%mm7 \n\t" 01144 "punpcklbw %%mm7, %%mm7 \n\t" 01145 "punpcklbw %%mm7, %%mm7 \n\t" 01146 "movq %%mm7, (%%"REG_c") \n\t" 01147 01148 "movq (%0), %%mm0 \n\t" // L10 01149 "movq %%mm0, %%mm1 \n\t" // L10 01150 "movq %%mm0, %%mm2 \n\t" // L10 01151 "psllq $8, %%mm1 \n\t" 01152 "psrlq $8, %%mm2 \n\t" 01153 "movd -4(%0), %%mm3 \n\t" 01154 "movd 8(%0), %%mm4 \n\t" 01155 "psrlq $24, %%mm3 \n\t" 01156 "psllq $56, %%mm4 \n\t" 01157 "por %%mm3, %%mm1 \n\t" // L00 01158 "por %%mm4, %%mm2 \n\t" // L20 01159 "movq %%mm1, %%mm3 \n\t" // L00 01160 PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 01161 PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 01162 "psubusb %%mm7, %%mm0 \n\t" 01163 "psubusb %%mm7, %%mm2 \n\t" 01164 "psubusb %%mm7, %%mm3 \n\t" 01165 "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 01166 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 01167 "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 01168 "paddb %%mm2, %%mm0 \n\t" 01169 "paddb %%mm3, %%mm0 \n\t" 01170 01171 "movq (%%"REG_a"), %%mm2 \n\t" // L11 01172 "movq %%mm2, %%mm3 \n\t" // L11 01173 "movq %%mm2, %%mm4 \n\t" // L11 01174 "psllq $8, %%mm3 \n\t" 01175 "psrlq $8, %%mm4 \n\t" 01176 "movd -4(%%"REG_a"), %%mm5 \n\t" 01177 "movd 8(%%"REG_a"), %%mm6 \n\t" 01178 "psrlq $24, %%mm5 \n\t" 01179 "psllq $56, %%mm6 \n\t" 01180 "por %%mm5, %%mm3 \n\t" // L01 01181 "por %%mm6, %%mm4 \n\t" // L21 01182 "movq %%mm3, %%mm5 \n\t" // L01 01183 PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 01184 PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 01185 "psubusb %%mm7, %%mm2 \n\t" 01186 "psubusb %%mm7, %%mm4 \n\t" 01187 "psubusb %%mm7, %%mm5 \n\t" 01188 "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 01189 "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 01190 "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 01191 "paddb %%mm4, %%mm2 \n\t" 01192 "paddb %%mm5, %%mm2 \n\t" 01193 // 0, 2, 3, 1 01194 #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ 01195 "movq " #src ", " #sx " \n\t" /* src[0] */\ 01196 "movq " #sx ", " #lx " \n\t" /* src[0] */\ 01197 "movq " #sx ", " #t0 " \n\t" /* src[0] */\ 01198 "psllq $8, " #lx " \n\t"\ 01199 "psrlq $8, " #t0 " \n\t"\ 01200 "movd -4" #src ", " #t1 " \n\t"\ 01201 "psrlq $24, " #t1 " \n\t"\ 01202 "por " #t1 ", " #lx " \n\t" /* src[-1] */\ 01203 "movd 8" #src ", " #t1 " \n\t"\ 01204 "psllq $56, " #t1 " \n\t"\ 01205 "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ 01206 "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ 01207 PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ 01208 PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ 01209 PAVGB(lx, pplx) \ 01210 "movq " #lx ", 8(%%"REG_c") \n\t"\ 01211 "movq (%%"REG_c"), " #lx " \n\t"\ 01212 "psubusb " #lx ", " #t1 " \n\t"\ 01213 "psubusb " #lx ", " #t0 " \n\t"\ 01214 "psubusb " #lx ", " #sx " \n\t"\ 01215 "movq "MANGLE(b00)", " #lx " \n\t"\ 01216 "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ 01217 "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ 01218 "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ 01219 "paddb " #t1 ", " #t0 " \n\t"\ 01220 "paddb " #t0 ", " #sx " \n\t"\ 01221 \ 01222 PAVGB(plx, pplx) /* filtered */\ 01223 "movq " #dst ", " #t0 " \n\t" /* dst */\ 01224 "movq " #t0 ", " #t1 " \n\t" /* dst */\ 01225 "psubusb %3, " #t0 " \n\t"\ 01226 "paddusb %3, " #t1 " \n\t"\ 01227 PMAXUB(t0, pplx)\ 01228 PMINUB(t1, pplx, t0)\ 01229 "paddb " #sx ", " #ppsx " \n\t"\ 01230 "paddb " #psx ", " #ppsx " \n\t"\ 01231 "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ 01232 "pand "MANGLE(b08)", " #ppsx " \n\t"\ 01233 "pcmpeqb " #lx ", " #ppsx " \n\t"\ 01234 "pand " #ppsx ", " #pplx " \n\t"\ 01235 "pandn " #dst ", " #ppsx " \n\t"\ 01236 "por " #pplx ", " #ppsx " \n\t"\ 01237 "movq " #ppsx ", " #dst " \n\t"\ 01238 "movq 8(%%"REG_c"), " #lx " \n\t" 01239 01240 #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ 01241 REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) 01242 /* 01243 0000000 01244 1111111 01245 01246 1111110 01247 1111101 01248 1111100 01249 1111011 01250 1111010 01251 1111001 01252 01253 1111000 01254 1110111 01255 01256 */ 01257 //DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) 01258 DERING_CORE((%%REGa) ,(%%REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) 01259 DERING_CORE((%%REGa, %1) ,(%%REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 01260 DERING_CORE((%%REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) 01261 DERING_CORE((%0, %1, 4) ,(%%REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) 01262 DERING_CORE((%%REGd) ,(%%REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 01263 DERING_CORE((%%REGd, %1) ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) 01264 DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) 01265 DERING_CORE((%0, %1, 8) ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) 01266 01267 "1: \n\t" 01268 : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2) 01269 : "%"REG_a, "%"REG_d, "%"REG_c 01270 ); 01271 #else //HAVE_MMX2 || HAVE_AMD3DNOW 01272 int y; 01273 int min=255; 01274 int max=0; 01275 int avg; 01276 uint8_t *p; 01277 int s[10]; 01278 const int QP2= c->QP/2 + 1; 01279 01280 for(y=1; y<9; y++){ 01281 int x; 01282 p= src + stride*y; 01283 for(x=1; x<9; x++){ 01284 p++; 01285 if(*p > max) max= *p; 01286 if(*p < min) min= *p; 01287 } 01288 } 01289 avg= (min + max + 1)>>1; 01290 01291 if(max - min <deringThreshold) return; 01292 01293 for(y=0; y<10; y++){ 01294 int t = 0; 01295 01296 if(src[stride*y + 0] > avg) t+= 1; 01297 if(src[stride*y + 1] > avg) t+= 2; 01298 if(src[stride*y + 2] > avg) t+= 4; 01299 if(src[stride*y + 3] > avg) t+= 8; 01300 if(src[stride*y + 4] > avg) t+= 16; 01301 if(src[stride*y + 5] > avg) t+= 32; 01302 if(src[stride*y + 6] > avg) t+= 64; 01303 if(src[stride*y + 7] > avg) t+= 128; 01304 if(src[stride*y + 8] > avg) t+= 256; 01305 if(src[stride*y + 9] > avg) t+= 512; 01306 01307 t |= (~t)<<16; 01308 t &= (t<<1) & (t>>1); 01309 s[y] = t; 01310 } 01311 01312 for(y=1; y<9; y++){ 01313 int t = s[y-1] & s[y] & s[y+1]; 01314 t|= t>>16; 01315 s[y-1]= t; 01316 } 01317 01318 for(y=1; y<9; y++){ 01319 int x; 01320 int t = s[y-1]; 01321 01322 p= src + stride*y; 01323 for(x=1; x<9; x++){ 01324 p++; 01325 if(t & (1<<x)){ 01326 int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) 01327 +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) 01328 +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); 01329 f= (f + 8)>>4; 01330 01331 #ifdef DEBUG_DERING_THRESHOLD 01332 __asm__ volatile("emms\n\t":); 01333 { 01334 static long long numPixels=0; 01335 if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; 01336 // if((max-min)<20 || (max-min)*QP<200) 01337 // if((max-min)*QP < 500) 01338 // if(max-min<QP/2) 01339 if(max-min < 20){ 01340 static int numSkipped=0; 01341 static int errorSum=0; 01342 static int worstQP=0; 01343 static int worstRange=0; 01344 static int worstDiff=0; 01345 int diff= (f - *p); 01346 int absDiff= FFABS(diff); 01347 int error= diff*diff; 01348 01349 if(x==1 || x==8 || y==1 || y==8) continue; 01350 01351 numSkipped++; 01352 if(absDiff > worstDiff){ 01353 worstDiff= absDiff; 01354 worstQP= QP; 01355 worstRange= max-min; 01356 } 01357 errorSum+= error; 01358 01359 if(1024LL*1024LL*1024LL % numSkipped == 0){ 01360 av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, " 01361 "wRange:%d, wDiff:%d, relSkip:%1.3f\n", 01362 (float)errorSum/numSkipped, numSkipped, worstQP, worstRange, 01363 worstDiff, (float)numSkipped/numPixels); 01364 } 01365 } 01366 } 01367 #endif 01368 if (*p + QP2 < f) *p= *p + QP2; 01369 else if(*p - QP2 > f) *p= *p - QP2; 01370 else *p=f; 01371 } 01372 } 01373 } 01374 #ifdef DEBUG_DERING_THRESHOLD 01375 if(max-min < 20){ 01376 for(y=1; y<9; y++){ 01377 int x; 01378 int t = 0; 01379 p= src + stride*y; 01380 for(x=1; x<9; x++){ 01381 p++; 01382 *p = FFMIN(*p + 20, 255); 01383 } 01384 } 01385 // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; 01386 } 01387 #endif 01388 #endif //HAVE_MMX2 || HAVE_AMD3DNOW 01389 } 01390 #endif //HAVE_ALTIVEC 01391 01398 static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) 01399 { 01400 #if HAVE_MMX2 || HAVE_AMD3DNOW 01401 src+= 4*stride; 01402 __asm__ volatile( 01403 "lea (%0, %1), %%"REG_a" \n\t" 01404 "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" 01405 // 0 1 2 3 4 5 6 7 8 9 01406 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 01407 01408 "movq (%0), %%mm0 \n\t" 01409 "movq (%%"REG_a", %1), %%mm1 \n\t" 01410 PAVGB(%%mm1, %%mm0) 01411 "movq %%mm0, (%%"REG_a") \n\t" 01412 "movq (%0, %1, 4), %%mm0 \n\t" 01413 PAVGB(%%mm0, %%mm1) 01414 "movq %%mm1, (%%"REG_a", %1, 2) \n\t" 01415 "movq (%%"REG_c", %1), %%mm1 \n\t" 01416 PAVGB(%%mm1, %%mm0) 01417 "movq %%mm0, (%%"REG_c") \n\t" 01418 "movq (%0, %1, 8), %%mm0 \n\t" 01419 PAVGB(%%mm0, %%mm1) 01420 "movq %%mm1, (%%"REG_c", %1, 2) \n\t" 01421 01422 : : "r" (src), "r" ((x86_reg)stride) 01423 : "%"REG_a, "%"REG_c 01424 ); 01425 #else 01426 int a, b, x; 01427 src+= 4*stride; 01428 01429 for(x=0; x<2; x++){ 01430 a= *(uint32_t*)&src[stride*0]; 01431 b= *(uint32_t*)&src[stride*2]; 01432 *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 01433 a= *(uint32_t*)&src[stride*4]; 01434 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 01435 b= *(uint32_t*)&src[stride*6]; 01436 *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 01437 a= *(uint32_t*)&src[stride*8]; 01438 *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 01439 src += 4; 01440 } 01441 #endif 01442 } 01443 01451 static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) 01452 { 01453 #if HAVE_MMX2 || HAVE_AMD3DNOW 01454 src+= stride*3; 01455 __asm__ volatile( 01456 "lea (%0, %1), %%"REG_a" \n\t" 01457 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 01458 "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t" 01459 "add %1, %%"REG_c" \n\t" 01460 "pxor %%mm7, %%mm7 \n\t" 01461 // 0 1 2 3 4 5 6 7 8 9 10 01462 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx 01463 01464 #define REAL_DEINT_CUBIC(a,b,c,d,e)\ 01465 "movq " #a ", %%mm0 \n\t"\ 01466 "movq " #b ", %%mm1 \n\t"\ 01467 "movq " #d ", %%mm2 \n\t"\ 01468 "movq " #e ", %%mm3 \n\t"\ 01469 PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ 01470 PAVGB(%%mm3, %%mm0) /* a(a+e) /2 */\ 01471 "movq %%mm0, %%mm2 \n\t"\ 01472 "punpcklbw %%mm7, %%mm0 \n\t"\ 01473 "punpckhbw %%mm7, %%mm2 \n\t"\ 01474 "movq %%mm1, %%mm3 \n\t"\ 01475 "punpcklbw %%mm7, %%mm1 \n\t"\ 01476 "punpckhbw %%mm7, %%mm3 \n\t"\ 01477 "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ 01478 "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ 01479 "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ 01480 "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ 01481 "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ 01482 "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ 01483 "packuswb %%mm3, %%mm1 \n\t"\ 01484 "movq %%mm1, " #c " \n\t" 01485 #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e) 01486 01487 DEINT_CUBIC((%0) , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1)) 01488 DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%0, %1, 8)) 01489 DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc)) 01490 DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2)) 01491 01492 : : "r" (src), "r" ((x86_reg)stride) 01493 : "%"REG_a, "%"REG_d, "%"REG_c 01494 ); 01495 #else //HAVE_MMX2 || HAVE_AMD3DNOW 01496 int x; 01497 src+= stride*3; 01498 for(x=0; x<8; x++){ 01499 src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); 01500 src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); 01501 src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); 01502 src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); 01503 src++; 01504 } 01505 #endif //HAVE_MMX2 || HAVE_AMD3DNOW 01506 } 01507 01515 static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) 01516 { 01517 #if HAVE_MMX2 || HAVE_AMD3DNOW 01518 src+= stride*4; 01519 __asm__ volatile( 01520 "lea (%0, %1), %%"REG_a" \n\t" 01521 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 01522 "pxor %%mm7, %%mm7 \n\t" 01523 "movq (%2), %%mm0 \n\t" 01524 // 0 1 2 3 4 5 6 7 8 9 10 01525 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx 01526 01527 #define REAL_DEINT_FF(a,b,c,d)\ 01528 "movq " #a ", %%mm1 \n\t"\ 01529 "movq " #b ", %%mm2 \n\t"\ 01530 "movq " #c ", %%mm3 \n\t"\ 01531 "movq " #d ", %%mm4 \n\t"\ 01532 PAVGB(%%mm3, %%mm1) \ 01533 PAVGB(%%mm4, %%mm0) \ 01534 "movq %%mm0, %%mm3 \n\t"\ 01535 "punpcklbw %%mm7, %%mm0 \n\t"\ 01536 "punpckhbw %%mm7, %%mm3 \n\t"\ 01537 "movq %%mm1, %%mm4 \n\t"\ 01538 "punpcklbw %%mm7, %%mm1 \n\t"\ 01539 "punpckhbw %%mm7, %%mm4 \n\t"\ 01540 "psllw $2, %%mm1 \n\t"\ 01541 "psllw $2, %%mm4 \n\t"\ 01542 "psubw %%mm0, %%mm1 \n\t"\ 01543 "psubw %%mm3, %%mm4 \n\t"\ 01544 "movq %%mm2, %%mm5 \n\t"\ 01545 "movq %%mm2, %%mm0 \n\t"\ 01546 "punpcklbw %%mm7, %%mm2 \n\t"\ 01547 "punpckhbw %%mm7, %%mm5 \n\t"\ 01548 "paddw %%mm2, %%mm1 \n\t"\ 01549 "paddw %%mm5, %%mm4 \n\t"\ 01550 "psraw $2, %%mm1 \n\t"\ 01551 "psraw $2, %%mm4 \n\t"\ 01552 "packuswb %%mm4, %%mm1 \n\t"\ 01553 "movq %%mm1, " #b " \n\t"\ 01554 01555 #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d) 01556 01557 DEINT_FF((%0) , (%%REGa) , (%%REGa, %1), (%%REGa, %1, 2)) 01558 DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) ) 01559 DEINT_FF((%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%%REGd, %1, 2)) 01560 DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4)) 01561 01562 "movq %%mm0, (%2) \n\t" 01563 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp) 01564 : "%"REG_a, "%"REG_d 01565 ); 01566 #else //HAVE_MMX2 || HAVE_AMD3DNOW 01567 int x; 01568 src+= stride*4; 01569 for(x=0; x<8; x++){ 01570 int t1= tmp[x]; 01571 int t2= src[stride*1]; 01572 01573 src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3); 01574 t1= src[stride*4]; 01575 src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3); 01576 t2= src[stride*6]; 01577 src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3); 01578 t1= src[stride*8]; 01579 src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); 01580 tmp[x]= t1; 01581 01582 src++; 01583 } 01584 #endif //HAVE_MMX2 || HAVE_AMD3DNOW 01585 } 01586 01594 static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) 01595 { 01596 #if HAVE_MMX2 || HAVE_AMD3DNOW 01597 src+= stride*4; 01598 __asm__ volatile( 01599 "lea (%0, %1), %%"REG_a" \n\t" 01600 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 01601 "pxor %%mm7, %%mm7 \n\t" 01602 "movq (%2), %%mm0 \n\t" 01603 "movq (%3), %%mm1 \n\t" 01604 // 0 1 2 3 4 5 6 7 8 9 10 01605 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx 01606 01607 #define REAL_DEINT_L5(t1,t2,a,b,c)\ 01608 "movq " #a ", %%mm2 \n\t"\ 01609 "movq " #b ", %%mm3 \n\t"\ 01610 "movq " #c ", %%mm4 \n\t"\ 01611 PAVGB(t2, %%mm3) \ 01612 PAVGB(t1, %%mm4) \ 01613 "movq %%mm2, %%mm5 \n\t"\ 01614 "movq %%mm2, " #t1 " \n\t"\ 01615 "punpcklbw %%mm7, %%mm2 \n\t"\ 01616 "punpckhbw %%mm7, %%mm5 \n\t"\ 01617 "movq %%mm2, %%mm6 \n\t"\ 01618 "paddw %%mm2, %%mm2 \n\t"\ 01619 "paddw %%mm6, %%mm2 \n\t"\ 01620 "movq %%mm5, %%mm6 \n\t"\ 01621 "paddw %%mm5, %%mm5 \n\t"\ 01622 "paddw %%mm6, %%mm5 \n\t"\ 01623 "movq %%mm3, %%mm6 \n\t"\ 01624 "punpcklbw %%mm7, %%mm3 \n\t"\ 01625 "punpckhbw %%mm7, %%mm6 \n\t"\ 01626 "paddw %%mm3, %%mm3 \n\t"\ 01627 "paddw %%mm6, %%mm6 \n\t"\ 01628 "paddw %%mm3, %%mm2 \n\t"\ 01629 "paddw %%mm6, %%mm5 \n\t"\ 01630 "movq %%mm4, %%mm6 \n\t"\ 01631 "punpcklbw %%mm7, %%mm4 \n\t"\ 01632 "punpckhbw %%mm7, %%mm6 \n\t"\ 01633 "psubw %%mm4, %%mm2 \n\t"\ 01634 "psubw %%mm6, %%mm5 \n\t"\ 01635 "psraw $2, %%mm2 \n\t"\ 01636 "psraw $2, %%mm5 \n\t"\ 01637 "packuswb %%mm5, %%mm2 \n\t"\ 01638 "movq %%mm2, " #a " \n\t"\ 01639 01640 #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c) 01641 01642 DEINT_L5(%%mm0, %%mm1, (%0) , (%%REGa) , (%%REGa, %1) ) 01643 DEINT_L5(%%mm1, %%mm0, (%%REGa) , (%%REGa, %1) , (%%REGa, %1, 2)) 01644 DEINT_L5(%%mm0, %%mm1, (%%REGa, %1) , (%%REGa, %1, 2), (%0, %1, 4) ) 01645 DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) ) 01646 DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%REGd) , (%%REGd, %1) ) 01647 DEINT_L5(%%mm1, %%mm0, (%%REGd) , (%%REGd, %1) , (%%REGd, %1, 2)) 01648 DEINT_L5(%%mm0, %%mm1, (%%REGd, %1) , (%%REGd, %1, 2), (%0, %1, 8) ) 01649 DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4)) 01650 01651 "movq %%mm0, (%2) \n\t" 01652 "movq %%mm1, (%3) \n\t" 01653 : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2) 01654 : "%"REG_a, "%"REG_d 01655 ); 01656 #else //HAVE_MMX2 || HAVE_AMD3DNOW 01657 int x; 01658 src+= stride*4; 01659 for(x=0; x<8; x++){ 01660 int t1= tmp[x]; 01661 int t2= tmp2[x]; 01662 int t3= src[0]; 01663 01664 src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3); 01665 t1= src[stride*1]; 01666 src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3); 01667 t2= src[stride*2]; 01668 src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3); 01669 t3= src[stride*3]; 01670 src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3); 01671 t1= src[stride*4]; 01672 src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3); 01673 t2= src[stride*5]; 01674 src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3); 01675 t3= src[stride*6]; 01676 src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3); 01677 t1= src[stride*7]; 01678 src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3); 01679 01680 tmp[x]= t3; 01681 tmp2[x]= t1; 01682 01683 src++; 01684 } 01685 #endif //HAVE_MMX2 || HAVE_AMD3DNOW 01686 } 01687 01695 static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp) 01696 { 01697 #if HAVE_MMX2 || HAVE_AMD3DNOW 01698 src+= 4*stride; 01699 __asm__ volatile( 01700 "lea (%0, %1), %%"REG_a" \n\t" 01701 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 01702 // 0 1 2 3 4 5 6 7 8 9 01703 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 01704 01705 "movq (%2), %%mm0 \n\t" // L0 01706 "movq (%%"REG_a"), %%mm1 \n\t" // L2 01707 PAVGB(%%mm1, %%mm0) // L0+L2 01708 "movq (%0), %%mm2 \n\t" // L1 01709 PAVGB(%%mm2, %%mm0) 01710 "movq %%mm0, (%0) \n\t" 01711 "movq (%%"REG_a", %1), %%mm0 \n\t" // L3 01712 PAVGB(%%mm0, %%mm2) // L1+L3 01713 PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 01714 "movq %%mm2, (%%"REG_a") \n\t" 01715 "movq (%%"REG_a", %1, 2), %%mm2 \n\t" // L4 01716 PAVGB(%%mm2, %%mm1) // L2+L4 01717 PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 01718 "movq %%mm1, (%%"REG_a", %1) \n\t" 01719 "movq (%0, %1, 4), %%mm1 \n\t" // L5 01720 PAVGB(%%mm1, %%mm0) // L3+L5 01721 PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 01722 "movq %%mm0, (%%"REG_a", %1, 2) \n\t" 01723 "movq (%%"REG_d"), %%mm0 \n\t" // L6 01724 PAVGB(%%mm0, %%mm2) // L4+L6 01725 PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 01726 "movq %%mm2, (%0, %1, 4) \n\t" 01727 "movq (%%"REG_d", %1), %%mm2 \n\t" // L7 01728 PAVGB(%%mm2, %%mm1) // L5+L7 01729 PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 01730 "movq %%mm1, (%%"REG_d") \n\t" 01731 "movq (%%"REG_d", %1, 2), %%mm1 \n\t" // L8 01732 PAVGB(%%mm1, %%mm0) // L6+L8 01733 PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 01734 "movq %%mm0, (%%"REG_d", %1) \n\t" 01735 "movq (%0, %1, 8), %%mm0 \n\t" // L9 01736 PAVGB(%%mm0, %%mm2) // L7+L9 01737 PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 01738 "movq %%mm2, (%%"REG_d", %1, 2) \n\t" 01739 "movq %%mm1, (%2) \n\t" 01740 01741 : : "r" (src), "r" ((x86_reg)stride), "r" (tmp) 01742 : "%"REG_a, "%"REG_d 01743 ); 01744 #else //HAVE_MMX2 || HAVE_AMD3DNOW 01745 int a, b, c, x; 01746 src+= 4*stride; 01747 01748 for(x=0; x<2; x++){ 01749 a= *(uint32_t*)&tmp[stride*0]; 01750 b= *(uint32_t*)&src[stride*0]; 01751 c= *(uint32_t*)&src[stride*1]; 01752 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); 01753 *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 01754 01755 a= *(uint32_t*)&src[stride*2]; 01756 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); 01757 *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); 01758 01759 b= *(uint32_t*)&src[stride*3]; 01760 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); 01761 *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); 01762 01763 c= *(uint32_t*)&src[stride*4]; 01764 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); 01765 *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 01766 01767 a= *(uint32_t*)&src[stride*5]; 01768 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); 01769 *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); 01770 01771 b= *(uint32_t*)&src[stride*6]; 01772 c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); 01773 *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); 01774 01775 c= *(uint32_t*)&src[stride*7]; 01776 a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); 01777 *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); 01778 01779 a= *(uint32_t*)&src[stride*8]; 01780 b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); 01781 *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); 01782 01783 *(uint32_t*)&tmp[stride*0]= c; 01784 src += 4; 01785 tmp += 4; 01786 } 01787 #endif //HAVE_MMX2 || HAVE_AMD3DNOW 01788 } 01789 01796 static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) 01797 { 01798 #if HAVE_MMX 01799 src+= 4*stride; 01800 #if HAVE_MMX2 01801 __asm__ volatile( 01802 "lea (%0, %1), %%"REG_a" \n\t" 01803 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 01804 // 0 1 2 3 4 5 6 7 8 9 01805 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 01806 01807 "movq (%0), %%mm0 \n\t" // 01808 "movq (%%"REG_a", %1), %%mm2 \n\t" // 01809 "movq (%%"REG_a"), %%mm1 \n\t" // 01810 "movq %%mm0, %%mm3 \n\t" 01811 "pmaxub %%mm1, %%mm0 \n\t" // 01812 "pminub %%mm3, %%mm1 \n\t" // 01813 "pmaxub %%mm2, %%mm1 \n\t" // 01814 "pminub %%mm1, %%mm0 \n\t" 01815 "movq %%mm0, (%%"REG_a") \n\t" 01816 01817 "movq (%0, %1, 4), %%mm0 \n\t" // 01818 "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // 01819 "movq %%mm2, %%mm3 \n\t" 01820 "pmaxub %%mm1, %%mm2 \n\t" // 01821 "pminub %%mm3, %%mm1 \n\t" // 01822 "pmaxub %%mm0, %%mm1 \n\t" // 01823 "pminub %%mm1, %%mm2 \n\t" 01824 "movq %%mm2, (%%"REG_a", %1, 2) \n\t" 01825 01826 "movq (%%"REG_d"), %%mm2 \n\t" // 01827 "movq (%%"REG_d", %1), %%mm1 \n\t" // 01828 "movq %%mm2, %%mm3 \n\t" 01829 "pmaxub %%mm0, %%mm2 \n\t" // 01830 "pminub %%mm3, %%mm0 \n\t" // 01831 "pmaxub %%mm1, %%mm0 \n\t" // 01832 "pminub %%mm0, %%mm2 \n\t" 01833 "movq %%mm2, (%%"REG_d") \n\t" 01834 01835 "movq (%%"REG_d", %1, 2), %%mm2 \n\t" // 01836 "movq (%0, %1, 8), %%mm0 \n\t" // 01837 "movq %%mm2, %%mm3 \n\t" 01838 "pmaxub %%mm0, %%mm2 \n\t" // 01839 "pminub %%mm3, %%mm0 \n\t" // 01840 "pmaxub %%mm1, %%mm0 \n\t" // 01841 "pminub %%mm0, %%mm2 \n\t" 01842 "movq %%mm2, (%%"REG_d", %1, 2) \n\t" 01843 01844 01845 : : "r" (src), "r" ((x86_reg)stride) 01846 : "%"REG_a, "%"REG_d 01847 ); 01848 01849 #else // MMX without MMX2 01850 __asm__ volatile( 01851 "lea (%0, %1), %%"REG_a" \n\t" 01852 "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" 01853 // 0 1 2 3 4 5 6 7 8 9 01854 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 01855 "pxor %%mm7, %%mm7 \n\t" 01856 01857 #define REAL_MEDIAN(a,b,c)\ 01858 "movq " #a ", %%mm0 \n\t"\ 01859 "movq " #b ", %%mm2 \n\t"\ 01860 "movq " #c ", %%mm1 \n\t"\ 01861 "movq %%mm0, %%mm3 \n\t"\ 01862 "movq %%mm1, %%mm4 \n\t"\ 01863 "movq %%mm2, %%mm5 \n\t"\ 01864 "psubusb %%mm1, %%mm3 \n\t"\ 01865 "psubusb %%mm2, %%mm4 \n\t"\ 01866 "psubusb %%mm0, %%mm5 \n\t"\ 01867 "pcmpeqb %%mm7, %%mm3 \n\t"\ 01868 "pcmpeqb %%mm7, %%mm4 \n\t"\ 01869 "pcmpeqb %%mm7, %%mm5 \n\t"\ 01870 "movq %%mm3, %%mm6 \n\t"\ 01871 "pxor %%mm4, %%mm3 \n\t"\ 01872 "pxor %%mm5, %%mm4 \n\t"\ 01873 "pxor %%mm6, %%mm5 \n\t"\ 01874 "por %%mm3, %%mm1 \n\t"\ 01875 "por %%mm4, %%mm2 \n\t"\ 01876 "por %%mm5, %%mm0 \n\t"\ 01877 "pand %%mm2, %%mm0 \n\t"\ 01878 "pand %%mm1, %%mm0 \n\t"\ 01879 "movq %%mm0, " #b " \n\t" 01880 #define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c) 01881 01882 MEDIAN((%0) , (%%REGa) , (%%REGa, %1)) 01883 MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4)) 01884 MEDIAN((%0, %1, 4) , (%%REGd) , (%%REGd, %1)) 01885 MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8)) 01886 01887 : : "r" (src), "r" ((x86_reg)stride) 01888 : "%"REG_a, "%"REG_d 01889 ); 01890 #endif //HAVE_MMX2 01891 #else //HAVE_MMX 01892 int x, y; 01893 src+= 4*stride; 01894 // FIXME - there should be a way to do a few columns in parallel like w/mmx 01895 for(x=0; x<8; x++){ 01896 uint8_t *colsrc = src; 01897 for (y=0; y<4; y++){ 01898 int a, b, c, d, e, f; 01899 a = colsrc[0 ]; 01900 b = colsrc[stride ]; 01901 c = colsrc[stride*2]; 01902 d = (a-b)>>31; 01903 e = (b-c)>>31; 01904 f = (c-a)>>31; 01905 colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f)); 01906 colsrc += stride*2; 01907 } 01908 src++; 01909 } 01910 #endif //HAVE_MMX 01911 } 01912 01913 #if HAVE_MMX 01914 01917 static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) 01918 { 01919 __asm__( 01920 "lea (%0, %1), %%"REG_a" \n\t" 01921 // 0 1 2 3 4 5 6 7 8 9 01922 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 01923 "movq (%0), %%mm0 \n\t" // 12345678 01924 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh 01925 "movq %%mm0, %%mm2 \n\t" // 12345678 01926 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 01927 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 01928 01929 "movq (%%"REG_a", %1), %%mm1 \n\t" 01930 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" 01931 "movq %%mm1, %%mm4 \n\t" 01932 "punpcklbw %%mm3, %%mm1 \n\t" 01933 "punpckhbw %%mm3, %%mm4 \n\t" 01934 01935 "movq %%mm0, %%mm3 \n\t" 01936 "punpcklwd %%mm1, %%mm0 \n\t" 01937 "punpckhwd %%mm1, %%mm3 \n\t" 01938 "movq %%mm2, %%mm1 \n\t" 01939 "punpcklwd %%mm4, %%mm2 \n\t" 01940 "punpckhwd %%mm4, %%mm1 \n\t" 01941 01942 "movd %%mm0, 128(%2) \n\t" 01943 "psrlq $32, %%mm0 \n\t" 01944 "movd %%mm0, 144(%2) \n\t" 01945 "movd %%mm3, 160(%2) \n\t" 01946 "psrlq $32, %%mm3 \n\t" 01947 "movd %%mm3, 176(%2) \n\t" 01948 "movd %%mm3, 48(%3) \n\t" 01949 "movd %%mm2, 192(%2) \n\t" 01950 "movd %%mm2, 64(%3) \n\t" 01951 "psrlq $32, %%mm2 \n\t" 01952 "movd %%mm2, 80(%3) \n\t" 01953 "movd %%mm1, 96(%3) \n\t" 01954 "psrlq $32, %%mm1 \n\t" 01955 "movd %%mm1, 112(%3) \n\t" 01956 01957 "lea (%%"REG_a", %1, 4), %%"REG_a" \n\t" 01958 01959 "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 01960 "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh 01961 "movq %%mm0, %%mm2 \n\t" // 12345678 01962 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 01963 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 01964 01965 "movq (%%"REG_a", %1), %%mm1 \n\t" 01966 "movq (%%"REG_a", %1, 2), %%mm3 \n\t" 01967 "movq %%mm1, %%mm4 \n\t" 01968 "punpcklbw %%mm3, %%mm1 \n\t" 01969 "punpckhbw %%mm3, %%mm4 \n\t" 01970 01971 "movq %%mm0, %%mm3 \n\t" 01972 "punpcklwd %%mm1, %%mm0 \n\t" 01973 "punpckhwd %%mm1, %%mm3 \n\t" 01974 "movq %%mm2, %%mm1 \n\t" 01975 "punpcklwd %%mm4, %%mm2 \n\t" 01976 "punpckhwd %%mm4, %%mm1 \n\t" 01977 01978 "movd %%mm0, 132(%2) \n\t" 01979 "psrlq $32, %%mm0 \n\t" 01980 "movd %%mm0, 148(%2) \n\t" 01981 "movd %%mm3, 164(%2) \n\t" 01982 "psrlq $32, %%mm3 \n\t" 01983 "movd %%mm3, 180(%2) \n\t" 01984 "movd %%mm3, 52(%3) \n\t" 01985 "movd %%mm2, 196(%2) \n\t" 01986 "movd %%mm2, 68(%3) \n\t" 01987 "psrlq $32, %%mm2 \n\t" 01988 "movd %%mm2, 84(%3) \n\t" 01989 "movd %%mm1, 100(%3) \n\t" 01990 "psrlq $32, %%mm1 \n\t" 01991 "movd %%mm1, 116(%3) \n\t" 01992 01993 01994 :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2) 01995 : "%"REG_a 01996 ); 01997 } 01998 02002 static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) 02003 { 02004 __asm__( 02005 "lea (%0, %1), %%"REG_a" \n\t" 02006 "lea (%%"REG_a",%1,4), %%"REG_d" \n\t" 02007 // 0 1 2 3 4 5 6 7 8 9 02008 // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 02009 "movq (%2), %%mm0 \n\t" // 12345678 02010 "movq 16(%2), %%mm1 \n\t" // abcdefgh 02011 "movq %%mm0, %%mm2 \n\t" // 12345678 02012 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 02013 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 02014 02015 "movq 32(%2), %%mm1 \n\t" 02016 "movq 48(%2), %%mm3 \n\t" 02017 "movq %%mm1, %%mm4 \n\t" 02018 "punpcklbw %%mm3, %%mm1 \n\t" 02019 "punpckhbw %%mm3, %%mm4 \n\t" 02020 02021 "movq %%mm0, %%mm3 \n\t" 02022 "punpcklwd %%mm1, %%mm0 \n\t" 02023 "punpckhwd %%mm1, %%mm3 \n\t" 02024 "movq %%mm2, %%mm1 \n\t" 02025 "punpcklwd %%mm4, %%mm2 \n\t" 02026 "punpckhwd %%mm4, %%mm1 \n\t" 02027 02028 "movd %%mm0, (%0) \n\t" 02029 "psrlq $32, %%mm0 \n\t" 02030 "movd %%mm0, (%%"REG_a") \n\t" 02031 "movd %%mm3, (%%"REG_a", %1) \n\t" 02032 "psrlq $32, %%mm3 \n\t" 02033 "movd %%mm3, (%%"REG_a", %1, 2) \n\t" 02034 "movd %%mm2, (%0, %1, 4) \n\t" 02035 "psrlq $32, %%mm2 \n\t" 02036 "movd %%mm2, (%%"REG_d") \n\t" 02037 "movd %%mm1, (%%"REG_d", %1) \n\t" 02038 "psrlq $32, %%mm1 \n\t" 02039 "movd %%mm1, (%%"REG_d", %1, 2) \n\t" 02040 02041 02042 "movq 64(%2), %%mm0 \n\t" // 12345678 02043 "movq 80(%2), %%mm1 \n\t" // abcdefgh 02044 "movq %%mm0, %%mm2 \n\t" // 12345678 02045 "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d 02046 "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h 02047 02048 "movq 96(%2), %%mm1 \n\t" 02049 "movq 112(%2), %%mm3 \n\t" 02050 "movq %%mm1, %%mm4 \n\t" 02051 "punpcklbw %%mm3, %%mm1 \n\t" 02052 "punpckhbw %%mm3, %%mm4 \n\t" 02053 02054 "movq %%mm0, %%mm3 \n\t" 02055 "punpcklwd %%mm1, %%mm0 \n\t" 02056 "punpckhwd %%mm1, %%mm3 \n\t" 02057 "movq %%mm2, %%mm1 \n\t" 02058 "punpcklwd %%mm4, %%mm2 \n\t" 02059 "punpckhwd %%mm4, %%mm1 \n\t" 02060 02061 "movd %%mm0, 4(%0) \n\t" 02062 "psrlq $32, %%mm0 \n\t" 02063 "movd %%mm0, 4(%%"REG_a") \n\t" 02064 "movd %%mm3, 4(%%"REG_a", %1) \n\t" 02065 "psrlq $32, %%mm3 \n\t" 02066 "movd %%mm3, 4(%%"REG_a", %1, 2) \n\t" 02067 "movd %%mm2, 4(%0, %1, 4) \n\t" 02068 "psrlq $32, %%mm2 \n\t" 02069 "movd %%mm2, 4(%%"REG_d") \n\t" 02070 "movd %%mm1, 4(%%"REG_d", %1) \n\t" 02071 "psrlq $32, %%mm1 \n\t" 02072 "movd %%mm1, 4(%%"REG_d", %1, 2) \n\t" 02073 02074 :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src) 02075 : "%"REG_a, "%"REG_d 02076 ); 02077 } 02078 #endif //HAVE_MMX 02079 //static long test=0; 02080 02081 #if !HAVE_ALTIVEC 02082 static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, 02083 uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise) 02084 { 02085 // to save a register (FIXME do this outside of the loops) 02086 tempBlurredPast[127]= maxNoise[0]; 02087 tempBlurredPast[128]= maxNoise[1]; 02088 tempBlurredPast[129]= maxNoise[2]; 02089 02090 #define FAST_L2_DIFF 02091 //#define L1_DIFF //u should change the thresholds too if u try that one 02092 #if HAVE_MMX2 || HAVE_AMD3DNOW 02093 __asm__ volatile( 02094 "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride 02095 "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride 02096 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 02097 // 0 1 2 3 4 5 6 7 8 9 02098 // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2 02099 //FIXME reorder? 02100 #ifdef L1_DIFF //needs mmx2 02101 "movq (%0), %%mm0 \n\t" // L0 02102 "psadbw (%1), %%mm0 \n\t" // |L0-R0| 02103 "movq (%0, %2), %%mm1 \n\t" // L1 02104 "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| 02105 "movq (%0, %2, 2), %%mm2 \n\t" // L2 02106 "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| 02107 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 02108 "psadbw (%1, %%"REG_a"), %%mm3 \n\t" // |L3-R3| 02109 02110 "movq (%0, %2, 4), %%mm4 \n\t" // L4 02111 "paddw %%mm1, %%mm0 \n\t" 02112 "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| 02113 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 02114 "paddw %%mm2, %%mm0 \n\t" 02115 "psadbw (%1, %%"REG_d"), %%mm5 \n\t" // |L5-R5| 02116 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 02117 "paddw %%mm3, %%mm0 \n\t" 02118 "psadbw (%1, %%"REG_a", 2), %%mm6 \n\t" // |L6-R6| 02119 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 02120 "paddw %%mm4, %%mm0 \n\t" 02121 "psadbw (%1, %%"REG_c"), %%mm7 \n\t" // |L7-R7| 02122 "paddw %%mm5, %%mm6 \n\t" 02123 "paddw %%mm7, %%mm6 \n\t" 02124 "paddw %%mm6, %%mm0 \n\t" 02125 #else //L1_DIFF 02126 #if defined (FAST_L2_DIFF) 02127 "pcmpeqb %%mm7, %%mm7 \n\t" 02128 "movq "MANGLE(b80)", %%mm6 \n\t" 02129 "pxor %%mm0, %%mm0 \n\t" 02130 #define REAL_L2_DIFF_CORE(a, b)\ 02131 "movq " #a ", %%mm5 \n\t"\ 02132 "movq " #b ", %%mm2 \n\t"\ 02133 "pxor %%mm7, %%mm2 \n\t"\ 02134 PAVGB(%%mm2, %%mm5)\ 02135 "paddb %%mm6, %%mm5 \n\t"\ 02136 "movq %%mm5, %%mm2 \n\t"\ 02137 "psllw $8, %%mm5 \n\t"\ 02138 "pmaddwd %%mm5, %%mm5 \n\t"\ 02139 "pmaddwd %%mm2, %%mm2 \n\t"\ 02140 "paddd %%mm2, %%mm5 \n\t"\ 02141 "psrld $14, %%mm5 \n\t"\ 02142 "paddd %%mm5, %%mm0 \n\t" 02143 02144 #else //defined (FAST_L2_DIFF) 02145 "pxor %%mm7, %%mm7 \n\t" 02146 "pxor %%mm0, %%mm0 \n\t" 02147 #define REAL_L2_DIFF_CORE(a, b)\ 02148 "movq " #a ", %%mm5 \n\t"\ 02149 "movq " #b ", %%mm2 \n\t"\ 02150 "movq %%mm5, %%mm1 \n\t"\ 02151 "movq %%mm2, %%mm3 \n\t"\ 02152 "punpcklbw %%mm7, %%mm5 \n\t"\ 02153 "punpckhbw %%mm7, %%mm1 \n\t"\ 02154 "punpcklbw %%mm7, %%mm2 \n\t"\ 02155 "punpckhbw %%mm7, %%mm3 \n\t"\ 02156 "psubw %%mm2, %%mm5 \n\t"\ 02157 "psubw %%mm3, %%mm1 \n\t"\ 02158 "pmaddwd %%mm5, %%mm5 \n\t"\ 02159 "pmaddwd %%mm1, %%mm1 \n\t"\ 02160 "paddd %%mm1, %%mm5 \n\t"\ 02161 "paddd %%mm5, %%mm0 \n\t" 02162 02163 #endif //defined (FAST_L2_DIFF) 02164 02165 #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b) 02166 02167 L2_DIFF_CORE((%0) , (%1)) 02168 L2_DIFF_CORE((%0, %2) , (%1, %2)) 02169 L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2)) 02170 L2_DIFF_CORE((%0, %%REGa) , (%1, %%REGa)) 02171 L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4)) 02172 L2_DIFF_CORE((%0, %%REGd) , (%1, %%REGd)) 02173 L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2)) 02174 L2_DIFF_CORE((%0, %%REGc) , (%1, %%REGc)) 02175 02176 #endif //L1_DIFF 02177 02178 "movq %%mm0, %%mm4 \n\t" 02179 "psrlq $32, %%mm0 \n\t" 02180 "paddd %%mm0, %%mm4 \n\t" 02181 "movd %%mm4, %%ecx \n\t" 02182 "shll $2, %%ecx \n\t" 02183 "mov %3, %%"REG_d" \n\t" 02184 "addl -4(%%"REG_d"), %%ecx \n\t" 02185 "addl 4(%%"REG_d"), %%ecx \n\t" 02186 "addl -1024(%%"REG_d"), %%ecx \n\t" 02187 "addl $4, %%ecx \n\t" 02188 "addl 1024(%%"REG_d"), %%ecx \n\t" 02189 "shrl $3, %%ecx \n\t" 02190 "movl %%ecx, (%%"REG_d") \n\t" 02191 02192 // "mov %3, %%"REG_c" \n\t" 02193 // "mov %%"REG_c", test \n\t" 02194 // "jmp 4f \n\t" 02195 "cmpl 512(%%"REG_d"), %%ecx \n\t" 02196 " jb 2f \n\t" 02197 "cmpl 516(%%"REG_d"), %%ecx \n\t" 02198 " jb 1f \n\t" 02199 02200 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 02201 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 02202 "movq (%0), %%mm0 \n\t" // L0 02203 "movq (%0, %2), %%mm1 \n\t" // L1 02204 "movq (%0, %2, 2), %%mm2 \n\t" // L2 02205 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 02206 "movq (%0, %2, 4), %%mm4 \n\t" // L4 02207 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 02208 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 02209 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 02210 "movq %%mm0, (%1) \n\t" // L0 02211 "movq %%mm1, (%1, %2) \n\t" // L1 02212 "movq %%mm2, (%1, %2, 2) \n\t" // L2 02213 "movq %%mm3, (%1, %%"REG_a") \n\t" // L3 02214 "movq %%mm4, (%1, %2, 4) \n\t" // L4 02215 "movq %%mm5, (%1, %%"REG_d") \n\t" // L5 02216 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // L6 02217 "movq %%mm7, (%1, %%"REG_c") \n\t" // L7 02218 "jmp 4f \n\t" 02219 02220 "1: \n\t" 02221 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 02222 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 02223 "movq (%0), %%mm0 \n\t" // L0 02224 PAVGB((%1), %%mm0) // L0 02225 "movq (%0, %2), %%mm1 \n\t" // L1 02226 PAVGB((%1, %2), %%mm1) // L1 02227 "movq (%0, %2, 2), %%mm2 \n\t" // L2 02228 PAVGB((%1, %2, 2), %%mm2) // L2 02229 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 02230 PAVGB((%1, %%REGa), %%mm3) // L3 02231 "movq (%0, %2, 4), %%mm4 \n\t" // L4 02232 PAVGB((%1, %2, 4), %%mm4) // L4 02233 "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 02234 PAVGB((%1, %%REGd), %%mm5) // L5 02235 "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 02236 PAVGB((%1, %%REGa, 2), %%mm6) // L6 02237 "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 02238 PAVGB((%1, %%REGc), %%mm7) // L7 02239 "movq %%mm0, (%1) \n\t" // R0 02240 "movq %%mm1, (%1, %2) \n\t" // R1 02241 "movq %%mm2, (%1, %2, 2) \n\t" // R2 02242 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 02243 "movq %%mm4, (%1, %2, 4) \n\t" // R4 02244 "movq %%mm5, (%1, %%"REG_d") \n\t" // R5 02245 "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // R6 02246 "movq %%mm7, (%1, %%"REG_c") \n\t" // R7 02247 "movq %%mm0, (%0) \n\t" // L0 02248 "movq %%mm1, (%0, %2) \n\t" // L1 02249 "movq %%mm2, (%0, %2, 2) \n\t" // L2 02250 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 02251 "movq %%mm4, (%0, %2, 4) \n\t" // L4 02252 "movq %%mm5, (%0, %%"REG_d") \n\t" // L5 02253 "movq %%mm6, (%0, %%"REG_a", 2) \n\t" // L6 02254 "movq %%mm7, (%0, %%"REG_c") \n\t" // L7 02255 "jmp 4f \n\t" 02256 02257 "2: \n\t" 02258 "cmpl 508(%%"REG_d"), %%ecx \n\t" 02259 " jb 3f \n\t" 02260 02261 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 02262 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 02263 "movq (%0), %%mm0 \n\t" // L0 02264 "movq (%0, %2), %%mm1 \n\t" // L1 02265 "movq (%0, %2, 2), %%mm2 \n\t" // L2 02266 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 02267 "movq (%1), %%mm4 \n\t" // R0 02268 "movq (%1, %2), %%mm5 \n\t" // R1 02269 "movq (%1, %2, 2), %%mm6 \n\t" // R2 02270 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 02271 PAVGB(%%mm4, %%mm0) 02272 PAVGB(%%mm5, %%mm1) 02273 PAVGB(%%mm6, %%mm2) 02274 PAVGB(%%mm7, %%mm3) 02275 PAVGB(%%mm4, %%mm0) 02276 PAVGB(%%mm5, %%mm1) 02277 PAVGB(%%mm6, %%mm2) 02278 PAVGB(%%mm7, %%mm3) 02279 "movq %%mm0, (%1) \n\t" // R0 02280 "movq %%mm1, (%1, %2) \n\t" // R1 02281 "movq %%mm2, (%1, %2, 2) \n\t" // R2 02282 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 02283 "movq %%mm0, (%0) \n\t" // L0 02284 "movq %%mm1, (%0, %2) \n\t" // L1 02285 "movq %%mm2, (%0, %2, 2) \n\t" // L2 02286 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 02287 02288 "movq (%0, %2, 4), %%mm0 \n\t" // L4 02289 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 02290 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 02291 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 02292 "movq (%1, %2, 4), %%mm4 \n\t" // R4 02293 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 02294 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 02295 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 02296 PAVGB(%%mm4, %%mm0) 02297 PAVGB(%%mm5, %%mm1) 02298 PAVGB(%%mm6, %%mm2) 02299 PAVGB(%%mm7, %%mm3) 02300 PAVGB(%%mm4, %%mm0) 02301 PAVGB(%%mm5, %%mm1) 02302 PAVGB(%%mm6, %%mm2) 02303 PAVGB(%%mm7, %%mm3) 02304 "movq %%mm0, (%1, %2, 4) \n\t" // R4 02305 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 02306 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 02307 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 02308 "movq %%mm0, (%0, %2, 4) \n\t" // L4 02309 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 02310 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 02311 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 02312 "jmp 4f \n\t" 02313 02314 "3: \n\t" 02315 "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride 02316 "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride 02317 "movq (%0), %%mm0 \n\t" // L0 02318 "movq (%0, %2), %%mm1 \n\t" // L1 02319 "movq (%0, %2, 2), %%mm2 \n\t" // L2 02320 "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 02321 "movq (%1), %%mm4 \n\t" // R0 02322 "movq (%1, %2), %%mm5 \n\t" // R1 02323 "movq (%1, %2, 2), %%mm6 \n\t" // R2 02324 "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 02325 PAVGB(%%mm4, %%mm0) 02326 PAVGB(%%mm5, %%mm1) 02327 PAVGB(%%mm6, %%mm2) 02328 PAVGB(%%mm7, %%mm3) 02329 PAVGB(%%mm4, %%mm0) 02330 PAVGB(%%mm5, %%mm1) 02331 PAVGB(%%mm6, %%mm2) 02332 PAVGB(%%mm7, %%mm3) 02333 PAVGB(%%mm4, %%mm0) 02334 PAVGB(%%mm5, %%mm1) 02335 PAVGB(%%mm6, %%mm2) 02336 PAVGB(%%mm7, %%mm3) 02337 "movq %%mm0, (%1) \n\t" // R0 02338 "movq %%mm1, (%1, %2) \n\t" // R1 02339 "movq %%mm2, (%1, %2, 2) \n\t" // R2 02340 "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 02341 "movq %%mm0, (%0) \n\t" // L0 02342 "movq %%mm1, (%0, %2) \n\t" // L1 02343 "movq %%mm2, (%0, %2, 2) \n\t" // L2 02344 "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 02345 02346 "movq (%0, %2, 4), %%mm0 \n\t" // L4 02347 "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 02348 "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 02349 "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 02350 "movq (%1, %2, 4), %%mm4 \n\t" // R4 02351 "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 02352 "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 02353 "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 02354 PAVGB(%%mm4, %%mm0) 02355 PAVGB(%%mm5, %%mm1) 02356 PAVGB(%%mm6, %%mm2) 02357 PAVGB(%%mm7, %%mm3) 02358 PAVGB(%%mm4, %%mm0) 02359 PAVGB(%%mm5, %%mm1) 02360 PAVGB(%%mm6, %%mm2) 02361 PAVGB(%%mm7, %%mm3) 02362 PAVGB(%%mm4, %%mm0) 02363 PAVGB(%%mm5, %%mm1) 02364 PAVGB(%%mm6, %%mm2) 02365 PAVGB(%%mm7, %%mm3) 02366 "movq %%mm0, (%1, %2, 4) \n\t" // R4 02367 "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 02368 "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 02369 "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 02370 "movq %%mm0, (%0, %2, 4) \n\t" // L4 02371 "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 02372 "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 02373 "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 02374 02375 "4: \n\t" 02376 02377 :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast) 02378 : "%"REG_a, "%"REG_d, "%"REG_c, "memory" 02379 ); 02380 #else //HAVE_MMX2 || HAVE_AMD3DNOW 02381 { 02382 int y; 02383 int d=0; 02384 // int sysd=0; 02385 int i; 02386 02387 for(y=0; y<8; y++){ 02388 int x; 02389 for(x=0; x<8; x++){ 02390 int ref= tempBlurred[ x + y*stride ]; 02391 int cur= src[ x + y*stride ]; 02392 int d1=ref - cur; 02393 // if(x==0 || x==7) d1+= d1>>1; 02394 // if(y==0 || y==7) d1+= d1>>1; 02395 // d+= FFABS(d1); 02396 d+= d1*d1; 02397 // sysd+= d1; 02398 } 02399 } 02400 i=d; 02401 d= ( 02402 4*d 02403 +(*(tempBlurredPast-256)) 02404 +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1)) 02405 +(*(tempBlurredPast+256)) 02406 +4)>>3; 02407 *tempBlurredPast=i; 02408 // ((*tempBlurredPast)*3 + d + 2)>>2; 02409 02410 /* 02411 Switch between 02412 1 0 0 0 0 0 0 (0) 02413 64 32 16 8 4 2 1 (1) 02414 64 48 36 27 20 15 11 (33) (approx) 02415 64 56 49 43 37 33 29 (200) (approx) 02416 */ 02417 if(d > maxNoise[1]){ 02418 if(d < maxNoise[2]){ 02419 for(y=0; y<8; y++){ 02420 int x; 02421 for(x=0; x<8; x++){ 02422 int ref= tempBlurred[ x + y*stride ]; 02423 int cur= src[ x + y*stride ]; 02424 tempBlurred[ x + y*stride ]= 02425 src[ x + y*stride ]= 02426 (ref + cur + 1)>>1; 02427 } 02428 } 02429 }else{ 02430 for(y=0; y<8; y++){ 02431 int x; 02432 for(x=0; x<8; x++){ 02433 tempBlurred[ x + y*stride ]= src[ x + y*stride ]; 02434 } 02435 } 02436 } 02437 }else{ 02438 if(d < maxNoise[0]){ 02439 for(y=0; y<8; y++){ 02440 int x; 02441 for(x=0; x<8; x++){ 02442 int ref= tempBlurred[ x + y*stride ]; 02443 int cur= src[ x + y*stride ]; 02444 tempBlurred[ x + y*stride ]= 02445 src[ x + y*stride ]= 02446 (ref*7 + cur + 4)>>3; 02447 } 02448 } 02449 }else{ 02450 for(y=0; y<8; y++){ 02451 int x; 02452 for(x=0; x<8; x++){ 02453 int ref= tempBlurred[ x + y*stride ]; 02454 int cur= src[ x + y*stride ]; 02455 tempBlurred[ x + y*stride ]= 02456 src[ x + y*stride ]= 02457 (ref*3 + cur + 2)>>2; 02458 } 02459 } 02460 } 02461 } 02462 } 02463 #endif //HAVE_MMX2 || HAVE_AMD3DNOW 02464 } 02465 #endif //HAVE_ALTIVEC 02466 02467 #if HAVE_MMX 02468 02471 static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ 02472 int64_t dc_mask, eq_mask, both_masks; 02473 int64_t sums[10*8*2]; 02474 src+= step*3; // src points to begin of the 8x8 Block 02475 //START_TIMER 02476 __asm__ volatile( 02477 "movq %0, %%mm7 \n\t" 02478 "movq %1, %%mm6 \n\t" 02479 : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) 02480 ); 02481 02482 __asm__ volatile( 02483 "lea (%2, %3), %%"REG_a" \n\t" 02484 // 0 1 2 3 4 5 6 7 8 9 02485 // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 02486 02487 "movq (%2), %%mm0 \n\t" 02488 "movq (%%"REG_a"), %%mm1 \n\t" 02489 "movq %%mm1, %%mm3 \n\t" 02490 "movq %%mm1, %%mm4 \n\t" 02491 "psubb %%mm1, %%mm0 \n\t" // mm0 = differnece 02492 "paddb %%mm7, %%mm0 \n\t" 02493 "pcmpgtb %%mm6, %%mm0 \n\t" 02494 02495 "movq (%%"REG_a",%3), %%mm2 \n\t" 02496 PMAXUB(%%mm2, %%mm4) 02497 PMINUB(%%mm2, %%mm3, %%mm5) 02498 "psubb %%mm2, %%mm1 \n\t" 02499 "paddb %%mm7, %%mm1 \n\t" 02500 "pcmpgtb %%mm6, %%mm1 \n\t" 02501 "paddb %%mm1, %%mm0 \n\t" 02502 02503 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 02504 PMAXUB(%%mm1, %%mm4) 02505 PMINUB(%%mm1, %%mm3, %%mm5) 02506 "psubb %%mm1, %%mm2 \n\t" 02507 "paddb %%mm7, %%mm2 \n\t" 02508 "pcmpgtb %%mm6, %%mm2 \n\t" 02509 "paddb %%mm2, %%mm0 \n\t" 02510 02511 "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" 02512 02513 "movq (%2, %3, 4), %%mm2 \n\t" 02514 PMAXUB(%%mm2, %%mm4) 02515 PMINUB(%%mm2, %%mm3, %%mm5) 02516 "psubb %%mm2, %%mm1 \n\t" 02517 "paddb %%mm7, %%mm1 \n\t" 02518 "pcmpgtb %%mm6, %%mm1 \n\t" 02519 "paddb %%mm1, %%mm0 \n\t" 02520 02521 "movq (%%"REG_a"), %%mm1 \n\t" 02522 PMAXUB(%%mm1, %%mm4) 02523 PMINUB(%%mm1, %%mm3, %%mm5) 02524 "psubb %%mm1, %%mm2 \n\t" 02525 "paddb %%mm7, %%mm2 \n\t" 02526 "pcmpgtb %%mm6, %%mm2 \n\t" 02527 "paddb %%mm2, %%mm0 \n\t" 02528 02529 "movq (%%"REG_a", %3), %%mm2 \n\t" 02530 PMAXUB(%%mm2, %%mm4) 02531 PMINUB(%%mm2, %%mm3, %%mm5) 02532 "psubb %%mm2, %%mm1 \n\t" 02533 "paddb %%mm7, %%mm1 \n\t" 02534 "pcmpgtb %%mm6, %%mm1 \n\t" 02535 "paddb %%mm1, %%mm0 \n\t" 02536 02537 "movq (%%"REG_a", %3, 2), %%mm1 \n\t" 02538 PMAXUB(%%mm1, %%mm4) 02539 PMINUB(%%mm1, %%mm3, %%mm5) 02540 "psubb %%mm1, %%mm2 \n\t" 02541 "paddb %%mm7, %%mm2 \n\t" 02542 "pcmpgtb %%mm6, %%mm2 \n\t" 02543 "paddb %%mm2, %%mm0 \n\t" 02544 02545 "movq (%2, %3, 8), %%mm2 \n\t" 02546 PMAXUB(%%mm2, %%mm4) 02547 PMINUB(%%mm2, %%mm3, %%mm5) 02548 "psubb %%mm2, %%mm1 \n\t" 02549 "paddb %%mm7, %%mm1 \n\t" 02550 "pcmpgtb %%mm6, %%mm1 \n\t" 02551 "paddb %%mm1, %%mm0 \n\t" 02552 02553 "movq (%%"REG_a", %3, 4), %%mm1 \n\t" 02554 "psubb %%mm1, %%mm2 \n\t" 02555 "paddb %%mm7, %%mm2 \n\t" 02556 "pcmpgtb %%mm6, %%mm2 \n\t" 02557 "paddb %%mm2, %%mm0 \n\t" 02558 "psubusb %%mm3, %%mm4 \n\t" 02559 02560 "pxor %%mm6, %%mm6 \n\t" 02561 "movq %4, %%mm7 \n\t" // QP,..., QP 02562 "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP 02563 "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0 02564 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 02565 "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 02566 "movq %%mm7, %1 \n\t" 02567 02568 "movq %5, %%mm7 \n\t" 02569 "punpcklbw %%mm7, %%mm7 \n\t" 02570 "punpcklbw %%mm7, %%mm7 \n\t" 02571 "punpcklbw %%mm7, %%mm7 \n\t" 02572 "psubb %%mm0, %%mm6 \n\t" 02573 "pcmpgtb %%mm7, %%mm6 \n\t" 02574 "movq %%mm6, %0 \n\t" 02575 02576 : "=m" (eq_mask), "=m" (dc_mask) 02577 : "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) 02578 : "%"REG_a 02579 ); 02580 02581 both_masks = dc_mask & eq_mask; 02582 02583 if(both_masks){ 02584 x86_reg offset= -8*step; 02585 int64_t *temp_sums= sums; 02586 02587 __asm__ volatile( 02588 "movq %2, %%mm0 \n\t" // QP,..., QP 02589 "pxor %%mm4, %%mm4 \n\t" 02590 02591 "movq (%0), %%mm6 \n\t" 02592 "movq (%0, %1), %%mm5 \n\t" 02593 "movq %%mm5, %%mm1 \n\t" 02594 "movq %%mm6, %%mm2 \n\t" 02595 "psubusb %%mm6, %%mm5 \n\t" 02596 "psubusb %%mm1, %%mm2 \n\t" 02597 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 02598 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 02599 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF 02600 02601 "pxor %%mm6, %%mm1 \n\t" 02602 "pand %%mm0, %%mm1 \n\t" 02603 "pxor %%mm1, %%mm6 \n\t" 02604 // 0:QP 6:First 02605 02606 "movq (%0, %1, 8), %%mm5 \n\t" 02607 "add %1, %0 \n\t" // %0 points to line 1 not 0 02608 "movq (%0, %1, 8), %%mm7 \n\t" 02609 "movq %%mm5, %%mm1 \n\t" 02610 "movq %%mm7, %%mm2 \n\t" 02611 "psubusb %%mm7, %%mm5 \n\t" 02612 "psubusb %%mm1, %%mm2 \n\t" 02613 "por %%mm5, %%mm2 \n\t" // ABS Diff of lines 02614 "movq %2, %%mm0 \n\t" // QP,..., QP 02615 "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 02616 "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF 02617 02618 "pxor %%mm7, %%mm1 \n\t" 02619 "pand %%mm0, %%mm1 \n\t" 02620 "pxor %%mm1, %%mm7 \n\t" 02621 02622 "movq %%mm6, %%mm5 \n\t" 02623 "punpckhbw %%mm4, %%mm6 \n\t" 02624 "punpcklbw %%mm4, %%mm5 \n\t" 02625 // 4:0 5/6:First 7:Last 02626 02627 "movq %%mm5, %%mm0 \n\t" 02628 "movq %%mm6, %%mm1 \n\t" 02629 "psllw $2, %%mm0 \n\t" 02630 "psllw $2, %%mm1 \n\t" 02631 "paddw "MANGLE(w04)", %%mm0 \n\t" 02632 "paddw "MANGLE(w04)", %%mm1 \n\t" 02633 02634 #define NEXT\ 02635 "movq (%0), %%mm2 \n\t"\ 02636 "movq (%0), %%mm3 \n\t"\ 02637 "add %1, %0 \n\t"\ 02638 "punpcklbw %%mm4, %%mm2 \n\t"\ 02639 "punpckhbw %%mm4, %%mm3 \n\t"\ 02640 "paddw %%mm2, %%mm0 \n\t"\ 02641 "paddw %%mm3, %%mm1 \n\t" 02642 02643 #define PREV\ 02644 "movq (%0), %%mm2 \n\t"\ 02645 "movq (%0), %%mm3 \n\t"\ 02646 "add %1, %0 \n\t"\ 02647 "punpcklbw %%mm4, %%mm2 \n\t"\ 02648 "punpckhbw %%mm4, %%mm3 \n\t"\ 02649 "psubw %%mm2, %%mm0 \n\t"\ 02650 "psubw %%mm3, %%mm1 \n\t" 02651 02652 02653 NEXT //0 02654 NEXT //1 02655 NEXT //2 02656 "movq %%mm0, (%3) \n\t" 02657 "movq %%mm1, 8(%3) \n\t" 02658 02659 NEXT //3 02660 "psubw %%mm5, %%mm0 \n\t" 02661 "psubw %%mm6, %%mm1 \n\t" 02662 "movq %%mm0, 16(%3) \n\t" 02663 "movq %%mm1, 24(%3) \n\t" 02664 02665 NEXT //4 02666 "psubw %%mm5, %%mm0 \n\t" 02667 "psubw %%mm6, %%mm1 \n\t" 02668 "movq %%mm0, 32(%3) \n\t" 02669 "movq %%mm1, 40(%3) \n\t" 02670 02671 NEXT //5 02672 "psubw %%mm5, %%mm0 \n\t" 02673 "psubw %%mm6, %%mm1 \n\t" 02674 "movq %%mm0, 48(%3) \n\t" 02675 "movq %%mm1, 56(%3) \n\t" 02676 02677 NEXT //6 02678 "psubw %%mm5, %%mm0 \n\t" 02679 "psubw %%mm6, %%mm1 \n\t" 02680 "movq %%mm0, 64(%3) \n\t" 02681 "movq %%mm1, 72(%3) \n\t" 02682 02683 "movq %%mm7, %%mm6 \n\t" 02684 "punpckhbw %%mm4, %%mm7 \n\t" 02685 "punpcklbw %%mm4, %%mm6 \n\t" 02686 02687 NEXT //7 02688 "mov %4, %0 \n\t" 02689 "add %1, %0 \n\t" 02690 PREV //0 02691 "movq %%mm0, 80(%3) \n\t" 02692 "movq %%mm1, 88(%3) \n\t" 02693 02694 PREV //1 02695 "paddw %%mm6, %%mm0 \n\t" 02696 "paddw %%mm7, %%mm1 \n\t" 02697 "movq %%mm0, 96(%3) \n\t" 02698 "movq %%mm1, 104(%3) \n\t" 02699 02700 PREV //2 02701 "paddw %%mm6, %%mm0 \n\t" 02702 "paddw %%mm7, %%mm1 \n\t" 02703 "movq %%mm0, 112(%3) \n\t" 02704 "movq %%mm1, 120(%3) \n\t" 02705 02706 PREV //3 02707 "paddw %%mm6, %%mm0 \n\t" 02708 "paddw %%mm7, %%mm1 \n\t" 02709 "movq %%mm0, 128(%3) \n\t" 02710 "movq %%mm1, 136(%3) \n\t" 02711 02712 PREV //4 02713 "paddw %%mm6, %%mm0 \n\t" 02714 "paddw %%mm7, %%mm1 \n\t" 02715 "movq %%mm0, 144(%3) \n\t" 02716 "movq %%mm1, 152(%3) \n\t" 02717 02718 "mov %4, %0 \n\t" //FIXME 02719 02720 : "+&r"(src) 02721 : "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src) 02722 ); 02723 02724 src+= step; // src points to begin of the 8x8 Block 02725 02726 __asm__ volatile( 02727 "movq %4, %%mm6 \n\t" 02728 "pcmpeqb %%mm5, %%mm5 \n\t" 02729 "pxor %%mm6, %%mm5 \n\t" 02730 "pxor %%mm7, %%mm7 \n\t" 02731 02732 "1: \n\t" 02733 "movq (%1), %%mm0 \n\t" 02734 "movq 8(%1), %%mm1 \n\t" 02735 "paddw 32(%1), %%mm0 \n\t" 02736 "paddw 40(%1), %%mm1 \n\t" 02737 "movq (%0, %3), %%mm2 \n\t" 02738 "movq %%mm2, %%mm3 \n\t" 02739 "movq %%mm2, %%mm4 \n\t" 02740 "punpcklbw %%mm7, %%mm2 \n\t" 02741 "punpckhbw %%mm7, %%mm3 \n\t" 02742 "paddw %%mm2, %%mm0 \n\t" 02743 "paddw %%mm3, %%mm1 \n\t" 02744 "paddw %%mm2, %%mm0 \n\t" 02745 "paddw %%mm3, %%mm1 \n\t" 02746 "psrlw $4, %%mm0 \n\t" 02747 "psrlw $4, %%mm1 \n\t" 02748 "packuswb %%mm1, %%mm0 \n\t" 02749 "pand %%mm6, %%mm0 \n\t" 02750 "pand %%mm5, %%mm4 \n\t" 02751 "por %%mm4, %%mm0 \n\t" 02752 "movq %%mm0, (%0, %3) \n\t" 02753 "add $16, %1 \n\t" 02754 "add %2, %0 \n\t" 02755 " js 1b \n\t" 02756 02757 : "+r"(offset), "+r"(temp_sums) 02758 : "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks) 02759 ); 02760 }else 02761 src+= step; // src points to begin of the 8x8 Block 02762 02763 if(eq_mask != -1LL){ 02764 uint8_t *temp_src= src; 02765 __asm__ volatile( 02766 "pxor %%mm7, %%mm7 \n\t" 02767 "lea -40(%%"REG_SP"), %%"REG_c" \n\t" // make space for 4 8-byte vars 02768 "and "ALIGN_MASK", %%"REG_c" \n\t" // align 02769 // 0 1 2 3 4 5 6 7 8 9 02770 // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1 02771 02772 "movq (%0), %%mm0 \n\t" 02773 "movq %%mm0, %%mm1 \n\t" 02774 "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 02775 "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 02776 02777 "movq (%0, %1), %%mm2 \n\t" 02778 "lea (%0, %1, 2), %%"REG_a" \n\t" 02779 "movq %%mm2, %%mm3 \n\t" 02780 "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 02781 "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 02782 02783 "movq (%%"REG_a"), %%mm4 \n\t" 02784 "movq %%mm4, %%mm5 \n\t" 02785 "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 02786 "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 02787 02788 "paddw %%mm0, %%mm0 \n\t" // 2L0 02789 "paddw %%mm1, %%mm1 \n\t" // 2H0 02790 "psubw %%mm4, %%mm2 \n\t" // L1 - L2 02791 "psubw %%mm5, %%mm3 \n\t" // H1 - H2 02792 "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 02793 "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 02794 02795 "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 02796 "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 02797 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 02798 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 02799 02800 "movq (%%"REG_a", %1), %%mm2 \n\t" 02801 "movq %%mm2, %%mm3 \n\t" 02802 "punpcklbw %%mm7, %%mm2 \n\t" // L3 02803 "punpckhbw %%mm7, %%mm3 \n\t" // H3 02804 02805 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 02806 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 02807 "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 02808 "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 02809 "movq %%mm0, (%%"REG_c") \n\t" // 2L0 - 5L1 + 5L2 - 2L3 02810 "movq %%mm1, 8(%%"REG_c") \n\t" // 2H0 - 5H1 + 5H2 - 2H3 02811 02812 "movq (%%"REG_a", %1, 2), %%mm0 \n\t" 02813 "movq %%mm0, %%mm1 \n\t" 02814 "punpcklbw %%mm7, %%mm0 \n\t" // L4 02815 "punpckhbw %%mm7, %%mm1 \n\t" // H4 02816 02817 "psubw %%mm0, %%mm2 \n\t" // L3 - L4 02818 "psubw %%mm1, %%mm3 \n\t" // H3 - H4 02819 "movq %%mm2, 16(%%"REG_c") \n\t" // L3 - L4 02820 "movq %%mm3, 24(%%"REG_c") \n\t" // H3 - H4 02821 "paddw %%mm4, %%mm4 \n\t" // 2L2 02822 "paddw %%mm5, %%mm5 \n\t" // 2H2 02823 "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 02824 "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 02825 02826 "lea (%%"REG_a", %1), %0 \n\t" 02827 "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 02828 "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 02829 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 02830 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 02831 //50 opcodes so far 02832 "movq (%0, %1, 2), %%mm2 \n\t" 02833 "movq %%mm2, %%mm3 \n\t" 02834 "punpcklbw %%mm7, %%mm2 \n\t" // L5 02835 "punpckhbw %%mm7, %%mm3 \n\t" // H5 02836 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 02837 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 02838 "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 02839 "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 02840 02841 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 02842 "punpcklbw %%mm7, %%mm6 \n\t" // L6 02843 "psubw %%mm6, %%mm2 \n\t" // L5 - L6 02844 "movq (%%"REG_a", %1, 4), %%mm6 \n\t" 02845 "punpckhbw %%mm7, %%mm6 \n\t" // H6 02846 "psubw %%mm6, %%mm3 \n\t" // H5 - H6 02847 02848 "paddw %%mm0, %%mm0 \n\t" // 2L4 02849 "paddw %%mm1, %%mm1 \n\t" // 2H4 02850 "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 02851 "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 02852 02853 "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 02854 "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 02855 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 02856 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 02857 02858 "movq (%0, %1, 4), %%mm2 \n\t" 02859 "movq %%mm2, %%mm3 \n\t" 02860 "punpcklbw %%mm7, %%mm2 \n\t" // L7 02861 "punpckhbw %%mm7, %%mm3 \n\t" // H7 02862 02863 "paddw %%mm2, %%mm2 \n\t" // 2L7 02864 "paddw %%mm3, %%mm3 \n\t" // 2H7 02865 "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 02866 "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 02867 02868 "movq (%%"REG_c"), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 02869 "movq 8(%%"REG_c"), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 02870 02871 #if HAVE_MMX2 02872 "movq %%mm7, %%mm6 \n\t" // 0 02873 "psubw %%mm0, %%mm6 \n\t" 02874 "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 02875 "movq %%mm7, %%mm6 \n\t" // 0 02876 "psubw %%mm1, %%mm6 \n\t" 02877 "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 02878 "movq %%mm7, %%mm6 \n\t" // 0 02879 "psubw %%mm2, %%mm6 \n\t" 02880 "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 02881 "movq %%mm7, %%mm6 \n\t" // 0 02882 "psubw %%mm3, %%mm6 \n\t" 02883 "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 02884 #else 02885 "movq %%mm7, %%mm6 \n\t" // 0 02886 "pcmpgtw %%mm0, %%mm6 \n\t" 02887 "pxor %%mm6, %%mm0 \n\t" 02888 "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| 02889 "movq %%mm7, %%mm6 \n\t" // 0 02890 "pcmpgtw %%mm1, %%mm6 \n\t" 02891 "pxor %%mm6, %%mm1 \n\t" 02892 "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| 02893 "movq %%mm7, %%mm6 \n\t" // 0 02894 "pcmpgtw %%mm2, %%mm6 \n\t" 02895 "pxor %%mm6, %%mm2 \n\t" 02896 "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| 02897 "movq %%mm7, %%mm6 \n\t" // 0 02898 "pcmpgtw %%mm3, %%mm6 \n\t" 02899 "pxor %%mm6, %%mm3 \n\t" 02900 "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| 02901 #endif 02902 02903 #if HAVE_MMX2 02904 "pminsw %%mm2, %%mm0 \n\t" 02905 "pminsw %%mm3, %%mm1 \n\t" 02906 #else 02907 "movq %%mm0, %%mm6 \n\t" 02908 "psubusw %%mm2, %%mm6 \n\t" 02909 "psubw %%mm6, %%mm0 \n\t" 02910 "movq %%mm1, %%mm6 \n\t" 02911 "psubusw %%mm3, %%mm6 \n\t" 02912 "psubw %%mm6, %%mm1 \n\t" 02913 #endif 02914 02915 "movd %2, %%mm2 \n\t" // QP 02916 "punpcklbw %%mm7, %%mm2 \n\t" 02917 02918 "movq %%mm7, %%mm6 \n\t" // 0 02919 "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) 02920 "pxor %%mm6, %%mm4 \n\t" 02921 "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| 02922 "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) 02923 "pxor %%mm7, %%mm5 \n\t" 02924 "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| 02925 // 100 opcodes 02926 "psllw $3, %%mm2 \n\t" // 8QP 02927 "movq %%mm2, %%mm3 \n\t" // 8QP 02928 "pcmpgtw %%mm4, %%mm2 \n\t" 02929 "pcmpgtw %%mm5, %%mm3 \n\t" 02930 "pand %%mm2, %%mm4 \n\t" 02931 "pand %%mm3, %%mm5 \n\t" 02932 02933 02934 "psubusw %%mm0, %%mm4 \n\t" // hd 02935 "psubusw %%mm1, %%mm5 \n\t" // ld 02936 02937 02938 "movq "MANGLE(w05)", %%mm2 \n\t" // 5 02939 "pmullw %%mm2, %%mm4 \n\t" 02940 "pmullw %%mm2, %%mm5 \n\t" 02941 "movq "MANGLE(w20)", %%mm2 \n\t" // 32 02942 "paddw %%mm2, %%mm4 \n\t" 02943 "paddw %%mm2, %%mm5 \n\t" 02944 "psrlw $6, %%mm4 \n\t" 02945 "psrlw $6, %%mm5 \n\t" 02946 02947 "movq 16(%%"REG_c"), %%mm0 \n\t" // L3 - L4 02948 "movq 24(%%"REG_c"), %%mm1 \n\t" // H3 - H4 02949 02950 "pxor %%mm2, %%mm2 \n\t" 02951 "pxor %%mm3, %%mm3 \n\t" 02952 02953 "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) 02954 "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) 02955 "pxor %%mm2, %%mm0 \n\t" 02956 "pxor %%mm3, %%mm1 \n\t" 02957 "psubw %%mm2, %%mm0 \n\t" // |L3-L4| 02958 "psubw %%mm3, %%mm1 \n\t" // |H3-H4| 02959 "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 02960 "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 02961 02962 "pxor %%mm6, %%mm2 \n\t" 02963 "pxor %%mm7, %%mm3 \n\t" 02964 "pand %%mm2, %%mm4 \n\t" 02965 "pand %%mm3, %%mm5 \n\t" 02966 02967 #if HAVE_MMX2 02968 "pminsw %%mm0, %%mm4 \n\t" 02969 "pminsw %%mm1, %%mm5 \n\t" 02970 #else 02971 "movq %%mm4, %%mm2 \n\t" 02972 "psubusw %%mm0, %%mm2 \n\t" 02973 "psubw %%mm2, %%mm4 \n\t" 02974 "movq %%mm5, %%mm2 \n\t" 02975 "psubusw %%mm1, %%mm2 \n\t" 02976 "psubw %%mm2, %%mm5 \n\t" 02977 #endif 02978 "pxor %%mm6, %%mm4 \n\t" 02979 "pxor %%mm7, %%mm5 \n\t" 02980 "psubw %%mm6, %%mm4 \n\t" 02981 "psubw %%mm7, %%mm5 \n\t" 02982 "packsswb %%mm5, %%mm4 \n\t" 02983 "movq %3, %%mm1 \n\t" 02984 "pandn %%mm4, %%mm1 \n\t" 02985 "movq (%0), %%mm0 \n\t" 02986 "paddb %%mm1, %%mm0 \n\t" 02987 "movq %%mm0, (%0) \n\t" 02988 "movq (%0, %1), %%mm0 \n\t" 02989 "psubb %%mm1, %%mm0 \n\t" 02990 "movq %%mm0, (%0, %1) \n\t" 02991 02992 : "+r" (temp_src) 02993 : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask) 02994 : "%"REG_a, "%"REG_c 02995 ); 02996 } 02997 /*if(step==16){ 02998 STOP_TIMER("step16") 02999 }else{ 03000 STOP_TIMER("stepX") 03001 }*/ 03002 } 03003 #endif //HAVE_MMX 03004 03005 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 03006 const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c); 03007 03012 #undef REAL_SCALED_CPY 03013 #undef SCALED_CPY 03014 03015 static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride, 03016 int levelFix, int64_t *packedOffsetAndScale) 03017 { 03018 #if !HAVE_MMX 03019 int i; 03020 #endif 03021 if(levelFix){ 03022 #if HAVE_MMX 03023 __asm__ volatile( 03024 "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset 03025 "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale 03026 "lea (%2,%4), %%"REG_a" \n\t" 03027 "lea (%3,%5), %%"REG_d" \n\t" 03028 "pxor %%mm4, %%mm4 \n\t" 03029 #if HAVE_MMX2 03030 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ 03031 "movq " #src1 ", %%mm0 \n\t"\ 03032 "movq " #src1 ", %%mm5 \n\t"\ 03033 "movq " #src2 ", %%mm1 \n\t"\ 03034 "movq " #src2 ", %%mm6 \n\t"\ 03035 "punpcklbw %%mm0, %%mm0 \n\t"\ 03036 "punpckhbw %%mm5, %%mm5 \n\t"\ 03037 "punpcklbw %%mm1, %%mm1 \n\t"\ 03038 "punpckhbw %%mm6, %%mm6 \n\t"\ 03039 "pmulhuw %%mm3, %%mm0 \n\t"\ 03040 "pmulhuw %%mm3, %%mm5 \n\t"\ 03041 "pmulhuw %%mm3, %%mm1 \n\t"\ 03042 "pmulhuw %%mm3, %%mm6 \n\t"\ 03043 "psubw %%mm2, %%mm0 \n\t"\ 03044 "psubw %%mm2, %%mm5 \n\t"\ 03045 "psubw %%mm2, %%mm1 \n\t"\ 03046 "psubw %%mm2, %%mm6 \n\t"\ 03047 "packuswb %%mm5, %%mm0 \n\t"\ 03048 "packuswb %%mm6, %%mm1 \n\t"\ 03049 "movq %%mm0, " #dst1 " \n\t"\ 03050 "movq %%mm1, " #dst2 " \n\t"\ 03051 03052 #else //HAVE_MMX2 03053 #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ 03054 "movq " #src1 ", %%mm0 \n\t"\ 03055 "movq " #src1 ", %%mm5 \n\t"\ 03056 "punpcklbw %%mm4, %%mm0 \n\t"\ 03057 "punpckhbw %%mm4, %%mm5 \n\t"\ 03058 "psubw %%mm2, %%mm0 \n\t"\ 03059 "psubw %%mm2, %%mm5 \n\t"\ 03060 "movq " #src2 ", %%mm1 \n\t"\ 03061 "psllw $6, %%mm0 \n\t"\ 03062 "psllw $6, %%mm5 \n\t"\ 03063 "pmulhw %%mm3, %%mm0 \n\t"\ 03064 "movq " #src2 ", %%mm6 \n\t"\ 03065 "pmulhw %%mm3, %%mm5 \n\t"\ 03066 "punpcklbw %%mm4, %%mm1 \n\t"\ 03067 "punpckhbw %%mm4, %%mm6 \n\t"\ 03068 "psubw %%mm2, %%mm1 \n\t"\ 03069 "psubw %%mm2, %%mm6 \n\t"\ 03070 "psllw $6, %%mm1 \n\t"\ 03071 "psllw $6, %%mm6 \n\t"\ 03072 "pmulhw %%mm3, %%mm1 \n\t"\ 03073 "pmulhw %%mm3, %%mm6 \n\t"\ 03074 "packuswb %%mm5, %%mm0 \n\t"\ 03075 "packuswb %%mm6, %%mm1 \n\t"\ 03076 "movq %%mm0, " #dst1 " \n\t"\ 03077 "movq %%mm1, " #dst2 " \n\t"\ 03078 03079 #endif //HAVE_MMX2 03080 #define SCALED_CPY(src1, src2, dst1, dst2)\ 03081 REAL_SCALED_CPY(src1, src2, dst1, dst2) 03082 03083 SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) 03084 SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2)) 03085 SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4)) 03086 "lea (%%"REG_a",%4,4), %%"REG_a" \n\t" 03087 "lea (%%"REG_d",%5,4), %%"REG_d" \n\t" 03088 SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2)) 03089 03090 03091 : "=&a" (packedOffsetAndScale) 03092 : "0" (packedOffsetAndScale), 03093 "r"(src), 03094 "r"(dst), 03095 "r" ((x86_reg)srcStride), 03096 "r" ((x86_reg)dstStride) 03097 : "%"REG_d 03098 ); 03099 #else //HAVE_MMX 03100 for(i=0; i<8; i++) 03101 memcpy( &(dst[dstStride*i]), 03102 &(src[srcStride*i]), BLOCK_SIZE); 03103 #endif //HAVE_MMX 03104 }else{ 03105 #if HAVE_MMX 03106 __asm__ volatile( 03107 "lea (%0,%2), %%"REG_a" \n\t" 03108 "lea (%1,%3), %%"REG_d" \n\t" 03109 03110 #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \ 03111 "movq " #src1 ", %%mm0 \n\t"\ 03112 "movq " #src2 ", %%mm1 \n\t"\ 03113 "movq %%mm0, " #dst1 " \n\t"\ 03114 "movq %%mm1, " #dst2 " \n\t"\ 03115 03116 #define SIMPLE_CPY(src1, src2, dst1, dst2)\ 03117 REAL_SIMPLE_CPY(src1, src2, dst1, dst2) 03118 03119 SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) 03120 SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2)) 03121 SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4)) 03122 "lea (%%"REG_a",%2,4), %%"REG_a" \n\t" 03123 "lea (%%"REG_d",%3,4), %%"REG_d" \n\t" 03124 SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2)) 03125 03126 : : "r" (src), 03127 "r" (dst), 03128 "r" ((x86_reg)srcStride), 03129 "r" ((x86_reg)dstStride) 03130 : "%"REG_a, "%"REG_d 03131 ); 03132 #else //HAVE_MMX 03133 for(i=0; i<8; i++) 03134 memcpy( &(dst[dstStride*i]), 03135 &(src[srcStride*i]), BLOCK_SIZE); 03136 #endif //HAVE_MMX 03137 } 03138 } 03139 03143 static inline void RENAME(duplicate)(uint8_t src[], int stride) 03144 { 03145 #if HAVE_MMX 03146 __asm__ volatile( 03147 "movq (%0), %%mm0 \n\t" 03148 "add %1, %0 \n\t" 03149 "movq %%mm0, (%0) \n\t" 03150 "movq %%mm0, (%0, %1) \n\t" 03151 "movq %%mm0, (%0, %1, 2) \n\t" 03152 : "+r" (src) 03153 : "r" ((x86_reg)-stride) 03154 ); 03155 #else 03156 int i; 03157 uint8_t *p=src; 03158 for(i=0; i<3; i++){ 03159 p-= stride; 03160 memcpy(p, src, 8); 03161 } 03162 #endif 03163 } 03164 03168 static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, 03169 const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2) 03170 { 03171 DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access 03172 int x,y; 03173 #ifdef COMPILE_TIME_MODE 03174 const int mode= COMPILE_TIME_MODE; 03175 #else 03176 const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode; 03177 #endif 03178 int black=0, white=255; // blackest black and whitest white in the picture 03179 int QPCorrecture= 256*256; 03180 03181 int copyAhead; 03182 #if HAVE_MMX 03183 int i; 03184 #endif 03185 03186 const int qpHShift= isColor ? 4-c.hChromaSubSample : 4; 03187 const int qpVShift= isColor ? 4-c.vChromaSubSample : 4; 03188 03189 //FIXME remove 03190 uint64_t * const yHistogram= c.yHistogram; 03191 uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride; 03192 uint8_t * const tempDst= dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride; 03193 //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4; 03194 03195 #if HAVE_MMX 03196 for(i=0; i<57; i++){ 03197 int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1; 03198 int threshold= offset*2 + 1; 03199 c.mmxDcOffset[i]= 0x7F - offset; 03200 c.mmxDcThreshold[i]= 0x7F - threshold; 03201 c.mmxDcOffset[i]*= 0x0101010101010101LL; 03202 c.mmxDcThreshold[i]*= 0x0101010101010101LL; 03203 } 03204 #endif 03205 03206 if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; 03207 else if( (mode & LINEAR_BLEND_DEINT_FILTER) 03208 || (mode & FFMPEG_DEINT_FILTER) 03209 || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14; 03210 else if( (mode & V_DEBLOCK) 03211 || (mode & LINEAR_IPOL_DEINT_FILTER) 03212 || (mode & MEDIAN_DEINT_FILTER) 03213 || (mode & V_A_DEBLOCK)) copyAhead=13; 03214 else if(mode & V_X1_FILTER) copyAhead=11; 03215 // else if(mode & V_RK1_FILTER) copyAhead=10; 03216 else if(mode & DERING) copyAhead=9; 03217 else copyAhead=8; 03218 03219 copyAhead-= 8; 03220 03221 if(!isColor){ 03222 uint64_t sum= 0; 03223 int i; 03224 uint64_t maxClipped; 03225 uint64_t clipped; 03226 double scale; 03227 03228 c.frameNum++; 03229 // first frame is fscked so we ignore it 03230 if(c.frameNum == 1) yHistogram[0]= width*height/64*15/256; 03231 03232 for(i=0; i<256; i++){ 03233 sum+= yHistogram[i]; 03234 } 03235 03236 /* We always get a completely black picture first. */ 03237 maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold); 03238 03239 clipped= sum; 03240 for(black=255; black>0; black--){ 03241 if(clipped < maxClipped) break; 03242 clipped-= yHistogram[black]; 03243 } 03244 03245 clipped= sum; 03246 for(white=0; white<256; white++){ 03247 if(clipped < maxClipped) break; 03248 clipped-= yHistogram[white]; 03249 } 03250 03251 scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black); 03252 03253 #if HAVE_MMX2 03254 c.packedYScale= (uint16_t)(scale*256.0 + 0.5); 03255 c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF; 03256 #else 03257 c.packedYScale= (uint16_t)(scale*1024.0 + 0.5); 03258 c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF; 03259 #endif 03260 03261 c.packedYOffset|= c.packedYOffset<<32; 03262 c.packedYOffset|= c.packedYOffset<<16; 03263 03264 c.packedYScale|= c.packedYScale<<32; 03265 c.packedYScale|= c.packedYScale<<16; 03266 03267 if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5); 03268 else QPCorrecture= 256*256; 03269 }else{ 03270 c.packedYScale= 0x0100010001000100LL; 03271 c.packedYOffset= 0; 03272 QPCorrecture= 256*256; 03273 } 03274 03275 /* copy & deinterlace first row of blocks */ 03276 y=-BLOCK_SIZE; 03277 { 03278 const uint8_t *srcBlock= &(src[y*srcStride]); 03279 uint8_t *dstBlock= tempDst + dstStride; 03280 03281 // From this point on it is guaranteed that we can read and write 16 lines downward 03282 // finish 1 block before the next otherwise we might have a problem 03283 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing 03284 for(x=0; x<width; x+=BLOCK_SIZE){ 03285 03286 #if HAVE_MMX2 03287 /* 03288 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); 03289 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); 03290 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); 03291 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); 03292 */ 03293 03294 __asm__( 03295 "mov %4, %%"REG_a" \n\t" 03296 "shr $2, %%"REG_a" \n\t" 03297 "and $6, %%"REG_a" \n\t" 03298 "add %5, %%"REG_a" \n\t" 03299 "mov %%"REG_a", %%"REG_d" \n\t" 03300 "imul %1, %%"REG_a" \n\t" 03301 "imul %3, %%"REG_d" \n\t" 03302 "prefetchnta 32(%%"REG_a", %0) \n\t" 03303 "prefetcht0 32(%%"REG_d", %2) \n\t" 03304 "add %1, %%"REG_a" \n\t" 03305 "add %3, %%"REG_d" \n\t" 03306 "prefetchnta 32(%%"REG_a", %0) \n\t" 03307 "prefetcht0 32(%%"REG_d", %2) \n\t" 03308 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), 03309 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) 03310 : "%"REG_a, "%"REG_d 03311 ); 03312 03313 #elif HAVE_AMD3DNOW 03314 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... 03315 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 03316 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); 03317 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 03318 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 03319 */ 03320 #endif 03321 03322 RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, 03323 srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); 03324 03325 RENAME(duplicate)(dstBlock + dstStride*8, dstStride); 03326 03327 if(mode & LINEAR_IPOL_DEINT_FILTER) 03328 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); 03329 else if(mode & LINEAR_BLEND_DEINT_FILTER) 03330 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); 03331 else if(mode & MEDIAN_DEINT_FILTER) 03332 RENAME(deInterlaceMedian)(dstBlock, dstStride); 03333 else if(mode & CUBIC_IPOL_DEINT_FILTER) 03334 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); 03335 else if(mode & FFMPEG_DEINT_FILTER) 03336 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); 03337 else if(mode & LOWPASS5_DEINT_FILTER) 03338 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); 03339 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) 03340 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); 03341 */ 03342 dstBlock+=8; 03343 srcBlock+=8; 03344 } 03345 if(width==FFABS(dstStride)) 03346 linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride); 03347 else{ 03348 int i; 03349 for(i=0; i<copyAhead; i++){ 03350 memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width); 03351 } 03352 } 03353 } 03354 03355 for(y=0; y<height; y+=BLOCK_SIZE){ 03356 //1% speedup if these are here instead of the inner loop 03357 const uint8_t *srcBlock= &(src[y*srcStride]); 03358 uint8_t *dstBlock= &(dst[y*dstStride]); 03359 #if HAVE_MMX 03360 uint8_t *tempBlock1= c.tempBlocks; 03361 uint8_t *tempBlock2= c.tempBlocks + 8; 03362 #endif 03363 const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride]; 03364 int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)]; 03365 int QP=0; 03366 /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards 03367 if not than use a temporary buffer */ 03368 if(y+15 >= height){ 03369 int i; 03370 /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with 03371 blockcopy to dst later */ 03372 linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, 03373 FFMAX(height-y-copyAhead, 0), srcStride); 03374 03375 /* duplicate last line of src to fill the void upto line (copyAhead+7) */ 03376 for(i=FFMAX(height-y, 8); i<copyAhead+8; i++) 03377 memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride)); 03378 03379 /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ 03380 linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride); 03381 03382 /* duplicate last line of dst to fill the void upto line (copyAhead) */ 03383 for(i=height-y+1; i<=copyAhead; i++) 03384 memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride)); 03385 03386 dstBlock= tempDst + dstStride; 03387 srcBlock= tempSrc; 03388 } 03389 03390 // From this point on it is guaranteed that we can read and write 16 lines downward 03391 // finish 1 block before the next otherwise we might have a problem 03392 // with the L1 Cache of the P4 ... or only a few blocks at a time or soemthing 03393 for(x=0; x<width; x+=BLOCK_SIZE){ 03394 const int stride= dstStride; 03395 #if HAVE_MMX 03396 uint8_t *tmpXchg; 03397 #endif 03398 if(isColor){ 03399 QP= QPptr[x>>qpHShift]; 03400 c.nonBQP= nonBQPptr[x>>qpHShift]; 03401 }else{ 03402 QP= QPptr[x>>4]; 03403 QP= (QP* QPCorrecture + 256*128)>>16; 03404 c.nonBQP= nonBQPptr[x>>4]; 03405 c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16; 03406 yHistogram[ srcBlock[srcStride*12 + 4] ]++; 03407 } 03408 c.QP= QP; 03409 #if HAVE_MMX 03410 __asm__ volatile( 03411 "movd %1, %%mm7 \n\t" 03412 "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP 03413 "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP 03414 "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP 03415 "movq %%mm7, %0 \n\t" 03416 : "=m" (c.pQPb) 03417 : "r" (QP) 03418 ); 03419 #endif 03420 03421 03422 #if HAVE_MMX2 03423 /* 03424 prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); 03425 prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); 03426 prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); 03427 prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); 03428 */ 03429 03430 __asm__( 03431 "mov %4, %%"REG_a" \n\t" 03432 "shr $2, %%"REG_a" \n\t" 03433 "and $6, %%"REG_a" \n\t" 03434 "add %5, %%"REG_a" \n\t" 03435 "mov %%"REG_a", %%"REG_d" \n\t" 03436 "imul %1, %%"REG_a" \n\t" 03437 "imul %3, %%"REG_d" \n\t" 03438 "prefetchnta 32(%%"REG_a", %0) \n\t" 03439 "prefetcht0 32(%%"REG_d", %2) \n\t" 03440 "add %1, %%"REG_a" \n\t" 03441 "add %3, %%"REG_d" \n\t" 03442 "prefetchnta 32(%%"REG_a", %0) \n\t" 03443 "prefetcht0 32(%%"REG_d", %2) \n\t" 03444 :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), 03445 "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) 03446 : "%"REG_a, "%"REG_d 03447 ); 03448 03449 #elif HAVE_AMD3DNOW 03450 //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... 03451 /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); 03452 prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); 03453 prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); 03454 prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); 03455 */ 03456 #endif 03457 03458 RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, 03459 srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset); 03460 03461 if(mode & LINEAR_IPOL_DEINT_FILTER) 03462 RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); 03463 else if(mode & LINEAR_BLEND_DEINT_FILTER) 03464 RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); 03465 else if(mode & MEDIAN_DEINT_FILTER) 03466 RENAME(deInterlaceMedian)(dstBlock, dstStride); 03467 else if(mode & CUBIC_IPOL_DEINT_FILTER) 03468 RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); 03469 else if(mode & FFMPEG_DEINT_FILTER) 03470 RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); 03471 else if(mode & LOWPASS5_DEINT_FILTER) 03472 RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); 03473 /* else if(mode & CUBIC_BLEND_DEINT_FILTER) 03474 RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); 03475 */ 03476 03477 /* only deblock if we have 2 blocks */ 03478 if(y + 8 < height){ 03479 if(mode & V_X1_FILTER) 03480 RENAME(vertX1Filter)(dstBlock, stride, &c); 03481 else if(mode & V_DEBLOCK){ 03482 const int t= RENAME(vertClassify)(dstBlock, stride, &c); 03483 03484 if(t==1) 03485 RENAME(doVertLowPass)(dstBlock, stride, &c); 03486 else if(t==2) 03487 RENAME(doVertDefFilter)(dstBlock, stride, &c); 03488 }else if(mode & V_A_DEBLOCK){ 03489 RENAME(do_a_deblock)(dstBlock, stride, 1, &c); 03490 } 03491 } 03492 03493 #if HAVE_MMX 03494 RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); 03495 #endif 03496 /* check if we have a previous block to deblock it with dstBlock */ 03497 if(x - 8 >= 0){ 03498 #if HAVE_MMX 03499 if(mode & H_X1_FILTER) 03500 RENAME(vertX1Filter)(tempBlock1, 16, &c); 03501 else if(mode & H_DEBLOCK){ 03502 //START_TIMER 03503 const int t= RENAME(vertClassify)(tempBlock1, 16, &c); 03504 //STOP_TIMER("dc & minmax") 03505 if(t==1) 03506 RENAME(doVertLowPass)(tempBlock1, 16, &c); 03507 else if(t==2) 03508 RENAME(doVertDefFilter)(tempBlock1, 16, &c); 03509 }else if(mode & H_A_DEBLOCK){ 03510 RENAME(do_a_deblock)(tempBlock1, 16, 1, &c); 03511 } 03512 03513 RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); 03514 03515 #else 03516 if(mode & H_X1_FILTER) 03517 horizX1Filter(dstBlock-4, stride, QP); 03518 else if(mode & H_DEBLOCK){ 03519 #if HAVE_ALTIVEC 03520 DECLARE_ALIGNED(16, unsigned char, tempBlock)[272]; 03521 transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride); 03522 03523 const int t=vertClassify_altivec(tempBlock-48, 16, &c); 03524 if(t==1) { 03525 doVertLowPass_altivec(tempBlock-48, 16, &c); 03526 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); 03527 } 03528 else if(t==2) { 03529 doVertDefFilter_altivec(tempBlock-48, 16, &c); 03530 transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); 03531 } 03532 #else 03533 const int t= RENAME(horizClassify)(dstBlock-4, stride, &c); 03534 03535 if(t==1) 03536 RENAME(doHorizLowPass)(dstBlock-4, stride, &c); 03537 else if(t==2) 03538 RENAME(doHorizDefFilter)(dstBlock-4, stride, &c); 03539 #endif 03540 }else if(mode & H_A_DEBLOCK){ 03541 RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c); 03542 } 03543 #endif //HAVE_MMX 03544 if(mode & DERING){ 03545 //FIXME filter first line 03546 if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c); 03547 } 03548 03549 if(mode & TEMP_NOISE_FILTER) 03550 { 03551 RENAME(tempNoiseReducer)(dstBlock-8, stride, 03552 c.tempBlurred[isColor] + y*dstStride + x, 03553 c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3), 03554 c.ppMode.maxTmpNoise); 03555 } 03556 } 03557 03558 dstBlock+=8; 03559 srcBlock+=8; 03560 03561 #if HAVE_MMX 03562 tmpXchg= tempBlock1; 03563 tempBlock1= tempBlock2; 03564 tempBlock2 = tmpXchg; 03565 #endif 03566 } 03567 03568 if(mode & DERING){ 03569 if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c); 03570 } 03571 03572 if((mode & TEMP_NOISE_FILTER)){ 03573 RENAME(tempNoiseReducer)(dstBlock-8, dstStride, 03574 c.tempBlurred[isColor] + y*dstStride + x, 03575 c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3), 03576 c.ppMode.maxTmpNoise); 03577 } 03578 03579 /* did we use a tmp buffer for the last lines*/ 03580 if(y+15 >= height){ 03581 uint8_t *dstBlock= &(dst[y*dstStride]); 03582 if(width==FFABS(dstStride)) 03583 linecpy(dstBlock, tempDst + dstStride, height-y, dstStride); 03584 else{ 03585 int i; 03586 for(i=0; i<height-y; i++){ 03587 memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width); 03588 } 03589 } 03590 } 03591 /* 03592 for(x=0; x<width; x+=32){ 03593 volatile int i; 03594 i+= + dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] 03595 + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] 03596 + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; 03597 + dstBlock[x +13*dstStride] 03598 + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; 03599 }*/ 03600 } 03601 #if HAVE_AMD3DNOW 03602 __asm__ volatile("femms"); 03603 #elif HAVE_MMX 03604 __asm__ volatile("emms"); 03605 #endif 03606 03607 #ifdef DEBUG_BRIGHTNESS 03608 if(!isColor){ 03609 int max=1; 03610 int i; 03611 for(i=0; i<256; i++) 03612 if(yHistogram[i] > max) max=yHistogram[i]; 03613 03614 for(i=1; i<256; i++){ 03615 int x; 03616 int start=yHistogram[i-1]/(max/256+1); 03617 int end=yHistogram[i]/(max/256+1); 03618 int inc= end > start ? 1 : -1; 03619 for(x=start; x!=end+inc; x+=inc) 03620 dst[ i*dstStride + x]+=128; 03621 } 03622 03623 for(i=0; i<100; i+=2){ 03624 dst[ (white)*dstStride + i]+=128; 03625 dst[ (black)*dstStride + i]+=128; 03626 } 03627 } 03628 #endif 03629 03630 *c2= c; //copy local context back 03631 03632 }