Libav 0.7.1
|
00001 /* 00002 * software RGB to RGB converter 00003 * pluralize by software PAL8 to RGB converter 00004 * software YUV to YUV converter 00005 * software YUV to RGB converter 00006 * Written by Nick Kurshev. 00007 * palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at) 00008 * lot of big-endian byte order fixes by Alex Beregszaszi 00009 * 00010 * This file is part of Libav. 00011 * 00012 * Libav is free software; you can redistribute it and/or 00013 * modify it under the terms of the GNU Lesser General Public 00014 * License as published by the Free Software Foundation; either 00015 * version 2.1 of the License, or (at your option) any later version. 00016 * 00017 * Libav is distributed in the hope that it will be useful, 00018 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00019 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00020 * Lesser General Public License for more details. 00021 * 00022 * You should have received a copy of the GNU Lesser General Public 00023 * License along with Libav; if not, write to the Free Software 00024 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00025 */ 00026 00027 #include <stddef.h> 00028 00029 #undef PREFETCH 00030 #undef MOVNTQ 00031 #undef EMMS 00032 #undef SFENCE 00033 #undef PAVGB 00034 00035 #if COMPILE_TEMPLATE_AMD3DNOW 00036 #define PREFETCH "prefetch" 00037 #define PAVGB "pavgusb" 00038 #elif COMPILE_TEMPLATE_MMX2 00039 #define PREFETCH "prefetchnta" 00040 #define PAVGB "pavgb" 00041 #else 00042 #define PREFETCH " # nop" 00043 #endif 00044 00045 #if COMPILE_TEMPLATE_AMD3DNOW 00046 /* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */ 00047 #define EMMS "femms" 00048 #else 00049 #define EMMS "emms" 00050 #endif 00051 00052 #if COMPILE_TEMPLATE_MMX2 00053 #define MOVNTQ "movntq" 00054 #define SFENCE "sfence" 00055 #else 00056 #define MOVNTQ "movq" 00057 #define SFENCE " # nop" 00058 #endif 00059 00060 #if !COMPILE_TEMPLATE_SSE2 00061 00062 #if !COMPILE_TEMPLATE_AMD3DNOW 00063 00064 static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size) 00065 { 00066 uint8_t *dest = dst; 00067 const uint8_t *s = src; 00068 const uint8_t *end; 00069 const uint8_t *mm_end; 00070 end = s + src_size; 00071 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 00072 mm_end = end - 23; 00073 __asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory"); 00074 while (s < mm_end) { 00075 __asm__ volatile( 00076 PREFETCH" 32%1 \n\t" 00077 "movd %1, %%mm0 \n\t" 00078 "punpckldq 3%1, %%mm0 \n\t" 00079 "movd 6%1, %%mm1 \n\t" 00080 "punpckldq 9%1, %%mm1 \n\t" 00081 "movd 12%1, %%mm2 \n\t" 00082 "punpckldq 15%1, %%mm2 \n\t" 00083 "movd 18%1, %%mm3 \n\t" 00084 "punpckldq 21%1, %%mm3 \n\t" 00085 "por %%mm7, %%mm0 \n\t" 00086 "por %%mm7, %%mm1 \n\t" 00087 "por %%mm7, %%mm2 \n\t" 00088 "por %%mm7, %%mm3 \n\t" 00089 MOVNTQ" %%mm0, %0 \n\t" 00090 MOVNTQ" %%mm1, 8%0 \n\t" 00091 MOVNTQ" %%mm2, 16%0 \n\t" 00092 MOVNTQ" %%mm3, 24%0" 00093 :"=m"(*dest) 00094 :"m"(*s) 00095 :"memory"); 00096 dest += 32; 00097 s += 24; 00098 } 00099 __asm__ volatile(SFENCE:::"memory"); 00100 __asm__ volatile(EMMS:::"memory"); 00101 while (s < end) { 00102 *dest++ = *s++; 00103 *dest++ = *s++; 00104 *dest++ = *s++; 00105 *dest++ = 255; 00106 } 00107 } 00108 00109 #define STORE_BGR24_MMX \ 00110 "psrlq $8, %%mm2 \n\t" \ 00111 "psrlq $8, %%mm3 \n\t" \ 00112 "psrlq $8, %%mm6 \n\t" \ 00113 "psrlq $8, %%mm7 \n\t" \ 00114 "pand "MANGLE(mask24l)", %%mm0\n\t" \ 00115 "pand "MANGLE(mask24l)", %%mm1\n\t" \ 00116 "pand "MANGLE(mask24l)", %%mm4\n\t" \ 00117 "pand "MANGLE(mask24l)", %%mm5\n\t" \ 00118 "pand "MANGLE(mask24h)", %%mm2\n\t" \ 00119 "pand "MANGLE(mask24h)", %%mm3\n\t" \ 00120 "pand "MANGLE(mask24h)", %%mm6\n\t" \ 00121 "pand "MANGLE(mask24h)", %%mm7\n\t" \ 00122 "por %%mm2, %%mm0 \n\t" \ 00123 "por %%mm3, %%mm1 \n\t" \ 00124 "por %%mm6, %%mm4 \n\t" \ 00125 "por %%mm7, %%mm5 \n\t" \ 00126 \ 00127 "movq %%mm1, %%mm2 \n\t" \ 00128 "movq %%mm4, %%mm3 \n\t" \ 00129 "psllq $48, %%mm2 \n\t" \ 00130 "psllq $32, %%mm3 \n\t" \ 00131 "pand "MANGLE(mask24hh)", %%mm2\n\t" \ 00132 "pand "MANGLE(mask24hhh)", %%mm3\n\t" \ 00133 "por %%mm2, %%mm0 \n\t" \ 00134 "psrlq $16, %%mm1 \n\t" \ 00135 "psrlq $32, %%mm4 \n\t" \ 00136 "psllq $16, %%mm5 \n\t" \ 00137 "por %%mm3, %%mm1 \n\t" \ 00138 "pand "MANGLE(mask24hhhh)", %%mm5\n\t" \ 00139 "por %%mm5, %%mm4 \n\t" \ 00140 \ 00141 MOVNTQ" %%mm0, %0 \n\t" \ 00142 MOVNTQ" %%mm1, 8%0 \n\t" \ 00143 MOVNTQ" %%mm4, 16%0" 00144 00145 00146 static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) 00147 { 00148 uint8_t *dest = dst; 00149 const uint8_t *s = src; 00150 const uint8_t *end; 00151 const uint8_t *mm_end; 00152 end = s + src_size; 00153 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 00154 mm_end = end - 31; 00155 while (s < mm_end) { 00156 __asm__ volatile( 00157 PREFETCH" 32%1 \n\t" 00158 "movq %1, %%mm0 \n\t" 00159 "movq 8%1, %%mm1 \n\t" 00160 "movq 16%1, %%mm4 \n\t" 00161 "movq 24%1, %%mm5 \n\t" 00162 "movq %%mm0, %%mm2 \n\t" 00163 "movq %%mm1, %%mm3 \n\t" 00164 "movq %%mm4, %%mm6 \n\t" 00165 "movq %%mm5, %%mm7 \n\t" 00166 STORE_BGR24_MMX 00167 :"=m"(*dest) 00168 :"m"(*s) 00169 :"memory"); 00170 dest += 24; 00171 s += 32; 00172 } 00173 __asm__ volatile(SFENCE:::"memory"); 00174 __asm__ volatile(EMMS:::"memory"); 00175 while (s < end) { 00176 *dest++ = *s++; 00177 *dest++ = *s++; 00178 *dest++ = *s++; 00179 s++; 00180 } 00181 } 00182 00183 /* 00184 original by Strepto/Astral 00185 ported to gcc & bugfixed: A'rpi 00186 MMX2, 3DNOW optimization by Nick Kurshev 00187 32-bit C version, and and&add trick by Michael Niedermayer 00188 */ 00189 static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size) 00190 { 00191 register const uint8_t* s=src; 00192 register uint8_t* d=dst; 00193 register const uint8_t *end; 00194 const uint8_t *mm_end; 00195 end = s + src_size; 00196 __asm__ volatile(PREFETCH" %0"::"m"(*s)); 00197 __asm__ volatile("movq %0, %%mm4"::"m"(mask15s)); 00198 mm_end = end - 15; 00199 while (s<mm_end) { 00200 __asm__ volatile( 00201 PREFETCH" 32%1 \n\t" 00202 "movq %1, %%mm0 \n\t" 00203 "movq 8%1, %%mm2 \n\t" 00204 "movq %%mm0, %%mm1 \n\t" 00205 "movq %%mm2, %%mm3 \n\t" 00206 "pand %%mm4, %%mm0 \n\t" 00207 "pand %%mm4, %%mm2 \n\t" 00208 "paddw %%mm1, %%mm0 \n\t" 00209 "paddw %%mm3, %%mm2 \n\t" 00210 MOVNTQ" %%mm0, %0 \n\t" 00211 MOVNTQ" %%mm2, 8%0" 00212 :"=m"(*d) 00213 :"m"(*s) 00214 ); 00215 d+=16; 00216 s+=16; 00217 } 00218 __asm__ volatile(SFENCE:::"memory"); 00219 __asm__ volatile(EMMS:::"memory"); 00220 mm_end = end - 3; 00221 while (s < mm_end) { 00222 register unsigned x= *((const uint32_t *)s); 00223 *((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0); 00224 d+=4; 00225 s+=4; 00226 } 00227 if (s < end) { 00228 register unsigned short x= *((const uint16_t *)s); 00229 *((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0); 00230 } 00231 } 00232 00233 static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size) 00234 { 00235 register const uint8_t* s=src; 00236 register uint8_t* d=dst; 00237 register const uint8_t *end; 00238 const uint8_t *mm_end; 00239 end = s + src_size; 00240 __asm__ volatile(PREFETCH" %0"::"m"(*s)); 00241 __asm__ volatile("movq %0, %%mm7"::"m"(mask15rg)); 00242 __asm__ volatile("movq %0, %%mm6"::"m"(mask15b)); 00243 mm_end = end - 15; 00244 while (s<mm_end) { 00245 __asm__ volatile( 00246 PREFETCH" 32%1 \n\t" 00247 "movq %1, %%mm0 \n\t" 00248 "movq 8%1, %%mm2 \n\t" 00249 "movq %%mm0, %%mm1 \n\t" 00250 "movq %%mm2, %%mm3 \n\t" 00251 "psrlq $1, %%mm0 \n\t" 00252 "psrlq $1, %%mm2 \n\t" 00253 "pand %%mm7, %%mm0 \n\t" 00254 "pand %%mm7, %%mm2 \n\t" 00255 "pand %%mm6, %%mm1 \n\t" 00256 "pand %%mm6, %%mm3 \n\t" 00257 "por %%mm1, %%mm0 \n\t" 00258 "por %%mm3, %%mm2 \n\t" 00259 MOVNTQ" %%mm0, %0 \n\t" 00260 MOVNTQ" %%mm2, 8%0" 00261 :"=m"(*d) 00262 :"m"(*s) 00263 ); 00264 d+=16; 00265 s+=16; 00266 } 00267 __asm__ volatile(SFENCE:::"memory"); 00268 __asm__ volatile(EMMS:::"memory"); 00269 mm_end = end - 3; 00270 while (s < mm_end) { 00271 register uint32_t x= *((const uint32_t*)s); 00272 *((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F); 00273 s+=4; 00274 d+=4; 00275 } 00276 if (s < end) { 00277 register uint16_t x= *((const uint16_t*)s); 00278 *((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F); 00279 } 00280 } 00281 00282 static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size) 00283 { 00284 const uint8_t *s = src; 00285 const uint8_t *end; 00286 const uint8_t *mm_end; 00287 uint16_t *d = (uint16_t *)dst; 00288 end = s + src_size; 00289 mm_end = end - 15; 00290 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster) 00291 __asm__ volatile( 00292 "movq %3, %%mm5 \n\t" 00293 "movq %4, %%mm6 \n\t" 00294 "movq %5, %%mm7 \n\t" 00295 "jmp 2f \n\t" 00296 ".p2align 4 \n\t" 00297 "1: \n\t" 00298 PREFETCH" 32(%1) \n\t" 00299 "movd (%1), %%mm0 \n\t" 00300 "movd 4(%1), %%mm3 \n\t" 00301 "punpckldq 8(%1), %%mm0 \n\t" 00302 "punpckldq 12(%1), %%mm3 \n\t" 00303 "movq %%mm0, %%mm1 \n\t" 00304 "movq %%mm3, %%mm4 \n\t" 00305 "pand %%mm6, %%mm0 \n\t" 00306 "pand %%mm6, %%mm3 \n\t" 00307 "pmaddwd %%mm7, %%mm0 \n\t" 00308 "pmaddwd %%mm7, %%mm3 \n\t" 00309 "pand %%mm5, %%mm1 \n\t" 00310 "pand %%mm5, %%mm4 \n\t" 00311 "por %%mm1, %%mm0 \n\t" 00312 "por %%mm4, %%mm3 \n\t" 00313 "psrld $5, %%mm0 \n\t" 00314 "pslld $11, %%mm3 \n\t" 00315 "por %%mm3, %%mm0 \n\t" 00316 MOVNTQ" %%mm0, (%0) \n\t" 00317 "add $16, %1 \n\t" 00318 "add $8, %0 \n\t" 00319 "2: \n\t" 00320 "cmp %2, %1 \n\t" 00321 " jb 1b \n\t" 00322 : "+r" (d), "+r"(s) 00323 : "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216) 00324 ); 00325 #else 00326 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 00327 __asm__ volatile( 00328 "movq %0, %%mm7 \n\t" 00329 "movq %1, %%mm6 \n\t" 00330 ::"m"(red_16mask),"m"(green_16mask)); 00331 while (s < mm_end) { 00332 __asm__ volatile( 00333 PREFETCH" 32%1 \n\t" 00334 "movd %1, %%mm0 \n\t" 00335 "movd 4%1, %%mm3 \n\t" 00336 "punpckldq 8%1, %%mm0 \n\t" 00337 "punpckldq 12%1, %%mm3 \n\t" 00338 "movq %%mm0, %%mm1 \n\t" 00339 "movq %%mm0, %%mm2 \n\t" 00340 "movq %%mm3, %%mm4 \n\t" 00341 "movq %%mm3, %%mm5 \n\t" 00342 "psrlq $3, %%mm0 \n\t" 00343 "psrlq $3, %%mm3 \n\t" 00344 "pand %2, %%mm0 \n\t" 00345 "pand %2, %%mm3 \n\t" 00346 "psrlq $5, %%mm1 \n\t" 00347 "psrlq $5, %%mm4 \n\t" 00348 "pand %%mm6, %%mm1 \n\t" 00349 "pand %%mm6, %%mm4 \n\t" 00350 "psrlq $8, %%mm2 \n\t" 00351 "psrlq $8, %%mm5 \n\t" 00352 "pand %%mm7, %%mm2 \n\t" 00353 "pand %%mm7, %%mm5 \n\t" 00354 "por %%mm1, %%mm0 \n\t" 00355 "por %%mm4, %%mm3 \n\t" 00356 "por %%mm2, %%mm0 \n\t" 00357 "por %%mm5, %%mm3 \n\t" 00358 "psllq $16, %%mm3 \n\t" 00359 "por %%mm3, %%mm0 \n\t" 00360 MOVNTQ" %%mm0, %0 \n\t" 00361 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 00362 d += 4; 00363 s += 16; 00364 } 00365 #endif 00366 __asm__ volatile(SFENCE:::"memory"); 00367 __asm__ volatile(EMMS:::"memory"); 00368 while (s < end) { 00369 register int rgb = *(const uint32_t*)s; s += 4; 00370 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8); 00371 } 00372 } 00373 00374 static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size) 00375 { 00376 const uint8_t *s = src; 00377 const uint8_t *end; 00378 const uint8_t *mm_end; 00379 uint16_t *d = (uint16_t *)dst; 00380 end = s + src_size; 00381 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 00382 __asm__ volatile( 00383 "movq %0, %%mm7 \n\t" 00384 "movq %1, %%mm6 \n\t" 00385 ::"m"(red_16mask),"m"(green_16mask)); 00386 mm_end = end - 15; 00387 while (s < mm_end) { 00388 __asm__ volatile( 00389 PREFETCH" 32%1 \n\t" 00390 "movd %1, %%mm0 \n\t" 00391 "movd 4%1, %%mm3 \n\t" 00392 "punpckldq 8%1, %%mm0 \n\t" 00393 "punpckldq 12%1, %%mm3 \n\t" 00394 "movq %%mm0, %%mm1 \n\t" 00395 "movq %%mm0, %%mm2 \n\t" 00396 "movq %%mm3, %%mm4 \n\t" 00397 "movq %%mm3, %%mm5 \n\t" 00398 "psllq $8, %%mm0 \n\t" 00399 "psllq $8, %%mm3 \n\t" 00400 "pand %%mm7, %%mm0 \n\t" 00401 "pand %%mm7, %%mm3 \n\t" 00402 "psrlq $5, %%mm1 \n\t" 00403 "psrlq $5, %%mm4 \n\t" 00404 "pand %%mm6, %%mm1 \n\t" 00405 "pand %%mm6, %%mm4 \n\t" 00406 "psrlq $19, %%mm2 \n\t" 00407 "psrlq $19, %%mm5 \n\t" 00408 "pand %2, %%mm2 \n\t" 00409 "pand %2, %%mm5 \n\t" 00410 "por %%mm1, %%mm0 \n\t" 00411 "por %%mm4, %%mm3 \n\t" 00412 "por %%mm2, %%mm0 \n\t" 00413 "por %%mm5, %%mm3 \n\t" 00414 "psllq $16, %%mm3 \n\t" 00415 "por %%mm3, %%mm0 \n\t" 00416 MOVNTQ" %%mm0, %0 \n\t" 00417 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 00418 d += 4; 00419 s += 16; 00420 } 00421 __asm__ volatile(SFENCE:::"memory"); 00422 __asm__ volatile(EMMS:::"memory"); 00423 while (s < end) { 00424 register int rgb = *(const uint32_t*)s; s += 4; 00425 *d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19); 00426 } 00427 } 00428 00429 static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size) 00430 { 00431 const uint8_t *s = src; 00432 const uint8_t *end; 00433 const uint8_t *mm_end; 00434 uint16_t *d = (uint16_t *)dst; 00435 end = s + src_size; 00436 mm_end = end - 15; 00437 #if 1 //is faster only if multiplies are reasonably fast (FIXME figure out on which CPUs this is faster, on Athlon it is slightly faster) 00438 __asm__ volatile( 00439 "movq %3, %%mm5 \n\t" 00440 "movq %4, %%mm6 \n\t" 00441 "movq %5, %%mm7 \n\t" 00442 "jmp 2f \n\t" 00443 ".p2align 4 \n\t" 00444 "1: \n\t" 00445 PREFETCH" 32(%1) \n\t" 00446 "movd (%1), %%mm0 \n\t" 00447 "movd 4(%1), %%mm3 \n\t" 00448 "punpckldq 8(%1), %%mm0 \n\t" 00449 "punpckldq 12(%1), %%mm3 \n\t" 00450 "movq %%mm0, %%mm1 \n\t" 00451 "movq %%mm3, %%mm4 \n\t" 00452 "pand %%mm6, %%mm0 \n\t" 00453 "pand %%mm6, %%mm3 \n\t" 00454 "pmaddwd %%mm7, %%mm0 \n\t" 00455 "pmaddwd %%mm7, %%mm3 \n\t" 00456 "pand %%mm5, %%mm1 \n\t" 00457 "pand %%mm5, %%mm4 \n\t" 00458 "por %%mm1, %%mm0 \n\t" 00459 "por %%mm4, %%mm3 \n\t" 00460 "psrld $6, %%mm0 \n\t" 00461 "pslld $10, %%mm3 \n\t" 00462 "por %%mm3, %%mm0 \n\t" 00463 MOVNTQ" %%mm0, (%0) \n\t" 00464 "add $16, %1 \n\t" 00465 "add $8, %0 \n\t" 00466 "2: \n\t" 00467 "cmp %2, %1 \n\t" 00468 " jb 1b \n\t" 00469 : "+r" (d), "+r"(s) 00470 : "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215) 00471 ); 00472 #else 00473 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 00474 __asm__ volatile( 00475 "movq %0, %%mm7 \n\t" 00476 "movq %1, %%mm6 \n\t" 00477 ::"m"(red_15mask),"m"(green_15mask)); 00478 while (s < mm_end) { 00479 __asm__ volatile( 00480 PREFETCH" 32%1 \n\t" 00481 "movd %1, %%mm0 \n\t" 00482 "movd 4%1, %%mm3 \n\t" 00483 "punpckldq 8%1, %%mm0 \n\t" 00484 "punpckldq 12%1, %%mm3 \n\t" 00485 "movq %%mm0, %%mm1 \n\t" 00486 "movq %%mm0, %%mm2 \n\t" 00487 "movq %%mm3, %%mm4 \n\t" 00488 "movq %%mm3, %%mm5 \n\t" 00489 "psrlq $3, %%mm0 \n\t" 00490 "psrlq $3, %%mm3 \n\t" 00491 "pand %2, %%mm0 \n\t" 00492 "pand %2, %%mm3 \n\t" 00493 "psrlq $6, %%mm1 \n\t" 00494 "psrlq $6, %%mm4 \n\t" 00495 "pand %%mm6, %%mm1 \n\t" 00496 "pand %%mm6, %%mm4 \n\t" 00497 "psrlq $9, %%mm2 \n\t" 00498 "psrlq $9, %%mm5 \n\t" 00499 "pand %%mm7, %%mm2 \n\t" 00500 "pand %%mm7, %%mm5 \n\t" 00501 "por %%mm1, %%mm0 \n\t" 00502 "por %%mm4, %%mm3 \n\t" 00503 "por %%mm2, %%mm0 \n\t" 00504 "por %%mm5, %%mm3 \n\t" 00505 "psllq $16, %%mm3 \n\t" 00506 "por %%mm3, %%mm0 \n\t" 00507 MOVNTQ" %%mm0, %0 \n\t" 00508 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 00509 d += 4; 00510 s += 16; 00511 } 00512 #endif 00513 __asm__ volatile(SFENCE:::"memory"); 00514 __asm__ volatile(EMMS:::"memory"); 00515 while (s < end) { 00516 register int rgb = *(const uint32_t*)s; s += 4; 00517 *d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9); 00518 } 00519 } 00520 00521 static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size) 00522 { 00523 const uint8_t *s = src; 00524 const uint8_t *end; 00525 const uint8_t *mm_end; 00526 uint16_t *d = (uint16_t *)dst; 00527 end = s + src_size; 00528 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 00529 __asm__ volatile( 00530 "movq %0, %%mm7 \n\t" 00531 "movq %1, %%mm6 \n\t" 00532 ::"m"(red_15mask),"m"(green_15mask)); 00533 mm_end = end - 15; 00534 while (s < mm_end) { 00535 __asm__ volatile( 00536 PREFETCH" 32%1 \n\t" 00537 "movd %1, %%mm0 \n\t" 00538 "movd 4%1, %%mm3 \n\t" 00539 "punpckldq 8%1, %%mm0 \n\t" 00540 "punpckldq 12%1, %%mm3 \n\t" 00541 "movq %%mm0, %%mm1 \n\t" 00542 "movq %%mm0, %%mm2 \n\t" 00543 "movq %%mm3, %%mm4 \n\t" 00544 "movq %%mm3, %%mm5 \n\t" 00545 "psllq $7, %%mm0 \n\t" 00546 "psllq $7, %%mm3 \n\t" 00547 "pand %%mm7, %%mm0 \n\t" 00548 "pand %%mm7, %%mm3 \n\t" 00549 "psrlq $6, %%mm1 \n\t" 00550 "psrlq $6, %%mm4 \n\t" 00551 "pand %%mm6, %%mm1 \n\t" 00552 "pand %%mm6, %%mm4 \n\t" 00553 "psrlq $19, %%mm2 \n\t" 00554 "psrlq $19, %%mm5 \n\t" 00555 "pand %2, %%mm2 \n\t" 00556 "pand %2, %%mm5 \n\t" 00557 "por %%mm1, %%mm0 \n\t" 00558 "por %%mm4, %%mm3 \n\t" 00559 "por %%mm2, %%mm0 \n\t" 00560 "por %%mm5, %%mm3 \n\t" 00561 "psllq $16, %%mm3 \n\t" 00562 "por %%mm3, %%mm0 \n\t" 00563 MOVNTQ" %%mm0, %0 \n\t" 00564 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 00565 d += 4; 00566 s += 16; 00567 } 00568 __asm__ volatile(SFENCE:::"memory"); 00569 __asm__ volatile(EMMS:::"memory"); 00570 while (s < end) { 00571 register int rgb = *(const uint32_t*)s; s += 4; 00572 *d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19); 00573 } 00574 } 00575 00576 static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size) 00577 { 00578 const uint8_t *s = src; 00579 const uint8_t *end; 00580 const uint8_t *mm_end; 00581 uint16_t *d = (uint16_t *)dst; 00582 end = s + src_size; 00583 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 00584 __asm__ volatile( 00585 "movq %0, %%mm7 \n\t" 00586 "movq %1, %%mm6 \n\t" 00587 ::"m"(red_16mask),"m"(green_16mask)); 00588 mm_end = end - 11; 00589 while (s < mm_end) { 00590 __asm__ volatile( 00591 PREFETCH" 32%1 \n\t" 00592 "movd %1, %%mm0 \n\t" 00593 "movd 3%1, %%mm3 \n\t" 00594 "punpckldq 6%1, %%mm0 \n\t" 00595 "punpckldq 9%1, %%mm3 \n\t" 00596 "movq %%mm0, %%mm1 \n\t" 00597 "movq %%mm0, %%mm2 \n\t" 00598 "movq %%mm3, %%mm4 \n\t" 00599 "movq %%mm3, %%mm5 \n\t" 00600 "psrlq $3, %%mm0 \n\t" 00601 "psrlq $3, %%mm3 \n\t" 00602 "pand %2, %%mm0 \n\t" 00603 "pand %2, %%mm3 \n\t" 00604 "psrlq $5, %%mm1 \n\t" 00605 "psrlq $5, %%mm4 \n\t" 00606 "pand %%mm6, %%mm1 \n\t" 00607 "pand %%mm6, %%mm4 \n\t" 00608 "psrlq $8, %%mm2 \n\t" 00609 "psrlq $8, %%mm5 \n\t" 00610 "pand %%mm7, %%mm2 \n\t" 00611 "pand %%mm7, %%mm5 \n\t" 00612 "por %%mm1, %%mm0 \n\t" 00613 "por %%mm4, %%mm3 \n\t" 00614 "por %%mm2, %%mm0 \n\t" 00615 "por %%mm5, %%mm3 \n\t" 00616 "psllq $16, %%mm3 \n\t" 00617 "por %%mm3, %%mm0 \n\t" 00618 MOVNTQ" %%mm0, %0 \n\t" 00619 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 00620 d += 4; 00621 s += 12; 00622 } 00623 __asm__ volatile(SFENCE:::"memory"); 00624 __asm__ volatile(EMMS:::"memory"); 00625 while (s < end) { 00626 const int b = *s++; 00627 const int g = *s++; 00628 const int r = *s++; 00629 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); 00630 } 00631 } 00632 00633 static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size) 00634 { 00635 const uint8_t *s = src; 00636 const uint8_t *end; 00637 const uint8_t *mm_end; 00638 uint16_t *d = (uint16_t *)dst; 00639 end = s + src_size; 00640 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 00641 __asm__ volatile( 00642 "movq %0, %%mm7 \n\t" 00643 "movq %1, %%mm6 \n\t" 00644 ::"m"(red_16mask),"m"(green_16mask)); 00645 mm_end = end - 15; 00646 while (s < mm_end) { 00647 __asm__ volatile( 00648 PREFETCH" 32%1 \n\t" 00649 "movd %1, %%mm0 \n\t" 00650 "movd 3%1, %%mm3 \n\t" 00651 "punpckldq 6%1, %%mm0 \n\t" 00652 "punpckldq 9%1, %%mm3 \n\t" 00653 "movq %%mm0, %%mm1 \n\t" 00654 "movq %%mm0, %%mm2 \n\t" 00655 "movq %%mm3, %%mm4 \n\t" 00656 "movq %%mm3, %%mm5 \n\t" 00657 "psllq $8, %%mm0 \n\t" 00658 "psllq $8, %%mm3 \n\t" 00659 "pand %%mm7, %%mm0 \n\t" 00660 "pand %%mm7, %%mm3 \n\t" 00661 "psrlq $5, %%mm1 \n\t" 00662 "psrlq $5, %%mm4 \n\t" 00663 "pand %%mm6, %%mm1 \n\t" 00664 "pand %%mm6, %%mm4 \n\t" 00665 "psrlq $19, %%mm2 \n\t" 00666 "psrlq $19, %%mm5 \n\t" 00667 "pand %2, %%mm2 \n\t" 00668 "pand %2, %%mm5 \n\t" 00669 "por %%mm1, %%mm0 \n\t" 00670 "por %%mm4, %%mm3 \n\t" 00671 "por %%mm2, %%mm0 \n\t" 00672 "por %%mm5, %%mm3 \n\t" 00673 "psllq $16, %%mm3 \n\t" 00674 "por %%mm3, %%mm0 \n\t" 00675 MOVNTQ" %%mm0, %0 \n\t" 00676 :"=m"(*d):"m"(*s),"m"(blue_16mask):"memory"); 00677 d += 4; 00678 s += 12; 00679 } 00680 __asm__ volatile(SFENCE:::"memory"); 00681 __asm__ volatile(EMMS:::"memory"); 00682 while (s < end) { 00683 const int r = *s++; 00684 const int g = *s++; 00685 const int b = *s++; 00686 *d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8); 00687 } 00688 } 00689 00690 static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size) 00691 { 00692 const uint8_t *s = src; 00693 const uint8_t *end; 00694 const uint8_t *mm_end; 00695 uint16_t *d = (uint16_t *)dst; 00696 end = s + src_size; 00697 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 00698 __asm__ volatile( 00699 "movq %0, %%mm7 \n\t" 00700 "movq %1, %%mm6 \n\t" 00701 ::"m"(red_15mask),"m"(green_15mask)); 00702 mm_end = end - 11; 00703 while (s < mm_end) { 00704 __asm__ volatile( 00705 PREFETCH" 32%1 \n\t" 00706 "movd %1, %%mm0 \n\t" 00707 "movd 3%1, %%mm3 \n\t" 00708 "punpckldq 6%1, %%mm0 \n\t" 00709 "punpckldq 9%1, %%mm3 \n\t" 00710 "movq %%mm0, %%mm1 \n\t" 00711 "movq %%mm0, %%mm2 \n\t" 00712 "movq %%mm3, %%mm4 \n\t" 00713 "movq %%mm3, %%mm5 \n\t" 00714 "psrlq $3, %%mm0 \n\t" 00715 "psrlq $3, %%mm3 \n\t" 00716 "pand %2, %%mm0 \n\t" 00717 "pand %2, %%mm3 \n\t" 00718 "psrlq $6, %%mm1 \n\t" 00719 "psrlq $6, %%mm4 \n\t" 00720 "pand %%mm6, %%mm1 \n\t" 00721 "pand %%mm6, %%mm4 \n\t" 00722 "psrlq $9, %%mm2 \n\t" 00723 "psrlq $9, %%mm5 \n\t" 00724 "pand %%mm7, %%mm2 \n\t" 00725 "pand %%mm7, %%mm5 \n\t" 00726 "por %%mm1, %%mm0 \n\t" 00727 "por %%mm4, %%mm3 \n\t" 00728 "por %%mm2, %%mm0 \n\t" 00729 "por %%mm5, %%mm3 \n\t" 00730 "psllq $16, %%mm3 \n\t" 00731 "por %%mm3, %%mm0 \n\t" 00732 MOVNTQ" %%mm0, %0 \n\t" 00733 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 00734 d += 4; 00735 s += 12; 00736 } 00737 __asm__ volatile(SFENCE:::"memory"); 00738 __asm__ volatile(EMMS:::"memory"); 00739 while (s < end) { 00740 const int b = *s++; 00741 const int g = *s++; 00742 const int r = *s++; 00743 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); 00744 } 00745 } 00746 00747 static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size) 00748 { 00749 const uint8_t *s = src; 00750 const uint8_t *end; 00751 const uint8_t *mm_end; 00752 uint16_t *d = (uint16_t *)dst; 00753 end = s + src_size; 00754 __asm__ volatile(PREFETCH" %0"::"m"(*src):"memory"); 00755 __asm__ volatile( 00756 "movq %0, %%mm7 \n\t" 00757 "movq %1, %%mm6 \n\t" 00758 ::"m"(red_15mask),"m"(green_15mask)); 00759 mm_end = end - 15; 00760 while (s < mm_end) { 00761 __asm__ volatile( 00762 PREFETCH" 32%1 \n\t" 00763 "movd %1, %%mm0 \n\t" 00764 "movd 3%1, %%mm3 \n\t" 00765 "punpckldq 6%1, %%mm0 \n\t" 00766 "punpckldq 9%1, %%mm3 \n\t" 00767 "movq %%mm0, %%mm1 \n\t" 00768 "movq %%mm0, %%mm2 \n\t" 00769 "movq %%mm3, %%mm4 \n\t" 00770 "movq %%mm3, %%mm5 \n\t" 00771 "psllq $7, %%mm0 \n\t" 00772 "psllq $7, %%mm3 \n\t" 00773 "pand %%mm7, %%mm0 \n\t" 00774 "pand %%mm7, %%mm3 \n\t" 00775 "psrlq $6, %%mm1 \n\t" 00776 "psrlq $6, %%mm4 \n\t" 00777 "pand %%mm6, %%mm1 \n\t" 00778 "pand %%mm6, %%mm4 \n\t" 00779 "psrlq $19, %%mm2 \n\t" 00780 "psrlq $19, %%mm5 \n\t" 00781 "pand %2, %%mm2 \n\t" 00782 "pand %2, %%mm5 \n\t" 00783 "por %%mm1, %%mm0 \n\t" 00784 "por %%mm4, %%mm3 \n\t" 00785 "por %%mm2, %%mm0 \n\t" 00786 "por %%mm5, %%mm3 \n\t" 00787 "psllq $16, %%mm3 \n\t" 00788 "por %%mm3, %%mm0 \n\t" 00789 MOVNTQ" %%mm0, %0 \n\t" 00790 :"=m"(*d):"m"(*s),"m"(blue_15mask):"memory"); 00791 d += 4; 00792 s += 12; 00793 } 00794 __asm__ volatile(SFENCE:::"memory"); 00795 __asm__ volatile(EMMS:::"memory"); 00796 while (s < end) { 00797 const int r = *s++; 00798 const int g = *s++; 00799 const int b = *s++; 00800 *d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7); 00801 } 00802 } 00803 00804 /* 00805 I use less accurate approximation here by simply left-shifting the input 00806 value and filling the low order bits with zeroes. This method improves PNG 00807 compression but this scheme cannot reproduce white exactly, since it does 00808 not generate an all-ones maximum value; the net effect is to darken the 00809 image slightly. 00810 00811 The better method should be "left bit replication": 00812 00813 4 3 2 1 0 00814 --------- 00815 1 1 0 1 1 00816 00817 7 6 5 4 3 2 1 0 00818 ---------------- 00819 1 1 0 1 1 1 1 0 00820 |=======| |===| 00821 | leftmost bits repeated to fill open bits 00822 | 00823 original bits 00824 */ 00825 static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) 00826 { 00827 const uint16_t *end; 00828 const uint16_t *mm_end; 00829 uint8_t *d = dst; 00830 const uint16_t *s = (const uint16_t*)src; 00831 end = s + src_size/2; 00832 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 00833 mm_end = end - 7; 00834 while (s < mm_end) { 00835 __asm__ volatile( 00836 PREFETCH" 32%1 \n\t" 00837 "movq %1, %%mm0 \n\t" 00838 "movq %1, %%mm1 \n\t" 00839 "movq %1, %%mm2 \n\t" 00840 "pand %2, %%mm0 \n\t" 00841 "pand %3, %%mm1 \n\t" 00842 "pand %4, %%mm2 \n\t" 00843 "psllq $3, %%mm0 \n\t" 00844 "psrlq $2, %%mm1 \n\t" 00845 "psrlq $7, %%mm2 \n\t" 00846 "movq %%mm0, %%mm3 \n\t" 00847 "movq %%mm1, %%mm4 \n\t" 00848 "movq %%mm2, %%mm5 \n\t" 00849 "punpcklwd %5, %%mm0 \n\t" 00850 "punpcklwd %5, %%mm1 \n\t" 00851 "punpcklwd %5, %%mm2 \n\t" 00852 "punpckhwd %5, %%mm3 \n\t" 00853 "punpckhwd %5, %%mm4 \n\t" 00854 "punpckhwd %5, %%mm5 \n\t" 00855 "psllq $8, %%mm1 \n\t" 00856 "psllq $16, %%mm2 \n\t" 00857 "por %%mm1, %%mm0 \n\t" 00858 "por %%mm2, %%mm0 \n\t" 00859 "psllq $8, %%mm4 \n\t" 00860 "psllq $16, %%mm5 \n\t" 00861 "por %%mm4, %%mm3 \n\t" 00862 "por %%mm5, %%mm3 \n\t" 00863 00864 "movq %%mm0, %%mm6 \n\t" 00865 "movq %%mm3, %%mm7 \n\t" 00866 00867 "movq 8%1, %%mm0 \n\t" 00868 "movq 8%1, %%mm1 \n\t" 00869 "movq 8%1, %%mm2 \n\t" 00870 "pand %2, %%mm0 \n\t" 00871 "pand %3, %%mm1 \n\t" 00872 "pand %4, %%mm2 \n\t" 00873 "psllq $3, %%mm0 \n\t" 00874 "psrlq $2, %%mm1 \n\t" 00875 "psrlq $7, %%mm2 \n\t" 00876 "movq %%mm0, %%mm3 \n\t" 00877 "movq %%mm1, %%mm4 \n\t" 00878 "movq %%mm2, %%mm5 \n\t" 00879 "punpcklwd %5, %%mm0 \n\t" 00880 "punpcklwd %5, %%mm1 \n\t" 00881 "punpcklwd %5, %%mm2 \n\t" 00882 "punpckhwd %5, %%mm3 \n\t" 00883 "punpckhwd %5, %%mm4 \n\t" 00884 "punpckhwd %5, %%mm5 \n\t" 00885 "psllq $8, %%mm1 \n\t" 00886 "psllq $16, %%mm2 \n\t" 00887 "por %%mm1, %%mm0 \n\t" 00888 "por %%mm2, %%mm0 \n\t" 00889 "psllq $8, %%mm4 \n\t" 00890 "psllq $16, %%mm5 \n\t" 00891 "por %%mm4, %%mm3 \n\t" 00892 "por %%mm5, %%mm3 \n\t" 00893 00894 :"=m"(*d) 00895 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null) 00896 :"memory"); 00897 /* borrowed 32 to 24 */ 00898 __asm__ volatile( 00899 "movq %%mm0, %%mm4 \n\t" 00900 "movq %%mm3, %%mm5 \n\t" 00901 "movq %%mm6, %%mm0 \n\t" 00902 "movq %%mm7, %%mm1 \n\t" 00903 00904 "movq %%mm4, %%mm6 \n\t" 00905 "movq %%mm5, %%mm7 \n\t" 00906 "movq %%mm0, %%mm2 \n\t" 00907 "movq %%mm1, %%mm3 \n\t" 00908 00909 STORE_BGR24_MMX 00910 00911 :"=m"(*d) 00912 :"m"(*s) 00913 :"memory"); 00914 d += 24; 00915 s += 8; 00916 } 00917 __asm__ volatile(SFENCE:::"memory"); 00918 __asm__ volatile(EMMS:::"memory"); 00919 while (s < end) { 00920 register uint16_t bgr; 00921 bgr = *s++; 00922 *d++ = (bgr&0x1F)<<3; 00923 *d++ = (bgr&0x3E0)>>2; 00924 *d++ = (bgr&0x7C00)>>7; 00925 } 00926 } 00927 00928 static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) 00929 { 00930 const uint16_t *end; 00931 const uint16_t *mm_end; 00932 uint8_t *d = (uint8_t *)dst; 00933 const uint16_t *s = (const uint16_t *)src; 00934 end = s + src_size/2; 00935 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 00936 mm_end = end - 7; 00937 while (s < mm_end) { 00938 __asm__ volatile( 00939 PREFETCH" 32%1 \n\t" 00940 "movq %1, %%mm0 \n\t" 00941 "movq %1, %%mm1 \n\t" 00942 "movq %1, %%mm2 \n\t" 00943 "pand %2, %%mm0 \n\t" 00944 "pand %3, %%mm1 \n\t" 00945 "pand %4, %%mm2 \n\t" 00946 "psllq $3, %%mm0 \n\t" 00947 "psrlq $3, %%mm1 \n\t" 00948 "psrlq $8, %%mm2 \n\t" 00949 "movq %%mm0, %%mm3 \n\t" 00950 "movq %%mm1, %%mm4 \n\t" 00951 "movq %%mm2, %%mm5 \n\t" 00952 "punpcklwd %5, %%mm0 \n\t" 00953 "punpcklwd %5, %%mm1 \n\t" 00954 "punpcklwd %5, %%mm2 \n\t" 00955 "punpckhwd %5, %%mm3 \n\t" 00956 "punpckhwd %5, %%mm4 \n\t" 00957 "punpckhwd %5, %%mm5 \n\t" 00958 "psllq $8, %%mm1 \n\t" 00959 "psllq $16, %%mm2 \n\t" 00960 "por %%mm1, %%mm0 \n\t" 00961 "por %%mm2, %%mm0 \n\t" 00962 "psllq $8, %%mm4 \n\t" 00963 "psllq $16, %%mm5 \n\t" 00964 "por %%mm4, %%mm3 \n\t" 00965 "por %%mm5, %%mm3 \n\t" 00966 00967 "movq %%mm0, %%mm6 \n\t" 00968 "movq %%mm3, %%mm7 \n\t" 00969 00970 "movq 8%1, %%mm0 \n\t" 00971 "movq 8%1, %%mm1 \n\t" 00972 "movq 8%1, %%mm2 \n\t" 00973 "pand %2, %%mm0 \n\t" 00974 "pand %3, %%mm1 \n\t" 00975 "pand %4, %%mm2 \n\t" 00976 "psllq $3, %%mm0 \n\t" 00977 "psrlq $3, %%mm1 \n\t" 00978 "psrlq $8, %%mm2 \n\t" 00979 "movq %%mm0, %%mm3 \n\t" 00980 "movq %%mm1, %%mm4 \n\t" 00981 "movq %%mm2, %%mm5 \n\t" 00982 "punpcklwd %5, %%mm0 \n\t" 00983 "punpcklwd %5, %%mm1 \n\t" 00984 "punpcklwd %5, %%mm2 \n\t" 00985 "punpckhwd %5, %%mm3 \n\t" 00986 "punpckhwd %5, %%mm4 \n\t" 00987 "punpckhwd %5, %%mm5 \n\t" 00988 "psllq $8, %%mm1 \n\t" 00989 "psllq $16, %%mm2 \n\t" 00990 "por %%mm1, %%mm0 \n\t" 00991 "por %%mm2, %%mm0 \n\t" 00992 "psllq $8, %%mm4 \n\t" 00993 "psllq $16, %%mm5 \n\t" 00994 "por %%mm4, %%mm3 \n\t" 00995 "por %%mm5, %%mm3 \n\t" 00996 :"=m"(*d) 00997 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null) 00998 :"memory"); 00999 /* borrowed 32 to 24 */ 01000 __asm__ volatile( 01001 "movq %%mm0, %%mm4 \n\t" 01002 "movq %%mm3, %%mm5 \n\t" 01003 "movq %%mm6, %%mm0 \n\t" 01004 "movq %%mm7, %%mm1 \n\t" 01005 01006 "movq %%mm4, %%mm6 \n\t" 01007 "movq %%mm5, %%mm7 \n\t" 01008 "movq %%mm0, %%mm2 \n\t" 01009 "movq %%mm1, %%mm3 \n\t" 01010 01011 STORE_BGR24_MMX 01012 01013 :"=m"(*d) 01014 :"m"(*s) 01015 :"memory"); 01016 d += 24; 01017 s += 8; 01018 } 01019 __asm__ volatile(SFENCE:::"memory"); 01020 __asm__ volatile(EMMS:::"memory"); 01021 while (s < end) { 01022 register uint16_t bgr; 01023 bgr = *s++; 01024 *d++ = (bgr&0x1F)<<3; 01025 *d++ = (bgr&0x7E0)>>3; 01026 *d++ = (bgr&0xF800)>>8; 01027 } 01028 } 01029 01030 /* 01031 * mm0 = 00 B3 00 B2 00 B1 00 B0 01032 * mm1 = 00 G3 00 G2 00 G1 00 G0 01033 * mm2 = 00 R3 00 R2 00 R1 00 R0 01034 * mm6 = FF FF FF FF FF FF FF FF 01035 * mm7 = 00 00 00 00 00 00 00 00 01036 */ 01037 #define PACK_RGB32 \ 01038 "packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \ 01039 "packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \ 01040 "packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \ 01041 "punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \ 01042 "punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \ 01043 "movq %%mm0, %%mm3 \n\t" \ 01044 "punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \ 01045 "punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \ 01046 MOVNTQ" %%mm0, %0 \n\t" \ 01047 MOVNTQ" %%mm3, 8%0 \n\t" \ 01048 01049 static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size) 01050 { 01051 const uint16_t *end; 01052 const uint16_t *mm_end; 01053 uint8_t *d = dst; 01054 const uint16_t *s = (const uint16_t *)src; 01055 end = s + src_size/2; 01056 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 01057 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); 01058 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); 01059 mm_end = end - 3; 01060 while (s < mm_end) { 01061 __asm__ volatile( 01062 PREFETCH" 32%1 \n\t" 01063 "movq %1, %%mm0 \n\t" 01064 "movq %1, %%mm1 \n\t" 01065 "movq %1, %%mm2 \n\t" 01066 "pand %2, %%mm0 \n\t" 01067 "pand %3, %%mm1 \n\t" 01068 "pand %4, %%mm2 \n\t" 01069 "psllq $3, %%mm0 \n\t" 01070 "psrlq $2, %%mm1 \n\t" 01071 "psrlq $7, %%mm2 \n\t" 01072 PACK_RGB32 01073 :"=m"(*d) 01074 :"m"(*s),"m"(mask15b),"m"(mask15g),"m"(mask15r) 01075 :"memory"); 01076 d += 16; 01077 s += 4; 01078 } 01079 __asm__ volatile(SFENCE:::"memory"); 01080 __asm__ volatile(EMMS:::"memory"); 01081 while (s < end) { 01082 register uint16_t bgr; 01083 bgr = *s++; 01084 *d++ = (bgr&0x1F)<<3; 01085 *d++ = (bgr&0x3E0)>>2; 01086 *d++ = (bgr&0x7C00)>>7; 01087 *d++ = 255; 01088 } 01089 } 01090 01091 static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size) 01092 { 01093 const uint16_t *end; 01094 const uint16_t *mm_end; 01095 uint8_t *d = dst; 01096 const uint16_t *s = (const uint16_t*)src; 01097 end = s + src_size/2; 01098 __asm__ volatile(PREFETCH" %0"::"m"(*s):"memory"); 01099 __asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory"); 01100 __asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory"); 01101 mm_end = end - 3; 01102 while (s < mm_end) { 01103 __asm__ volatile( 01104 PREFETCH" 32%1 \n\t" 01105 "movq %1, %%mm0 \n\t" 01106 "movq %1, %%mm1 \n\t" 01107 "movq %1, %%mm2 \n\t" 01108 "pand %2, %%mm0 \n\t" 01109 "pand %3, %%mm1 \n\t" 01110 "pand %4, %%mm2 \n\t" 01111 "psllq $3, %%mm0 \n\t" 01112 "psrlq $3, %%mm1 \n\t" 01113 "psrlq $8, %%mm2 \n\t" 01114 PACK_RGB32 01115 :"=m"(*d) 01116 :"m"(*s),"m"(mask16b),"m"(mask16g),"m"(mask16r) 01117 :"memory"); 01118 d += 16; 01119 s += 4; 01120 } 01121 __asm__ volatile(SFENCE:::"memory"); 01122 __asm__ volatile(EMMS:::"memory"); 01123 while (s < end) { 01124 register uint16_t bgr; 01125 bgr = *s++; 01126 *d++ = (bgr&0x1F)<<3; 01127 *d++ = (bgr&0x7E0)>>3; 01128 *d++ = (bgr&0xF800)>>8; 01129 *d++ = 255; 01130 } 01131 } 01132 01133 static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size) 01134 { 01135 x86_reg idx = 15 - src_size; 01136 const uint8_t *s = src-idx; 01137 uint8_t *d = dst-idx; 01138 __asm__ volatile( 01139 "test %0, %0 \n\t" 01140 "jns 2f \n\t" 01141 PREFETCH" (%1, %0) \n\t" 01142 "movq %3, %%mm7 \n\t" 01143 "pxor %4, %%mm7 \n\t" 01144 "movq %%mm7, %%mm6 \n\t" 01145 "pxor %5, %%mm7 \n\t" 01146 ".p2align 4 \n\t" 01147 "1: \n\t" 01148 PREFETCH" 32(%1, %0) \n\t" 01149 "movq (%1, %0), %%mm0 \n\t" 01150 "movq 8(%1, %0), %%mm1 \n\t" 01151 # if COMPILE_TEMPLATE_MMX2 01152 "pshufw $177, %%mm0, %%mm3 \n\t" 01153 "pshufw $177, %%mm1, %%mm5 \n\t" 01154 "pand %%mm7, %%mm0 \n\t" 01155 "pand %%mm6, %%mm3 \n\t" 01156 "pand %%mm7, %%mm1 \n\t" 01157 "pand %%mm6, %%mm5 \n\t" 01158 "por %%mm3, %%mm0 \n\t" 01159 "por %%mm5, %%mm1 \n\t" 01160 # else 01161 "movq %%mm0, %%mm2 \n\t" 01162 "movq %%mm1, %%mm4 \n\t" 01163 "pand %%mm7, %%mm0 \n\t" 01164 "pand %%mm6, %%mm2 \n\t" 01165 "pand %%mm7, %%mm1 \n\t" 01166 "pand %%mm6, %%mm4 \n\t" 01167 "movq %%mm2, %%mm3 \n\t" 01168 "movq %%mm4, %%mm5 \n\t" 01169 "pslld $16, %%mm2 \n\t" 01170 "psrld $16, %%mm3 \n\t" 01171 "pslld $16, %%mm4 \n\t" 01172 "psrld $16, %%mm5 \n\t" 01173 "por %%mm2, %%mm0 \n\t" 01174 "por %%mm4, %%mm1 \n\t" 01175 "por %%mm3, %%mm0 \n\t" 01176 "por %%mm5, %%mm1 \n\t" 01177 # endif 01178 MOVNTQ" %%mm0, (%2, %0) \n\t" 01179 MOVNTQ" %%mm1, 8(%2, %0) \n\t" 01180 "add $16, %0 \n\t" 01181 "js 1b \n\t" 01182 SFENCE" \n\t" 01183 EMMS" \n\t" 01184 "2: \n\t" 01185 : "+&r"(idx) 01186 : "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one) 01187 : "memory"); 01188 for (; idx<15; idx+=4) { 01189 register int v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00; 01190 v &= 0xff00ff; 01191 *(uint32_t *)&d[idx] = (v>>16) + g + (v<<16); 01192 } 01193 } 01194 01195 static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size) 01196 { 01197 unsigned i; 01198 x86_reg mmx_size= 23 - src_size; 01199 __asm__ volatile ( 01200 "test %%"REG_a", %%"REG_a" \n\t" 01201 "jns 2f \n\t" 01202 "movq "MANGLE(mask24r)", %%mm5 \n\t" 01203 "movq "MANGLE(mask24g)", %%mm6 \n\t" 01204 "movq "MANGLE(mask24b)", %%mm7 \n\t" 01205 ".p2align 4 \n\t" 01206 "1: \n\t" 01207 PREFETCH" 32(%1, %%"REG_a") \n\t" 01208 "movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG 01209 "movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG 01210 "movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B 01211 "psllq $16, %%mm0 \n\t" // 00 BGR BGR 01212 "pand %%mm5, %%mm0 \n\t" 01213 "pand %%mm6, %%mm1 \n\t" 01214 "pand %%mm7, %%mm2 \n\t" 01215 "por %%mm0, %%mm1 \n\t" 01216 "por %%mm2, %%mm1 \n\t" 01217 "movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG 01218 MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG 01219 "movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B 01220 "movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR 01221 "pand %%mm7, %%mm0 \n\t" 01222 "pand %%mm5, %%mm1 \n\t" 01223 "pand %%mm6, %%mm2 \n\t" 01224 "por %%mm0, %%mm1 \n\t" 01225 "por %%mm2, %%mm1 \n\t" 01226 "movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B 01227 MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R 01228 "movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR 01229 "movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG 01230 "pand %%mm6, %%mm0 \n\t" 01231 "pand %%mm7, %%mm1 \n\t" 01232 "pand %%mm5, %%mm2 \n\t" 01233 "por %%mm0, %%mm1 \n\t" 01234 "por %%mm2, %%mm1 \n\t" 01235 MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t" 01236 "add $24, %%"REG_a" \n\t" 01237 " js 1b \n\t" 01238 "2: \n\t" 01239 : "+a" (mmx_size) 01240 : "r" (src-mmx_size), "r"(dst-mmx_size) 01241 ); 01242 01243 __asm__ volatile(SFENCE:::"memory"); 01244 __asm__ volatile(EMMS:::"memory"); 01245 01246 if (mmx_size==23) return; //finished, was multiple of 8 01247 01248 src+= src_size; 01249 dst+= src_size; 01250 src_size= 23-mmx_size; 01251 src-= src_size; 01252 dst-= src_size; 01253 for (i=0; i<src_size; i+=3) { 01254 register uint8_t x; 01255 x = src[i + 2]; 01256 dst[i + 1] = src[i + 1]; 01257 dst[i + 2] = src[i + 0]; 01258 dst[i + 0] = x; 01259 } 01260 } 01261 01262 static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 01263 int width, int height, 01264 int lumStride, int chromStride, int dstStride, int vertLumPerChroma) 01265 { 01266 int y; 01267 const x86_reg chromWidth= width>>1; 01268 for (y=0; y<height; y++) { 01269 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) 01270 __asm__ volatile( 01271 "xor %%"REG_a", %%"REG_a" \n\t" 01272 ".p2align 4 \n\t" 01273 "1: \n\t" 01274 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" 01275 PREFETCH" 32(%2, %%"REG_a") \n\t" 01276 PREFETCH" 32(%3, %%"REG_a") \n\t" 01277 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) 01278 "movq %%mm0, %%mm2 \n\t" // U(0) 01279 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) 01280 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 01281 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) 01282 01283 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) 01284 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) 01285 "movq %%mm3, %%mm4 \n\t" // Y(0) 01286 "movq %%mm5, %%mm6 \n\t" // Y(8) 01287 "punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0) 01288 "punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4) 01289 "punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8) 01290 "punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12) 01291 01292 MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t" 01293 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" 01294 MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t" 01295 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" 01296 01297 "add $8, %%"REG_a" \n\t" 01298 "cmp %4, %%"REG_a" \n\t" 01299 " jb 1b \n\t" 01300 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) 01301 : "%"REG_a 01302 ); 01303 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { 01304 usrc += chromStride; 01305 vsrc += chromStride; 01306 } 01307 ysrc += lumStride; 01308 dst += dstStride; 01309 } 01310 __asm__(EMMS" \n\t" 01311 SFENCE" \n\t" 01312 :::"memory"); 01313 } 01314 01319 static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 01320 int width, int height, 01321 int lumStride, int chromStride, int dstStride) 01322 { 01323 //FIXME interpolate chroma 01324 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); 01325 } 01326 01327 static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 01328 int width, int height, 01329 int lumStride, int chromStride, int dstStride, int vertLumPerChroma) 01330 { 01331 int y; 01332 const x86_reg chromWidth= width>>1; 01333 for (y=0; y<height; y++) { 01334 //FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway) 01335 __asm__ volatile( 01336 "xor %%"REG_a", %%"REG_a" \n\t" 01337 ".p2align 4 \n\t" 01338 "1: \n\t" 01339 PREFETCH" 32(%1, %%"REG_a", 2) \n\t" 01340 PREFETCH" 32(%2, %%"REG_a") \n\t" 01341 PREFETCH" 32(%3, %%"REG_a") \n\t" 01342 "movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0) 01343 "movq %%mm0, %%mm2 \n\t" // U(0) 01344 "movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0) 01345 "punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 01346 "punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8) 01347 01348 "movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0) 01349 "movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8) 01350 "movq %%mm0, %%mm4 \n\t" // Y(0) 01351 "movq %%mm2, %%mm6 \n\t" // Y(8) 01352 "punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0) 01353 "punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4) 01354 "punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8) 01355 "punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12) 01356 01357 MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t" 01358 MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t" 01359 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t" 01360 MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t" 01361 01362 "add $8, %%"REG_a" \n\t" 01363 "cmp %4, %%"REG_a" \n\t" 01364 " jb 1b \n\t" 01365 ::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth) 01366 : "%"REG_a 01367 ); 01368 if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) { 01369 usrc += chromStride; 01370 vsrc += chromStride; 01371 } 01372 ysrc += lumStride; 01373 dst += dstStride; 01374 } 01375 __asm__(EMMS" \n\t" 01376 SFENCE" \n\t" 01377 :::"memory"); 01378 } 01379 01384 static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 01385 int width, int height, 01386 int lumStride, int chromStride, int dstStride) 01387 { 01388 //FIXME interpolate chroma 01389 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2); 01390 } 01391 01395 static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 01396 int width, int height, 01397 int lumStride, int chromStride, int dstStride) 01398 { 01399 RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); 01400 } 01401 01405 static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst, 01406 int width, int height, 01407 int lumStride, int chromStride, int dstStride) 01408 { 01409 RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1); 01410 } 01411 01416 static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 01417 int width, int height, 01418 int lumStride, int chromStride, int srcStride) 01419 { 01420 int y; 01421 const x86_reg chromWidth= width>>1; 01422 for (y=0; y<height; y+=2) { 01423 __asm__ volatile( 01424 "xor %%"REG_a", %%"REG_a" \n\t" 01425 "pcmpeqw %%mm7, %%mm7 \n\t" 01426 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... 01427 ".p2align 4 \n\t" 01428 "1: \n\t" 01429 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 01430 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) 01431 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) 01432 "movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0) 01433 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4) 01434 "psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0) 01435 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4) 01436 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0) 01437 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4) 01438 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 01439 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) 01440 01441 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" 01442 01443 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8) 01444 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12) 01445 "movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8) 01446 "movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12) 01447 "psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8) 01448 "psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12) 01449 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8) 01450 "pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12) 01451 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) 01452 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) 01453 01454 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" 01455 01456 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) 01457 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) 01458 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) 01459 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) 01460 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) 01461 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) 01462 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) 01463 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) 01464 01465 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" 01466 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" 01467 01468 "add $8, %%"REG_a" \n\t" 01469 "cmp %4, %%"REG_a" \n\t" 01470 " jb 1b \n\t" 01471 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 01472 : "memory", "%"REG_a 01473 ); 01474 01475 ydst += lumStride; 01476 src += srcStride; 01477 01478 __asm__ volatile( 01479 "xor %%"REG_a", %%"REG_a" \n\t" 01480 ".p2align 4 \n\t" 01481 "1: \n\t" 01482 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 01483 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) 01484 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) 01485 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) 01486 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) 01487 "pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0) 01488 "pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4) 01489 "pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8) 01490 "pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12) 01491 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) 01492 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) 01493 01494 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" 01495 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" 01496 01497 "add $8, %%"REG_a" \n\t" 01498 "cmp %4, %%"REG_a" \n\t" 01499 " jb 1b \n\t" 01500 01501 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 01502 : "memory", "%"REG_a 01503 ); 01504 udst += chromStride; 01505 vdst += chromStride; 01506 ydst += lumStride; 01507 src += srcStride; 01508 } 01509 __asm__ volatile(EMMS" \n\t" 01510 SFENCE" \n\t" 01511 :::"memory"); 01512 } 01513 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 01514 01515 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW 01516 static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride) 01517 { 01518 int x,y; 01519 01520 dst[0]= src[0]; 01521 01522 // first line 01523 for (x=0; x<srcWidth-1; x++) { 01524 dst[2*x+1]= (3*src[x] + src[x+1])>>2; 01525 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; 01526 } 01527 dst[2*srcWidth-1]= src[srcWidth-1]; 01528 01529 dst+= dstStride; 01530 01531 for (y=1; y<srcHeight; y++) { 01532 const x86_reg mmxSize= srcWidth&~15; 01533 __asm__ volatile( 01534 "mov %4, %%"REG_a" \n\t" 01535 "movq "MANGLE(mmx_ff)", %%mm0 \n\t" 01536 "movq (%0, %%"REG_a"), %%mm4 \n\t" 01537 "movq %%mm4, %%mm2 \n\t" 01538 "psllq $8, %%mm4 \n\t" 01539 "pand %%mm0, %%mm2 \n\t" 01540 "por %%mm2, %%mm4 \n\t" 01541 "movq (%1, %%"REG_a"), %%mm5 \n\t" 01542 "movq %%mm5, %%mm3 \n\t" 01543 "psllq $8, %%mm5 \n\t" 01544 "pand %%mm0, %%mm3 \n\t" 01545 "por %%mm3, %%mm5 \n\t" 01546 "1: \n\t" 01547 "movq (%0, %%"REG_a"), %%mm0 \n\t" 01548 "movq (%1, %%"REG_a"), %%mm1 \n\t" 01549 "movq 1(%0, %%"REG_a"), %%mm2 \n\t" 01550 "movq 1(%1, %%"REG_a"), %%mm3 \n\t" 01551 PAVGB" %%mm0, %%mm5 \n\t" 01552 PAVGB" %%mm0, %%mm3 \n\t" 01553 PAVGB" %%mm0, %%mm5 \n\t" 01554 PAVGB" %%mm0, %%mm3 \n\t" 01555 PAVGB" %%mm1, %%mm4 \n\t" 01556 PAVGB" %%mm1, %%mm2 \n\t" 01557 PAVGB" %%mm1, %%mm4 \n\t" 01558 PAVGB" %%mm1, %%mm2 \n\t" 01559 "movq %%mm5, %%mm7 \n\t" 01560 "movq %%mm4, %%mm6 \n\t" 01561 "punpcklbw %%mm3, %%mm5 \n\t" 01562 "punpckhbw %%mm3, %%mm7 \n\t" 01563 "punpcklbw %%mm2, %%mm4 \n\t" 01564 "punpckhbw %%mm2, %%mm6 \n\t" 01565 MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t" 01566 MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t" 01567 MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t" 01568 MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t" 01569 "add $8, %%"REG_a" \n\t" 01570 "movq -1(%0, %%"REG_a"), %%mm4 \n\t" 01571 "movq -1(%1, %%"REG_a"), %%mm5 \n\t" 01572 " js 1b \n\t" 01573 :: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ), 01574 "r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2), 01575 "g" (-mmxSize) 01576 : "%"REG_a 01577 ); 01578 01579 for (x=mmxSize-1; x<srcWidth-1; x++) { 01580 dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2; 01581 dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2; 01582 dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2; 01583 dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2; 01584 } 01585 dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2; 01586 dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2; 01587 01588 dst+=dstStride*2; 01589 src+=srcStride; 01590 } 01591 01592 // last line 01593 dst[0]= src[0]; 01594 01595 for (x=0; x<srcWidth-1; x++) { 01596 dst[2*x+1]= (3*src[x] + src[x+1])>>2; 01597 dst[2*x+2]= ( src[x] + 3*src[x+1])>>2; 01598 } 01599 dst[2*srcWidth-1]= src[srcWidth-1]; 01600 01601 __asm__ volatile(EMMS" \n\t" 01602 SFENCE" \n\t" 01603 :::"memory"); 01604 } 01605 #endif /* COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW */ 01606 01607 #if !COMPILE_TEMPLATE_AMD3DNOW 01608 01614 static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 01615 int width, int height, 01616 int lumStride, int chromStride, int srcStride) 01617 { 01618 int y; 01619 const x86_reg chromWidth= width>>1; 01620 for (y=0; y<height; y+=2) { 01621 __asm__ volatile( 01622 "xor %%"REG_a", %%"REG_a" \n\t" 01623 "pcmpeqw %%mm7, %%mm7 \n\t" 01624 "psrlw $8, %%mm7 \n\t" // FF,00,FF,00... 01625 ".p2align 4 \n\t" 01626 "1: \n\t" 01627 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 01628 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0) 01629 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4) 01630 "movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0) 01631 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4) 01632 "pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0) 01633 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4) 01634 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0) 01635 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4) 01636 "packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0) 01637 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0) 01638 01639 MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t" 01640 01641 "movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8) 01642 "movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12) 01643 "movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8) 01644 "movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12) 01645 "pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8) 01646 "pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12) 01647 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8) 01648 "psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12) 01649 "packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8) 01650 "packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8) 01651 01652 MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t" 01653 01654 "movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0) 01655 "movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8) 01656 "psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0) 01657 "psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8) 01658 "pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0) 01659 "pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8) 01660 "packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0) 01661 "packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0) 01662 01663 MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t" 01664 MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t" 01665 01666 "add $8, %%"REG_a" \n\t" 01667 "cmp %4, %%"REG_a" \n\t" 01668 " jb 1b \n\t" 01669 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 01670 : "memory", "%"REG_a 01671 ); 01672 01673 ydst += lumStride; 01674 src += srcStride; 01675 01676 __asm__ volatile( 01677 "xor %%"REG_a", %%"REG_a" \n\t" 01678 ".p2align 4 \n\t" 01679 "1: \n\t" 01680 PREFETCH" 64(%0, %%"REG_a", 4) \n\t" 01681 "movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0) 01682 "movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4) 01683 "movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8) 01684 "movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12) 01685 "psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0) 01686 "psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4) 01687 "psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8) 01688 "psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12) 01689 "packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0) 01690 "packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8) 01691 01692 MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t" 01693 MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t" 01694 01695 "add $8, %%"REG_a" \n\t" 01696 "cmp %4, %%"REG_a" \n\t" 01697 " jb 1b \n\t" 01698 01699 ::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth) 01700 : "memory", "%"REG_a 01701 ); 01702 udst += chromStride; 01703 vdst += chromStride; 01704 ydst += lumStride; 01705 src += srcStride; 01706 } 01707 __asm__ volatile(EMMS" \n\t" 01708 SFENCE" \n\t" 01709 :::"memory"); 01710 } 01711 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 01712 01720 static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst, 01721 int width, int height, 01722 int lumStride, int chromStride, int srcStride) 01723 { 01724 int y; 01725 const x86_reg chromWidth= width>>1; 01726 for (y=0; y<height-2; y+=2) { 01727 int i; 01728 for (i=0; i<2; i++) { 01729 __asm__ volatile( 01730 "mov %2, %%"REG_a" \n\t" 01731 "movq "MANGLE(ff_bgr2YCoeff)", %%mm6 \n\t" 01732 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" 01733 "pxor %%mm7, %%mm7 \n\t" 01734 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" 01735 ".p2align 4 \n\t" 01736 "1: \n\t" 01737 PREFETCH" 64(%0, %%"REG_d") \n\t" 01738 "movd (%0, %%"REG_d"), %%mm0 \n\t" 01739 "movd 3(%0, %%"REG_d"), %%mm1 \n\t" 01740 "punpcklbw %%mm7, %%mm0 \n\t" 01741 "punpcklbw %%mm7, %%mm1 \n\t" 01742 "movd 6(%0, %%"REG_d"), %%mm2 \n\t" 01743 "movd 9(%0, %%"REG_d"), %%mm3 \n\t" 01744 "punpcklbw %%mm7, %%mm2 \n\t" 01745 "punpcklbw %%mm7, %%mm3 \n\t" 01746 "pmaddwd %%mm6, %%mm0 \n\t" 01747 "pmaddwd %%mm6, %%mm1 \n\t" 01748 "pmaddwd %%mm6, %%mm2 \n\t" 01749 "pmaddwd %%mm6, %%mm3 \n\t" 01750 #ifndef FAST_BGR2YV12 01751 "psrad $8, %%mm0 \n\t" 01752 "psrad $8, %%mm1 \n\t" 01753 "psrad $8, %%mm2 \n\t" 01754 "psrad $8, %%mm3 \n\t" 01755 #endif 01756 "packssdw %%mm1, %%mm0 \n\t" 01757 "packssdw %%mm3, %%mm2 \n\t" 01758 "pmaddwd %%mm5, %%mm0 \n\t" 01759 "pmaddwd %%mm5, %%mm2 \n\t" 01760 "packssdw %%mm2, %%mm0 \n\t" 01761 "psraw $7, %%mm0 \n\t" 01762 01763 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" 01764 "movd 15(%0, %%"REG_d"), %%mm1 \n\t" 01765 "punpcklbw %%mm7, %%mm4 \n\t" 01766 "punpcklbw %%mm7, %%mm1 \n\t" 01767 "movd 18(%0, %%"REG_d"), %%mm2 \n\t" 01768 "movd 21(%0, %%"REG_d"), %%mm3 \n\t" 01769 "punpcklbw %%mm7, %%mm2 \n\t" 01770 "punpcklbw %%mm7, %%mm3 \n\t" 01771 "pmaddwd %%mm6, %%mm4 \n\t" 01772 "pmaddwd %%mm6, %%mm1 \n\t" 01773 "pmaddwd %%mm6, %%mm2 \n\t" 01774 "pmaddwd %%mm6, %%mm3 \n\t" 01775 #ifndef FAST_BGR2YV12 01776 "psrad $8, %%mm4 \n\t" 01777 "psrad $8, %%mm1 \n\t" 01778 "psrad $8, %%mm2 \n\t" 01779 "psrad $8, %%mm3 \n\t" 01780 #endif 01781 "packssdw %%mm1, %%mm4 \n\t" 01782 "packssdw %%mm3, %%mm2 \n\t" 01783 "pmaddwd %%mm5, %%mm4 \n\t" 01784 "pmaddwd %%mm5, %%mm2 \n\t" 01785 "add $24, %%"REG_d" \n\t" 01786 "packssdw %%mm2, %%mm4 \n\t" 01787 "psraw $7, %%mm4 \n\t" 01788 01789 "packuswb %%mm4, %%mm0 \n\t" 01790 "paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t" 01791 01792 MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t" 01793 "add $8, %%"REG_a" \n\t" 01794 " js 1b \n\t" 01795 : : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width) 01796 : "%"REG_a, "%"REG_d 01797 ); 01798 ydst += lumStride; 01799 src += srcStride; 01800 } 01801 src -= srcStride*2; 01802 __asm__ volatile( 01803 "mov %4, %%"REG_a" \n\t" 01804 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" 01805 "movq "MANGLE(ff_bgr2UCoeff)", %%mm6 \n\t" 01806 "pxor %%mm7, %%mm7 \n\t" 01807 "lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t" 01808 "add %%"REG_d", %%"REG_d" \n\t" 01809 ".p2align 4 \n\t" 01810 "1: \n\t" 01811 PREFETCH" 64(%0, %%"REG_d") \n\t" 01812 PREFETCH" 64(%1, %%"REG_d") \n\t" 01813 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW 01814 "movq (%0, %%"REG_d"), %%mm0 \n\t" 01815 "movq (%1, %%"REG_d"), %%mm1 \n\t" 01816 "movq 6(%0, %%"REG_d"), %%mm2 \n\t" 01817 "movq 6(%1, %%"REG_d"), %%mm3 \n\t" 01818 PAVGB" %%mm1, %%mm0 \n\t" 01819 PAVGB" %%mm3, %%mm2 \n\t" 01820 "movq %%mm0, %%mm1 \n\t" 01821 "movq %%mm2, %%mm3 \n\t" 01822 "psrlq $24, %%mm0 \n\t" 01823 "psrlq $24, %%mm2 \n\t" 01824 PAVGB" %%mm1, %%mm0 \n\t" 01825 PAVGB" %%mm3, %%mm2 \n\t" 01826 "punpcklbw %%mm7, %%mm0 \n\t" 01827 "punpcklbw %%mm7, %%mm2 \n\t" 01828 #else 01829 "movd (%0, %%"REG_d"), %%mm0 \n\t" 01830 "movd (%1, %%"REG_d"), %%mm1 \n\t" 01831 "movd 3(%0, %%"REG_d"), %%mm2 \n\t" 01832 "movd 3(%1, %%"REG_d"), %%mm3 \n\t" 01833 "punpcklbw %%mm7, %%mm0 \n\t" 01834 "punpcklbw %%mm7, %%mm1 \n\t" 01835 "punpcklbw %%mm7, %%mm2 \n\t" 01836 "punpcklbw %%mm7, %%mm3 \n\t" 01837 "paddw %%mm1, %%mm0 \n\t" 01838 "paddw %%mm3, %%mm2 \n\t" 01839 "paddw %%mm2, %%mm0 \n\t" 01840 "movd 6(%0, %%"REG_d"), %%mm4 \n\t" 01841 "movd 6(%1, %%"REG_d"), %%mm1 \n\t" 01842 "movd 9(%0, %%"REG_d"), %%mm2 \n\t" 01843 "movd 9(%1, %%"REG_d"), %%mm3 \n\t" 01844 "punpcklbw %%mm7, %%mm4 \n\t" 01845 "punpcklbw %%mm7, %%mm1 \n\t" 01846 "punpcklbw %%mm7, %%mm2 \n\t" 01847 "punpcklbw %%mm7, %%mm3 \n\t" 01848 "paddw %%mm1, %%mm4 \n\t" 01849 "paddw %%mm3, %%mm2 \n\t" 01850 "paddw %%mm4, %%mm2 \n\t" 01851 "psrlw $2, %%mm0 \n\t" 01852 "psrlw $2, %%mm2 \n\t" 01853 #endif 01854 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" 01855 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" 01856 01857 "pmaddwd %%mm0, %%mm1 \n\t" 01858 "pmaddwd %%mm2, %%mm3 \n\t" 01859 "pmaddwd %%mm6, %%mm0 \n\t" 01860 "pmaddwd %%mm6, %%mm2 \n\t" 01861 #ifndef FAST_BGR2YV12 01862 "psrad $8, %%mm0 \n\t" 01863 "psrad $8, %%mm1 \n\t" 01864 "psrad $8, %%mm2 \n\t" 01865 "psrad $8, %%mm3 \n\t" 01866 #endif 01867 "packssdw %%mm2, %%mm0 \n\t" 01868 "packssdw %%mm3, %%mm1 \n\t" 01869 "pmaddwd %%mm5, %%mm0 \n\t" 01870 "pmaddwd %%mm5, %%mm1 \n\t" 01871 "packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0 01872 "psraw $7, %%mm0 \n\t" 01873 01874 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW 01875 "movq 12(%0, %%"REG_d"), %%mm4 \n\t" 01876 "movq 12(%1, %%"REG_d"), %%mm1 \n\t" 01877 "movq 18(%0, %%"REG_d"), %%mm2 \n\t" 01878 "movq 18(%1, %%"REG_d"), %%mm3 \n\t" 01879 PAVGB" %%mm1, %%mm4 \n\t" 01880 PAVGB" %%mm3, %%mm2 \n\t" 01881 "movq %%mm4, %%mm1 \n\t" 01882 "movq %%mm2, %%mm3 \n\t" 01883 "psrlq $24, %%mm4 \n\t" 01884 "psrlq $24, %%mm2 \n\t" 01885 PAVGB" %%mm1, %%mm4 \n\t" 01886 PAVGB" %%mm3, %%mm2 \n\t" 01887 "punpcklbw %%mm7, %%mm4 \n\t" 01888 "punpcklbw %%mm7, %%mm2 \n\t" 01889 #else 01890 "movd 12(%0, %%"REG_d"), %%mm4 \n\t" 01891 "movd 12(%1, %%"REG_d"), %%mm1 \n\t" 01892 "movd 15(%0, %%"REG_d"), %%mm2 \n\t" 01893 "movd 15(%1, %%"REG_d"), %%mm3 \n\t" 01894 "punpcklbw %%mm7, %%mm4 \n\t" 01895 "punpcklbw %%mm7, %%mm1 \n\t" 01896 "punpcklbw %%mm7, %%mm2 \n\t" 01897 "punpcklbw %%mm7, %%mm3 \n\t" 01898 "paddw %%mm1, %%mm4 \n\t" 01899 "paddw %%mm3, %%mm2 \n\t" 01900 "paddw %%mm2, %%mm4 \n\t" 01901 "movd 18(%0, %%"REG_d"), %%mm5 \n\t" 01902 "movd 18(%1, %%"REG_d"), %%mm1 \n\t" 01903 "movd 21(%0, %%"REG_d"), %%mm2 \n\t" 01904 "movd 21(%1, %%"REG_d"), %%mm3 \n\t" 01905 "punpcklbw %%mm7, %%mm5 \n\t" 01906 "punpcklbw %%mm7, %%mm1 \n\t" 01907 "punpcklbw %%mm7, %%mm2 \n\t" 01908 "punpcklbw %%mm7, %%mm3 \n\t" 01909 "paddw %%mm1, %%mm5 \n\t" 01910 "paddw %%mm3, %%mm2 \n\t" 01911 "paddw %%mm5, %%mm2 \n\t" 01912 "movq "MANGLE(ff_w1111)", %%mm5 \n\t" 01913 "psrlw $2, %%mm4 \n\t" 01914 "psrlw $2, %%mm2 \n\t" 01915 #endif 01916 "movq "MANGLE(ff_bgr2VCoeff)", %%mm1 \n\t" 01917 "movq "MANGLE(ff_bgr2VCoeff)", %%mm3 \n\t" 01918 01919 "pmaddwd %%mm4, %%mm1 \n\t" 01920 "pmaddwd %%mm2, %%mm3 \n\t" 01921 "pmaddwd %%mm6, %%mm4 \n\t" 01922 "pmaddwd %%mm6, %%mm2 \n\t" 01923 #ifndef FAST_BGR2YV12 01924 "psrad $8, %%mm4 \n\t" 01925 "psrad $8, %%mm1 \n\t" 01926 "psrad $8, %%mm2 \n\t" 01927 "psrad $8, %%mm3 \n\t" 01928 #endif 01929 "packssdw %%mm2, %%mm4 \n\t" 01930 "packssdw %%mm3, %%mm1 \n\t" 01931 "pmaddwd %%mm5, %%mm4 \n\t" 01932 "pmaddwd %%mm5, %%mm1 \n\t" 01933 "add $24, %%"REG_d" \n\t" 01934 "packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2 01935 "psraw $7, %%mm4 \n\t" 01936 01937 "movq %%mm0, %%mm1 \n\t" 01938 "punpckldq %%mm4, %%mm0 \n\t" 01939 "punpckhdq %%mm4, %%mm1 \n\t" 01940 "packsswb %%mm1, %%mm0 \n\t" 01941 "paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t" 01942 "movd %%mm0, (%2, %%"REG_a") \n\t" 01943 "punpckhdq %%mm0, %%mm0 \n\t" 01944 "movd %%mm0, (%3, %%"REG_a") \n\t" 01945 "add $4, %%"REG_a" \n\t" 01946 " js 1b \n\t" 01947 : : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth) 01948 : "%"REG_a, "%"REG_d 01949 ); 01950 01951 udst += chromStride; 01952 vdst += chromStride; 01953 src += srcStride*2; 01954 } 01955 01956 __asm__ volatile(EMMS" \n\t" 01957 SFENCE" \n\t" 01958 :::"memory"); 01959 01960 rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride); 01961 } 01962 #endif /* !COMPILE_TEMPLATE_SSE2 */ 01963 01964 #if !COMPILE_TEMPLATE_AMD3DNOW 01965 static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest, 01966 int width, int height, int src1Stride, 01967 int src2Stride, int dstStride) 01968 { 01969 int h; 01970 01971 for (h=0; h < height; h++) { 01972 int w; 01973 01974 #if COMPILE_TEMPLATE_SSE2 01975 __asm__( 01976 "xor %%"REG_a", %%"REG_a" \n\t" 01977 "1: \n\t" 01978 PREFETCH" 64(%1, %%"REG_a") \n\t" 01979 PREFETCH" 64(%2, %%"REG_a") \n\t" 01980 "movdqa (%1, %%"REG_a"), %%xmm0 \n\t" 01981 "movdqa (%1, %%"REG_a"), %%xmm1 \n\t" 01982 "movdqa (%2, %%"REG_a"), %%xmm2 \n\t" 01983 "punpcklbw %%xmm2, %%xmm0 \n\t" 01984 "punpckhbw %%xmm2, %%xmm1 \n\t" 01985 "movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t" 01986 "movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t" 01987 "add $16, %%"REG_a" \n\t" 01988 "cmp %3, %%"REG_a" \n\t" 01989 " jb 1b \n\t" 01990 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) 01991 : "memory", "%"REG_a"" 01992 ); 01993 #else 01994 __asm__( 01995 "xor %%"REG_a", %%"REG_a" \n\t" 01996 "1: \n\t" 01997 PREFETCH" 64(%1, %%"REG_a") \n\t" 01998 PREFETCH" 64(%2, %%"REG_a") \n\t" 01999 "movq (%1, %%"REG_a"), %%mm0 \n\t" 02000 "movq 8(%1, %%"REG_a"), %%mm2 \n\t" 02001 "movq %%mm0, %%mm1 \n\t" 02002 "movq %%mm2, %%mm3 \n\t" 02003 "movq (%2, %%"REG_a"), %%mm4 \n\t" 02004 "movq 8(%2, %%"REG_a"), %%mm5 \n\t" 02005 "punpcklbw %%mm4, %%mm0 \n\t" 02006 "punpckhbw %%mm4, %%mm1 \n\t" 02007 "punpcklbw %%mm5, %%mm2 \n\t" 02008 "punpckhbw %%mm5, %%mm3 \n\t" 02009 MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t" 02010 MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t" 02011 MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t" 02012 MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t" 02013 "add $16, %%"REG_a" \n\t" 02014 "cmp %3, %%"REG_a" \n\t" 02015 " jb 1b \n\t" 02016 ::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15) 02017 : "memory", "%"REG_a 02018 ); 02019 #endif 02020 for (w= (width&(~15)); w < width; w++) { 02021 dest[2*w+0] = src1[w]; 02022 dest[2*w+1] = src2[w]; 02023 } 02024 dest += dstStride; 02025 src1 += src1Stride; 02026 src2 += src2Stride; 02027 } 02028 __asm__( 02029 EMMS" \n\t" 02030 SFENCE" \n\t" 02031 ::: "memory" 02032 ); 02033 } 02034 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 02035 02036 #if !COMPILE_TEMPLATE_SSE2 02037 #if !COMPILE_TEMPLATE_AMD3DNOW 02038 static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2, 02039 uint8_t *dst1, uint8_t *dst2, 02040 int width, int height, 02041 int srcStride1, int srcStride2, 02042 int dstStride1, int dstStride2) 02043 { 02044 x86_reg y; 02045 int x,w,h; 02046 w=width/2; h=height/2; 02047 __asm__ volatile( 02048 PREFETCH" %0 \n\t" 02049 PREFETCH" %1 \n\t" 02050 ::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory"); 02051 for (y=0;y<h;y++) { 02052 const uint8_t* s1=src1+srcStride1*(y>>1); 02053 uint8_t* d=dst1+dstStride1*y; 02054 x=0; 02055 for (;x<w-31;x+=32) { 02056 __asm__ volatile( 02057 PREFETCH" 32%1 \n\t" 02058 "movq %1, %%mm0 \n\t" 02059 "movq 8%1, %%mm2 \n\t" 02060 "movq 16%1, %%mm4 \n\t" 02061 "movq 24%1, %%mm6 \n\t" 02062 "movq %%mm0, %%mm1 \n\t" 02063 "movq %%mm2, %%mm3 \n\t" 02064 "movq %%mm4, %%mm5 \n\t" 02065 "movq %%mm6, %%mm7 \n\t" 02066 "punpcklbw %%mm0, %%mm0 \n\t" 02067 "punpckhbw %%mm1, %%mm1 \n\t" 02068 "punpcklbw %%mm2, %%mm2 \n\t" 02069 "punpckhbw %%mm3, %%mm3 \n\t" 02070 "punpcklbw %%mm4, %%mm4 \n\t" 02071 "punpckhbw %%mm5, %%mm5 \n\t" 02072 "punpcklbw %%mm6, %%mm6 \n\t" 02073 "punpckhbw %%mm7, %%mm7 \n\t" 02074 MOVNTQ" %%mm0, %0 \n\t" 02075 MOVNTQ" %%mm1, 8%0 \n\t" 02076 MOVNTQ" %%mm2, 16%0 \n\t" 02077 MOVNTQ" %%mm3, 24%0 \n\t" 02078 MOVNTQ" %%mm4, 32%0 \n\t" 02079 MOVNTQ" %%mm5, 40%0 \n\t" 02080 MOVNTQ" %%mm6, 48%0 \n\t" 02081 MOVNTQ" %%mm7, 56%0" 02082 :"=m"(d[2*x]) 02083 :"m"(s1[x]) 02084 :"memory"); 02085 } 02086 for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x]; 02087 } 02088 for (y=0;y<h;y++) { 02089 const uint8_t* s2=src2+srcStride2*(y>>1); 02090 uint8_t* d=dst2+dstStride2*y; 02091 x=0; 02092 for (;x<w-31;x+=32) { 02093 __asm__ volatile( 02094 PREFETCH" 32%1 \n\t" 02095 "movq %1, %%mm0 \n\t" 02096 "movq 8%1, %%mm2 \n\t" 02097 "movq 16%1, %%mm4 \n\t" 02098 "movq 24%1, %%mm6 \n\t" 02099 "movq %%mm0, %%mm1 \n\t" 02100 "movq %%mm2, %%mm3 \n\t" 02101 "movq %%mm4, %%mm5 \n\t" 02102 "movq %%mm6, %%mm7 \n\t" 02103 "punpcklbw %%mm0, %%mm0 \n\t" 02104 "punpckhbw %%mm1, %%mm1 \n\t" 02105 "punpcklbw %%mm2, %%mm2 \n\t" 02106 "punpckhbw %%mm3, %%mm3 \n\t" 02107 "punpcklbw %%mm4, %%mm4 \n\t" 02108 "punpckhbw %%mm5, %%mm5 \n\t" 02109 "punpcklbw %%mm6, %%mm6 \n\t" 02110 "punpckhbw %%mm7, %%mm7 \n\t" 02111 MOVNTQ" %%mm0, %0 \n\t" 02112 MOVNTQ" %%mm1, 8%0 \n\t" 02113 MOVNTQ" %%mm2, 16%0 \n\t" 02114 MOVNTQ" %%mm3, 24%0 \n\t" 02115 MOVNTQ" %%mm4, 32%0 \n\t" 02116 MOVNTQ" %%mm5, 40%0 \n\t" 02117 MOVNTQ" %%mm6, 48%0 \n\t" 02118 MOVNTQ" %%mm7, 56%0" 02119 :"=m"(d[2*x]) 02120 :"m"(s2[x]) 02121 :"memory"); 02122 } 02123 for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x]; 02124 } 02125 __asm__( 02126 EMMS" \n\t" 02127 SFENCE" \n\t" 02128 ::: "memory" 02129 ); 02130 } 02131 02132 static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3, 02133 uint8_t *dst, 02134 int width, int height, 02135 int srcStride1, int srcStride2, 02136 int srcStride3, int dstStride) 02137 { 02138 x86_reg x; 02139 int y,w,h; 02140 w=width/2; h=height; 02141 for (y=0;y<h;y++) { 02142 const uint8_t* yp=src1+srcStride1*y; 02143 const uint8_t* up=src2+srcStride2*(y>>2); 02144 const uint8_t* vp=src3+srcStride3*(y>>2); 02145 uint8_t* d=dst+dstStride*y; 02146 x=0; 02147 for (;x<w-7;x+=8) { 02148 __asm__ volatile( 02149 PREFETCH" 32(%1, %0) \n\t" 02150 PREFETCH" 32(%2, %0) \n\t" 02151 PREFETCH" 32(%3, %0) \n\t" 02152 "movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ 02153 "movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */ 02154 "movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */ 02155 "movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */ 02156 "movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */ 02157 "movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */ 02158 "punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */ 02159 "punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */ 02160 "punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */ 02161 "punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */ 02162 02163 "movq %%mm1, %%mm6 \n\t" 02164 "punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/ 02165 "punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/ 02166 "punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/ 02167 MOVNTQ" %%mm0, (%4, %0, 8) \n\t" 02168 MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t" 02169 02170 "punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/ 02171 "movq 8(%1, %0, 4), %%mm0 \n\t" 02172 "movq %%mm0, %%mm3 \n\t" 02173 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/ 02174 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/ 02175 MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t" 02176 MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t" 02177 02178 "movq %%mm4, %%mm6 \n\t" 02179 "movq 16(%1, %0, 4), %%mm0 \n\t" 02180 "movq %%mm0, %%mm3 \n\t" 02181 "punpcklbw %%mm5, %%mm4 \n\t" 02182 "punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/ 02183 "punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/ 02184 MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t" 02185 MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t" 02186 02187 "punpckhbw %%mm5, %%mm6 \n\t" 02188 "movq 24(%1, %0, 4), %%mm0 \n\t" 02189 "movq %%mm0, %%mm3 \n\t" 02190 "punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/ 02191 "punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/ 02192 MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t" 02193 MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t" 02194 02195 : "+r" (x) 02196 : "r"(yp), "r" (up), "r"(vp), "r"(d) 02197 :"memory"); 02198 } 02199 for (; x<w; x++) { 02200 const int x2 = x<<2; 02201 d[8*x+0] = yp[x2]; 02202 d[8*x+1] = up[x]; 02203 d[8*x+2] = yp[x2+1]; 02204 d[8*x+3] = vp[x]; 02205 d[8*x+4] = yp[x2+2]; 02206 d[8*x+5] = up[x]; 02207 d[8*x+6] = yp[x2+3]; 02208 d[8*x+7] = vp[x]; 02209 } 02210 } 02211 __asm__( 02212 EMMS" \n\t" 02213 SFENCE" \n\t" 02214 ::: "memory" 02215 ); 02216 } 02217 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 02218 02219 static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count) 02220 { 02221 dst += count; 02222 src += 2*count; 02223 count= - count; 02224 02225 if(count <= -16) { 02226 count += 15; 02227 __asm__ volatile( 02228 "pcmpeqw %%mm7, %%mm7 \n\t" 02229 "psrlw $8, %%mm7 \n\t" 02230 "1: \n\t" 02231 "movq -30(%1, %0, 2), %%mm0 \n\t" 02232 "movq -22(%1, %0, 2), %%mm1 \n\t" 02233 "movq -14(%1, %0, 2), %%mm2 \n\t" 02234 "movq -6(%1, %0, 2), %%mm3 \n\t" 02235 "pand %%mm7, %%mm0 \n\t" 02236 "pand %%mm7, %%mm1 \n\t" 02237 "pand %%mm7, %%mm2 \n\t" 02238 "pand %%mm7, %%mm3 \n\t" 02239 "packuswb %%mm1, %%mm0 \n\t" 02240 "packuswb %%mm3, %%mm2 \n\t" 02241 MOVNTQ" %%mm0,-15(%2, %0) \n\t" 02242 MOVNTQ" %%mm2,- 7(%2, %0) \n\t" 02243 "add $16, %0 \n\t" 02244 " js 1b \n\t" 02245 : "+r"(count) 02246 : "r"(src), "r"(dst) 02247 ); 02248 count -= 15; 02249 } 02250 while(count<0) { 02251 dst[count]= src[2*count]; 02252 count++; 02253 } 02254 } 02255 02256 #if !COMPILE_TEMPLATE_AMD3DNOW 02257 static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) 02258 { 02259 dst0+= count; 02260 dst1+= count; 02261 src += 4*count; 02262 count= - count; 02263 if(count <= -8) { 02264 count += 7; 02265 __asm__ volatile( 02266 "pcmpeqw %%mm7, %%mm7 \n\t" 02267 "psrlw $8, %%mm7 \n\t" 02268 "1: \n\t" 02269 "movq -28(%1, %0, 4), %%mm0 \n\t" 02270 "movq -20(%1, %0, 4), %%mm1 \n\t" 02271 "movq -12(%1, %0, 4), %%mm2 \n\t" 02272 "movq -4(%1, %0, 4), %%mm3 \n\t" 02273 "pand %%mm7, %%mm0 \n\t" 02274 "pand %%mm7, %%mm1 \n\t" 02275 "pand %%mm7, %%mm2 \n\t" 02276 "pand %%mm7, %%mm3 \n\t" 02277 "packuswb %%mm1, %%mm0 \n\t" 02278 "packuswb %%mm3, %%mm2 \n\t" 02279 "movq %%mm0, %%mm1 \n\t" 02280 "movq %%mm2, %%mm3 \n\t" 02281 "psrlw $8, %%mm0 \n\t" 02282 "psrlw $8, %%mm2 \n\t" 02283 "pand %%mm7, %%mm1 \n\t" 02284 "pand %%mm7, %%mm3 \n\t" 02285 "packuswb %%mm2, %%mm0 \n\t" 02286 "packuswb %%mm3, %%mm1 \n\t" 02287 MOVNTQ" %%mm0,- 7(%3, %0) \n\t" 02288 MOVNTQ" %%mm1,- 7(%2, %0) \n\t" 02289 "add $8, %0 \n\t" 02290 " js 1b \n\t" 02291 : "+r"(count) 02292 : "r"(src), "r"(dst0), "r"(dst1) 02293 ); 02294 count -= 7; 02295 } 02296 while(count<0) { 02297 dst0[count]= src[4*count+0]; 02298 dst1[count]= src[4*count+2]; 02299 count++; 02300 } 02301 } 02302 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 02303 02304 static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) 02305 { 02306 dst0 += count; 02307 dst1 += count; 02308 src0 += 4*count; 02309 src1 += 4*count; 02310 count= - count; 02311 #ifdef PAVGB 02312 if(count <= -8) { 02313 count += 7; 02314 __asm__ volatile( 02315 "pcmpeqw %%mm7, %%mm7 \n\t" 02316 "psrlw $8, %%mm7 \n\t" 02317 "1: \n\t" 02318 "movq -28(%1, %0, 4), %%mm0 \n\t" 02319 "movq -20(%1, %0, 4), %%mm1 \n\t" 02320 "movq -12(%1, %0, 4), %%mm2 \n\t" 02321 "movq -4(%1, %0, 4), %%mm3 \n\t" 02322 PAVGB" -28(%2, %0, 4), %%mm0 \n\t" 02323 PAVGB" -20(%2, %0, 4), %%mm1 \n\t" 02324 PAVGB" -12(%2, %0, 4), %%mm2 \n\t" 02325 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" 02326 "pand %%mm7, %%mm0 \n\t" 02327 "pand %%mm7, %%mm1 \n\t" 02328 "pand %%mm7, %%mm2 \n\t" 02329 "pand %%mm7, %%mm3 \n\t" 02330 "packuswb %%mm1, %%mm0 \n\t" 02331 "packuswb %%mm3, %%mm2 \n\t" 02332 "movq %%mm0, %%mm1 \n\t" 02333 "movq %%mm2, %%mm3 \n\t" 02334 "psrlw $8, %%mm0 \n\t" 02335 "psrlw $8, %%mm2 \n\t" 02336 "pand %%mm7, %%mm1 \n\t" 02337 "pand %%mm7, %%mm3 \n\t" 02338 "packuswb %%mm2, %%mm0 \n\t" 02339 "packuswb %%mm3, %%mm1 \n\t" 02340 MOVNTQ" %%mm0,- 7(%4, %0) \n\t" 02341 MOVNTQ" %%mm1,- 7(%3, %0) \n\t" 02342 "add $8, %0 \n\t" 02343 " js 1b \n\t" 02344 : "+r"(count) 02345 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) 02346 ); 02347 count -= 7; 02348 } 02349 #endif 02350 while(count<0) { 02351 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; 02352 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; 02353 count++; 02354 } 02355 } 02356 02357 #if !COMPILE_TEMPLATE_AMD3DNOW 02358 static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count) 02359 { 02360 dst0+= count; 02361 dst1+= count; 02362 src += 4*count; 02363 count= - count; 02364 if(count <= -8) { 02365 count += 7; 02366 __asm__ volatile( 02367 "pcmpeqw %%mm7, %%mm7 \n\t" 02368 "psrlw $8, %%mm7 \n\t" 02369 "1: \n\t" 02370 "movq -28(%1, %0, 4), %%mm0 \n\t" 02371 "movq -20(%1, %0, 4), %%mm1 \n\t" 02372 "movq -12(%1, %0, 4), %%mm2 \n\t" 02373 "movq -4(%1, %0, 4), %%mm3 \n\t" 02374 "psrlw $8, %%mm0 \n\t" 02375 "psrlw $8, %%mm1 \n\t" 02376 "psrlw $8, %%mm2 \n\t" 02377 "psrlw $8, %%mm3 \n\t" 02378 "packuswb %%mm1, %%mm0 \n\t" 02379 "packuswb %%mm3, %%mm2 \n\t" 02380 "movq %%mm0, %%mm1 \n\t" 02381 "movq %%mm2, %%mm3 \n\t" 02382 "psrlw $8, %%mm0 \n\t" 02383 "psrlw $8, %%mm2 \n\t" 02384 "pand %%mm7, %%mm1 \n\t" 02385 "pand %%mm7, %%mm3 \n\t" 02386 "packuswb %%mm2, %%mm0 \n\t" 02387 "packuswb %%mm3, %%mm1 \n\t" 02388 MOVNTQ" %%mm0,- 7(%3, %0) \n\t" 02389 MOVNTQ" %%mm1,- 7(%2, %0) \n\t" 02390 "add $8, %0 \n\t" 02391 " js 1b \n\t" 02392 : "+r"(count) 02393 : "r"(src), "r"(dst0), "r"(dst1) 02394 ); 02395 count -= 7; 02396 } 02397 src++; 02398 while(count<0) { 02399 dst0[count]= src[4*count+0]; 02400 dst1[count]= src[4*count+2]; 02401 count++; 02402 } 02403 } 02404 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 02405 02406 static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count) 02407 { 02408 dst0 += count; 02409 dst1 += count; 02410 src0 += 4*count; 02411 src1 += 4*count; 02412 count= - count; 02413 #ifdef PAVGB 02414 if(count <= -8) { 02415 count += 7; 02416 __asm__ volatile( 02417 "pcmpeqw %%mm7, %%mm7 \n\t" 02418 "psrlw $8, %%mm7 \n\t" 02419 "1: \n\t" 02420 "movq -28(%1, %0, 4), %%mm0 \n\t" 02421 "movq -20(%1, %0, 4), %%mm1 \n\t" 02422 "movq -12(%1, %0, 4), %%mm2 \n\t" 02423 "movq -4(%1, %0, 4), %%mm3 \n\t" 02424 PAVGB" -28(%2, %0, 4), %%mm0 \n\t" 02425 PAVGB" -20(%2, %0, 4), %%mm1 \n\t" 02426 PAVGB" -12(%2, %0, 4), %%mm2 \n\t" 02427 PAVGB" - 4(%2, %0, 4), %%mm3 \n\t" 02428 "psrlw $8, %%mm0 \n\t" 02429 "psrlw $8, %%mm1 \n\t" 02430 "psrlw $8, %%mm2 \n\t" 02431 "psrlw $8, %%mm3 \n\t" 02432 "packuswb %%mm1, %%mm0 \n\t" 02433 "packuswb %%mm3, %%mm2 \n\t" 02434 "movq %%mm0, %%mm1 \n\t" 02435 "movq %%mm2, %%mm3 \n\t" 02436 "psrlw $8, %%mm0 \n\t" 02437 "psrlw $8, %%mm2 \n\t" 02438 "pand %%mm7, %%mm1 \n\t" 02439 "pand %%mm7, %%mm3 \n\t" 02440 "packuswb %%mm2, %%mm0 \n\t" 02441 "packuswb %%mm3, %%mm1 \n\t" 02442 MOVNTQ" %%mm0,- 7(%4, %0) \n\t" 02443 MOVNTQ" %%mm1,- 7(%3, %0) \n\t" 02444 "add $8, %0 \n\t" 02445 " js 1b \n\t" 02446 : "+r"(count) 02447 : "r"(src0), "r"(src1), "r"(dst0), "r"(dst1) 02448 ); 02449 count -= 7; 02450 } 02451 #endif 02452 src0++; 02453 src1++; 02454 while(count<0) { 02455 dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1; 02456 dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1; 02457 count++; 02458 } 02459 } 02460 02461 static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, 02462 int width, int height, 02463 int lumStride, int chromStride, int srcStride) 02464 { 02465 int y; 02466 const int chromWidth= -((-width)>>1); 02467 02468 for (y=0; y<height; y++) { 02469 RENAME(extract_even)(src, ydst, width); 02470 if(y&1) { 02471 RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth); 02472 udst+= chromStride; 02473 vdst+= chromStride; 02474 } 02475 02476 src += srcStride; 02477 ydst+= lumStride; 02478 } 02479 __asm__( 02480 EMMS" \n\t" 02481 SFENCE" \n\t" 02482 ::: "memory" 02483 ); 02484 } 02485 02486 #if !COMPILE_TEMPLATE_AMD3DNOW 02487 static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, 02488 int width, int height, 02489 int lumStride, int chromStride, int srcStride) 02490 { 02491 int y; 02492 const int chromWidth= -((-width)>>1); 02493 02494 for (y=0; y<height; y++) { 02495 RENAME(extract_even)(src, ydst, width); 02496 RENAME(extract_odd2)(src, udst, vdst, chromWidth); 02497 02498 src += srcStride; 02499 ydst+= lumStride; 02500 udst+= chromStride; 02501 vdst+= chromStride; 02502 } 02503 __asm__( 02504 EMMS" \n\t" 02505 SFENCE" \n\t" 02506 ::: "memory" 02507 ); 02508 } 02509 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 02510 02511 static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, 02512 int width, int height, 02513 int lumStride, int chromStride, int srcStride) 02514 { 02515 int y; 02516 const int chromWidth= -((-width)>>1); 02517 02518 for (y=0; y<height; y++) { 02519 RENAME(extract_even)(src+1, ydst, width); 02520 if(y&1) { 02521 RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth); 02522 udst+= chromStride; 02523 vdst+= chromStride; 02524 } 02525 02526 src += srcStride; 02527 ydst+= lumStride; 02528 } 02529 __asm__( 02530 EMMS" \n\t" 02531 SFENCE" \n\t" 02532 ::: "memory" 02533 ); 02534 } 02535 02536 #if !COMPILE_TEMPLATE_AMD3DNOW 02537 static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src, 02538 int width, int height, 02539 int lumStride, int chromStride, int srcStride) 02540 { 02541 int y; 02542 const int chromWidth= -((-width)>>1); 02543 02544 for (y=0; y<height; y++) { 02545 RENAME(extract_even)(src+1, ydst, width); 02546 RENAME(extract_even2)(src, udst, vdst, chromWidth); 02547 02548 src += srcStride; 02549 ydst+= lumStride; 02550 udst+= chromStride; 02551 vdst+= chromStride; 02552 } 02553 __asm__( 02554 EMMS" \n\t" 02555 SFENCE" \n\t" 02556 ::: "memory" 02557 ); 02558 } 02559 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 02560 #endif /* !COMPILE_TEMPLATE_SSE2 */ 02561 02562 static inline void RENAME(rgb2rgb_init)(void) 02563 { 02564 #if !COMPILE_TEMPLATE_SSE2 02565 #if !COMPILE_TEMPLATE_AMD3DNOW 02566 rgb15to16 = RENAME(rgb15to16); 02567 rgb15tobgr24 = RENAME(rgb15tobgr24); 02568 rgb15to32 = RENAME(rgb15to32); 02569 rgb16tobgr24 = RENAME(rgb16tobgr24); 02570 rgb16to32 = RENAME(rgb16to32); 02571 rgb16to15 = RENAME(rgb16to15); 02572 rgb24tobgr16 = RENAME(rgb24tobgr16); 02573 rgb24tobgr15 = RENAME(rgb24tobgr15); 02574 rgb24tobgr32 = RENAME(rgb24tobgr32); 02575 rgb32to16 = RENAME(rgb32to16); 02576 rgb32to15 = RENAME(rgb32to15); 02577 rgb32tobgr24 = RENAME(rgb32tobgr24); 02578 rgb24to15 = RENAME(rgb24to15); 02579 rgb24to16 = RENAME(rgb24to16); 02580 rgb24tobgr24 = RENAME(rgb24tobgr24); 02581 shuffle_bytes_2103 = RENAME(shuffle_bytes_2103); 02582 rgb32tobgr16 = RENAME(rgb32tobgr16); 02583 rgb32tobgr15 = RENAME(rgb32tobgr15); 02584 yv12toyuy2 = RENAME(yv12toyuy2); 02585 yv12touyvy = RENAME(yv12touyvy); 02586 yuv422ptoyuy2 = RENAME(yuv422ptoyuy2); 02587 yuv422ptouyvy = RENAME(yuv422ptouyvy); 02588 yuy2toyv12 = RENAME(yuy2toyv12); 02589 vu9_to_vu12 = RENAME(vu9_to_vu12); 02590 yvu9_to_yuy2 = RENAME(yvu9_to_yuy2); 02591 uyvytoyuv422 = RENAME(uyvytoyuv422); 02592 yuyvtoyuv422 = RENAME(yuyvtoyuv422); 02593 #endif /* !COMPILE_TEMPLATE_SSE2 */ 02594 02595 #if COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW 02596 planar2x = RENAME(planar2x); 02597 #endif /* COMPILE_TEMPLATE_MMX2 || COMPILE_TEMPLATE_AMD3DNOW */ 02598 rgb24toyv12 = RENAME(rgb24toyv12); 02599 02600 yuyvtoyuv420 = RENAME(yuyvtoyuv420); 02601 uyvytoyuv420 = RENAME(uyvytoyuv420); 02602 #endif /* COMPILE_TEMPLATE_SSE2 */ 02603 02604 #if !COMPILE_TEMPLATE_AMD3DNOW 02605 interleaveBytes = RENAME(interleaveBytes); 02606 #endif /* !COMPILE_TEMPLATE_AMD3DNOW */ 02607 }