Libav 0.7.1
|
00001 /* 00002 * The simplest mpeg encoder (well, it was the simplest!) 00003 * Copyright (c) 2000,2001 Fabrice Bellard 00004 * 00005 * Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> 00006 * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> 00007 * 00008 * This file is part of Libav. 00009 * 00010 * Libav is free software; you can redistribute it and/or 00011 * modify it under the terms of the GNU Lesser General Public 00012 * License as published by the Free Software Foundation; either 00013 * version 2.1 of the License, or (at your option) any later version. 00014 * 00015 * Libav is distributed in the hope that it will be useful, 00016 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00017 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00018 * Lesser General Public License for more details. 00019 * 00020 * You should have received a copy of the GNU Lesser General Public 00021 * License along with Libav; if not, write to the Free Software 00022 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00023 */ 00024 00025 #include "libavutil/cpu.h" 00026 #include "libavutil/x86_cpu.h" 00027 #include "libavcodec/avcodec.h" 00028 #include "libavcodec/dsputil.h" 00029 #include "libavcodec/mpegvideo.h" 00030 #include "dsputil_mmx.h" 00031 00032 extern uint16_t inv_zigzag_direct16[64]; 00033 00034 00035 static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, 00036 DCTELEM *block, int n, int qscale) 00037 { 00038 x86_reg level, qmul, qadd, nCoeffs; 00039 00040 qmul = qscale << 1; 00041 00042 assert(s->block_last_index[n]>=0 || s->h263_aic); 00043 00044 if (!s->h263_aic) { 00045 if (n < 4) 00046 level = block[0] * s->y_dc_scale; 00047 else 00048 level = block[0] * s->c_dc_scale; 00049 qadd = (qscale - 1) | 1; 00050 }else{ 00051 qadd = 0; 00052 level= block[0]; 00053 } 00054 if(s->ac_pred) 00055 nCoeffs=63; 00056 else 00057 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; 00058 //printf("%d %d ", qmul, qadd); 00059 __asm__ volatile( 00060 "movd %1, %%mm6 \n\t" //qmul 00061 "packssdw %%mm6, %%mm6 \n\t" 00062 "packssdw %%mm6, %%mm6 \n\t" 00063 "movd %2, %%mm5 \n\t" //qadd 00064 "pxor %%mm7, %%mm7 \n\t" 00065 "packssdw %%mm5, %%mm5 \n\t" 00066 "packssdw %%mm5, %%mm5 \n\t" 00067 "psubw %%mm5, %%mm7 \n\t" 00068 "pxor %%mm4, %%mm4 \n\t" 00069 ".p2align 4 \n\t" 00070 "1: \n\t" 00071 "movq (%0, %3), %%mm0 \n\t" 00072 "movq 8(%0, %3), %%mm1 \n\t" 00073 00074 "pmullw %%mm6, %%mm0 \n\t" 00075 "pmullw %%mm6, %%mm1 \n\t" 00076 00077 "movq (%0, %3), %%mm2 \n\t" 00078 "movq 8(%0, %3), %%mm3 \n\t" 00079 00080 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 00081 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 00082 00083 "pxor %%mm2, %%mm0 \n\t" 00084 "pxor %%mm3, %%mm1 \n\t" 00085 00086 "paddw %%mm7, %%mm0 \n\t" 00087 "paddw %%mm7, %%mm1 \n\t" 00088 00089 "pxor %%mm0, %%mm2 \n\t" 00090 "pxor %%mm1, %%mm3 \n\t" 00091 00092 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 00093 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 00094 00095 "pandn %%mm2, %%mm0 \n\t" 00096 "pandn %%mm3, %%mm1 \n\t" 00097 00098 "movq %%mm0, (%0, %3) \n\t" 00099 "movq %%mm1, 8(%0, %3) \n\t" 00100 00101 "add $16, %3 \n\t" 00102 "jng 1b \n\t" 00103 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) 00104 : "memory" 00105 ); 00106 block[0]= level; 00107 } 00108 00109 00110 static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, 00111 DCTELEM *block, int n, int qscale) 00112 { 00113 x86_reg qmul, qadd, nCoeffs; 00114 00115 qmul = qscale << 1; 00116 qadd = (qscale - 1) | 1; 00117 00118 assert(s->block_last_index[n]>=0 || s->h263_aic); 00119 00120 nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; 00121 //printf("%d %d ", qmul, qadd); 00122 __asm__ volatile( 00123 "movd %1, %%mm6 \n\t" //qmul 00124 "packssdw %%mm6, %%mm6 \n\t" 00125 "packssdw %%mm6, %%mm6 \n\t" 00126 "movd %2, %%mm5 \n\t" //qadd 00127 "pxor %%mm7, %%mm7 \n\t" 00128 "packssdw %%mm5, %%mm5 \n\t" 00129 "packssdw %%mm5, %%mm5 \n\t" 00130 "psubw %%mm5, %%mm7 \n\t" 00131 "pxor %%mm4, %%mm4 \n\t" 00132 ".p2align 4 \n\t" 00133 "1: \n\t" 00134 "movq (%0, %3), %%mm0 \n\t" 00135 "movq 8(%0, %3), %%mm1 \n\t" 00136 00137 "pmullw %%mm6, %%mm0 \n\t" 00138 "pmullw %%mm6, %%mm1 \n\t" 00139 00140 "movq (%0, %3), %%mm2 \n\t" 00141 "movq 8(%0, %3), %%mm3 \n\t" 00142 00143 "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 00144 "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 00145 00146 "pxor %%mm2, %%mm0 \n\t" 00147 "pxor %%mm3, %%mm1 \n\t" 00148 00149 "paddw %%mm7, %%mm0 \n\t" 00150 "paddw %%mm7, %%mm1 \n\t" 00151 00152 "pxor %%mm0, %%mm2 \n\t" 00153 "pxor %%mm1, %%mm3 \n\t" 00154 00155 "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 00156 "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 00157 00158 "pandn %%mm2, %%mm0 \n\t" 00159 "pandn %%mm3, %%mm1 \n\t" 00160 00161 "movq %%mm0, (%0, %3) \n\t" 00162 "movq %%mm1, 8(%0, %3) \n\t" 00163 00164 "add $16, %3 \n\t" 00165 "jng 1b \n\t" 00166 ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) 00167 : "memory" 00168 ); 00169 } 00170 00171 00172 /* 00173 NK: 00174 Note: looking at PARANOID: 00175 "enable all paranoid tests for rounding, overflows, etc..." 00176 00177 #ifdef PARANOID 00178 if (level < -2048 || level > 2047) 00179 fprintf(stderr, "unquant error %d %d\n", i, level); 00180 #endif 00181 We can suppose that result of two multiplications can't be greater than 0xFFFF 00182 i.e. is 16-bit, so we use here only PMULLW instruction and can avoid 00183 a complex multiplication. 00184 ===================================================== 00185 Full formula for multiplication of 2 integer numbers 00186 which are represent as high:low words: 00187 input: value1 = high1:low1 00188 value2 = high2:low2 00189 output: value3 = value1*value2 00190 value3=high3:low3 (on overflow: modulus 2^32 wrap-around) 00191 this mean that for 0x123456 * 0x123456 correct result is 0x766cb0ce4 00192 but this algorithm will compute only 0x66cb0ce4 00193 this limited by 16-bit size of operands 00194 --------------------------------- 00195 tlow1 = high1*low2 00196 tlow2 = high2*low1 00197 tlow1 = tlow1 + tlow2 00198 high3:low3 = low1*low2 00199 high3 += tlow1 00200 */ 00201 static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, 00202 DCTELEM *block, int n, int qscale) 00203 { 00204 x86_reg nCoeffs; 00205 const uint16_t *quant_matrix; 00206 int block0; 00207 00208 assert(s->block_last_index[n]>=0); 00209 00210 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; 00211 00212 if (n < 4) 00213 block0 = block[0] * s->y_dc_scale; 00214 else 00215 block0 = block[0] * s->c_dc_scale; 00216 /* XXX: only mpeg1 */ 00217 quant_matrix = s->intra_matrix; 00218 __asm__ volatile( 00219 "pcmpeqw %%mm7, %%mm7 \n\t" 00220 "psrlw $15, %%mm7 \n\t" 00221 "movd %2, %%mm6 \n\t" 00222 "packssdw %%mm6, %%mm6 \n\t" 00223 "packssdw %%mm6, %%mm6 \n\t" 00224 "mov %3, %%"REG_a" \n\t" 00225 ".p2align 4 \n\t" 00226 "1: \n\t" 00227 "movq (%0, %%"REG_a"), %%mm0 \n\t" 00228 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" 00229 "movq (%1, %%"REG_a"), %%mm4 \n\t" 00230 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" 00231 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 00232 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 00233 "pxor %%mm2, %%mm2 \n\t" 00234 "pxor %%mm3, %%mm3 \n\t" 00235 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 00236 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 00237 "pxor %%mm2, %%mm0 \n\t" 00238 "pxor %%mm3, %%mm1 \n\t" 00239 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 00240 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 00241 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 00242 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 00243 "pxor %%mm4, %%mm4 \n\t" 00244 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 00245 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 00246 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 00247 "psraw $3, %%mm0 \n\t" 00248 "psraw $3, %%mm1 \n\t" 00249 "psubw %%mm7, %%mm0 \n\t" 00250 "psubw %%mm7, %%mm1 \n\t" 00251 "por %%mm7, %%mm0 \n\t" 00252 "por %%mm7, %%mm1 \n\t" 00253 "pxor %%mm2, %%mm0 \n\t" 00254 "pxor %%mm3, %%mm1 \n\t" 00255 "psubw %%mm2, %%mm0 \n\t" 00256 "psubw %%mm3, %%mm1 \n\t" 00257 "pandn %%mm0, %%mm4 \n\t" 00258 "pandn %%mm1, %%mm5 \n\t" 00259 "movq %%mm4, (%0, %%"REG_a") \n\t" 00260 "movq %%mm5, 8(%0, %%"REG_a") \n\t" 00261 00262 "add $16, %%"REG_a" \n\t" 00263 "js 1b \n\t" 00264 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) 00265 : "%"REG_a, "memory" 00266 ); 00267 block[0]= block0; 00268 } 00269 00270 static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, 00271 DCTELEM *block, int n, int qscale) 00272 { 00273 x86_reg nCoeffs; 00274 const uint16_t *quant_matrix; 00275 00276 assert(s->block_last_index[n]>=0); 00277 00278 nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; 00279 00280 quant_matrix = s->inter_matrix; 00281 __asm__ volatile( 00282 "pcmpeqw %%mm7, %%mm7 \n\t" 00283 "psrlw $15, %%mm7 \n\t" 00284 "movd %2, %%mm6 \n\t" 00285 "packssdw %%mm6, %%mm6 \n\t" 00286 "packssdw %%mm6, %%mm6 \n\t" 00287 "mov %3, %%"REG_a" \n\t" 00288 ".p2align 4 \n\t" 00289 "1: \n\t" 00290 "movq (%0, %%"REG_a"), %%mm0 \n\t" 00291 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" 00292 "movq (%1, %%"REG_a"), %%mm4 \n\t" 00293 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" 00294 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 00295 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 00296 "pxor %%mm2, %%mm2 \n\t" 00297 "pxor %%mm3, %%mm3 \n\t" 00298 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 00299 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 00300 "pxor %%mm2, %%mm0 \n\t" 00301 "pxor %%mm3, %%mm1 \n\t" 00302 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 00303 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 00304 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 00305 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 00306 "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 00307 "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 00308 "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 00309 "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 00310 "pxor %%mm4, %%mm4 \n\t" 00311 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 00312 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 00313 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 00314 "psraw $4, %%mm0 \n\t" 00315 "psraw $4, %%mm1 \n\t" 00316 "psubw %%mm7, %%mm0 \n\t" 00317 "psubw %%mm7, %%mm1 \n\t" 00318 "por %%mm7, %%mm0 \n\t" 00319 "por %%mm7, %%mm1 \n\t" 00320 "pxor %%mm2, %%mm0 \n\t" 00321 "pxor %%mm3, %%mm1 \n\t" 00322 "psubw %%mm2, %%mm0 \n\t" 00323 "psubw %%mm3, %%mm1 \n\t" 00324 "pandn %%mm0, %%mm4 \n\t" 00325 "pandn %%mm1, %%mm5 \n\t" 00326 "movq %%mm4, (%0, %%"REG_a") \n\t" 00327 "movq %%mm5, 8(%0, %%"REG_a") \n\t" 00328 00329 "add $16, %%"REG_a" \n\t" 00330 "js 1b \n\t" 00331 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) 00332 : "%"REG_a, "memory" 00333 ); 00334 } 00335 00336 static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, 00337 DCTELEM *block, int n, int qscale) 00338 { 00339 x86_reg nCoeffs; 00340 const uint16_t *quant_matrix; 00341 int block0; 00342 00343 assert(s->block_last_index[n]>=0); 00344 00345 if(s->alternate_scan) nCoeffs= 63; //FIXME 00346 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; 00347 00348 if (n < 4) 00349 block0 = block[0] * s->y_dc_scale; 00350 else 00351 block0 = block[0] * s->c_dc_scale; 00352 quant_matrix = s->intra_matrix; 00353 __asm__ volatile( 00354 "pcmpeqw %%mm7, %%mm7 \n\t" 00355 "psrlw $15, %%mm7 \n\t" 00356 "movd %2, %%mm6 \n\t" 00357 "packssdw %%mm6, %%mm6 \n\t" 00358 "packssdw %%mm6, %%mm6 \n\t" 00359 "mov %3, %%"REG_a" \n\t" 00360 ".p2align 4 \n\t" 00361 "1: \n\t" 00362 "movq (%0, %%"REG_a"), %%mm0 \n\t" 00363 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" 00364 "movq (%1, %%"REG_a"), %%mm4 \n\t" 00365 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" 00366 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 00367 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 00368 "pxor %%mm2, %%mm2 \n\t" 00369 "pxor %%mm3, %%mm3 \n\t" 00370 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 00371 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 00372 "pxor %%mm2, %%mm0 \n\t" 00373 "pxor %%mm3, %%mm1 \n\t" 00374 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 00375 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 00376 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q 00377 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q 00378 "pxor %%mm4, %%mm4 \n\t" 00379 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 00380 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 00381 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 00382 "psraw $3, %%mm0 \n\t" 00383 "psraw $3, %%mm1 \n\t" 00384 "pxor %%mm2, %%mm0 \n\t" 00385 "pxor %%mm3, %%mm1 \n\t" 00386 "psubw %%mm2, %%mm0 \n\t" 00387 "psubw %%mm3, %%mm1 \n\t" 00388 "pandn %%mm0, %%mm4 \n\t" 00389 "pandn %%mm1, %%mm5 \n\t" 00390 "movq %%mm4, (%0, %%"REG_a") \n\t" 00391 "movq %%mm5, 8(%0, %%"REG_a") \n\t" 00392 00393 "add $16, %%"REG_a" \n\t" 00394 "jng 1b \n\t" 00395 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) 00396 : "%"REG_a, "memory" 00397 ); 00398 block[0]= block0; 00399 //Note, we do not do mismatch control for intra as errors cannot accumulate 00400 } 00401 00402 static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, 00403 DCTELEM *block, int n, int qscale) 00404 { 00405 x86_reg nCoeffs; 00406 const uint16_t *quant_matrix; 00407 00408 assert(s->block_last_index[n]>=0); 00409 00410 if(s->alternate_scan) nCoeffs= 63; //FIXME 00411 else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; 00412 00413 quant_matrix = s->inter_matrix; 00414 __asm__ volatile( 00415 "pcmpeqw %%mm7, %%mm7 \n\t" 00416 "psrlq $48, %%mm7 \n\t" 00417 "movd %2, %%mm6 \n\t" 00418 "packssdw %%mm6, %%mm6 \n\t" 00419 "packssdw %%mm6, %%mm6 \n\t" 00420 "mov %3, %%"REG_a" \n\t" 00421 ".p2align 4 \n\t" 00422 "1: \n\t" 00423 "movq (%0, %%"REG_a"), %%mm0 \n\t" 00424 "movq 8(%0, %%"REG_a"), %%mm1 \n\t" 00425 "movq (%1, %%"REG_a"), %%mm4 \n\t" 00426 "movq 8(%1, %%"REG_a"), %%mm5 \n\t" 00427 "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] 00428 "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] 00429 "pxor %%mm2, %%mm2 \n\t" 00430 "pxor %%mm3, %%mm3 \n\t" 00431 "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 00432 "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 00433 "pxor %%mm2, %%mm0 \n\t" 00434 "pxor %%mm3, %%mm1 \n\t" 00435 "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) 00436 "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) 00437 "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 00438 "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 00439 "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q 00440 "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q 00441 "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q 00442 "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q 00443 "pxor %%mm4, %%mm4 \n\t" 00444 "pxor %%mm5, %%mm5 \n\t" // FIXME slow 00445 "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 00446 "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 00447 "psrlw $4, %%mm0 \n\t" 00448 "psrlw $4, %%mm1 \n\t" 00449 "pxor %%mm2, %%mm0 \n\t" 00450 "pxor %%mm3, %%mm1 \n\t" 00451 "psubw %%mm2, %%mm0 \n\t" 00452 "psubw %%mm3, %%mm1 \n\t" 00453 "pandn %%mm0, %%mm4 \n\t" 00454 "pandn %%mm1, %%mm5 \n\t" 00455 "pxor %%mm4, %%mm7 \n\t" 00456 "pxor %%mm5, %%mm7 \n\t" 00457 "movq %%mm4, (%0, %%"REG_a") \n\t" 00458 "movq %%mm5, 8(%0, %%"REG_a") \n\t" 00459 00460 "add $16, %%"REG_a" \n\t" 00461 "jng 1b \n\t" 00462 "movd 124(%0, %3), %%mm0 \n\t" 00463 "movq %%mm7, %%mm6 \n\t" 00464 "psrlq $32, %%mm7 \n\t" 00465 "pxor %%mm6, %%mm7 \n\t" 00466 "movq %%mm7, %%mm6 \n\t" 00467 "psrlq $16, %%mm7 \n\t" 00468 "pxor %%mm6, %%mm7 \n\t" 00469 "pslld $31, %%mm7 \n\t" 00470 "psrlq $15, %%mm7 \n\t" 00471 "pxor %%mm7, %%mm0 \n\t" 00472 "movd %%mm0, 124(%0, %3) \n\t" 00473 00474 ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs) 00475 : "%"REG_a, "memory" 00476 ); 00477 } 00478 00479 static void denoise_dct_mmx(MpegEncContext *s, DCTELEM *block){ 00480 const int intra= s->mb_intra; 00481 int *sum= s->dct_error_sum[intra]; 00482 uint16_t *offset= s->dct_offset[intra]; 00483 00484 s->dct_count[intra]++; 00485 00486 __asm__ volatile( 00487 "pxor %%mm7, %%mm7 \n\t" 00488 "1: \n\t" 00489 "pxor %%mm0, %%mm0 \n\t" 00490 "pxor %%mm1, %%mm1 \n\t" 00491 "movq (%0), %%mm2 \n\t" 00492 "movq 8(%0), %%mm3 \n\t" 00493 "pcmpgtw %%mm2, %%mm0 \n\t" 00494 "pcmpgtw %%mm3, %%mm1 \n\t" 00495 "pxor %%mm0, %%mm2 \n\t" 00496 "pxor %%mm1, %%mm3 \n\t" 00497 "psubw %%mm0, %%mm2 \n\t" 00498 "psubw %%mm1, %%mm3 \n\t" 00499 "movq %%mm2, %%mm4 \n\t" 00500 "movq %%mm3, %%mm5 \n\t" 00501 "psubusw (%2), %%mm2 \n\t" 00502 "psubusw 8(%2), %%mm3 \n\t" 00503 "pxor %%mm0, %%mm2 \n\t" 00504 "pxor %%mm1, %%mm3 \n\t" 00505 "psubw %%mm0, %%mm2 \n\t" 00506 "psubw %%mm1, %%mm3 \n\t" 00507 "movq %%mm2, (%0) \n\t" 00508 "movq %%mm3, 8(%0) \n\t" 00509 "movq %%mm4, %%mm2 \n\t" 00510 "movq %%mm5, %%mm3 \n\t" 00511 "punpcklwd %%mm7, %%mm4 \n\t" 00512 "punpckhwd %%mm7, %%mm2 \n\t" 00513 "punpcklwd %%mm7, %%mm5 \n\t" 00514 "punpckhwd %%mm7, %%mm3 \n\t" 00515 "paddd (%1), %%mm4 \n\t" 00516 "paddd 8(%1), %%mm2 \n\t" 00517 "paddd 16(%1), %%mm5 \n\t" 00518 "paddd 24(%1), %%mm3 \n\t" 00519 "movq %%mm4, (%1) \n\t" 00520 "movq %%mm2, 8(%1) \n\t" 00521 "movq %%mm5, 16(%1) \n\t" 00522 "movq %%mm3, 24(%1) \n\t" 00523 "add $16, %0 \n\t" 00524 "add $32, %1 \n\t" 00525 "add $16, %2 \n\t" 00526 "cmp %3, %0 \n\t" 00527 " jb 1b \n\t" 00528 : "+r" (block), "+r" (sum), "+r" (offset) 00529 : "r"(block+64) 00530 ); 00531 } 00532 00533 static void denoise_dct_sse2(MpegEncContext *s, DCTELEM *block){ 00534 const int intra= s->mb_intra; 00535 int *sum= s->dct_error_sum[intra]; 00536 uint16_t *offset= s->dct_offset[intra]; 00537 00538 s->dct_count[intra]++; 00539 00540 __asm__ volatile( 00541 "pxor %%xmm7, %%xmm7 \n\t" 00542 "1: \n\t" 00543 "pxor %%xmm0, %%xmm0 \n\t" 00544 "pxor %%xmm1, %%xmm1 \n\t" 00545 "movdqa (%0), %%xmm2 \n\t" 00546 "movdqa 16(%0), %%xmm3 \n\t" 00547 "pcmpgtw %%xmm2, %%xmm0 \n\t" 00548 "pcmpgtw %%xmm3, %%xmm1 \n\t" 00549 "pxor %%xmm0, %%xmm2 \n\t" 00550 "pxor %%xmm1, %%xmm3 \n\t" 00551 "psubw %%xmm0, %%xmm2 \n\t" 00552 "psubw %%xmm1, %%xmm3 \n\t" 00553 "movdqa %%xmm2, %%xmm4 \n\t" 00554 "movdqa %%xmm3, %%xmm5 \n\t" 00555 "psubusw (%2), %%xmm2 \n\t" 00556 "psubusw 16(%2), %%xmm3 \n\t" 00557 "pxor %%xmm0, %%xmm2 \n\t" 00558 "pxor %%xmm1, %%xmm3 \n\t" 00559 "psubw %%xmm0, %%xmm2 \n\t" 00560 "psubw %%xmm1, %%xmm3 \n\t" 00561 "movdqa %%xmm2, (%0) \n\t" 00562 "movdqa %%xmm3, 16(%0) \n\t" 00563 "movdqa %%xmm4, %%xmm6 \n\t" 00564 "movdqa %%xmm5, %%xmm0 \n\t" 00565 "punpcklwd %%xmm7, %%xmm4 \n\t" 00566 "punpckhwd %%xmm7, %%xmm6 \n\t" 00567 "punpcklwd %%xmm7, %%xmm5 \n\t" 00568 "punpckhwd %%xmm7, %%xmm0 \n\t" 00569 "paddd (%1), %%xmm4 \n\t" 00570 "paddd 16(%1), %%xmm6 \n\t" 00571 "paddd 32(%1), %%xmm5 \n\t" 00572 "paddd 48(%1), %%xmm0 \n\t" 00573 "movdqa %%xmm4, (%1) \n\t" 00574 "movdqa %%xmm6, 16(%1) \n\t" 00575 "movdqa %%xmm5, 32(%1) \n\t" 00576 "movdqa %%xmm0, 48(%1) \n\t" 00577 "add $32, %0 \n\t" 00578 "add $64, %1 \n\t" 00579 "add $32, %2 \n\t" 00580 "cmp %3, %0 \n\t" 00581 " jb 1b \n\t" 00582 : "+r" (block), "+r" (sum), "+r" (offset) 00583 : "r"(block+64) 00584 XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", 00585 "%xmm4", "%xmm5", "%xmm6", "%xmm7") 00586 ); 00587 } 00588 00589 #if HAVE_SSSE3 00590 #define HAVE_SSSE3_BAK 00591 #endif 00592 #undef HAVE_SSSE3 00593 #define HAVE_SSSE3 0 00594 00595 #undef HAVE_SSE2 00596 #undef HAVE_MMX2 00597 #define HAVE_SSE2 0 00598 #define HAVE_MMX2 0 00599 #define RENAME(a) a ## _MMX 00600 #define RENAMEl(a) a ## _mmx 00601 #include "mpegvideo_mmx_template.c" 00602 00603 #undef HAVE_MMX2 00604 #define HAVE_MMX2 1 00605 #undef RENAME 00606 #undef RENAMEl 00607 #define RENAME(a) a ## _MMX2 00608 #define RENAMEl(a) a ## _mmx2 00609 #include "mpegvideo_mmx_template.c" 00610 00611 #undef HAVE_SSE2 00612 #define HAVE_SSE2 1 00613 #undef RENAME 00614 #undef RENAMEl 00615 #define RENAME(a) a ## _SSE2 00616 #define RENAMEl(a) a ## _sse2 00617 #include "mpegvideo_mmx_template.c" 00618 00619 #ifdef HAVE_SSSE3_BAK 00620 #undef HAVE_SSSE3 00621 #define HAVE_SSSE3 1 00622 #undef RENAME 00623 #undef RENAMEl 00624 #define RENAME(a) a ## _SSSE3 00625 #define RENAMEl(a) a ## _sse2 00626 #include "mpegvideo_mmx_template.c" 00627 #endif 00628 00629 void MPV_common_init_mmx(MpegEncContext *s) 00630 { 00631 int mm_flags = av_get_cpu_flags(); 00632 00633 if (mm_flags & AV_CPU_FLAG_MMX) { 00634 const int dct_algo = s->avctx->dct_algo; 00635 00636 s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; 00637 s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; 00638 s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; 00639 s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; 00640 if(!(s->flags & CODEC_FLAG_BITEXACT)) 00641 s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; 00642 s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; 00643 00644 if (mm_flags & AV_CPU_FLAG_SSE2) { 00645 s->denoise_dct= denoise_dct_sse2; 00646 } else { 00647 s->denoise_dct= denoise_dct_mmx; 00648 } 00649 00650 if(dct_algo==FF_DCT_AUTO || dct_algo==FF_DCT_MMX){ 00651 #if HAVE_SSSE3 00652 if(mm_flags & AV_CPU_FLAG_SSSE3){ 00653 s->dct_quantize= dct_quantize_SSSE3; 00654 } else 00655 #endif 00656 if(mm_flags & AV_CPU_FLAG_SSE2){ 00657 s->dct_quantize= dct_quantize_SSE2; 00658 } else if(mm_flags & AV_CPU_FLAG_MMX2){ 00659 s->dct_quantize= dct_quantize_MMX2; 00660 } else { 00661 s->dct_quantize= dct_quantize_MMX; 00662 } 00663 } 00664 } 00665 }