Libav 0.7.1
|
00001 /* 00002 * Copyright (c) 2002 Brian Foley 00003 * Copyright (c) 2002 Dieter Shirley 00004 * Copyright (c) 2003-2004 Romain Dolbeau <romain@dolbeau.org> 00005 * 00006 * This file is part of Libav. 00007 * 00008 * Libav is free software; you can redistribute it and/or 00009 * modify it under the terms of the GNU Lesser General Public 00010 * License as published by the Free Software Foundation; either 00011 * version 2.1 of the License, or (at your option) any later version. 00012 * 00013 * Libav is distributed in the hope that it will be useful, 00014 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00015 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00016 * Lesser General Public License for more details. 00017 * 00018 * You should have received a copy of the GNU Lesser General Public 00019 * License along with Libav; if not, write to the Free Software 00020 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00021 */ 00022 00023 #include "config.h" 00024 #if HAVE_ALTIVEC_H 00025 #include <altivec.h> 00026 #endif 00027 #include "libavcodec/dsputil.h" 00028 #include "util_altivec.h" 00029 #include "types_altivec.h" 00030 #include "dsputil_altivec.h" 00031 00032 static int sad16_x2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 00033 { 00034 int i; 00035 int s; 00036 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 00037 vector unsigned char *tv; 00038 vector unsigned char pix1v, pix2v, pix2iv, avgv, t5; 00039 vector unsigned int sad; 00040 vector signed int sumdiffs; 00041 00042 s = 0; 00043 sad = (vector unsigned int)vec_splat_u32(0); 00044 for (i = 0; i < h; i++) { 00045 /* Read unaligned pixels into our vectors. The vectors are as follows: 00046 pix1v: pix1[0]-pix1[15] 00047 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] */ 00048 tv = (vector unsigned char *) pix1; 00049 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 00050 00051 tv = (vector unsigned char *) &pix2[0]; 00052 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 00053 00054 tv = (vector unsigned char *) &pix2[1]; 00055 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); 00056 00057 /* Calculate the average vector */ 00058 avgv = vec_avg(pix2v, pix2iv); 00059 00060 /* Calculate a sum of abs differences vector */ 00061 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 00062 00063 /* Add each 4 pixel group together and put 4 results into sad */ 00064 sad = vec_sum4s(t5, sad); 00065 00066 pix1 += line_size; 00067 pix2 += line_size; 00068 } 00069 /* Sum up the four partial sums, and put the result into s */ 00070 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 00071 sumdiffs = vec_splat(sumdiffs, 3); 00072 vec_ste(sumdiffs, 0, &s); 00073 00074 return s; 00075 } 00076 00077 static int sad16_y2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 00078 { 00079 int i; 00080 int s; 00081 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 00082 vector unsigned char *tv; 00083 vector unsigned char pix1v, pix2v, pix3v, avgv, t5; 00084 vector unsigned int sad; 00085 vector signed int sumdiffs; 00086 uint8_t *pix3 = pix2 + line_size; 00087 00088 s = 0; 00089 sad = (vector unsigned int)vec_splat_u32(0); 00090 00091 /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one 00092 iteration becomes pix2 in the next iteration. We can use this 00093 fact to avoid a potentially expensive unaligned read, each 00094 time around the loop. 00095 Read unaligned pixels into our vectors. The vectors are as follows: 00096 pix2v: pix2[0]-pix2[15] 00097 Split the pixel vectors into shorts */ 00098 tv = (vector unsigned char *) &pix2[0]; 00099 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 00100 00101 for (i = 0; i < h; i++) { 00102 /* Read unaligned pixels into our vectors. The vectors are as follows: 00103 pix1v: pix1[0]-pix1[15] 00104 pix3v: pix3[0]-pix3[15] */ 00105 tv = (vector unsigned char *) pix1; 00106 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 00107 00108 tv = (vector unsigned char *) &pix3[0]; 00109 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); 00110 00111 /* Calculate the average vector */ 00112 avgv = vec_avg(pix2v, pix3v); 00113 00114 /* Calculate a sum of abs differences vector */ 00115 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 00116 00117 /* Add each 4 pixel group together and put 4 results into sad */ 00118 sad = vec_sum4s(t5, sad); 00119 00120 pix1 += line_size; 00121 pix2v = pix3v; 00122 pix3 += line_size; 00123 00124 } 00125 00126 /* Sum up the four partial sums, and put the result into s */ 00127 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 00128 sumdiffs = vec_splat(sumdiffs, 3); 00129 vec_ste(sumdiffs, 0, &s); 00130 return s; 00131 } 00132 00133 static int sad16_xy2_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 00134 { 00135 int i; 00136 int s; 00137 uint8_t *pix3 = pix2 + line_size; 00138 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 00139 const vector unsigned short two = (const vector unsigned short)vec_splat_u16(2); 00140 vector unsigned char *tv, avgv, t5; 00141 vector unsigned char pix1v, pix2v, pix3v, pix2iv, pix3iv; 00142 vector unsigned short pix2lv, pix2hv, pix2ilv, pix2ihv; 00143 vector unsigned short pix3lv, pix3hv, pix3ilv, pix3ihv; 00144 vector unsigned short avghv, avglv; 00145 vector unsigned short t1, t2, t3, t4; 00146 vector unsigned int sad; 00147 vector signed int sumdiffs; 00148 00149 sad = (vector unsigned int)vec_splat_u32(0); 00150 00151 s = 0; 00152 00153 /* Due to the fact that pix3 = pix2 + line_size, the pix3 of one 00154 iteration becomes pix2 in the next iteration. We can use this 00155 fact to avoid a potentially expensive unaligned read, as well 00156 as some splitting, and vector addition each time around the loop. 00157 Read unaligned pixels into our vectors. The vectors are as follows: 00158 pix2v: pix2[0]-pix2[15] pix2iv: pix2[1]-pix2[16] 00159 Split the pixel vectors into shorts */ 00160 tv = (vector unsigned char *) &pix2[0]; 00161 pix2v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[0])); 00162 00163 tv = (vector unsigned char *) &pix2[1]; 00164 pix2iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix2[1])); 00165 00166 pix2hv = (vector unsigned short) vec_mergeh(zero, pix2v); 00167 pix2lv = (vector unsigned short) vec_mergel(zero, pix2v); 00168 pix2ihv = (vector unsigned short) vec_mergeh(zero, pix2iv); 00169 pix2ilv = (vector unsigned short) vec_mergel(zero, pix2iv); 00170 t1 = vec_add(pix2hv, pix2ihv); 00171 t2 = vec_add(pix2lv, pix2ilv); 00172 00173 for (i = 0; i < h; i++) { 00174 /* Read unaligned pixels into our vectors. The vectors are as follows: 00175 pix1v: pix1[0]-pix1[15] 00176 pix3v: pix3[0]-pix3[15] pix3iv: pix3[1]-pix3[16] */ 00177 tv = (vector unsigned char *) pix1; 00178 pix1v = vec_perm(tv[0], tv[1], vec_lvsl(0, pix1)); 00179 00180 tv = (vector unsigned char *) &pix3[0]; 00181 pix3v = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[0])); 00182 00183 tv = (vector unsigned char *) &pix3[1]; 00184 pix3iv = vec_perm(tv[0], tv[1], vec_lvsl(0, &pix3[1])); 00185 00186 /* Note that AltiVec does have vec_avg, but this works on vector pairs 00187 and rounds up. We could do avg(avg(a,b),avg(c,d)), but the rounding 00188 would mean that, for example, avg(3,0,0,1) = 2, when it should be 1. 00189 Instead, we have to split the pixel vectors into vectors of shorts, 00190 and do the averaging by hand. */ 00191 00192 /* Split the pixel vectors into shorts */ 00193 pix3hv = (vector unsigned short) vec_mergeh(zero, pix3v); 00194 pix3lv = (vector unsigned short) vec_mergel(zero, pix3v); 00195 pix3ihv = (vector unsigned short) vec_mergeh(zero, pix3iv); 00196 pix3ilv = (vector unsigned short) vec_mergel(zero, pix3iv); 00197 00198 /* Do the averaging on them */ 00199 t3 = vec_add(pix3hv, pix3ihv); 00200 t4 = vec_add(pix3lv, pix3ilv); 00201 00202 avghv = vec_sr(vec_add(vec_add(t1, t3), two), two); 00203 avglv = vec_sr(vec_add(vec_add(t2, t4), two), two); 00204 00205 /* Pack the shorts back into a result */ 00206 avgv = vec_pack(avghv, avglv); 00207 00208 /* Calculate a sum of abs differences vector */ 00209 t5 = vec_sub(vec_max(pix1v, avgv), vec_min(pix1v, avgv)); 00210 00211 /* Add each 4 pixel group together and put 4 results into sad */ 00212 sad = vec_sum4s(t5, sad); 00213 00214 pix1 += line_size; 00215 pix3 += line_size; 00216 /* Transfer the calculated values for pix3 into pix2 */ 00217 t1 = t3; 00218 t2 = t4; 00219 } 00220 /* Sum up the four partial sums, and put the result into s */ 00221 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 00222 sumdiffs = vec_splat(sumdiffs, 3); 00223 vec_ste(sumdiffs, 0, &s); 00224 00225 return s; 00226 } 00227 00228 static int sad16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 00229 { 00230 int i; 00231 int s; 00232 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 00233 vector unsigned char perm1, perm2, pix1v_low, pix1v_high, pix2v_low, pix2v_high; 00234 vector unsigned char t1, t2, t3,t4, t5; 00235 vector unsigned int sad; 00236 vector signed int sumdiffs; 00237 00238 sad = (vector unsigned int)vec_splat_u32(0); 00239 00240 00241 for (i = 0; i < h; i++) { 00242 /* Read potentially unaligned pixels into t1 and t2 */ 00243 perm1 = vec_lvsl(0, pix1); 00244 pix1v_high = vec_ld( 0, pix1); 00245 pix1v_low = vec_ld(15, pix1); 00246 perm2 = vec_lvsl(0, pix2); 00247 pix2v_high = vec_ld( 0, pix2); 00248 pix2v_low = vec_ld(15, pix2); 00249 t1 = vec_perm(pix1v_high, pix1v_low, perm1); 00250 t2 = vec_perm(pix2v_high, pix2v_low, perm2); 00251 00252 /* Calculate a sum of abs differences vector */ 00253 t3 = vec_max(t1, t2); 00254 t4 = vec_min(t1, t2); 00255 t5 = vec_sub(t3, t4); 00256 00257 /* Add each 4 pixel group together and put 4 results into sad */ 00258 sad = vec_sum4s(t5, sad); 00259 00260 pix1 += line_size; 00261 pix2 += line_size; 00262 } 00263 00264 /* Sum up the four partial sums, and put the result into s */ 00265 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 00266 sumdiffs = vec_splat(sumdiffs, 3); 00267 vec_ste(sumdiffs, 0, &s); 00268 00269 return s; 00270 } 00271 00272 static int sad8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 00273 { 00274 int i; 00275 int s; 00276 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 00277 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 00278 vector unsigned char t1, t2, t3,t4, t5; 00279 vector unsigned int sad; 00280 vector signed int sumdiffs; 00281 00282 sad = (vector unsigned int)vec_splat_u32(0); 00283 00284 permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; 00285 00286 for (i = 0; i < h; i++) { 00287 /* Read potentially unaligned pixels into t1 and t2 00288 Since we're reading 16 pixels, and actually only want 8, 00289 mask out the last 8 pixels. The 0s don't change the sum. */ 00290 perm1 = vec_lvsl(0, pix1); 00291 pix1v = (vector unsigned char *) pix1; 00292 perm2 = vec_lvsl(0, pix2); 00293 pix2v = (vector unsigned char *) pix2; 00294 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); 00295 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); 00296 00297 /* Calculate a sum of abs differences vector */ 00298 t3 = vec_max(t1, t2); 00299 t4 = vec_min(t1, t2); 00300 t5 = vec_sub(t3, t4); 00301 00302 /* Add each 4 pixel group together and put 4 results into sad */ 00303 sad = vec_sum4s(t5, sad); 00304 00305 pix1 += line_size; 00306 pix2 += line_size; 00307 } 00308 00309 /* Sum up the four partial sums, and put the result into s */ 00310 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 00311 sumdiffs = vec_splat(sumdiffs, 3); 00312 vec_ste(sumdiffs, 0, &s); 00313 00314 return s; 00315 } 00316 00317 static int pix_norm1_altivec(uint8_t *pix, int line_size) 00318 { 00319 int i; 00320 int s; 00321 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 00322 vector unsigned char *tv; 00323 vector unsigned char pixv; 00324 vector unsigned int sv; 00325 vector signed int sum; 00326 00327 sv = (vector unsigned int)vec_splat_u32(0); 00328 00329 s = 0; 00330 for (i = 0; i < 16; i++) { 00331 /* Read in the potentially unaligned pixels */ 00332 tv = (vector unsigned char *) pix; 00333 pixv = vec_perm(tv[0], tv[1], vec_lvsl(0, pix)); 00334 00335 /* Square the values, and add them to our sum */ 00336 sv = vec_msum(pixv, pixv, sv); 00337 00338 pix += line_size; 00339 } 00340 /* Sum up the four partial sums, and put the result into s */ 00341 sum = vec_sums((vector signed int) sv, (vector signed int) zero); 00342 sum = vec_splat(sum, 3); 00343 vec_ste(sum, 0, &s); 00344 00345 return s; 00346 } 00347 00353 static int sse8_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 00354 { 00355 int i; 00356 int s; 00357 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 00358 vector unsigned char perm1, perm2, permclear, *pix1v, *pix2v; 00359 vector unsigned char t1, t2, t3,t4, t5; 00360 vector unsigned int sum; 00361 vector signed int sumsqr; 00362 00363 sum = (vector unsigned int)vec_splat_u32(0); 00364 00365 permclear = (vector unsigned char){255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0}; 00366 00367 00368 for (i = 0; i < h; i++) { 00369 /* Read potentially unaligned pixels into t1 and t2 00370 Since we're reading 16 pixels, and actually only want 8, 00371 mask out the last 8 pixels. The 0s don't change the sum. */ 00372 perm1 = vec_lvsl(0, pix1); 00373 pix1v = (vector unsigned char *) pix1; 00374 perm2 = vec_lvsl(0, pix2); 00375 pix2v = (vector unsigned char *) pix2; 00376 t1 = vec_and(vec_perm(pix1v[0], pix1v[1], perm1), permclear); 00377 t2 = vec_and(vec_perm(pix2v[0], pix2v[1], perm2), permclear); 00378 00379 /* Since we want to use unsigned chars, we can take advantage 00380 of the fact that abs(a-b)^2 = (a-b)^2. */ 00381 00382 /* Calculate abs differences vector */ 00383 t3 = vec_max(t1, t2); 00384 t4 = vec_min(t1, t2); 00385 t5 = vec_sub(t3, t4); 00386 00387 /* Square the values and add them to our sum */ 00388 sum = vec_msum(t5, t5, sum); 00389 00390 pix1 += line_size; 00391 pix2 += line_size; 00392 } 00393 00394 /* Sum up the four partial sums, and put the result into s */ 00395 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 00396 sumsqr = vec_splat(sumsqr, 3); 00397 vec_ste(sumsqr, 0, &s); 00398 00399 return s; 00400 } 00401 00407 static int sse16_altivec(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h) 00408 { 00409 int i; 00410 int s; 00411 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 00412 vector unsigned char perm1, perm2, *pix1v, *pix2v; 00413 vector unsigned char t1, t2, t3,t4, t5; 00414 vector unsigned int sum; 00415 vector signed int sumsqr; 00416 00417 sum = (vector unsigned int)vec_splat_u32(0); 00418 00419 for (i = 0; i < h; i++) { 00420 /* Read potentially unaligned pixels into t1 and t2 */ 00421 perm1 = vec_lvsl(0, pix1); 00422 pix1v = (vector unsigned char *) pix1; 00423 perm2 = vec_lvsl(0, pix2); 00424 pix2v = (vector unsigned char *) pix2; 00425 t1 = vec_perm(pix1v[0], pix1v[1], perm1); 00426 t2 = vec_perm(pix2v[0], pix2v[1], perm2); 00427 00428 /* Since we want to use unsigned chars, we can take advantage 00429 of the fact that abs(a-b)^2 = (a-b)^2. */ 00430 00431 /* Calculate abs differences vector */ 00432 t3 = vec_max(t1, t2); 00433 t4 = vec_min(t1, t2); 00434 t5 = vec_sub(t3, t4); 00435 00436 /* Square the values and add them to our sum */ 00437 sum = vec_msum(t5, t5, sum); 00438 00439 pix1 += line_size; 00440 pix2 += line_size; 00441 } 00442 00443 /* Sum up the four partial sums, and put the result into s */ 00444 sumsqr = vec_sums((vector signed int) sum, (vector signed int) zero); 00445 sumsqr = vec_splat(sumsqr, 3); 00446 vec_ste(sumsqr, 0, &s); 00447 00448 return s; 00449 } 00450 00451 static int pix_sum_altivec(uint8_t * pix, int line_size) 00452 { 00453 const vector unsigned int zero = (const vector unsigned int)vec_splat_u32(0); 00454 vector unsigned char perm, *pixv; 00455 vector unsigned char t1; 00456 vector unsigned int sad; 00457 vector signed int sumdiffs; 00458 00459 int i; 00460 int s; 00461 00462 sad = (vector unsigned int)vec_splat_u32(0); 00463 00464 for (i = 0; i < 16; i++) { 00465 /* Read the potentially unaligned 16 pixels into t1 */ 00466 perm = vec_lvsl(0, pix); 00467 pixv = (vector unsigned char *) pix; 00468 t1 = vec_perm(pixv[0], pixv[1], perm); 00469 00470 /* Add each 4 pixel group together and put 4 results into sad */ 00471 sad = vec_sum4s(t1, sad); 00472 00473 pix += line_size; 00474 } 00475 00476 /* Sum up the four partial sums, and put the result into s */ 00477 sumdiffs = vec_sums((vector signed int) sad, (vector signed int) zero); 00478 sumdiffs = vec_splat(sumdiffs, 3); 00479 vec_ste(sumdiffs, 0, &s); 00480 00481 return s; 00482 } 00483 00484 static void get_pixels_altivec(DCTELEM *restrict block, const uint8_t *pixels, int line_size) 00485 { 00486 int i; 00487 vector unsigned char perm, bytes, *pixv; 00488 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 00489 vector signed short shorts; 00490 00491 for (i = 0; i < 8; i++) { 00492 // Read potentially unaligned pixels. 00493 // We're reading 16 pixels, and actually only want 8, 00494 // but we simply ignore the extras. 00495 perm = vec_lvsl(0, pixels); 00496 pixv = (vector unsigned char *) pixels; 00497 bytes = vec_perm(pixv[0], pixv[1], perm); 00498 00499 // convert the bytes into shorts 00500 shorts = (vector signed short)vec_mergeh(zero, bytes); 00501 00502 // save the data to the block, we assume the block is 16-byte aligned 00503 vec_st(shorts, i*16, (vector signed short*)block); 00504 00505 pixels += line_size; 00506 } 00507 } 00508 00509 static void diff_pixels_altivec(DCTELEM *restrict block, const uint8_t *s1, 00510 const uint8_t *s2, int stride) 00511 { 00512 int i; 00513 vector unsigned char perm, bytes, *pixv; 00514 const vector unsigned char zero = (const vector unsigned char)vec_splat_u8(0); 00515 vector signed short shorts1, shorts2; 00516 00517 for (i = 0; i < 4; i++) { 00518 // Read potentially unaligned pixels 00519 // We're reading 16 pixels, and actually only want 8, 00520 // but we simply ignore the extras. 00521 perm = vec_lvsl(0, s1); 00522 pixv = (vector unsigned char *) s1; 00523 bytes = vec_perm(pixv[0], pixv[1], perm); 00524 00525 // convert the bytes into shorts 00526 shorts1 = (vector signed short)vec_mergeh(zero, bytes); 00527 00528 // Do the same for the second block of pixels 00529 perm = vec_lvsl(0, s2); 00530 pixv = (vector unsigned char *) s2; 00531 bytes = vec_perm(pixv[0], pixv[1], perm); 00532 00533 // convert the bytes into shorts 00534 shorts2 = (vector signed short)vec_mergeh(zero, bytes); 00535 00536 // Do the subtraction 00537 shorts1 = vec_sub(shorts1, shorts2); 00538 00539 // save the data to the block, we assume the block is 16-byte aligned 00540 vec_st(shorts1, 0, (vector signed short*)block); 00541 00542 s1 += stride; 00543 s2 += stride; 00544 block += 8; 00545 00546 00547 // The code below is a copy of the code above... This is a manual 00548 // unroll. 00549 00550 // Read potentially unaligned pixels 00551 // We're reading 16 pixels, and actually only want 8, 00552 // but we simply ignore the extras. 00553 perm = vec_lvsl(0, s1); 00554 pixv = (vector unsigned char *) s1; 00555 bytes = vec_perm(pixv[0], pixv[1], perm); 00556 00557 // convert the bytes into shorts 00558 shorts1 = (vector signed short)vec_mergeh(zero, bytes); 00559 00560 // Do the same for the second block of pixels 00561 perm = vec_lvsl(0, s2); 00562 pixv = (vector unsigned char *) s2; 00563 bytes = vec_perm(pixv[0], pixv[1], perm); 00564 00565 // convert the bytes into shorts 00566 shorts2 = (vector signed short)vec_mergeh(zero, bytes); 00567 00568 // Do the subtraction 00569 shorts1 = vec_sub(shorts1, shorts2); 00570 00571 // save the data to the block, we assume the block is 16-byte aligned 00572 vec_st(shorts1, 0, (vector signed short*)block); 00573 00574 s1 += stride; 00575 s2 += stride; 00576 block += 8; 00577 } 00578 } 00579 00580 00581 static void clear_block_altivec(DCTELEM *block) { 00582 LOAD_ZERO; 00583 vec_st(zero_s16v, 0, block); 00584 vec_st(zero_s16v, 16, block); 00585 vec_st(zero_s16v, 32, block); 00586 vec_st(zero_s16v, 48, block); 00587 vec_st(zero_s16v, 64, block); 00588 vec_st(zero_s16v, 80, block); 00589 vec_st(zero_s16v, 96, block); 00590 vec_st(zero_s16v, 112, block); 00591 } 00592 00593 00594 static void add_bytes_altivec(uint8_t *dst, uint8_t *src, int w) { 00595 register int i; 00596 register vector unsigned char vdst, vsrc; 00597 00598 /* dst and src are 16 bytes-aligned (guaranteed) */ 00599 for (i = 0 ; (i + 15) < w ; i+=16) { 00600 vdst = vec_ld(i, (unsigned char*)dst); 00601 vsrc = vec_ld(i, (unsigned char*)src); 00602 vdst = vec_add(vsrc, vdst); 00603 vec_st(vdst, i, (unsigned char*)dst); 00604 } 00605 /* if w is not a multiple of 16 */ 00606 for (; (i < w) ; i++) { 00607 dst[i] = src[i]; 00608 } 00609 } 00610 00611 /* next one assumes that ((line_size % 16) == 0) */ 00612 void put_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00613 { 00614 register vector unsigned char pixelsv1, pixelsv2; 00615 register vector unsigned char pixelsv1B, pixelsv2B; 00616 register vector unsigned char pixelsv1C, pixelsv2C; 00617 register vector unsigned char pixelsv1D, pixelsv2D; 00618 00619 register vector unsigned char perm = vec_lvsl(0, pixels); 00620 int i; 00621 register int line_size_2 = line_size << 1; 00622 register int line_size_3 = line_size + line_size_2; 00623 register int line_size_4 = line_size << 2; 00624 00625 // hand-unrolling the loop by 4 gains about 15% 00626 // mininum execution time goes from 74 to 60 cycles 00627 // it's faster than -funroll-loops, but using 00628 // -funroll-loops w/ this is bad - 74 cycles again. 00629 // all this is on a 7450, tuning for the 7450 00630 #if 0 00631 for (i = 0; i < h; i++) { 00632 pixelsv1 = vec_ld(0, pixels); 00633 pixelsv2 = vec_ld(16, pixels); 00634 vec_st(vec_perm(pixelsv1, pixelsv2, perm), 00635 0, block); 00636 pixels+=line_size; 00637 block +=line_size; 00638 } 00639 #else 00640 for (i = 0; i < h; i += 4) { 00641 pixelsv1 = vec_ld( 0, pixels); 00642 pixelsv2 = vec_ld(15, pixels); 00643 pixelsv1B = vec_ld(line_size, pixels); 00644 pixelsv2B = vec_ld(15 + line_size, pixels); 00645 pixelsv1C = vec_ld(line_size_2, pixels); 00646 pixelsv2C = vec_ld(15 + line_size_2, pixels); 00647 pixelsv1D = vec_ld(line_size_3, pixels); 00648 pixelsv2D = vec_ld(15 + line_size_3, pixels); 00649 vec_st(vec_perm(pixelsv1, pixelsv2, perm), 00650 0, (unsigned char*)block); 00651 vec_st(vec_perm(pixelsv1B, pixelsv2B, perm), 00652 line_size, (unsigned char*)block); 00653 vec_st(vec_perm(pixelsv1C, pixelsv2C, perm), 00654 line_size_2, (unsigned char*)block); 00655 vec_st(vec_perm(pixelsv1D, pixelsv2D, perm), 00656 line_size_3, (unsigned char*)block); 00657 pixels+=line_size_4; 00658 block +=line_size_4; 00659 } 00660 #endif 00661 } 00662 00663 /* next one assumes that ((line_size % 16) == 0) */ 00664 #define op_avg(a,b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) ) 00665 void avg_pixels16_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00666 { 00667 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 00668 register vector unsigned char perm = vec_lvsl(0, pixels); 00669 int i; 00670 00671 for (i = 0; i < h; i++) { 00672 pixelsv1 = vec_ld( 0, pixels); 00673 pixelsv2 = vec_ld(16,pixels); 00674 blockv = vec_ld(0, block); 00675 pixelsv = vec_perm(pixelsv1, pixelsv2, perm); 00676 blockv = vec_avg(blockv,pixelsv); 00677 vec_st(blockv, 0, (unsigned char*)block); 00678 pixels+=line_size; 00679 block +=line_size; 00680 } 00681 } 00682 00683 /* next one assumes that ((line_size % 8) == 0) */ 00684 static void avg_pixels8_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 00685 { 00686 register vector unsigned char pixelsv1, pixelsv2, pixelsv, blockv; 00687 int i; 00688 00689 for (i = 0; i < h; i++) { 00690 /* block is 8 bytes-aligned, so we're either in the 00691 left block (16 bytes-aligned) or in the right block (not) */ 00692 int rightside = ((unsigned long)block & 0x0000000F); 00693 00694 blockv = vec_ld(0, block); 00695 pixelsv1 = vec_ld( 0, pixels); 00696 pixelsv2 = vec_ld(16, pixels); 00697 pixelsv = vec_perm(pixelsv1, pixelsv2, vec_lvsl(0, pixels)); 00698 00699 if (rightside) { 00700 pixelsv = vec_perm(blockv, pixelsv, vcprm(0,1,s0,s1)); 00701 } else { 00702 pixelsv = vec_perm(blockv, pixelsv, vcprm(s0,s1,2,3)); 00703 } 00704 00705 blockv = vec_avg(blockv, pixelsv); 00706 00707 vec_st(blockv, 0, block); 00708 00709 pixels += line_size; 00710 block += line_size; 00711 } 00712 } 00713 00714 /* next one assumes that ((line_size % 8) == 0) */ 00715 static void put_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00716 { 00717 register int i; 00718 register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 00719 register vector unsigned char blockv, temp1, temp2; 00720 register vector unsigned short pixelssum1, pixelssum2, temp3; 00721 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 00722 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 00723 00724 temp1 = vec_ld(0, pixels); 00725 temp2 = vec_ld(16, pixels); 00726 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 00727 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 00728 pixelsv2 = temp2; 00729 } else { 00730 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 00731 } 00732 pixelsv1 = vec_mergeh(vczero, pixelsv1); 00733 pixelsv2 = vec_mergeh(vczero, pixelsv2); 00734 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 00735 (vector unsigned short)pixelsv2); 00736 pixelssum1 = vec_add(pixelssum1, vctwo); 00737 00738 for (i = 0; i < h ; i++) { 00739 int rightside = ((unsigned long)block & 0x0000000F); 00740 blockv = vec_ld(0, block); 00741 00742 temp1 = vec_ld(line_size, pixels); 00743 temp2 = vec_ld(line_size + 16, pixels); 00744 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 00745 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 00746 pixelsv2 = temp2; 00747 } else { 00748 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 00749 } 00750 00751 pixelsv1 = vec_mergeh(vczero, pixelsv1); 00752 pixelsv2 = vec_mergeh(vczero, pixelsv2); 00753 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 00754 (vector unsigned short)pixelsv2); 00755 temp3 = vec_add(pixelssum1, pixelssum2); 00756 temp3 = vec_sra(temp3, vctwo); 00757 pixelssum1 = vec_add(pixelssum2, vctwo); 00758 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 00759 00760 if (rightside) { 00761 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 00762 } else { 00763 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 00764 } 00765 00766 vec_st(blockv, 0, block); 00767 00768 block += line_size; 00769 pixels += line_size; 00770 } 00771 } 00772 00773 /* next one assumes that ((line_size % 8) == 0) */ 00774 static void put_no_rnd_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 00775 { 00776 register int i; 00777 register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 00778 register vector unsigned char blockv, temp1, temp2; 00779 register vector unsigned short pixelssum1, pixelssum2, temp3; 00780 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 00781 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 00782 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 00783 00784 temp1 = vec_ld(0, pixels); 00785 temp2 = vec_ld(16, pixels); 00786 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 00787 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 00788 pixelsv2 = temp2; 00789 } else { 00790 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 00791 } 00792 pixelsv1 = vec_mergeh(vczero, pixelsv1); 00793 pixelsv2 = vec_mergeh(vczero, pixelsv2); 00794 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 00795 (vector unsigned short)pixelsv2); 00796 pixelssum1 = vec_add(pixelssum1, vcone); 00797 00798 for (i = 0; i < h ; i++) { 00799 int rightside = ((unsigned long)block & 0x0000000F); 00800 blockv = vec_ld(0, block); 00801 00802 temp1 = vec_ld(line_size, pixels); 00803 temp2 = vec_ld(line_size + 16, pixels); 00804 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 00805 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 00806 pixelsv2 = temp2; 00807 } else { 00808 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 00809 } 00810 00811 pixelsv1 = vec_mergeh(vczero, pixelsv1); 00812 pixelsv2 = vec_mergeh(vczero, pixelsv2); 00813 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 00814 (vector unsigned short)pixelsv2); 00815 temp3 = vec_add(pixelssum1, pixelssum2); 00816 temp3 = vec_sra(temp3, vctwo); 00817 pixelssum1 = vec_add(pixelssum2, vcone); 00818 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 00819 00820 if (rightside) { 00821 blockv = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 00822 } else { 00823 blockv = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 00824 } 00825 00826 vec_st(blockv, 0, block); 00827 00828 block += line_size; 00829 pixels += line_size; 00830 } 00831 } 00832 00833 /* next one assumes that ((line_size % 16) == 0) */ 00834 static void put_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 00835 { 00836 register int i; 00837 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; 00838 register vector unsigned char blockv, temp1, temp2; 00839 register vector unsigned short temp3, temp4, 00840 pixelssum1, pixelssum2, pixelssum3, pixelssum4; 00841 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 00842 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 00843 00844 temp1 = vec_ld(0, pixels); 00845 temp2 = vec_ld(16, pixels); 00846 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 00847 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 00848 pixelsv2 = temp2; 00849 } else { 00850 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 00851 } 00852 pixelsv3 = vec_mergel(vczero, pixelsv1); 00853 pixelsv4 = vec_mergel(vczero, pixelsv2); 00854 pixelsv1 = vec_mergeh(vczero, pixelsv1); 00855 pixelsv2 = vec_mergeh(vczero, pixelsv2); 00856 pixelssum3 = vec_add((vector unsigned short)pixelsv3, 00857 (vector unsigned short)pixelsv4); 00858 pixelssum3 = vec_add(pixelssum3, vctwo); 00859 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 00860 (vector unsigned short)pixelsv2); 00861 pixelssum1 = vec_add(pixelssum1, vctwo); 00862 00863 for (i = 0; i < h ; i++) { 00864 blockv = vec_ld(0, block); 00865 00866 temp1 = vec_ld(line_size, pixels); 00867 temp2 = vec_ld(line_size + 16, pixels); 00868 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 00869 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 00870 pixelsv2 = temp2; 00871 } else { 00872 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 00873 } 00874 00875 pixelsv3 = vec_mergel(vczero, pixelsv1); 00876 pixelsv4 = vec_mergel(vczero, pixelsv2); 00877 pixelsv1 = vec_mergeh(vczero, pixelsv1); 00878 pixelsv2 = vec_mergeh(vczero, pixelsv2); 00879 00880 pixelssum4 = vec_add((vector unsigned short)pixelsv3, 00881 (vector unsigned short)pixelsv4); 00882 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 00883 (vector unsigned short)pixelsv2); 00884 temp4 = vec_add(pixelssum3, pixelssum4); 00885 temp4 = vec_sra(temp4, vctwo); 00886 temp3 = vec_add(pixelssum1, pixelssum2); 00887 temp3 = vec_sra(temp3, vctwo); 00888 00889 pixelssum3 = vec_add(pixelssum4, vctwo); 00890 pixelssum1 = vec_add(pixelssum2, vctwo); 00891 00892 blockv = vec_packsu(temp3, temp4); 00893 00894 vec_st(blockv, 0, block); 00895 00896 block += line_size; 00897 pixels += line_size; 00898 } 00899 } 00900 00901 /* next one assumes that ((line_size % 16) == 0) */ 00902 static void put_no_rnd_pixels16_xy2_altivec(uint8_t * block, const uint8_t * pixels, int line_size, int h) 00903 { 00904 register int i; 00905 register vector unsigned char pixelsv1, pixelsv2, pixelsv3, pixelsv4; 00906 register vector unsigned char blockv, temp1, temp2; 00907 register vector unsigned short temp3, temp4, 00908 pixelssum1, pixelssum2, pixelssum3, pixelssum4; 00909 register const vector unsigned char vczero = (const vector unsigned char)vec_splat_u8(0); 00910 register const vector unsigned short vcone = (const vector unsigned short)vec_splat_u16(1); 00911 register const vector unsigned short vctwo = (const vector unsigned short)vec_splat_u16(2); 00912 00913 temp1 = vec_ld(0, pixels); 00914 temp2 = vec_ld(16, pixels); 00915 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 00916 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 00917 pixelsv2 = temp2; 00918 } else { 00919 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 00920 } 00921 pixelsv3 = vec_mergel(vczero, pixelsv1); 00922 pixelsv4 = vec_mergel(vczero, pixelsv2); 00923 pixelsv1 = vec_mergeh(vczero, pixelsv1); 00924 pixelsv2 = vec_mergeh(vczero, pixelsv2); 00925 pixelssum3 = vec_add((vector unsigned short)pixelsv3, 00926 (vector unsigned short)pixelsv4); 00927 pixelssum3 = vec_add(pixelssum3, vcone); 00928 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 00929 (vector unsigned short)pixelsv2); 00930 pixelssum1 = vec_add(pixelssum1, vcone); 00931 00932 for (i = 0; i < h ; i++) { 00933 blockv = vec_ld(0, block); 00934 00935 temp1 = vec_ld(line_size, pixels); 00936 temp2 = vec_ld(line_size + 16, pixels); 00937 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 00938 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 00939 pixelsv2 = temp2; 00940 } else { 00941 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 00942 } 00943 00944 pixelsv3 = vec_mergel(vczero, pixelsv1); 00945 pixelsv4 = vec_mergel(vczero, pixelsv2); 00946 pixelsv1 = vec_mergeh(vczero, pixelsv1); 00947 pixelsv2 = vec_mergeh(vczero, pixelsv2); 00948 00949 pixelssum4 = vec_add((vector unsigned short)pixelsv3, 00950 (vector unsigned short)pixelsv4); 00951 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 00952 (vector unsigned short)pixelsv2); 00953 temp4 = vec_add(pixelssum3, pixelssum4); 00954 temp4 = vec_sra(temp4, vctwo); 00955 temp3 = vec_add(pixelssum1, pixelssum2); 00956 temp3 = vec_sra(temp3, vctwo); 00957 00958 pixelssum3 = vec_add(pixelssum4, vcone); 00959 pixelssum1 = vec_add(pixelssum2, vcone); 00960 00961 blockv = vec_packsu(temp3, temp4); 00962 00963 vec_st(blockv, 0, block); 00964 00965 block += line_size; 00966 pixels += line_size; 00967 } 00968 } 00969 00970 static int hadamard8_diff8x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ 00971 int sum; 00972 register const vector unsigned char vzero = 00973 (const vector unsigned char)vec_splat_u8(0); 00974 register vector signed short temp0, temp1, temp2, temp3, temp4, 00975 temp5, temp6, temp7; 00976 { 00977 register const vector signed short vprod1 =(const vector signed short) 00978 { 1,-1, 1,-1, 1,-1, 1,-1 }; 00979 register const vector signed short vprod2 =(const vector signed short) 00980 { 1, 1,-1,-1, 1, 1,-1,-1 }; 00981 register const vector signed short vprod3 =(const vector signed short) 00982 { 1, 1, 1, 1,-1,-1,-1,-1 }; 00983 register const vector unsigned char perm1 = (const vector unsigned char) 00984 {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 00985 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D}; 00986 register const vector unsigned char perm2 = (const vector unsigned char) 00987 {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 00988 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B}; 00989 register const vector unsigned char perm3 = (const vector unsigned char) 00990 {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 00991 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}; 00992 00993 #define ONEITERBUTTERFLY(i, res) \ 00994 { \ 00995 register vector unsigned char src1, src2, srcO; \ 00996 register vector unsigned char dst1, dst2, dstO; \ 00997 register vector signed short srcV, dstV; \ 00998 register vector signed short but0, but1, but2, op1, op2, op3; \ 00999 src1 = vec_ld(stride * i, src); \ 01000 src2 = vec_ld((stride * i) + 15, src); \ 01001 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ 01002 dst1 = vec_ld(stride * i, dst); \ 01003 dst2 = vec_ld((stride * i) + 15, dst); \ 01004 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ 01005 /* promote the unsigned chars to signed shorts */ \ 01006 /* we're in the 8x8 function, we only care for the first 8 */ \ 01007 srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \ 01008 (vector signed char)srcO); \ 01009 dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \ 01010 (vector signed char)dstO); \ 01011 /* subtractions inside the first butterfly */ \ 01012 but0 = vec_sub(srcV, dstV); \ 01013 op1 = vec_perm(but0, but0, perm1); \ 01014 but1 = vec_mladd(but0, vprod1, op1); \ 01015 op2 = vec_perm(but1, but1, perm2); \ 01016 but2 = vec_mladd(but1, vprod2, op2); \ 01017 op3 = vec_perm(but2, but2, perm3); \ 01018 res = vec_mladd(but2, vprod3, op3); \ 01019 } 01020 ONEITERBUTTERFLY(0, temp0); 01021 ONEITERBUTTERFLY(1, temp1); 01022 ONEITERBUTTERFLY(2, temp2); 01023 ONEITERBUTTERFLY(3, temp3); 01024 ONEITERBUTTERFLY(4, temp4); 01025 ONEITERBUTTERFLY(5, temp5); 01026 ONEITERBUTTERFLY(6, temp6); 01027 ONEITERBUTTERFLY(7, temp7); 01028 } 01029 #undef ONEITERBUTTERFLY 01030 { 01031 register vector signed int vsum; 01032 register vector signed short line0 = vec_add(temp0, temp1); 01033 register vector signed short line1 = vec_sub(temp0, temp1); 01034 register vector signed short line2 = vec_add(temp2, temp3); 01035 register vector signed short line3 = vec_sub(temp2, temp3); 01036 register vector signed short line4 = vec_add(temp4, temp5); 01037 register vector signed short line5 = vec_sub(temp4, temp5); 01038 register vector signed short line6 = vec_add(temp6, temp7); 01039 register vector signed short line7 = vec_sub(temp6, temp7); 01040 01041 register vector signed short line0B = vec_add(line0, line2); 01042 register vector signed short line2B = vec_sub(line0, line2); 01043 register vector signed short line1B = vec_add(line1, line3); 01044 register vector signed short line3B = vec_sub(line1, line3); 01045 register vector signed short line4B = vec_add(line4, line6); 01046 register vector signed short line6B = vec_sub(line4, line6); 01047 register vector signed short line5B = vec_add(line5, line7); 01048 register vector signed short line7B = vec_sub(line5, line7); 01049 01050 register vector signed short line0C = vec_add(line0B, line4B); 01051 register vector signed short line4C = vec_sub(line0B, line4B); 01052 register vector signed short line1C = vec_add(line1B, line5B); 01053 register vector signed short line5C = vec_sub(line1B, line5B); 01054 register vector signed short line2C = vec_add(line2B, line6B); 01055 register vector signed short line6C = vec_sub(line2B, line6B); 01056 register vector signed short line3C = vec_add(line3B, line7B); 01057 register vector signed short line7C = vec_sub(line3B, line7B); 01058 01059 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); 01060 vsum = vec_sum4s(vec_abs(line1C), vsum); 01061 vsum = vec_sum4s(vec_abs(line2C), vsum); 01062 vsum = vec_sum4s(vec_abs(line3C), vsum); 01063 vsum = vec_sum4s(vec_abs(line4C), vsum); 01064 vsum = vec_sum4s(vec_abs(line5C), vsum); 01065 vsum = vec_sum4s(vec_abs(line6C), vsum); 01066 vsum = vec_sum4s(vec_abs(line7C), vsum); 01067 vsum = vec_sums(vsum, (vector signed int)vzero); 01068 vsum = vec_splat(vsum, 3); 01069 vec_ste(vsum, 0, &sum); 01070 } 01071 return sum; 01072 } 01073 01074 /* 01075 16x8 works with 16 elements; it allows to avoid replicating loads, and 01076 give the compiler more rooms for scheduling. It's only used from 01077 inside hadamard8_diff16_altivec. 01078 01079 Unfortunately, it seems gcc-3.3 is a bit dumb, and the compiled code has a LOT 01080 of spill code, it seems gcc (unlike xlc) cannot keep everything in registers 01081 by itself. The following code include hand-made registers allocation. It's not 01082 clean, but on a 7450 the resulting code is much faster (best case fall from 01083 700+ cycles to 550). 01084 01085 xlc doesn't add spill code, but it doesn't know how to schedule for the 7450, 01086 and its code isn't much faster than gcc-3.3 on the 7450 (but uses 25% less 01087 instructions...) 01088 01089 On the 970, the hand-made RA is still a win (around 690 vs. around 780), but 01090 xlc goes to around 660 on the regular C code... 01091 */ 01092 01093 static int hadamard8_diff16x8_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h) { 01094 int sum; 01095 register vector signed short 01096 temp0 __asm__ ("v0"), 01097 temp1 __asm__ ("v1"), 01098 temp2 __asm__ ("v2"), 01099 temp3 __asm__ ("v3"), 01100 temp4 __asm__ ("v4"), 01101 temp5 __asm__ ("v5"), 01102 temp6 __asm__ ("v6"), 01103 temp7 __asm__ ("v7"); 01104 register vector signed short 01105 temp0S __asm__ ("v8"), 01106 temp1S __asm__ ("v9"), 01107 temp2S __asm__ ("v10"), 01108 temp3S __asm__ ("v11"), 01109 temp4S __asm__ ("v12"), 01110 temp5S __asm__ ("v13"), 01111 temp6S __asm__ ("v14"), 01112 temp7S __asm__ ("v15"); 01113 register const vector unsigned char vzero __asm__ ("v31") = 01114 (const vector unsigned char)vec_splat_u8(0); 01115 { 01116 register const vector signed short vprod1 __asm__ ("v16") = 01117 (const vector signed short){ 1,-1, 1,-1, 1,-1, 1,-1 }; 01118 register const vector signed short vprod2 __asm__ ("v17") = 01119 (const vector signed short){ 1, 1,-1,-1, 1, 1,-1,-1 }; 01120 register const vector signed short vprod3 __asm__ ("v18") = 01121 (const vector signed short){ 1, 1, 1, 1,-1,-1,-1,-1 }; 01122 register const vector unsigned char perm1 __asm__ ("v19") = 01123 (const vector unsigned char) 01124 {0x02, 0x03, 0x00, 0x01, 0x06, 0x07, 0x04, 0x05, 01125 0x0A, 0x0B, 0x08, 0x09, 0x0E, 0x0F, 0x0C, 0x0D}; 01126 register const vector unsigned char perm2 __asm__ ("v20") = 01127 (const vector unsigned char) 01128 {0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03, 01129 0x0C, 0x0D, 0x0E, 0x0F, 0x08, 0x09, 0x0A, 0x0B}; 01130 register const vector unsigned char perm3 __asm__ ("v21") = 01131 (const vector unsigned char) 01132 {0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 01133 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07}; 01134 01135 #define ONEITERBUTTERFLY(i, res1, res2) \ 01136 { \ 01137 register vector unsigned char src1 __asm__ ("v22"), \ 01138 src2 __asm__ ("v23"), \ 01139 dst1 __asm__ ("v24"), \ 01140 dst2 __asm__ ("v25"), \ 01141 srcO __asm__ ("v22"), \ 01142 dstO __asm__ ("v23"); \ 01143 \ 01144 register vector signed short srcV __asm__ ("v24"), \ 01145 dstV __asm__ ("v25"), \ 01146 srcW __asm__ ("v26"), \ 01147 dstW __asm__ ("v27"), \ 01148 but0 __asm__ ("v28"), \ 01149 but0S __asm__ ("v29"), \ 01150 op1 __asm__ ("v30"), \ 01151 but1 __asm__ ("v22"), \ 01152 op1S __asm__ ("v23"), \ 01153 but1S __asm__ ("v24"), \ 01154 op2 __asm__ ("v25"), \ 01155 but2 __asm__ ("v26"), \ 01156 op2S __asm__ ("v27"), \ 01157 but2S __asm__ ("v28"), \ 01158 op3 __asm__ ("v29"), \ 01159 op3S __asm__ ("v30"); \ 01160 \ 01161 src1 = vec_ld(stride * i, src); \ 01162 src2 = vec_ld((stride * i) + 16, src); \ 01163 srcO = vec_perm(src1, src2, vec_lvsl(stride * i, src)); \ 01164 dst1 = vec_ld(stride * i, dst); \ 01165 dst2 = vec_ld((stride * i) + 16, dst); \ 01166 dstO = vec_perm(dst1, dst2, vec_lvsl(stride * i, dst)); \ 01167 /* promote the unsigned chars to signed shorts */ \ 01168 srcV = (vector signed short)vec_mergeh((vector signed char)vzero, \ 01169 (vector signed char)srcO); \ 01170 dstV = (vector signed short)vec_mergeh((vector signed char)vzero, \ 01171 (vector signed char)dstO); \ 01172 srcW = (vector signed short)vec_mergel((vector signed char)vzero, \ 01173 (vector signed char)srcO); \ 01174 dstW = (vector signed short)vec_mergel((vector signed char)vzero, \ 01175 (vector signed char)dstO); \ 01176 /* subtractions inside the first butterfly */ \ 01177 but0 = vec_sub(srcV, dstV); \ 01178 but0S = vec_sub(srcW, dstW); \ 01179 op1 = vec_perm(but0, but0, perm1); \ 01180 but1 = vec_mladd(but0, vprod1, op1); \ 01181 op1S = vec_perm(but0S, but0S, perm1); \ 01182 but1S = vec_mladd(but0S, vprod1, op1S); \ 01183 op2 = vec_perm(but1, but1, perm2); \ 01184 but2 = vec_mladd(but1, vprod2, op2); \ 01185 op2S = vec_perm(but1S, but1S, perm2); \ 01186 but2S = vec_mladd(but1S, vprod2, op2S); \ 01187 op3 = vec_perm(but2, but2, perm3); \ 01188 res1 = vec_mladd(but2, vprod3, op3); \ 01189 op3S = vec_perm(but2S, but2S, perm3); \ 01190 res2 = vec_mladd(but2S, vprod3, op3S); \ 01191 } 01192 ONEITERBUTTERFLY(0, temp0, temp0S); 01193 ONEITERBUTTERFLY(1, temp1, temp1S); 01194 ONEITERBUTTERFLY(2, temp2, temp2S); 01195 ONEITERBUTTERFLY(3, temp3, temp3S); 01196 ONEITERBUTTERFLY(4, temp4, temp4S); 01197 ONEITERBUTTERFLY(5, temp5, temp5S); 01198 ONEITERBUTTERFLY(6, temp6, temp6S); 01199 ONEITERBUTTERFLY(7, temp7, temp7S); 01200 } 01201 #undef ONEITERBUTTERFLY 01202 { 01203 register vector signed int vsum; 01204 register vector signed short line0S, line1S, line2S, line3S, line4S, 01205 line5S, line6S, line7S, line0BS,line2BS, 01206 line1BS,line3BS,line4BS,line6BS,line5BS, 01207 line7BS,line0CS,line4CS,line1CS,line5CS, 01208 line2CS,line6CS,line3CS,line7CS; 01209 01210 register vector signed short line0 = vec_add(temp0, temp1); 01211 register vector signed short line1 = vec_sub(temp0, temp1); 01212 register vector signed short line2 = vec_add(temp2, temp3); 01213 register vector signed short line3 = vec_sub(temp2, temp3); 01214 register vector signed short line4 = vec_add(temp4, temp5); 01215 register vector signed short line5 = vec_sub(temp4, temp5); 01216 register vector signed short line6 = vec_add(temp6, temp7); 01217 register vector signed short line7 = vec_sub(temp6, temp7); 01218 01219 register vector signed short line0B = vec_add(line0, line2); 01220 register vector signed short line2B = vec_sub(line0, line2); 01221 register vector signed short line1B = vec_add(line1, line3); 01222 register vector signed short line3B = vec_sub(line1, line3); 01223 register vector signed short line4B = vec_add(line4, line6); 01224 register vector signed short line6B = vec_sub(line4, line6); 01225 register vector signed short line5B = vec_add(line5, line7); 01226 register vector signed short line7B = vec_sub(line5, line7); 01227 01228 register vector signed short line0C = vec_add(line0B, line4B); 01229 register vector signed short line4C = vec_sub(line0B, line4B); 01230 register vector signed short line1C = vec_add(line1B, line5B); 01231 register vector signed short line5C = vec_sub(line1B, line5B); 01232 register vector signed short line2C = vec_add(line2B, line6B); 01233 register vector signed short line6C = vec_sub(line2B, line6B); 01234 register vector signed short line3C = vec_add(line3B, line7B); 01235 register vector signed short line7C = vec_sub(line3B, line7B); 01236 01237 vsum = vec_sum4s(vec_abs(line0C), vec_splat_s32(0)); 01238 vsum = vec_sum4s(vec_abs(line1C), vsum); 01239 vsum = vec_sum4s(vec_abs(line2C), vsum); 01240 vsum = vec_sum4s(vec_abs(line3C), vsum); 01241 vsum = vec_sum4s(vec_abs(line4C), vsum); 01242 vsum = vec_sum4s(vec_abs(line5C), vsum); 01243 vsum = vec_sum4s(vec_abs(line6C), vsum); 01244 vsum = vec_sum4s(vec_abs(line7C), vsum); 01245 01246 line0S = vec_add(temp0S, temp1S); 01247 line1S = vec_sub(temp0S, temp1S); 01248 line2S = vec_add(temp2S, temp3S); 01249 line3S = vec_sub(temp2S, temp3S); 01250 line4S = vec_add(temp4S, temp5S); 01251 line5S = vec_sub(temp4S, temp5S); 01252 line6S = vec_add(temp6S, temp7S); 01253 line7S = vec_sub(temp6S, temp7S); 01254 01255 line0BS = vec_add(line0S, line2S); 01256 line2BS = vec_sub(line0S, line2S); 01257 line1BS = vec_add(line1S, line3S); 01258 line3BS = vec_sub(line1S, line3S); 01259 line4BS = vec_add(line4S, line6S); 01260 line6BS = vec_sub(line4S, line6S); 01261 line5BS = vec_add(line5S, line7S); 01262 line7BS = vec_sub(line5S, line7S); 01263 01264 line0CS = vec_add(line0BS, line4BS); 01265 line4CS = vec_sub(line0BS, line4BS); 01266 line1CS = vec_add(line1BS, line5BS); 01267 line5CS = vec_sub(line1BS, line5BS); 01268 line2CS = vec_add(line2BS, line6BS); 01269 line6CS = vec_sub(line2BS, line6BS); 01270 line3CS = vec_add(line3BS, line7BS); 01271 line7CS = vec_sub(line3BS, line7BS); 01272 01273 vsum = vec_sum4s(vec_abs(line0CS), vsum); 01274 vsum = vec_sum4s(vec_abs(line1CS), vsum); 01275 vsum = vec_sum4s(vec_abs(line2CS), vsum); 01276 vsum = vec_sum4s(vec_abs(line3CS), vsum); 01277 vsum = vec_sum4s(vec_abs(line4CS), vsum); 01278 vsum = vec_sum4s(vec_abs(line5CS), vsum); 01279 vsum = vec_sum4s(vec_abs(line6CS), vsum); 01280 vsum = vec_sum4s(vec_abs(line7CS), vsum); 01281 vsum = vec_sums(vsum, (vector signed int)vzero); 01282 vsum = vec_splat(vsum, 3); 01283 vec_ste(vsum, 0, &sum); 01284 } 01285 return sum; 01286 } 01287 01288 static int hadamard8_diff16_altivec(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){ 01289 int score; 01290 score = hadamard8_diff16x8_altivec(s, dst, src, stride, 8); 01291 if (h==16) { 01292 dst += 8*stride; 01293 src += 8*stride; 01294 score += hadamard8_diff16x8_altivec(s, dst, src, stride, 8); 01295 } 01296 return score; 01297 } 01298 01299 static void vorbis_inverse_coupling_altivec(float *mag, float *ang, 01300 int blocksize) 01301 { 01302 int i; 01303 vector float m, a; 01304 vector bool int t0, t1; 01305 const vector unsigned int v_31 = //XXX 01306 vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1)); 01307 for (i = 0; i < blocksize; i += 4) { 01308 m = vec_ld(0, mag+i); 01309 a = vec_ld(0, ang+i); 01310 t0 = vec_cmple(m, (vector float)vec_splat_u32(0)); 01311 t1 = vec_cmple(a, (vector float)vec_splat_u32(0)); 01312 a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31)); 01313 t0 = (vector bool int)vec_and(a, t1); 01314 t1 = (vector bool int)vec_andc(a, t1); 01315 a = vec_sub(m, (vector float)t1); 01316 m = vec_add(m, (vector float)t0); 01317 vec_stl(a, 0, ang+i); 01318 vec_stl(m, 0, mag+i); 01319 } 01320 } 01321 01322 /* next one assumes that ((line_size % 8) == 0) */ 01323 static void avg_pixels8_xy2_altivec(uint8_t *block, const uint8_t *pixels, int line_size, int h) 01324 { 01325 register int i; 01326 register vector unsigned char pixelsv1, pixelsv2, pixelsavg; 01327 register vector unsigned char blockv, temp1, temp2, blocktemp; 01328 register vector unsigned short pixelssum1, pixelssum2, temp3; 01329 01330 register const vector unsigned char vczero = (const vector unsigned char) 01331 vec_splat_u8(0); 01332 register const vector unsigned short vctwo = (const vector unsigned short) 01333 vec_splat_u16(2); 01334 01335 temp1 = vec_ld(0, pixels); 01336 temp2 = vec_ld(16, pixels); 01337 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(0, pixels)); 01338 if ((((unsigned long)pixels) & 0x0000000F) == 0x0000000F) { 01339 pixelsv2 = temp2; 01340 } else { 01341 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(1, pixels)); 01342 } 01343 pixelsv1 = vec_mergeh(vczero, pixelsv1); 01344 pixelsv2 = vec_mergeh(vczero, pixelsv2); 01345 pixelssum1 = vec_add((vector unsigned short)pixelsv1, 01346 (vector unsigned short)pixelsv2); 01347 pixelssum1 = vec_add(pixelssum1, vctwo); 01348 01349 for (i = 0; i < h ; i++) { 01350 int rightside = ((unsigned long)block & 0x0000000F); 01351 blockv = vec_ld(0, block); 01352 01353 temp1 = vec_ld(line_size, pixels); 01354 temp2 = vec_ld(line_size + 16, pixels); 01355 pixelsv1 = vec_perm(temp1, temp2, vec_lvsl(line_size, pixels)); 01356 if (((((unsigned long)pixels) + line_size) & 0x0000000F) == 0x0000000F) { 01357 pixelsv2 = temp2; 01358 } else { 01359 pixelsv2 = vec_perm(temp1, temp2, vec_lvsl(line_size + 1, pixels)); 01360 } 01361 01362 pixelsv1 = vec_mergeh(vczero, pixelsv1); 01363 pixelsv2 = vec_mergeh(vczero, pixelsv2); 01364 pixelssum2 = vec_add((vector unsigned short)pixelsv1, 01365 (vector unsigned short)pixelsv2); 01366 temp3 = vec_add(pixelssum1, pixelssum2); 01367 temp3 = vec_sra(temp3, vctwo); 01368 pixelssum1 = vec_add(pixelssum2, vctwo); 01369 pixelsavg = vec_packsu(temp3, (vector unsigned short) vczero); 01370 01371 if (rightside) { 01372 blocktemp = vec_perm(blockv, pixelsavg, vcprm(0, 1, s0, s1)); 01373 } else { 01374 blocktemp = vec_perm(blockv, pixelsavg, vcprm(s0, s1, 2, 3)); 01375 } 01376 01377 blockv = vec_avg(blocktemp, blockv); 01378 vec_st(blockv, 0, block); 01379 01380 block += line_size; 01381 pixels += line_size; 01382 } 01383 } 01384 01385 void dsputil_init_altivec(DSPContext* c, AVCodecContext *avctx) 01386 { 01387 const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8; 01388 01389 c->pix_abs[0][1] = sad16_x2_altivec; 01390 c->pix_abs[0][2] = sad16_y2_altivec; 01391 c->pix_abs[0][3] = sad16_xy2_altivec; 01392 c->pix_abs[0][0] = sad16_altivec; 01393 c->pix_abs[1][0] = sad8_altivec; 01394 c->sad[0]= sad16_altivec; 01395 c->sad[1]= sad8_altivec; 01396 c->pix_norm1 = pix_norm1_altivec; 01397 c->sse[1]= sse8_altivec; 01398 c->sse[0]= sse16_altivec; 01399 c->pix_sum = pix_sum_altivec; 01400 c->diff_pixels = diff_pixels_altivec; 01401 c->get_pixels = get_pixels_altivec; 01402 if (!high_bit_depth) 01403 c->clear_block = clear_block_altivec; 01404 c->add_bytes= add_bytes_altivec; 01405 if (!high_bit_depth) { 01406 c->put_pixels_tab[0][0] = put_pixels16_altivec; 01407 /* the two functions do the same thing, so use the same code */ 01408 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_altivec; 01409 c->avg_pixels_tab[0][0] = avg_pixels16_altivec; 01410 c->avg_pixels_tab[1][0] = avg_pixels8_altivec; 01411 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_altivec; 01412 c->put_pixels_tab[1][3] = put_pixels8_xy2_altivec; 01413 c->put_no_rnd_pixels_tab[1][3] = put_no_rnd_pixels8_xy2_altivec; 01414 c->put_pixels_tab[0][3] = put_pixels16_xy2_altivec; 01415 c->put_no_rnd_pixels_tab[0][3] = put_no_rnd_pixels16_xy2_altivec; 01416 } 01417 01418 c->hadamard8_diff[0] = hadamard8_diff16_altivec; 01419 c->hadamard8_diff[1] = hadamard8_diff8x8_altivec; 01420 if (CONFIG_VORBIS_DECODER) 01421 c->vorbis_inverse_coupling = vorbis_inverse_coupling_altivec; 01422 }