Libav 0.7.1
|
00001 /* 00002 * Originally provided by Intel at Application Note AP-922. 00003 * 00004 * Column code adapted from Peter Gubanov. 00005 * Copyright (c) 2000-2001 Peter Gubanov <peter@elecard.net.ru> 00006 * http://www.elecard.com/peter/idct.shtml 00007 * rounding trick copyright (c) 2000 Michel Lespinasse <walken@zoy.org> 00008 * 00009 * MMI port and (c) 2002 by Leon van Stuivenberg 00010 * 00011 * This file is part of Libav. 00012 * 00013 * Libav is free software; you can redistribute it and/or 00014 * modify it under the terms of the GNU Lesser General Public 00015 * License as published by the Free Software Foundation; either 00016 * version 2.1 of the License, or (at your option) any later version. 00017 * 00018 * Libav is distributed in the hope that it will be useful, 00019 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00020 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00021 * Lesser General Public License for more details. 00022 * 00023 * You should have received a copy of the GNU Lesser General Public 00024 * License along with Libav; if not, write to the Free Software 00025 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00026 */ 00027 00028 #include "libavutil/common.h" 00029 #include "libavcodec/dsputil.h" 00030 #include "mmi.h" 00031 00032 #define BITS_INV_ACC 5 // 4 or 5 for IEEE 00033 #define SHIFT_INV_ROW (16 - BITS_INV_ACC) 00034 #define SHIFT_INV_COL (1 + BITS_INV_ACC) 00035 00036 #define TG1 6518 00037 #define TG2 13573 00038 #define TG3 21895 00039 #define CS4 23170 00040 00041 #define ROUNDER_0 0 00042 #define ROUNDER_1 16 00043 00044 #define TAB_i_04 (32+0) 00045 #define TAB_i_17 (32+64) 00046 #define TAB_i_26 (32+128) 00047 #define TAB_i_35 (32+192) 00048 00049 #define TG_1_16 (32+256+0) 00050 #define TG_2_16 (32+256+16) 00051 #define TG_3_16 (32+256+32) 00052 #define COS_4_16 (32+256+48) 00053 00054 #define CLIPMAX (32+256+64+0) 00055 00056 static short consttable[] align16 = { 00057 /* rounder 0*/ // assume SHIFT_INV_ROW == 11 00058 0x3ff, 1, 0x3ff, 1, 0x3ff, 1, 0x3ff, 1, 00059 /* rounder 1*/ 00060 0x3ff, 0, 0x3ff, 0, 0x3ff, 0, 0x3ff, 0, 00061 /* row 0/4*/ 00062 16384, 21407, -16384, -21407, 22725, 19266, -22725, -12873, 00063 8867, 16384, 8867, 16384, 4520, 12873, -4520, 19266, 00064 16384, -8867, 16384, -8867, 12873, -22725, 19266, -22725, 00065 21407, -16384, -21407, 16384, 19266, 4520, -12873, 4520, 00066 /* row 1/7*/ 00067 22725, 29692, -22725, -29692, 31521, 26722, -31521, -17855, 00068 12299, 22725, 12299, 22725, 6270, 17855, -6270, 26722, 00069 22725, -12299, 22725, -12299, 17855, -31521, 26722, -31521, 00070 29692, -22725, -29692, 22725, 26722, 6270, -17855, 6270, 00071 /* row 2/6*/ 00072 21407, 27969, -21407, -27969, 29692, 25172, -29692, -16819, 00073 11585, 21407, 11585, 21407, 5906, 16819, -5906, 25172, 00074 21407, -11585, 21407, -11585, 16819, -29692, 25172, -29692, 00075 27969, -21407, -27969, 21407, 25172, 5906, -16819, 5906, 00076 /*row 3/5*/ 00077 19266, 25172, -19266, -25172, 26722, 22654, -26722, -15137, 00078 10426, 19266, 10426, 19266, 5315, 15137, -5315, 22654, 00079 19266, -10426, 19266, -10426, 15137, -26722, 22654, -26722, 00080 25172, -19266, -25172, 19266, 22654, 5315, -15137, 5315, 00081 /*column constants*/ 00082 TG1, TG1, TG1, TG1, TG1, TG1, TG1, TG1, 00083 TG2, TG2, TG2, TG2, TG2, TG2, TG2, TG2, 00084 TG3, TG3, TG3, TG3, TG3, TG3, TG3, TG3, 00085 CS4, CS4, CS4, CS4, CS4, CS4, CS4, CS4, 00086 /* clamp */ 00087 255, 255, 255, 255, 255, 255, 255, 255 00088 }; 00089 00090 00091 #define DCT_8_INV_ROW1(blk, rowoff, taboff, rnd, outreg) { \ 00092 lq(blk, rowoff, $16); /* r16 = x7 x5 x3 x1 x6 x4 x2 x0 */ \ 00093 /*slot*/ \ 00094 lq($24, 0+taboff, $17); /* r17 = w */ \ 00095 /*delay slot $16*/ \ 00096 lq($24, 16+taboff, $18);/* r18 = w */ \ 00097 prevh($16, $2); /* r2 = x1 x3 x5 x7 x0 x2 x4 x6 */ \ 00098 lq($24, 32+taboff, $19);/* r19 = w */ \ 00099 phmadh($17, $16, $17); /* r17 = b1"b0'a1"a0' */ \ 00100 lq($24, 48+taboff, $20);/* r20 = w */ \ 00101 phmadh($18, $2, $18); /* r18 = b1'b0"a1'a0" */ \ 00102 phmadh($19, $16, $19); /* r19 = b3"b2'a3"a2' */ \ 00103 phmadh($20, $2, $20); /* r20 = b3'b2"a3'a2" */ \ 00104 paddw($17, $18, $17); /* r17 = (b1)(b0)(a1)(a0) */ \ 00105 paddw($19, $20, $19); /* r19 = (b3)(b2)(a3)(a2) */ \ 00106 pcpyld($19, $17, $18); /* r18 = (a3)(a2)(a1)(a0) */ \ 00107 pcpyud($17, $19, $20); /* r20 = (b3)(b2)(b1)(b0) */ \ 00108 paddw($18, rnd, $18); /* r18 = (a3)(a2)(a1)(a0) */\ 00109 paddw($18, $20, $17); /* r17 = ()()()(a0+b0) */ \ 00110 psubw($18, $20, $20); /* r20 = ()()()(a0-b0) */ \ 00111 psraw($17, SHIFT_INV_ROW, $17); /* r17 = (y3 y2 y1 y0) */ \ 00112 psraw($20, SHIFT_INV_ROW, $20); /* r20 = (y4 y5 y6 y7) */ \ 00113 ppach($20, $17, outreg);/* out = y4 y5 y6 y7 y3 y2 y1 y0 Note order */ \ 00114 \ 00115 prevh(outreg, $2); \ 00116 pcpyud($2, $2, $2); \ 00117 pcpyld($2, outreg, outreg); \ 00118 } 00119 00120 00121 #define DCT_8_INV_COL8() \ 00122 \ 00123 lq($24, TG_3_16, $2); /* r2 = tn3 */ \ 00124 \ 00125 pmulth($11, $2, $17); /* r17 = x3 * tn3 (6420) */ \ 00126 psraw($17, 15, $17); \ 00127 pmfhl_uw($3); /* r3 = 7531 */ \ 00128 psraw($3, 15, $3); \ 00129 pinteh($3, $17, $17); /* r17 = x3 * tn3 */ \ 00130 psubh($17, $13, $17); /* r17 = tm35 */ \ 00131 \ 00132 pmulth($13, $2, $18); /* r18 = x5 * tn3 (6420) */ \ 00133 psraw($18, 15, $18); \ 00134 pmfhl_uw($3); /* r3 = 7531 */ \ 00135 psraw($3, 15, $3); \ 00136 pinteh($3, $18, $18); /* r18 = x5 * tn3 */ \ 00137 paddh($18, $11, $18); /* r18 = tp35 */ \ 00138 \ 00139 lq($24, TG_1_16, $2); /* r2 = tn1 */ \ 00140 \ 00141 pmulth($15, $2, $19); /* r19 = x7 * tn1 (6420) */ \ 00142 psraw($19, 15, $19); \ 00143 pmfhl_uw($3); /* r3 = 7531 */ \ 00144 psraw($3, 15, $3); \ 00145 pinteh($3, $19, $19); /* r19 = x7 * tn1 */ \ 00146 paddh($19, $9, $19); /* r19 = tp17 */ \ 00147 \ 00148 pmulth($9, $2, $20); /* r20 = x1 * tn1 (6420) */ \ 00149 psraw($20, 15, $20); \ 00150 pmfhl_uw($3); /* r3 = 7531 */ \ 00151 psraw($3, 15, $3); \ 00152 pinteh($3, $20, $20); /* r20 = x1 * tn1 */ \ 00153 psubh($20, $15, $20); /* r20 = tm17 */ \ 00154 \ 00155 psubh($19, $18, $3); /* r3 = t1 */ \ 00156 paddh($20, $17, $16); /* r16 = t2 */ \ 00157 psubh($20, $17, $23); /* r23 = b3 */ \ 00158 paddh($19, $18, $20); /* r20 = b0 */ \ 00159 \ 00160 lq($24, COS_4_16, $2); /* r2 = cs4 */ \ 00161 \ 00162 paddh($3, $16, $21); /* r21 = t1+t2 */ \ 00163 psubh($3, $16, $22); /* r22 = t1-t2 */ \ 00164 \ 00165 pmulth($21, $2, $21); /* r21 = cs4 * (t1+t2) 6420 */ \ 00166 psraw($21, 15, $21); \ 00167 pmfhl_uw($3); /* r3 = 7531 */ \ 00168 psraw($3, 15, $3); \ 00169 pinteh($3, $21, $21); /* r21 = b1 */ \ 00170 \ 00171 pmulth($22, $2, $22); /* r22 = cs4 * (t1-t2) 6420 */ \ 00172 psraw($22, 15, $22); \ 00173 pmfhl_uw($3); /* r3 = 7531 */ \ 00174 psraw($3, 15, $3); \ 00175 pinteh($3, $22, $22); /* r22 = b2 */ \ 00176 \ 00177 lq($24, TG_2_16, $2); /* r2 = tn2 */ \ 00178 \ 00179 pmulth($10, $2, $17); /* r17 = x2 * tn2 (6420) */ \ 00180 psraw($17, 15, $17); \ 00181 pmfhl_uw($3); /* r3 = 7531 */ \ 00182 psraw($3, 15, $3); \ 00183 pinteh($3, $17, $17); /* r17 = x3 * tn3 */ \ 00184 psubh($17, $14, $17); /* r17 = tm26 */ \ 00185 \ 00186 pmulth($14, $2, $18); /* r18 = x6 * tn2 (6420) */ \ 00187 psraw($18, 15, $18); \ 00188 pmfhl_uw($3); /* r3 = 7531 */ \ 00189 psraw($3, 15, $3); \ 00190 pinteh($3, $18, $18); /* r18 = x6 * tn2 */ \ 00191 paddh($18, $10, $18); /* r18 = tp26 */ \ 00192 \ 00193 paddh($8, $12, $2); /* r2 = tp04 */ \ 00194 psubh($8, $12, $3); /* r3 = tm04 */ \ 00195 \ 00196 paddh($2, $18, $16); /* r16 = a0 */ \ 00197 psubh($2, $18, $19); /* r19 = a3 */ \ 00198 psubh($3, $17, $18); /* r18 = a2 */ \ 00199 paddh($3, $17, $17); /* r17 = a1 */ 00200 00201 00202 #define DCT_8_INV_COL8_STORE(blk) \ 00203 \ 00204 paddh($16, $20, $2); /* y0 a0+b0 */ \ 00205 psubh($16, $20, $16); /* y7 a0-b0 */ \ 00206 psrah($2, SHIFT_INV_COL, $2); \ 00207 psrah($16, SHIFT_INV_COL, $16); \ 00208 sq($2, 0, blk); \ 00209 sq($16, 112, blk); \ 00210 \ 00211 paddh($17, $21, $3); /* y1 a1+b1 */ \ 00212 psubh($17, $21, $17); /* y6 a1-b1 */ \ 00213 psrah($3, SHIFT_INV_COL, $3); \ 00214 psrah($17, SHIFT_INV_COL, $17); \ 00215 sq($3, 16, blk); \ 00216 sq($17, 96, blk); \ 00217 \ 00218 paddh($18, $22, $2); /* y2 a2+b2 */ \ 00219 psubh($18, $22, $18); /* y5 a2-b2 */ \ 00220 psrah($2, SHIFT_INV_COL, $2); \ 00221 psrah($18, SHIFT_INV_COL, $18); \ 00222 sq($2, 32, blk); \ 00223 sq($18, 80, blk); \ 00224 \ 00225 paddh($19, $23, $3); /* y3 a3+b3 */ \ 00226 psubh($19, $23, $19); /* y4 a3-b3 */ \ 00227 psrah($3, SHIFT_INV_COL, $3); \ 00228 psrah($19, SHIFT_INV_COL, $19); \ 00229 sq($3, 48, blk); \ 00230 sq($19, 64, blk); 00231 00232 00233 00234 #define DCT_8_INV_COL8_PMS() \ 00235 paddh($16, $20, $2); /* y0 a0+b0 */ \ 00236 psubh($16, $20, $20); /* y7 a0-b0 */ \ 00237 psrah($2, SHIFT_INV_COL, $16); \ 00238 psrah($20, SHIFT_INV_COL, $20); \ 00239 \ 00240 paddh($17, $21, $3); /* y1 a1+b1 */ \ 00241 psubh($17, $21, $21); /* y6 a1-b1 */ \ 00242 psrah($3, SHIFT_INV_COL, $17); \ 00243 psrah($21, SHIFT_INV_COL, $21); \ 00244 \ 00245 paddh($18, $22, $2); /* y2 a2+b2 */ \ 00246 psubh($18, $22, $22); /* y5 a2-b2 */ \ 00247 psrah($2, SHIFT_INV_COL, $18); \ 00248 psrah($22, SHIFT_INV_COL, $22); \ 00249 \ 00250 paddh($19, $23, $3); /* y3 a3+b3 */ \ 00251 psubh($19, $23, $23); /* y4 a3-b3 */ \ 00252 psrah($3, SHIFT_INV_COL, $19); \ 00253 psrah($23, SHIFT_INV_COL, $23); 00254 00255 #define PUT(rs) \ 00256 pminh(rs, $11, $2); \ 00257 pmaxh($2, $0, $2); \ 00258 ppacb($0, $2, $2); \ 00259 sd3(2, 0, 4); \ 00260 __asm__ volatile ("add $4, $5, $4"); 00261 00262 #define DCT_8_INV_COL8_PUT() \ 00263 PUT($16); \ 00264 PUT($17); \ 00265 PUT($18); \ 00266 PUT($19); \ 00267 PUT($23); \ 00268 PUT($22); \ 00269 PUT($21); \ 00270 PUT($20); 00271 00272 #define ADD(rs) \ 00273 ld3(4, 0, 2); \ 00274 pextlb($0, $2, $2); \ 00275 paddh($2, rs, $2); \ 00276 pminh($2, $11, $2); \ 00277 pmaxh($2, $0, $2); \ 00278 ppacb($0, $2, $2); \ 00279 sd3(2, 0, 4); \ 00280 __asm__ volatile ("add $4, $5, $4"); 00281 00282 /*fixme: schedule*/ 00283 #define DCT_8_INV_COL8_ADD() \ 00284 ADD($16); \ 00285 ADD($17); \ 00286 ADD($18); \ 00287 ADD($19); \ 00288 ADD($23); \ 00289 ADD($22); \ 00290 ADD($21); \ 00291 ADD($20); 00292 00293 00294 void ff_mmi_idct(int16_t * block) 00295 { 00296 /* $4 = block */ 00297 __asm__ volatile("la $24, %0"::"m"(consttable[0])); 00298 lq($24, ROUNDER_0, $8); 00299 lq($24, ROUNDER_1, $7); 00300 DCT_8_INV_ROW1($4, 0, TAB_i_04, $8, $8); 00301 DCT_8_INV_ROW1($4, 16, TAB_i_17, $7, $9); 00302 DCT_8_INV_ROW1($4, 32, TAB_i_26, $7, $10); 00303 DCT_8_INV_ROW1($4, 48, TAB_i_35, $7, $11); 00304 DCT_8_INV_ROW1($4, 64, TAB_i_04, $7, $12); 00305 DCT_8_INV_ROW1($4, 80, TAB_i_35, $7, $13); 00306 DCT_8_INV_ROW1($4, 96, TAB_i_26, $7, $14); 00307 DCT_8_INV_ROW1($4, 112, TAB_i_17, $7, $15); 00308 DCT_8_INV_COL8(); 00309 DCT_8_INV_COL8_STORE($4); 00310 00311 //let savedtemp regs be saved 00312 __asm__ volatile(" ":::"$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23"); 00313 } 00314 00315 00316 void ff_mmi_idct_put(uint8_t *dest, int line_size, DCTELEM *block) 00317 { 00318 /* $4 = dest, $5 = line_size, $6 = block */ 00319 __asm__ volatile("la $24, %0"::"m"(consttable[0])); 00320 lq($24, ROUNDER_0, $8); 00321 lq($24, ROUNDER_1, $7); 00322 DCT_8_INV_ROW1($6, 0, TAB_i_04, $8, $8); 00323 DCT_8_INV_ROW1($6, 16, TAB_i_17, $7, $9); 00324 DCT_8_INV_ROW1($6, 32, TAB_i_26, $7, $10); 00325 DCT_8_INV_ROW1($6, 48, TAB_i_35, $7, $11); 00326 DCT_8_INV_ROW1($6, 64, TAB_i_04, $7, $12); 00327 DCT_8_INV_ROW1($6, 80, TAB_i_35, $7, $13); 00328 DCT_8_INV_ROW1($6, 96, TAB_i_26, $7, $14); 00329 DCT_8_INV_ROW1($6, 112, TAB_i_17, $7, $15); 00330 DCT_8_INV_COL8(); 00331 lq($24, CLIPMAX, $11); 00332 DCT_8_INV_COL8_PMS(); 00333 DCT_8_INV_COL8_PUT(); 00334 00335 //let savedtemp regs be saved 00336 __asm__ volatile(" ":::"$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23"); 00337 } 00338 00339 00340 void ff_mmi_idct_add(uint8_t *dest, int line_size, DCTELEM *block) 00341 { 00342 /* $4 = dest, $5 = line_size, $6 = block */ 00343 __asm__ volatile("la $24, %0"::"m"(consttable[0])); 00344 lq($24, ROUNDER_0, $8); 00345 lq($24, ROUNDER_1, $7); 00346 DCT_8_INV_ROW1($6, 0, TAB_i_04, $8, $8); 00347 DCT_8_INV_ROW1($6, 16, TAB_i_17, $7, $9); 00348 DCT_8_INV_ROW1($6, 32, TAB_i_26, $7, $10); 00349 DCT_8_INV_ROW1($6, 48, TAB_i_35, $7, $11); 00350 DCT_8_INV_ROW1($6, 64, TAB_i_04, $7, $12); 00351 DCT_8_INV_ROW1($6, 80, TAB_i_35, $7, $13); 00352 DCT_8_INV_ROW1($6, 96, TAB_i_26, $7, $14); 00353 DCT_8_INV_ROW1($6, 112, TAB_i_17, $7, $15); 00354 DCT_8_INV_COL8(); 00355 lq($24, CLIPMAX, $11); 00356 DCT_8_INV_COL8_PMS(); 00357 DCT_8_INV_COL8_ADD(); 00358 00359 //let savedtemp regs be saved 00360 __asm__ volatile(" ":::"$16", "$17", "$18", "$19", "$20", "$21", "$22", "$23"); 00361 } 00362