Libav 0.7.1
|
00001 /* 00002 * Windows Media Audio Voice decoder. 00003 * Copyright (c) 2009 Ronald S. Bultje 00004 * 00005 * This file is part of Libav. 00006 * 00007 * Libav is free software; you can redistribute it and/or 00008 * modify it under the terms of the GNU Lesser General Public 00009 * License as published by the Free Software Foundation; either 00010 * version 2.1 of the License, or (at your option) any later version. 00011 * 00012 * Libav is distributed in the hope that it will be useful, 00013 * but WITHOUT ANY WARRANTY; without even the implied warranty of 00014 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 00015 * Lesser General Public License for more details. 00016 * 00017 * You should have received a copy of the GNU Lesser General Public 00018 * License along with Libav; if not, write to the Free Software 00019 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 00020 */ 00021 00028 #include <math.h> 00029 #include "avcodec.h" 00030 #include "get_bits.h" 00031 #include "put_bits.h" 00032 #include "wmavoice_data.h" 00033 #include "celp_math.h" 00034 #include "celp_filters.h" 00035 #include "acelp_vectors.h" 00036 #include "acelp_filters.h" 00037 #include "lsp.h" 00038 #include "libavutil/lzo.h" 00039 #include "dct.h" 00040 #include "rdft.h" 00041 #include "sinewin.h" 00042 00043 #define MAX_BLOCKS 8 ///< maximum number of blocks per frame 00044 #define MAX_LSPS 16 ///< maximum filter order 00045 #define MAX_LSPS_ALIGN16 16 ///< same as #MAX_LSPS; needs to be multiple 00046 00047 #define MAX_FRAMES 3 ///< maximum number of frames per superframe 00048 #define MAX_FRAMESIZE 160 ///< maximum number of samples per frame 00049 #define MAX_SIGNAL_HISTORY 416 ///< maximum excitation signal history 00050 #define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES) 00051 00052 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that 00053 00054 #define VLC_NBITS 6 ///< number of bits to read per VLC iteration 00055 00059 static VLC frame_type_vlc; 00060 00064 enum { 00065 ACB_TYPE_NONE = 0, 00066 ACB_TYPE_ASYMMETRIC = 1, 00067 00068 00069 00070 00071 ACB_TYPE_HAMMING = 2 00072 00073 00074 }; 00075 00079 enum { 00080 FCB_TYPE_SILENCE = 0, 00081 00082 00083 FCB_TYPE_HARDCODED = 1, 00084 00085 FCB_TYPE_AW_PULSES = 2, 00086 00087 FCB_TYPE_EXC_PULSES = 3, 00088 00089 00090 }; 00091 00095 static const struct frame_type_desc { 00096 uint8_t n_blocks; 00097 00098 uint8_t log_n_blocks; 00099 uint8_t acb_type; 00100 uint8_t fcb_type; 00101 uint8_t dbl_pulses; 00102 00103 00104 uint16_t frame_size; 00105 00106 } frame_descs[17] = { 00107 { 1, 0, ACB_TYPE_NONE, FCB_TYPE_SILENCE, 0, 0 }, 00108 { 2, 1, ACB_TYPE_NONE, FCB_TYPE_HARDCODED, 0, 28 }, 00109 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES, 0, 46 }, 00110 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 80 }, 00111 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 104 }, 00112 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0, 108 }, 00113 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 132 }, 00114 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 168 }, 00115 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 64 }, 00116 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 80 }, 00117 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 104 }, 00118 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 108 }, 00119 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 132 }, 00120 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 168 }, 00121 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 176 }, 00122 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 208 }, 00123 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 256 } 00124 }; 00125 00129 typedef struct { 00134 GetBitContext gb; 00135 00136 00137 00138 int8_t vbm_tree[25]; 00139 00140 int spillover_bitsize; 00141 00142 00143 int history_nsamples; 00144 00145 00146 /* postfilter specific values */ 00147 int do_apf; 00148 00149 int denoise_strength; 00150 00151 int denoise_tilt_corr; 00152 00153 int dc_level; 00154 00155 00156 int lsps; 00157 int lsp_q_mode; 00158 int lsp_def_mode; 00159 00160 int frame_lsp_bitsize; 00161 00162 int sframe_lsp_bitsize; 00163 00164 00165 int min_pitch_val; 00166 int max_pitch_val; 00167 int pitch_nbits; 00168 00169 int block_pitch_nbits; 00170 00171 int block_pitch_range; 00172 int block_delta_pitch_nbits; 00173 00174 00175 00176 int block_delta_pitch_hrange; 00177 00178 uint16_t block_conv_table[4]; 00179 00180 00190 int spillover_nbits; 00191 00192 00193 00194 int has_residual_lsps; 00195 00196 00197 00198 00199 int skip_bits_next; 00200 00201 00202 00203 uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + FF_INPUT_BUFFER_PADDING_SIZE]; 00206 int sframe_cache_size; 00207 00208 00209 00210 00211 PutBitContext pb; 00212 00222 double prev_lsps[MAX_LSPS]; 00223 00224 int last_pitch_val; 00225 int last_acb_type; 00226 int pitch_diff_sh16; 00227 00228 float silence_gain; 00229 00230 int aw_idx_is_ext; 00231 00232 int aw_pulse_range; 00233 00234 00235 00236 00237 00238 int aw_n_pulses[2]; 00239 00240 00241 int aw_first_pulse_off[2]; 00242 00243 int aw_next_pulse_off_cache; 00244 00245 00246 00247 00248 00249 int frame_cntr; 00250 00251 float gain_pred_err[6]; 00252 float excitation_history[MAX_SIGNAL_HISTORY]; 00256 float synth_history[MAX_LSPS]; 00257 00266 RDFTContext rdft, irdft; 00267 00268 DCTContext dct, dst; 00269 00270 float sin[511], cos[511]; 00271 00272 float postfilter_agc; 00273 00274 float dcf_mem[2]; 00275 float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE]; 00278 float denoise_filter_cache[MAX_FRAMESIZE]; 00279 int denoise_filter_cache_size; 00280 DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80]; 00282 DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80]; 00284 DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16]; 00287 00290 } WMAVoiceContext; 00291 00301 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25]) 00302 { 00303 static const uint8_t bits[] = { 00304 2, 2, 2, 4, 4, 4, 00305 6, 6, 6, 8, 8, 8, 00306 10, 10, 10, 12, 12, 12, 00307 14, 14, 14, 14 00308 }; 00309 static const uint16_t codes[] = { 00310 0x0000, 0x0001, 0x0002, // 00/01/10 00311 0x000c, 0x000d, 0x000e, // 11+00/01/10 00312 0x003c, 0x003d, 0x003e, // 1111+00/01/10 00313 0x00fc, 0x00fd, 0x00fe, // 111111+00/01/10 00314 0x03fc, 0x03fd, 0x03fe, // 11111111+00/01/10 00315 0x0ffc, 0x0ffd, 0x0ffe, // 1111111111+00/01/10 00316 0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx 00317 }; 00318 int cntr[8], n, res; 00319 00320 memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25); 00321 memset(cntr, 0, sizeof(cntr)); 00322 for (n = 0; n < 17; n++) { 00323 res = get_bits(gb, 3); 00324 if (cntr[res] > 3) // should be >= 3 + (res == 7)) 00325 return -1; 00326 vbm_tree[res * 3 + cntr[res]++] = n; 00327 } 00328 INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits), 00329 bits, 1, 1, codes, 2, 2, 132); 00330 return 0; 00331 } 00332 00336 static av_cold int wmavoice_decode_init(AVCodecContext *ctx) 00337 { 00338 int n, flags, pitch_range, lsp16_flag; 00339 WMAVoiceContext *s = ctx->priv_data; 00340 00349 if (ctx->extradata_size != 46) { 00350 av_log(ctx, AV_LOG_ERROR, 00351 "Invalid extradata size %d (should be 46)\n", 00352 ctx->extradata_size); 00353 return -1; 00354 } 00355 flags = AV_RL32(ctx->extradata + 18); 00356 s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align); 00357 s->do_apf = flags & 0x1; 00358 if (s->do_apf) { 00359 ff_rdft_init(&s->rdft, 7, DFT_R2C); 00360 ff_rdft_init(&s->irdft, 7, IDFT_C2R); 00361 ff_dct_init(&s->dct, 6, DCT_I); 00362 ff_dct_init(&s->dst, 6, DST_I); 00363 00364 ff_sine_window_init(s->cos, 256); 00365 memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0])); 00366 for (n = 0; n < 255; n++) { 00367 s->sin[n] = -s->sin[510 - n]; 00368 s->cos[510 - n] = s->cos[n]; 00369 } 00370 } 00371 s->denoise_strength = (flags >> 2) & 0xF; 00372 if (s->denoise_strength >= 12) { 00373 av_log(ctx, AV_LOG_ERROR, 00374 "Invalid denoise filter strength %d (max=11)\n", 00375 s->denoise_strength); 00376 return -1; 00377 } 00378 s->denoise_tilt_corr = !!(flags & 0x40); 00379 s->dc_level = (flags >> 7) & 0xF; 00380 s->lsp_q_mode = !!(flags & 0x2000); 00381 s->lsp_def_mode = !!(flags & 0x4000); 00382 lsp16_flag = flags & 0x1000; 00383 if (lsp16_flag) { 00384 s->lsps = 16; 00385 s->frame_lsp_bitsize = 34; 00386 s->sframe_lsp_bitsize = 60; 00387 } else { 00388 s->lsps = 10; 00389 s->frame_lsp_bitsize = 24; 00390 s->sframe_lsp_bitsize = 48; 00391 } 00392 for (n = 0; n < s->lsps; n++) 00393 s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0); 00394 00395 init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3); 00396 if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) { 00397 av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n"); 00398 return -1; 00399 } 00400 00401 s->min_pitch_val = ((ctx->sample_rate << 8) / 400 + 50) >> 8; 00402 s->max_pitch_val = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8; 00403 pitch_range = s->max_pitch_val - s->min_pitch_val; 00404 if (pitch_range <= 0) { 00405 av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n"); 00406 return -1; 00407 } 00408 s->pitch_nbits = av_ceil_log2(pitch_range); 00409 s->last_pitch_val = 40; 00410 s->last_acb_type = ACB_TYPE_NONE; 00411 s->history_nsamples = s->max_pitch_val + 8; 00412 00413 if (s->min_pitch_val < 1 || s->history_nsamples > MAX_SIGNAL_HISTORY) { 00414 int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8, 00415 max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8; 00416 00417 av_log(ctx, AV_LOG_ERROR, 00418 "Unsupported samplerate %d (min=%d, max=%d)\n", 00419 ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz 00420 00421 return -1; 00422 } 00423 00424 s->block_conv_table[0] = s->min_pitch_val; 00425 s->block_conv_table[1] = (pitch_range * 25) >> 6; 00426 s->block_conv_table[2] = (pitch_range * 44) >> 6; 00427 s->block_conv_table[3] = s->max_pitch_val - 1; 00428 s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF; 00429 if (s->block_delta_pitch_hrange <= 0) { 00430 av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n"); 00431 return -1; 00432 } 00433 s->block_delta_pitch_nbits = 1 + av_ceil_log2(s->block_delta_pitch_hrange); 00434 s->block_pitch_range = s->block_conv_table[2] + 00435 s->block_conv_table[3] + 1 + 00436 2 * (s->block_conv_table[1] - 2 * s->min_pitch_val); 00437 s->block_pitch_nbits = av_ceil_log2(s->block_pitch_range); 00438 00439 ctx->sample_fmt = AV_SAMPLE_FMT_FLT; 00440 00441 return 0; 00442 } 00443 00465 static void adaptive_gain_control(float *out, const float *in, 00466 const float *speech_synth, 00467 int size, float alpha, float *gain_mem) 00468 { 00469 int i; 00470 float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor; 00471 float mem = *gain_mem; 00472 00473 for (i = 0; i < size; i++) { 00474 speech_energy += fabsf(speech_synth[i]); 00475 postfilter_energy += fabsf(in[i]); 00476 } 00477 gain_scale_factor = (1.0 - alpha) * speech_energy / postfilter_energy; 00478 00479 for (i = 0; i < size; i++) { 00480 mem = alpha * mem + gain_scale_factor; 00481 out[i] = in[i] * mem; 00482 } 00483 00484 *gain_mem = mem; 00485 } 00486 00505 static int kalman_smoothen(WMAVoiceContext *s, int pitch, 00506 const float *in, float *out, int size) 00507 { 00508 int n; 00509 float optimal_gain = 0, dot; 00510 const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)], 00511 *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)], 00512 *best_hist_ptr; 00513 00514 /* find best fitting point in history */ 00515 do { 00516 dot = ff_dot_productf(in, ptr, size); 00517 if (dot > optimal_gain) { 00518 optimal_gain = dot; 00519 best_hist_ptr = ptr; 00520 } 00521 } while (--ptr >= end); 00522 00523 if (optimal_gain <= 0) 00524 return -1; 00525 dot = ff_dot_productf(best_hist_ptr, best_hist_ptr, size); 00526 if (dot <= 0) // would be 1.0 00527 return -1; 00528 00529 if (optimal_gain <= dot) { 00530 dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000 00531 } else 00532 dot = 0.625; 00533 00534 /* actual smoothing */ 00535 for (n = 0; n < size; n++) 00536 out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]); 00537 00538 return 0; 00539 } 00540 00551 static float tilt_factor(const float *lpcs, int n_lpcs) 00552 { 00553 float rh0, rh1; 00554 00555 rh0 = 1.0 + ff_dot_productf(lpcs, lpcs, n_lpcs); 00556 rh1 = lpcs[0] + ff_dot_productf(lpcs, &lpcs[1], n_lpcs - 1); 00557 00558 return rh1 / rh0; 00559 } 00560 00564 static void calc_input_response(WMAVoiceContext *s, float *lpcs, 00565 int fcb_type, float *coeffs, int remainder) 00566 { 00567 float last_coeff, min = 15.0, max = -15.0; 00568 float irange, angle_mul, gain_mul, range, sq; 00569 int n, idx; 00570 00571 /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */ 00572 s->rdft.rdft_calc(&s->rdft, lpcs); 00573 #define log_range(var, assign) do { \ 00574 float tmp = log10f(assign); var = tmp; \ 00575 max = FFMAX(max, tmp); min = FFMIN(min, tmp); \ 00576 } while (0) 00577 log_range(last_coeff, lpcs[1] * lpcs[1]); 00578 for (n = 1; n < 64; n++) 00579 log_range(lpcs[n], lpcs[n * 2] * lpcs[n * 2] + 00580 lpcs[n * 2 + 1] * lpcs[n * 2 + 1]); 00581 log_range(lpcs[0], lpcs[0] * lpcs[0]); 00582 #undef log_range 00583 range = max - min; 00584 lpcs[64] = last_coeff; 00585 00586 /* Now, use this spectrum to pick out these frequencies with higher 00587 * (relative) power/energy (which we then take to be "not noise"), 00588 * and set up a table (still in lpc[]) of (relative) gains per frequency. 00589 * These frequencies will be maintained, while others ("noise") will be 00590 * decreased in the filter output. */ 00591 irange = 64.0 / range; // so irange*(max-value) is in the range [0, 63] 00592 gain_mul = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) : 00593 (5.0 / 14.7)); 00594 angle_mul = gain_mul * (8.0 * M_LN10 / M_PI); 00595 for (n = 0; n <= 64; n++) { 00596 float pwr; 00597 00598 idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1); 00599 pwr = wmavoice_denoise_power_table[s->denoise_strength][idx]; 00600 lpcs[n] = angle_mul * pwr; 00601 00602 /* 70.57 =~ 1/log10(1.0331663) */ 00603 idx = (pwr * gain_mul - 0.0295) * 70.570526123; 00604 if (idx > 127) { // fallback if index falls outside table range 00605 coeffs[n] = wmavoice_energy_table[127] * 00606 powf(1.0331663, idx - 127); 00607 } else 00608 coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)]; 00609 } 00610 00611 /* calculate the Hilbert transform of the gains, which we do (since this 00612 * is a sinus input) by doing a phase shift (in theory, H(sin())=cos()). 00613 * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the 00614 * "moment" of the LPCs in this filter. */ 00615 s->dct.dct_calc(&s->dct, lpcs); 00616 s->dst.dct_calc(&s->dst, lpcs); 00617 00618 /* Split out the coefficient indexes into phase/magnitude pairs */ 00619 idx = 255 + av_clip(lpcs[64], -255, 255); 00620 coeffs[0] = coeffs[0] * s->cos[idx]; 00621 idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255); 00622 last_coeff = coeffs[64] * s->cos[idx]; 00623 for (n = 63;; n--) { 00624 idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255); 00625 coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx]; 00626 coeffs[n * 2] = coeffs[n] * s->cos[idx]; 00627 00628 if (!--n) break; 00629 00630 idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255); 00631 coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx]; 00632 coeffs[n * 2] = coeffs[n] * s->cos[idx]; 00633 } 00634 coeffs[1] = last_coeff; 00635 00636 /* move into real domain */ 00637 s->irdft.rdft_calc(&s->irdft, coeffs); 00638 00639 /* tilt correction and normalize scale */ 00640 memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder)); 00641 if (s->denoise_tilt_corr) { 00642 float tilt_mem = 0; 00643 00644 coeffs[remainder - 1] = 0; 00645 ff_tilt_compensation(&tilt_mem, 00646 -1.8 * tilt_factor(coeffs, remainder - 1), 00647 coeffs, remainder); 00648 } 00649 sq = (1.0 / 64.0) * sqrtf(1 / ff_dot_productf(coeffs, coeffs, remainder)); 00650 for (n = 0; n < remainder; n++) 00651 coeffs[n] *= sq; 00652 } 00653 00680 static void wiener_denoise(WMAVoiceContext *s, int fcb_type, 00681 float *synth_pf, int size, 00682 const float *lpcs) 00683 { 00684 int remainder, lim, n; 00685 00686 if (fcb_type != FCB_TYPE_SILENCE) { 00687 float *tilted_lpcs = s->tilted_lpcs_pf, 00688 *coeffs = s->denoise_coeffs_pf, tilt_mem = 0; 00689 00690 tilted_lpcs[0] = 1.0; 00691 memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps); 00692 memset(&tilted_lpcs[s->lsps + 1], 0, 00693 sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1)); 00694 ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps), 00695 tilted_lpcs, s->lsps + 2); 00696 00697 /* The IRDFT output (127 samples for 7-bit filter) beyond the frame 00698 * size is applied to the next frame. All input beyond this is zero, 00699 * and thus all output beyond this will go towards zero, hence we can 00700 * limit to min(size-1, 127-size) as a performance consideration. */ 00701 remainder = FFMIN(127 - size, size - 1); 00702 calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder); 00703 00704 /* apply coefficients (in frequency spectrum domain), i.e. complex 00705 * number multiplication */ 00706 memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size)); 00707 s->rdft.rdft_calc(&s->rdft, synth_pf); 00708 s->rdft.rdft_calc(&s->rdft, coeffs); 00709 synth_pf[0] *= coeffs[0]; 00710 synth_pf[1] *= coeffs[1]; 00711 for (n = 1; n < 64; n++) { 00712 float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1]; 00713 synth_pf[n * 2] = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1]; 00714 synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1]; 00715 } 00716 s->irdft.rdft_calc(&s->irdft, synth_pf); 00717 } 00718 00719 /* merge filter output with the history of previous runs */ 00720 if (s->denoise_filter_cache_size) { 00721 lim = FFMIN(s->denoise_filter_cache_size, size); 00722 for (n = 0; n < lim; n++) 00723 synth_pf[n] += s->denoise_filter_cache[n]; 00724 s->denoise_filter_cache_size -= lim; 00725 memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size], 00726 sizeof(s->denoise_filter_cache[0]) * s->denoise_filter_cache_size); 00727 } 00728 00729 /* move remainder of filter output into a cache for future runs */ 00730 if (fcb_type != FCB_TYPE_SILENCE) { 00731 lim = FFMIN(remainder, s->denoise_filter_cache_size); 00732 for (n = 0; n < lim; n++) 00733 s->denoise_filter_cache[n] += synth_pf[size + n]; 00734 if (lim < remainder) { 00735 memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim], 00736 sizeof(s->denoise_filter_cache[0]) * (remainder - lim)); 00737 s->denoise_filter_cache_size = remainder; 00738 } 00739 } 00740 } 00741 00762 static void postfilter(WMAVoiceContext *s, const float *synth, 00763 float *samples, int size, 00764 const float *lpcs, float *zero_exc_pf, 00765 int fcb_type, int pitch) 00766 { 00767 float synth_filter_in_buf[MAX_FRAMESIZE / 2], 00768 *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16], 00769 *synth_filter_in = zero_exc_pf; 00770 00771 assert(size <= MAX_FRAMESIZE / 2); 00772 00773 /* generate excitation from input signal */ 00774 ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps); 00775 00776 if (fcb_type >= FCB_TYPE_AW_PULSES && 00777 !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size)) 00778 synth_filter_in = synth_filter_in_buf; 00779 00780 /* re-synthesize speech after smoothening, and keep history */ 00781 ff_celp_lp_synthesis_filterf(synth_pf, lpcs, 00782 synth_filter_in, size, s->lsps); 00783 memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps], 00784 sizeof(synth_pf[0]) * s->lsps); 00785 00786 wiener_denoise(s, fcb_type, synth_pf, size, lpcs); 00787 00788 adaptive_gain_control(samples, synth_pf, synth, size, 0.99, 00789 &s->postfilter_agc); 00790 00791 if (s->dc_level > 8) { 00792 /* remove ultra-low frequency DC noise / highpass filter; 00793 * coefficients are identical to those used in SIPR decoding, 00794 * and very closely resemble those used in AMR-NB decoding. */ 00795 ff_acelp_apply_order_2_transfer_function(samples, samples, 00796 (const float[2]) { -1.99997, 1.0 }, 00797 (const float[2]) { -1.9330735188, 0.93589198496 }, 00798 0.93980580475, s->dcf_mem, size); 00799 } 00800 } 00816 static void dequant_lsps(double *lsps, int num, 00817 const uint16_t *values, 00818 const uint16_t *sizes, 00819 int n_stages, const uint8_t *table, 00820 const double *mul_q, 00821 const double *base_q) 00822 { 00823 int n, m; 00824 00825 memset(lsps, 0, num * sizeof(*lsps)); 00826 for (n = 0; n < n_stages; n++) { 00827 const uint8_t *t_off = &table[values[n] * num]; 00828 double base = base_q[n], mul = mul_q[n]; 00829 00830 for (m = 0; m < num; m++) 00831 lsps[m] += base + mul * t_off[m]; 00832 00833 table += sizes[n] * num; 00834 } 00835 } 00836 00848 static void dequant_lsp10i(GetBitContext *gb, double *lsps) 00849 { 00850 static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 }; 00851 static const double mul_lsf[4] = { 00852 5.2187144800e-3, 1.4626986422e-3, 00853 9.6179549166e-4, 1.1325736225e-3 00854 }; 00855 static const double base_lsf[4] = { 00856 M_PI * -2.15522e-1, M_PI * -6.1646e-2, 00857 M_PI * -3.3486e-2, M_PI * -5.7408e-2 00858 }; 00859 uint16_t v[4]; 00860 00861 v[0] = get_bits(gb, 8); 00862 v[1] = get_bits(gb, 6); 00863 v[2] = get_bits(gb, 5); 00864 v[3] = get_bits(gb, 5); 00865 00866 dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i, 00867 mul_lsf, base_lsf); 00868 } 00869 00874 static void dequant_lsp10r(GetBitContext *gb, 00875 double *i_lsps, const double *old, 00876 double *a1, double *a2, int q_mode) 00877 { 00878 static const uint16_t vec_sizes[3] = { 128, 64, 64 }; 00879 static const double mul_lsf[3] = { 00880 2.5807601174e-3, 1.2354460219e-3, 1.1763821673e-3 00881 }; 00882 static const double base_lsf[3] = { 00883 M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2 00884 }; 00885 const float (*ipol_tab)[2][10] = q_mode ? 00886 wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a; 00887 uint16_t interpol, v[3]; 00888 int n; 00889 00890 dequant_lsp10i(gb, i_lsps); 00891 00892 interpol = get_bits(gb, 5); 00893 v[0] = get_bits(gb, 7); 00894 v[1] = get_bits(gb, 6); 00895 v[2] = get_bits(gb, 6); 00896 00897 for (n = 0; n < 10; n++) { 00898 double delta = old[n] - i_lsps[n]; 00899 a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n]; 00900 a1[10 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n]; 00901 } 00902 00903 dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r, 00904 mul_lsf, base_lsf); 00905 } 00906 00910 static void dequant_lsp16i(GetBitContext *gb, double *lsps) 00911 { 00912 static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 }; 00913 static const double mul_lsf[5] = { 00914 3.3439586280e-3, 6.9908173703e-4, 00915 3.3216608306e-3, 1.0334960326e-3, 00916 3.1899104283e-3 00917 }; 00918 static const double base_lsf[5] = { 00919 M_PI * -1.27576e-1, M_PI * -2.4292e-2, 00920 M_PI * -1.28094e-1, M_PI * -3.2128e-2, 00921 M_PI * -1.29816e-1 00922 }; 00923 uint16_t v[5]; 00924 00925 v[0] = get_bits(gb, 8); 00926 v[1] = get_bits(gb, 6); 00927 v[2] = get_bits(gb, 7); 00928 v[3] = get_bits(gb, 6); 00929 v[4] = get_bits(gb, 7); 00930 00931 dequant_lsps( lsps, 5, v, vec_sizes, 2, 00932 wmavoice_dq_lsp16i1, mul_lsf, base_lsf); 00933 dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2, 00934 wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]); 00935 dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1, 00936 wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]); 00937 } 00938 00943 static void dequant_lsp16r(GetBitContext *gb, 00944 double *i_lsps, const double *old, 00945 double *a1, double *a2, int q_mode) 00946 { 00947 static const uint16_t vec_sizes[3] = { 128, 128, 128 }; 00948 static const double mul_lsf[3] = { 00949 1.2232979501e-3, 1.4062241527e-3, 1.6114744851e-3 00950 }; 00951 static const double base_lsf[3] = { 00952 M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2 00953 }; 00954 const float (*ipol_tab)[2][16] = q_mode ? 00955 wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a; 00956 uint16_t interpol, v[3]; 00957 int n; 00958 00959 dequant_lsp16i(gb, i_lsps); 00960 00961 interpol = get_bits(gb, 5); 00962 v[0] = get_bits(gb, 7); 00963 v[1] = get_bits(gb, 7); 00964 v[2] = get_bits(gb, 7); 00965 00966 for (n = 0; n < 16; n++) { 00967 double delta = old[n] - i_lsps[n]; 00968 a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n]; 00969 a1[16 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n]; 00970 } 00971 00972 dequant_lsps( a2, 10, v, vec_sizes, 1, 00973 wmavoice_dq_lsp16r1, mul_lsf, base_lsf); 00974 dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1, 00975 wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]); 00976 dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1, 00977 wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]); 00978 } 00979 00993 static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb, 00994 const int *pitch) 00995 { 00996 static const int16_t start_offset[94] = { 00997 -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11, 00998 13, 15, 18, 17, 19, 20, 21, 22, 23, 24, 25, 26, 00999 27, 28, 29, 30, 31, 32, 33, 35, 37, 39, 41, 43, 01000 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 01001 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 01002 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115, 01003 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139, 01004 141, 143, 145, 147, 149, 151, 153, 155, 157, 159 01005 }; 01006 int bits, offset; 01007 01008 /* position of pulse */ 01009 s->aw_idx_is_ext = 0; 01010 if ((bits = get_bits(gb, 6)) >= 54) { 01011 s->aw_idx_is_ext = 1; 01012 bits += (bits - 54) * 3 + get_bits(gb, 2); 01013 } 01014 01015 /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count 01016 * the distribution of the pulses in each block contained in this frame. */ 01017 s->aw_pulse_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16; 01018 for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ; 01019 s->aw_n_pulses[0] = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0]; 01020 s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2; 01021 offset += s->aw_n_pulses[0] * pitch[0]; 01022 s->aw_n_pulses[1] = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1]; 01023 s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2; 01024 01025 /* if continuing from a position before the block, reset position to 01026 * start of block (when corrected for the range over which it can be 01027 * spread in aw_pulse_set1()). */ 01028 if (start_offset[bits] < MAX_FRAMESIZE / 2) { 01029 while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0) 01030 s->aw_first_pulse_off[1] -= pitch[1]; 01031 if (start_offset[bits] < 0) 01032 while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0) 01033 s->aw_first_pulse_off[0] -= pitch[0]; 01034 } 01035 } 01036 01044 static void aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb, 01045 int block_idx, AMRFixed *fcb) 01046 { 01047 uint16_t use_mask_mem[9]; // only 5 are used, rest is padding 01048 uint16_t *use_mask = use_mask_mem + 2; 01049 /* in this function, idx is the index in the 80-bit (+ padding) use_mask 01050 * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits 01051 * of idx are the position of the bit within a particular item in the 01052 * array (0 being the most significant bit, and 15 being the least 01053 * significant bit), and the remainder (>> 4) is the index in the 01054 * use_mask[]-array. This is faster and uses less memory than using a 01055 * 80-byte/80-int array. */ 01056 int pulse_off = s->aw_first_pulse_off[block_idx], 01057 pulse_start, n, idx, range, aidx, start_off = 0; 01058 01059 /* set offset of first pulse to within this block */ 01060 if (s->aw_n_pulses[block_idx] > 0) 01061 while (pulse_off + s->aw_pulse_range < 1) 01062 pulse_off += fcb->pitch_lag; 01063 01064 /* find range per pulse */ 01065 if (s->aw_n_pulses[0] > 0) { 01066 if (block_idx == 0) { 01067 range = 32; 01068 } else /* block_idx = 1 */ { 01069 range = 8; 01070 if (s->aw_n_pulses[block_idx] > 0) 01071 pulse_off = s->aw_next_pulse_off_cache; 01072 } 01073 } else 01074 range = 16; 01075 pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0; 01076 01077 /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly, 01078 * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus 01079 * we exclude that range from being pulsed again in this function. */ 01080 memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0])); 01081 memset( use_mask, -1, 5 * sizeof(use_mask[0])); 01082 memset(&use_mask[5], 0, 2 * sizeof(use_mask[0])); 01083 if (s->aw_n_pulses[block_idx] > 0) 01084 for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) { 01085 int excl_range = s->aw_pulse_range; // always 16 or 24 01086 uint16_t *use_mask_ptr = &use_mask[idx >> 4]; 01087 int first_sh = 16 - (idx & 15); 01088 *use_mask_ptr++ &= 0xFFFFu << first_sh; 01089 excl_range -= first_sh; 01090 if (excl_range >= 16) { 01091 *use_mask_ptr++ = 0; 01092 *use_mask_ptr &= 0xFFFF >> (excl_range - 16); 01093 } else 01094 *use_mask_ptr &= 0xFFFF >> excl_range; 01095 } 01096 01097 /* find the 'aidx'th offset that is not excluded */ 01098 aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4); 01099 for (n = 0; n <= aidx; pulse_start++) { 01100 for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ; 01101 if (idx >= MAX_FRAMESIZE / 2) { // find from zero 01102 if (use_mask[0]) idx = 0x0F; 01103 else if (use_mask[1]) idx = 0x1F; 01104 else if (use_mask[2]) idx = 0x2F; 01105 else if (use_mask[3]) idx = 0x3F; 01106 else if (use_mask[4]) idx = 0x4F; 01107 else return; 01108 idx -= av_log2_16bit(use_mask[idx >> 4]); 01109 } 01110 if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) { 01111 use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15)); 01112 n++; 01113 start_off = idx; 01114 } 01115 } 01116 01117 fcb->x[fcb->n] = start_off; 01118 fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0; 01119 fcb->n++; 01120 01121 /* set offset for next block, relative to start of that block */ 01122 n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag; 01123 s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0; 01124 } 01125 01133 static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb, 01134 int block_idx, AMRFixed *fcb) 01135 { 01136 int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx)); 01137 float v; 01138 01139 if (s->aw_n_pulses[block_idx] > 0) { 01140 int n, v_mask, i_mask, sh, n_pulses; 01141 01142 if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each 01143 n_pulses = 3; 01144 v_mask = 8; 01145 i_mask = 7; 01146 sh = 4; 01147 } else { // 4 pulses, 1:sign + 2:index each 01148 n_pulses = 4; 01149 v_mask = 4; 01150 i_mask = 3; 01151 sh = 3; 01152 } 01153 01154 for (n = n_pulses - 1; n >= 0; n--, val >>= sh) { 01155 fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0; 01156 fcb->x[fcb->n] = (val & i_mask) * n_pulses + n + 01157 s->aw_first_pulse_off[block_idx]; 01158 while (fcb->x[fcb->n] < 0) 01159 fcb->x[fcb->n] += fcb->pitch_lag; 01160 if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2) 01161 fcb->n++; 01162 } 01163 } else { 01164 int num2 = (val & 0x1FF) >> 1, delta, idx; 01165 01166 if (num2 < 1 * 79) { delta = 1; idx = num2 + 1; } 01167 else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; } 01168 else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; } 01169 else { delta = 7; idx = num2 + 1 - 3 * 75; } 01170 v = (val & 0x200) ? -1.0 : 1.0; 01171 01172 fcb->no_repeat_mask |= 3 << fcb->n; 01173 fcb->x[fcb->n] = idx - delta; 01174 fcb->y[fcb->n] = v; 01175 fcb->x[fcb->n + 1] = idx; 01176 fcb->y[fcb->n + 1] = (val & 1) ? -v : v; 01177 fcb->n += 2; 01178 } 01179 } 01180 01194 static int pRNG(int frame_cntr, int block_num, int block_size) 01195 { 01196 /* array to simplify the calculation of z: 01197 * y = (x % 9) * 5 + 6; 01198 * z = (49995 * x) / y; 01199 * Since y only has 9 values, we can remove the division by using a 01200 * LUT and using FASTDIV-style divisions. For each of the 9 values 01201 * of y, we can rewrite z as: 01202 * z = x * (49995 / y) + x * ((49995 % y) / y) 01203 * In this table, each col represents one possible value of y, the 01204 * first number is 49995 / y, and the second is the FASTDIV variant 01205 * of 49995 % y / y. */ 01206 static const unsigned int div_tbl[9][2] = { 01207 { 8332, 3 * 715827883U }, // y = 6 01208 { 4545, 0 * 390451573U }, // y = 11 01209 { 3124, 11 * 268435456U }, // y = 16 01210 { 2380, 15 * 204522253U }, // y = 21 01211 { 1922, 23 * 165191050U }, // y = 26 01212 { 1612, 23 * 138547333U }, // y = 31 01213 { 1388, 27 * 119304648U }, // y = 36 01214 { 1219, 16 * 104755300U }, // y = 41 01215 { 1086, 39 * 93368855U } // y = 46 01216 }; 01217 unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr; 01218 if (x >= 0xFFFF) x -= 0xFFFF; // max value of x is 8*1877+0xFFFE=0x13AA6, 01219 // so this is effectively a modulo (%) 01220 y = x - 9 * MULH(477218589, x); // x % 9 01221 z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1])); 01222 // z = x * 49995 / (y * 5 + 6) 01223 return z % (1000 - block_size); 01224 } 01225 01230 static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb, 01231 int block_idx, int size, 01232 const struct frame_type_desc *frame_desc, 01233 float *excitation) 01234 { 01235 float gain; 01236 int n, r_idx; 01237 01238 assert(size <= MAX_FRAMESIZE); 01239 01240 /* Set the offset from which we start reading wmavoice_std_codebook */ 01241 if (frame_desc->fcb_type == FCB_TYPE_SILENCE) { 01242 r_idx = pRNG(s->frame_cntr, block_idx, size); 01243 gain = s->silence_gain; 01244 } else /* FCB_TYPE_HARDCODED */ { 01245 r_idx = get_bits(gb, 8); 01246 gain = wmavoice_gain_universal[get_bits(gb, 6)]; 01247 } 01248 01249 /* Clear gain prediction parameters */ 01250 memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err)); 01251 01252 /* Apply gain to hardcoded codebook and use that as excitation signal */ 01253 for (n = 0; n < size; n++) 01254 excitation[n] = wmavoice_std_codebook[r_idx + n] * gain; 01255 } 01256 01261 static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb, 01262 int block_idx, int size, 01263 int block_pitch_sh2, 01264 const struct frame_type_desc *frame_desc, 01265 float *excitation) 01266 { 01267 static const float gain_coeff[6] = { 01268 0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458 01269 }; 01270 float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain; 01271 int n, idx, gain_weight; 01272 AMRFixed fcb; 01273 01274 assert(size <= MAX_FRAMESIZE / 2); 01275 memset(pulses, 0, sizeof(*pulses) * size); 01276 01277 fcb.pitch_lag = block_pitch_sh2 >> 2; 01278 fcb.pitch_fac = 1.0; 01279 fcb.no_repeat_mask = 0; 01280 fcb.n = 0; 01281 01282 /* For the other frame types, this is where we apply the innovation 01283 * (fixed) codebook pulses of the speech signal. */ 01284 if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) { 01285 aw_pulse_set1(s, gb, block_idx, &fcb); 01286 aw_pulse_set2(s, gb, block_idx, &fcb); 01287 } else /* FCB_TYPE_EXC_PULSES */ { 01288 int offset_nbits = 5 - frame_desc->log_n_blocks; 01289 01290 fcb.no_repeat_mask = -1; 01291 /* similar to ff_decode_10_pulses_35bits(), but with single pulses 01292 * (instead of double) for a subset of pulses */ 01293 for (n = 0; n < 5; n++) { 01294 float sign; 01295 int pos1, pos2; 01296 01297 sign = get_bits1(gb) ? 1.0 : -1.0; 01298 pos1 = get_bits(gb, offset_nbits); 01299 fcb.x[fcb.n] = n + 5 * pos1; 01300 fcb.y[fcb.n++] = sign; 01301 if (n < frame_desc->dbl_pulses) { 01302 pos2 = get_bits(gb, offset_nbits); 01303 fcb.x[fcb.n] = n + 5 * pos2; 01304 fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign; 01305 } 01306 } 01307 } 01308 ff_set_fixed_vector(pulses, &fcb, 1.0, size); 01309 01310 /* Calculate gain for adaptive & fixed codebook signal. 01311 * see ff_amr_set_fixed_gain(). */ 01312 idx = get_bits(gb, 7); 01313 fcb_gain = expf(ff_dot_productf(s->gain_pred_err, gain_coeff, 6) - 01314 5.2409161640 + wmavoice_gain_codebook_fcb[idx]); 01315 acb_gain = wmavoice_gain_codebook_acb[idx]; 01316 pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx], 01317 -2.9957322736 /* log(0.05) */, 01318 1.6094379124 /* log(5.0) */); 01319 01320 gain_weight = 8 >> frame_desc->log_n_blocks; 01321 memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err, 01322 sizeof(*s->gain_pred_err) * (6 - gain_weight)); 01323 for (n = 0; n < gain_weight; n++) 01324 s->gain_pred_err[n] = pred_err; 01325 01326 /* Calculation of adaptive codebook */ 01327 if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) { 01328 int len; 01329 for (n = 0; n < size; n += len) { 01330 int next_idx_sh16; 01331 int abs_idx = block_idx * size + n; 01332 int pitch_sh16 = (s->last_pitch_val << 16) + 01333 s->pitch_diff_sh16 * abs_idx; 01334 int pitch = (pitch_sh16 + 0x6FFF) >> 16; 01335 int idx_sh16 = ((pitch << 16) - pitch_sh16) * 8 + 0x58000; 01336 idx = idx_sh16 >> 16; 01337 if (s->pitch_diff_sh16) { 01338 if (s->pitch_diff_sh16 > 0) { 01339 next_idx_sh16 = (idx_sh16) &~ 0xFFFF; 01340 } else 01341 next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF; 01342 len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8, 01343 1, size - n); 01344 } else 01345 len = size; 01346 01347 ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch], 01348 wmavoice_ipol1_coeffs, 17, 01349 idx, 9, len); 01350 } 01351 } else /* ACB_TYPE_HAMMING */ { 01352 int block_pitch = block_pitch_sh2 >> 2; 01353 idx = block_pitch_sh2 & 3; 01354 if (idx) { 01355 ff_acelp_interpolatef(excitation, &excitation[-block_pitch], 01356 wmavoice_ipol2_coeffs, 4, 01357 idx, 8, size); 01358 } else 01359 av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch, 01360 sizeof(float) * size); 01361 } 01362 01363 /* Interpolate ACB/FCB and use as excitation signal */ 01364 ff_weighted_vector_sumf(excitation, excitation, pulses, 01365 acb_gain, fcb_gain, size); 01366 } 01367 01384 static void synth_block(WMAVoiceContext *s, GetBitContext *gb, 01385 int block_idx, int size, 01386 int block_pitch_sh2, 01387 const double *lsps, const double *prev_lsps, 01388 const struct frame_type_desc *frame_desc, 01389 float *excitation, float *synth) 01390 { 01391 double i_lsps[MAX_LSPS]; 01392 float lpcs[MAX_LSPS]; 01393 float fac; 01394 int n; 01395 01396 if (frame_desc->acb_type == ACB_TYPE_NONE) 01397 synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation); 01398 else 01399 synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2, 01400 frame_desc, excitation); 01401 01402 /* convert interpolated LSPs to LPCs */ 01403 fac = (block_idx + 0.5) / frame_desc->n_blocks; 01404 for (n = 0; n < s->lsps; n++) // LSF -> LSP 01405 i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n])); 01406 ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1); 01407 01408 /* Speech synthesis */ 01409 ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps); 01410 } 01411 01427 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx, 01428 float *samples, 01429 const double *lsps, const double *prev_lsps, 01430 float *excitation, float *synth) 01431 { 01432 WMAVoiceContext *s = ctx->priv_data; 01433 int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val; 01434 int pitch[MAX_BLOCKS], last_block_pitch; 01435 01436 /* Parse frame type ("frame header"), see frame_descs */ 01437 int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], 01438 block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks; 01439 01440 if (bd_idx < 0) { 01441 av_log(ctx, AV_LOG_ERROR, 01442 "Invalid frame type VLC code, skipping\n"); 01443 return -1; 01444 } 01445 01446 /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */ 01447 if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) { 01448 /* Pitch is provided per frame, which is interpreted as the pitch of 01449 * the last sample of the last block of this frame. We can interpolate 01450 * the pitch of other blocks (and even pitch-per-sample) by gradually 01451 * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */ 01452 n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1; 01453 log_n_blocks_x2 = frame_descs[bd_idx].log_n_blocks + 1; 01454 cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits); 01455 cur_pitch_val = FFMIN(cur_pitch_val, s->max_pitch_val - 1); 01456 if (s->last_acb_type == ACB_TYPE_NONE || 01457 20 * abs(cur_pitch_val - s->last_pitch_val) > 01458 (cur_pitch_val + s->last_pitch_val)) 01459 s->last_pitch_val = cur_pitch_val; 01460 01461 /* pitch per block */ 01462 for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) { 01463 int fac = n * 2 + 1; 01464 01465 pitch[n] = (MUL16(fac, cur_pitch_val) + 01466 MUL16((n_blocks_x2 - fac), s->last_pitch_val) + 01467 frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2; 01468 } 01469 01470 /* "pitch-diff-per-sample" for calculation of pitch per sample */ 01471 s->pitch_diff_sh16 = 01472 ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE; 01473 } 01474 01475 /* Global gain (if silence) and pitch-adaptive window coordinates */ 01476 switch (frame_descs[bd_idx].fcb_type) { 01477 case FCB_TYPE_SILENCE: 01478 s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)]; 01479 break; 01480 case FCB_TYPE_AW_PULSES: 01481 aw_parse_coords(s, gb, pitch); 01482 break; 01483 } 01484 01485 for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) { 01486 int bl_pitch_sh2; 01487 01488 /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */ 01489 switch (frame_descs[bd_idx].acb_type) { 01490 case ACB_TYPE_HAMMING: { 01491 /* Pitch is given per block. Per-block pitches are encoded as an 01492 * absolute value for the first block, and then delta values 01493 * relative to this value) for all subsequent blocks. The scale of 01494 * this pitch value is semi-logaritmic compared to its use in the 01495 * decoder, so we convert it to normal scale also. */ 01496 int block_pitch, 01497 t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2, 01498 t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1, 01499 t3 = s->block_conv_table[3] - s->block_conv_table[2] + 1; 01500 01501 if (n == 0) { 01502 block_pitch = get_bits(gb, s->block_pitch_nbits); 01503 } else 01504 block_pitch = last_block_pitch - s->block_delta_pitch_hrange + 01505 get_bits(gb, s->block_delta_pitch_nbits); 01506 /* Convert last_ so that any next delta is within _range */ 01507 last_block_pitch = av_clip(block_pitch, 01508 s->block_delta_pitch_hrange, 01509 s->block_pitch_range - 01510 s->block_delta_pitch_hrange); 01511 01512 /* Convert semi-log-style scale back to normal scale */ 01513 if (block_pitch < t1) { 01514 bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch; 01515 } else { 01516 block_pitch -= t1; 01517 if (block_pitch < t2) { 01518 bl_pitch_sh2 = 01519 (s->block_conv_table[1] << 2) + (block_pitch << 1); 01520 } else { 01521 block_pitch -= t2; 01522 if (block_pitch < t3) { 01523 bl_pitch_sh2 = 01524 (s->block_conv_table[2] + block_pitch) << 2; 01525 } else 01526 bl_pitch_sh2 = s->block_conv_table[3] << 2; 01527 } 01528 } 01529 pitch[n] = bl_pitch_sh2 >> 2; 01530 break; 01531 } 01532 01533 case ACB_TYPE_ASYMMETRIC: { 01534 bl_pitch_sh2 = pitch[n] << 2; 01535 break; 01536 } 01537 01538 default: // ACB_TYPE_NONE has no pitch 01539 bl_pitch_sh2 = 0; 01540 break; 01541 } 01542 01543 synth_block(s, gb, n, block_nsamples, bl_pitch_sh2, 01544 lsps, prev_lsps, &frame_descs[bd_idx], 01545 &excitation[n * block_nsamples], 01546 &synth[n * block_nsamples]); 01547 } 01548 01549 /* Averaging projection filter, if applicable. Else, just copy samples 01550 * from synthesis buffer */ 01551 if (s->do_apf) { 01552 double i_lsps[MAX_LSPS]; 01553 float lpcs[MAX_LSPS]; 01554 01555 for (n = 0; n < s->lsps; n++) // LSF -> LSP 01556 i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n])); 01557 ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1); 01558 postfilter(s, synth, samples, 80, lpcs, 01559 &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx], 01560 frame_descs[bd_idx].fcb_type, pitch[0]); 01561 01562 for (n = 0; n < s->lsps; n++) // LSF -> LSP 01563 i_lsps[n] = cos(lsps[n]); 01564 ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1); 01565 postfilter(s, &synth[80], &samples[80], 80, lpcs, 01566 &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80], 01567 frame_descs[bd_idx].fcb_type, pitch[0]); 01568 } else 01569 memcpy(samples, synth, 160 * sizeof(synth[0])); 01570 01571 /* Cache values for next frame */ 01572 s->frame_cntr++; 01573 if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%) 01574 s->last_acb_type = frame_descs[bd_idx].acb_type; 01575 switch (frame_descs[bd_idx].acb_type) { 01576 case ACB_TYPE_NONE: 01577 s->last_pitch_val = 0; 01578 break; 01579 case ACB_TYPE_ASYMMETRIC: 01580 s->last_pitch_val = cur_pitch_val; 01581 break; 01582 case ACB_TYPE_HAMMING: 01583 s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1]; 01584 break; 01585 } 01586 01587 return 0; 01588 } 01589 01602 static void stabilize_lsps(double *lsps, int num) 01603 { 01604 int n, m, l; 01605 01606 /* set minimum value for first, maximum value for last and minimum 01607 * spacing between LSF values. 01608 * Very similar to ff_set_min_dist_lsf(), but in double. */ 01609 lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI); 01610 for (n = 1; n < num; n++) 01611 lsps[n] = FFMAX(lsps[n], lsps[n - 1] + 0.0125 * M_PI); 01612 lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI); 01613 01614 /* reorder (looks like one-time / non-recursed bubblesort). 01615 * Very similar to ff_sort_nearly_sorted_floats(), but in double. */ 01616 for (n = 1; n < num; n++) { 01617 if (lsps[n] < lsps[n - 1]) { 01618 for (m = 1; m < num; m++) { 01619 double tmp = lsps[m]; 01620 for (l = m - 1; l >= 0; l--) { 01621 if (lsps[l] <= tmp) break; 01622 lsps[l + 1] = lsps[l]; 01623 } 01624 lsps[l + 1] = tmp; 01625 } 01626 break; 01627 } 01628 } 01629 } 01630 01640 static int check_bits_for_superframe(GetBitContext *orig_gb, 01641 WMAVoiceContext *s) 01642 { 01643 GetBitContext s_gb, *gb = &s_gb; 01644 int n, need_bits, bd_idx; 01645 const struct frame_type_desc *frame_desc; 01646 01647 /* initialize a copy */ 01648 init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits); 01649 skip_bits_long(gb, get_bits_count(orig_gb)); 01650 assert(get_bits_left(gb) == get_bits_left(orig_gb)); 01651 01652 /* superframe header */ 01653 if (get_bits_left(gb) < 14) 01654 return 1; 01655 if (!get_bits1(gb)) 01656 return -1; // WMAPro-in-WMAVoice superframe 01657 if (get_bits1(gb)) skip_bits(gb, 12); // number of samples in superframe 01658 if (s->has_residual_lsps) { // residual LSPs (for all frames) 01659 if (get_bits_left(gb) < s->sframe_lsp_bitsize) 01660 return 1; 01661 skip_bits_long(gb, s->sframe_lsp_bitsize); 01662 } 01663 01664 /* frames */ 01665 for (n = 0; n < MAX_FRAMES; n++) { 01666 int aw_idx_is_ext = 0; 01667 01668 if (!s->has_residual_lsps) { // independent LSPs (per-frame) 01669 if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1; 01670 skip_bits_long(gb, s->frame_lsp_bitsize); 01671 } 01672 bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)]; 01673 if (bd_idx < 0) 01674 return -1; // invalid frame type VLC code 01675 frame_desc = &frame_descs[bd_idx]; 01676 if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) { 01677 if (get_bits_left(gb) < s->pitch_nbits) 01678 return 1; 01679 skip_bits_long(gb, s->pitch_nbits); 01680 } 01681 if (frame_desc->fcb_type == FCB_TYPE_SILENCE) { 01682 skip_bits(gb, 8); 01683 } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) { 01684 int tmp = get_bits(gb, 6); 01685 if (tmp >= 0x36) { 01686 skip_bits(gb, 2); 01687 aw_idx_is_ext = 1; 01688 } 01689 } 01690 01691 /* blocks */ 01692 if (frame_desc->acb_type == ACB_TYPE_HAMMING) { 01693 need_bits = s->block_pitch_nbits + 01694 (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits; 01695 } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) { 01696 need_bits = 2 * !aw_idx_is_ext; 01697 } else 01698 need_bits = 0; 01699 need_bits += frame_desc->frame_size; 01700 if (get_bits_left(gb) < need_bits) 01701 return 1; 01702 skip_bits_long(gb, need_bits); 01703 } 01704 01705 return 0; 01706 } 01707 01728 static int synth_superframe(AVCodecContext *ctx, 01729 float *samples, int *data_size) 01730 { 01731 WMAVoiceContext *s = ctx->priv_data; 01732 GetBitContext *gb = &s->gb, s_gb; 01733 int n, res, n_samples = 480; 01734 double lsps[MAX_FRAMES][MAX_LSPS]; 01735 const double *mean_lsf = s->lsps == 16 ? 01736 wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode]; 01737 float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12]; 01738 float synth[MAX_LSPS + MAX_SFRAMESIZE]; 01739 01740 memcpy(synth, s->synth_history, 01741 s->lsps * sizeof(*synth)); 01742 memcpy(excitation, s->excitation_history, 01743 s->history_nsamples * sizeof(*excitation)); 01744 01745 if (s->sframe_cache_size > 0) { 01746 gb = &s_gb; 01747 init_get_bits(gb, s->sframe_cache, s->sframe_cache_size); 01748 s->sframe_cache_size = 0; 01749 } 01750 01751 if ((res = check_bits_for_superframe(gb, s)) == 1) return 1; 01752 01753 /* First bit is speech/music bit, it differentiates between WMAVoice 01754 * speech samples (the actual codec) and WMAVoice music samples, which 01755 * are really WMAPro-in-WMAVoice-superframes. I've never seen those in 01756 * the wild yet. */ 01757 if (!get_bits1(gb)) { 01758 av_log_missing_feature(ctx, "WMAPro-in-WMAVoice support", 1); 01759 return -1; 01760 } 01761 01762 /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */ 01763 if (get_bits1(gb)) { 01764 if ((n_samples = get_bits(gb, 12)) > 480) { 01765 av_log(ctx, AV_LOG_ERROR, 01766 "Superframe encodes >480 samples (%d), not allowed\n", 01767 n_samples); 01768 return -1; 01769 } 01770 } 01771 /* Parse LSPs, if global for the superframe (can also be per-frame). */ 01772 if (s->has_residual_lsps) { 01773 double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2]; 01774 01775 for (n = 0; n < s->lsps; n++) 01776 prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n]; 01777 01778 if (s->lsps == 10) { 01779 dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode); 01780 } else /* s->lsps == 16 */ 01781 dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode); 01782 01783 for (n = 0; n < s->lsps; n++) { 01784 lsps[0][n] = mean_lsf[n] + (a1[n] - a2[n * 2]); 01785 lsps[1][n] = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]); 01786 lsps[2][n] += mean_lsf[n]; 01787 } 01788 for (n = 0; n < 3; n++) 01789 stabilize_lsps(lsps[n], s->lsps); 01790 } 01791 01792 /* Parse frames, optionally preceeded by per-frame (independent) LSPs. */ 01793 for (n = 0; n < 3; n++) { 01794 if (!s->has_residual_lsps) { 01795 int m; 01796 01797 if (s->lsps == 10) { 01798 dequant_lsp10i(gb, lsps[n]); 01799 } else /* s->lsps == 16 */ 01800 dequant_lsp16i(gb, lsps[n]); 01801 01802 for (m = 0; m < s->lsps; m++) 01803 lsps[n][m] += mean_lsf[m]; 01804 stabilize_lsps(lsps[n], s->lsps); 01805 } 01806 01807 if ((res = synth_frame(ctx, gb, n, 01808 &samples[n * MAX_FRAMESIZE], 01809 lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1], 01810 &excitation[s->history_nsamples + n * MAX_FRAMESIZE], 01811 &synth[s->lsps + n * MAX_FRAMESIZE]))) 01812 return res; 01813 } 01814 01815 /* Statistics? FIXME - we don't check for length, a slight overrun 01816 * will be caught by internal buffer padding, and anything else 01817 * will be skipped, not read. */ 01818 if (get_bits1(gb)) { 01819 res = get_bits(gb, 4); 01820 skip_bits(gb, 10 * (res + 1)); 01821 } 01822 01823 /* Specify nr. of output samples */ 01824 *data_size = n_samples * sizeof(float); 01825 01826 /* Update history */ 01827 memcpy(s->prev_lsps, lsps[2], 01828 s->lsps * sizeof(*s->prev_lsps)); 01829 memcpy(s->synth_history, &synth[MAX_SFRAMESIZE], 01830 s->lsps * sizeof(*synth)); 01831 memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE], 01832 s->history_nsamples * sizeof(*excitation)); 01833 if (s->do_apf) 01834 memmove(s->zero_exc_pf, &s->zero_exc_pf[MAX_SFRAMESIZE], 01835 s->history_nsamples * sizeof(*s->zero_exc_pf)); 01836 01837 return 0; 01838 } 01839 01847 static int parse_packet_header(WMAVoiceContext *s) 01848 { 01849 GetBitContext *gb = &s->gb; 01850 unsigned int res; 01851 01852 if (get_bits_left(gb) < 11) 01853 return 1; 01854 skip_bits(gb, 4); // packet sequence number 01855 s->has_residual_lsps = get_bits1(gb); 01856 do { 01857 res = get_bits(gb, 6); // number of superframes per packet 01858 // (minus first one if there is spillover) 01859 if (get_bits_left(gb) < 6 * (res == 0x3F) + s->spillover_bitsize) 01860 return 1; 01861 } while (res == 0x3F); 01862 s->spillover_nbits = get_bits(gb, s->spillover_bitsize); 01863 01864 return 0; 01865 } 01866 01882 static void copy_bits(PutBitContext *pb, 01883 const uint8_t *data, int size, 01884 GetBitContext *gb, int nbits) 01885 { 01886 int rmn_bytes, rmn_bits; 01887 01888 rmn_bits = rmn_bytes = get_bits_left(gb); 01889 if (rmn_bits < nbits) 01890 return; 01891 if (nbits > pb->size_in_bits - put_bits_count(pb)) 01892 return; 01893 rmn_bits &= 7; rmn_bytes >>= 3; 01894 if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0) 01895 put_bits(pb, rmn_bits, get_bits(gb, rmn_bits)); 01896 ff_copy_bits(pb, data + size - rmn_bytes, 01897 FFMIN(nbits - rmn_bits, rmn_bytes << 3)); 01898 } 01899 01911 static int wmavoice_decode_packet(AVCodecContext *ctx, void *data, 01912 int *data_size, AVPacket *avpkt) 01913 { 01914 WMAVoiceContext *s = ctx->priv_data; 01915 GetBitContext *gb = &s->gb; 01916 int size, res, pos; 01917 01918 if (*data_size < 480 * sizeof(float)) { 01919 av_log(ctx, AV_LOG_ERROR, 01920 "Output buffer too small (%d given - %zu needed)\n", 01921 *data_size, 480 * sizeof(float)); 01922 return -1; 01923 } 01924 *data_size = 0; 01925 01926 /* Packets are sometimes a multiple of ctx->block_align, with a packet 01927 * header at each ctx->block_align bytes. However, Libav's ASF demuxer 01928 * feeds us ASF packets, which may concatenate multiple "codec" packets 01929 * in a single "muxer" packet, so we artificially emulate that by 01930 * capping the packet size at ctx->block_align. */ 01931 for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align); 01932 if (!size) 01933 return 0; 01934 init_get_bits(&s->gb, avpkt->data, size << 3); 01935 01936 /* size == ctx->block_align is used to indicate whether we are dealing with 01937 * a new packet or a packet of which we already read the packet header 01938 * previously. */ 01939 if (size == ctx->block_align) { // new packet header 01940 if ((res = parse_packet_header(s)) < 0) 01941 return res; 01942 01943 /* If the packet header specifies a s->spillover_nbits, then we want 01944 * to push out all data of the previous packet (+ spillover) before 01945 * continuing to parse new superframes in the current packet. */ 01946 if (s->spillover_nbits > 0) { 01947 if (s->sframe_cache_size > 0) { 01948 int cnt = get_bits_count(gb); 01949 copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits); 01950 flush_put_bits(&s->pb); 01951 s->sframe_cache_size += s->spillover_nbits; 01952 if ((res = synth_superframe(ctx, data, data_size)) == 0 && 01953 *data_size > 0) { 01954 cnt += s->spillover_nbits; 01955 s->skip_bits_next = cnt & 7; 01956 return cnt >> 3; 01957 } else 01958 skip_bits_long (gb, s->spillover_nbits - cnt + 01959 get_bits_count(gb)); // resync 01960 } else 01961 skip_bits_long(gb, s->spillover_nbits); // resync 01962 } 01963 } else if (s->skip_bits_next) 01964 skip_bits(gb, s->skip_bits_next); 01965 01966 /* Try parsing superframes in current packet */ 01967 s->sframe_cache_size = 0; 01968 s->skip_bits_next = 0; 01969 pos = get_bits_left(gb); 01970 if ((res = synth_superframe(ctx, data, data_size)) < 0) { 01971 return res; 01972 } else if (*data_size > 0) { 01973 int cnt = get_bits_count(gb); 01974 s->skip_bits_next = cnt & 7; 01975 return cnt >> 3; 01976 } else if ((s->sframe_cache_size = pos) > 0) { 01977 /* rewind bit reader to start of last (incomplete) superframe... */ 01978 init_get_bits(gb, avpkt->data, size << 3); 01979 skip_bits_long(gb, (size << 3) - pos); 01980 assert(get_bits_left(gb) == pos); 01981 01982 /* ...and cache it for spillover in next packet */ 01983 init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE); 01984 copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size); 01985 // FIXME bad - just copy bytes as whole and add use the 01986 // skip_bits_next field 01987 } 01988 01989 return size; 01990 } 01991 01992 static av_cold int wmavoice_decode_end(AVCodecContext *ctx) 01993 { 01994 WMAVoiceContext *s = ctx->priv_data; 01995 01996 if (s->do_apf) { 01997 ff_rdft_end(&s->rdft); 01998 ff_rdft_end(&s->irdft); 01999 ff_dct_end(&s->dct); 02000 ff_dct_end(&s->dst); 02001 } 02002 02003 return 0; 02004 } 02005 02006 static av_cold void wmavoice_flush(AVCodecContext *ctx) 02007 { 02008 WMAVoiceContext *s = ctx->priv_data; 02009 int n; 02010 02011 s->postfilter_agc = 0; 02012 s->sframe_cache_size = 0; 02013 s->skip_bits_next = 0; 02014 for (n = 0; n < s->lsps; n++) 02015 s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0); 02016 memset(s->excitation_history, 0, 02017 sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY); 02018 memset(s->synth_history, 0, 02019 sizeof(*s->synth_history) * MAX_LSPS); 02020 memset(s->gain_pred_err, 0, 02021 sizeof(s->gain_pred_err)); 02022 02023 if (s->do_apf) { 02024 memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0, 02025 sizeof(*s->synth_filter_out_buf) * s->lsps); 02026 memset(s->dcf_mem, 0, 02027 sizeof(*s->dcf_mem) * 2); 02028 memset(s->zero_exc_pf, 0, 02029 sizeof(*s->zero_exc_pf) * s->history_nsamples); 02030 memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache)); 02031 } 02032 } 02033 02034 AVCodec ff_wmavoice_decoder = { 02035 "wmavoice", 02036 AVMEDIA_TYPE_AUDIO, 02037 CODEC_ID_WMAVOICE, 02038 sizeof(WMAVoiceContext), 02039 wmavoice_decode_init, 02040 NULL, 02041 wmavoice_decode_end, 02042 wmavoice_decode_packet, 02043 CODEC_CAP_SUBFRAMES, 02044 .flush = wmavoice_flush, 02045 .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"), 02046 };