1 /* Copyright (C) 2003 Epic Games (written by Jean-Marc Valin)
2 Copyright (C) 2004-2006 Epic Games
5 Preprocessor with denoising based on the algorithm by Ephraim and Malah
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are
11 1. Redistributions of source code must retain the above copyright notice,
12 this list of conditions and the following disclaimer.
14 2. Redistributions in binary form must reproduce the above copyright
15 notice, this list of conditions and the following disclaimer in the
16 documentation and/or other materials provided with the distribution.
18 3. The name of the author may not be used to endorse or promote products
19 derived from this software without specific prior written permission.
21 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
24 DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
25 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
26 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
27 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
29 STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
30 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
31 POSSIBILITY OF SUCH DAMAGE.
38 Y. Ephraim and D. Malah, "Speech enhancement using minimum mean-square error
39 short-time spectral amplitude estimator". IEEE Transactions on Acoustics,
40 Speech and Signal Processing, vol. ASSP-32, no. 6, pp. 1109-1121, 1984.
42 Y. Ephraim and D. Malah, "Speech enhancement using minimum mean-square error
43 log-spectral amplitude estimator". IEEE Transactions on Acoustics, Speech and
44 Signal Processing, vol. ASSP-33, no. 2, pp. 443-445, 1985.
46 I. Cohen and B. Berdugo, "Speech enhancement for non-stationary noise environments".
47 Signal Processing, vol. 81, no. 2, pp. 2403-2418, 2001.
49 Stefan Gustafsson, Rainer Martin, Peter Jax, and Peter Vary. "A psychoacoustic
50 approach to combined acoustic echo cancellation and noise reduction". IEEE
51 Transactions on Speech and Audio Processing, 2002.
53 J.-M. Valin, J. Rouat, and F. Michaud, "Microphone array post-filter for separation
54 of simultaneous non-stationary sources". In Proceedings IEEE International
55 Conference on Acoustics, Speech, and Signal Processing, 2004.
63 #include "speex/speex_preprocess.h"
64 #include "speex/speex_echo.h"
67 #include "filterbank.h"
68 #include "math_approx.h"
69 #include "os_support.h"
72 #define M_PI 3.14159263
75 #define LOUDNESS_EXP 5.f
76 #define AMP_SCALE .001f
77 #define AMP_SCALE_1 1000.f
81 #define SPEECH_PROB_START_DEFAULT QCONST16(0.35f,15)
82 #define SPEECH_PROB_CONTINUE_DEFAULT QCONST16(0.20f,15)
83 #define NOISE_SUPPRESS_DEFAULT -15
84 #define ECHO_SUPPRESS_DEFAULT -40
85 #define ECHO_SUPPRESS_ACTIVE_DEFAULT -15
91 #define SQR(x) ((x)*(x))
92 #define SQR16(x) (MULT16_16((x),(x)))
93 #define SQR16_Q15(x) (MULT16_16_Q15((x),(x)))
96 static inline spx_word16_t DIV32_16_Q8(spx_word32_t a, spx_word32_t b)
102 if (b>=QCONST32(1,23))
107 if (b>=QCONST32(1,19))
112 if (b>=QCONST32(1,15))
118 return PDIV32_16(a,b);
122 static inline spx_word16_t DIV32_16_Q15(spx_word32_t a, spx_word32_t b)
124 if (SHR32(a,15) >= b)
128 if (b>=QCONST32(1,23))
133 if (b>=QCONST32(1,19))
138 if (b>=QCONST32(1,15))
144 return DIV32_16(a,b);
147 #define SNR_SCALING 256.f
148 #define SNR_SCALING_1 0.0039062f
151 #define FRAC_SCALING 32767.f
152 #define FRAC_SCALING_1 3.0518e-05
155 #define EXPIN_SCALING 2048.f
156 #define EXPIN_SCALING_1 0.00048828f
157 #define EXPIN_SHIFT 11
158 #define EXPOUT_SCALING_1 1.5259e-05
160 #define NOISE_SHIFT 7
164 #define DIV32_16_Q8(a,b) ((a)/(b))
165 #define DIV32_16_Q15(a,b) ((a)/(b))
166 #define SNR_SCALING 1.f
167 #define SNR_SCALING_1 1.f
169 #define FRAC_SCALING 1.f
170 #define FRAC_SCALING_1 1.f
172 #define NOISE_SHIFT 0
174 #define EXPIN_SCALING 1.f
175 #define EXPIN_SCALING_1 1.f
176 #define EXPOUT_SCALING_1 1.f
180 /** Speex pre-processor state. */
181 struct SpeexPreprocessState_ {
183 int frame_size; /**< Number of samples processed each time */
184 int ps_size; /**< Number of points in the power spectrum */
185 int sampling_rate; /**< Sampling rate of the input/output */
192 int dereverb_enabled;
193 spx_word16_t reverb_decay;
194 spx_word16_t reverb_level;
195 spx_word16_t speech_prob_start;
196 spx_word16_t speech_prob_continue;
199 int echo_suppress_active;
200 SpeexEchoState *echo_state;
202 spx_word16_t speech_prob; /**< Probability last frame was speech */
204 /* DSP-related arrays */
205 spx_word16_t *frame; /**< Processing frame (2*ps_size) */
206 spx_word16_t *ft; /**< Processing frame in freq domain (2*ps_size) */
207 spx_word32_t *ps; /**< Current power spectrum */
208 spx_word16_t *gain2; /**< Adjusted gains */
209 spx_word16_t *gain_floor; /**< Minimum gain allowed */
210 spx_word16_t *window; /**< Analysis/Synthesis window */
211 spx_word32_t *noise; /**< Noise estimate */
212 spx_word32_t *reverb_estimate; /**< Estimate of reverb energy */
213 spx_word32_t *old_ps; /**< Power spectrum for last frame */
214 spx_word16_t *gain; /**< Ephraim Malah gain */
215 spx_word16_t *prior; /**< A-priori SNR */
216 spx_word16_t *post; /**< A-posteriori SNR */
218 spx_word32_t *S; /**< Smoothed power spectrum */
219 spx_word32_t *Smin; /**< See Cohen paper */
220 spx_word32_t *Stmp; /**< See Cohen paper */
221 int *update_prob; /**< Probability of speech presence for noise update */
223 spx_word16_t *zeta; /**< Smoothed a priori SNR */
224 spx_word32_t *echo_noise;
225 spx_word32_t *residual_echo;
228 spx_word16_t *inbuf; /**< Input buffer (overlapped analysis) */
229 spx_word16_t *outbuf; /**< Output buffer (for overlap and add) */
231 /* AGC stuff, only for floating point for now */
235 float loudness_accum;
236 float *loudness_weight; /**< Perceptual loudness curve */
237 float loudness; /**< Loudness estimate */
238 float agc_gain; /**< Current AGC gain */
239 float max_gain; /**< Maximum gain allowed */
240 float max_increase_step; /**< Maximum increase in gain from one frame to another */
241 float max_decrease_step; /**< Maximum decrease in gain from one frame to another */
242 float prev_loudness; /**< Loudness of previous frame */
243 float init_max; /**< Current gain limit during initialisation */
245 int nb_adapt; /**< Number of frames used for adaptation so far */
247 int min_count; /**< Number of frames processed so far */
248 void *fft_lookup; /**< Lookup table for the FFT */
255 static void conj_window(spx_word16_t *w, int len)
262 spx_word16_t x = DIV32_16(MULT16_16(32767,i),len);
264 spx_word16_t x = DIV32_16(MULT16_16(QCONST16(4.f,13),i),len);
267 if (x<QCONST16(1.f,13))
269 } else if (x<QCONST16(2.f,13))
271 x=QCONST16(2.f,13)-x;
273 } else if (x<QCONST16(3.f,13))
275 x=x-QCONST16(2.f,13);
278 x=QCONST16(2.f,13)-x+QCONST16(2.f,13); /* 4 - x */
280 x = MULT16_16_Q14(QCONST16(1.271903f,14), x);
281 tmp = SQR16_Q15(QCONST16(.5f,15)-MULT16_16_P15(QCONST16(.5f,15),spx_cos_norm(SHL32(EXTEND32(x),2))));
283 tmp=SUB16(Q15_ONE,tmp);
284 w[i]=spx_sqrt(SHL32(EXTEND32(tmp),15));
290 /* This function approximates the gain function
291 y = gamma(1.25)^2 * M(-.25;1;-x) / sqrt(x)
292 which multiplied by xi/(1+xi) is the optimal gain
293 in the loudness domain ( sqrt[amplitude] )
294 Input in Q11 format, output in Q15
296 static inline spx_word32_t hypergeom_gain(spx_word32_t xx)
301 static const spx_word16_t table[21] = {
302 6730, 8357, 9868, 11267, 12563, 13770, 14898,
303 15959, 16961, 17911, 18816, 19682, 20512, 21311,
304 22082, 22827, 23549, 24250, 24931, 25594, 26241};
309 return ADD32(EXTEND32(Q15_ONE),EXTEND32(DIV32_16(QCONST32(.1296,23), SHR32(xx,EXPIN_SHIFT-SNR_SHIFT))));
310 frac = SHL32(xx-SHL32(ind,10),5);
311 return SHL32(DIV32_16(PSHR32(MULT16_16(Q15_ONE-frac,table[ind]) + MULT16_16(frac,table[ind+1]),7),(spx_sqrt(SHL32(xx,15)+6711))),7);
314 static inline spx_word16_t qcurve(spx_word16_t x)
317 return DIV32_16(SHL32(EXTEND32(32767),9),ADD16(512,MULT16_16_Q15(QCONST16(.60f,15),DIV32_16(32767,x))));
320 /* Compute the gain floor based on different floors for the background noise and residual echo */
321 static void compute_gain_floor(int noise_suppress, int effective_echo_suppress, spx_word32_t *noise, spx_word32_t *echo, spx_word16_t *gain_floor, int len)
325 if (noise_suppress > effective_echo_suppress)
327 spx_word16_t noise_gain, gain_ratio;
328 noise_gain = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(0.11513,11),noise_suppress)),1)));
329 gain_ratio = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(.2302585f,11),effective_echo_suppress-noise_suppress)),1)));
331 /* gain_floor = sqrt [ (noise*noise_floor + echo*echo_floor) / (noise+echo) ] */
333 gain_floor[i] = MULT16_16_Q15(noise_gain,
334 spx_sqrt(SHL32(EXTEND32(DIV32_16_Q15(PSHR32(noise[i],NOISE_SHIFT) + MULT16_32_Q15(gain_ratio,echo[i]),
335 (1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]) )),15)));
337 spx_word16_t echo_gain, gain_ratio;
338 echo_gain = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(0.11513,11),effective_echo_suppress)),1)));
339 gain_ratio = EXTRACT16(MIN32(Q15_ONE,SHR32(spx_exp(MULT16_16(QCONST16(.2302585f,11),noise_suppress-effective_echo_suppress)),1)));
341 /* gain_floor = sqrt [ (noise*noise_floor + echo*echo_floor) / (noise+echo) ] */
343 gain_floor[i] = MULT16_16_Q15(echo_gain,
344 spx_sqrt(SHL32(EXTEND32(DIV32_16_Q15(MULT16_32_Q15(gain_ratio,PSHR32(noise[i],NOISE_SHIFT)) + echo[i],
345 (1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]) )),15)));
350 /* This function approximates the gain function
351 y = gamma(1.25)^2 * M(-.25;1;-x) / sqrt(x)
352 which multiplied by xi/(1+xi) is the optimal gain
353 in the loudness domain ( sqrt[amplitude] )
355 static inline spx_word32_t hypergeom_gain(spx_word32_t xx)
360 static const float table[21] = {
361 0.82157f, 1.02017f, 1.20461f, 1.37534f, 1.53363f, 1.68092f, 1.81865f,
362 1.94811f, 2.07038f, 2.18638f, 2.29688f, 2.40255f, 2.50391f, 2.60144f,
363 2.69551f, 2.78647f, 2.87458f, 2.96015f, 3.04333f, 3.12431f, 3.20326f};
364 x = EXPIN_SCALING_1*xx;
365 integer = floor(2*x);
370 return FRAC_SCALING*(1+.1296/x);
372 return FRAC_SCALING*((1-frac)*table[ind] + frac*table[ind+1])/sqrt(x+.0001f);
375 static inline spx_word16_t qcurve(spx_word16_t x)
377 return 1.f/(1.f+.15f/(SNR_SCALING_1*x));
380 static void compute_gain_floor(int noise_suppress, int effective_echo_suppress, spx_word32_t *noise, spx_word32_t *echo, spx_word16_t *gain_floor, int len)
386 noise_floor = exp(.2302585f*noise_suppress);
387 echo_floor = exp(.2302585f*effective_echo_suppress);
389 /* Compute the gain floor based on different floors for the background noise and residual echo */
391 gain_floor[i] = FRAC_SCALING*sqrt(noise_floor*PSHR32(noise[i],NOISE_SHIFT) + echo_floor*echo[i])/sqrt(1+PSHR32(noise[i],NOISE_SHIFT) + echo[i]);
395 EXPORT SpeexPreprocessState *speex_preprocess_state_init(int frame_size, int sampling_rate)
400 SpeexPreprocessState *st = (SpeexPreprocessState *)speex_alloc(sizeof(SpeexPreprocessState));
401 st->frame_size = frame_size;
403 /* Round ps_size down to the nearest power of two */
406 st->ps_size = st->frame_size;
409 if (st->ps_size & ~i)
419 if (st->ps_size < 3*st->frame_size/4)
420 st->ps_size = st->ps_size * 3 / 2;
422 st->ps_size = st->frame_size;
426 N3 = 2*N - st->frame_size;
427 N4 = st->frame_size - N3;
429 st->sampling_rate = sampling_rate;
430 st->denoise_enabled = 1;
432 st->dereverb_enabled = 0;
433 st->reverb_decay = 0;
434 st->reverb_level = 0;
435 st->noise_suppress = NOISE_SUPPRESS_DEFAULT;
436 st->echo_suppress = ECHO_SUPPRESS_DEFAULT;
437 st->echo_suppress_active = ECHO_SUPPRESS_ACTIVE_DEFAULT;
439 st->speech_prob_start = SPEECH_PROB_START_DEFAULT;
440 st->speech_prob_continue = SPEECH_PROB_CONTINUE_DEFAULT;
442 st->echo_state = NULL;
444 st->nbands = NB_BANDS;
446 st->bank = filterbank_new(M, sampling_rate, N, 1);
448 st->frame = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
449 st->window = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
450 st->ft = (spx_word16_t*)speex_alloc(2*N*sizeof(spx_word16_t));
452 st->ps = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
453 st->noise = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
454 st->echo_noise = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
455 st->residual_echo = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
456 st->reverb_estimate = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
457 st->old_ps = (spx_word32_t*)speex_alloc((N+M)*sizeof(spx_word32_t));
458 st->prior = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
459 st->post = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
460 st->gain = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
461 st->gain2 = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
462 st->gain_floor = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
463 st->zeta = (spx_word16_t*)speex_alloc((N+M)*sizeof(spx_word16_t));
465 st->S = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
466 st->Smin = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
467 st->Stmp = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
468 st->update_prob = (int*)speex_alloc(N*sizeof(int));
470 st->inbuf = (spx_word16_t*)speex_alloc(N3*sizeof(spx_word16_t));
471 st->outbuf = (spx_word16_t*)speex_alloc(N3*sizeof(spx_word16_t));
473 conj_window(st->window, 2*N3);
474 for (i=2*N3;i<2*st->ps_size;i++)
475 st->window[i]=Q15_ONE;
479 for (i=N3-1;i>=0;i--)
481 st->window[i+N3+N4]=st->window[i+N3];
487 st->noise[i]=QCONST32(1.f,NOISE_SHIFT);
488 st->reverb_estimate[i]=0;
491 st->post[i]=SHL16(1, SNR_SHIFT);
492 st->prior[i]=SHL16(1, SNR_SHIFT);
496 st->update_prob[i] = 1;
504 st->agc_level = 8000;
505 st->loudness_weight = (float*)speex_alloc(N*sizeof(float));
508 float ff=((float)i)*.5*sampling_rate/((float)N);
509 /*st->loudness_weight[i] = .5f*(1.f/(1.f+ff/8000.f))+1.f*exp(-.5f*(ff-3800.f)*(ff-3800.f)/9e5f);*/
510 st->loudness_weight[i] = .35f-.35f*ff/16000.f+.73f*exp(-.5f*(ff-3800)*(ff-3800)/9e5f);
511 if (st->loudness_weight[i]<.01f)
512 st->loudness_weight[i]=.01f;
513 st->loudness_weight[i] *= st->loudness_weight[i];
515 /*st->loudness = pow(AMP_SCALE*st->agc_level,LOUDNESS_EXP);*/
516 st->loudness = 1e-15;
519 st->max_increase_step = exp(0.11513f * 12.*st->frame_size / st->sampling_rate);
520 st->max_decrease_step = exp(-0.11513f * 40.*st->frame_size / st->sampling_rate);
521 st->prev_loudness = 1;
526 st->fft_lookup = spx_fft_init(2*N);
533 EXPORT void speex_preprocess_state_destroy(SpeexPreprocessState *st)
535 speex_free(st->frame);
538 speex_free(st->gain2);
539 speex_free(st->gain_floor);
540 speex_free(st->window);
541 speex_free(st->noise);
542 speex_free(st->reverb_estimate);
543 speex_free(st->old_ps);
544 speex_free(st->gain);
545 speex_free(st->prior);
546 speex_free(st->post);
548 speex_free(st->loudness_weight);
550 speex_free(st->echo_noise);
551 speex_free(st->residual_echo);
554 speex_free(st->Smin);
555 speex_free(st->Stmp);
556 speex_free(st->update_prob);
557 speex_free(st->zeta);
559 speex_free(st->inbuf);
560 speex_free(st->outbuf);
562 spx_fft_destroy(st->fft_lookup);
563 filterbank_destroy(st->bank);
567 /* FIXME: The AGC doesn't work yet with fixed-point*/
569 static void speex_compute_agc(SpeexPreprocessState *st, spx_word16_t Pframe, spx_word16_t *ft)
579 loudness += 2.f*N*st->ps[i]* st->loudness_weight[i];
581 loudness=sqrt(loudness);
582 /*if (loudness < 2*pow(st->loudness, 1.0/LOUDNESS_EXP) &&
583 loudness*2 > pow(st->loudness, 1.0/LOUDNESS_EXP))*/
586 /*rate=2.0f*Pframe*Pframe/(1+st->nb_loudness_adapt);*/
587 rate = .03*Pframe*Pframe;
588 st->loudness = (1-rate)*st->loudness + (rate)*pow(AMP_SCALE*loudness, LOUDNESS_EXP);
589 st->loudness_accum = (1-rate)*st->loudness_accum + rate;
590 if (st->init_max < st->max_gain && st->nb_adapt > 20)
591 st->init_max *= 1.f + .1f*Pframe*Pframe;
593 /*printf ("%f %f %f %f\n", Pframe, loudness, pow(st->loudness, 1.0f/LOUDNESS_EXP), st->loudness2);*/
595 target_gain = AMP_SCALE*st->agc_level*pow(st->loudness/(1e-4+st->loudness_accum), -1.0f/LOUDNESS_EXP);
597 if ((Pframe>.5 && st->nb_adapt > 20) || target_gain < st->agc_gain)
599 if (target_gain > st->max_increase_step*st->agc_gain)
600 target_gain = st->max_increase_step*st->agc_gain;
601 if (target_gain < st->max_decrease_step*st->agc_gain && loudness < 10*st->prev_loudness)
602 target_gain = st->max_decrease_step*st->agc_gain;
603 if (target_gain > st->max_gain)
604 target_gain = st->max_gain;
605 if (target_gain > st->init_max)
606 target_gain = st->init_max;
608 st->agc_gain = target_gain;
610 /*fprintf (stderr, "%f %f %f\n", loudness, (float)AMP_SCALE_1*pow(st->loudness, 1.0f/LOUDNESS_EXP), st->agc_gain);*/
613 ft[i] *= st->agc_gain;
614 st->prev_loudness = loudness;
618 static void preprocess_analysis(SpeexPreprocessState *st, spx_int16_t *x)
622 int N3 = 2*N - st->frame_size;
623 int N4 = st->frame_size - N3;
624 spx_word32_t *ps=st->ps;
626 /* 'Build' input frame */
628 st->frame[i]=st->inbuf[i];
629 for (i=0;i<st->frame_size;i++)
630 st->frame[N3+i]=x[i];
634 st->inbuf[i]=x[N4+i];
638 st->frame[i] = MULT16_16_Q15(st->frame[i], st->window[i]);
642 spx_word16_t max_val=0;
644 max_val = MAX16(max_val, ABS16(st->frame[i]));
645 st->frame_shift = 14-spx_ilog2(EXTEND32(max_val));
647 st->frame[i] = SHL16(st->frame[i], st->frame_shift);
652 spx_fft(st->fft_lookup, st->frame, st->ft);
655 ps[0]=MULT16_16(st->ft[0],st->ft[0]);
657 ps[i]=MULT16_16(st->ft[2*i-1],st->ft[2*i-1]) + MULT16_16(st->ft[2*i],st->ft[2*i]);
659 st->ps[i] = PSHR32(st->ps[i], 2*st->frame_shift);
661 filterbank_compute_bank32(st->bank, ps, ps+N);
664 static void update_noise_prob(SpeexPreprocessState *st)
671 st->S[i] = MULT16_32_Q15(QCONST16(.8f,15),st->S[i]) + MULT16_32_Q15(QCONST16(.05f,15),st->ps[i-1])
672 + MULT16_32_Q15(QCONST16(.1f,15),st->ps[i]) + MULT16_32_Q15(QCONST16(.05f,15),st->ps[i+1]);
673 st->S[0] = MULT16_32_Q15(QCONST16(.8f,15),st->S[0]) + MULT16_32_Q15(QCONST16(.2f,15),st->ps[0]);
674 st->S[N-1] = MULT16_32_Q15(QCONST16(.8f,15),st->S[N-1]) + MULT16_32_Q15(QCONST16(.2f,15),st->ps[N-1]);
679 st->Smin[i] = st->Stmp[i] = 0;
682 if (st->nb_adapt < 100)
684 else if (st->nb_adapt < 1000)
686 else if (st->nb_adapt < 10000)
690 if (st->min_count > min_range)
695 st->Smin[i] = MIN32(st->Stmp[i], st->S[i]);
696 st->Stmp[i] = st->S[i];
701 st->Smin[i] = MIN32(st->Smin[i], st->S[i]);
702 st->Stmp[i] = MIN32(st->Stmp[i], st->S[i]);
707 if (MULT16_32_Q15(QCONST16(.4f,15),st->S[i]) > st->Smin[i])
708 st->update_prob[i] = 1;
710 st->update_prob[i] = 0;
711 /*fprintf (stderr, "%f ", st->S[i]/st->Smin[i]);*/
712 /*fprintf (stderr, "%f ", st->update_prob[i]);*/
717 #define NOISE_OVERCOMPENS 1.
719 void speex_echo_get_residual(SpeexEchoState *st, spx_word32_t *Yout, int len);
721 EXPORT int speex_preprocess(SpeexPreprocessState *st, spx_int16_t *x, spx_int32_t *echo)
723 return speex_preprocess_run(st, x);
726 EXPORT int speex_preprocess_run(SpeexPreprocessState *st, spx_int16_t *x)
731 int N3 = 2*N - st->frame_size;
732 int N4 = st->frame_size - N3;
733 spx_word32_t *ps=st->ps;
736 spx_word16_t beta, beta_1;
737 spx_word16_t effective_echo_suppress;
740 if (st->nb_adapt>20000)
741 st->nb_adapt = 20000;
744 beta = MAX16(QCONST16(.03,15),DIV32_16(Q15_ONE,st->nb_adapt));
745 beta_1 = Q15_ONE-beta;
747 /* Deal with residual echo if provided */
750 speex_echo_get_residual(st->echo_state, st->residual_echo, N);
752 /* If there are NaNs or ridiculous values, it'll show up in the DC and we just reset everything to zero */
753 if (!(st->residual_echo[0] >=0 && st->residual_echo[0]<N*1e9f))
756 st->residual_echo[i] = 0;
760 st->echo_noise[i] = MAX32(MULT16_32_Q15(QCONST16(.6f,15),st->echo_noise[i]), st->residual_echo[i]);
761 filterbank_compute_bank32(st->bank, st->echo_noise, st->echo_noise+N);
764 st->echo_noise[i] = 0;
766 preprocess_analysis(st, x);
768 update_noise_prob(st);
770 /* Noise estimation always updated for the 10 first frames */
771 /*if (st->nb_adapt<10)
774 st->update_prob[i] = 0;
778 /* Update the noise estimate for the frequencies where it can be */
781 if (!st->update_prob[i] || st->ps[i] < PSHR32(st->noise[i], NOISE_SHIFT))
782 st->noise[i] = MAX32(EXTEND32(0),MULT16_32_Q15(beta_1,st->noise[i]) + MULT16_32_Q15(beta,SHL32(st->ps[i],NOISE_SHIFT)));
784 filterbank_compute_bank32(st->bank, st->noise, st->noise+N);
786 /* Special case for first frame */
789 st->old_ps[i] = ps[i];
791 /* Compute a posteriori SNR */
796 /* Total noise estimate including residual echo and reverberation */
797 spx_word32_t tot_noise = ADD32(ADD32(ADD32(EXTEND32(1), PSHR32(st->noise[i],NOISE_SHIFT)) , st->echo_noise[i]) , st->reverb_estimate[i]);
799 /* A posteriori SNR = ps/noise - 1*/
800 st->post[i] = SUB16(DIV32_16_Q8(ps[i],tot_noise), QCONST16(1.f,SNR_SHIFT));
801 st->post[i]=MIN16(st->post[i], QCONST16(100.f,SNR_SHIFT));
803 /* Computing update gamma = .1 + .9*(old/(old+noise))^2 */
804 gamma = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.89f,15),SQR16_Q15(DIV32_16_Q15(st->old_ps[i],ADD32(st->old_ps[i],tot_noise))));
806 /* A priori SNR update = gamma*max(0,post) + (1-gamma)*old/noise */
807 st->prior[i] = EXTRACT16(PSHR32(ADD32(MULT16_16(gamma,MAX16(0,st->post[i])), MULT16_16(Q15_ONE-gamma,DIV32_16_Q8(st->old_ps[i],tot_noise))), 15));
808 st->prior[i]=MIN16(st->prior[i], QCONST16(100.f,SNR_SHIFT));
811 /*print_vec(st->post, N+M, "");*/
813 /* Recursive average of the a priori SNR. A bit smoothed for the psd components */
814 st->zeta[0] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[0]), MULT16_16(QCONST16(.3f,15),st->prior[0])),15);
816 st->zeta[i] = PSHR32(ADD32(ADD32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[i]), MULT16_16(QCONST16(.15f,15),st->prior[i])),
817 MULT16_16(QCONST16(.075f,15),st->prior[i-1])), MULT16_16(QCONST16(.075f,15),st->prior[i+1])),15);
818 for (i=N-1;i<N+M;i++)
819 st->zeta[i] = PSHR32(ADD32(MULT16_16(QCONST16(.7f,15),st->zeta[i]), MULT16_16(QCONST16(.3f,15),st->prior[i])),15);
821 /* Speech probability of presence for the entire frame is based on the average filterbank a priori SNR */
824 Zframe = ADD32(Zframe, EXTEND32(st->zeta[i]));
825 Pframe = QCONST16(.1f,15)+MULT16_16_Q15(QCONST16(.899f,15),qcurve(DIV32_16(Zframe,st->nbands)));
827 effective_echo_suppress = EXTRACT16(PSHR32(ADD32(MULT16_16(SUB16(Q15_ONE,Pframe), st->echo_suppress), MULT16_16(Pframe, st->echo_suppress_active)),15));
829 compute_gain_floor(st->noise_suppress, effective_echo_suppress, st->noise+N, st->echo_noise+N, st->gain_floor+N, M);
831 /* Compute Ephraim & Malah gain speech probability of presence for each critical band (Bark scale)
832 Technically this is actually wrong because the EM gaim assumes a slightly different probability
836 /* See EM and Cohen papers*/
838 /* Gain from hypergeometric function */
840 /* Weiner filter gain */
841 spx_word16_t prior_ratio;
842 /* a priority probability of speech presence based on Bark sub-band alone */
844 /* Speech absence a priori probability (considering sub-band and frame) */
850 prior_ratio = PDIV32_16(SHL32(EXTEND32(st->prior[i]), 15), ADD16(st->prior[i], SHL32(1,SNR_SHIFT)));
851 theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(st->post[i]),EXPIN_SHIFT-SNR_SHIFT));
853 MM = hypergeom_gain(theta);
854 /* Gain with bound */
855 st->gain[i] = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM)));
856 /* Save old Bark power spectrum */
857 st->old_ps[i] = MULT16_32_P15(QCONST16(.2f,15),st->old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(st->gain[i])),ps[i]);
859 P1 = QCONST16(.199f,15)+MULT16_16_Q15(QCONST16(.8f,15),qcurve (st->zeta[i]));
860 q = Q15_ONE-MULT16_16_Q15(Pframe,P1);
862 theta = MIN32(theta, EXTEND32(32767));
863 /*Q8*/tmp = MULT16_16_Q15((SHL32(1,SNR_SHIFT)+st->prior[i]),EXTRACT16(MIN32(Q15ONE,SHR32(spx_exp(-EXTRACT16(theta)),1))));
864 tmp = MIN16(QCONST16(3.,SNR_SHIFT), tmp); /* Prevent overflows in the next line*/
865 /*Q8*/tmp = EXTRACT16(PSHR32(MULT16_16(PDIV32_16(SHL32(EXTEND32(q),8),(Q15_ONE-q)),tmp),8));
866 st->gain2[i]=DIV32_16(SHL32(EXTEND32(32767),SNR_SHIFT), ADD16(256,tmp));
868 st->gain2[i]=1/(1.f + (q/(1.f-q))*(1+st->prior[i])*exp(-theta));
871 /* Convert the EM gains and speech prob to linear frequency */
872 filterbank_compute_psd16(st->bank,st->gain2+N, st->gain2);
873 filterbank_compute_psd16(st->bank,st->gain+N, st->gain);
875 /* Use 1 for linear gain resolution (best) or 0 for Bark gain resolution (faster) */
878 filterbank_compute_psd16(st->bank,st->gain_floor+N, st->gain_floor);
880 /* Compute gain according to the Ephraim-Malah algorithm -- linear frequency */
885 spx_word16_t prior_ratio;
890 /* Wiener filter gain */
891 prior_ratio = PDIV32_16(SHL32(EXTEND32(st->prior[i]), 15), ADD16(st->prior[i], SHL32(1,SNR_SHIFT)));
892 theta = MULT16_32_P15(prior_ratio, QCONST32(1.f,EXPIN_SHIFT)+SHL32(EXTEND32(st->post[i]),EXPIN_SHIFT-SNR_SHIFT));
894 /* Optimal estimator for loudness domain */
895 MM = hypergeom_gain(theta);
896 /* EM gain with bound */
897 g = EXTRACT16(MIN32(Q15_ONE, MULT16_32_Q15(prior_ratio, MM)));
898 /* Interpolated speech probability of presence */
901 /* Constrain the gain to be close to the Bark scale gain */
902 if (MULT16_16_Q15(QCONST16(.333f,15),g) > st->gain[i])
903 g = MULT16_16(3,st->gain[i]);
906 /* Save old power spectrum */
907 st->old_ps[i] = MULT16_32_P15(QCONST16(.2f,15),st->old_ps[i]) + MULT16_32_P15(MULT16_16_P15(QCONST16(.8f,15),SQR16_Q15(st->gain[i])),ps[i]);
909 /* Apply gain floor */
910 if (st->gain[i] < st->gain_floor[i])
911 st->gain[i] = st->gain_floor[i];
913 /* Exponential decay model for reverberation (unused) */
914 /*st->reverb_estimate[i] = st->reverb_decay*st->reverb_estimate[i] + st->reverb_decay*st->reverb_level*st->gain[i]*st->gain[i]*st->ps[i];*/
916 /* Take into account speech probability of presence (loudness domain MMSE estimator) */
917 /* gain2 = [p*sqrt(gain)+(1-p)*sqrt(gain _floor) ]^2 */
918 tmp = MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(st->gain[i]),15))) + MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(st->gain_floor[i]),15)));
919 st->gain2[i]=SQR16_Q15(tmp);
921 /* Use this if you want a log-domain MMSE estimator instead */
922 /*st->gain2[i] = pow(st->gain[i], p) * pow(st->gain_floor[i],1.f-p);*/
928 spx_word16_t p = st->gain2[i];
929 st->gain[i] = MAX16(st->gain[i], st->gain_floor[i]);
930 tmp = MULT16_16_P15(p,spx_sqrt(SHL32(EXTEND32(st->gain[i]),15))) + MULT16_16_P15(SUB16(Q15_ONE,p),spx_sqrt(SHL32(EXTEND32(st->gain_floor[i]),15)));
931 st->gain2[i]=SQR16_Q15(tmp);
933 filterbank_compute_psd16(st->bank,st->gain2+N, st->gain2);
936 /* If noise suppression is off, don't apply the gain (but then why call this in the first place!) */
937 if (!st->denoise_enabled)
940 st->gain2[i]=Q15_ONE;
943 /* Apply computed gain */
946 st->ft[2*i-1] = MULT16_16_P15(st->gain2[i],st->ft[2*i-1]);
947 st->ft[2*i] = MULT16_16_P15(st->gain2[i],st->ft[2*i]);
949 st->ft[0] = MULT16_16_P15(st->gain2[0],st->ft[0]);
950 st->ft[2*N-1] = MULT16_16_P15(st->gain2[N-1],st->ft[2*N-1]);
952 /*FIXME: This *will* not work for fixed-point */
955 speex_compute_agc(st, Pframe, st->ft);
958 /* Inverse FFT with 1/N scaling */
959 spx_ifft(st->fft_lookup, st->ft, st->frame);
960 /* Scale back to original (lower) amplitude */
962 st->frame[i] = PSHR16(st->frame[i], st->frame_shift);
964 /*FIXME: This *will* not work for fixed-point */
970 if (fabs(st->frame[i])>max_sample)
971 max_sample = fabs(st->frame[i]);
972 if (max_sample>28000.f)
974 float damp = 28000.f/max_sample;
976 st->frame[i] *= damp;
981 /* Synthesis window (for WOLA) */
983 st->frame[i] = MULT16_16_Q15(st->frame[i], st->window[i]);
985 /* Perform overlap and add */
987 x[i] = st->outbuf[i] + st->frame[i];
989 x[N3+i] = st->frame[N3+i];
993 st->outbuf[i] = st->frame[st->frame_size+i];
995 /* FIXME: This VAD is a kludge */
996 st->speech_prob = Pframe;
999 if (st->speech_prob > st->speech_prob_start || (st->was_speech && st->speech_prob > st->speech_prob_continue))
1013 EXPORT void speex_preprocess_estimate_update(SpeexPreprocessState *st, spx_int16_t *x)
1016 int N = st->ps_size;
1017 int N3 = 2*N - st->frame_size;
1019 spx_word32_t *ps=st->ps;
1024 preprocess_analysis(st, x);
1026 update_noise_prob(st);
1030 if (!st->update_prob[i] || st->ps[i] < PSHR32(st->noise[i],NOISE_SHIFT))
1032 st->noise[i] = MULT16_32_Q15(QCONST16(.95f,15),st->noise[i]) + MULT16_32_Q15(QCONST16(.05f,15),SHL32(st->ps[i],NOISE_SHIFT));
1037 st->outbuf[i] = MULT16_16_Q15(x[st->frame_size-N3+i],st->window[st->frame_size+i]);
1039 /* Save old power spectrum */
1041 st->old_ps[i] = ps[i];
1044 st->reverb_estimate[i] = MULT16_32_Q15(st->reverb_decay, st->reverb_estimate[i]);
1048 EXPORT int speex_preprocess_ctl(SpeexPreprocessState *state, int request, void *ptr)
1051 SpeexPreprocessState *st;
1052 st=(SpeexPreprocessState*)state;
1055 case SPEEX_PREPROCESS_SET_DENOISE:
1056 st->denoise_enabled = (*(spx_int32_t*)ptr);
1058 case SPEEX_PREPROCESS_GET_DENOISE:
1059 (*(spx_int32_t*)ptr) = st->denoise_enabled;
1062 case SPEEX_PREPROCESS_SET_AGC:
1063 st->agc_enabled = (*(spx_int32_t*)ptr);
1065 case SPEEX_PREPROCESS_GET_AGC:
1066 (*(spx_int32_t*)ptr) = st->agc_enabled;
1068 #ifndef DISABLE_FLOAT_API
1069 case SPEEX_PREPROCESS_SET_AGC_LEVEL:
1070 st->agc_level = (*(float*)ptr);
1071 if (st->agc_level<1)
1073 if (st->agc_level>32768)
1074 st->agc_level=32768;
1076 case SPEEX_PREPROCESS_GET_AGC_LEVEL:
1077 (*(float*)ptr) = st->agc_level;
1079 #endif /* #ifndef DISABLE_FLOAT_API */
1080 case SPEEX_PREPROCESS_SET_AGC_INCREMENT:
1081 st->max_increase_step = exp(0.11513f * (*(spx_int32_t*)ptr)*st->frame_size / st->sampling_rate);
1083 case SPEEX_PREPROCESS_GET_AGC_INCREMENT:
1084 (*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_increase_step)*st->sampling_rate/st->frame_size);
1086 case SPEEX_PREPROCESS_SET_AGC_DECREMENT:
1087 st->max_decrease_step = exp(0.11513f * (*(spx_int32_t*)ptr)*st->frame_size / st->sampling_rate);
1089 case SPEEX_PREPROCESS_GET_AGC_DECREMENT:
1090 (*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_decrease_step)*st->sampling_rate/st->frame_size);
1092 case SPEEX_PREPROCESS_SET_AGC_MAX_GAIN:
1093 st->max_gain = exp(0.11513f * (*(spx_int32_t*)ptr));
1095 case SPEEX_PREPROCESS_GET_AGC_MAX_GAIN:
1096 (*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->max_gain));
1099 case SPEEX_PREPROCESS_SET_VAD:
1100 speex_warning("The VAD has been replaced by a hack pending a complete rewrite");
1101 st->vad_enabled = (*(spx_int32_t*)ptr);
1103 case SPEEX_PREPROCESS_GET_VAD:
1104 (*(spx_int32_t*)ptr) = st->vad_enabled;
1107 case SPEEX_PREPROCESS_SET_DEREVERB:
1108 st->dereverb_enabled = (*(spx_int32_t*)ptr);
1109 for (i=0;i<st->ps_size;i++)
1110 st->reverb_estimate[i]=0;
1112 case SPEEX_PREPROCESS_GET_DEREVERB:
1113 (*(spx_int32_t*)ptr) = st->dereverb_enabled;
1116 case SPEEX_PREPROCESS_SET_DEREVERB_LEVEL:
1117 /* FIXME: Re-enable when de-reverberation is actually enabled again */
1118 /*st->reverb_level = (*(float*)ptr);*/
1120 case SPEEX_PREPROCESS_GET_DEREVERB_LEVEL:
1121 /* FIXME: Re-enable when de-reverberation is actually enabled again */
1122 /*(*(float*)ptr) = st->reverb_level;*/
1125 case SPEEX_PREPROCESS_SET_DEREVERB_DECAY:
1126 /* FIXME: Re-enable when de-reverberation is actually enabled again */
1127 /*st->reverb_decay = (*(float*)ptr);*/
1129 case SPEEX_PREPROCESS_GET_DEREVERB_DECAY:
1130 /* FIXME: Re-enable when de-reverberation is actually enabled again */
1131 /*(*(float*)ptr) = st->reverb_decay;*/
1134 case SPEEX_PREPROCESS_SET_PROB_START:
1135 *(spx_int32_t*)ptr = MIN32(100,MAX32(0, *(spx_int32_t*)ptr));
1136 st->speech_prob_start = DIV32_16(MULT16_16(Q15ONE,*(spx_int32_t*)ptr), 100);
1138 case SPEEX_PREPROCESS_GET_PROB_START:
1139 (*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob_start, 100);
1142 case SPEEX_PREPROCESS_SET_PROB_CONTINUE:
1143 *(spx_int32_t*)ptr = MIN32(100,MAX32(0, *(spx_int32_t*)ptr));
1144 st->speech_prob_continue = DIV32_16(MULT16_16(Q15ONE,*(spx_int32_t*)ptr), 100);
1146 case SPEEX_PREPROCESS_GET_PROB_CONTINUE:
1147 (*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob_continue, 100);
1150 case SPEEX_PREPROCESS_SET_NOISE_SUPPRESS:
1151 st->noise_suppress = -ABS(*(spx_int32_t*)ptr);
1153 case SPEEX_PREPROCESS_GET_NOISE_SUPPRESS:
1154 (*(spx_int32_t*)ptr) = st->noise_suppress;
1156 case SPEEX_PREPROCESS_SET_ECHO_SUPPRESS:
1157 st->echo_suppress = -ABS(*(spx_int32_t*)ptr);
1159 case SPEEX_PREPROCESS_GET_ECHO_SUPPRESS:
1160 (*(spx_int32_t*)ptr) = st->echo_suppress;
1162 case SPEEX_PREPROCESS_SET_ECHO_SUPPRESS_ACTIVE:
1163 st->echo_suppress_active = -ABS(*(spx_int32_t*)ptr);
1165 case SPEEX_PREPROCESS_GET_ECHO_SUPPRESS_ACTIVE:
1166 (*(spx_int32_t*)ptr) = st->echo_suppress_active;
1168 case SPEEX_PREPROCESS_SET_ECHO_STATE:
1169 st->echo_state = (SpeexEchoState*)ptr;
1171 case SPEEX_PREPROCESS_GET_ECHO_STATE:
1172 (*(SpeexEchoState**)ptr) = (SpeexEchoState*)st->echo_state;
1175 case SPEEX_PREPROCESS_GET_AGC_LOUDNESS:
1176 (*(spx_int32_t*)ptr) = pow(st->loudness, 1.0/LOUDNESS_EXP);
1178 case SPEEX_PREPROCESS_GET_AGC_GAIN:
1179 (*(spx_int32_t*)ptr) = floor(.5+8.6858*log(st->agc_gain));
1182 case SPEEX_PREPROCESS_GET_PSD_SIZE:
1183 case SPEEX_PREPROCESS_GET_NOISE_PSD_SIZE:
1184 (*(spx_int32_t*)ptr) = st->ps_size;
1186 case SPEEX_PREPROCESS_GET_PSD:
1187 for(i=0;i<st->ps_size;i++)
1188 ((spx_int32_t *)ptr)[i] = (spx_int32_t) st->ps[i];
1190 case SPEEX_PREPROCESS_GET_NOISE_PSD:
1191 for(i=0;i<st->ps_size;i++)
1192 ((spx_int32_t *)ptr)[i] = (spx_int32_t) PSHR32(st->noise[i], NOISE_SHIFT);
1194 case SPEEX_PREPROCESS_GET_PROB:
1195 (*(spx_int32_t*)ptr) = MULT16_16_Q15(st->speech_prob, 100);
1198 case SPEEX_PREPROCESS_SET_AGC_TARGET:
1199 st->agc_level = (*(spx_int32_t*)ptr);
1200 if (st->agc_level<1)
1202 if (st->agc_level>32768)
1203 st->agc_level=32768;
1205 case SPEEX_PREPROCESS_GET_AGC_TARGET:
1206 (*(spx_int32_t*)ptr) = st->agc_level;
1210 speex_warning_int("Unknown speex_preprocess_ctl request: ", request);
1217 long long spx_mips=0;