1 /* Copyright (C) 2003-2008 Jean-Marc Valin
4 Echo canceller based on the MDF algorithm (see below)
6 Redistribution and use in source and binary forms, with or without
7 modification, are permitted provided that the following conditions are
10 1. Redistributions of source code must retain the above copyright notice,
11 this list of conditions and the following disclaimer.
13 2. Redistributions in binary form must reproduce the above copyright
14 notice, this list of conditions and the following disclaimer in the
15 documentation and/or other materials provided with the distribution.
17 3. The name of the author may not be used to endorse or promote products
18 derived from this software without specific prior written permission.
20 THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
21 IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
22 OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
23 DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
24 INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
25 (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
26 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
27 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
28 STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
29 ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
30 POSSIBILITY OF SUCH DAMAGE.
34 The echo canceller is based on the MDF algorithm described in:
36 J. S. Soo, K. K. Pang Multidelay block frequency adaptive filter,
37 IEEE Trans. Acoust. Speech Signal Process., Vol. ASSP-38, No. 2,
40 We use the Alternatively Updated MDF (AUMDF) variant. Robustness to
41 double-talk is achieved using a variable learning rate as described in:
43 Valin, J.-M., On Adjusting the Learning Rate in Frequency Domain Echo
44 Cancellation With Double-Talk. IEEE Transactions on Audio,
45 Speech and Language Processing, Vol. 15, No. 3, pp. 1030-1034, 2007.
46 http://people.xiph.org/~jm/papers/valin_taslp2006.pdf
48 There is no explicit double-talk detection, but a continuous variation
49 in the learning rate based on residual echo, double-talk and background
52 About the fixed-point version:
53 All the signals are represented with 16-bit words. The filter weights
54 are represented with 32-bit words, but only the top 16 bits are used
55 in most cases. The lower 16 bits are completely unreliable (due to the
56 fact that the update is done only on the top bits), but help in the
57 adaptation -- probably by removing a "threshold effect" due to
58 quantization (rounding going to zero) when the gradient is small.
60 Another kludge that seems to work good: when performing the weight
61 update, we only move half the way toward the "goal" this seems to
62 reduce the effect of quantization noise in the update phase. This
63 can be seen as applying a gradient descent on a "soft constraint"
64 instead of having a hard constraint.
73 #include "speex/speex_echo.h"
75 #include "pseudofloat.h"
76 #include "math_approx.h"
77 #include "os_support.h"
80 #define M_PI 3.14159265358979323846
84 #define WEIGHT_SHIFT 11
85 #define NORMALIZE_SCALEDOWN 5
86 #define NORMALIZE_SCALEUP 3
88 #define WEIGHT_SHIFT 0
92 #define WORD2INT(x) ((x) < -32767 ? -32768 : ((x) > 32766 ? 32767 : (x)))
94 #define WORD2INT(x) ((x) < -32767.5f ? -32768 : ((x) > 32766.5f ? 32767 : floor(.5+(x))))
97 /* If enabled, the AEC will use a foreground filter and a background filter to be more robust to double-talk
98 and difficult signals in general. The cost is an extra FFT and a matrix-vector multiply */
102 static const spx_float_t MIN_LEAK = {20972, -22};
104 /* Constants for the two-path filter */
105 static const spx_float_t VAR1_SMOOTH = {23593, -16};
106 static const spx_float_t VAR2_SMOOTH = {23675, -15};
107 static const spx_float_t VAR1_UPDATE = {16384, -15};
108 static const spx_float_t VAR2_UPDATE = {16384, -16};
109 static const spx_float_t VAR_BACKTRACK = {16384, -12};
110 #define TOP16(x) ((x)>>16)
114 static const spx_float_t MIN_LEAK = .005f;
116 /* Constants for the two-path filter */
117 static const spx_float_t VAR1_SMOOTH = .36f;
118 static const spx_float_t VAR2_SMOOTH = .7225f;
119 static const spx_float_t VAR1_UPDATE = .5f;
120 static const spx_float_t VAR2_UPDATE = .25f;
121 static const spx_float_t VAR_BACKTRACK = 4.f;
126 #define PLAYBACK_DELAY 2
128 void speex_echo_get_residual(SpeexEchoState *st, spx_word32_t *Yout, int len);
131 /** Speex echo cancellation state. */
132 struct SpeexEchoState_ {
133 int frame_size; /**< Number of samples processed each time */
140 int C; /** Number of input channels (microphones) */
141 int K; /** Number of output channels (loudspeakers) */
142 spx_int32_t sampling_rate;
143 spx_word16_t spec_average;
145 spx_word16_t beta_max;
146 spx_word32_t sum_adapt;
147 spx_word16_t leak_estimate;
149 spx_word16_t *e; /* scratch */
150 spx_word16_t *x; /* Far-end input buffer (2N) */
151 spx_word16_t *X; /* Far-end buffer (M+1 frames) in frequency domain */
152 spx_word16_t *input; /* scratch */
153 spx_word16_t *y; /* scratch */
154 spx_word16_t *last_y;
155 spx_word16_t *Y; /* scratch */
157 spx_word32_t *PHI; /* scratch */
158 spx_word32_t *W; /* (Background) filter weights */
160 spx_word16_t *foreground; /* Foreground filter weights */
161 spx_word32_t Davg1; /* 1st recursive average of the residual power difference */
162 spx_word32_t Davg2; /* 2nd recursive average of the residual power difference */
163 spx_float_t Dvar1; /* Estimated variance of 1st estimator */
164 spx_float_t Dvar2; /* Estimated variance of 2nd estimator */
166 spx_word32_t *power; /* Power of the far-end signal */
167 spx_float_t *power_1;/* Inverse power of far-end */
168 spx_word16_t *wtmp; /* scratch */
170 spx_word16_t *wtmp2; /* scratch */
172 spx_word32_t *Rf; /* scratch */
173 spx_word32_t *Yf; /* scratch */
174 spx_word32_t *Xf; /* scratch */
179 spx_word16_t *window;
182 spx_word16_t *memX, *memD, *memE;
183 spx_word16_t preemph;
184 spx_word16_t notch_radius;
185 spx_mem_t *notch_mem;
187 /* NOTE: If you only use speex_echo_cancel() and want to save some memory, remove this */
188 spx_int16_t *play_buf;
190 int play_buf_started;
193 static inline void filter_dc_notch16(const spx_int16_t *in, spx_word16_t radius, spx_word16_t *out, int len, spx_mem_t *mem, int stride)
198 den2 = MULT16_16_Q15(radius,radius) + MULT16_16_Q15(QCONST16(.7,15),MULT16_16_Q15(32767-radius,32767-radius));
200 den2 = radius*radius + .7*(1-radius)*(1-radius);
202 /*printf ("%d %d %d %d %d %d\n", num[0], num[1], num[2], den[0], den[1], den[2]);*/
205 spx_word16_t vin = in[i*stride];
206 spx_word32_t vout = mem[0] + SHL32(EXTEND32(vin),15);
208 mem[0] = mem[1] + SHL32(SHL32(-EXTEND32(vin),15) + MULT16_32_Q15(radius,vout),1);
210 mem[0] = mem[1] + 2*(-vin + radius*vout);
212 mem[1] = SHL32(EXTEND32(vin),15) - MULT16_32_Q15(den2,vout);
213 out[i] = SATURATE32(PSHR32(MULT16_32_Q15(radius,vout),15),32767);
217 /* This inner product is slightly different from the codec version because of fixed-point */
218 static inline spx_word32_t mdf_inner_prod(const spx_word16_t *x, const spx_word16_t *y, int len)
225 part = MAC16_16(part,*x++,*y++);
226 part = MAC16_16(part,*x++,*y++);
227 /* HINT: If you had a 40-bit accumulator, you could shift only at the end */
228 sum = ADD32(sum,SHR32(part,6));
233 /** Compute power spectrum of a half-complex (packed) vector */
234 static inline void power_spectrum(const spx_word16_t *X, spx_word32_t *ps, int N)
237 ps[0]=MULT16_16(X[0],X[0]);
238 for (i=1,j=1;i<N-1;i+=2,j++)
240 ps[j] = MULT16_16(X[i],X[i]) + MULT16_16(X[i+1],X[i+1]);
242 ps[j]=MULT16_16(X[i],X[i]);
245 /** Compute power spectrum of a half-complex (packed) vector and accumulate */
246 static inline void power_spectrum_accum(const spx_word16_t *X, spx_word32_t *ps, int N)
249 ps[0]+=MULT16_16(X[0],X[0]);
250 for (i=1,j=1;i<N-1;i+=2,j++)
252 ps[j] += MULT16_16(X[i],X[i]) + MULT16_16(X[i+1],X[i+1]);
254 ps[j]+=MULT16_16(X[i],X[i]);
257 /** Compute cross-power spectrum of a half-complex (packed) vectors and add to acc */
259 static inline void spectral_mul_accum(const spx_word16_t *X, const spx_word32_t *Y, spx_word16_t *acc, int N, int M)
262 spx_word32_t tmp1=0,tmp2=0;
265 tmp1 = MAC16_16(tmp1, X[j*N],TOP16(Y[j*N]));
267 acc[0] = PSHR32(tmp1,WEIGHT_SHIFT);
273 tmp1 = SUB32(MAC16_16(tmp1, X[j*N+i],TOP16(Y[j*N+i])), MULT16_16(X[j*N+i+1],TOP16(Y[j*N+i+1])));
274 tmp2 = MAC16_16(MAC16_16(tmp2, X[j*N+i+1],TOP16(Y[j*N+i])), X[j*N+i], TOP16(Y[j*N+i+1]));
276 acc[i] = PSHR32(tmp1,WEIGHT_SHIFT);
277 acc[i+1] = PSHR32(tmp2,WEIGHT_SHIFT);
282 tmp1 = MAC16_16(tmp1, X[(j+1)*N-1],TOP16(Y[(j+1)*N-1]));
284 acc[N-1] = PSHR32(tmp1,WEIGHT_SHIFT);
286 static inline void spectral_mul_accum16(const spx_word16_t *X, const spx_word16_t *Y, spx_word16_t *acc, int N, int M)
289 spx_word32_t tmp1=0,tmp2=0;
292 tmp1 = MAC16_16(tmp1, X[j*N],Y[j*N]);
294 acc[0] = PSHR32(tmp1,WEIGHT_SHIFT);
300 tmp1 = SUB32(MAC16_16(tmp1, X[j*N+i],Y[j*N+i]), MULT16_16(X[j*N+i+1],Y[j*N+i+1]));
301 tmp2 = MAC16_16(MAC16_16(tmp2, X[j*N+i+1],Y[j*N+i]), X[j*N+i], Y[j*N+i+1]);
303 acc[i] = PSHR32(tmp1,WEIGHT_SHIFT);
304 acc[i+1] = PSHR32(tmp2,WEIGHT_SHIFT);
309 tmp1 = MAC16_16(tmp1, X[(j+1)*N-1],Y[(j+1)*N-1]);
311 acc[N-1] = PSHR32(tmp1,WEIGHT_SHIFT);
315 static inline void spectral_mul_accum(const spx_word16_t *X, const spx_word32_t *Y, spx_word16_t *acc, int N, int M)
325 acc[i] += (X[i]*Y[i] - X[i+1]*Y[i+1]);
326 acc[i+1] += (X[i+1]*Y[i] + X[i]*Y[i+1]);
333 #define spectral_mul_accum16 spectral_mul_accum
336 /** Compute weighted cross-power spectrum of a half-complex (packed) vector with conjugate */
337 static inline void weighted_spectral_mul_conj(const spx_float_t *w, const spx_float_t p, const spx_word16_t *X, const spx_word16_t *Y, spx_word32_t *prod, int N)
341 W = FLOAT_AMULT(p, w[0]);
342 prod[0] = FLOAT_MUL32(W,MULT16_16(X[0],Y[0]));
343 for (i=1,j=1;i<N-1;i+=2,j++)
345 W = FLOAT_AMULT(p, w[j]);
346 prod[i] = FLOAT_MUL32(W,MAC16_16(MULT16_16(X[i],Y[i]), X[i+1],Y[i+1]));
347 prod[i+1] = FLOAT_MUL32(W,MAC16_16(MULT16_16(-X[i+1],Y[i]), X[i],Y[i+1]));
349 W = FLOAT_AMULT(p, w[j]);
350 prod[i] = FLOAT_MUL32(W,MULT16_16(X[i],Y[i]));
353 static inline void mdf_adjust_prop(const spx_word32_t *W, int N, int M, int P, spx_word16_t *prop)
356 spx_word16_t max_sum = 1;
357 spx_word32_t prop_sum = 1;
360 spx_word32_t tmp = 1;
363 tmp += MULT16_16(EXTRACT16(SHR32(W[p*N*M + i*N+j],18)), EXTRACT16(SHR32(W[p*N*M + i*N+j],18)));
365 /* Just a security in case an overflow were to occur */
366 tmp = MIN32(ABS32(tmp), 536870912);
368 prop[i] = spx_sqrt(tmp);
369 if (prop[i] > max_sum)
374 prop[i] += MULT16_16_Q15(QCONST16(.1f,15),max_sum);
375 prop_sum += EXTEND32(prop[i]);
379 prop[i] = DIV32(MULT16_16(QCONST16(.99f,15), prop[i]),prop_sum);
380 /*printf ("%f ", prop[i]);*/
385 #ifdef DUMP_ECHO_CANCEL_DATA
387 static FILE *rFile=NULL, *pFile=NULL, *oFile=NULL;
389 static void dump_audio(const spx_int16_t *rec, const spx_int16_t *play, const spx_int16_t *out, int len)
391 if (!(rFile && pFile && oFile))
393 speex_fatal("Dump files not open");
395 fwrite(rec, sizeof(spx_int16_t), len, rFile);
396 fwrite(play, sizeof(spx_int16_t), len, pFile);
397 fwrite(out, sizeof(spx_int16_t), len, oFile);
401 /** Creates a new echo canceller state */
402 EXPORT SpeexEchoState *speex_echo_state_init(int frame_size, int filter_length)
404 return speex_echo_state_init_mc(frame_size, filter_length, 1, 1);
407 EXPORT SpeexEchoState *speex_echo_state_init_mc(int frame_size, int filter_length, int nb_mic, int nb_speakers)
410 SpeexEchoState *st = (SpeexEchoState *)speex_alloc(sizeof(SpeexEchoState));
416 #ifdef DUMP_ECHO_CANCEL_DATA
417 if (rFile || pFile || oFile)
418 speex_fatal("Opening dump files twice");
419 rFile = fopen("aec_rec.sw", "wb");
420 pFile = fopen("aec_play.sw", "wb");
421 oFile = fopen("aec_out.sw", "wb");
424 st->frame_size = frame_size;
425 st->window_size = 2*frame_size;
427 M = st->M = (filter_length+st->frame_size-1)/frame_size;
432 /* This is the default sampling rate */
433 st->sampling_rate = 8000;
434 st->spec_average = DIV32_16(SHL32(EXTEND32(st->frame_size), 15), st->sampling_rate);
436 st->beta0 = DIV32_16(SHL32(EXTEND32(st->frame_size), 16), st->sampling_rate);
437 st->beta_max = DIV32_16(SHL32(EXTEND32(st->frame_size), 14), st->sampling_rate);
439 st->beta0 = (2.0f*st->frame_size)/st->sampling_rate;
440 st->beta_max = (.5f*st->frame_size)/st->sampling_rate;
442 st->leak_estimate = 0;
444 st->fft_table = spx_fft_init(N);
446 st->e = (spx_word16_t*)speex_alloc(C*N*sizeof(spx_word16_t));
447 st->x = (spx_word16_t*)speex_alloc(K*N*sizeof(spx_word16_t));
448 st->input = (spx_word16_t*)speex_alloc(C*st->frame_size*sizeof(spx_word16_t));
449 st->y = (spx_word16_t*)speex_alloc(C*N*sizeof(spx_word16_t));
450 st->last_y = (spx_word16_t*)speex_alloc(C*N*sizeof(spx_word16_t));
451 st->Yf = (spx_word32_t*)speex_alloc((st->frame_size+1)*sizeof(spx_word32_t));
452 st->Rf = (spx_word32_t*)speex_alloc((st->frame_size+1)*sizeof(spx_word32_t));
453 st->Xf = (spx_word32_t*)speex_alloc((st->frame_size+1)*sizeof(spx_word32_t));
454 st->Yh = (spx_word32_t*)speex_alloc((st->frame_size+1)*sizeof(spx_word32_t));
455 st->Eh = (spx_word32_t*)speex_alloc((st->frame_size+1)*sizeof(spx_word32_t));
457 st->X = (spx_word16_t*)speex_alloc(K*(M+1)*N*sizeof(spx_word16_t));
458 st->Y = (spx_word16_t*)speex_alloc(C*N*sizeof(spx_word16_t));
459 st->E = (spx_word16_t*)speex_alloc(C*N*sizeof(spx_word16_t));
460 st->W = (spx_word32_t*)speex_alloc(C*K*M*N*sizeof(spx_word32_t));
462 st->foreground = (spx_word16_t*)speex_alloc(M*N*C*K*sizeof(spx_word16_t));
464 st->PHI = (spx_word32_t*)speex_alloc(N*sizeof(spx_word32_t));
465 st->power = (spx_word32_t*)speex_alloc((frame_size+1)*sizeof(spx_word32_t));
466 st->power_1 = (spx_float_t*)speex_alloc((frame_size+1)*sizeof(spx_float_t));
467 st->window = (spx_word16_t*)speex_alloc(N*sizeof(spx_word16_t));
468 st->prop = (spx_word16_t*)speex_alloc(M*sizeof(spx_word16_t));
469 st->wtmp = (spx_word16_t*)speex_alloc(N*sizeof(spx_word16_t));
471 st->wtmp2 = (spx_word16_t*)speex_alloc(N*sizeof(spx_word16_t));
474 st->window[i] = (16383-SHL16(spx_cos(DIV32_16(MULT16_16(25736,i<<1),N)),1));
475 st->window[N-i-1] = st->window[i];
479 st->window[i] = .5-.5*cos(2*M_PI*i/N);
481 for (i=0;i<=st->frame_size;i++)
482 st->power_1[i] = FLOAT_ONE;
483 for (i=0;i<N*M*K*C;i++)
486 spx_word32_t sum = 0;
487 /* Ratio of ~10 between adaptation rate of first and last block */
488 spx_word16_t decay = SHR32(spx_exp(NEG16(DIV32_16(QCONST16(2.4,11),M))),1);
489 st->prop[0] = QCONST16(.7, 15);
490 sum = EXTEND32(st->prop[0]);
493 st->prop[i] = MULT16_16_Q15(st->prop[i-1], decay);
494 sum = ADD32(sum, EXTEND32(st->prop[i]));
498 st->prop[i] = DIV32(MULT16_16(QCONST16(.8f,15), st->prop[i]),sum);
502 st->memX = (spx_word16_t*)speex_alloc(K*sizeof(spx_word16_t));
503 st->memD = (spx_word16_t*)speex_alloc(C*sizeof(spx_word16_t));
504 st->memE = (spx_word16_t*)speex_alloc(C*sizeof(spx_word16_t));
505 st->preemph = QCONST16(.9,15);
506 if (st->sampling_rate<12000)
507 st->notch_radius = QCONST16(.9, 15);
508 else if (st->sampling_rate<24000)
509 st->notch_radius = QCONST16(.982, 15);
511 st->notch_radius = QCONST16(.992, 15);
513 st->notch_mem = (spx_mem_t*)speex_alloc(2*C*sizeof(spx_mem_t));
515 st->Pey = st->Pyy = FLOAT_ONE;
518 st->Davg1 = st->Davg2 = 0;
519 st->Dvar1 = st->Dvar2 = FLOAT_ZERO;
522 st->play_buf = (spx_int16_t*)speex_alloc(K*(PLAYBACK_DELAY+1)*st->frame_size*sizeof(spx_int16_t));
523 st->play_buf_pos = PLAYBACK_DELAY*st->frame_size;
524 st->play_buf_started = 0;
529 /** Resets echo canceller state */
530 EXPORT void speex_echo_state_reset(SpeexEchoState *st)
543 st->foreground[i] = 0;
545 for (i=0;i<N*(M+1);i++)
547 for (i=0;i<=st->frame_size;i++)
550 st->power_1[i] = FLOAT_ONE;
554 for (i=0;i<st->frame_size;i++)
567 st->notch_mem[i] = 0;
569 st->memD[i]=st->memE[i]=0;
576 st->Pey = st->Pyy = FLOAT_ONE;
578 st->Davg1 = st->Davg2 = 0;
579 st->Dvar1 = st->Dvar2 = FLOAT_ZERO;
581 for (i=0;i<3*st->frame_size;i++)
583 st->play_buf_pos = PLAYBACK_DELAY*st->frame_size;
584 st->play_buf_started = 0;
588 /** Destroys an echo canceller state */
589 EXPORT void speex_echo_state_destroy(SpeexEchoState *st)
591 spx_fft_destroy(st->fft_table);
595 speex_free(st->input);
597 speex_free(st->last_y);
609 speex_free(st->foreground);
612 speex_free(st->power);
613 speex_free(st->power_1);
614 speex_free(st->window);
615 speex_free(st->prop);
616 speex_free(st->wtmp);
618 speex_free(st->wtmp2);
620 speex_free(st->memX);
621 speex_free(st->memD);
622 speex_free(st->memE);
623 speex_free(st->notch_mem);
625 speex_free(st->play_buf);
628 #ifdef DUMP_ECHO_CANCEL_DATA
632 rFile = pFile = oFile = NULL;
636 EXPORT void speex_echo_capture(SpeexEchoState *st, const spx_int16_t *rec, spx_int16_t *out)
639 /*speex_warning_int("capture with fill level ", st->play_buf_pos/st->frame_size);*/
640 st->play_buf_started = 1;
641 if (st->play_buf_pos>=st->frame_size)
643 speex_echo_cancellation(st, rec, st->play_buf, out);
644 st->play_buf_pos -= st->frame_size;
645 for (i=0;i<st->play_buf_pos;i++)
646 st->play_buf[i] = st->play_buf[i+st->frame_size];
648 speex_warning("No playback frame available (your application is buggy and/or got xruns)");
649 if (st->play_buf_pos!=0)
651 speex_warning("internal playback buffer corruption?");
652 st->play_buf_pos = 0;
654 for (i=0;i<st->frame_size;i++)
659 EXPORT void speex_echo_playback(SpeexEchoState *st, const spx_int16_t *play)
661 /*speex_warning_int("playback with fill level ", st->play_buf_pos/st->frame_size);*/
662 if (!st->play_buf_started)
664 speex_warning("discarded first playback frame");
667 if (st->play_buf_pos<=PLAYBACK_DELAY*st->frame_size)
670 for (i=0;i<st->frame_size;i++)
671 st->play_buf[st->play_buf_pos+i] = play[i];
672 st->play_buf_pos += st->frame_size;
673 if (st->play_buf_pos <= (PLAYBACK_DELAY-1)*st->frame_size)
675 speex_warning("Auto-filling the buffer (your application is buggy and/or got xruns)");
676 for (i=0;i<st->frame_size;i++)
677 st->play_buf[st->play_buf_pos+i] = play[i];
678 st->play_buf_pos += st->frame_size;
681 speex_warning("Had to discard a playback frame (your application is buggy and/or got xruns)");
685 /** Performs echo cancellation on a frame (deprecated, last arg now ignored) */
686 EXPORT void speex_echo_cancel(SpeexEchoState *st, const spx_int16_t *in, const spx_int16_t *far_end, spx_int16_t *out, spx_int32_t *Yout)
688 speex_echo_cancellation(st, in, far_end, out);
691 /** Performs echo cancellation on a frame */
692 EXPORT void speex_echo_cancellation(SpeexEchoState *st, const spx_int16_t *in, const spx_int16_t *far_end, spx_int16_t *out)
694 int i,j, chan, speak;
696 spx_word32_t Syy,See,Sxx,Sdd, Sff;
699 int update_foreground;
702 spx_word16_t ss, ss_1;
703 spx_float_t Pey = FLOAT_ONE, Pyy=FLOAT_ONE;
704 spx_float_t alpha, alpha_1;
715 ss=DIV32_16(11469,M);
716 ss_1 = SUB16(32767,ss);
722 for (chan = 0; chan < C; chan++)
724 /* Apply a notch filter to make sure DC doesn't end up causing problems */
725 filter_dc_notch16(in+chan, st->notch_radius, st->input+chan*st->frame_size, st->frame_size, st->notch_mem+2*chan, C);
726 /* Copy input data to buffer and apply pre-emphasis */
727 /* Copy input data to buffer */
728 for (i=0;i<st->frame_size;i++)
731 /* FIXME: This core has changed a bit, need to merge properly */
732 tmp32 = SUB32(EXTEND32(st->input[chan*st->frame_size+i]), EXTEND32(MULT16_16_P15(st->preemph, st->memD[chan])));
737 if (st->saturated == 0)
743 if (st->saturated == 0)
747 st->memD[chan] = st->input[chan*st->frame_size+i];
748 st->input[chan*st->frame_size+i] = EXTRACT16(tmp32);
752 for (speak = 0; speak < K; speak++)
754 for (i=0;i<st->frame_size;i++)
757 st->x[speak*N+i] = st->x[speak*N+i+st->frame_size];
758 tmp32 = SUB32(EXTEND32(far_end[i*K+speak]), EXTEND32(MULT16_16_P15(st->preemph, st->memX[speak])));
760 /*FIXME: If saturation occurs here, we need to freeze adaptation for M frames (not just one) */
772 st->x[speak*N+i+st->frame_size] = EXTRACT16(tmp32);
773 st->memX[speak] = far_end[i*K+speak];
777 for (speak = 0; speak < K; speak++)
779 /* Shift memory: this could be optimized eventually*/
783 st->X[(j+1)*N*K+speak*N+i] = st->X[j*N*K+speak*N+i];
785 /* Convert x (echo input) to frequency domain */
786 spx_fft(st->fft_table, st->x+speak*N, &st->X[speak*N]);
790 for (speak = 0; speak < K; speak++)
792 Sxx += mdf_inner_prod(st->x+speak*N+st->frame_size, st->x+speak*N+st->frame_size, st->frame_size);
793 power_spectrum_accum(st->X+speak*N, st->Xf, N);
797 for (chan = 0; chan < C; chan++)
800 /* Compute foreground filter */
801 spectral_mul_accum16(st->X, st->foreground+chan*N*K*M, st->Y+chan*N, N, M*K);
802 spx_ifft(st->fft_table, st->Y+chan*N, st->e+chan*N);
803 for (i=0;i<st->frame_size;i++)
804 st->e[chan*N+i] = SUB16(st->input[chan*st->frame_size+i], st->e[chan*N+i+st->frame_size]);
805 Sff += mdf_inner_prod(st->e+chan*N, st->e+chan*N, st->frame_size);
809 /* Adjust proportional adaption rate */
810 /* FIXME: Adjust that for C, K*/
812 mdf_adjust_prop (st->W, N, M, C*K, st->prop);
813 /* Compute weight gradient */
814 if (st->saturated == 0)
816 for (chan = 0; chan < C; chan++)
818 for (speak = 0; speak < K; speak++)
822 weighted_spectral_mul_conj(st->power_1, FLOAT_SHL(PSEUDOFLOAT(st->prop[j]),-15), &st->X[(j+1)*N*K+speak*N], st->E+chan*N, st->PHI, N);
824 st->W[chan*N*K*M + j*N*K + speak*N + i] += st->PHI[i];
832 /* FIXME: MC conversion required */
833 /* Update weight to prevent circular convolution (MDF / AUMDF) */
834 for (chan = 0; chan < C; chan++)
836 for (speak = 0; speak < K; speak++)
840 /* This is a variant of the Alternatively Updated MDF (AUMDF) */
841 /* Remove the "if" to make this an MDF filter */
842 if (j==0 || st->cancel_count%(M-1) == j-1)
846 st->wtmp2[i] = EXTRACT16(PSHR32(st->W[chan*N*K*M + j*N*K + speak*N + i],NORMALIZE_SCALEDOWN+16));
847 spx_ifft(st->fft_table, st->wtmp2, st->wtmp);
848 for (i=0;i<st->frame_size;i++)
852 for (i=st->frame_size;i<N;i++)
854 st->wtmp[i]=SHL16(st->wtmp[i],NORMALIZE_SCALEUP);
856 spx_fft(st->fft_table, st->wtmp, st->wtmp2);
857 /* The "-1" in the shift is a sort of kludge that trades less efficient update speed for decrease noise */
859 st->W[chan*N*K*M + j*N*K + speak*N + i] -= SHL32(EXTEND32(st->wtmp2[i]),16+NORMALIZE_SCALEDOWN-NORMALIZE_SCALEUP-1);
861 spx_ifft(st->fft_table, &st->W[chan*N*K*M + j*N*K + speak*N], st->wtmp);
862 for (i=st->frame_size;i<N;i++)
866 spx_fft(st->fft_table, st->wtmp, &st->W[chan*N*K*M + j*N*K + speak*N]);
873 /* So we can use power_spectrum_accum */
874 for (i=0;i<=st->frame_size;i++)
875 st->Rf[i] = st->Yf[i] = st->Xf[i] = 0;
880 /* Difference in response, this is used to estimate the variance of our residual power estimate */
881 for (chan = 0; chan < C; chan++)
883 spectral_mul_accum(st->X, st->W+chan*N*K*M, st->Y+chan*N, N, M*K);
884 spx_ifft(st->fft_table, st->Y+chan*N, st->y+chan*N);
885 for (i=0;i<st->frame_size;i++)
886 st->e[chan*N+i] = SUB16(st->e[chan*N+i+st->frame_size], st->y[chan*N+i+st->frame_size]);
887 Dbf += 10+mdf_inner_prod(st->e+chan*N, st->e+chan*N, st->frame_size);
888 for (i=0;i<st->frame_size;i++)
889 st->e[chan*N+i] = SUB16(st->input[chan*st->frame_size+i], st->y[chan*N+i+st->frame_size]);
890 See += mdf_inner_prod(st->e+chan*N, st->e+chan*N, st->frame_size);
899 /* Logic for updating the foreground filter */
901 /* For two time windows, compute the mean of the energy difference, as well as the variance */
902 st->Davg1 = ADD32(MULT16_32_Q15(QCONST16(.6f,15),st->Davg1), MULT16_32_Q15(QCONST16(.4f,15),SUB32(Sff,See)));
903 st->Davg2 = ADD32(MULT16_32_Q15(QCONST16(.85f,15),st->Davg2), MULT16_32_Q15(QCONST16(.15f,15),SUB32(Sff,See)));
904 st->Dvar1 = FLOAT_ADD(FLOAT_MULT(VAR1_SMOOTH, st->Dvar1), FLOAT_MUL32U(MULT16_32_Q15(QCONST16(.4f,15),Sff), MULT16_32_Q15(QCONST16(.4f,15),Dbf)));
905 st->Dvar2 = FLOAT_ADD(FLOAT_MULT(VAR2_SMOOTH, st->Dvar2), FLOAT_MUL32U(MULT16_32_Q15(QCONST16(.15f,15),Sff), MULT16_32_Q15(QCONST16(.15f,15),Dbf)));
907 /* Equivalent float code:
908 st->Davg1 = .6*st->Davg1 + .4*(Sff-See);
909 st->Davg2 = .85*st->Davg2 + .15*(Sff-See);
910 st->Dvar1 = .36*st->Dvar1 + .16*Sff*Dbf;
911 st->Dvar2 = .7225*st->Dvar2 + .0225*Sff*Dbf;
914 update_foreground = 0;
915 /* Check if we have a statistically significant reduction in the residual echo */
916 /* Note that this is *not* Gaussian, so we need to be careful about the longer tail */
917 if (FLOAT_GT(FLOAT_MUL32U(SUB32(Sff,See),ABS32(SUB32(Sff,See))), FLOAT_MUL32U(Sff,Dbf)))
918 update_foreground = 1;
919 else if (FLOAT_GT(FLOAT_MUL32U(st->Davg1, ABS32(st->Davg1)), FLOAT_MULT(VAR1_UPDATE,(st->Dvar1))))
920 update_foreground = 1;
921 else if (FLOAT_GT(FLOAT_MUL32U(st->Davg2, ABS32(st->Davg2)), FLOAT_MULT(VAR2_UPDATE,(st->Dvar2))))
922 update_foreground = 1;
925 if (update_foreground)
927 st->Davg1 = st->Davg2 = 0;
928 st->Dvar1 = st->Dvar2 = FLOAT_ZERO;
929 /* Copy background filter to foreground filter */
930 for (i=0;i<N*M*C*K;i++)
931 st->foreground[i] = EXTRACT16(PSHR32(st->W[i],16));
932 /* Apply a smooth transition so as to not introduce blocking artifacts */
933 for (chan = 0; chan < C; chan++)
934 for (i=0;i<st->frame_size;i++)
935 st->e[chan*N+i+st->frame_size] = MULT16_16_Q15(st->window[i+st->frame_size],st->e[chan*N+i+st->frame_size]) + MULT16_16_Q15(st->window[i],st->y[chan*N+i+st->frame_size]);
937 int reset_background=0;
938 /* Otherwise, check if the background filter is significantly worse */
939 if (FLOAT_GT(FLOAT_MUL32U(NEG32(SUB32(Sff,See)),ABS32(SUB32(Sff,See))), FLOAT_MULT(VAR_BACKTRACK,FLOAT_MUL32U(Sff,Dbf))))
940 reset_background = 1;
941 if (FLOAT_GT(FLOAT_MUL32U(NEG32(st->Davg1), ABS32(st->Davg1)), FLOAT_MULT(VAR_BACKTRACK,st->Dvar1)))
942 reset_background = 1;
943 if (FLOAT_GT(FLOAT_MUL32U(NEG32(st->Davg2), ABS32(st->Davg2)), FLOAT_MULT(VAR_BACKTRACK,st->Dvar2)))
944 reset_background = 1;
945 if (reset_background)
947 /* Copy foreground filter to background filter */
948 for (i=0;i<N*M*C*K;i++)
949 st->W[i] = SHL32(EXTEND32(st->foreground[i]),16);
950 /* We also need to copy the output so as to get correct adaptation */
951 for (chan = 0; chan < C; chan++)
953 for (i=0;i<st->frame_size;i++)
954 st->y[chan*N+i+st->frame_size] = st->e[chan*N+i+st->frame_size];
955 for (i=0;i<st->frame_size;i++)
956 st->e[chan*N+i] = SUB16(st->input[chan*st->frame_size+i], st->y[chan*N+i+st->frame_size]);
959 st->Davg1 = st->Davg2 = 0;
960 st->Dvar1 = st->Dvar2 = FLOAT_ZERO;
966 for (chan = 0; chan < C; chan++)
968 /* Compute error signal (for the output with de-emphasis) */
969 for (i=0;i<st->frame_size;i++)
971 spx_word32_t tmp_out;
973 tmp_out = SUB32(EXTEND32(st->input[chan*st->frame_size+i]), EXTEND32(st->e[chan*N+i+st->frame_size]));
975 tmp_out = SUB32(EXTEND32(st->input[chan*st->frame_size+i]), EXTEND32(st->y[chan*N+i+st->frame_size]));
977 tmp_out = ADD32(tmp_out, EXTEND32(MULT16_16_P15(st->preemph, st->memE[chan])));
978 /* This is an arbitrary test for saturation in the microphone signal */
979 if (in[i*C+chan] <= -32000 || in[i*C+chan] >= 32000)
981 if (st->saturated == 0)
984 out[i*C+chan] = WORD2INT(tmp_out);
985 st->memE[chan] = tmp_out;
988 #ifdef DUMP_ECHO_CANCEL_DATA
989 dump_audio(in, far_end, out, st->frame_size);
992 /* Compute error signal (filter update version) */
993 for (i=0;i<st->frame_size;i++)
995 st->e[chan*N+i+st->frame_size] = st->e[chan*N+i];
999 /* Compute a bunch of correlations */
1000 /* FIXME: bad merge */
1001 Sey += mdf_inner_prod(st->e+chan*N+st->frame_size, st->y+chan*N+st->frame_size, st->frame_size);
1002 Syy += mdf_inner_prod(st->y+chan*N+st->frame_size, st->y+chan*N+st->frame_size, st->frame_size);
1003 Sdd += mdf_inner_prod(st->input+chan*st->frame_size, st->input+chan*st->frame_size, st->frame_size);
1005 /* Convert error to frequency domain */
1006 spx_fft(st->fft_table, st->e+chan*N, st->E+chan*N);
1007 for (i=0;i<st->frame_size;i++)
1008 st->y[i+chan*N] = 0;
1009 spx_fft(st->fft_table, st->y+chan*N, st->Y+chan*N);
1011 /* Compute power spectrum of echo (X), error (E) and filter response (Y) */
1012 power_spectrum_accum(st->E+chan*N, st->Rf, N);
1013 power_spectrum_accum(st->Y+chan*N, st->Yf, N);
1017 /*printf ("%f %f %f %f\n", Sff, See, Syy, Sdd, st->update_cond);*/
1019 /* Do some sanity check */
1020 if (!(Syy>=0 && Sxx>=0 && See >= 0)
1022 || !(Sff < N*1e9 && Syy < N*1e9 && Sxx < N*1e9)
1026 /* Things have gone really bad */
1027 st->screwed_up += 50;
1028 for (i=0;i<st->frame_size*C;i++)
1030 } else if (SHR32(Sff, 2) > ADD32(Sdd, SHR32(MULT16_16(N, 10000),6)))
1032 /* AEC seems to add lots of echo instead of removing it, let's see if it will improve */
1035 /* Everything's fine */
1038 if (st->screwed_up>=50)
1040 speex_warning("The echo canceller started acting funny and got slapped (reset). It swears it will behave now.");
1041 speex_echo_state_reset(st);
1045 /* Add a small noise floor to make sure not to have problems when dividing */
1046 See = MAX32(See, SHR32(MULT16_16(N, 100),6));
1048 for (speak = 0; speak < K; speak++)
1050 Sxx += mdf_inner_prod(st->x+speak*N+st->frame_size, st->x+speak*N+st->frame_size, st->frame_size);
1051 power_spectrum_accum(st->X+speak*N, st->Xf, N);
1055 /* Smooth far end energy estimate over time */
1056 for (j=0;j<=st->frame_size;j++)
1057 st->power[j] = MULT16_32_Q15(ss_1,st->power[j]) + 1 + MULT16_32_Q15(ss,st->Xf[j]);
1059 /* Compute filtered spectra and (cross-)correlations */
1060 for (j=st->frame_size;j>=0;j--)
1063 Eh = PSEUDOFLOAT(st->Rf[j] - st->Eh[j]);
1064 Yh = PSEUDOFLOAT(st->Yf[j] - st->Yh[j]);
1065 Pey = FLOAT_ADD(Pey,FLOAT_MULT(Eh,Yh));
1066 Pyy = FLOAT_ADD(Pyy,FLOAT_MULT(Yh,Yh));
1068 st->Eh[j] = MAC16_32_Q15(MULT16_32_Q15(SUB16(32767,st->spec_average),st->Eh[j]), st->spec_average, st->Rf[j]);
1069 st->Yh[j] = MAC16_32_Q15(MULT16_32_Q15(SUB16(32767,st->spec_average),st->Yh[j]), st->spec_average, st->Yf[j]);
1071 st->Eh[j] = (1-st->spec_average)*st->Eh[j] + st->spec_average*st->Rf[j];
1072 st->Yh[j] = (1-st->spec_average)*st->Yh[j] + st->spec_average*st->Yf[j];
1076 Pyy = FLOAT_SQRT(Pyy);
1077 Pey = FLOAT_DIVU(Pey,Pyy);
1079 /* Compute correlation updatete rate */
1080 tmp32 = MULT16_32_Q15(st->beta0,Syy);
1081 if (tmp32 > MULT16_32_Q15(st->beta_max,See))
1082 tmp32 = MULT16_32_Q15(st->beta_max,See);
1083 alpha = FLOAT_DIV32(tmp32, See);
1084 alpha_1 = FLOAT_SUB(FLOAT_ONE, alpha);
1085 /* Update correlations (recursive average) */
1086 st->Pey = FLOAT_ADD(FLOAT_MULT(alpha_1,st->Pey) , FLOAT_MULT(alpha,Pey));
1087 st->Pyy = FLOAT_ADD(FLOAT_MULT(alpha_1,st->Pyy) , FLOAT_MULT(alpha,Pyy));
1088 if (FLOAT_LT(st->Pyy, FLOAT_ONE))
1089 st->Pyy = FLOAT_ONE;
1090 /* We don't really hope to get better than 33 dB (MIN_LEAK-3dB) attenuation anyway */
1091 if (FLOAT_LT(st->Pey, FLOAT_MULT(MIN_LEAK,st->Pyy)))
1092 st->Pey = FLOAT_MULT(MIN_LEAK,st->Pyy);
1093 if (FLOAT_GT(st->Pey, st->Pyy))
1095 /* leak_estimate is the linear regression result */
1096 st->leak_estimate = FLOAT_EXTRACT16(FLOAT_SHL(FLOAT_DIVU(st->Pey, st->Pyy),14));
1097 /* This looks like a stupid bug, but it's right (because we convert from Q14 to Q15) */
1098 if (st->leak_estimate > 16383)
1099 st->leak_estimate = 32767;
1101 st->leak_estimate = SHL16(st->leak_estimate,1);
1102 /*printf ("%f\n", st->leak_estimate);*/
1104 /* Compute Residual to Error Ratio */
1106 tmp32 = MULT16_32_Q15(st->leak_estimate,Syy);
1107 tmp32 = ADD32(SHR32(Sxx,13), ADD32(tmp32, SHL32(tmp32,1)));
1108 /* Check for y in e (lower bound on RER) */
1110 spx_float_t bound = PSEUDOFLOAT(Sey);
1111 bound = FLOAT_DIVU(FLOAT_MULT(bound, bound), PSEUDOFLOAT(ADD32(1,Syy)));
1112 if (FLOAT_GT(bound, PSEUDOFLOAT(See)))
1114 else if (tmp32 < FLOAT_EXTRACT32(bound))
1115 tmp32 = FLOAT_EXTRACT32(bound);
1117 if (tmp32 > SHR32(See,1))
1118 tmp32 = SHR32(See,1);
1119 RER = FLOAT_EXTRACT16(FLOAT_SHL(FLOAT_DIV32(tmp32,See),15));
1121 RER = (.0001*Sxx + 3.*MULT16_32_Q15(st->leak_estimate,Syy)) / See;
1122 /* Check for y in e (lower bound on RER) */
1123 if (RER < Sey*Sey/(1+See*Syy))
1124 RER = Sey*Sey/(1+See*Syy);
1129 /* We consider that the filter has had minimal adaptation if the following is true*/
1130 if (!st->adapted && st->sum_adapt > SHL32(EXTEND32(M),15) && MULT16_32_Q15(st->leak_estimate,Syy) > MULT16_32_Q15(QCONST16(.03f,15),Syy))
1137 /* Normal learning rate calculation once we're past the minimal adaptation phase */
1138 for (i=0;i<=st->frame_size;i++)
1141 /* Compute frequency-domain adaptation mask */
1142 r = MULT16_32_Q15(st->leak_estimate,SHL32(st->Yf[i],3));
1143 e = SHL32(st->Rf[i],3)+1;
1151 r = MULT16_32_Q15(QCONST16(.7,15),r) + MULT16_32_Q15(QCONST16(.3,15),(spx_word32_t)(MULT16_32_Q15(RER,e)));
1152 /*st->power_1[i] = adapt_rate*r/(e*(1+st->power[i]));*/
1153 st->power_1[i] = FLOAT_SHL(FLOAT_DIV32_FLOAT(r,FLOAT_MUL32U(e,st->power[i]+10)),WEIGHT_SHIFT+16);
1156 /* Temporary adaption rate if filter is not yet adapted enough */
1157 spx_word16_t adapt_rate=0;
1159 if (Sxx > SHR32(MULT16_16(N, 1000),6))
1161 tmp32 = MULT16_32_Q15(QCONST16(.25f, 15), Sxx);
1163 if (tmp32 > SHR32(See,2))
1164 tmp32 = SHR32(See,2);
1166 if (tmp32 > .25*See)
1169 adapt_rate = FLOAT_EXTRACT16(FLOAT_SHL(FLOAT_DIV32(tmp32, See),15));
1171 for (i=0;i<=st->frame_size;i++)
1172 st->power_1[i] = FLOAT_SHL(FLOAT_DIV32(EXTEND32(adapt_rate),ADD32(st->power[i],10)),WEIGHT_SHIFT+1);
1175 /* How much have we adapted so far? */
1176 st->sum_adapt = ADD32(st->sum_adapt,adapt_rate);
1179 /* FIXME: MC conversion required */
1180 for (i=0;i<st->frame_size;i++)
1181 st->last_y[i] = st->last_y[st->frame_size+i];
1184 /* If the filter is adapted, take the filtered echo */
1185 for (i=0;i<st->frame_size;i++)
1186 st->last_y[st->frame_size+i] = in[i]-out[i];
1188 /* If filter isn't adapted yet, all we can do is take the far end signal directly */
1189 /* moved earlier: for (i=0;i<N;i++)
1190 st->last_y[i] = st->x[i];*/
1195 /* Compute spectrum of estimated echo for use in an echo post-filter */
1196 void speex_echo_get_residual(SpeexEchoState *st, spx_word32_t *residual_echo, int len)
1202 N = st->window_size;
1204 /* Apply hanning window (should pre-compute it)*/
1206 st->y[i] = MULT16_16_Q15(st->window[i],st->last_y[i]);
1208 /* Compute power spectrum of the echo */
1209 spx_fft(st->fft_table, st->y, st->Y);
1210 power_spectrum(st->Y, residual_echo, N);
1213 if (st->leak_estimate > 16383)
1216 leak2 = SHL16(st->leak_estimate, 1);
1218 if (st->leak_estimate>.5)
1221 leak2 = 2*st->leak_estimate;
1223 /* Estimate residual echo */
1224 for (i=0;i<=st->frame_size;i++)
1225 residual_echo[i] = (spx_int32_t)MULT16_32_Q15(leak2,residual_echo[i]);
1229 EXPORT int speex_echo_ctl(SpeexEchoState *st, int request, void *ptr)
1234 case SPEEX_ECHO_GET_FRAME_SIZE:
1235 (*(int*)ptr) = st->frame_size;
1237 case SPEEX_ECHO_SET_SAMPLING_RATE:
1238 st->sampling_rate = (*(int*)ptr);
1239 st->spec_average = DIV32_16(SHL32(EXTEND32(st->frame_size), 15), st->sampling_rate);
1241 st->beta0 = DIV32_16(SHL32(EXTEND32(st->frame_size), 16), st->sampling_rate);
1242 st->beta_max = DIV32_16(SHL32(EXTEND32(st->frame_size), 14), st->sampling_rate);
1244 st->beta0 = (2.0f*st->frame_size)/st->sampling_rate;
1245 st->beta_max = (.5f*st->frame_size)/st->sampling_rate;
1247 if (st->sampling_rate<12000)
1248 st->notch_radius = QCONST16(.9, 15);
1249 else if (st->sampling_rate<24000)
1250 st->notch_radius = QCONST16(.982, 15);
1252 st->notch_radius = QCONST16(.992, 15);
1254 case SPEEX_ECHO_GET_SAMPLING_RATE:
1255 (*(int*)ptr) = st->sampling_rate;
1257 case SPEEX_ECHO_GET_IMPULSE_RESPONSE_SIZE:
1258 /*FIXME: Implement this for multiple channels */
1259 *((spx_int32_t *)ptr) = st->M * st->frame_size;
1261 case SPEEX_ECHO_GET_IMPULSE_RESPONSE:
1263 int M = st->M, N = st->window_size, n = st->frame_size, i, j;
1264 spx_int32_t *filt = (spx_int32_t *) ptr;
1267 /*FIXME: Implement this for multiple channels */
1270 st->wtmp2[i] = EXTRACT16(PSHR32(st->W[j*N+i],16+NORMALIZE_SCALEDOWN));
1271 spx_ifft(st->fft_table, st->wtmp2, st->wtmp);
1273 spx_ifft(st->fft_table, &st->W[j*N], st->wtmp);
1276 filt[j*n+i] = PSHR32(MULT16_16(32767,st->wtmp[i]), WEIGHT_SHIFT-NORMALIZE_SCALEDOWN);
1281 speex_warning_int("Unknown speex_echo_ctl request: ", request);