2 ** FFT and FHT routines
3 ** Copyright 1988, 1993; Ron Mayer
4 ** Copyright (c) 1999-2000 Takehiro Tominaga
7 ** Does a hartley transform of "n" points in the array "fz".
9 ** NOTE: This routine uses at least 2 patented algorithms, and may be
10 ** under the restrictions of a bunch of different organizations.
11 ** Although I wrote it completely myself; it is kind of a derivative
12 ** of a routine I once authored and released under the GPL, so it
13 ** may fall under the free software foundation's restrictions;
14 ** it was worked on as a Stanford Univ project, so they claim
15 ** some rights to it; it was further optimized at work here, so
16 ** I think this company claims parts of it. The patents are
17 ** held by R. Bracewell (the FHT algorithm) and O. Buneman (the
18 ** trig generator), both at Stanford Univ.
19 ** If it were up to me, I'd say go do whatever you want with it;
20 ** but it would be polite to give credit to the following people
21 ** if you use this anywhere:
22 ** Euler - probable inventor of the fourier transform.
23 ** Gauss - probable inventor of the FFT.
24 ** Hartley - probable inventor of the hartley transform.
25 ** Buneman - for a really cool trig generator
26 ** Mayer(me) - for authoring this particular version and
27 ** including all the optimizations in one package.
29 ** Ron Mayer; mayer@acuson.com
30 ** and added some optimization by
31 ** Mather - idea of using lookup table
32 ** Takehiro - some dirty hack for speed up
35 /* $Id: fft.c,v 1.38 2009/04/20 21:48:00 robert Exp $ */
47 #include "vector/lame_intrin.h"
51 #define TRI_SIZE (5-1) /* 1024 = 4**5 */
54 static FLOAT window[BLKSIZE], window_s[BLKSIZE_s / 2];
56 static const FLOAT costab[TRI_SIZE * 2] = {
57 9.238795325112867e-01, 3.826834323650898e-01,
58 9.951847266721969e-01, 9.801714032956060e-02,
59 9.996988186962042e-01, 2.454122852291229e-02,
60 9.999811752826011e-01, 6.135884649154475e-03
64 fht(FLOAT * fz, int n)
66 const FLOAT *tri = costab;
71 n <<= 1; /* to get BLKSIZE, because of 3DNow! ASM routine */
76 int i, k1, k2, k3, kx;
107 for (i = 1; i < kx; i++) {
109 c2 = 1 - (2 * s1) * s1;
114 FLOAT a, b, g0, f0, f1, g1, f2, g2, f3, g3;
115 b = s2 * fi[k1] - c2 * gi[k1];
116 a = c2 * fi[k1] + s2 * gi[k1];
121 b = s2 * fi[k3] - c2 * gi[k3];
122 a = c2 * fi[k3] + s2 * gi[k3];
127 b = s1 * f2 - c1 * g3;
128 a = c1 * f2 + s1 * g3;
133 b = c1 * g2 - s1 * f3;
134 a = s1 * g2 + c1 * f3;
143 c1 = c2 * tri[0] - s1 * tri[1];
144 s1 = c2 * tri[1] + s1 * tri[0];
151 static const unsigned char rv_tbl[] = {
152 0x00, 0x80, 0x40, 0xc0, 0x20, 0xa0, 0x60, 0xe0,
153 0x10, 0x90, 0x50, 0xd0, 0x30, 0xb0, 0x70, 0xf0,
154 0x08, 0x88, 0x48, 0xc8, 0x28, 0xa8, 0x68, 0xe8,
155 0x18, 0x98, 0x58, 0xd8, 0x38, 0xb8, 0x78, 0xf8,
156 0x04, 0x84, 0x44, 0xc4, 0x24, 0xa4, 0x64, 0xe4,
157 0x14, 0x94, 0x54, 0xd4, 0x34, 0xb4, 0x74, 0xf4,
158 0x0c, 0x8c, 0x4c, 0xcc, 0x2c, 0xac, 0x6c, 0xec,
159 0x1c, 0x9c, 0x5c, 0xdc, 0x3c, 0xbc, 0x7c, 0xfc,
160 0x02, 0x82, 0x42, 0xc2, 0x22, 0xa2, 0x62, 0xe2,
161 0x12, 0x92, 0x52, 0xd2, 0x32, 0xb2, 0x72, 0xf2,
162 0x0a, 0x8a, 0x4a, 0xca, 0x2a, 0xaa, 0x6a, 0xea,
163 0x1a, 0x9a, 0x5a, 0xda, 0x3a, 0xba, 0x7a, 0xfa,
164 0x06, 0x86, 0x46, 0xc6, 0x26, 0xa6, 0x66, 0xe6,
165 0x16, 0x96, 0x56, 0xd6, 0x36, 0xb6, 0x76, 0xf6,
166 0x0e, 0x8e, 0x4e, 0xce, 0x2e, 0xae, 0x6e, 0xee,
167 0x1e, 0x9e, 0x5e, 0xde, 0x3e, 0xbe, 0x7e, 0xfe
170 #define ch01(index) (buffer[chn][index])
172 #define ml00(f) (window[i ] * f(i))
173 #define ml10(f) (window[i + 0x200] * f(i + 0x200))
174 #define ml20(f) (window[i + 0x100] * f(i + 0x100))
175 #define ml30(f) (window[i + 0x300] * f(i + 0x300))
177 #define ml01(f) (window[i + 0x001] * f(i + 0x001))
178 #define ml11(f) (window[i + 0x201] * f(i + 0x201))
179 #define ml21(f) (window[i + 0x101] * f(i + 0x101))
180 #define ml31(f) (window[i + 0x301] * f(i + 0x301))
182 #define ms00(f) (window_s[i ] * f(i + k))
183 #define ms10(f) (window_s[0x7f - i] * f(i + k + 0x80))
184 #define ms20(f) (window_s[i + 0x40] * f(i + k + 0x40))
185 #define ms30(f) (window_s[0x3f - i] * f(i + k + 0xc0))
187 #define ms01(f) (window_s[i + 0x01] * f(i + k + 0x01))
188 #define ms11(f) (window_s[0x7e - i] * f(i + k + 0x81))
189 #define ms21(f) (window_s[i + 0x41] * f(i + k + 0x41))
190 #define ms31(f) (window_s[0x3e - i] * f(i + k + 0xc1))
194 fft_short(lame_internal_flags const *const gfc,
195 FLOAT x_real[3][BLKSIZE_s], int chn, const sample_t *const buffer[2])
201 for (b = 0; b < 3; b++) {
202 FLOAT *x = &x_real[b][BLKSIZE_s / 2];
203 short const k = (576 / 3) * (b + 1);
204 j = BLKSIZE_s / 8 - 1;
206 FLOAT f0, f1, f2, f3, w;
234 x[BLKSIZE_s / 2 + 0] = f0 + f2;
235 x[BLKSIZE_s / 2 + 2] = f0 - f2;
236 x[BLKSIZE_s / 2 + 1] = f1 + f3;
237 x[BLKSIZE_s / 2 + 3] = f1 - f3;
240 gfc->fft_fht(x, BLKSIZE_s / 2);
241 /* BLKSIZE_s/2 because of 3DNow! ASM routine */
246 fft_long(lame_internal_flags const *const gfc,
247 FLOAT x[BLKSIZE], int chn, const sample_t *const buffer[2])
250 int jj = BLKSIZE / 8 - 1;
254 FLOAT f0, f1, f2, f3, w;
281 x[BLKSIZE / 2 + 0] = f0 + f2;
282 x[BLKSIZE / 2 + 2] = f0 - f2;
283 x[BLKSIZE / 2 + 1] = f1 + f3;
284 x[BLKSIZE / 2 + 3] = f1 - f3;
287 gfc->fft_fht(x, BLKSIZE / 2);
288 /* BLKSIZE/2 because of 3DNow! ASM routine */
292 extern void fht_3DN(FLOAT * fz, int n);
293 extern void fht_SSE(FLOAT * fz, int n);
297 init_fft(lame_internal_flags * const gfc)
301 /* The type of window used here will make no real difference, but */
302 /* in the interest of merging nspsytune stuff - switch to blackman window */
303 for (i = 0; i < BLKSIZE; i++)
304 /* blackman window */
305 window[i] = 0.42 - 0.5 * cos(2 * PI * (i + .5) / BLKSIZE) +
306 0.08 * cos(4 * PI * (i + .5) / BLKSIZE);
308 for (i = 0; i < BLKSIZE_s / 2; i++)
309 window_s[i] = 0.5 * (1.0 - cos(2.0 * PI * (i + 0.5) / BLKSIZE_s));
313 if (gfc->CPU_features.AMD_3DNow) {
314 gfc->fft_fht = fht_3DN;
316 else if (gfc->CPU_features.SSE) {
317 gfc->fft_fht = fht_SSE;
323 #ifdef HAVE_XMMINTRIN_H
325 gfc->fft_fht = fht_SSE2;