SDRAngel  4.11.5
Developer docs for <a href="https://github.com/f4exb/sdrangel">SDRangel<\a>, an Open Source Qt5 / OpenGL 3.0+ SDR and signal analyzer frontend to various hardware.
gfft.h
Go to the documentation of this file.
1 //==============================================================================
2 // g_fft.h:
3 //
4 // FFT library
5 // Copyright (C) 2013
6 // Dave Freese, W1HKJ
7 //
8 // based on public domain code by John Green <green_jt@vsdec.npt.nuwc.navy.mil>
9 // original version is available at
10 // http://hyperarchive.lcs.mit.edu/
11 // /HyperArchive/Archive/dev/src/ffts-for-risc-2-c.hqx
12 //
13 // ported to C++ for fldigi by Dave Freese, W1HKJ
14 //
15 // This file is part of fldigi.
16 //
17 // Fldigi is free software: you can redistribute it and/or modify
18 // it under the terms of the GNU Lesser General Public License as published by
19 // the Free Software Foundation, either version 3 of the License, or
20 // (at your option) any later version.
21 //
22 // Fldigi is distributed in the hope that it will be useful,
23 // but WITHOUT ANY WARRANTY; without even the implied warranty of
24 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
25 // GNU General Public License for more details.
26 //
27 // You should have received a copy of the GNU General Public License
28 // along with fldigi. If not, see <http://www.gnu.org/licenses/>.
29 //==============================================================================
30 
31 #ifndef CGREEN_FFT_H
32 #define CGREEN_FFT_H
33 
34 #include <complex>
35 
36 template <typename FFT_TYPE>
37 class g_fft {
38 
39 #define FFT_RECIPLN2 1.442695040888963407359924681001892137426 // 1.0/log(2)
40 
41 // some useful conversions between a number and its power of 2
42 #define LOG2(a) (FFT_RECIPLN2*log(a)) // floating point logarithm base 2
43 #define POW2(m) ((unsigned int) 1 << (m)) // integer power of 2 for m<32
44 
45 // fft's with M bigger than this bust primary cache
46 #define MCACHE (11 - (sizeof(FFT_TYPE) / 8))
47 
48 // some math constants to 40 decimal places
49 #define FFT_PI 3.141592653589793238462643383279502884197 // pi
50 #define FFT_ROOT2 1.414213562373095048801688724209698078569 // sqrt(2)
51 #define FFT_COSPID8 0.9238795325112867561281831893967882868224 // cos(pi/8)
52 #define FFT_SINPID8 0.3826834323650897717284599840303988667613 // sin(pi/8)
53 
54 private:
55  int FFT_size;
56  int FFT_N;
57  FFT_TYPE *FFT_table_1[32];
58  short int *FFT_table_2[32];
59 
60  FFT_TYPE *Utbl;
61  short *BRLow;
62 
63  void fftInit();
64  int ConvertFFTSize(int);
65 
66 // base fft methods
67  void riffts1(FFT_TYPE *ioptr, int M, FFT_TYPE *Utbl, short *BRLow);
68  void ifrstage(FFT_TYPE *ioptr, int M, FFT_TYPE *inUtbl);
69  void rifft8pt(FFT_TYPE *ioptr, FFT_TYPE scale);
70  void rifft4pt(FFT_TYPE *ioptr, FFT_TYPE scale);
71  void rifft2pt(FFT_TYPE *ioptr, FFT_TYPE scale);
72  void rifft1pt(FFT_TYPE *ioptr, FFT_TYPE scale);
73  void rffts1(FFT_TYPE *ioptr, int M, FFT_TYPE *Utbl, short *BRLow);
74  void frstage(FFT_TYPE *ioptr, int M, FFT_TYPE *inUtbl);
75  void rfft8pt(FFT_TYPE *ioptr);
76  void rfft4pt(FFT_TYPE *ioptr);
77  void rfft2pt(FFT_TYPE *ioptr);
78  void rfft1pt(FFT_TYPE *ioptr);
79  void iffts1(FFT_TYPE *ioptr, int M, FFT_TYPE *Utbl, short *BRLow);
80  void ifftrecurs(FFT_TYPE *ioptr, int M, FFT_TYPE *Utbl, int Ustride,
81  int NDiffU, int StageCnt);
82  void ibfstages(FFT_TYPE *ioptr, int M, FFT_TYPE *inUtbl, int Ustride,
83  int NDiffU, int StageCnt);
84  void ibfR4(FFT_TYPE *ioptr, int M, int NDiffU);
85  void ibfR2(FFT_TYPE *ioptr, int M, int NDiffU);
86  void ifft8pt(FFT_TYPE *ioptr, FFT_TYPE scale);
87  void ifft4pt(FFT_TYPE *ioptr, FFT_TYPE scale);
88  void ifft2pt(FFT_TYPE *ioptr, FFT_TYPE scale);
89  void scbitrevR2(FFT_TYPE *ioptr, int M, short *inBRLow, FFT_TYPE scale);
90  void ffts1(FFT_TYPE *ioptr, int M, FFT_TYPE *Utbl, short *BRLow);
91  void fftrecurs(FFT_TYPE *ioptr, int M,
92  FFT_TYPE *Utbl, int Ustride, int NDiffU,
93  int StageCnt);
94  void bfstages(FFT_TYPE *ioptr, int M,
95  FFT_TYPE *inUtbl, int Ustride,
96  int NDiffU, int StageCnt);
97  void bfR4(FFT_TYPE *ioptr, int M, int NDiffU);
98  void bfR2(FFT_TYPE *ioptr, int M, int NDiffU);
99  void fft8pt(FFT_TYPE *ioptr);
100  void fft4pt(FFT_TYPE *ioptr);
101  void fft2pt(FFT_TYPE *ioptr);
102  void bitrevR2(FFT_TYPE *ioptr, int M, short *inBRLow);
103  void fftBRInit(int M, short *BRLow);
104  void fftCosInit(int M, FFT_TYPE *Utbl);
105 
106 public:
107  g_fft (int M = 8192)
108  {
109  if (M < 16) M = 16;
110  if (M > 268435456) M = 268435456;
111  FFT_size = M;
112  fftInit();
113  }
114 
116  {
117  for (int i = 0; i < 32; i++)
118  {
119  if (FFT_table_1[i] != 0) delete [] FFT_table_1[i];
120  if (FFT_table_2[i] != 0) delete [] FFT_table_2[i];
121  }
122  }
123 
124  void ComplexFFT(std::complex<FFT_TYPE> *buf);
125  void InverseComplexFFT(std::complex<FFT_TYPE> *buf);
126  void RealFFT(std::complex<FFT_TYPE> *buf);
127  void InverseRealFFT(std::complex<FFT_TYPE> *buf);
128  FFT_TYPE GetInverseComplexFFTScale();
129  FFT_TYPE GetInverseRealFFTScale();
130 };
131 
132 //------------------------------------------------------------------------------
133 // Compute Utbl, the cosine table for ffts
134 // of size (pow(2,M)/4 +1)
135 // INPUTS
136 // M = log2 of fft size
137 // OUTPUTS
138 // *Utbl = cosine table
139 //------------------------------------------------------------------------------
140 template <typename FFT_TYPE>
141 void g_fft<FFT_TYPE>::fftCosInit(int M, FFT_TYPE *Utbl)
142 {
143  unsigned int fftN = POW2(M);
144  unsigned int i1;
145 
146  Utbl[0] = FFT_TYPE(1.0);
147  for (i1 = 1; i1 < fftN/4; i1++)
148  Utbl[i1] = (FFT_TYPE)cos((2.0 * FFT_PI * (float)i1) / (float)fftN);
149  Utbl[fftN/4] = FFT_TYPE(0.0);
150 }
151 
152 //------------------------------------------------------------------------------
153 // Compute BRLow, the bit reversed table for ffts
154 // of size pow(2,M/2 -1)
155 // INPUTS
156 // M = log2 of fft size
157 // OUTPUTS
158 // *BRLow = bit reversed counter table
159 //------------------------------------------------------------------------------
160 template <typename FFT_TYPE>
162 {
163  int Mroot_1 = M / 2 - 1;
164  int Nroot_1 = POW2(Mroot_1);
165  int i1;
166  int bitsum;
167  int bitmask;
168  int bit;
169 
170  for (i1 = 0; i1 < Nroot_1; i1++) {
171  bitsum = 0;
172  bitmask = 1;
173  for (bit = 1; bit <= Mroot_1; bitmask <<= 1, bit++)
174  if (i1 & bitmask)
175  bitsum = bitsum + (Nroot_1 >> bit);
176  BRLow[i1] = bitsum;
177  }
178 }
179 
180 //------------------------------------------------------------------------------
181 // parts of ffts1
182 // bit reverse and first radix 2 stage of forward or inverse fft
183 //------------------------------------------------------------------------------
184 template <typename FFT_TYPE>
185 void g_fft<FFT_TYPE>::bitrevR2(FFT_TYPE *ioptr, int M, short *inBRLow)
186 {
187  FFT_TYPE f0r;
188  FFT_TYPE f0i;
189  FFT_TYPE f1r;
190  FFT_TYPE f1i;
191  FFT_TYPE f2r;
192  FFT_TYPE f2i;
193  FFT_TYPE f3r;
194  FFT_TYPE f3i;
195  FFT_TYPE f4r;
196  FFT_TYPE f4i;
197  FFT_TYPE f5r;
198  FFT_TYPE f5i;
199  FFT_TYPE f6r;
200  FFT_TYPE f6i;
201  FFT_TYPE f7r;
202  FFT_TYPE f7i;
203  FFT_TYPE t0r;
204  FFT_TYPE t0i;
205  FFT_TYPE t1r;
206  FFT_TYPE t1i;
207  FFT_TYPE *p0r;
208  FFT_TYPE *p1r;
209  FFT_TYPE *IOP;
210  FFT_TYPE *iolimit;
211  int Colstart;
212  int iCol;
213  unsigned int posA;
214  unsigned int posAi;
215  unsigned int posB;
216  unsigned int posBi;
217 
218  const unsigned int Nrems2 = POW2((M + 3) / 2);
219  const unsigned int Nroot_1_ColInc = POW2(M) - Nrems2;
220  const unsigned int Nroot_1 = POW2(M / 2 - 1) - 1;
221  const unsigned int ColstartShift = (M + 1) / 2 + 1;
222 
223  posA = POW2(M); // 1/2 of POW2(M) complex
224  posAi = posA + 1;
225  posB = posA + 2;
226  posBi = posB + 1;
227 
228  iolimit = ioptr + Nrems2;
229  for (; ioptr < iolimit; ioptr += POW2(M / 2 + 1)) {
230  for (Colstart = Nroot_1; Colstart >= 0; Colstart--) {
231  iCol = Nroot_1;
232  p0r = ioptr + Nroot_1_ColInc + inBRLow[Colstart] * 4;
233  IOP = ioptr + (Colstart << ColstartShift);
234  p1r = IOP + inBRLow[iCol] * 4;
235  f0r = *(p0r);
236  f0i = *(p0r + 1);
237  f1r = *(p0r + posA);
238  f1i = *(p0r + posAi);
239  for (; iCol > Colstart;) {
240  f2r = *(p0r + 2);
241  f2i = *(p0r + (2 + 1));
242  f3r = *(p0r + posB);
243  f3i = *(p0r + posBi);
244  f4r = *(p1r);
245  f4i = *(p1r + 1);
246  f5r = *(p1r + posA);
247  f5i = *(p1r + posAi);
248  f6r = *(p1r + 2);
249  f6i = *(p1r + (2 + 1));
250  f7r = *(p1r + posB);
251  f7i = *(p1r + posBi);
252 
253  t0r = f0r + f1r;
254  t0i = f0i + f1i;
255  f1r = f0r - f1r;
256  f1i = f0i - f1i;
257  t1r = f2r + f3r;
258  t1i = f2i + f3i;
259  f3r = f2r - f3r;
260  f3i = f2i - f3i;
261  f0r = f4r + f5r;
262  f0i = f4i + f5i;
263  f5r = f4r - f5r;
264  f5i = f4i - f5i;
265  f2r = f6r + f7r;
266  f2i = f6i + f7i;
267  f7r = f6r - f7r;
268  f7i = f6i - f7i;
269 
270  *(p1r) = t0r;
271  *(p1r + 1) = t0i;
272  *(p1r + 2) = f1r;
273  *(p1r + (2 + 1)) = f1i;
274  *(p1r + posA) = t1r;
275  *(p1r + posAi) = t1i;
276  *(p1r + posB) = f3r;
277  *(p1r + posBi) = f3i;
278  *(p0r) = f0r;
279  *(p0r + 1) = f0i;
280  *(p0r + 2) = f5r;
281  *(p0r + (2 + 1)) = f5i;
282  *(p0r + posA) = f2r;
283  *(p0r + posAi) = f2i;
284  *(p0r + posB) = f7r;
285  *(p0r + posBi) = f7i;
286 
287  p0r -= Nrems2;
288  f0r = *(p0r);
289  f0i = *(p0r + 1);
290  f1r = *(p0r + posA);
291  f1i = *(p0r + posAi);
292  iCol -= 1;
293  p1r = IOP + inBRLow[iCol] * 4;
294  }
295  f2r = *(p0r + 2);
296  f2i = *(p0r + (2 + 1));
297  f3r = *(p0r + posB);
298  f3i = *(p0r + posBi);
299 
300  t0r = f0r + f1r;
301  t0i = f0i + f1i;
302  f1r = f0r - f1r;
303  f1i = f0i - f1i;
304  t1r = f2r + f3r;
305  t1i = f2i + f3i;
306  f3r = f2r - f3r;
307  f3i = f2i - f3i;
308 
309  *(p0r) = t0r;
310  *(p0r + 1) = t0i;
311  *(p0r + 2) = f1r;
312  *(p0r + (2 + 1)) = f1i;
313  *(p0r + posA) = t1r;
314  *(p0r + posAi) = t1i;
315  *(p0r + posB) = f3r;
316  *(p0r + posBi) = f3i;
317  }
318  }
319 }
320 
321 //------------------------------------------------------------------------------
322 // RADIX 2 fft
323 //------------------------------------------------------------------------------
324 template <typename FFT_TYPE>
325 void g_fft<FFT_TYPE>::fft2pt(FFT_TYPE *ioptr)
326 {
327  FFT_TYPE f0r, f0i, f1r, f1i;
328  FFT_TYPE t0r, t0i;
329 
330 // bit reversed load
331  f0r = ioptr[0];
332  f0i = ioptr[1];
333  f1r = ioptr[2];
334  f1i = ioptr[3];
335 
336 // Butterflys
337 // f0 - - - t0
338 // f1 - 1 - f1
339 
340  t0r = f0r + f1r;
341  t0i = f0i + f1i;
342  f1r = f0r - f1r;
343  f1i = f0i - f1i;
344 
345 // store result
346  ioptr[0] = t0r;
347  ioptr[1] = t0i;
348  ioptr[2] = f1r;
349  ioptr[3] = f1i;
350 }
351 
352 //------------------------------------------------------------------------------
353 // RADIX 4 fft
354 //------------------------------------------------------------------------------
355 template <typename FFT_TYPE>
356 void g_fft<FFT_TYPE>::fft4pt(FFT_TYPE *ioptr)
357 {
358  FFT_TYPE f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
359  FFT_TYPE t0r, t0i, t1r, t1i;
360 
361 // bit reversed load
362  f0r = ioptr[0];
363  f0i = ioptr[1];
364  f1r = ioptr[4];
365  f1i = ioptr[5];
366  f2r = ioptr[2];
367  f2i = ioptr[3];
368  f3r = ioptr[6];
369  f3i = ioptr[7];
370 
371 // Butterflys
372 // f0 - - t0 - - f0
373 // f1 - 1 - f1 - - f1
374 // f2 - - f2 - 1 - f2
375 // f3 - 1 - t1 - -i - f3
376 
377  t0r = f0r + f1r;
378  t0i = f0i + f1i;
379  f1r = f0r - f1r;
380  f1i = f0i - f1i;
381 
382  t1r = f2r - f3r;
383  t1i = f2i - f3i;
384  f2r = f2r + f3r;
385  f2i = f2i + f3i;
386 
387  f0r = t0r + f2r;
388  f0i = t0i + f2i;
389  f2r = t0r - f2r;
390  f2i = t0i - f2i;
391 
392  f3r = f1r - t1i;
393  f3i = f1i + t1r;
394  f1r = f1r + t1i;
395  f1i = f1i - t1r;
396 
397 // store result
398  ioptr[0] = f0r;
399  ioptr[1] = f0i;
400  ioptr[2] = f1r;
401  ioptr[3] = f1i;
402  ioptr[4] = f2r;
403  ioptr[5] = f2i;
404  ioptr[6] = f3r;
405  ioptr[7] = f3i;
406 }
407 
408 //------------------------------------------------------------------------------
409 // RADIX 8 fft
410 //------------------------------------------------------------------------------
411 template <typename FFT_TYPE>
412 void g_fft<FFT_TYPE>::fft8pt(FFT_TYPE *ioptr)
413 {
414  FFT_TYPE w0r = 1.0 / FFT_ROOT2; // cos(pi/4)
415  FFT_TYPE f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
416  FFT_TYPE f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
417  FFT_TYPE t0r, t0i, t1r, t1i;
418  const FFT_TYPE Two = 2.0;
419 
420 // bit reversed load
421  f0r = ioptr[0];
422  f0i = ioptr[1];
423  f1r = ioptr[8];
424  f1i = ioptr[9];
425  f2r = ioptr[4];
426  f2i = ioptr[5];
427  f3r = ioptr[12];
428  f3i = ioptr[13];
429  f4r = ioptr[2];
430  f4i = ioptr[3];
431  f5r = ioptr[10];
432  f5i = ioptr[11];
433  f6r = ioptr[6];
434  f6i = ioptr[7];
435  f7r = ioptr[14];
436  f7i = ioptr[15];
437 
438 // Butterflys
439 // f0 - - t0 - - f0 - - f0
440 // f1 - 1 - f1 - - f1 - - f1
441 // f2 - - f2 - 1 - f2 - - f2
442 // f3 - 1 - t1 - -i - f3 - - f3
443 // f4 - - t0 - - f4 - 1 - t0
444 // f5 - 1 - f5 - - f5 - w3 - f4
445 // f6 - - f6 - 1 - f6 - -i - t1
446 // f7 - 1 - t1 - -i - f7 - iw3- f6
447 
448  t0r = f0r + f1r;
449  t0i = f0i + f1i;
450  f1r = f0r - f1r;
451  f1i = f0i - f1i;
452 
453  t1r = f2r - f3r;
454  t1i = f2i - f3i;
455  f2r = f2r + f3r;
456  f2i = f2i + f3i;
457 
458  f0r = t0r + f2r;
459  f0i = t0i + f2i;
460  f2r = t0r - f2r;
461  f2i = t0i - f2i;
462 
463  f3r = f1r - t1i;
464  f3i = f1i + t1r;
465  f1r = f1r + t1i;
466  f1i = f1i - t1r;
467 
468  t0r = f4r + f5r;
469  t0i = f4i + f5i;
470  f5r = f4r - f5r;
471  f5i = f4i - f5i;
472 
473  t1r = f6r - f7r;
474  t1i = f6i - f7i;
475  f6r = f6r + f7r;
476  f6i = f6i + f7i;
477 
478  f4r = t0r + f6r;
479  f4i = t0i + f6i;
480  f6r = t0r - f6r;
481  f6i = t0i - f6i;
482 
483  f7r = f5r - t1i;
484  f7i = f5i + t1r;
485  f5r = f5r + t1i;
486  f5i = f5i - t1r;
487 
488  t0r = f0r - f4r;
489  t0i = f0i - f4i;
490  f0r = f0r + f4r;
491  f0i = f0i + f4i;
492 
493  t1r = f2r - f6i;
494  t1i = f2i + f6r;
495  f2r = f2r + f6i;
496  f2i = f2i - f6r;
497 
498  f4r = f1r - f5r * w0r - f5i * w0r;
499  f4i = f1i + f5r * w0r - f5i * w0r;
500  f1r = f1r * Two - f4r;
501  f1i = f1i * Two - f4i;
502 
503  f6r = f3r + f7r * w0r - f7i * w0r;
504  f6i = f3i + f7r * w0r + f7i * w0r;
505  f3r = f3r * Two - f6r;
506  f3i = f3i * Two - f6i;
507 
508 // store result
509  ioptr[0] = f0r;
510  ioptr[1] = f0i;
511  ioptr[2] = f1r;
512  ioptr[3] = f1i;
513  ioptr[4] = f2r;
514  ioptr[5] = f2i;
515  ioptr[6] = f3r;
516  ioptr[7] = f3i;
517  ioptr[8] = t0r;
518  ioptr[9] = t0i;
519  ioptr[10] = f4r;
520  ioptr[11] = f4i;
521  ioptr[12] = t1r;
522  ioptr[13] = t1i;
523  ioptr[14] = f6r;
524  ioptr[15] = f6i;
525 }
526 
527 //------------------------------------------------------------------------------
528 // 2nd radix 2 stage
529 //------------------------------------------------------------------------------
530 template <typename FFT_TYPE>
531 void g_fft<FFT_TYPE>::bfR2(FFT_TYPE *ioptr, int M, int NDiffU)
532 {
533  unsigned int pos;
534  unsigned int posi;
535  unsigned int pinc;
536  unsigned int pnext;
537  unsigned int NSameU;
538  unsigned int SameUCnt;
539 
540  FFT_TYPE *pstrt;
541  FFT_TYPE *p0r, *p1r, *p2r, *p3r;
542 
543  FFT_TYPE f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
544  FFT_TYPE f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
545 
546  pinc = NDiffU * 2; // 2 floats per complex
547  pnext = pinc * 4;
548  pos = 2;
549  posi = pos + 1;
550  NSameU = POW2(M) / 4 / NDiffU; // 4 Us at a time
551  pstrt = ioptr;
552  p0r = pstrt;
553  p1r = pstrt + pinc;
554  p2r = p1r + pinc;
555  p3r = p2r + pinc;
556 
557 // Butterflys
558 // f0 - - f4
559 // f1 - 1 - f5
560 // f2 - - f6
561 // f3 - 1 - f7
562 // Butterflys
563 // f0 - - f4
564 // f1 - 1 - f5
565 // f2 - - f6
566 // f3 - 1 - f7
567 
568  for (SameUCnt = NSameU; SameUCnt > 0; SameUCnt--) {
569 
570  f0r = *p0r;
571  f1r = *p1r;
572  f0i = *(p0r + 1);
573  f1i = *(p1r + 1);
574  f2r = *p2r;
575  f3r = *p3r;
576  f2i = *(p2r + 1);
577  f3i = *(p3r + 1);
578 
579  f4r = f0r + f1r;
580  f4i = f0i + f1i;
581  f5r = f0r - f1r;
582  f5i = f0i - f1i;
583 
584  f6r = f2r + f3r;
585  f6i = f2i + f3i;
586  f7r = f2r - f3r;
587  f7i = f2i - f3i;
588 
589  *p0r = f4r;
590  *(p0r + 1) = f4i;
591  *p1r = f5r;
592  *(p1r + 1) = f5i;
593  *p2r = f6r;
594  *(p2r + 1) = f6i;
595  *p3r = f7r;
596  *(p3r + 1) = f7i;
597 
598  f0r = *(p0r + pos);
599  f1i = *(p1r + posi);
600  f0i = *(p0r + posi);
601  f1r = *(p1r + pos);
602  f2r = *(p2r + pos);
603  f3i = *(p3r + posi);
604  f2i = *(p2r + posi);
605  f3r = *(p3r + pos);
606 
607  f4r = f0r + f1i;
608  f4i = f0i - f1r;
609  f5r = f0r - f1i;
610  f5i = f0i + f1r;
611 
612  f6r = f2r + f3i;
613  f6i = f2i - f3r;
614  f7r = f2r - f3i;
615  f7i = f2i + f3r;
616 
617  *(p0r + pos) = f4r;
618  *(p0r + posi) = f4i;
619  *(p1r + pos) = f5r;
620  *(p1r + posi) = f5i;
621  *(p2r + pos) = f6r;
622  *(p2r + posi) = f6i;
623  *(p3r + pos) = f7r;
624  *(p3r + posi) = f7i;
625 
626  p0r += pnext;
627  p1r += pnext;
628  p2r += pnext;
629  p3r += pnext;
630  }
631 }
632 
633 //------------------------------------------------------------------------------
634 // 1 radix 4 stage
635 //------------------------------------------------------------------------------
636 template <typename FFT_TYPE>
637 void g_fft<FFT_TYPE>::bfR4(FFT_TYPE *ioptr, int M, int NDiffU)
638 {
639  unsigned int pos;
640  unsigned int posi;
641  unsigned int pinc;
642  unsigned int pnext;
643  unsigned int pnexti;
644  unsigned int NSameU;
645  unsigned int SameUCnt;
646 
647  FFT_TYPE *pstrt;
648  FFT_TYPE *p0r, *p1r, *p2r, *p3r;
649 
650  FFT_TYPE w1r = 1.0 / FFT_ROOT2; // cos(pi/4)
651  FFT_TYPE f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
652  FFT_TYPE f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
653  FFT_TYPE t1r, t1i;
654  const FFT_TYPE Two = 2.0;
655 
656  pinc = NDiffU * 2; // 2 floats per complex
657  pnext = pinc * 4;
658  pnexti = pnext + 1;
659  pos = 2;
660  posi = pos + 1;
661  NSameU = POW2(M) / 4 / NDiffU; // 4 pts per butterfly
662  pstrt = ioptr;
663  p0r = pstrt;
664  p1r = pstrt + pinc;
665  p2r = p1r + pinc;
666  p3r = p2r + pinc;
667 
668 // Butterflys
669 // f0 - - f0 - - f4
670 // f1 - 1 - f5 - - f5
671 // f2 - - f6 - 1 - f6
672 // f3 - 1 - f3 - -i - f7
673 // Butterflys
674 // f0 - - f4 - - f4
675 // f1 - -i - t1 - - f5
676 // f2 - - f2 - w1 - f6
677 // f3 - -i - f7 - iw1- f7
678 
679  f0r = *p0r;
680  f1r = *p1r;
681  f2r = *p2r;
682  f3r = *p3r;
683  f0i = *(p0r + 1);
684  f1i = *(p1r + 1);
685  f2i = *(p2r + 1);
686  f3i = *(p3r + 1);
687 
688  f5r = f0r - f1r;
689  f5i = f0i - f1i;
690  f0r = f0r + f1r;
691  f0i = f0i + f1i;
692 
693  f6r = f2r + f3r;
694  f6i = f2i + f3i;
695  f3r = f2r - f3r;
696  f3i = f2i - f3i;
697 
698  for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) {
699 
700  f7r = f5r - f3i;
701  f7i = f5i + f3r;
702  f5r = f5r + f3i;
703  f5i = f5i - f3r;
704 
705  f4r = f0r + f6r;
706  f4i = f0i + f6i;
707  f6r = f0r - f6r;
708  f6i = f0i - f6i;
709 
710  f2r = *(p2r + pos);
711  f2i = *(p2r + posi);
712  f1r = *(p1r + pos);
713  f1i = *(p1r + posi);
714  f3i = *(p3r + posi);
715  f0r = *(p0r + pos);
716  f3r = *(p3r + pos);
717  f0i = *(p0r + posi);
718 
719  *p3r = f7r;
720  *p0r = f4r;
721  *(p3r + 1) = f7i;
722  *(p0r + 1) = f4i;
723  *p1r = f5r;
724  *p2r = f6r;
725  *(p1r + 1) = f5i;
726  *(p2r + 1) = f6i;
727 
728  f7r = f2r - f3i;
729  f7i = f2i + f3r;
730  f2r = f2r + f3i;
731  f2i = f2i - f3r;
732 
733  f4r = f0r + f1i;
734  f4i = f0i - f1r;
735  t1r = f0r - f1i;
736  t1i = f0i + f1r;
737 
738  f5r = t1r - f7r * w1r + f7i * w1r;
739  f5i = t1i - f7r * w1r - f7i * w1r;
740  f7r = t1r * Two - f5r;
741  f7i = t1i * Two - f5i;
742 
743  f6r = f4r - f2r * w1r - f2i * w1r;
744  f6i = f4i + f2r * w1r - f2i * w1r;
745  f4r = f4r * Two - f6r;
746  f4i = f4i * Two - f6i;
747 
748  f3r = *(p3r + pnext);
749  f0r = *(p0r + pnext);
750  f3i = *(p3r + pnexti);
751  f0i = *(p0r + pnexti);
752  f2r = *(p2r + pnext);
753  f2i = *(p2r + pnexti);
754  f1r = *(p1r + pnext);
755  f1i = *(p1r + pnexti);
756 
757  *(p2r + pos) = f6r;
758  *(p1r + pos) = f5r;
759  *(p2r + posi) = f6i;
760  *(p1r + posi) = f5i;
761  *(p3r + pos) = f7r;
762  *(p0r + pos) = f4r;
763  *(p3r + posi) = f7i;
764  *(p0r + posi) = f4i;
765 
766  f6r = f2r + f3r;
767  f6i = f2i + f3i;
768  f3r = f2r - f3r;
769  f3i = f2i - f3i;
770 
771  f5r = f0r - f1r;
772  f5i = f0i - f1i;
773  f0r = f0r + f1r;
774  f0i = f0i + f1i;
775 
776  p3r += pnext;
777  p0r += pnext;
778  p1r += pnext;
779  p2r += pnext;
780  }
781  f7r = f5r - f3i;
782  f7i = f5i + f3r;
783  f5r = f5r + f3i;
784  f5i = f5i - f3r;
785 
786  f4r = f0r + f6r;
787  f4i = f0i + f6i;
788  f6r = f0r - f6r;
789  f6i = f0i - f6i;
790 
791  f2r = *(p2r + pos);
792  f2i = *(p2r + posi);
793  f1r = *(p1r + pos);
794  f1i = *(p1r + posi);
795  f3i = *(p3r + posi);
796  f0r = *(p0r + pos);
797  f3r = *(p3r + pos);
798  f0i = *(p0r + posi);
799 
800  *p3r = f7r;
801  *p0r = f4r;
802  *(p3r + 1) = f7i;
803  *(p0r + 1) = f4i;
804  *p1r = f5r;
805  *p2r = f6r;
806  *(p1r + 1) = f5i;
807  *(p2r + 1) = f6i;
808 
809  f7r = f2r - f3i;
810  f7i = f2i + f3r;
811  f2r = f2r + f3i;
812  f2i = f2i - f3r;
813 
814  f4r = f0r + f1i;
815  f4i = f0i - f1r;
816  t1r = f0r - f1i;
817  t1i = f0i + f1r;
818 
819  f5r = t1r - f7r * w1r + f7i * w1r;
820  f5i = t1i - f7r * w1r - f7i * w1r;
821  f7r = t1r * Two - f5r;
822  f7i = t1i * Two - f5i;
823 
824  f6r = f4r - f2r * w1r - f2i * w1r;
825  f6i = f4i + f2r * w1r - f2i * w1r;
826  f4r = f4r * Two - f6r;
827  f4i = f4i * Two - f6i;
828 
829  *(p2r + pos) = f6r;
830  *(p1r + pos) = f5r;
831  *(p2r + posi) = f6i;
832  *(p1r + posi) = f5i;
833  *(p3r + pos) = f7r;
834  *(p0r + pos) = f4r;
835  *(p3r + posi) = f7i;
836  *(p0r + posi) = f4i;
837 }
838 
839 //------------------------------------------------------------------------------
840 // RADIX 8 Stages
841 //------------------------------------------------------------------------------
842 template <typename FFT_TYPE>
843 void g_fft<FFT_TYPE>::bfstages(FFT_TYPE *ioptr, int M, FFT_TYPE *inUtbl, int Ustride,
844  int NDiffU, int StageCnt)
845 {
846  unsigned int pos;
847  unsigned int posi;
848  unsigned int pinc;
849  unsigned int pnext;
850  unsigned int NSameU;
851  int Uinc;
852  int Uinc2;
853  int Uinc4;
854  unsigned int DiffUCnt;
855  unsigned int SameUCnt;
856  unsigned int U2toU3;
857 
858  FFT_TYPE *pstrt;
859  FFT_TYPE *p0r, *p1r, *p2r, *p3r;
860  FFT_TYPE *u0r, *u0i, *u1r, *u1i, *u2r, *u2i;
861 
862  FFT_TYPE w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
863  FFT_TYPE f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
864  FFT_TYPE f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
865  FFT_TYPE t0r, t0i, t1r, t1i;
866  const FFT_TYPE Two = FFT_TYPE(2.0);
867 
868  pinc = NDiffU * 2; // 2 floats per complex
869  pnext = pinc * 8;
870  pos = pinc * 4;
871  posi = pos + 1;
872  NSameU = POW2(M) / 8 / NDiffU; // 8 pts per butterfly
873  Uinc = (int) NSameU * Ustride;
874  Uinc2 = Uinc * 2;
875  Uinc4 = Uinc * 4;
876  U2toU3 = (POW2(M) / 8) * Ustride;
877  for (; StageCnt > 0; StageCnt--) {
878 
879  u0r = &inUtbl[0];
880  u0i = &inUtbl[POW2(M - 2) * Ustride];
881  u1r = u0r;
882  u1i = u0i;
883  u2r = u0r;
884  u2i = u0i;
885 
886  w0r = *u0r;
887  w0i = *u0i;
888  w1r = *u1r;
889  w1i = *u1i;
890  w2r = *u2r;
891  w2i = *u2i;
892  w3r = *(u2r + U2toU3);
893  w3i = *(u2i - U2toU3);
894 
895  pstrt = ioptr;
896 
897  p0r = pstrt;
898  p1r = pstrt + pinc;
899  p2r = p1r + pinc;
900  p3r = p2r + pinc;
901 
902 // Butterflys
903 // f0 - - t0 - - f0 - - f0
904 // f1 - w0- f1 - - f1 - - f1
905 // f2 - - f2 - w1- f2 - - f4
906 // f3 - w0- t1 - iw1- f3 - - f5
907 //
908 // f4 - - t0 - - f4 - w2- t0
909 // f5 - w0- f5 - - f5 - w3- t1
910 // f6 - - f6 - w1- f6 - iw2- f6
911 // f7 - w0- t1 - iw1- f7 - iw3- f7
912 
913  for (DiffUCnt = NDiffU; DiffUCnt > 0; DiffUCnt--) {
914  f0r = *p0r;
915  f0i = *(p0r + 1);
916  f1r = *p1r;
917  f1i = *(p1r + 1);
918  for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) {
919  f2r = *p2r;
920  f2i = *(p2r + 1);
921  f3r = *p3r;
922  f3i = *(p3r + 1);
923 
924  t0r = f0r + f1r * w0r + f1i * w0i;
925  t0i = f0i - f1r * w0i + f1i * w0r;
926  f1r = f0r * Two - t0r;
927  f1i = f0i * Two - t0i;
928 
929  f4r = *(p0r + pos);
930  f4i = *(p0r + posi);
931  f5r = *(p1r + pos);
932  f5i = *(p1r + posi);
933 
934  f6r = *(p2r + pos);
935  f6i = *(p2r + posi);
936  f7r = *(p3r + pos);
937  f7i = *(p3r + posi);
938 
939  t1r = f2r - f3r * w0r - f3i * w0i;
940  t1i = f2i + f3r * w0i - f3i * w0r;
941  f2r = f2r * Two - t1r;
942  f2i = f2i * Two - t1i;
943 
944  f0r = t0r + f2r * w1r + f2i * w1i;
945  f0i = t0i - f2r * w1i + f2i * w1r;
946  f2r = t0r * Two - f0r;
947  f2i = t0i * Two - f0i;
948 
949  f3r = f1r + t1r * w1i - t1i * w1r;
950  f3i = f1i + t1r * w1r + t1i * w1i;
951  f1r = f1r * Two - f3r;
952  f1i = f1i * Two - f3i;
953 
954  t0r = f4r + f5r * w0r + f5i * w0i;
955  t0i = f4i - f5r * w0i + f5i * w0r;
956  f5r = f4r * Two - t0r;
957  f5i = f4i * Two - t0i;
958 
959  t1r = f6r - f7r * w0r - f7i * w0i;
960  t1i = f6i + f7r * w0i - f7i * w0r;
961  f6r = f6r * Two - t1r;
962  f6i = f6i * Two - t1i;
963 
964  f4r = t0r + f6r * w1r + f6i * w1i;
965  f4i = t0i - f6r * w1i + f6i * w1r;
966  f6r = t0r * Two - f4r;
967  f6i = t0i * Two - f4i;
968 
969  f7r = f5r + t1r * w1i - t1i * w1r;
970  f7i = f5i + t1r * w1r + t1i * w1i;
971  f5r = f5r * Two - f7r;
972  f5i = f5i * Two - f7i;
973 
974  t0r = f0r - f4r * w2r - f4i * w2i;
975  t0i = f0i + f4r * w2i - f4i * w2r;
976  f0r = f0r * Two - t0r;
977  f0i = f0i * Two - t0i;
978 
979  t1r = f1r - f5r * w3r - f5i * w3i;
980  t1i = f1i + f5r * w3i - f5i * w3r;
981  f1r = f1r * Two - t1r;
982  f1i = f1i * Two - t1i;
983 
984  *(p0r + pos) = t0r;
985  *(p1r + pos) = t1r;
986  *(p0r + posi) = t0i;
987  *(p1r + posi) = t1i;
988  *p0r = f0r;
989  *p1r = f1r;
990  *(p0r + 1) = f0i;
991  *(p1r + 1) = f1i;
992 
993  p0r += pnext;
994  f0r = *p0r;
995  f0i = *(p0r + 1);
996 
997  p1r += pnext;
998 
999  f1r = *p1r;
1000  f1i = *(p1r + 1);
1001 
1002  f4r = f2r - f6r * w2i + f6i * w2r;
1003  f4i = f2i - f6r * w2r - f6i * w2i;
1004  f6r = f2r * Two - f4r;
1005  f6i = f2i * Two - f4i;
1006 
1007  f5r = f3r - f7r * w3i + f7i * w3r;
1008  f5i = f3i - f7r * w3r - f7i * w3i;
1009  f7r = f3r * Two - f5r;
1010  f7i = f3i * Two - f5i;
1011 
1012  *p2r = f4r;
1013  *p3r = f5r;
1014  *(p2r + 1) = f4i;
1015  *(p3r + 1) = f5i;
1016  *(p2r + pos) = f6r;
1017  *(p3r + pos) = f7r;
1018  *(p2r + posi) = f6i;
1019  *(p3r + posi) = f7i;
1020 
1021  p2r += pnext;
1022  p3r += pnext;
1023  }
1024 
1025  f2r = *p2r;
1026  f2i = *(p2r + 1);
1027  f3r = *p3r;
1028  f3i = *(p3r + 1);
1029 
1030  t0r = f0r + f1r * w0r + f1i * w0i;
1031  t0i = f0i - f1r * w0i + f1i * w0r;
1032  f1r = f0r * Two - t0r;
1033  f1i = f0i * Two - t0i;
1034 
1035  f4r = *(p0r + pos);
1036  f4i = *(p0r + posi);
1037  f5r = *(p1r + pos);
1038  f5i = *(p1r + posi);
1039 
1040  f6r = *(p2r + pos);
1041  f6i = *(p2r + posi);
1042  f7r = *(p3r + pos);
1043  f7i = *(p3r + posi);
1044 
1045  t1r = f2r - f3r * w0r - f3i * w0i;
1046  t1i = f2i + f3r * w0i - f3i * w0r;
1047  f2r = f2r * Two - t1r;
1048  f2i = f2i * Two - t1i;
1049 
1050  f0r = t0r + f2r * w1r + f2i * w1i;
1051  f0i = t0i - f2r * w1i + f2i * w1r;
1052  f2r = t0r * Two - f0r;
1053  f2i = t0i * Two - f0i;
1054 
1055  f3r = f1r + t1r * w1i - t1i * w1r;
1056  f3i = f1i + t1r * w1r + t1i * w1i;
1057  f1r = f1r * Two - f3r;
1058  f1i = f1i * Two - f3i;
1059 
1060  if ((int) DiffUCnt == NDiffU / 2)
1061  Uinc4 = -Uinc4;
1062 
1063  u0r += Uinc4;
1064  u0i -= Uinc4;
1065  u1r += Uinc2;
1066  u1i -= Uinc2;
1067  u2r += Uinc;
1068  u2i -= Uinc;
1069 
1070  pstrt += 2;
1071 
1072  t0r = f4r + f5r * w0r + f5i * w0i;
1073  t0i = f4i - f5r * w0i + f5i * w0r;
1074  f5r = f4r * Two - t0r;
1075  f5i = f4i * Two - t0i;
1076 
1077  t1r = f6r - f7r * w0r - f7i * w0i;
1078  t1i = f6i + f7r * w0i - f7i * w0r;
1079  f6r = f6r * Two - t1r;
1080  f6i = f6i * Two - t1i;
1081 
1082  f4r = t0r + f6r * w1r + f6i * w1i;
1083  f4i = t0i - f6r * w1i + f6i * w1r;
1084  f6r = t0r * Two - f4r;
1085  f6i = t0i * Two - f4i;
1086 
1087  f7r = f5r + t1r * w1i - t1i * w1r;
1088  f7i = f5i + t1r * w1r + t1i * w1i;
1089  f5r = f5r * Two - f7r;
1090  f5i = f5i * Two - f7i;
1091 
1092  w0r = *u0r;
1093  w0i = *u0i;
1094  w1r = *u1r;
1095  w1i = *u1i;
1096 
1097  if ((int) DiffUCnt <= NDiffU / 2)
1098  w0r = -w0r;
1099 
1100  t0r = f0r - f4r * w2r - f4i * w2i;
1101  t0i = f0i + f4r * w2i - f4i * w2r;
1102  f0r = f0r * Two - t0r;
1103  f0i = f0i * Two - t0i;
1104 
1105  f4r = f2r - f6r * w2i + f6i * w2r;
1106  f4i = f2i - f6r * w2r - f6i * w2i;
1107  f6r = f2r * Two - f4r;
1108  f6i = f2i * Two - f4i;
1109 
1110  *(p0r + pos) = t0r;
1111  *p2r = f4r;
1112  *(p0r + posi) = t0i;
1113  *(p2r + 1) = f4i;
1114  w2r = *u2r;
1115  w2i = *u2i;
1116  *p0r = f0r;
1117  *(p2r + pos) = f6r;
1118  *(p0r + 1) = f0i;
1119  *(p2r + posi) = f6i;
1120 
1121  p0r = pstrt;
1122  p2r = pstrt + pinc + pinc;
1123 
1124  t1r = f1r - f5r * w3r - f5i * w3i;
1125  t1i = f1i + f5r * w3i - f5i * w3r;
1126  f1r = f1r * Two - t1r;
1127  f1i = f1i * Two - t1i;
1128 
1129  f5r = f3r - f7r * w3i + f7i * w3r;
1130  f5i = f3i - f7r * w3r - f7i * w3i;
1131  f7r = f3r * Two - f5r;
1132  f7i = f3i * Two - f5i;
1133 
1134  *(p1r + pos) = t1r;
1135  *p3r = f5r;
1136  *(p1r + posi) = t1i;
1137  *(p3r + 1) = f5i;
1138  w3r = *(u2r + U2toU3);
1139  w3i = *(u2i - U2toU3);
1140  *p1r = f1r;
1141  *(p3r + pos) = f7r;
1142  *(p1r + 1) = f1i;
1143  *(p3r + posi) = f7i;
1144 
1145  p1r = pstrt + pinc;
1146  p3r = p2r + pinc;
1147  }
1148  NSameU /= 8;
1149  Uinc /= 8;
1150  Uinc2 /= 8;
1151  Uinc4 = Uinc * 4;
1152  NDiffU *= 8;
1153  pinc *= 8;
1154  pnext *= 8;
1155  pos *= 8;
1156  posi = pos + 1;
1157  }
1158 }
1159 
1160 template <typename FFT_TYPE>
1161 void g_fft<FFT_TYPE>::fftrecurs(FFT_TYPE *ioptr, int M,
1162  FFT_TYPE *Utbl, int Ustride, int NDiffU,
1163  int StageCnt)
1164 {
1165 // recursive bfstages calls to maximize on chip cache efficiency
1166  int i1;
1167 
1168  if (M <= (int) MCACHE) // fits on chip ?
1169  bfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); // RADIX 8 Stages
1170  else {
1171  for (i1 = 0; i1 < 8; i1++) {
1172  fftrecurs(&ioptr[i1 * POW2(M - 3) * 2], M - 3, Utbl, 8 * Ustride,
1173  NDiffU, StageCnt - 1); // RADIX 8 Stages
1174  }
1175  bfstages(ioptr, M, Utbl, Ustride, POW2(M - 3), 1); // RADIX 8 Stage
1176  }
1177 }
1178 
1179 //------------------------------------------------------------------------------
1180 // Compute in-place complex fft on the rows of the input array
1181 // INPUTS
1182 // *ioptr = input data array
1183 // M = log2 of fft size (ex M=10 for 1024 point fft)
1184 // *Utbl = cosine table
1185 // *BRLow = bit reversed counter table
1186 // OUTPUTS
1187 // *ioptr = output data array
1188 //------------------------------------------------------------------------------
1189 template <typename FFT_TYPE>
1190 void g_fft<FFT_TYPE>::ffts1(FFT_TYPE *ioptr, int M, FFT_TYPE *Utbl, short *BRLow)
1191 {
1192  int StageCnt;
1193  int NDiffU;
1194 
1195  switch (M) {
1196  case 0:
1197  break;
1198  case 1:
1199  fft2pt(ioptr); // a 2 pt fft
1200  break;
1201  case 2:
1202  fft4pt(ioptr); // a 4 pt fft
1203  break;
1204  case 3:
1205  fft8pt(ioptr); // an 8 pt fft
1206  break;
1207  default:
1208  bitrevR2(ioptr, M, BRLow); // bit reverse and first radix 2 stage
1209  StageCnt = (M - 1) / 3; // number of radix 8 stages
1210  NDiffU = 2; // one radix 2 stage already complete
1211  if ((M - 1 - (StageCnt * 3)) == 1) {
1212  bfR2(ioptr, M, NDiffU); // 1 radix 2 stage
1213  NDiffU *= 2;
1214  }
1215  if ((M - 1 - (StageCnt * 3)) == 2) {
1216  bfR4(ioptr, M, NDiffU); // 1 radix 4 stage
1217  NDiffU *= 4;
1218  }
1219  if (M <= (int) MCACHE)
1220  bfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt); // RADIX 8 Stages
1221  else
1222  fftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); // RADIX 8 Stages
1223  }
1224 }
1225 
1226 //------------------------------------------------------------------------------
1227 // parts of iffts1
1228 // scaled bit reverse and first radix 2 stage forward or inverse fft
1229 //------------------------------------------------------------------------------
1230 template <typename FFT_TYPE>
1231 void g_fft<FFT_TYPE>::scbitrevR2(FFT_TYPE *ioptr, int M, short *inBRLow, FFT_TYPE scale)
1232 {
1233  FFT_TYPE f0r;
1234  FFT_TYPE f0i;
1235  FFT_TYPE f1r;
1236  FFT_TYPE f1i;
1237  FFT_TYPE f2r;
1238  FFT_TYPE f2i;
1239  FFT_TYPE f3r;
1240  FFT_TYPE f3i;
1241  FFT_TYPE f4r;
1242  FFT_TYPE f4i;
1243  FFT_TYPE f5r;
1244  FFT_TYPE f5i;
1245  FFT_TYPE f6r;
1246  FFT_TYPE f6i;
1247  FFT_TYPE f7r;
1248  FFT_TYPE f7i;
1249  FFT_TYPE t0r;
1250  FFT_TYPE t0i;
1251  FFT_TYPE t1r;
1252  FFT_TYPE t1i;
1253  FFT_TYPE *p0r;
1254  FFT_TYPE *p1r;
1255  FFT_TYPE *IOP;
1256  FFT_TYPE *iolimit;
1257  int Colstart;
1258  int iCol;
1259  unsigned int posA;
1260  unsigned int posAi;
1261  unsigned int posB;
1262  unsigned int posBi;
1263 
1264  const unsigned int Nrems2 = POW2((M + 3) / 2);
1265  const unsigned int Nroot_1_ColInc = POW2(M) - Nrems2;
1266  const unsigned int Nroot_1 = POW2(M / 2 - 1) - 1;
1267  const unsigned int ColstartShift = (M + 1) / 2 + 1;
1268 
1269  posA = POW2(M); // 1/2 of POW2(M) complexes
1270  posAi = posA + 1;
1271  posB = posA + 2;
1272  posBi = posB + 1;
1273 
1274  iolimit = ioptr + Nrems2;
1275  for (; ioptr < iolimit; ioptr += POW2(M / 2 + 1)) {
1276  for (Colstart = Nroot_1; Colstart >= 0; Colstart--) {
1277  iCol = Nroot_1;
1278  p0r = ioptr + Nroot_1_ColInc + inBRLow[Colstart] * 4;
1279  IOP = ioptr + (Colstart << ColstartShift);
1280  p1r = IOP + inBRLow[iCol] * 4;
1281  f0r = *(p0r);
1282  f0i = *(p0r + 1);
1283  f1r = *(p0r + posA);
1284  f1i = *(p0r + posAi);
1285  for (; iCol > Colstart;) {
1286  f2r = *(p0r + 2);
1287  f2i = *(p0r + (2 + 1));
1288  f3r = *(p0r + posB);
1289  f3i = *(p0r + posBi);
1290  f4r = *(p1r);
1291  f4i = *(p1r + 1);
1292  f5r = *(p1r + posA);
1293  f5i = *(p1r + posAi);
1294  f6r = *(p1r + 2);
1295  f6i = *(p1r + (2 + 1));
1296  f7r = *(p1r + posB);
1297  f7i = *(p1r + posBi);
1298 
1299  t0r = f0r + f1r;
1300  t0i = f0i + f1i;
1301  f1r = f0r - f1r;
1302  f1i = f0i - f1i;
1303  t1r = f2r + f3r;
1304  t1i = f2i + f3i;
1305  f3r = f2r - f3r;
1306  f3i = f2i - f3i;
1307  f0r = f4r + f5r;
1308  f0i = f4i + f5i;
1309  f5r = f4r - f5r;
1310  f5i = f4i - f5i;
1311  f2r = f6r + f7r;
1312  f2i = f6i + f7i;
1313  f7r = f6r - f7r;
1314  f7i = f6i - f7i;
1315 
1316  *(p1r) = scale * t0r;
1317  *(p1r + 1) = scale * t0i;
1318  *(p1r + 2) = scale * f1r;
1319  *(p1r + (2 + 1)) = scale * f1i;
1320  *(p1r + posA) = scale * t1r;
1321  *(p1r + posAi) = scale * t1i;
1322  *(p1r + posB) = scale * f3r;
1323  *(p1r + posBi) = scale * f3i;
1324  *(p0r) = scale * f0r;
1325  *(p0r + 1) = scale * f0i;
1326  *(p0r + 2) = scale * f5r;
1327  *(p0r + (2 + 1)) = scale * f5i;
1328  *(p0r + posA) = scale * f2r;
1329  *(p0r + posAi) = scale * f2i;
1330  *(p0r + posB) = scale * f7r;
1331  *(p0r + posBi) = scale * f7i;
1332 
1333  p0r -= Nrems2;
1334  f0r = *(p0r);
1335  f0i = *(p0r + 1);
1336  f1r = *(p0r + posA);
1337  f1i = *(p0r + posAi);
1338  iCol -= 1;
1339  p1r = IOP + inBRLow[iCol] * 4;
1340  }
1341  f2r = *(p0r + 2);
1342  f2i = *(p0r + (2 + 1));
1343  f3r = *(p0r + posB);
1344  f3i = *(p0r + posBi);
1345 
1346  t0r = f0r + f1r;
1347  t0i = f0i + f1i;
1348  f1r = f0r - f1r;
1349  f1i = f0i - f1i;
1350  t1r = f2r + f3r;
1351  t1i = f2i + f3i;
1352  f3r = f2r - f3r;
1353  f3i = f2i - f3i;
1354 
1355  *(p0r) = scale * t0r;
1356  *(p0r + 1) = scale * t0i;
1357  *(p0r + 2) = scale * f1r;
1358  *(p0r + (2 + 1)) = scale * f1i;
1359  *(p0r + posA) = scale * t1r;
1360  *(p0r + posAi) = scale * t1i;
1361  *(p0r + posB) = scale * f3r;
1362  *(p0r + posBi) = scale * f3i;
1363  }
1364  }
1365 }
1366 
1367 //------------------------------------------------------------------------------
1368 // RADIX 2 ifft
1369 //------------------------------------------------------------------------------
1370 template <typename FFT_TYPE>
1371 void g_fft<FFT_TYPE>::ifft2pt(FFT_TYPE *ioptr, FFT_TYPE scale)
1372 {
1373  FFT_TYPE f0r, f0i, f1r, f1i;
1374  FFT_TYPE t0r, t0i;
1375 
1376 // bit reversed load
1377  f0r = ioptr[0];
1378  f0i = ioptr[1];
1379  f1r = ioptr[2];
1380  f1i = ioptr[3];
1381 
1382 // Butterflys
1383 // f0 - - t0
1384 // f1 - 1 - f1
1385 
1386  t0r = f0r + f1r;
1387  t0i = f0i + f1i;
1388  f1r = f0r - f1r;
1389  f1i = f0i - f1i;
1390 
1391 // store result
1392  ioptr[0] = scale * t0r;
1393  ioptr[1] = scale * t0i;
1394  ioptr[2] = scale * f1r;
1395  ioptr[3] = scale * f1i;
1396 }
1397 
1398 //------------------------------------------------------------------------------
1399 // RADIX 4 ifft
1400 //------------------------------------------------------------------------------
1401 template <typename FFT_TYPE>
1402 void g_fft<FFT_TYPE>::ifft4pt(FFT_TYPE *ioptr, FFT_TYPE scale)
1403 {
1404  FFT_TYPE f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1405  FFT_TYPE t0r, t0i, t1r, t1i;
1406 
1407 // bit reversed load
1408  f0r = ioptr[0];
1409  f0i = ioptr[1];
1410  f1r = ioptr[4];
1411  f1i = ioptr[5];
1412  f2r = ioptr[2];
1413  f2i = ioptr[3];
1414  f3r = ioptr[6];
1415  f3i = ioptr[7];
1416 
1417 // Butterflys
1418 // f0 - - t0 - - f0
1419 // f1 - 1 - f1 - - f1
1420 // f2 - - f2 - 1 - f2
1421 // f3 - 1 - t1 - i - f3
1422 
1423  t0r = f0r + f1r;
1424  t0i = f0i + f1i;
1425  f1r = f0r - f1r;
1426  f1i = f0i - f1i;
1427 
1428  t1r = f2r - f3r;
1429  t1i = f2i - f3i;
1430  f2r = f2r + f3r;
1431  f2i = f2i + f3i;
1432 
1433  f0r = t0r + f2r;
1434  f0i = t0i + f2i;
1435  f2r = t0r - f2r;
1436  f2i = t0i - f2i;
1437 
1438  f3r = f1r + t1i;
1439  f3i = f1i - t1r;
1440  f1r = f1r - t1i;
1441  f1i = f1i + t1r;
1442 
1443 // store result
1444  ioptr[0] = scale * f0r;
1445  ioptr[1] = scale * f0i;
1446  ioptr[2] = scale * f1r;
1447  ioptr[3] = scale * f1i;
1448  ioptr[4] = scale * f2r;
1449  ioptr[5] = scale * f2i;
1450  ioptr[6] = scale * f3r;
1451  ioptr[7] = scale * f3i;
1452 }
1453 
1454 //------------------------------------------------------------------------------
1455 // RADIX 8 ifft
1456 //------------------------------------------------------------------------------
1457 template <typename FFT_TYPE>
1458 void g_fft<FFT_TYPE>::ifft8pt(FFT_TYPE *ioptr, FFT_TYPE scale)
1459 {
1460  FFT_TYPE w0r = 1.0 / FFT_ROOT2; // cos(pi/4)
1461  FFT_TYPE f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1462  FFT_TYPE f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1463  FFT_TYPE t0r, t0i, t1r, t1i;
1464  const FFT_TYPE Two = 2.0;
1465 
1466 // bit reversed load
1467  f0r = ioptr[0];
1468  f0i = ioptr[1];
1469  f1r = ioptr[8];
1470  f1i = ioptr[9];
1471  f2r = ioptr[4];
1472  f2i = ioptr[5];
1473  f3r = ioptr[12];
1474  f3i = ioptr[13];
1475  f4r = ioptr[2];
1476  f4i = ioptr[3];
1477  f5r = ioptr[10];
1478  f5i = ioptr[11];
1479  f6r = ioptr[6];
1480  f6i = ioptr[7];
1481  f7r = ioptr[14];
1482  f7i = ioptr[15];
1483 
1484 // Butterflys
1485 // f0 - - t0 - - f0 - - f0
1486 // f1 - 1 - f1 - - f1 - - f1
1487 // f2 - - f2 - 1 - f2 - - f2
1488 // f3 - 1 - t1 - i - f3 - - f3
1489 // f4 - - t0 - - f4 - 1 - t0
1490 // f5 - 1 - f5 - - f5 - w3 - f4
1491 // f6 - - f6 - 1 - f6 - i - t1
1492 // f7 - 1 - t1 - i - f7 - iw3- f6
1493 
1494  t0r = f0r + f1r;
1495  t0i = f0i + f1i;
1496  f1r = f0r - f1r;
1497  f1i = f0i - f1i;
1498 
1499  t1r = f2r - f3r;
1500  t1i = f2i - f3i;
1501  f2r = f2r + f3r;
1502  f2i = f2i + f3i;
1503 
1504  f0r = t0r + f2r;
1505  f0i = t0i + f2i;
1506  f2r = t0r - f2r;
1507  f2i = t0i - f2i;
1508 
1509  f3r = f1r + t1i;
1510  f3i = f1i - t1r;
1511  f1r = f1r - t1i;
1512  f1i = f1i + t1r;
1513 
1514  t0r = f4r + f5r;
1515  t0i = f4i + f5i;
1516  f5r = f4r - f5r;
1517  f5i = f4i - f5i;
1518 
1519  t1r = f6r - f7r;
1520  t1i = f6i - f7i;
1521  f6r = f6r + f7r;
1522  f6i = f6i + f7i;
1523 
1524  f4r = t0r + f6r;
1525  f4i = t0i + f6i;
1526  f6r = t0r - f6r;
1527  f6i = t0i - f6i;
1528 
1529  f7r = f5r + t1i;
1530  f7i = f5i - t1r;
1531  f5r = f5r - t1i;
1532  f5i = f5i + t1r;
1533 
1534  t0r = f0r - f4r;
1535  t0i = f0i - f4i;
1536  f0r = f0r + f4r;
1537  f0i = f0i + f4i;
1538 
1539  t1r = f2r + f6i;
1540  t1i = f2i - f6r;
1541  f2r = f2r - f6i;
1542  f2i = f2i + f6r;
1543 
1544  f4r = f1r - f5r * w0r + f5i * w0r;
1545  f4i = f1i - f5r * w0r - f5i * w0r;
1546  f1r = f1r * Two - f4r;
1547  f1i = f1i * Two - f4i;
1548 
1549  f6r = f3r + f7r * w0r + f7i * w0r;
1550  f6i = f3i - f7r * w0r + f7i * w0r;
1551  f3r = f3r * Two - f6r;
1552  f3i = f3i * Two - f6i;
1553 
1554 // store result
1555  ioptr[0] = scale * f0r;
1556  ioptr[1] = scale * f0i;
1557  ioptr[2] = scale * f1r;
1558  ioptr[3] = scale * f1i;
1559  ioptr[4] = scale * f2r;
1560  ioptr[5] = scale * f2i;
1561  ioptr[6] = scale * f3r;
1562  ioptr[7] = scale * f3i;
1563  ioptr[8] = scale * t0r;
1564  ioptr[9] = scale * t0i;
1565  ioptr[10] = scale * f4r;
1566  ioptr[11] = scale * f4i;
1567  ioptr[12] = scale * t1r;
1568  ioptr[13] = scale * t1i;
1569  ioptr[14] = scale * f6r;
1570  ioptr[15] = scale * f6i;
1571 }
1572 
1573 //------------------------------------------------------------------------------
1574 // 2nd radix 2 stage
1575 //------------------------------------------------------------------------------
1576 template <typename FFT_TYPE>
1577 void g_fft<FFT_TYPE>::ibfR2(FFT_TYPE *ioptr, int M, int NDiffU)
1578 {
1579  unsigned int pos;
1580  unsigned int posi;
1581  unsigned int pinc;
1582  unsigned int pnext;
1583  unsigned int NSameU;
1584  unsigned int SameUCnt;
1585 
1586  FFT_TYPE *pstrt;
1587  FFT_TYPE *p0r, *p1r, *p2r, *p3r;
1588 
1589  FFT_TYPE f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1590  FFT_TYPE f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1591 
1592  pinc = NDiffU * 2; // 2 floats per complex
1593  pnext = pinc * 4;
1594  pos = 2;
1595  posi = pos + 1;
1596  NSameU = POW2(M) / 4 / NDiffU; // 4 Us at a time
1597  pstrt = ioptr;
1598  p0r = pstrt;
1599  p1r = pstrt + pinc;
1600  p2r = p1r + pinc;
1601  p3r = p2r + pinc;
1602 
1603 // Butterflys
1604 // f0 - - f4
1605 // f1 - 1 - f5
1606 // f2 - - f6
1607 // f3 - 1 - f7
1608 // Butterflys
1609 // f0 - - f4
1610 // f1 - 1 - f5
1611 // f2 - - f6
1612 // f3 - 1 - f7
1613 
1614  for (SameUCnt = NSameU; SameUCnt > 0; SameUCnt--) {
1615 
1616  f0r = *p0r;
1617  f1r = *p1r;
1618  f0i = *(p0r + 1);
1619  f1i = *(p1r + 1);
1620  f2r = *p2r;
1621  f3r = *p3r;
1622  f2i = *(p2r + 1);
1623  f3i = *(p3r + 1);
1624 
1625  f4r = f0r + f1r;
1626  f4i = f0i + f1i;
1627  f5r = f0r - f1r;
1628  f5i = f0i - f1i;
1629 
1630  f6r = f2r + f3r;
1631  f6i = f2i + f3i;
1632  f7r = f2r - f3r;
1633  f7i = f2i - f3i;
1634 
1635  *p0r = f4r;
1636  *(p0r + 1) = f4i;
1637  *p1r = f5r;
1638  *(p1r + 1) = f5i;
1639  *p2r = f6r;
1640  *(p2r + 1) = f6i;
1641  *p3r = f7r;
1642  *(p3r + 1) = f7i;
1643 
1644  f0r = *(p0r + pos);
1645  f1i = *(p1r + posi);
1646  f0i = *(p0r + posi);
1647  f1r = *(p1r + pos);
1648  f2r = *(p2r + pos);
1649  f3i = *(p3r + posi);
1650  f2i = *(p2r + posi);
1651  f3r = *(p3r + pos);
1652 
1653  f4r = f0r - f1i;
1654  f4i = f0i + f1r;
1655  f5r = f0r + f1i;
1656  f5i = f0i - f1r;
1657 
1658  f6r = f2r - f3i;
1659  f6i = f2i + f3r;
1660  f7r = f2r + f3i;
1661  f7i = f2i - f3r;
1662 
1663  *(p0r + pos) = f4r;
1664  *(p0r + posi) = f4i;
1665  *(p1r + pos) = f5r;
1666  *(p1r + posi) = f5i;
1667  *(p2r + pos) = f6r;
1668  *(p2r + posi) = f6i;
1669  *(p3r + pos) = f7r;
1670  *(p3r + posi) = f7i;
1671 
1672  p0r += pnext;
1673  p1r += pnext;
1674  p2r += pnext;
1675  p3r += pnext;
1676  }
1677 }
1678 
1679 //------------------------------------------------------------------------------
1680 // 1 radix 4 stage
1681 //------------------------------------------------------------------------------
1682 template <typename FFT_TYPE>
1683 void g_fft<FFT_TYPE>::ibfR4(FFT_TYPE *ioptr, int M, int NDiffU)
1684 {
1685  unsigned int pos;
1686  unsigned int posi;
1687  unsigned int pinc;
1688  unsigned int pnext;
1689  unsigned int pnexti;
1690  unsigned int NSameU;
1691  unsigned int SameUCnt;
1692 
1693  FFT_TYPE *pstrt;
1694  FFT_TYPE *p0r, *p1r, *p2r, *p3r;
1695 
1696  FFT_TYPE w1r = 1.0 / FFT_ROOT2; // cos(pi/4)
1697  FFT_TYPE f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1698  FFT_TYPE f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1699  FFT_TYPE t1r, t1i;
1700  const FFT_TYPE Two = 2.0;
1701 
1702  pinc = NDiffU * 2; // 2 floats per complex
1703  pnext = pinc * 4;
1704  pnexti = pnext + 1;
1705  pos = 2;
1706  posi = pos + 1;
1707  NSameU = POW2(M) / 4 / NDiffU; // 4 pts per butterfly
1708  pstrt = ioptr;
1709  p0r = pstrt;
1710  p1r = pstrt + pinc;
1711  p2r = p1r + pinc;
1712  p3r = p2r + pinc;
1713 
1714 // Butterflys
1715 // f0 - - f0 - - f4
1716 // f1 - 1 - f5 - - f5
1717 // f2 - - f6 - 1 - f6
1718 // f3 - 1 - f3 - -i - f7
1719 // Butterflys
1720 // f0 - - f4 - - f4
1721 // f1 - -i - t1 - - f5
1722 // f2 - - f2 - w1 - f6
1723 // f3 - -i - f7 - iw1- f7
1724 
1725  f0r = *p0r;
1726  f1r = *p1r;
1727  f2r = *p2r;
1728  f3r = *p3r;
1729  f0i = *(p0r + 1);
1730  f1i = *(p1r + 1);
1731  f2i = *(p2r + 1);
1732  f3i = *(p3r + 1);
1733 
1734  f5r = f0r - f1r;
1735  f5i = f0i - f1i;
1736  f0r = f0r + f1r;
1737  f0i = f0i + f1i;
1738 
1739  f6r = f2r + f3r;
1740  f6i = f2i + f3i;
1741  f3r = f2r - f3r;
1742  f3i = f2i - f3i;
1743 
1744  for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) {
1745 
1746  f7r = f5r + f3i;
1747  f7i = f5i - f3r;
1748  f5r = f5r - f3i;
1749  f5i = f5i + f3r;
1750 
1751  f4r = f0r + f6r;
1752  f4i = f0i + f6i;
1753  f6r = f0r - f6r;
1754  f6i = f0i - f6i;
1755 
1756  f2r = *(p2r + pos);
1757  f2i = *(p2r + posi);
1758  f1r = *(p1r + pos);
1759  f1i = *(p1r + posi);
1760  f3i = *(p3r + posi);
1761  f0r = *(p0r + pos);
1762  f3r = *(p3r + pos);
1763  f0i = *(p0r + posi);
1764 
1765  *p3r = f7r;
1766  *p0r = f4r;
1767  *(p3r + 1) = f7i;
1768  *(p0r + 1) = f4i;
1769  *p1r = f5r;
1770  *p2r = f6r;
1771  *(p1r + 1) = f5i;
1772  *(p2r + 1) = f6i;
1773 
1774  f7r = f2r + f3i;
1775  f7i = f2i - f3r;
1776  f2r = f2r - f3i;
1777  f2i = f2i + f3r;
1778 
1779  f4r = f0r - f1i;
1780  f4i = f0i + f1r;
1781  t1r = f0r + f1i;
1782  t1i = f0i - f1r;
1783 
1784  f5r = t1r - f7r * w1r - f7i * w1r;
1785  f5i = t1i + f7r * w1r - f7i * w1r;
1786  f7r = t1r * Two - f5r;
1787  f7i = t1i * Two - f5i;
1788 
1789  f6r = f4r - f2r * w1r + f2i * w1r;
1790  f6i = f4i - f2r * w1r - f2i * w1r;
1791  f4r = f4r * Two - f6r;
1792  f4i = f4i * Two - f6i;
1793 
1794  f3r = *(p3r + pnext);
1795  f0r = *(p0r + pnext);
1796  f3i = *(p3r + pnexti);
1797  f0i = *(p0r + pnexti);
1798  f2r = *(p2r + pnext);
1799  f2i = *(p2r + pnexti);
1800  f1r = *(p1r + pnext);
1801  f1i = *(p1r + pnexti);
1802 
1803  *(p2r + pos) = f6r;
1804  *(p1r + pos) = f5r;
1805  *(p2r + posi) = f6i;
1806  *(p1r + posi) = f5i;
1807  *(p3r + pos) = f7r;
1808  *(p0r + pos) = f4r;
1809  *(p3r + posi) = f7i;
1810  *(p0r + posi) = f4i;
1811 
1812  f6r = f2r + f3r;
1813  f6i = f2i + f3i;
1814  f3r = f2r - f3r;
1815  f3i = f2i - f3i;
1816 
1817  f5r = f0r - f1r;
1818  f5i = f0i - f1i;
1819  f0r = f0r + f1r;
1820  f0i = f0i + f1i;
1821 
1822  p3r += pnext;
1823  p0r += pnext;
1824  p1r += pnext;
1825  p2r += pnext;
1826  }
1827  f7r = f5r + f3i;
1828  f7i = f5i - f3r;
1829  f5r = f5r - f3i;
1830  f5i = f5i + f3r;
1831 
1832  f4r = f0r + f6r;
1833  f4i = f0i + f6i;
1834  f6r = f0r - f6r;
1835  f6i = f0i - f6i;
1836 
1837  f2r = *(p2r + pos);
1838  f2i = *(p2r + posi);
1839  f1r = *(p1r + pos);
1840  f1i = *(p1r + posi);
1841  f3i = *(p3r + posi);
1842  f0r = *(p0r + pos);
1843  f3r = *(p3r + pos);
1844  f0i = *(p0r + posi);
1845 
1846  *p3r = f7r;
1847  *p0r = f4r;
1848  *(p3r + 1) = f7i;
1849  *(p0r + 1) = f4i;
1850  *p1r = f5r;
1851  *p2r = f6r;
1852  *(p1r + 1) = f5i;
1853  *(p2r + 1) = f6i;
1854 
1855  f7r = f2r + f3i;
1856  f7i = f2i - f3r;
1857  f2r = f2r - f3i;
1858  f2i = f2i + f3r;
1859 
1860  f4r = f0r - f1i;
1861  f4i = f0i + f1r;
1862  t1r = f0r + f1i;
1863  t1i = f0i - f1r;
1864 
1865  f5r = t1r - f7r * w1r - f7i * w1r;
1866  f5i = t1i + f7r * w1r - f7i * w1r;
1867  f7r = t1r * Two - f5r;
1868  f7i = t1i * Two - f5i;
1869 
1870  f6r = f4r - f2r * w1r + f2i * w1r;
1871  f6i = f4i - f2r * w1r - f2i * w1r;
1872  f4r = f4r * Two - f6r;
1873  f4i = f4i * Two - f6i;
1874 
1875  *(p2r + pos) = f6r;
1876  *(p1r + pos) = f5r;
1877  *(p2r + posi) = f6i;
1878  *(p1r + posi) = f5i;
1879  *(p3r + pos) = f7r;
1880  *(p0r + pos) = f4r;
1881  *(p3r + posi) = f7i;
1882  *(p0r + posi) = f4i;
1883 }
1884 
1885 //------------------------------------------------------------------------------
1886 // RADIX 8 Stages
1887 //------------------------------------------------------------------------------
1888 template <typename FFT_TYPE>
1889 void g_fft<FFT_TYPE>::ibfstages(FFT_TYPE *ioptr, int M, FFT_TYPE *inUtbl, int Ustride,
1890  int NDiffU, int StageCnt)
1891 {
1892  unsigned int pos;
1893  unsigned int posi;
1894  unsigned int pinc;
1895  unsigned int pnext;
1896  unsigned int NSameU;
1897  int Uinc;
1898  int Uinc2;
1899  int Uinc4;
1900  unsigned int DiffUCnt;
1901  unsigned int SameUCnt;
1902  unsigned int U2toU3;
1903 
1904  FFT_TYPE *pstrt;
1905  FFT_TYPE *p0r, *p1r, *p2r, *p3r;
1906  FFT_TYPE *u0r, *u0i, *u1r, *u1i, *u2r, *u2i;
1907 
1908  FFT_TYPE w0r, w0i, w1r, w1i, w2r, w2i, w3r, w3i;
1909  FFT_TYPE f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
1910  FFT_TYPE f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
1911  FFT_TYPE t0r, t0i, t1r, t1i;
1912  const FFT_TYPE Two = 2.0;
1913 
1914  pinc = NDiffU * 2; // 2 floats per complex
1915  pnext = pinc * 8;
1916  pos = pinc * 4;
1917  posi = pos + 1;
1918  NSameU = POW2(M) / 8 / NDiffU; // 8 pts per butterfly
1919  Uinc = (int) NSameU * Ustride;
1920  Uinc2 = Uinc * 2;
1921  Uinc4 = Uinc * 4;
1922  U2toU3 = (POW2(M) / 8) * Ustride;
1923  for (; StageCnt > 0; StageCnt--) {
1924 
1925  u0r = &inUtbl[0];
1926  u0i = &inUtbl[POW2(M - 2) * Ustride];
1927  u1r = u0r;
1928  u1i = u0i;
1929  u2r = u0r;
1930  u2i = u0i;
1931 
1932  w0r = *u0r;
1933  w0i = *u0i;
1934  w1r = *u1r;
1935  w1i = *u1i;
1936  w2r = *u2r;
1937  w2i = *u2i;
1938  w3r = *(u2r + U2toU3);
1939  w3i = *(u2i - U2toU3);
1940 
1941  pstrt = ioptr;
1942 
1943  p0r = pstrt;
1944  p1r = pstrt + pinc;
1945  p2r = p1r + pinc;
1946  p3r = p2r + pinc;
1947 
1948 // Butterflys
1949 // f0 - - t0 - - f0 - - f0
1950 // f1 - w0- f1 - - f1 - - f1
1951 // f2 - - f2 - w1- f2 - - f4
1952 // f3 - w0- t1 - iw1- f3 - - f5
1953 // f4 - - t0 - - f4 - w2- t0
1954 // f5 - w0- f5 - - f5 - w3- t1
1955 // f6 - - f6 - w1- f6 - iw2- f6
1956 // f7 - w0- t1 - iw1- f7 - iw3- f7
1957 
1958  for (DiffUCnt = NDiffU; DiffUCnt > 0; DiffUCnt--) {
1959  f0r = *p0r;
1960  f0i = *(p0r + 1);
1961  f1r = *p1r;
1962  f1i = *(p1r + 1);
1963  for (SameUCnt = NSameU - 1; SameUCnt > 0; SameUCnt--) {
1964  f2r = *p2r;
1965  f2i = *(p2r + 1);
1966  f3r = *p3r;
1967  f3i = *(p3r + 1);
1968 
1969  t0r = f0r + f1r * w0r - f1i * w0i;
1970  t0i = f0i + f1r * w0i + f1i * w0r;
1971  f1r = f0r * Two - t0r;
1972  f1i = f0i * Two - t0i;
1973 
1974  f4r = *(p0r + pos);
1975  f4i = *(p0r + posi);
1976  f5r = *(p1r + pos);
1977  f5i = *(p1r + posi);
1978 
1979  f6r = *(p2r + pos);
1980  f6i = *(p2r + posi);
1981  f7r = *(p3r + pos);
1982  f7i = *(p3r + posi);
1983 
1984  t1r = f2r - f3r * w0r + f3i * w0i;
1985  t1i = f2i - f3r * w0i - f3i * w0r;
1986  f2r = f2r * Two - t1r;
1987  f2i = f2i * Two - t1i;
1988 
1989  f0r = t0r + f2r * w1r - f2i * w1i;
1990  f0i = t0i + f2r * w1i + f2i * w1r;
1991  f2r = t0r * Two - f0r;
1992  f2i = t0i * Two - f0i;
1993 
1994  f3r = f1r + t1r * w1i + t1i * w1r;
1995  f3i = f1i - t1r * w1r + t1i * w1i;
1996  f1r = f1r * Two - f3r;
1997  f1i = f1i * Two - f3i;
1998 
1999  t0r = f4r + f5r * w0r - f5i * w0i;
2000  t0i = f4i + f5r * w0i + f5i * w0r;
2001  f5r = f4r * Two - t0r;
2002  f5i = f4i * Two - t0i;
2003 
2004  t1r = f6r - f7r * w0r + f7i * w0i;
2005  t1i = f6i - f7r * w0i - f7i * w0r;
2006  f6r = f6r * Two - t1r;
2007  f6i = f6i * Two - t1i;
2008 
2009  f4r = t0r + f6r * w1r - f6i * w1i;
2010  f4i = t0i + f6r * w1i + f6i * w1r;
2011  f6r = t0r * Two - f4r;
2012  f6i = t0i * Two - f4i;
2013 
2014  f7r = f5r + t1r * w1i + t1i * w1r;
2015  f7i = f5i - t1r * w1r + t1i * w1i;
2016  f5r = f5r * Two - f7r;
2017  f5i = f5i * Two - f7i;
2018 
2019  t0r = f0r - f4r * w2r + f4i * w2i;
2020  t0i = f0i - f4r * w2i - f4i * w2r;
2021  f0r = f0r * Two - t0r;
2022  f0i = f0i * Two - t0i;
2023 
2024  t1r = f1r - f5r * w3r + f5i * w3i;
2025  t1i = f1i - f5r * w3i - f5i * w3r;
2026  f1r = f1r * Two - t1r;
2027  f1i = f1i * Two - t1i;
2028 
2029  *(p0r + pos) = t0r;
2030  *(p0r + posi) = t0i;
2031  *p0r = f0r;
2032  *(p0r + 1) = f0i;
2033 
2034  p0r += pnext;
2035  f0r = *p0r;
2036  f0i = *(p0r + 1);
2037 
2038  *(p1r + pos) = t1r;
2039  *(p1r + posi) = t1i;
2040  *p1r = f1r;
2041  *(p1r + 1) = f1i;
2042 
2043  p1r += pnext;
2044 
2045  f1r = *p1r;
2046  f1i = *(p1r + 1);
2047 
2048  f4r = f2r - f6r * w2i - f6i * w2r;
2049  f4i = f2i + f6r * w2r - f6i * w2i;
2050  f6r = f2r * Two - f4r;
2051  f6i = f2i * Two - f4i;
2052 
2053  f5r = f3r - f7r * w3i - f7i * w3r;
2054  f5i = f3i + f7r * w3r - f7i * w3i;
2055  f7r = f3r * Two - f5r;
2056  f7i = f3i * Two - f5i;
2057 
2058  *p2r = f4r;
2059  *(p2r + 1) = f4i;
2060  *(p2r + pos) = f6r;
2061  *(p2r + posi) = f6i;
2062 
2063  p2r += pnext;
2064 
2065  *p3r = f5r;
2066  *(p3r + 1) = f5i;
2067  *(p3r + pos) = f7r;
2068  *(p3r + posi) = f7i;
2069 
2070  p3r += pnext;
2071  }
2072 
2073  f2r = *p2r;
2074  f2i = *(p2r + 1);
2075  f3r = *p3r;
2076  f3i = *(p3r + 1);
2077 
2078  t0r = f0r + f1r * w0r - f1i * w0i;
2079  t0i = f0i + f1r * w0i + f1i * w0r;
2080  f1r = f0r * Two - t0r;
2081  f1i = f0i * Two - t0i;
2082 
2083  f4r = *(p0r + pos);
2084  f4i = *(p0r + posi);
2085  f5r = *(p1r + pos);
2086  f5i = *(p1r + posi);
2087 
2088  f6r = *(p2r + pos);
2089  f6i = *(p2r + posi);
2090  f7r = *(p3r + pos);
2091  f7i = *(p3r + posi);
2092 
2093  t1r = f2r - f3r * w0r + f3i * w0i;
2094  t1i = f2i - f3r * w0i - f3i * w0r;
2095  f2r = f2r * Two - t1r;
2096  f2i = f2i * Two - t1i;
2097 
2098  f0r = t0r + f2r * w1r - f2i * w1i;
2099  f0i = t0i + f2r * w1i + f2i * w1r;
2100  f2r = t0r * Two - f0r;
2101  f2i = t0i * Two - f0i;
2102 
2103  f3r = f1r + t1r * w1i + t1i * w1r;
2104  f3i = f1i - t1r * w1r + t1i * w1i;
2105  f1r = f1r * Two - f3r;
2106  f1i = f1i * Two - f3i;
2107 
2108  if ((int) DiffUCnt == NDiffU / 2)
2109  Uinc4 = -Uinc4;
2110 
2111  u0r += Uinc4;
2112  u0i -= Uinc4;
2113  u1r += Uinc2;
2114  u1i -= Uinc2;
2115  u2r += Uinc;
2116  u2i -= Uinc;
2117 
2118  pstrt += 2;
2119 
2120  t0r = f4r + f5r * w0r - f5i * w0i;
2121  t0i = f4i + f5r * w0i + f5i * w0r;
2122  f5r = f4r * Two - t0r;
2123  f5i = f4i * Two - t0i;
2124 
2125  t1r = f6r - f7r * w0r + f7i * w0i;
2126  t1i = f6i - f7r * w0i - f7i * w0r;
2127  f6r = f6r * Two - t1r;
2128  f6i = f6i * Two - t1i;
2129 
2130  f4r = t0r + f6r * w1r - f6i * w1i;
2131  f4i = t0i + f6r * w1i + f6i * w1r;
2132  f6r = t0r * Two - f4r;
2133  f6i = t0i * Two - f4i;
2134 
2135  f7r = f5r + t1r * w1i + t1i * w1r;
2136  f7i = f5i - t1r * w1r + t1i * w1i;
2137  f5r = f5r * Two - f7r;
2138  f5i = f5i * Two - f7i;
2139 
2140  w0r = *u0r;
2141  w0i = *u0i;
2142  w1r = *u1r;
2143  w1i = *u1i;
2144 
2145  if ((int) DiffUCnt <= NDiffU / 2)
2146  w0r = -w0r;
2147 
2148  t0r = f0r - f4r * w2r + f4i * w2i;
2149  t0i = f0i - f4r * w2i - f4i * w2r;
2150  f0r = f0r * Two - t0r;
2151  f0i = f0i * Two - t0i;
2152 
2153  f4r = f2r - f6r * w2i - f6i * w2r;
2154  f4i = f2i + f6r * w2r - f6i * w2i;
2155  f6r = f2r * Two - f4r;
2156  f6i = f2i * Two - f4i;
2157 
2158  *(p0r + pos) = t0r;
2159  *p2r = f4r;
2160  *(p0r + posi) = t0i;
2161  *(p2r + 1) = f4i;
2162  w2r = *u2r;
2163  w2i = *u2i;
2164  *p0r = f0r;
2165  *(p2r + pos) = f6r;
2166  *(p0r + 1) = f0i;
2167  *(p2r + posi) = f6i;
2168 
2169  p0r = pstrt;
2170  p2r = pstrt + pinc + pinc;
2171 
2172  t1r = f1r - f5r * w3r + f5i * w3i;
2173  t1i = f1i - f5r * w3i - f5i * w3r;
2174  f1r = f1r * Two - t1r;
2175  f1i = f1i * Two - t1i;
2176 
2177  f5r = f3r - f7r * w3i - f7i * w3r;
2178  f5i = f3i + f7r * w3r - f7i * w3i;
2179  f7r = f3r * Two - f5r;
2180  f7i = f3i * Two - f5i;
2181 
2182  *(p1r + pos) = t1r;
2183  *p3r = f5r;
2184  *(p1r + posi) = t1i;
2185  *(p3r + 1) = f5i;
2186  w3r = *(u2r + U2toU3);
2187  w3i = *(u2i - U2toU3);
2188  *p1r = f1r;
2189  *(p3r + pos) = f7r;
2190  *(p1r + 1) = f1i;
2191  *(p3r + posi) = f7i;
2192 
2193  p1r = pstrt + pinc;
2194  p3r = p2r + pinc;
2195  }
2196  NSameU /= 8;
2197  Uinc /= 8;
2198  Uinc2 /= 8;
2199  Uinc4 = Uinc * 4;
2200  NDiffU *= 8;
2201  pinc *= 8;
2202  pnext *= 8;
2203  pos *= 8;
2204  posi = pos + 1;
2205  }
2206 }
2207 
2208 //------------------------------------------------------------------------------
2209 // recursive bfstages calls to maximize on chip cache efficiency
2210 //------------------------------------------------------------------------------
2211 template <typename FFT_TYPE>
2212 void g_fft<FFT_TYPE>::ifftrecurs(FFT_TYPE *ioptr, int M, FFT_TYPE *Utbl, int Ustride,
2213  int NDiffU, int StageCnt)
2214 {
2215  int i1;
2216 
2217  if (M <= (int) MCACHE)
2218  ibfstages(ioptr, M, Utbl, Ustride, NDiffU, StageCnt); // RADIX 8 Stages
2219  else {
2220  for (i1 = 0; i1 < 8; i1++) {
2221  ifftrecurs(&ioptr[i1 * POW2(M - 3) * 2], M - 3, Utbl, 8 * Ustride,
2222  NDiffU, StageCnt - 1); // RADIX 8 Stages
2223  }
2224  ibfstages(ioptr, M, Utbl, Ustride, POW2(M - 3), 1); // RADIX 8 Stage
2225  }
2226 }
2227 
2228 //------------------------------------------------------------------------------
2229 // Compute in-place inverse complex fft on the rows of the input array
2230 // INPUTS
2231 // *ioptr = input data array
2232 // M = log2 of fft size
2233 // *Utbl = cosine table
2234 // *BRLow = bit reversed counter table
2235 // OUTPUTS
2236 // *ioptr = output data array
2237 //------------------------------------------------------------------------------
2238 template <typename FFT_TYPE>
2239 void g_fft<FFT_TYPE>::iffts1(FFT_TYPE *ioptr, int M, FFT_TYPE *Utbl, short *BRLow)
2240 {
2241  int StageCnt;
2242  int NDiffU;
2243  const FFT_TYPE scale = 1.0 / POW2(M);
2244 
2245  switch (M) {
2246  case 0:
2247  break;
2248  case 1:
2249  ifft2pt(ioptr, scale); // a 2 pt fft
2250  break;
2251  case 2:
2252  ifft4pt(ioptr, scale); // a 4 pt fft
2253  break;
2254  case 3:
2255  ifft8pt(ioptr, scale); // an 8 pt fft
2256  break;
2257  default:
2258 // bit reverse and first radix 2 stage
2259  scbitrevR2(ioptr, M, BRLow, scale);
2260  StageCnt = (M - 1) / 3; // number of radix 8 stages
2261  NDiffU = 2; // one radix 2 stage already complete
2262  if ((M - 1 - (StageCnt * 3)) == 1) {
2263  ibfR2(ioptr, M, NDiffU); // 1 radix 2 stage
2264  NDiffU *= 2;
2265  }
2266  if ((M - 1 - (StageCnt * 3)) == 2) {
2267  ibfR4(ioptr, M, NDiffU); // 1 radix 4 stage
2268  NDiffU *= 4;
2269  }
2270  if (M <= (int) MCACHE)
2271  ibfstages(ioptr, M, Utbl, 1, NDiffU, StageCnt); // RADIX 8 Stages
2272  else
2273  ifftrecurs(ioptr, M, Utbl, 1, NDiffU, StageCnt); // RADIX 8 Stages
2274  }
2275 }
2276 
2277 //------------------------------------------------------------------------------
2278 // parts of rffts1
2279 // RADIX 2 rfft
2280 //------------------------------------------------------------------------------
2281 template <typename FFT_TYPE>
2282 void g_fft<FFT_TYPE>::rfft1pt(FFT_TYPE *ioptr)
2283 {
2284  FFT_TYPE f0r, f0i;
2285  FFT_TYPE t0r, t0i;
2286 
2287 // bit reversed load
2288  f0r = ioptr[0];
2289  f0i = ioptr[1];
2290 
2291 // finish rfft
2292  t0r = f0r + f0i;
2293  t0i = f0r - f0i;
2294 
2295 // store result
2296  ioptr[0] = t0r;
2297  ioptr[1] = t0i;
2298 }
2299 
2300 //------------------------------------------------------------------------------
2301 // RADIX 4 rfft
2302 //------------------------------------------------------------------------------
2303 template <typename FFT_TYPE>
2304 void g_fft<FFT_TYPE>::rfft2pt(FFT_TYPE *ioptr)
2305 {
2306  FFT_TYPE f0r, f0i, f1r, f1i;
2307  FFT_TYPE t0r, t0i;
2308 
2309 // bit reversed load
2310  f0r = ioptr[0];
2311  f0i = ioptr[1];
2312  f1r = ioptr[2];
2313  f1i = ioptr[3];
2314 
2315 // Butterflys
2316 // f0 - - t0
2317 // f1 - 1 - f1
2318 
2319  t0r = f0r + f1r;
2320  t0i = f0i + f1i;
2321  f1r = f0r - f1r;
2322  f1i = f1i - f0i;
2323 // finish rfft
2324  f0r = t0r + t0i;
2325  f0i = t0r - t0i;
2326 
2327 // store result
2328  ioptr[0] = f0r;
2329  ioptr[1] = f0i;
2330  ioptr[2] = f1r;
2331  ioptr[3] = f1i;
2332 }
2333 
2334 //------------------------------------------------------------------------------
2335 // RADIX 8 rfft
2336 //------------------------------------------------------------------------------
2337 template <typename FFT_TYPE>
2338 void g_fft<FFT_TYPE>::rfft4pt(FFT_TYPE *ioptr)
2339 {
2340  FFT_TYPE f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2341  FFT_TYPE t0r, t0i, t1r, t1i;
2342  FFT_TYPE w0r = 1.0 / FFT_ROOT2; // cos(pi/4)
2343  const FFT_TYPE Two = 2.0;
2344  const FFT_TYPE scale = 0.5;
2345 
2346 // bit reversed load
2347  f0r = ioptr[0];
2348  f0i = ioptr[1];
2349  f1r = ioptr[4];
2350  f1i = ioptr[5];
2351  f2r = ioptr[2];
2352  f2i = ioptr[3];
2353  f3r = ioptr[6];
2354  f3i = ioptr[7];
2355 
2356 // Butterflys
2357 // f0 - - t0 - - f0
2358 // f1 - 1 - f1 - - f1
2359 // f2 - - f2 - 1 - f2
2360 // f3 - 1 - t1 - -i - f3
2361 
2362  t0r = f0r + f1r;
2363  t0i = f0i + f1i;
2364  f1r = f0r - f1r;
2365  f1i = f0i - f1i;
2366 
2367  t1r = f2r - f3r;
2368  t1i = f2i - f3i;
2369  f2r = f2r + f3r;
2370  f2i = f2i + f3i;
2371 
2372  f0r = t0r + f2r;
2373  f0i = t0i + f2i;
2374  f2r = t0r - f2r;
2375  f2i = f2i - t0i; // neg for rfft
2376 
2377  f3r = f1r - t1i;
2378  f3i = f1i + t1r;
2379  f1r = f1r + t1i;
2380  f1i = f1i - t1r;
2381 
2382 // finish rfft
2383  t0r = f0r + f0i; // compute Re(x[0])
2384  t0i = f0r - f0i; // compute Re(x[N/2])
2385 
2386  t1r = f1r + f3r;
2387  t1i = f1i - f3i;
2388  f0r = f1i + f3i;
2389  f0i = f3r - f1r;
2390 
2391  f1r = t1r + w0r * f0r + w0r * f0i;
2392  f1i = t1i - w0r * f0r + w0r * f0i;
2393  f3r = Two * t1r - f1r;
2394  f3i = f1i - Two * t1i;
2395 
2396 // store result
2397  ioptr[4] = f2r;
2398  ioptr[5] = f2i;
2399  ioptr[0] = t0r;
2400  ioptr[1] = t0i;
2401 
2402  ioptr[2] = scale * f1r;
2403  ioptr[3] = scale * f1i;
2404  ioptr[6] = scale * f3r;
2405  ioptr[7] = scale * f3i;
2406 }
2407 
2408 //------------------------------------------------------------------------------
2409 // RADIX 16 rfft
2410 //------------------------------------------------------------------------------
2411 template <typename FFT_TYPE>
2412 void g_fft<FFT_TYPE>::rfft8pt(FFT_TYPE *ioptr)
2413 {
2414  FFT_TYPE w0r = 1.0 / FFT_ROOT2; // cos(pi/4)
2415  FFT_TYPE w1r = FFT_COSPID8; // cos(pi/8)
2416  FFT_TYPE w1i = FFT_SINPID8; // sin(pi/8)
2417  FFT_TYPE f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2418  FFT_TYPE f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
2419  FFT_TYPE t0r, t0i, t1r, t1i;
2420  const FFT_TYPE Two = 2.0;
2421  const FFT_TYPE scale = 0.5;
2422 
2423 // bit reversed load
2424  f0r = ioptr[0];
2425  f0i = ioptr[1];
2426  f1r = ioptr[8];
2427  f1i = ioptr[9];
2428  f2r = ioptr[4];
2429  f2i = ioptr[5];
2430  f3r = ioptr[12];
2431  f3i = ioptr[13];
2432  f4r = ioptr[2];
2433  f4i = ioptr[3];
2434  f5r = ioptr[10];
2435  f5i = ioptr[11];
2436  f6r = ioptr[6];
2437  f6i = ioptr[7];
2438  f7r = ioptr[14];
2439  f7i = ioptr[15];
2440 
2441 // Butterflys
2442 // f0 - - t0 - - f0 - - f0
2443 // f1 - 1 - f1 - - f1 - - f1
2444 // f2 - - f2 - 1 - f2 - - f2
2445 // f3 - 1 - t1 - -i - f3 - - f3
2446 // f4 - - t0 - - f4 - 1 - t0
2447 // f5 - 1 - f5 - - f5 - w3 - f4
2448 // f6 - - f6 - 1 - f6 - -i - t1
2449 // f7 - 1 - t1 - -i - f7 - iw3- f6
2450 
2451  t0r = f0r + f1r;
2452  t0i = f0i + f1i;
2453  f1r = f0r - f1r;
2454  f1i = f0i - f1i;
2455 
2456  t1r = f2r - f3r;
2457  t1i = f2i - f3i;
2458  f2r = f2r + f3r;
2459  f2i = f2i + f3i;
2460 
2461  f0r = t0r + f2r;
2462  f0i = t0i + f2i;
2463  f2r = t0r - f2r;
2464  f2i = t0i - f2i;
2465 
2466  f3r = f1r - t1i;
2467  f3i = f1i + t1r;
2468  f1r = f1r + t1i;
2469  f1i = f1i - t1r;
2470 
2471  t0r = f4r + f5r;
2472  t0i = f4i + f5i;
2473  f5r = f4r - f5r;
2474  f5i = f4i - f5i;
2475 
2476  t1r = f6r - f7r;
2477  t1i = f6i - f7i;
2478  f6r = f6r + f7r;
2479  f6i = f6i + f7i;
2480 
2481  f4r = t0r + f6r;
2482  f4i = t0i + f6i;
2483  f6r = t0r - f6r;
2484  f6i = t0i - f6i;
2485 
2486  f7r = f5r - t1i;
2487  f7i = f5i + t1r;
2488  f5r = f5r + t1i;
2489  f5i = f5i - t1r;
2490 
2491  t0r = f0r - f4r;
2492  t0i = f4i - f0i; // neg for rfft
2493  f0r = f0r + f4r;
2494  f0i = f0i + f4i;
2495 
2496  t1r = f2r - f6i;
2497  t1i = f2i + f6r;
2498  f2r = f2r + f6i;
2499  f2i = f2i - f6r;
2500 
2501  f4r = f1r - f5r * w0r - f5i * w0r;
2502  f4i = f1i + f5r * w0r - f5i * w0r;
2503  f1r = f1r * Two - f4r;
2504  f1i = f1i * Two - f4i;
2505 
2506  f6r = f3r + f7r * w0r - f7i * w0r;
2507  f6i = f3i + f7r * w0r + f7i * w0r;
2508  f3r = f3r * Two - f6r;
2509  f3i = f3i * Two - f6i;
2510 
2511 // finish rfft
2512  f5r = f0r + f0i; // compute Re(x[0])
2513  f5i = f0r - f0i; // compute Re(x[N/2])
2514 
2515  f0r = f2r + t1r;
2516  f0i = f2i - t1i;
2517  f7r = f2i + t1i;
2518  f7i = t1r - f2r;
2519 
2520  f2r = f0r + w0r * f7r + w0r * f7i;
2521  f2i = f0i - w0r * f7r + w0r * f7i;
2522  t1r = Two * f0r - f2r;
2523  t1i = f2i - Two * f0i;
2524 
2525  f0r = f1r + f6r;
2526  f0i = f1i - f6i;
2527  f7r = f1i + f6i;
2528  f7i = f6r - f1r;
2529 
2530  f1r = f0r + w1r * f7r + w1i * f7i;
2531  f1i = f0i - w1i * f7r + w1r * f7i;
2532  f6r = Two * f0r - f1r;
2533  f6i = f1i - Two * f0i;
2534 
2535  f0r = f3r + f4r;
2536  f0i = f3i - f4i;
2537  f7r = f3i + f4i;
2538  f7i = f4r - f3r;
2539 
2540  f3r = f0r + w1i * f7r + w1r * f7i;
2541  f3i = f0i - w1r * f7r + w1i * f7i;
2542  f4r = Two * f0r - f3r;
2543  f4i = f3i - Two * f0i;
2544 
2545 // store result
2546  ioptr[8] = t0r;
2547  ioptr[9] = t0i;
2548  ioptr[0] = f5r;
2549  ioptr[1] = f5i;
2550 
2551  ioptr[4] = scale * f2r;
2552  ioptr[5] = scale * f2i;
2553  ioptr[12] = scale * t1r;
2554  ioptr[13] = scale * t1i;
2555 
2556  ioptr[2] = scale * f1r;
2557  ioptr[3] = scale * f1i;
2558  ioptr[6] = scale * f3r;
2559  ioptr[7] = scale * f3i;
2560  ioptr[10] = scale * f4r;
2561  ioptr[11] = scale * f4i;
2562  ioptr[14] = scale * f6r;
2563  ioptr[15] = scale * f6i;
2564 }
2565 
2566 //------------------------------------------------------------------------------
2567 // Finish RFFT
2568 //------------------------------------------------------------------------------
2569 template <typename FFT_TYPE>
2570 void g_fft<FFT_TYPE>::frstage(FFT_TYPE *ioptr, int M, FFT_TYPE *inUtbl)
2571 {
2572  unsigned int pos;
2573  unsigned int posi;
2574  unsigned int diffUcnt;
2575 
2576  FFT_TYPE *p0r, *p1r;
2577  FFT_TYPE *u0r, *u0i;
2578 
2579  FFT_TYPE w0r, w0i;
2580  FFT_TYPE f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i;
2581  FFT_TYPE t0r, t0i, t1r, t1i;
2582  const FFT_TYPE Two = 2.0;
2583 
2584  pos = POW2(M - 1);
2585  posi = pos + 1;
2586 
2587  p0r = ioptr;
2588  p1r = ioptr + pos / 2;
2589 
2590  u0r = inUtbl + POW2(M - 3);
2591 
2592  w0r = *u0r, f0r = *(p0r);
2593  f0i = *(p0r + 1);
2594  f4r = *(p0r + pos);
2595  f4i = *(p0r + posi);
2596  f1r = *(p1r);
2597  f1i = *(p1r + 1);
2598  f5r = *(p1r + pos);
2599  f5i = *(p1r + posi);
2600 
2601  t0r = Two * f0r + Two * f0i; // compute Re(x[0])
2602  t0i = Two * f0r - Two * f0i; // compute Re(x[N/2])
2603  t1r = f4r + f4r;
2604  t1i = -f4i - f4i;
2605 
2606  f0r = f1r + f5r;
2607  f0i = f1i - f5i;
2608  f4r = f1i + f5i;
2609  f4i = f5r - f1r;
2610 
2611  f1r = f0r + w0r * f4r + w0r * f4i;
2612  f1i = f0i - w0r * f4r + w0r * f4i;
2613  f5r = Two * f0r - f1r;
2614  f5i = f1i - Two * f0i;
2615 
2616  *(p0r) = t0r;
2617  *(p0r + 1) = t0i;
2618  *(p0r + pos) = t1r;
2619  *(p0r + posi) = t1i;
2620  *(p1r) = f1r;
2621  *(p1r + 1) = f1i;
2622  *(p1r + pos) = f5r;
2623  *(p1r + posi) = f5i;
2624 
2625  u0r = inUtbl + 1;
2626  u0i = inUtbl + (POW2(M - 2) - 1);
2627 
2628  w0r = *u0r, w0i = *u0i;
2629 
2630  p0r = (ioptr + 2);
2631  p1r = (ioptr + (POW2(M - 2) - 1) * 2);
2632 
2633 // Butterflys
2634 // f0 - t0 - - f0
2635 // f5 - t1 - w0 - f5
2636 // f1 - t0 - - f1
2637 // f4 - t1 -iw0 - f4
2638 
2639  for (diffUcnt = POW2(M - 3) - 1; diffUcnt > 0; diffUcnt--) {
2640 
2641  f0r = *(p0r);
2642  f0i = *(p0r + 1);
2643  f5r = *(p1r + pos);
2644  f5i = *(p1r + posi);
2645  f1r = *(p1r);
2646  f1i = *(p1r + 1);
2647  f4r = *(p0r + pos);
2648  f4i = *(p0r + posi);
2649 
2650  t0r = f0r + f5r;
2651  t0i = f0i - f5i;
2652  t1r = f0i + f5i;
2653  t1i = f5r - f0r;
2654 
2655  f0r = t0r + w0r * t1r + w0i * t1i;
2656  f0i = t0i - w0i * t1r + w0r * t1i;
2657  f5r = Two * t0r - f0r;
2658  f5i = f0i - Two * t0i;
2659 
2660  t0r = f1r + f4r;
2661  t0i = f1i - f4i;
2662  t1r = f1i + f4i;
2663  t1i = f4r - f1r;
2664 
2665  f1r = t0r + w0i * t1r + w0r * t1i;
2666  f1i = t0i - w0r * t1r + w0i * t1i;
2667  f4r = Two * t0r - f1r;
2668  f4i = f1i - Two * t0i;
2669 
2670  *(p0r) = f0r;
2671  *(p0r + 1) = f0i;
2672  *(p1r + pos) = f5r;
2673  *(p1r + posi) = f5i;
2674 
2675  w0r = *++u0r;
2676  w0i = *--u0i;
2677 
2678  *(p1r) = f1r;
2679  *(p1r + 1) = f1i;
2680  *(p0r + pos) = f4r;
2681  *(p0r + posi) = f4i;
2682 
2683  p0r += 2;
2684  p1r -= 2;
2685  }
2686 }
2687 
2688 //------------------------------------------------------------------------------
2689 // Compute in-place real fft on the rows of the input array
2690 // The result is the complex spectra of the positive frequencies
2691 // except the location for the first complex number contains the real
2692 // values for DC and Nyquest
2693 // INPUTS
2694 // *ioptr = real input data array
2695 // M = log2 of fft size
2696 // *Utbl = cosine table
2697 // *BRLow = bit reversed counter table
2698 // OUTPUTS
2699 // *ioptr = output data array in the following order
2700 // Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]), Re(x[2]), Im(x[2]),
2701 // ... Re(x[N/2-1]), Im(x[N/2-1]).
2702 //------------------------------------------------------------------------------
2703 template <typename FFT_TYPE>
2704 void g_fft<FFT_TYPE>::rffts1(FFT_TYPE *ioptr, int M, FFT_TYPE *Utbl, short *BRLow)
2705 {
2706  FFT_TYPE scale;
2707  int StageCnt;
2708  int NDiffU;
2709 
2710  M = M - 1;
2711  switch (M) {
2712  case -1:
2713  break;
2714  case 0:
2715  rfft1pt(ioptr); // a 2 pt fft
2716  break;
2717  case 1:
2718  rfft2pt(ioptr); // a 4 pt fft
2719  break;
2720  case 2:
2721  rfft4pt(ioptr); // an 8 pt fft
2722  break;
2723  case 3:
2724  rfft8pt(ioptr); // a 16 pt fft
2725  break;
2726  default:
2727  scale = 0.5;
2728 // bit reverse and first radix 2 stage
2729  scbitrevR2(ioptr, M, BRLow, scale);
2730  StageCnt = (M - 1) / 3; // number of radix 8 stages
2731  NDiffU = 2; // one radix 2 stage already complete
2732  if ((M - 1 - (StageCnt * 3)) == 1) {
2733  bfR2(ioptr, M, NDiffU); // 1 radix 2 stage
2734  NDiffU *= 2;
2735  }
2736  if ((M - 1 - (StageCnt * 3)) == 2) {
2737  bfR4(ioptr, M, NDiffU); // 1 radix 4 stage
2738  NDiffU *= 4;
2739  }
2740  if (M <= (int) MCACHE)
2741  bfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); // RADIX 8 Stages
2742  else
2743  fftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); // RADIX 8 Stages
2744  frstage(ioptr, M + 1, Utbl);
2745  }
2746 }
2747 
2748 //------------------------------------------------------------------------------
2749 // parts of riffts1
2750 //------------------------------------------------------------------------------
2751 
2752 //------------------------------------------------------------------------------
2753 // RADIX 2 rifft
2754 //------------------------------------------------------------------------------
2755 template <typename FFT_TYPE>
2756 void g_fft<FFT_TYPE>::rifft1pt(FFT_TYPE *ioptr, FFT_TYPE scale)
2757 {
2758  FFT_TYPE f0r, f0i;
2759  FFT_TYPE t0r, t0i;
2760 
2761 // bit reversed load
2762  f0r = ioptr[0];
2763  f0i = ioptr[1];
2764 
2765 // finish rfft
2766  t0r = f0r + f0i;
2767  t0i = f0r - f0i;
2768 
2769 // store result
2770  ioptr[0] = scale * t0r;
2771  ioptr[1] = scale * t0i;
2772 }
2773 
2774 //------------------------------------------------------------------------------
2775 // RADIX 4 rifft
2776 //------------------------------------------------------------------------------
2777 template <typename FFT_TYPE>
2778 void g_fft<FFT_TYPE>::rifft2pt(FFT_TYPE *ioptr, FFT_TYPE scale)
2779 {
2780  FFT_TYPE f0r, f0i, f1r, f1i;
2781  FFT_TYPE t0r, t0i;
2782  const FFT_TYPE Two = FFT_TYPE(2.0);
2783 
2784 // bit reversed load
2785  t0r = ioptr[0];
2786  t0i = ioptr[1];
2787  f1r = Two * ioptr[2];
2788  f1i = Two * ioptr[3];
2789 
2790 // start rifft
2791  f0r = t0r + t0i;
2792  f0i = t0r - t0i;
2793 
2794 // Butterflys
2795 // f0 - - t0
2796 // f1 - 1 - f1
2797 
2798  t0r = f0r + f1r;
2799  t0i = f0i - f1i;
2800  f1r = f0r - f1r;
2801  f1i = f0i + f1i;
2802 
2803 // store result
2804  ioptr[0] = scale * t0r;
2805  ioptr[1] = scale * t0i;
2806  ioptr[2] = scale * f1r;
2807  ioptr[3] = scale * f1i;
2808 }
2809 
2810 //------------------------------------------------------------------------------
2811 // RADIX 8 rifft
2812 //------------------------------------------------------------------------------
2813 template <typename FFT_TYPE>
2814 void g_fft<FFT_TYPE>::rifft4pt(FFT_TYPE *ioptr, FFT_TYPE scale)
2815 {
2816  FFT_TYPE f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2817  FFT_TYPE t0r, t0i, t1r, t1i;
2818  FFT_TYPE w0r = 1.0 / FFT_ROOT2; // cos(pi/4)
2819  const FFT_TYPE Two = FFT_TYPE(2.0);
2820 
2821 // bit reversed load
2822  t0r = ioptr[0];
2823  t0i = ioptr[1];
2824  f2r = ioptr[2];
2825  f2i = ioptr[3];
2826  f1r = Two * ioptr[4];
2827  f1i = Two * ioptr[5];
2828  f3r = ioptr[6];
2829  f3i = ioptr[7];
2830 
2831 // start rfft
2832  f0r = t0r + t0i; // compute Re(x[0])
2833  f0i = t0r - t0i; // compute Re(x[N/2])
2834 
2835  t1r = f2r + f3r;
2836  t1i = f2i - f3i;
2837  t0r = f2r - f3r;
2838  t0i = f2i + f3i;
2839 
2840  f2r = t1r - w0r * t0r - w0r * t0i;
2841  f2i = t1i + w0r * t0r - w0r * t0i;
2842  f3r = Two * t1r - f2r;
2843  f3i = f2i - Two * t1i;
2844 
2845 // Butterflys
2846 // f0 - - t0 - - f0
2847 // f1 - 1 - f1 - - f1
2848 // f2 - - f2 - 1 - f2
2849 // f3 - 1 - t1 - i - f3
2850 
2851  t0r = f0r + f1r;
2852  t0i = f0i - f1i;
2853  f1r = f0r - f1r;
2854  f1i = f0i + f1i;
2855 
2856  t1r = f2r - f3r;
2857  t1i = f2i - f3i;
2858  f2r = f2r + f3r;
2859  f2i = f2i + f3i;
2860 
2861  f0r = t0r + f2r;
2862  f0i = t0i + f2i;
2863  f2r = t0r - f2r;
2864  f2i = t0i - f2i;
2865 
2866  f3r = f1r + t1i;
2867  f3i = f1i - t1r;
2868  f1r = f1r - t1i;
2869  f1i = f1i + t1r;
2870 
2871 // store result
2872  ioptr[0] = scale * f0r;
2873  ioptr[1] = scale * f0i;
2874  ioptr[2] = scale * f1r;
2875  ioptr[3] = scale * f1i;
2876  ioptr[4] = scale * f2r;
2877  ioptr[5] = scale * f2i;
2878  ioptr[6] = scale * f3r;
2879  ioptr[7] = scale * f3i;
2880 }
2881 
2882 //------------------------------------------------------------------------------
2883 // RADIX 16 rifft
2884 //------------------------------------------------------------------------------
2885 template <typename FFT_TYPE>
2886 void g_fft<FFT_TYPE>::rifft8pt(FFT_TYPE *ioptr, FFT_TYPE scale)
2887 {
2888  FFT_TYPE w0r = (FFT_TYPE) (1.0 / FFT_ROOT2); // cos(pi/4)
2889  FFT_TYPE w1r = FFT_COSPID8; // cos(pi/8)
2890  FFT_TYPE w1i = FFT_SINPID8; // sin(pi/8)
2891  FFT_TYPE f0r, f0i, f1r, f1i, f2r, f2i, f3r, f3i;
2892  FFT_TYPE f4r, f4i, f5r, f5i, f6r, f6i, f7r, f7i;
2893  FFT_TYPE t0r, t0i, t1r, t1i;
2894  const FFT_TYPE Two = FFT_TYPE(2.0);
2895 
2896 // bit reversed load
2897  t0r = ioptr[0];
2898  t0i = ioptr[1];
2899  f4r = ioptr[2];
2900  f4i = ioptr[3];
2901  f2r = ioptr[4];
2902  f2i = ioptr[5];
2903  f6r = ioptr[6];
2904  f6i = ioptr[7];
2905  f1r = Two * ioptr[8];
2906  f1i = Two * ioptr[9];
2907  f5r = ioptr[10];
2908  f5i = ioptr[11];
2909  f3r = ioptr[12];
2910  f3i = ioptr[13];
2911  f7r = ioptr[14];
2912  f7i = ioptr[15];
2913 
2914 // start rfft
2915  f0r = t0r + t0i; // compute Re(x[0])
2916  f0i = t0r - t0i; // compute Re(x[N/2])
2917 
2918  t0r = f2r + f3r;
2919  t0i = f2i - f3i;
2920  t1r = f2r - f3r;
2921  t1i = f2i + f3i;
2922 
2923  f2r = t0r - w0r * t1r - w0r * t1i;
2924  f2i = t0i + w0r * t1r - w0r * t1i;
2925  f3r = Two * t0r - f2r;
2926  f3i = f2i - Two * t0i;
2927 
2928  t0r = f4r + f7r;
2929  t0i = f4i - f7i;
2930  t1r = f4r - f7r;
2931  t1i = f4i + f7i;
2932 
2933  f4r = t0r - w1i * t1r - w1r * t1i;
2934  f4i = t0i + w1r * t1r - w1i * t1i;
2935  f7r = Two * t0r - f4r;
2936  f7i = f4i - Two * t0i;
2937 
2938  t0r = f6r + f5r;
2939  t0i = f6i - f5i;
2940  t1r = f6r - f5r;
2941  t1i = f6i + f5i;
2942 
2943  f6r = t0r - w1r * t1r - w1i * t1i;
2944  f6i = t0i + w1i * t1r - w1r * t1i;
2945  f5r = Two * t0r - f6r;
2946  f5i = f6i - Two * t0i;
2947 
2948 // Butterflys
2949 // f0 - - t0 - - f0 - - f0
2950 // f1* - 1 - f1 - - f1 - - f1
2951 // f2 - - f2 - 1 - f2 - - f2
2952 // f3 - 1 - t1 - i - f3 - - f3
2953 // f4 - - t0 - - f4 - 1 - t0
2954 // f5 - 1 - f5 - - f5 - w3 - f4
2955 // f6 - - f6 - 1 - f6 - i - t1
2956 // f7 - 1 - t1 - i - f7 - iw3- f6
2957 
2958  t0r = f0r + f1r;
2959  t0i = f0i - f1i;
2960  f1r = f0r - f1r;
2961  f1i = f0i + f1i;
2962 
2963  t1r = f2r - f3r;
2964  t1i = f2i - f3i;
2965  f2r = f2r + f3r;
2966  f2i = f2i + f3i;
2967 
2968  f0r = t0r + f2r;
2969  f0i = t0i + f2i;
2970  f2r = t0r - f2r;
2971  f2i = t0i - f2i;
2972 
2973  f3r = f1r + t1i;
2974  f3i = f1i - t1r;
2975  f1r = f1r - t1i;
2976  f1i = f1i + t1r;
2977 
2978  t0r = f4r + f5r;
2979  t0i = f4i + f5i;
2980  f5r = f4r - f5r;
2981  f5i = f4i - f5i;
2982 
2983  t1r = f6r - f7r;
2984  t1i = f6i - f7i;
2985  f6r = f6r + f7r;
2986  f6i = f6i + f7i;
2987 
2988  f4r = t0r + f6r;
2989  f4i = t0i + f6i;
2990  f6r = t0r - f6r;
2991  f6i = t0i - f6i;
2992 
2993  f7r = f5r + t1i;
2994  f7i = f5i - t1r;
2995  f5r = f5r - t1i;
2996  f5i = f5i + t1r;
2997 
2998  t0r = f0r - f4r;
2999  t0i = f0i - f4i;
3000  f0r = f0r + f4r;
3001  f0i = f0i + f4i;
3002 
3003  t1r = f2r + f6i;
3004  t1i = f2i - f6r;
3005  f2r = f2r - f6i;
3006  f2i = f2i + f6r;
3007 
3008  f4r = f1r - f5r * w0r + f5i * w0r;
3009  f4i = f1i - f5r * w0r - f5i * w0r;
3010  f1r = f1r * Two - f4r;
3011  f1i = f1i * Two - f4i;
3012 
3013  f6r = f3r + f7r * w0r + f7i * w0r;
3014  f6i = f3i - f7r * w0r + f7i * w0r;
3015  f3r = f3r * Two - f6r;
3016  f3i = f3i * Two - f6i;
3017 
3018 // store result
3019  ioptr[0] = scale * f0r;
3020  ioptr[1] = scale * f0i;
3021  ioptr[2] = scale * f1r;
3022  ioptr[3] = scale * f1i;
3023  ioptr[4] = scale * f2r;
3024  ioptr[5] = scale * f2i;
3025  ioptr[6] = scale * f3r;
3026  ioptr[7] = scale * f3i;
3027  ioptr[8] = scale * t0r;
3028  ioptr[9] = scale * t0i;
3029  ioptr[10] = scale * f4r;
3030  ioptr[11] = scale * f4i;
3031  ioptr[12] = scale * t1r;
3032  ioptr[13] = scale * t1i;
3033  ioptr[14] = scale * f6r;
3034  ioptr[15] = scale * f6i;
3035 }
3036 
3037 //------------------------------------------------------------------------------
3038 // Start RIFFT
3039 //------------------------------------------------------------------------------
3040 template <typename FFT_TYPE>
3041 void g_fft<FFT_TYPE>::ifrstage(FFT_TYPE *ioptr, int M, FFT_TYPE *inUtbl)
3042 {
3043  unsigned int pos;
3044  unsigned int posi;
3045  unsigned int diffUcnt;
3046 
3047  FFT_TYPE *p0r, *p1r;
3048  FFT_TYPE *u0r, *u0i;
3049 
3050  FFT_TYPE w0r, w0i;
3051  FFT_TYPE f0r, f0i, f1r, f1i, f4r, f4i, f5r, f5i;
3052  FFT_TYPE t0r, t0i, t1r, t1i;
3053  const FFT_TYPE Two = FFT_TYPE(2.0);
3054 
3055  pos = POW2(M - 1);
3056  posi = pos + 1;
3057 
3058  p0r = ioptr;
3059  p1r = ioptr + pos / 2;
3060 
3061  u0r = inUtbl + POW2(M - 3);
3062 
3063  w0r = *u0r, f0r = *(p0r);
3064  f0i = *(p0r + 1);
3065  f4r = *(p0r + pos);
3066  f4i = *(p0r + posi);
3067  f1r = *(p1r);
3068  f1i = *(p1r + 1);
3069  f5r = *(p1r + pos);
3070  f5i = *(p1r + posi);
3071 
3072  t0r = f0r + f0i;
3073  t0i = f0r - f0i;
3074  t1r = f4r + f4r;
3075  t1i = -f4i - f4i;
3076 
3077  f0r = f1r + f5r;
3078  f0i = f1i - f5i;
3079  f4r = f1r - f5r;
3080  f4i = f1i + f5i;
3081 
3082  f1r = f0r - w0r * f4r - w0r * f4i;
3083  f1i = f0i + w0r * f4r - w0r * f4i;
3084  f5r = Two * f0r - f1r;
3085  f5i = f1i - Two * f0i;
3086 
3087  *(p0r) = t0r;
3088  *(p0r + 1) = t0i;
3089  *(p0r + pos) = t1r;
3090  *(p0r + posi) = t1i;
3091  *(p1r) = f1r;
3092  *(p1r + 1) = f1i;
3093  *(p1r + pos) = f5r;
3094  *(p1r + posi) = f5i;
3095 
3096  u0r = inUtbl + 1;
3097  u0i = inUtbl + (POW2(M - 2) - 1);
3098 
3099  w0r = *u0r, w0i = *u0i;
3100 
3101  p0r = (ioptr + 2);
3102  p1r = (ioptr + (POW2(M - 2) - 1) * 2);
3103 
3104 // Butterflys
3105 // f0 - t0 - f0
3106 // f1 - t1 -w0- f1
3107 // f2 - t0 - f2
3108 // f3 - t1 -iw0- f3
3109 
3110  for (diffUcnt = POW2(M - 3) - 1; diffUcnt > 0; diffUcnt--) {
3111 
3112  f0r = *(p0r);
3113  f0i = *(p0r + 1);
3114  f5r = *(p1r + pos);
3115  f5i = *(p1r + posi);
3116  f1r = *(p1r);
3117  f1i = *(p1r + 1);
3118  f4r = *(p0r + pos);
3119  f4i = *(p0r + posi);
3120 
3121  t0r = f0r + f5r;
3122  t0i = f0i - f5i;
3123  t1r = f0r - f5r;
3124  t1i = f0i + f5i;
3125 
3126  f0r = t0r - w0i * t1r - w0r * t1i;
3127  f0i = t0i + w0r * t1r - w0i * t1i;
3128  f5r = Two * t0r - f0r;
3129  f5i = f0i - Two * t0i;
3130 
3131  t0r = f1r + f4r;
3132  t0i = f1i - f4i;
3133  t1r = f1r - f4r;
3134  t1i = f1i + f4i;
3135 
3136  f1r = t0r - w0r * t1r - w0i * t1i;
3137  f1i = t0i + w0i * t1r - w0r * t1i;
3138  f4r = Two * t0r - f1r;
3139  f4i = f1i - Two * t0i;
3140 
3141  *(p0r) = f0r;
3142  *(p0r + 1) = f0i;
3143  *(p1r + pos) = f5r;
3144  *(p1r + posi) = f5i;
3145 
3146  w0r = *++u0r;
3147  w0i = *--u0i;
3148 
3149  *(p1r) = f1r;
3150  *(p1r + 1) = f1i;
3151  *(p0r + pos) = f4r;
3152  *(p0r + posi) = f4i;
3153 
3154  p0r += 2;
3155  p1r -= 2;
3156  }
3157 }
3158 
3159 //------------------------------------------------------------------------------
3160 // Compute in-place real ifft on the rows of the input array
3161 // data order as from rffts1
3162 // INPUTS
3163 // *ioptr = input data array in the following order
3164 // M = log2 of fft size
3165 // Re(x[0]), Re(x[N/2]), Re(x[1]), Im(x[1]),
3166 // Re(x[2]), Im(x[2]), ... Re(x[N/2-1]), Im(x[N/2-1]).
3167 // *Utbl = cosine table
3168 // *BRLow = bit reversed counter table
3169 // OUTPUTS
3170 // *ioptr = real output data array
3171 //------------------------------------------------------------------------------
3172 template <typename FFT_TYPE>
3173 void g_fft<FFT_TYPE>::riffts1(FFT_TYPE *ioptr, int M, FFT_TYPE *Utbl, short *BRLow)
3174 {
3175  FFT_TYPE scale;
3176  int StageCnt;
3177  int NDiffU;
3178 
3179  scale = (FFT_TYPE)(1.0 / (float)((int)POW2(M)));
3180  M = M - 1;
3181  switch (M) {
3182  case -1:
3183  break;
3184  case 0:
3185  rifft1pt(ioptr, scale); // a 2 pt fft
3186  break;
3187  case 1:
3188  rifft2pt(ioptr, scale); // a 4 pt fft
3189  break;
3190  case 2:
3191  rifft4pt(ioptr, scale); // an 8 pt fft
3192  break;
3193  case 3:
3194  rifft8pt(ioptr, scale); // a 16 pt fft
3195  break;
3196  default:
3197  ifrstage(ioptr, M + 1, Utbl);
3198 // bit reverse and first radix 2 stage
3199  scbitrevR2(ioptr, M, BRLow, scale);
3200  StageCnt = (M - 1) / 3; // number of radix 8 stages
3201  NDiffU = 2; // one radix 2 stage already complete
3202  if ((M - 1 - (StageCnt * 3)) == 1) {
3203  ibfR2(ioptr, M, NDiffU); // 1 radix 2 stage
3204  NDiffU *= 2;
3205  }
3206  if ((M - 1 - (StageCnt * 3)) == 2) {
3207  ibfR4(ioptr, M, NDiffU); // 1 radix 4 stage
3208  NDiffU *= 4;
3209  }
3210  if (M <= (int) MCACHE)
3211  ibfstages(ioptr, M, Utbl, 2, NDiffU, StageCnt); // RADIX 8 Stages
3212  else
3213  ifftrecurs(ioptr, M, Utbl, 2, NDiffU, StageCnt); // RADIX 8 Stages
3214  }
3215 }
3216 
3217 //==============================================================================
3218 // End of original C functions
3219 //
3220 // Wrapper methods for simple class access
3221 //==============================================================================
3222 
3223 //------------------------------------------------------------------------------
3224 // malloc and init cosine and bit reversed tables for a given size
3225 // fft, ifft, rfft, rifft
3226 // INPUTS
3227 // M = log2 of fft size (ex M=10 for 1024 point fft)
3228 // OUTPUTS
3229 // private cosine and bit reversed tables
3230 //------------------------------------------------------------------------------
3231 template <typename FFT_TYPE>
3233 {
3234  for (int i = 0; i < 32; i++)
3235  {
3236  FFT_table_1[i] = (FFT_TYPE*)0;
3237  FFT_table_2[i] = (short int*)0;
3238  }
3239 
3241 
3242 // create and initialize cos table
3243  FFT_table_1[FFT_N] = new FFT_TYPE[(POW2(FFT_N) / 4 + 1)];
3245 
3246 // create and initialize bit reverse tables
3247  FFT_table_2[FFT_N/2] = new short[POW2(FFT_N/2 - 1)];
3248  fftBRInit(FFT_N, FFT_table_2[FFT_N/2]);
3249 
3250  if ((FFT_N % 2) == 0) { // FFT_N/2 = (FFT_N-1)/2 if FFT_N is odd. Prevents memory leak
3251  FFT_table_2[(FFT_N - 1) / 2] = new short[POW2((FFT_N - 1) / 2 - 1)];
3252  }
3253  fftBRInit(FFT_N - 1, FFT_table_2[(FFT_N - 1) / 2]);
3254 
3255  Utbl = ((FFT_TYPE**) FFT_table_1)[FFT_N];
3256  BRLow = ((short**) FFT_table_2)[FFT_N / 2];
3257 
3258 }
3259 
3260 //------------------------------------------------------------------------------
3261 // convert from N to LOG2(N)
3262 //------------------------------------------------------------------------------
3263 template <typename FFT_TYPE>
3265 {
3266  if (N <= 0) N = -N;
3267 
3268  switch (N) {
3269  case 0x00000001: return 0; // 1
3270  case 0x00000002: return 1; // 2
3271  case 0x00000004: return 2; // 4
3272  case 0x00000008: return 3; // 8
3273  case 0x00000010: return 4; // 16
3274  case 0x00000020: return 5; // 32
3275  case 0x00000040: return 6; // 64
3276  case 0x00000080: return 7; // 128
3277  case 0x00000100: return 8; // 256
3278  case 0x00000200: return 9; // 512
3279  case 0x00000400: return 10; // 1024
3280  case 0x00000800: return 11; // 2048
3281  case 0x00001000: return 12; // 4096
3282  case 0x00002000: return 13; // 8192
3283  case 0x00004000: return 14; // 16384
3284  case 0x00008000: return 15; // 32768
3285  case 0x00010000: return 16; // 65536
3286  case 0x00020000: return 17; // 131072
3287  case 0x00040000: return 18; // 262144
3288  case 0x00080000: return 19; // 525288
3289  case 0x00100000: return 20; // 1048576
3290  case 0x00200000: return 21; // 2097152
3291  case 0x00400000: return 22; // 4194304
3292  case 0x00800000: return 23; // 8388608
3293  case 0x01000000: return 24; // 16777216
3294  case 0x02000000: return 25; // 33554432
3295  case 0x04000000: return 26; // 67108864
3296  case 0x08000000: return 27; // 134217728
3297  case 0x10000000: return 28; // 268435456
3298  }
3299  return 0;
3300 }
3301 
3302 //------------------------------------------------------------------------------
3303 // Compute in-place complex FFT
3304 // FFTsize: FFT length in samples
3305 // buf: array of FFTsize*2 FFT_TYPE values,
3306 // in interleaved real/imaginary format
3307 //------------------------------------------------------------------------------
3308 template <typename FFT_TYPE>
3309 void g_fft<FFT_TYPE>::ComplexFFT(std::complex<FFT_TYPE> *buf)
3310 {
3311  void *ptr = buf;
3312  FFT_TYPE *nbuf = static_cast<FFT_TYPE *>(ptr);
3313  ffts1(nbuf, FFT_N, Utbl, BRLow);
3314 }
3315 
3316 //------------------------------------------------------------------------------
3317 // Compute in-place inverse complex FFT
3318 // FFTsize: FFT length in samples
3319 // buf: array of FFTsize*2 FFT_TYPE values,
3320 // in interleaved real/imaginary format
3321 // Output should be scaled by the return value of
3322 // GetInverseComplexFFTScale(fft_struct, FFTsize).
3323 //------------------------------------------------------------------------------
3324 template <typename FFT_TYPE>
3325 void g_fft<FFT_TYPE>::InverseComplexFFT(std::complex<FFT_TYPE> *buf)
3326 {
3327  void *ptr = buf;
3328  FFT_TYPE *nbuf = static_cast<FFT_TYPE *>(ptr);
3329  iffts1(nbuf, FFT_N, Utbl, BRLow);
3330 }
3331 
3332 //------------------------------------------------------------------------------
3333 // Compute in-place real FFT
3334 // FFTsize: FFT length in samples
3335 // buf: array of FFTsize FFT_TYPE values; output is in interleaved
3336 // real/imaginary format, except for buf[1] which is the real
3337 // part for the Nyquist frequency
3338 //------------------------------------------------------------------------------
3339 template <typename FFT_TYPE>
3340 void g_fft<FFT_TYPE>::RealFFT(std::complex<FFT_TYPE> *buf)
3341 {
3342  void *ptr = buf;
3343  FFT_TYPE *nbuf = static_cast<FFT_TYPE *>(ptr);
3344  rffts1(nbuf, FFT_N, Utbl, BRLow);
3345 }
3346 
3347 //------------------------------------------------------------------------------
3348 // Compute in-place inverse real FFT
3349 // FFTsize: FFT length in samples
3350 // buf: array of FFTsize FFT_TYPE values; input is expected to be in
3351 // interleaved real/imaginary format, except for buf[1] which
3352 // is the real part for the Nyquist frequency
3353 // Output should be scaled by the return value of
3354 // GetInverseRealFFTScale(fft_struct, FFTsize).
3355 //------------------------------------------------------------------------------
3356 template <typename FFT_TYPE>
3357 void g_fft<FFT_TYPE>::InverseRealFFT(std::complex<FFT_TYPE> *buf)
3358 {
3359  void *ptr = buf;
3360  FFT_TYPE *nbuf = static_cast<FFT_TYPE *>(ptr);
3361  riffts1(nbuf, FFT_N, Utbl, BRLow);
3362 }
3363 
3364 //------------------------------------------------------------------------------
3365 // Returns the amplitude scale that should be applied to the result of
3366 // an inverse complex FFT with a length of 'FFTsize' samples.
3367 //------------------------------------------------------------------------------
3368 template <typename FFT_TYPE>
3370 {
3371  return FFT_TYPE(1.0);
3372 }
3373 
3374 //------------------------------------------------------------------------------
3375 // Returns the amplitude scale that should be applied to the result of
3376 // an inverse real FFT with a length of 'FFTsize' samples.
3377 //------------------------------------------------------------------------------
3378 template <typename FFT_TYPE>
3380 {
3381  return FFT_TYPE(1.0);
3382 }
3383 
3384 #endif
3385 
void ifft8pt(FFT_TYPE *ioptr, FFT_TYPE scale)
Definition: gfft.h:1458
Fixed< IntType, IntBits > cos(Fixed< IntType, IntBits > const &x)
Definition: fixed.h:2271
void rifft8pt(FFT_TYPE *ioptr, FFT_TYPE scale)
Definition: gfft.h:2886
int FFT_N
Definition: gfft.h:56
void frstage(FFT_TYPE *ioptr, int M, FFT_TYPE *inUtbl)
Definition: gfft.h:2570
void ifftrecurs(FFT_TYPE *ioptr, int M, FFT_TYPE *Utbl, int Ustride, int NDiffU, int StageCnt)
Definition: gfft.h:2212
FFT_TYPE GetInverseRealFFTScale()
Definition: gfft.h:3379
void ifft2pt(FFT_TYPE *ioptr, FFT_TYPE scale)
Definition: gfft.h:1371
void InverseComplexFFT(std::complex< FFT_TYPE > *buf)
Definition: gfft.h:3325
void InverseRealFFT(std::complex< FFT_TYPE > *buf)
Definition: gfft.h:3357
#define POW2(m)
Definition: gfft.h:43
void fft2pt(FFT_TYPE *ioptr)
Definition: gfft.h:325
void ibfR4(FFT_TYPE *ioptr, int M, int NDiffU)
Definition: gfft.h:1683
short int * FFT_table_2[32]
Definition: gfft.h:58
void ComplexFFT(std::complex< FFT_TYPE > *buf)
Definition: gfft.h:3309
void RealFFT(std::complex< FFT_TYPE > *buf)
Definition: gfft.h:3340
short * BRLow
Definition: gfft.h:61
void ifft4pt(FFT_TYPE *ioptr, FFT_TYPE scale)
Definition: gfft.h:1402
void iffts1(FFT_TYPE *ioptr, int M, FFT_TYPE *Utbl, short *BRLow)
Definition: gfft.h:2239
void rifft2pt(FFT_TYPE *ioptr, FFT_TYPE scale)
Definition: gfft.h:2778
void bfR4(FFT_TYPE *ioptr, int M, int NDiffU)
Definition: gfft.h:637
void fftrecurs(FFT_TYPE *ioptr, int M, FFT_TYPE *Utbl, int Ustride, int NDiffU, int StageCnt)
Definition: gfft.h:1161
#define FFT_SINPID8
Definition: gfft.h:52
FFT_TYPE GetInverseComplexFFTScale()
Definition: gfft.h:3369
void rfft1pt(FFT_TYPE *ioptr)
Definition: gfft.h:2282
void fftBRInit(int M, short *BRLow)
Definition: gfft.h:161
void ffts1(FFT_TYPE *ioptr, int M, FFT_TYPE *Utbl, short *BRLow)
Definition: gfft.h:1190
void fft8pt(FFT_TYPE *ioptr)
Definition: gfft.h:412
#define FFT_COSPID8
Definition: gfft.h:51
int32_t i
Definition: decimators.h:244
g_fft(int M=8192)
Definition: gfft.h:107
void rfft4pt(FFT_TYPE *ioptr)
Definition: gfft.h:2338
void bfR2(FFT_TYPE *ioptr, int M, int NDiffU)
Definition: gfft.h:531
void fftInit()
Definition: gfft.h:3232
void scbitrevR2(FFT_TYPE *ioptr, int M, short *inBRLow, FFT_TYPE scale)
Definition: gfft.h:1231
void rfft2pt(FFT_TYPE *ioptr)
Definition: gfft.h:2304
void rifft4pt(FFT_TYPE *ioptr, FFT_TYPE scale)
Definition: gfft.h:2814
void rfft8pt(FFT_TYPE *ioptr)
Definition: gfft.h:2412
FFT_TYPE * Utbl
Definition: gfft.h:60
Definition: gfft.h:37
void ifrstage(FFT_TYPE *ioptr, int M, FFT_TYPE *inUtbl)
Definition: gfft.h:3041
#define MCACHE
Definition: gfft.h:46
void ibfstages(FFT_TYPE *ioptr, int M, FFT_TYPE *inUtbl, int Ustride, int NDiffU, int StageCnt)
Definition: gfft.h:1889
#define FFT_ROOT2
Definition: gfft.h:50
~g_fft()
Definition: gfft.h:115
#define FFT_PI
Definition: gfft.h:49
int FFT_size
Definition: gfft.h:55
void ibfR2(FFT_TYPE *ioptr, int M, int NDiffU)
Definition: gfft.h:1577
void fft4pt(FFT_TYPE *ioptr)
Definition: gfft.h:356
FFT_TYPE * FFT_table_1[32]
Definition: gfft.h:57
void bfstages(FFT_TYPE *ioptr, int M, FFT_TYPE *inUtbl, int Ustride, int NDiffU, int StageCnt)
Definition: gfft.h:843
void rifft1pt(FFT_TYPE *ioptr, FFT_TYPE scale)
Definition: gfft.h:2756
uint8_t i1
Definition: decimators.h:245
int ConvertFFTSize(int)
Definition: gfft.h:3264
void rffts1(FFT_TYPE *ioptr, int M, FFT_TYPE *Utbl, short *BRLow)
Definition: gfft.h:2704
void riffts1(FFT_TYPE *ioptr, int M, FFT_TYPE *Utbl, short *BRLow)
Definition: gfft.h:3173
void fftCosInit(int M, FFT_TYPE *Utbl)
Definition: gfft.h:141
void bitrevR2(FFT_TYPE *ioptr, int M, short *inBRLow)
Definition: gfft.h:185