//
// fftbench.c
//
// A simple FFT implmentation for use as a micro-controller benchmark. This is an in-place
// Radix-2 Decimation In Time FFT using fixed point arithmetic.
//
// When reimplementing this benchmark in other languages please honour the intention of
// this benchmark by following the algorithm as closely as possible. This version is based off
// of bech_fft.spin which is to be regarded as the "mother" of all versions of this benchmark
// in other languages.
//
// This FFT was developed from the description by Douglas L. Jones at
// http://cnx.org/content/m12016/latest/.
// It is written as a direct implementation of the discussion and diagrams on that page
// with an emphasis on clarity and ease of understanding rather than speed.
//
//
// This file is released under the terms of the MIT license. See below.
//
// Credits:
//
//     A big thank you to Dave Hein for clearing up some issues during a great FFT debate on
//     the Parallax Inc Propller discussion forum:
//     http://forums.parallax.com/showthread.php?127306-Fourier-for-dummies-under-construction
//
// History:
//
// 2011-02-27    v1.0  Initial version.
//
// 2012-10-04    v1.1  Added support for parallel processing using OpenMP
//                     A crude attempt at parallelization using up to 4 cores max.
//
// 2012-12-05    v1.2  Changed to use "parallel for" OMP construct.
//                     Configured for 4 cores max.
//
// 2020-08-20    v1.2.1  Converted back to C89. Also, use the Catalina multi-threading kernal  
//                       and thread_factory (instead of OMP) when compiled using Catalina.
//
// 2026-01-19    v1.3.0  Switched to packed Real, Imag data.
// 2026-01-19    v1.3.1  Used for testing PASM library. 
// 2026-01-19    v1.3.2  Testing IFFT operations
// 2026-01-30    v1.3.3  Speed comparison of different libraries and configurations.

#include <time.h>
#include <inttypes.h> 

#ifdef _OPENMP
// Only include omp if it is available
#include <omp.h>
#else
// Otherwise redefine some omp functions to remove compiler errors
#define omp_get_max_threads() 1
#define omp_get_thread_num() 1
#endif

/*
 * define how many slices we want when parallel processing (which will also 
 * be the number of worker threads we create when using Catalina multi-tasking):
 *
 * note that we divide the work up into 2^LOG2_SLICES slices, so
 *    0 = 1 slices
 *    1 = 2 slices
 *    2 = 4 slices
 *    3 = 8 slices
 *    4 = 16 slices
 *    5 = 32 slices
 *     etc
 *
 */
#ifndef LOG2_SLICES
#define LOG2_SLICES  0
#endif

#ifdef __CATALINA__

/*
 * include the Catalina thread factory:
 */
#include "thread_factory.h"

/*
 * define the stack size for the factory and each worker:
 */
#define STACK_SIZE   200

/*
 * a lock to use to avoid kernel contention (all kernels, including those
 * in the factory, must use the same lock for this purpose):
 */
static int kernel_lock = -1;

/*
 * define a place to put the parameters for the slice a worker thread should process - we
 * need a place to put the variables slice, slices, firstLevel & lastLevel. The status field
 * indicates the status of the thread - 0 means thread is either not yet started, or that it
 * has finished its allocated slice and is waiting for a new one:
 */
struct slice_parameters {
   int slice;
   int slices;
   int firstLevel;
   int lastLevel;
   int status;
};

struct slice_parameters worker[1<<LOG2_SLICES] = { 0, 0, 0, 0, 0};

#endif

/* define PROPELLER for Catalina and GCC on the Propeller (causes the timing
 * to use the CNT counter instead of gettimeofday
 */
#ifndef __linux__
//#define PROPELLER
#endif

#include <stdio.h>
#ifdef __P2__
#define PROPELLER
//#include <propeller2.h>
#else
#ifdef PROPELLER
//#include <propeller.h>
#ifndef _cnt
#define _cnt() CNT
#endif
#endif
#endif

#define int32_t int
#define int16_t short int

// Specify size of FFT buffer here with length and log base 2 of the length.
// N.B. Changing this will require changing the "twiddle factor" tables.
//     and may also require changing the fixed point format (if going bigger)
#define FFT_SIZE      1024
#define LOG2_FFT_SIZE 10

// cos and sin parts of the signal to be analysed
// Result is written back to here.
// Just write input sammles to bx and zero all by.
// Packed data, X is even, Y is odd index
static int32_t bxy[FFT_SIZE*2];
static int32_t rxy[FFT_SIZE*2];

// Set if array bounds exceeded
int rangeError = 0;

static void fillInput();
static void decimate();
void butterflies(int32_t* bxy, int32_t firstLevel, int32_t lastLevel, int32_t slices, int32_t slen);
static void printSpectrum();

#ifdef PROPELLER

#ifndef _clockfreq
#ifdef __P2__
#define _clockfreq() (*(unsigned int *)0x14)
#else
#define _clockfreq() _CLKFREQ
#endif
#endif

// assumes an 80MHz clock
unsigned long time_us()
{
    unsigned long mhz = _clockfreq() / 1000000;
    return _cnt() / mhz;
}

#else
    
// Return a timestamp in nanosecond resolution.
uint64_t time_ns( void ) {
  struct timespec now;
  clock_gettime( CLOCK_MONOTONIC, &now );
  return (uint64_t)now.tv_sec * UINT64_C(1000000000) + (uint64_t)now.tv_nsec;
}

#endif

void print_omp_version() {
    #ifdef _OPENMP
    printf("OpenMP version = ");
    switch (_OPENMP) {
        case 200805:
            printf("3.0");
            break;
        case 200505:
            printf("2.5");
            break;
        case 200203:
            printf("2.0");
            break;
        default:
            printf("Unknown. _OPENMP = %d", _OPENMP);
            break;
    }
    printf("\n");
    #else
    printf("OpenMP not available on this system\n");
    #ifdef __CATALINA__
    printf("Using Catalina Multi-threading instead\n");
    #endif
    #endif
}

#ifdef __CATALINA__

/* slice_worker : call the butterflies function on a slice
 *
 * This is our worker thread function - note that the "me" parameter passed is
 * a thread number, which the worker uses to look up their slice parameters
 * in the worker array - this is just an easy way of passing parameters
 * other than a plain int, and also indicating the status of our task.
 */
int slice_worker(int me, char *unused[]) {
   unsigned long i;
   unsigned long j;
   int s, slen;

   while (1) {
      // wait until we are allocated a slice
      while ((i = worker[me].status) == 0) {
         idle();
      }

      // process our allocated slice
      s = FFT_SIZE * worker[me].slice / worker[me].slices;
      slen = FFT_SIZE / worker[me].slices;
      butterflies(&bxy[s], worker[me].firstLevel, worker[me].lastLevel, worker[me].slices, slen);

      // indicate we are done
      worker[me].status = 0;
   }
   return 0;
}

/*
 * next_worker - move to the next worker
 */
#define next_worker(j) (j = (j + 1)%(1<<LOG2_SLICES))

#endif

#ifdef PROPELLER
struct __using("cog_fft.spin2") fft;
struct __using("inline_fft.spin2") inline_fft;
void fft_bench_cog(unsigned int flags , char * comment ) {

    int i,p;
    unsigned long startTime, endTime;
    printf ("fft_bench v1.3.3 for PROPELLER\n");
    // Input some data
    fillInput();

    if (flags & fft.REAL )
    {
       for(i=1;i<FFT_SIZE;i++)  bxy[i]=bxy[i*2]; // pack real data
    }
    p=0;
    // Start benchmark timer
    startTime = time_us();
    //           size + config , input arry, output arry    input array is modified!
#if 0
    fft.fft( (FFT_SIZE) | flags   ,     bxy   ,     rxy ); // time domain -> frequency domai
#else
    fft.fft_nonblocking( (FFT_SIZE) | flags   ,     bxy   ,     rxy );
    while( ! fft.is_done() )
    {
        p++;
    }
#endif


    if( 0 )
    {
    // frequency domain filter and value scaling
    for(i=0;i<FFT_SIZE*2;i++)  rxy[i]=i < 800 ? rxy[i]/FFT_SIZE : 0 ; 
    fft.fft( FFT_SIZE | fft.IFFT , rxy , bxy ); // frequency domain -> time domain 

    fft.fft( FFT_SIZE , bxy , rxy ); // FFT for spectrum analysis 
    } 
    // Stop benchmark timer
    endTime = time_us();

    // Print resulting spectrum
    if( flags & fft.INPLACE ) 
       printSpectrum(bxy);
    else
       printSpectrum(rxy);

    printf ("1024 point bit-reversal and butterfly run time = %u us %s\n", endTime - startTime, comment );
    printf("clock frequency = %u\n", _clockfreq());
    printf("polls = %d\n",p);
}

void fft_bench_inline() {

    int i;
    unsigned long startTime, endTime;
    printf ("fft_bench v1.3.3 for PROPELLER\n");
    // Input some data
    fillInput();
    //printSpectrum(bxy);

    // Start benchmark timer
    startTime = time_us();
    inline_fft.fft( FFT_SIZE , bxy ); 
    /*
    for(i=0;i<FFT_SIZE*2;i++)  rxy[i]=i < 800 ? rxy[i]/FFT_SIZE : 0 ;
 
    //for(i=0;i<FFT_SIZE*2;i+=1)  rxy[i]= 0 ;
    //rxy[2]=1024*1;

    // begin IFFT ' This code used for testing if FFT/IFFT was reversible  
    for(i=1;i<FFT_SIZE*2;i+=2)  rxy[i]=-rxy[i];   // inline function does not yet have IFFT mode
    fft.inline_fft( (FFT_SIZE/1) | (0<<31) , rxy ); //
    for(i=1;i<FFT_SIZE*2;i+=2)  rxy[i]=-rxy[i];
    fft.inline_bitreviq_to_absphs( FFT_SIZE/1 , rxy , bxy ); //
    //printSpectrum(bxy);

    //for(i=0;i<FFT_SIZE*2;i++)  rxy[i]=0;
    //for(i=0;i<FFT_SIZE*2;i++)  bxy[i]=bxy[i]/FFT_SIZE;
    fft.inline_fft( (FFT_SIZE/1) | (0<<31) , bxy ); // 
    fft.inline_bitreviq_to_absphs( FFT_SIZE/1 , bxy , rxy ); // 
    //for(i=0;i<FFT_SIZE*2;i++)  rxy[i]=rxy[i]*FFT_SIZE;
  */
    // Stop benchmark timer
    endTime = time_us();

    // Print resulting spectrum
    printSpectrum(bxy);
    //printSpectrum(rxy);

    printf ("1024 point bit-reversal and butterfly run time = %u us Inline \n", endTime - startTime);
    printf("clock frequency = %u\n", _clockfreq());
}

struct __using("sa_fft7.spin2") ditfft;
int32_t ditparams[6];
void fft_bench_dit() {

    unsigned long startTime, endTime;
    printf ("fft_bench v1.3.3 for PROPELLER\n");
    // Input some data
    fillInput();
    int32_t cog_num=-1;
    int32_t FIXINT=8;
    int32_t N=10;
    ditparams[0]=(FIXINT*0x10100) | ((8|ditfft.INP_SHARED|ditfft.INP_COMPLEX)<<24);
    ditparams[0]=(FIXINT*0x10100) | ((8|ditfft.INP_COMPLEX)<<24);
    ditparams[1]=N | (FIXINT*0x100) ;
    ditparams[2]=(unsigned int ) &bxy[0];
    ditparams[3]=(unsigned int ) &rxy[0];


    cog_num=ditfft.start(&ditparams[0]);
    _waitms(2);
    // Start benchmark timer
    startTime = time_us();
    ditparams[0]|=ditfft.GO ;//| ditfft.DO_SCALED;
    //fft.cog_fft( FFT_SIZE/1 , bxy , rxy ); // real mode uses half size
    //fft.inline_fft( FFT_SIZE/1 , bxy ); // real mode uses half size
    //fft.inline_bitreviq_to_absphs( FFT_SIZE/1 , bxy , rxy ); // real mode uses half size
    while( ditparams[0]&0xff );

    // Stop benchmark timer
    endTime = time_us();

    if( cog_num>=0 ) _cogstop( cog_num );

    // Print resulting spectrum
    //printSpectrum(bxy);
    printSpectrum(rxy);

    printf ("1024 point bit-reversal and butterfly run time = %u us MT\n", endTime - startTime);
    printf("clock frequency = %u\n", _clockfreq());
}
#endif


void fft_bench() {

#ifdef __CATALINA__
    int i = 0;
#else
    int s, slen;
#endif

    int firstLevel;
    int lastLevel;
    int slice;
    int slices;

#ifdef PROPELLER
    unsigned long startTime, endTime;
    printf ("fft_bench v1.3.3 for PROPELLER\n");
#else
    uint64_t startTime, endTime;
    printf ("fft_bench v1.3.3\n");
#endif

    print_omp_version();

    // Input some data
    fillInput();

    // Start benchmark timer
#ifdef PROPELLER
    startTime = time_us();
#else
    startTime = time_ns();
#endif

    // Radix-2 Decimation In Time, the bit-reversal step.
    decimate();

    //  Our FFT array will be split into slices. each slice can be handled by it's own thread
    slices = (1<<LOG2_SLICES);
    lastLevel = LOG2_FFT_SIZE - (LOG2_SLICES + 1);

    firstLevel = 0;
    for ( ; slices >= 1; slices = slices / 2) {
        
#ifdef _OPENMP
        #pragma omp parallel for default (none) \
            shared (bx, by) \
            private (slice, s, slen) \
                firstprivate(slices, firstLevel, lastLevel)
#endif
        for (slice = 0; slice < slices; slice++) {

#ifdef __CATALINA__
            // find a free worker, waiting as necessary
            while (worker[i].status != 0) {
               next_worker(i);
               if (i == 0) {
                  idle();
               }
            }
            // found a free worker, so allocate it a slice to work on
            worker[i].slice = slice;
            worker[i].slices = slices;
            worker[i].firstLevel = firstLevel;
            worker[i].lastLevel = lastLevel;
            worker[i].status = 1;
            next_worker(i);
#else
            s = FFT_SIZE * slice / slices;
            slen = FFT_SIZE / slices;
            butterflies(&bxy[s], firstLevel, lastLevel, slices, slen);
#endif

        }

#ifdef __CATALINA__
        // wait till all workers complete their allocated slice
        i = 0;
        while (1) {
           if (worker[i].status != 0) {
              idle();
           }
           else {
              next_worker(i);
              if (i == 0) {
                 break;
              }
           }
        }
#endif

        lastLevel = lastLevel + 1;
        firstLevel = lastLevel;
    }

    // Did we have an array bounds violation?
    if (rangeError) printf ("Error: Array bounds violation\n");

    // Stop benchmark timer
#ifdef PROPELLER
    endTime = time_us();
#else
    endTime = time_ns();
#endif

    // Print resulting spectrum
    printSpectrum(bxy);

#ifdef PROPELLER
    printf ("1024 point bit-reversal and butterfly run time = %u us\n", endTime - startTime);
    printf("clock frequency = %u\n", _clockfreq());
#else
    printf ("1024 point bit-reversal and butterfly run time = %" PRIu64 "us\n", (endTime - startTime) / 1000U);
#endif

}

// Integer square root
static int sqrti(int i) {
    int s = 0;
    int t = 1 << 30;
    while (t) {
        s |= t;
        if (s <= i) {
            i -= s;
            s += t;
        }
        else
            s -= t;
        s >>= 1;
        t >>= 2;
    }
    return(s);
}

static void printSpectrum( int32_t *spect ) {
    int32_t f, real, imag,  magnitude;

    // Spectrum is available in first half of the buffers after FFT.
    printf("Freq.    Magnitude     I      Q\n");
    for (f = 0; f < FFT_SIZE / 1; f++) {
        // Frequency magnitde is square root of cos part sqaured plus sin part squared
        real = spect[f*2+0]  / FFT_SIZE;
        imag = spect[f*2+1]  / FFT_SIZE;
        magnitude = sqrti ((real * real) + (imag * imag));
        if (magnitude > 0) {
            printf ("%08x %08x %08x %08x\n", f, magnitude, real, imag);
        }
    }
}

// For testing define 16 samples  of an input wave form here.
static int32_t input[] =  {4096, 3784, 2896, 1567, 0, -1567, -2896, -3784, -4096, -3784, -2896, -1567, 0, 1567, 2896, 3784};

// Fill buffer bx with samples of of an imput signal and clear by.
static void fillInput() {
    int32_t k;

    for (k = 0; k <=FFT_SIZE - 1; k++) {
        // Two frequencies of the waveform defined in input
        bxy[k*2]  = (input[(3*k) % 16] / 4);
        bxy[k*2] += (input[(5*k) % 16] / 4);

        // The highest frequency
        if (k & 1)
            bxy[k*2] += (4096 / 8);
        else
            bxy[k*2] += (-4096 / 8);

        // A DC level
        bxy[k*2] += (4096 / 8);

        // Clear Y array.
        bxy[(k*2)+1] = (input[(4*k) % 16] / 4);//k&3;//-bxy[(k*2)];
        rxy[(k*2)+0] = 0;
        rxy[(k*2)+1] = 0;
    }
}

// Reverse length low order bits of integer
static unsigned int bitReverse(unsigned int x,  unsigned int length) {
    x = (((x & 0xaaaaaaaa) >> 1) | ((x & 0x55555555) << 1));
    x = (((x & 0xcccccccc) >> 2) | ((x & 0x33333333) << 2));
    x = (((x & 0xf0f0f0f0) >> 4) | ((x & 0x0f0f0f0f) << 4));
    x = (((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8));
    x = (x >> 16) | (x << 16);
    return (x  >> (32 - length));
}

// Radix-2 decimation in time.
// Moves every sample of bx and by to a postion given by
// reversing the bits of its original array index.
static void decimate() {
    int32_t i, revi, tx1, ty1;

    for (i = 0; i <= FFT_SIZE - 1; i++) {
        revi = bitReverse (i, LOG2_FFT_SIZE);
        if (i < revi) {
            tx1 = bxy[(i*2)+0];
            ty1 = bxy[(i*2)+1];

            bxy[(i*2)+0] = bxy[(revi*2)+0];
            bxy[(i*2)+1] = bxy[(revi*2)+1];

            bxy[(revi*2)+0] = tx1;
            bxy[(revi*2)+1] = ty1;
        }
    }
}

static int32_t *wx;
static int32_t *wy;

// Apply FFT butterflies to N complex samples in buffers bx and by, in time decimated order!
// Resulting FFT is produced in bx and by in the correct order.
void butterflies(int32_t* bxy, int32_t firstLevel, int32_t lastLevel, int32_t slices, int32_t slen) {

    int32_t flightSize = 1 << firstLevel;
    int32_t wDelta = FFT_SIZE / (2 * (1 << firstLevel));
    int32_t noFlights = wDelta / slices;
    int32_t level, flight, flightIndex, wIndex, butterfly, b0, b1, a, b, c, d, k1, k2, k3, tx, ty;

    // Loop though the decimation levels
    // lastLevel is logN - 1
    for (level = firstLevel; level <= lastLevel; level++) {

        flightIndex = 0;
        // Loop through each flight on a level.
        for (flight = 0; flight < noFlights; flight++) {
            wIndex = 0;

            // Loop through butterflies within a flight.
            for (butterfly = 0; butterfly < flightSize; butterfly++) {
                b0 = flightIndex + butterfly;
                b1 = b0 + flightSize;

                // Check that we are within our array slice
                if ((b0 < 0) || (b0 >= slen)) rangeError = 1;
                if ((b1 < 0) || (b1 >= slen)) rangeError = 1;

                // At last...the butterfly.
                // Get X[b1]
                a = bxy[(b1*2)+0];
                b = bxy[(b1*2)+1];

                // Get W[wIndex]
                c = wx[wIndex];
                d = wy[wIndex];

                // Somewhat optimized complex multiply
                k1 = (a * (c + d)) >> 12;
                // T = X[b1] * W[wIndex]
                k2 = (d * (a + b)) >> 12;
                k3 = (c * (b - a)) >> 12;

                tx = k1 - k2;
                ty = k1 + k3;

                k1 = bxy[(b0*2)+0];
                k2 = bxy[(b0*2)+1];
                // X[b1] = X[b0] * T
                bxy[(b1*2)+0] = k1 - tx;
                bxy[(b1*2)+1] = k2 - ty;

                // X[b0] = X[b0] * T
                bxy[(b0*2)+0] = k1 + tx;
                bxy[(b0*2)+1] = k2 + ty;

                wIndex += wDelta;
            }
            flightIndex += flightSize << 1;
        }
        flightSize <<= 1;
        noFlights >>= 1;
        wDelta >>= 1;
    }
}

int main(int argc, char* argv[]) {

    int n;

#ifdef __CATALINA__
    int i;
    FACTORY *f;
    _thread *w;

   // assign a lock to avoid context switch contention 
   kernel_lock = _locknew();
   _thread_set_lock(kernel_lock);

   printf("Creating %d Slice Workers\n", 1<<LOG2_SLICES);

   // create a factory with any available cogs
   f = create_factory(ANY_COG, STACK_SIZE, kernel_lock);
   if (f == NULL) {
       t_printf("Cannot create factory\n");
       exit(1);
   }

   // create workers who will work in the factory
   for (i = 0; i < (1<<LOG2_SLICES); i++) {
      if (create_worker(f, &slice_worker, STACK_SIZE, 100, i, NULL) == NULL) {
          t_printf("Cannot create worker\n");
          exit(1);
      }
   }
#endif

    for (n = 0; n < 1; n++) {
#ifdef PROPELLER
        fft_bench_dit();
        fft_bench_cog(fft.NOBR|fft.INPLACE,"FFT ONLY, no bit reversal");// INPLACE optional
        fft_bench_cog(fft.IFFT|fft.INPLACE,"IFFT In-Place Algorithm");
        fft_bench_cog(fft.INPLACE,"In-Place Algorithm");
        fft_bench_cog(fft.IFFT,"IFFT");
        fft_bench_cog(0,"Out-of-Place Algorithm");
        fft_bench_cog(fft.REAL,"Real Input");
        fft_bench_inline();
#endif
        fft_bench();
    }
    return(0);
}

// Cosine from 0 to 3π/2 (0 to 270 degrees)
static int32_t cos[768] = {
    4095,  4094,  4094,  4094,  4093,  4093,  4092,  4091,  4090,  4088,  4087,  4085,  4083,  4081,  4079,  4077,
    4075,  4072,  4070,  4067,  4064,  4061,  4057,  4054,  4050,  4046,  4042,  4038,  4034,  4030,  4025,  4021,
    4016,  4011,  4006,  4000,  3995,  3989,  3984,  3978,  3972,  3966,  3959,  3953,  3946,  3939,  3932,  3925,
    3918,  3911,  3903,  3896,  3888,  3880,  3872,  3864,  3855,  3847,  3838,  3829,  3820,  3811,  3802,  3792,
    3783,  3773,  3763,  3753,  3743,  3733,  3723,  3712,  3701,  3691,  3680,  3668,  3657,  3646,  3634,  3623,
    3611,  3599,  3587,  3575,  3563,  3550,  3537,  3525,  3512,  3499,  3486,  3473,  3459,  3446,  3432,  3418,
    3404,  3390,  3376,  3362,  3348,  3333,  3318,  3304,  3289,  3274,  3258,  3243,  3228,  3212,  3197,  3181,
    3165,  3149,  3133,  3117,  3100,  3084,  3067,  3051,  3034,  3017,  3000,  2983,  2965,  2948,  2930,  2913,
    2895,  2877,  2859,  2841,  2823,  2805,  2787,  2768,  2750,  2731,  2712,  2693,  2674,  2655,  2636,  2617,
    2597,  2578,  2558,  2539,  2519,  2499,  2479,  2459,  2439,  2419,  2398,  2378,  2357,  2337,  2316,  2295,
    2275,  2254,  2233,  2211,  2190,  2169,  2148,  2126,  2105,  2083,  2061,  2040,  2018,  1996,  1974,  1952,
    1930,  1908,  1885,  1863,  1841,  1818,  1796,  1773,  1750,  1728,  1705,  1682,  1659,  1636,  1613,  1590,
    1567,  1543,  1520,  1497,  1473,  1450,  1426,  1403,  1379,  1355,  1332,  1308,  1284,  1260,  1236,  1212,
    1188,  1164,  1140,  1116,  1092,  1067,  1043,  1019,   994,   970,   946,   921,   897,   872,   848,   823,
    798,   774,   749,   724,   700,   675,   650,   625,   600,   575,   551,   526,   501,   476,   451,   426,
    401,   376,   351,   326,   301,   276,   251,   226,   200,   175,   150,   125,   100,    75,    50,    25,
    0,   -25,   -50,   -75,  -100,  -125,  -150,  -175,  -200,  -226,  -251,  -276,  -301,  -326,  -351,  -376,
    -401,  -426,  -451,  -476,  -501,  -526,  -551,  -576,  -600,  -625,  -650,  -675,  -700,  -724,  -749,  -774,
    -798,  -823,  -848,  -872,  -897,  -921,  -946,  -970,  -995, -1019, -1043, -1067, -1092, -1116, -1140, -1164,
    -1188, -1212, -1236, -1260, -1284, -1308, -1332, -1355, -1379, -1403, -1426, -1450, -1473, -1497, -1520, -1543,
    -1567, -1590, -1613, -1636, -1659, -1682, -1705, -1728, -1750, -1773, -1796, -1818, -1841, -1863, -1885, -1908,
    -1930, -1952, -1974, -1996, -2018, -2040, -2062, -2083, -2105, -2126, -2148, -2169, -2190, -2212, -2233, -2254,
    -2275, -2295, -2316, -2337, -2357, -2378, -2398, -2419, -2439, -2459, -2479, -2499, -2519, -2539, -2558, -2578,
    -2597, -2617, -2636, -2655, -2674, -2693, -2712, -2731, -2750, -2768, -2787, -2805, -2823, -2841, -2859, -2877,
    -2895, -2913, -2930, -2948, -2965, -2983, -3000, -3017, -3034, -3051, -3067, -3084, -3100, -3117, -3133, -3149,
    -3165, -3181, -3197, -3212, -3228, -3243, -3258, -3274, -3289, -3304, -3318, -3333, -3348, -3362, -3376, -3390,
    -3404, -3418, -3432, -3446, -3459, -3473, -3486, -3499, -3512, -3525, -3537, -3550, -3563, -3575, -3587, -3599,
    -3611, -3623, -3634, -3646, -3657, -3669, -3680, -3691, -3701, -3712, -3723, -3733, -3743, -3753, -3763, -3773,
    -3783, -3792, -3802, -3811, -3820, -3829, -3838, -3847, -3855, -3864, -3872, -3880, -3888, -3896, -3903, -3911,
    -3918, -3925, -3932, -3939, -3946, -3953, -3959, -3966, -3972, -3978, -3984, -3989, -3995, -4000, -4006, -4011,
    -4016, -4021, -4025, -4030, -4034, -4038, -4043, -4046, -4050, -4054, -4057, -4061, -4064, -4067, -4070, -4072,
    -4075, -4077, -4079, -4081, -4083, -4085, -4087, -4088, -4090, -4091, -4092, -4093, -4093, -4094, -4094, -4094,
    -4094, -4094, -4094, -4094, -4093, -4093, -4092, -4091, -4090, -4088, -4087, -4085, -4083, -4081, -4079, -4077,
    -4075, -4072, -4070, -4067, -4064, -4061, -4057, -4054, -4050, -4046, -4042, -4038, -4034, -4030, -4025, -4021,
    -4016, -4011, -4006, -4000, -3995, -3989, -3984, -3978, -3972, -3966, -3959, -3953, -3946, -3939, -3932, -3925,
    -3918, -3911, -3903, -3896, -3888, -3880, -3872, -3863, -3855, -3847, -3838, -3829, -3820, -3811, -3802, -3792,
    -3783, -3773, -3763, -3753, -3743, -3733, -3723, -3712, -3701, -3691, -3680, -3668, -3657, -3646, -3634, -3623,
    -3611, -3599, -3587, -3575, -3562, -3550, -3537, -3525, -3512, -3499, -3486, -3473, -3459, -3446, -3432, -3418,
    -3404, -3390, -3376, -3362, -3347, -3333, -3318, -3304, -3289, -3274, -3258, -3243, -3228, -3212, -3197, -3181,
    -3165, -3149, -3133, -3117, -3100, -3084, -3067, -3050, -3034, -3017, -3000, -2983, -2965, -2948, -2930, -2913,
    -2895, -2877, -2859, -2841, -2823, -2805, -2787, -2768, -2749, -2731, -2712, -2693, -2674, -2655, -2636, -2617,
    -2597, -2578, -2558, -2539, -2519, -2499, -2479, -2459, -2439, -2419, -2398, -2378, -2357, -2337, -2316, -2295,
    -2275, -2254, -2233, -2211, -2190, -2169, -2148, -2126, -2105, -2083, -2061, -2040, -2018, -1996, -1974, -1952,
    -1930, -1908, -1885, -1863, -1841, -1818, -1796, -1773, -1750, -1728, -1705, -1682, -1659, -1636, -1613, -1590,
    -1567, -1543, -1520, -1497, -1473, -1450, -1426, -1403, -1379, -1355, -1332, -1308, -1284, -1260, -1236, -1212,
    -1188, -1164, -1140, -1116, -1092, -1067, -1043, -1019,  -994,  -970,  -946,  -921,  -897,  -872,  -848,  -823,
    -798,   -774,  -749,  -724,  -700,  -675,  -650,  -625,  -600,  -575,  -551,  -526,  -501,  -476,  -451,  -426,
    -401,   -376,  -351,  -326,  -301,  -276,  -251,  -225,  -200,  -175,  -150,  -125,  -100,   -75,   -50,   -25
};

// Half cycle of cos
static int32_t *wx = &cos[0];

// Half cycle of minus sine
static int32_t *wy = &cos[256];

//    This file is distributed under the terms of the The MIT License as follows:
//
//    Copyright (c) 2012 Michael Rychlik
//
//    Permission is hereby granted, free of charge, to any person obtaining a copy
//    of this software and associated documentation files (the "Software"), to deal
//    in the Software without restriction, including without limitation the rights
//    to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
//    copies of the Software, and to permit persons to whom the Software is
//    furnished to do so, subject to the following conditions:
//
//    The above copyright notice and this permission notice shall be included in
//    all copies or substantial portions of the Software.
//
//    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
//    IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
//    FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
//    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
//    LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
//    OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
//    THE SOFTWARE.
