1024-point FFT in 79 longs

Cluso99 · 2019-12-20 06:46

cgracey wrote: »
Cluso99 wrote: »
		sub	a1,b1		'write (a1-b1,a2-b2) to (b1,b2)
		sub	a1,b1
		sub	a2,b2
		sub	a2,b2
		setq	#2-1
		wrlong	a1,ptrb++
This seems to be doing
(a1-2*b1, a2-2*b2)
???
A1 and A2 previously had B1 and B2 added to them. So we need to subtract twice now.

Aha. Makes sense now

cgracey · 2019-12-20 11:29

I cleaned up the spectrograph program and the FFT inside. If you want to run it on your P2 Eval board, connect the HDMI board to P[55:48] and hook a microphone into the A/V board on P[7:0]. Or, just connect a microphone via a .1uF capacitor to P[5].

I made some enhancements that Rayman suggested, like adding a window function for the FFT input sample set. I made a nice cosine-shaped window filter which actually cleaned up the output quite a bit.

Here is the code, followed by a partial screen shot of me talking into it:

'*********************************************************************
'*  Use microphone on A/V board to demonstrate spectrograph on HDMI  *
'*********************************************************************

CON		av_base		= 0		'must be a multiple of 8
		hdmi_base	= 48		'must be a multiple of 8

		mic_pin		= av_base + 5
		hdmi_pins	= hdmi_base + 7<<6

		sample_buff	= $1000				'Microphone samples buffer (1024 words)
		real_imag_buff	= sample_buff	 + 1024*2	'FFT (real,imaginary) buffer (1024*2 longs)
		powr_angl_buff	= real_imag_buff + 1024*2*4	'FFT powers buffer (512*2 longs)
		bitmap		= powr_angl_buff + 512*2*4	'HDMI bitmap (300 KB)


DAT		org

                hubset  ##%1_000001_0000011000_1111_10_00       'config PLL, 20MHz/2*25*1 = 250MHz
                waitx   ##20_000_000 / 200                      'allow crystal+PLL 5ms to stabilize
                hubset  ##%1_000001_0000011000_1111_10_11       'switch to PLL

		setq	##($7FFFF - @end_of_pgm)/4		'clear hub RAM
		wrlong	#0,##@end_of_pgm

		coginit	#2,#@pgm_hdmi		'launch HDMI
		coginit	#1,#@pgm_mic		'launch Microphone
		coginit	#0,#@pgm_spectro	'launch Spectrograph


'*********************************
'*  HDMI 640 x 480 x 8bpp luma8  *
'*********************************

DAT             org

pgm_hdmi        setcmod #$100                  	'enable HDMI mode
                drvl    #hdmi_pins       	'enable HDMI pins
                wrpin   ##%111001<<8,#hdmi_pins 'set 1.5k low drive on HDMI pins

                setxfrq ##$0CCCCCCC+1           'set streamer freq to 1/10th clk (25 MHz)

                rdfast  ##640*480/64,##bitmap   'set rdfast to wrap on 300KB bitmap

' Field loop

.field          mov     .hsync0,.sync_000       'vsync off
                mov     .hsync1,.sync_001

                callpa  #10,#.blank             'top blanks

                mov     .i,#480                 'set visible lines
.line           call    #.hsync                 'do horizontal sync
                xcont   .m_rf,#%000             'do visible line, %xxx selects rgb color
                djnz    .i,#.line               'another line?

		wrbyte	#1,#0			'signal ideal time to update bitmap

                callpa  #33,#.blank             'bottom blanks

                mov     .hsync0,.sync_002       'vsync on
                mov     .hsync1,.sync_003

                callpa  #2,#.blank              'vertical sync blanks

                jmp     #.field                 'loop

' Subroutines

.blank          call    #.hsync                 'blank lines
                xcont   .m_vi,.hsync0
        _ret_   djnz    pa,#.blank

.hsync          xcont   .m_bs,.hsync0           'horizontal sync
                xzero   .m_sn,.hsync1
        _ret_   xcont   .m_bv,.hsync0

' Data

.sync_000       long    %1101010100_1101010100_1101010100_10    '
.sync_001       long    %1101010100_1101010100_0010101011_10    '        hsync
.sync_002       long    %1101010100_1101010100_0101010100_10    'vsync
.sync_003       long    %1101010100_1101010100_1010101011_10    'vsync + hsync

.m_bs           long    $70810000 + hdmi_base<<17 + 16          'before sync
.m_sn           long    $70810000 + hdmi_base<<17 + 96          'sync
.m_bv           long    $70810000 + hdmi_base<<17 + 48          'before visible
.m_vi           long    $70810000 + hdmi_base<<17 + 640         'visible
.m_rf           long    $B0820000 + hdmi_base<<17 + 640         'visible rfbyte luma8

.hsync0         res     1
.hsync1         res     1

.i		res	1


'****************
'*  Microphone  *
'****************

DAT		org

pgm_mic		wrpin	##%100111_0000000_00_11000_0,#mic_pin	'set mic for 100x-mag and 14-bit SINC2 sampling
		wxpin	#13,#mic_pin
		dirh	#mic_pin

.wait		testp	#mic_pin	wc	'wait for sample
	if_nc	jmp	#.wait

		rdpin	.x,#mic_pin		'get sample and convert to signed word (acknowledges pin)
		shl	.x,#2
		bitnot	.x,#15

		setq2	#512-1			'load sample buffer in hub into lut at single-sample offset
		rdlong	0,##sample_buff+2

		rdlut	.y,.lastlut		'install sample in last word of buffer
		setword	.y,.x,#1
		wrlut	.y,.lastlut

		setq2	#512-1			'write buffer back with new sample (scrolls buffer)
		wrlong	0,##sample_buff

		jmp	#.wait			'next sample

' Data

.lastlut	long	$1FF

.x		res	1
.y		res	1


'******************
'*  Spectrograph  *
'******************

DAT		org

pgm_spectro	call	#fft			'do fft on current samples

.wait		rdbyte	i1,#0			'wait for ideal time to update bitmap
		tjz	i1,#.wait		'comment-out this line to ignore refresh and go faster
		wrbyte	#0,#0

		mov	i1,#480			'ready to scroll screen and insert new pixel column
		mov	i2,##bitmap

.line		add	i2,#1			'read in pixel row from bitmap at +1 offset
		setq2	#640/4-1
		rdlong	0,i2

		mov	i3,i1			'get power sample
		shl	i3,#3
		add	i3,powr_angl_ptr
		rdlong	i3,i3

		shr	i3,#10			'scale power sample down and clamp it for luma8 pixel use
		fle	i3,#$FF

		rdlut	i4,#640/4-1		'install pixel at end of pixel row
		setbyte	i4,i3,#3
		wrlut	i4,#640/4-1

		sub	i2,#1			'write pixel row back to bitmap at +0 offset (scrolls pixels)
		setq2	#640/4-1
		wrlong	0,i2

		add	i2,##640		'point to next pixel row
		djnz	i1,#.line		

		jmp	#pgm_spectro
'
'
' Fast Fourier Transform, 1024-point
'
' on entry:	sample_buff must hold 1024 signed word samples
' on exit:	powr_angl_buff holds 512 long (power,angle) pairs
'

' Load 1024 signed-word samples into 1024 signed-long (real,imaginary) coordinate pairs

fft		mov	i1,#0			'reset index
		mov	ptra,sample_ptr		'ready to read from sample_buff
		wrfast	#0,real_imag_ptr	'ready to write to real_imag_buff

.load		mov	i2,i1			'apply 1-minus-cosine window to samples
		shl	i2,#32-(encod 1024)
		qrotate	##$3FFF,i2
		rdword	ax,ptra++		'read sample while waiting for window value
		getqx	i2	
		subr	i2,##$4000
		muls	ax,i2			'scale sample by window value
		sar	ax,#15
		wflong	ax			'store windowed sample as (real=sample,imaginary=0)
		wflong	#0
		incmod	i1,##1024-1		'loop until all samples stored as (real,imaginary) pairs
		tjnz	i1,#.load


' Perform 1024-point FFT

		decod	i1,#encod(1024 >> 1)	'init counters
		mov	i2,#1


.loop1		mov	i3,#0
		mov	i4,i1
		mov	c1,i2
		encod	shift,i1

.loop2		mov	angle,i3
		shr	angle,shift
		rev	angle

		mov	ptra,i3			'point to (ax,ay)
		shl	ptra,#3
		add	ptra,real_imag_ptr

		mov	ptrb,i1			'point to (bx,by)
		shl	ptrb,#3
		add	ptrb,ptra

		mov	c2,i4
		sub	c2,i3


.loop3		setq	#2-1			'read (bx,by)
		rdlong	bx,ptrb

		setq	by			'rotate (bx,by) by angle
		qrotate	bx,angle

		setq	#2-1			'read (ax,ay)
		rdlong	ax,ptra

		getqx	bx			'get rotated (bx,by)
		getqy	by

		add	ax,bx			'(ax,ay) = (ax+bx,ay+by)
		add	ay,by

		shl	bx,#1			'(bx,by) = (ax-bx,ay-by)
		subr	bx,ax
		shl	by,#1
		subr	by,ay

		setq	#2-1			'write (ax,ay)
		wrlong	ax,ptra++

		setq	#2-1			'write (bx,by)
		wrlong	bx,ptrb++

		djnz	c2,#.loop3


		add	i3,i1
		add	i3,i1
		add	i4,i1
		add	i4,i1

		djnz	c1,#.loop2


		shr	i1,#1
		shl	i2,#1

		tjnz	i1,#.loop1


' Convert 1024-point FFT results to 512 (power,angle) pairs

		wrfast	#0,powr_angl_ptr	'ready to store (power,angle) pairs, i1=0

.convert	mov	ptra,i1			'get bit-reverse index for (real,imaginary) pair
		rev	ptra
		shr	ptra,#32 - (encod 1024) - 3	'(-3 for two longs per pair)
		add	ptra,real_imag_ptr
		setq	#2-1			'read (real,imaginary)
		rdlong	ax,ptra

		qvector	ax,ay			'convert (real,imaginary) to (power,angle)

		getqx	ax			'store (power,angle)
		wflong	ax
		getqy	ay
		wflong	ay

		incmod	i1,#512-1		'loop until conversions done
	_ret_	tjnz	i1,#.convert
'
'
' Data
'
sample_ptr	long	sample_buff
real_imag_ptr	long	real_imag_buff
powr_angl_ptr	long	powr_angl_buff

c1		res	1
c2		res	1
i1		res	1
i2		res	1
i3		res	1
i4		res	1
shift		res	1
angle		res	1
ax		res	1	'(ax,ay) must be contiguous
ay		res	1
bx		res	1	'(bx,by) must be contiguous
by		res	1


end_of_pgm

Here is a file containing the code and the updated PNut.exe you will need to compile it with:

https://drive.google.com/file/d/1vWiyXDIwwLDsx0Fsyc7Mdqqu4jC87jdc/view?usp=sharing

cgracey · 2019-12-20 11:38

I've been messing around with FFT stuff all day and I realized that when you are dealing with small contiguous operations, they all lie next to each other in memory, making it easy to load and save contiguous stretches of hub RAM, while performing the smaller math on the entire loaded memory section. I'm going to work on this a little more tomorrow and be done with it for a while. Currently, this code takes 3ms to do the entire FFT procedure. Tomorrow, I should be able to get it working in under 1ms.

Rayman · 2019-12-20 11:47

Looking at mp3 again... noticed that they take samples from past frame and next frame into ends of current frame.
I think that makes sure you don’t miss anything when the window is applied...

For this, I wouldn’t do it.
But for voice recognition, might want to.

evanh · 2019-12-20 12:09

I don't think I've ever seen a spectrum analyser image looking that detailed before. The flashy MP3 ones always appeared useless so it hasn't much interested me.

lonesock · 2019-12-20 18:21

My FFT skillz are rusty, but isn't the smallest loop (1x512) only doing rotations by 90-degrees (or was it 180?). If that's the case it should be easy to special case that pass. If I am way off, then sorry for the distraction!

Jonathan

Rayman · 2019-12-20 19:03

I think it'd also be nice to just show x-y plot of magnitude vs. frequency...

cgracey · 2019-12-20 20:10

Rayman wrote: »

I think it'd also be nice to just show x-y plot of magnitude vs. frequency...

You probably already realize this, but that spectrograph program shows frequency (0Hz..15KHz) on the vertical, time rolling left on the horizontal (60 pixel columns per second), and magnitude as pixel intensity. Seeing the FFT output change over time gives a whole extra dimension to what FFT's are usually used for.

cgracey · 2019-12-20 20:13

evanh wrote: »

I don't think I've ever seen a spectrum analyser image looking that detailed before. The flashy MP3 ones always appeared useless so it hasn't much interested me.

It's going to be really interesting when we can modify DDS signals within feedback loops and watch them like this. It'll be a whole new frontier of possibilities.

cgracey · 2019-12-20 20:58

lonesock wrote: »

My FFT skillz are rusty, but isn't the smallest loop (1x512) only doing rotations by 90-degrees (or was it 180?). If that's the case it should be easy to special case that pass. If I am way off, then sorry for the distraction!

Jonathan

I just looked at the angles and I'm surprised to see that they only range from 0 to just under 180 degrees. Makes sense, though, as the compounding of rotations can create many effective angles greater than 180 degrees.

Here is what I'm seeing:

span	iter's	angles (MSBS, 000/400/200 = 0/90/45 deg)
--------------------------------------------------------
512	1 	000
256	2	000 400
128	4	000 400 200 600
64	8	000 400 200 600 100 500 300 700
32	16	000 400 200 600 ... 180 580 380 780
16	32	000 400 200 600 ... 1C0 5C0 3C0 7C0
8	64	000 400 200 600 ... 1E0 5E0 3E0 7E0
4	128	000 400 200 600 ... 1F0 5F0 3F0 7F0
2	256	000 400 200 600 ... 1F8 5F8 3F8 7F8
1	512	000 400 200 600 ... 1FC 5FC 3FC 7FC

So, the first 512-span iteration doesn't even rotate anything, and just does the adds/subs. The next 256-span iteration only rotates by 90 degrees in its last half. In every inner loop, the rotation can be skipped when the angle is zero.

cgracey · 2019-12-20 22:10

I think I'm going to go back to work on Spin2 now and not worry about optimizing this FFT. That can be done at a later time. What we have is very simple and understandable and not too slow. I think only a 3x speed-up is possible. If 10x was possible, I'd keep working on it, but for now, we've got something pretty good and very compact.

I was really pleased to realize that SETQ+QROTATE takes care of all the hard math in the FFT. The rest is just adds and subtracts. I keep wondering, though, if there's another level of optimization possible, aside from stuffing the CORDIC and unrolling the inner loop, somewhat.

msrobots · 2019-12-20 22:15

You need to write a spin version and compare the results, so back to Spin...

Mike

Publison · 2019-12-26 20:32

The unrelated responses were moved to a new thread:
http://forums.parallax.com/discussion/170982/heater-has-6-3-vac#latest

SaucySoliton · 2020-05-07 04:48

Finally tried this out. Very cool! Here is a version that uses scope mode and therefore can support data rates in the megasamples. It's set at 3.676MSPS right now as that is what is needed to show the entire AM radio band on the screen. It really looks like we could receive AM radio by just connecting a wire to a P2 pin.

I added some syncronization between the acquire and processing. The streamer would otherwise write so fast that samples would change before the an entire 1024 samples is read by the window filter code. It's my first time using cogatn. It was easy. The code also has changes to compile with fastspin.

'*********************************************************************
'*  Scope mode spectrograph on HDMI  *
'*********************************************************************

CON		av_base		= 0		'must be a multiple of 8
		hdmi_base	= 48		'must be a multiple of 8

		mic_pin		= av_base + 0  ' 5 for mic,  chose  0/H for scope mode ADC
		hdmi_pins	= hdmi_base + 7<<6

		sample_buff	= $1000				'Microphone samples buffer (1024 words)
		real_imag_buff	= sample_buff	 + 1024*2	'FFT (real,imaginary) buffer (1024*2 longs)
		powr_angl_buff	= real_imag_buff + 1024*2*4	'FFT powers buffer (512*2 longs)
		bitmap		= powr_angl_buff + 512*2*4	'HDMI bitmap (300 KB)


DAT		org

                hubset  ##%1_000001_0000011000_1111_10_00       'config PLL, 20MHz/2*25*1 = 250MHz
                waitx   ##20_000_000 / 200                      'allow crystal+PLL 5ms to stabilize
                hubset  ##%1_000001_0000011000_1111_10_11       'switch to PLL

		setq	##($7FFFF - @end_of_pgm)/4		'clear hub RAM
		wrlong	#0,##@end_of_pgm

		coginit	#2,#@pgm_hdmi		'launch HDMI
		coginit	#1,#@pgm_scope		'launch Microphone
		coginit	#0,#@pgm_spectro	'launch Spectrograph


'*********************************
'*  HDMI 640 x 480 x 8bpp luma8  *
'*********************************

DAT             org

pgm_hdmi        setcmod #$100                  	'enable HDMI mode
                drvl    #hdmi_pins       	'enable HDMI pins
                wrpin   ##%111001<<8,#hdmi_pins 'set 1.5k low drive on HDMI pins

                setxfrq ##$0CCCCCCC+1           'set streamer freq to 1/10th clk (25 MHz)

                rdfast  ##640*480/64,##bitmap   'set rdfast to wrap on 300KB bitmap

' Field loop

.field          mov     .hsync0,.sync_000       'vsync off
                mov     .hsync1,.sync_001

                callpa  #10,#.blank             'top blanks

                mov     .i,#480                 'set visible lines
.line           call    #.hsync                 'do horizontal sync
                xcont   .m_rf,#%000             'do visible line, %xxx selects rgb color
                djnz    .i,#.line               'another line?

		wrbyte	#1,#0			'signal ideal time to update bitmap

                callpa  #33,#.blank             'bottom blanks

                mov     .hsync0,.sync_002       'vsync on
                mov     .hsync1,.sync_003

                callpa  #2,#.blank              'vertical sync blanks

                jmp     #.field                 'loop

' Subroutines

.blank          call    #.hsync                 'blank lines
                xcont   .m_vi,.hsync0
        _ret_   djnz    pa,#.blank

.hsync          xcont   .m_bs,.hsync0           'horizontal sync
                xzero   .m_sn,.hsync1
        _ret_   xcont   .m_bv,.hsync0

' Data

.sync_000       long    %1101010100_1101010100_1101010100_10    '
.sync_001       long    %1101010100_1101010100_0010101011_10    '        hsync
.sync_002       long    %1101010100_1101010100_0101010100_10    'vsync
.sync_003       long    %1101010100_1101010100_1010101011_10    'vsync + hsync

.m_bs           long    $70810000 + hdmi_base<<17 + 16          'before sync
.m_sn           long    $70810000 + hdmi_base<<17 + 96          'sync
.m_bv           long    $70810000 + hdmi_base<<17 + 48          'before visible
.m_vi           long    $70810000 + hdmi_base<<17 + 640         'visible
.m_rf           long    $B0820000 + hdmi_base<<17 + 640         'visible rfbyte luma8

.hsync0         res     1
.hsync1         res     1

.i		res	1


'****************
'*  Microphone  *
'****************

DAT		org

pgm_mic		wrpin	##%100111_0000000_00_11000_0,#mic_pin	'set mic for 100x-mag and 14-bit SINC2 sampling
		wxpin	#13,#mic_pin
		dirh	#mic_pin

.wait		testp	#mic_pin	wc	'wait for sample
	if_nc	jmp	#.wait

		rdpin	.x,#mic_pin		'get sample and convert to signed word (acknowledges pin)
		shl	.x,#2
		bitnot	.x,#15

		setq2	#512-1			'load sample buffer in hub into lut at single-sample offset
		rdlong	0,##sample_buff+2

		rdlut	.y,.lastlut		'install sample in last word of buffer
		setword	.y,.x,#1
		wrlut	.y,.lastlut

		setq2	#512-1			'write buffer back with new sample (scrolls buffer)
		wrlong	0,##sample_buff

		jmp	#.wait			'next sample

' Data

.lastlut	long	$1FF

.x		res	1
.y		res	1


'****************
'*  Scope ADC   *
'****************

CON
   sp_m_sinc     =  24<<1
   sp_m_sinc_ext =  25<<1
   sp_m_scope    =  %11010<<1

   sp_adc_1x     =  %100_011<<15
   sp_adc_3x     =  %100_100<<15
   sp_adc_10x    =  %100_101<<15
   sp_adc_31x    =  %100_110<<15
   sp_adc_100x   =  %100_111<<15
   sp_adc_gio    =  %100_000<<15
   sp_adc_vio    =  %100_001<<15
   sp_adc_flt    =  %100_010<<15   

   sp_dac_900    =  %101_00<<16
   sp_dac_600    =  %101_01<<16
   sp_dac_123    =  %101_10<<16
   sp_dac_75     =  %101_11<<16

   st_dac_none          =  0<<24
   st_dac_X0_0123       =  1<<24
   st_dac_X0_01         =  2<<24
   st_dac_X0_23         =  3<<24
   st_dac_X0_0          =  4<<24
   st_dac_X0_1          =  5<<24
   st_dac_X0_2          =  6<<24
   st_dac_X0_3          =  7<<24
   st_dac_X0_0d12d3     =  8<<24
   st_dac_X0_0d1        =  9<<24
   st_dac_X0_2d3        = 10<<24
   st_dac_X0X1_0123     = 11<<24
   st_dac_X0X1_01       = 12<<24
   st_dac_X0X1_23       = 13<<24
   st_dac_X0X1_0d12d3   = 14<<24
   st_dac_X0X1X2X3_0123 = 15<<24
   st_dac_all           = 15<<24

   
   
   st_scope_1adc       =  %1111_0000_0000_0010<<16
   st_scope_1adc_8pin  =  %1111_0000_0000_0011<<16
   st_scope_2adc       =  %1111_0000_0000_0100<<16
   st_scope_2adc_16pin =  %1111_0000_0000_0101<<16
   st_scope_4adc       =  %1111_0000_0000_0110<<16
   st_scope_wfast      = 1<<23



DAT		org

pgm_scope	wrpin	##sp_adc_10x | sp_m_scope ,#mic_pin	'
		wxpin	#1,#mic_pin		' select filer taps
		dirh	#mic_pin

                setscp   #(1<<6)|mic_pin

                wrfast   #2*1024/64,##sample_buff

		setxfrq	##1<<31/(68)    ' 250/68 = 3.676MSPS
.repeat
		waitatn
		xcont	##st_scope_2adc | st_scope_wfast | 1024 ,#mic_pin
		'waitx	##250000000/60


		jmp	#.repeat			'next sample

' Data

.lastlut	long	$1FF

.x		res	1
.y		res	1


'******************
'*  Spectrograph  *
'******************

DAT		org

pgm_spectro	call	#fft			'do fft on current samples

.wait		rdbyte	i1,#0			'wait for ideal time to update bitmap
		tjz	i1,#.wait		'comment-out this line to ignore refresh and go faster
		wrbyte	#0,#0

		mov	i1,#480			'ready to scroll screen and insert new pixel column
		mov	i2,##bitmap

.line		add	i2,#1			'read in pixel row from bitmap at +1 offset
		setq2	#640/4-1
		rdlong	0,i2

		mov	i3,i1			'get power sample
		shl	i3,#3
		add	i3,powr_angl_ptr
		rdlong	i3,i3

		shr	i3,#10			'scale power sample down and clamp it for luma8 pixel use
		fle	i3,#$FF

		rdlut	i4,#640/4-1		'install pixel at end of pixel row
		setbyte	i4,i3,#3
		wrlut	i4,#640/4-1

		sub	i2,#1			'write pixel row back to bitmap at +0 offset (scrolls pixels)
		setq2	#640/4-1
		wrlong	0,i2

		add	i2,##640		'point to next pixel row
		djnz	i1,#.line		

		jmp	#pgm_spectro
'
'
' Fast Fourier Transform, 1024-point
'
' on entry:	sample_buff must hold 1024 signed word samples
' on exit:	powr_angl_buff holds 512 long (power,angle) pairs
'

' Load 1024 signed-word samples into 1024 signed-long (real,imaginary) coordinate pairs

fft		mov	i1,#0			'reset index
		mov	ptra,sample_ptr		'ready to read from sample_buff
		wrfast	#0,real_imag_ptr	'ready to write to real_imag_buff

.load		mov	i2,i1			'apply 1-minus-cosine window to samples
		shl	i2,#32-((10))
		qrotate	##$3FFF,i2
		rdword	ax,ptra++		'read sample while waiting for window value

		getbyte ax,ax,#mic_pin&1	' choose channel
		shl	ax,#8			' byte to word
		sub	ax,##128<<8		' unsigned to signed

		getqx	i2	
		subr	i2,##$4000
		muls	ax,i2			'scale sample by window value
		sar	ax,#15
		wflong	ax			'store windowed sample as (real=sample,imaginary=0)
		wflong	#0
		incmod	i1,##1024-1		'loop until all samples stored as (real,imaginary) pairs
		tjnz	i1,#.load
		cogatn	#1<<1	' scope cog

' Perform 1024-point FFT

		decod	i1,#((10))-1	'init counters
		mov	i2,#1


.loop1		mov	i3,#0
		mov	i4,i1
		mov	c1,i2
		encod	shift,i1

.loop2		mov	angle,i3
		shr	angle,shift
		rev	angle

		mov	ptra,i3			'point to (ax,ay)
		shl	ptra,#3
		add	ptra,real_imag_ptr

		mov	ptrb,i1			'point to (bx,by)
		shl	ptrb,#3
		add	ptrb,ptra

		mov	c2,i4
		sub	c2,i3


.loop3		setq	#2-1			'read (bx,by)
		rdlong	bx,ptrb

		setq	by			'rotate (bx,by) by angle
		qrotate	bx,angle

		setq	#2-1			'read (ax,ay)
		rdlong	ax,ptra

		getqx	bx			'get rotated (bx,by)
		getqy	by

		add	ax,bx			'(ax,ay) = (ax+bx,ay+by)
		add	ay,by

		shl	bx,#1			'(bx,by) = (ax-bx,ay-by)
		subr	bx,ax
		shl	by,#1
		subr	by,ay

		setq	#2-1			'write (ax,ay)
		wrlong	ax,ptra++

		setq	#2-1			'write (bx,by)
		wrlong	bx,ptrb++

		djnz	c2,#.loop3


		add	i3,i1
		add	i3,i1
		add	i4,i1
		add	i4,i1

		djnz	c1,#.loop2


		shr	i1,#1
		shl	i2,#1

		tjnz	i1,#.loop1


' Convert 1024-point FFT results to 512 (power,angle) pairs

		wrfast	#0,powr_angl_ptr	'ready to store (power,angle) pairs, i1=0

.convert	mov	ptra,i1			'get bit-reverse index for (real,imaginary) pair
		rev	ptra
		shr	ptra,#32 - ((10)) - 3	'(-3 for two longs per pair)
		add	ptra,real_imag_ptr
		setq	#2-1			'read (real,imaginary)
		rdlong	ax,ptra

		qvector	ax,ay			'convert (real,imaginary) to (power,angle)

		getqx	ax			'store (power,angle)
		wflong	ax
		getqy	ay
		wflong	ay

		incmod	i1,#512-1		'loop until conversions done
	_ret_	tjnz	i1,#.convert
'
'
' Data
'
sample_ptr	long	sample_buff
real_imag_ptr	long	real_imag_buff
powr_angl_ptr	long	powr_angl_buff

c1		res	1
c2		res	1
i1		res	1
i2		res	1
i3		res	1
i4		res	1
shift		res	1
angle		res	1
ax		res	1	'(ax,ay) must be contiguous
ay		res	1
bx		res	1	'(bx,by) must be contiguous
by		res	1


end_of_pgm

Rayman · 2023-10-13 23:07

I'm wondering if this could be used for wake word voice recognition or maybe a few simple commands...

Looks to be pre-Spin2 though, might take some work to get it going in Spin2...

Also not sure why Chip had the ADC at 100X for a real mic. Pretty sure my SparkFun one only needs about 10X or so... Need that for sure for unamplified mic for sure...

Rayman · 2023-10-14 14:41

Just tested this out with SparkFun amplified mic: https://www.sparkfun.com/products/12758

100X gain on ADC is definitely way too much. Dialed that down to 1X (or maybe 10X with LDO board) like this:

pgm_mic         'wrpin   ##%100111_0000000_00_11000_0,#mic_pin   'set mic for 100x-mag and 14-bit SINC2 sampling
                wrpin   ##P_LOCAL_A|P_ADC_1X|P_ADC ,#mic_pin

I'm thinking there is definitely possibility of using this for voice recognition...

Took some pics, first with a board without LDOs. I think you can see the power supply ripple showing up as horizontal lines with this board...
IMG_1918: Saying "Yes" then "No" repeat. 1X gain. "Yes" is the skinny, tall one and "No" is short and wide.
IMG_1919: Wistling three notes repeatedly. 1X gain. This would be another way to interact, probably easier...

Next, switch to a board with LDOs on the mic pins.
IMG_1920: "Yes" and "No" with 10X gain
IMG_1921: "Yes" and No" with 1X gain

Rayman · 2023-10-14 15:16

Here's a slightly modernized version of the code. It doesn't start with an erased screen like the original though...
Next step is to un-hardcode the buffer addresses...

avsa242 · 2023-10-14 15:27

Very cool...I don't know how I ever missed this thread. I ported Heater's P1 spin-based FFT to the P2 several months back to try out things like audio and hopefully eventually RF - looks like @SaucySoliton had a similar idea with this code here. This is great though...so compact. One thing I couldn't figure out with Heater's code was how to change the sampling bandwidth (so for ex., to focus on smaller swath of spectrum) - I think I was only able to accomplish something like it by setting the number of points to something smaller. Not having looked at this in depth yet, is it reasonably easy to change parameters like that?
Good luck with the voice recognition btw, that'd be a great addition.

Christof Eb. · 2023-10-15 11:50

@Rayman said:
Just tested this out with SparkFun amplified mic: https://www.sparkfun.com/products/12758

100X gain on ADC is definitely way too much. Dialed that down to 1X (or maybe 10X with LDO board) like this:
pgm_mic         'wrpin   ##%100111_0000000_00_11000_0,#mic_pin   'set mic for 100x-mag and 14-bit SINC2 sampling
                wrpin   ##P_LOCAL_A|P_ADC_1X|P_ADC ,#mic_pin
I'm thinking there is definitely possibility of using this for voice recognition...

Took some pics, first with a board without LDOs. I think you can see the power supply ripple showing up as horizontal lines with this board...
IMG_1918: Saying "Yes" then "No" repeat. 1X gain. "Yes" is the skinny, tall one and "No" is short and wide.
IMG_1919: Wistling three notes repeatedly. 1X gain. This would be another way to interact, probably easier...

Next, switch to a board with LDOs on the mic pins.
IMG_1920: "Yes" and "No" with 10X gain
IMG_1921: "Yes" and No" with 1X gain

Hi,
For voice recognition after you have decided, which few frequencies are relevant, it will be probably more efficient to use the Goertzel algorithm instead of fft. Wasn't there a thread about it somewhere?

Ariba · 2023-10-15 22:27

@"Christof Eb." said:
...

Hi,
For voice recognition after you have decided, which few frequencies are relevant, it will be probably more efficient to use the Goertzel algorithm instead of fft. Wasn't there a thread about it somewhere?

https://forums.parallax.com/discussion/115725/goertzel-based-speech-recognizer-now-with-source-code/p1

Rayman · 2023-10-19 13:23

Starting to get a handle on this code...
Here's a version with VGA instead of HDMI

There's one thing that Chip added to the HDMI driver that I needed to copy into the VGA driver:
wrbyte #1,#0 'signal ideal time to update bitmap

Seems like would be better to use cogatn instead, but I guess this works.

Rayman · 2023-10-19 13:40

Right now, the buffers start at $1000, which kind of clobbers the spin2 interpreter. Kind of surprising it still works.
But, it's mostly all assembly, Spin2 just needs to start up to cogs...
Original was all assembly and so didn't have spin2 interpeter...

Rayman · 2023-10-19 17:16

Got the buffers change from being constants to addresses of variable arrays.
Should be able to make this into a subobject now.
Mic ADC gain is set to 100x for unamplified electret mic.

One thing I haven't figured out is why I can't insert instructions at the start of the Spectrograph Driver.
Adding even a nop there breaks it...

DAT  'Spectrograph Driver
'******************
'*  Spectrograph  *
'******************
org
               'nop  'RJA: Can't have anything here!  Why not?

pgm_spectro     call    #fft                    'do fft on current samples

evanh · 2023-10-19 21:33

Someone a while back did a test of the ADC performance and found that the noise floor of x100 gain is ten times noisier than the x10 gain. That said, I have no idea how scientific the testing actually was.

evanh · 2023-10-19 21:45

@Rayman said:
One thing I haven't figured out is why I can't insert instructions at the start of the Spectrograph Driver.
Adding even a nop there breaks it...

The COGINIT requires pgm_spectro to be at ORG 0.

Rayman · 2023-10-19 22:14

Right. Thanks @evanh
Obvious now…

Rayman · 2023-10-20 16:18

I've added this brief analysis of the situation to the top of the code here.
Only the lower half of the screen is going to be useful for voice recognition.
So, using the upper half of screen to plot the results of the last FFT operation.
Also, doing a very crude amplification of higher frequencies.

CON ''Notes:
        ''Mic Sample rate is dependent on P2 clock frequency
        ''Code is set for SINC2 sampling at 14 bits, 8192 clocks per sample. --> Sample frequency =  250_000_000/8192 =  30517.58 Hz
        ''FFT on 1024 samples -->  Max. update rate on all fresh samples =     30517.58/1024 =  29.8 Hz
        ''Video driver signals Spectroscope driver to refresh screen during vertical blanking (and then do another FFT and wait), so 60 Hz
        ''   --> Each FFT is on about half old, half new samples

        ''Frequency resolution (steps in frequency between FFT result) =  30517.58/1024 = 29.80 Hz
        ''FFT result is 512 real and 512 imaginary steps, so range is from 0 Hz to 29.80*511=   15228.99 Hz
        ''Wikipedia says telephony uses range from 300 to 3400 Hz.  0..4000 Hz would be lower 134 pixels in this display (about lower 1/4 of display)

SaucySoliton · 2024-01-14 20:39

Here's a first try at an optimized FFT library based on Chip's code. Usable in flexprop. I haven't tested propeller tool. QROTATE is huge boost for FFT. It does sin(), cos(), 4 multiplies and 2 adds in as little as 8 clocks, provided the pipeline can be kept full.

New optimizations:
1. FIFO can read some of the data in the background.
2. Some instructions placed in wait interval between hub writes.
3. SKIPF bypasses cordic instructions when angle is zero.
4. Unrolled loop packs the cordic pipeline with 4 points at a time.
5. Bitreverse reads in order with FIFO and writes out of order.

I've tried not to grow the length of code too much. The unrolled loop processes 4 samples at a time. I have not done any tests to determine if that is the optimal number. The inline ASM version does not use the unrolled loop. From Chip's work it sounds like 16 points would be good.

fft_bench v1.2.1 for PROPELLER
OpenMP not available on this system
Freq.    Magnitude  
       0      200        
      c0      1ff        
     140      1ff        
     200      200        
1024 point bit-reversal and butterfly run time = 2059 us  PASM
clock frequency = 160000000

1024 point butterfly run time = 1901 us PASM, with out of order outputs

1024 point butterfly run time = 3638 us inline ASM, with out of order outputs

Freq.    Magnitude 
       0      1fe        
      c0      1ff        
     140      1ff        
     200      1ff        
1024 point bit-reversal and butterfly run time = 13625 us  flexc
clock frequency = 160000000

The C code in fftbench has real and imaginary data stored in separate arrays. I don't know if any of the compilers could improve performance if they were packed together. It's not quite apples to apples; the PASM bit reverser is an out-of-place algorithm (more memory required). The C bit reverser is in-place. The benchmarks exclude the QVECTOR on the final output.

I started working with inline assembler. There were a lot of limitations there like can't use ptra, can't use fifo, skipf can't jump over instructions if in hubexec, that I moved my development to a dedicated cog for now.

evanh · 2024-01-14 21:42

It suits to make it like a coprocessor anyway. It's a compute intensive process that is fine implemented as a concurrent task. Now we just need 16 cores for multiple FFT channels.

avsa242 · 2024-01-14 21:43

This already seemed impressive at first, then I realized Chip was running his code at 250MHz and you're only running the above at 160. I tried it at 250 (default settings; didn't try any of the different variants) and got 1317usec. Nice

Cheers

1024-point FFT in 79 longs

Comments