I cleaned up the spectrograph program and the FFT inside. If you want to run it on your P2 Eval board, connect the HDMI board to P[55:48] and hook a microphone into the A/V board on P[7:0]. Or, just connect a microphone via a .1uF capacitor to P[5].
I made some enhancements that Rayman suggested, like adding a window function for the FFT input sample set. I made a nice cosine-shaped window filter which actually cleaned up the output quite a bit.
Here is the code, followed by a partial screen shot of me talking into it:
'*********************************************************************
'* Use microphone on A/V board to demonstrate spectrograph on HDMI *
'*********************************************************************
CON av_base = 0 'must be a multiple of 8
hdmi_base = 48 'must be a multiple of 8
mic_pin = av_base + 5
hdmi_pins = hdmi_base + 7<<6
sample_buff = $1000 'Microphone samples buffer (1024 words)
real_imag_buff = sample_buff + 1024*2 'FFT (real,imaginary) buffer (1024*2 longs)
powr_angl_buff = real_imag_buff + 1024*2*4 'FFT powers buffer (512*2 longs)
bitmap = powr_angl_buff + 512*2*4 'HDMI bitmap (300 KB)
DAT org
hubset ##%1_000001_0000011000_1111_10_00 'config PLL, 20MHz/2*25*1 = 250MHz
waitx ##20_000_000 / 200 'allow crystal+PLL 5ms to stabilize
hubset ##%1_000001_0000011000_1111_10_11 'switch to PLL
setq ##($7FFFF - @end_of_pgm)/4 'clear hub RAM
wrlong #0,##@end_of_pgm
coginit #2,#@pgm_hdmi 'launch HDMI
coginit #1,#@pgm_mic 'launch Microphone
coginit #0,#@pgm_spectro 'launch Spectrograph
'*********************************
'* HDMI 640 x 480 x 8bpp luma8 *
'*********************************
DAT org
pgm_hdmi setcmod #$100 'enable HDMI mode
drvl #hdmi_pins 'enable HDMI pins
wrpin ##%111001<<8,#hdmi_pins 'set 1.5k low drive on HDMI pins
setxfrq ##$0CCCCCCC+1 'set streamer freq to 1/10th clk (25 MHz)
rdfast ##640*480/64,##bitmap 'set rdfast to wrap on 300KB bitmap
' Field loop
.field mov .hsync0,.sync_000 'vsync off
mov .hsync1,.sync_001
callpa #10,#.blank 'top blanks
mov .i,#480 'set visible lines
.line call #.hsync 'do horizontal sync
xcont .m_rf,#%000 'do visible line, %xxx selects rgb color
djnz .i,#.line 'another line?
wrbyte #1,#0 'signal ideal time to update bitmap
callpa #33,#.blank 'bottom blanks
mov .hsync0,.sync_002 'vsync on
mov .hsync1,.sync_003
callpa #2,#.blank 'vertical sync blanks
jmp #.field 'loop
' Subroutines
.blank call #.hsync 'blank lines
xcont .m_vi,.hsync0
_ret_ djnz pa,#.blank
.hsync xcont .m_bs,.hsync0 'horizontal sync
xzero .m_sn,.hsync1
_ret_ xcont .m_bv,.hsync0
' Data
.sync_000 long %1101010100_1101010100_1101010100_10 '
.sync_001 long %1101010100_1101010100_0010101011_10 ' hsync
.sync_002 long %1101010100_1101010100_0101010100_10 'vsync
.sync_003 long %1101010100_1101010100_1010101011_10 'vsync + hsync
.m_bs long $70810000 + hdmi_base<<17 + 16 'before sync
.m_sn long $70810000 + hdmi_base<<17 + 96 'sync
.m_bv long $70810000 + hdmi_base<<17 + 48 'before visible
.m_vi long $70810000 + hdmi_base<<17 + 640 'visible
.m_rf long $B0820000 + hdmi_base<<17 + 640 'visible rfbyte luma8
.hsync0 res 1
.hsync1 res 1
.i res 1
'****************
'* Microphone *
'****************
DAT org
pgm_mic wrpin ##%100111_0000000_00_11000_0,#mic_pin 'set mic for 100x-mag and 14-bit SINC2 sampling
wxpin #13,#mic_pin
dirh #mic_pin
.wait testp #mic_pin wc 'wait for sample
if_nc jmp #.wait
rdpin .x,#mic_pin 'get sample and convert to signed word (acknowledges pin)
shl .x,#2
bitnot .x,#15
setq2 #512-1 'load sample buffer in hub into lut at single-sample offset
rdlong 0,##sample_buff+2
rdlut .y,.lastlut 'install sample in last word of buffer
setword .y,.x,#1
wrlut .y,.lastlut
setq2 #512-1 'write buffer back with new sample (scrolls buffer)
wrlong 0,##sample_buff
jmp #.wait 'next sample
' Data
.lastlut long $1FF
.x res 1
.y res 1
'******************
'* Spectrograph *
'******************
DAT org
pgm_spectro call #fft 'do fft on current samples
.wait rdbyte i1,#0 'wait for ideal time to update bitmap
tjz i1,#.wait 'comment-out this line to ignore refresh and go faster
wrbyte #0,#0
mov i1,#480 'ready to scroll screen and insert new pixel column
mov i2,##bitmap
.line add i2,#1 'read in pixel row from bitmap at +1 offset
setq2 #640/4-1
rdlong 0,i2
mov i3,i1 'get power sample
shl i3,#3
add i3,powr_angl_ptr
rdlong i3,i3
shr i3,#10 'scale power sample down and clamp it for luma8 pixel use
fle i3,#$FF
rdlut i4,#640/4-1 'install pixel at end of pixel row
setbyte i4,i3,#3
wrlut i4,#640/4-1
sub i2,#1 'write pixel row back to bitmap at +0 offset (scrolls pixels)
setq2 #640/4-1
wrlong 0,i2
add i2,##640 'point to next pixel row
djnz i1,#.line
jmp #pgm_spectro
'
'
' Fast Fourier Transform, 1024-point
'
' on entry: sample_buff must hold 1024 signed word samples
' on exit: powr_angl_buff holds 512 long (power,angle) pairs
'
' Load 1024 signed-word samples into 1024 signed-long (real,imaginary) coordinate pairs
fft mov i1,#0 'reset index
mov ptra,sample_ptr 'ready to read from sample_buff
wrfast #0,real_imag_ptr 'ready to write to real_imag_buff
.load mov i2,i1 'apply 1-minus-cosine window to samples
shl i2,#32-(encod 1024)
qrotate ##$3FFF,i2
rdword ax,ptra++ 'read sample while waiting for window value
getqx i2
subr i2,##$4000
muls ax,i2 'scale sample by window value
sar ax,#15
wflong ax 'store windowed sample as (real=sample,imaginary=0)
wflong #0
incmod i1,##1024-1 'loop until all samples stored as (real,imaginary) pairs
tjnz i1,#.load
' Perform 1024-point FFT
decod i1,#encod(1024 >> 1) 'init counters
mov i2,#1
.loop1 mov i3,#0
mov i4,i1
mov c1,i2
encod shift,i1
.loop2 mov angle,i3
shr angle,shift
rev angle
mov ptra,i3 'point to (ax,ay)
shl ptra,#3
add ptra,real_imag_ptr
mov ptrb,i1 'point to (bx,by)
shl ptrb,#3
add ptrb,ptra
mov c2,i4
sub c2,i3
.loop3 setq #2-1 'read (bx,by)
rdlong bx,ptrb
setq by 'rotate (bx,by) by angle
qrotate bx,angle
setq #2-1 'read (ax,ay)
rdlong ax,ptra
getqx bx 'get rotated (bx,by)
getqy by
add ax,bx '(ax,ay) = (ax+bx,ay+by)
add ay,by
shl bx,#1 '(bx,by) = (ax-bx,ay-by)
subr bx,ax
shl by,#1
subr by,ay
setq #2-1 'write (ax,ay)
wrlong ax,ptra++
setq #2-1 'write (bx,by)
wrlong bx,ptrb++
djnz c2,#.loop3
add i3,i1
add i3,i1
add i4,i1
add i4,i1
djnz c1,#.loop2
shr i1,#1
shl i2,#1
tjnz i1,#.loop1
' Convert 1024-point FFT results to 512 (power,angle) pairs
wrfast #0,powr_angl_ptr 'ready to store (power,angle) pairs, i1=0
.convert mov ptra,i1 'get bit-reverse index for (real,imaginary) pair
rev ptra
shr ptra,#32 - (encod 1024) - 3 '(-3 for two longs per pair)
add ptra,real_imag_ptr
setq #2-1 'read (real,imaginary)
rdlong ax,ptra
qvector ax,ay 'convert (real,imaginary) to (power,angle)
getqx ax 'store (power,angle)
wflong ax
getqy ay
wflong ay
incmod i1,#512-1 'loop until conversions done
_ret_ tjnz i1,#.convert
'
'
' Data
'
sample_ptr long sample_buff
real_imag_ptr long real_imag_buff
powr_angl_ptr long powr_angl_buff
c1 res 1
c2 res 1
i1 res 1
i2 res 1
i3 res 1
i4 res 1
shift res 1
angle res 1
ax res 1 '(ax,ay) must be contiguous
ay res 1
bx res 1 '(bx,by) must be contiguous
by res 1
end_of_pgm
Here is a file containing the code and the updated PNut.exe you will need to compile it with:
I've been messing around with FFT stuff all day and I realized that when you are dealing with small contiguous operations, they all lie next to each other in memory, making it easy to load and save contiguous stretches of hub RAM, while performing the smaller math on the entire loaded memory section. I'm going to work on this a little more tomorrow and be done with it for a while. Currently, this code takes 3ms to do the entire FFT procedure. Tomorrow, I should be able to get it working in under 1ms.
Looking at mp3 again... noticed that they take samples from past frame and next frame into ends of current frame.
I think that makes sure you don’t miss anything when the window is applied...
For this, I wouldn’t do it.
But for voice recognition, might want to.
I don't think I've ever seen a spectrum analyser image looking that detailed before. The flashy MP3 ones always appeared useless so it hasn't much interested me.
My FFT skillz are rusty, but isn't the smallest loop (1x512) only doing rotations by 90-degrees (or was it 180?). If that's the case it should be easy to special case that pass. If I am way off, then sorry for the distraction!
I think it'd also be nice to just show x-y plot of magnitude vs. frequency...
You probably already realize this, but that spectrograph program shows frequency (0Hz..15KHz) on the vertical, time rolling left on the horizontal (60 pixel columns per second), and magnitude as pixel intensity. Seeing the FFT output change over time gives a whole extra dimension to what FFT's are usually used for.
I don't think I've ever seen a spectrum analyser image looking that detailed before. The flashy MP3 ones always appeared useless so it hasn't much interested me.
It's going to be really interesting when we can modify DDS signals within feedback loops and watch them like this. It'll be a whole new frontier of possibilities.
My FFT skillz are rusty, but isn't the smallest loop (1x512) only doing rotations by 90-degrees (or was it 180?). If that's the case it should be easy to special case that pass. If I am way off, then sorry for the distraction!
Jonathan
I just looked at the angles and I'm surprised to see that they only range from 0 to just under 180 degrees. Makes sense, though, as the compounding of rotations can create many effective angles greater than 180 degrees.
So, the first 512-span iteration doesn't even rotate anything, and just does the adds/subs. The next 256-span iteration only rotates by 90 degrees in its last half. In every inner loop, the rotation can be skipped when the angle is zero.
I think I'm going to go back to work on Spin2 now and not worry about optimizing this FFT. That can be done at a later time. What we have is very simple and understandable and not too slow. I think only a 3x speed-up is possible. If 10x was possible, I'd keep working on it, but for now, we've got something pretty good and very compact.
I was really pleased to realize that SETQ+QROTATE takes care of all the hard math in the FFT. The rest is just adds and subtracts. I keep wondering, though, if there's another level of optimization possible, aside from stuffing the CORDIC and unrolling the inner loop, somewhat.
Finally tried this out. Very cool! Here is a version that uses scope mode and therefore can support data rates in the megasamples. It's set at 3.676MSPS right now as that is what is needed to show the entire AM radio band on the screen. It really looks like we could receive AM radio by just connecting a wire to a P2 pin.
I added some syncronization between the acquire and processing. The streamer would otherwise write so fast that samples would change before the an entire 1024 samples is read by the window filter code. It's my first time using cogatn. It was easy. The code also has changes to compile with fastspin.
'*********************************************************************
'* Scope mode spectrograph on HDMI *
'*********************************************************************
CON av_base = 0 'must be a multiple of 8
hdmi_base = 48 'must be a multiple of 8
mic_pin = av_base + 0 ' 5 for mic, chose 0/H for scope mode ADC
hdmi_pins = hdmi_base + 7<<6
sample_buff = $1000 'Microphone samples buffer (1024 words)
real_imag_buff = sample_buff + 1024*2 'FFT (real,imaginary) buffer (1024*2 longs)
powr_angl_buff = real_imag_buff + 1024*2*4 'FFT powers buffer (512*2 longs)
bitmap = powr_angl_buff + 512*2*4 'HDMI bitmap (300 KB)
DAT org
hubset ##%1_000001_0000011000_1111_10_00 'config PLL, 20MHz/2*25*1 = 250MHz
waitx ##20_000_000 / 200 'allow crystal+PLL 5ms to stabilize
hubset ##%1_000001_0000011000_1111_10_11 'switch to PLL
setq ##($7FFFF - @end_of_pgm)/4 'clear hub RAM
wrlong #0,##@end_of_pgm
coginit #2,#@pgm_hdmi 'launch HDMI
coginit #1,#@pgm_scope 'launch Microphone
coginit #0,#@pgm_spectro 'launch Spectrograph
'*********************************
'* HDMI 640 x 480 x 8bpp luma8 *
'*********************************
DAT org
pgm_hdmi setcmod #$100 'enable HDMI mode
drvl #hdmi_pins 'enable HDMI pins
wrpin ##%111001<<8,#hdmi_pins 'set 1.5k low drive on HDMI pins
setxfrq ##$0CCCCCCC+1 'set streamer freq to 1/10th clk (25 MHz)
rdfast ##640*480/64,##bitmap 'set rdfast to wrap on 300KB bitmap
' Field loop
.field mov .hsync0,.sync_000 'vsync off
mov .hsync1,.sync_001
callpa #10,#.blank 'top blanks
mov .i,#480 'set visible lines
.line call #.hsync 'do horizontal sync
xcont .m_rf,#%000 'do visible line, %xxx selects rgb color
djnz .i,#.line 'another line?
wrbyte #1,#0 'signal ideal time to update bitmap
callpa #33,#.blank 'bottom blanks
mov .hsync0,.sync_002 'vsync on
mov .hsync1,.sync_003
callpa #2,#.blank 'vertical sync blanks
jmp #.field 'loop
' Subroutines
.blank call #.hsync 'blank lines
xcont .m_vi,.hsync0
_ret_ djnz pa,#.blank
.hsync xcont .m_bs,.hsync0 'horizontal sync
xzero .m_sn,.hsync1
_ret_ xcont .m_bv,.hsync0
' Data
.sync_000 long %1101010100_1101010100_1101010100_10 '
.sync_001 long %1101010100_1101010100_0010101011_10 ' hsync
.sync_002 long %1101010100_1101010100_0101010100_10 'vsync
.sync_003 long %1101010100_1101010100_1010101011_10 'vsync + hsync
.m_bs long $70810000 + hdmi_base<<17 + 16 'before sync
.m_sn long $70810000 + hdmi_base<<17 + 96 'sync
.m_bv long $70810000 + hdmi_base<<17 + 48 'before visible
.m_vi long $70810000 + hdmi_base<<17 + 640 'visible
.m_rf long $B0820000 + hdmi_base<<17 + 640 'visible rfbyte luma8
.hsync0 res 1
.hsync1 res 1
.i res 1
'****************
'* Microphone *
'****************
DAT org
pgm_mic wrpin ##%100111_0000000_00_11000_0,#mic_pin 'set mic for 100x-mag and 14-bit SINC2 sampling
wxpin #13,#mic_pin
dirh #mic_pin
.wait testp #mic_pin wc 'wait for sample
if_nc jmp #.wait
rdpin .x,#mic_pin 'get sample and convert to signed word (acknowledges pin)
shl .x,#2
bitnot .x,#15
setq2 #512-1 'load sample buffer in hub into lut at single-sample offset
rdlong 0,##sample_buff+2
rdlut .y,.lastlut 'install sample in last word of buffer
setword .y,.x,#1
wrlut .y,.lastlut
setq2 #512-1 'write buffer back with new sample (scrolls buffer)
wrlong 0,##sample_buff
jmp #.wait 'next sample
' Data
.lastlut long $1FF
.x res 1
.y res 1
'****************
'* Scope ADC *
'****************
CON
sp_m_sinc = 24<<1
sp_m_sinc_ext = 25<<1
sp_m_scope = %11010<<1
sp_adc_1x = %100_011<<15
sp_adc_3x = %100_100<<15
sp_adc_10x = %100_101<<15
sp_adc_31x = %100_110<<15
sp_adc_100x = %100_111<<15
sp_adc_gio = %100_000<<15
sp_adc_vio = %100_001<<15
sp_adc_flt = %100_010<<15
sp_dac_900 = %101_00<<16
sp_dac_600 = %101_01<<16
sp_dac_123 = %101_10<<16
sp_dac_75 = %101_11<<16
st_dac_none = 0<<24
st_dac_X0_0123 = 1<<24
st_dac_X0_01 = 2<<24
st_dac_X0_23 = 3<<24
st_dac_X0_0 = 4<<24
st_dac_X0_1 = 5<<24
st_dac_X0_2 = 6<<24
st_dac_X0_3 = 7<<24
st_dac_X0_0d12d3 = 8<<24
st_dac_X0_0d1 = 9<<24
st_dac_X0_2d3 = 10<<24
st_dac_X0X1_0123 = 11<<24
st_dac_X0X1_01 = 12<<24
st_dac_X0X1_23 = 13<<24
st_dac_X0X1_0d12d3 = 14<<24
st_dac_X0X1X2X3_0123 = 15<<24
st_dac_all = 15<<24
st_scope_1adc = %1111_0000_0000_0010<<16
st_scope_1adc_8pin = %1111_0000_0000_0011<<16
st_scope_2adc = %1111_0000_0000_0100<<16
st_scope_2adc_16pin = %1111_0000_0000_0101<<16
st_scope_4adc = %1111_0000_0000_0110<<16
st_scope_wfast = 1<<23
DAT org
pgm_scope wrpin ##sp_adc_10x | sp_m_scope ,#mic_pin '
wxpin #1,#mic_pin ' select filer taps
dirh #mic_pin
setscp #(1<<6)|mic_pin
wrfast #2*1024/64,##sample_buff
setxfrq ##1<<31/(68) ' 250/68 = 3.676MSPS
.repeat
waitatn
xcont ##st_scope_2adc | st_scope_wfast | 1024 ,#mic_pin
'waitx ##250000000/60
jmp #.repeat 'next sample
' Data
.lastlut long $1FF
.x res 1
.y res 1
'******************
'* Spectrograph *
'******************
DAT org
pgm_spectro call #fft 'do fft on current samples
.wait rdbyte i1,#0 'wait for ideal time to update bitmap
tjz i1,#.wait 'comment-out this line to ignore refresh and go faster
wrbyte #0,#0
mov i1,#480 'ready to scroll screen and insert new pixel column
mov i2,##bitmap
.line add i2,#1 'read in pixel row from bitmap at +1 offset
setq2 #640/4-1
rdlong 0,i2
mov i3,i1 'get power sample
shl i3,#3
add i3,powr_angl_ptr
rdlong i3,i3
shr i3,#10 'scale power sample down and clamp it for luma8 pixel use
fle i3,#$FF
rdlut i4,#640/4-1 'install pixel at end of pixel row
setbyte i4,i3,#3
wrlut i4,#640/4-1
sub i2,#1 'write pixel row back to bitmap at +0 offset (scrolls pixels)
setq2 #640/4-1
wrlong 0,i2
add i2,##640 'point to next pixel row
djnz i1,#.line
jmp #pgm_spectro
'
'
' Fast Fourier Transform, 1024-point
'
' on entry: sample_buff must hold 1024 signed word samples
' on exit: powr_angl_buff holds 512 long (power,angle) pairs
'
' Load 1024 signed-word samples into 1024 signed-long (real,imaginary) coordinate pairs
fft mov i1,#0 'reset index
mov ptra,sample_ptr 'ready to read from sample_buff
wrfast #0,real_imag_ptr 'ready to write to real_imag_buff
.load mov i2,i1 'apply 1-minus-cosine window to samples
shl i2,#32-((10))
qrotate ##$3FFF,i2
rdword ax,ptra++ 'read sample while waiting for window value
getbyte ax,ax,#mic_pin&1 ' choose channel
shl ax,#8 ' byte to word
sub ax,##128<<8 ' unsigned to signed
getqx i2
subr i2,##$4000
muls ax,i2 'scale sample by window value
sar ax,#15
wflong ax 'store windowed sample as (real=sample,imaginary=0)
wflong #0
incmod i1,##1024-1 'loop until all samples stored as (real,imaginary) pairs
tjnz i1,#.load
cogatn #1<<1 ' scope cog
' Perform 1024-point FFT
decod i1,#((10))-1 'init counters
mov i2,#1
.loop1 mov i3,#0
mov i4,i1
mov c1,i2
encod shift,i1
.loop2 mov angle,i3
shr angle,shift
rev angle
mov ptra,i3 'point to (ax,ay)
shl ptra,#3
add ptra,real_imag_ptr
mov ptrb,i1 'point to (bx,by)
shl ptrb,#3
add ptrb,ptra
mov c2,i4
sub c2,i3
.loop3 setq #2-1 'read (bx,by)
rdlong bx,ptrb
setq by 'rotate (bx,by) by angle
qrotate bx,angle
setq #2-1 'read (ax,ay)
rdlong ax,ptra
getqx bx 'get rotated (bx,by)
getqy by
add ax,bx '(ax,ay) = (ax+bx,ay+by)
add ay,by
shl bx,#1 '(bx,by) = (ax-bx,ay-by)
subr bx,ax
shl by,#1
subr by,ay
setq #2-1 'write (ax,ay)
wrlong ax,ptra++
setq #2-1 'write (bx,by)
wrlong bx,ptrb++
djnz c2,#.loop3
add i3,i1
add i3,i1
add i4,i1
add i4,i1
djnz c1,#.loop2
shr i1,#1
shl i2,#1
tjnz i1,#.loop1
' Convert 1024-point FFT results to 512 (power,angle) pairs
wrfast #0,powr_angl_ptr 'ready to store (power,angle) pairs, i1=0
.convert mov ptra,i1 'get bit-reverse index for (real,imaginary) pair
rev ptra
shr ptra,#32 - ((10)) - 3 '(-3 for two longs per pair)
add ptra,real_imag_ptr
setq #2-1 'read (real,imaginary)
rdlong ax,ptra
qvector ax,ay 'convert (real,imaginary) to (power,angle)
getqx ax 'store (power,angle)
wflong ax
getqy ay
wflong ay
incmod i1,#512-1 'loop until conversions done
_ret_ tjnz i1,#.convert
'
'
' Data
'
sample_ptr long sample_buff
real_imag_ptr long real_imag_buff
powr_angl_ptr long powr_angl_buff
c1 res 1
c2 res 1
i1 res 1
i2 res 1
i3 res 1
i4 res 1
shift res 1
angle res 1
ax res 1 '(ax,ay) must be contiguous
ay res 1
bx res 1 '(bx,by) must be contiguous
by res 1
end_of_pgm
I'm wondering if this could be used for wake word voice recognition or maybe a few simple commands...
Looks to be pre-Spin2 though, might take some work to get it going in Spin2...
Also not sure why Chip had the ADC at 100X for a real mic. Pretty sure my SparkFun one only needs about 10X or so... Need that for sure for unamplified mic for sure...
100X gain on ADC is definitely way too much. Dialed that down to 1X (or maybe 10X with LDO board) like this:
pgm_mic 'wrpin ##%100111_0000000_00_11000_0,#mic_pin 'set mic for 100x-mag and 14-bit SINC2 sampling
wrpin ##P_LOCAL_A|P_ADC_1X|P_ADC ,#mic_pin
I'm thinking there is definitely possibility of using this for voice recognition...
Took some pics, first with a board without LDOs. I think you can see the power supply ripple showing up as horizontal lines with this board...
IMG_1918: Saying "Yes" then "No" repeat. 1X gain. "Yes" is the skinny, tall one and "No" is short and wide.
IMG_1919: Wistling three notes repeatedly. 1X gain. This would be another way to interact, probably easier...
Next, switch to a board with LDOs on the mic pins.
IMG_1920: "Yes" and "No" with 10X gain
IMG_1921: "Yes" and No" with 1X gain
Here's a slightly modernized version of the code. It doesn't start with an erased screen like the original though...
Next step is to un-hardcode the buffer addresses...
Very cool...I don't know how I ever missed this thread. I ported Heater's P1 spin-based FFT to the P2 several months back to try out things like audio and hopefully eventually RF - looks like @SaucySoliton had a similar idea with this code here. This is great though...so compact. One thing I couldn't figure out with Heater's code was how to change the sampling bandwidth (so for ex., to focus on smaller swath of spectrum) - I think I was only able to accomplish something like it by setting the number of points to something smaller. Not having looked at this in depth yet, is it reasonably easy to change parameters like that?
Good luck with the voice recognition btw, that'd be a great addition.
100X gain on ADC is definitely way too much. Dialed that down to 1X (or maybe 10X with LDO board) like this:
pgm_mic 'wrpin ##%100111_0000000_00_11000_0,#mic_pin 'set mic for 100x-mag and 14-bit SINC2 sampling
wrpin ##P_LOCAL_A|P_ADC_1X|P_ADC ,#mic_pin
I'm thinking there is definitely possibility of using this for voice recognition...
Took some pics, first with a board without LDOs. I think you can see the power supply ripple showing up as horizontal lines with this board...
IMG_1918: Saying "Yes" then "No" repeat. 1X gain. "Yes" is the skinny, tall one and "No" is short and wide.
IMG_1919: Wistling three notes repeatedly. 1X gain. This would be another way to interact, probably easier...
Next, switch to a board with LDOs on the mic pins.
IMG_1920: "Yes" and "No" with 10X gain
IMG_1921: "Yes" and No" with 1X gain
Hi,
For voice recognition after you have decided, which few frequencies are relevant, it will be probably more efficient to use the Goertzel algorithm instead of fft. Wasn't there a thread about it somewhere?
Hi,
For voice recognition after you have decided, which few frequencies are relevant, it will be probably more efficient to use the Goertzel algorithm instead of fft. Wasn't there a thread about it somewhere?
Right now, the buffers start at $1000, which kind of clobbers the spin2 interpreter. Kind of surprising it still works.
But, it's mostly all assembly, Spin2 just needs to start up to cogs...
Original was all assembly and so didn't have spin2 interpeter...
Got the buffers change from being constants to addresses of variable arrays.
Should be able to make this into a subobject now.
Mic ADC gain is set to 100x for unamplified electret mic.
One thing I haven't figured out is why I can't insert instructions at the start of the Spectrograph Driver.
Adding even a nop there breaks it...
DAT 'Spectrograph Driver
'******************
'* Spectrograph *
'******************
org
'nop 'RJA: Can't have anything here! Why not?
pgm_spectro call #fft 'do fft on current samples
Someone a while back did a test of the ADC performance and found that the noise floor of x100 gain is ten times noisier than the x10 gain. That said, I have no idea how scientific the testing actually was.
@Rayman said:
One thing I haven't figured out is why I can't insert instructions at the start of the Spectrograph Driver.
Adding even a nop there breaks it...
I've added this brief analysis of the situation to the top of the code here.
Only the lower half of the screen is going to be useful for voice recognition.
So, using the upper half of screen to plot the results of the last FFT operation.
Also, doing a very crude amplification of higher frequencies.
CON ''Notes:
''Mic Sample rate is dependent on P2 clock frequency
''Code is set for SINC2 sampling at 14 bits, 8192 clocks per sample. --> Sample frequency = 250_000_000/8192 = 30517.58 Hz
''FFT on 1024 samples --> Max. update rate on all fresh samples = 30517.58/1024 = 29.8 Hz
''Video driver signals Spectroscope driver to refresh screen during vertical blanking (and then do another FFT and wait), so 60 Hz
'' --> Each FFT is on about half old, half new samples
''Frequency resolution (steps in frequency between FFT result) = 30517.58/1024 = 29.80 Hz
''FFT result is 512 real and 512 imaginary steps, so range is from 0 Hz to 29.80*511= 15228.99 Hz
''Wikipedia says telephony uses range from 300 to 3400 Hz. 0..4000 Hz would be lower 134 pixels in this display (about lower 1/4 of display)
Here's a first try at an optimized FFT library based on Chip's code. Usable in flexprop. I haven't tested propeller tool. QROTATE is huge boost for FFT. It does sin(), cos(), 4 multiplies and 2 adds in as little as 8 clocks, provided the pipeline can be kept full.
New optimizations:
1. FIFO can read some of the data in the background.
2. Some instructions placed in wait interval between hub writes.
3. SKIPF bypasses cordic instructions when angle is zero.
4. Unrolled loop packs the cordic pipeline with 4 points at a time.
5. Bitreverse reads in order with FIFO and writes out of order.
I've tried not to grow the length of code too much. The unrolled loop processes 4 samples at a time. I have not done any tests to determine if that is the optimal number. The inline ASM version does not use the unrolled loop. From Chip's work it sounds like 16 points would be good.
fft_bench v1.2.1 for PROPELLER
OpenMP not available on this system
Freq. Magnitude
0 200
c0 1ff
140 1ff
200 200
1024 point bit-reversal and butterfly run time = 2059 us PASM
clock frequency = 160000000
1024 point butterfly run time = 1901 us PASM, with out of order outputs
1024 point butterfly run time = 3638 us inline ASM, with out of order outputs
Freq. Magnitude
0 1fe
c0 1ff
140 1ff
200 1ff
1024 point bit-reversal and butterfly run time = 13625 us flexc
clock frequency = 160000000
The C code in fftbench has real and imaginary data stored in separate arrays. I don't know if any of the compilers could improve performance if they were packed together. It's not quite apples to apples; the PASM bit reverser is an out-of-place algorithm (more memory required). The C bit reverser is in-place. The benchmarks exclude the QVECTOR on the final output.
I started working with inline assembler. There were a lot of limitations there like can't use ptra, can't use fifo, skipf can't jump over instructions if in hubexec, that I moved my development to a dedicated cog for now.
It suits to make it like a coprocessor anyway. It's a compute intensive process that is fine implemented as a concurrent task. Now we just need 16 cores for multiple FFT channels.
This already seemed impressive at first, then I realized Chip was running his code at 250MHz and you're only running the above at 160. I tried it at 250 (default settings; didn't try any of the different variants) and got 1317usec. Nice
Comments
Aha. Makes sense now
I made some enhancements that Rayman suggested, like adding a window function for the FFT input sample set. I made a nice cosine-shaped window filter which actually cleaned up the output quite a bit.
Here is the code, followed by a partial screen shot of me talking into it:
Here is a file containing the code and the updated PNut.exe you will need to compile it with:
https://drive.google.com/file/d/1vWiyXDIwwLDsx0Fsyc7Mdqqu4jC87jdc/view?usp=sharing
I think that makes sure you don’t miss anything when the window is applied...
For this, I wouldn’t do it.
But for voice recognition, might want to.
Jonathan
You probably already realize this, but that spectrograph program shows frequency (0Hz..15KHz) on the vertical, time rolling left on the horizontal (60 pixel columns per second), and magnitude as pixel intensity. Seeing the FFT output change over time gives a whole extra dimension to what FFT's are usually used for.
It's going to be really interesting when we can modify DDS signals within feedback loops and watch them like this. It'll be a whole new frontier of possibilities.
I just looked at the angles and I'm surprised to see that they only range from 0 to just under 180 degrees. Makes sense, though, as the compounding of rotations can create many effective angles greater than 180 degrees.
Here is what I'm seeing:
So, the first 512-span iteration doesn't even rotate anything, and just does the adds/subs. The next 256-span iteration only rotates by 90 degrees in its last half. In every inner loop, the rotation can be skipped when the angle is zero.
I was really pleased to realize that SETQ+QROTATE takes care of all the hard math in the FFT. The rest is just adds and subtracts. I keep wondering, though, if there's another level of optimization possible, aside from stuffing the CORDIC and unrolling the inner loop, somewhat.
Mike
http://forums.parallax.com/discussion/170982/heater-has-6-3-vac#latest
Finally tried this out. Very cool! Here is a version that uses scope mode and therefore can support data rates in the megasamples. It's set at 3.676MSPS right now as that is what is needed to show the entire AM radio band on the screen. It really looks like we could receive AM radio by just connecting a wire to a P2 pin.
I added some syncronization between the acquire and processing. The streamer would otherwise write so fast that samples would change before the an entire 1024 samples is read by the window filter code. It's my first time using cogatn. It was easy. The code also has changes to compile with fastspin.
I'm wondering if this could be used for wake word voice recognition or maybe a few simple commands...
Looks to be pre-Spin2 though, might take some work to get it going in Spin2...
Also not sure why Chip had the ADC at 100X for a real mic. Pretty sure my SparkFun one only needs about 10X or so... Need that for sure for unamplified mic for sure...
Just tested this out with SparkFun amplified mic: https://www.sparkfun.com/products/12758
100X gain on ADC is definitely way too much. Dialed that down to 1X (or maybe 10X with LDO board) like this:
I'm thinking there is definitely possibility of using this for voice recognition...
Took some pics, first with a board without LDOs. I think you can see the power supply ripple showing up as horizontal lines with this board...
IMG_1918: Saying "Yes" then "No" repeat. 1X gain. "Yes" is the skinny, tall one and "No" is short and wide.
IMG_1919: Wistling three notes repeatedly. 1X gain. This would be another way to interact, probably easier...
Next, switch to a board with LDOs on the mic pins.
IMG_1920: "Yes" and "No" with 10X gain
IMG_1921: "Yes" and No" with 1X gain
Here's a slightly modernized version of the code. It doesn't start with an erased screen like the original though...
Next step is to un-hardcode the buffer addresses...
Very cool...I don't know how I ever missed this thread. I ported Heater's P1 spin-based FFT to the P2 several months back to try out things like audio and hopefully eventually RF - looks like @SaucySoliton had a similar idea with this code here. This is great though...so compact. One thing I couldn't figure out with Heater's code was how to change the sampling bandwidth (so for ex., to focus on smaller swath of spectrum) - I think I was only able to accomplish something like it by setting the number of points to something smaller. Not having looked at this in depth yet, is it reasonably easy to change parameters like that?
Good luck with the voice recognition btw, that'd be a great addition.
Hi,
For voice recognition after you have decided, which few frequencies are relevant, it will be probably more efficient to use the Goertzel algorithm instead of fft. Wasn't there a thread about it somewhere?
https://forums.parallax.com/discussion/115725/goertzel-based-speech-recognizer-now-with-source-code/p1
Starting to get a handle on this code...
Here's a version with VGA instead of HDMI
There's one thing that Chip added to the HDMI driver that I needed to copy into the VGA driver:
wrbyte #1,#0 'signal ideal time to update bitmap
Seems like would be better to use cogatn instead, but I guess this works.
Right now, the buffers start at $1000, which kind of clobbers the spin2 interpreter. Kind of surprising it still works.
But, it's mostly all assembly, Spin2 just needs to start up to cogs...
Original was all assembly and so didn't have spin2 interpeter...
Got the buffers change from being constants to addresses of variable arrays.
Should be able to make this into a subobject now.
Mic ADC gain is set to 100x for unamplified electret mic.
One thing I haven't figured out is why I can't insert instructions at the start of the Spectrograph Driver.
Adding even a nop there breaks it...
Someone a while back did a test of the ADC performance and found that the noise floor of x100 gain is ten times noisier than the x10 gain. That said, I have no idea how scientific the testing actually was.
The COGINIT requires
pgm_spectro
to be at ORG 0.Right. Thanks @evanh
Obvious now…
I've added this brief analysis of the situation to the top of the code here.
Only the lower half of the screen is going to be useful for voice recognition.
So, using the upper half of screen to plot the results of the last FFT operation.
Also, doing a very crude amplification of higher frequencies.
Here's a first try at an optimized FFT library based on Chip's code. Usable in flexprop. I haven't tested propeller tool. QROTATE is huge boost for FFT. It does sin(), cos(), 4 multiplies and 2 adds in as little as 8 clocks, provided the pipeline can be kept full.
New optimizations:
1. FIFO can read some of the data in the background.
2. Some instructions placed in wait interval between hub writes.
3. SKIPF bypasses cordic instructions when angle is zero.
4. Unrolled loop packs the cordic pipeline with 4 points at a time.
5. Bitreverse reads in order with FIFO and writes out of order.
I've tried not to grow the length of code too much. The unrolled loop processes 4 samples at a time. I have not done any tests to determine if that is the optimal number. The inline ASM version does not use the unrolled loop. From Chip's work it sounds like 16 points would be good.
The C code in fftbench has real and imaginary data stored in separate arrays. I don't know if any of the compilers could improve performance if they were packed together. It's not quite apples to apples; the PASM bit reverser is an out-of-place algorithm (more memory required). The C bit reverser is in-place. The benchmarks exclude the QVECTOR on the final output.
I started working with inline assembler. There were a lot of limitations there like can't use ptra, can't use fifo, skipf can't jump over instructions if in hubexec, that I moved my development to a dedicated cog for now.
It suits to make it like a coprocessor anyway. It's a compute intensive process that is fine implemented as a concurrent task. Now we just need 16 cores for multiple FFT channels.
This already seemed impressive at first, then I realized Chip was running his code at 250MHz and you're only running the above at 160. I tried it at 250 (default settings; didn't try any of the different variants) and got 1317usec. Nice
Cheers