[EDITED] fast SPI out, 1 bit per instruction

lonesock · 2009-06-16 18:39

Hi, All.

EDITED: GEAR timing was off by a tiny bit...used a scope, updated the counter PHSx initialization values!!

You may remember this thread here:
http://forums.parallax.com/showthread.php?p=811943
I was trying to use both counters to get SPI data at one bit per instruction. kuroneko pointed me to a similar thread of his where he used almost exactly the same technique, and of course did it months before I did [noparse][[/noparse]8^)
http://forums.parallax.com/showthread.php?p=784536

Well, I wanted to do the same thing for SPI output, so here is the test framework. It looks a bit goofy in GEAR (the clock pin trace is off by 1 prop-clock relative to the data pin trace, not sure why), but the scope looks nice and clean.

{{
  Jonathan "lonesock" Dummer
  Testing a fast SPI clock out routine

  Use both counters in NCO single-ended mode, where the output
  pin is equal to PHSx's high bit.  Use Counter B to drive the
  clock pin, and Counter A to drive the data line.  B actually
  changes the pin automatically, while you update the Data pin
  using a series of SHL's on PHSA (we set FRQA to 0, so no up-
  dates are happening automatically).

  This is for an SPI interface where the data is latched in on
  the rising edge of the Clock line, so you want your Data pin
  to be stable before the clock pin goes high.  You might have
  to sdjust the "movi phsb,#%xxx000000" line to initialize the
  PHSB into the right state for your SPI definition.
}}
CON
  '_clkmode = RCFast
  _clkmode = RCSlow
  pinDataOut = 25   
  pinClock = 24     
  pinChipSelect = 26 

PUB start_test
  ' start out our assembly test framework, then we're done!
  cognew( @fast_SPI_out_test_entry, 0 )
  repeat
    ' do nothing forever (looking at you, Wally!)
  
DAT
ORG 0

fast_SPI_out_test_entry
        ' set up Counter A to be the data counter
        mov frqa,#0             ' unecessary
        mov phsa,#0             ' unecessary
        mov ctra,#pinDataOut    ' set the data pin
        movi ctra,#%0_00100_000 ' set the mode to NCO, single output pin
        ' set up Counter B to be the clock counter
        mov frqb,#0             ' unecessary
        mov phsb,#0             ' unecessary
        mov ctrb,#pinClock      ' set the clock pin
        movi ctrb,#%0_00100_000 ' set the mode to NCO, single output pin
        '     set up my 3 pins as outputs
        mov t,#1                ' temp = 1
        shl t,#pinDataOut       ' temp = 1 << pinDataOut
        mov dira,t              ' DIRA now has the DataOut pin as an output
        mov t,#1                ' temp = 1
        shl t,#pinClock         ' temp = 1 << pinClock
        or dira,t               ' DIRA now has both DataOut and Clock as outputs
        mov maskCS,#1           ' ditto fo the ChipSelect pin, but keep the mask for later
        shl maskCS,#pinChipSelect
        or dira,maskCS          ' DIRA now has all 3 pins set to outputs
        mov outa,maskCS         ' set the Chip Select pin high (usually active low)        

fast_SPI_out_test
        ' what is my data byte?
        mov data,#%10101010     ' randomly selected by myself

        ' here is the super fast unrolled version
        '{
        mov phsa,data           ' start with the raw data byte
        shl phsa,#24            ' get the MSb into position 31
        'rev phsa,#0            ' do this instead of the above line for LSb first
        andn outa,maskCS        ' CS goes low, signifying a start
        movi phsb,#%000000000   ' set up my clock register        
        movi frqb,#%010000000   ' start my clock line ticking!
        shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot
        shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot
        shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot
        shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot
        shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot
        shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot
        shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot
        mov frqb,#0             ' stop my clock
        or outa,maskCS          ' CS goes high again
        '}

        ' here is the 2x slower looped version
        '{
        '' NOTE: The 1st one will be primed, so the number
        '' of remaining bits = total-1.  For this 8-bit
        '' test, I have 7 bits remaining to be shifted out.
        mov t,#7                ' number of bits left        
        mov phsa,data           ' start with the raw data byte
        shl phsa,#24            ' get the MSb into position 31
        'rev phsa,#0            ' do this instead of the above line for LSb first
        andn outa,maskCS        ' CS goes low, signifying a start
        movi phsb,#%011000000   ' set up my clock register         
        movi frqb,#%001000000   ' start my clock line ticking!
:bit_shift_loop
        shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot
        djnz t,#:bit_shift_loop ' keep going till we run out of bits
        mov frqb,#0             ' stop my clock
        or outa,maskCS          ' CS goes high again
        '}  

        ' wait and repeat
        mov t,#1              ' 511 clocks is a good number for fitting into 9 bits [noparse][[/noparse]8^)
        shl t,#9
        add t,cnt               ' add in the current time
        waitcnt t,#511          ' wait for a little while
        jmp #fast_SPI_out_test  ' start our test over again

data    res
t       res
maskCS  res

FIT 496

edit: changed the code to work with what the scope says!!

▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
lonesock
Piranha are people too.

Post Edited (lonesock) : 6/17/2009 11:51:57 PM GMT

Phil Pilgrim (PhiPi) · 2009-06-16 19:05

'Very clever use of ctra to map to a pin! I think I would use a ror, though, and rev the data for MSB first. That way you can avoid the shl #24.

-Phil

lonesock · 2009-06-16 19:43

Phil Pilgrim (PhiPi) said...
'Very clever use of ctra to map to a pin! I think I would use a ror, though, and rev the data for MSB first. That way you can avoid the shl #24.

-Phil

Thanks! The data I need to send out is in fact MSb first, so either way I need an extra instruction to set it up [noparse][[/noparse]8^)

edit: latest code is in the top post

Note that this can be used to send more than 8 bits, up to 32, obviously, but you just lose that many words of Cog RAM as the loop must be unrolled. Alternatively, if memory was more important than speed, you could use a loop at 1/2 the data rate, but almost no code size...you'd just have to play with the starting value for phsb, and frqb would be 1/2 the current value.

Theoretically for a 80MHz clock, you can now drive SD cards at the specified max data rate (20MHz clock) for both input ant output. If you overclock your prop, you might go too fast! I doubt that is a problem, as the SD spec was artificially limited, and I don't think any companies would go out of their way to corrupt data if it goes a tiny bit faster than spec. On the other hand, going with a cheap SD card could spell trouble via simply poor construction.

The next challenge would be to read and write SPI data concurrently. I'm sure you could do it with 2 instructions per bit, but the supreme awesomeness would be one bit (each way) per instruction. I'm dubious, but hey, food for thought [noparse][[/noparse]8^)

▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
lonesock
Piranha are people too.

Post Edited (lonesock) : 6/17/2009 11:52:39 PM GMT

Phil Pilgrim (PhiPi) · 2009-06-16 20:04

For fast bidirectional transfers, you could let the video circuitry handle the output, while your program did the input. Despite what the manual implies, you can also use mode %00010 or %00011 for the ctra video clock source to get the shift clock on a pin. Or you could let the video circuitry produce the shift clock and the serial output data stream.

-Phil

lonesock · 2009-06-16 20:19

Phil Pilgrim (PhiPi) said...
For fast bidirectional transfers, you could let the video circuitry handle the output, while your program did the input. Despite what the manual implies, you can also use mode %00010 or %00011 for the ctra video clock source to get the shift clock on a pin. Or you could let the video circuitry produce the shift clock and the serial output data stream.

-Phil

Great idea! I had not thought of using the video circuitry for output, nor did I realize you could use a PLL mode with a pin output, thanks! I'll look into this as soon as I need to use a part with bidirectional SPI. (I do want to make sure I don't spend more time doing the bit-twiddling to mux both the data and clock into the video register than it would take to just shift it out the regular way.)

▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
lonesock
Piranha are people too.

Cluso99 · 2009-06-17 05:57

Great project. Yes, we will want the fastest permissable SD card access.

We can dedicate a whole cog for it in our emulations.

▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
Links to other interesting threads:

· Home of the MultiBladeProps: TriBladeProp, SixBladeProp, website (Multiple propeller pcbs)
· Single Board Computer:·3 Propeller ICs·and a·TriBladeProp board (ZiCog Z80 Emulator)
· Prop Tools under Development or Completed (Index)
· Emulators: Micros eg Altair, and Terminals eg VT100 (Index)
· Search the Propeller forums (via Google)
My cruising website is: ·www.bluemagic.biz·· MultiBladeProp is: www.bluemagic.biz/cluso.htm

lonesock · 2009-06-17 06:46

Cluso99 said...
Great project. Yes, we will want the fastest permissable SD card access.

Thanks! I already ran into a speed limit: a LCD driver chip I'm using has a 20 MHz clock rate max. My prop is running at 100MHz (using a 6.25MHz clock line in from an ENC28J60, which in turn divides its 25MHz crystal by 4). So running the full speed SPI out I over-ran the max allowable data rate. To that end, I made a looped version of the same that run's at 1/2 the speed, and is very scalable (e.g. to a full 32-bits). Here's the relevant portion:

EDITED: put the new code in the sample up top

At the risk of derailing my own thread, if I made a fast FAT32-only & MMC/SD/SDHC object, would it:
A) be of use to anyone besides myself?

duplicate the effort of anyone else? (I know many people have mentioned FAT32 support already, but I have no status updates)
C) be a problem if there was no FAT16 or FAT12 support?
D) annoy people with limitations like "only one file open for writing at a time"?

I have some ideas for optimization, and even a cool name: FlashFAT32! (and a "Lion Tamer" hat!)

Anyway...bedtime for me...I appreciate any feedback, which I will collect in the morning [noparse][[/noparse]8^)

Jonathan

▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
lonesock
Piranha are people too.

Post Edited (lonesock) : 6/17/2009 11:53:20 PM GMT

rokicki · 2009-06-17 06:49

Hmm, my big project was going to be an updated fsrw that's faster, uses cog memory for
read-ahead and write-behind, and a bunch of other stuff, but if you're considering fat32
and sdhc, I wasn't planning on that. I'd hate to waste my effort if you're going to be
leapfrogging that anyway.

I was also going to have a nifty DMA mode so you could get real speed even when using
only Spin.

Sapieha · 2009-06-17 06:57

Hi lonesock.

Yours code will be very useful for Ramtrons 2MBit FRAMS.
If You have that driver.
I use that chips

▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
Nothing is impossible, there are only different degrees of difficulty.
For every stupid question there is at least one intelligent answer.
Don't guess - ask instead.
If you don't ask you won't know.
If your gonna construct something, make it·as simple as·possible yet as versatile as posible.

Sapieha

kuroneko · 2009-06-17 07:03

lonesock said...
C) be a problem if there was no FAT16 or FAT12 support?

There isn't really any need for limiting yourself to a specific format. You can design the FS in a generic way, the only difference being the cluster r/w code (which is plugged in during the mount stage).

Cluso99 · 2009-06-17 08:20

It would be great to have both FAT16 and FAT32 support. FAT16 is standard for 2GB and below while FAT32 for 4GB and above. FAT32 with only 8+3 filenames.

On the TriBlade and RamBlade, DMA is not possible, nor is read ahead or write behind. This is because it shares pins with the ram.

▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
Links to other interesting threads:

· Home of the MultiBladeProps: TriBladeProp, SixBladeProp, website (Multiple propeller pcbs)
· Single Board Computer:·3 Propeller ICs·and a·TriBladeProp board (ZiCog Z80 Emulator)
· Prop Tools under Development or Completed (Index)
· Emulators: Micros eg Altair, and Terminals eg VT100 (Index)
· Search the Propeller forums (via Google)
My cruising website is: ·www.bluemagic.biz·· MultiBladeProp is: www.bluemagic.biz/cluso.htm

simonl · 2009-06-17 09:20

@rokicki: I'm sure many of us would be delighted to see faster SD access, using cog memory for
read-ahead and write-behind, and DMA too, so PLEASE do continue

@lonesock: likewise; I expect loads of people would "rip yer arm off" to get FAT32 / SD / SDHC support on the Prop' - can we have it yesterday please?! (BTW: Your SPI work looks great).

▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
Cheers,
Simon

www.norfolkhelicopterclub.com

“Before you criticize someone, you should walk a mile in their shoes. That way when you criticize them, you are a mile away from them and you have their shoes.” - Jack Handey.

Kye · 2009-06-17 14:27

Fat 32 will be impossible in spin because of how math is done. If you want to support FAT32 it WILL BE A DIFFICULT project. If you try to do everything in assembly in 512 longs then it might be possible.

However, then the feature set wouldn't be good...

Also, you can't just pump the spi clock to max with sd cards. They have a CSD register which tells you their maximum limit, and all SD cards can have a different value. Most however should be able to take 5 Mhz.

Plus you need high speed input for the SD card also, not high speed output... Maybe input would be possible with the counters also.

Good luck, if you want to try. Sorry for raining on the happy parade.

▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
Nyamekye,

Kye · 2009-06-17 14:27

Wait actually since FAT32 its 28 bit addressing it may be possible in spin...

▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
Nyamekye,

rokicki · 2009-06-17 16:19

It would be really nice to work together, actually, and pool resources rather than duplicating code.

I have looked some more at fat32, and I agree, it would be straightforward to modify fsrw to do fat32
(the cluster r/w is part of it, but another part is the extendable root directory).

At first blush it might be easy to divide work into the block layer and the file system layer (as it
currently is divided). In other words, we can use the 1-bit-per-instruction ideas in a really simple
spisasm routine that plugs into the existing fsrw to make it much faster (and this layer can
implement read-ahead and write-behind), but I was going to move the FAT manipulation into the
cog too, so it may not be so quick and dried.

Lonesock, email me at my username at gmail.com (same username as what I use here) and
let's discuss if we want to work together on this.

lonesock · 2009-06-17 18:32

Hi, All.

Thanks for your responses! Rokicki and I will collaborate on the next rev of fsrw. There will be some more news when we have it (and look for an upcoming forum poll from rokicki).

(note: the next release will be ready on or before the Duke Nukem Forever ship-date [noparse][[/noparse]8^)

thanks,
Jonathan

▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
lonesock
Piranha are people too.

lonesock · 2009-06-18 00:09

Hi, again, All.

Well, I finally plugged a scope in and looked at my waveforms: GEAR != HW+Scope

I changed my unrolled SPI code so that the clock pin transitions to high are dead center of each data bit on the scope. I then looked at the traces in GEAR, and the trace for the clock pin was advanced by one, relative to the data pin. The GEAR output is attached (remember, it looks perfect on the scope..."Don't Panic")

The updated code is in the 1st post, both the embedded code block and the attached file. The code shows the fast unrolled "1 instruction per bit" way, and it also has the looped "2 instructions per bit" way.

In the looped mode, the clock transition to high does not land exactly in the middle of the data bit, but is in fact one clock late. I.e. each data bit is 8 propeller-clocks wide, and the clock pin's transition to high occurs on relative clock 5. I could not figure out a way to get that transition exactly in the middle of the data bit without an extra clock-pin-transition-to-high slipping in.

Anyway, "share and enjoy"
Jonathan

▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
lonesock
Piranha are people too.

kuroneko · 2009-06-18 00:48

lonesock said...
In the looped mode, the clock transition to high does not land exactly in the middle of the data bit, but is in fact one clock late. I.e. each data bit is 8 propeller-clocks wide, and the clock pin's transition to high occurs on relative clock 5. I could not figure out a way to get that transition exactly in the middle of the data bit without an extra clock-pin-transition-to-high slipping in.

FWIW, they way your code currently works is that the clock transitions happen during S (Sder, frqx is written in R but active during the next S). Lets call that 4n. The bit shifts happen during R, i.e. 4n+3. Which explains the off center transition. I'd suggest a minor instruction re-ordering.

.       movi frqb,#%001000000   ' start my clock line ticking!
        movi phsb,#%100000000   ' set up my clock register         
:bit_shift_loop
        shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot

Now ctrb starts incrementing (assuming phsb is initially 0) and reaches 4 during R of movi phsb, .... At the same time however we force phsb to 4 (so no harm done here, i.e. we could place a nop there if we could guarantee phsb being 0 when we set frqb). Anyway, the clock transitions are now locked to 4n+3 as well which is what we wanted.

lonesock · 2009-06-18 01:19

@kuroneko: Thanks for the explanation about the clock transitions. The code you shared does place the clock-line transitions exactly in the middle of the data bit. Unfortunately, my major problem is not getting into the loop, but getting out of it. The final djnz falls through instead of jumping, which takes 8 clocks instead of 4. By the time my "mov frqb,#0" line executes, the clock counter has already caused the clock pin to transition high again, yielding an extra (unwanted) bit out.

Maybe I could try something like:
* compute the exact time I should shut off the clock counter
* do something like:

:bit_shift_loop
        shl phsa,#1 wz
if_nz jmp #:bit_shift_loop
        waitcnt end_time_stamp,#0
        mov frqb,#0

This would let me perform both the jump and the fall through in only 4 clocks. The waitcnt would make sure that the clock pin transitioned the requisite number of times, even if I ran out of data bits early. I think getting the ending cnt value would be tricky, but possible. I also run the risk (even if this works) of losing in setup instructions what I gained by going to the looped version in the 1st place.

Of course the alternative is to just let my clock pin transition on clock 5 of every 8-clock-wide data bit, instead of on clock 4 [noparse][[/noparse]8^). And since this is going at 1/2 speed, I wouldn't think that the precise timing is that important, but I guess that is device specific.

Any feedback on the "waitcnt" idea, or the usefulness thereof?

▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
lonesock
Piranha are people too.

kuroneko · 2009-06-18 02:42

Sorry about the messed up exit condition, I was only concentrating on the lock. Those things come always back and bite you [noparse]:)[/noparse]

lonesock said...
Of course the alternative is to just let my clock pin transition on clock 5 of every 8-clock-wide data bit, instead of on clock 4 [noparse][[/noparse]8^). And since this is going at 1/2 speed, I wouldn't think that the precise timing is that important, but I guess that is device specific.

Any feedback on the "waitcnt" idea, or the usefulness thereof?

I don't think the waitcnt idea is going to fly. Actually, I'm sure. Imagine the last bit sent is 1, this means we have one more round-trip for the jmp instruction in order to clear phsa (bit 9 if you like), then we have a nop (jump not taken) and that's already too late (clock transition during nop.R). Even without the extra loop cycle you'd only have 8+3 cycles left to stop the clock (4 of which are consumed by the jump not taken, 6 by the waitcnt ...).

So I'd suggest you ignore that slightly off center transition and just use it. The data hold time should be long enough.

rokicki · 2009-06-18 03:11

Yeah, off-center isn't a problem. Still plenty of setup and hold time.

Cluso99 · 2009-06-18 03:27

If you run out of cog space you could split and load some bits as overlays or run an LMM style code (zero footprint) for the less important parts. I have done both if you need help.

▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
Links to other interesting threads:

· Home of the MultiBladeProps: TriBladeProp, SixBladeProp, website (Multiple propeller pcbs)
· Single Board Computer:·3 Propeller ICs·and a·TriBladeProp board (ZiCog Z80 Emulator)
· Prop Tools under Development or Completed (Index)
· Emulators: Micros eg Altair, and Terminals eg VT100 (Index)
· Search the Propeller forums (via Google)
My cruising website is: ·www.bluemagic.biz·· MultiBladeProp is: www.bluemagic.biz/cluso.htm

lonesock · 2009-06-27 14:23

Cluso99 said...
If you run out of cog space you could split and load some bits as overlays or run an LMM style code (zero footprint) for the less important parts. I have done both if you need help.

Hi, Cluso, sorry I didn't respond earlier. I have a prototype working, and have plenty of Cog RAM left (Though the plan is to keep the PASM under 240 longs, leaving room for 2x 512-byte buffers in the Cog.) However, if I need to add any more functionality I'll be resting right up against the limit. I think I understand how to do overlays (doesn't mean I can implement them

, and I get the concept of LMM code, but how do you do a zero-footprint LMM?!

▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
lonesock
Piranha are people too.

pgbpsu · 2012-10-01 09:00

I'm trying to use Jonathan's very clever use of the counters to get ~20Mbit/sec output to an SPI line. I've got his code working, however, since it is write only, I'm using a much slower version of an SPI routine to read/write data. However, I can't get the two functions to work together. I suspect it's got something to do with the use of counters.

I've got 2 routines, one fast pasm spi write only (using lonesock's counter idea), and one pasm read-write. I can call either one repeatedly, however once I call the read/write version the write only (counter version) no longer clocks out data. Can anyone explain what's going on that prevents me from using the r/w version (non-counter) and then using the w_fast(counter) version? I know for sure that I can get into the fast (counter) version again because the XOR statements there, if uncommented, show up on the scope. But I just don't get any data. It seems like my setup of the counters is wrong, but I don't know why.

Thanks,
Peter

CON
  _CLKMODE = XTAL1 + PLL16X
  _XINFREQ = 5_000_000

  TEST_PT           =  5
  WIFI_MISO         =  8
  WIFI_MOSI         =  9
  WIFI_CLK          = 11
  
  DEBUG             =     0
  DEBUG_BAUD        = 38400

  MAX_PAYLOAD       =   200



   
CON ' Enumerated Command list and Buffer names

  #1, CMDSetupPins, CMDR_W_BYTE, CMD_W_BYTE, CMD_W_FAST


VAR
  byte tBuf[MAX_PAYLOAD]

  long wifiCommand, bytesForSPI, bufferAddress
  
OBJ
  uarts    : "FullDuplexSerial4portPlus_0v3"       '1 COG for 3 serial ports

PUB MAIN | serialCogId, pasmCogId, rxByte

  uarts.Init
  uarts.AddPort(DEBUG,31,30,-1,-1,0,0,DEBUG_BAUD) 'Add DEBUG port
  uarts.Start
  serialCogId    := uarts.getCogID                'Start the ports
  PAUSE_MS(2000)

  bytefill(@tbuf, 0, MAX_PAYLOAD)

  pasmCogId := cognew(@SPI_ASM, @wifiCommand) + 1'Start the SPI PASM cog

  uarts.str(DEBUG,string(13,"PASM SPI code launched in cog: "))
  uarts.dec(DEBUG,pasmCogId)

  bytefill(@tbuf, 0, MAX_PAYLOAD)
  tbuf[0]:=$00
  tbuf[1]:=$01
  tbuf[2]:=$02
  tbuf[3]:=$03
  bufferAddress := @tbuf

  uarts.str(DEBUG,string(13,"Setting up PASM based SPI routines and I/O."))
  wifiCommand := CMDSetupPins
  repeat while wifiCommand                  ' Wait for it to complete

  repeat
    uarts.str(DEBUG,string(13,13, "Press 1,2, or 3 to send data."))
    repeat
      rxByte := uarts.rxcheck(DEBUG)
    until rxByte == "1" or rxByte == "2" or rxByte == "3"

    bytefill(@tbuf, 0, MAX_PAYLOAD)
    tbuf[0]:=$00
    tbuf[1]:=$01
    tbuf[2]:=$02
    tbuf[3]:=$03

    case rxByte
      "1" : uarts.str(DEBUG,string(13, "Using r/w byte"))
          bufferAddress := @tbuf
          bytesForSPI := 4
          wifiCommand := CMDR_W_BYTE
      "2" : uarts.str(DEBUG,string(13, "Using w byte"))
          bufferAddress := @tbuf
          bytesForSPI := 4
          wifiCommand := CMD_W_BYTE
      "3" : uarts.str(DEBUG,string(13, "Using wf byte"))
          bufferAddress := @tbuf
          bytesForSPI := 4
          wifiCommand := CMD_W_FAST
    repeat while wifiCommand                  ' Wait for it to complete

PUB PAUSE_MS(mS)
  waitcnt(clkfreq/1000 * mS + cnt)

DAT
        org   0  ' start at beginning
SPI_ASM

        mov       commandPtr,     par             ' 
        mov       byteCountPtr,   par             ' 
        mov       bufferPtr,      par             ' 
        add       byteCountPtr,   #4
        add       bufferPtr,      #8

'_________________________________________Wait for command_________________________________________
WaitForCommand
          rdlong  command,      commandPtr      wz  ' check for a command; Z=1 if read value is 0
   if_z   jmp     #WaitForCommand                   ' command is zero; read again

'_________________________________________Command list_____________________________________________
          cmp     command,      #CMDR_W_BYTE    wz  ' Z=1 if Value1 == Value2
   if_z   call    #RW_BYTE                          ' If Z=1 jump to WRITE_TO_DF routing
   if_z   jmp     #CommandDone                      ' After returning jump to CommandDone sub

          cmp     command,      #CMD_W_BYTE    wz   ' Z=1 if Value1 == Value2
   if_z   call    #W_BYTE                           ' If Z=1 jump to WRITE_TO_DF routing
   if_z   jmp     #CommandDone                      ' After returning jump to CommandDone sub

          cmp     command,      #CMD_W_FAST    wz   ' Z=1 if Value1 == Value2
   if_z   call    #W_FAST                           ' If Z=1 jump to WRITE_TO_DF routing
   if_z   jmp     #CommandDone                      ' After returning jump to CommandDone sub

          cmp     command,      #CMDSetupPins   wz  ' Z=1 if Value1 == Value2
   if_z   call    #SETUP_PINS                       ' If Z=1 jump to RESET_DF; return with Z=1
   if_z   jmp     #CommandDone                      ' After returning jump to CommandDone sub


   if_z   jmp     #CommandDone

CommandDone
          wrlong  zero,          commandPtr          ' Set command=0 signaling we're done
          jmp     #WaitForCommand
                     
{******************************** RW_BYTE ******************************************************
This command grabs reads the number of bytes to write from the HUB location byteCountPtr, then
writes (byte wise) the bytes beginning at bufferVal.  Because it is SPI it also reads from the
WIFI module at the same time.  The result is placed in the same slot of the buffer as the sent
value.
Z=1 when we enter.  Z=1 when finished.  
***************************************************************************************************}
RW_BYTE
          rdlong  numBytes,     byteCountPtr    ' how many bytes are we supposed to write?
                                                ' get a copy of that location for future use
          rdlong  hubAddress,   bufferPtr       ' read the addresss of the buffer; currently found in HUB mailbox
:byteLoop
          mov     inData,       #0              ' init before filling
          mov     numBits,      #8              ' init number of bits to read/write
          rdbyte  outData,      hubAddress      ' get local copy of data to write
          shl     outData,      #24             ' shift lowest byte into MSB; bit[7]->bit[31]

:bitLoop 
          muxnz   outa,         clkMask         ' lower clock
          shl     outData,      #1        wc    ' set C = outData[31] then shift          
          muxc    outa,         mosiMask        ' put value of C onto MOSI
          test    misoMask,     ina       wc    ' C=ina[miso]
          rcl     inData,       #1              ' shift C left into inByte
          muxz    outa,         clkMask         ' raise clock
          djnz    numBits,      #:bitLoop       ' decrement loop          
          
          wrbyte  inData,       hubAddress      ' write the data back to the buffer
          add     hubAddress,   #1              ' set pointer to next byte in buffer     
          djnz    numBytes,     #:byteLoop      ' Decrement loop count.
RW_BYTE_ret       ret

{******************************** W_BYTE ******************************************************
This command reads the number of bytes to write from the HUB location byteCountPtr, then
writes (byte wise) the bytes beginning at bufferVal.  This is a stripped down version of the RW_BYTE
code and does NOT capture a response.  It's write ONLY.  
Z=1 when we enter.  Z=1 when finished.  
***************************************************************************************************}
W_BYTE
          rdlong  numBytes,     byteCountPtr    ' how many bytes are we supposed to write?
          rdlong  hubAddress,   bufferPtr            ' read the addresss of the buffer; currently found in HUB mailbox
:byteLoop
          mov     numBits,      #8              ' init number of bits to read/write
          rdbyte  outData,      hubAddress      ' get local copy of data to write
          shl     outData,      #24             ' shift lowest byte into MSB; bit[7]->bit[31]

:bitLoop 
          muxnz   outa,         clkMask         ' lower clock
          shl     outData,      #1        wc    ' set C = outData[31] then shift          
          muxc    outa,         mosiMask        ' put value of C onto MOSI
          muxz    outa,         clkMask         ' raise clock
          djnz    numBits,      #:bitLoop       ' decrement loop          
          
          add     hubAddress,   #1              ' set pointer to next byte in buffer     
          djnz    numBytes,     #:byteLoop      ' Decrement loop count.
W_BYTE_ret        ret

{******************************** W_FAST ******************************************************
This command reads the number of bytes to write from the HUB location byteCountPtr, then
writes (byte wise) the bytes beginning at bufferVal.  This is a stripped down version of the RW_BYTE
code and does NOT capture a response.  It's write ONLY.

Taken from lonesock's forum post:
http://forums.parallax.com/showthread.php?113722-EDITED-fast-SPI-out-1-bit-per-instruction


Z=1 when we enter.  Z=1 when finished.  
***************************************************************************************************}
W_FAST
          rdlong  numBytes,     byteCountPtr    ' how many bytes are we supposed to write?
          rdlong  hubAddress,   bufferPtr       ' read the addresss of the buffer; currently found in HUB mailbox
:byteLoop
          rdbyte  outData,      hubAddress      ' get local copy of data to write
          mov     outData,      %10101010
          mov     phsa,         outData         ' start with the raw data byte
          shl     phsa,         #24             ' get the MSb into position 31
          'rev     phsa,         #0             ' do this instead of the above line for LSb first
          movi    phsb,         #%000000000     ' set up my clock register        
          movi    frqb,         #%010000000     ' start my clock line ticking!
'xor   outa,   testMask
          shl     phsa,         #1              ' move next bit into the PHSA[31] slot
          shl     phsa,         #1              ' move next bit into the PHSA[31] slot
          shl     phsa,         #1              ' move next bit into the PHSA[31] slot
'xor   outa,   testMask
          shl     phsa,         #1              ' move next bit into the PHSA[31] slot
          shl     phsa,         #1              ' move next bit into the PHSA[31] slot
          shl     phsa,         #1              ' move next bit into the PHSA[31] slot
'xor   outa,   testMask
          shl     phsa,         #1              ' move next bit into the PHSA[31] slot
          mov     frqb,         #0              ' stop my clock
        

          add     hubAddress,   #1              ' set pointer to next byte in buffer     
          djnz    numBytes,     #:byteLoop      ' Decrement loop count.
W_FAST_ret        ret

{******************************** SETUP_PINS *********************************************************
Use the temp register and constants from up above to setup the I/O lines for the Red Pine.
Z=1 when we enter.  Z=1 when finished.
***************************************************************************************************}
SETUP_PINS
        ' set up Counter A to be the data counter
          mov     ctra,         #WIFI_MOSI          ' set the data pin
          movi    ctra,         #%0_00100_000       ' set the mode to NCO, single output pin
        ' set up Counter B to be the clock counter
          mov     ctrb,         #WIFI_CLK           ' set the clock pin
          movi    ctrb,         #%0_00100_000       ' set the mode to NCO, single output pin

          mov     temp,         #1
          shl     temp,         #WIFI_MOSI
          mov     dira,         temp

          mov     temp,         #1
          shl     temp,         #WIFI_CLK
          or      dira,         temp

          mov     temp,         #1
          shl     temp,         #TEST_PT
          or      dira,         temp
          
SETUP_PINS_ret    ret

{******************************** Variables ********************************************************
PASM variables below
***************************************************************************************************}
command       long      0
numBytes      long      0
bufferPtr     long      0
hubAddress    long      0

zero          long      0               ' Zero=0
numBits       long      0

misoMask      long      |< WIFI_MISO
mosiMask      long      |< WIFI_MOSI
clkMask       long      |< WIFI_CLK
testMask      long      |< TEST_PT

temp          res       1
outData       res       1
inData        res       1
commandPtr    res       1
byteCountPtr  res       1
bufferValPtr  res       1
curBytePtr    res       1

data    res   1
t       res   1

        FIT     496

Ahle2 · 2012-10-01 11:58

Phil Pilgrim (PhiPi) wrote: »

For fast bidirectional transfers, you could let the video circuitry handle the output, while your program did the input. Despite what the manual implies, you can also use mode 010 or 011 for the ctra video clock source to get the shift clock on a pin. Or you could let the video circuitry produce the shift clock and the serial output data stream.

-Phil

I did continuous (not just byte bursts as described here) 20 Mbit spi out using the video generator and even started a thread about it. Almost no one was interested; Maybe because I am not one of the official "video generator/counter gurus" on this forum?!
Anyway, I lost interest because lack of feedback.

Even better than 20 Mbit out is possible in bursts. The bottle neck is feeding the video generator fast enough with data from hub ram.

lonesock · 2012-10-01 12:11

@Ahle2: I read and enjoyed the thread...sorry for the lack of feedback! I think the deal-killer for me, at least at the time, was the difficulty in getting it started with a quick turnaround time or keeping it fed with the properly spaced waitvids if I was doing something else...or am I misremembering? I will dig that up and review it, thanks for the memory jog!

@Peter: Here's what I have in the latest version of the FSRW block code, byte versions only:

out8
        andn outa,maskDI 
        mov phsb,#0
        movi frqb,#%01_0000000        
        rol phsa,#1
        rol phsa,#1
        rol phsa,#1
        rol phsa,#1
        rol phsa,#1
        rol phsa,#1
        rol phsa,#1
        mov frqb,#0
        ' don't shift out the final bit...already sent, but be aware 
        ' of this when sending consecutive bytes (send_cmd, for e.g.) 
out8_ret
        ret

{
in8
        or outa,maskDI
        mov ctra,readMode
        ' Start my clock
        mov frqa,#1<<7
        mov phsa,#0
        movi phsb,#%11_0000000
        movi frqb,#%01_0000000
        ' keep reading in my value, one bit at a time!  (Kuroneko - "Wh)
        shr frqa,#1
        shr frqa,#1
        shr frqa,#1
        shr frqa,#1
        shr frqa,#1
        shr frqa,#1
        shr frqa,#1
        mov frqb,#0 ' stop the clock
        mov readback,phsa
        mov frqa,#0
        mov ctra,writeMode
in8_ret
        ret
}
in8
        neg phsa,#1' DI high
        mov readback,#0
        ' set up my clock, and start it
        movi phsb,#%011_000000
        movi frqb,#%001_000000
        ' keep reading in my value
        test maskDO,ina wc
        rcl readback,#1
        test maskDO,ina wc
        rcl readback,#1
        test maskDO,ina wc
        rcl readback,#1
        test maskDO,ina wc
        rcl readback,#1
        test maskDO,ina wc
        rcl readback,#1
        test maskDO,ina wc
        rcl readback,#1
        test maskDO,ina wc
        rcl readback,#1
        test maskDO,ina wc
        mov frqb,#0 ' stop the clock
        rcl readback,#1
        mov phsa,#0 'DI low
in8_ret
        ret

The commented out middle version was the 20 Mbps read, but it didn't seem to work on all hardware & cog/pin combinations...so it's in disuse, and hasn't been tested since the rest of the counter framework as evolved around it. Sorry for not answering your question directly...a bit busy!

thanks,
Jonathan

pgbpsu · 2012-10-01 12:11

Hi Ahle2-

Do you mind posting a link to that thread? If I can't get what I'm working on fixed, I'm open to a different tactic.

Thanks

pgbpsu · 2012-10-01 14:21

Hi Jonathan-

Thanks for posting. I'd looked over that section of fsrw to see if it was different.

The code I posted above (based directly on yours) works but, it basically doesn't get along with the other stuff I'd written and I can't understand why. I really like these write speeds and I'm only working with on specific device rather than a bunch of different SD card manufactures. My device is spec'd to 25Mhz for the spi line so I'm still considerably below it. However, do need to read with this device as well as a read while writing (true spi) so I need to get a version working that will do both in and out. My spi read/write routines aren't nearly as fast as what you've got, but they are fast enough since I don't use read AND write simultaneously that frequently. For the heavy throughput stuff I want to use your code and for the read/write stuff I can go slowly. But to do this they need to cooperate.

Ahle2, if Jonathan's memory is correct your methods might not be suitable for my application (everything here is treated as a byte) but I'd still like to see them.

Thanks,
Peter

kuroneko · 2012-10-01 16:06

pgbpsu wrote: »

Can anyone explain what's going on that prevents me from using the r/w version (non-counter) and then using the w_fast(counter) version?

[COLOR="red"]muxz    outa,         clkMask         ' raise clock[/COLOR]
          djnz    numBits,      #:bitLoop       ' decrement loop                       WIFI_CLK = 1  exit path
                                                '                                                    |
          wrbyte  inData,       hubAddress      ' write the data back to the buffer    WIFI_CLK = 1  |
          add     hubAddress,   #1              ' set pointer to next byte in buffer   WIFI_CLK = 1  |
          djnz    numBytes,     #:byteLoop      ' Decrement loop count.                WIFI_CLK = 1  |
RW_BYTE_ret       ret                           '                                      WIFI_CLK = 1  |

Does that ring a bell?

pgbpsu · 2012-10-01 17:03

kuroneko-

I appreciate you looking at this. I must say I don't see the problem (which is probably how I got into this situation). In the highlighted code above I'm using the red line to toggle the clock line. That works fine. I'm only changing the state of one pin, outa[clkMask]. As I'm sure you're aware, this works fine. However after running this, when I jump into lonesock's code I don't get any output - clocks (or data) coming out. I think you're suggesting that when I leave my slow read/write routine, I'm leaving outa[clkMask] := 1 and this somehow prevents other output. But since this is the same cog (and NOT the direction register) why wouldn't the faster routine be able to then toggle the counter output pin (which just so happens to be the same as the bit mask used in the code above).

I'm sure you see the problem. Unfortunately, I don't (yet).

Thanks,
Peter

kuroneko · 2012-10-01 17:11

pgbpsu wrote: »

I must say I don't see the problem (which is probably how I got into this situation).

The code is entered with Z = 1, which means - as your comments indicate - muxnz lowers the clock and muxz raises the clock. When you leave the code the clock remains high (raised) which means it stays high unless the relevant bit in outa is reset. The counter output (clock) is in contention^A with outa so won't have any effect.

IOW either the fast code has to unblock outa and raise it again once finished or you change all routines to leave the clock line low once done.

^A if set to output, pin := outa|ctra|ctrb|video

[EDITED] fast SPI out, 1 bit per instruction

Comments