Shop OBEX P1 Docs P2 Docs Learn Events
[EDITED] fast SPI out, 1 bit per instruction — Parallax Forums

[EDITED] fast SPI out, 1 bit per instruction

lonesocklonesock Posts: 917
edited 2012-10-09 06:42 in Propeller 1
Hi, All.

EDITED: GEAR timing was off by a tiny bit...used a scope, updated the counter PHSx initialization values!!

You may remember this thread here:
http://forums.parallax.com/showthread.php?p=811943
I was trying to use both counters to get SPI data at one bit per instruction. kuroneko pointed me to a similar thread of his where he used almost exactly the same technique, and of course did it months before I did [noparse][[/noparse]8^)
http://forums.parallax.com/showthread.php?p=784536

Well, I wanted to do the same thing for SPI output, so here is the test framework. It looks a bit goofy in GEAR (the clock pin trace is off by 1 prop-clock relative to the data pin trace, not sure why), but the scope looks nice and clean.

{{
  Jonathan "lonesock" Dummer
  Testing a fast SPI clock out routine

  Use both counters in NCO single-ended mode, where the output
  pin is equal to PHSx's high bit.  Use Counter B to drive the
  clock pin, and Counter A to drive the data line.  B actually
  changes the pin automatically, while you update the Data pin
  using a series of SHL's on PHSA (we set FRQA to 0, so no up-
  dates are happening automatically).

  This is for an SPI interface where the data is latched in on
  the rising edge of the Clock line, so you want your Data pin
  to be stable before the clock pin goes high.  You might have
  to sdjust the "movi phsb,#%xxx000000" line to initialize the
  PHSB into the right state for your SPI definition.
}}
CON
  '_clkmode = RCFast
  _clkmode = RCSlow
  pinDataOut = 25   
  pinClock = 24     
  pinChipSelect = 26 

PUB start_test
  ' start out our assembly test framework, then we're done!
  cognew( @fast_SPI_out_test_entry, 0 )
  repeat
    ' do nothing forever (looking at you, Wally!)
  
DAT
ORG 0

fast_SPI_out_test_entry
        ' set up Counter A to be the data counter
        mov frqa,#0             ' unecessary
        mov phsa,#0             ' unecessary
        mov ctra,#pinDataOut    ' set the data pin
        movi ctra,#%0_00100_000 ' set the mode to NCO, single output pin
        ' set up Counter B to be the clock counter
        mov frqb,#0             ' unecessary
        mov phsb,#0             ' unecessary
        mov ctrb,#pinClock      ' set the clock pin
        movi ctrb,#%0_00100_000 ' set the mode to NCO, single output pin
        '     set up my 3 pins as outputs
        mov t,#1                ' temp = 1
        shl t,#pinDataOut       ' temp = 1 << pinDataOut
        mov dira,t              ' DIRA now has the DataOut pin as an output
        mov t,#1                ' temp = 1
        shl t,#pinClock         ' temp = 1 << pinClock
        or dira,t               ' DIRA now has both DataOut and Clock as outputs
        mov maskCS,#1           ' ditto fo the ChipSelect pin, but keep the mask for later
        shl maskCS,#pinChipSelect
        or dira,maskCS          ' DIRA now has all 3 pins set to outputs
        mov outa,maskCS         ' set the Chip Select pin high (usually active low)        

fast_SPI_out_test
        ' what is my data byte?
        mov data,#%10101010     ' randomly selected by myself

        ' here is the super fast unrolled version
        '{
        mov phsa,data           ' start with the raw data byte
        shl phsa,#24            ' get the MSb into position 31
        'rev phsa,#0            ' do this instead of the above line for LSb first
        andn outa,maskCS        ' CS goes low, signifying a start
        movi phsb,#%000000000   ' set up my clock register        
        movi frqb,#%010000000   ' start my clock line ticking!
        shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot
        shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot
        shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot
        shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot
        shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot
        shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot
        shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot
        mov frqb,#0             ' stop my clock
        or outa,maskCS          ' CS goes high again
        '}

        ' here is the 2x slower looped version
        '{
        '' NOTE: The 1st one will be primed, so the number
        '' of remaining bits = total-1.  For this 8-bit
        '' test, I have 7 bits remaining to be shifted out.
        mov t,#7                ' number of bits left        
        mov phsa,data           ' start with the raw data byte
        shl phsa,#24            ' get the MSb into position 31
        'rev phsa,#0            ' do this instead of the above line for LSb first
        andn outa,maskCS        ' CS goes low, signifying a start
        movi phsb,#%011000000   ' set up my clock register         
        movi frqb,#%001000000   ' start my clock line ticking!
:bit_shift_loop
        shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot
        djnz t,#:bit_shift_loop ' keep going till we run out of bits
        mov frqb,#0             ' stop my clock
        or outa,maskCS          ' CS goes high again
        '}  

        ' wait and repeat
        mov t,#1              ' 511 clocks is a good number for fitting into 9 bits [noparse][[/noparse]8^)
        shl t,#9
        add t,cnt               ' add in the current time
        waitcnt t,#511          ' wait for a little while
        jmp #fast_SPI_out_test  ' start our test over again

data    res
t       res
maskCS  res

FIT 496




edit: changed the code to work with what the scope says!!

▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
lonesock
Piranha are people too.

Post Edited (lonesock) : 6/17/2009 11:51:57 PM GMT
«1

Comments

  • Phil Pilgrim (PhiPi)Phil Pilgrim (PhiPi) Posts: 23,514
    edited 2009-06-16 19:05
    'Very clever use of ctra to map to a pin! I think I would use a ror, though, and rev the data for MSB first. That way you can avoid the shl #24.

    -Phil
  • lonesocklonesock Posts: 917
    edited 2009-06-16 19:43
    Phil Pilgrim (PhiPi) said...
    'Very clever use of ctra to map to a pin! I think I would use a ror, though, and rev the data for MSB first. That way you can avoid the shl #24.

    -Phil
    Thanks! The data I need to send out is in fact MSb first, so either way I need an extra instruction to set it up [noparse][[/noparse]8^)

    edit: latest code is in the top post

    Note that this can be used to send more than 8 bits, up to 32, obviously, but you just lose that many words of Cog RAM as the loop must be unrolled. Alternatively, if memory was more important than speed, you could use a loop at 1/2 the data rate, but almost no code size...you'd just have to play with the starting value for phsb, and frqb would be 1/2 the current value.

    Theoretically for a 80MHz clock, you can now drive SD cards at the specified max data rate (20MHz clock) for both input ant output. If you overclock your prop, you might go too fast! I doubt that is a problem, as the SD spec was artificially limited, and I don't think any companies would go out of their way to corrupt data if it goes a tiny bit faster than spec. On the other hand, going with a cheap SD card could spell trouble via simply poor construction.

    The next challenge would be to read and write SPI data concurrently. I'm sure you could do it with 2 instructions per bit, but the supreme awesomeness would be one bit (each way) per instruction. I'm dubious, but hey, food for thought [noparse][[/noparse]8^)

    ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
    lonesock
    Piranha are people too.

    Post Edited (lonesock) : 6/17/2009 11:52:39 PM GMT
  • Phil Pilgrim (PhiPi)Phil Pilgrim (PhiPi) Posts: 23,514
    edited 2009-06-16 20:04
    For fast bidirectional transfers, you could let the video circuitry handle the output, while your program did the input. Despite what the manual implies, you can also use mode %00010 or %00011 for the ctra video clock source to get the shift clock on a pin. Or you could let the video circuitry produce the shift clock and the serial output data stream.

    -Phil
  • lonesocklonesock Posts: 917
    edited 2009-06-16 20:19
    Phil Pilgrim (PhiPi) said...
    For fast bidirectional transfers, you could let the video circuitry handle the output, while your program did the input. Despite what the manual implies, you can also use mode %00010 or %00011 for the ctra video clock source to get the shift clock on a pin. Or you could let the video circuitry produce the shift clock and the serial output data stream.

    -Phil
    Great idea! I had not thought of using the video circuitry for output, nor did I realize you could use a PLL mode with a pin output, thanks! I'll look into this as soon as I need to use a part with bidirectional SPI. (I do want to make sure I don't spend more time doing the bit-twiddling to mux both the data and clock into the video register than it would take to just shift it out the regular way.)

    ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
    lonesock
    Piranha are people too.
  • Cluso99Cluso99 Posts: 18,069
    edited 2009-06-17 05:57
    Great project. Yes, we will want the fastest permissable SD card access. smile.gif

    We can dedicate a whole cog for it in our emulations.

    ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
    Links to other interesting threads:

    · Home of the MultiBladeProps: TriBladeProp, SixBladeProp, website (Multiple propeller pcbs)
    · Single Board Computer:·3 Propeller ICs·and a·TriBladeProp board (ZiCog Z80 Emulator)
    · Prop Tools under Development or Completed (Index)
    · Emulators: Micros eg Altair, and Terminals eg VT100 (Index)
    · Search the Propeller forums (via Google)
    My cruising website is: ·www.bluemagic.biz·· MultiBladeProp is: www.bluemagic.biz/cluso.htm
  • lonesocklonesock Posts: 917
    edited 2009-06-17 06:46
    Cluso99 said...
    Great project. Yes, we will want the fastest permissable SD card access.
    Thanks! I already ran into a speed limit: a LCD driver chip I'm using has a 20 MHz clock rate max. My prop is running at 100MHz (using a 6.25MHz clock line in from an ENC28J60, which in turn divides its 25MHz crystal by 4). So running the full speed SPI out I over-ran the max allowable data rate. To that end, I made a looped version of the same that run's at 1/2 the speed, and is very scalable (e.g. to a full 32-bits). Here's the relevant portion:

    EDITED: put the new code in the sample up top

    At the risk of derailing my own thread, if I made a fast FAT32-only & MMC/SD/SDHC object, would it:
    A) be of use to anyone besides myself?
    B) duplicate the effort of anyone else? (I know many people have mentioned FAT32 support already, but I have no status updates)
    C) be a problem if there was no FAT16 or FAT12 support?
    D) annoy people with limitations like "only one file open for writing at a time"?

    I have some ideas for optimization, and even a cool name: FlashFAT32! (and a "Lion Tamer" hat!)

    Anyway...bedtime for me...I appreciate any feedback, which I will collect in the morning [noparse][[/noparse]8^)

    Jonathan

    ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
    lonesock
    Piranha are people too.

    Post Edited (lonesock) : 6/17/2009 11:53:20 PM GMT
  • rokickirokicki Posts: 1,000
    edited 2009-06-17 06:49
    Hmm, my big project was going to be an updated fsrw that's faster, uses cog memory for
    read-ahead and write-behind, and a bunch of other stuff, but if you're considering fat32
    and sdhc, I wasn't planning on that. I'd hate to waste my effort if you're going to be
    leapfrogging that anyway.

    I was also going to have a nifty DMA mode so you could get real speed even when using
    only Spin.
  • SapiehaSapieha Posts: 2,964
    edited 2009-06-17 06:57
    Hi lonesock.

    Yours code will be very useful for Ramtrons 2MBit FRAMS.
    If You have that driver.
    I use that chips

    ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
    Nothing is impossible, there are only different degrees of difficulty.
    For every stupid question there is at least one intelligent answer.
    Don't guess - ask instead.
    If you don't ask you won't know.
    If your gonna construct something, make it·as simple as·possible yet as versatile as posible.


    Sapieha
  • kuronekokuroneko Posts: 3,623
    edited 2009-06-17 07:03
    lonesock said...
    C) be a problem if there was no FAT16 or FAT12 support?
    There isn't really any need for limiting yourself to a specific format. You can design the FS in a generic way, the only difference being the cluster r/w code (which is plugged in during the mount stage).
  • Cluso99Cluso99 Posts: 18,069
    edited 2009-06-17 08:20
    It would be great to have both FAT16 and FAT32 support. FAT16 is standard for 2GB and below while FAT32 for 4GB and above. FAT32 with only 8+3 filenames.

    On the TriBlade and RamBlade, DMA is not possible, nor is read ahead or write behind. This is because it shares pins with the ram.

    ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
    Links to other interesting threads:

    · Home of the MultiBladeProps: TriBladeProp, SixBladeProp, website (Multiple propeller pcbs)
    · Single Board Computer:·3 Propeller ICs·and a·TriBladeProp board (ZiCog Z80 Emulator)
    · Prop Tools under Development or Completed (Index)
    · Emulators: Micros eg Altair, and Terminals eg VT100 (Index)
    · Search the Propeller forums (via Google)
    My cruising website is: ·www.bluemagic.biz·· MultiBladeProp is: www.bluemagic.biz/cluso.htm
  • simonlsimonl Posts: 866
    edited 2009-06-17 09:20
    @rokicki: I'm sure many of us would be delighted to see faster SD access, using cog memory for
    read-ahead and write-behind, and DMA too, so PLEASE do continue smile.gif

    @lonesock: likewise; I expect loads of people would "rip yer arm off" to get FAT32 / SD / SDHC support on the Prop' - can we have it yesterday please?! (BTW: Your SPI work looks great).

    ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
    Cheers,
    Simon

    www.norfolkhelicopterclub.com

    “Before you criticize someone, you should walk a mile in their shoes. That way when you criticize them, you are a mile away from them and you have their shoes.” - Jack Handey.
  • KyeKye Posts: 2,200
    edited 2009-06-17 14:27
    Fat 32 will be impossible in spin because of how math is done. If you want to support FAT32 it WILL BE A DIFFICULT project. If you try to do everything in assembly in 512 longs then it might be possible.

    However, then the feature set wouldn't be good...

    Also, you can't just pump the spi clock to max with sd cards. They have a CSD register which tells you their maximum limit, and all SD cards can have a different value. Most however should be able to take 5 Mhz.

    Plus you need high speed input for the SD card also, not high speed output... Maybe input would be possible with the counters also.

    Good luck, if you want to try. Sorry for raining on the happy parade. =)

    ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
    Nyamekye,
  • KyeKye Posts: 2,200
    edited 2009-06-17 14:27
    Wait actually since FAT32 its 28 bit addressing it may be possible in spin...

    ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
    Nyamekye,
  • rokickirokicki Posts: 1,000
    edited 2009-06-17 16:19
    It would be really nice to work together, actually, and pool resources rather than duplicating code.

    I have looked some more at fat32, and I agree, it would be straightforward to modify fsrw to do fat32
    (the cluster r/w is part of it, but another part is the extendable root directory).

    At first blush it might be easy to divide work into the block layer and the file system layer (as it
    currently is divided). In other words, we can use the 1-bit-per-instruction ideas in a really simple
    spisasm routine that plugs into the existing fsrw to make it much faster (and this layer can
    implement read-ahead and write-behind), but I was going to move the FAT manipulation into the
    cog too, so it may not be so quick and dried.

    Lonesock, email me at my username at gmail.com (same username as what I use here) and
    let's discuss if we want to work together on this.
  • lonesocklonesock Posts: 917
    edited 2009-06-17 18:32
    Hi, All.

    Thanks for your responses! Rokicki and I will collaborate on the next rev of fsrw. There will be some more news when we have it (and look for an upcoming forum poll from rokicki).

    (note: the next release will be ready on or before the Duke Nukem Forever ship-date [noparse][[/noparse]8^)

    thanks,
    Jonathan

    ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
    lonesock
    Piranha are people too.
  • lonesocklonesock Posts: 917
    edited 2009-06-18 00:09
    Hi, again, All.

    Well, I finally plugged a scope in and looked at my waveforms: GEAR != HW+Scope

    I changed my unrolled SPI code so that the clock pin transitions to high are dead center of each data bit on the scope. I then looked at the traces in GEAR, and the trace for the clock pin was advanced by one, relative to the data pin. The GEAR output is attached (remember, it looks perfect on the scope..."Don't Panic")

    The updated code is in the 1st post, both the embedded code block and the attached file. The code shows the fast unrolled "1 instruction per bit" way, and it also has the looped "2 instructions per bit" way.

    In the looped mode, the clock transition to high does not land exactly in the middle of the data bit, but is in fact one clock late. I.e. each data bit is 8 propeller-clocks wide, and the clock pin's transition to high occurs on relative clock 5. I could not figure out a way to get that transition exactly in the middle of the data bit without an extra clock-pin-transition-to-high slipping in.

    Anyway, "share and enjoy"
    Jonathan

    ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
    lonesock
    Piranha are people too.
    317 x 29 - 7K
  • kuronekokuroneko Posts: 3,623
    edited 2009-06-18 00:48
    lonesock said...
    In the looped mode, the clock transition to high does not land exactly in the middle of the data bit, but is in fact one clock late. I.e. each data bit is 8 propeller-clocks wide, and the clock pin's transition to high occurs on relative clock 5. I could not figure out a way to get that transition exactly in the middle of the data bit without an extra clock-pin-transition-to-high slipping in.
    FWIW, they way your code currently works is that the clock transitions happen during S (Sder, frqx is written in R but active during the next S). Lets call that 4n. The bit shifts happen during R, i.e. 4n+3. Which explains the off center transition. I'd suggest a minor instruction re-ordering.

    .       movi frqb,#%001000000   ' start my clock line ticking!
            movi phsb,#%100000000   ' set up my clock register         
    :bit_shift_loop
            shl phsa,#1             ' move next bit into the PHSA[noparse][[/noparse]31] slot
    


    Now ctrb starts incrementing (assuming phsb is initially 0) and reaches 4 during R of movi phsb, .... At the same time however we force phsb to 4 (so no harm done here, i.e. we could place a nop there if we could guarantee phsb being 0 when we set frqb). Anyway, the clock transitions are now locked to 4n+3 as well which is what we wanted.
  • lonesocklonesock Posts: 917
    edited 2009-06-18 01:19
    @kuroneko: Thanks for the explanation about the clock transitions. The code you shared does place the clock-line transitions exactly in the middle of the data bit. Unfortunately, my major problem is not getting into the loop, but getting out of it. The final djnz falls through instead of jumping, which takes 8 clocks instead of 4. By the time my "mov frqb,#0" line executes, the clock counter has already caused the clock pin to transition high again, yielding an extra (unwanted) bit out.

    Maybe I could try something like:
    * compute the exact time I should shut off the clock counter
    * do something like:
    :bit_shift_loop
            shl phsa,#1 wz
    if_nz jmp #:bit_shift_loop
            waitcnt end_time_stamp,#0
            mov frqb,#0
    
    


    This would let me perform both the jump and the fall through in only 4 clocks. The waitcnt would make sure that the clock pin transitioned the requisite number of times, even if I ran out of data bits early. I think getting the ending cnt value would be tricky, but possible. I also run the risk (even if this works) of losing in setup instructions what I gained by going to the looped version in the 1st place.

    Of course the alternative is to just let my clock pin transition on clock 5 of every 8-clock-wide data bit, instead of on clock 4 [noparse][[/noparse]8^). And since this is going at 1/2 speed, I wouldn't think that the precise timing is that important, but I guess that is device specific.

    Any feedback on the "waitcnt" idea, or the usefulness thereof?

    ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
    lonesock
    Piranha are people too.
  • kuronekokuroneko Posts: 3,623
    edited 2009-06-18 02:42
    Sorry about the messed up exit condition, I was only concentrating on the lock. Those things come always back and bite you [noparse]:)[/noparse]
    lonesock said...
    Of course the alternative is to just let my clock pin transition on clock 5 of every 8-clock-wide data bit, instead of on clock 4 [noparse][[/noparse]8^). And since this is going at 1/2 speed, I wouldn't think that the precise timing is that important, but I guess that is device specific.

    Any feedback on the "waitcnt" idea, or the usefulness thereof?
    I don't think the waitcnt idea is going to fly. Actually, I'm sure. Imagine the last bit sent is 1, this means we have one more round-trip for the jmp instruction in order to clear phsa (bit 9 if you like), then we have a nop (jump not taken) and that's already too late (clock transition during nop.R). Even without the extra loop cycle you'd only have 8+3 cycles left to stop the clock (4 of which are consumed by the jump not taken, 6 by the waitcnt ...).

    So I'd suggest you ignore that slightly off center transition and just use it. The data hold time should be long enough.
  • rokickirokicki Posts: 1,000
    edited 2009-06-18 03:11
    Yeah, off-center isn't a problem. Still plenty of setup and hold time.
  • Cluso99Cluso99 Posts: 18,069
    edited 2009-06-18 03:27
    If you run out of cog space you could split and load some bits as overlays or run an LMM style code (zero footprint) for the less important parts. I have done both if you need help.

    ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
    Links to other interesting threads:

    · Home of the MultiBladeProps: TriBladeProp, SixBladeProp, website (Multiple propeller pcbs)
    · Single Board Computer:·3 Propeller ICs·and a·TriBladeProp board (ZiCog Z80 Emulator)
    · Prop Tools under Development or Completed (Index)
    · Emulators: Micros eg Altair, and Terminals eg VT100 (Index)
    · Search the Propeller forums (via Google)
    My cruising website is: ·www.bluemagic.biz·· MultiBladeProp is: www.bluemagic.biz/cluso.htm
  • lonesocklonesock Posts: 917
    edited 2009-06-27 14:23
    Cluso99 said...
    If you run out of cog space you could split and load some bits as overlays or run an LMM style code (zero footprint) for the less important parts. I have done both if you need help.
    Hi, Cluso, sorry I didn't respond earlier. I have a prototype working, and have plenty of Cog RAM left (Though the plan is to keep the PASM under 240 longs, leaving room for 2x 512-byte buffers in the Cog.) However, if I need to add any more functionality I'll be resting right up against the limit. I think I understand how to do overlays (doesn't mean I can implement them wink.gif, and I get the concept of LMM code, but how do you do a zero-footprint LMM?!

    ▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔▔
    lonesock
    Piranha are people too.
  • pgbpsupgbpsu Posts: 460
    edited 2012-10-01 09:00
    I'm trying to use Jonathan's very clever use of the counters to get ~20Mbit/sec output to an SPI line. I've got his code working, however, since it is write only, I'm using a much slower version of an SPI routine to read/write data. However, I can't get the two functions to work together. I suspect it's got something to do with the use of counters.

    I've got 2 routines, one fast pasm spi write only (using lonesock's counter idea), and one pasm read-write. I can call either one repeatedly, however once I call the read/write version the write only (counter version) no longer clocks out data. Can anyone explain what's going on that prevents me from using the r/w version (non-counter) and then using the w_fast(counter) version? I know for sure that I can get into the fast (counter) version again because the XOR statements there, if uncommented, show up on the scope. But I just don't get any data. It seems like my setup of the counters is wrong, but I don't know why.

    Thanks,
    Peter
    CON
      _CLKMODE = XTAL1 + PLL16X
      _XINFREQ = 5_000_000
    
      TEST_PT           =  5
      WIFI_MISO         =  8
      WIFI_MOSI         =  9
      WIFI_CLK          = 11
      
      DEBUG             =     0
      DEBUG_BAUD        = 38400
    
      MAX_PAYLOAD       =   200
    
    
    
       
    CON ' Enumerated Command list and Buffer names
    
      #1, CMDSetupPins, CMDR_W_BYTE, CMD_W_BYTE, CMD_W_FAST
    
    
    VAR
      byte tBuf[MAX_PAYLOAD]
    
      long wifiCommand, bytesForSPI, bufferAddress
      
    OBJ
      uarts    : "FullDuplexSerial4portPlus_0v3"       '1 COG for 3 serial ports
    
    PUB MAIN | serialCogId, pasmCogId, rxByte
    
      uarts.Init
      uarts.AddPort(DEBUG,31,30,-1,-1,0,0,DEBUG_BAUD) 'Add DEBUG port
      uarts.Start
      serialCogId    := uarts.getCogID                'Start the ports
      PAUSE_MS(2000)
    
      bytefill(@tbuf, 0, MAX_PAYLOAD)
    
      pasmCogId := cognew(@SPI_ASM, @wifiCommand) + 1'Start the SPI PASM cog
    
      uarts.str(DEBUG,string(13,"PASM SPI code launched in cog: "))
      uarts.dec(DEBUG,pasmCogId)
    
      bytefill(@tbuf, 0, MAX_PAYLOAD)
      tbuf[0]:=$00
      tbuf[1]:=$01
      tbuf[2]:=$02
      tbuf[3]:=$03
      bufferAddress := @tbuf
    
      uarts.str(DEBUG,string(13,"Setting up PASM based SPI routines and I/O."))
      wifiCommand := CMDSetupPins
      repeat while wifiCommand                  ' Wait for it to complete
    
      repeat
        uarts.str(DEBUG,string(13,13, "Press 1,2, or 3 to send data."))
        repeat
          rxByte := uarts.rxcheck(DEBUG)
        until rxByte == "1" or rxByte == "2" or rxByte == "3"
    
        bytefill(@tbuf, 0, MAX_PAYLOAD)
        tbuf[0]:=$00
        tbuf[1]:=$01
        tbuf[2]:=$02
        tbuf[3]:=$03
    
        case rxByte
          "1" : uarts.str(DEBUG,string(13, "Using r/w byte"))
              bufferAddress := @tbuf
              bytesForSPI := 4
              wifiCommand := CMDR_W_BYTE
          "2" : uarts.str(DEBUG,string(13, "Using w byte"))
              bufferAddress := @tbuf
              bytesForSPI := 4
              wifiCommand := CMD_W_BYTE
          "3" : uarts.str(DEBUG,string(13, "Using wf byte"))
              bufferAddress := @tbuf
              bytesForSPI := 4
              wifiCommand := CMD_W_FAST
        repeat while wifiCommand                  ' Wait for it to complete
    
    PUB PAUSE_MS(mS)
      waitcnt(clkfreq/1000 * mS + cnt)
    
    DAT
            org   0  ' start at beginning
    SPI_ASM
    
            mov       commandPtr,     par             ' 
            mov       byteCountPtr,   par             ' 
            mov       bufferPtr,      par             ' 
            add       byteCountPtr,   #4
            add       bufferPtr,      #8
    
    '_________________________________________Wait for command_________________________________________
    WaitForCommand
              rdlong  command,      commandPtr      wz  ' check for a command; Z=1 if read value is 0
       if_z   jmp     #WaitForCommand                   ' command is zero; read again
    
    '_________________________________________Command list_____________________________________________
              cmp     command,      #CMDR_W_BYTE    wz  ' Z=1 if Value1 == Value2
       if_z   call    #RW_BYTE                          ' If Z=1 jump to WRITE_TO_DF routing
       if_z   jmp     #CommandDone                      ' After returning jump to CommandDone sub
    
              cmp     command,      #CMD_W_BYTE    wz   ' Z=1 if Value1 == Value2
       if_z   call    #W_BYTE                           ' If Z=1 jump to WRITE_TO_DF routing
       if_z   jmp     #CommandDone                      ' After returning jump to CommandDone sub
    
              cmp     command,      #CMD_W_FAST    wz   ' Z=1 if Value1 == Value2
       if_z   call    #W_FAST                           ' If Z=1 jump to WRITE_TO_DF routing
       if_z   jmp     #CommandDone                      ' After returning jump to CommandDone sub
    
              cmp     command,      #CMDSetupPins   wz  ' Z=1 if Value1 == Value2
       if_z   call    #SETUP_PINS                       ' If Z=1 jump to RESET_DF; return with Z=1
       if_z   jmp     #CommandDone                      ' After returning jump to CommandDone sub
    
    
       if_z   jmp     #CommandDone
    
    CommandDone
              wrlong  zero,          commandPtr          ' Set command=0 signaling we're done
              jmp     #WaitForCommand
                         
    {******************************** RW_BYTE ******************************************************
    This command grabs reads the number of bytes to write from the HUB location byteCountPtr, then
    writes (byte wise) the bytes beginning at bufferVal.  Because it is SPI it also reads from the
    WIFI module at the same time.  The result is placed in the same slot of the buffer as the sent
    value.
    Z=1 when we enter.  Z=1 when finished.  
    ***************************************************************************************************}
    RW_BYTE
              rdlong  numBytes,     byteCountPtr    ' how many bytes are we supposed to write?
                                                    ' get a copy of that location for future use
              rdlong  hubAddress,   bufferPtr       ' read the addresss of the buffer; currently found in HUB mailbox
    :byteLoop
              mov     inData,       #0              ' init before filling
              mov     numBits,      #8              ' init number of bits to read/write
              rdbyte  outData,      hubAddress      ' get local copy of data to write
              shl     outData,      #24             ' shift lowest byte into MSB; bit[7]->bit[31]
    
    :bitLoop 
              muxnz   outa,         clkMask         ' lower clock
              shl     outData,      #1        wc    ' set C = outData[31] then shift          
              muxc    outa,         mosiMask        ' put value of C onto MOSI
              test    misoMask,     ina       wc    ' C=ina[miso]
              rcl     inData,       #1              ' shift C left into inByte
              muxz    outa,         clkMask         ' raise clock
              djnz    numBits,      #:bitLoop       ' decrement loop          
              
              wrbyte  inData,       hubAddress      ' write the data back to the buffer
              add     hubAddress,   #1              ' set pointer to next byte in buffer     
              djnz    numBytes,     #:byteLoop      ' Decrement loop count.
    RW_BYTE_ret       ret
    
    {******************************** W_BYTE ******************************************************
    This command reads the number of bytes to write from the HUB location byteCountPtr, then
    writes (byte wise) the bytes beginning at bufferVal.  This is a stripped down version of the RW_BYTE
    code and does NOT capture a response.  It's write ONLY.  
    Z=1 when we enter.  Z=1 when finished.  
    ***************************************************************************************************}
    W_BYTE
              rdlong  numBytes,     byteCountPtr    ' how many bytes are we supposed to write?
              rdlong  hubAddress,   bufferPtr            ' read the addresss of the buffer; currently found in HUB mailbox
    :byteLoop
              mov     numBits,      #8              ' init number of bits to read/write
              rdbyte  outData,      hubAddress      ' get local copy of data to write
              shl     outData,      #24             ' shift lowest byte into MSB; bit[7]->bit[31]
    
    :bitLoop 
              muxnz   outa,         clkMask         ' lower clock
              shl     outData,      #1        wc    ' set C = outData[31] then shift          
              muxc    outa,         mosiMask        ' put value of C onto MOSI
              muxz    outa,         clkMask         ' raise clock
              djnz    numBits,      #:bitLoop       ' decrement loop          
              
              add     hubAddress,   #1              ' set pointer to next byte in buffer     
              djnz    numBytes,     #:byteLoop      ' Decrement loop count.
    W_BYTE_ret        ret
    
    {******************************** W_FAST ******************************************************
    This command reads the number of bytes to write from the HUB location byteCountPtr, then
    writes (byte wise) the bytes beginning at bufferVal.  This is a stripped down version of the RW_BYTE
    code and does NOT capture a response.  It's write ONLY.
    
    Taken from lonesock's forum post:
    http://forums.parallax.com/showthread.php?113722-EDITED-fast-SPI-out-1-bit-per-instruction
    
    
    Z=1 when we enter.  Z=1 when finished.  
    ***************************************************************************************************}
    W_FAST
              rdlong  numBytes,     byteCountPtr    ' how many bytes are we supposed to write?
              rdlong  hubAddress,   bufferPtr       ' read the addresss of the buffer; currently found in HUB mailbox
    :byteLoop
              rdbyte  outData,      hubAddress      ' get local copy of data to write
              mov     outData,      %10101010
              mov     phsa,         outData         ' start with the raw data byte
              shl     phsa,         #24             ' get the MSb into position 31
              'rev     phsa,         #0             ' do this instead of the above line for LSb first
              movi    phsb,         #%000000000     ' set up my clock register        
              movi    frqb,         #%010000000     ' start my clock line ticking!
    'xor   outa,   testMask
              shl     phsa,         #1              ' move next bit into the PHSA[31] slot
              shl     phsa,         #1              ' move next bit into the PHSA[31] slot
              shl     phsa,         #1              ' move next bit into the PHSA[31] slot
    'xor   outa,   testMask
              shl     phsa,         #1              ' move next bit into the PHSA[31] slot
              shl     phsa,         #1              ' move next bit into the PHSA[31] slot
              shl     phsa,         #1              ' move next bit into the PHSA[31] slot
    'xor   outa,   testMask
              shl     phsa,         #1              ' move next bit into the PHSA[31] slot
              mov     frqb,         #0              ' stop my clock
            
    
              add     hubAddress,   #1              ' set pointer to next byte in buffer     
              djnz    numBytes,     #:byteLoop      ' Decrement loop count.
    W_FAST_ret        ret
    
    {******************************** SETUP_PINS *********************************************************
    Use the temp register and constants from up above to setup the I/O lines for the Red Pine.
    Z=1 when we enter.  Z=1 when finished.
    ***************************************************************************************************}
    SETUP_PINS
            ' set up Counter A to be the data counter
              mov     ctra,         #WIFI_MOSI          ' set the data pin
              movi    ctra,         #%0_00100_000       ' set the mode to NCO, single output pin
            ' set up Counter B to be the clock counter
              mov     ctrb,         #WIFI_CLK           ' set the clock pin
              movi    ctrb,         #%0_00100_000       ' set the mode to NCO, single output pin
    
              mov     temp,         #1
              shl     temp,         #WIFI_MOSI
              mov     dira,         temp
    
              mov     temp,         #1
              shl     temp,         #WIFI_CLK
              or      dira,         temp
    
              mov     temp,         #1
              shl     temp,         #TEST_PT
              or      dira,         temp
              
    SETUP_PINS_ret    ret
    
    {******************************** Variables ********************************************************
    PASM variables below
    ***************************************************************************************************}
    command       long      0
    numBytes      long      0
    bufferPtr     long      0
    hubAddress    long      0
    
    zero          long      0               ' Zero=0
    numBits       long      0
    
    misoMask      long      |< WIFI_MISO
    mosiMask      long      |< WIFI_MOSI
    clkMask       long      |< WIFI_CLK
    testMask      long      |< TEST_PT
    
    temp          res       1
    outData       res       1
    inData        res       1
    commandPtr    res       1
    byteCountPtr  res       1
    bufferValPtr  res       1
    curBytePtr    res       1
    
    data    res   1
    t       res   1
    
            FIT     496
    
  • Ahle2Ahle2 Posts: 1,179
    edited 2012-10-01 11:58
    For fast bidirectional transfers, you could let the video circuitry handle the output, while your program did the input. Despite what the manual implies, you can also use mode 010 or 011 for the ctra video clock source to get the shift clock on a pin. Or you could let the video circuitry produce the shift clock and the serial output data stream.

    -Phil
    I did continuous (not just byte bursts as described here) 20 Mbit spi out using the video generator and even started a thread about it. Almost no one was interested; Maybe because I am not one of the official "video generator/counter gurus" on this forum?!
    Anyway, I lost interest because lack of feedback.

    Even better than 20 Mbit out is possible in bursts. The bottle neck is feeding the video generator fast enough with data from hub ram.
  • lonesocklonesock Posts: 917
    edited 2012-10-01 12:11
    @Ahle2: I read and enjoyed the thread...sorry for the lack of feedback! I think the deal-killer for me, at least at the time, was the difficulty in getting it started with a quick turnaround time or keeping it fed with the properly spaced waitvids if I was doing something else...or am I misremembering? I will dig that up and review it, thanks for the memory jog!

    @Peter: Here's what I have in the latest version of the FSRW block code, byte versions only:
    out8
            andn outa,maskDI 
            mov phsb,#0
            movi frqb,#%01_0000000        
            rol phsa,#1
            rol phsa,#1
            rol phsa,#1
            rol phsa,#1
            rol phsa,#1
            rol phsa,#1
            rol phsa,#1
            mov frqb,#0
            ' don't shift out the final bit...already sent, but be aware 
            ' of this when sending consecutive bytes (send_cmd, for e.g.) 
    out8_ret
            ret
    
    {
    in8
            or outa,maskDI
            mov ctra,readMode
            ' Start my clock
            mov frqa,#1<<7
            mov phsa,#0
            movi phsb,#%11_0000000
            movi frqb,#%01_0000000
            ' keep reading in my value, one bit at a time!  (Kuroneko - "Wh)
            shr frqa,#1
            shr frqa,#1
            shr frqa,#1
            shr frqa,#1
            shr frqa,#1
            shr frqa,#1
            shr frqa,#1
            mov frqb,#0 ' stop the clock
            mov readback,phsa
            mov frqa,#0
            mov ctra,writeMode
    in8_ret
            ret
    }
    in8
            neg phsa,#1' DI high
            mov readback,#0
            ' set up my clock, and start it
            movi phsb,#%011_000000
            movi frqb,#%001_000000
            ' keep reading in my value
            test maskDO,ina wc
            rcl readback,#1
            test maskDO,ina wc
            rcl readback,#1
            test maskDO,ina wc
            rcl readback,#1
            test maskDO,ina wc
            rcl readback,#1
            test maskDO,ina wc
            rcl readback,#1
            test maskDO,ina wc
            rcl readback,#1
            test maskDO,ina wc
            rcl readback,#1
            test maskDO,ina wc
            mov frqb,#0 ' stop the clock
            rcl readback,#1
            mov phsa,#0 'DI low
    in8_ret
            ret
    
    The commented out middle version was the 20 Mbps read, but it didn't seem to work on all hardware & cog/pin combinations...so it's in disuse, and hasn't been tested since the rest of the counter framework as evolved around it. Sorry for not answering your question directly...a bit busy!

    thanks,
    Jonathan
  • pgbpsupgbpsu Posts: 460
    edited 2012-10-01 12:11
    Hi Ahle2-

    Do you mind posting a link to that thread? If I can't get what I'm working on fixed, I'm open to a different tactic.

    Thanks
  • pgbpsupgbpsu Posts: 460
    edited 2012-10-01 14:21
    Hi Jonathan-

    Thanks for posting. I'd looked over that section of fsrw to see if it was different.

    The code I posted above (based directly on yours) works but, it basically doesn't get along with the other stuff I'd written and I can't understand why. I really like these write speeds and I'm only working with on specific device rather than a bunch of different SD card manufactures. My device is spec'd to 25Mhz for the spi line so I'm still considerably below it. However, do need to read with this device as well as a read while writing (true spi) so I need to get a version working that will do both in and out. My spi read/write routines aren't nearly as fast as what you've got, but they are fast enough since I don't use read AND write simultaneously that frequently. For the heavy throughput stuff I want to use your code and for the read/write stuff I can go slowly. But to do this they need to cooperate.

    Ahle2, if Jonathan's memory is correct your methods might not be suitable for my application (everything here is treated as a byte) but I'd still like to see them.

    Thanks,
    Peter
  • kuronekokuroneko Posts: 3,623
    edited 2012-10-01 16:06
    pgbpsu wrote: »
    Can anyone explain what's going on that prevents me from using the r/w version (non-counter) and then using the w_fast(counter) version?
    [COLOR="red"]muxz    outa,         clkMask         ' raise clock[/COLOR]
              djnz    numBits,      #:bitLoop       ' decrement loop                       WIFI_CLK = 1  exit path
                                                    '                                                    |
              wrbyte  inData,       hubAddress      ' write the data back to the buffer    WIFI_CLK = 1  |
              add     hubAddress,   #1              ' set pointer to next byte in buffer   WIFI_CLK = 1  |
              djnz    numBytes,     #:byteLoop      ' Decrement loop count.                WIFI_CLK = 1  |
    RW_BYTE_ret       ret                           '                                      WIFI_CLK = 1  |
    
    Does that ring a bell?
  • pgbpsupgbpsu Posts: 460
    edited 2012-10-01 17:03
    kuroneko-

    I appreciate you looking at this. I must say I don't see the problem (which is probably how I got into this situation). In the highlighted code above I'm using the red line to toggle the clock line. That works fine. I'm only changing the state of one pin, outa[clkMask]. As I'm sure you're aware, this works fine. However after running this, when I jump into lonesock's code I don't get any output - clocks (or data) coming out. I think you're suggesting that when I leave my slow read/write routine, I'm leaving outa[clkMask] := 1 and this somehow prevents other output. But since this is the same cog (and NOT the direction register) why wouldn't the faster routine be able to then toggle the counter output pin (which just so happens to be the same as the bit mask used in the code above).

    I'm sure you see the problem. Unfortunately, I don't (yet).

    Thanks,
    Peter
  • kuronekokuroneko Posts: 3,623
    edited 2012-10-01 17:11
    pgbpsu wrote: »
    I must say I don't see the problem (which is probably how I got into this situation).
    The code is entered with Z = 1, which means - as your comments indicate - muxnz lowers the clock and muxz raises the clock. When you leave the code the clock remains high (raised) which means it stays high unless the relevant bit in outa is reset. The counter output (clock) is in contentionA with outa so won't have any effect.

    IOW either the fast code has to unblock outa and raise it again once finished or you change all routines to leave the clock line low once done.


    A if set to output, pin := outa|ctra|ctrb|video
Sign In or Register to comment.