
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <sys/time.h>


enum {
//    _xinfreq = 20_000_000,
    _xtlfreq = 20_000_000,
    _clkfreq = 8_000_000,

    DOWNLOAD_BAUD = 230_400,
    DEBUG_BAUD = DOWNLOAD_BAUD,

    basepin = 40,

    MISO = basepin+0,
    MOSI = MISO+1,  // this pairing is required for internal simultaneous management of both smartpins
    CLK = basepin+2,
    CS = basepin+5,
    CMODE = 3,  // CPOL is bit1, CPHA is bit0

    BUFFSIZE = 1024,    // longwords

    SPINS = CS | CLK<<6 | MISO<<12 | MOSI<<18 | CMODE<<30,

    LMISO = basepin+6,    // requires a jumpered loopback to MISO
    LMOSI = basepin+3,    // requires a jumpered loopback to MOSI
    LCLK = basepin+4,    // requires a jumpered loopback to CLK
    LCS = basepin+7,    // requires a jumpered loopback to CS

    SPIFREQ = 1_000_000,    // SPI master clock frequency

};


struct __using("spi_slave.spin2") dut;
//struct __using("spi_slave_truncated.spin2") dut;


static uint32_t  sbuff[BUFFSIZE];
static uint32_t  txdata[BUFFSIZE];
static uint32_t  rxdata[BUFFSIZE];



static void  cmd4( uint32_t addr, size_t len )    // CMD4  - BlockRead 
{
    ptrb = (uint32_t)&rxdata[addr];

//        wypin( LCLK, 5 );
//        waitus( 10 );

    __asm const {    // don't optimise
		outh	#LCS    // signal end of prior SPI transaction, timing starts here
		setnib	addr, #4, #7    // place CMD4 (BlockRead) in first 4 bits
		rev	addr
		shl	len, #5    // longwords to bits
		dirl	#LCLK
		dirl	#LMOSI
		wypin	addr, #LMOSI    // load CA into Tx shifter
		dirh	#LMOSI    // Tx smartpin aligned for fresh clocks
		dirl	#LMISO
		dirh	#LMISO    // Rx smartpin aligned for fresh clocks
		outl	#LCS    // signal start of SPI transaction
		dirh	#LCLK    // enable clock gen
		wypin	#64, #LCLK    // fire!  timing ends here

		wypin	#0, #LMOSI     // clear Tx smartpin buffer, continous mode is important here if divider is high
.cawait
		testp	#LCLK   wz    // wait for 64 SPI clocks
	if_nz	jmp	#.cawait
		wypin	len, #LCLK    // clocks for Rx data
		akpin	#LMISO    // ack Rx smartpin to discard Rx idle period

//		mov	addr, ptrb    // diag
//		mov	len, #0    // diag
.dataloop
		testp	#LCLK   wz    // check for completion
		testp	#LMISO   wc    // Rx word in smartpin buffer
	if_c	rdpin	pb, #LMISO
	if_c	rev	pb      // 32-bit endian bit-swap, big-endian shifted out from lsb of shifter
	if_c	movbyts	pb, #0b00_01_10_11    // 32-bit endian byte-swap
	if_c	wrlong	pb, ptrb++

//	if_c	add	len, #1    // diag
	if_nz	jmp	#.dataloop

		outh	#LCS    // signal completion of SPI transaction, slave begins reset sequence
//		wrlong	len, addr    // diag
    }
}



static void  cmd5( uint32_t addr, size_t len )    // CMD5  - BlockWrite
{
    pb = (uint32_t)&txdata[addr];

//        wypin( LCLK, 5 );
//        waitus( 10 );

    __asm volatile {    // don't optimise
		rdfast	##0x8000_0000, pb
		outh	#LCS    // signal end of prior SPI transaction, timing starts here
		setnib	addr, #5, #7    // place CMD5 (BlockWrite) in first 4 bits
		rev	addr
		add	len, #1
		shl	len, #5    // longwords to bits
		dirl	#LCLK
		dirl	#LMOSI
		wypin	addr, #LMOSI    // load CA into Tx shifter
		dirh	#LMOSI    // Tx smartpin aligned for fresh clocks
		outl	#LCS    // signal start of SPI transaction
		dirh	#LCLK    // enable clock gen
		wypin	len, #LCLK    // fire!  timing ends here

		modcz	$f,$f   wc    // C flag set to prefill the smartpin's buffer
.dataloop
	if_c	rflong	pb
	if_c	movbyts	pb, #0b00_01_10_11    // 32-bit endian byte-swap
	if_c	rev	pb      // 32-bit endian bit-swap, big-endian shifted out from lsb of shifter
	if_c	wypin	pb, #LMOSI    // load buffer register, continous mode is important here if divider is high

		testp	#LCLK   wz
		testp	#LMOSI   wc
	if_nz	jmp	#.dataloop

		outh	#LCS    // signal completion of SPI transaction, slave begins reset sequence
    }
}

/*
static void  cmd7( uint32_t wraddr, size_t len )    // CMD7  - BlockReadWrite
{
    pb = (uint32_t)&txdata[addr];
    ptrb = (uint32_t)&rxdata[addr];

//        wypin( LCLK, 5 );
//        waitus( 10 );

    __asm const {    // don't optimise this
		rdfast	##0x8000_0000, pb
		outh	#LCS    // signal end of prior SPI transaction, timing starts here
		setnib	addr, #7, #7    // place CMD5 (BlockWrite) in first 4 bits
		rev	addr
		shl	len, #5    // longwords to bits
		dirl	#LMOSI
		wypin	addr, #LMOSI    // load CA into shifter
		dirh	#LMOSI    // ready for clocks
		dirl	#LMISO
		dirh	#LMISO    // ready for clocks
		outl	#LCS    // signal start of SPI transaction
		dirl	#LCLK
		dirh	#LCLK    // enable clock gen
		wypin	len, #LCLK    // fire!  timing ends here

.dataloop
		testp	#LMOSI   wc
	if_c	rflong	pb
	if_c	movbyts	pb, #0b00_01_10_11    // 32-bit endian byte-swap
	if_c	rev	pb      // 32-bit endian bit-swap, big-endian shifted out from lsb of shifter
	if_c	wypin	pb, #LMOSI    // load buffer register, continous mode is important here if divider is high

		testp	#LMISO   wc
	if_c	rdpin	pb, #LMISO    // 32-bit endian bit-swap, big-endian shifted out from lsb of shifter
	if_c	rev	pb      // 32-bit endian bit-swap, big-endian shifted out from lsb of shifter
	if_c	movbyts	pb, #0b00_01_10_11    // 32-bit endian byte-swap
	if_c	wrlong	pb, ptrb++

		testp	#LCLK   wc
	if_nc	jmp	#.dataloop

		outh	#LCS    // signal completion of SPI transaction, slave begins reset sequence
    }
}
*/


static void  filldat( uint8_t addr[], size_t len )
{
    size_t  i;

    for( i = 0; i < len; i++ )
        addr[i] = i - 1;
}


static void  cleardat( uint8_t addr[], size_t len )
{
    size_t  i;

    for( i = 0; i < len; i++ )
        addr[i] = 0;
}


void  main( void )
{
    uint32_t  i, j, addr;

    i = _clockfreq();
    printf( "   clkfreq = %d   clkmode = 0x%x   SPI clock divider = %d\n", i, _clockmode(), i / SPIFREQ );

    dut.start_SPI_slave( SPINS, sbuff, BUFFSIZE );
    _waitatn();    // wait for cog to start up


    // Slave cog is ready and waiting.  At this stage the tester program has not setup any of its pins,
    // so we are going to be glitching them from the start.

    _pinh( LCS );

    j = i / SPIFREQ;    // SPI clock divider
    j = j < 2 ? 2 : j;
    _pinstart( LCLK, P_PULSE | P_OE
            | (CMODE & 2 ? P_INVERT_OUTPUT : 0)    // CPOL
            | ((CMODE==1 || CMODE==2) ? P_INVERT_IN : 0)    // clocking edge
//            | ((CMODE==1 || CMODE==2) ? 0 : P_INVERT_IN)    // clocking edge
            , j | (j / 2)<<16, 0 );

    _pinstart( LMOSI, P_SYNC_TX | P_OE
            | ((LCLK - LMOSI)&7)<<24    // smartB (clock pin) input select
//            , 1<<5 | 32-1, 0 );    // start-stop buffer mode with 32-bit shifter
            , 0<<5 | 32-1, 0 );    // continuous buffer mode with 32-bit shifter

    _pinstart( LMISO, P_SYNC_RX | P_HIGH_FLOAT | P_LOW_FLOAT
            | ((LCLK - LMISO)&7)<<24    // smartB (clock pin) input select
            , 1<<5 | 32-1, 0 );    // late sample mode with 32-bit shifter

    while( 1 )
    {
        addr = 0;

        printf( " Tx Test Data  " );
        filldat( (uint8_t *)txdata, BUFFSIZE/4 );
        for( i = addr; i < addr + 6; i++ )
            printf( " %08x ", txdata[i] );
        puts("");

        for( j = 0; j < 12; j++ )
        {
            printf( "Block Write Test  " );
            cleardat( (uint8_t *)sbuff, BUFFSIZE/4 );
            cmd5( addr, 10 );    // CMD5  - BlockWrite

            for( i = addr; i < addr + 6; i++ )
                printf( " %08x ", sbuff[i] );
            puts("");
        }
        _waitms( 800 );                // pause for debug

        for( j = 0; j < 12; j++ )
        {
            printf( "Block Read Test  " );
            cleardat( (uint8_t *)rxdata, BUFFSIZE/4 );
            cmd4( addr, 10 );    // CMD4  - BlockRead

            for( i = addr; i < addr + 6; i++ )
                printf( " %08x ", rxdata[i] );
            puts("");
        }
        _waitms( 800 );                // pause for debug
    }
}
