Fast, counter-augmented fixed multiply

Phil Pilgrim (PhiPi) · 2014-05-09 14:34

Inspired by this thread, I decided to write a general fixed multiply routine using ctra to do some of the work. Basically, multiplication is just a combination of shifts and adds. In this implementation, the PASM program does the shifts and the counter does the adds in parallel, cutting the program's execution time by nearly half. Because the counter does four adds for every shift, the result has to be corrected at the end.

Here's a demo program that multiplies by 12345:

CON

  _clkmode      = xtal1 + pll16x
  _xinfreq      = 5_000_000

OBJ

  sio   :       "FullDuplexSerial"

VAR

  long cmd, multiplicand, multiplier
  
PUB  start 

  cognew(@multiply, @cmd)
  sio.start(31, 30, 0, 115200)
  multiplier := 12345
  repeat
    multiplicand := get_dec
    sio.dec(multiplicand)
    sio.str(string(" * "))
    sio.dec(multiplier)
    sio.str(string(" == "))
    sio.dec(multiplicand * multiplier)
    cmd~~
    sio.str(string(" == "))
    sio.dec(multiplicand)
    sio.tx(13)

PRI get_dec | char
  repeat
    char := sio.rx
    case char
      "0" .. "9" :
        result := result * 10 + char - "0"
      13 :
        return result    

DAT

              org       0
multiply      mov       ctra,ctra0
              mov       mpptr,par
              add       mpptr,#4

cmd_lp        rdlong    mpcnd,par wz
        if_z  jmp       #cmd_lp

              rdlong    mpcnd,mpptr

'-------[Replace with generated code]--------------------------------------------------------------              

'Multiply by 12345 == $3039 == %0011_0000_0011_1001.

              mov       frqa,#0       'Zero the multiplicand.
              mov       phsa,#0       'Zero the product.
              mov       frqa,mpcnd    'Bit 0.
              shl       frqa,#3       'Bit 3.
              shl       frqa,#1       'Bit 4.
              shl       frqa,#1       'Bit 5.
              shl       frqa,#7       'Bit 12.
              shl       frqa,#1       'Bit 13.
              mov       frqa,#0       'Let Bit 13 finish.
              mov       product,phsa  'Read product.            
              shr       product,#2    'Correct by four counts per instruction. 

'--------------------------------------------------------------------------------------------------
 
              wrlong    product,mpptr
              wrlong    zero,par
              jmp       #cmd_lp

ctra0         long      %00100 << 26
zero          long      0

product
mpcnd         res       1
mpptr         res       1

Going one step further, I decided to write a Spin program that generates the PASM code for any muliplier:

CON

  _clkmode      = xtal1 + pll16x
  _xinfreq      = 5_000_000

OBJ

  sio   :       "FullDuplexSerial"

VAR

  long multiplier
  
PUB  start | shrgt, shlft, zeroes

  sio.start(31, 30, 0, 115200)
  repeat
    multiplier := get_dec
    sio.str(@comment)
    sio.dec(multiplier)
    sio.str(string(13, 13))
    multiplier ><= 32
    shlft := 32 - >| multiplier 
    multiplier <<= shlft
    shrgt := (2 - shlft) #> 0
    shlft := shlft - (2 - shrgt)
    sio.str(@zero_frqa)
    sio.str(@zero_phsa)
    if (shlft)
      sio.str(@shl_mpcnd)
      sio.dec(shlft)
      sio.tx(13)
    sio.str(@mov_frqa)
    repeat while (multiplier)
      if (multiplier &= $7fff_ffff)
        shlft := 32 - >| multiplier
        sio.str(@shl_frqa)
        sio.dec(shlft)
        sio.tx(13)
        multiplier <<= shlft
    sio.str(@zero_frqa)
    sio.str(@mov_product)
    if (shrgt)
      sio.str(@shr_product)
      sio.dec(shrgt)
      sio.tx(13)
    
    
PRI get_dec | char
  repeat
    char := sio.rx
    case char
      "0" .. "9" : result := result * 10 + char - "0"
      13 : return result    

DAT

comment                 byte    13, "'Multiply by ", 0
zero_frqa               byte    "              mov       frqa,#0", 13, 0
zero_phsa               byte    "              mov       phsa,#0",13, 0
shl_mpcnd               byte    "              shl       mpcnd,#", 0
mov_frqa                byte    "              mov       frqa,mpcnd", 13, 0
shl_frqa                byte    "              shl       frqa,#", 0
mov_product             byte    "              mov       product,phsa", 13, 0
shr_product             byte    "              shr       product,#", 0

It produces optimized code that eliminates the final correction if the last two bits of the multiplier are zeroes. If only the last bit is zero, it reduces the amount of the correction by one bit. This is done to provide the greatest amount of headroom without producing an overflow. The user still needs to make sure that the operands are in a range that will not overflow, however.

Anyway, I've only cursorily tested the code. If you find any errors, please post the issue here. Thanks.

-Phil

tonyp12 · 2014-05-11 09:09

Useful if you always multiply with the same number or a few fixed numbers as you could fit a few in a cog, so maybe expand the Spin-generating-code to allow 24 of them?
And if someone really what to save 4 cycles by getting rid of shr , setup CTRB to 20mhz to feed CTRA on the same pin(?) instead of always, but waste 1 pin

Over on msp430 forum I was showed what I learned to create the fastest 8bit x 8bit ->16bit multiplication ever made for the msp430
a jnc take two cycles so instead I do +1 or +2 to PC
For the last years most msp's have a 0 fixed for bit0 of PC so +1 does noting but +2 skips to the next opcode.

msp430 also have counters, but no frqa so always +1 but maybe I can still do some tricks with it.

I was able to change the eight jnc to addc = 8 cycles saved
subtract one for the added inv = now only 20-to-28 cycles !!
It now also auto AND 0xFF the values coming in, without extra cycles.


multiply2   inv.b  R13     ; invert and clears bit 8-F
            add.b  #0,R12  ; clear c and clears bit 8-F 
            swpb   R12
            rrc.w  r12
_bit0       rrc.w  r13
            addc.w #1,PC
            add.w  r12,r13            
_bit1       rrc.w  r13
            addc.w #1,PC
            ....

kuroneko · 2014-05-12 00:16

tonyp12 wrote: »

And if someone really what to save 4 cycles by getting rid of shr , setup CTRB to 20mhz to feed CTRA on the same pin(?) instead of always, but waste 1 pin

FWIW, I wasted one of those pins [thread=125046]about 4 years ago[/thread] ... (why is it wasted when it fulfills a function?) ...

@PhiPi: Wouldn't a LOGIC.always be a more logical choice (movi ...)? Not relevant for your example but if it becomes part of something bigger then you have to keep an eye on it.

Fast, counter-augmented fixed multiply

Comments