Pasm divide by 10 and 100
Hello,
Is this less efficient, more efficient or no difference to any other PASM ways to divide by 10 or 100? 50,000/10 = 6776 clk cycles, 1,500,000/10 = 11,320 clk cycles. 50,000/100 took 1404 clk cycles, and 1,500,000/100 took 1848 clk cycles. I know that when you get closer to 0 or closer to 2 billion (for the number * modifier) the numbers aren't as exact, but I think they are close enough for me so far.
I tried to figure out the long way to do this in a loop, and didn't have any success. I can't attach the project because of the work firewall, but here is the gist of it.
I'd like to be able to divide by any number though, with self modifying code and I'm not sure how to do that.
Is this less efficient, more efficient or no difference to any other PASM ways to divide by 10 or 100? 50,000/10 = 6776 clk cycles, 1,500,000/10 = 11,320 clk cycles. 50,000/100 took 1404 clk cycles, and 1,500,000/100 took 1848 clk cycles. I know that when you get closer to 0 or closer to 2 billion (for the number * modifier) the numbers aren't as exact, but I think they are close enough for me so far.
I tried to figure out the long way to do this in a loop, and didn't have any success. I can't attach the project because of the work firewall, but here is the gist of it.
I'd like to be able to divide by any number though, with self modifying code and I'm not sure how to do that.
CON
_clkmode = xtal1+pll16x
_clkfreq = 80_000_000
OBJ
DEBUG : "FullDuplexSerial" ' for debugging
VAR
LONG retblock, done, pst1, pst2, pst3, pst4, pst5, pst6, pst7, pst8
PUB Main
DEBUG.start(31, 30, 0, 115200)
waitcnt(clkfreq + cnt)
cognew(@math, @retblock)
repeat
debug.dec(pst1)
debug.tx(13)
debug.dec(pst2)
debug.tx(13)
debug.dec(pst3)
debug.tx(13)
debug.dec(pst4)
debug.tx(13)
debug.dec(pst5)
debug.tx(13)
debug.dec(pst6)
debug.tx(13)
debug.dec(pst7)
debug.tx(13)
debug.tx(13)
debug.dec(pst8)
debug.tx(13)
debug.tx(13)
debug.tx(13)
waitcnt(clkfreq + cnt)
DAT
math
add pstptr1, par ' @pst
add pstptr2, par ' @pst2
add pstptr3, par ' @pst3
add pstptr4, par ' @pst4
add pstptr5, par ' @pst5
add pstptr6, par ' @pst6
add pstptr7, par ' @pst7
add pstptr8, par ' @pst8
mov par1, fiftyK
MOV par2, #5
'call #divideByTen
call #divideByHun
jmp #end' /10
divideByTen
mov startCnt, cnt
mov divideByTenLoopCnt, #205
mov dividedByTenSum, #0
wrlong divideByTenLoopCnt, pstptr2
:div add dividedByTenSum, par1 ' multiply by 205, by adding the value 205 times
wrlong dividedByTenSum, pstptr3
wrlong par1, pstptr1
djnz divideByTenLoopCnt, #:div
shr dividedByTenSum, #11 ' divide by 2048 (net divide by /10) by shifting the value right by 13
wrlong dividedByTenSum, pstptr4
mov subSkew, #0
mov numTemp, par1
:rem
cmp numTemp, divideByTenSkew WZ, WC
if_z_or_nc add subSkew, #1
if_z_or_nc sub numTemp, divideByTenSkew
if_z_or_nc wrlong subSkew, pstptr5
if_z_or_nc jmp #:rem
mov endCnt, cnt
mov dividedByTenSumT, dividedByTenSum
sub dividedByTenSumT, subSkew
wrlong dividedByTenSumT, pstptr6
sub endCnt, startCnt
wrlong endCnt, pstptr7
MOV ret_val, dividedByTenSumT ' return product
divideByTen_ret RET
' /100
divideByHun
mov startCnt, cnt
mov divideByHunLoopCnt, #41
mov dividedByHunSum, #0
wrlong divideByHunLoopCnt, pstptr2
:div add dividedByHunSum, par1 ' multiply by 205, by adding the value 205 times
wrlong dividedByHunSum, pstptr3
wrlong par1, pstptr1
djnz divideByHunLoopCnt, #:div
shr dividedByHunSum, #12 ' divide by 2048 (net divide by /10) by shifting the value right by 13
wrlong dividedByHunSum, pstptr4
mov subSkew, #0
mov numTemp, par1
:rem
cmp numTemp, divideByHunSkew WZ, WC
if_z_or_nc add subSkew, #1
if_z_or_nc sub numTemp, divideByHunSkew
if_z_or_nc wrlong subSkew, pstptr5
if_z_or_nc jmp #:rem
mov endCnt, cnt
mov dividedByHunSumT, dividedByHunSum
sub dividedByHunSumT, subSkew
wrlong dividedByHunSumT, pstptr6
sub endCnt, startCnt
wrlong endCnt, pstptr7
MOV ret_val, dividedByHunSumT ' return product
divideByHun_ret RET
/1000
divideByThou
mov startCnt, cnt
mov divideByThouLoopCnt, #4
mov dividedByThouSum, #0
wrlong divideByThouLoopCnt, pstptr2
:div add dividedByThouSum, par1 ' multiply by 205, by adding the value 205 times
wrlong dividedByThouSum, pstptr3
wrlong par1, pstptr1
djnz divideByThouLoopCnt, #:div
shr dividedByThouSum, #12 ' divide by 2048 (net divide by /10) by shifting the value right by 13
wrlong dividedByThouSum, pstptr4
mov subSkew, #0
mov numTemp, par1
:rem
cmp numTemp, divideByThouSkew WZ, WC
if_z_or_nc add subSkew, #1
if_z_or_nc sub numTemp, divideByThouSkew
if_z_or_nc wrlong subSkew, pstptr5
if_z_or_nc jmp #:rem
mov endCnt, cnt
mov dividedByThouSumT, dividedByThouSum
add dividedByThouSumT, subSkew
wrlong dividedByThouSumT, pstptr6
sub endCnt, startCnt
wrlong endCnt, pstptr7
MOV ret_val, dividedByThouSumT ' return product
divideByThou_ret RET
end
'MOV p1, PAR
WRLONG ret_val, pstptr8
'ADD p1, #4
'WRLONG negone, p1
COGID p1
COGSTOP p1
' --------------------------------------------------------------------------------------------------
retblkptr long 0
doneptr long 4
pstptr1 long 8
pstptr2 long 12
pstptr3 long 16
pstptr4 long 20
pstptr5 long 24
pstptr6 long 28
pstptr7 long 32
pstptr8 long 36
par1 LONG 0
par2 LONG 0
ret_val LONG 0
divideByTenLoopCnt long 419431
divideByTenLoopCnt2 long 0
divideByTenLoopCntT long 205
divideByHundredLoopCnt long 41944
divideByTenSkew long 10273
dividedByTenSum long 0
fiveK long 5000
tenK long 10000
elevenK long 11000
fifteenK long 15000
twentyK long 20000
twentyFiveK long 25000
FiftyK long 50000
oneHundredFiftyK long 150000
onePtFiveMil long 1500000
fifteenMil long 15000000
fiftyMil long 50000000
subSkew long 0
numTemp long 0
dividedByTenSumT long 0
startCnt long 0
endCnt long 0
divideByHunLoopCnt long 41
dividedByHunSum long 0
dividedByHunSumT long 0
divideByHunSkew long 102731
divideByThouLoopCnt long 4
dividedByThouSum long 0
dividedByThouSumT long 0
divideByThouSkew long 41943
p1 RES
FIT 496
Comments
Just put the thing you want to divide in the dividend and the thing you want to divide by in the divisor. Then the result will appear in the quotient and the remainder will appear in the dividend.
' ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ' Unsigned Divide ' ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// LNDivide mov LNDivideQuotient, #0 ' Setup to divide. mov LNDivideBuffer, #0 ' mov LNDivideCounter, #32 ' cmp LNDivideDivsor, #0 wz ' Clear if dividing by zero. if_z mov LNDivideDividend, #0 ' if_z jmp #LNDivide_ret ' LNDivideLoopPre shr LNDivideDivsor, #1 wc, wz ' Align divisor MSB and count size. rcr LNDivideBuffer, #1 ' if_nz djnz LNDivideCounter, #LNDivideLoopPre ' LNDivideLoopPost cmpsub LNDivideDividend, LNDivideBuffer wc ' Preform division. rcl LNDivideQuotient, #1 ' shr LNDivideBuffer, #1 ' djnz LNDivideCounter, #LNDivideLoopPost ' LNDivide_ret ret ' Return. Remainder in dividend on exit. ' ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ' Unsigned Multiply ' ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// LNMultiply mov LNMultiplyProduct, #0 ' Clear product. LNMultiplyLoop shr LNMultiplyMultiplicand, #1 wc ' Preform multiplication. if_c add LNMultiplyProduct, LNMultiplyMultiplier ' shl LNMultiplyMultiplier, #1 ' tjnz LNMultiplyMultiplicand, #LNMultiplyLoop ' LNMultiply_ret ret ' Return. ' ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ' Data ' ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ' //////////////////////Run Time Variables///////////////////////////////////////////////////////////////////////////////////// LNDivideBuffer res 1 LNDivideCounter res 1 LNDivideDividend res 1 LNDivideDivsor res 1 LNDivideQuotient res 1 LNMultiplyMultiplicand res 1 LNMultiplyMultiplier res 1 LNMultiplyProduct res 1
I have used a techique like yours for processors that have a single-cycle multiply. You are approximating 1/10 as 204.8/2048. The value of 205 has an error of 0.1%, so you could use this technique to handle numbers that have a limited range. You need to add 1024 before shifting down by 11 bits to round. I didn't notice if your code did that. You also need to implement the multiply by 205 in a more efficient way. 205 is equal to %1100_1101. So you could implement this as follows:
pub x_div_10(x) result := x + x + x result := ((result << 6) + (result << 2) + x + 1024) >> 11
EDIT: Of course, in Spin it would be much more efficient to just do x/10. However, this technique should be fairly efficient in PASM.
http://www.hackersdelight.org/divcMore.pdf
Jonathan
x_div_10 mov temp x add temp,x add temp,x mov result,temp add result,#1024 >> 6 shl result,#6 shl temp,#2 add result,temp add result,x shr result,#11 x_div_10_ret
div10 mov t, a shl t, #1 add a, t mov t, a shl t, #4 add a, t mov t, a shl t, #8 add a, t add a, H3333 shr a, #17 div10_ret ret H3333 long $3333
Thanks for this code, I tested it and was quite happy!
Thanks for your replies, I did not think of the rounding technique, that was brilliant, I tried it in my code and it worked well and makes complete logical sense! So does the shift by X to get to 205 faster, rather than an add routine.
I tried Kye's object and it worked well, do you see any advantage to not using that for a /10, since it appears it can divide just about any number and is not limited to /10 or /100?
Is there a repository where I can look up a spin command and then view the routines the interpreter uses? That would be a huge help to PASM beginners like me.
May I propose a fast and very accurate algorithm for the division by 10?
As you know, division by 10 is the same as to multiply with 0.1.
2^20 is 1_048_576. It is easy to divide with this number in PASM.
104_856 / 1_048_576 is about 0.0999985, some 2ppm accurate to 0.1
However, 104_856 factorisable as 2^3 x 3 x 17 x 257. In other words
104_856 = 2^3 x (2^2 - 1) x (2^4 + 1) x (2^8 + 1)
This means that it is easy to multiply with 104_856 in PASM with left
shifts and additions/subtraction.
The very pseudo algorithm goes as follows:
Rotate argument 1 left
Subtract original argument
Rotate new argument 2 left
Add new argument to result
Rotate result 8 left.
Add previous result once to it.
Rotate final result 17 times right.
(See some similar code in Dave Hein post)
I guess this will take about 12-15 machine cycles and can be used
up to 40_000 without error.
Cheers,
Istvan
Thanks for your post. I have never used a rotate command for math before, I guess that would be RCL/RCR in PASM?
Kye kind of gave me the "keys to the castle" when he gave me this code below, I just call it and then return a value to a variable, I don't know if your way is faster, but this was from the spin interpreter and I've just been using this as I spent days working on an algorithm and I use this just like an object when I call it.
' ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ' Unsigned Divide ' ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// LNDivide mov LNDivideQuotient, #0 ' Setup to divide. mov LNDivideBuffer, #0 ' mov LNDivideCounter, #32 ' cmp LNDivideDivsor, #0 wz ' Clear if dividing by zero. if_z mov LNDivideDividend, #0 ' if_z jmp #LNDivide_ret ' LNDivideLoopPre shr LNDivideDivsor, #1 wc, wz ' Align divisor MSB and count size. rcr LNDivideBuffer, #1 ' if_nz djnz LNDivideCounter, #LNDivideLoopPre ' LNDivideLoopPost cmpsub LNDivideDividend, LNDivideBuffer wc ' Preform division. rcl LNDivideQuotient, #1 ' shr LNDivideBuffer, #1 ' djnz LNDivideCounter, #LNDivideLoopPost ' 'wrlong LNDivideQuotient, pstptr7 'wrlong LNDivideDividend, pstptr8 mov divisionResult, LNDivideQuotient LNDivide_ret ret ' Return. Remainder in dividend on exit. ' ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ' Unsigned Multiply ' ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// LNMultiply mov LNMultiplyProduct, #0 ' Clear product. LNMultiplyLoop shr LNMultiplyMultiplicand, #1 wc ' Preform multiplication. if_c add LNMultiplyProduct, LNMultiplyMultiplier ' shl LNMultiplyMultiplier, #1 ' tjnz LNMultiplyMultiplicand, #LNMultiplyLoop ' mov multiplicationResult, LNMultiplyProduct LNMultiply_ret ret ' Return. ' ///////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// ' Data ' /////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
Kye's code is a fast and optimized "traditional" binary division, that always takes about 104-116 machine cycles.
Its main advantage here, that it works with arbitrary arguments. If speed matters, it is an overkill for just dividing
small numbers with 10.
Dave Hein PASM code takes only 10 machine cycles. 10 times faster than the full division, but the result is
correct only for arguments less than 1027.
My method should be implemented in PASM with less than 20 machine cycles. Slower, but works correctly
up to 32K dividends. The interesting thing about it that the factorization of the multiplier (104_856) contains
only numbers that are powers of two plus/minus one (3, 17, 257). This allows for effective PASM coding.
In my previous answer I should wrote shift instead of rotate. One step shift of the binary numbers to the left
or the to the right is multiplication or division by two, respectively. For example to divide by 8 just shift
3 times to the right:
in PASM SHR dividend, #3 (1 machine cycle)
in SPIN dividend>>3 (Much less machine cycles than of "dividend := dividend / 8")
You can use rotate in binary arithmetics in many cases, usualy combined with carry to handle over or
underflow, for example.
Clarify "machine cycle" - its highly ambiguous to my mind...
Hi Mark,
I see, you even coded it long before I "found" it. It is only 12 machine cycles. 4 clock ticks
needed for the Prop for 1 instruction cycle, which I call 1 machine cycle, maybe because of
easier spelling.
Did you note, that one can expand the range of arguments with correct result by distributing
the 17 left shifts inside/after the multiplications? Maybe one can make a 31-bit divide_by_10
routine from your code.