C Expressive Enough for Idiomatic PASM?

I've been working on translating some C code to PASM. In the process, I've been wondering whether it's even reasonable to compile C to PASM, since PASM has so many idiomatic shortcuts for which C simply has no native semantics.
Here's part of the C code:
Here's what I would consider a "nearly-brain-dead" translation to PASM, but with some simple optimizations already in place, nonetheless. It results in 51 PASM instructions:
Here's the same code translated into more idiomatic PASM (37 instructions):
With further hand finessing (e.g reordering statements to eliminate the nops), I've been able to reduce the equivalent PASM to 33 instructions.
So my questions are these:
1. Can an optimizing C compiler even hope to produce efficient, idiomatic PASM?
2. If "no" to #1, are we doing users any favors by providing a C-to-PASM compiler if it misleads them into believing that they're getting the best performance possible from the Propeller?
3. If "yes" to #1, will sufficient effort be put into the planned compiler to do so?
Whatever the answers, it will be a good exercise to compare the compiler output to hand-translated PASM in order to uncover missed opportunities for optimization -- assuming a compiler can actually exploit them.
-Phil
Here's part of the C code:
val = *indata++;
diff = val - valpred;
if(diff < 0)
{
delta = 8;
diff = (-diff);
}
else
{
delta = 0;
}
vpdiff = (step >> 3);
if ( diff >= step ) {
delta |= 4;
diff -= step;
vpdiff += step;
}
step >>= 1;
if ( diff >= step ) {
delta |= 2;
diff -= step;
vpdiff += step;
}
step >>= 1;
if ( diff >= step ) {
delta |= 1;
vpdiff += step;
}
if ( (delta&8) != 0 )
{
valpred -= vpdiff;
if ( valpred < -32768 )
valpred = -32768;
}
else
{
valpred += vpdiff;
if ( valpred > 32767 )
valpred = 32767;
}
index += indexTable[delta];
if ( index < 0 ) index = 0;
else if ( index > 88 ) index = 88;
step = stepsizeTable[index];
outputbuffer = (delta << 4);
Here's what I would consider a "nearly-brain-dead" translation to PASM, but with some simple optimizations already in place, nonetheless. It results in 51 PASM instructions:
'val = *indata++;
rdlong val,indata
add indata,#4
'diff = val - valpred;
mov diff,val
sub diff,valpred
'if(diff < 0)
cmp diff,#0 wc
if_nc jmp #:L001
'{
' delta = 8;
' diff = (-diff);
'}
mov delta,#8
neg diff,diff
jmp #:L002
'else
'{
' delta = 0;
'}
:L001 mov delta,#0
'vpdiff = (step >> 3);
:L002 mov vpdiff,step
shr vpdiff,#3
'if ( diff >= step ) {
cmp diff,step wc
if_c jmp #:L003
' delta |= 4;
' diff -= step;
' vpdiff += step;
'}
or delta,#4
sub diff,step
add vpdiff,step
'step >>= 1;
:L003 shr step,#1
'if ( diff >= step ) {
cmp diff,step wc
if_c jmp #:L004
' delta |= 2;
' diff -= step;
' vpdiff += step;
'}
or delta,#2
sub diff,step
add vpdiff,step
'step >>= 1;
:L004 shr step, #1
'if ( diff >= step ) {
cmp diff,step wc
if_c jmp #:L005
' delta |= 1;
' vpdiff += step;
'}
or delta,#1
add vpdiff,step
'if ( (delta&8) != 0 )
:L005 test delta,#8 wz
if_z jmp #:L006
'{
' valpred -= vpdiff;
' if ( valpred < -32768 ) valpred = -32768;
'}
sub valpred,vpdiff
cmps valpred,minword wc
if_c mov valpred,minword
jmp #:L007
'else
'{
' valpred += vpdiff;
' if ( valpred > 32767 ) valpred = 32767;
'}
:L006 add valpred,vpdiff
cmps valpred,maxword wz,wc
if_nz_and_nc mov valpred,maxword
'index += indexTable[delta];
:L007 movs :L008,#indexTable
add :L008,delta
nop
:L008 add index,0-0
'if ( index < 0 ) index = 0;
cmp index,#0 wc
if_c mov index,#0
jmp #:L009
'else if ( index > 88 ) index = 88;
cmp index,#88 wc,wz
if_nz_and_nc mov index,#88
'step = stepsizeTable[index];
:L009 movs :L010,#stepsizeTable
add :L010,index
nop
:L010 mov step,0-0
'outputbuffer = delta;
wrlong delta,outputbuffer
Here's the same code translated into more idiomatic PASM (37 instructions):
'val = *indata++;
rdlong val,indata
add indata,#4
'diff = val - valpred;
mov diff,val
sub diff,valpred wc
'if(diff < 0)
'{
' delta = 8;
' diff = (-diff);
'}
'else
'{
' delta = 0;
'}
if_c mov delta,#8
if_nc mov delta,#0
abs diff,diff
'vpdiff = (step >> 3);
mov vpdiff,step
shr vpdiff,#3
'if ( diff >= step ) {
cmp diff,step wc
' delta |= 4;
' diff -= step;
' vpdiff += step;
'}
if_nc or delta,#4
if_nc sub diff,step
if_nc add vdiff,step
'step >>= 1;
shr step,#1
'if ( diff >= step ) {
cmp diff,step wc
' delta |= 2;
' diff -= step;
' vpdiff += step;
'}
if_nc or delta,#2
if_nc sub diff,step
if_nc add vpdiff,step
'step >>= 1;
shr step,#1
'if ( diff >= step ) {
cmp diff,step wc
' delta |= 1;
' vpdiff += step;
'}
if_nc or delta,#1
if_nc add vpdiff,step
'if ( (delta&8) != 0 )
'{
' valpred -= vpdiff;
' if ( valpred < -32768 ) valpred = -32768;
'}
'else
'{
' valpred += vpdiff;
' if ( valpred > 32767 ) valpred = 32767;
'}
test delta,#8 wc
sumc valpred,vpdiff
mins valpred,minword
maxs valpred,maxword
'index += indexTable[delta];
movs :L001,#indexTable
add :L001,delta
nop
:L001 add index,0-0
'if ( index < 0 ) index = 0;
mins index,#0
'else if ( index > 88 ) index = 88;
maxs index,#88
'step = stepsizeTable[index];
movs :L002,#stepsizeTable
add :L002,index
nop
:L002 mov step,0-0
'outputbuffer = delta;
wrlong delta,outputbuffer
With further hand finessing (e.g reordering statements to eliminate the nops), I've been able to reduce the equivalent PASM to 33 instructions.
So my questions are these:
1. Can an optimizing C compiler even hope to produce efficient, idiomatic PASM?
2. If "no" to #1, are we doing users any favors by providing a C-to-PASM compiler if it misleads them into believing that they're getting the best performance possible from the Propeller?
3. If "yes" to #1, will sufficient effort be put into the planned compiler to do so?
Whatever the answers, it will be a good exercise to compare the compiler output to hand-translated PASM in order to uncover missed opportunities for optimization -- assuming a compiler can actually exploit them.
-Phil
Comments
That of course is LMM code so it will have some funny looking jumps and such in it but basically you will see what to expect if it were native PASM.
I have never been totally convinced that compiling C to PASM is sensible for the Prop. Because:
1. PASM has no convenient stack pointer or stack ops you have to synthesize it in code.
2. PASM has no pointers. For in cog PASM/data you have to synthesize pointers with self modifying code. Perhaps not so bad for data in HUB using rd/wrlong and friends. But then what about function pointers for in cog code?
3. The generated code size is too big, all instructions are 32 bit and we are memory constrained.
4. Generating LMM is a neat solution to much of the above but requires VM to run it and is hence slow.
All on all it's hard to beat Spin for compact code and PASM for speed.
The Prop 2 will have more speed and space to give C room to breath but as a C engine it's still going to be bit sad compared to similarly clocked devices.
Still, we might be surprised what optimizations the C compiler gurus can come up with.
Possibly not for the purists but for the rest of us, I would be thinking the answer is yes.
My understanding of some of the Basic versions for the prop is they translate Basic instructions mindlessly into Pasm blobs. That works, and it isn't optimal, but it does run faster than interpreted Basic, and is better than not having anything.
Then there are pasm dummies like me who are dangerous enough to write some code that works, but can't work out why adding a NOP in one cog's code would crash the code running in another cog. For people like me, C to Pasm is going to be better than nothing.
In practice, if I wanted to use this in earnest, I'd code the C, compile it and if ran fast enough for the purpose and fit inside the cog then leave it as it is. An example might be a mouse driver or a keyboard driver where speed is not critical.
And if it does not fit, or it is not fast enough, well, then you would have to get your hands dirty and start pulling the code to bits one line at a time. Even then, have a skeleton working code that is too slow is still probably a better place to start than having no code at all.
I guess if one starts to see patterns to code that can be optimised one can add that to a compiler?
In my view, the answer is that if your problem calls for assembly language, then use assembly language. Don't try and use (and abuse) C. C is a "universal assembler" mainly because it is "universal" - not because it makes a good replacement for an assembler. Yes, it can approach the efficiency of assembly language for problems that require the subset of the instruction set that the C code generator implements well - but most C code generators will only make effective use of a small proportion of the available instruction set and addressing modes of a typical CPU. And the more complex the CPU, the smaller the subset. This is all quite normal. Trying to correctly identify candidate cases and then correctly use all the obscure instructions and addressing modes that exist in even a moderately complex CPU is very difficult - and it follows a law of diminishing returns.
Ross.
P.S. If no one has done it by the time I get home, I'll compile Phil's code with Catalina and post the results - I don't claim Catalina has a spectacularly efficient code generator - just one that works reasonably well in most cases.
Actual Catalina code is attached below - 91 longs. Not optimal, perhaps - but It is worth noting that this is LMM PASM (i.e. not cog PASM, which your example is). An LMM PASM compiler cannot assume there will be enough space available in the cog to store all the required data (in general, since such a code snippet would just be a small part of a much larger program there would not be) - so it would generally have to use Hub RAM instead. It is the overhead of the Hub access that is one reason the code is larger.
Sorry, but Catalina does not nicely intersperse the C code as comments, so it's a bit unreadable:
mov r11, FP sub r11, #-(-8) rdlong r11, r11 mov r10, r11 adds r10, #4 jmp #LODF long -8 wrlong r10, RI rdlong r11, r11 jmp #LODF long -4 wrlong r11, RI mov r11, FP sub r11, #-(-4) rdlong r11, r11 mov r16, r11 subs r16, r13 cmps r16, #0 wz,wc if_ae add PC,#(@C_func_2-@:Opt_1) :Opt_1 mov r15, #8 neg r16, r16 add PC,#(@C_func_3-@:Opt_2) :Opt_2 C_func_2 mov r15, #0 C_func_3 mov r12, r17 sar r12, #3 cmps r16, r17 wz,wc if_b add PC,#(@C_func_4-@:Opt_3) :Opt_3 or r15, #4 subs r16, r17 adds r12, r17 C_func_4 sar r17, #1 cmps r16, r17 wz,wc if_b add PC,#(@C_func_6-@:Opt_4) :Opt_4 or r15, #2 subs r16, r17 adds r12, r17 C_func_6 sar r17, #1 cmps r16, r17 wz,wc if_b add PC,#(@C_func_8-@:Opt_5) :Opt_5 or r15, #1 adds r12, r17 C_func_8 mov r11, r15 and r11, #8 cmps r11, #0 wz if_z add PC,#(@C_func_10-@:Opt_6) :Opt_6 subs r13, r12 jmp #LODA long @C_func_L000014 rdlong r11, RI cmps r13, r11 wz,wc if_ae add PC,#(@C_func_11-@:Opt_7) :Opt_7 jmp #LODA long @C_func_L000014 rdlong r13, RI add PC,#(@C_func_11-@:Opt_8) :Opt_8 C_func_10 adds r13, r12 jmp #LODA long @C_func_L000019 rdlong r11, RI cmps r13, r11 wz,wc if_be add PC,#(@C_func_17-@:Opt_9) :Opt_9 jmp #LODA long @C_func_L000019 rdlong r13, RI C_func_17 C_func_11 mov r11, r15 shl r11, #2 mov r10, FP sub r10, #-(-48) adds r11, r10 rdlong r11, r11 adds r14, r11 cmps r14, #0 wz,wc if_ae add PC,#(@C_func_22-@:Opt_10) :Opt_10 mov r14, #0 add PC,#(@C_func_23-@:Opt_11) :Opt_11 C_func_22 cmps r14, #88 wz,wc if_be add PC,#(@C_func_24-@:Opt_12) :Opt_12 mov r14, #88 C_func_24 C_func_23 mov r11, r14 shl r11, #2 mov r10, FP sub r10, #-(-88) adds r11, r10 rdlong r17, r11 mov r11, r15 shl r11, #4 jmp #LODF long -92 wrlong r11, RI ... C_func_L000019 long 32767 C_func_L000014 long -32768
You should be feeling quite proud of that. Catalina is doing very well.
Here are some code sizes for Phils snippet (wrapped into a void function) for some compilers I have lying around:
Code sizes for Phil Pilgrims example C code ------------------------------------------- Optimization = -O2 Compiler Instructions Bytes Notes -------- ------------ ----- ----- Catalina 91 364 32 bit instructions XMOS C 96 192 16 bit instructions GCC Intel 91 ?? Variable length instructions GCC ZPU(ZOG) 281 281 8 bit instructions
So Catalina is on a par with x86-gcc instruction wise (although not all the x86 instructions will be 32 bit so the x86 code may be smaller
)
XMOS C is a clear winner on shear code size due to it's compact instructions.
Poor old Zog is not turning in the hoped for small code size.
None of this is relevant to Phil's question but at least it shows Catalina is doing well. And perhaps hints at the unsuitability of C for COG PASM code.
I'd post it now if I weren't on a bus thumbing a mobile phone.
I get a slightly different answer for GCC on the x86. My version of the compiler produces 66 instructions with -O2. And that includes a few instructions for calling overhead. The current GCC P1 compiler generates 71 instruction for the same code. If I force all of the variables into registers except for outputbuffer it generates 62 instructions. I expect that we'll see some improvements in the compiler that will reduce the number of instructions. The source file I used is attached below.
Dave
<testsub> B430 push {r4-r5} F2400304 movw r3, #4 F2C10300 movt r3, #0x1000 6819 ldr r1, [r3] F8512B04 ldr r2, [r1], #4 6019 str r1, [r3] F2400314 movw r3, #20 F2C10300 movt r3, #0x1000 681C ldr r4, [r3] 1B12 subs r2, r2, r4 BF46 itte mi 4252 rsbmi r2, r2 ,#0 2308 movmi r3, #8 2300 movpl r3, #0 F240010C movw r1, #12 F2C10100 movt r1, #0x1000 6808 ldr r0, [r1] EA4F01E0 asr.w r1, r0, #3 4282 cmp r2, r0 BFA2 ittt ge F0430304 orrge r3, r3, #4 EBC00202 rsbge r2, r0, r2 1809 addge r1, r1, r0 EA4F0060 asr.w r0, r0, #1 4282 cmp r2, r0 BFA2 ittt ge F0430302 orrge r3, r3, #2 EBC00202 rsbge r2, r0, r2 1809 addge r1, r1, r0 EA4F0060 asr.w r0, r0, #1 F240050C movw r5, #12 F2C10500 movt r5, #0x1000 6028 str r0, [r5] 4282 cmp r2, r0 BFA4 itt ge F0430301 orrge r3, r3, #1 1809 addge r1, r1, r0 F0130F08 tst r3, #8 D012 beq 0x000002FC <__text_start__+0x98> 1A61 subs r1, r4, r1 F2400214 movw r2, #20 F2C10200 movt r2, #0x1000 6011 str r1, [r2] F5114F00 cmn r1, #0x8000 DA19 bge 0x0000031C <__text_start__+0xB8> F2400214 movw r2, #20 F2C10200 movt r2, #0x1000 F44F4100 mov.w r1, #0x8000 F6CF71FF movt r1, #0xFFFF 6011 str r1, [r2] E00F b 0x0000031C <__text_start__+0xB8> 1909 adds r1, r1, r4 F2400214 movw r2, #20 F2C10200 movt r2, #0x1000 6011 str r1, [r2] F5B14F00 cmp.w r1, #0x8000 BFA1 itttt ge F2400214 movwge r2, #20 F2C10200 movtge r2, #0x1000 F64771FF movwge r1, #0x7FFF 6011 strge r1, [r2] F2400210 movw r2, #16 F2C10200 movt r2, #0x1000 6812 ldr r2, [r2] F8521023 ldr.w r1, [r2, r3, lsl #2] 2958 cmp r1, #0x58 BFA8 it ge 2158 movge r1, #0x58 F2400208 movw r2, #8 F2C10200 movt r2, #0x1000 EA2171E1 bic.w r1, r1, r1, asr #31 6812 ldr r2, [r2] F8521021 ldr.w r1, [r2, r1, lsl #2] F240020C movw r2, #12 F2C10200 movt r2, #0x1000 6011 str r1, [r2] F2400200 movw r2, #0 F2C10200 movt r2, #0x1000 EA4F1303 lsl.w r3, r3, #4 6013 str r3, [r2] BC30 pop {r4-r5} 4770 bx lr BF00 nop
I make it 83 instructions. I think that they are mostly single-cycle, and can run at up to 120 MHz.
Can you show us the 62-instruction output?
-Phil
Of all the idiomatic optimizations, I thought one of the hardest would be to make the translation from:
if ( index < 0 ) index = 0; else if ( index > 88 ) index = 88;
to
maxs r7, #88 mins r7, #0
Yet your compiler seems to manage that nicely. (The hardest may be recognizing opportunities for sumc, muxc, etc.) There's a lot of working-register thrashing going on, though. Is this a result of GCC having to treat variables as load/store only?
-Phil
int addsub(int which, int a, int b) { if (which) return a+b; return a-b; }
Generated assembly codeaddsub cmps r0, #0 wz,wc mov r0, r1 IF_NE add r0, r2 IF_E sub r0, r2 jmp lr
That's an example where sumz would be optimum, but identifying those opportunities will not be easy.
-Phil
Phil - it may be a good idea if you modify your code fragment to turn it into a function within a compilable C program (i.e. declaring all variables etc) - then we could all compare more easily since we could just extract the code corresponding to the function.
Dave - a compiler that achieves ~70 longs for cog-based PASM when a very sophisticated user can only manage to squeeze it down to ~30 is pretty darn good!
Ross.
It's even better for non-sophisticated users.
However, the real optimization is not on purely an instruction level but on the algorithm level. For example a couple of decades a ago I achieved a 2:1 optimization over x86 C code (which I'd already 2:1 optimized) with hand-coded assembly (performing a Galois Field multiplication for error correction coding) by coding the routine to use the carry bit effectively - something which wasn't possible in the C code and thus required significantly more complex code. So even on typical microprocessors there are idiomatic features which can't be expressed in C.
However, in my mind the target platform will have a greater impact any C compiler for the Propeller. A compiler written for just the base resources (32K shared memory + 496 instructions/registers per cog) will generate significantly different code than one written for the C3 or any other platform with external RAM. The former will likely generate PASM and throw an error if the program is too large to fit in COG RAM, while the latter may use LMM with overlays / library routines.
And for Phil's question regarding the speed of C versus PASM, I guess the real question is whether the resulting code is "fast enough" for the task at hand. Experienced programmers know it's possible to wring more speed out of a program by coding it in assembly. (How much more unfortunately can not be answered before it's attempted.) The question is whether it's required and worth the effort. For code which is I/O bound I suspect compiled C will be "fast enough", hopefully faster than SPIN.
I can read C, but I've never programmed in it, so I'd probably do it wrong.
BTW, my main reason for bringing this up is to raise the question whether the C compiler will meet user's expectations for code size and execution speed. Rather than beating our heads out of the gate over raw optimality, the better question is, "How does the ratio of compiled C to hand-optimized PASM compare with that of the best C compilers for other micros, e.g. AVR, PIC, XMOS, ARM, etc.?" That's the yardstick by which experienced programmers will judge the Propeller compiler. However, the 496-long cog memory limit may place a greater expectation on optimization for the Propeller than it would for another chip.
-Phil
Ok - if no-one else does so, I'll post my version when I get home tonight.
Ross.
Turns out that my test was the same except I had all vars outside the function where as Dave has put some as local to the function: val, diff, delta, vpdiff and index.
This benchmarking is tricky business.
Yes, it will make a big difference how and where the variables are declared. Attached is the code I suggest we use. It makes no sense, but it should compile under any C compiler. I suppose some compilers might be clever enough to optimize away whole chunks of the test code - but to prevent that we would have to change phil's original source code.
Ross.
That's no good. I was worried about this already. Using -O2 on GCC for Intel removes the whole function and the call. It has no output so it gets optimized away nicely:)
Turns out that having global vars, as mine and Dave's versions does prevents that. BUT I think even then it's not fool proof. One would have to add the "volatile" attribute to the inputs and outputs to be really sure. One needs "volatile" anyway as we assume that the variables in HUB can be read or written at any time and so we don't want the compiler squirreling them away in registers as it works through the code.
In this case we need to be sure that whatever would be in HUB in the intended PASM is "volatile".
Blast! I had a suspicion that might happen. Perhaps we all agree to turn off optimization altogether?
But first, try the updated attachment to my previous post - this one makes all the variables volatile parameters to the function, so I think the code within the function can't be optimized away. I also return the function value in the main program, so the function itself cannot be optimized away either.
Other than that, I'm afraid Phil may have to modify his original code.
Ross.
Absolutely not !
Given that we are currently targeting a very small memory space, code in COG data in COG registers or HUB I think we have to allow any and all optimizations that squeeze things in.
The way I see it is that all we have to do is distinguish between:
1) Variables in COG, basically local vars on stack and or things that get stashed in registers in a normal CPU, stuff that the world outside the function cannot see because it is in COG on the Prop.
2) And global variables, those that are outside function, used as inputs or outputs. On the prop they are in HUB.
So is it not as simple as making all variables that should be in HUB on the Prop global and marking them as "volatile". Volatile because they can be read/written my processes other than the one running the function at any time.
I see that Dave's example has "cogmem" attributes for some variables so that the prop compiler knows where they are. They should otherwise be "volatile" in other compilers.
Well, I guess I agree about the optimization - I just thought I'd throw it out there as an idea, since we have already seen that no optimizer comes close to what a programmer can do by hand anyway.
However, I think we have now gone some way towards answering Phil's original questions. If even such a small fragment of C code takes two or three times the space of what a good programmer can achieve in assembler (acknowledging that the final result is very dependent on where the variables are located) then it doesn't really matter what the exact results are - cog space is so limited on the Propeller that cog-based C is going to struggle if it tries to positiion itself as an assembler replacement (i.e. for writing drivers etc).
Ross.
2: int func(volatile int *indata, volatile int *indexTable, volatile int *stepsizeTable, volatile int val, volatile int diff, volatile int valpred, volatile int delta, volatile int step, volatile int vpdiff, volatile int index, volatile int outputbu 3: 4: 0x00000184 B5F0 PUSH {r4-r7,lr} 5: val = *indata++; 0x00000186 6800 LDR r0,[r0,#0x00] 0x00000188 9F0A LDR r7,[sp,#0x28] 0x0000018A 9E06 LDR r6,[sp,#0x18] 0x0000018C 9D08 LDR r5,[sp,#0x20] 6: diff = val - valpred; 0x0000018E 1B80 SUBS r0,r0,r6 7: if(diff < 0) 8: { 0x00000190 D502 BPL 0x00000198 9: delta = 8; 0x00000192 2308 MOVS r3,#0x08 10: diff = (-diff); 11: } 12: else 13: { 0x00000194 4240 RSBS r0,r0,#0 0x00000196 E000 B 0x0000019A 14: delta = 0; 15: } 16: 17: vpdiff = (step >> 3); 18: 0x00000198 2300 MOVS r3,#0x00 19: if ( diff >= step ) { 0x0000019A 42A8 CMP r0,r5 0x0000019C DB02 BLT 0x000001A4 20: delta |= 4; 0x0000019E F0430304 ORR r3,r3,#0x04 21: diff -= step; 22: vpdiff += step; 23: } 0x000001A2 1B40 SUBS r0,r0,r5 24: step >>= 1; 0x000001A4 106D ASRS r5,r5,#1 25: if ( diff >= step ) { 0x000001A6 42A8 CMP r0,r5 0x000001A8 DB02 BLT 0x000001B0 26: delta |= 2; 0x000001AA F0430302 ORR r3,r3,#0x02 27: diff -= step; 28: vpdiff += step; 29: } 0x000001AE 1B40 SUBS r0,r0,r5 30: step >>= 1; 0x000001B0 106D ASRS r5,r5,#1 31: if ( diff >= step ) { 0x000001B2 42A8 CMP r0,r5 0x000001B4 DB01 BLT 0x000001BA 32: delta |= 1; 33: vpdiff += step; 34: } 35: 0x000001B6 F0430301 ORR r3,r3,#0x01 36: if ( (delta&8) != 0 ) 37: { 38: valpred -= vpdiff; 39: if ( valpred < -32768 ) 40: valpred = -32768; 41: } 42: 43: else 44: { 45: valpred += vpdiff; 46: if ( valpred > 32767 ) 47: valpred = 32767; 48: } 49: 0x000001BA EA4F7003 LSL r0,r3,#28 50: index += indexTable[delta]; 0x000001BE F8510023 LDR r0,[r1,r3,LSL #2] 0x000001C2 19C0 ADDS r0,r0,r7 51: if ( index < 0 ) index = 0; 0x000001C4 D501 BPL 0x000001CA 0x000001C6 2000 MOVS r0,#0x00 0x000001C8 E002 B 0x000001D0 52: else if ( index > 88 ) index = 88; 0x000001CA 2858 CMP r0,#0x58 0x000001CC DD00 BLE 0x000001D0 0x000001CE 2058 MOVS r0,#0x58 53: step = stepsizeTable[index]; 54: 0x000001D0 F8520020 LDR r0,[r2,r0,LSL #2] 55: outputbuffer = (delta << 4); 56: 57: return outputbuffer; 58: 0x000001D4 0118 LSLS r0,r3,#4 59: } 60: 61: 0x000001D6 BDF0 POP {r4-r7,pc}