C Expressive Enough for Idiomatic PASM?

Phil Pilgrim (PhiPi) · 2011-08-15 20:19

I've been working on translating some C code to PASM. In the process, I've been wondering whether it's even reasonable to compile C to PASM, since PASM has so many idiomatic shortcuts for which C simply has no native semantics.

Here's part of the C code:

val = *indata++;
diff = val - valpred;
if(diff < 0)
{
    delta = 8;
    diff = (-diff);
}
else
{
    delta = 0;
}

vpdiff = (step >> 3);

if ( diff >= step ) {
    delta |= 4;
    diff -= step;
    vpdiff += step;
}
step >>= 1;
if ( diff >= step  ) {
    delta |= 2;
    diff -= step;
    vpdiff += step;
}
step >>= 1;
if ( diff >= step ) {
    delta |= 1;
    vpdiff += step;
}

if ( (delta&8) != 0 )
{
    valpred -= vpdiff;
    if ( valpred < -32768 )
        valpred = -32768;
}
else
{
    valpred += vpdiff;
    if ( valpred > 32767 )
        valpred = 32767;
}

index += indexTable[delta];
if ( index < 0 ) index = 0;
else if ( index > 88 ) index = 88;
step = stepsizeTable[index];

outputbuffer = (delta << 4);

Here's what I would consider a "nearly-brain-dead" translation to PASM, but with some simple optimizations already in place, nonetheless. It results in 51 PASM instructions:

'val = *indata++;

              rdlong    val,indata
              add       indata,#4
                        
'diff = val - valpred;

              mov       diff,val
              sub       diff,valpred
              
'if(diff < 0)

              cmp       diff,#0 wc
        if_nc jmp       #:L001
        
'{
'   delta = 8;
'   diff = (-diff);
'}

              mov       delta,#8
              neg       diff,diff
              jmp       #:L002
              
'else
'{
'   delta = 0;
'}

:L001         mov       delta,#0
        
'vpdiff = (step >> 3);

:L002         mov       vpdiff,step
              shr       vpdiff,#3
        
'if ( diff >= step ) {

              cmp       diff,step wc
        if_c  jmp       #:L003
        
'  delta |= 4;
'  diff -= step;
'  vpdiff += step;
'}

              or        delta,#4
              sub       diff,step
              add       vpdiff,step

'step >>= 1;

:L003         shr       step,#1

'if ( diff >= step  ) {

              cmp       diff,step wc
        if_c  jmp       #:L004
        
'  delta |= 2;
'  diff -= step;
'  vpdiff += step;
'}

              or        delta,#2
              sub       diff,step
              add       vpdiff,step
              
'step >>= 1;

:L004         shr       step, #1

'if ( diff >= step ) {

              cmp       diff,step wc
        if_c  jmp       #:L005
        
'  delta |= 1;
'  vpdiff += step;
'}

              or        delta,#1
              add       vpdiff,step

'if ( (delta&8) != 0 )

:L005         test      delta,#8 wz
        if_z  jmp       #:L006
        
'{
'  valpred -= vpdiff;
'  if ( valpred < -32768 ) valpred = -32768;
'}

              sub       valpred,vpdiff
              cmps      valpred,minword wc
        if_c  mov       valpred,minword
              jmp       #:L007
              
'else
'{
'  valpred += vpdiff;
'  if ( valpred > 32767 ) valpred = 32767;
'}

:L006         add       valpred,vpdiff
              cmps      valpred,maxword wz,wc
 if_nz_and_nc mov       valpred,maxword

'index += indexTable[delta];

:L007         movs      :L008,#indexTable
              add       :L008,delta
              nop
:L008         add       index,0-0

'if ( index < 0 ) index = 0;

              cmp       index,#0 wc
        if_c  mov       index,#0
              jmp       #:L009
              
'else if ( index > 88 ) index = 88;

              cmp       index,#88 wc,wz
if_nz_and_nc  mov       index,#88 
              
'step = stepsizeTable[index];

:L009         movs      :L010,#stepsizeTable
              add       :L010,index
              nop
:L010         mov       step,0-0
        
'outputbuffer = delta;

              wrlong    delta,outputbuffer

Here's the same code translated into more idiomatic PASM (37 instructions):

'val = *indata++;

              rdlong    val,indata
              add       indata,#4
              
'diff = val - valpred;

              mov       diff,val
              sub       diff,valpred wc
              
'if(diff < 0)
'{
'  delta = 8;
'  diff = (-diff);
'}
'else
'{
'  delta = 0;
'}

        if_c  mov       delta,#8
        if_nc mov       delta,#0
              abs       diff,diff
        
'vpdiff = (step >> 3);

              mov       vpdiff,step
              shr       vpdiff,#3
        
'if ( diff >= step ) {

              cmp       diff,step wc
        
'  delta |= 4;
'  diff -= step;
'  vpdiff += step;
'}

        if_nc or        delta,#4
        if_nc sub       diff,step
        if_nc add       vdiff,step
        
'step >>= 1;

              shr   step,#1
        
'if ( diff >= step  ) {

              cmp       diff,step wc
              
'  delta |= 2;
'  diff -= step;
'  vpdiff += step;
'}

        if_nc or        delta,#2
        if_nc sub       diff,step
        if_nc add       vpdiff,step
        
'step >>= 1;

              shr       step,#1
              
'if ( diff >= step ) {

              cmp       diff,step wc
              
'  delta |= 1;
'  vpdiff += step;
'}

       if_nc or        delta,#1
       if_nc add       vpdiff,step

'if ( (delta&8) != 0 )
'{
'  valpred -= vpdiff;
'  if ( valpred < -32768 ) valpred = -32768;
'}
'else
'{
'  valpred += vpdiff;
'  if ( valpred > 32767 ) valpred = 32767;
'}

              test      delta,#8 wc
              sumc      valpred,vpdiff
              mins      valpred,minword
              maxs      valpred,maxword

'index += indexTable[delta];

              movs      :L001,#indexTable
              add       :L001,delta
              nop
:L001         add       index,0-0

'if ( index < 0 ) index = 0;

              mins      index,#0
              
'else if ( index > 88 ) index = 88;

              maxs      index,#88
              
'step = stepsizeTable[index];

              movs      :L002,#stepsizeTable
              add       :L002,index
              nop
:L002         mov       step,0-0

'outputbuffer = delta;

              wrlong    delta,outputbuffer

With further hand finessing (e.g reordering statements to eliminate the nops), I've been able to reduce the equivalent PASM to 33 instructions.

So my questions are these:

1. Can an optimizing C compiler even hope to produce efficient, idiomatic PASM?

2. If "no" to #1, are we doing users any favors by providing a C-to-PASM compiler if it misleads them into believing that they're getting the best performance possible from the Propeller?

3. If "yes" to #1, will sufficient effort be put into the planned compiler to do so?

Whatever the answers, it will be a good exercise to compare the compiler output to hand-translated PASM in order to uncover missed opportunities for optimization -- assuming a compiler can actually exploit them.

-Phil

Heater. · 2011-08-15 21:01

I think you should try and compile your example with Catalina and take look at the assembler output it produces. I think I remember Catalina has a command line option to do that.
That of course is LMM code so it will have some funny looking jumps and such in it but basically you will see what to expect if it were native PASM.

I have never been totally convinced that compiling C to PASM is sensible for the Prop. Because:
1. PASM has no convenient stack pointer or stack ops you have to synthesize it in code.
2. PASM has no pointers. For in cog PASM/data you have to synthesize pointers with self modifying code. Perhaps not so bad for data in HUB using rd/wrlong and friends. But then what about function pointers for in cog code?
3. The generated code size is too big, all instructions are 32 bit and we are memory constrained.
4. Generating LMM is a neat solution to much of the above but requires VM to run it and is hence slow.

All on all it's hard to beat Spin for compact code and PASM for speed.

The Prop 2 will have more speed and space to give C room to breath but as a C engine it's still going to be bit sad compared to similarly clocked devices.

Still, we might be surprised what optimizations the C compiler gurus can come up with.

jazzed · 2011-08-15 21:23

Hi Phil, I'll make sure our compiler team has a look at your example.

Dr_Acula · 2011-08-15 21:26

I've been wondering whether it's even reasonable to compile C to PASM

Possibly not for the purists but for the rest of us, I would be thinking the answer is yes.

My understanding of some of the Basic versions for the prop is they translate Basic instructions mindlessly into Pasm blobs. That works, and it isn't optimal, but it does run faster than interpreted Basic, and is better than not having anything.

Then there are pasm dummies like me who are dangerous enough to write some code that works, but can't work out why adding a NOP in one cog's code would crash the code running in another cog. For people like me, C to Pasm is going to be better than nothing.

In practice, if I wanted to use this in earnest, I'd code the C, compile it and if ran fast enough for the purpose and fit inside the cog then leave it as it is. An example might be a mouse driver or a keyboard driver where speed is not critical.

And if it does not fit, or it is not fast enough, well, then you would have to get your hands dirty and start pulling the code to bits one line at a time. Even then, have a skeleton working code that is too slow is still probably a better place to start than having no code at all.

I guess if one starts to see patterns to code that can be optimised one can add that to a compiler?

RossH · 2011-08-16 00:31

Interesting to read this thread in conjunction with the recent comments posted by potatohead and others about inline assembly (here).

In my view, the answer is that if your problem calls for assembly language, then use assembly language. Don't try and use (and abuse) C. C is a "universal assembler" mainly because it is "universal" - not because it makes a good replacement for an assembler. Yes, it can approach the efficiency of assembly language for problems that require the subset of the instruction set that the C code generator implements well - but most C code generators will only make effective use of a small proportion of the available instruction set and addressing modes of a typical CPU. And the more complex the CPU, the smaller the subset. This is all quite normal. Trying to correctly identify candidate cases and then correctly use all the obscure instructions and addressing modes that exist in even a moderately complex CPU is very difficult - and it follows a law of diminishing returns.

Ross.

P.S. If no one has done it by the time I get home, I'll compile Phil's code with Catalina and post the results - I don't claim Catalina has a spectacularly efficient code generator - just one that works reasonably well in most cases.

RossH · 2011-08-16 02:26

Hi Phil,

Actual Catalina code is attached below - 91 longs. Not optimal, perhaps - but It is worth noting that this is LMM PASM (i.e. not cog PASM, which your example is). An LMM PASM compiler cannot assume there will be enough space available in the cog to store all the required data (in general, since such a code snippet would just be a small part of a much larger program there would not be) - so it would generally have to use Hub RAM instead. It is the overhead of the Hub access that is one reason the code is larger.

Sorry, but Catalina does not nicely intersperse the C code as comments, so it's a bit unreadable:

mov r11, FP
 sub r11, #-(-8)
 rdlong r11, r11
 mov r10, r11
 adds r10, #4
 jmp #LODF
 long -8
 wrlong r10, RI
 rdlong r11, r11
 jmp #LODF
 long -4
 wrlong r11, RI
 mov r11, FP
 sub r11, #-(-4)
 rdlong r11, r11
 mov r16, r11
 subs r16, r13
 cmps r16,  #0 wz,wc
 if_ae add PC,#(@C_func_2-@:Opt_1) 
:Opt_1
 mov r15, #8
 neg r16, r16
 add PC,#(@C_func_3-@:Opt_2) 
:Opt_2
C_func_2
 mov r15, #0
C_func_3
 mov r12, r17
 sar r12, #3
 cmps r16, r17 wz,wc
 if_b add PC,#(@C_func_4-@:Opt_3) 
:Opt_3
 or r15, #4
 subs r16, r17
 adds r12, r17
C_func_4
 sar r17, #1
 cmps r16, r17 wz,wc
 if_b add PC,#(@C_func_6-@:Opt_4) 
:Opt_4
 or r15, #2
 subs r16, r17
 adds r12, r17
C_func_6
 sar r17, #1
 cmps r16, r17 wz,wc
 if_b add PC,#(@C_func_8-@:Opt_5) 
:Opt_5
 or r15, #1
 adds r12, r17
C_func_8
 mov r11, r15
 and r11, #8
 cmps r11,  #0 wz
 if_z add PC,#(@C_func_10-@:Opt_6) 
:Opt_6
 subs r13, r12
 jmp #LODA
 long @C_func_L000014
 rdlong  r11, RI
 cmps r13, r11 wz,wc
 if_ae add PC,#(@C_func_11-@:Opt_7) 
:Opt_7
 jmp #LODA
 long @C_func_L000014
 rdlong  r13, RI
 add PC,#(@C_func_11-@:Opt_8) 
:Opt_8
C_func_10
 adds r13, r12
 jmp #LODA
 long @C_func_L000019
 rdlong  r11, RI
 cmps r13, r11 wz,wc
 if_be add PC,#(@C_func_17-@:Opt_9) 
:Opt_9
 jmp #LODA
 long @C_func_L000019
 rdlong  r13, RI
C_func_17
C_func_11
 mov r11, r15
 shl r11, #2
 mov r10, FP
 sub r10, #-(-48)
 adds r11, r10
 rdlong r11, r11
 adds r14, r11
 cmps r14,  #0 wz,wc
 if_ae add PC,#(@C_func_22-@:Opt_10) 
:Opt_10
 mov r14, #0
 add PC,#(@C_func_23-@:Opt_11) 
:Opt_11
C_func_22
 cmps r14,  #88 wz,wc
 if_be add PC,#(@C_func_24-@:Opt_12) 
:Opt_12
 mov r14, #88
C_func_24
C_func_23
 mov r11, r14
 shl r11, #2
 mov r10, FP
 sub r10, #-(-88)
 adds r11, r10
 rdlong r17, r11
 mov r11, r15
 shl r11, #4
 jmp #LODF
 long -92
 wrlong r11, RI
 ...

C_func_L000019
 long 32767
C_func_L000014
 long -32768

Heater. · 2011-08-16 04:42

RossH,

You should be feeling quite proud of that. Catalina is doing very well.

Here are some code sizes for Phils snippet (wrapped into a void function) for some compilers I have lying around:

Code sizes for Phil Pilgrims example C code
-------------------------------------------

Optimization = -O2

Compiler        Instructions    Bytes   Notes
--------        ------------    -----   -----
Catalina        91              364     32 bit instructions
XMOS C          96              192     16 bit instructions
GCC Intel       91              ??      Variable length instructions
GCC ZPU(ZOG)    281             281     8 bit instructions

So Catalina is on a par with x86-gcc instruction wise (although not all the x86 instructions will be 32 bit so the x86 code may be smaller
)
XMOS C is a clear winner on shear code size due to it's compact instructions.

Poor old Zog is not turning in the hoped for small code size.

None of this is relevant to Phil's question but at least it shows Catalina is doing well. And perhaps hints at the unsuitability of C for COG PASM code.

Martin_H · 2011-08-16 05:54

I'd agree with Dr_Acula, that a C compiler that produces reasonable but sub-optimal COG PASM code is still useful. There's a bunch of C code which could be directly reused with a C compiler, even one that produces sub-optimal code. Basically most programmers want to re-use as much code as possible. So using a portable high level language, and dropping down to assembler only when required is attractive.

jazzed · 2011-08-16 08:38

Heater & Ross: I didn't see a C file. Please post what you are using. This can be interpreted as a benchmark (regardless of merits), so the test should start with exactly the same source file.

Heater. · 2011-08-16 09:01

True. I only copy and pasted the contents of the code box in the first post into a file wraped it up as a function, void phipi(){....}, and adde a main() to call it. All the variables are declared.outside the function as int, int* or integer arrays as appropriate.
I'd post it now if I weren't on a bus thumbing a mobile phone.

Dave Hein · 2011-08-16 09:38

Heater,

I get a slightly different answer for GCC on the x86. My version of the compiler produces 66 instructions with -O2. And that includes a few instructions for calling overhead. The current GCC P1 compiler generates 71 instruction for the same code. If I force all of the variables into registers except for outputbuffer it generates 62 instructions. I expect that we'll see some improvements in the compiler that will reduce the number of instructions. The source file I used is attached below.

Dave

Leon · 2011-08-16 11:00

I compiled your code for an ARM Cortex-M3 using gcc, the function generated this code (disassembled):

<testsub>
    B430        push {r4-r5}
    F2400304    movw r3, #4
    F2C10300    movt r3, #0x1000
    6819        ldr r1, [r3]
    F8512B04    ldr r2, [r1], #4
    6019        str r1, [r3]
    F2400314    movw r3, #20
    F2C10300    movt r3, #0x1000
    681C        ldr r4, [r3]
    1B12        subs r2, r2, r4
    BF46        itte mi
    4252        rsbmi r2, r2 ,#0
    2308        movmi r3, #8
    2300        movpl r3, #0
    F240010C    movw r1, #12
    F2C10100    movt r1, #0x1000
    6808        ldr r0, [r1]
    EA4F01E0    asr.w r1, r0, #3
    4282        cmp r2, r0
    BFA2        ittt ge
    F0430304    orrge r3, r3, #4
    EBC00202    rsbge r2, r0, r2
    1809        addge r1, r1, r0
    EA4F0060    asr.w r0, r0, #1
    4282        cmp r2, r0
    BFA2        ittt ge
    F0430302    orrge r3, r3, #2
    EBC00202    rsbge r2, r0, r2
    1809        addge r1, r1, r0
    EA4F0060    asr.w r0, r0, #1
    F240050C    movw r5, #12
    F2C10500    movt r5, #0x1000
    6028        str r0, [r5]
    4282        cmp r2, r0
    BFA4        itt ge
    F0430301    orrge r3, r3, #1
    1809        addge r1, r1, r0
    F0130F08    tst r3, #8
    D012        beq 0x000002FC <__text_start__+0x98>
    1A61        subs r1, r4, r1
    F2400214    movw r2, #20
    F2C10200    movt r2, #0x1000
    6011        str r1, [r2]
    F5114F00    cmn r1, #0x8000
    DA19        bge 0x0000031C <__text_start__+0xB8>
    F2400214    movw r2, #20
    F2C10200    movt r2, #0x1000
    F44F4100    mov.w r1, #0x8000
    F6CF71FF    movt r1, #0xFFFF
    6011        str r1, [r2]
    E00F        b 0x0000031C <__text_start__+0xB8>
    1909        adds r1, r1, r4
    F2400214    movw r2, #20
    F2C10200    movt r2, #0x1000
    6011        str r1, [r2]
    F5B14F00    cmp.w r1, #0x8000
    BFA1        itttt ge
    F2400214    movwge r2, #20
    F2C10200    movtge r2, #0x1000
    F64771FF    movwge r1, #0x7FFF
    6011        strge r1, [r2]
    F2400210    movw r2, #16
    F2C10200    movt r2, #0x1000
    6812        ldr r2, [r2]
    F8521023    ldr.w r1, [r2, r3, lsl #2]
    2958        cmp r1, #0x58
    BFA8        it ge
    2158        movge r1, #0x58
    F2400208    movw r2, #8
    F2C10200    movt r2, #0x1000
    EA2171E1    bic.w r1, r1, r1, asr #31
    6812        ldr r2, [r2]
    F8521021    ldr.w r1, [r2, r1, lsl #2]
    F240020C    movw r2, #12
    F2C10200    movt r2, #0x1000
    6011        str r1, [r2]
    F2400200    movw r2, #0
    F2C10200    movt r2, #0x1000
    EA4F1303    lsl.w r3, r3, #4
    6013        str r3, [r2]
    BC30        pop {r4-r5}
    4770        bx lr
    BF00        nop

I make it 83 instructions. I think that they are mostly single-cycle, and can run at up to 120 MHz.

Heater. · 2011-08-16 11:15

62 instructions for the P1 compiler, in the early stages that is, already sounds pretty good. I'll have to sort out what x86-gcc I used, it's on Debian so it may not be so recent. I'll update the results sheet tomorrow.

Phil Pilgrim (PhiPi) · 2011-08-16 11:38

Dave,

Can you show us the 62-instruction output?

-Phil

Dave Hein · 2011-08-16 13:11

Phil Pilgrim (PhiPi) wrote: »

Can you show us the 62-instruction output?

Here's the assembly output. The two obvious areas for improvement are to use variable registers directly instead of moving them to the rxx registers, and eliminate cmps when the Z and C flags can be determined from earlier operations.

Phil Pilgrim (PhiPi) · 2011-08-16 13:51

Dave,

Of all the idiomatic optimizations, I thought one of the hardest would be to make the translation from:

if ( index < 0 ) index = 0;
else if ( index > 88 ) index = 88;

to

	maxs	r7, #88
	mins	r7, #0

Yet your compiler seems to manage that nicely. (The hardest may be recognizing opportunities for sumc, muxc, etc.) There's a lot of working-register thrashing going on, though. Is this a result of GCC having to treat variables as load/store only?

-Phil

Dave Hein · 2011-08-16 15:08

First of all, let me emphasize that the compiler work is not being done by me. I have just been testing out the compiler and the assembler. Most of the work on the compiler is being done by another member of the team, and one other member of the team is working on the assembler and linker. GCC is capable of doing some nice optimizations. The following snippet of code was posted by the person working on the compiler.

int
addsub(int which, int a, int b)
{
    if (which)
        return a+b;
    return a-b;
}

Generated assembly code

addsub
    cmps    r0, #0 wz,wc
    mov    r0, r1
    IF_NE add    r0, r2
    IF_E  sub    r0, r2
    jmp    lr

Phil Pilgrim (PhiPi) · 2011-08-16 15:17

Dave,

That's an example where sumz would be optimum, but identifying those opportunities will not be easy.

-Phil

RossH · 2011-08-16 17:17

Jazzed - the source code is just what is in Phil's first post. I wrapped it in a function and added the necessary lines to declare the variables (all as ints). Then I extracted the code that corresponded to his lines of C.

Phil - it may be a good idea if you modify your code fragment to turn it into a function within a compilable C program (i.e. declaring all variables etc) - then we could all compare more easily since we could just extract the code corresponding to the function.

Dave - a compiler that achieves ~70 longs for cog-based PASM when a very sophisticated user can only manage to squeeze it down to ~30 is pretty darn good!

Ross.

Kevin Wood · 2011-08-16 17:31

RossH wrote:

Dave - a compiler that achieves ~70 longs for cog-based PASM when a very sophisticated user can only manage to squeeze it down to ~30 is pretty darn good!

It's even better for non-sophisticated users.

ericball · 2011-08-16 17:34

An optimizing C compiler can never be as good as hand-optimized assembly; although with today's out-of-order multi-core / thread, and multiple levels of cache & virtual memory I'm not sure whether it's possible to truly generate optimal code on anything but the smallest routines.

However, the real optimization is not on purely an instruction level but on the algorithm level. For example a couple of decades a ago I achieved a 2:1 optimization over x86 C code (which I'd already 2:1 optimized) with hand-coded assembly (performing a Galois Field multiplication for error correction coding) by coding the routine to use the carry bit effectively - something which wasn't possible in the C code and thus required significantly more complex code. So even on typical microprocessors there are idiomatic features which can't be expressed in C.

However, in my mind the target platform will have a greater impact any C compiler for the Propeller. A compiler written for just the base resources (32K shared memory + 496 instructions/registers per cog) will generate significantly different code than one written for the C3 or any other platform with external RAM. The former will likely generate PASM and throw an error if the program is too large to fit in COG RAM, while the latter may use LMM with overlays / library routines.

And for Phil's question regarding the speed of C versus PASM, I guess the real question is whether the resulting code is "fast enough" for the task at hand. Experienced programmers know it's possible to wring more speed out of a program by coding it in assembly. (How much more unfortunately can not be answered before it's attempted.) The question is whether it's required and worth the effort. For code which is I/O bound I suspect compiled C will be "fast enough", hopefully faster than SPIN.

Phil Pilgrim (PhiPi) · 2011-08-16 18:00

RossH wrote:

Phil - it may be a good idea if you modify your code fragment to turn it into a function within a compilable C program (i.e. declaring all variables etc) - then we could all compare more easily since we could just extract the code corresponding to the function.

I can read C, but I've never programmed in it, so I'd probably do it wrong.

BTW, my main reason for bringing this up is to raise the question whether the C compiler will meet user's expectations for code size and execution speed. Rather than beating our heads out of the gate over raw optimality, the better question is, "How does the ratio of compiled C to hand-optimized PASM compare with that of the best C compilers for other micros, e.g. AVR, PIC, XMOS, ARM, etc.?" That's the yardstick by which experienced programmers will judge the Propeller compiler. However, the 496-long cog memory limit may place a greater expectation on optimization for the Propeller than it would for another chip.

-Phil

RossH · 2011-08-16 18:31

Phil Pilgrim (PhiPi) wrote: »

I can read C, but I've never programmed in it, so I'd probably do it wrong.

-Phil

Ok - if no-one else does so, I'll post my version when I get home tonight.

Ross.

Heater. · 2011-08-17 03:37

With Dave's test5.c I can get x86 gcc (4.4.5) to produce 64 instructions for the function where as for my version it turns in 91.
Turns out that my test was the same except I had all vars outside the function where as Dave has put some as local to the function: val, diff, delta, vpdiff and index.
This benchmarking is tricky business.

RossH · 2011-08-17 03:47

Heater. wrote: »

With Dave's test5.c I can get x86 gcc (4.4.5) to produce 64 instructions for the function where as for my version it turns in 91.
Turns out that my test was the same except I had all vars outside the function where as Dave has put some as local to the function: val, diff, delta, vpdiff and index.
This benchmarking is tricky business.

Yes, it will make a big difference how and where the variables are declared. Attached is the code I suggest we use. It makes no sense, but it should compile under any C compiler. I suppose some compilers might be clever enough to optimize away whole chunks of the test code - but to prevent that we would have to change phil's original source code.

Ross.

Heater. · 2011-08-17 04:01

RossH,
That's no good. I was worried about this already. Using -O2 on GCC for Intel removes the whole function and the call. It has no output so it gets optimized away nicely:)

Turns out that having global vars, as mine and Dave's versions does prevents that. BUT I think even then it's not fool proof. One would have to add the "volatile" attribute to the inputs and outputs to be really sure. One needs "volatile" anyway as we assume that the variables in HUB can be read or written at any time and so we don't want the compiler squirreling them away in registers as it works through the code.

In this case we need to be sure that whatever would be in HUB in the intended PASM is "volatile".

RossH · 2011-08-17 05:05

Heater. wrote: »

RossH,
That's no good. I was worried about this already. Using -O2 on GCC for Intel removes the whole function and the call. It has no output so it gets optimized away nicely:)

Turns out that having global vars, as mine and Dave's versions does prevents that. BUT I think even then it's not fool proof. One would have to add the "volatile" attribute to the inputs and outputs to be really sure. One needs "volatile" anyway as we assume that the variables in HUB can be read or written at any time and so we don't want the compiler squirreling them away in registers as it works through the code.

In this case we need to be sure that whatever would be in COG in the intended PASM is "volatile".

Blast! I had a suspicion that might happen. Perhaps we all agree to turn off optimization altogether?

But first, try the updated attachment to my previous post - this one makes all the variables volatile parameters to the function, so I think the code within the function can't be optimized away. I also return the function value in the main program, so the function itself cannot be optimized away either.

Other than that, I'm afraid Phil may have to modify his original code.

Ross.

Heater. · 2011-08-17 05:22

RossH,

Perhaps we all agree to turn off optimization altogether?

Absolutely not !

Given that we are currently targeting a very small memory space, code in COG data in COG registers or HUB I think we have to allow any and all optimizations that squeeze things in.

The way I see it is that all we have to do is distinguish between:
1) Variables in COG, basically local vars on stack and or things that get stashed in registers in a normal CPU, stuff that the world outside the function cannot see because it is in COG on the Prop.
2) And global variables, those that are outside function, used as inputs or outputs. On the prop they are in HUB.

So is it not as simple as making all variables that should be in HUB on the Prop global and marking them as "volatile". Volatile because they can be read/written my processes other than the one running the function at any time.

I see that Dave's example has "cogmem" attributes for some variables so that the prop compiler knows where they are. They should otherwise be "volatile" in other compilers.

Heater. · 2011-08-17 05:25

Oh yes, I don't think we want to pass parameters and return results from this test function.

RossH · 2011-08-17 06:17

Heater. wrote: »

RossH,

Absolutely not !

Given that we are currently targeting a very small memory space, code in COG data in COG registers or HUB I think we have to allow any and all optimizations that squeeze things in.

Well, I guess I agree about the optimization - I just thought I'd throw it out there as an idea, since we have already seen that no optimizer comes close to what a programmer can do by hand anyway.

However, I think we have now gone some way towards answering Phil's original questions. If even such a small fragment of C code takes two or three times the space of what a good programmer can achieve in assembler (acknowledging that the final result is very dependent on where the variables are located) then it doesn't really matter what the exact results are - cog space is so limited on the Propeller that cog-based C is going to struggle if it tries to positiion itself as an assembler replacement (i.e. for writing drivers etc).

Ross.

Batang · 2011-08-17 08:15

Just for interest I compiled RossH's function for an ARM M3 using a Keil compiler using -02 optimization, disassembly as follows:

     2: int func(volatile int *indata, volatile int *indexTable, volatile int *stepsizeTable, volatile int val, volatile int diff, volatile int valpred, volatile int delta, volatile int step, volatile int vpdiff, volatile int index, volatile int outputbu 
     3:  
     4:  
0x00000184 B5F0      PUSH     {r4-r7,lr}
     5:    val = *indata++; 
0x00000186 6800      LDR      r0,[r0,#0x00]
0x00000188 9F0A      LDR      r7,[sp,#0x28]
0x0000018A 9E06      LDR      r6,[sp,#0x18]
0x0000018C 9D08      LDR      r5,[sp,#0x20]
     6:    diff = val - valpred; 
0x0000018E 1B80      SUBS     r0,r0,r6
     7:    if(diff < 0) 
     8:    { 
0x00000190 D502      BPL      0x00000198
     9:        delta = 8; 
0x00000192 2308      MOVS     r3,#0x08
    10:        diff = (-diff); 
    11:    } 
    12:    else 
    13:    { 
0x00000194 4240      RSBS     r0,r0,#0
0x00000196 E000      B        0x0000019A
    14:        delta = 0; 
    15:    } 
    16:     
    17:    vpdiff = (step >> 3); 
    18:     
0x00000198 2300      MOVS     r3,#0x00
    19:    if ( diff >= step ) { 
0x0000019A 42A8      CMP      r0,r5
0x0000019C DB02      BLT      0x000001A4
    20:        delta |= 4; 
0x0000019E F0430304  ORR      r3,r3,#0x04
    21:        diff -= step; 
    22:        vpdiff += step; 
    23:    } 
0x000001A2 1B40      SUBS     r0,r0,r5
    24:    step >>= 1; 
0x000001A4 106D      ASRS     r5,r5,#1
    25:    if ( diff >= step  ) { 
0x000001A6 42A8      CMP      r0,r5
0x000001A8 DB02      BLT      0x000001B0
    26:        delta |= 2; 
0x000001AA F0430302  ORR      r3,r3,#0x02
    27:        diff -= step; 
    28:        vpdiff += step; 
    29:    } 
0x000001AE 1B40      SUBS     r0,r0,r5
    30:    step >>= 1; 
0x000001B0 106D      ASRS     r5,r5,#1
    31:    if ( diff >= step ) { 
0x000001B2 42A8      CMP      r0,r5
0x000001B4 DB01      BLT      0x000001BA
    32:        delta |= 1; 
    33:        vpdiff += step; 
    34:    } 
    35:     
0x000001B6 F0430301  ORR      r3,r3,#0x01
    36:    if ( (delta&8) != 0 ) 
    37:    { 
    38:        valpred -= vpdiff; 
    39:        if ( valpred < -32768 ) 
    40:            valpred = -32768; 
    41:    }                        
    42:  
    43:    else 
    44:    { 
    45:        valpred += vpdiff; 
    46:        if ( valpred > 32767 ) 
    47:            valpred = 32767; 
    48:    } 
    49:     
0x000001BA EA4F7003  LSL      r0,r3,#28
    50:    index += indexTable[delta]; 
0x000001BE F8510023  LDR      r0,[r1,r3,LSL #2]
0x000001C2 19C0      ADDS     r0,r0,r7
    51:    if ( index < 0 ) index = 0; 
0x000001C4 D501      BPL      0x000001CA
0x000001C6 2000      MOVS     r0,#0x00
0x000001C8 E002      B        0x000001D0
    52:    else if ( index > 88 ) index = 88; 
0x000001CA 2858      CMP      r0,#0x58
0x000001CC DD00      BLE      0x000001D0
0x000001CE 2058      MOVS     r0,#0x58
    53:    step = stepsizeTable[index]; 
    54:                           
0x000001D0 F8520020  LDR      r0,[r2,r0,LSL #2]
    55:    outputbuffer = (delta << 4); 
    56:  
    57:    return outputbuffer;     
    58:  
0x000001D4 0118      LSLS     r0,r3,#4
    59: } 
    60:  
    61:  
0x000001D6 BDF0      POP      {r4-r7,pc}

C Expressive Enough for Idiomatic PASM?

Comments