Shop OBEX P1 Docs P2 Docs Learn Events
C Expressive Enough for Idiomatic PASM? — Parallax Forums

C Expressive Enough for Idiomatic PASM?

Phil Pilgrim (PhiPi)Phil Pilgrim (PhiPi) Posts: 23,514
edited 2011-09-18 09:38 in Propeller 1
I've been working on translating some C code to PASM. In the process, I've been wondering whether it's even reasonable to compile C to PASM, since PASM has so many idiomatic shortcuts for which C simply has no native semantics.

Here's part of the C code:
val = *indata++;
diff = val - valpred;
if(diff < 0)
{
    delta = 8;
    diff = (-diff);
}
else
{
    delta = 0;
}

vpdiff = (step >> 3);

if ( diff >= step ) {
    delta |= 4;
    diff -= step;
    vpdiff += step;
}
step >>= 1;
if ( diff >= step  ) {
    delta |= 2;
    diff -= step;
    vpdiff += step;
}
step >>= 1;
if ( diff >= step ) {
    delta |= 1;
    vpdiff += step;
}

if ( (delta&8) != 0 )
{
    valpred -= vpdiff;
    if ( valpred < -32768 )
        valpred = -32768;
}
else
{
    valpred += vpdiff;
    if ( valpred > 32767 )
        valpred = 32767;
}

index += indexTable[delta];
if ( index < 0 ) index = 0;
else if ( index > 88 ) index = 88;
step = stepsizeTable[index];

outputbuffer = (delta << 4);

Here's what I would consider a "nearly-brain-dead" translation to PASM, but with some simple optimizations already in place, nonetheless. It results in 51 PASM instructions:
'val = *indata++;

              rdlong    val,indata
              add       indata,#4
                        
'diff = val - valpred;

              mov       diff,val
              sub       diff,valpred
              
'if(diff < 0)

              cmp       diff,#0 wc
        if_nc jmp       #:L001
        
'{
'   delta = 8;
'   diff = (-diff);
'}

              mov       delta,#8
              neg       diff,diff
              jmp       #:L002
              
'else
'{
'   delta = 0;
'}

:L001         mov       delta,#0
        
'vpdiff = (step >> 3);

:L002         mov       vpdiff,step
              shr       vpdiff,#3
        
'if ( diff >= step ) {

              cmp       diff,step wc
        if_c  jmp       #:L003
        
'  delta |= 4;
'  diff -= step;
'  vpdiff += step;
'}

              or        delta,#4
              sub       diff,step
              add       vpdiff,step

'step >>= 1;

:L003         shr       step,#1

'if ( diff >= step  ) {

              cmp       diff,step wc
        if_c  jmp       #:L004
        
'  delta |= 2;
'  diff -= step;
'  vpdiff += step;
'}

              or        delta,#2
              sub       diff,step
              add       vpdiff,step
              
'step >>= 1;

:L004         shr       step, #1

'if ( diff >= step ) {

              cmp       diff,step wc
        if_c  jmp       #:L005
        
'  delta |= 1;
'  vpdiff += step;
'}

              or        delta,#1
              add       vpdiff,step

'if ( (delta&8) != 0 )

:L005         test      delta,#8 wz
        if_z  jmp       #:L006
        
'{
'  valpred -= vpdiff;
'  if ( valpred < -32768 ) valpred = -32768;
'}

              sub       valpred,vpdiff
              cmps      valpred,minword wc
        if_c  mov       valpred,minword
              jmp       #:L007
              
'else
'{
'  valpred += vpdiff;
'  if ( valpred > 32767 ) valpred = 32767;
'}

:L006         add       valpred,vpdiff
              cmps      valpred,maxword wz,wc
 if_nz_and_nc mov       valpred,maxword

'index += indexTable[delta];

:L007         movs      :L008,#indexTable
              add       :L008,delta
              nop
:L008         add       index,0-0

'if ( index < 0 ) index = 0;

              cmp       index,#0 wc
        if_c  mov       index,#0
              jmp       #:L009
              
'else if ( index > 88 ) index = 88;

              cmp       index,#88 wc,wz
if_nz_and_nc  mov       index,#88 
              
'step = stepsizeTable[index];

:L009         movs      :L010,#stepsizeTable
              add       :L010,index
              nop
:L010         mov       step,0-0
        
'outputbuffer = delta;

              wrlong    delta,outputbuffer

Here's the same code translated into more idiomatic PASM (37 instructions):
'val = *indata++;

              rdlong    val,indata
              add       indata,#4
              
'diff = val - valpred;

              mov       diff,val
              sub       diff,valpred wc
              
'if(diff < 0)
'{
'  delta = 8;
'  diff = (-diff);
'}
'else
'{
'  delta = 0;
'}

        if_c  mov       delta,#8
        if_nc mov       delta,#0
              abs       diff,diff
        
'vpdiff = (step >> 3);

              mov       vpdiff,step
              shr       vpdiff,#3
        
'if ( diff >= step ) {

              cmp       diff,step wc
        
'  delta |= 4;
'  diff -= step;
'  vpdiff += step;
'}

        if_nc or        delta,#4
        if_nc sub       diff,step
        if_nc add       vdiff,step
        
'step >>= 1;

              shr   step,#1
        
'if ( diff >= step  ) {

              cmp       diff,step wc
              
'  delta |= 2;
'  diff -= step;
'  vpdiff += step;
'}

        if_nc or        delta,#2
        if_nc sub       diff,step
        if_nc add       vpdiff,step
        
'step >>= 1;

              shr       step,#1
              
'if ( diff >= step ) {

              cmp       diff,step wc
              
'  delta |= 1;
'  vpdiff += step;
'}

       if_nc or        delta,#1
       if_nc add       vpdiff,step

'if ( (delta&8) != 0 )
'{
'  valpred -= vpdiff;
'  if ( valpred < -32768 ) valpred = -32768;
'}
'else
'{
'  valpred += vpdiff;
'  if ( valpred > 32767 ) valpred = 32767;
'}

              test      delta,#8 wc
              sumc      valpred,vpdiff
              mins      valpred,minword
              maxs      valpred,maxword

'index += indexTable[delta];

              movs      :L001,#indexTable
              add       :L001,delta
              nop
:L001         add       index,0-0

'if ( index < 0 ) index = 0;

              mins      index,#0
              
'else if ( index > 88 ) index = 88;

              maxs      index,#88
              
'step = stepsizeTable[index];

              movs      :L002,#stepsizeTable
              add       :L002,index
              nop
:L002         mov       step,0-0

'outputbuffer = delta;

              wrlong    delta,outputbuffer

With further hand finessing (e.g reordering statements to eliminate the nops), I've been able to reduce the equivalent PASM to 33 instructions.

So my questions are these:

1. Can an optimizing C compiler even hope to produce efficient, idiomatic PASM?

2. If "no" to #1, are we doing users any favors by providing a C-to-PASM compiler if it misleads them into believing that they're getting the best performance possible from the Propeller?

3. If "yes" to #1, will sufficient effort be put into the planned compiler to do so?

Whatever the answers, it will be a good exercise to compare the compiler output to hand-translated PASM in order to uncover missed opportunities for optimization -- assuming a compiler can actually exploit them.

-Phil
«134

Comments

  • Heater.Heater. Posts: 21,230
    edited 2011-08-15 21:01
    I think you should try and compile your example with Catalina and take look at the assembler output it produces. I think I remember Catalina has a command line option to do that.
    That of course is LMM code so it will have some funny looking jumps and such in it but basically you will see what to expect if it were native PASM.

    I have never been totally convinced that compiling C to PASM is sensible for the Prop. Because:
    1. PASM has no convenient stack pointer or stack ops you have to synthesize it in code.
    2. PASM has no pointers. For in cog PASM/data you have to synthesize pointers with self modifying code. Perhaps not so bad for data in HUB using rd/wrlong and friends. But then what about function pointers for in cog code?
    3. The generated code size is too big, all instructions are 32 bit and we are memory constrained.
    4. Generating LMM is a neat solution to much of the above but requires VM to run it and is hence slow.

    All on all it's hard to beat Spin for compact code and PASM for speed.

    The Prop 2 will have more speed and space to give C room to breath but as a C engine it's still going to be bit sad compared to similarly clocked devices.

    Still, we might be surprised what optimizations the C compiler gurus can come up with.
  • jazzedjazzed Posts: 11,803
    edited 2011-08-15 21:23
    Hi Phil, I'll make sure our compiler team has a look at your example.
  • Dr_AculaDr_Acula Posts: 5,484
    edited 2011-08-15 21:26
    I've been wondering whether it's even reasonable to compile C to PASM

    Possibly not for the purists but for the rest of us, I would be thinking the answer is yes.

    My understanding of some of the Basic versions for the prop is they translate Basic instructions mindlessly into Pasm blobs. That works, and it isn't optimal, but it does run faster than interpreted Basic, and is better than not having anything.

    Then there are pasm dummies like me who are dangerous enough to write some code that works, but can't work out why adding a NOP in one cog's code would crash the code running in another cog. For people like me, C to Pasm is going to be better than nothing.

    In practice, if I wanted to use this in earnest, I'd code the C, compile it and if ran fast enough for the purpose and fit inside the cog then leave it as it is. An example might be a mouse driver or a keyboard driver where speed is not critical.

    And if it does not fit, or it is not fast enough, well, then you would have to get your hands dirty and start pulling the code to bits one line at a time. Even then, have a skeleton working code that is too slow is still probably a better place to start than having no code at all.

    I guess if one starts to see patterns to code that can be optimised one can add that to a compiler?
  • RossHRossH Posts: 5,519
    edited 2011-08-16 00:31
    Interesting to read this thread in conjunction with the recent comments posted by potatohead and others about inline assembly (here).

    In my view, the answer is that if your problem calls for assembly language, then use assembly language. Don't try and use (and abuse) C. C is a "universal assembler" mainly because it is "universal" - not because it makes a good replacement for an assembler. Yes, it can approach the efficiency of assembly language for problems that require the subset of the instruction set that the C code generator implements well - but most C code generators will only make effective use of a small proportion of the available instruction set and addressing modes of a typical CPU. And the more complex the CPU, the smaller the subset. This is all quite normal. Trying to correctly identify candidate cases and then correctly use all the obscure instructions and addressing modes that exist in even a moderately complex CPU is very difficult - and it follows a law of diminishing returns.

    Ross.

    P.S. If no one has done it by the time I get home, I'll compile Phil's code with Catalina and post the results - I don't claim Catalina has a spectacularly efficient code generator - just one that works reasonably well in most cases.
  • RossHRossH Posts: 5,519
    edited 2011-08-16 02:26
    Hi Phil,

    Actual Catalina code is attached below - 91 longs. Not optimal, perhaps - but It is worth noting that this is LMM PASM (i.e. not cog PASM, which your example is). An LMM PASM compiler cannot assume there will be enough space available in the cog to store all the required data (in general, since such a code snippet would just be a small part of a much larger program there would not be) - so it would generally have to use Hub RAM instead. It is the overhead of the Hub access that is one reason the code is larger.

    Sorry, but Catalina does not nicely intersperse the C code as comments, so it's a bit unreadable:
    mov r11, FP
     sub r11, #-(-8)
     rdlong r11, r11
     mov r10, r11
     adds r10, #4
     jmp #LODF
     long -8
     wrlong r10, RI
     rdlong r11, r11
     jmp #LODF
     long -4
     wrlong r11, RI
     mov r11, FP
     sub r11, #-(-4)
     rdlong r11, r11
     mov r16, r11
     subs r16, r13
     cmps r16,  #0 wz,wc
     if_ae add PC,#(@C_func_2-@:Opt_1) 
    :Opt_1
     mov r15, #8
     neg r16, r16
     add PC,#(@C_func_3-@:Opt_2) 
    :Opt_2
    C_func_2
     mov r15, #0
    C_func_3
     mov r12, r17
     sar r12, #3
     cmps r16, r17 wz,wc
     if_b add PC,#(@C_func_4-@:Opt_3) 
    :Opt_3
     or r15, #4
     subs r16, r17
     adds r12, r17
    C_func_4
     sar r17, #1
     cmps r16, r17 wz,wc
     if_b add PC,#(@C_func_6-@:Opt_4) 
    :Opt_4
     or r15, #2
     subs r16, r17
     adds r12, r17
    C_func_6
     sar r17, #1
     cmps r16, r17 wz,wc
     if_b add PC,#(@C_func_8-@:Opt_5) 
    :Opt_5
     or r15, #1
     adds r12, r17
    C_func_8
     mov r11, r15
     and r11, #8
     cmps r11,  #0 wz
     if_z add PC,#(@C_func_10-@:Opt_6) 
    :Opt_6
     subs r13, r12
     jmp #LODA
     long @C_func_L000014
     rdlong  r11, RI
     cmps r13, r11 wz,wc
     if_ae add PC,#(@C_func_11-@:Opt_7) 
    :Opt_7
     jmp #LODA
     long @C_func_L000014
     rdlong  r13, RI
     add PC,#(@C_func_11-@:Opt_8) 
    :Opt_8
    C_func_10
     adds r13, r12
     jmp #LODA
     long @C_func_L000019
     rdlong  r11, RI
     cmps r13, r11 wz,wc
     if_be add PC,#(@C_func_17-@:Opt_9) 
    :Opt_9
     jmp #LODA
     long @C_func_L000019
     rdlong  r13, RI
    C_func_17
    C_func_11
     mov r11, r15
     shl r11, #2
     mov r10, FP
     sub r10, #-(-48)
     adds r11, r10
     rdlong r11, r11
     adds r14, r11
     cmps r14,  #0 wz,wc
     if_ae add PC,#(@C_func_22-@:Opt_10) 
    :Opt_10
     mov r14, #0
     add PC,#(@C_func_23-@:Opt_11) 
    :Opt_11
    C_func_22
     cmps r14,  #88 wz,wc
     if_be add PC,#(@C_func_24-@:Opt_12) 
    :Opt_12
     mov r14, #88
    C_func_24
    C_func_23
     mov r11, r14
     shl r11, #2
     mov r10, FP
     sub r10, #-(-88)
     adds r11, r10
     rdlong r17, r11
     mov r11, r15
     shl r11, #4
     jmp #LODF
     long -92
     wrlong r11, RI
     ...
    
    C_func_L000019
     long 32767
    C_func_L000014
     long -32768
    
    
  • Heater.Heater. Posts: 21,230
    edited 2011-08-16 04:42
    RossH,

    You should be feeling quite proud of that. Catalina is doing very well.

    Here are some code sizes for Phils snippet (wrapped into a void function) for some compilers I have lying around:
    Code sizes for Phil Pilgrims example C code
    -------------------------------------------
    
    Optimization = -O2
    
    Compiler        Instructions    Bytes   Notes
    --------        ------------    -----   -----
    Catalina        91              364     32 bit instructions
    XMOS C          96              192     16 bit instructions
    GCC Intel       91              ??      Variable length instructions
    GCC ZPU(ZOG)    281             281     8 bit instructions
    

    So Catalina is on a par with x86-gcc instruction wise (although not all the x86 instructions will be 32 bit so the x86 code may be smaller
    )
    XMOS C is a clear winner on shear code size due to it's compact instructions.

    Poor old Zog is not turning in the hoped for small code size.

    None of this is relevant to Phil's question but at least it shows Catalina is doing well. And perhaps hints at the unsuitability of C for COG PASM code.
  • Martin_HMartin_H Posts: 4,051
    edited 2011-08-16 05:54
    I'd agree with Dr_Acula, that a C compiler that produces reasonable but sub-optimal COG PASM code is still useful. There's a bunch of C code which could be directly reused with a C compiler, even one that produces sub-optimal code. Basically most programmers want to re-use as much code as possible. So using a portable high level language, and dropping down to assembler only when required is attractive.
  • jazzedjazzed Posts: 11,803
    edited 2011-08-16 08:38
    Heater & Ross: I didn't see a C file. Please post what you are using. This can be interpreted as a benchmark (regardless of merits), so the test should start with exactly the same source file.
  • Heater.Heater. Posts: 21,230
    edited 2011-08-16 09:01
    True. I only copy and pasted the contents of the code box in the first post into a file wraped it up as a function, void phipi(){....}, and adde a main() to call it. All the variables are declared.outside the function as int, int* or integer arrays as appropriate.
    I'd post it now if I weren't on a bus thumbing a mobile phone.
  • Dave HeinDave Hein Posts: 6,347
    edited 2011-08-16 09:38
    Heater,

    I get a slightly different answer for GCC on the x86. My version of the compiler produces 66 instructions with -O2. And that includes a few instructions for calling overhead. The current GCC P1 compiler generates 71 instruction for the same code. If I force all of the variables into registers except for outputbuffer it generates 62 instructions. I expect that we'll see some improvements in the compiler that will reduce the number of instructions. The source file I used is attached below.

    Dave
    c
    c
    1K
  • LeonLeon Posts: 7,620
    edited 2011-08-16 11:00
    I compiled your code for an ARM Cortex-M3 using gcc, the function generated this code (disassembled):
    <testsub>
        B430        push {r4-r5}
        F2400304    movw r3, #4
        F2C10300    movt r3, #0x1000
        6819        ldr r1, [r3]
        F8512B04    ldr r2, [r1], #4
        6019        str r1, [r3]
        F2400314    movw r3, #20
        F2C10300    movt r3, #0x1000
        681C        ldr r4, [r3]
        1B12        subs r2, r2, r4
        BF46        itte mi
        4252        rsbmi r2, r2 ,#0
        2308        movmi r3, #8
        2300        movpl r3, #0
        F240010C    movw r1, #12
        F2C10100    movt r1, #0x1000
        6808        ldr r0, [r1]
        EA4F01E0    asr.w r1, r0, #3
        4282        cmp r2, r0
        BFA2        ittt ge
        F0430304    orrge r3, r3, #4
        EBC00202    rsbge r2, r0, r2
        1809        addge r1, r1, r0
        EA4F0060    asr.w r0, r0, #1
        4282        cmp r2, r0
        BFA2        ittt ge
        F0430302    orrge r3, r3, #2
        EBC00202    rsbge r2, r0, r2
        1809        addge r1, r1, r0
        EA4F0060    asr.w r0, r0, #1
        F240050C    movw r5, #12
        F2C10500    movt r5, #0x1000
        6028        str r0, [r5]
        4282        cmp r2, r0
        BFA4        itt ge
        F0430301    orrge r3, r3, #1
        1809        addge r1, r1, r0
        F0130F08    tst r3, #8
        D012        beq 0x000002FC <__text_start__+0x98>
        1A61        subs r1, r4, r1
        F2400214    movw r2, #20
        F2C10200    movt r2, #0x1000
        6011        str r1, [r2]
        F5114F00    cmn r1, #0x8000
        DA19        bge 0x0000031C <__text_start__+0xB8>
        F2400214    movw r2, #20
        F2C10200    movt r2, #0x1000
        F44F4100    mov.w r1, #0x8000
        F6CF71FF    movt r1, #0xFFFF
        6011        str r1, [r2]
        E00F        b 0x0000031C <__text_start__+0xB8>
        1909        adds r1, r1, r4
        F2400214    movw r2, #20
        F2C10200    movt r2, #0x1000
        6011        str r1, [r2]
        F5B14F00    cmp.w r1, #0x8000
        BFA1        itttt ge
        F2400214    movwge r2, #20
        F2C10200    movtge r2, #0x1000
        F64771FF    movwge r1, #0x7FFF
        6011        strge r1, [r2]
        F2400210    movw r2, #16
        F2C10200    movt r2, #0x1000
        6812        ldr r2, [r2]
        F8521023    ldr.w r1, [r2, r3, lsl #2]
        2958        cmp r1, #0x58
        BFA8        it ge
        2158        movge r1, #0x58
        F2400208    movw r2, #8
        F2C10200    movt r2, #0x1000
        EA2171E1    bic.w r1, r1, r1, asr #31
        6812        ldr r2, [r2]
        F8521021    ldr.w r1, [r2, r1, lsl #2]
        F240020C    movw r2, #12
        F2C10200    movt r2, #0x1000
        6011        str r1, [r2]
        F2400200    movw r2, #0
        F2C10200    movt r2, #0x1000
        EA4F1303    lsl.w r3, r3, #4
        6013        str r3, [r2]
        BC30        pop {r4-r5}
        4770        bx lr
        BF00        nop
    

    I make it 83 instructions. I think that they are mostly single-cycle, and can run at up to 120 MHz.
  • Heater.Heater. Posts: 21,230
    edited 2011-08-16 11:15
    62 instructions for the P1 compiler, in the early stages that is, already sounds pretty good. I'll have to sort out what x86-gcc I used, it's on Debian so it may not be so recent. I'll update the results sheet tomorrow.
  • Phil Pilgrim (PhiPi)Phil Pilgrim (PhiPi) Posts: 23,514
    edited 2011-08-16 11:38
    Dave,

    Can you show us the 62-instruction output?

    -Phil
  • Dave HeinDave Hein Posts: 6,347
    edited 2011-08-16 13:11
    Can you show us the 62-instruction output?
    Here's the assembly output. The two obvious areas for improvement are to use variable registers directly instead of moving them to the rxx registers, and eliminate cmps when the Z and C flags can be determined from earlier operations.
    s
    s
    1K
    t.s 1.1K
  • Phil Pilgrim (PhiPi)Phil Pilgrim (PhiPi) Posts: 23,514
    edited 2011-08-16 13:51
    Dave,

    Of all the idiomatic optimizations, I thought one of the hardest would be to make the translation from:
    if ( index < 0 ) index = 0;
    else if ( index > 88 ) index = 88;
    

    to
    	maxs	r7, #88
    	mins	r7, #0
    

    Yet your compiler seems to manage that nicely. (The hardest may be recognizing opportunities for sumc, muxc, etc.) There's a lot of working-register thrashing going on, though. Is this a result of GCC having to treat variables as load/store only?

    -Phil
  • Dave HeinDave Hein Posts: 6,347
    edited 2011-08-16 15:08
    First of all, let me emphasize that the compiler work is not being done by me. I have just been testing out the compiler and the assembler. Most of the work on the compiler is being done by another member of the team, and one other member of the team is working on the assembler and linker. GCC is capable of doing some nice optimizations. The following snippet of code was posted by the person working on the compiler.
    int
    addsub(int which, int a, int b)
    {
        if (which)
            return a+b;
        return a-b;
    }
    
    Generated assembly code
    addsub
        cmps    r0, #0 wz,wc
        mov    r0, r1
        IF_NE add    r0, r2
        IF_E  sub    r0, r2
        jmp    lr
    
  • Phil Pilgrim (PhiPi)Phil Pilgrim (PhiPi) Posts: 23,514
    edited 2011-08-16 15:17
    Dave,

    That's an example where sumz would be optimum, but identifying those opportunities will not be easy.

    -Phil
  • RossHRossH Posts: 5,519
    edited 2011-08-16 17:17
    Jazzed - the source code is just what is in Phil's first post. I wrapped it in a function and added the necessary lines to declare the variables (all as ints). Then I extracted the code that corresponded to his lines of C.

    Phil - it may be a good idea if you modify your code fragment to turn it into a function within a compilable C program (i.e. declaring all variables etc) - then we could all compare more easily since we could just extract the code corresponding to the function.

    Dave - a compiler that achieves ~70 longs for cog-based PASM when a very sophisticated user can only manage to squeeze it down to ~30 is pretty darn good!

    Ross.
  • Kevin WoodKevin Wood Posts: 1,266
    edited 2011-08-16 17:31
    RossH wrote:
    Dave - a compiler that achieves ~70 longs for cog-based PASM when a very sophisticated user can only manage to squeeze it down to ~30 is pretty darn good!

    It's even better for non-sophisticated users. :)
  • ericballericball Posts: 774
    edited 2011-08-16 17:34
    An optimizing C compiler can never be as good as hand-optimized assembly; although with today's out-of-order multi-core / thread, and multiple levels of cache & virtual memory I'm not sure whether it's possible to truly generate optimal code on anything but the smallest routines.

    However, the real optimization is not on purely an instruction level but on the algorithm level. For example a couple of decades a ago I achieved a 2:1 optimization over x86 C code (which I'd already 2:1 optimized) with hand-coded assembly (performing a Galois Field multiplication for error correction coding) by coding the routine to use the carry bit effectively - something which wasn't possible in the C code and thus required significantly more complex code. So even on typical microprocessors there are idiomatic features which can't be expressed in C.

    However, in my mind the target platform will have a greater impact any C compiler for the Propeller. A compiler written for just the base resources (32K shared memory + 496 instructions/registers per cog) will generate significantly different code than one written for the C3 or any other platform with external RAM. The former will likely generate PASM and throw an error if the program is too large to fit in COG RAM, while the latter may use LMM with overlays / library routines.

    And for Phil's question regarding the speed of C versus PASM, I guess the real question is whether the resulting code is "fast enough" for the task at hand. Experienced programmers know it's possible to wring more speed out of a program by coding it in assembly. (How much more unfortunately can not be answered before it's attempted.) The question is whether it's required and worth the effort. For code which is I/O bound I suspect compiled C will be "fast enough", hopefully faster than SPIN.
  • Phil Pilgrim (PhiPi)Phil Pilgrim (PhiPi) Posts: 23,514
    edited 2011-08-16 18:00
    RossH wrote:
    Phil - it may be a good idea if you modify your code fragment to turn it into a function within a compilable C program (i.e. declaring all variables etc) - then we could all compare more easily since we could just extract the code corresponding to the function.

    I can read C, but I've never programmed in it, so I'd probably do it wrong. :(

    BTW, my main reason for bringing this up is to raise the question whether the C compiler will meet user's expectations for code size and execution speed. Rather than beating our heads out of the gate over raw optimality, the better question is, "How does the ratio of compiled C to hand-optimized PASM compare with that of the best C compilers for other micros, e.g. AVR, PIC, XMOS, ARM, etc.?" That's the yardstick by which experienced programmers will judge the Propeller compiler. However, the 496-long cog memory limit may place a greater expectation on optimization for the Propeller than it would for another chip.

    -Phil
  • RossHRossH Posts: 5,519
    edited 2011-08-16 18:31
    I can read C, but I've never programmed in it, so I'd probably do it wrong. :(

    -Phil

    Ok - if no-one else does so, I'll post my version when I get home tonight.

    Ross.
  • Heater.Heater. Posts: 21,230
    edited 2011-08-17 03:37
    With Dave's test5.c I can get x86 gcc (4.4.5) to produce 64 instructions for the function where as for my version it turns in 91.
    Turns out that my test was the same except I had all vars outside the function where as Dave has put some as local to the function: val, diff, delta, vpdiff and index.
    This benchmarking is tricky business.
  • RossHRossH Posts: 5,519
    edited 2011-08-17 03:47
    Heater. wrote: »
    With Dave's test5.c I can get x86 gcc (4.4.5) to produce 64 instructions for the function where as for my version it turns in 91.
    Turns out that my test was the same except I had all vars outside the function where as Dave has put some as local to the function: val, diff, delta, vpdiff and index.
    This benchmarking is tricky business.

    Yes, it will make a big difference how and where the variables are declared. Attached is the code I suggest we use. It makes no sense, but it should compile under any C compiler. I suppose some compilers might be clever enough to optimize away whole chunks of the test code - but to prevent that we would have to change phil's original source code.

    Ross.
  • Heater.Heater. Posts: 21,230
    edited 2011-08-17 04:01
    RossH,
    That's no good. I was worried about this already. Using -O2 on GCC for Intel removes the whole function and the call. It has no output so it gets optimized away nicely:)

    Turns out that having global vars, as mine and Dave's versions does prevents that. BUT I think even then it's not fool proof. One would have to add the "volatile" attribute to the inputs and outputs to be really sure. One needs "volatile" anyway as we assume that the variables in HUB can be read or written at any time and so we don't want the compiler squirreling them away in registers as it works through the code.

    In this case we need to be sure that whatever would be in HUB in the intended PASM is "volatile".
  • RossHRossH Posts: 5,519
    edited 2011-08-17 05:05
    Heater. wrote: »
    RossH,
    That's no good. I was worried about this already. Using -O2 on GCC for Intel removes the whole function and the call. It has no output so it gets optimized away nicely:)

    Turns out that having global vars, as mine and Dave's versions does prevents that. BUT I think even then it's not fool proof. One would have to add the "volatile" attribute to the inputs and outputs to be really sure. One needs "volatile" anyway as we assume that the variables in HUB can be read or written at any time and so we don't want the compiler squirreling them away in registers as it works through the code.

    In this case we need to be sure that whatever would be in COG in the intended PASM is "volatile".

    Blast! I had a suspicion that might happen. Perhaps we all agree to turn off optimization altogether?

    But first, try the updated attachment to my previous post - this one makes all the variables volatile parameters to the function, so I think the code within the function can't be optimized away. I also return the function value in the main program, so the function itself cannot be optimized away either.

    Other than that, I'm afraid Phil may have to modify his original code.

    Ross.
  • Heater.Heater. Posts: 21,230
    edited 2011-08-17 05:22
    RossH,
    Perhaps we all agree to turn off optimization altogether?
    Absolutely not !

    Given that we are currently targeting a very small memory space, code in COG data in COG registers or HUB I think we have to allow any and all optimizations that squeeze things in.

    The way I see it is that all we have to do is distinguish between:
    1) Variables in COG, basically local vars on stack and or things that get stashed in registers in a normal CPU, stuff that the world outside the function cannot see because it is in COG on the Prop.
    2) And global variables, those that are outside function, used as inputs or outputs. On the prop they are in HUB.

    So is it not as simple as making all variables that should be in HUB on the Prop global and marking them as "volatile". Volatile because they can be read/written my processes other than the one running the function at any time.

    I see that Dave's example has "cogmem" attributes for some variables so that the prop compiler knows where they are. They should otherwise be "volatile" in other compilers.
  • Heater.Heater. Posts: 21,230
    edited 2011-08-17 05:25
    Oh yes, I don't think we want to pass parameters and return results from this test function.
  • RossHRossH Posts: 5,519
    edited 2011-08-17 06:17
    Heater. wrote: »
    RossH,

    Absolutely not !

    Given that we are currently targeting a very small memory space, code in COG data in COG registers or HUB I think we have to allow any and all optimizations that squeeze things in.

    Well, I guess I agree about the optimization - I just thought I'd throw it out there as an idea, since we have already seen that no optimizer comes close to what a programmer can do by hand anyway.

    However, I think we have now gone some way towards answering Phil's original questions. If even such a small fragment of C code takes two or three times the space of what a good programmer can achieve in assembler (acknowledging that the final result is very dependent on where the variables are located) then it doesn't really matter what the exact results are - cog space is so limited on the Propeller that cog-based C is going to struggle if it tries to positiion itself as an assembler replacement (i.e. for writing drivers etc).

    Ross.
  • BatangBatang Posts: 234
    edited 2011-08-17 08:15
    Just for interest I compiled RossH's function for an ARM M3 using a Keil compiler using -02 optimization, disassembly as follows:
         2: int func(volatile int *indata, volatile int *indexTable, volatile int *stepsizeTable, volatile int val, volatile int diff, volatile int valpred, volatile int delta, volatile int step, volatile int vpdiff, volatile int index, volatile int outputbu 
         3:  
         4:  
    0x00000184 B5F0      PUSH     {r4-r7,lr}
         5:    val = *indata++; 
    0x00000186 6800      LDR      r0,[r0,#0x00]
    0x00000188 9F0A      LDR      r7,[sp,#0x28]
    0x0000018A 9E06      LDR      r6,[sp,#0x18]
    0x0000018C 9D08      LDR      r5,[sp,#0x20]
         6:    diff = val - valpred; 
    0x0000018E 1B80      SUBS     r0,r0,r6
         7:    if(diff < 0) 
         8:    { 
    0x00000190 D502      BPL      0x00000198
         9:        delta = 8; 
    0x00000192 2308      MOVS     r3,#0x08
        10:        diff = (-diff); 
        11:    } 
        12:    else 
        13:    { 
    0x00000194 4240      RSBS     r0,r0,#0
    0x00000196 E000      B        0x0000019A
        14:        delta = 0; 
        15:    } 
        16:     
        17:    vpdiff = (step >> 3); 
        18:     
    0x00000198 2300      MOVS     r3,#0x00
        19:    if ( diff >= step ) { 
    0x0000019A 42A8      CMP      r0,r5
    0x0000019C DB02      BLT      0x000001A4
        20:        delta |= 4; 
    0x0000019E F0430304  ORR      r3,r3,#0x04
        21:        diff -= step; 
        22:        vpdiff += step; 
        23:    } 
    0x000001A2 1B40      SUBS     r0,r0,r5
        24:    step >>= 1; 
    0x000001A4 106D      ASRS     r5,r5,#1
        25:    if ( diff >= step  ) { 
    0x000001A6 42A8      CMP      r0,r5
    0x000001A8 DB02      BLT      0x000001B0
        26:        delta |= 2; 
    0x000001AA F0430302  ORR      r3,r3,#0x02
        27:        diff -= step; 
        28:        vpdiff += step; 
        29:    } 
    0x000001AE 1B40      SUBS     r0,r0,r5
        30:    step >>= 1; 
    0x000001B0 106D      ASRS     r5,r5,#1
        31:    if ( diff >= step ) { 
    0x000001B2 42A8      CMP      r0,r5
    0x000001B4 DB01      BLT      0x000001BA
        32:        delta |= 1; 
        33:        vpdiff += step; 
        34:    } 
        35:     
    0x000001B6 F0430301  ORR      r3,r3,#0x01
        36:    if ( (delta&8) != 0 ) 
        37:    { 
        38:        valpred -= vpdiff; 
        39:        if ( valpred < -32768 ) 
        40:            valpred = -32768; 
        41:    }                        
        42:  
        43:    else 
        44:    { 
        45:        valpred += vpdiff; 
        46:        if ( valpred > 32767 ) 
        47:            valpred = 32767; 
        48:    } 
        49:     
    0x000001BA EA4F7003  LSL      r0,r3,#28
        50:    index += indexTable[delta]; 
    0x000001BE F8510023  LDR      r0,[r1,r3,LSL #2]
    0x000001C2 19C0      ADDS     r0,r0,r7
        51:    if ( index < 0 ) index = 0; 
    0x000001C4 D501      BPL      0x000001CA
    0x000001C6 2000      MOVS     r0,#0x00
    0x000001C8 E002      B        0x000001D0
        52:    else if ( index > 88 ) index = 88; 
    0x000001CA 2858      CMP      r0,#0x58
    0x000001CC DD00      BLE      0x000001D0
    0x000001CE 2058      MOVS     r0,#0x58
        53:    step = stepsizeTable[index]; 
        54:                           
    0x000001D0 F8520020  LDR      r0,[r2,r0,LSL #2]
        55:    outputbuffer = (delta << 4); 
        56:  
        57:    return outputbuffer;     
        58:  
    0x000001D4 0118      LSLS     r0,r3,#4
        59: } 
        60:  
        61:  
    0x000001D6 BDF0      POP      {r4-r7,pc}
    
Sign In or Register to comment.