Shop OBEX P1 Docs P2 Docs Learn Events
Deconstructing a video driver — Parallax Forums

Deconstructing a video driver

Dr_AculaDr_Acula Posts: 5,484
edited 2011-08-28 07:22 in Propeller 1
I have this crazy idea that maybe it is possible to pull data off an external ram chip fast enough for a video driver and improve the resolution. Maybe it is possible, maybe not, but I'm intrigued enough with the idea to have a custom board made.

I'm currently deconstructing various video drivers. One by Baggers 256x96 with one byte per pixel which is about the maximum you can fit in hub.
''*****************************
''*  TV Driver v1.0           *
''*  (C) 2004 Parallax, Inc.  *
''*****************************

CON

  fntsc         = 3_579_545     'NTSC color frequency
  lntsc         = 3640          'NTSC color cycles per line * 16
  sntsc         = 624           'NTSC color cycles per sync * 16

  fpal          = 4_433_618     'PAL color frequency
  lpal          = 4544          'PAL color cycles per line * 16
  spal          = 848           'PAL color cycles per sync * 16

  paramcount    = 12

VX_FORCE = 2

VAR

  long  cogon, cog

                                                    
PUB start(tvptr) : okay

'' Start TV driver - starts a cog
'' returns false if no cog available
''
''   tvptr = pointer to TV parameters

  stop
  okay := cogon := (cog := cognew(@entry,tvptr)) > 0


PUB stop

'' Stop TV driver - frees a cog

  if cogon~
    cogstop(cog)


DAT

'*******************************
'* Assembly language TV driver *
'*******************************

                        org
'
'
' Entry
'
entry                   long    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0   'palette gets loaded in here

entry2                  mov     isinterlacedmode,_mode
                        and     isinterlacedmode,#%0010

                        mov     taskptr,#tasks          'reset tasks

                        mov     x,#10                   'perform task sections initially
:init                   jmpret  taskret,taskptr
                        djnz    x,#:init
'
'
' Superfield
'
superfield              mov     taskptr,#tasks          'reset tasks

                        test    _mode,#%0001    wc      'if ntsc, set phaseflip
        if_nc           mov     phaseflip,phasemask

                        test    _mode,#%0010    wz      'get interlace into z
'
' Field
'
field                   mov     x,vinv                  'do invisible back porch lines
:black                  call    #hsync                  'do hsync
                        waitvid burst,sync_high2        'do black
                        jmpret  taskret,taskptr         'call task section (z undisturbed)
                        djnz    x,#:black               'another black line?

                        wrlong  invisible,VSyncFeedBack '1

                        mov     current_line,#0
                        test    interlace,#1     wc
        if_z_and_c      mov     current_line,#1
                        wrlong  current_line,_nextline

                        mov     x,vb                    'do visible back porch lines
                        call    #blank_lines

                        mov     ptr,_tvdisplist
                        
                        mov     y,_vt                   'set vertical tiles
:line                   mov     vx,_vx                  'set vertical expand
:vert
                        test    _mode,#%0010    wz      'get interlace into z
        if_z            xor     interlace,#1            'interlace skip?
        if_z            tjz     interlace,#:skip

                        call    #hsync                  'do hsync

                        mov     vscl,hb                 'do visible back porch pixels
                        rdlong  bordercolour,_bordercolour
                        xor     tile,bordercolour
                        waitvid tile,#0

                        mov     ptrbackup,ptr

                        mov     vscl,hx                 'set horizontal expand
                        mov     x,_ht                   'set horizontal tiles
                        shl     x,#2

                        cmp     current_line,#192 wc,wz
        if_ae           jmp     #:xloop

                        test    _mode,#$80 wz
        if_z            jmp     #:xloop
                        

:xloop2                 rdlong  pixels,ptr
                        add     ptr,#4
                        xor     pixels,phaseflip
                        waitvid pixels,#%%3210
                        djnz    x,#:xloop2
                        jmp     #:done
                        
:xloop                  rdword  pixels,ptr
                        movs    :p3,pixels
                        and     :p3,pixeland
                        ror     pixels,#4
                        movs    :p4,pixels
                        and     :p4,pixeland
                        ror     pixels,#4
                        movs    :p1,pixels
                        and     :p1,pixeland
                        ror     pixels,#4
                        movs    :p2,pixels
                        and     :p2,pixeland

                        add     ptr,#2

:p1                     mov     pixels,0-0
                        rol     pixels,#8
:p2                     or      pixels,0-0                        
                        rol     pixels,#8
:p3                     or      pixels,0-0                        
                        rol     pixels,#8
:p4                     or      pixels,0-0                        
                        
                        xor     pixels,phaseflip
                        waitvid pixels,#%%3210
                        djnz    x,#:xloop
:done
                        mov     vscl,hf                 'do visible front porch pixels
                        mov     tile,phaseflip
'                        rdlong  bordercolour,_bordercolour
                        xor     tile,bordercolour
                        waitvid tile,#0

                        add     current_line,#1
                        cmp     isinterlacedmode,#2    wc      'get interlace into z
        if_nc           add     current_line,#1
                        wrlong  current_line,_nextline

:skip
                        cmp     vx,#1 wz
        if_nz           mov     ptr,ptrbackup
                        test    _mode,#%0010    wz      'get interlace into z
                        
                        djnz    vx,#:vert               'vertical expand?
                        add     line,lineinc    wc      'set next line
        if_nc           jmp     #:line

                        cmp     y,#2 wz
        if_z            wrlong  visible,VSyncFeedBack   '2
                        test    _mode,#%0010    wz      'get interlace into z

                        
                        djnz    y,#:line                'another tile line?

'                        wrlong  visible,VSyncFeedBack   '2

        if_z            xor     interlace,#1    wz      'get interlace and field1 into z

                        test    _mode,#%0001    wc      'do visible front porch lines
                        mov     x,vf
        if_nz_and_c     add     x,#1
                        call    #blank_lines

                        mov     t1,par
                        add     t1,#1
                        wrbyte  interlace,t1             'set status to visible


'        if_nz           wrlong  invisible,par           'unless interlace and field1, set status to invisible

        if_z_eq_c       call    #hsync                  'if required, do short line
        if_z_eq_c       mov     vscl,hrest
        if_z_eq_c       waitvid burst,sync_high2
        if_z_eq_c       xor     phaseflip,phasemask

                        call    #vsync_high             'do high vsync pulses

                        movs    vsync1,#sync_low1       'do low vsync pulses
                        movs    vsync2,#sync_low2
                        call    #vsync_low

                        call    #vsync_high             'do high vsync pulses

        if_nz           mov     vscl,hhalf              'if odd frame, do half line
        if_nz           waitvid burst,sync_high2

        if_z            jmp     #field                  'if interlace and field1, display field2
                        jmp     #superfield             'else, new superfield
'
'
' Blank lines
'
blank_lines             call    #hsync                  'do hsync

                        rdlong  bordercolour,_bordercolour
                        xor     tile,bordercolour
                        waitvid tile,#0

                        djnz    x,#blank_lines
blank_lines_ret         ret
'
'
' Horizontal sync
'
hsync                   test    _mode,#%0001    wc      'if pal, toggle phaseflip
        if_c            xor     phaseflip,phasemask

                        mov     vscl,sync_scale1        'do hsync       
                        mov     tile,phaseflip
                        xor     tile,burst
                        waitvid tile,sync_normal

                        mov     vscl,hvis               'setup in case blank line
                        mov     tile,phaseflip

hsync_ret               ret
'
'
' Vertical sync
'
vsync_high              movs    vsync1,#sync_high1      'vertical sync
                        movs    vsync2,#sync_high2

vsync_low               mov     x,vrep

vsyncx                  mov     vscl,sync_scale1
vsync1                  waitvid burst,sync_high1

                        mov     vscl,sync_scale2
vsync2                  waitvid burst,sync_high2

                        djnz    x,#vsyncx
vsync_low_ret
vsync_high_ret          ret
'
'
' Tasks - performed in sections during invisible back porch lines
'
tasks                   mov     t1,par                  'load parameters
                        movd    :par,#_enable           '(skip _status)
                        mov     t2,#paramcount - 1
:load                   add     t1,#4
:par                    rdlong  0,t1
                        add     :par,d0
                        djnz    t2,#:load               '+119

                        mov     VSyncFeedBack,par
                        add     VSyncFeedBack,#7*4

                        mov     _nextline,par
                        add     _nextline,#8*4

                        mov     _bordercolour,par
                        add     _bordercolour,#10*4

                        mov     _vx,#1
                        test    _mode,#$40 wz
              if_nz      mov     _vx,#2

                        mov     t1,_tvpalette
                        mov     t2,#16
                        movd    :getpal,#0
:getpal                 rdbyte  0-0,t1
                        add     :getpal,d0
                        add     t1,#1
                        djnz    t2,#:getpal

                        mov     t1,_pins                'set video pins and directions
                        test    t1,#$08         wc
        if_nc           mov     t2,pins0
        if_c            mov     t2,pins1
                        test    t1,#$40         wc
                        shr     t1,#1
                        shl     t1,#3
                        shr     t2,t1
                        movs    vcfg,t2
                        shr     t1,#6
                        movd    vcfg,t1
                        shl     t1,#3
                        and     t2,#$FF
                        shl     t2,t1
        if_nc           mov     dira,t2
        if_nc           mov     dirb,#0
        if_c            mov     dira,#0
        if_c            mov     dirb,t2                 '+18

                        tjz     _enable,#disabled       '+2, disabled?

                        jmpret  taskptr,taskret         '+1=140, break and return later

                        movs    :rd,#wtab               'load ntsc/pal metrics from word table
                        movd    :wr,#hvis
                        mov     t1,#wtabx - wtab
                        test    _mode,#%0001    wc
:rd                     mov     t2,0
                        add     :rd,#1
        if_nc           shl     t2,#16
                        shr     t2,#16
:wr                     mov     0,t2
                        add     :wr,d0
                        djnz    t1,#:rd                 '+54

        if_nc           movs    :ltab,#ltab             'load ntsc/pal metrics from long table
        if_c            movs    :ltab,#ltab+1
                        movd    :ltab,#fcolor
                        mov     t1,#(ltabx - ltab) >> 1
:ltab                   mov     0,0
                        add     :ltab,d0s1
                        djnz    t1,#:ltab               '+17

                        rdlong  t1,#0                   'get CLKFREQ
                        shr     t1,#1                   'if CLKFREQ < 16MHz, cancel _broadcast
                        cmp     t1,m8           wc
        if_c            mov     _broadcast,#0
                        shr     t1,#1                   'if CLKFREQ < color frequency * 4, disable
                        cmp     t1,fcolor       wc
        if_c            jmp     #disabled               '+11

                        jmpret  taskptr,taskret         '+1=83, break and return later

                        mov     t1,fcolor               'set ctra pll to fcolor * 16
                        call    #divide                 'if ntsc, set vco to fcolor * 32 (114.5454 MHz)
                        test    _mode,#%0001    wc      'if pal, set vco to fcolor * 16 (70.9379 MHz)
        if_c            movi    ctra,#%00001_111        'select fcolor * 16 output (ntsc=/2, pal=/1)
        if_nc           movi    ctra,#%00001_110
        if_nc           shl     t2,#1
                        mov     frqa,t2                 '+147

                        jmpret  taskptr,taskret         '+1=148, break and return later

                        mov     t1,_broadcast           'set ctrb pll to _broadcast
                        mov     t2,#0                   'if 0, turn off ctrb
                        tjz     t1,:off
                        min     t1,m8                   'limit from 8MHz to 128MHz
                        max     t1,m128
                        mov     t2,#%00001_100          'adjust _broadcast to be within 4MHz-8MHz
:scale                  shr     t1,#1                   '(vco will be within 64MHz-128MHz)
                        cmp     m8,t1           wc
        if_c            add     t2,#%00000_001
        if_c            jmp     #:scale
:off                    movi    ctrb,t2
                        call    #divide
                        mov     frqb,t2                 '+165

                        jmpret  taskptr,taskret         '+1=166, break and return later

                        mov     t1,#%10100_000          'set video configuration
                        test    _pins,#$01      wc      '(swap broadcast/baseband output bits?)
        if_c            or      t1,#%01000_000
                        test    _mode,#%1000    wc      '(strip chroma from broadcast?)
        if_nc           or      t1,#%00010_000
                        test    _mode,#%0100    wc      '(strip chroma from baseband?)
        if_nc           or      t1,#%00001_000
                        and     _auralcog,#%111         '(set aural cog)
                        or      t1,_auralcog
                        movi    vcfg,t1                 '+10

                        mov     hx,_hx                  'compute horizontal metrics
                        shl     hx,#10
                        or      hx,_hx
                        shl     hx,#2

                        mov     t1,_ht
                        mov     t2,_hx
                        call    #multiply
                        mov     hf,hvis
                        sub     hf,t1
                        shr     hf,#1           wc
                        mov     hb,_ho
                        addx    hb,hf
                        sub     hf,_ho                  '+52

                        mov     t1,_vt                  'compute vertical metrics
                        mov     t2,_vx
                        call    #multiply
                        test    _mode,#%0010    wc      '(if interlace, halve lines)
        if_c            shr     t1,#1
                        mov     vf,vvis
                        sub     vf,t1
                        shr     vf,#1           wc
                        neg     vb,_vo
                        addx    vb,vf
                        add     vf,_vo                  '+48

                        xor     _mode,#%0010            '+1, flip interlace bit for display

:colors                 jmpret  taskptr,taskret         '+1=112/160, break and return later

                        jmp     #:colors                '+1, keep loading colors
'
' Divide t1/CLKFREQ to get frqa or frqb value into t2
'
divide                  rdlong  m1,#0                   'get CLKFREQ

                        mov     m2,#32+1
:loop                   cmpsub  t1,m1           wc
                        rcl     t2,#1
                        shl     t1,#1
                        djnz    m2,#:loop

divide_ret              ret                             '+140
'
'
' Multiply t1 * t2 * 16 (t1, t2 = bytes)
'
multiply                shl     t2,#8+4-1

                        mov     m1,#8
:loop                   shr     t1,#1           wc
        if_c            add     t1,t2
                        djnz    m1,#:loop

multiply_ret            ret                             '+37
'
'
' Disabled - reset status, nap ~4ms, try again
'
disabled                mov     ctra,#0                 'reset ctra
                        mov     ctrb,#0                 'reset ctrb
                        mov     vcfg,#0                 'reset video

                        wrlong  outa,par                'set status to disabled

                        rdlong  t1,#0                   'get CLKFREQ
                        shr     t1,#8                   'nap for ~4ms
                        min     t1,#3
                        add     t1,cnt
                        waitcnt t1,#0

                        jmp     #entry2                 'reload parameters
'
'
' Initialized data
'

_vx                     long    VX_FORCE

_ho                     long    0               'ho
_vo                     long    0               'vo
_broadcast              long    50_000_000'_xinfreq<<4  'broadcast
_auralcog               long    0               'auralcog


pixeland                long    $fffffe0f

bordercolour            long    $02

m8                      long    8_000_000
m128                    long    128_000_000
d0                      long    1 << 9 << 0
d6                      long    1 << 9 << 6
d0s1                    long    1 << 9 << 0 + 1 << 1
interlace               long    0
invisible               long    1
visible                 long    2
phaseflip               long    $00000000
phasemask               long    $F0F0F0F0
line                    long    $00060000
lineinc                 long    $10000000
pins0                   long    %11110000_01110000_00001111_00000111
pins1                   long    %11111111_11110111_01111111_01110111
sync_high1              long    %0101010101010101010101_101010_0101
sync_high2              long    %01010101010101010101010101010101       'used for black
sync_low1               long    %1010101010101010101010101010_0101
sync_low2               long    %01_101010101010101010101010101010
'
'
' NTSC/PAL metrics tables
'                               ntsc                    pal
'                               ----------------------------------------------
wtab                    word    lntsc - sntsc,          lpal - spal     'hvis
                        word    lntsc / 2 - sntsc,      lpal / 2 - spal 'hrest
                        word    lntsc / 2,              lpal / 2        'hhalf
                        word    243,                    286             'vvis
                        word    10,                     18              'vinv
                        word    6,                      5               'vrep
                        word    $02_8A,                 $02_AA          'burst
wtabx
ltab                    long    fntsc                                   'fcolor
                        long    fpal
                        long    sntsc >> 4 << 12 + sntsc                'sync_scale1
                        long    spal >> 4 << 12 + spal
                        long    67 << 12 + lntsc / 2 - sntsc            'sync_scale2
                        long    79 << 12 + lpal / 2 - spal
                        long    %0101_00000000_01_10101010101010_0101   'sync_normal
                        long    %010101_00000000_01_101010101010_0101
ltabx
'
'
' Uninitialized data
'
taskptr                 res     1                       'tasks
taskret                 res     1
t1                      res     1
t2                      res     1
m1                      res     1
m2                      res     1

x                       res     1                       'display
y                       res     1
hf                      res     1
hb                      res     1
vf                      res     1
vb                      res     1
hx                      res     1
vx                      res     1
tile                    res     1
pixels                  res     1

hvis                    res     1                       'loaded from word table
hrest                   res     1
hhalf                   res     1
vvis                    res     1
vinv                    res     1
vrep                    res     1
burst                   res     1

fcolor                  res     1                       'loaded from long table
sync_scale1             res     1
sync_scale2             res     1
sync_normal             res     1
'
'
' Parameter buffer
'
_enable                 res     1       '0/non-0        read-only
_pins                   res     1       '%pppmmmm       read-only
_mode                   res     1       '%ccip          read-only
_ht                     res     1       '1+             read-only
_vt                     res     1       '1+             read-only
_hx                     res     1       '4+             read-only
_tvsync                 res     1
_nextline               res     1       'line to fetch
_tvdisplist             res     1
_bordercolour           res     1       'long @border
_tvpalette              res     1        

ptr                     res     1

current_line            res     1
isinterlacedmode        res     1

VSyncFeedBack           res     1

bitmapptr               res     1

ptrbackup               res     1

                        fit 
''
''___
''VAR                   'TV parameters - 13 contiguous longs
''
'' 0  long  tv_status     '0/1/2 = off/invisible/visible           read-only
'' 1  long  tv_enable     '0/non-0 = off/on                        write-only
'' 2  long  tv_pins       '%pppmmmm = pin group, pin group mode    write-only
'' 3  long  tv_mode       '%ccip = chroma, interlace, ntsc/pal     write-only
'' 4  long  tv_hc         'horizontal count tiles                  write-only
'' 5  long  tv_vc         'vertical count tiles                    write-only
'' 6  long  tv_hx         'horizontal tile expansion               write-only
'' 7  long  tv_vx         'vertical tile expansion                 write-only
'' 8  long  tv_ho         'horizontal offset                       write-only
'' 9  long  tv_vo         'vertical offset                         write-only
''10  long  tv_broadcast  'broadcast frequency (Hz)                write-only
''11  long  tv_auralcog   'aural fm cog                            write-only
''12  long  tv_scanline   '256 bytes (64 longs) for display buffer write-only
''13  long  tv_bordercolour
''14  long  tv_nextline
''15  long  tv_DisplayList_ptr
''16  long  tv_vsync
''
''The preceding VAR section may be copied into your code.
''After setting variables, do start(@tv_status) to start driver.
''
''All parameters are reloaded each superframe, allowing you to make live
''changes. To minimize flicker, correlate changes with tv_status.
''
''Experimentation may be required to optimize some parameters.
''
''Parameter descriptions:
''  _________
''  tv_status
''
''    driver sets this to indicate status:
''      0: driver disabled (tv_enable = 0 or CLKFREQ < requirement)
''      1: currently outputting invisible sync data
''      2: currently outputting visible screen data
''  _________
''  tv_enable
''
''        0: disable (pins will be driven low, reduces power)
''    non-0: enable
''  _______
''  tv_pins
''
''    bits 6..4 select pin group:
''      %000: pins 7..0
''      %001: pins 15..8
''      %010: pins 23..16
''      %011: pins 31..24
''      %100: pins 39..32
''      %101: pins 47..40
''      %110: pins 55..48
''      %111: pins 63..56
''
''    bits 3..0 select pin group mode:
''      %0000: %0000_0111    -                    baseband
''      %0001: %0000_0111    -                    broadcast
''      %0010: %0000_1111    -                    baseband + chroma
''      %0011: %0000_1111    -                    broadcast + aural
''      %0100: %0111_0000    baseband             -
''      %0101: %0111_0000    broadcast            -
''      %0110: %1111_0000    baseband + chroma    -
''      %0111: %1111_0000    broadcast + aural    -
''      %1000: %0111_0111    broadcast            baseband
''      %1001: %0111_0111    baseband             broadcast
''      %1010: %0111_1111    broadcast            baseband + chroma
''      %1011: %0111_1111    baseband             broadcast + aural
''      %1100: %1111_0111    broadcast + aural    baseband
''      %1101: %1111_0111    baseband + chroma    broadcast
''      %1110: %1111_1111    broadcast + aural    baseband + chroma
''      %1111: %1111_1111    baseband + chroma    broadcast + aural
''      -----------------------------------------------------------
''            active pins    top nibble           bottom nibble
''
''      the baseband signal nibble is arranged as:
''        bit 3: chroma signal for s-video (attach via 560-ohm resistor)
''        bits 2..0: baseband video (sum 270/560/1100-ohm resistors to form 75-ohm 1V signal)
''
''      the broadcast signal nibble is arranged as:
''        bit 3: aural subcarrier (sum 560-ohm resistor into network below)
''        bits 2..0: visual carrier (sum 270/560/1100-ohm resistors to form 75-ohm 1V signal)
''  _______
''  tv_mode
''
''    bit 3 controls chroma mixing into broadcast:
''      0: mix chroma into broadcast (color)
''      1: strip chroma from broadcast (black/white)
''
''    bit 2 controls chroma mixing into baseband:
''      0: mix chroma into baseband (composite color)
''      1: strip chroma from baseband (black/white or s-video)
''
''    bit 1 controls interlace:
''      0: progressive scan (243 display lines for NTSC, 286 for PAL)
''           less flicker, good for motion
''      1: interlaced scan (486 display lines for NTSC, 572 for PAL)
''           doubles the vertical display lines, good for text
''
''    bit 0 selects NTSC or PAL format
''      0: NTSC
''           3016 horizontal display ticks
''           243 or 486 (interlaced) vertical display lines
''           CLKFREQ must be at least 14_318_180 (4 * 3_579_545 Hz)*
''      1: PAL
''           3692 horizontal display ticks
''           286 or 572 (interlaced) vertical display lines
''           CLKFREQ must be at least 17_734_472 (4 * 4_433_618 Hz)*
''
''      * driver will disable itself while CLKFREQ is below requirement
''  _____
''  tv_ht
''
''    horizontal number of 16 * 16 pixel tiles - must be at least 1
''    practical limit is 40 for NTSC, 50 for PAL
''  _____
''  tv_vt
''
''    vertical number of 16 * 16 pixel tiles - must be at least 1
''    practical limit is 13 for NTSC, 15 for PAL (26/30 max for interlaced NTSC/PAL)
''  _____
''  tv_hx
''
''    horizontal tile expansion factor - must be at least 3 for NTSC, 4 for PAL
''
''    make sure 16 * tv_ht * tv_hx + ||tv_ho + 32 is less than the horizontal display ticks
''  _____
''  tv_vx
''
''    vertical tile expansion factor - must be at least 1
''
''    make sure 16 * tv_vt * tv_vx + ||tv_vo + 1 is less than the display lines
''  _____
''  tv_ho
''
''    horizontal offset in ticks - pos/neg value (0 for centered image)
''    shifts the display right/left
''  _____
''  tv_vo
''
''    vertical offset in lines - pos/neg value (0 for centered image)
''    shifts the display up/down
''  ____________
''  tv_broadcast
''
''    broadcast frequency expressed in Hz (ie channel 2 is 55_250_000)
''    if 0, modulator is turned off - saves power
''
''    broadcasting requires CLKFREQ to be at least 16_000_000
''    while CLKFREQ is below 16_000_000, modulator will be turned off
''  ___________
''  tv_auralcog
''
''    selects cog to supply aural fm signal - 0..7
''    uses ctra pll output from selected cog
''
''    in NTSC, the offset frequency must be 4.5MHz and the max bandwidth +-25KHz
''    in PAL, the offset frequency is and max bandwidth vary by PAL type

in conjunction with some comments made in this thread http://forums.parallaxinc.com/forums/default.aspx?f=15&m=385095 in particular potatohead's comments about half way down, where he says that you can't quite feed a propeller for 640 pixels per line but you might get 512.

First thing - can we compress a picture vertically?

Looking at that code, I modified this little bit
                        mov     _vx,#1
                        test    _mode,#$40 wz
              if_nz      mov     _vx,#2
by adding in mov _vx,#1
and this squished up the picture vertically to half the size. The picture is 96 high and with the black bars at the top and the bottom, it now takes about 1/3 of the screen.

So far, so good, because maybe you could read in 1/3 of a screen from hub ram, and meanwhile another cog is reading in the next 1/3. That ought to improve the resolution three fold.

Next thing - can we compress things horizontally?

This driver is 256 pixels wide. Potatohead says maybe you can do 512.

Looking at that code above, I think there is some legacy code in there, because putting an endless loop in ":xloop" and it still keeps running, so I think xloop is never used in this driver.

So the core of the video read is xloop2
:xloop2                 rdlong  pixels,ptr
                        add     ptr,#4
                        xor     pixels,phaseflip
                        waitvid pixels,#%%3210
                        djnz    x,#:xloop2
                        jmp     #:done

It reads four pixels from hub, increments the pointer, then displays them.

But I'm a bit stuck about how to make this loop any shorter to squish things horizontally. In other words, make the picture half as wide.
Is waitvid pixels,#%%3210 running as fast as is possible?
Is there some sort of delay hidden in there? \


Addit: answering the question myself here after a few hours of research - the key is the VSCL register. Tracing this back through the code, the variable is "hx" which ends up in the main program as "my_hx = 11+((mode&1)*6)"

Tweaking the 11 does decrease the screen size, though at 7 there is a lot of shimmering on the screen, and at 6 the screen is blank.

I guess I know enough here to be dangerous!

Thanks in advance, advice would be most appreciated.
640 x 480 - 52K
«1

Comments

  • Dr_AculaDr_Acula Posts: 5,484
    edited 2011-08-05 07:41
    I have answered some of my questions here after some experiments. I have a little test pattern, with black/white/black/white and a few colors. Push this to the limit and it turns into gray.

    So, in Bagger's driver, make _vt one instead of two, and that makes the pixels half as high.
    Then tweak the following variables
    x_tiles  = 16                  
    y_tiles  = 6                 
    my_hx    = 11+((mode&1)*6)     
    

    First, decrease the y tiles so it doesn't run out of memory. I've gone down to 1.
    Then try my_hx, and I changed the 11 to 7 but that seems to shimmer a lot. So made it 8.
    Then increase the number of x tiles from 16. I went up to 21 before it runs off the screen.

    So that gives an x resolution of 16*21 = 336

    And for the y resolution, measuring with a ruler I think I can get 15 tiles, so that is 240 pixels

    Possibly one could go for higher resolution, but once bwbwbw merges into gray on a screen there probably isn't much point. (bbwwbbww is still individual pixels).

    The ratio of x to y does not really matter because any image can be pre-scaled prior to floyd-steinberg dithering.

    So that gives a total video ram of 336*240 = 80640 bytes.

    Next challenge is whether this can be pulled out of sram fast enough.

    With 30 frames a second, that is 2.4megabytes per second or 413ns. The sram is 55ns so that should be ok. (24 prop pins devoted to the sram chip and doing data in 8k blocks).

    I think the math works. Now time to build a real circuit.
  • RaymanRayman Posts: 14,876
    edited 2011-08-05 08:18
    I have drivers for my Flashpoint working that can show images on TV directly from either Flash or SRAM...

    Actually, I should clarify...

    I use a buffer in HUB RAM and point the modified TV driver to it.
    So the TV driver thinks it's showing the same stuff on every line or few lines.
    But, meanwhile, other cogs are moving the bitmap from external memory into the buffer.
  • Cluso99Cluso99 Posts: 18,069
    edited 2011-08-05 16:09
    Drac: If you use a tile driver or text driver, then keeping that is external SRAM should be simple enough. As for keeping the pixel data in SRAM then I am unsure and will totally depend on resolution and perhaps overclocking. I was thinking of saying "it's impossible" just to bait the forum into proving me wrong haha!
  • Dr_AculaDr_Acula Posts: 5,484
    edited 2011-08-05 21:00
    I use a buffer in HUB RAM and point the modified TV driver to it.
    So the TV driver thinks it's showing the same stuff on every line or few lines.
    But, meanwhile, other cogs are moving the bitmap from external memory into the buffer.

    That is exactly the idea. Do you have some code that is in the public domain?

    I'm thinking you would have one slightly modified video driver doing the display, exactly as you say, and I think it would only need one other cog to pull data out of the sram.

    Thinking ahead, one idea is for a self contained board that has TV, sram, serial port, audio, and sd card. The fastest code is for movies and so the sd card is on this board. It may be limited by the sd data transfer rate (80-100 kilobytes a second).

    Then for slower things like using this board as a display for a GUI, that can be done via the serial port. So there is a cog dedicated to the serial port. And maybe you have some GUI commands, like 'draw a box'. And because there will be lots of unused ram, you could do fast background store/redraw for tiles to other parts of ram. So maybe another cog for that.

    One thing with pulling data out of ram is that the hub does not need nearly as big a buffer for video ram. Possibly only a few k, possibly only a couple of lines. So there is lots of space left in hub for GUI driver code etc in Spin.

    Anyway, the test for the moment is to see what happens with Floyd-Steinberg dithering when the number of x pixels is increased. Specifically, do the pixels start to blur into each other.

    The answer is that I think they do, but I also have a feeling that things can be pushed further than 336 pixels wide, because I can still see individual pixels.

    So - for testing without external ram, we need a long and thin picture. And I wanted skin tones. This is 86 colors and while some experiments at high resolution with 16 colors looks great for scenery, 16 colors not look so good for skin, as most skin ends up a mixture of yellow and red pixels. So it is going to be an 86 color palette and the 'reverse waitvid' code that has %%3210 in it.

    It should be possible to run the attached code on most propeller boards - just change the sd card pins and the tv (modepins) to suit.

    The photo does not quite do it justice as my camera is making the colors slightly more saturated than they are and also slightly higher contrast.

    Also the colors are not quite correct on this display as they were calibrated on a different TV display. Calibrating colors is a matter of putting the palette driver on a TV, then comparing to the colors on a VGA display where the RGB can be adjusted (ie in paintshop). Once the F-S code knows the true palette it can work out the nearest match in terms of the closest RGB value in a 3D color space.

    Hopefully anyone interested in this will be able to get the code working without too much trouble. The zip contains 3 spin files. There is also the picture file and just copy this to an sd card.

    What I don't really understand are the tiles and this bit of code
    PAL      = %0001
    NTSC     = %0000
    mode     = $c0 + NTSC
    
    x_tiles  = 21                    '16             try 21,3 08+mode, and in the tv driver, _vx=#1 instead of #2
    y_tiles  = 4                    '6
    my_hx    = 08+((mode&1)*6)      '11+((mode&1)*6)
    

    Some combinations of values don't seem to work. Pushing x_tiles beyond 21 seems to cause problems. And for the value in my_hx, 09 does not work and 07 is smaller but there is a lot of shimmering.

    I'm not sure what mode&1 equates too - NTSC is zero and so I'm guessing (mode&1)*6 equates to zero. In which case my_hx is jumping in small steps and maybe it can be tweaked for more x pixels.

    @Rayman, I found your movie player code - I think it is 16*12 tiles = 256*192 and appears to have the full color palette. It is 1/4 of the screen so if it were reading from external ram, that could be expanded to ? 512*384.

    What I'm wondering is whether it is possible to generate a really thin picture, maybe only 32 high, and 512 wide?
    640 x 480 - 64K
    490 x 366 - 28K
  • Cluso99Cluso99 Posts: 18,069
    edited 2011-08-05 22:04
    Drac: I presume you mean to just do a thin wide screen to test out your theories. Sounds good to me.
  • Dr_AculaDr_Acula Posts: 5,484
    edited 2011-08-05 22:22
    Yes, I just want a thin display, even one tile high is ok. I want to test out my theory that increasing the number of pixels in a line will start to blur the pixels together. I'm just starting to see that at 336, and I think it should get better because the physical width on my PC screen is now slightly wider than the width it ends up on the little TV display, and the pixels are almost not visible on the PC display.

    The code above used Bagger's driver, but I've found one by Rayman which I think can go to 512. I've got a whole collection of TV drivers, many of which don't use the waitvid %%3210 code and I think have less colors. But there are a few that have the full color palette and I would like to see which one(s) can be most easily extended to 512 pixels (maybe 32 x 1 tiles).

    I am encouraged by these comments in the video driver
    ''  tv_ht
    ''
    ''    horizontal number of 16 * 16 pixel tiles - must be at least 1
    ''    practical limit is 40 for NTSC, 50 for PAL
    ''  _____
    ''  tv_vt
    ''
    ''    vertical number of 16 * 16 pixel tiles - must be at least 1
    ''    practical limit is 13 for NTSC, 15 for PAL (26/30 max for interlaced NTSC/PAL)
    ''  _____
    ''  tv_hx
    ''
    ''    horizontal tile expansion factor - must be at least 3 for NTSC, 4 for PAL
    ''
    ''    make sure 16 * tv_ht * tv_hx + ||tv_ho + 32 is less than the horizontal display ticks
    

    In particular the comment that the practical limit for NTSC is 40 tiles of 16.

    Something is not quite right I think with the calculations in the code as anything more than 21 gives a blank screen. I suspect that something goes out of range?

    The maths is here
                            mov     hx,_hx                  'compute horizontal metrics
                            shl     hx,#10
                            or      hx,_hx
                            shl     hx,#2
    
                            mov     t1,_ht
                            mov     t2,_hx
                            call    #multiply
                            mov     hf,hvis
                            sub     hf,t1
                            shr     hf,#1           wc
                            mov     hb,_ho
                            addx    hb,hf
                            sub     hf,_ho                  '+52
    
  • potatoheadpotatohead Posts: 10,261
    edited 2011-08-05 23:08
    IMHO, those comments are for the usual 4 color, 16 pixel tiles. 4 pixel tiles have a much tighter waitvid loop, and that reduces the resolution. When those comments were written, JT Cook and others had not yet published the 4 pixel waitvid form. Chip authored that driver, with the intent of it having 4 color graphics tiles. That driver will do 640 pixels on a TV with no mods, because the tiles are 16 pixels, allowing plenty of time for the waitvid loop to complete.

    Can you run a prop at 100Mhz? That's probably what you will need for 512 pixels at the 4 pixel waitvid frame size, which is the next reasonable step above 320. A few pixels packed together have the same problem that a lot of pixels on the line packed together does. That problem is the waitvid loop. The pixel clock rate is the constraint, not just the absolute number of pixels. If one group of pixels can be done, they all can be done. If one cannot, none can. Not really possible to just stuff the pixels into the middle of the scan line, without hard coding the waitvids to read their pixel data, and even then, there isn't a big gain, due to the hub window constraint.

    I looked at my earlier comments. We found that waitvids could fetch data from the busses, without having to use a waitvid instruction. Linus discovered this with his 800x600 VGA driver.

    The technique isn't really all that viable, so those comments are off limits, and had assumed a waitvid loop that is a hack, more than useful.

    FWIW, things will blur together at resolutions over 256. A simple non-phase change driver starts this at 256 pixels, and gets crappy at 320. The Parallax driver core being used here performs very well up to 320 pixels. A interlaced signal, using two scan lines per pixel vertically, will do a very nice 320x200 display, with some blending and artifacting. At 512 pixels, small color details will be lost, leaving different color artifacts, if there are patterns, and mostly will display intensity changes only, color topping out well below that pixel rate. On many displays 640 is a smudged mess. PC capture cards and HDTV displays actually render a lot of it though. See my 80x50 driver for some of that rendering. Note, it's only 2 color per 8 pixels though. 4 colors would be useful, more than that and it would be messy. S-video has some added sharpness in the intensity, though color blending still happens. S-video might be good for what I describe below.

    Probably the timing metrics used for a 4 pixel frame break down, because there simply isn't enough time to execute the video loop at the desired tile size.

    You could use the Parallax driver in 4 color tile mode, with up to 64 palettes of 4 colors to choose from to see color effects at higher resolutions. 8 pixel tiles can be drawn at 512 pixels, and I am sure I've done that at one point. That means 4 unique colors per every 8 pixels. You could choose to not have any vertical tile height for a very interesting balance. All colors on screen, just no more than 4 per 8 pixel block horizontally, no limits vertically.

    I'm mentioning this because the color info breaks down at that resolution anyway. 256 is really the sweet spot. Doubling that means 8 pixels really can't display more color anyway, but intensity changes can still occur. A optimized picture might do very well with those limits. If S-video output were used, it would look pretty nice actually. Some care would have to be taken to keep the tile borders from corrupting the image. The most obvious one would be to limit color to 256 pixels, so the limit doesn't cause tile borders to be seen, but allow intensity changes within the tile, where possible. Such a program would have to break down the image into 256 pixel color, calculate palettes for areas that have similar color, or one color but intensity changes, then break those into the 64 palettes, matching those up with the tiles, however high they are. One scan line would require a lot of tile memory, almost as much as a bitmap, but your zone idea could still work.

    I suspect that zone idea of having multiple COGS render to buffers moving things into and out of the HUB will work, but will have fill rate limits. Sufficient for pictures and a simple GUI though. Probably not good enough for a real time display with sprites and such.

    I may have some time next week to toy with this, if you want to see some modifications to the Parallax driver.

    Re: Tiles and such. Play with the commented graphics demo.spin found in my sig, if you are having trouble with palettes and tiles. I broke it down to a simpler form than presented in the original graphics demo program.
  • Dr_AculaDr_Acula Posts: 5,484
    edited 2011-08-05 23:39
    Wow, thanks potatohead. Heaps to digest there.

    I've just been down a tangent trying to work out the maths of the horizontal resolution and started to find differences in code, eg baggers has
                            mov     hx,_hx                  'compute horizontal metrics
                            shl     hx,#10                 ' 
                            or      hx,_hx
                            shl     hx,#2                  ' 
    

    but in another earlier driver I found this
                            mov     hx,_hx                  'compute horizontal metrics
                            shl     hx,#8
                            or      hx,_hx
                            shl     hx,#4
    

    and also the horizontal stretch value in bagger's code is
    my_hx    = 08+((mode&1)*6)      '11+((mode&1)*6)   
    

    where other code has it set as a constant (eg 10).

    I think I might be starting to see some of the artifacts you talk about at 336 pixels. It would be interesting to see what the artifacts do look like at 512.

    With FS dithering the colors are likely to be very close to each other, eg alternating a light and a slightly darker shade of a color. In an ideal world I'd like to see that turn into the average of those two colors with the pixels not visible. In a not so ideal world it might end up with a different color altogether.

    I wonder if that could be tested on one of your demos - eg a tile that just alternated color $BD and $BC at 512 pixels per line?

    Addit - any chance you (potatohead) could point me more specifically to the driver you mentioned? I have quite a few TV drivers and a few are yours so I'm getting confused between them all.

    There is one you have which is 160x96 which has some very useful comments about how the timing loops are calculated.
    ' I want to explain some important constants up front, so I'll declare them here.
    ' Ordinarily they'd be tucked away after the code.  There is also a lot of skipping
    ' arounf between CON and DAT sections.  I'm declaring them in the order I can best explain them.
    '
    ' I've used some naming conventions.  These prefixes mean:
    ' NTSC_ :   The value is a constant of the NTSC system. They would probably be the same no matter
    '           what resolution or color mode you are using.  You'd probably only change them if you were
    '           converting to PAL.  In which case, good luck to you.
    ' CHOOSE_ : A choice was made for this particular resolution and color depth.  You might change these.
    ' CALC_     These were calculated from one or more of the choices you made.  Choose again and you may
    '           have to change these.
    
    '***************************************************
    '* Color Frequency                                 *
    '***************************************************
    '
    ' The NTSC output is a signal with various elements that need timing relative to each other.  So we need some
    ' sort of clock from which to time all the elements.  The frequency of the color carrier is useful.  This is
    ' 3.579545 Mhz. A cycle of this clock will take 1sec/3.579545MHz = 279.365ns.
    
    CON  NTSC_color_frequency       =     3_579_545
    DAT  NTSC_color_freq            long  NTSC_color_frequency
    
    ' Additionally this period of 279ns is divided up into 16 by a Phased Locked Loop circuit (PLL),
    ' and it is multiples of this period that the Video Scale Hardware Register (VSCL) is programmed.
    ' This period of 17.460313ns is called a "clock".
    
    
    
    '***************************************************
    '* Horizontal sync                                 *
    '***************************************************
    '* A hsync takes 10.9us.  That's 10.9us/17.460313ns = 624 clocks (rounded to an integer).
    CON  NTSC_hsync_clocks          =               624
    DAT  NTSC_hsync_VSCL              long  160368
    DAT  NTSC_control_signal_palette long  $00_00_02_8a
    DAT  NTSC_hsync_pixels          long  %%11_0000_1_2222222_11
    
    
    '***************************************************
    '* Blank lines                                     *
    '***************************************************
    CON  NTSC_active_video_clocks   =     3024
    DAT  NTSC_active_video_VSCL     long  NTSC_active_video_clocks
    
    
    
    '***************************************************
    '* User graphics lines                             *
    '***************************************************
    ' The important lines at last.  To fit 256 pixels in, we're going to make them 10 clocks each.
    ' That's 2560 clocks for the user graphics width.  You could use 11 clocks each or some other near value.
    ' It depends whether you want overscan to the left and right.  
    CON CHOOSE_horizontal_pixels  =  160
    CON CHOOSE_vertical_pixel_height = 1   '0 or 1, depending on vertical resolution
    CON CHOOSE_clocks_per_gfx_pixel = 2560 / CHOOSE_horizontal_pixels
    CON CALC_bytes_per_line = CHOOSE_horizontal_pixels * CHOOSE_vertical_pixel_height
    CON CALC_waitvids = CHOOSE_horizontal_pixels / 4
    
    ' Because we're going to use 256 color mode, we're only going to output 4 pixels per WAITVID.  So that
    ' decides the number of clocks per frame.  4.
    CON CALC_clocks_per_gfx_frame   =  CHOOSE_clocks_per_gfx_pixel*4
    
    
    ' Program the VSCL as before.
    DAT CALC_user_data_VSCL         long  CHOOSE_clocks_per_gfx_pixel << 12 + CALC_clocks_per_gfx_frame
    
    ' So if we're doing 256 pixels, 4 at a time, that's 256/4 = 64 frames.  64 WAITVIDS.
    CON CALC_frames_per_gfx_line    = 80/4
    
    
    ' This is the overscan. 3024 - 2560 Active Display Area Pixels
    
    CON CALC_overscan = 464 
    
    CON CHOOSE_horizontal_offset    = 00
    
    CON CALC_backporch = 208 + CHOOSE_horizontal_offset    'this must be a multiple of the total
                                                           'pixel clock 16 clocks in this case.
    CON CALC_frontporch = (CALC_overscan - CALC_backporch)
    

    At the moment I cannot run that code though, because I do not have the variables correct for the pins.
                            ' VCFG: setup Video Configuration register and 3-bit tv DAC pins to output
                            movs    VCFG, #%0000_0111       ' VCFG'S = pinmask (pin31: 0000_0111 : pin24)
                            movd    VCFG, #3                ' VCFG'D = pingroup (grp. 1 i.e. pins 8-15)
    
                            movi    VCFG, #%0_11_111_000    ' baseband video on bottom nibble, 2-bit color, enable chroma on broadcast & baseband
                                                            ' %0_xx_x_x_x_xxx : Not used
                                                            ' %x_10_x_x_x_xxx : Composite video to top nibble, broadcast to bottom nibble
                                                            ' %x_xx_1_x_x_xxx : 4 color mode
                                                            ' %x_xx_x_1_x_xxx : Enable chroma on broadcast
                                                            ' %x_xx_x_x_1_xxx : Enable chroma on baseband
                                                            ' %x_xx_x_x_x_000 : Broadcast Aural FM bits (don't care)
    

    but this is vastly different to the code I have which starts off with modepins = %010_0000
    and then does a whole lot of calculations. (My TV is on pins 16,17,18) but a simple change of the code above to movd VCFG, #2 does not work. Probably some other variable somewhere. It took a fair bit of trialing to work out the modepins above and while I understand that %010_0000 means %x10_xxxx is binary for group 2 ie pin group 16-23, I still don't understand why 0000 worked. I'd love to get your fractal demo working as I think this looks the easiest code to start experimenting with more pixels.

    I still am also not up to speed with tiles, so I don't understand the comment about problems at the interface between tiles. In fact, as far as I can see, there are no tiles in bagger's code. They seem to be a legacy from earlier tile code, and there are 'tiles' for the purposes of calculating timing values and there is even some legacy code in there for tiles but it never gets used. In the core of the code, bytes are being read out of hub ram one at a time
    :xloop2                 rdlong  pixels,ptr
                            add     ptr,#4
                            xor     pixels,phaseflip
                            waitvid pixels,#%%3210
                            djnz    x,#:xloop2
                            jmp     #:done
    

    I presume this loop is fast enough to run 512 pixels per row?
  • potatoheadpotatohead Posts: 10,261
    edited 2011-08-06 00:28
    "The driver" is just the TV.spin, authored by Chip. "The other driver" is the first VGA driver Linus posted. It's no longer on the forum. I maybe have a copy, as I snagged it that day. Will look. It's kind of broken though. The technique really doesn't work. Need waitvids.

    That loop probably won't do 512 at 80Mhz. Might do it at 100Mhz, can't remember my last tests, though I can do some next week. It's the 4 pixel frame that is the limiting factor. Given the TV sweep frequency, limiting the frame to 4 pixels puts a hard limit on how many pixels one can get. TV.spin uses 16 pixel frames, and that's a lot of time! Easy to do 640 pixels, but only 4 colors. Because there are more pixels, the PLLA per pixel, or pixel clock can be faster, because the waitvid spends more time drawing it.

    I've not looked at the driver you are using right now. I suspect Jim just bypassed the tile code, and set the timings up for a simple bitmap. That only takes a small loop, and yes, it fetches right from the HUB. That's actually the simplest case driver. The only thing simpler is a color bar test, or some other basic thing one can draw with just COG data.

    In that case, the tile parameters still apply, but just don't make tiles. Think waitvid loops and "tiles" that really are just one scan line high. That's how the modified driver maps to the TV.spin parameters. Most modifications to the Parallax TV.Spin driver do that kind of thing to preserve all the nice options Chip built in, like scaling horizontal and vertical, interlace, etc...
  • Dr_AculaDr_Acula Posts: 5,484
    edited 2011-08-06 00:57
    Thanks for all your help here. I'm both learning about video and pushing the boundaries at the same time.

    Attached is your fractal code. What would I need to modify to make it work on pins 16-18 instead of your default of pins 24-26?

    Also, re timing, are you saying that this loop can't pull data out of the hub fast enough?
                            mov     r1, #CALC_waitvids  '80 pixels horizontal resolution is 20 waitvids
    :draw_pixels            rdlong  B, A    'get four pixels from hub
                            waitvid B, #%%3210  'draw them to screen
                            add     A, #4    'point to next pixel group
                            djnz    r1, #:draw_pixels  'line done?
    

    (I think the comment there was from earlier code as the demo says 160 pixels, not 80)

    Thinking simplistically, I have video drivers where there are tiles, and if I add tiles and tweak another variable, I can get more tiles. But there are quite a range of values that don't work and I don't know why they don't work. There are a number of possibilities - calculations going out of range, backporch timing not right, or (maybe) data not being retrieved from hub fast enough.

    Your code attached has such a good explanation that I believe it will allow things to be increased much more incrementally. For example, add 16 more pixels, recalculate the waitvid delay, recalculate the timing at the end of the line, and test it.

    If I could get the pins worked out I think there might be some more experiments that could be useful.

    In particular, one thing I want to explore is the 3D NTSC color space. I do not fully understand NTSC in that it seems to reduce the 3D color of HSL down to two dimensions only. In paintshop pro is a very nice color picker that has Hue, Saturation and Luminance. Hue is the color. Saturation is grey to full color. Luminance is white to full color to black. That describes three dimensions. The propeller color space works in two dimensions for most of the colors (a range of luminances for each hue) but does not have a range of saturations for each hue/luminance combination). It then adds to the palette the grayscale. This ends up excluding a wide range of colors, particularly the colors with very little saturation, ie gray with a hint of blue.

    To build such colors, the Floyd Steinberg dithering will take a hue/luminance value and dither it with a gray value.

    What I would like to test is whether at high pixel counts per line, these dither to the correct "gray with a hint of color" color that the propeller palette cannot display, or whether they dither to an artifact color.
  • potatoheadpotatohead Posts: 10,261
    edited 2011-08-06 01:20
    I'll get the pins sorted in the morning. No worries.

    Yeah, that's simple code for testing, but the signal isn't that good. It's the "non-color phase change" style, so it breaks down at 256 pixels, artifacting quickly.

    That loop is kind of crappy really. It's got the waitvid right next to the HUB operation. I think the other one is better, because there are instructions in the dead time between the hub operation and the waitvid. Though now, I don't know. Need to go back and re-read the adventure Bill and Kurenko went through on his drivers. Maybe that one does go faster. I need to look at it again. Wrote that one a very long time ago, before I really knew how waitvid and the HUB were interacting.

    Thinking on colors and drivers...
  • Dr_AculaDr_Acula Posts: 5,484
    edited 2011-08-06 17:13
    Thanks potatohead - any code examples that can do more than 336 pixels per line with more than 64 colors would be most appreciated!
  • Toby SeckshundToby Seckshund Posts: 2,027
    edited 2011-08-07 03:34
    Dr_A

    Remember that the chroma bandwith for PAL is at best only 1.5MHz. You are destined to get bluring and artifacts as it was designed into the system. Y/C will only regain some of the detail of the Y bits and leave the C bits alone.

    YUV or RGB will be the only way out of this, but then you are back to a form of slow scan VGA.
  • Dr_AculaDr_Acula Posts: 5,484
    edited 2011-08-07 06:31
    Below is 384x64 - this is the best I've been able to get.

    Look closely and there are some color artifacts starting to appear with purple areas in the skin tones. All things being equal, on a real screen I think the improvements in smoothness of pixels outweighs color artifacts.

    Some experiments:

    Potatohead said there may be problems pulling data out of hub ram fast enough. Well, the best way to see if that is the rate limiting factor rather than something else is to do a test. Take this video loop
    :xloop2                 rdlong  pixels,ptr
                            add     ptr,#4
                            xor     pixels,phaseflip
                            waitvid pixels,#%%3210
                            djnz    x,#:xloop2
                            jmp     #:done
    

    and add NOP instructions till it fails. It fails with two but not one.

    So it is pretty close to the max speed possible.

    Next experiment - remove the xor pixels. This removes some color but the picture is still there, and it does mean that the number of horizontal tiles can go from 21 to 23. So that is 368 pixels wide.

    But I am not sure this xor can be removed in real code. I'm not entirely sure what it does as there is a comment somewhere in the code that says it is there for PAL, and in another place, for NTSC.

    So I am not sure that one can be removed.

    So while I was doing this I noticed that PAL images seem smaller than NTSC images, so I changed over to PAL. Yes, you can fit more tiles in. Now it can go from 21 to 24 tiles. That is 384 pixels.

    That is the photo below.

    However, PAL has a lot of shimmering compared with NTSC, and this seems to be a common issue with every video driver I have seen for the propeller, so I think I would prefer a 336 pixel image not shimmering to a 384 pixel image that does.

    So, looking at all these factors, my feeling is that the optimum is not at 256, nor is it at 384, but it is somewhere around 336 pixels.

    So the next experiment, is it possible to pull data out of an sram as fast that loop above.

    Dedicate a cog to this. Have an sram with at least 9 address bits directly accessible so that no latching is required for a single line. The /Rd line will be permanently low.

    The loop will be be a an out for the address, an in for the data, a wrbyte, an increment for the hub counter and an increment for the ram counter.

    I don't know if it will work or not, but it ought to be possible to start off with very tiny pictures, 16x16 pixels and those ought to read easily fast enough, and then gradually increase the rate until it fails.

    Or maybe maybe pull the data out of the external ram directly and never send it through the hub.

    All great fun!
    640 x 480 - 40K
  • BaggersBaggers Posts: 3,019
    edited 2011-08-07 08:27
    Dr_A, the "xor pixels,phaseflip" is for PAL only, so all instances can be removed for NTSC only versions.
    Remember looking at inner loop adding nops, you can only add one nop before it fails, and this is outing a long to the video buffer, so you'd have to read in 4 bytes, or them into a long, and write into HUB-RAM, you're going to need a bigger buffer, and start at the HSync, and fill the line buffer rather than waiting til you start the loop.
  • BaggersBaggers Posts: 3,019
    edited 2011-08-07 08:29
    or you could just link 4 props together, and you can have 384x256 pixels :D
  • Dr_AculaDr_Acula Posts: 5,484
    edited 2011-08-07 19:00
    Good points there Baggers - yes that particular loop is too fast to add any more code.

    I am not 100% sure how long the waitvid takes, but I'm guessing that with (say) 192 lines, 336 pixels, 30 frames per second it is around 500ns. So this will need to be another cog fetching the data, and it would need to stay ahead of the video cog. 4 reads at 55ns is 220ns, but that assumes that we don't need a NOP in the ram read circuit, and if we do then it will be longer.

    Like you say, it is 4 bytes to make up one long. There could be another solution that uses two ram chips and reads in a 16 bit word instead of a byte. There may be just enough prop pins for this if pins 28-31 are recycled - eg of those 4 pins, use 3 to drive the TV and one is the data receive from another board. That leaves 28 prop pins to drive a ram chip.

    Still some more experiments to do. 256x96 is 24k and 256x192 fits in two props which I think is your propGFX. External ram will need to be faster than this to make it worthwhile.

    BTW, is there a link somewhere to where one could buy the PropGFX?
  • ericballericball Posts: 774
    edited 2011-08-08 10:57
    For TV on the Propeller you first need to understand how composite video works. Composite video is made up of three signals: Y or luma which is the black & white component, and two color difference signals: B-Y and R-Y (scaled and called U & V respectively; see http://www.poynton.com/notes/colour_and_gamma/ColorFAQ.html for more info about them and RGB conversions). The color difference signals are modulated in quadrature at the colorburst frequency (3.579545 MHz for NTSC or 4.43361875 MHz for PAL) then added to the Y signal. (S-Video keeps them separate.)

    This puts a hard limit of the colorburst frequency on the bandwidth of the color portion of the signal, i.e. the absolute maximum pixel rate is twice the colorburst frequency. Any higher will just be picked up by the color demodulator as frequency aliases. Furthermore, the resolution of both signals is further restricted by the color separation filter in the TV with Y signals near the colorburst frequency being decoded as colors. (S-Video doesn't have this restriction.)

    The Propeller video generator uses a 16 tap shifter to generate color phase instead of separate color difference signals. This requires the PLLA frequency to be set to 16 times the colorburst frequency, i.e. 57.272727 MHz for NTSC or 70.9379 MHz for PAL. This results in a total of 3640 PLLA cycles per line for NTSC and 4540 PLLA cycles per line for PAL. However, some of that time is lost to the horizontal sync and blanking leaving 2986 PLLA cycles (NTSC, 480 lines) or 3694 PLLA cycles (PAL, 576 lines) for a 4:3 image. At 12 PLLA per pixel that gives a horizontal resolution of 248 pixels (NTSC) or 307 pixels (PAL).

    Now, if you're looking at pulling that pixel data from external RAM then you need to determine how many bytes you can retrieve per horizontal line which is 63.555usec (NTSC) or 64usec (PAL). (Includes horizontal blanking time but not vertical blanking.)
  • LoopyBytelooseLoopyByteloose Posts: 12,537
    edited 2011-08-08 11:25
    Seems like B/W would be the fastest. External ram seems to be limited to 20Mhz when the Propeller is running at 80Mhz (per Andre LeMothe).
  • Dr_AculaDr_Acula Posts: 5,484
    edited 2011-08-08 18:14
    Thanks ericball - that is a very good link. I like the matrix transformations - easier than some of the code examples I've seen.

    RGB is the easiest color scheme to understand, and it maps to the VGA driver neatly.

    RGB transforms into HSL with a formula that is reversible. I find HSL hard to describe and when I get home I might take a screenshot of the color picker for paintshop because it demonstrates it well, but essentially you have hue, which is easy to explain, and saturation, which is gray to color, and then there is a L value for each H/S combination, and the L value can be thought of in two parts - black to the H/S value (a color), and then that color up to white.

    What I don't quite understand is how this is translated into NTSC. The hue translates to the phase. But I am not quite sure what the amplitude of the waveform is - whether it is the Saturation or the Luminance.

    From a practical perspective, if one were to plot the propeller palette in a 3D RGB space (or a 3D HSL space), there is clustering of the palette around the fairly saturated colors, and no points round the unsaturated ones. In fact, as far as I can see and based on testing with paintshop, I think the at the propeller palette has a range of hue values and the saturation is fixed at about 0.7, and there are five Luminance values for each of those colors. And then there are the gray scale colors which have a Saturation value of zero, and a range of luminances. There are also the fully saturated colors with a saturation value of 1 (or more?) but to my eye, pictures are already coming out looking a bit too saturated so I haven't included these in the Floyd-Steinberg palette.

    In addition, due to the limitations of the 16 tap shifter you describe, it appears that yellow comes out a brown color.

    Maybe you can answer this because I couldn't really find an explanation (albeit I think I got close from an e-book, but of course, they only show you a few pages, and just when it was getting interesting, they say tell you those pages are deleted and you have to buy the book!), but how does NTSC encode saturation? More specificially, down at the waveform level, you have a wave with a phase (which encodes color) and you would have an amplitude, which encodes something, but how do you combine Saturation and Luminance into a wave without losing information? As far as I can see, you can only code one of those. Unless you do something cunning like superimpose two waves on top of each other? (Please forgive my ignorance, I live in a PAL country (which I also don't understand very well), and really, RGB is the only system I properly understand).

    Anyway, I'd like to push the prop further and I think you can get the wider range of saturation values by dithering color and grayscale pixels, and early experiments do suggest this works but only if the pixels are nice and small. Hence my motivation to get more pixels per line.

    I've got some boards on the way as all of this will only work if it is possible to pull data out of a ram fast enough. Otherwise, the best solution will be Baggers' dual prop system.

    I have another idea that could be worth brainstorming - what if you build the entire waveform inside a ram chip and then output it? Bypass waitvid and create the waveform in code. At the end of the day, a prop has 3 pins going up and down, and so a ram chip could have 3 pins going up and down as well. Program in the color burst, the front porch, picture, backporch etc.

    At first glance the bandwidth of a NTSC signal is 6Mhz, so if you take a digital signal and filter it with a 6Mhz low pass filter you should get a picture with no loss of information. Can a memory chip output at 6Mhz? Well, 55ns is around 20Mhz so the chip should be fast enough. How much memory - well at 30 frames a second that is 6000000/30 = 200,000 samples which should fit in a 512k ram chip.

    Is there a catch? Probably. I can see one obvious problem - how do you get the data into the ram chip in the first place, and to solve that you would need two ram chips - one to fill with data while the other one is playing. Maybe there is another catch?

    eric, on your avatar is the NTSC waveform. I think they are the color bars. Can you explain why the color bars move down as the hue is changed? And also, what does the waveform look like when you zoom right in? Is it a sine wave with a certain amplitude, phase, width (and shape)?
  • ericballericball Posts: 774
    edited 2011-08-08 20:39
    Hue roughly corresponds to the phase of the color signal and saturation to the amplitude. L corresponds to Y. However, HSL isn't the same as YUV as the transforms are very different. All of the information is in the linked page. On the Propeller there are two saturations - low saturation (Y = 2..6) where the color amplitude is equal to the colorburst and high saturation (Y=0 or 7) which is -3 times the colorburst amplitude.

    NTSC Composite video signal is Y(t) + U * cos(Fc * t) + V * sin(Fc * t) + sync(t), while PAL is Y(t) + U * cos(Fc * t) +/- V * sin(Fc * t) + sync(t), with the correct filters and demodulators each component can be extracted.

    Dithering trades off spatial resolution for color resolution. Unfortunately composite video restricts your spatial resolution and the Propeller has limited color resolution.

    Yes, it is possible to output a raw waveform - my NTSC4x1, NTSC4x2 and NTSC240H (sprite) drivers all use this a variation of this technique. You encode the picture at 4 times the colorburst frequency then use VGA mode to output it to a DAC (either the standard 3 (+1) pin TV DAC or a custom 8 pin DAC). You don't even need to encode the sync as the driver can generate it. The problem is calculating the pixel values is non-trivial as you need to scale the picture to the correct size, transform RGB to YUV then filter the result to avoid aliasing. It can be done for static images, but I wouldn't want to do it using a Propeller. Also the resulting palette will have a better distributed gamut than the standard Propeller but may not have has many hues or greys (depending on the width of your DAC).

    Yep, my avatar is a time graph of the NTSC colorbars. Spend some time with the RGB to YUV (phase/amplitude) and I think you will be able to answer your own questions. (Hint - from left to right you have sync, burst, white, yellow, then cyan.) Learning about vectorscopes might also provide some insight.
  • Dr_AculaDr_Acula Posts: 5,484
    edited 2011-08-08 21:24
    Thanks ericball - looks like I have some homework to do!

    Re
    You don't even need to encode the sync as the driver can generate it. The problem is calculating the pixel values is non-trivial as you need to scale the picture to the correct size, transform RGB to YUV then filter the result to avoid aliasing.

    Have you got an example of, say how a RGB value is translated?

    I'm already doing some of that for the dithering, eg scaling, a number of transforms and then finding the best colors. I have vb.net and C code for many of the transforms in that link you gave. Adding aliasing filtering should be possible too as I convert the values into an array so we can move a filter over that array. I'm reading this link at the moment http://en.wikipedia.org/wiki/YUV Have you got a link to the code for, say, yourNTSC4x driver?
  • ericballericball Posts: 774
    edited 2011-08-09 05:47
    Dr_Acula wrote: »
    have you got an example of, say how a rgb value is translated?

    y',r',g',b' = [0...1]
    y' = 0.299 * r' + 0.587 * g' + 0.114 * b'
    u = 0.492111 * (b'-y')
    v = 0.877283 * (r'-y)
  • Dr_AculaDr_Acula Posts: 5,484
    edited 2011-08-09 07:15
    Ok, how does that then translate into an actual NTSC waveform?

    I have not had any luck finding anything on the internet that explains this next step - I suspect I may be searching the wrong search terms. I presume some sort of sine wave is created, with a certain amplitude and a certain delay corresponding to phase? Maybe I could understand this by studying your code you mentioned above?
  • ericballericball Posts: 774
    edited 2011-08-09 07:59
    Dr_Acula wrote: »
    Ok, how does that then translate into an actual NTSC waveform?
    NTSC Composite video signal is Y'(t) + U(t) * cos(Fc * t) + V(t) * sin(Fc * t) + sync(t)
    PAL is Y'(t) + U(t) * cos(Fc * t) +/- V(t) * sin(Fc * t) + sync(t)
    where Fc is the appropriate colorburst frequency

    There are also some scaling factors, but that's the basic formula.
  • Dr_AculaDr_Acula Posts: 5,484
    edited 2011-08-09 19:19
    Thanks++ for all your help here.

    In that formula, what is t? With the sin and cosine in the formula, that implies a formula that ends up producing a sine wave. It probably is a fairly clean sine wave for a real TV camera, but when you create a wave on the propeller, presumably t is a discrete step rather than a continuous variable. How many discrete t intervals are there for a pixel at, say, 512 pixels per line? Do you end up getting multiple sine waves per pixel or just one? If there are multiple sine waves, at the crossover between pixels do you need to do this at a zero crossing? Is this the 'magic' that is happening inside a waitvid instruction?

    addit - for reference, a good description of waitvid and other video on page 11 http://www.rayslogic.com/propeller/PropellerDatasheet-v1.1.pdf
  • LoopyBytelooseLoopyByteloose Posts: 12,537
    edited 2011-08-10 04:13
    Great thread. Anyone that wants to maximize video should read along.
    Thanks to all.

    Thinking of Cosine as a horizontal element and Sine as a vertical element might help.

    The color is based on the phase shift, not how many cycles per pixel. But there seems to be a rate limit for the measurement of phase shift.
  • ericballericball Posts: 774
    edited 2011-08-10 06:48
    Dr_Acula wrote: »
    Thanks++ for all your help here.

    In that formula, what is t? With the sin and cosine in the formula, that implies a formula that ends up producing a sine wave. It probably is a fairly clean sine wave for a real TV camera, but when you create a wave on the propeller, presumably t is a discrete step rather than a continuous variable. How many discrete t intervals are there for a pixel at, say, 512 pixels per line? Do you end up getting multiple sine waves per pixel or just one? If there are multiple sine waves, at the crossover between pixels do you need to do this at a zero crossing? Is this the 'magic' that is happening inside a waitvid instruction?

    t is time, and Y(t), U(t), and V(t) are those values in raster order.

    Yes, ideally the sin waves are analog but a square wave at the right frequency is just as good because all of the harmonics are beyond the TV's bandwidth.

    A TV line is 63.5555usec (NTSC) / 64usec (PAL) with 70-80% of each line being active. (Various TVs have different amounts of overscan. Even HDTVs which shouldn't have overscan often stretch the picture slightly - even in hi-def! 8-bit computers and consoles often had a large black border to avoid data loss due to overscan.) So if we assume 80% active then 512 active pixels works out to a pixel frequency of about 10MHz. However, it would be difficult / impossible to generate color directly using that frequency. Either you use the Propeller's built-in composite video generator (in which case you'd set VSCL.PixelClocks to 5 (NSTC) or 7 (PAL) ) or you use 4 times the colorburst frequency and output Y+U, Y+V, Y-U, Y-V. Note: 512 active pixels per line (10MHz pixel frequency) is above double the colorburst frequency so you will get color artifacts instead of detail.

    Depending upon the width of the pixel a single pixel may contain more or less than a complete sine wave. When there is a color difference between pixels there is a phase shift, which likely doesn't occur at the zero crossing. But that's okay as the harmonics are, again, high frequency and thus don't have much impact. (However, high contrast edges and abrupt color shifts may result in some color artifacts at the edges - depending upon the TV.)

    WAITVID isn't magic, WAITVID just causes the cog to wait until the frame counter reaches zero. The magic is in the video generator.

    When the frame counter reaches zero the pixel and frame counters are loaded from VSCL and the the color and pixel registers are loaded from the source and destination data buses. Each PLLA decrements the frame and pixel counters and increments the phase counter. (I call it a phase shifter, but it's really a counter.) When the pixel counter reaches zero the pixel register is shifted by one or two bits (i.e. one pixel) and the counter is reset to the value previously loaded. One or two LSBs of the pixel register select the appropriate byte from the color register which either drives the output pins directly or is split into luma, color enable and phase values. If color is disabled the luma value drives the pins. If color is enabled then the phase value is added to the phase counter and the third bit determines whether 1 is added or subtracted from the luma value before driving the output pins.
  • idbruceidbruce Posts: 6,197
    edited 2011-08-10 12:39
    @Dr_Acula and ericball

    As Loopy Byteloose so eloquently stated, "Great thread.". Not really my cup of tea, but still very interesting. Dr_Acula I hope you succeed with this. I just wish this thread had updates a little more often. However I realize experimentation and the thought process, along with normal duties takes some time. Good luck.

    Bruce
  • Dr_AculaDr_Acula Posts: 5,484
    edited 2011-08-18 07:30
    Thanks idbruce. Thread has been quiet because this is one of those Thomas Edison projects - 1% inspiration and 99% perspiration. Much like your project I'm sure!

    Oh boy has this been complicated. Attached is code and a photo of the setup.

    So many things to go wrong. eg, you can't attach diagnostic leds to /oe, /wr and /rd on a sram because they pull the lines down and then as you do a handover from spin to pasm by tristating the propeller pins, you lose the memory. I have leds on D0-D7 and A0-A7 and that has been enough for diagnostics.

    It is also really hard to debug as this is running at full pasm speed. Why is the line_counter crashing the entire program? Well, it turns out there was a bug in the video driver adding 1 twice, and also the interlace was adding another 1. How do you find such a bug? a/ take the value and convert to a pixel location and print a white pixel on the screen. The value should be 96 and it was counting up to around 390 and this was the clue that it was around 4x the proper value.

    Maybe I could add a debug serial routine or something, but this stuff runs so fast that even adding in a few lines of debug code alters the behavior of the code being debugged.

    The experiment is fairly simple. Fill a sram with some data (I do this in Spin), then read it out one line at a time.

    So - is the ram fast enough?

    Well, Baggers' graphics driver has been tweaked a bit so it updates the line counter at the right time, ie, as soon as the new frame is started, and then immediately after all the pixels have been displayed and before the boundary and the back porch are displayed.

    As soon as the ramdriver cog detects a new line, it reads out some data.

    Now, I'm not 100% sure what I'm actually seeing with this experiment, but what I see is that every time I recompile and download, I see a different pattern of black lines at random locations on the screen. Sometimes it is 6. Sometimes it is 20. What I think is that there is a phasing issue and sometimes it is catching up and sometimes not.

    But I think it is reading data off a ram chip fast enough.

    And I think that my code can be optimised further.

    Fundamentally, we have a statistic Potatohead mentioned recently in another thread, where he said that with a 256 pixel driver there are 10 instructions per pixel.

    And we have some code from Cluso99's ramblade driver which is doing some clever things preloading the address lines, and exploiting the 9 bit nature of the propeller (which in some ways is not a 32 bit chip) and which works the best when driving a sram with D0-7 on prop pins P0-P7 and address pins P8 upwards.

    Cluso's batch ram load routine is only 5 pasm instructions long
    rdloop                  mov     outx, ina               ' read byte from SRAM \ ignores upper bits
                            wrbyte  outx, hubaddr           ' copy byte to hub    /
                            add     hubaddr, #1             ' inc hub pointer
                            add     outa, #(1 << 8)         ' inc sram address
                            djnz    len, #rdloop            ' loop for xxx bytes
    

    and I think that means it should be fast enough.

    No NOPs needed with a 55ns sram chip either, and I think that once the /rd pin is taken low, reads are faster than, say, alternate read/write/read.

    There are certainly more optimisations to go here. I'm really bad at writing fast code - my initial code for the ram driver was 20 pasm instructions and cluso's is 5. I'm not at all happy with the global structure of the code that tests for new lines in the ram driver object.

    But I am very encouraged by the fact that every time I tweak something, I get more "rainbow" lines and less black lines.

    Back to coding... :)
    1024 x 768 - 144K
Sign In or Register to comment.