I wrote a new VRAM data transfer routine since my old one was using page 1 and I want the stack space for something else. I think this is a good method if you don't mind using up some extra ROM. It is pretty fast, as it has to make a constant six branches per block of data to be copied. Looking for feedback if this is a decent method, or I am missing something that could be improved?
Forgive the heavy macro code, but I think it should still be understandable even if you aren't familiar with the macros.
Data format: PPUAddressHI, PPUAddressLO, Length (max 32, bits inverted), data, data, ..
PPUAddressHI : bit7 set means no more data blocks.
PPUAddressHI : bit6 set means column mode.
I am trying to figure out if you can have a block of 32 copies and jump into the appropriate spot, but then the constant part of the offset will be wrong. You could read the buffer backwards, but then it has to be written backwards.
EDIT: Realized that there is no requirment to negate the data count, I was doing that to avoid having to clear the carry for the add in each if-block: I can just add one less than the current constant.
Forgive the heavy macro code, but I think it should still be understandable even if you aren't familiar with the macros.
Data format: PPUAddressHI, PPUAddressLO, Length (max 32, bits inverted), data, data, ..
PPUAddressHI : bit7 set means no more data blocks.
PPUAddressHI : bit6 set means column mode.
Code:
.proc PPUtransferFast
; Buffer fixed to VramBuffer1
; can copy about 140 bytes with OAM DMA
; can copy about 185 bytes without OAM DMA
locals
dataLength .byte
endlocals
ldy #0 ; y is index into buffer
lda VramBuffer1
repeat
ldx #CT_NMI ; assume horizontal, leave NMI active
; check for vertical write:
if rol a == bit7 set ; check bit 6 with bit 7
ldx #(CT_NMI | CT_ADDRINC32)
endif
stx PPU_CTRL ; calling code expected to set PPU_CTRL after if needed.
ror a ; restore a
sta PPU_ADDRESS ; bit6 ignored here.
iny
lda VramBuffer1, y ; PPU address low
sta PPU_ADDRESS
; load length byte:
; bits for length are pre-inverted for more optimal code
iny
lda VramBuffer1,y
tax
if !a & #32 ; bit 6 (not) set, assume valid value of 32
; if here, all PPU writes below will be executed
tax ; a is zero, so set length to 0 -> bottom 5 bits will be read as set (logic is inverted below)
iny
mb PPU_DATA := VramBuffer1[ y ] ; do extra copy for total of 32
endif
mb local::dataLength := x
tya ; count index with a
if lsr local::dataLength == carry clear
mb x, PPU_DATA := VramBuffer1[ y + 1 ]
mb y := a +c #1
endif
if lsr local::dataLength == carry clear
mb x, PPU_DATA := VramBuffer1[ y + 1 ]
mb x, PPU_DATA := VramBuffer1[ y + 2 ]
mb y := a +c #2
endif
if lsr local::dataLength == carry clear
mb x, PPU_DATA := VramBuffer1[ y + 1 ]
mb x, PPU_DATA := VramBuffer1[ y + 2 ]
mb x, PPU_DATA := VramBuffer1[ y + 3 ]
mb x, PPU_DATA := VramBuffer1[ y + 4 ]
mb y := a +c #4
endif
if lsr local::dataLength == carry clear
mb x, PPU_DATA := VramBuffer1[ y + 1 ]
mb x, PPU_DATA := VramBuffer1[ y + 2 ]
mb x, PPU_DATA := VramBuffer1[ y + 3 ]
mb x, PPU_DATA := VramBuffer1[ y + 4 ]
mb x, PPU_DATA := VramBuffer1[ y + 5 ]
mb x, PPU_DATA := VramBuffer1[ y + 6 ]
mb x, PPU_DATA := VramBuffer1[ y + 7 ]
mb x, PPU_DATA := VramBuffer1[ y + 8 ]
mb y := a +c #8
endif
if lsr local::dataLength == carry clear
mb x, PPU_DATA := VramBuffer1[ y + 1 ]
mb x, PPU_DATA := VramBuffer1[ y + 2 ]
mb x, PPU_DATA := VramBuffer1[ y + 3 ]
mb x, PPU_DATA := VramBuffer1[ y + 4 ]
mb x, PPU_DATA := VramBuffer1[ y + 5 ]
mb x, PPU_DATA := VramBuffer1[ y + 6 ]
mb x, PPU_DATA := VramBuffer1[ y + 7 ]
mb x, PPU_DATA := VramBuffer1[ y + 8 ]
mb x, PPU_DATA := VramBuffer1[ y + 9 ]
mb x, PPU_DATA := VramBuffer1[ y + 10 ]
mb x, PPU_DATA := VramBuffer1[ y + 11 ]
mb x, PPU_DATA := VramBuffer1[ y + 12 ]
mb x, PPU_DATA := VramBuffer1[ y + 13 ]
mb x, PPU_DATA := VramBuffer1[ y + 14 ]
mb x, PPU_DATA := VramBuffer1[ y + 15 ]
mb x, PPU_DATA := VramBuffer1[ y + 16 ]
mb y := a +c #16
endif
iny
until lda VramBuffer1[ y ] == N set
; clear buffer:
ldy #0
sty M::VramBuffer1Offset
lda #$FF ; first byte is negative, if this buffer used now, nothing happens
sta VramBuffer1
rts
.endproc
; Buffer fixed to VramBuffer1
; can copy about 140 bytes with OAM DMA
; can copy about 185 bytes without OAM DMA
locals
dataLength .byte
endlocals
ldy #0 ; y is index into buffer
lda VramBuffer1
repeat
ldx #CT_NMI ; assume horizontal, leave NMI active
; check for vertical write:
if rol a == bit7 set ; check bit 6 with bit 7
ldx #(CT_NMI | CT_ADDRINC32)
endif
stx PPU_CTRL ; calling code expected to set PPU_CTRL after if needed.
ror a ; restore a
sta PPU_ADDRESS ; bit6 ignored here.
iny
lda VramBuffer1, y ; PPU address low
sta PPU_ADDRESS
; load length byte:
; bits for length are pre-inverted for more optimal code
iny
lda VramBuffer1,y
tax
if !a & #32 ; bit 6 (not) set, assume valid value of 32
; if here, all PPU writes below will be executed
tax ; a is zero, so set length to 0 -> bottom 5 bits will be read as set (logic is inverted below)
iny
mb PPU_DATA := VramBuffer1[ y ] ; do extra copy for total of 32
endif
mb local::dataLength := x
tya ; count index with a
if lsr local::dataLength == carry clear
mb x, PPU_DATA := VramBuffer1[ y + 1 ]
mb y := a +c #1
endif
if lsr local::dataLength == carry clear
mb x, PPU_DATA := VramBuffer1[ y + 1 ]
mb x, PPU_DATA := VramBuffer1[ y + 2 ]
mb y := a +c #2
endif
if lsr local::dataLength == carry clear
mb x, PPU_DATA := VramBuffer1[ y + 1 ]
mb x, PPU_DATA := VramBuffer1[ y + 2 ]
mb x, PPU_DATA := VramBuffer1[ y + 3 ]
mb x, PPU_DATA := VramBuffer1[ y + 4 ]
mb y := a +c #4
endif
if lsr local::dataLength == carry clear
mb x, PPU_DATA := VramBuffer1[ y + 1 ]
mb x, PPU_DATA := VramBuffer1[ y + 2 ]
mb x, PPU_DATA := VramBuffer1[ y + 3 ]
mb x, PPU_DATA := VramBuffer1[ y + 4 ]
mb x, PPU_DATA := VramBuffer1[ y + 5 ]
mb x, PPU_DATA := VramBuffer1[ y + 6 ]
mb x, PPU_DATA := VramBuffer1[ y + 7 ]
mb x, PPU_DATA := VramBuffer1[ y + 8 ]
mb y := a +c #8
endif
if lsr local::dataLength == carry clear
mb x, PPU_DATA := VramBuffer1[ y + 1 ]
mb x, PPU_DATA := VramBuffer1[ y + 2 ]
mb x, PPU_DATA := VramBuffer1[ y + 3 ]
mb x, PPU_DATA := VramBuffer1[ y + 4 ]
mb x, PPU_DATA := VramBuffer1[ y + 5 ]
mb x, PPU_DATA := VramBuffer1[ y + 6 ]
mb x, PPU_DATA := VramBuffer1[ y + 7 ]
mb x, PPU_DATA := VramBuffer1[ y + 8 ]
mb x, PPU_DATA := VramBuffer1[ y + 9 ]
mb x, PPU_DATA := VramBuffer1[ y + 10 ]
mb x, PPU_DATA := VramBuffer1[ y + 11 ]
mb x, PPU_DATA := VramBuffer1[ y + 12 ]
mb x, PPU_DATA := VramBuffer1[ y + 13 ]
mb x, PPU_DATA := VramBuffer1[ y + 14 ]
mb x, PPU_DATA := VramBuffer1[ y + 15 ]
mb x, PPU_DATA := VramBuffer1[ y + 16 ]
mb y := a +c #16
endif
iny
until lda VramBuffer1[ y ] == N set
; clear buffer:
ldy #0
sty M::VramBuffer1Offset
lda #$FF ; first byte is negative, if this buffer used now, nothing happens
sta VramBuffer1
rts
.endproc
I am trying to figure out if you can have a block of 32 copies and jump into the appropriate spot, but then the constant part of the offset will be wrong. You could read the buffer backwards, but then it has to be written backwards.
EDIT: Realized that there is no requirment to negate the data count, I was doing that to avoid having to clear the carry for the add in each if-block: I can just add one less than the current constant.