Dunno if it the best code, more just wanted to practice some 6502, but here is a Duff's Device solution for loading PPU data. It expects that the address has been set and A holds the number of values to be copied from the (fake) stack.
I don't like that the setup is so long, maybe room for improvment there.
Code:
.proc duff_copy ; reg.a has count
tsx
stx $E; save stack
ldx $F; load fake stack
txs
; split a into high low nybble:
tax
and #$0F
asl
asl
sta 1 ; low half x 4
txa
lsr a
lsr a
lsr a
lsr a
sta 2 ; high half
lda #<jump_in
sec
sbc 1 ; subtract low nibble to find entry point
; code must be aligned so that <jump_in is greater than 15 * 4 (which is 60)
sta 3 ; 3+4 is indirect jump into loop
lda #>jump_in
sta 4
jmp (3)
copy:
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
jump_in:
dec 2
bpl copy
tsx
stx $F
ldx $E
txs
rts
.endproc
tsx
stx $E; save stack
ldx $F; load fake stack
txs
; split a into high low nybble:
tax
and #$0F
asl
asl
sta 1 ; low half x 4
txa
lsr a
lsr a
lsr a
lsr a
sta 2 ; high half
lda #<jump_in
sec
sbc 1 ; subtract low nibble to find entry point
; code must be aligned so that <jump_in is greater than 15 * 4 (which is 60)
sta 3 ; 3+4 is indirect jump into loop
lda #>jump_in
sta 4
jmp (3)
copy:
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
pla
sta PPU_DATA
jump_in:
dec 2
bpl copy
tsx
stx $F
ldx $E
txs
rts
.endproc
I don't like that the setup is so long, maybe room for improvment there.