Omnimaga
Calculator Community => Other Calc-Related Projects and Ideas => TI Z80 => Topic started by: Xeda112358 on October 09, 2018, 07:23:57 pm
-
A few months ago I implemented heapsort (https://www.omnimaga.org/ti-z80-calculator-projects/heapsort-vatsort-and-listsort/) and used it to create a sorted array of VAT entries. While it is fast, instead of directly sorting the VAT, it creates an array of pointers to each entry and sorts the pointers. This means that the VAT itself remains unsorted (not necessarily an issue), and a variable amount of RAM needs to be allocated. In the example, if the user had over 383 variables in the named variables table, only the first 383 entries could be sorted (unlikely, but an issue nevertheless).
My goal here was to sort the VAT in-place, using only a fixed amount of overhead memory. After a couple of weeks of brainstorming and testing, I ended up with an adaptive insertion sort implementation and a mergesort implementation.
pTemp = 982Eh ;bottom of named vars VAT
progPtr = 9830h ;top of named vars VAT
OP1 = 8478h
;uses 19 bytes of RAM, 4 bytes of stack space
#define tmp OP1+14
#define head tmp+1
#define head_lag tmp+3
#macro advanceVAT()
#ifdef speed
;17cc saved per iteration
;8 bytes vs a 3 byte call (but no 8-byte subroutine)
ld bc,-6
add hl,bc
sbc a,a ;HL<=FE66, so carry flag is set
sub (hl)
ld c,a
add hl,bc
#else
call advance_VAT ;preserves DE
#endif
#endmacro
sortVAT:
#ifdef nointerrupt
di
#endif
ld hl,(progPtr)
isort_main:
_:
ld (head_lag),hl
ld d,h
ld e,l
advanceVAT()
ld (head),hl
#ifdef speed
;11 bytes, 29cc or 46cc. avg=29.06640625cc
ld a,(pTemp)
cp l
jr nz,$+7
ld a,(pTemp+1)
cp h
ret z
#else
;adds 8 bytes, 55cc
ld bc,(pTemp) ;Need to verify that we haven't reached the end of the progVAT
or a ;
sbc hl,bc ;
ret z ;
add hl,bc ;
#endif
call cmpVAT
ld hl,(head)
jr nc,-_
;if it makes it here, then (head) needs to be inserted into the previous part of the VAT
;We might be able to speed it up a little more if I also grab the next element
; If (head_advance) is bigger than (head), then no need to start the search from the beginning
ld de,tmp
#ifdef speed
ldd
ldd
ldd
ldd
ldd
ldd
ld b,0
ld c,(hl)
lddr
ldd
#else
ld bc,6
lddr
ld c,(hl)
inc c
lddr
#endif
ld hl,(progPtr)
_:
push hl
#ifdef speed
;+5 bytes, -11cc
ld bc,-6
add hl,bc
ld de,tmp-6
call cmpVAT_stepin
#else
ex de,hl
ld hl,tmp
call cmpVAT
#endif
pop hl
jr c,+_
advanceVAT()
jp -_
_:
;HL is where to insert
ld de,(head)
or a
sbc hl,de
ld b,h
ld c,l
ld hl,-6
add hl,de
ld a,l
sub a,(hl)
ld l,a
jr nc,$+4
dec h
or a
inc de
ex de,hl
#ifdef speed
call fastldir
#else
ldir
#endif
;begin at DE, copy tmp. First need size of tmp
ld hl,tmp-6
ld c,(hl)
sbc hl,bc
ld a,c
ldir
#ifdef speed
ldi
ldi
ldi
ldi
ldi
ldi
ldi
add a,7
#else
ld c,7
add a,c
ldir
#endif
ld hl,(head_lag)
ld c,a
ld a,l
sub c
ld l,a
jp nc,isort_main
dec h
jp isort_main
#ifndef speed
advance_VAT:
ld bc,-6
add hl,bc
sbc a,a ;HL<=FE66, so carry flag is set
sub (hl)
ld c,a
add hl,bc
ret
#endif
cmpVAT:
;if @HL>=@DE, return nc
ld bc,-6
add hl,bc
ex de,hl
add hl,bc
cmpVAT_stepin:
ld a,(de)
cp (hl)
jr nc,first_longer
;the second name is longer.
ld c,a
_:
dec hl
dec de
ld a,(de)
cp (hl)
ret nz
dec c
jr nz,-_
scf
ret
first_longer:
;the first name is longer, so load c with the size of the second name
ld c,(hl)
_:
dec hl
dec de
ld a,(de)
cp (hl)
ret nz
dec c
jr nz,-_
ret
#ifdef speed
fastldir:
;copy BC bytes from HL to DE
;copy BC bytes from HL to DE
;breaks even at 26 bytes
; 5% faster than LDIR with 35 bytes
;10% faster than LDIR with 48 bytes
;15% faster than LDIR with 91 bytes
;20% faster than LDIR with 635 bytes
;max is ~ 20.8% faster than LDIR
;Cost: 104+16N+10ceiling(N/16)
push hl
; push af
xor a
sub c
and 15 ;change to n-1
add a,a
ld hl,ldirloop
add a,l
ld l,a
#if (ldirloop>>8)!=(_ldirloop_end>>8)
jr nc,$+3 ;these aren't needed if the ldirloop doesn't cross a 256 byte boundary. Can save 12cc on the above timings and 3 bytes.
inc h ;
#endif
; pop af
ex (sp),hl
ret
ldirloop:
;n=16, (number of LDI instructions, use qty of 4,8,16,32,64)
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
_ldirloop_end:
ldi
jp pe,ldirloop
ret
#endif
#undefine tmp
#undefine head
#undefine head_lag1
.echo $-sortVAT," bytes"
pTemp = 982Eh ;bottom of named vars VAT
progPtr = 9830h ;top of named vars VAT
OP1 = 8478h
;uses 29 bytes of state.
; 4 bytes stack space
; 10 bytes variables
; 15 bytes swap
#define var_n OP1+15
#define partition_size var_n+2
#define var_k partition_size
#define head0 var_n+4
#define head1 var_n+6
#define tail var_n+8
cmpVAT:
;HL points to one entry
;DE points to the second entry
;return c if first>second
;return nc if first<=second
ld bc,-6
add hl,bc
ld a,(hl)
ex de,hl
add hl,bc
ld b,(hl)
ex de,hl
;A is the size of the first name
;HL points to the byte above the first name
;B is the size of the second name
;DE points to the byte above the second name
ld c,a
jr nc,second_name_longer
_:
dec hl
dec de
ld a,(de)
cp (hl)
ret nz
dec c
djnz -_
inc c
scf
ret
second_name_longer:
dec hl
dec de
ld a,(de)
cp (hl)
ret nz
dec c
jr nz,second_name_longer
ret
nthstr:
;Input:
; HL points to the size byte of the name of the first element
; BC is the number of elements
;Output:
; c flag set if end of VAT reached, else nc
; HL points to the element
;speed:
; worst: 34+(95-7/256)*BC
; best: 34+(95-14/256)*BC
ld de,(pTemp)
_:
or a ;Can do some analysis to omit this code for most runs
sbc hl,de ;could speed this up by almost 37%
add hl,de ;Still, only adds .001 seconds per 100 VAT entries :P
ret c ;35cc
ld a,-6
sub (hl)
add a,l
ld l,a
jr c,$+3
dec h
or a
cpd
jp pe,-_
or a
ret
sortVAT:
ld hl,(progPtr) ;get the number of elements in the VAT
ld de,-6 ;
add hl,de ;
ld bc,0 ;
call nthstr ;
ld (var_n),bc ;
ld hl,1
ld (var_k),hl
sort_main: ;until partition_size exceeds size of VAT
ld hl,(progPtr)
sort_main_0: ;merge multiples of partition_size
ld (head0),hl
ld de,-6
add hl,de
ld bc,(var_k)
call nthstr
jp c,sort_main_0_end
push hl
ld bc,(var_k)
call nthstr
ld de,6
add hl,de
ld (tail),hl
pop hl
add hl,de
ld (head1),hl
ex de,hl ;head1
ld hl,(head0)
;or a
sbc hl,de
ld hl,(tail)
jr z,mergeloop_end
;or a ;head0>=head1, so c flag should be reset already
sbc hl,de
jr z,mergeloop_end-1
ld de,(head1)
mergeloop:
ld hl,(head0)
call cmpVAT
jr nc,+_
#ifdef speed
;saves 42cc
ld hl,(head1)
ld de,var_n-1
ldd
ldd
ldd
ldd
ldd
ldd
ld a,(hl)
ldd
ld c,a
add a,7
ld b,0
#else
ld hl,(head1)
ld de,-6
add hl,de
ld a,(hl)
ld de,var_n-1
ld hl,(head1)
add a,7
ld b,0
ld c,a
#endif
lddr
;HL now points to the bottom
ld de,(head1)
ld (head1),hl
ld hl,(head0)
sbc hl,de ;number of bytes that need to be shifted up by A
ld b,h
ld c,l
;need to shift from DE to (head1)
ld hl,(head1)
ex de,hl
inc de
inc hl
#ifdef speed
call fastldir
#else
ldir
#endif
ld hl,var_n-1
ld de,(head0)
ld c,a
lddr
ex de,hl
jp $+9
_:
ld a,l
sub c
ld l,a
jr nc,$+3
dec h
ld (head0),hl
ld de,(head1)
or a
sbc hl,de
ld hl,(tail)
jr z,mergeloop_end
;or a ;head0>=head1, so c flag is reset already
sbc hl,de
jr nz,mergeloop
add hl,de
mergeloop_end:
ex de,hl
;ld de,(tail)
ld hl,(pTemp)
;or a
sbc hl,de
ex de,hl
jp nz,sort_main_0
sort_main_0_end:
ld hl,(var_k)
add hl,hl
ld (var_k),hl
ld bc,(var_n)
add hl,bc
jp nc,sort_main
ret
#ifdef speed
fastldir:
;copy BC bytes from HL to DE
;copy BC bytes from HL to DE
;breaks even at 26 bytes
; 5% faster than LDIR with 35 bytes
;10% faster than LDIR with 48 bytes
;15% faster than LDIR with 91 bytes
;20% faster than LDIR with 635 bytes
;max is ~ 20.8% faster than LDIR
;Cost: 104+16N+10ceiling(N/16)
push hl
push af
xor a
sub c
and 15 ;change to n-1
add a,a
ld hl,ldirloop
add a,l
ld l,a
#if (ldirloop>>8)!=(_ldirloop_end>>8)
jr nc,$+3 ;these aren't needed if the ldirloop doesn't cross a 256 byte boundary. Can save 12cc on the above timings and 3 bytes.
inc h ;
#endif
pop af
ex (sp),hl
ret
ldirloop:
;n=16, (number of LDI instructions, use qty of 4,8,16,32,64)
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
ldi
_ldirloop_end:
ldi
jp pe,ldirloop
ret
#endif
.echo $-$9D95," bytes"
#undefine var_n
#undefine partition_size
#undefine var_k
#undefine head0
#undefine head1
#undefine tail
I want to clarify that my mergesort method had to be in-place using O(1) space, so it devolved into an O(n2) algorithm.
With my estimates, mergesort would be faster than a non-adaptive insertion sort at 90 elements in the worst case. In practice, at 96 elements it was about 5% faster, probably because it wasn't a worst case scenario.
Either way, my insertion sort is fast compared with other implementations like that in MirageOS or DoorsCS7. It's much faster when the VAT is mostly sorted already, like if it gets sorted and then a user adds a few new programs. In that case, their implementations still seem to be O(n2) whereas mine is closer to O(n).
In both codes, the biggest speed factor was in shifting VAT entries. In the speed optimized versions, I take advantage of LDI over LDIR where I can, and I use my fastldir routine to eke out as much performance as possible, gaining a 10% speed boost in the overall routine in my test.
-
Wow... i spent the last three nights trying to understand how to just display the names of all my programs :P
-
Haha, yeah, VAT manipulation is a pain. I saw your Axe program and it looks pretty low level; have you ever done any assembly work?
-
No. Actually I only just started coding... kinda
Ive been trying for ages with different languages but as the only computer I have sucks and because of schoolwork I just haven't really had the chance to try anything.
so I got a Ti-84 +SE as soon as I heard about it and here I am! Axe is the first language I can use every spare minute I have because its on a portable(ish) device which is great!
-
That is pretty impressive. And that's basically how I got into calc programming. It was super portable and relatively easy to use and didn't have a computer at home.
-
Probably the most advanced code i have made yet was a simple game using Unity where you have a fox that runs around a little room dodging orange balls that dispense at varied speeds from the top left corner, slowly speeding up interval between dispenses. It only lacked menu systems and a high score but I left it because everything my computer had that supported any code (Python, CSharp, Java) broke