Author Topic: Yet Another VAT Sort (Read 5883 times)

Xeda112358 · « **on:** October 09, 2018, 07:23:57 pm »

A few months ago I implemented heapsort and used it to create a sorted array of VAT entries. While it is fast, instead of directly sorting the VAT, it creates an array of pointers to each entry and sorts the pointers. This means that the VAT itself remains unsorted (not necessarily an issue), and a variable amount of RAM needs to be allocated. In the example, if the user had over 383 variables in the named variables table, only the first 383 entries could be sorted (unlikely, but an issue nevertheless).

My goal here was to sort the VAT in-place, using only a fixed amount of overhead memory. After a couple of weeks of brainstorming and testing, I ended up with an adaptive insertion sort implementation and a mergesort implementation.

Code: (Insertion Sort) [Select]

pTemp       = 982Eh ;bottom of named vars VAT
progPtr			= 9830h ;top of named vars VAT
OP1         = 8478h
;uses 19 bytes of RAM, 4 bytes of stack space
#define tmp OP1+14
#define head tmp+1
#define head_lag tmp+3
#macro advanceVAT()
#ifdef speed
;17cc saved per iteration
;8 bytes vs a 3 byte call (but no 8-byte subroutine)
  ld bc,-6
  add hl,bc
  sbc a,a   ;HL<=FE66, so carry flag is set
  sub (hl)
  ld c,a
  add hl,bc
#else
  call advance_VAT    ;preserves DE
#endif
#endmacro


sortVAT:
#ifdef nointerrupt
  di
#endif
  ld hl,(progPtr)
isort_main:
_:
  ld (head_lag),hl
  ld d,h
  ld e,l
  advanceVAT()
  ld (head),hl
#ifdef speed
;11 bytes, 29cc or 46cc. avg=29.06640625cc
  ld a,(pTemp)
  cp l
  jr nz,$+7
  ld a,(pTemp+1)
  cp h
  ret z
#else
;adds 8 bytes, 55cc
  ld bc,(pTemp) ;Need to verify that we haven't reached the end of the progVAT
  or a          ;
  sbc hl,bc     ;
  ret z         ;
  add hl,bc     ;
#endif
  call cmpVAT
  ld hl,(head)
  jr nc,-_
;if it makes it here, then (head) needs to be inserted into the previous part of the VAT
;We might be able to speed it up a little more if I also grab the next element
;  If (head_advance) is bigger than (head), then no need to start the search from the beginning
  ld de,tmp
#ifdef speed
  ldd
  ldd
  ldd
  ldd
  ldd
  ldd
  ld b,0
  ld c,(hl)
  lddr
  ldd
#else
  ld bc,6
  lddr
  ld c,(hl)
  inc c
  lddr
#endif
  ld hl,(progPtr)
_:
  push hl
#ifdef speed
;+5 bytes, -11cc
  ld bc,-6
  add hl,bc
  ld de,tmp-6
  call cmpVAT_stepin
#else
  ex de,hl
  ld hl,tmp
  call cmpVAT
#endif
  pop hl
  jr c,+_
  advanceVAT()
  jp -_
_:
;HL is where to insert
  ld de,(head)
  or a
  sbc hl,de
  ld b,h
  ld c,l
  ld hl,-6
  add hl,de
  ld a,l
  sub a,(hl)
  ld l,a
  jr nc,$+4
  dec h
  or a
  inc de
  ex de,hl
#ifdef speed
  call fastldir
#else
  ldir
#endif
  ;begin at DE, copy tmp. First need size of tmp
  ld hl,tmp-6
  ld c,(hl)
  sbc hl,bc
  ld a,c
  ldir
#ifdef speed
  ldi
  ldi
  ldi
  ldi
  ldi
  ldi
  ldi
  add a,7
#else
  ld c,7
  add a,c
  ldir
#endif
  ld hl,(head_lag)
  ld c,a
  ld a,l
  sub c
  ld l,a
  jp nc,isort_main
  dec h
  jp isort_main
#ifndef speed
advance_VAT:
  ld bc,-6
  add hl,bc
  sbc a,a   ;HL<=FE66, so carry flag is set
  sub (hl)
  ld c,a
  add hl,bc
  ret
#endif
cmpVAT:
;if @HL>=@DE, return nc
  ld bc,-6
  add hl,bc
  ex de,hl
  add hl,bc
cmpVAT_stepin:
  ld a,(de)
  cp (hl)
  jr nc,first_longer
;the second name is longer.
  ld c,a
_:
  dec hl
  dec de
  ld a,(de)
  cp (hl)
  ret nz
  dec c
  jr nz,-_
  scf
  ret
first_longer:
;the first name is longer, so load c with the size of the second name
  ld c,(hl)
_:
  dec hl
  dec de
  ld a,(de)
  cp (hl)
  ret nz
  dec c
  jr nz,-_
  ret
#ifdef speed
fastldir:
;copy BC bytes from HL to DE
;copy BC bytes from HL to DE
;breaks even at 26 bytes
; 5% faster than LDIR with 35 bytes
;10% faster than LDIR with 48 bytes
;15% faster than LDIR with 91 bytes
;20% faster than LDIR with 635 bytes
;max is ~ 20.8% faster than LDIR
;Cost: 104+16N+10ceiling(N/16)
    push hl
;    push af
    xor a
    sub c
    and 15               ;change to n-1
    add a,a
    ld hl,ldirloop
    add a,l
    ld l,a
#if (ldirloop>>8)!=(_ldirloop_end>>8)
    jr nc,$+3  ;these aren't needed if the ldirloop doesn't cross a 256 byte boundary. Can save 12cc on the above timings and 3 bytes.
    inc h       ;
#endif
;    pop af
    ex (sp),hl
    ret
ldirloop:
;n=16, (number of LDI instructions, use qty of 4,8,16,32,64)
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
_ldirloop_end:
    ldi
    jp pe,ldirloop
    ret
#endif
#undefine tmp
#undefine head
#undefine head_lag1
.echo $-sortVAT," bytes"

Code: (Mergesort) [Select]

pTemp       = 982Eh ;bottom of named vars VAT
progPtr			= 9830h ;top of named vars VAT
OP1         = 8478h
;uses 29 bytes of state.
; 4 bytes stack space
; 10 bytes variables
; 15 bytes swap
#define var_n OP1+15
#define partition_size var_n+2
#define var_k partition_size
#define head0 var_n+4
#define head1 var_n+6
#define tail var_n+8

cmpVAT:
;HL points to one entry
;DE points to the second entry
;return c if first>second
;return nc if first<=second
  ld bc,-6
  add hl,bc
  ld a,(hl)
  ex de,hl
  add hl,bc
  ld b,(hl)
  ex de,hl
  ;A is the size of the first name
  ;HL points to the byte above the first name
  ;B is the size of the second name
  ;DE points to the byte above the second name
  ld c,a
  jr nc,second_name_longer
_:
  dec hl
  dec de
  ld a,(de)
  cp (hl)
  ret nz
  dec c
  djnz -_
  inc c
  scf
  ret
second_name_longer:
  dec hl
  dec de
  ld a,(de)
  cp (hl)
  ret nz
  dec c
  jr nz,second_name_longer
  ret
nthstr:
;Input:
;   HL points to the size byte of the name of the first element
;   BC is the number of elements
;Output:
;   c flag set if end of VAT reached, else nc
;   HL points to the element
;speed:
; worst: 34+(95-7/256)*BC
; best:  34+(95-14/256)*BC
  ld de,(pTemp)
_:
  or a      ;Can do some analysis to omit this code for most runs
  sbc hl,de ;could speed this up by almost 37%
  add hl,de ;Still, only adds .001 seconds per 100 VAT entries :P
  ret c     ;35cc
  ld a,-6
  sub (hl)
  add a,l
  ld l,a
  jr c,$+3
  dec h
  or a
  cpd
  jp pe,-_
  or a
  ret
sortVAT:
  ld hl,(progPtr) ;get the number of elements in the VAT
  ld de,-6        ;
  add hl,de       ;
  ld bc,0         ;
  call nthstr     ;
  ld (var_n),bc   ;
  ld hl,1
  ld (var_k),hl
sort_main:      ;until partition_size exceeds size of VAT
  ld hl,(progPtr)
sort_main_0:    ;merge multiples of partition_size
  ld (head0),hl
  ld de,-6
  add hl,de
  ld bc,(var_k)
  call nthstr
  jp c,sort_main_0_end
  push hl
  ld bc,(var_k)
  call nthstr
  ld de,6
  add hl,de
  ld (tail),hl
  pop hl
  add hl,de
  ld (head1),hl

  ex de,hl    ;head1
  ld hl,(head0)
  ;or a
  sbc hl,de
    ld hl,(tail)
  jr z,mergeloop_end
  ;or a         ;head0>=head1, so c flag should be reset already
  sbc hl,de
  jr z,mergeloop_end-1
  ld de,(head1)
mergeloop:
  ld hl,(head0)
  call cmpVAT
  jr nc,+_
#ifdef speed
;saves 42cc
  ld hl,(head1)
  ld de,var_n-1
  ldd
  ldd
  ldd
  ldd
  ldd
  ldd
  ld a,(hl)
  ldd
  ld c,a
  add a,7
  ld b,0
#else
  ld hl,(head1)
  ld de,-6
  add hl,de
  ld a,(hl)
  ld de,var_n-1
  ld hl,(head1)
  add a,7
  ld b,0
  ld c,a
#endif
  lddr
;HL now points to the bottom
  ld de,(head1)
  ld (head1),hl
  ld hl,(head0)
  sbc hl,de ;number of bytes that need to be shifted up by A
  ld b,h
  ld c,l
;need to shift from DE to (head1)
  ld hl,(head1)
  ex de,hl
  inc de
  inc hl
#ifdef speed
  call fastldir
#else
  ldir
#endif

  ld hl,var_n-1
  ld de,(head0)
  ld c,a
  lddr
  ex de,hl
  jp $+9
_:
  ld a,l
  sub c
  ld l,a
  jr nc,$+3
  dec h

  ld (head0),hl
  ld de,(head1)
  or a
  sbc hl,de
  ld hl,(tail)
  jr z,mergeloop_end
  ;or a         ;head0>=head1, so c flag is reset already
  sbc hl,de
  jr nz,mergeloop
  add hl,de
mergeloop_end:
  ex de,hl
  ;ld de,(tail)
  ld hl,(pTemp)
  ;or a
  sbc hl,de
  ex de,hl
  jp nz,sort_main_0
sort_main_0_end:
  ld hl,(var_k)
  add hl,hl
  ld (var_k),hl  
  ld bc,(var_n)
  add hl,bc
  jp nc,sort_main
  ret
#ifdef speed
fastldir:
;copy BC bytes from HL to DE
;copy BC bytes from HL to DE
;breaks even at 26 bytes
; 5% faster than LDIR with 35 bytes
;10% faster than LDIR with 48 bytes
;15% faster than LDIR with 91 bytes
;20% faster than LDIR with 635 bytes
;max is ~ 20.8% faster than LDIR
;Cost: 104+16N+10ceiling(N/16)
    push hl
    push af
    xor a
    sub c
    and 15               ;change to n-1
    add a,a
    ld hl,ldirloop
    add a,l
    ld l,a
#if (ldirloop>>8)!=(_ldirloop_end>>8)
    jr nc,$+3  ;these aren't needed if the ldirloop doesn't cross a 256 byte boundary. Can save 12cc on the above timings and 3 bytes.
    inc h       ;
#endif
    pop af
    ex (sp),hl
    ret
ldirloop:
;n=16, (number of LDI instructions, use qty of 4,8,16,32,64)
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
    ldi
_ldirloop_end:
    ldi
    jp pe,ldirloop
    ret
#endif
.echo $-$9D95," bytes"
#undefine var_n
#undefine partition_size
#undefine var_k
#undefine head0
#undefine head1
#undefine tail

I want to clarify that my mergesort method had to be in-place using O(1) space, so it devolved into an O(n²) algorithm.
With my estimates, mergesort would be faster than a non-adaptive insertion sort at 90 elements in the worst case. In practice, at 96 elements it was about 5% faster, probably because it wasn't a worst case scenario.

Either way, my insertion sort is fast compared with other implementations like that in MirageOS or DoorsCS7. It's much faster when the VAT is mostly sorted already, like if it gets sorted and then a user adds a few new programs. In that case, their implementations still seem to be O(n²) whereas mine is closer to O(n).

In both codes, the biggest speed factor was in shifting VAT entries. In the speed optimized versions, I take advantage of LDI over LDIR where I can, and I use my fastldir routine to eke out as much performance as possible, gaining a 10% speed boost in the overall routine in my test.

ClainBill · « **Reply #1 on:** January 29, 2019, 03:13:29 am »

Wow... i spent the last three nights trying to understand how to just display the names of all my programs

Xeda112358 · « **Reply #2 on:** January 29, 2019, 09:53:54 am »

Haha, yeah, VAT manipulation is a pain. I saw your Axe program and it looks pretty low level; have you ever done any assembly work?

ClainBill · « **Reply #3 on:** February 01, 2019, 10:50:50 am »

No. Actually I only just started coding... kinda
Ive been trying for ages with different languages but as the only computer I have sucks and because of schoolwork I just haven't really had the chance to try anything.
so I got a Ti-84 +SE as soon as I heard about it and here I am! Axe is the first language I can use every spare minute I have because its on a portable(ish) device which is great!

Xeda112358 · « **Reply #4 on:** February 01, 2019, 10:59:01 am »

That is pretty impressive. And that's basically how I got into calc programming. It was super portable and relatively easy to use and didn't have a computer at home.

ClainBill · « **Reply #5 on:** February 02, 2019, 05:21:04 am »

Probably the most advanced code i have made yet was a simple game using Unity where you have a fox that runs around a little room dodging orange balls that dispense at varied speeds from the top left corner, slowly speeding up interval between dispenses. It only lacked menu systems and a high score but I left it because everything my computer had that supported any code (Python, CSharp, Java) broke

Author Topic: Yet Another VAT Sort (Read 5883 times)

Xeda112358

Yet Another VAT Sort

ClainBill

Re: Yet Another VAT Sort

Xeda112358

Re: Yet Another VAT Sort

ClainBill

Re: Yet Another VAT Sort

Xeda112358

Re: Yet Another VAT Sort

ClainBill

Re: Yet Another VAT Sort