0 Members and 1 Guest are viewing this topic.
pTemp = 982Eh ;bottom of named vars VATprogPtr = 9830h ;top of named vars VATOP1 = 8478h;uses 19 bytes of RAM, 4 bytes of stack space#define tmp OP1+14#define head tmp+1#define head_lag tmp+3#macro advanceVAT()#ifdef speed;17cc saved per iteration;8 bytes vs a 3 byte call (but no 8-byte subroutine) ld bc,-6 add hl,bc sbc a,a ;HL<=FE66, so carry flag is set sub (hl) ld c,a add hl,bc#else call advance_VAT ;preserves DE#endif#endmacrosortVAT:#ifdef nointerrupt di#endif ld hl,(progPtr)isort_main:_: ld (head_lag),hl ld d,h ld e,l advanceVAT() ld (head),hl#ifdef speed;11 bytes, 29cc or 46cc. avg=29.06640625cc ld a,(pTemp) cp l jr nz,$+7 ld a,(pTemp+1) cp h ret z#else;adds 8 bytes, 55cc ld bc,(pTemp) ;Need to verify that we haven't reached the end of the progVAT or a ; sbc hl,bc ; ret z ; add hl,bc ;#endif call cmpVAT ld hl,(head) jr nc,-_;if it makes it here, then (head) needs to be inserted into the previous part of the VAT;We might be able to speed it up a little more if I also grab the next element; If (head_advance) is bigger than (head), then no need to start the search from the beginning ld de,tmp#ifdef speed ldd ldd ldd ldd ldd ldd ld b,0 ld c,(hl) lddr ldd#else ld bc,6 lddr ld c,(hl) inc c lddr#endif ld hl,(progPtr)_: push hl#ifdef speed;+5 bytes, -11cc ld bc,-6 add hl,bc ld de,tmp-6 call cmpVAT_stepin#else ex de,hl ld hl,tmp call cmpVAT#endif pop hl jr c,+_ advanceVAT() jp -__:;HL is where to insert ld de,(head) or a sbc hl,de ld b,h ld c,l ld hl,-6 add hl,de ld a,l sub a,(hl) ld l,a jr nc,$+4 dec h or a inc de ex de,hl#ifdef speed call fastldir#else ldir#endif ;begin at DE, copy tmp. First need size of tmp ld hl,tmp-6 ld c,(hl) sbc hl,bc ld a,c ldir#ifdef speed ldi ldi ldi ldi ldi ldi ldi add a,7#else ld c,7 add a,c ldir#endif ld hl,(head_lag) ld c,a ld a,l sub c ld l,a jp nc,isort_main dec h jp isort_main#ifndef speedadvance_VAT: ld bc,-6 add hl,bc sbc a,a ;HL<=FE66, so carry flag is set sub (hl) ld c,a add hl,bc ret#endifcmpVAT:;if @HL>=@DE, return nc ld bc,-6 add hl,bc ex de,hl add hl,bccmpVAT_stepin: ld a,(de) cp (hl) jr nc,first_longer;the second name is longer. ld c,a_: dec hl dec de ld a,(de) cp (hl) ret nz dec c jr nz,-_ scf retfirst_longer:;the first name is longer, so load c with the size of the second name ld c,(hl)_: dec hl dec de ld a,(de) cp (hl) ret nz dec c jr nz,-_ ret#ifdef speedfastldir:;copy BC bytes from HL to DE;copy BC bytes from HL to DE;breaks even at 26 bytes; 5% faster than LDIR with 35 bytes;10% faster than LDIR with 48 bytes;15% faster than LDIR with 91 bytes;20% faster than LDIR with 635 bytes;max is ~ 20.8% faster than LDIR;Cost: 104+16N+10ceiling(N/16) push hl; push af xor a sub c and 15 ;change to n-1 add a,a ld hl,ldirloop add a,l ld l,a#if (ldirloop>>8)!=(_ldirloop_end>>8) jr nc,$+3 ;these aren't needed if the ldirloop doesn't cross a 256 byte boundary. Can save 12cc on the above timings and 3 bytes. inc h ;#endif; pop af ex (sp),hl retldirloop:;n=16, (number of LDI instructions, use qty of 4,8,16,32,64) ldi ldi ldi ldi ldi ldi ldi ldi ldi ldi ldi ldi ldi ldi ldi_ldirloop_end: ldi jp pe,ldirloop ret#endif#undefine tmp#undefine head#undefine head_lag1.echo $-sortVAT," bytes"
pTemp = 982Eh ;bottom of named vars VATprogPtr = 9830h ;top of named vars VATOP1 = 8478h;uses 29 bytes of state.; 4 bytes stack space; 10 bytes variables; 15 bytes swap#define var_n OP1+15#define partition_size var_n+2#define var_k partition_size#define head0 var_n+4#define head1 var_n+6#define tail var_n+8cmpVAT:;HL points to one entry;DE points to the second entry;return c if first>second;return nc if first<=second ld bc,-6 add hl,bc ld a,(hl) ex de,hl add hl,bc ld b,(hl) ex de,hl ;A is the size of the first name ;HL points to the byte above the first name ;B is the size of the second name ;DE points to the byte above the second name ld c,a jr nc,second_name_longer_: dec hl dec de ld a,(de) cp (hl) ret nz dec c djnz -_ inc c scf retsecond_name_longer: dec hl dec de ld a,(de) cp (hl) ret nz dec c jr nz,second_name_longer retnthstr:;Input:; HL points to the size byte of the name of the first element; BC is the number of elements;Output:; c flag set if end of VAT reached, else nc; HL points to the element;speed:; worst: 34+(95-7/256)*BC; best: 34+(95-14/256)*BC ld de,(pTemp)_: or a ;Can do some analysis to omit this code for most runs sbc hl,de ;could speed this up by almost 37% add hl,de ;Still, only adds .001 seconds per 100 VAT entries :P ret c ;35cc ld a,-6 sub (hl) add a,l ld l,a jr c,$+3 dec h or a cpd jp pe,-_ or a retsortVAT: ld hl,(progPtr) ;get the number of elements in the VAT ld de,-6 ; add hl,de ; ld bc,0 ; call nthstr ; ld (var_n),bc ; ld hl,1 ld (var_k),hlsort_main: ;until partition_size exceeds size of VAT ld hl,(progPtr)sort_main_0: ;merge multiples of partition_size ld (head0),hl ld de,-6 add hl,de ld bc,(var_k) call nthstr jp c,sort_main_0_end push hl ld bc,(var_k) call nthstr ld de,6 add hl,de ld (tail),hl pop hl add hl,de ld (head1),hl ex de,hl ;head1 ld hl,(head0) ;or a sbc hl,de ld hl,(tail) jr z,mergeloop_end ;or a ;head0>=head1, so c flag should be reset already sbc hl,de jr z,mergeloop_end-1 ld de,(head1)mergeloop: ld hl,(head0) call cmpVAT jr nc,+_#ifdef speed;saves 42cc ld hl,(head1) ld de,var_n-1 ldd ldd ldd ldd ldd ldd ld a,(hl) ldd ld c,a add a,7 ld b,0#else ld hl,(head1) ld de,-6 add hl,de ld a,(hl) ld de,var_n-1 ld hl,(head1) add a,7 ld b,0 ld c,a#endif lddr;HL now points to the bottom ld de,(head1) ld (head1),hl ld hl,(head0) sbc hl,de ;number of bytes that need to be shifted up by A ld b,h ld c,l;need to shift from DE to (head1) ld hl,(head1) ex de,hl inc de inc hl#ifdef speed call fastldir#else ldir#endif ld hl,var_n-1 ld de,(head0) ld c,a lddr ex de,hl jp $+9_: ld a,l sub c ld l,a jr nc,$+3 dec h ld (head0),hl ld de,(head1) or a sbc hl,de ld hl,(tail) jr z,mergeloop_end ;or a ;head0>=head1, so c flag is reset already sbc hl,de jr nz,mergeloop add hl,demergeloop_end: ex de,hl ;ld de,(tail) ld hl,(pTemp) ;or a sbc hl,de ex de,hl jp nz,sort_main_0sort_main_0_end: ld hl,(var_k) add hl,hl ld (var_k),hl ld bc,(var_n) add hl,bc jp nc,sort_main ret#ifdef speedfastldir:;copy BC bytes from HL to DE;copy BC bytes from HL to DE;breaks even at 26 bytes; 5% faster than LDIR with 35 bytes;10% faster than LDIR with 48 bytes;15% faster than LDIR with 91 bytes;20% faster than LDIR with 635 bytes;max is ~ 20.8% faster than LDIR;Cost: 104+16N+10ceiling(N/16) push hl push af xor a sub c and 15 ;change to n-1 add a,a ld hl,ldirloop add a,l ld l,a#if (ldirloop>>8)!=(_ldirloop_end>>8) jr nc,$+3 ;these aren't needed if the ldirloop doesn't cross a 256 byte boundary. Can save 12cc on the above timings and 3 bytes. inc h ;#endif pop af ex (sp),hl retldirloop:;n=16, (number of LDI instructions, use qty of 4,8,16,32,64) ldi ldi ldi ldi ldi ldi ldi ldi ldi ldi ldi ldi ldi ldi ldi_ldirloop_end: ldi jp pe,ldirloop ret#endif.echo $-$9D95," bytes"#undefine var_n#undefine partition_size#undefine var_k#undefine head0#undefine head1#undefine tail