0 Members and 1 Guest are viewing this topic.
sqrt32:;Input: HLDE;Output: DE is the square root, AHL is the remainder;Destroys: D'E', H'L';Speed: 248+{0,44}+3*sqrt32sub+sqrt32sub_2+sqrt32_iter15;min: 1697cc;max: 2086cc;avg: 1889.75cc;;Python implementation:; remainder = 0; acc = 0; for k in range(16):; acc<<=1; x&=0xFFFFFFFF; x<<=2; y=x>>32; remainder<<=2; remainder+=y; if remainder>=acc*2+1:; remainder-=(acc*2+1); acc+=1; return [acc,remainder]; di exx ld hl,0 ;remainder ld d,h \ ld e,h ;acc exx ld a,h \ call sqrt32sub \ exx ld a,l \ call sqrt32sub \ exx ld a,d \ call sqrt32sub \ exx;Now we have four more iterations;The first two are no problem ld a,e exx call sqrt32sub_2;On the next iteration, HL might temporarily overflow by 1 bit call sqrt32_iter15;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anywayssqrt32_iter16: add a,a adc hl,hl rla adc hl,hl rla;AHL - (DE+DE+1) sbc hl,de \ sbc a,0 inc e sbc hl,de \ sbc a,0 ret p add hl,de adc a,0 dec e add hl,de adc a,0 retsqrt32sub:;min: 391cc;max: 483cc;avg: 437cc exx call sqrt32sub_2sqrt32sub_2:;min: 185cc;max: 231cc;avg: 208cc call +__:;min: 84cc;max: 107cc;avg: 95.5cc sll e \ rl d ;sla e \ rl d \ inc e add a,a adc hl,hl add a,a adc hl,hl sbc hl,de inc e ret nc dec e add hl,de dec e retsqrt32_iter15:;91+{8,0+{0,23}};min: 91cc;max: 114cc;avg: 100.75cc sll e \ rl d ;sla e \ rl d \ inc e add a,a adc hl,hl add a,a adc hl,hl ;This might overflow! jr c,sqrt32_iter15_br0; sbc hl,de inc e ret nc dec e add hl,de dec e retsqrt32_iter15_br0: or a sbc hl,de inc e ret
sqrt32:;Input: HLDE;speed: 238+{0,1}+{0,44}+sqrtHL+3*sqrt32sub_2+sqrt32_iter15;min: 1260;max: 1506;avg: 1377.75 push de call sqrtHL pop bc add a,a ld e,a jr nc,+_ inc d_: ld a,b call sqrt32sub_2 call sqrt32sub_2;Now we have four more iterations;The first two are no problem ld a,c call sqrt32sub_2;On the next iteration, HL might temporarily overflow by 1 bit call sqrt32_iter15;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anywayssqrt32_iter16: add a,a adc hl,hl rla adc hl,hl rla;AHL - (DE+DE+1) sbc hl,de \ sbc a,0 inc e or a sbc hl,de \ sbc a,0 ret p add hl,de adc a,0 dec e add hl,de adc a,0 retsqrt32sub_2:;min: 185cc;max: 231cc;avg: 208cc call +__:;min: 84cc;max: 107cc;avg: 95.5cc sll e \ rl d add a,a \ adc hl,hl add a,a \ adc hl,hl sbc hl,de inc e ret nc dec e add hl,de dec e retsqrt32_iter15:;91+{8,0+{0,23}};min: 91cc;max: 114cc;avg: 100.75cc sll e \ rl d ;sla e \ rl d \ inc e add a,a adc hl,hl add a,a adc hl,hl ;This might overflow! jr c,sqrt32_iter15_br0; sbc hl,de inc e ret nc dec e add hl,de dec e retsqrt32_iter15_br0: or a sbc hl,de inc e ret.echo $-sqrt32sqrtHL:;returns A as the sqrt, HL as the remainder, D = 0;min: 376cc;max: 416cc;avg: 393cc ld de,$5040 ld a,h sub e jr nc,+_ add a,e ld d,$10_: sub d jr nc,+_ add a,d .db $01 ;start of ld bc,** which is 10cc to skip the next two bytes._: set 5,d res 4,d srl d set 2,d sub d jr nc,+_ add a,d .db $01 ;start of ld bc,** which is 10cc to skip the next two bytes._: set 3,d res 2,d srl d inc d sub d jr nc,+_ add a,d dec d ;this resets the low bit of D, so `srl d` resets carry. .db $06 ;start of ld b,* which is 7cc to skip the next byte._: inc d srl d ld h,a sbc hl,de ld a,e jr nc,+_ add hl,de_: ccf rra srl d rra ld e,a sbc hl,de jr nc,+_ add hl,de .db $01 ;start of ld bc,** which is 10cc to skip the next two bytes._: or %00100000 xor %00011000 srl d rra ld e,a sbc hl,de jr nc,+_ add hl,de .db $01 ;start of ld bc,** which is 10cc to skip the next two bytes._: or %00001000 xor %00000110 srl d rra ld e,a sbc hl,de jr nc,+_ add hl,de srl d rra ret_: inc a srl d rra ret.echo $-sqrtHL
sqrt32sub_2:;min: 178cc;max: 224cc;avg: 201cc jp return4return4:;min: 84cc;max: 107cc;avg: 95.5cc sll e \ rl d add a,a \ adc hl,hl add a,a \ adc hl,hl sbc hl,de inc e ret nc dec e add hl,de dec e retsqrt32_iter15:;91+{8,0+{0,23}};min: 91cc;max: 114cc;avg: 100.75cc sll e \ rl d ;sla e \ rl d \ inc e add a,a adc hl,hl add a,a adc hl,hl ;This might overflow! jr c,sqrt32_iter15_br0; sbc hl,de inc e ret nc dec e add hl,de dec e retsqrt32_iter15_br0: or a sbc hl,de inc e retsqrtHL:;returns A as the sqrt, HL as the remainder, D = 0;min: 376cc;max: 416cc;avg: 393cc ld de,$5040 ld a,h sub e jr nc,+_ add a,e ld d,$10_: sub d jr nc,+_ add a,d .db $01 ;start of ld bc,** which is 10cc to skip the next two bytes._: set 5,d res 4,d srl d set 2,d sub d jr nc,+_ add a,d .db $01 ;start of ld bc,** which is 10cc to skip the next two bytes._: set 3,d res 2,d srl d inc d sub d jr nc,+_ add a,d dec d ;this resets the low bit of D, so `srl d` resets carry. .db $06 ;start of ld b,* which is 7cc to skip the next byte._: inc d srl d ld h,a sbc hl,de ld a,e jr nc,+_ add hl,de_: ccf rra srl d rra ld e,a sbc hl,de jr nc,+_ add hl,de .db $01 ;start of ld bc,** which is 10cc to skip the next two bytes._: or %00100000 xor %00011000 srl d rra ld e,a sbc hl,de jr nc,+_ add hl,de .db $01 ;start of ld bc,** which is 10cc to skip the next two bytes._: or %00001000 xor %00000110 srl d rra ld e,a sbc hl,de jr nc,+_ add hl,de srl d rra ret_: inc a srl d rra retsqrt32_stack:.dw return0.dw return4 ;subroutine.dw return1.dw return4 ;subroutine.dw return2.dw return4 ;subroutine.dw return3.dw return5sqrt32_stack_end:sqrt32:;Input: HLIX;Output: DE is the sqrt, AHL is the remainder;min: 1203;max: 1455;avg: 1323.75 ld sp,sqrt32_stack jp sqrtHLreturn0: add a,a ld e,a jr nc,+_ inc d_: ld a,ixh jp sqrt32sub_2return1: jp sqrt32sub_2return2:;Now we have four more iterations;The first two are no problem ld a,ixl jp sqrt32sub_2return3:;On the next iteration, HL might temporarily overflow by 1 bit jp sqrt32_iter15return5:;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anywayssqrt32_iter16: add a,a adc hl,hl rla adc hl,hl rla;AHL - (DE+DE+1) sbc hl,de \ sbc a,0 inc e or a sbc hl,de \ sbc a,0 jp p,+_ add hl,de adc a,0 dec e add hl,de adc a,0_: ;...
sqrt32_iter16: add a,a adc hl,hl rla adc hl,hl rla;AHL - (DE+DE+1) sbc hl,de \ sbc a,0 inc e or a sbc hl,de \ sbc a,0 ret p add hl,de adc a,0 dec e add hl,de adc a,0 ret
sqrt32_iter16: add a,a ld b,a ;either 0x00 or 0x128 adc hl,hl rla adc hl,hl rla;AHL - (DE+DE+1) sbc hl,de \ sbc a,b inc e or a sbc hl,de \ sbc a,b ret p add hl,de adc a,b dec e add hl,de adc a,b ret
sqrtHLIX:;Input: HLIX;Output: DE is the sqrt, AHL is the remainder;speed: 754+{0,1}+6{0,6}+{0,3+{0,18}}+{0,38}+sqrtHL;min: 1130;max: 1266;avg: 1190.5;167 bytes call sqrtHL add a,a ld e,a jr nc,+_ inc d_: ld a,ixh sll e \ rl d add a,a \ adc hl,hl add a,a \ adc hl,hl sbc hl,de jr nc,+_ add hl,de dec e .db $FE ;start of `cp *`_: inc e sll e \ rl d add a,a \ adc hl,hl add a,a \ adc hl,hl sbc hl,de jr nc,+_ add hl,de dec e .db $FE ;start of `cp *`_: inc e sll e \ rl d add a,a \ adc hl,hl add a,a \ adc hl,hl sbc hl,de jr nc,+_ add hl,de dec e .db $FE ;start of `cp *`_: inc e sll e \ rl d add a,a \ adc hl,hl add a,a \ adc hl,hl sbc hl,de jr nc,+_ add hl,de dec e .db $FE ;start of `cp *`_: inc e;Now we have four more iterations;The first two are no problem ld a,ixl sll e \ rl d add a,a \ adc hl,hl add a,a \ adc hl,hl sbc hl,de jr nc,+_ add hl,de dec e .db $FE ;start of `cp *`_: inc e sll e \ rl d add a,a \ adc hl,hl add a,a \ adc hl,hl sbc hl,de jr nc,+_ add hl,de dec e .db $FE ;start of `cp *`_: inc esqrt32_iter15:;On the next iteration, HL might temporarily overflow by 1 bit sll e \ rl d ;sla e \ rl d \ inc e add a,a adc hl,hl add a,a adc hl,hl ;This might overflow! jr c,sqrt32_iter15_br0; sbc hl,de jr nc,+_ add hl,de dec e jr sqrt32_iter16sqrt32_iter15_br0: or a sbc hl,de_: inc e;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anywayssqrt32_iter16: add a,a ld b,a ;either 0x00 or 0x80 adc hl,hl rla adc hl,hl rla;AHL - (DE+DE+1) sbc hl,de \ sbc a,b inc e or a sbc hl,de \ sbc a,b ret p add hl,de adc a,b dec e add hl,de adc a,b ret