211
ASM / Re: [z80] 32 bit by 16 bits division and 32 bit square root
« on: March 23, 2019, 10:18:40 am »
Here is my version. It's about 50 bytes larger, but averages 1889.75cc. It does use stack and shadow registers, but it could be made faster by totally unrolling (which would be about 300 bytes of code).
EDIT:
Oh jeez, here is an even bigger version that uses less stack space and doesn't use shadow registers or index registers:
EDIT2: Forgot that sqrtHL didn't preserve BC, fixed that. Now it seems the last bit of the remainder might be broken, so I have to fix that
EDIT3: Fixed the bug in the bottom bit I just needed to reset the carry flag before the second subtraction in the final iteration.
EDIT4: In a scenario where you don't have RAM for a stack, we can hardcode it! It even saves 54cc (but adds 20 bytes). 10cc of that 54cc is just due to not having an ending RET. I also switched input to HLIX instead of HLDE.
I reorganized the code so that it would be "obvious" that sqrt32 is an in-line routine, so I put it at the end (in practice, the subroutines would probably be toward the end of mem). The precomputed stack is inserted just before sqrt32.
Code: [Select]
sqrt32:
;Input: HLDE
;Output: DE is the square root, AHL is the remainder
;Destroys: D'E', H'L'
;Speed: 248+{0,44}+3*sqrt32sub+sqrt32sub_2+sqrt32_iter15
;min: 1697cc
;max: 2086cc
;avg: 1889.75cc
;
;Python implementation:
; remainder = 0
; acc = 0
; for k in range(16):
; acc<<=1
; x&=0xFFFFFFFF
; x<<=2
; y=x>>32
; remainder<<=2
; remainder+=y
; if remainder>=acc*2+1:
; remainder-=(acc*2+1)
; acc+=1
; return [acc,remainder]
;
di
exx
ld hl,0 ;remainder
ld d,h \ ld e,h ;acc
exx
ld a,h \ call sqrt32sub \ exx
ld a,l \ call sqrt32sub \ exx
ld a,d \ call sqrt32sub \ exx
;Now we have four more iterations
;The first two are no problem
ld a,e
exx
call sqrt32sub_2
;On the next iteration, HL might temporarily overflow by 1 bit
call sqrt32_iter15
;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anyways
sqrt32_iter16:
add a,a
adc hl,hl
rla
adc hl,hl
rla
;AHL - (DE+DE+1)
sbc hl,de \ sbc a,0
inc e
sbc hl,de \ sbc a,0
ret p
add hl,de
adc a,0
dec e
add hl,de
adc a,0
ret
sqrt32sub:
;min: 391cc
;max: 483cc
;avg: 437cc
exx
call sqrt32sub_2
sqrt32sub_2:
;min: 185cc
;max: 231cc
;avg: 208cc
call +_
_:
;min: 84cc
;max: 107cc
;avg: 95.5cc
sll e \ rl d ;sla e \ rl d \ inc e
add a,a
adc hl,hl
add a,a
adc hl,hl
sbc hl,de
inc e
ret nc
dec e
add hl,de
dec e
ret
sqrt32_iter15:
;91+{8,0+{0,23}}
;min: 91cc
;max: 114cc
;avg: 100.75cc
sll e \ rl d ;sla e \ rl d \ inc e
add a,a
adc hl,hl
add a,a
adc hl,hl ;This might overflow!
jr c,sqrt32_iter15_br0
;
sbc hl,de
inc e
ret nc
dec e
add hl,de
dec e
ret
sqrt32_iter15_br0:
or a
sbc hl,de
inc e
ret
EDIT:
Oh jeez, here is an even bigger version that uses less stack space and doesn't use shadow registers or index registers:
Code: [Select]
sqrt32:
;Input: HLDE
;speed: 238+{0,1}+{0,44}+sqrtHL+3*sqrt32sub_2+sqrt32_iter15
;min: 1260
;max: 1506
;avg: 1377.75
push de
call sqrtHL
pop bc
add a,a
ld e,a
jr nc,+_
inc d
_:
ld a,b
call sqrt32sub_2
call sqrt32sub_2
;Now we have four more iterations
;The first two are no problem
ld a,c
call sqrt32sub_2
;On the next iteration, HL might temporarily overflow by 1 bit
call sqrt32_iter15
;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anyways
sqrt32_iter16:
add a,a
adc hl,hl
rla
adc hl,hl
rla
;AHL - (DE+DE+1)
sbc hl,de \ sbc a,0
inc e
or a
sbc hl,de \ sbc a,0
ret p
add hl,de
adc a,0
dec e
add hl,de
adc a,0
ret
sqrt32sub_2:
;min: 185cc
;max: 231cc
;avg: 208cc
call +_
_:
;min: 84cc
;max: 107cc
;avg: 95.5cc
sll e \ rl d
add a,a \ adc hl,hl
add a,a \ adc hl,hl
sbc hl,de
inc e
ret nc
dec e
add hl,de
dec e
ret
sqrt32_iter15:
;91+{8,0+{0,23}}
;min: 91cc
;max: 114cc
;avg: 100.75cc
sll e \ rl d ;sla e \ rl d \ inc e
add a,a
adc hl,hl
add a,a
adc hl,hl ;This might overflow!
jr c,sqrt32_iter15_br0
;
sbc hl,de
inc e
ret nc
dec e
add hl,de
dec e
ret
sqrt32_iter15_br0:
or a
sbc hl,de
inc e
ret
.echo $-sqrt32
sqrtHL:
;returns A as the sqrt, HL as the remainder, D = 0
;min: 376cc
;max: 416cc
;avg: 393cc
ld de,$5040
ld a,h
sub e
jr nc,+_
add a,e
ld d,$10
_:
sub d
jr nc,+_
add a,d
.db $01 ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
set 5,d
res 4,d
srl d
set 2,d
sub d
jr nc,+_
add a,d
.db $01 ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
set 3,d
res 2,d
srl d
inc d
sub d
jr nc,+_
add a,d
dec d ;this resets the low bit of D, so `srl d` resets carry.
.db $06 ;start of ld b,* which is 7cc to skip the next byte.
_:
inc d
srl d
ld h,a
sbc hl,de
ld a,e
jr nc,+_
add hl,de
_:
ccf
rra
srl d
rra
ld e,a
sbc hl,de
jr nc,+_
add hl,de
.db $01 ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
or %00100000
xor %00011000
srl d
rra
ld e,a
sbc hl,de
jr nc,+_
add hl,de
.db $01 ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
or %00001000
xor %00000110
srl d
rra
ld e,a
sbc hl,de
jr nc,+_
add hl,de
srl d
rra
ret
_:
inc a
srl d
rra
ret
.echo $-sqrtHL
It does use the 16-bit square root routine here to take care of the first 16 bits Combined, it is 194 bytes.EDIT2: Forgot that sqrtHL didn't preserve BC, fixed that. Now it seems the last bit of the remainder might be broken, so I have to fix that
EDIT3: Fixed the bug in the bottom bit I just needed to reset the carry flag before the second subtraction in the final iteration.
EDIT4: In a scenario where you don't have RAM for a stack, we can hardcode it! It even saves 54cc (but adds 20 bytes). 10cc of that 54cc is just due to not having an ending RET. I also switched input to HLIX instead of HLDE.
I reorganized the code so that it would be "obvious" that sqrt32 is an in-line routine, so I put it at the end (in practice, the subroutines would probably be toward the end of mem). The precomputed stack is inserted just before sqrt32.
Code: [Select]
sqrt32sub_2:
;min: 178cc
;max: 224cc
;avg: 201cc
jp return4
return4:
;min: 84cc
;max: 107cc
;avg: 95.5cc
sll e \ rl d
add a,a \ adc hl,hl
add a,a \ adc hl,hl
sbc hl,de
inc e
ret nc
dec e
add hl,de
dec e
ret
sqrt32_iter15:
;91+{8,0+{0,23}}
;min: 91cc
;max: 114cc
;avg: 100.75cc
sll e \ rl d ;sla e \ rl d \ inc e
add a,a
adc hl,hl
add a,a
adc hl,hl ;This might overflow!
jr c,sqrt32_iter15_br0
;
sbc hl,de
inc e
ret nc
dec e
add hl,de
dec e
ret
sqrt32_iter15_br0:
or a
sbc hl,de
inc e
ret
sqrtHL:
;returns A as the sqrt, HL as the remainder, D = 0
;min: 376cc
;max: 416cc
;avg: 393cc
ld de,$5040
ld a,h
sub e
jr nc,+_
add a,e
ld d,$10
_:
sub d
jr nc,+_
add a,d
.db $01 ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
set 5,d
res 4,d
srl d
set 2,d
sub d
jr nc,+_
add a,d
.db $01 ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
set 3,d
res 2,d
srl d
inc d
sub d
jr nc,+_
add a,d
dec d ;this resets the low bit of D, so `srl d` resets carry.
.db $06 ;start of ld b,* which is 7cc to skip the next byte.
_:
inc d
srl d
ld h,a
sbc hl,de
ld a,e
jr nc,+_
add hl,de
_:
ccf
rra
srl d
rra
ld e,a
sbc hl,de
jr nc,+_
add hl,de
.db $01 ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
or %00100000
xor %00011000
srl d
rra
ld e,a
sbc hl,de
jr nc,+_
add hl,de
.db $01 ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
or %00001000
xor %00000110
srl d
rra
ld e,a
sbc hl,de
jr nc,+_
add hl,de
srl d
rra
ret
_:
inc a
srl d
rra
ret
sqrt32_stack:
.dw return0
.dw return4 ;subroutine
.dw return1
.dw return4 ;subroutine
.dw return2
.dw return4 ;subroutine
.dw return3
.dw return5
sqrt32_stack_end:
sqrt32:
;Input: HLIX
;Output: DE is the sqrt, AHL is the remainder
;min: 1203
;max: 1455
;avg: 1323.75
ld sp,sqrt32_stack
jp sqrtHL
return0:
add a,a
ld e,a
jr nc,+_
inc d
_:
ld a,ixh
jp sqrt32sub_2
return1:
jp sqrt32sub_2
return2:
;Now we have four more iterations
;The first two are no problem
ld a,ixl
jp sqrt32sub_2
return3:
;On the next iteration, HL might temporarily overflow by 1 bit
jp sqrt32_iter15
return5:
;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anyways
sqrt32_iter16:
add a,a
adc hl,hl
rla
adc hl,hl
rla
;AHL - (DE+DE+1)
sbc hl,de \ sbc a,0
inc e
or a
sbc hl,de \ sbc a,0
jp p,+_
add hl,de
adc a,0
dec e
add hl,de
adc a,0
_:
;...