1
ASM / Re: eZ80 Optimized Routines
« on: October 27, 2021, 10:36:07 am »
Was wondering how "optimized" the multiply 32 routine was using the karatsuba algorithm vs a straight forward multiply. And the answer is the straight forward multiply is significantly faster.
Following routine was written in several stages.
1. Write straight forward multiple, doing the 16 sub products from least significant to most significant, using AHL as a running sum.
2. For 3 of the additions, it's impossible for a carry to propagate from the 16 bit add. This allowed for the adc a,a to be deleted, resulting in 3 bytes and cycles saved.
3. Since the B and C registers weren't used, decided to use them to cache a few values from the numbers being multiplied. This saves 20 bytes and 32 cycles.
4. Finally, a minor modification was made for the 30 product. Saves 1 byte.
Following routine was written in several stages.
1. Write straight forward multiple, doing the 16 sub products from least significant to most significant, using AHL as a running sum.
2. For 3 of the additions, it's impossible for a carry to propagate from the 16 bit add. This allowed for the adc a,a to be deleted, resulting in 3 bytes and cycles saved.
3. Since the B and C registers weren't used, decided to use them to cache a few values from the numbers being multiplied. This saves 20 bytes and 32 cycles.
4. Finally, a minor modification was made for the 30 product. Saves 1 byte.
Code: [Select]
; (IX+8) = (IX+0)*(IX+4)
;
; Register usage
; AHL = running sum for result.
; Periodically, L is saved as next byte and sum shifted right by 1 byte.
; DE = Next sub product to be added
; B = Cached value of (ix+0) and later (ix+7)
; C = Cached value of (ix+4) and later (ix+3)
; IX = pointer to values being multiplied and result.
;
; 178 bytes
; 275 clock cycles
Mult32:
xor a
; Calc byte 0
ld b,(ix+0)
ld h,b ; Prod 00
ld c,(ix+4)
ld l,c
mlt hl
ld (ix+8),l
ld l,h
ld h,a
; Calc byte 1
ld d,b ; Prod 01
ld e,(ix+5)
mlt de
add hl,de ; No carry possible
ld d,(ix+1) ; Prod 10
ld e,c
mlt de
add hl,de
adc a,a
ld (ix+9),l
ld l,h
ld h,a
xor a
; Calc byte 2
ld d,b ; Prod 02
ld e,(ix+6)
mlt de
add hl,de ; No carry possible
ld d,(ix+2) ; Prod 20
ld e,c
mlt de
add hl,de
adc a,a
ld d,(ix+1) ; Prod 11
ld e,(ix+5)
mlt de
add hl,de
adc a,0
ld (ix+10),l
ld l,h
ld h,a
xor a
; Calc byte 3
ld d,b ; Prod 03 (Last use of cached (ix+0) in b)
ld b,(ix+7) ; Cache (ix+7)
ld e,b
mlt de
add hl,de
adc a,a
ld de,(ix+3) ; Prod 30. Minor optimization that saves a byte
ld c,e ; Cache (ix+3)
mlt de
add hl,de
adc a,0
ld d,(ix+1) ; Prod 12
ld e,(ix+6)
mlt de
add hl,de
adc a,0
ld d,(ix+2) ; Prod 21
ld e,(ix+5)
mlt de
add hl,de
adc a,0
ld (ix+11),l
ld l,h
ld h,a
xor a
; Calc byte 4
ld d,c ; Prod 31
ld e,(ix+5)
mlt de
add hl,de
adc a,a
ld d,(ix+1) ; Prod 13
ld e,b
mlt de
add hl,de
adc a,0
ld d,(ix+2) ; Prod 22
ld e,(ix+6)
mlt de
add hl,de
adc a,0
ld (ix+12),l
ld l,h
ld h,a
xor a
; Calc byte 5
ld d,(ix+2) ; Prod 23
ld e,b
mlt de
add hl,de
adc a,a
ld d,c ; Prod 32
ld e,(ix+6)
mlt de
add hl,de
adc a,0
ld (ix+13),l
ld l,h
ld h,a
; Calc bytes 6 & 7
mlt bc ; Prod 33
add hl,bc ; No carry possible
ld (ix+14),hl
ret