Author Topic: eZ80 Optimized Routines (Read 19152 times)

jcochran · « **Reply #15 on:** October 11, 2021, 12:28:05 pm »

The mult16 routine from Xeda112358 has a bug, and can be made one byte shorter.

To demonstrate bug, multiply 0ffffh by 0ffffh.
Correct result = 0fffe0001
Incorrect result = 0feff0001

New routine.

Code: [Select]

mul16:
;;Expects Z80 mode
;;Inputs: hl,bc
;;Outputs: Upper 16 bits in DE, lower 16 bits in BC
;;53 or 55 t-states
;;29 bytes 
    ld d,c
    ld e,l
    mlt de
    push de     ; Push low product onto stack.
    ld d,h      ; Prepare for high product
    ld e,b
    ld h,e      ; Swap bytes for 2 middle products. Do here when I have copies of high values
    ld b,d      ; instead of later, needing A as temp.
    mlt de      ; Calculate high product
    mlt hl      ; Calculate first middle product
    mlt bc      ; Calculate second middle product
    add hl,bc   ; Sum middle products
    jr nc,$+3
    inc d       ; Inc D if carry.  (The INC DE was a bug).
    pop bc      ; Retrieve low product.
    ld a,b      ; Add in middle products to create final answer.
    add a,l
    ld b,a
    ld a,e
    adc a,h
    ld e,a
    ret nc
    inc d
    ret

Xeda112358 · « **Reply #16 on:** October 11, 2021, 12:32:17 pm »

Oh, fantastic! Good catch!

jcochran · « **Reply #17 on:** October 12, 2021, 04:18:27 pm »

If you don't insist on the current register usage, you can get a bit shorter and faster.

Code: [Select]

; HLDE = HL * DE
; 52 cc, 28 bytes
Mult16:
	ld	b,h
	ld	c,d
	mlt	bc
	push	bc
	ld	b,l
	ld	c,e
	ld	l,c
	ld	e,b
	mlt	bc
	mlt	de
	mlt	hl
	xor	a
	add	hl,de
	ld	e,h
	ld	h,l
	ld	l,a
	adc	a,a
	ld	d,a
	add	hl,bc
	ex	de,hl
	pop	bc
	adc	hl,bc
	ret

Unfortunately, I don't see any way to make it shorter or faster.

jcochran · « **Reply #18 on:** October 27, 2021, 10:36:07 am »

Was wondering how "optimized" the multiply 32 routine was using the karatsuba algorithm vs a straight forward multiply. And the answer is the straight forward multiply is significantly faster.

Following routine was written in several stages.
1. Write straight forward multiple, doing the 16 sub products from least significant to most significant, using AHL as a running sum.
2. For 3 of the additions, it's impossible for a carry to propagate from the 16 bit add. This allowed for the adc a,a to be deleted, resulting in 3 bytes and cycles saved.
3. Since the B and C registers weren't used, decided to use them to cache a few values from the numbers being multiplied. This saves 20 bytes and 32 cycles.
4. Finally, a minor modification was made for the 30 product. Saves 1 byte.

Code: [Select]

; (IX+8) = (IX+0)*(IX+4)
;
; Register usage
; AHL = running sum for result.
;       Periodically, L is saved as next byte and sum shifted right by 1 byte.
; DE = Next sub product to be added
; B  = Cached value of (ix+0) and later (ix+7)
; C  = Cached value of (ix+4) and later (ix+3)
; IX = pointer to values being multiplied and result.
;
; 178 bytes
; 275 clock cycles

Mult32:
	xor	a
; Calc byte 0
	ld	b,(ix+0)
	ld	h,b		; Prod 00
	ld	c,(ix+4)
	ld	l,c
	mlt	hl
	ld	(ix+8),l
	ld	l,h
	ld	h,a

; Calc byte 1
	ld	d,b		; Prod 01
	ld	e,(ix+5)
	mlt	de
	add	hl,de		; No carry possible
	ld	d,(ix+1)	; Prod 10
	ld	e,c
	mlt	de
	add	hl,de
	adc	a,a
	ld	(ix+9),l
	ld	l,h
	ld	h,a
	xor	a

; Calc byte 2
	ld	d,b		; Prod 02
	ld	e,(ix+6)
	mlt	de
	add	hl,de		; No carry possible
	ld	d,(ix+2)	; Prod 20
	ld	e,c
	mlt	de
	add	hl,de
	adc	a,a
	ld	d,(ix+1)	; Prod 11
	ld	e,(ix+5)
	mlt	de
	add	hl,de
	adc	a,0
	ld	(ix+10),l
	ld	l,h
	ld	h,a
	xor	a

; Calc byte 3
	ld	d,b		; Prod 03 (Last use of cached (ix+0) in b)
	ld	b,(ix+7)	; Cache (ix+7)
	ld	e,b
	mlt	de
	add	hl,de
	adc	a,a
	ld	de,(ix+3)	; Prod 30. Minor optimization that saves a byte
	ld	c,e		; Cache (ix+3)
	mlt	de
	add	hl,de
	adc	a,0
	ld	d,(ix+1)	; Prod 12
	ld	e,(ix+6)
	mlt	de
	add	hl,de
	adc	a,0
	ld	d,(ix+2)	; Prod 21
	ld	e,(ix+5)
	mlt	de
	add	hl,de
	adc	a,0
	ld	(ix+11),l
	ld	l,h
	ld	h,a
	xor	a

; Calc byte 4
	ld	d,c		; Prod 31
	ld	e,(ix+5)
	mlt	de
	add	hl,de
	adc	a,a
	ld	d,(ix+1)	; Prod 13
	ld	e,b
	mlt	de
	add	hl,de
	adc	a,0
	ld	d,(ix+2)	; Prod 22
	ld	e,(ix+6)
	mlt	de
	add	hl,de
	adc	a,0
	ld	(ix+12),l
	ld	l,h
	ld	h,a
	xor	a

; Calc byte 5
	ld	d,(ix+2)	; Prod 23
	ld	e,b
	mlt	de
	add	hl,de
	adc	a,a
	ld	d,c		; Prod 32
	ld	e,(ix+6)
	mlt	de
	add	hl,de
	adc	a,0
	ld	(ix+13),l
	ld	l,h
	ld	h,a

; Calc bytes 6 & 7
	mlt	bc		; Prod 33
	add	hl,bc		; No carry possible
	ld	(ix+14),hl
	ret

calc84maniac · « **Reply #19 on:** April 13, 2025, 02:31:45 am »

I found a way to make a faster 16x16->32 by avoiding stack usage, though it's slightly larger:

Code: [Select]

; HLDE = HL * DE
; 50 cc, 30 bytes
HLTimesDE:
	ld	b,h
	ld	c,e
	ld	e,h
	ld	h,d
	mlt	de ; H*D
	ld	a,e
	ld	e,l
	mlt	hl ; L*D
	add	a,h
	ld	h,a
	adc	a,d
	sub	a,h
	ld	d,c
	mlt	bc ; H*E
	add	hl,bc
	ld	b,0
	adc	a,b
	mlt	de ; L*E
	ld	c,d
	add	hl,bc
	adc	a,b
	ld	d,l
	ld	l,h
	ld	h,a
	ret

As a bonus, this can be modified to use the same register inputs/outputs as Xeda's original (or for that matter, any combination of register pair inputs/outputs as long as HL isn't the low 16 bits of the result, which would require one extra byte/cycle to handle):

Code: [Select]

; DEBC = HL * BC
; 50 cc, 30 bytes
HLTimesBC:
	ld	d,h
	ld	e,c
	ld	c,h
	ld	h,b
	mlt	bc ; H*B
	ld	a,c
	ld	c,l
	mlt	hl ; L*B
	add	a,h
	ld	h,a
	adc	a,b
	sub	a,h
	ld	b,e
	mlt	de ; H*C
	add	hl,de
	ld	d,0
	adc	a,d
	mlt	bc ; L*C
	ld	e,b
	add	hl,de
	adc	a,d
	ld	b,l
	ld	e,h
	ld	d,a
	ret

Edit:
For ADL mode, those routines can simply be changed to add a .s suffix on each 16-bit addition for a total of 53cc, 32 bytes, but I was also able to modify the routine to output to an 8-bit and 24-bit register at a cost of only 1 extra cycle and byte on top of that:

Code: [Select]

; AUHL = HL * DE
; 54 cc, 33 bytes
HLTimesDE:
	ld	b,h
	ld	c,e
	ld	e,h
	ld	h,d
	mlt	de ; H*D
	ld	a,e
	ld	e,l
	mlt	hl ; L*D
	add	a,h
	ld	h,a
	adc	a,d
	sub	a,h
	ld	d,c
	mlt	bc ; H*E
	add	hl,bc
	add	hl,hl
	add	hl,hl
	add	hl,hl
	add	hl,hl
	add	hl,hl
	add	hl,hl
	add	hl,hl
	add	hl,hl
	adc	a,l ; 0
	mlt	de ; L*E
	add	hl,de
	adc	a,0
	ret

Author Topic: eZ80 Optimized Routines (Read 19152 times)

jcochran

Re: eZ80 Optimized Routines

Xeda112358

Re: eZ80 Optimized Routines

jcochran

Re: eZ80 Optimized Routines

jcochran

Re: eZ80 Optimized Routines

calc84maniac

Re: eZ80 Optimized Routines