Author Topic: ASM Optimized routines (Read 114514 times)

DJ Omnimaga · « **Reply #30 on:** April 30, 2010, 11:01:50 pm »

I don't understand the code above but I wish you good luck in this ^^

Quigibo · « **Reply #31 on:** May 01, 2010, 03:19:23 am »

Okay, turns out the condition testing was actually taking up more space then it was saving. My brain hurts anyway, so I give up.

I did manage to improve my existing code by a couple bytes and it makes more sense now, but rounding is apparently still backwards. Feel free to use it if anyone needs it.

Code: [Select]

SignedDivision:
	ld	a,h
	xor	d
	push	af

	bit	7,h
	jr	z,$+8
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a

	bit	7,d
	jr	z,$+8
	xor	a
	sub	e
	ld	e,a
	sbc	a,a
	sub	d
	ld	d,a

	call	RegularDivision

	pop	af
	add	a,a
	ret	nc

	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	ret

Galandros · « **Reply #32 on:** June 25, 2010, 07:56:39 am »

I still need to code the optimized arbitrary pixels to the left or right.
I am on one of coding graphics effects and display routines.

Until that here a cool one that does reverse video, reverse colour a "filled box" in the screen buffer.

Code: [Select]

;input: a=x, e=y, b=height, c=width in bytes
;complement box area at x,y
cplbox:
	ld	h,0
	ld	d,h
; you can optimize here for speed with sla l and avoid some adds
	ld	l,e
	add	hl,de
	add	hl,de
	add	hl,hl
	add	hl,hl
	ld	e,a
	srl	e
	srl	e
	srl	e
	add	hl,de
	ld	de,plotsscreen
	add	hl,de			;get first position in buffer
	and	7
	ld	d,$FF
	ld	e,0			;de is a mask
	jr	z,cplaligned
maskrotateloop:
	srl	d
	rr	e
	dec	a
	jr	nz,maskrotateloop
cplaligned:
	
cplboxheightloop:
	push	bc
	push	hl
	ld	b,c
cplboxwifthtloop:
	ld	a,(hl)	;\
	xor d		;|
	ld	(hl),a	;|
	inc	hl		;| complement buffer
	ld a,(hl)	;|
	xor e		;|
	ld	(hl),a	;/
	djnz	cplboxwifthtloop	; loop for width
	pop	hl
	ld	c,12
	add	hl,bc		; advance for next line
	pop	bc
	djnz	cplboxheightloop	; loop height
	ret

I will convert width to pixels instead of bytes. Seems like I will need to use a temporary byte in ram.

calc84maniac · « **Reply #33 on:** December 17, 2010, 10:29:55 pm »

Optimized routine for HL=A-HL (the negate HL optimization can be derived from this by setting A=0 first):

Code: [Select]

  sub l
  ld l,a
  sbc a,a
  sub h
  ld h,a

Also, topic stickied.

Xeda112358 · « **Reply #34 on:** August 11, 2011, 02:49:33 pm »

Can these be optimised:

Code: [Select]

C_Div_D:
;Inputs:
;     C is the numerator
;     D is the denominator
;Outputs:
;     A is the remainder
;     B is 0
;     C is the result of C/D
;     D,E,H,L are not changed
;
     ld b,8
     xor a
       sla c
       rla
       cp d
       jr c,$+4
         inc c
         sub d
       djnz $-8
     ret

Code: [Select]

DE_Times_A:
;Inputs:
;     DE and A are factors
;Outputs:
;     A is not changed
;     B is 0
;     C is not changed
;     DE is not changed
;     HL is the product
;
     ld b,8
     ld hl,0
       add hl,hl
       rlca
       jr nc,$+3
         add hl,de
       djnz $-5
     ret

Code: [Select]

DE_Times_BC:
;Inputs:
;     DE and BC are factors
;Outputs:
;     A is 0
;     BC is not changed
;     DE is 0
;     HL is the product
;
       ld hl,0
       ld a,16
Mul_Loop_1:
         add hl,hl
         ex de,hl
         add hl,hl
         ex de,hl
         jr nc,$+3
           add hl,bc
         dec a
         jr nz,Mul_Loop_1
       ret

Code: [Select]

DEHL_Div_C:
;Inputs:
;     DEHL is a 32 bit value where DE is the upper 16 bits
;     C is the value to divide DEHL by
;Outputs:
;    A is the remainder
;    B is 0
;    C is not changed
;    DEHL is the result of the division
;
     ld b,32
     xor a
       add hl,hl
       ex de,hl
       adc hl,hl
       ex de,hl
       rla
       cp c
       jr c,$+4
         inc l
         sub c
       djnz $-10
     ret

Code: [Select]

;===============================================================
DEHL_Times_A:
;===============================================================
;Inputs:
;     DEHL is a 32 bit factor
;     A is an 8 bit factor
;Outputs:
;     interrupts disabled
;     BC is not changed
;     AHLDE is the 40-bit result
;     D'E' is the lower 16 bits of the input
;     H'L' is the lower 16 bits of the output
;     B' is 0
;     C' is not changed
;     A' is not changed
;===============================================================
     di
     push hl
     or a
     sbc hl,hl
     exx
     pop de
     sbc hl,hl
     ld b,8
mul32Loop:
       add hl,hl
       exx
       adc hl,hl
       exx
       add a,a
       jr nc,$+8
         add hl,de
         exx
         adc hl,de
         inc a
         exx
       djnz mul32Loop
       push hl
       exx
       pop de
       ret

Code: [Select]

GCDHL_BC:
;Inputs:
;     HL is a number
;     BC is a number
;Outputs:
;     A is 0
;     BC is the GCD
;     DE is 0
;Destroys:
;     HL
;Size:  25 bytes
;Speed: 30 to 49708 cycles
;       -As slow as about 126 times per second at 6MHz
;       -As fast as about 209715 times per second at 6MHz
;Speed break down:
;     If HL=BC, 30 cycles
;     24+1552x
;     If BC>HL, add 20 cycles
;     *x is from 1 to at most 32 (because we use 2 16-bit numbers)
;
     or a \ sbc hl,bc     ;B7ED42    19
     ret z                ;C8        5|11
     add hl,bc            ;09        11
     jr nc,$+8            ;3006      11|31
       ld a,h             ;7C        --
       ld h,b             ;60        --
       ld b,a             ;47        --
       ld a,l             ;7D        --
       ld l,c             ;69        --
       ld c,a             ;4F        --
Loop:
     call HL_Div_BC       ;CD****    1511
     ld a,d \ or e        ;7AB2      8
     ret z                ;C8        5|11
     ld h,b \ ld l,c      ;6069      8
     ld b,d \ ld c,e      ;424B      8
     jr $-10              ;18F8      12

EDIT: 25-March-2015 This has been really in need of updating and optimizing. This version is 226cc to 322cc faster than the original for 2 bytes more.

Code: [Select]

;===============================================================
DE_Div_BC_round:
;===============================================================
;Performs DE/BC, rounded
;Speed:   1172+6b cycles, 1268cc worst case
;Size:    25 bytes
;Inputs:
;     DE is the numerator
;     BC is the denominator
;Outputs:
;     DE is the quotient
;     BC is divided by 2 (truncated)
;     A reflects the low bits of the quotient
;Destroys: HL
;===============================================================
    ld a,d
    ld hl,0
    ld d,16
 
    rl e
    rla
    adc hl,hl
    sbc hl,bc
    jr c,$+3
    add hl,bc
    dec d
    jr nz,$-11
    cpl
    ld d,a
    ld a,e
    cpl
    ld e,a
    ret

Code: [Select]

HL_Div_C:
;Inputs:
;     HL is the numerator
;     C is the denominator
;Outputs:
;     A is the remainder
;     B is 0
;     C is not changed
;     DE is not changed
;     HL is the quotient
;
       ld b,16
       xor a
         add hl,hl
         rla
         cp c
         jr c,$+4
           inc l
           sub c
         djnz $-7
       ret

Code: [Select]

HLDE_Div_C:
;Inputs:
;     HLDE is a 32 bit value where HL is the upper 16 bits
;     C is the value to divide HLDE by
;Outputs:
;    A is the remainder
;    B is 0
;    C is not changed
;    HLDE is the result of the division
;
     ld b,32
     xor a
       ex de,hl
       add hl,hl
       ex de,hl
       adc hl,hl
       rla
       cp c
       jr c,$+4
         inc e
         sub c
       djnz $-10
     ret

EDIT 16 Aug 2019: A less destructive nCr routine that isn't prone to overflow in intermediate calculations can be found here.

Code: [Select]

;===============================================================
nCrHL_DE:
;===============================================================
;Inputs:
;     hl is "n"
;     de is "r"
;Outputs:
;     interrupts off
;     a is 0
;     bc is an intermediate result
;     de is "n"
;     hl is the result
;     a' is not changed
;     bc' is "r"+1
;     de' is the same as bc
;     hl' is "r" or the compliment, whichever is smaller
;===============================================================
     or a                     ;reset carry flag
     sbc hl,de
     ret c                    ;r should not be bigger than n
     sbc hl,de \ add hl,de
     jr nc,$+3
       ex de,hl
                             ;hl is R
     push de
     ld bc,1                 ;A
     exx
     pop de                  ;N
     ld bc,1                 ;C
     ld h,b \ ld l,c         ;D
nCrLoop:
     push de
     push hl
     call DE_Times_BC
     push hl \ exx \ pop de
     push hl
     call DE_Div_BC
     pop de
     push hl \ ex de,hl \ exx \ pop hl
     ld b,h \ ld c,l
     pop de \ add hl,de
     pop de \ inc de
     exx
     inc bc
     or a \ sbc hl,bc \ add hl,bc
     exx
     jr nc,nCrLoop
     ret

Code: [Select]

RoundHL_Div_C:
;Inputs:
;     HL is the numerator
;     C is the denominator
;Outputs:
;     A is twice the remainder of the unrounded value 
;     B is 0
;     C is not changed
;     DE is not changed
;     HL is the rounded quotient
;     c flag set means no rounding was performed
;            reset means the value was rounded
;
       ld b,16
       xor a
         add hl,hl
         rla
         cp c
         jr c,$+4
           inc l
           sub c
         djnz $-7
       add a,a
       cp c
       jr c,$+3
         inc hl
       ret

Code: [Select]

;===============================================================
sqrtE:
;===============================================================
;Input:
;     E is the value to find the square root of
;Outputs:
;     A is E-D^2
;     B is 0
;     D is the rounded result
;     E is not changed
;     HL is not changed
;Destroys:
;     C
;
        xor a               ;1      4         4
        ld d,a              ;1      4         4
        ld c,a              ;1      4         4
        ld b,4              ;2      7         7
sqrtELoop:
        rlc d               ;2      8        32
        ld c,d              ;1      4        16
        scf                 ;1      4        16
        rl c                ;2      8        32

        rlc e               ;2      8        32
        rla                 ;1      4        16
        rlc e               ;2      8        32
        rla                 ;1      4        16

        cp c                ;1      4        16
        jr c,$+4            ;4    12|15      48+3x
          inc d             ;--    --        --
          sub c             ;--    --        --
        djnz sqrtELoop      ;2    13|8       47
        cp d                ;1      4         4
        jr c,$+3            ;3    12|11     12|11
          inc d             ;--    --        --
        ret                 ;1     10        10
;===============================================================
;Size  : 29 bytes
;Speed : 347+3x cycles plus 1 if rounded down
;   x is the number of set bits in the result.
;===============================================================

Code: [Select]

;===============================================================
sqrtE:
;===============================================================
;Input:
;     E is the value to find the square root of
;Outputs:
;     A is E-D^2
;     B is 0
;     D is the result
;     E is not changed
;     HL is not changed
;Destroys:
;     C=2D+1 if D is even, 2D-1 if D is odd

        xor a               ;1      4         4
        ld d,a              ;1      4         4
        ld c,a              ;1      4         4
        ld b,4              ;2      7         7
sqrtELoop:
        rlc d               ;2      8        32
        ld c,d              ;1      4        16
        scf                 ;1      4        16
        rl c                ;2      8        32

        rlc e               ;2      8        32
        rla                 ;1      4        16
        rlc e               ;2      8        32
        rla                 ;1      4        16

        cp c                ;1      4        16
        jr c,$+4            ;4    12|15      48+3x
          inc d             ;--    --        --
          sub c             ;--    --        --
        djnz sqrtELoop      ;2    13|8       47
        ret                 ;1     10        10
;===============================================================
;Size  : 25 bytes
;Speed : 332+3x cycles
;   x is the number of set bits in the result. This will not
;   exceed 4, so the range for cycles is 332 to 344. To put this
;   into perspective, under the slowest conditions (4 set bits
;   in the result at 6MHz), this can execute over 18000 times
;   in a second.
;===============================================================

It doesn't matter if they are optimised for speed or size, I just want to know what optimisation tricks I still need to establish. I just copied these out of my math routines folder, so some of them have random scratch work with them...

Quigibo · « **Reply #35 on:** August 11, 2011, 04:02:27 pm »

For your DE_Times_BC, this is one byte more in overhead, but much faster:

Code: [Select]

       ld a,c
       ld c,b
       ld hl,0
       ld b,16
Mul_Loop_1:
         add hl,hl
         add a,a
         rl c
         jr nc,$+3
           add hl,de
         djnz Mul_Loop_1
       ret

You could also call it the more unconventional way with CA_TIMES_DE which saves a byte and is still faster.

Code: [Select]

       ld hl,0
       ld b,16
Mul_Loop_1:
         add hl,hl
         add a,a
         rl c
         jr nc,$+3
           add hl,de
         djnz Mul_Loop_1
       ret

Xeda112358 · « **Reply #36 on:** August 11, 2011, 04:20:03 pm »

Quote from: Quigibo on August 11, 2011, 04:02:27 pm

For your DE_Times_BC, this is one byte more in overhead, but much faster:
Code: [Select]
ld a,c ld c,b ld hl,0 ld b,16 Mul_Loop_1: add hl,hl add a,a rl c jr nc,$+3 add hl,de djnz Mul_Loop_1 ret

I like this method for speeding things up! This is exactly the kind of thing I was hoping for. I want to understand how to program better, so the more ideas I can learn, the better off I should be on my quest

Xeda112358 · « **Reply #37 on:** December 01, 2011, 11:07:15 am »

Hmm, here is a signed division routine I wrote... I compared it to the HL_Div_BC routine I wrote.
If both inputs are positive, this is exactly the same speed, but if both are negative, it takes 50 cycles more and if only one is negative, it takes 71 cycles more.

Code: [Select]

;===============================================================
HL_SDiv_BC:
;===============================================================
;Performs HL/BC
;Speed:   1494 cycles
;         **same cycles as the regular HL_Div_BC
;         add 25 if HL is negative
;         add 25 if BC is negative
;         add another 46 if only one is negative 
;Size:    54 bytes
;         **31 bytes larger than the regular HL_Div_BC
;Inputs:
;     HL is the numerator
;     BC is the denominator
;Outputs:
;     HL is the quotient
;     DE is the remainder
;     BC is not changed
;     A is 0
;     z flag is set
;     c flag is reset
;===============================================================
     ld a,h
     xor b
     and 128
     push af
absHL:
     bit 7,h
     jr z,absBC
     ld a,l \ cpl \ ld l,a
     ld a,h \ cpl \ ld h,a
     inc hl
absBC:
     bit 7,b
     jr z,$+9
     ld a,c \ cpl \ ld c,a
     ld a,b \ cpl \ ld b,a
     inc bc
     add hl,hl
       ld a,15
       ld de,0
Div_Loop_1:
         add hl,hl
         ex de,hl
         adc hl,hl
         or a
         sbc hl,bc
         jr c,$+5
           inc e
           jr $+3
         add hl,bc
         ex de,hl
         dec a
         jr nz,Div_Loop_1
       pop af \ ret z
     ld a,l \ cpl \ ld l,a
     ld a,h \ cpl \ ld h,a
     inc hl
       ret

calc84maniac · « **Reply #38 on:** December 01, 2011, 11:39:56 am »

I took a shot at optimizing it some more, I'm not sure if it works, but I think it should.

Code: [Select]

;===============================================================
HL_SDiv_BC:
;===============================================================
;Performs HL/BC
;Speed:   1168 to 1318 cycles depending on how many set bits in the result
;         add 19 if HL is negative
;         add 19 if BC is positive
;         add another 28 if only one is negative 
;Size:    54 bytes
;         **31 bytes larger than the regular HL_Div_BC
;Inputs:
;     HL is the numerator
;     BC is the denominator
;Outputs:
;     HL is the quotient
;     DE is the remainder
;     BC = -abs(BC)
;===============================================================
     ld a,h
     xor b
     push af
absHL:
     add hl,hl
     jr nc,negabsBC
     xor a \ sub l \ ld l,a
     sbc a,a \ sub h \ ld h,a
negabsBC:
     bit 7,b
     jr nz,$+8
     xor a \ sub c \ ld c,a
     sbc a,a \ sub b \ ld b,a
       ex de,hl
       xor a
       ld h,a
       ld l,a
       ld a,15
Div_Loop_1:
         rl e \ rl d
         adc hl,hl
         add hl,bc
         jr c,$+4
          sbc hl,bc
         dec a
         jr nz,Div_Loop_1
       ex de,hl
       adc hl,hl
       pop af \ ret p
     xor a \ sub l \ ld l,a
     sbc a,a \ sub h \ ld h,a
     ret

Edit: Just realized I needed to clear the carry before the loop. My fix renders Xeda's ld hl,0 comment moot though, sorry

Xeda112358 · « **Reply #39 on:** December 01, 2011, 11:42:58 am »

Wow, awesome! But can the ld hl,0 be changed to sbc hl,hl? That would save a byte (but I think it is 5 cycles slower...

)

Runer112 · « **Reply #40 on:** December 12, 2011, 03:46:00 pm »

Here's a very optimized way to convert a 16-bit signed number into an 8-bit signed number in a with overflow handling (if hl<-128, a=-128; if hl>127, a=127). Two added bonus to being super small and super fast are that it destroys nothing and that you could easily modify it to make the input a 16-bit register other than hl.

Code: [Select]

Signed16To8:
	ld	a,l
	add	a,a
	sbc	a,a
	sub	h
	ld	a,l
	ret	z
	ld	a,h
	add	a,a
	sbc	a,a
	xor	%01111111
	ret

Xeda112358 · « **Reply #41 on:** December 12, 2011, 03:50:27 pm »

Yes, this was truly awesome getting to witness live amazingness while I also tried to create the same routine

My attempt was extremely ugly compared to this

Beautiful code, Runer112

Xeda112358 · « **Reply #42 on:** March 12, 2012, 02:50:35 pm »

Wow, I am surprised I haven't posted these in this topic as I have been very proud of them for a long while now. They make a legitimate use of RRD and RLD, so for those questioning the use, check it out:

Code: [Select]

ShiftScreenRight4:
;Shifts the graph screen right 4 pixels
     ld hl,plotSScreen
     ld c,64
       xor a
       ld b,12
         rrd
         inc hl
         djnz $-3
       dec c
       jr nz,$-9
     ret
ShiftScreenLeft4:
;Shifts the graph screen left 4 pixels
     ld hl,plotSScreen+767
     ld c,64
       xor a
       ld b,12
         rld
         dec hl
         djnz $-3
       dec c
       jr nz,$-9
     ret

It is the same size as shifting 1 pixel, though 7680 cycles slower. That is still about 1 bazillion times faster than shifting left or right 1 pixel, 4 times. I've been using these for years in my graphics related programs

I hope they prove useful!

EDIT: In this case, 1 bazillion == 88664, apparently. To run the shifting right code once, it is 22166 cycles. The above codes use 29846 cycles.

Xeda112358 · « **Reply #43 on:** May 02, 2012, 08:28:56 pm »

Hmm, not sure why I haven't posted this here, yet, either. This is pretty useful, especially for parsing a list of numbers from some form of user input. Feel free to optimise and report back

Code: [Select]

;=============================================================== 
ConvRStr: 
;=============================================================== 
;Input: 
;     DE points to the base 10 number string in RAM. 
;Outputs: 
;     HL is the 16-bit value of the number 
;     DE points to the byte after the number 
;     BC is HL/10
;     c flag reset (nc)
;     z flag reset (nz)
;Destroys: 
;     A (actually, add 30h and you get the ending token) 
;Size:  23 bytes 
;Speed: 104n+42+11c
;       n is the number of digits 
;       c is at most n-2 
;       at most 595 cycles for any 16-bit decimal value 
;=============================================================== 
     ld hl,0          ;  10 : 210000 
ConvLoop:             ; 
     ld a,(de)        ;   7 : 1A 
     sub 30h          ;   7 : D630 
     cp 10            ;   7 : FE0A 
     ret nc           ;5|11 : D0 
     inc de           ;   6 : 13 
                      ; 
     ld b,h           ;   4 : 44 
     ld c,l           ;   4 : 4D 
     add hl,hl        ;  11 : 29 
     add hl,hl        ;  11 : 29 
     add hl,bc        ;  11 : 09 
     add hl,hl        ;  11 : 29 
                      ; 
     add a,l          ;   4 : 85 
     ld l,a           ;   4 : 6F 
     jr nc,ConvLoop   ;12|23: 30EE 
     inc h            ; --- : 24 
     jr ConvLoop      ; --- : 18EB

The ones with t-states as '---' are computed along with the previous instruction to make calculations easier. Anyways, to give an idea, at the slowest, this can execute 9803 times per second (assuming you are using call which takes another 17 t-states). This stops reading when a character that is not a decimal number is run into (for example, a comma or newline).

EDIT: By removing that one byte, timing is much more easily computed and slowest time drops from 625 to 595 t-states

This means it can execute an extra 459 times per second. It also makes the c flag have a definite output and as well the z flag

calc84maniac · « **Reply #44 on:** May 02, 2012, 08:52:30 pm »

The ret c on line 23 is redundant, but otherwise a great routine

Author Topic: ASM Optimized routines (Read 114514 times)

DJ Omnimaga

Re: ASM Optimized routines

Quigibo

Re: ASM Optimized routines

Galandros

Re: ASM Optimized routines

calc84maniac

Re: ASM Optimized routines

Xeda112358

Re: ASM Optimized routines

Quigibo

Re: ASM Optimized routines

Xeda112358

Re: ASM Optimized routines

Xeda112358

Re: ASM Optimized routines

calc84maniac

Re: ASM Optimized routines

Xeda112358

Re: ASM Optimized routines

Runer112

Re: ASM Optimized routines

Xeda112358

Re: ASM Optimized routines

Xeda112358

Re: ASM Optimized routines

Xeda112358

Re: ASM Optimized routines

calc84maniac

Re: ASM Optimized routines