Author Topic: Assembly Programmers - Help Axe Optimize! (Read 166226 times)

parserp · « **Reply #255 on:** November 16, 2011, 05:54:13 pm »

Quote from: buttsfredkin on November 04, 2011, 10:24:39 am

Hm...I say this as an Axe programmer, not knowing ASM...how about UPSIDE DOWN TEXT! om nom nom nom

Yeah! it could be something like Fix 11

that would be awesome!

Quigibo · « **Reply #256 on:** November 16, 2011, 08:54:40 pm »

But TI doesn't have a text flag for that

It has to be something that already exists.

ztrumpet · « **Reply #257 on:** November 16, 2011, 09:31:22 pm »

I like the lowercase toggle option. Actually, now that I think about it, it wouldn't be that useful because there's no reliable input command. Hmm...

Xeda112358 · « **Reply #258 on:** November 17, 2011, 12:31:08 pm »

I believe there is a Feature Wishlist thread ...

And to be relevant to the topic, I have not found any *actual* optimisations just from the quick look I made. There might be more, but y'all have done an amazing job so far!

jacobly · « **Reply #259 on:** December 06, 2011, 01:36:32 am »

p_SDiv: same size, saves an average of ~~7 cycles~~ 3.5 cycles
Edit: oops, abs(hl) isn't always positive (what???)

Original

Code: [Select]

p_SDiv:
	.db __SDivEnd-1-$
	ld	a,h
	xor	d
	push	af
	bit	7,h
	jr	z,$+8
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	bit	7,d
	jr	z,$+8
	xor	a
	sub	e
	ld	e,a
	sbc	a,a
	sub	d
	ld	d,a
	call	$3F00+sub_Div
	pop	af
	ret	p
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	ret
__SDivEnd:

Optimized

Code: [Select]

p_SDiv:
	.db __SDivEnd-1-$
	ld	a,h
	xor	d
	push	af
	xor	d ; a = h
	jp	p,$+9
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	; a = high byte of abs(hl)
	; therefore, bit 7 is clear
	; xor	d ; or works too
	; jp	p,$+9
	; Edit: doesn't work if hl = $8000
	; because abs($8000) = $8000
	bit	7,d
	jr	z,$+8
	xor	a
	sub	e
	ld	e,a
	sbc	a,a
	sub	d
	ld	d,a
	call	$3F00+sub_Div
	pop	af
	ret	p
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	ret
__SDivEnd:

Edit: Some more attempts at optimizing.

Optimized (speed)
+1 byte, avg -19 cycles

Code: [Select]

p_SDiv:
	.db __SDivEnd-1-$
	ld	a,h
	xor	d
	push	af
	xor	d ; a = h
	jp	p,$+9
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	ld	a,d
	or	a
	jp	p,$+9
	xor	a
	sub	e
	ld	e,a
	sbc	a,a
	sub	d
	ld	d,a
	ld b,b
	.db 2
	call $3F00+sub_Div
	pop	af
	ret	p
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	ret
__SDivEnd:

p_Div:
	.db __DivEnd-1-$
	ld	a,d
	or	a
	ld	b,16
	; ...

Optimized (size)
-4 bytes, avg +37 cycles

Code: [Select]

p_SDiv:
	.db __SDivEnd-1-$
	ld	a,h
	xor	d
	push	af
	ld	b,2
__SDivRepeat:
	ex	de,hl
	xor	h
	jp	p,__SDivSkip
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
__SDivSkip:
	xor	a
	djnz	__SDivRepeat
	call $3F00+sub_Div
	pop	af
	ret	p
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	ret
__SDivEnd:

Optimized (lol)
-8 bytes, avg +110-4 cycles

Code: [Select]

p_SDiv:
	.db __SDivEnd-1-$
	ld	a,h
	xor	d
	ld	b,2
__SDivRepeat:
	push	af
	xor	d
	jp	p,__SDivSkip
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
__SDivSkip:
	dec	b
	ret	m
	ex	de,hl
	ld	b,b
	.db	1
	call	z,$3F00+sub_Div
	inc	b
	pop	af
	djnz	__SDivRepeat
	jr	__SDivRepeat+2
__SDivEnd:

p_Div:
	.db __DivEnd-1-$
	ld	a,d
	or	a
	ld	b,16
	; ...

Edit: p_SortD: same size, 2*size-5 cycles faster

Original

Code: [Select]

p_SortD:
	.db __SortDEnd-1-$
	ld	c,l
	ex	de,hl
__SortDLoop2:
	ld	b,c
	push	hl
	jr	__SortDJumpIn
__SortDLoop1:
	inc	hl
	cp	(hl)
	jr	c,__SortDSkip
__SortDJumpIn:
	ld	a,(hl)
	ld	d,h
	ld	e,l
__SortDSkip:
	djnz	__SortDLoop1
	ld	b,(hl)
	ld	(hl),a
	ld	a,b
	ld	(de),a
	pop	hl
	dec	c
	jr	nz,__SortDLoop2
	ret
__SortDEnd:

Optimized

Code: [Select]

p_SortD:
	.db __SortDEnd-1-$
	ld	c,l
	ex	de,hl
__SortDLoop2:
	ld	b,c
	push	hl
	jr	__SortDJumpIn
__SortDLoop1:
	inc	hl
	cp	(hl)
	jr	c,__SortDSkip
__SortDJumpIn:
	ld	a,(hl)
	ld	d,h
	ld	e,l
__SortDSkip:
	djnz	__SortDLoop1
	ldi
	dec	hl
	ld	(hl),a
	pop	hl
	jp	pe,__SortDLoop2
	ret
__SortDEnd:

Edit: p_Reciprocal: same size, saves avg 2 cycles

Original

Code: [Select]

p_Reciprocal:
	.db __ReciprocalEnd-1-$
	xor	a
	bit	7,h
	push	af
	jr	z,$+7
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	ex	de,hl
	ld	bc,$1000
	ld	hl,1
	xor	a
	ld	b,b
	.db 10
	call	$3F00+sub_Div
	pop	af
	ret	z
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	ret
__ReciprocalEnd:

Optimized
avg -2 cycles

Code: [Select]

p_Reciprocal:
	.db __ReciprocalEnd-1-$
	xor	a
	bit	7,h
	push	af
	jr	z,$+8
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	xor	a
	ex	de,hl
	ld	bc,$1000
	ld	hl,1
	ld	b,b
	.db 10
	call	$3F00+sub_Div
	pop	af
	ret	z
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	ret
__ReciprocalEnd:

Optimized Moar
avg -33 cycles

Code: [Select]

p_Reciprocal:
	.db __ReciprocalEnd-1-$
	xor	a
	bit	7,h
	push	af
	jr	z,$+8
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	xor	a
	ex	de,hl
	ld	bc,$1001
	ld	hl,2
	ld	b,b
	.db 16
	call	$3F00+sub_Div
	pop	af
	ret	z
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	ret
__ReciprocalEnd:

Edit: p_Mod: 2 bytes smaller, 96 cycles faster!

Original

Code: [Select]

p_Mod:
	.db __ModEnd-1-$
	ld	a,h
	ld	c,l
	ld	hl,0
	ld	b,16
__ModLoop:
	scf
	rl	c
	rla
	adc	hl,hl
	sbc	hl,de
	jr	nc,__ModSkip
	add	hl,de
	dec	c
__ModSkip:
	djnz	__ModLoop
	ret
__ModEnd:

Optimized

Code: [Select]

p_Mod:
	.db __ModEnd-1-$
	ld	a,h
	ld	c,l
	ld	hl,0
	ld	b,16
__ModLoop:
	sla	c
	rla
	adc	hl,hl
	sbc	hl,de
	jr	nc,__ModSkip
	add	hl,de
__ModSkip:
	djnz	__ModLoop
	ret
__ModEnd:

Happybobjr · « **Reply #260 on:** December 06, 2011, 07:56:54 am »

awesome I needed speed boost in division.
p_SDiv is signed 16 bit division, right?

jacobly · « **Reply #261 on:** December 09, 2011, 05:53:34 am »

Quote from: Quigibo on November 04, 2011, 01:58:14 am

Not an optimization, but I'm posting this here since more assembly people will read it. Since the Bitmap() command is being replaced with something actually useful, that means the "Fix 8" and "Fix 9" will also need to be replaced. Are there any useful flags (particularly for text) that would be useful to Axe programmers that I haven't already covered with the other fix commands? A couple I can think of are an APD toggle or Lowercase toggle.

I agree with adding the lowercase enable flag, especially if p_GetKeyPause is modified slightly. For example:

Code: [Select]

p_GetKeyPause: ; Change to subroutine
	.db __GetKeyPauseEnd-1-$
	B_CALL(_GetKeyRetOff)
	res	7,(iy+40)
	ld	h,0
	ld	l,a
	cp	$fc
	ret	c
	ld	a,($8446)
	ld	h,a ; Edit: something like inc h \ ld l,a might be easier
	; since lowercase letters would be consecutive
	; or ld hl,($8446) \ ld h,a
	ret
__GetKeyPauseEnd-1-$

Quigibo · « **Reply #262 on:** December 10, 2011, 05:06:35 pm »

Thanks for the optimizations

Unfortunately that p_SortD won't work because ldi also increases de. Also, the p_Mod won't work because c's right bit will never be set in either of those cases.

jacobly · « **Reply #263 on:** December 10, 2011, 05:11:59 pm »

For p_SortD, affecting de doesn't matter because if you follow the code path, the next occurrence of de is when it is loaded from hl, so its contents don't matter.
As for p_Mod, ac is the division result, which is never needed. You can also notice that in the original routine, the new bits shifted into ac are never read.

Edit: fixed grammar

Also, some peephole ops I would find useful.

Code: [Select]

	.db 3
	sbc	hl,de
	ld	a,h
	or	l
	.db 2
	sbc	hl,de

Code: [Select]

	.db	8
	ld	de,$0000
	add	hl,de
	sbc	hl,hl
	inc	hl
	dec	hl
	.db	6
	ld	de,$0000
	add	hl,de
	sbc	hl,hl

Quigibo · « **Reply #264 on:** December 10, 2011, 07:20:44 pm »

Oh I forgot to mention that I've optimized division now by having the long division routine call the modulus subroutine to save space... I do see what you mean now about the sorting, so I added that change.

Generally I don't do peephole optimizations unless all the registers are the same, but I'm 99% sure that this particular one should be okay and that nothing else in the Axe protocol currently relies on the "a" register after a zero check, so I'll add that one. That second one seems rare, it would only occur if you added 1 after a signed greater than or equal to zero comparison. So at least until make it faster, I'm only trying to do the most common/significant optimizations because each one I add slows down parsing.

jacobly · « **Reply #265 on:** December 10, 2011, 07:43:58 pm »

I had a sneaking suspicion that you would find some reason to call p_Mod

For the first peephole optimization, since you already had

Code: [Select]

	.db 3
	sbc	hl,hl
	ld	a,h
	or	l
	.db 2
	sbc	hl,hl

I figured that you already checked that the value of a is not used.

The second one was an optimization for 32-bit subtraction (it was supposed to be p_LtLeXX followed by dec hl). However, the following should work because it has no differing side effects and should be more common. (btw, I think Runer may have made a similar suggestion)
.db 2
inc hl
dec hl
.db 0 ;<- dont know if this works

jacobly · « **Reply #266 on:** December 11, 2011, 07:34:47 am »

Lol... I'm already optimizing

p_ArcTan: same size, save 19 or 10 (avg 14.5) cycles

Original

Code: [Select]

p_ArcTan:
	.db __ArcTanEnd-1-$
	ex	de,hl		;de = y
	pop	hl
	ex	(sp),hl		;hl = x
	push	hl
	ld	a,h		;\
	xor	d		; |Get pairity
	rla			;/
	jr	c,__ArcTanSS	;\
	add	hl,de		; |
	add	hl,de		; |
__ArcTanSS:			; |
	or	a		; |hl = x +- y
	sbc	hl,de		;/
	ex	de,hl		;de = x +- y
	ld	b,6		;\
__ArcTan64:			; |
	add	hl,hl		; |hl = 64y
	djnz	__ArcTan64	;/
	call	$3F00+sub_SDiv	;hl = 64y/(x +- y)
	pop	af		;\
	rla			; |Right side, fine
	ret	nc		;/
	sbc	a,a		;\
	sub	h		; |Reverse sign extend
	ld	h,a		;/
	ld	a,l		;\
	add	a,128		; |Add or sub 128
	ld	l,a		;/
	ret
__ArcTanEnd:

Optimized

Code: [Select]

p_ArcTan:
	.db __ArcTanEnd-1-$
	ex	de,hl		;de = y
	pop	hl
	ex	(sp),hl		;hl = x
	push	hl
	ld	a,h		;\
	xor	d		;/ Get parity
	jp	m,__ArcTanSS-p_ArcTan-1
	add	hl,de		;\
	jr	__ArcTanDS	; |
__ArcTanSS:			; |hl = x +- y
	sbc	hl,de		; |
__ArcTanDS:			;/
	ex	de,hl		;de = x +- y
	ld	b,6		;\
__ArcTan64:			; |
	add	hl,hl		; |hl = 64y
	djnz	__ArcTan64	;/
	call	$3F00+sub_SDiv	;hl = 64y/(x +- y)
	pop	af		;\
	rla			; |Right side, fine
	ret	nc		;/
	sbc	a,a		;\
	sub	h		; |Reverse sign extend
	ld	h,a		;/
	ld	a,l		;\
	add	a,128		; |Add or sub 128
	ld	l,a		;/
	ret
__ArcTanEnd:

I'm curious as to why you multiplied by 64 before dividing. It would seem that if the times 64 was after the division, the result would generally be the same, but there would be less of a chance of overflow. It's possible though that it doesn't matter.
Edit: Oh yeah... accuracy. Your way is more accurate, nvm.

p_DrawBmp: saves 3 to 4 bytes, and (8 ± 0 or 4) × (visible height) - 8 cycles

Original

Code: [Select]

p_DrawBmp:
	; ...
__DrawBmpGoodSize:
	ld	b,a			;B = plot_height
	push	bc			;****** BEGIN BUFFER CALCULATIONS ******
	; ...
__DrawBmpLeftLoop:
	inc	c
	dec	c
	jr	z,__DrawBmpSkipMain
	dec	c
	; ...
__DrawBmpOnLeft:			;A = X + 8
	inc	c
	dec	c
	ld	d,(hl)
	inc	hl
	ld	e,c			;E = 0 if z
	jr	z,__DrawBmpSt
	; ...
__DrawBmpStSkip:
	ld	a,e
	pop	de			;D = X
	ld	e,c
	pop	bc
	ld	c,e			;C = bytes
	; ...
__DrawBmpColWall:
	inc	c
	dec	c
	jr	z,__DrawBmpSkipMain
	dec	c
	ld	a,d
	jr	nz,__DrawBmpColLeft
	cp	88
	ld	d,(hl)
	inc	hl
	jr	nc,__DrawBmpSkipMain
	ld	e,c
	jr	__DrawBmpSt
	; ...

Optimized

Code: [Select]

p_DrawBmp:
	; ... c = bytes + 1 is required for the rest of the optimizations
__DrawBmpGoodSize:
	ld	b,a			;B = plot_height
	inc	c			;C = bytes+1
	push	bc			;****** BEGIN BUFFER CALCULATIONS ******
	; ... undo inc c above, affect z flag the same as before, c is still one more than before
__DrawBmpLeftLoop:
	dec	c
	jr	z,__DrawBmpSkipMain
	; ... since c is one more than before, check e = c - 1 for 0, instead of c
__DrawBmpOnLeft:			;A = X + 8
	ld	d,(hl)
	inc	hl
	ld	e,c
	dec	e			;E = 0 and z (if bytes = 0)
	jr	z,__DrawBmpSt
	; ... this stores one more than before to e, but all code paths lead to
	; either pop de, ld e,(hl), or ld e,c before e is ever used.
__DrawBmpStSkip:
	ld	a,e
	pop	de			;D = X
	ld	e,c
	pop	bc
	ld	c,e			;C = bytes+1
	; ... same as above
__DrawBmpColWall:
	dec	c
	jr	z,__DrawBmpSkipMain
	ld	a,d
	jr	nz,__DrawBmpColLeft
	cp	88
	ld	d,(hl)
	inc	hl
	jr	nc,__DrawBmpSkipMain
	; I do not understand the reason for ld e,c, however, c is one more than before,
	; so dec e to have e be the same as before, but I don't know if this is necessary.
	ld	e,c
	dec	e
	jr	__DrawBmpSt
	; ...

Sorry for bumping some of these so soon, but I wanted to change them to work with the new version.

p_88Mul: same size, saves 1 or 6 (avg 3.5) cycles

Original

Code: [Select]

p_88Mul:
	.db __88MulEnd-1-$
	ld	a,h
	xor	d
	push	af
	bit	7,h
	jr	z,$+8
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	bit	7,d
	jr	z,$+8
	xor	a
	sub	e
	ld	e,a
	sbc	a,a
	sub	d
	ld	d,a
	call	$3F00+sub_MulFull
	ld	l,h
	ld	h,a
	pop	af
	xor	h
	ret	p
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	ret
__88MulEnd:

Optimized

Code: [Select]

p_88Mul:
	.db __88MulEnd-1-$
	ld	a,h
	xor	d
	push	af
	xor	d ; a = h
	jp	p,$+9-p_88Mul-1
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	bit	7,d
	jr	z,$+8
	xor	a
	sub	e
	ld	e,a
	sbc	a,a
	sub	d
	ld	d,a
	call	$3F00+sub_MulFull
	ld	l,h
	ld	h,a
	pop	af
	xor	h
	ret	p
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	ret
__88MulEnd:

p_SDiv: same size, saves 1 or 6 (avg 3.5) cycles

Original

Code: [Select]

p_SDiv:
	.db __SDivEnd-1-$
	ld	a,h
	xor	d
	push	af
	bit	7,h
	jr	z,$+8
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	bit	7,d
	jr	z,$+8
	xor	a
	sub	e
	ld	e,a
	sbc	a,a
	sub	d
	ld	d,a
	call	$3F00+sub_Div
	pop	af
	ret	p
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	ret
__SDivEnd:

Optimized

Code: [Select]

p_SDiv:
	.db __SDivEnd-1-$
	ld	a,h
	xor	d
	push	af
	xor	d ; a = h
	jp	p,$+9-p_SDiv-1
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	bit	7,d
	jr	z,$+8
	xor	a
	sub	e
	ld	e,a
	sbc	a,a
	sub	d
	ld	d,a
	call	$3F00+sub_Div
	pop	af
	ret	p
	xor	a
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	ret
__SDivEnd:

p_Reciprocal: same size, saves 31 cycles
Let me know if you want this one explained.

Original

Code: [Select]

p_Reciprocal:
	.db __ReciprocalEnd-1-$
	xor	a
	bit	7,h
	push	af
	jr	z,$+8
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	xor	a
	ex	de,hl
	ld	bc,$1000
	ld	hl,1
	ld b,b \ .db 7 \ call $3F00+sub_Mod
	ld	h,a
	ld	l,c
	pop	af
	ret	z
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	ret
__ReciprocalEnd:

Optimized

Code: [Select]

p_Reciprocal:
	.db __ReciprocalEnd-1-$
	xor	a
	bit	7,h
	push	af
	jr	z,$+8
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	xor	a
	ex	de,hl
	ld	bc,$1001
	ld	hl,2
	ld b,b \ .db 13 \ call $3F00+sub_Mod
	ld	h,a
	ld	l,c
	pop	af
	ret	z
	sub	l
	ld	l,a
	sbc	a,a
	sub	h
	ld	h,a
	ret
__ReciprocalEnd:

Runer112 · « **Reply #267 on:** December 12, 2011, 01:48:13 am »

First, an optimization so simple, it doesn't even deserve a fancy side-by-side comparison. It's funny that nobody (including myself, until now) noticed this optimization, because it's something you wouldn't even think about optimizing. But anyways, getting to the optimization: remove the ld hl,0 at the start of p_Mul! Who would've thought, optimize code by simply deleting a line of it!

Second, I actually thought of the above optimization while toying around with the multiplication routine attempting to optimize it in another manner. Although I know you generally like to optimize for size, I think that with a routine like multiplication that is so heavily used, a routine optimized more for speed would be worth it. Especially if the cost in size is a paltry two bytes! This optimization will get a fancy side-by-side comparison.

Smaller routine: 14 bytes, ~836 cycles

Code: [Select]

p_Mul:
	.db __MulEnd-1-$
	ld	c,h
	ld	a,l
	ld	b,16
__MulNext:
	add	hl,hl
	add	a,a
	rl	c
	jr	nc,__MulSkip
	add	hl,de
__MulSkip:
	djnz	__MulNext
	ret
__MulEnd:

Faster routine: 16 bytes, ~741 cycles

Code: [Select]

p_Mul:
	.db __MulEnd-1-$
	ld	c,l
	ld	a,h
	call	__MulByte
	ld	a,c
__MulByte:
	ld	b,8
__MulNext:
	add	hl,hl
	add	a,a
	jr	nc,__MulSkip
	add	hl,de
__MulSkip:
	djnz	__MulNext
	ret
__MulEnd:

EDIT: The division routine just gave me a great idea. For another paltry two bytes, if you'd like, you can cut the time for multiplying 8-bit numbers in half! Compared to Axe's current routine, you would get a speed increase anywhere from 12% to 120% for a total size increase of one byte! Not bad! Of the three routines I have provided, this routine is definitely my favorite.

Even faster routine: 18 bytes,
~749 cycles for 16-bit inputs (h!=0),
~386 cycles for 8-bit inputs (h=0)

Code: [Select]

p_Mul:
	.db __MulEnd-1-$
	ld	c,l
	xor	a
	ld	l,a
	add	a,h
	call	nz,__MulByte
	ld	a,c
__MulByte:
	ld	b,8
__MulNext:
	add	hl,hl
	add	a,a
	jr	nc,__MulSkip
	add	hl,de
__MulSkip:
	djnz	__MulNext
	ret
__MulEnd:

EDIT 2: Another thought: the same basic optimization I applied in the 16-byte routine (the faster routine before the 8-bit optimization) could be applied to p_MulFull to save a couple hundred cycles in each fixed-point multiplication. High-order multiplication and fixed-point multiplication could no longer share a routine, but it might be worth it considering the two multiplication techniques are (in my experience) not commonly used in the same scenario.

Builderboy · « **Reply #268 on:** December 12, 2011, 02:28:58 am »

*Builderboy votes for using super fast multiplication

*

Quigibo · « **Reply #269 on:** December 12, 2011, 02:58:15 am »

@jacobly
Your p_DrawBmp optimizations don't do the same thing as the original code (which is why you're confused about the use of e). For instance, right after __DrawBmpColWall, my original routine checks if c is zero, then checks if c is one. Yours checks if c is zero, then if its non-zero. The e register, which is the left aligned byte to be shifted, is only loaded with c as an optimization in the case that c is zero because the sprite is clipped on that wall. This is always the same as doing ld e,0.

But thanks for the other optimizations, I have added all of them

@Runer112
Mind == Blown. That's unbelievably cool! Modular arithmetic is quite a strange beast sometimes. Anyway, I like that last suggestion best as well

However, even though removing the zero loading works for the 16 bit multiplication and (I presume) the 32 bit multiplication, I don't think that last optimization will work for 32-bit, unless you can think of another method?

Author Topic: Assembly Programmers - Help Axe Optimize! (Read 166226 times)

parserp

Re: Assembly Programmers - Help Axe Optimize!

Quigibo

Re: Assembly Programmers - Help Axe Optimize!

ztrumpet

Re: Assembly Programmers - Help Axe Optimize!

Xeda112358

Re: Assembly Programmers - Help Axe Optimize!

jacobly

Re: Assembly Programmers - Help Axe Optimize!

Happybobjr

Re: Assembly Programmers - Help Axe Optimize!

jacobly

Re: Assembly Programmers - Help Axe Optimize!

Quigibo

Re: Assembly Programmers - Help Axe Optimize!

jacobly

Re: Assembly Programmers - Help Axe Optimize!

Quigibo

Re: Assembly Programmers - Help Axe Optimize!

jacobly

Re: Assembly Programmers - Help Axe Optimize!

jacobly

Re: Assembly Programmers - Help Axe Optimize!

Runer112

Re: Assembly Programmers - Help Axe Optimize!

Builderboy

Re: Assembly Programmers - Help Axe Optimize!

Quigibo

Re: Assembly Programmers - Help Axe Optimize!