Show Posts

This section allows you to view all posts made by this member. Note that you can only see posts made in areas you currently have access to.

Messages - Xeda112358

Pages: 1 ... 13 14 [15] 16 17 ... 317

211

ASM / Re: [z80] 32 bit by 16 bits division and 32 bit square root

« on: March 23, 2019, 10:18:40 am »

Here is my version. It's about 50 bytes larger, but averages 1889.75cc. It does use stack and shadow registers, but it could be made faster by totally unrolling (which would be about 300 bytes of code).

Code: [Select]

sqrt32:
;Input: HLDE
;Output: DE is the square root, AHL is the remainder
;Destroys: D'E', H'L'
;Speed: 248+{0,44}+3*sqrt32sub+sqrt32sub_2+sqrt32_iter15
;min: 1697cc
;max: 2086cc
;avg: 1889.75cc
;
;Python implementation:
;  remainder = 0
;  acc = 0
;  for k in range(16):
;    acc<<=1
;    x&=0xFFFFFFFF
;    x<<=2
;    y=x>>32
;    remainder<<=2
;    remainder+=y
;    if remainder>=acc*2+1:
;      remainder-=(acc*2+1)
;      acc+=1
;  return [acc,remainder]
;
  di
  exx  
  ld hl,0         ;remainder
  ld d,h \ ld e,h ;acc
  exx

  ld a,h \ call sqrt32sub \ exx
  ld a,l \ call sqrt32sub \ exx
  ld a,d \ call sqrt32sub \ exx
;Now we have four more iterations
;The first two are no problem
  ld a,e
  exx
  call sqrt32sub_2

;On the next iteration, HL might temporarily overflow by 1 bit
  call sqrt32_iter15

;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anyways
sqrt32_iter16:
  add a,a
  adc hl,hl
  rla
  adc hl,hl
  rla
;AHL - (DE+DE+1)
  sbc hl,de \ sbc a,0
  inc e
  sbc hl,de \ sbc a,0
  ret p
  add hl,de
  adc a,0
  dec e
  add hl,de
  adc a,0
  ret


sqrt32sub:
;min: 391cc
;max: 483cc
;avg: 437cc
  exx
  call sqrt32sub_2

sqrt32sub_2:
;min: 185cc
;max: 231cc
;avg: 208cc
  call +_

_:
;min: 84cc
;max: 107cc
;avg: 95.5cc

  sll e \ rl d      ;sla e \ rl d \ inc e

  add a,a
  adc hl,hl
  add a,a
  adc hl,hl

  sbc hl,de
  inc e
  ret nc
  dec e
  add hl,de
  dec e
  ret

sqrt32_iter15:
;91+{8,0+{0,23}}
;min: 91cc
;max: 114cc
;avg: 100.75cc

  sll e \ rl d      ;sla e \ rl d \ inc e
  add a,a
  adc hl,hl
  add a,a
  adc hl,hl       ;This might overflow!
  jr c,sqrt32_iter15_br0
;
  sbc hl,de
  inc e
  ret nc
  dec e
  add hl,de
  dec e
  ret
sqrt32_iter15_br0:
  or a
  sbc hl,de
  inc e
  ret

EDIT:
Oh jeez, here is an even bigger version that uses less stack space and doesn't use shadow registers or index registers:

Code: [Select]

sqrt32:
;Input: HLDE
;speed: 238+{0,1}+{0,44}+sqrtHL+3*sqrt32sub_2+sqrt32_iter15
;min: 1260
;max: 1506
;avg: 1377.75

  push de
  call sqrtHL
  pop bc
  add a,a
  ld e,a
  jr nc,+_
  inc d
_:

  ld a,b
  call sqrt32sub_2
  call sqrt32sub_2
;Now we have four more iterations
;The first two are no problem
  ld a,c
  call sqrt32sub_2

;On the next iteration, HL might temporarily overflow by 1 bit
  call sqrt32_iter15

;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anyways
sqrt32_iter16:
  add a,a
  adc hl,hl
  rla
  adc hl,hl
  rla
;AHL - (DE+DE+1)
  sbc hl,de \ sbc a,0
  inc e
  or a
  sbc hl,de \ sbc a,0
  ret p
  add hl,de
  adc a,0
  dec e
  add hl,de
  adc a,0
  ret

sqrt32sub_2:
;min: 185cc
;max: 231cc
;avg: 208cc
  call +_

_:
;min: 84cc
;max: 107cc
;avg: 95.5cc

  sll e \ rl d
  add a,a \ adc hl,hl
  add a,a \ adc hl,hl

  sbc hl,de
  inc e
  ret nc
  dec e
  add hl,de
  dec e
  ret

sqrt32_iter15:
;91+{8,0+{0,23}}
;min: 91cc
;max: 114cc
;avg: 100.75cc

  sll e \ rl d      ;sla e \ rl d \ inc e
  add a,a
  adc hl,hl
  add a,a
  adc hl,hl       ;This might overflow!
  jr c,sqrt32_iter15_br0
;
  sbc hl,de
  inc e
  ret nc
  dec e
  add hl,de
  dec e
  ret
sqrt32_iter15_br0:
  or a
  sbc hl,de
  inc e
  ret
.echo $-sqrt32

sqrtHL:
;returns A as the sqrt, HL as the remainder, D = 0
;min: 376cc
;max: 416cc
;avg: 393cc
  ld de,$5040
  ld a,h
  sub e
  jr nc,+_
  add a,e
  ld d,$10
_:
  sub d
  jr nc,+_
  add a,d
  .db $01   ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
  set 5,d
  res 4,d
  srl d

  set 2,d
  sub d
  jr nc,+_
  add a,d
  .db $01   ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
  set 3,d
  res 2,d
  srl d

  inc d
  sub d
  jr nc,+_
  add a,d
  dec d   ;this resets the low bit of D, so `srl d` resets carry.
  .db $06   ;start of ld b,* which is 7cc to skip the next byte.
_:
  inc d
  srl d
  ld h,a


  sbc hl,de
  ld a,e
  jr nc,+_
  add hl,de
_:
  ccf
  rra
  srl d
  rra
  ld e,a

  sbc hl,de
  jr nc,+_
  add hl,de
  .db $01   ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
  or %00100000
  xor %00011000
  srl d
  rra
  ld e,a


  sbc hl,de
  jr nc,+_
  add hl,de
  .db $01   ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
  or %00001000
  xor %00000110
  srl d
  rra
  ld e,a
  sbc hl,de
  jr nc,+_
  add hl,de
  srl d
  rra
  ret
_:
  inc a
  srl d
  rra
  ret
.echo $-sqrtHL

It does use the 16-bit square root routine here to take care of the first 16 bits

Combined, it is 194 bytes.
EDIT2: Forgot that sqrtHL didn't preserve BC, fixed that. Now it seems the last bit of the remainder might be broken, so I have to fix that

EDIT3: Fixed the bug in the bottom bit

I just needed to reset the carry flag before the second subtraction in the final iteration.

EDIT4: In a scenario where you don't have RAM for a stack, we can hardcode it! It even saves 54cc (but adds 20 bytes). 10cc of that 54cc is just due to not having an ending RET. I also switched input to HLIX instead of HLDE.
I reorganized the code so that it would be "obvious" that sqrt32 is an in-line routine, so I put it at the end (in practice, the subroutines would probably be toward the end of mem). The precomputed stack is inserted just before sqrt32.

Code: [Select]

sqrt32sub_2:
;min: 178cc
;max: 224cc
;avg: 201cc
  jp return4
return4:
;min: 84cc
;max: 107cc
;avg: 95.5cc

  sll e \ rl d
  add a,a \ adc hl,hl
  add a,a \ adc hl,hl

  sbc hl,de
  inc e
  ret nc
  dec e
  add hl,de
  dec e
  ret

sqrt32_iter15:
;91+{8,0+{0,23}}
;min: 91cc
;max: 114cc
;avg: 100.75cc

  sll e \ rl d      ;sla e \ rl d \ inc e
  add a,a
  adc hl,hl
  add a,a
  adc hl,hl       ;This might overflow!
  jr c,sqrt32_iter15_br0
;
  sbc hl,de
  inc e
  ret nc
  dec e
  add hl,de
  dec e
  ret
sqrt32_iter15_br0:
  or a
  sbc hl,de
  inc e
  ret

sqrtHL:
;returns A as the sqrt, HL as the remainder, D = 0
;min: 376cc
;max: 416cc
;avg: 393cc
  ld de,$5040
  ld a,h
  sub e
  jr nc,+_
  add a,e
  ld d,$10
_:
  sub d
  jr nc,+_
  add a,d
  .db $01   ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
  set 5,d
  res 4,d
  srl d

  set 2,d
  sub d
  jr nc,+_
  add a,d
  .db $01   ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
  set 3,d
  res 2,d
  srl d

  inc d
  sub d
  jr nc,+_
  add a,d
  dec d   ;this resets the low bit of D, so `srl d` resets carry.
  .db $06   ;start of ld b,* which is 7cc to skip the next byte.
_:
  inc d
  srl d
  ld h,a


  sbc hl,de
  ld a,e
  jr nc,+_
  add hl,de
_:
  ccf
  rra
  srl d
  rra
  ld e,a

  sbc hl,de
  jr nc,+_
  add hl,de
  .db $01   ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
  or %00100000
  xor %00011000
  srl d
  rra
  ld e,a


  sbc hl,de
  jr nc,+_
  add hl,de
  .db $01   ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
  or %00001000
  xor %00000110
  srl d
  rra
  ld e,a
  sbc hl,de
  jr nc,+_
  add hl,de
  srl d
  rra
  ret
_:
  inc a
  srl d
  rra
  ret
sqrt32_stack:
.dw return0
.dw return4   ;subroutine
.dw return1
.dw return4   ;subroutine
.dw return2
.dw return4   ;subroutine
.dw return3
.dw return5
sqrt32_stack_end:




sqrt32:
;Input: HLIX
;Output: DE is the sqrt, AHL is the remainder
;min: 1203
;max: 1455
;avg: 1323.75
  ld sp,sqrt32_stack
  jp sqrtHL
return0:
  add a,a
  ld e,a
  jr nc,+_
  inc d
_:

  ld a,ixh
  jp sqrt32sub_2
return1:
  jp sqrt32sub_2
return2:
;Now we have four more iterations
;The first two are no problem
  ld a,ixl
  jp sqrt32sub_2
return3:
;On the next iteration, HL might temporarily overflow by 1 bit
  jp sqrt32_iter15
return5:

;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anyways
sqrt32_iter16:
  add a,a
  adc hl,hl
  rla
  adc hl,hl
  rla
;AHL - (DE+DE+1)
  sbc hl,de \ sbc a,0
  inc e
  or a
  sbc hl,de \ sbc a,0
  jp p,+_
  add hl,de
  adc a,0
  dec e
  add hl,de
  adc a,0
_:
  ;...

212

Super Smash Bros. Open / Re: Selecting on the menu not working?

« on: March 22, 2019, 08:40:13 am »

According to Hayleia (the author), only the "smash" button works; all the others redirect back to the main menu. They have the source code here if you want to look at it. In particular, the SMASHC2 file has the main menu code.

213

ASM / Re: [z80] 32 bit by 16 bits division and 32 bit square root

« on: March 20, 2019, 08:58:40 am »

Well,why not just do ex de,hl first and make the first 8 iterations add hl,hl and then another ex de,hl and the final iterations add hl,de.

In the version I'm working on, I have 3 groups of 4 iterations (shifting 1 byte) and then three iterations and then 1 special case. Something is wrong with it though, so I have to work more on it.

214

ASM / Re: [z80] 32 bit by 16 bits division and 32 bit square root

« on: March 19, 2019, 05:56:10 pm »

The remainder is no more than 2sqrt(x), so in this case at most 0x1FFFE, but on the 15th iteration it is at most 0xFFFE

215

ASM / Re: [z80] 32 bit by 16 bits division and 32 bit square root

« on: March 19, 2019, 04:58:24 pm »

I haven't had much luck with this one, unfortunately. If you are open to using more stack space or external memory, you can take advantage of first working with 8-bit values, then 16-bit, then 32-bit.

Oh! I was just about to post and you posted. That looks like a much cleaner routine!

EDIT: One thing though: In you routine, A is 0 until the last iteration, so you can make the last iteration a special case and speed up the inner loop.

216

General Calculator Help / Re: TI-84 not receiving OS [Resolved]

« on: March 19, 2019, 10:45:15 am »

Holy heck, I'm glad it worked out! A few weeks ago my nspire was taking a loooong time to validate-- I even kept it plugged in over night and it still wasn't validated. I knew validation could take a while, but that seemed excessive, so I unplugged it, went to work and let it die, then when I came back home, I plugged it in turned it on and it validated in like two minutes. OS validation can be annoying; we just need to crack the certificates and disable it entirely.

217

Grammer / Re: Grammer 2-The APP

« on: March 19, 2019, 10:41:10 am »

I added the ▶Nom( command and it is actually implemented as a code block like this:

Code: [Select]

:▶Nom(x,y,z
:<<do stuff>>
:End

What it does is it pushes the list of variables to the stack and when it's End is reached, pops them back off.

My original plan was to make this command useful in subroutines, where when the subroutine's End was reached, it would restore the variables and then exit. However, that was hackish in a future-migraine way, and in implementing it, I realized this routine could be a much more useful as an explicit block.

While coding this, I realized I needed to add ▶Nom( to the internal list of block tokens and found a long standing bug in the routine that finds a block's matching End token-- it wasn't discarding the second byte of a two-byte token. This means, for example, that if a two-byte token's second byte was, say, $D4, it would be interpreted as an End token ! Fortunately, this would only happen in hacked variables (not useful in Grammer) and the (potentially useful) character tokens for ~, @, #, $, and &.

Anyway, here is v2.50.4.0

218

Grammer / Re: Grammer 2-The APP

« on: March 18, 2019, 01:27:11 am »

Grammer now has some basic stack support and easier parameter parsing!

To accomplish the stack routines, Grammer keeps track of three values: The stack base, the stack top, and the stack pointer. Grammer can detect overflows in either direction. I decided to make these actual variables that you can interface with via the Pmt_Bgn and Pmt_End tokens (found at the end of the Finance app). When storing to Pmt_Bgn, the stack pointer is automatically reset. Currently, the stack defaults to saveSScreen+256 and ends at saveSScreen+768, so you have 512 bytes to work with. If you want to relocate the stack, you may. For example, you could create a temp var of 10000 bytes and have a giant stack there !

To push, use Param' followed by an arbitrary number of parameters. To pop, use Param° followed by an arbitrary number of parameters that must be variable names. For example, to swap A and B, you could do:

Code: [Select]

Param'A,B
Param°A,B

The unmodified Param token also takes variable names as its input. A while back, I added the ability for subroutines to be called with arguments, kinda. In order to parse those arguments, you had to do something like:

Code: [Select]

]?→A
]?→B
]?→C

But now you can do:

Code: [Select]

ParamA,B,C

I plan to add the ▶Nom( token to save variables and then restore them at the end of a subroutine. It is basically just an automated sequence of pushes and pops.

219

ASM / Re: [z80] 32 bit by 16 bits division and 32 bit square root

« on: March 17, 2019, 10:30:45 pm »

Code: [Select]

[quote author=fghsgh link=topic=18691.msg406921#msg406921 date=1552874767]
My trick: pen & paper & sleep
[/quote]
Hey, that's the trick I use! I should clarify: without motivation, I am lazy.

[quote author=fghsgh link=topic=18691.msg406921#msg406921 date=1552874767]
This is not the first time someone has recommended SPASM to me. It is, however, the first time someone provided me with a Github link and I thought it was only available for Windows until now. It would also mean that I have to transform all my previous programs (or at least include files) to this syntax.

Glad I could help! I think with spasm you can do such things as #define TASM or something like that and it'll allow `equ` instead of `=` and whatnot.

Quote from: fghsgh on March 17, 2019, 10:06:07 pm

I don't think that's too hard in this case: just djnz 8 times

True (except not using djnz since that would use B !), but you would have to call that routine which uses a stack anyways

I imagine you mean a very limited stack as opposed to no stack whatsoever?

Quote

I will try to make the sqrt routine asap, if I don't forget. It'll probably be for next weekend.

Good luck! The pseudocode outlined here has yielded me better results than algorithm I learned as a kid. It's the same algorithm, just structured better for programming instead of computing by hand.

220

TI Z80 / Re: [BASIC] bigpi- Arbitrary Precision Pi Calculator

« on: March 17, 2019, 09:02:45 pm »

Maybe in 96 years I'll upload a more efficient version

221

Introduce Yourself! / Re: Hello again?

« on: March 17, 2019, 08:52:05 pm »

Oh my gosh here, have some

Welcome to Omni!

222

ASM / Re: [z80] 32 bit by 16 bits division and 32 bit square root

« on: March 17, 2019, 08:47:07 pm »

Quote from: fghsgh on March 17, 2019, 08:02:08 pm

I wonder, if you can make my routine twice as fast and if you've written complete support for floats, why didn't you write this routine in the first place?

It was just because my float routines were special-purpose and writing a general-purpose routine was a daunting idea (which is why I'm grateful for your work). For the float routines, I can always guarantee the top 16 bits are smaller than BC, resulting in a routine that is under 900cc. That said, I should update my above routine with a trick I came up with for those (it reduces the problem so that BC is always less than 32768, )(correctly) correcting later so that there is no overflow to check for).

Quote from: fghsgh on March 17, 2019, 08:02:08 pm

Also, I was having trouble with getting INC/DEC IXL to work for some reason. Haven't had the time to investigate that though. It could be that I just typed in the wrong opcode because I have a bad assembler which doesn't support it apparently and it's too much of a hassle to get a new one on Linux.

I use SPASM-ng and that has been great for me.

Quote from: fghsgh on March 17, 2019, 08:02:08 pm

As for your optimizations, I could follow along until you got those subroutines in. (EDIT: I think I got it now) And that might also be a problem if you have limited stack space or if you can't use the stack at all (in which case the routine would be inline). I tend to put RAM page 02 into bank C and completely fill it up with data, not leaving any place for a stack.

Hmm, I'll keep this in mind if I work more on these.

Quote from: fghsgh on March 17, 2019, 08:02:08 pm

Another, easy way of doing this would be by thinking of the 32 and 16 bit numbers as 4&2-digit base-256 numbers and using a routine for smaller numbers to do the actual division.

I do this in the extended-precision float routines, and it requires two multiplications (one multiply if you don't need the remainder), but could come out faster. I might pursue this, too. In my float routines, because they were special-purpose, this method didn't pay off until the 32/32 divisions and up. However, general-purpose 32/16 might benefit.

223

ASM / Re: [z80] 32 bit by 16 bits division and 32 bit square root

« on: March 17, 2019, 06:58:37 pm »

I am making a double-post because this post is very distinct from the other.
Before I proceed, the routine doesn't take into account that the `adc hl,hl` could overflow on the 17th iteration and beyond if BC>32768. For example, 0x80000000/0x8001 will leave HL as 8000h on the 16th iteration, and then 0x0000 on the 17th.

Before we fix that, notice that you are using two `ex de,hl` each iteration, but here is a more efficient way:

Code: [Select]

div32_16:
 ex de,hl   ; 4
 ld hl,0    ; 10
 ld a,32    ; 7
div32_16loop:
 add ix,ix  ; 15
 rl e       ; 8
 rl d       ; 8
 adc hl,hl  ; 15
 sbc hl,bc  ; 15
 inc ixl    ; 8
 jr nc,cansub  ; 12/7
  add hl,bc ; 11
  dec ixl   ; 8
cansub:
 dec a      ; 4
 jr nz,div32_16loop ; 12/7
 ex de,hl   ; 4
 ret        ; 10

This ends up adding 2 bytes and 8cc to the overhead code, but saves 7cc per iteration, for a net savings of 216cc.

But we can replace those `rl e` with `rla`, we'll save 4cc per iteration.

Code: [Select]

div32_16:
 ex de,hl   ; 4
 ld hl,0    ; 10
 ld a,e     ; 4
 ld e,32    ; 7
div32_16loop:
 add ix,ix  ; 15
 rla        ; 4
 rl d       ; 8
 adc hl,hl  ; 15
 sbc hl,bc  ; 15
 inc ixl    ; 8
 jr nc,cansub  ; 12/7
  add hl,bc ; 11
  dec ixl   ; 8
cansub:
 dec e      ; 4
 jr nz,div32_16loop ; 12/7
 ex de,hl   ; 4
 ret        ; 10

This is actually a cheap optimization. It adds no extra bytes, but adds 4cc to overhead and saves 4cc per iteration, for a net savings of 124cc!

But in a similar vein, we can just split up the code into 4 parts with a common subroutine, reducing the shifting work:

Code: [Select]

div32_16:
;136+4*div32_16_sub8
;min: 2180cc
;max: 2788cc
;avg: 2484cc
;49 bytes
  ex de,hl   ; 4
  ld hl,0    ; 10

  ld a,d              ; 4
  call div32_16_sub8  ; 17
  ld d,a              ; 4

  ld a,e              ; 4
  call div32_16_sub8  ; 17
  ld e,a              ; 4

  ld a,ixh            ; 8
  call div32_16_sub8  ; 17
  ld ixh,a            ; 8

  ld a,ixl            ; 8
  call div32_16_sub8  ; 17
  ld ixl,a            ; 8

  ex de,hl   ; 4
  ret        ; 10

div32_16_sub8:
;119+8*div32_16_sub
;min: 511cc
;max: 663cc
;avg: 587cc
  call +_
_:
;17+2(17+2(div32_16_sub)))
  call +_
_:
;17+2(div32_16_sub)
  call div32_16_sub
div32_16_sub:
;min: 49cc
;max: 68cc
;avg: 58.5cc
 add a,a    ; 4
 adc hl,hl  ; 15
 sbc hl,bc  ; 15
 inc a      ; 4
 ret nc     ;11/5
 add hl,bc  ; 11
 dec a      ; 4
 ret        ; 10

The above routine works when BC<=0x8000, but to extend it, we'll need to take care of the overflow and that will slow it down:

Code: [Select]

div32_16:
;136+4*div32_16_sub8
;min: 2340cc
;max: 3012cc
;avg: 2676cc
;55 bytes
  ex de,hl   ; 4
  ld hl,0    ; 10

  ld a,d              ; 4
  call div32_16_sub8  ; 17
  ld d,a              ; 4

  ld a,e              ; 4
  call div32_16_sub8  ; 17
  ld e,a              ; 4

  ld a,ixh            ; 8
  call div32_16_sub8  ; 17
  ld ixh,a            ; 8

  ld a,ixl            ; 8
  call div32_16_sub8  ; 17
  ld ixl,a            ; 8

  ex de,hl   ; 4
  ret        ; 10

div32_16_sub8:
;119+8*div32_16_sub
;min: 551cc
;max: 719cc
;avg: 635cc
  call +_
_:
;17+2(17+2(div32_16_sub)))
  call +_
_:
;17+2(div32_16_sub)
  call div32_16_sub
div32_16_sub:
;54+{0,2+{0,19}}
;min: 54cc
;max: 75cc
;avg: 64.5cc
  add a,a    ; 4
  inc a      ; 4
  adc hl,hl  ; 15
  jr c,+_    ;12/7
  sbc hl,bc  ; 15
  ret nc     ;11/5
  add hl,bc  ; 11
  dec a      ; 4
  ret        ; 10
_:
  or a       ; 4
  sbc hl,bc  ; 15
  ret        ; 10

This adds 6 bytes, and 192cc on average, so it's not too bad.

Now we move in with your idea to negate BC upon entry to improve some cases. That allows some other optimizations too, like not needing to reset the carry flag in the last case of div32_16_sub:

Code: [Select]

div32_16:
;HLIX/BC -> HLIX remainder DE
;158+4*div32_16_sub8
;min: 2298cc
;max: 3034cc
;avg: 2546cc
;59 bytes
  ex de,hl   ; 4

; Negate BC to allow add instead of sbc
  xor a      ; 4
; Need to set HL to 0 anyways, so save 2cc and a byte
  ld h,a     ; 4
  ld l,a     ; 4
  sub c      ; 4
  ld c,a     ; 4
  sbc a,a    ; 4
  sub b      ; 4
  ld b,a     ; 4


  ld a,d              ; 4
  call div32_16_sub8  ; 17
  ld d,a              ; 4

  ld a,e              ; 4
  call div32_16_sub8  ; 17
  ld e,a              ; 4

  ld a,ixh            ; 8
  call div32_16_sub8  ; 17
  ld ixh,a            ; 8

  ld a,ixl            ; 8
  call div32_16_sub8  ; 17
  ld ixl,a            ; 8

  ex de,hl   ; 4
  ret        ; 10

div32_16_sub8:
;119+8*div32_16_sub
;min: 535cc
;max: 719cc
;avg: 597cc
  call +_
_:
;17+2(17+2(div32_16_sub)))
  call +_
_:
;17+2(div32_16_sub)
  call div32_16_sub
div32_16_sub:
52+{4,0+{0,23}}
;min: 52cc
;max: 75cc
;avg: 59.75cc
  add a,a    ; 4
  inc a      ; 4
  adc hl,hl  ; 15
  jr c,+_    ;12/7
  add hl,bc  ; 11
  ret c      ;11/5
  sbc hl,bc  ; 15
  dec a      ; 4
  ret        ; 10
_:
  add hl,bc  ; 11
  ret        ; 10

This adds a net of 4 bytes. It adds 22cc to the worst case, saves 42cc in the best case, but manages to bring the average case down by 130cc!

But now let's take advantage of the output flags in the sub routine. It turns out that except for that last case, the output is carry flag reset if the bit shifted in should be 0, and carry set if the bit should be 1.

Code: [Select]

div32_16:
;HLIX/BC -> HLIX remainder DE
;174+4*div32_16_sub8
;min: 2186cc
;max: 2794cc
;avg: 2466cc
;61 bytes
  ex de,hl   ; 4

; Negate BC to allow add instead of sbc
  xor a      ; 4
; Need to set HL to 0 anyways, so save 2cc and a byte
  ld h,a     ; 4
  ld l,a     ; 4
  sub c      ; 4
  ld c,a     ; 4
  sbc a,a    ; 4
  sub b      ; 4
  ld b,a     ; 4


  ld a,d              ; 4
  call div32_16_sub8  ; 17
  rla                 ; 4
  ld d,a              ; 4

  ld a,e              ; 4
  call div32_16_sub8  ; 17
  rla                 ; 4
  ld e,a              ; 4

  ld a,ixh            ; 8
  call div32_16_sub8  ; 17
  rla                 ; 4
  ld ixh,a            ; 8

  ld a,ixl            ; 8
  call div32_16_sub8  ; 17
  rla                 ; 4
  ld ixl,a            ; 8

  ex de,hl   ; 4
  ret        ; 10

div32_16_sub8:
;119+8*div32_16_sub
;min: 503cc
;max: 655cc
;avg: 573cc
  call +_
_:
;17+2(17+2(div32_16_sub)))
  call +_
_:
;17+2(div32_16_sub)
  call div32_16_sub
div32_16_sub:
;48+{8,0+{0,19}}
;min: 48cc
;max: 67cc
;avg: 56.75cc
  rla        ; 4
  adc hl,hl  ; 15
  jr c,+_    ;12/7
  add hl,bc  ; 11
  ret c      ;11/5
  sbc hl,bc  ; 15
  ret        ; 10
_:
  add hl,bc  ; 11
  scf        ; 4
  ret        ; 10

This costs two extra bytes, but saves 112cc in the best case, 240cc in the worst case, and 80cc on average !

224

General Calculator Help / Re: TI-84 not receiving OS

« on: March 17, 2019, 04:36:53 pm »

Oof, if flash failed then something might really be wrong with the hardware, but I'm not an expert in that area.

225

ASM / Re: [z80] 32 bit by 16 bits division and 32 bit square root

« on: March 17, 2019, 04:22:38 pm »

Okay, when I get home I might have a chance to optimize this more. And yes, that's a good place for it. I have been working on-and-off on that page for a few months to revamp it. If it is locked due to a pending draft, let me know and I'll move the draft to another page altogether.

Also, this might be useful: https://github.com/Zeda/z80float/tree/master/extended/sqrt
The sqrt32 and sqrt64 are special-purpose routines (the top two bits shouldn't both be 0), but it can give you an idea. Also the division folder might inspire optimizations in this routine. It has a 32/16 routine but it is also special-purpose (top bit of denominator is always set).

Pages: 1 ... 13 14 [15] 16 17 ... 317