Show Posts - Xeda112358

Profile Info
- Summary
- Show Stats
- Show Posts...

Show Posts

This section allows you to view all posts made by this member. Note that you can only see posts made in areas you currently have access to.

Messages - Xeda112358

Pages: 1 ... 9 10 [11] 12 13 ... 317

151

ASM / Re: ASM Optimized routines

« on: August 28, 2019, 10:11:55 am »

Quote from: SpiroH on August 28, 2019, 10:07:05 am

@Zeda: Nice performance aware exercise! (Except for the many push and pop which look a bit dated to me).
I wonder how many are really interested in speeding up the calculations these days.
It seems all they care about is python, java and what-have-you-funky-high-level-language .

Thanks

I wrote these with apps in mind, so I tried to reduce the need for external RAM. I should definitely make versions that take full advantage of SMC, though!

152

ASM / Re: ASM Optimized routines

« on: August 28, 2019, 09:33:29 am »

Here is a masked sprite routine (no clipping)! Interleave the data with the mask, where the MASK is ANDed with the buffer, and the data is ORed on top of that:

;Masked Sprite routine
putsprite_masked:
;Inputs:
;   (A,L) = (x,y)
;   B is height
;   IX points to the sprite data
;       first byte is the data
;       second byte is mask
;       continues, alternating like this.
;
;Outputs:
;   Mask is ANDed to the buffer, then data is ORed on top of that.
;
;Destroys:
;   AF, BC, DE, HL, IX
;
;Notes:
;   To set a pixel...
;     black: mask is any, data is 1
;     white: mask is 0, data is 0
;     clear: mask is 1, data is 0 (keeps the data from the buffer)
;
;This routine is free to use :)
;65 bytes (or 66 bytes if gbuf is not located at 0x**40

  ld e,l
  ld h,0
  ld d,h
  add hl,hl
  add hl,de
  add hl,hl
  add hl,hl
  ld e,a
  and 7
  ld c,a
  xor e  ;essentially gets E with the bottom 3 bits reset
#if (plotSScreen&255) = 64
  inc a
  rra
  rra
  rra
  ld e,a
  ld d,plotSScreen>>8
#else
  rra
  rra
  rra
  ld e,a
  add hl,de
  ld de,plotSScreen
#endif
  add hl,de


putsprite_masked_loop:
  push bc
  xor a
  ld d,(ix)
  ld e,a
  sub c
  ld b,c
  ld c,$FF
  inc ix
  ld a,(ix)
  jr z,putsprite_masked_rotdone
putsprite_masked_rot:
  scf
  rra
  rr c
  srl d
  rr e
  djnz putsprite_masked_rot
putsprite_masked_rotdone:
  and (hl)
  or d
  ld (hl),a
  inc hl
  ld a,(hl)
  and c
  or e
  ld (hl),a
  ld c,11
  add hl,bc
  inc ix
  pop bc
  djnz putsprite_masked_loop
  ret

But if you want even faster and smaller, use a non-traditional mask technique by ORing the mask onto the buffer, then XORing the data on top of it. The format is less intuitive, but it allows for white/black/clear/invert instead of just white/black/clear:

;Masked Sprite routine
putsprite_masked:
;Inputs:
;   (A,L) = (x,y)
;   B is height
;   IX points to the sprite data
;       first byte is the data
;       second byte is mask
;       continues, alternating like this.
;
;Outputs:
;   Mask is ORed to the buffer, then data is XORed on top of that.
;
;Destroys:
;   AF, BC, DE, HL, IX
;
;Notes:
;   To set a pixel...
;     black: mask is 1, data is 0
;     white: mask is 1, data is 1
;     clear: mask is 0, data is 0 (keeps the data from the buffer)
;     invert: mask is 0, data is 1 (inverts the data from the buffer)
;
;This routine is free to use :)
;63 bytes (or 64 bytes if gbuf is not located at 0x**40

  ld e,l
  ld h,0
  ld d,h
  add hl,hl
  add hl,de
  add hl,hl
  add hl,hl
  ld e,a
  and 7
  ld c,a
  xor e  ;essentially gets E with the bottom 3 bits reset
#if (plotSScreen&255) = 64
  inc a
  rra
  rra
  rra
  ld e,a
  ld d,plotSScreen>>8
#else
  rra
  rra
  rra
  ld e,a
  add hl,de
  ld de,plotSScreen
#endif
  add hl,de

putsprite_masked_loop:
  push bc
  xor a
  ld d,(ix)
  ld e,a
  or c
  ld b,c
  ld c,e
  inc ix
  ld a,(ix)
  jr z,putsprite_masked_rotdone
putsprite_masked_rot:
  rra
  rr c
  srl d
  rr e
  djnz putsprite_masked_rot
putsprite_masked_rotdone:
  or (hl)
  xor d
  ld (hl),a
  inc hl
  ld a,(hl)
  or c
  xor e
  ld (hl),a
  ld c,11
  add hl,bc
  inc ix
  pop bc
  djnz putsprite_masked_loop
  ret

I also made some "bigsprite" routines! These do have clipping, too. First, they use some common subroutines for computing masks and performing most of the clipping and shifting:

;133 bytes total

;This is made by Zeda, feel free to use it for whatever.
;Takes inputs for a big sprite and sets up masks and clipping
;requires 4 bytes of temporary RAM, but doesn't use SMC


spritetmp = 8000h     ;relocate this as needed! Just need 4 bytes.
sprite_width  = spritetmp+0
sprite_x      = spritetmp+1
sprite_mask0  = spritetmp+2
sprite_mask1  = spritetmp+3

bigsprite_subroutine:
;Inputs:
;     B is the X-coordinate
;     C is the Y-Coordinate
;     DE points to the sprite
;     H is the height
;     L is the width in bytes
;Outputs:
;   carry flag is set if okay to draw, nc if out-of-bounds.
;   B is height.
;   C is width.
;   HL points to the byte to start drawing at.
;   DE points to where to start sourcing the sprite data
;   (sprite_width) is the width of the sprite in bytes
;   (sprite_x) is the intitial x coordinate to begin drawing at
;   (sprite_mask0) is the left mask
;   (sprite_mask1) is the right mask
;92 bytes

;First check if the sprite is on-screen in the horizontal direction
  ld a,c
  cp 64
  jr c,+_
  add a,h
  ret nc
  ld h,a
  push hl
  xor a
  ld h,a
  sub c
  ex de,hl
  add hl,de
  dec a
  jr nz,$-2
  ex de,hl
  pop hl
  xor a
  ld c,a
_:
;Next check h+c<=64
  ld a,64
  sub c
  cp h
  jr nc,+_
  ld h,a
_:

;Make sure the height is not now 0
  ld a,h
  or a
  ret z

;Save the width and height of the sprite
  push hl               ;height,width
  ld h,b
  ld (sprite_width),hl  ;x,width
  push de               ;sprite pointer

;Set up a pointer to the routine for shifting the  routine for shifting the sprite data
  ld ixh,rshiftHA_7>>8
  ld a,h
  cpl
  and 7
  ld l,a
  add a,a
  add a,l
  add a,rshiftHA_7&255
  ld ixl,a
#if (rshiftHA_7&255)>234
  jr nc,$+4
  inc ixh
#endif

  ld a,b
  and 7
  ld de,spritemask
  add a,e
  ld e,a
#if spritemask&255>248
  jr nc,$+3
  inc d
#endif
  ld a,(de)
  ld (sprite_mask0),a
  cpl
  ld (sprite_mask1),a
;
;
  ld a,c
  add a,a
  sbc a,a
  ld h,a
  ld a,b
  ld b,h
  ld l,c
  add hl,hl
  add hl,bc
  add hl,hl
  add hl,hl
  ld c,a
  add a,a
  sbc a,a
  ld b,a
  ld a,c
  sra c
  sra c
  sra c
  add hl,bc
  ld bc,plotSScreen
  add hl,bc

  pop de
  pop bc
  ;B is height
  ;C is width
  ex de,hl
  scf
  ret

rshiftHA_7:
  rr h \ rra
  rr h \ rra
  rr h \ rra
  rr h \ rra
  rr h \ rra
  rr h \ rra
  rr h \ rra
  ex de,hl
  ld e,a
  ret

spritemask:
  .db $00,$80,$C0,$E0,$F0,$F8,$FC,$FE
call_ix:
  jp (ix)

Then you can draw a big sprite with OR logic:

bigsprite_OR:
;Inputs:
;     B is the X-coordinate
;     C is the Y-Coordinate
;     DE points to the sprite
;     H is the height
;     L is the width in bytes
;68 bytes

;Set up the clipping
  call bigsprite_subroutine
  ret nc

bigsprite_OR_loop:
  push bc   ;height,width
  push de   ;gbuf ptr
  push hl   ;sprite data pointer
  ld a,(sprite_x)
  ld c,a
  add a,8
  ld (sprite_x),a

spriteloop_OR:
  push bc
  push hl
  ld h,(hl)
  xor a
  call call_ix
  ld a,c
  cp 96
  jr nc,+_
  ld a,(hl)
  or d
  ld (hl),a
  ld a,c
_:
  inc hl
  add a,8
  cp 96
  jr nc,+_
  ld a,(sprite_mask1)
  ld a,(hl)
  or e
  ld (hl),a
_:
  ld bc,11
  add hl,bc
  ex de,hl
  pop hl
  ld a,(sprite_width)
  ld c,a
  add hl,bc
  pop bc
  djnz spriteloop_OR


  pop hl
  inc hl
  pop de
  inc de
  pop bc
  dec c
  jr nz,bigsprite_OR_loop
  ret

Or draw with XOR logic:

bigsprite_XOR:
;Inputs:
;     B is the X-coordinate
;     C is the Y-Coordinate
;     DE points to the sprite
;     H is the height
;     L is the width in bytes
;68 bytes

;Set up the clipping
  call bigsprite_subroutine
  ret nc

bigsprite_XOR_loop:
  push bc   ;height,width
  push de   ;gbuf ptr
  push hl   ;sprite data pointer
  ld a,(sprite_x)
  ld c,a
  add a,8
  ld (sprite_x),a

spriteloop_XOR:
  push bc
  push hl
  ld h,(hl)
  xor a
  call call_ix
  ld a,c
  cp 96
  jr nc,+_
  ld a,(hl)
  xor d
  ld (hl),a
  ld a,c
_:
  inc hl
  add a,8
  cp 96
  jr nc,+_
  ld a,(sprite_mask1)
  ld a,(hl)
  xor e
  ld (hl),a
_:
  ld bc,11
  add hl,bc
  ex de,hl
  pop hl
  ld a,(sprite_width)
  ld c,a
  add hl,bc
  pop bc
  djnz spriteloop_XOR


  pop hl
  inc hl
  pop de
  inc de
  pop bc
  dec c
  jr nz,bigsprite_XOR_loop
  ret

Or draw with AND logic:

bigsprite_AND:
;Inputs:
;     B is the X-coordinate
;     C is the Y-Coordinate
;     DE points to the sprite
;     H is the height
;     L is the width in bytes
;69 bytes

;Set up the clipping
  call bigsprite_subroutine
  ret nc

bigsprite_AND_loop:
  push bc   ;height,width
  push de   ;gbuf ptr
  push hl   ;sprite data pointer
  ld a,(sprite_x)
  ld c,a
  add a,8
  ld (sprite_x),a

spriteloop_AND:
  push bc
  push hl
  ld h,(hl)
  scf \ sbc a,a
  call call_ix
  ld a,c
  cp 96
  jr nc,+_
  ld a,(hl)
  and d
  ld (hl),a
  ld a,c
_:
  inc hl
  add a,8
  cp 96
  jr nc,+_
  ld a,(sprite_mask1)
  ld a,(hl)
  and e
  ld (hl),a
_:
  ld bc,11
  add hl,bc
  ex de,hl
  pop hl
  ld a,(sprite_width)
  ld c,a
  add hl,bc
  pop bc
  djnz spriteloop_AND


  pop hl
  inc hl
  pop de
  inc de
  pop bc
  dec c
  jr nz,bigsprite_AND_loop
  ret

Or draw with Erase logic:

bigsprite_Erase:
;Inputs:
;     B is the X-coordinate
;     C is the Y-Coordinate
;     DE points to the sprite
;     H is the height
;     L is the width in bytes
;67 bytes

;Set up the clipping
  call bigsprite_subroutine
  ret nc

bigsprite_Erase_loop:
  push bc   ;height,width
  push de   ;gbuf ptr
  push hl   ;sprite data pointer
  ld a,(sprite_x)
  ld c,a
  add a,8
  ld (sprite_x),a

spriteloop_Erase:
  push bc
  push hl
  ld h,(hl)
  xor a
  call call_ix
  ld a,c
  cp 96
  jr nc,+_
  ld a,d
  cpl
  and (hl)
  ld (hl),a
  ld a,c
_:
  inc hl
  add a,8
  cp 96
  jr nc,+_
  ld a,e
  cpl
  and (hl)
  ld (hl),a
_:
  ld bc,11
  add hl,bc
  ex de,hl
  pop hl
  ld a,(sprite_width)
  ld c,a
  add hl,bc
  pop bc
  djnz spriteloop_Erase

  pop hl
  inc hl
  pop de
  inc de
  pop bc
  dec c
  jr nz,bigsprite_Erase_loop
  ret

Or draw with Overwrite logic:

bigsprite_Overwrite:
;Inputs:
;     B is the X-coordinate
;     C is the Y-Coordinate
;     DE points to the sprite
;     H is the height
;     L is the width in bytes
;71 bytes

;Set up the clipping
  call bigsprite_subroutine
  ret nc

bigsprite_Overwrite_loop:
  push bc   ;height,width
  push de   ;gbuf ptr
  push hl   ;sprite data pointer
  ld a,(sprite_x)
  ld c,a
  add a,8
  ld (sprite_x),a

spriteloop_Overwrite:
  push bc
  push hl
  ld h,(hl)
  xor a
  call call_ix
  ld a,c
  cp 96
  jr nc,+_
  ld a,(sprite_mask0)
  and (hl)
  or d
  ld (hl),a
  ld a,c
_:
  inc hl
  add a,8
  cp 96
  jr nc,+_
  ld a,(sprite_mask1)
  and (hl)
  or e
  ld (hl),a
_:
  ld bc,11
  add hl,bc
  ex de,hl
  pop hl
  ld a,(sprite_width)
  ld c,a
  add hl,bc
  pop bc
  djnz spriteloop_Overwrite

  pop hl
  inc hl
  pop de
  inc de
  pop bc
  dec c
  jr nz,bigsprite_Overwrite_loop
  ret

153

Other Calc-Related Projects and Ideas / Re: Star Trek Multiplayer for the CE nears first demo release

« on: August 27, 2019, 10:30:31 am »

This is sounding quite cool!

154

Grammer / Re: Grammer 2-The APP

« on: August 23, 2019, 08:37:59 pm »

I rewrote the Input routine and ran into some issues that I finally managed to fix. Now, the cursor blinks, and you can change the location and size of the input buffer! Here is a screenshot where I relocate the input buffer to a spot within the source code (!), and limit it to 9 bytes (8 bytes plus a null byte):

The two new "commands" are →Input (Sets the location of the input buffer) and →Input' (Sets the size of the input buffer).

155

TI Z80 / Re: Z80 Optimized Routines Repository

« on: August 19, 2019, 03:41:23 pm »

Okay, thanks! There are 26 routines that I'll need to investigate later when I get out of work. Nine of them I don't know if I'll be able to contact the author, but one of those I plan to make a better implementation of anyways.

EDIT:
How does this sound?

Quote

1. This License does not apply to any file with a separate License header.
2. Permission is granted, free of charge, to use, modify, and/or distribute any part of this software for any purpose.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

Written by Zeda Thomas <[email protected]>, Aug 2019

156

TI Z80 / Re: Z80 Optimized Routines Repository

« on: August 19, 2019, 02:36:11 pm »

That's a good point.
At the moment, all but three of the routines are from myself or the calculator forums in their useful routines threads. The ones from UTI are explicitly free to use.

157

ASM / Re: ASM Optimized routines

« on: August 16, 2019, 11:41:13 pm »

Here are some routines that I've added to the repository:

itoa_8
Converts an 8-bit signed integer to an ASCII string.

;Converts an 8-bit signed integer to a string

itoa_8:
;Input:
;   A is a signed integer
;   HL points to where the null-terminated ASCII string is stored (needs at most 5 bytes)
;Output:
;   The number is converted to a null-terminated string at HL
;Destroys:
;   Up to five bytes at HL
;   All registers preserved.
;on 0 to 9:       252       D=0
;on 10 to 99:     258+20D   D=0 to 9
;on 100 to 127:   277+20D   D=0 to 2
;on -1 to -9:     276       D=0
;on -10 to -99:   282+20D   D=0 to 9
;on -100 to -128: 301+20D   D=0 to 2

;min: 252cc  (+23cc over original)
;max: 462cc  (-49cc over original)
;avg: 343.74609375cc = 87999/256
;54 bytes
  push hl
  push de
  push bc
  push af
  or a
  jp p,itoa_pos
  neg
  ld (hl),$1A  ;start if neg char on TI-OS
  inc hl
itoa_pos:
;A is on [0,128]
;calculate 100s place, plus 1 for a future calculation
  ld b,'0'
  cp 100 \ jr c,$+5 \ sub 100 \ inc b

;calculate 10s place digit, +1 for future calculation
  ld de,$0A2F
  inc e \ sub d \ jr nc,$-2
  ld c,a

;Digits are now in D, C, A
; strip leading zeros!
  ld a,'0'
  cp b \ jr z,$+5 \ ld (hl),b \ inc hl \ .db $FE  ; start of `cp *` to skip the next byte, turns into `cp $BB` which will always return nz and nc
  cp e \ jr z,$+4 \ ld (hl),e \ inc hl
  add a,c
  add a,d
  ld (hl),a
  inc hl
  ld (hl),0

  pop af
  pop bc
  pop de
  pop hl
  ret

fixed88_to_string
Uses the itoa_8 routine to convert an 8.8 fixed-point number to a string.

;This converts a fixed-point number to a string.
;It displays up to 3 digits after the decimal.

fixed88_to_str:
;Inputs:
;   D.E is the fixed-point number
;   HL points to where the string gets output.
;      Needs at most 9 bytes.
;Outputs:
;   HL is preserved
;Destroys:
;   AF,DE,BC

;First check if the input is negative.
;If so, write a negative sign and negate
  push hl
  ld a,d
  or a
  jp p,+_
  ld (hl),$1A      ;negative sign on TI-OS
  inc hl
  xor a
  sub e
  ld e,a
  sbc a,a
  sub d
_:

;Our adjusted number is in A.E
;Now we can print the integer part
  call itoa_8

;Check if we need to print the fractional part
  xor a
  cp e
  jr z,fixed88_to_str_end

;We need to write the fractional part, so seek the end of the string
;Search for the null byte. A is already 0
  cpir

;Write a decimal
  dec hl
  ld (hl),'.'

  ld b,3
_:
;Multiply E by 10, converting overflow to an ASCII digit
  call fixed88_to_str_e_times_10
  inc hl
  ld (hl),a
  djnz -_

;Strip the ending zeros
  ld a,'0'
_:
  cp (hl)
  dec hl
  jr z,-_

;write a null byte
  inc hl
  inc hl
  ld (hl),0

fixed88_to_str_end:
;restore HL
  pop hl
  ret

fixed88_to_str_e_times_10:
  ld a,e
  ld d,0
  add a,a \ rl d
  add a,a \ rl d
  add a,e \ jr nc,$+3 \ inc d
  add a,a
  ld e,a
  ld a,d
  rla
  add a,'0'
  ret

sqrtA
This is a very fast, unrolled routine to compute the square root of A.

sqrtA:
;Input: A
;Output: D is the square root, A is the remainder (input-D^2)
;Destroys: BC
;speed: 161+{0,6}+{0,1}+{0,1}+{0,3}
;min: 161cc
;max: 172cc
;avg: 166.5cc
;45 bytes
  ld d,$40

  sub d
  jr nc,+_
  add a,d
  ld d,0
_:

  set 4,d
  sub d
  jr nc,+_
  add a,d
  .db $01   ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
  set 5,d
  res 4,d
  srl d

  set 2,d
  sub d
  jr nc,+_
  add a,d
  .db $01   ;start of ld bc,** which is 10cc to skip the next two bytes.
_:
  set 3,d
  res 2,d
  srl d

  inc d
  sub d
  jr nc,+_
  add a,d
  dec d
_:
  inc d
  srl d
  ret

sqrtfixed_88
An unrolled, fast 8.8 fixed-point square root routine. Uses the above sqrtA routine.

sqrtfixed_88:
;Input: A.E ==> D.E
;Output: DE is the sqrt, AHL is the remainder
;Speed: 690+6{0,13}+{0,3+{0,18}}+{0,38}+sqrtA
;min: 855cc
;max: 1003cc
;avg: 924.5cc
;152 bytes

  call sqrtA
  ld l,a
  ld a,e
  ld h,0
  ld e,d
  ld d,h

  sla e
  rl d

  sll e \ rl d
  add a,a \ adc hl,hl
  add a,a \ adc hl,hl
  sbc hl,de
  jr nc,+_
  add hl,de
  dec e
  .db $FE     ;start of `cp *`
_:
  inc e

  sll e \ rl d
  add a,a \ adc hl,hl
  add a,a \ adc hl,hl
  sbc hl,de
  jr nc,+_
  add hl,de
  dec e
  .db $FE     ;start of `cp *`
_:
  inc e

  sll e \ rl d
  add a,a \ adc hl,hl
  add a,a \ adc hl,hl
  sbc hl,de
  jr nc,+_
  add hl,de
  dec e
  .db $FE     ;start of `cp *`
_:
  inc e

  sll e \ rl d
  add a,a \ adc hl,hl
  add a,a \ adc hl,hl
  sbc hl,de
  jr nc,+_
  add hl,de
  dec e
  .db $FE     ;start of `cp *`
_:
  inc e

;Now we have four more iterations
;The first two are no problem
  sll e \ rl d
  add hl,hl
  add hl,hl
  sbc hl,de
  jr nc,+_
  add hl,de
  dec e
  .db $FE     ;start of `cp *`
_:
  inc e

  sll e \ rl d
  add hl,hl
  add hl,hl
  sbc hl,de
  jr nc,+_
  add hl,de
  dec e
  .db $FE     ;start of `cp *`
_:
  inc e

sqrtfixed_88_iter11:
;On the next iteration, HL might temporarily overflow by 1 bit
  sll e \ rl d      ;sla e \ rl d \ inc e
  add hl,hl
  add hl,hl
  jr c,sqrtfixed_88_iter11_br0
;
  sbc hl,de
  jr nc,+_
  add hl,de
  dec e
  jr sqrtfixed_88_iter12
sqrtfixed_88_iter11_br0:
  or a
  sbc hl,de
_:
  inc e

;On the next iteration, HL is allowed to overflow, DE could overflow with our current routine, but it needs to be shifted right at the end, anyways
sqrtfixed_88_iter12:
  ld b,a      ;A is 0, so B is 0
  add hl,hl
  add hl,hl
  rla
;AHL - (DE+DE+1)
  sbc hl,de \ sbc a,b
  inc e
  or a
  sbc hl,de \ sbc a,b
  ret p
  add hl,de
  adc a,b
  dec e
  add hl,de
  adc a,b
  ret

ncr_HL_DE
Computes 'HL choose DE' in such a way so that overflow only occurs if the final result overflows 16 bits.

; Requires
;    mul16          ;BC*DE ==> DEHL
;    DEHL_Div_BC    ;DEHL/BC ==> DEHL

ncr_HL_DE:
;"n choose r", defined as n!/(r!(n-r)!)
;Computes "HL choose DE"
;Inputs: HL,DE
;Outputs:
;   HL is the result
;       "HL choose DE"
;   carry flag reset means overflow
;Destroys:
;   A,BC,DE,IX
;Notes:
;   Overflow is returned as 0
;   Overflow happens if HL choose DE exceeds 65535
;   This algorithm is constructed in such a way that intermediate
;   operations won't erroneously trigger overflow.
;66 bytes
  ld bc,1
  or a
  sbc hl,de
  jr c,ncr_oob
  jr z,ncr_exit
  sbc hl,de
  add hl,de
  jr c,$+3
  ex de,hl
  ld a,h
  or l
  push hl
  pop ix
ncr_exit:
  ld h,b
  ld l,c
  scf
  ret z
ncr_loop:
  push bc \ push de
  push hl \ push bc
  ld b,h
  ld c,l
  call mul16          ;BC*DE ==> DEHL
  pop bc
  call DEHL_Div_BC    ;result in DEHL
  ld a,d
  or e
  pop bc
  pop de
  jr nz,ncr_overflow
  add hl,bc
  jr c,ncr_overflow
  pop bc
  inc bc
  ld a,b
  cp ixh
  jr c,ncr_loop
  ld a,ixl
  cp c
  jr nc,ncr_loop
  ret
ncr_overflow:
  pop bc
  xor a
  ld b,a
ncr_oob:
  ld h,b
  ld l,b
  ret

EDIT: Optimized itoa_8 above. Here are some more routines:
uitoa_8
Converts an 8-bit unsigned integer to an ASCII string.

;Converts an 8-bit unsigned integer to a string

uitoa_8:
;Input:
;   A is a signed integer
;   HL points to where the null-terminated ASCII string is stored (needs at most 5 bytes)
;Output:
;   The number is converted to a null-terminated string at HL
;Destroys:
;   Up to four bytes at HL
;   All registers preserved.
;on 0 to 9:     238              D=0
;on 10 to 99:   244+20D          D=0 to 9
;on 100 to 255: 257+2{0,6}+20D   D=0 to 5
;min: 238cc
;max: 424cc
;avg: 317.453125cc = 81268/256 = (238*10 + 334*90+313*156)/256
;52 bytes

  push hl
  push de
  push bc
  push af
;A is on [0,255]
;calculate 100s place, plus 1 for a future calculation
  ld b,'0'
  cp 100 \ jr c,$+5 \ sub 100 \ inc b
  cp 100 \ jr c,$+5 \ sub 100 \ inc b

;calculate 10s place digit, +1 for future calculation
  ld de,$0A2F
  inc e \ sub d \ jr nc,$-2
  ld c,a

;Digits are now in D, C, A
; strip leading zeros!
  ld a,'0'
  cp b \ jr z,$+5 \ ld (hl),b \ inc hl \ .db $FE  ; start of `cp *` to skip the next byte, turns into `cp $BB` which will always return nz and nc
  cp e \ jr z,$+4 \ ld (hl),e \ inc hl
  add a,c
  add a,d
  ld (hl),a
  inc hl
  ld (hl),0

  pop af
  pop bc
  pop de
  pop hl
  ret

itoa_16
Converts a 16-bit signed integer to an ASCII string.

;Converts a 16-bit signed integer to an ASCII string.

itoa_16:
;Input:
;   DE is the number to convert
;   HL points to where to write the ASCII string (up to 7 bytes needed).
;Output:
;   HL points to the null-terminated ASCII string
;      NOTE: This isn't necessarily the same as the input HL.
  push de
  push bc
  push af
  push hl
  bit 7,d
  jr z,+_
  xor a
  sub e
  ld e,a
  sbc a,a
  sub d
  ld d,a
  ld (hl),$1A     ;negative char on TI-OS
  inc hl
_:
  ex de,hl

  ld bc,-10000
  ld a,'0'-1
  inc a \ add hl,bc \ jr c,$-2
  ld (de),a
  inc de

  ld bc,1000
  ld a,'9'+1
  dec a \ add hl,bc \ jr nc,$-2
  ld (de),a
  inc de

  ld bc,-100
  ld a,'0'-1
  inc a \ add hl,bc \ jr c,$-2
  ld (de),a
  inc de

  ld a,l
  ld h,'9'+1
  dec h \ add a,10 \ jr nc,$-3
  add a,'0'
  ex de,hl
  ld (hl),d
  inc hl
  ld (hl),a
  inc hl
  ld (hl),0

;No strip the leading zeros
  pop hl

;If the first char is a negative sign, skip it
  ld a,(hl)
  cp $1A
  push af
  ld a,'0'
  jr nz,$+3
  inc hl
  cp (hl)
  jr z,$-2

;Check if we need to re-write the negative sign
  pop af
  jr nz,+_
  dec hl
  ld (hl),a
_:

  pop af
  pop bc
  pop de
  ret

uitoa_16
Converts a 16-bit unsigned integer to an ASCII string.

;Converts a 16-bit unsigned integer to an ASCII string.

uitoa_16:
;Input:
;   DE is the number to convert
;   HL points to where to write the ASCII string (up to 6 bytes needed).
;Output:
;   HL points to the null-terminated ASCII string
;      NOTE: This isn't necessarily the same as the input HL.
  push de
  push bc
  push af
  ex de,hl

  ld bc,-10000
  ld a,'0'-1
  inc a \ add hl,bc \ jr c,$-2
  ld (de),a
  inc de

  ld bc,1000
  ld a,'9'+1
  dec a \ add hl,bc \ jr nc,$-2
  ld (de),a
  inc de

  ld bc,-100
  ld a,'0'-1
  inc a \ add hl,bc \ jr c,$-2
  ld (de),a
  inc de

  ld a,l
  ld h,'9'+1
  dec h \ add a,10 \ jr nc,$-3
  add a,'0'
  ex de,hl
  ld (hl),d
  inc hl
  ld (hl),a
  inc hl
  ld (hl),0

;No strip the leading zeros
  ld c,-6
  add hl,bc
  ld a,'0'
  inc hl \ cp (hl) \ jr z,$-2
  pop af
  pop bc
  pop de
  ret

158

TI Z80 / Re: Z80 Optimized Routines Repository

« on: August 14, 2019, 06:04:00 pm »

Good news! I've finished Cemetech's thread and it was tedious as heck. I've also added a bunch of my personal stash that I think is in an acceptable state

Currently at about 100 routines.

EDIT: Finished porting from the other sites.

159

TI Z80 / Re: Z80 Optimized Routines Repository

« on: August 14, 2019, 02:46:30 pm »

On this one? The labels are in the right places, but I do notice that sometimes pressing a key will read as the wrong group

EDIT: Also, I'm hoping to put your routines in the repository if you'd like!

160

TI Z80 / Z80 Optimized Routines Repository

« on: August 13, 2019, 02:38:20 pm »

Hi folks! I've noticed that the "Z80 Optimized Routines" threads and their equivalents on various sites aren't very easy to navigate. I am starting a repository on GitHub in the hopes of addressing these three issues:

Organization! "Is this routine documented? What page is it on?
Collaboration! "Is there a better version later in the thread? On what page!? Here is yetanotherversion!"
Cleanliness! "What is this random request doing in the middle of the thread?"

I initialized the repository here.

My plan is to start porting Cemtech's thread, Omnimaga's thread, UnitedTI's thread, Z80 Heaven's routines, and my private routines folder.

If you want to help port documentation, I only ask that you cite the original author if possible, except when the original author doesn't care to be cited. If you want to add your own routines, keep it organized! And please, if you see an optimization, please make it!

A final note: I think it would be great to have an eZ80 and TI-BASIC repository, too, but I don't think I'm up for maintaining that!

161

ASM / Re: Better LCD Delay Routines?

« on: August 07, 2019, 10:32:25 am »

Quote from: thepenguin77 on August 07, 2019, 10:23:34 am

(P.S. This is what I work on now. Also, I tend to go by bcov77 on other platforms if you feel like googling.)

That is so freaking cool.

162

ASM / Re: [8X+] port $24 question

« on: July 31, 2019, 07:46:51 am »

That is really confusing wording. I think your interpretation is most likely:

Quote

Does that mean that the ASIC will allow execution on all pages below $180, in other words all of them ?

163

ASM / Re: Better LCD Delay Routines?

« on: July 29, 2019, 12:37:39 pm »

Oh wow, I hadn't realized that!
EDIT: I saw this on that page:

Quote

NOTE: The contents of this port should NOT be less than 0Ch or the LCD driver will no longer respond.

164

ASM / Re: Better LCD Delay Routines?

« on: July 29, 2019, 06:21:08 am »

@Sue Doenim : your second routine should use "jr c,", not "jr nz,". I usually go with the second method unless I can get $10 in C, then I use the "in a,(c)" method. I also optionally use compiler directives so the user can use undocumented instructions.

For example, in Grammer, I define my LCDDelay routine as:

in a,(16) \ rla \ jr c,$-3

But one of my favorite tricks that many people don't use (and you'll see in many of my projects) is that if I am only doing full-screen LCD updates and I don't need interrupts, then at the beginning of my program I disable interrupts and write 80h to port 16 (or BFh to port 16 if you are doing it the weird way). Then I can skip that entire step in my LCD update routine, since I write column-by-column and that internal LCD counter is automatically reset to the desired initial value by the end of my routine.

It doesn't save much, but it does save space (you almost certainly don't need to worry about an LCD delay between initializing with 80h and the first time you update the LCD), and you save a non-zero number of clock cycles each update, so it really is a "free" optimization.

165

TI Z80 / Shuffle - Shuffle a TI List Really Fast

« on: July 28, 2019, 11:49:54 pm »

Hey there, it's ya gender non-specific diminutive Zeda, here, and today we'll be looking at the Fisher-Yates algorithm and just how freaking efficient it can be for shuffling a list. For reference, it takes one second to shuffle a 999-element list at 6MHz, and if that ain't the way your deity intended it, I don't know what is.

First, how do we shuffle L1 in BASIC?

rand(dim(L1->L2
SortA(L2,L1

This is a super clever algorithm, but slow as heck as the lists get bigger. Plus, it uses an extra list of the same size, wasting precious RAM. So how does the Fisher-Yates algorithm work? You start at the last element. Randomly choose an element up to and including the current element and swap them. Now move down one element and repeat (so now the last element is off limits, then the last two, et cetera). Repeat this until there is one element left.

This is easy to perform in-place, and it performs n-1 swaps, making it significantly faster than the BASIC algorithm above. In fact, let's implement it in BASIC:

dim(L1->N
For(K,N,2,-1
randInt(1,K->A
L1(K->B
L1(A->L1(K
B->L1(A
End

This takes approximately 37.5 seconds to sort a 999 element list. I don't even have the RAM needed to test the regular method, but extrapolating, it would take the "normal" method approximately 73 seconds for 999 elements. So basically, the Fisher-Yates algorithm is actually faster even in TI-BASIC (after about 400 elements, though).

So without further ado, the assembly code!

;Randomizes a TI-list in Ans

_RclAns= 4AD7h
seed1  = $80F8
seed2  = $80FC

seed1_0=seed1
seed1_1=seed1+2
seed2_0=seed2
seed2_1=seed2+2
#define bcall(x) rst 28h \ .dw x


.db $BB,$6D
.org $9D95

; Put it into 15MHz mode if possible!
  in a,(2)
  add a,a
  sbc a,a
  out (20h),a


; Initialize the random seed
  ld hl,seed1
  ld b,7
  ld a,r
_:
  xor (hl)
  ld (hl),a
  inc hl
  djnz -_
  or 99
  or (hl)
  ld (hl),a
  

; Locate Ans, verify that it is a list or complex list
  bcall(_RclAns)
  ex de,hl
  ld c,(hl)
  inc hl
  ld b,(hl)
  inc hl
  ld (list_base),hl
  dec a
  jr z,+_
  sub 12
  ret nz
  dec a
_:

;A is 0 if a real list, -1 if complex
;HL points to the first element
;BC is the number of elements
  and $29     ;make it either NOP or ADD HL,HL
  ld (get_complex_element),a
  sub 29h
  sbc a,a
;FF if real, 00 if complex
  cpl
  and 9
  add a,9
  ld (element_size),a


shuffle_loop:
  push bc

  push bc
  call rand
  pop bc
  ex de,hl
  call mul16
  dec bc
  ;swap elements DE and BC
  
  call get_element
  push hl
  ld d,b
  ld e,c
  call get_element
  pop de

  call swap_elements
  
  pop bc
  dec bc
  ld a,c
  dec a
  jr nz,shuffle_loop
  inc b
  dec b
  jr nz,shuffle_loop
  ret


swap_elements:
;HL and DE point to the elements
element_size = $+2
  ld bc,255
_:
  ld a,(de)
  ldi
  dec hl
  ld (hl),a
  inc hl
  djnz -_
  ret


get_element:
;Input:
;   DE is the element to locate
;Output:
;   HL points to the element
  ld l,e
  ld h,d
  add hl,hl
  add hl,hl
  add hl,hl
  add hl,de
get_complex_element:
  nop
list_base = $+1
  ld de,0
  add hl,de
  ret


rand:
;Tested and passes all CAcert tests
;Uses a very simple 32-bit LCG and 32-bit LFSR
;it has a period of 18,446,744,069,414,584,320
;roughly 18.4 quintillion.
;LFSR taps: 0,2,6,7  = 11000101
;291cc
;Thanks to Runer112 for his help on optimizing the LCG and suggesting to try the much simpler LCG. On their own, the two are terrible, but together they are great.
    ld hl,(seed1)
    ld de,(seed1+2)
    ld b,h
    ld c,l
    add hl,hl \ rl e \ rl d
    add hl,hl \ rl e \ rl d
    inc l
    add hl,bc
    ld (seed1_0),hl
    ld hl,(seed1_1)
    adc hl,de
    ld (seed1_1),hl
    ex de,hl
;;lfsr
    ld hl,(seed2)
    ld bc,(seed2+2)
    add hl,hl \ rl c \ rl b
    ld (seed2_1),bc
    sbc a,a
    and %11000101
    xor l
    ld l,a
    ld (seed2_0),hl
    ex de,hl
    add hl,bc
    ret


mul16:
;BC*DE
  ld hl,0
  ld a,16
mul16_loop:
  add hl,hl
  rl e
  rl d
  jr nc,+_
  add hl,bc
  jr nc,+_
  inc de
_:
  dec a
  jr nz,mul16_loop
  ret

It isn't perfect, but it is pretty good and importantly, it is fast! The biggest problem is in the random number generator, but even that is still pretty good for this application.

Pages: 1 ... 9 10 [11] 12 13 ... 317

Server load over the past 5, 10 and 15 minutes respectively: 0.52783203125, 0.40478515625, 0.4462890625

Page created in 0.101 seconds with 31 queries.