Show Posts - Xeda112358

Profile Info
- Summary
- Show Stats
- Show Posts...

Show Posts

This section allows you to view all posts made by this member. Note that you can only see posts made in areas you currently have access to.

Messages - Xeda112358

Pages: 1 ... 28 29 [30] 31 32 ... 317

436

ASM / Re: ASM Optimized routines

« on: August 03, 2015, 10:54:32 am »

Related to the previous post, I offer several alternatives to the bcalls _SetXXOP1 and _SetXXOP2. These routines are for converting 8-bit integers to TI floats. The advantages with my routines are that they are faster, you don't need to truncate to only the bottom 2 digits, and you can store the output to any location instead of just OP1 or OP2.

So first, if you still want the output to be the same:

setXXOP2:
    ld hl,OP2
    jr setXX
setXXOP1:
    ld hl,OP1
setXX:
;;Inputs: A is the unsigned int
;;        HL is where to write the TI float
;;Destroys:All
;;291cc+38b (or 144cc if A=0)
;;average: b=29/255
;;295.3215686cc
;;59 bytes
    ld c,0
    ld (hl),c \ inc hl
    ld (hl),81h
    inc hl \ ld (hl),c
    ld d,h \ ld e,l
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c

    or a \ ret z    ;If A is zero, exit early. +138cc
    ld l,a          ;\
    ld h,c          ; |
    add hl,hl       ; |Start converting A to BCD
    add hl,hl       ; |
    add hl,hl       ; |
    add hl,hl       ; |
    ld a,h \ daa  \ rl l    ; |Finish converting A to BCD
    adc a,a \ daa \ rl l    ; |Number is in LA
    adc a,a \ daa \ rl l    ; |
    adc a,a \ daa \ rl l    ; |
    adc a,a \ daa           ;/ +124cc
    ex de,hl
    ld (hl),a
    and $F0
    ret nz      ;+29cc
    rld         ;\ Rotate up 1 digit
    dec hl      ; |
    ld (hl),80h ; |
    ret         ; /

And if you want to get all 3 digits:

setXXXOP1:
    ld hl,OP1
setXXX:
;;Inputs: A is the unsigned int
;;        HL is where to write the TI float
;;Destroys:All
;;423cc+13a+63b (or 233cc if A=0)
;;average: a=99/255, b=29/255
;;435.2117647cc average
;;64 bytes
    ld bc,$0700
    ld (hl),c
    inc hl
    ld (hl),83h
    ld d,h
    ld e,l
    inc hl \ ld (hl),c \ djnz $-2
    or a \ ret z    ;If A is zero, exit early. +227cc
    ld l,a          ;\
    ld h,c          ; |
    add hl,hl       ; |Start converting A to BCD
    add hl,hl       ; |
    add hl,hl       ; |
    add hl,hl       ; |
    ld a,h \ daa  \ rl l    ; |Finish converting A to BCD
    adc a,a \ daa \ rl l    ; |Number is in LA
    adc a,a \ daa \ rl l    ; |
    adc a,a \ daa \ rl l    ; |
    adc a,a \ daa \ rl l    ;/ +132cc
    ex de,hl
    jr nz,$+6 \ ld e,a \ xor a \ ld (hl),81h    ;+(21+4/85)cc
    inc hl
    ld (hl),e
    inc hl
    ld (hl),a
    ld a,e
    and $F0
    ret nz      ;+48cc
    rld         ;\ Rotate up 1 digit
    dec hl      ; |
    rld         ; |
    dec hl      ; |
    dec (hl)    ; |Decrement exponent
    ret         ; /+63(29/255)cc

And if you want to do that, but maybe a little faster:

setXXX:
;;Inputs: A is the unsigned int
;;        HL is where to write the TI float
;;Destroys:All
;;334cc+13a+63b (or 144cc if A=0)
;;average: a=99/255, b=29/255
;;346.2117647cc average
;;75 bytes
    ld c,0
    ld (hl),c
    inc hl
    ld (hl),83h
    ld d,h
    ld e,l
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    inc hl \ ld (hl),c
    
    or a \ ret z    ;If A is zero, exit early. +227cc
    ld l,a          ;\
    ld h,c          ; |
    add hl,hl       ; |Start converting A to BCD
    add hl,hl       ; |
    add hl,hl       ; |
    add hl,hl       ; |
    ld a,h \ daa  \ rl l    ; |Finish converting A to BCD
    adc a,a \ daa \ rl l    ; |Number is in LA
    adc a,a \ daa \ rl l    ; |
    adc a,a \ daa \ rl l    ; |
    adc a,a \ daa \ rl l    ;/ +132cc
    ex de,hl
    jr nz,$+6 \ ld e,a \ xor a \ ld (hl),81h    ;+(21+4/85)cc
    inc hl
    ld (hl),e
    inc hl
    ld (hl),a
    ld a,e
    and $F0
    ret nz      ;+48cc
    rld         ;\ Rotate up 1 digit
    dec hl      ; |
    rld         ; |
    dec hl      ; |
    dec (hl)    ; |Decrement exponent
    ret         ; /+63(29/255)cc

And if you want to convert signed 8-bit ints:

setXXX_signed:
;;Inputs: A is the signed int
;;        HL is where to write the TI float
    ld c,0
    ld (hl),c
    add a,a
    jr c,$+6
    neg
    ld (hl),80h
    inc hl
    ld (hl),81h
    ld d,h
    ld e,l
    inc hl \ ld (hl),c \ djnz $-2
    or a \ ret z
    ld l,a          ;\
    ld h,c          ; |
    add hl,hl       ; |Start converting A to BCD
    add hl,hl       ; |
    add hl,hl       ; |
    add hl,hl       ; |
    ld a,h \ daa  \ rl l    ; |Finish converting A to BCD
    adc a,a \ daa \ rl l    ; |Number is in cA
    adc a,a \ daa \ rl l    ; |(c is carry)
    adc a,a \ daa           ;/
    ex de,hl
    jr nc,$+15
    ld (hl),82h
    inc hl
    inc hl
    ld (hl),a
    xor a
    rld
    or $10
    dec hl
    ld (hl),a
    ret
    inc hl
    ld (hl),a
    and $F0
    ret nz
    rld         ;\ Rotate up 1 digit
    dec hl      ; |
    ld (hl),80h ; |
    ret         ; /

The slowest routine here has a worst case of 499cc and slowest average case is 436cc.

437

Escheron: Twilight over Ragnoth / Re: "Escheron: Twilight over Ragnoth" — progress updates and discussion

« on: August 02, 2015, 11:11:06 am »

Sorry, I never noticed the mention. I don't have a lot of time, so I am going to make it quick. I may have given you bad pseudocode, I dunno. Here is some tested and working BASIC code:

First the sub program which would be equivalent to the subroutine thing to get the next X,Y. It's BASIC, so I can't use the Y var, so I just use Z:

If A>=0 and X!=R
Then
X+E->X      ;X coord, E is 1 or -1 (or possibly 0, but we don't actually need that)
A-V->A
End
If A<0 and Z!=S
Then
Z+F->Z     ;F is like E
A+U->A
End

Now the actual program. In this case, I just use the X,Z to draw pixels, you would use it to render the map.

;;Input a list as any {X1,Y1,X2,Y2}, just make sure they are in bounds as my code doesn't check
Ans->L1
Ans(1->X
L1(2->Z
L1(3->R    ;x2
L1(4->S    ;y2
R-X->U     ;deltax
S-Z->V     ;deltay
(U>0)-(U<0->E   ;sign(deltax)->xinc
(V>0)-(V<0->F   ;sign(deltay)->yinc
abs(U->A
2Ans->U
2abs(V->V
Pxl-On(X,Z
Repeat getKey
prgmLINESUB
Pxl-On(X,Z
End

I hope this helps ^~^

438

ASM / Re: ASM Optimized routines

« on: August 02, 2015, 09:06:26 am »

So earlier on IRC I posted a challenge that I was thinking about before bed: Convert an 8-bit unsigned integer to BCD as fast as you can.

I managed to get my code down to 143.5cc, then jacobly noticed an optimization on mine, and I noticed a bug that had been carried through from the beginning. After all was done, we got it down to 131cc.

The code given is a subroutine (so adding an ret to the end, +1 byte +10cc). Without further ado:

L_To_Dec:
;;Unrolled
;;Converts the 8-bit register L to binary coded decimal
;;Digits stored in LA (A has the lower 2 digits, L the upper).
;;Inputs: L is the 8-bit unsigned int to convert
;;Output: A has the lower 2 digits (in BCD form), L has the upper
;;Destroys: H,F
;;141cc 
;;27 bytes
    ld h,0
    add hl,hl
    add hl,hl
    add hl,hl
    add hl,hl
    ld a,h \ daa  \ rl l
    adc a,a \ daa \ rl l
    adc a,a \ daa \ rl l
    adc a,a \ daa \ rl l
    adc a,a \ daa \ rl l
    ret

EDIT: Found a transcription error. Had to adjust code, size, and timing by +4 bytes, +16cc

439

The Axe Parser Project / Re: Assembly Programmers - Help Axe Optimize!

« on: July 27, 2015, 02:49:49 pm »

I think I finally have a major optimization after having worked on link routines for the past couple of weeks. I didn't modify the timeout or syncing code, just the core get/send stuff. I've tested it and it is reliable.

For reference, in the even that p_SendByte doesn't have to wait, the new routine is 931cc vs 1647cc. Here are my proposed routines:

p_GetByte: +0 bytes, presumably as much faster as p_SendByte

p_GetByte:
	.db __GetByteEnd-$-1
	di
	ld	bc,$0803		;Bit counter in b, bit mask in c
	ld	hl,-1
	xor	a
	out	(0),a			;Make sure we are reset
	in	a,(0)
	and	c			;Check to see if sender is ready
	dec	a
	ret	nz			;If not, then go back
	inc	a
	out	(0),a			;Relay a confirmation
	ex	(sp),hl			;Wait at until confirmation is read (59 T-states minimum)
	ex	(sp),hl
	ld	a,(de)			;Bit counter in b and bitmask in c
	xor	a			;Store received byte in l
	ld hl,$AA
	out	(0),a			;Reset the ports to receive data

__GetByteLoop:
    in a,(0)
    xor l
    rra
    jr c,__GetByteLoop
    in a,(0)
    rra
    rra             ;bits cycled in are masked with 0x55. Need to invert anyways, so mask at the end with 0xAA
    rr l
    djnz __GetByteLoop
    ret

p_SendByte: -4 bytes, -723cc

p_SendByte:
    .db __SendByteEnd-$-1
	di
	ld	bc,$5503		;Bit counter in b, bit mask in c
	ld	a,%00000010
	out	(0),a			;Indicate we are ready to send
__SendByteTimeout:
	dec	hl
	ld	a,h
	or	l
	jr	z,__SendByteDone
	in	a,(0)			;Loop is 59 T-states maximum
	and	c
	jr	nz,__SendByteTimeout	;Keep looping till we get it
	out (0),a
__SendLoop:
    rrc e
    ccf
    rla
    sla b
    ccf
    rla
    out (0),a
    ex (sp),hl
    ex (sp),hl
    nop
    jr nz,__SendLoop
;need 37cc
    xor a
    ex (sp),hl
    ex (sp),hl
__SendByteDone
    out (0),a
    ret
__SendByteEnd:

EDIT: I looked at the timeout code for p_SendByte, and realized that my code didn't need B to be a counter but instead I was using D as a kind of counter. By using B instead of D, I could cut out the ld d,$55, saving 2 bytes and 7cc.

440

ASM / Re: Link Port Woes

« on: July 27, 2015, 12:34:08 pm »

Thank you a bunch! I just looked at the Tachyon link routines and noticed that the receiving end does in a,(0) \ rra \ jr c,$-3 \ in a,(0) \ rra \ rra
I tried doing that and it worked, but now I understand why. Thank you!

441

ASM / Link Port Woes

« on: July 27, 2015, 12:03:43 pm »

I am trying to write some link port routines and I am having some issues. Every so often, data won't be correctly transmitted and I have deduced that this only occurs when the sent byte has bit 1,3, or 5 set, it sometimes causes the next bit up to be set (so if bit 3 is supposed to be set, it also sets bit 4). This is a chaotic occurence.

What I do know is when the odd bits are read, bit 0 of port 0 (so the tip) reads 1. As well, the sending calc had a low battery, so I am not sure if it is an issue of bringing the lines up. I am pretty sure this is not the problem, though, as I switched out batteries.

I also tried adding code that guaranteed that the lines were reading correctly, otherwise it would rewrite the value to port 0 until it did. This still didn't work.
I tried adding huge delays, it still was just as unreliable.
Interrupts and link assist are disabled.

Below is my code if you want to muddle through it. It is supposed to send a value in Ans to the other calc and the receiving calc receives in Ans (modulo 100).

#define bcall(x) rst 28h \ .dw x
.db $BB,$6D
.org $9D95
    bcall(4AD7h)
    bcall(4AEFh)
sendb:
    di
    ld c,a
    xor a
    out (8),a
    ld e,55h
sendloop:
    rrc c
    rla
    sla e
    ccf
    rla
    out (0),a
    ld b,6          ;this is a delay, and not claculated to be precise. We probably do not even need a delay
    djnz $
    jr nz,sendloop
    ld b,10         ;delay
    djnz $
    xor a
    out (0),a
    ret

#define bcall(x) rst 28h \ .dw x
.db $BB,$6D
.org $9D95
    call getb
    bcall(478Ch)
    bcall(4ABFh)
    ret
getb:
    di
    xor a
    out (8),a
    ld bc,$08AA
readloop:
    in a,(0)
    xor c
    rra
    jr c,readloop
    rra             ;bits cycled in are masked with 0x55 as a sideeffect. Need to invert anyways, so mask at the end with 0xAA
    rr c
    djnz readloop
    ld a,c
    and 1
    rrca
    xor c
    xor $AA
    ret

442

Technology and Development / Re: Serial Link, TI Z80

« on: July 15, 2015, 12:31:38 pm »

Yeah, I should solder them as the connection was shaky before I taped them. I was going to, then I never got to it

I think the issue with the link routines and only being able to send one direction at a time is a hardware issue. I've tried changing the lines using the "receiving" calculator and it doesn't work. I think it's that it can't change it from a 0 to a 1 unless it was the calc that set it to 0 (by sending a 1).

I dunno, it was a pain in the butt, but I did find the program I was working on, which is awesome. It can send and receive bytes or packets of data and allows the user to exit the receiving end by pressing Clear if there is no activity (so no infinite loops!). Currently I can just send a string in Ans from one calc to the beginning of RAM, but that is only because I didn't have access to a computer at the time I made it and couldn't remember the _CreateStrng bcall.

My plan is to make a link protocol where a calc can:

-Send a packet of data to be written to predetermined location in RAM
-Send a file to another calc
-Request from the other calc a file
-Request a list of files (for FileSyst, this would allow the other calc to inspect folders in order to make requests for files).
-Request current key presses

443

Technology and Development / Serial Link, TI Z80

« on: July 15, 2015, 10:29:19 am »

I was not sure where to post this, but I made a calc-to-calc serial cable!

I ended up finding two sets of headphones with the right size tips (not sure what they are called), so I cut them up with an ample length of wire. I also have a bunch of jumper wires that I got for my Raspberry Pi, so after cutting the headphone wires, I exposed the three sets of wire, used an abrasive to strip off the enameling, and spliced each wire with a female tip of a jumper wire. From there, I can connect them with male-male jumpers and it properly transmits data between my calcs!

The reason that I didn't just splice the two sets of wires together directly is because now I can plug in other items. For example, I connected male leads to an electric motor and LED-- it lets me control them through the link port now! I can also hook them up to a breadboard if I wanted to, making it possible to connect more calculators, or I can directly plug them into the GPIO board on my Raspberry Pi. A few months ago I did this and wrote a Python program and a calc program that could communicate with each other (basically just sending a string in Ans to the Pi, and have it displayed in the terminal).

Unfortunately, I lost the Python program due to a crash, but I still have the calc programmed backed up on calc. My plan is to come up with some working routines from scratch since I am still pretty terrible with figuring out the link port. It took me forever to figure out that the receiving calc can't change the lines.

444

Computer Usage and Setup Help / Re: Help with Raspberry pie compute module?

« on: July 05, 2015, 06:29:06 am »

Do you mean the Raspberry Pi, or is a Raspberry Pie something different (other than a delicious dessert)?

445

The Axe Parser Project / Re: Features Wishlist

« on: July 01, 2015, 11:55:29 am »

Okay, since you tackled the parser, I have this feature wish:

Allow a very simple set of variable types.

I imagine you must keep the variable names stored somewhere, so maybe a byte indicating the variable type, too? You wouldn't have to do things like associate correct routines, though. Like just because an var is flagged as 8.8 fixed point, doesn't mean that multiplication of such a var should be automatically the fixed point multiplication routine. The current system for that, where the developer specifies is fine (and I like it that way).

However, here is where it can come in really, really useful. Custom variable types could be allowed and associated with an axiom. So if I wanted to make an axiom for rational numbers, I could have it specify a rational var type of 32-bits (2 16-bit ints) and have specially made math routines for those vars.

This might need to wait for Axe 2.0, though

446

Math and Science / Re: My way to compute Geometric Means (and square roots)

« on: July 01, 2015, 11:34:31 am »

It almost certainly won't run faster (I haven't tried in a while). It's only useful in situations where excessively large numbers are used (the BASIC version).

However, if you are working in a 32-bit environment, there is no risk of overflowing into a 64-bit int via multiplication, so all operations can take place with 32 bits.

The algorithm is best suited for a machine that can perform arbitrary shifts (the Z80 can only shift one place to the left or right) and even better if it has a "clz" instruction. Then it averages roughly 5 shift-and-adds, 1 clz, and 1 subtraction for each 4 bits of accuracy.

The algorithm is best suited for fixed point math in the form of 2.n (so 2 integer bits, n fractional bits). This can be easily achieved for floats where the exponent can be set aside for later, and the mantissa is almost always already in a 1.n form. For integers, you just need to shift left an even number of times until the upper 2 bits are non-zero (so 01, 10,11).

447

Math and Science / Re: My way to compute Geometric Means (and square roots)

« on: June 30, 2015, 03:13:52 am »

Necro-update.

I was going to go to sleep a couple hours ago, but as soon as I turned out the lights, for whatever reason, I thought of a way to tweak this algorithm to be even faster on the Z80. I've spent the last couple of hours refining my algorithm. I'll refrain from a full mathematical proof, because 1] I'm tired, and 2] I haven't made a formal proof yet

The principal is the same: We want some algorithm for making x_n and y_n converge to some "r". We also need to be able to relate x_n and x_n to the original x and y.

One further constraint is to use very, very simple operations (shift-and-add).

If x_n<y_n, and I define
x_n+1=x_n(1+2^-k_n)²
y_n+1=y_n(1-2^-k_n)²
then we have that x_n+1y_n+1=x_ny_n(1-4^-k_n)²

So if we keep track of the factors (1-4^-k_n)², we can just divide them out of the final result. So basically, start with a constant factor of 1. Each iteration, multiply that by your sqrt((1-4^-k_n)²)=(1-4^-k_n). After all of the iterations are done, we have x_n=y_n and xy(c²)=x_ny_n=x_n². So then:
xy(c²)=x_n²
xy=x_n²/c²
sqrt(xy)=x_n/c

The algorithm below appears to require n iterations to compute 4n bits of accuracy.

;think of all values here as 2.m fixed point numbers. 
INPUTS: X,Y are on [1,4)
    c=1
    iterate n times
        a=y-x
        if underflow
            a=-a
            (x,y) = (y,x)       //swap x and y, so x<y
        n=-2+intlg(a)           //intlg(a) basically finds the negative of the index of the first '1' bit in the binary expansion of a. Basically, the "clz" instruction for some processors.
        c-=c>>2n
        x+=x>>n
        x+=x>>n
        y-=y>>n
        y-=y>>n
    return x/c

448

News / Re: Android reaches the Nspire!

« on: June 27, 2015, 06:51:25 pm »

So this means I can use my calc to get on the internet if I have a USB Wifi dongle?

449

ASM / isDivisible Routine

« on: June 23, 2015, 12:30:33 pm »

I don't want to put this in the Optimized Routines thread as I haven't had it long enough for it to be ultra-optimized.

This code is intended as a method of testing if HL is divisible by DE. It's average run time for random 16-bit inputs is 163cc, and where DE<=HL (more likely in practice), it is 229cc. For a code that is doing prime-checking, it will more likely take about 707.5cc (Since DE will be roughly the square root of HL or less).

isDivisible:
;;Inputs: DE,HL
;;Outputs: c flag set if HL is not divisible by DE, else c flag is set.
;;         HL is 0 if true.
;;1087cc worst case (HL=65535, DE=1). 
;;Average for random inputs: 163cc
;;Average for DE<=HL: 229cc.
    ld a,d \ or e \ ccf \ ret z         ;remove this if DE is always guaranteed non-zero
;step 1
    ld a,e \ or l \ rra \ jr c,step2    ;\
    srl d \ rr e \ rr h \ rr l          ; |
    ld a,e \ or l \ rra \ jr nc,$-11    ; |Remove these if DE is always guaranteed odd at input.
step2:                                  ; |
    ld a,e \ rra \ ccf \ ret c          ;/
;steps 3, 4, and 5
    ld a,l
    or a
loop:
    sbc hl,de \ ret c \ ret z
    rr h \ rra \ bit 0,a \ jr z,$-5
    ld l,a
    jp loop

Motivation and Development
I often find myself in a situation where I need to find the factors of a number, but I have no technology around to aid me. This means I need to use... mental arithmetic! I've been doing this for 15 years, so I have refined my mental process quite a bit. It is still a trial division algorithm, but with a very obfuscated "division" technique. We don't need to do 1131/7 to see if it is divisible by 7, we just need to see if 7 divides 1131 and this is what my algorithm does. Interestingly, testing divisibility at the algorithmic level is a little faster than division. Not by much, but it is also non-negligible.
The Algorithm
The core algorithm is designed around checking that (A mod B == 0) is true or false. We also make the assumption that B is odd and by extension, non-zero. The case where B is non-zero and even will be discussed later.

Since B is odd, 2 does not divide B. This means that if A is even:
(A mod B == 0) if and only if (A/2 mod B)==0.
We also know by the definition of divisibility that
(A mod B) == (A+c*B mod B)
where c is any integer. Combining all this, we have an algorithm:

Remove all factors of 2 from A
With A now odd, do A=A-B
- If the result is zero, that means (A mod B == 0)
- If the result underflow (becomes "negative", or on the Z80, sets the carry flag), it means that A was somewhere on [1,B-1], so it is not divisible by B.
Continue back at 1.

Now suppose B is allowed to be non-zero and even. Then B is of the form d*2^k where d is odd. This just means there are some factors of 2 that can be removed from B until it is odd. The only way A is divisible by B, is if it has the same number or more of factors of 2 as B. If we factor out common factors of 2 and find B is still even, then A is not divisible by B. Otherwise we have an odd number and only need to check the new (A mod d) for which we can use the "odd algorithm" above.
So putting it all together:

If B==0, return FALSE.
Remove common factors of 2 from A and B.
If B is even, return FALSE.
Remove all factors of 2 from A.
Subtract B from A (A=A-B).
- If the result is zero, return TRUE.
- If the result is "negative" (setting the carry flag on many processors), return FALSE.
Repeat at 4.

The overhead steps are (1) to (3). The iterated steps are (4) and (5).
Because (5) always produces an even number, when it then performs step 4, it always divides by at least one factor of 2. This means the algorithm takes at most 1+ceil(log2(A))-floor(log2(B) iterations. For example, if A is a 37-bit number and B is a 13-bit number,this takes at most 38-13 = 25 iterations. However, in practice it is usually fewer iterations.
Example Time:
Say I wanted to test if 1337 is divisible by 17.
Since 17 is odd, we can proceed.
1337 is odd, so no factors of 2 to remove.
1337-17 == 1320.
1320/2 == 660
660/2 == 330
330/2 == 165
165-17 == 148
148/2 == 74
74/2 == 37
37-17 == 20
20/2 == 10
10/2 == 5
5-17 == -12
So 1337 is not divisible by 17.

Now test divisibility by 7:
1337 => 1330
=>665
=>658
=>329
=>322
=>161
=>154
=>77
=>70
=>35
=>28
=>14
=>7
=>0

So 1337 is divisible by 7.

The worst-case timing is 66*16+31 = 1087

450

ASM / (P)RNG

« on: April 20, 2015, 09:48:20 am »

EDIT:23-June-2015 Runer took a look at my LFSR code and suggested changing it to a Galois LFSR using the inverted mask and the code is much simpler.

I have a bunch of Pseudo-Random Number Generators (PRNGs), so instead of spamming the Optimized Routines topic, I'll post here until I have it all sorted out.

So this is my most recent work and I think zillions times better than previous work. It has a period of 4,294,901,760 (65536*65535) and takes a tiny 308cc in the worst case. It passes a chaos game test that I wrote, it is very nicely distributed (mostly uniform, but not *exactly* so). As a comparison, I wrote a Combined LCG that had a fairly long period (on a similar scale to this), but it took ~1500cc, and yesterday I realized that it might not touch a dust of numbers on it's range (this depends on the upper bound for prime gaps since it used multiplication, and a modulus [which may have remedied the multiplication]).

So the method I chose was to use a Lehmer RNG which is a special case LCG (Linear Congruential Generator) that had a period of 65536 (75*seed mod 65537 ->seed) and a 16-bit LFSR (Linear Feedback Shift Register) with maximal period, so 65535. I then add the two. The whole process cycles every gcd(65535,65536) iterations. However, since they are consecutive integers, they are coprime, so the cycle is of length 65536*65535.

Now I will give all three PRNGs -- the Lehmer RNG, which is pretty good on its own, the LFSR which is very fast, and the combined PRNG:

lehmer:
;;Input:
;;  (seed) has the seed value of the RNG
;;Output:
;;  (seed) is updated, HL is the result
;;Destroys:
;;  A,DE,BC
;;Timing:
;;  if seed>0     231cc or 232cc, condition dependent
;;  if seed=0     91cc
;;  if smc=1      subtract 6cc
;;Size: 44 bytes
;;Notes:
;;    Uses the Lehmer RNG used by the Sinclair ZX81
;;    75x mod 65537 -> x
;;    0 is never a possibility, but 65536, the maximum value, is. I encode 65536 as 0, so the seed is only 2 bytes.
#IF smc == 0
    ld hl,(seed)
#ELSE
seed = $+1
    ld hl,0
#ENDIF
;multiply by 75
    ld c,l
    ld b,h
    xor a
    adc hl,hl \ jr z,special \ ld d,a \ rla
    add hl,hl \ rla
    add hl,hl \ rla \ add hl,bc \ adc a,d
    add hl,hl \ rla
    add hl,hl \ rla \ add hl,bc \ adc a,d
    add hl,hl \ rla \ add hl,bc
;modulo 65537, see note below on how this works
    ld e,a
    sbc hl,de
    jr nc,$+3
    inc hl
    ld (seed),hl
    ret
special:
;In the case that HL=0, this should be interpreted as 65536 = -1 mod 65537, so return -75 mod 65537 = -74 mod 65536 in HL
    ld hl,-74
    ld (seed),hl
    ret
    
;mod by 2^16 + 1 (a prime)
;current form is A*2^16+HL
;need:
;  (A*2^16+HL) mod (2^16+1)
;add 0 as +1-1
;  (A*(2^16+1-1)+HL) mod (2^16+1)
;distribute
;  (A*(2^16+1)-A+HL) mod (2^16+1)
;A*(2^16+1) mod 2^16+1 = 0, so remove
;  (-A+HL) mod (2^16+1)
;Oh hey, that's easy! :P
;I use this trick everywhere, you should, too.

smc=1

LFSR16:
;;Input: (seed1) is non-zero
;;Output: (seed1) updated, HL is the result
;;Destroys: A
;;13 bytes
;;66cc, add 6cc if not using smc
#IF smc == 0
    ld hl,(seed1)
#ELSE
seed1 = $+1
    ld hl,1
#ENDIF
    add hl,hl
    sbc a,a
    and %00101101
    xor l
    ld l,a
    ld (seed1),hl
    ret

smc = 1

prng16:
;;Input:
;;  (seed1) is a non-zero 16-bit int
;;  (seed2) is a 16-bit int
;;Output:
;;  HL is the pseudo random number
;;  DE is the output of the LFSR
;;  BC is the previous seed of the Lehmer PRNG
;;  (seed1) is the output of the LFSR
;;  (seed2) is the output of the Lehmer PRNG
;;Destroys: A
;;Notes:
;;  This uses a 16-bit Lehmer PRNG, and an LFSR
;;  The period is 4,294,901,760 (65536*65535)
;;  Technically,the Lehmer PRNG here can have values from 1 to 65536. In this application, we store 65536 as 0.
;;Speed:
;;    If smc = 0, add 12cc
;;    293+a-c, a,b,c are {0,1}
;;      probability a= 1 is 38/65536
;;      probability c= 1 is 1/65536
;;    Average: 293+39/65536 = 293.00059509277cc
;;    Best:293cc
;;    Worst:294cc
;;50 bytes
#IF smc == 0
    ld hl,(seed2)
#ELSE
seed2 = $+1
    ld hl,0
#ENDIF
;multiply by 75
    ld c,l
    ld b,h
    xor a
    adc hl,hl \ jr nz,$+3 \ inc a \ rla
    add hl,hl \ rla
    add hl,hl \ rla \ add hl,bc \ adc a,d
    add hl,hl \ rla
    add hl,hl \ rla \ add hl,bc \ adc a,d
    add hl,hl \ rla \ add hl,bc
    ld e,a
    sbc hl,de
    jr nc,$+3
    inc hl
    ld (seed2),hl
    ex de,hl
#IF smc == 0
    ld hl,(seed1)
#ELSE
seed1 = $+1
    ld hl,1
#ENDIF
    add hl,hl
    sbc a,a
    and %00101101
    xor l
    ld l,a
    ld (seed1),hl
    add hl,de
    ret

Pages: 1 ... 28 29 [30] 31 32 ... 317

Server load over the past 5, 10 and 15 minutes respectively: 0.87890625, 0.93701171875, 0.6708984375

Page created in 0.093 seconds with 26 queries.