0 Members and 6 Guests are viewing this topic.
mul16:;;Expects Z80 mode;;Inputs: hl,bc;;Outputs: Upper 16 bits in DE, lower 16 bits in BC;;54 or 56 t-states;;30 bytes ld d,c \ ld e,l \ mlt de \ push de ;11 ld d,h \ ld e,b \ mlt de ;8 ld a,l \ ld l,c \ ld c,a ;3 mlt hl \ mlt bc \ add hl,bc ;13 jr nc,$+3 \ inc de \ pop bc ;6 ld a,b \ add a,l \ ld b,a ;3 ld a,e \ adc a,h \ ld e,a ;3 ret nc \ inc d \ ret ;7 (+2 for carry)
mul32:;;Expects Z80 mode;;Inputs: ix points to the first 32-bit number;; ix+4 points to the next 32-bit number;;Outputs: ix+8 points to the 64-bit result;;Algorithm: Karatsuba;348cc to 375cc;compute z0 ld hl,(ix) ;5 ld bc,(ix+4) ;5 call mul16 ;59+2 ld (ix+8),bc ;5 ld (ix+10),de ;5;compute z2 ld hl,(ix+2) ;5 ld bc,(ix+6) ;5 call mul16 ;59+2 ld (ix+12),bc ;5 ld (ix+14),de ;5;compute z1, most complicated as it requires 17-bits of info for each factor ld hl,(ix+2) ;5 ld bc,(ix) ;5 add hl,bc ;1 rla ;1 ld b,h ;1 ld c,l ;1 ld hl,(ix+6) ;5 ld de,(ix+4) ;5 add hl,de ;1 rla ;1 push hl ;3 push bc ;3 push af ;3 call mul16 ;59+2 pop af ;3 and 3 ;2 ex de,hl ;1 pop de ;3 jr z,$+7 ;label0 ;3|(6+1);bit 0 means add [stack0] to the upper 16 bits;bit 1 means add [stack1] to the upper 16 bits;both mean add 2^32 jp po,$+5 \ or 4 ;-- rra \ jr nc,$+7 ;4+4 rrca \ add hl,bc \ adc a,0 \ rlca ;-- ; srl a \ pop de \ jr nc,$+5 ;8+2 add hl,bc \ adc a,0 ; ld d,b \ ld e,c ;2;z1 = AHLDE-z0-z2 ld bc,(ix+8) \ ex de,hl \ sbc hl,bc ;8 ld bc,(ix+10) \ ex de,hl \ sbc hl,bc \ sbc a,0 ;10 ld bc,(ix+12) \ ex de,hl \ sbc hl,bc ;8 ld bc,(ix+14) \ ex de,hl \ sbc hl,bc \ sbc a,0 ;10 ex de,hl ;1 ld bc,(ix+10) \ add hl,bc \ ld (ix+10),hl \ ex de,hl ;12 ld bc,(ix+12) \ adc hl,bc \ ld (ix+12),hl \ adc a,0 ;13 ret z \ ld hl,(ix+14) \ add a,l \ ld l,a ;7+16 jr nc,$+3 \ inc h \ ld (ix+14),hl \ ret ;--
GCD16:;;Expect Z80 mode;;Inputs: HL,DE;;Output: HL;; BC=0;; DE=0;; A=0;Binary GCD algorithm;About 432cc on average (average of 500 iterations with randomized inputs on [0,65535];78 bytes xor a ld b,a ld c,a sbc hl,bc ex de,hl ret z sbc hl,bc ex de,hl ret z;factor out cofactor powers of two ld a,l \ or e \ rra \ jr nc,$+16 srl h \ rr l srl d \ rr e inc b ld a,l \ or e \ rra \ jr nc,$-12.loop:;factor out powers of 2 from hl ld a,l \ rra \ ld a,h \ jr c,$+10 rra \ rr l \ bit 0,l \ jr z,$-5 \ ld h,a;factor out powers of 2 from de ld a,e \ rra \ ld a,d \ jr c,$+10 rra \ rr e \ bit 0,e \ jr z,$-5 \ ld d,a xor a sbc hl,de jr z,finish jr nc,loop add hl,de or a ex de,hl sbc hl,de jr loop.finish: ex de,hl dec b inc b ret z add hl,hl \ djnz $-1 \ ret
sqrt16:;;Expext Z80 mode;;Inputs: HL;;Output: A;Unrolled, speed optimized.;At most 112 clock cycles;111 bytes. xor a \ ld c,a \ ld d,a \ ld e,l \ ld l,h \ ld h,c add hl,hl \ add hl,hl \ sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a add hl,hl \ add hl,hl \ rl c \ ld a,c \ rla sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a sla c \ ld a,c \ add a,a \ add hl,hl \ add hl,hl jr nc,$+5 \ sub h \ jr $+5 \ sub h \ jr nc,$+5 \ inc c \ cpl \ ld h,a ld e,h \ ld h,l \ ld l,c \ ld a,l add hl,hl \ rl e \ rl d add hl,hl \ rl e \ rl d sbc hl,de \ rla \ ret
Rand24:;;Expects Z80 mode;;Inputs: seed1,seed2;;Outputs:;; AHL is the pseudo-random number;; seed1,seed2 incremented accordingly;;Destroys: BC,DE;;Notes:;; seed1*243+83 mod 65519 -> seed1;; seed2*251+43 mod 65521 -> seed2;; Output seed1*seed2 mod 16777259;;215cc worst case ld hl,(seed1) ld d,h ld h,243 ld e,h mlt hl mlt de ld a,83 add a,l ld l,a ld a,h adc a,e ld h,a ld a,d adc a,0;now AHL mod 65519; Essentially, we can at least subtract A*65519=A(65536-17), so add A*17 to HL ld c,a ld b,17 mlt bc add hl,bc ld de,65519 jr nc,$+5 or a \ sbc hl,de or a \ sbc hl,de jr nc,$+3 add hl,de ld (seed1),hl;seed1 done, now we need to do seed2 ld hl,(seed2) ld d,h ld h,251 ld e,h mlt hl mlt de ld a,43 add a,l ld l,a ld a,h adc a,e ld h,a ld a,d adc a,0;now AHL mod 65521; Essentially, we can at least subtract A*65521=A(65536-15), so add A*15 to HL ld c,a ld b,15 mlt bc add hl,bc ld de,65521 jr nc,$+5 or a \ sbc hl,de or a \ sbc hl,de jr nc,$+3 add hl,de ld (seed2),hl;now seed1 and seed 2 are computed;seed1*seed2 mod 16777259 ld bc,(seed1) call mul16 ; -> DEBC;16777259 = 2^24+43;D2^24+EBC mod 16777259 = EBC+(D*2^24+43D-43D) mod 16777259;D2^24+EBC mod 16777259 = EBC+(16777259D-43D) mod 16777259;D2^24+EBC mod 16777259 = EBC-43D mod 16777259 ld a,e ld e,43 mlt de;ABC-de ld h,b \ ld l,c or a sbc hl,de sbc a,0 ret nc ld bc,43 add hl,bc ret
setBuf:;;Any mode;;Inputs: A is the byte to set each byte in the buffer to;; HL points to the buffer;; BC is the buffer size, greater than 1;;8 bytes;;14+3*BC clock cycles ld (hl),a dec bc ld d,h ld e,l inc de ldir ret
fastMul_Ndiv255:;;Expects Z80 mode;;Description: Multiplies HL by A/255;;Inputs: A,HL;;Outputs: HL is the result;; A is a copy of the upper byte of the result;; DE is the product of the input A,H;; B is a copy of E;; C is a copy of the upper byte of the product of inputs A,L;;32cc;;18 bytes ld d,a ld e,h ld h,d mlt hl mlt de;15;DE; DE; HL; HL ld c,h ld b,e ld a,d add hl,bc \ adc a,0 add hl,de \ adc a,0 ld l,h \ ld h,a ret;17
fastMul_Ndiv255_fix:;;Expects Z80 mode;;Description: Multiplies HL by A/255;;Inputs: A,HL;;Outputs: HL is the result;;52cc;;26 bytes push af ld d,a ld e,l ld l,d mlt hl mlt de;HL; HL; DE; DE ld c,d ld b,l ld a,h add hl,de \ adc a,0 add hl,bc \ adc a,0 ld d,l \ ld l,h \ ld h,a pop af \ add a,e ret nc adc a,d ret nc inc hl ret
fastMul_Ndiv255_round:;;Expects Z80 mode;;Description: Multiplies HL by A/255;;Inputs: A,HL;;Outputs: HL is the result;;37cc;;23 bytes ld d,a ld e,l ld l,d mlt hl mlt de;15;HL; HL; DE; DE ld c,d ld b,l ld a,h add hl,de \ adc a,0 add hl,bc \ adc a,0 ld l,h \ ld h,a sla e \ jr nc,$+3 \ inc hl ret;22
sqrt24:;;Expects ADL mode;;Inputs: HL;;Outputs: DE is the integer square root;; HL is the difference inputHL-DE^2;; c flag reset xor a \ ld b,l \ push bc \ ld b,a \ ld d,a \ ld c,a \ ld l,a \ ld e,a;Iteration 1 add hl,hl \ rl c \ add hl,hl \ rl c sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a;Iteration 2 add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a;Iteration 3 add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a;Iteration 4 add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a;Iteration 5 add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a;Iteration 6 add hl,hl \ rl c \ add hl,hl \ rl c \ rl e \ ld a,e sub c \ jr nc,$+6 \ inc e \ inc e \ cpl \ ld c,a;Iteration 7 add hl,hl \ rl c \ add hl,hl \ rl c \ rl b ex de,hl \ add hl,hl \ push hl \ sbc hl,bc \ jr nc,$+8 ld a,h \ cpl \ ld b,a ld a,l \ cpl \ ld c,a pop hl jr nc,$+4 \ inc hl \ inc hl ex de,hl;Iteration 8 add hl,hl \ ld l,c \ ld h,b \ adc hl,hl \ adc hl,hl ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl jr nc,$+6 \ sbc hl,de \ inc de \ inc de;Iteration 9 pop af rla \ adc hl,hl \ rla \ adc hl,hl ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl jr nc,$+6 \ sbc hl,de \ inc de \ inc de;Iteration 10 rla \ adc hl,hl \ rla \ adc hl,hl ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl jr nc,$+6 \ sbc hl,de \ inc de \ inc de;Iteration 11 rla \ adc hl,hl \ rla \ adc hl,hl ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl jr nc,$+6 \ sbc hl,de \ inc de \ inc de;Iteration 11 rla \ adc hl,hl \ rla \ adc hl,hl ex de,hl \ add hl,hl \ sbc hl,de \ add hl,de \ ex de,hl jr nc,$+6 \ sbc hl,de \ inc de \ inc de rr d \ rr e \ ret
div16:;;Inputs: DE is the numerator, BC is the divisor;;Outputs: DE is the result;; A is a copy of E;; HL is the remainder;; BC is not changed;140 bytes;145cc xor a \ sbc hl,hl ld a,d rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc rla \ cpl \ ld d,a ld a,e rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc rla \ adc hl,hl \ sbc hl,bc \ jr nc,$+3 \ add hl,bc rla \ cpl \ ld e,a ret
prng24:;;Expects ADL mode.;;Output: HL;;50cc;;33 bytes;;cycle length: 281,474,959,933,440 (about 2.8 trillion) ld de,(seed1) or a sbc hl,hl add hl,de add hl,hl add hl,hl inc l add hl,de ld (seed1),hl ld hl,(seed2) add hl,hl sbc a,a and %00011011 xor l ld l,a ld (seed2),hl add hl,de ret
That "one in ten million" is when bc = 1? How does pipelining work for an instruction like ldir (or any of the other repeat instructions)?
atan8:;returns H=256*arctan(E/256);48cc if ADL mode;takes 164cc to call this on the TI-84+CE ld c,201 ld b,e mlt bc ;x*201 xor a sub e ld d,a mlt de ;x(256-x) ld l,e ld h,70 ld e,h mlt de ;upper bytes mlt hl ;lower bytes ld a,e add a,h ld l,a ld h,d jr nc,$+3 inc h add hl,bc ret