For 73 bytes (10 bytes more), you can save 110 t-states on the worst case. It involves some unrolling:
SqrtHL:
;input: HL
;Output: A
;73 bytes
;639 t-states worst case
xor a
ld b,4
ld e,l
ld l,h
ld h,a
sqrt16loop:
add hl,hl
add hl,hl
add a,a
ld c,a
sub h
jr nc,$+6
cpl
ld h,a
inc c
inc c
ld a,c
djnz sqrt16loop
ld l,e
ld b,2
sqrt16loop2:
add hl,hl
add hl,hl
add a,a
ld c,a
sub h
jr nc,$+6
cpl
ld h,a
inc c
inc c
ld a,c
djnz sqrt16loop2
add a,a \ ld c,a
add hl,hl
add hl,hl
jr nc,$+6
sub h \ jp $+6
sub h
jr nc,$+6
inc c \ inc c
cpl
ld h,a
;b=0
;c is the result
;l has two more bits to rotate into h
ld a,l
ld l,h
ld h,b
add a,a \ adc hl,hl
add a,a \ adc hl,hl
ld a,c
sla c \ rl b
sbc hl,bc
ret c
inc a
ret
But really, if you are going to unroll that far, you should just unroll the whole thing and go up to 108 bytes and down to 543 t-states worst case:
SqrtHL:
;input: HL
;Output: A
;108 bytes
;543 t-states worst case
;Average is about 509 t-states
xor a
ld b,a
ld e,l
ld l,h
ld h,a
add hl,hl
add hl,hl
cp h
jr nc,$+5
dec h
ld a,4
add hl,hl
add hl,hl
ld c,a
sub h
jr nc,$+6
cpl
ld h,a
inc c
inc c
ld a,c
add hl,hl
add hl,hl
add a,a
ld c,a
sub h
jr nc,$+6
cpl
ld h,a
inc c
inc c
ld a,c
add hl,hl
add hl,hl
add a,a
ld c,a
sub h
jr nc,$+6
cpl
ld h,a
inc c
inc c
ld a,c
ld l,e
add hl,hl
add hl,hl
add a,a
ld c,a
sub h
jr nc,$+6
cpl
ld h,a
inc c
inc c
ld a,c
add hl,hl
add hl,hl
add a,a
ld c,a
sub h
jr nc,$+6
cpl
ld h,a
inc c
inc c
ld a,c
add a,a \ ld c,a
add hl,hl
add hl,hl
jr nc,$+6
sub h \ jp $+6
sub h
jr nc,$+6
inc c \ inc c
cpl
ld h,a
ld a,l
ld l,h
add a,a
ld h,a
adc hl,hl
adc hl,hl
ld a,c
sll c \ rl b
sbc hl,bc
ret c
inc a
ret