Option to convert double / real8
to string by SIMD
using FPU
as an aid. The use of FPU is caused by the desire to obtain 16 significant digits.
In accordance with, x64 software conventions
we will assume that the number to be converted is located in XMM0
.
We will use x64
bit code for x32
bit addressing. This way of addressing allows you to take advantage of both dialects.
We will use the undocumented convention of passing / returning multiple parameters from a function. The agreement absolutely mirrors the agreement x64 software conventions
, except that it describes the rules for placing parameters when exiting the procedure.
For the convenience of reading the code, we will create two blocks of text constants, in the first we will define aliases for the arguments of the assembler commands, in the second we will define aliases for the sizes of variables in the stack that will make it easy to understand which variable we are writing / reading:
ROUND_TOWARD_ZERO equ 11b
SIGNIFICANT_BIT_RESET equ 3Fh
LCW equ word
LIExp2 equ dword
LIExp10 equ dword
LSExp10 equ dword
LIUpPathNam equ dword
LILowPathNam equ dword
LNamber equ qword
LMulExp2 equ qword
LStX equ tbyte
LString equ xmmword * 2
We create a segment of auxiliary data that we will use in the calculation, it is especially worth noting that the data for the registers is SIMD
aligned according to the paragraph for the possibility of direct access:
.data
f10m4 real4 4 dup (1.0e-4)
f10p4 real4 4 dup (1.0e+4)
f10m2 real4 4 dup (1.0e-2)
f10p2 real4 4 dup (1.0e+2)
f10m1 real4 4 dup (1.0e-1)
f10p1 real4 4 dup (1.0e+1)
f0001 real4 0.0, 1.0e-2, 1.0e-1, 1.0
f0002 real4 0.0, 0.0, 1.0e+1, 1.0e+1
i30h db 10h dup (30h)
f10p8 real4 1.0e+8
NoSD real4 7.0
CW0 dw 0F7Fh
CW1 dw 037Fh
DotM dw 652Dh
namber real8 -1.234567890123456e+248
FPU
, :
fstcw word ptr[esp - LCW]
fstp tbyte ptr[esp - LCW - LStX]
fldcw CW0
RAX
:
movd rax, xmm0
btr rax, SIGNIFICANT_BIT_RESET
'-' , , 1 0 :
mov dword ptr[esp - LString - dword], 2D000000h ; '-' 00 00 00
setnc byte ptr[esp - LString - dword - byte]
mov qword ptr[esp - LCW - LStX - LNamber], rax
, "" "" / , "", , , FPU
:
shr rax, 34h
sub eax, 3FFh
mov dword ptr[esp - LCW - LStX - LNamber - LIExp2], eax
FPU
2 () 10 () 2 (), :
fldlg2
fimul dword ptr[esp - LCW - LStX - LNamber - LIExp2]
, , :
fsubr NoSD
fistp dword ptr[esp - LCW - LStX - LNamber - LIExp10]
FPU
10 () 2 () 10 (), :
fldl2t
fimul dword ptr[esp - LCW - LStX - LNamber - LIExp10]
:
fist dword ptr[esp - LCW - LStX - LNamber - LIExp10 - LIExp2]
. f2xm1
60 Skylake
f2xm1
:
fisub dword ptr[esp - LCW - LStX - LNamber - LIExp10 - LIExp2]
f2xm1
0
float
:
cvtsi2ss xmm0, dword ptr[esp - LCW - LStX - LNamber - LIExp10]
EAX , :
mov eax, dword ptr[esp - LCW - LStX - LNamber - LIExp10 - LIExp2]
add ax, 3FFh
shl rax, 34h
mov qword ptr[esp - LCW - LStX - LNamber - LMulExp2], rax
:
xor edx, edx
subss xmm0, NoSD
pxor xmm1, xmm1
comiss xmm1, xmm0
jz @f
:
shufps xmm0, xmm0, 0
subps xmm1, xmm0
maxps xmm0, xmm1
, :
mulps xmm0, xmmword ptr f0001
roundps xmm0, xmm0, ROUND_TOWARD_ZERO
pshufd xmm1, xmm0, 10010000b
mulps xmm1, xmmword ptr f0002
subps xmm0, xmm1
:
cvtps2dq xmm0, xmm0
pxor xmm1, xmm1
pcmpeqd xmm1, xmm0
packusdw xmm0, xmm0
packuswb xmm0, xmm0
AX
'-'
'+'
:
mov eax, 2B65h
cmovc ax, DotM
RDX
:
movmskps ecx, xmm1
bsr ecx, ecx
lea ecx,[ecx * 8 - 8]
movd edx, xmm0
add edx, 30303000h
shrd rdx, rdx, cl
mov dx, ax
:
@@: fmul qword ptr[esp - LCW - LStX - LNamber - LMulExp2]
:
fadd qword ptr[esp - LCW - LStX - LNamber - LMulExp2]
:
fmul qword ptr[esp - LCW - LStX - LNamber]
:
fist dword ptr[esp - LCW - LStX - LILowPathNam - LIUpPathNam]
10+8 :
fisub dword ptr[esp - LCW - LStX - LILowPathNam - LIUpPathNam]
fmul f10p8
:
fldcw CW1
fistp dword ptr[esp - LCW - LStX - LIUpPathNam]
FPU
:
fld tbyte ptr[esp - LCW - LStX]
fldcw word ptr[esp - LCW]
0
float
:
movq xmm0, qword ptr[esp - LCW - LStX - LIUpPathNam - LILowPathNam]
cvtdq2ps xmm0, xmm0
:
movaps xmm1, xmm0
mulps xmm0, xmmword ptr f10m4
roundps xmm0, xmm0, ROUND_TOWARD_ZERO
movaps xmm2, xmm0
mulps xmm2, xmmword ptr f10p4
subps xmm1, xmm2
unpcklps xmm0, xmm1
:
movaps xmm1, xmm0
mulps xmm0, xmmword ptr f10m2
roundps xmm0, xmm0, ROUND_TOWARD_ZERO
movaps xmm2, xmm0
mulps xmm2, xmmword ptr f10p2
subps xmm1, xmm2
:
movaps xmm2, xmm1
mulps xmm1, xmmword ptr f10m1
roundps xmm1, xmm1, ROUND_TOWARD_ZERO
movaps xmm3, xmm1
mulps xmm3, xmmword ptr f10p1
subps xmm2, xmm3
cvtps2dq xmm1, xmm1
cvtps2dq xmm2, xmm2
pslld xmm2, 8
paddb xmm1, xmm2
:
movaps xmm2, xmm0
mulps xmm0, xmmword ptr f10m1
roundps xmm0, xmm0, ROUND_TOWARD_ZERO
movaps xmm3, xmm0
mulps xmm3, xmmword ptr f10p1
subps xmm2, xmm3
cvtps2dq xmm0, xmm0
cvtps2dq xmm2, xmm2
pslld xmm2, 8
paddb xmm0, xmm2
:
pslld xmm1, 16
paddb xmm0, xmm1
:
pxor xmm3, xmm3
pcmpeqb xmm3, xmm0
pmovmskb eax, xmm3
bts eax, 10h
bsr eax, eax
:
paddb xmm0, xmmWord ptr i30h
movdqu [esp - LString + byte], xmm0
:
mov qword ptr[esp - LString + byte + eax], rdx
:
movd xmm0, rdx
pxor xmm1, xmm1
pcmpeqb xmm1, xmm0
pmovmskb edx, xmm1
bsf edx, edx
lea eax,[eax + edx + word + byte]
'.' :
mov dl,[esp - LString + byte]
mov dh,'.'
mov [esp - LString], dx
:
mov ecx, dword ptr[esp - LString - dword - byte]
sub eax, ecx
1
2
:
movdqu xmm1, xmmword ptr[esp - LString + ecx - byte]
movdqu xmm2, xmmword ptr[esp - LString + ecx - byte + xmmword]
ECX
:
mov ecx, eax
- .
- .
Why are they used simultaneously FPU
and SIMD
- because there FPU
is an extended precision mode that allows you to extract 16 significant digits.