ARM64与x86_64浮点运算精度比较

浮点数（floating-point number）是属于有理数中某特定子集的数的数字表示，在计算机中用以近似表示任意某个实数。具体来说，这个实数由一个整数或定点数（即尾数）乘以某个基数（计算机中通常是2）的整数次幂得到，这种表示方法类似于基数为10的科学记数法。

大部分计算机采用二进制（b=2）的表示方法。位（bit）是衡量浮点数所需存储空间的单位，通常为32位或64位，分别被叫作单精度和双精度。

浮点数精度标准有：

IEEE 754
16-bit: Half (binary16)
32-bit: Single (binary32), decimal32
64-bit: Double (binary64), decimal64
128-bit: Quadruple (binary128), decimal128
Extended precision formats

计算机内部表示

Type	Sign	Exponent	Significand field	Total bits	Exponent bias	Bits precision	Number of decimal digits
Half(IEEE 754-2008)	1	5	10	16	15	11	~3.3
Single	1	8	23	32	127	24	~7.2
Double	1	11	52	64	1023	53	~15.9
x86 extended precision	1	15	64	80	16383	64	~19.2
Quad	1	15	112	128	16383	113	~34.0

有一些CPU架构提供更大的浮点数，例如Intel的浮点运算单元8087协处理器（以及其被集成进x86处理器中的后代产品）提供80位长的浮点数，用于存储浮点运算的中间结果。还有一些系统提供128位的浮点数（通常用软件实现）。

对比ARM64与x86_64单精度/双精度浮点运算精度，测试代码如下：

#include 	<float.h>
#include 	<stdio.h>
#include 	<stdlib.h>
#include 	<string.h>
#include 	<errno.h>
#include        <limits.h>
#include        <unistd.h>
#include        <fcntl.h>
#include        <errno.h>
#include        <sys/signal.h>
#include        <math.h>

int main()
{
   float  a=120.31234567890123456789765;
   float  b=123.52345678901234567890123;
   float  c=  1.12345678901234567890123;
   float  d=  3.09876543210987654321987;
   float  e,f,g;
   
   double a1=120.31234567890123456789312345678901234567890123456789654;
   double b1=123.52345678901234567890152345678901234567890145678945678;
   double c1=  1.12345678901234567891234567890152345678989014567894234;
   double d1=  3.09876543210987654321234562345623456234562345625678981;
   double e1,f1,g1;
   
   unsigned long long l_a=5234567890123456789015234567890123456789014567894567876543219812;
   unsigned long long l_b=7234567890123456789015234567890126456789014567894567876543219812;
   unsigned long long l_c;
   double l_d;

   e=b-a;
   f=c/d;
   g=d-c;
   
   e1=b1-a1;
   f1=c1/d1;
   g1=d1-c1;
   
   l_c = l_b/l_a;
   l_d = (double)l_b/(double)l_a;

   printf("n===========Single Float Test===================n");
   printf("nb:%f(123.52345678901234567890123)-a:%f(120.31234567890123456789765) = %3.23fn", b,a,e);
   printf("c:%f(1.12345678901234567890123)/d:%f(3.09876543210987654321987) = %3.23fn", c,d,f);
   printf("d:%f(3.09876543210987654321987)-c:%f(1.12345678901234567890123) = %3.23fn", d,c,g);

   printf("n************Double Float Test***************n");
   printf("nb1:%f(123.52345678901234567890152345678901234567890145678945678)-a1:%f(120.31234567890123456789312345678901234567890123456789654) = %3.52fn", b1,a1,e1);
   printf("c1:%f(1.12345678901234567891234567890152345678989014567894234)/d1:%f(3.09876543210987654321234562345623456234562345625678981) = %3.52fn", c1,d1,f1);
   printf("d1:%f(3.09876543210987654321234562345623456234562345625678981)-c1:%f(1.12345678901234567891234567890152345678989014567894234) = %3.52fn", d1,c1,g1);
   
   printf("n<<<<<<<<<<<<<<<Double Float Test>>>>>>>>>>>>>>>n");
   printf("n unsigned long long divide l_b/l_a = %llu n", l_c);
   printf("unsinged long long divide to double:%3.52fn", l_d);
   
   
   return 0;
}

分别在ARM64与x86_64机器上运行，结果表明，两个输出完全一致。说明ARM64单精度/双精度浮点运算结果与x86_64完全一致。

x86_64机器运行结果：

-> ./float_test

===========Single Float Test===================

b:123.523460(123.52345678901234567890123)-a:120.312347(120.31234567890123456789765) = 3.21111297607421875000000

c:1.123457(1.12345678901234567890123)/d:3.098765(3.09876543210987654321987) = 0.36254981160163879394531

d:3.098765(3.09876543210987654321987)-c:1.123457(1.12345678901234567890123) = 1.97530853748321533203125

************Double Float Test***************

b1:123.523457(123.52345678901234567890152345678901234567890145678945678)-a1:120.312346(120.31234567890123456789312345678901234567890123456789654) = 3.2111111101111049492828897200524806976318359375000000

c1:1.123457(1.12345678901234567891234567890152345678989014567894234)/d1:3.0 98765(3.09876543210987654321234562345623456234562345625678981) = 0.3625498004369470117502771699946606531739234924316406

d1:3.098765(3.09876543210987654321234562345623456234562345625678981)-c1:1.123457(1.12345678901234567891234567890152345678989014567894234) = 1.9753086430975308473989571211859583854675292968750000

<<<<<<<<<<<<<<<Double Float Test>>>>>>>>>>>>>>>

unsigned long long divide l_b/l_a = 6

unsinged long long divide to double:6.0746380502316048577426954580005258321762084960937500

ARM64机器运行结果：

-bash-4.3# /mnt/install_aarch64/bin/gcc -o float_test float_test.c

float_test.c: In function ‘main’:

float_test.c:27:27: warning: integer constant is too large for its type

unsigned long long l_a=5234567890123456789015234567890123456789014567894567876543219812;

float_test.c:28:27: warning: integer constant is too large for its type

unsigned long long l_b=7234567890123456789015234567890126456789014567894567876543219812;

-bash-4.3# ./float_test

===========Single Float Test===================

b:123.523460(123.52345678901234567890123)-a:120.312347(120.31234567890123456789765) = 3.21111297607421875000000

c:1.123457(1.12345678901234567890123)/d:3.098765(3.09876543210987654321987) = 0.36254981160163879394531

d:3.098765(3.09876543210987654321987)-c:1.123457(1.12345678901234567890123) = 1.97530853748321533203125

************Double Float Test***************

c1:1.123457(1.12345678901234567891234567890152345678989014567894234)/d1:3.0 98765(3.09876543210987654321234562345623456234562345625678981) = 0.3625498004369470117502771699946606531739234924316406

d1:3.098765(3.09876543210987654321234562345623456234562345625678981)-c1:1.123457(1.12345678901234567891234567890152345678989014567894234) = 1.9753086430975308473989571211859583854675292968750000

<<<<<<<<<<<<<<<Double Float Test>>>>>>>>>>>>>>>

unsigned long long divide l_b/l_a = 6

unsinged long long divide to double:6.0746380502316048577426954580005258321762084960937500

-bash-4.3#

ARM64与x86_64浮点运算精度比较

1 Comment

Recent Posts

Categories

ARM64与x86_64浮点运算精度比较

Share this:

1 Comment

Recent Posts

Categories

Discover more from Intelligent Computing Architecture