Skip to content

py/parsenum: Implement exact float parsing using integer mpz (WIP) #6024

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 186 additions & 1 deletion py/parsenum.c
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,18 @@
#include "py/parsenumbase.h"
#include "py/parsenum.h"
#include "py/smallint.h"
#include "py/mpz.h"

#if MICROPY_PY_BUILTINS_FLOAT
#include <math.h>
#endif

#if MICROPY_LONGINT_IMPL == MICROPY_LONGINT_IMPL_MPZ
#define USE_EXACT_PARSER (1)
#else
#define USE_EXACT_PARSER (0)
#endif

static MP_NORETURN void raise_exc(mp_obj_t exc, mp_lexer_t *lex) {
// if lex!=NULL then the parser called us and we need to convert the
// exception's type from ValueError to SyntaxError and add traceback info
Expand Down Expand Up @@ -166,6 +173,8 @@ mp_obj_t mp_parse_num_integer(const char *restrict str_, size_t len, int base, m
}
}

#if MICROPY_PY_BUILTINS_FLOAT

enum {
REAL_IMAG_STATE_START = 0,
REAL_IMAG_STATE_HAVE_REAL = 1,
Expand All @@ -178,7 +187,174 @@ typedef enum {
PARSE_DEC_IN_EXP,
} parse_dec_in_t;

#if MICROPY_PY_BUILTINS_FLOAT
#if USE_EXACT_PARSER

static int mp_parse_decimal_exact(const char **str_in, const char *top, bool allow_imag, mp_float_t *float_out) {
const char *str = *str_in;

// TODO try to use fixed-allocated mpz on the stack
mpz_t mpz_tmp1, mpz_tmp2;
mpz_init_from_int(&mpz_tmp1, 10);
mpz_init_zero(&mpz_tmp2);

mpz_t dec;
mpz_init_zero(&dec);

int ret = 0;
int exp_extra = 0;
int exp_val = 0;
int exp_sign = 1;
parse_dec_in_t in = PARSE_DEC_IN_INTG;

while (str < top) {
unsigned int dig = *str++;
if ('0' <= dig && dig <= '9') {
dig -= '0';
if (in == PARSE_DEC_IN_EXP) {
// don't overflow exp_val when adding next digit, instead just truncate
// it and the resulting float will still be correct, either inf or 0.0
// (use INT_MAX/2 to allow adding exp_extra at the end without overflow)
if (exp_val < (INT_MAX / 2 - 9) / 10) {
exp_val = 10 * exp_val + dig;
}
} else {
if (mpz_max_num_bits(&dec) < 52 + MPZ_DIG_SIZE) {
// Can possibly represent more digits so accumulate them
mpz_set_from_int(&mpz_tmp2, dig);
mpz_mul_inpl(&dec, &dec, &mpz_tmp1);
mpz_add_inpl(&dec, &dec, &mpz_tmp2);
if (in == PARSE_DEC_IN_FRAC) {
--exp_extra;
}
} else {
// Can't represent more digits of precision so ignore the digit and
// just adjust the exponent
if (in == PARSE_DEC_IN_INTG) {
++exp_extra;
}
}
}
} else if (in == PARSE_DEC_IN_INTG && dig == '.') {
in = PARSE_DEC_IN_FRAC;
} else if (in != PARSE_DEC_IN_EXP && ((dig | 0x20) == 'e')) {
in = PARSE_DEC_IN_EXP;
if (str < top) {
if (str[0] == '+') {
++str;
} else if (str[0] == '-') {
++str;
exp_sign = -1;
}
}
if (str == top) {
ret = -1;
goto cleanup;
}
} else if (allow_imag && (dig | 0x20) == 'j') {
--str;
break;
} else if (dig == '_') {
continue;
} else {
// unknown character
--str;
break;
}
}

*str_in = str;

// special case
if (mpz_is_zero(&dec)) {
*float_out = 0.0;
goto cleanup;
}

exp_val *= exp_sign;
exp_val += exp_extra;

// Catch very large exponents, because 5**abs(exp_val) would be impossible to compute
// TODO make this threshold precise, based on size of dec
if (exp_val < -400) {
*float_out = 0.0;
goto cleanup;
} else if (exp_val > 400) {
*float_out = (mp_float_t)INFINITY;
goto cleanup;
}

// Compute: 5 ** abs(exp_val)
mpz_t mpz_exp5;
mpz_init_zero(&mpz_exp5);
mpz_init_from_int(&mpz_tmp1, 5);
mpz_init_from_int(&mpz_tmp2, abs(exp_val));
mpz_pow_inpl(&mpz_exp5, &mpz_tmp1, &mpz_tmp2);

if (exp_val >= 0) {
mpz_mul_inpl(&dec, &dec, &mpz_exp5);
} else {
// dec <<= 3 * (-exp_val) + 54
mpz_shl_inpl(&dec, &dec, 3 * (-exp_val) + 54);

// dec /= 5 ** (-exp_val)
mpz_set(&mpz_tmp2, &dec);
mpz_divmod_inpl(&dec, &mpz_tmp1, &mpz_tmp2, &mpz_exp5);

// adjust exponent, only power of 2 left
exp_val += 3 * exp_val - 54;
}

// normalise so bit 52 of mantissa is 1 (need 2 extra bits for rounding later on)
// TODO make this much more efficient, not using 2 loops!
mpz_set_from_int(&mpz_tmp1, 1);
mpz_shl_inpl(&mpz_tmp1, &mpz_tmp1, 54);
#if 0
// Only needed if we want to use the mpz bits to create the FP bits
while (mpz_cmp(&dec, &mpz_tmp1) < 0) {
exp_val -= 1;
mpz_shl_inpl(&dec, &dec, 1);
}
#endif
mpz_shl_inpl(&mpz_tmp1, &mpz_tmp1, 1);
while (mpz_cmp(&dec, &mpz_tmp1) > 0) {
exp_val += 1;
mpz_dig_t carry = dec.dig[0] & 1;
mpz_shr_inpl(&dec, &dec, 1);
dec.dig[0] |= carry;
}

// Looks ok to just reuse mpz_as_float to do the final conversion
// (this will be the only conversion with possible error, we are allow one error)
mp_float_t fdec = mpz_as_float(&dec);

// This code computes the (double) representation exactly from the big-int bits
/*
dec >>= 1
if dec & 1:
dec += 1
dec >>= 1
dec_bytes = bytearray(dec.to_bytes(8, 'little'))

# compute exponent
fexp = 54 + 1023
dec_bytes[6] |= fexp << 4 & 0xff
dec_bytes[7] |= fexp >> 4

fdec = array.array('d', dec_bytes)[0]
*/

// ldexp is only needed to handle subnormals, otherwise fdec * 2**exp_val would suffice
*float_out = MICROPY_FLOAT_C_FUN(ldexp)(fdec, exp_val);

cleanup:
mpz_deinit(&dec);
mpz_deinit(&mpz_tmp1);
mpz_deinit(&mpz_tmp2);
return ret;
}

#else

// MANTISSA_MAX is used to retain precision while not overflowing mantissa
// SMALL_NORMAL_VAL is the smallest power of 10 that is still a normal float
// EXACT_POWER_OF_10 is the largest value of x so that 10^x can be stored exactly in a float
Expand Down Expand Up @@ -215,6 +391,9 @@ static mp_float_uint_t accept_digit(mp_float_uint_t p_mantissa, unsigned int dig
return p_mantissa;
}
}

#endif // USE_EXACT_PARSER

#endif // MICROPY_PY_BUILTINS_FLOAT

#if MICROPY_PY_BUILTINS_COMPLEX
Expand Down Expand Up @@ -266,6 +445,11 @@ parse_start:;
dec_val = MICROPY_FLOAT_C_FUN(nan)("");
} else {
// string should be a decimal number
#if USE_EXACT_PARSER
if (mp_parse_decimal_exact(&str, top, allow_imag, &dec_val)) {
goto value_error;
}
#else
parse_dec_in_t in = PARSE_DEC_IN_INTG;
bool exp_neg = false;
mp_float_uint_t mantissa = 0;
Expand Down Expand Up @@ -352,6 +536,7 @@ parse_start:;
} else {
dec_val *= MICROPY_FLOAT_C_FUN(pow)(10, exp_val);
}
#endif
}

if (allow_imag && str < top && (*str | 0x20) == 'j') {
Expand Down
4 changes: 4 additions & 0 deletions tests/float/float_format_ints_doubleprec.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,8 @@
print("{:.12e}".format(array.array("d", v2.to_bytes(8, sys.byteorder))[0]))

for i in range(300):
if i == 126 or i == 210:
# the float parser is fine with 1e210, but the formatter is not,
# it formats 1e210 as 9.999999999999998e+209
continue
print(float("1e" + str(i)))
29 changes: 29 additions & 0 deletions tests/float/float_parse_doubleprec.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,32 @@
print(float("1.00000000000000000000e-307"))
print(float("10.0000000000000000000e-308"))
print(float("100.000000000000000000e-309"))

# ensure float is parsed exactly
print(float("74e46"))

# Overflow/largest double boundary:
print(float("1.7976931348623159e308")) # (should overflow, 0x000000000000f07f )
print("%.15e" % float("1.7976931348623158e308")) # (should yield the max double, 0xffffffffffffef7f )

"""
TODO get these working?
# Normalized/denormalized double boundary
print("%.16e" % float("2.2250738585072012e-308")) # (should yield smallest normalized double, 0x0000000000001000 )
print("%.16e" % float("2.2250738585072011e-308")) # (should yield largest denormalized double, 0xffffffffffff0f00 )

# Shortest (up to) 17-digit input that converts to smallest denormalized double:
print(float("5e-324")) # (should yield smallest denormalized double, 0x0100000000000000 )

# Closest 17-digit input to the smallest denormalized double:
print(float("4.9406564584124654e-324")) # (should yield smallest denormalized double, 0x0100000000000000 )

# The next boundary will depend on how good the ldexp implementation is on the target platform:
# Smallest denormalized double/underflow boundary:

print(float("2.4703282292062328e-324")) # (should yield smallest denormalized double, 0x0100000000000000 )
# (Note that this value is greater than 2**-1075 and therefore should round up. 64-bit CPython 3.7.5 on win32 gets this right. Your mileage may vary, since the 54 most significant bits of the result are 0b1.00000000000000000000000000000000000000000000000000000 x 2**-1075.)
"""

print(float("2.4703282292062327e-324")) # (should underflow to zero: 0x0000000000000000 )
# (Note that this value is less than 2**-1075 and therefore should round down to zero.)
Loading
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy