py/parsenum: Extend mp_parse_num_integer() to parse long long.

projectgus · projectgus · commit c8f23eeb2a35 · 2025-07-17T13:50:10.000+10:00
If big integer support is 'long long' then mp_parse_num_integer() can
parse to it directly instead of failing over from small int. This means
strtoll() is no longer pulled in, and fixes some bugs parsing long long
integers (i.e. can now parse negative values correctly, can now parse
values which aren't NULL terminated).

The (default) smallint parsing compiled code should stay the same here,
macros and a typedef are used to abstract some parts of it out.

When bigint is long long we parse to 'unsigned long long' first (to avoid
the code size hit of pulling in signed 64-bit math routines) and the
convert to signed at the end.

One tricky case this routine correctly overflows on is
int("9223372036854775808") which is one more than LLONG_MAX in decimal. No
unit test case added for this as it's too hard to detect 64-bit long
integer mode.

This work was funded through GitHub Sponsors.

Signed-off-by: Angus Gratton &lt;angus@redyak.com.au&gt;
diff --git a/py/objint_longlong.c b/py/objint_longlong.c
@@ -39,8 +39,6 @@
 
 #if MICROPY_LONGINT_IMPL == MICROPY_LONGINT_IMPL_LONGLONG
 
-#include <errno.h>
-
 #if MICROPY_PY_SYS_MAXSIZE
 // Export value for sys.maxsize
 const mp_obj_int_t mp_sys_maxsize_obj = {{&mp_type_int}, MP_SSIZE_MAX};
@@ -294,22 +292,12 @@ mp_obj_t mp_obj_new_int_from_ll(long long val) {
 }
 
 mp_obj_t mp_obj_new_int_from_ull(unsigned long long val) {
-    // TODO raise an exception if the unsigned long long won't fit
     if (val >> (sizeof(unsigned long long) * 8 - 1) != 0) {
         raise_long_long_overflow();
     }
     return mp_obj_new_int_from_ll(val);
 }
 
-mp_obj_t mp_obj_new_int_from_str_len(const char **str, size_t len, bool neg, unsigned int base) {
-    // TODO this does not honor the given length of the string, but it all cases it should anyway be null terminated
-    // TODO check overflow
-    char *endptr;
-    mp_obj_t result = mp_obj_new_int_from_ll(strtoll(*str, &endptr, base));
-    *str = endptr;
-    return result;
-}
-
 mp_int_t mp_obj_int_get_truncated(mp_const_obj_t self_in) {
     if (mp_obj_is_small_int(self_in)) {
         return MP_OBJ_SMALL_INT_VALUE(self_in);
diff --git a/py/parsenum.c b/py/parsenum.c
@@ -46,6 +46,27 @@ static MP_NORETURN void raise_exc(mp_obj_t exc, mp_lexer_t *lex) {
     nlr_raise(exc);
 }
 
+#if MICROPY_LONGINT_IMPL != MICROPY_LONGINT_IMPL_LONGLONG
+// For the common small integer parsing case, we parse directly to mp_int_t and
+// check that the value doesn't overflow a smallint (in which case we fail over
+// to bigint parsing if supported)
+typedef mp_int_t parsed_int_t;
+
+#define PARSED_INT_MUL_OVERFLOW mp_small_int_mul_overflow
+#define PARSED_INT_FITS MP_SMALL_INT_FITS
+#else
+// In the special case where bigint support is long long, we save code size by
+// parsing directly to long long and then return either a bigint or smallint
+// from the same result.
+//
+// To avoid pulling in (slow) signed 64-bit math routines we do the initial
+// parsing to an unsigned long long and only convert to signed at the end.
+typedef unsigned long long parsed_int_t;
+
+#define PARSED_INT_MUL_OVERFLOW mp_mul_ull_overflow
+#define PARSED_INT_FITS(I) ((I) <= (unsigned long long)LLONG_MAX)
+#endif
+
 mp_obj_t mp_parse_num_integer(const char *restrict str_, size_t len, int base, mp_lexer_t *lex) {
     const byte *restrict str = (const byte *)str_;
     const byte *restrict top = str + len;
@@ -76,7 +97,7 @@ mp_obj_t mp_parse_num_integer(const char *restrict str_, size_t len, int base, m
     str += mp_parse_num_base((const char *)str, top - str, &base);
 
     // string should be an integer number
-    mp_int_t int_val = 0;
+    parsed_int_t parsed_val = 0;
     const byte *restrict str_val_start = str;
     for (; str < top; str++) {
         // get next digit as a value
@@ -98,25 +119,29 @@ mp_obj_t mp_parse_num_integer(const char *restrict str_, size_t len, int base, m
             break;
         }
 
-        // add next digi and check for overflow
-        if (mp_small_int_mul_overflow(int_val, base, &int_val)) {
+        // add next digit and check for overflow
+        if (PARSED_INT_MUL_OVERFLOW(parsed_val, base, &parsed_val)) {
             goto overflow;
         }
-        int_val += dig;
-        if (!MP_SMALL_INT_FITS(int_val)) {
+        parsed_val += dig;
+        if (!PARSED_INT_FITS(parsed_val)) {
             goto overflow;
         }
     }
 
-    // negate value if needed
+    #if MICROPY_LONGINT_IMPL != MICROPY_LONGINT_IMPL_LONGLONG
+    // The PARSED_INT_FITS check above ensures parsed_val fits in small int representation
+    ret_val = MP_OBJ_NEW_SMALL_INT(neg ? (-parsed_val) : parsed_val);
+have_ret_val:
+    #else
+    // The PARSED_INT_FITS check above ensures parsed_val won't overflow signed long long
+    long long signed_val = parsed_val;
     if (neg) {
-        int_val = -int_val;
+        signed_val = -signed_val;
     }
+    ret_val = mp_obj_new_int_from_ll(signed_val); // Could be large or small int
+    #endif
 
-    // create the small int
-    ret_val = MP_OBJ_NEW_SMALL_INT(int_val);
-
-have_ret_val:
     // check we parsed something
     if (str == str_val_start) {
         goto value_error;
@@ -135,13 +160,17 @@ mp_obj_t mp_parse_num_integer(const char *restrict str_, size_t len, int base, m
     return ret_val;
 
 overflow:
+    #if MICROPY_LONGINT_IMPL != MICROPY_LONGINT_IMPL_LONGLONG
     // reparse using long int
     {
         const char *s2 = (const char *)str_val_start;
         ret_val = mp_obj_new_int_from_str_len(&s2, top - str_val_start, neg, base);
         str = (const byte *)s2;
         goto have_ret_val;
     }
+    #else
+    mp_raise_msg(&mp_type_OverflowError, MP_ERROR_TEXT("result overflows long long storage"));
+    #endif
 
 value_error:
     {