py/objint.c: Code review of int.from_bytes().

IhorNehrutsa · IhorNehrutsa · commit 0489c31e88e9 · 2025-03-14T16:03:03.000+02:00
Support signed param: result = int.from_bytes(bytearray(), order='big'|'little', signed=False|True) Add `length`, `byteorder`, `signed` according to the micropython#16311. Signed-off-by: Ihor Nehrutsa <Ihor.Nehrutsa@gmail.com>
diff --git a/py/mpz.c b/py/mpz.c
@@ -850,7 +850,7 @@ size_t mpz_set_from_str(mpz_t *z, const char *str, size_t len, bool neg, unsigne
     return cur - str;
 }
 
-void mpz_set_from_bytes(mpz_t *z, bool big_endian, size_t len, const byte *buf) {
+void mpz_set_from_bytes(mpz_t *z, bool big_endian, bool is_signed, size_t len, const byte *buf) {
     int delta = 1;
     if (big_endian) {
         buf += len - 1;
@@ -862,6 +862,9 @@ void mpz_set_from_bytes(mpz_t *z, bool big_endian, size_t len, const byte *buf)
     mpz_dig_t d = 0;
     int num_bits = 0;
     z->neg = 0;
+    if ((is_signed) && (buf[len - 1] & 0x80)) {
+        z->neg = 1;
+    }
     z->len = 0;
     while (len) {
         while (len && num_bits < DIG_SIZE) {
@@ -879,7 +882,14 @@ void mpz_set_from_bytes(mpz_t *z, bool big_endian, size_t len, const byte *buf)
         #endif
         num_bits -= DIG_SIZE;
     }
-
+    if (z->neg) {
+        // sign extend
+        while (num_bits < DIG_SIZE) {
+            d |= DIG_MSB << num_bits;
+            num_bits += DIG_SIZE;
+        }
+        z->dig[z->len++] = d & DIG_MASK;
+    }
     z->len = mpn_remove_trailing_zeros(z->dig, z->dig + z->len);
 }
 
diff --git a/py/mpz.h b/py/mpz.h
@@ -114,7 +114,7 @@ void mpz_set_from_ll(mpz_t *z, long long i, bool is_signed);
 void mpz_set_from_float(mpz_t *z, mp_float_t src);
 #endif
 size_t mpz_set_from_str(mpz_t *z, const char *str, size_t len, bool neg, unsigned int base);
-void mpz_set_from_bytes(mpz_t *z, bool big_endian, size_t len, const byte *buf);
+void mpz_set_from_bytes(mpz_t *z, bool big_endian, bool is_signed, size_t len, const byte *buf);
 
 static inline bool mpz_is_zero(const mpz_t *z) {
     return z->len == 0;
diff --git a/py/objint.c b/py/objint.c
@@ -387,37 +387,86 @@ mp_obj_t mp_obj_int_binary_op_extra_cases(mp_binary_op_t op, mp_obj_t lhs_in, mp
     return MP_OBJ_NULL; // op not supported
 }
 
-// this is a classmethod
-static mp_obj_t int_from_bytes(size_t n_args, const mp_obj_t *args) {
-    // TODO: Support signed param (assumes signed=False at the moment)
-
-    // get the buffer info
-    mp_buffer_info_t bufinfo;
-    mp_get_buffer_raise(args[1], &bufinfo, MP_BUFFER_READ);
+void *reverce_memcpy(void *dest, const void *src, size_t len) {
+    char *d = (char *)dest + len - 1;
+    const char *s = src;
+    while (len--) {
+        *d-- = *s++;
+    }
+    return dest;
+}
 
-    const byte *buf = (const byte *)bufinfo.buf;
-    int delta = 1;
-    bool big_endian = n_args < 3 || args[2] != MP_OBJ_NEW_QSTR(MP_QSTR_little);
-    if (!big_endian) {
-        buf += bufinfo.len - 1;
-        delta = -1;
+mp_obj_t mp_obj_integer_from_bytes_impl(bool big_endian, bool is_signed, size_t len, const byte *buf) {
+    if (len > sizeof(mp_int_t)) {
+        #if MICROPY_LONGINT_IMPL != MICROPY_LONGINT_IMPL_NONE
+        // Result will overflow a small-int size so construct a big-int
+        return mp_obj_int_from_bytes_impl(big_endian, is_signed, len, buf);
+        #else
+        mp_raise_msg(&mp_type_OverflowError, MP_ERROR_TEXT("small-int overflow"));
+        #endif
+    }
+    union {
+        mp_int_t value;
+        mp_uint_t uvalue;
+        byte buf[sizeof(mp_int_t)];
+    } result = {0};
+    // #if sizeof(mp_int_t) != sizeof(mp_uint_t)
+    // #error "sizeof(mp_int_t) != sizeof(mp_uint_t)"
+    // #endif
+
+    if (big_endian) {
+        reverce_memcpy(&result, buf, len);
+    } else { // little-endian
+        memcpy(&result, buf, len);
     }
 
-    mp_uint_t value = 0;
-    size_t len = bufinfo.len;
-    for (; len--; buf += delta) {
+    if ((is_signed) && (sizeof(result) > len) && (result.buf[len - 1] & 0x80)) {
+        // Sign propagation in little-endian
+        // x = 2
+        // x.to_bytes(1, 'little', True) -> b'\x02'
+        // x.to_bytes(4, 'little', True) -> b'\x02\x00\x00\x00'
+        // x = -2
+        // x.to_bytes(1, 'little', True) -> b'\xFE'
+        // x.to_bytes(4, 'little', True) -> b'\xFE\xFF\xFF\xFF'
+        memset(result.buf + len, 0xFF, sizeof(result) - len);
+    }
+    if (((!is_signed) && (result.uvalue > MP_SMALL_INT_MAX)) || (is_signed && ((result.value < MP_SMALL_INT_MIN) || (result.value > MP_SMALL_INT_MAX)))) {
+        // Result will overflow a small-int so construct a big-int
         #if MICROPY_LONGINT_IMPL != MICROPY_LONGINT_IMPL_NONE
-        if (value > (MP_SMALL_INT_MAX >> 8)) {
-            // Result will overflow a small-int so construct a big-int
-            return mp_obj_int_from_bytes_impl(big_endian, bufinfo.len, bufinfo.buf);
-        }
+        return mp_obj_int_from_bytes_impl(big_endian, is_signed, len, buf);
+        #else
+        mp_raise_msg(&mp_type_OverflowError, MP_ERROR_TEXT("small-int overflow"));
         #endif
-        value = (value << 8) | *buf;
     }
-    return mp_obj_new_int_from_uint(value);
+    return mp_obj_new_int(result.value);
 }
 
-static MP_DEFINE_CONST_FUN_OBJ_VAR_BETWEEN(int_from_bytes_fun_obj, 2, 4, int_from_bytes);
+// this is a classmethod
+// result = int.from_bytes(bytearray(), [[length=,] byteorder='big',] signed=False)
+static mp_obj_t int_from_bytes(size_t n_args, const mp_obj_t *pos_args, mp_map_t *kw_args) {
+    enum { ARG_length, ARG_byteorder, ARG_signed };
+    static const mp_arg_t allowed_args[] = {
+        { MP_QSTR_length,    MP_ARG_INT, { .u_int = 0 } },
+        { MP_QSTR_byteorder, MP_ARG_OBJ, { .u_rom_obj = MP_ROM_QSTR(MP_QSTR_big) } },
+        { MP_QSTR_signed,    MP_ARG_BOOL, {.u_bool = false} },
+    };
+    mp_arg_val_t args[MP_ARRAY_SIZE(allowed_args)];
+    mp_arg_parse_all(n_args - 2, pos_args + 2, kw_args, MP_ARRAY_SIZE(allowed_args), allowed_args, args);
+
+    // get the buffer info
+    mp_buffer_info_t bufinfo;
+    mp_get_buffer_raise(pos_args[1], &bufinfo, MP_BUFFER_READ);
+
+    size_t len = args[ARG_length].u_int;
+    bool big_endian = args[ARG_byteorder].u_obj != MP_OBJ_NEW_QSTR(MP_QSTR_little);
+    bool is_signed = args[ARG_signed].u_bool;
+
+    if ((len <= 0) || (len > bufinfo.len)) {
+        len = bufinfo.len;
+    }
+    return mp_obj_integer_from_bytes_impl(big_endian, is_signed, len, bufinfo.buf);
+}
+static MP_DEFINE_CONST_FUN_OBJ_KW(int_from_bytes_fun_obj, 2, int_from_bytes);
 static MP_DEFINE_CONST_CLASSMETHOD_OBJ(int_from_bytes_obj, MP_ROM_PTR(&int_from_bytes_fun_obj));
 
 static mp_obj_t int_to_bytes(size_t n_args, const mp_obj_t *args) {
diff --git a/py/objint.h b/py/objint.h
@@ -54,13 +54,15 @@ char *mp_obj_int_formatted(char **buf, size_t *buf_size, size_t *fmt_size, mp_co
 char *mp_obj_int_formatted_impl(char **buf, size_t *buf_size, size_t *fmt_size, mp_const_obj_t self_in,
     int base, const char *prefix, char base_char, char comma);
 mp_int_t mp_obj_int_hash(mp_obj_t self_in);
-mp_obj_t mp_obj_int_from_bytes_impl(bool big_endian, size_t len, const byte *buf);
+mp_obj_t mp_obj_int_from_bytes_impl(bool big_endian, bool is_signed, size_t len, const byte *buf);
+mp_obj_t mp_obj_integer_from_bytes_impl(bool big_endian, bool is_signed, size_t len, const byte *buf);
 // Returns true if 'self_in' fit into 'len' bytes of 'buf' without overflowing, 'buf' is truncated otherwise.
 bool mp_obj_int_to_bytes_impl(mp_obj_t self_in, bool big_endian, size_t len, byte *buf);
 int mp_obj_int_sign(mp_obj_t self_in);
 mp_obj_t mp_obj_int_unary_op(mp_unary_op_t op, mp_obj_t o_in);
 mp_obj_t mp_obj_int_binary_op(mp_binary_op_t op, mp_obj_t lhs_in, mp_obj_t rhs_in);
 mp_obj_t mp_obj_int_binary_op_extra_cases(mp_binary_op_t op, mp_obj_t lhs_in, mp_obj_t rhs_in);
 mp_obj_t mp_obj_int_pow3(mp_obj_t base, mp_obj_t exponent,  mp_obj_t modulus);
+void *reverce_memcpy(void *dest, const void *src, size_t len);
 
 #endif // MICROPY_INCLUDED_PY_OBJINT_H
diff --git a/py/objint_longlong.c b/py/objint_longlong.c
@@ -43,18 +43,32 @@
 const mp_obj_int_t mp_sys_maxsize_obj = {{&mp_type_int}, MP_SSIZE_MAX};
 #endif
 
-mp_obj_t mp_obj_int_from_bytes_impl(bool big_endian, size_t len, const byte *buf) {
-    int delta = 1;
-    if (!big_endian) {
-        buf += len - 1;
-        delta = -1;
+mp_obj_t mp_obj_int_from_bytes_impl(bool big_endian, bool is_signed, size_t len, const byte *buf) {
+    if (len > sizeof(mp_longint_impl_t)) {
+        mp_raise_msg(&mp_type_OverflowError, MP_ERROR_TEXT("big-int overflow"));
     }
+    union {
+        mp_longint_impl_t value;
+        byte buf[sizeof(mp_longint_impl_t)];
+    } result = {0};
 
-    mp_longint_impl_t value = 0;
-    for (; len--; buf += delta) {
-        value = (value << 8) | *buf;
+    if (big_endian) {
+        reverce_memcpy(&result, buf, len);
+    } else { // little-endian
+        memcpy(&result, buf, len);
     }
-    return mp_obj_new_int_from_ll(value);
+
+    if ((is_signed) && (sizeof(result) > len) && (result.buf[len - 1] & 0x80)) {
+        // Sign propagation in little-endian
+        // x = 2
+        // x.to_bytes(1, 'little', True) -> b'\x02'
+        // x.to_bytes(4, 'little', True) -> b'\x02\x00\x00\x00'
+        // x = -2
+        // x.to_bytes(1, 'little', True) -> b'\xFE'
+        // x.to_bytes(4, 'little', True) -> b'\xFE\xFF\xFF\xFF'
+        memset(result.buf + len, 0xFF, sizeof(result) - len);
+    }
+    return mp_obj_new_int_from_ll(result.value);
 }
 
 bool mp_obj_int_to_bytes_impl(mp_obj_t self_in, bool big_endian, size_t len, byte *buf) {
diff --git a/py/objint_mpz.c b/py/objint_mpz.c
@@ -106,9 +106,9 @@ char *mp_obj_int_formatted_impl(char **buf, size_t *buf_size, size_t *fmt_size,
     return str;
 }
 
-mp_obj_t mp_obj_int_from_bytes_impl(bool big_endian, size_t len, const byte *buf) {
+mp_obj_t mp_obj_int_from_bytes_impl(bool big_endian, bool is_signed, size_t len, const byte *buf) {
     mp_obj_int_t *o = mp_obj_int_new_mpz();
-    mpz_set_from_bytes(&o->mpz, big_endian, len, buf);
+    mpz_set_from_bytes(&o->mpz, big_endian, is_signed, len, buf);
     return MP_OBJ_FROM_PTR(o);
 }
 
diff --git a/tests/basics/int_bytes.py b/tests/basics/int_bytes.py
@@ -9,7 +9,7 @@
 
 # check that extra zero bytes don't change the internal int value
 print(int.from_bytes(bytes(20), "little") == 0)
-print(int.from_bytes(b"\x01" + bytes(20), "little") == 1)
+print(int.from_bytes(b"\x01" + bytes(7), "little") == 1)
 
 # big-endian conversion
 print((10).to_bytes(1, "big"))
diff --git a/tests/basics/int_from_bytes.py b/tests/basics/int_from_bytes.py

Original file line number	Diff line number	Diff line change
`@@ -106,9 +106,9 @@ char mp_obj_int_formatted_impl(char buf, size_t buf_size, size_t *fmt_size,`
`106`	`106`	`return str;`
`107`	`107`	`}`
`108`	`108`
`109`		`-mp_obj_t mp_obj_int_from_bytes_impl(bool big_endian, size_t len, const byte *buf) {`
	`109`	`+mp_obj_t mp_obj_int_from_bytes_impl(bool big_endian, bool is_signed, size_t len, const byte *buf) {`
`110`	`110`	`mp_obj_int_t *o = mp_obj_int_new_mpz();`
`111`		`- mpz_set_from_bytes(&o->mpz, big_endian, len, buf);`
	`111`	`+ mpz_set_from_bytes(&o->mpz, big_endian, is_signed, len, buf);`
`112`	`112`	`return MP_OBJ_FROM_PTR(o);`
`113`	`113`	`}`
`114`	`114`