-
-
Notifications
You must be signed in to change notification settings - Fork 32.1k
gh-128762: Include inline values in sys.getsizeof()
#128763
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
inline_values_size = 32 | ||
|
||
linked_list = None | ||
for i in range(28): |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I find that it needs this many iterations to "stabilize". Is there a right way to do this?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I happend to encounter the same issue. What seems to happen is that the size of the inline values starts at 30, and then decreases by 1 for each new instance—until it reaches the minimum possible (i.e. the number of unique keys encountered so far). If more than 30 different keys are encountered, values are no longer inlined.
I wasn't able the find the exact reason why, but it seems to be a simple way of 'right-sizing' the inline values. You can imagine when creating 10k instances it all amortizes to the 'right' size to cover the attributes set in the __init__
.
Relevant source:
cpython/Include/internal/pycore_dict.h
Lines 314 to 326 in b90ecea
shared_keys_usable_size(PyDictKeysObject *keys) | |
{ | |
// dk_usable will decrease for each instance that is created and each | |
// value that is added. dk_nentries will increase for each value that | |
// is added. We want to always return the right value or larger. | |
// We therefore increase dk_nentries first and we decrease dk_usable | |
// second, and conversely here we read dk_usable first and dk_entries | |
// second (to avoid the case where we read entries before the increment | |
// and read usable after the decrement) | |
Py_ssize_t dk_usable = FT_ATOMIC_LOAD_SSIZE_ACQUIRE(keys->dk_usable); | |
Py_ssize_t dk_nentries = FT_ATOMIC_LOAD_SSIZE_ACQUIRE(keys->dk_nentries); | |
return dk_nentries + dk_usable; | |
} |
#define SHARED_KEYS_MAX_SIZE 30
@@ -1475,6 +1479,16 @@ def test_gc_head_size(self): | |||
# but lists are | |||
self.assertEqual(sys.getsizeof([]), vsize('Pn') + gc_header_size) | |||
|
|||
def test_inline_values(self): |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have overall low confidence that this is the right way to test this, and I'm open to suggestions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm not sure if it's what you're looking for, but here's the code I've been using to inspect the inline values from Python:
py_ssize_t = ctypes.c_ssize_t # Almost always the case
class PyObject(ctypes.Structure):
_fields_ = [("ob_refcnt", py_ssize_t), ("ob_type", ctypes.c_void_p)]
class PyDictValues(ctypes.Structure):
_fields_ = [
("capacity", ctypes.c_uint8),
("size", ctypes.c_uint8),
("embedded", ctypes.c_uint8),
("valid", ctypes.c_uint8),
("values", ctypes.POINTER(ctypes.py_object)),
]
def __repr__(self):
return (
f"DictValues(capacity={self.capacity}, "
f"size={self.size}, "
f"embedded={self.embedded}, "
f"valid={self.valid})"
)
class PyDictKeysObject(ctypes.Structure):
_fields_ = [
("dk_refcnt", py_ssize_t),
("dk_log2_size", ctypes.c_uint8),
("dk_log2_index_bytes", ctypes.c_uint8),
("dk_kind", ctypes.c_uint8),
("dk_version", ctypes.c_uint32),
("dk_usable", py_ssize_t),
("dk_nentries", py_ssize_t),
(
"dk_indices",
ctypes.POINTER(ctypes.c_void_p),
), # Placeholder for indices
]
def __repr__(self):
return (
f"DictKeysObject(refcount={self.dk_refcnt}, "
f"log2_size={self.dk_log2_size}, "
f"log2_index_bytes={self.dk_log2_index_bytes}, "
f"kind={self.dk_kind}, "
f"version={self.dk_version}, "
f"usable={self.dk_usable}, "
f"nentries={self.dk_nentries})"
)
class PyDict(PyObject):
_fields_ = [
("ma_used", py_ssize_t),
("ma_version_tag", ctypes.c_uint64),
("ma_keys", ctypes.POINTER(PyDictKeysObject)),
("ma_values", ctypes.POINTER(PyDictValues)),
]
def __repr__(self):
try:
values = self.ma_values.contents
except ValueError: # NULL pointer
values = None
return (
f"DictStruct(size={self.ma_used}, "
f"refcount={self.ob_refcnt}, "
f"version={self.ma_version_tag}, "
f"keys={self.ma_keys.contents}, "
f"values={values})"
)
FLAG_INLINE_VALUES = 1 << 2
def dict_and_inline_values(obj):
ptr = id(obj)
if type(obj).__flags__ & FLAG_INLINE_VALUES:
values_offset = type(obj).__basicsize__
# Layout of a pure-python object as of python 3.13
# see https://github.com/python/cpython/blob/main/Objects/object_layout.md
values = PyDictValues.from_address(ptr + values_offset)
else:
values = None
try:
# NOTE: this simple logic only works for "typical" objects
managed_dict = ctypes.POINTER(PyDict).from_address(ptr - 24).contents
except ValueError:
# The instance __dict__ hasn't been "materialized" yet (null pointer)
managed_dict = None
return (managed_dict, values)
sys.getsizeof()
sys.getsizeof()
does not include inline values #128762