Skip to content

gh-132983: [DRAFT] Test _zstd code with Python test suite #133185

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 39 commits into from
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
9814e3b
Add _zstd module
emmatyping Apr 26, 2025
fda87c8
Add _zstd to modules
emmatyping Apr 26, 2025
887e564
Fix path for compression.zstd module
emmatyping Apr 26, 2025
cdba656
Ignore _zstd module like _io
emmatyping Apr 26, 2025
6b67e9b
Expand module state macros to improve code quality
emmatyping Apr 26, 2025
a99a5d2
Remove backticks suggested in review
emmatyping Apr 27, 2025
02cd17a
Use critical sections to lock object state
emmatyping Apr 27, 2025
54eca74
Remove compress/decompress and mark module as not reliant on the GIL
emmatyping Apr 27, 2025
f605956
Lift critical section to avoid clang warning
emmatyping Apr 27, 2025
2eadc65
Respond to comments by picnixz
emmatyping Apr 27, 2025
8eac354
Call out pyzstd explicitly in license description
emmatyping Apr 27, 2025
26775be
Use a much more robust implementation...
emmatyping Apr 27, 2025
eae460f
Use PyList_GetItemRef for thread safety purposes
emmatyping Apr 27, 2025
2ab5e4a
Use a macro for the minimum supported version
emmatyping Apr 27, 2025
d5bf1c1
remove const from primivite types
emmatyping Apr 27, 2025
9e92b9f
Use PyMem_New in another spot
emmatyping Apr 27, 2025
47f815a
Simplify error handling in _get_frame_size
emmatyping Apr 27, 2025
6a4f7b8
Another simplification of error handling in get_frame_info
emmatyping Apr 27, 2025
d7b3805
Rename _module_state to mod_state
emmatyping Apr 27, 2025
c225ea6
Rewrite comment explaining the context of the code
emmatyping Apr 28, 2025
6e8c61c
Add link to pyzstd
emmatyping Apr 28, 2025
e52ad06
Add TODO about refactoring dict training code
emmatyping Apr 28, 2025
2a1ad8b
Use PyModule_AddObjectRef over PyModule_AddObject
emmatyping Apr 28, 2025
94473b9
Check result of OutputBufferGrow
emmatyping Apr 28, 2025
e2b2515
Simplify return logic in `add_constant_to_type`
emmatyping Apr 29, 2025
cd2f085
Ignore return value of _zstd_clear()
emmatyping Apr 29, 2025
79e174f
Remove redundant comments
emmatyping Apr 29, 2025
ce6f79c
Remove __reduce__ from ZstdDict
emmatyping Apr 29, 2025
e15dd85
Use PyUnicode_FromFormat instead of a buffer
emmatyping Apr 29, 2025
685a3d1
Don't use C constants/types in error messages
emmatyping Apr 29, 2025
1b9f786
Make error messages easier to understand for Python users
emmatyping Apr 29, 2025
40c653c
Lower minimum required version 1.4.0
emmatyping Apr 30, 2025
428677d
Use casts and make slot function signatures correct
emmatyping Apr 30, 2025
0962bbb
Be consistent with CPython on const usage
emmatyping Apr 30, 2025
85efc18
Make else clauses in line with PEP 7
emmatyping Apr 30, 2025
cadf6e4
Fix over-indented blocks in argument clinic
emmatyping Apr 30, 2025
e45c22a
Merge branch 'main' into 3.14-zstd-c-code
emmatyping Apr 30, 2025
5c1e360
Add Python files
emmatyping Apr 30, 2025
1ad29f2
Fix byteswarning in test
emmatyping Apr 30, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
Add Python files
  • Loading branch information
emmatyping committed Apr 30, 2025
commit 5c1e360edbfc47151d86857c4e4500f1bed7df36
286 changes: 286 additions & 0 deletions Lib/compression/zstd/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,286 @@
"""Python bindings to Zstandard (zstd) compression library, the API style is
similar to Python's bz2/lzma/zlib modules.
"""

__all__ = (
# From this file
"compressionLevel_values",
"get_frame_info",
"CParameter",
"DParameter",
"Strategy",
"finalize_dict",
"train_dict",
"zstd_support_multithread",
"compress",
"decompress",
# From _zstd
"ZstdCompressor",
"ZstdDecompressor",
"ZstdDict",
"ZstdError",
"get_frame_size",
"zstd_version",
"zstd_version_info",
# From zstd.zstdfile
"open",
"ZstdFile",
)

from collections import namedtuple
from enum import IntEnum
from functools import lru_cache

from compression.zstd.zstdfile import ZstdFile, open
from _zstd import *

import _zstd


_ZSTD_CStreamSizes = _zstd._ZSTD_CStreamSizes
_ZSTD_DStreamSizes = _zstd._ZSTD_DStreamSizes
_train_dict = _zstd._train_dict
_finalize_dict = _zstd._finalize_dict


# TODO(emmatyping): these should be dataclasses or some other class, not namedtuples

# compressionLevel_values
_nt_values = namedtuple("values", ["default", "min", "max"])
compressionLevel_values = _nt_values(*_zstd._compressionLevel_values)


_nt_frame_info = namedtuple("frame_info", ["decompressed_size", "dictionary_id"])


def get_frame_info(frame_buffer):
"""Get zstd frame information from a frame header.

Parameter
frame_buffer: A bytes-like object. It should starts from the beginning of
a frame, and needs to include at least the frame header (6 to
18 bytes).

Return a two-items namedtuple: (decompressed_size, dictionary_id)

If decompressed_size is None, decompressed size is unknown.

dictionary_id is a 32-bit unsigned integer value. 0 means dictionary ID was
not recorded in the frame header, the frame may or may not need a dictionary
to be decoded, and the ID of such a dictionary is not specified.

It's possible to append more items to the namedtuple in the future."""

ret_tuple = _zstd._get_frame_info(frame_buffer)
return _nt_frame_info(*ret_tuple)


def _nbytes(dat):
if isinstance(dat, (bytes, bytearray)):
return len(dat)
with memoryview(dat) as mv:
return mv.nbytes


def train_dict(samples, dict_size):
"""Train a zstd dictionary, return a ZstdDict object.

Parameters
samples: An iterable of samples, a sample is a bytes-like object
represents a file.
dict_size: The dictionary's maximum size, in bytes.
"""
# Check argument's type
if not isinstance(dict_size, int):
raise TypeError('dict_size argument should be an int object.')

# Prepare data
chunks = []
chunk_sizes = []
for chunk in samples:
chunks.append(chunk)
chunk_sizes.append(_nbytes(chunk))

chunks = b''.join(chunks)
if not chunks:
raise ValueError("The samples are empty content, can't train dictionary.")

# samples_bytes: samples be stored concatenated in a single flat buffer.
# samples_size_list: a list of each sample's size.
# dict_size: size of the dictionary, in bytes.
dict_content = _train_dict(chunks, chunk_sizes, dict_size)

return ZstdDict(dict_content)


def finalize_dict(zstd_dict, samples, dict_size, level):
"""Finalize a zstd dictionary, return a ZstdDict object.

Given a custom content as a basis for dictionary, and a set of samples,
finalize dictionary by adding headers and statistics according to the zstd
dictionary format.

You may compose an effective dictionary content by hand, which is used as
basis dictionary, and use some samples to finalize a dictionary. The basis
dictionary can be a "raw content" dictionary, see is_raw parameter in
ZstdDict.__init__ method.

Parameters
zstd_dict: A ZstdDict object, basis dictionary.
samples: An iterable of samples, a sample is a bytes-like object
represents a file.
dict_size: The dictionary's maximum size, in bytes.
level: The compression level expected to use in production. The
statistics for each compression level differ, so tuning the
dictionary for the compression level can help quite a bit.
"""

# Check arguments' type
if not isinstance(zstd_dict, ZstdDict):
raise TypeError('zstd_dict argument should be a ZstdDict object.')
if not isinstance(dict_size, int):
raise TypeError('dict_size argument should be an int object.')
if not isinstance(level, int):
raise TypeError('level argument should be an int object.')

# Prepare data
chunks = []
chunk_sizes = []
for chunk in samples:
chunks.append(chunk)
chunk_sizes.append(_nbytes(chunk))

chunks = b''.join(chunks)
if not chunks:
raise ValueError("The samples are empty content, can't finalize dictionary.")

# custom_dict_bytes: existing dictionary.
# samples_bytes: samples be stored concatenated in a single flat buffer.
# samples_size_list: a list of each sample's size.
# dict_size: maximal size of the dictionary, in bytes.
# compression_level: compression level expected to use in production.
dict_content = _finalize_dict(zstd_dict.dict_content,
chunks, chunk_sizes,
dict_size, level)

return _zstd.ZstdDict(dict_content)

def compress(data, level=None, options=None, zstd_dict=None):
"""Compress a block of data, return a bytes object of zstd compressed data.

Refer to ZstdCompressor's docstring for a description of the
optional arguments *level*, *options*, and *zstd_dict*.

For incremental compression, use an ZstdCompressor instead.
"""
comp = ZstdCompressor(level=level, options=options, zstd_dict=zstd_dict)
return comp.compress(data, ZstdCompressor.FLUSH_FRAME)

def decompress(data, zstd_dict=None, options=None):
"""Decompress one or more frames of data.

Refer to ZstdDecompressor's docstring for a description of the
optional arguments *zstd_dict*, *options*.

For incremental decompression, use an ZstdDecompressor instead.
"""
results = []
while True:
decomp = ZstdDecompressor(options=options, zstd_dict=zstd_dict)
try:
res = decomp.decompress(data)
except ZstdError:
if results:
break # Leftover data is not a valid LZMA/XZ stream; ignore it.
else:
raise # Error on the first iteration; bail out.
results.append(res)
if not decomp.eof:
raise ZstdError("Compressed data ended before the "
"end-of-stream marker was reached")
data = decomp.unused_data
if not data:
break
return b"".join(results)

class _UnsupportedCParameter:
def __set_name__(self, _, name):
self.name = name

def __get__(self, *_, **__):
msg = ("%s CParameter not available, zstd version is %s.") % (
self.name,
zstd_version,
)
raise NotImplementedError(msg)


class CParameter(IntEnum):
"""Compression parameters"""

compressionLevel = _zstd._ZSTD_c_compressionLevel
windowLog = _zstd._ZSTD_c_windowLog
hashLog = _zstd._ZSTD_c_hashLog
chainLog = _zstd._ZSTD_c_chainLog
searchLog = _zstd._ZSTD_c_searchLog
minMatch = _zstd._ZSTD_c_minMatch
targetLength = _zstd._ZSTD_c_targetLength
strategy = _zstd._ZSTD_c_strategy

targetCBlockSize = _UnsupportedCParameter()

enableLongDistanceMatching = _zstd._ZSTD_c_enableLongDistanceMatching
ldmHashLog = _zstd._ZSTD_c_ldmHashLog
ldmMinMatch = _zstd._ZSTD_c_ldmMinMatch
ldmBucketSizeLog = _zstd._ZSTD_c_ldmBucketSizeLog
ldmHashRateLog = _zstd._ZSTD_c_ldmHashRateLog

contentSizeFlag = _zstd._ZSTD_c_contentSizeFlag
checksumFlag = _zstd._ZSTD_c_checksumFlag
dictIDFlag = _zstd._ZSTD_c_dictIDFlag

nbWorkers = _zstd._ZSTD_c_nbWorkers
jobSize = _zstd._ZSTD_c_jobSize
overlapLog = _zstd._ZSTD_c_overlapLog

@lru_cache(maxsize=None)
def bounds(self):
"""Return lower and upper bounds of a compression parameter, both inclusive."""
# 1 means compression parameter
return _zstd._get_param_bounds(1, self.value)


class DParameter(IntEnum):
"""Decompression parameters"""

windowLogMax = _zstd._ZSTD_d_windowLogMax

@lru_cache(maxsize=None)
def bounds(self):
"""Return lower and upper bounds of a decompression parameter, both inclusive."""
# 0 means decompression parameter
return _zstd._get_param_bounds(0, self.value)


class Strategy(IntEnum):
"""Compression strategies, listed from fastest to strongest.

Note : new strategies _might_ be added in the future, only the order
(from fast to strong) is guaranteed.
"""

fast = _zstd._ZSTD_fast
dfast = _zstd._ZSTD_dfast
greedy = _zstd._ZSTD_greedy
lazy = _zstd._ZSTD_lazy
lazy2 = _zstd._ZSTD_lazy2
btlazy2 = _zstd._ZSTD_btlazy2
btopt = _zstd._ZSTD_btopt
btultra = _zstd._ZSTD_btultra
btultra2 = _zstd._ZSTD_btultra2


# Set CParameter/DParameter types for validity check
_zstd._set_parameter_types(CParameter, DParameter)

zstd_support_multithread = CParameter.nbWorkers.bounds() != (0, 0)
Loading
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy