From f5a3997cc4a5a5a96989bfe01a4df43038db02e1 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 27 Jun 2025 12:18:48 +0200 Subject: [PATCH 01/27] Build: Speed up the library extraction and build by avoiding to re-extract already unpacked files. Also make the extraction of downloaded content more secure by validating the tar file content. --- buildlibxml.py | 52 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 39 insertions(+), 13 deletions(-) diff --git a/buildlibxml.py b/buildlibxml.py index cc61d65b2..df7691a56 100644 --- a/buildlibxml.py +++ b/buildlibxml.py @@ -371,24 +371,50 @@ def download_library(dest_dir, location, name, version_re, filename, version=Non return dest_filename -def unpack_tarball(tar_filename, dest): +def unpack_tarball(tar_filename, dest) -> str: print('Unpacking %s into %s' % (os.path.basename(tar_filename), dest)) - if sys.version_info[0] < 3 and tar_filename.endswith('.xz'): - # Py 2.7 lacks lzma support - tar_cm = py2_tarxz(tar_filename) - else: - tar_cm = closing(tarfile.open(tar_filename)) + os_path = os.path + abs_dest = os_path.abspath(dest) + + tar_cm = tarfile.open(tar_filename) + + if hasattr(tarfile, 'data_filter'): + tar_cm.extraction_filter = tarfile.data_filter base_dir = None - with tar_cm as tar: + with closing(tar_cm) as tar: + directories = [] for member in tar: - base_name = member.name.split('/')[0] + # Guard against malicious tar file content. + path = os_path.join(dest, member.name) + abs_path = os_path.abspath(path) + if not os_path.commonpath([abs_dest, abs_path]).startswith(abs_dest): + raise RuntimeError('Unexpected path in %s: %s' % (tar_filename, member.name)) + + if member.isdir(): + directories.append(member) + continue + elif not member.isfile(): + raise RuntimeError('Unexpected path in %s: %s' % (tar_filename, member.name)) + + # Find common base directory. + first_dir = member.name.split('/')[0] if base_dir is None: - base_dir = base_name - elif base_dir != base_name: - print('Unexpected path in %s: %s' % (tar_filename, base_name)) - tar.extractall(dest) - return os.path.join(dest, base_dir) + base_dir = first_dir + elif base_dir != first_dir: + print('Unexpected path in %s: %s' % (tar_filename, first_dir)) + continue + + # Extract only new files. + if os_path.exists(abs_path) and os_path.getsize(abs_path) == member.size: + continue + tar.extract(member) + + # Update directory properties/times/etc. + for member in directories: + tar.extract(member) + + return os_path.join(dest, base_dir) def call_subprocess(cmd, **kw): From f8e490c7cd60713176fb17e7b3f035f7ddfbc322 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 27 Jun 2025 09:09:29 +0200 Subject: [PATCH 02/27] Use bit field for parser config flags instead of a series of several oversized ints. --- src/lxml/parser.pxi | 74 +++++++++++++++++++++++++++------------------ 1 file changed, 44 insertions(+), 30 deletions(-) diff --git a/src/lxml/parser.pxi b/src/lxml/parser.pxi index 93b6ef5ae..fa8e64dd3 100644 --- a/src/lxml/parser.pxi +++ b/src/lxml/parser.pxi @@ -827,6 +827,26 @@ cdef inline int _fixHtmlDictNodeNames(tree.xmlDict* c_dict, return 0 +cdef extern from *: + """ + typedef struct { + unsigned int for_html: 1; + unsigned int remove_comments: 1; + unsigned int remove_pis: 1; + unsigned int strip_cdata: 1; + unsigned int collect_ids: 1; + unsigned int resolve_external_entities: 1; + } __lxml_ParserFlags; + """ + ctypedef struct ParserFlags "__lxml_ParserFlags": + bint for_html + bint remove_comments + bint remove_pis + bint strip_cdata + bint collect_ids + bint resolve_external_entities + + @cython.internal cdef class _BaseParser: cdef ElementClassLookup _class_lookup @@ -834,12 +854,7 @@ cdef class _BaseParser: cdef _ParserContext _parser_context cdef _ParserContext _push_parser_context cdef int _parse_options - cdef bint _for_html - cdef bint _remove_comments - cdef bint _remove_pis - cdef bint _strip_cdata - cdef bint _collect_ids - cdef bint _resolve_external_entities + cdef ParserFlags _flags cdef XMLSchema _schema cdef bytes _filename cdef readonly object target @@ -855,15 +870,17 @@ cdef class _BaseParser: raise TypeError, "This class cannot be instantiated" self._parse_options = parse_options + self._flags = ParserFlags( + for_html=for_html, + remove_comments=remove_comments, + remove_pis=remove_pis, + strip_cdata=strip_cdata, + collect_ids=collect_ids, + resolve_external_entities=resolve_external_entities, + ) + self.target = target - self._for_html = for_html - self._remove_comments = remove_comments - self._remove_pis = remove_pis - self._strip_cdata = strip_cdata - self._collect_ids = collect_ids - self._resolve_external_entities = resolve_external_entities self._schema = schema - self._resolvers = _ResolverRegistry() if encoding is None: @@ -891,7 +908,7 @@ cdef class _BaseParser: cdef xmlparser.xmlParserCtxt* pctxt if self._parser_context is None: self._parser_context = self._createContext(self.target, None) - self._parser_context._collect_ids = self._collect_ids + self._parser_context._collect_ids = self._flags.collect_ids if self._schema is not None: self._parser_context._validator = \ self._schema._newSaxValidator( @@ -906,7 +923,7 @@ cdef class _BaseParser: if self._push_parser_context is None: self._push_parser_context = self._createContext( self.target, self._events_to_collect) - self._push_parser_context._collect_ids = self._collect_ids + self._push_parser_context._collect_ids = self._flags.collect_ids if self._schema is not None: self._push_parser_context._validator = \ self._schema._newSaxValidator( @@ -937,14 +954,14 @@ cdef class _BaseParser: @cython.final cdef int _configureSaxContext(self, xmlparser.xmlParserCtxt* pctxt) except -1: - if self._remove_comments: + if self._flags.remove_comments: pctxt.sax.comment = NULL - if self._remove_pis: + if self._flags.remove_pis: pctxt.sax.processingInstruction = NULL - if self._strip_cdata: + if self._flags.strip_cdata: # hard switch-off for CDATA nodes => makes them plain text pctxt.sax.cdataBlock = NULL - if not self._resolve_external_entities: + if not self._flags.resolve_external_entities: pctxt.sax.getEntity = _getInternalEntityOnly cdef int _registerHtmlErrorHandler(self, xmlparser.xmlParserCtxt* c_ctxt) except -1: @@ -971,7 +988,7 @@ cdef class _BaseParser: Create and initialise a libxml2-level parser context. """ cdef xmlparser.xmlParserCtxt* c_ctxt - if self._for_html: + if self._flags.for_html: c_ctxt = htmlparser.htmlCreateMemoryParserCtxt('dummy', 5) if c_ctxt is not NULL: self._registerHtmlErrorHandler(c_ctxt) @@ -985,7 +1002,7 @@ cdef class _BaseParser: cdef xmlparser.xmlParserCtxt* _newPushParserCtxt(self) except NULL: cdef xmlparser.xmlParserCtxt* c_ctxt cdef char* c_filename = _cstr(self._filename) if self._filename is not None else NULL - if self._for_html: + if self._flags.for_html: c_ctxt = htmlparser.htmlCreatePushParserCtxt( NULL, NULL, NULL, 0, c_filename, tree.XML_CHAR_ENCODING_NONE) if c_ctxt is not NULL: @@ -1033,10 +1050,7 @@ cdef class _BaseParser: cdef _BaseParser parser parser = self.__class__() parser._parse_options = self._parse_options - parser._for_html = self._for_html - parser._remove_comments = self._remove_comments - parser._remove_pis = self._remove_pis - parser._strip_cdata = self._strip_cdata + parser._flags = self._flags parser._filename = self._filename parser._resolvers = self._resolvers parser.target = self.target @@ -1111,7 +1125,7 @@ cdef class _BaseParser: __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) orig_options = pctxt.options with nogil: - if self._for_html: + if self._flags.for_html: result = htmlparser.htmlCtxtReadMemory( pctxt, c_text, buffer_len, c_filename, c_encoding, self._parse_options) @@ -1169,7 +1183,7 @@ cdef class _BaseParser: orig_options = pctxt.options with nogil: - if self._for_html: + if self._flags.for_html: result = htmlparser.htmlCtxtReadMemory( pctxt, c_text, c_len, c_filename, c_encoding, self._parse_options) @@ -1207,7 +1221,7 @@ cdef class _BaseParser: orig_options = pctxt.options with nogil: - if self._for_html: + if self._flags.for_html: result = htmlparser.htmlCtxtReadFile( pctxt, c_filename, c_encoding, self._parse_options) if result is not NULL: @@ -1411,7 +1425,7 @@ cdef class _FeedParser(_BaseParser): if char_data is not NULL: buffer_len = 4 if py_buffer_len > 4 else py_buffer_len orig_loader = _register_document_loader() - if self._for_html: + if self._flags.for_html: error = _htmlCtxtResetPush( pctxt, char_data, buffer_len, c_filename, c_encoding, self._parse_options) @@ -1494,7 +1508,7 @@ cdef class _FeedParser(_BaseParser): pctxt = context._c_ctxt self._feed_parser_running = 0 - if self._for_html: + if self._flags.for_html: htmlparser.htmlParseChunk(pctxt, NULL, 0, 1) else: xmlparser.xmlParseChunk(pctxt, NULL, 0, 1) From 56367fa7ae889e23b27be00bf3ca500fc6a23dc0 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 27 Jun 2025 12:35:34 +0200 Subject: [PATCH 03/27] Build: Fix output directory of library extraction. --- buildlibxml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/buildlibxml.py b/buildlibxml.py index df7691a56..7973cec63 100644 --- a/buildlibxml.py +++ b/buildlibxml.py @@ -408,7 +408,7 @@ def unpack_tarball(tar_filename, dest) -> str: # Extract only new files. if os_path.exists(abs_path) and os_path.getsize(abs_path) == member.size: continue - tar.extract(member) + tar.extract(member, abs_dest) # Update directory properties/times/etc. for member in directories: From 0b51241810b0060dfd49dba5a92866adccf16a9b Mon Sep 17 00:00:00 2001 From: scoder Date: Fri, 27 Jun 2025 16:55:52 +0200 Subject: [PATCH 04/27] Make the XML names dict local to a parser rather than local to a thread. (GH-466) This makes it easier to control for users since dicts cannot shrink, only grow. --- src/lxml/apihelpers.pxi | 60 ++++++++-------- src/lxml/classlookup.pxi | 17 +++-- src/lxml/debug.pxi | 8 +-- src/lxml/etree.pyx | 29 ++++---- src/lxml/parser.pxi | 146 +++++++++++++++++++++------------------ src/lxml/xslt.pxi | 10 ++- 6 files changed, 142 insertions(+), 128 deletions(-) diff --git a/src/lxml/apihelpers.pxi b/src/lxml/apihelpers.pxi index f683e70db..0744774fb 100644 --- a/src/lxml/apihelpers.pxi +++ b/src/lxml/apihelpers.pxi @@ -79,6 +79,7 @@ cdef bint _isAncestorOrSame(xmlNode* c_ancestor, xmlNode* c_node) noexcept: c_node = c_node.parent return False + cdef _Element _makeElement(tag, xmlDoc* c_doc, _Document doc, _BaseParser parser, text, tail, attrib, nsmap, dict extra_attrs): @@ -96,9 +97,10 @@ cdef _Element _makeElement(tag, xmlDoc* c_doc, _Document doc, If 'c_doc' is also NULL, a new xmlDoc will be created. """ - cdef xmlNode* c_node + cdef bint is_new_doc = doc is None if doc is not None: c_doc = doc._c_doc + ns_utf, name_utf = _getNsTag(tag) if parser is not None and parser._for_html: _htmlTagValidOrRaise(name_utf) @@ -108,34 +110,30 @@ cdef _Element _makeElement(tag, xmlDoc* c_doc, _Document doc, _tagValidOrRaise(name_utf) if c_doc is NULL: c_doc = _newXMLDoc() - c_node = _createElement(c_doc, name_utf) + + if doc is None: + doc = _documentFactory(c_doc, parser) + if is_new_doc: + doc.initDict() + + cdef xmlNode* c_node = _createElement(c_doc, name_utf) if c_node is NULL: - if doc is None and c_doc is not NULL: - tree.xmlFreeDoc(c_doc) raise MemoryError() - try: - if doc is None: - tree.xmlDocSetRootElement(c_doc, c_node) - doc = _documentFactory(c_doc, parser) - if text is not None: - _setNodeText(c_node, text) - if tail is not None: - _setTailText(c_node, tail) - # add namespaces to node if necessary - _setNodeNamespaces(c_node, doc, ns_utf, nsmap) - _initNodeAttributes(c_node, doc, attrib, extra_attrs) - return _elementFactory(doc, c_node) - except: - # free allocated c_node/c_doc unless Python does it for us - if c_node.doc is not c_doc: - # node not yet in document => will not be freed by document - if tail is not None: - _removeText(c_node.next) # tail - tree.xmlFreeNode(c_node) - if doc is None: - # c_doc will not be freed by doc - tree.xmlFreeDoc(c_doc) - raise + if is_new_doc: + tree.xmlDocSetRootElement(c_doc, c_node) + + # add namespaces to node if necessary + _setNodeNamespaces(c_node, doc, ns_utf, nsmap) + + if text is not None: + _setNodeText(c_node, text) + if tail is not None: + _setTailText(c_node, tail) + + _initNodeAttributes(c_node, doc, attrib, extra_attrs) + + return _elementFactory(doc, c_node) + cdef int _initNewElement(_Element element, bint is_html, name_utf, ns_utf, _BaseParser parser, attrib, nsmap, dict extra_attrs) except -1: @@ -153,13 +151,14 @@ cdef int _initNewElement(_Element element, bint is_html, name_utf, ns_utf, else: _tagValidOrRaise(name_utf) c_doc = _newXMLDoc() + + doc = _documentFactory(c_doc, parser) + doc.initDict() + c_node = _createElement(c_doc, name_utf) if c_node is NULL: - if c_doc is not NULL: - tree.xmlFreeDoc(c_doc) raise MemoryError() tree.xmlDocSetRootElement(c_doc, c_node) - doc = _documentFactory(c_doc, parser) # add namespaces to node if necessary _setNodeNamespaces(c_node, doc, ns_utf, nsmap) _initNodeAttributes(c_node, doc, attrib, extra_attrs) @@ -167,6 +166,7 @@ cdef int _initNewElement(_Element element, bint is_html, name_utf, ns_utf, element._init() return 0 + cdef _Element _makeSubElement(_Element parent, tag, text, tail, attrib, nsmap, dict extra_attrs): """Create a new child element and initialize text content, namespaces and diff --git a/src/lxml/classlookup.pxi b/src/lxml/classlookup.pxi index 92d1d47a5..fcd766533 100644 --- a/src/lxml/classlookup.pxi +++ b/src/lxml/classlookup.pxi @@ -108,14 +108,15 @@ cdef class CommentBase(_Comment): """ def __init__(self, text): # copied from Comment() factory - cdef _Document doc - cdef xmlDoc* c_doc if text is None: text = b'' else: text = _utf8(text) + c_doc = _newXMLDoc() doc = _documentFactory(c_doc, None) + doc.initDict() + self._c_node = _createComment(c_doc, _xcstr(text)) if self._c_node is NULL: raise MemoryError() @@ -138,15 +139,16 @@ cdef class PIBase(_ProcessingInstruction): """ def __init__(self, target, text=None): # copied from PI() factory - cdef _Document doc - cdef xmlDoc* c_doc target = _utf8(target) if text is None: text = b'' else: text = _utf8(text) + c_doc = _newXMLDoc() doc = _documentFactory(c_doc, None) + doc.initDict() + self._c_node = _createPI(c_doc, _xcstr(target), _xcstr(text)) if self._c_node is NULL: raise MemoryError() @@ -167,8 +169,6 @@ cdef class EntityBase(_Entity): called after object creation. """ def __init__(self, name): - cdef _Document doc - cdef xmlDoc* c_doc name_utf = _utf8(name) c_name = _xcstr(name_utf) if c_name[0] == c'#': @@ -176,8 +176,11 @@ cdef class EntityBase(_Entity): raise ValueError, f"Invalid character reference: '{name}'" elif not _xmlNameIsValid(c_name): raise ValueError, f"Invalid entity reference: '{name}'" + c_doc = _newXMLDoc() doc = _documentFactory(c_doc, None) + doc.initDict() + self._c_node = _createEntity(c_doc, c_name) if self._c_node is NULL: raise MemoryError() @@ -553,7 +556,7 @@ def set_element_class_lookup(ElementClassLookup lookup = None): This defines the main entry point for looking up element implementations. The standard implementation uses the :class:`ParserBasedElementClassLookup` - to delegate to different lookup schemes for each parser. + to delegate to different lookup schemes for each parser. .. warning:: diff --git a/src/lxml/debug.pxi b/src/lxml/debug.pxi index d728e8419..b6c1c1f31 100644 --- a/src/lxml/debug.pxi +++ b/src/lxml/debug.pxi @@ -24,13 +24,9 @@ cdef class _MemDebug: def dict_size(self): """dict_size(self) - Returns the current size of the global name dictionary used by libxml2 - for the current thread. Each thread has its own dictionary. + Returns the current size of the default parser's name dictionary used by libxml2. """ - c_dict = __GLOBAL_PARSER_CONTEXT._getThreadDict(NULL) - if c_dict is NULL: - raise MemoryError() - return tree.xmlDictSize(c_dict) + return __GLOBAL_PARSER_CONTEXT.getDefaultParser().dict_size memory_debugger = _MemDebug() diff --git a/src/lxml/etree.pyx b/src/lxml/etree.pyx index 562d95ed1..927918f8e 100644 --- a/src/lxml/etree.pyx +++ b/src/lxml/etree.pyx @@ -481,6 +481,9 @@ cdef public class _Document [ type LxmlDocumentType, object LxmlDocument ]: # the document tree.xmlFreeDoc(self._c_doc) + cdef void initDict(self) noexcept: + self._parser.initDocDict(self._c_doc) + @cython.final cdef getroot(self): # return an element proxy for the document root @@ -3214,10 +3217,6 @@ def Comment(text=None): Comment element factory. This factory function creates a special element that will be serialized as an XML comment. """ - cdef _Document doc - cdef xmlNode* c_node - cdef xmlDoc* c_doc - if text is None: text = b'' else: @@ -3227,6 +3226,8 @@ def Comment(text=None): c_doc = _newXMLDoc() doc = _documentFactory(c_doc, None) + doc.initDict() + c_node = _createComment(c_doc, _xcstr(text)) tree.xmlAddChild(c_doc, c_node) return _elementFactory(doc, c_node) @@ -3238,10 +3239,6 @@ def ProcessingInstruction(target, text=None): ProcessingInstruction element factory. This factory function creates a special element that will be serialized as an XML processing instruction. """ - cdef _Document doc - cdef xmlNode* c_node - cdef xmlDoc* c_doc - target = _utf8(target) _tagValidOrRaise(target) if target.lower() == b'xml': @@ -3256,6 +3253,8 @@ def ProcessingInstruction(target, text=None): c_doc = _newXMLDoc() doc = _documentFactory(c_doc, None) + doc.initDict() + c_node = _createPI(c_doc, _xcstr(target), _xcstr(text)) tree.xmlAddChild(c_doc, c_node) return _elementFactory(doc, c_node) @@ -3291,9 +3290,6 @@ def Entity(name): declared in the document. A document that uses entity references requires a DTD to define the entities. """ - cdef _Document doc - cdef xmlNode* c_node - cdef xmlDoc* c_doc name_utf = _utf8(name) c_name = _xcstr(name_utf) if c_name[0] == c'#': @@ -3301,8 +3297,11 @@ def Entity(name): raise ValueError, f"Invalid character reference: '{name}'" elif not _xmlNameIsValid(c_name): raise ValueError, f"Invalid entity reference: '{name}'" + c_doc = _newXMLDoc() doc = _documentFactory(c_doc, None) + doc.initDict() + c_node = _createEntity(c_doc, c_name) tree.xmlAddChild(c_doc, c_node) return _elementFactory(doc, c_node) @@ -3317,6 +3316,7 @@ def SubElement(_Element _parent not None, _tag, """ return _makeSubElement(_parent, _tag, None, None, attrib, nsmap, _extra) + from typing import Generic, TypeVar T = TypeVar("T") @@ -3327,11 +3327,7 @@ class ElementTree(ABC, Generic[T]): ElementTree wrapper class. """ - cdef xmlNode* c_next - cdef xmlNode* c_node - cdef xmlNode* c_node_copy cdef xmlDoc* c_doc - cdef _ElementTree etree cdef _Document doc if element is not None: @@ -3344,15 +3340,18 @@ class ElementTree(ABC, Generic[T]): else: c_doc = _newXMLDoc() doc = _documentFactory(c_doc, parser) + doc.initDict() return _elementTreeFactory(doc, element) + # Register _ElementTree as a virtual subclass of ElementTree ElementTree.register(_ElementTree) # Remove "ABC" and typing helpers from module dict del ABC, Generic, TypeVar, T + def HTML(text, _BaseParser parser=None, *, base_url=None): """HTML(text, parser=None, base_url=None) diff --git a/src/lxml/parser.pxi b/src/lxml/parser.pxi index fa8e64dd3..dc3629bbb 100644 --- a/src/lxml/parser.pxi +++ b/src/lxml/parser.pxi @@ -44,6 +44,53 @@ cdef class ParserError(LxmlError): """ +@cython.final +@cython.internal +cdef class _ParserDictionary: + # The string dictionary of a parser, shared by all of its parsed documents. + + cdef tree.xmlDict* _c_dict + + def __cinit__(self): + self._c_dict = xmlparser.xmlDictCreate() + + def __dealloc__(self): + xmlparser.xmlDictFree(self._c_dict) + self._c_dict = NULL + + cdef tree.xmlDict *getDict(self): + return self._c_dict + + cdef tree.xmlDict *getDictRef(self): + c_dict = self._c_dict + xmlparser.xmlDictReference(c_dict) + return c_dict + + cdef size_t getDictSize(self): + return tree.xmlDictSize(self._c_dict) + + cdef void initDictRef(self, tree.xmlDict** c_dict_ref) noexcept: + c_dict = c_dict_ref[0] + if c_dict is self._c_dict: + return + + c_dict_ref[0] = self.getDictRef() + if c_dict is not NULL: + xmlparser.xmlDictFree(c_dict) + + cdef void initParserDict(self, xmlparser.xmlParserCtxt* pctxt) noexcept: + "Assure we always use the same string dictionary." + self.initDictRef(&pctxt.dict) + pctxt.dictNames = 1 + + #cdef void initXPathParserDict(self, xpath.xmlXPathContext* pctxt) noexcept: + # "Assure we always use the same string dictionary." + # self.initDictRef(&pctxt.dict) + + cdef void initDocDict(self, xmlDoc *c_doc) noexcept: + self.initDictRef(&c_doc.dict) + + @cython.final @cython.internal cdef class _ParserDictionaryContext: @@ -56,17 +103,12 @@ cdef class _ParserDictionaryContext: # __GLOBAL_PARSER_CONTEXT as defined below the class. # - cdef tree.xmlDict* _c_dict cdef _BaseParser _default_parser cdef list _implied_parser_contexts def __cinit__(self): self._implied_parser_contexts = [] - def __dealloc__(self): - if self._c_dict is not NULL: - xmlparser.xmlDictFree(self._c_dict) - cdef int initMainParserContext(self) except -1: """Put the global context into the thread dictionary of the main thread. To be called once and only in the main thread.""" @@ -105,48 +147,6 @@ cdef class _ParserDictionaryContext: context._default_parser = self._default_parser._copy() return context._default_parser - cdef tree.xmlDict* _getThreadDict(self, tree.xmlDict* default): - "Return the thread-local dict or create a new one if necessary." - cdef _ParserDictionaryContext context - context = self._findThreadParserContext() - if context._c_dict is NULL: - # thread dict not yet set up => use default or create a new one - if default is not NULL: - context._c_dict = default - xmlparser.xmlDictReference(default) - return default - if self._c_dict is NULL: - self._c_dict = xmlparser.xmlDictCreate() - if context is not self: - context._c_dict = xmlparser.xmlDictCreateSub(self._c_dict) - return context._c_dict - - cdef int initThreadDictRef(self, tree.xmlDict** c_dict_ref) except -1: - c_dict = c_dict_ref[0] - c_thread_dict = self._getThreadDict(c_dict) - if c_dict is c_thread_dict: - return 0 - if c_dict is not NULL: - xmlparser.xmlDictFree(c_dict) - c_dict_ref[0] = c_thread_dict - xmlparser.xmlDictReference(c_thread_dict) - - cdef int initParserDict(self, xmlparser.xmlParserCtxt* pctxt) except -1: - "Assure we always use the same string dictionary." - self.initThreadDictRef(&pctxt.dict) - pctxt.dictNames = 1 - - cdef int initXPathParserDict(self, xpath.xmlXPathContext* pctxt) except -1: - "Assure we always use the same string dictionary." - self.initThreadDictRef(&pctxt.dict) - - cdef int initDocDict(self, xmlDoc* result) except -1: - "Store dict of last object parsed if no shared dict yet" - # XXX We also free the result dict here if there already was one. - # This case should only occur for new documents with empty dicts, - # otherwise we'd free data that's in use => segfault - self.initThreadDictRef(&result.dict) - cdef _ParserContext findImpliedContext(self): """Return any current implied xml parser context for the current thread. This is used when the resolver functions are called @@ -537,6 +537,7 @@ cdef void _reset_document_loader(xmlparser.xmlExternalEntityLoader old) noexcept @cython.internal cdef class _ParserContext(_ResolverContext): cdef _ErrorLog _error_log + cdef _ParserDictionary _dict cdef _ParserSchemaValidationContext _validator cdef xmlparser.xmlParserCtxt* _c_ctxt cdef xmlparser.xmlExternalEntityLoader _orig_loader @@ -549,6 +550,7 @@ cdef class _ParserContext(_ResolverContext): if config.ENABLE_THREADING: self._lock = python.PyThread_allocate_lock() self._error_log = _ErrorLog() + self._dict = _ParserDictionary() def __dealloc__(self): if config.ENABLE_THREADING and self._lock is not NULL: @@ -578,6 +580,7 @@ cdef class _ParserContext(_ResolverContext): """ self._c_ctxt = c_ctxt c_ctxt._private = self + self._dict.initParserDict(c_ctxt) cdef void _resetParserContext(self) noexcept: if self._c_ctxt is not NULL: @@ -703,11 +706,11 @@ cdef xmlDoc* _handleParseResult(_ParserContext context, # to parse the document. cdef bint well_formed if result is not NULL: - __GLOBAL_PARSER_CONTEXT.initDocDict(result) + context._dict.initDocDict(result) if c_ctxt.myDoc is not NULL: if c_ctxt.myDoc is not result: - __GLOBAL_PARSER_CONTEXT.initDocDict(c_ctxt.myDoc) + context._dict.initDocDict(c_ctxt.myDoc) tree.xmlFreeDoc(c_ctxt.myDoc) c_ctxt.myDoc = NULL @@ -1018,6 +1021,14 @@ cdef class _BaseParser: c_ctxt.sax.startDocument = _initSaxDocument return c_ctxt + @cython.final + cdef void initDocDict(self, tree.xmlDoc *c_doc) noexcept: + self._getParserContext()._dict.initDocDict(c_doc) + + @cython.final + cdef tree.xmlDict* getDict(self) noexcept: + return self._getParserContext()._dict.getDict() + @property def error_log(self): """The error log of the last parser run. @@ -1036,6 +1047,15 @@ cdef class _BaseParser: """The version of the underlying XML parser.""" return "libxml2 %d.%d.%d" % LIBXML_VERSION + @property + def dict_size(self): + cdef size_t size = 0 + if self._parser_context is not None: + size += self._parser_context._dict.getDictSize() + if self._push_parser_context is not None: + size += self._push_parser_context._dict.getDictSize() + return size + def set_element_class_lookup(self, ElementClassLookup lookup = None): """set_element_class_lookup(self, lookup = None) @@ -1122,7 +1142,6 @@ cdef class _BaseParser: context.prepare() try: pctxt = context._c_ctxt - __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) orig_options = pctxt.options with nogil: if self._flags.for_html: @@ -1154,9 +1173,6 @@ cdef class _BaseParser: context = self._getParserContext() context.prepare() try: - pctxt = context._c_ctxt - __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) - if self._default_encoding is None: c_encoding = NULL # libxml2 (at least 2.9.3) does not recognise UTF-32 BOMs @@ -1181,6 +1197,7 @@ cdef class _BaseParser: else: c_encoding = _cstr(self._default_encoding) + pctxt = context._c_ctxt orig_options = pctxt.options with nogil: if self._flags.for_html: @@ -1211,14 +1228,12 @@ cdef class _BaseParser: context = self._getParserContext() context.prepare() try: - pctxt = context._c_ctxt - __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) - if self._default_encoding is None: c_encoding = NULL else: c_encoding = _cstr(self._default_encoding) + pctxt = context._c_ctxt orig_options = pctxt.options with nogil: if self._flags.for_html: @@ -1242,7 +1257,6 @@ cdef class _BaseParser: cdef _ParserContext context cdef _FileReaderContext file_context cdef xmlDoc* result - cdef xmlparser.xmlParserCtxt* pctxt cdef char* c_filename if not filename: filename = None @@ -1250,12 +1264,10 @@ cdef class _BaseParser: context = self._getParserContext() context.prepare() try: - pctxt = context._c_ctxt - __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) file_context = _FileReaderContext( filelike, context, filename, encoding or self._default_encoding) - result = file_context._readDoc(pctxt, self._parse_options) + result = file_context._readDoc(context._c_ctxt, self._parse_options) return context._handleParseResultDoc( self, result, filename) @@ -1324,8 +1336,8 @@ cdef void _initSaxDocument(void* ctxt) noexcept with gil: if c_doc and c_ctxt.dict and not c_doc.dict: # I have no idea why libxml2 disables this - we need it c_ctxt.dictNames = 1 - c_doc.dict = c_ctxt.dict xmlparser.xmlDictReference(c_ctxt.dict) + c_doc.dict = c_ctxt.dict # set up XML ID hash table if c_ctxt._private: @@ -1438,7 +1450,6 @@ cdef class _FeedParser(_BaseParser): char_data += buffer_len if error: raise MemoryError() - __GLOBAL_PARSER_CONTEXT.initParserDict(pctxt) #print pctxt.charset, 'NONE' if c_encoding is NULL else c_encoding @@ -1543,9 +1554,9 @@ cdef (int, int) _parse_data_chunk(xmlparser.xmlParserCtxt* c_ctxt, fixup_error = _fixHtmlDictSubtreeNames( c_ctxt.dict, c_ctxt.myDoc, c_node) if c_ctxt.myDoc.dict and c_ctxt.myDoc.dict is not c_ctxt.dict: + xmlparser.xmlDictReference(c_ctxt.dict) xmlparser.xmlDictFree(c_ctxt.myDoc.dict) c_ctxt.myDoc.dict = c_ctxt.dict - xmlparser.xmlDictReference(c_ctxt.dict) else: orig_loader = _register_document_loader() error = xmlparser.xmlParseChunk(c_ctxt, char_data, buffer_len, 0) @@ -1974,7 +1985,6 @@ cdef xmlDoc* _newXMLDoc() except NULL: raise MemoryError() if result.encoding is NULL: result.encoding = tree.xmlStrdup("UTF-8") - __GLOBAL_PARSER_CONTEXT.initDocDict(result) return result cdef xmlDoc* _newHTMLDoc() except NULL: @@ -1982,7 +1992,6 @@ cdef xmlDoc* _newHTMLDoc() except NULL: result = tree.htmlNewDoc(NULL, NULL) if result is NULL: raise MemoryError() - __GLOBAL_PARSER_CONTEXT.initDocDict(result) return result cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL: @@ -1994,7 +2003,8 @@ cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL: result = tree.xmlCopyDoc(c_doc, 0) if result is NULL: raise MemoryError() - __GLOBAL_PARSER_CONTEXT.initDocDict(result) + xmlparser.xmlDictReference(c_doc.dict) + result.dict = c_doc.dict return result cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL: @@ -2002,7 +2012,9 @@ cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL: cdef xmlDoc* result cdef xmlNode* c_node result = tree.xmlCopyDoc(c_doc, 0) # non recursive - __GLOBAL_PARSER_CONTEXT.initDocDict(result) + assert result.dict is NULL + xmlparser.xmlDictReference(c_doc.dict) + result.dict = c_doc.dict with nogil: c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive if c_node is NULL: diff --git a/src/lxml/xslt.pxi b/src/lxml/xslt.pxi index 659d7054c..9ffe27677 100644 --- a/src/lxml/xslt.pxi +++ b/src/lxml/xslt.pxi @@ -62,6 +62,7 @@ cdef _initXSLTResolverContext(_XSLTResolverContext context, context._parser = parser context._c_style_doc = NULL + cdef xmlDoc* _xslt_resolve_from_python(const_xmlChar* c_uri, void* c_context, int parse_options, int* error) with gil: # call the Python document loaders @@ -101,6 +102,7 @@ cdef xmlDoc* _xslt_resolve_from_python(const_xmlChar* c_uri, void* c_context, doc_ref._file, doc_ref._filename, context._parser) elif doc_ref._type == PARSER_DATA_EMPTY: c_return_doc = _newXMLDoc() + context._parser.initDocDict(c_return_doc) if c_return_doc is not NULL and c_return_doc.URL is NULL: c_return_doc.URL = tree.xmlStrdup(c_uri) except: @@ -598,11 +600,12 @@ cdef class XSLT: if resolver_context is not None: resolver_context.clear() - result_doc = _documentFactory(c_result, input_doc._parser) - c_dict = c_result.dict xmlparser.xmlDictReference(c_dict) - __GLOBAL_PARSER_CONTEXT.initThreadDictRef(&c_result.dict) + + result_doc = _documentFactory(c_result, input_doc._parser) + result_doc.initDict() + if c_dict is not c_result.dict or \ self._c_style.doc.dict is not c_result.dict or \ input_doc._c_doc.dict is not c_result.dict: @@ -616,6 +619,7 @@ cdef class XSLT: if input_doc._c_doc.dict is not c_result.dict: fixThreadDictNames(c_result, input_doc._c_doc.dict, c_result.dict) + xmlparser.xmlDictFree(c_dict) return _xsltResultTreeFactory(result_doc, self, profile_doc) From ff9035f510c6a90df1d0fc50390500a54cbf70b5 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 27 Jun 2025 16:57:04 +0200 Subject: [PATCH 05/27] Increase version number. --- src/lxml/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lxml/__init__.py b/src/lxml/__init__.py index 8443a3498..2673b6542 100644 --- a/src/lxml/__init__.py +++ b/src/lxml/__init__.py @@ -1,6 +1,6 @@ # this is a package -__version__ = "6.0.0" +__version__ = "7.0.0a0" def get_include(): From 4ffa77747513c2592d69f07bde98647dc88a2824 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Sat, 28 Jun 2025 10:00:00 +0200 Subject: [PATCH 06/27] Fix "document is HTML" detection. Prefer document node type over parser config. --- src/lxml/apihelpers.pxi | 13 +++++++++---- src/lxml/etree.pyx | 10 +++++++++- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/lxml/apihelpers.pxi b/src/lxml/apihelpers.pxi index 0744774fb..2eac2ac63 100644 --- a/src/lxml/apihelpers.pxi +++ b/src/lxml/apihelpers.pxi @@ -97,12 +97,17 @@ cdef _Element _makeElement(tag, xmlDoc* c_doc, _Document doc, If 'c_doc' is also NULL, a new xmlDoc will be created. """ + cdef bint is_html = False cdef bint is_new_doc = doc is None + if doc is not None: c_doc = doc._c_doc + is_html = doc.ishtml() + elif parser is not None: + is_html = parser._flags.for_html ns_utf, name_utf = _getNsTag(tag) - if parser is not None and parser._for_html: + if is_html: _htmlTagValidOrRaise(name_utf) if c_doc is NULL: c_doc = _newHTMLDoc() @@ -180,7 +185,7 @@ cdef _Element _makeSubElement(_Element parent, tag, text, tail, ns_utf, name_utf = _getNsTag(tag) c_doc = parent._doc._c_doc - if parent._doc._parser is not None and parent._doc._parser._for_html: + if parent._doc.ishtml(): _htmlTagValidOrRaise(name_utf) else: _tagValidOrRaise(name_utf) @@ -315,7 +320,7 @@ cdef _initNodeAttributes(xmlNode* c_node, _Document doc, attrib, dict extra): raise TypeError, f"Invalid attribute dictionary: {python._fqtypename(attrib).decode('utf8')}" if not attrib and not extra: return # nothing to do - is_html = doc._parser._for_html + is_html = doc.ishtml() seen = set() if extra: for name, value in extra.items(): @@ -582,7 +587,7 @@ cdef int _setAttributeValue(_Element element, key, value) except -1: cdef const_xmlChar* c_value cdef xmlNs* c_ns ns, tag = _getNsTag(key) - is_html = element._doc._parser._for_html + is_html = element._doc.ishtml() if not is_html: _attributeValidOrRaise(tag) c_tag = _xcstr(tag) diff --git a/src/lxml/etree.pyx b/src/lxml/etree.pyx index 927918f8e..92c1d4410 100644 --- a/src/lxml/etree.pyx +++ b/src/lxml/etree.pyx @@ -547,6 +547,10 @@ cdef public class _Document [ type LxmlDocumentType, object LxmlDocument ]: else: return (self._c_doc.standalone == 1) + @cython.final + cdef bint ishtml(self): + return self._c_doc.type == tree.XML_HTML_DOCUMENT_NODE + @cython.final cdef bytes buildNewPrefix(self): # get a new unique prefix ("nsX") for this document @@ -765,6 +769,10 @@ cdef class DocInfo: """ return self._doc.isstandalone() + @property + def is_html(self): + return self._doc.ishtml() + property URL: "The source URL of the document (or None if unknown)." def __get__(self): @@ -1133,7 +1141,7 @@ cdef public class _Element [ type LxmlElementType, object LxmlElement ]: _assertValidNode(self) ns, name = _getNsTag(value) parser = self._doc._parser - if parser is not None and parser._for_html: + if self._doc.ishtml(): _htmlTagValidOrRaise(name) else: _tagValidOrRaise(name) From b0660e3a5b7d4127ddce3917ddce7d1e470ea01b Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 4 Jul 2025 18:31:31 +0200 Subject: [PATCH 07/27] Add an integration test that runs ElementTree's serialiser over lxml's XML tree. --- src/lxml/tests/test_elementtree.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/lxml/tests/test_elementtree.py b/src/lxml/tests/test_elementtree.py index 784dbfc18..bf92ffa16 100644 --- a/src/lxml/tests/test_elementtree.py +++ b/src/lxml/tests/test_elementtree.py @@ -14,6 +14,7 @@ import re import sys import textwrap +import types import unittest from contextlib import contextmanager from functools import wraps, partial @@ -4978,6 +4979,35 @@ def setUpClass(cls): r'This method will be removed.*\.iter\(\).*instead', PendingDeprecationWarning) + def test_elementtree_serialises_lxml_tree(self): + # Parse tree with lxml.etree. + root = etree.XML(""" + + A + + + + """) + + # Sanity checks. + self.assertNotIsInstance(etree.tostring, types.FunctionType) + self.assertIsInstance(self.etree.tostring, types.FunctionType) + + # Serialised with xml.etree.ElementTree.tostring() + xml_tostring = self.etree.tostring(root, encoding='utf8') + self.assertIn(b'', xml_tostring) + self.assertIn(b'', xml_tostring) + + # ET.write() + out = io.BytesIO() + self.etree.ElementTree(root).write(out, encoding='utf8') + xml_write = out.getvalue() + self.assertIn(b'', xml_write) + self.assertIn(b'', xml_write) + + # Both should be identical because they used the same serialiser. + self.assertEqual(xml_tostring, xml_write) + filter_by_version( ElementTreeTestCase, ElementTreeTestCase.required_versions_ET, ET_VERSION) From 0603be38f8642765366db0ff1e8e8c2f069abf56 Mon Sep 17 00:00:00 2001 From: Nick Wellnhofer Date: Sun, 6 Jul 2025 08:06:15 +0200 Subject: [PATCH 08/27] Fix expected messages for XPath namespace errors (GH-467) libxml2 2.15 adds additional detail to some XPath error messages, for example the actual name of undefined prefixes, functions or variables. Use the doctest.ELLIPSIS option to handle such messages. --- doc/extensions.txt | 2 +- doc/xpathxslt.txt | 4 ++-- src/lxml/tests/common_imports.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/doc/extensions.txt b/doc/extensions.txt index 45bcf9795..5d14247eb 100644 --- a/doc/extensions.txt +++ b/doc/extensions.txt @@ -249,7 +249,7 @@ the global mapping of the FunctionNamespace objects: >>> e2('/foo:a') Traceback (most recent call last): ... - lxml.etree.XPathEvalError: Undefined namespace prefix + lxml.etree.XPathEvalError: Undefined namespace prefix... Evaluator-local extensions diff --git a/doc/xpathxslt.txt b/doc/xpathxslt.txt index 3b0b899c4..d2480f03c 100644 --- a/doc/xpathxslt.txt +++ b/doc/xpathxslt.txt @@ -412,7 +412,7 @@ During evaluation, lxml will emit an XPathEvalError on errors: >>> find(root) Traceback (most recent call last): ... - lxml.etree.XPathEvalError: Undefined namespace prefix + lxml.etree.XPathEvalError: Undefined namespace prefix... This works for the ``XPath`` class, however, the other evaluators (including the ``xpath()`` method) are one-shot operations that do parsing and evaluation @@ -429,7 +429,7 @@ in one step. They therefore raise evaluation exceptions in all cases: >>> find = root.xpath("//ns:a") Traceback (most recent call last): ... - lxml.etree.XPathEvalError: Undefined namespace prefix + lxml.etree.XPathEvalError: Undefined namespace prefix... >>> find = root.xpath("\\") Traceback (most recent call last): diff --git a/src/lxml/tests/common_imports.py b/src/lxml/tests/common_imports.py index 4ef6e770e..44916c273 100644 --- a/src/lxml/tests/common_imports.py +++ b/src/lxml/tests/common_imports.py @@ -104,7 +104,7 @@ def BytesIO(*args): def make_doctest(filename): file_path = os.path.join(DOC_DIR, filename) - return doctest.DocFileSuite(file_path, module_relative=False, encoding='utf-8') + return doctest.DocFileSuite(file_path, module_relative=False, encoding='utf-8', optionflags=doctest.ELLIPSIS) class HelperTestCase(unittest.TestCase): From c414c894e1d99c26547b6026ac3b12c30904ba16 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Sun, 6 Jul 2025 09:35:49 +0200 Subject: [PATCH 09/27] Tests: Prevent global config leak on test failures. --- src/lxml/tests/test_xmlschema.py | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/src/lxml/tests/test_xmlschema.py b/src/lxml/tests/test_xmlschema.py index 0e7e03ccc..9341d2a0a 100644 --- a/src/lxml/tests/test_xmlschema.py +++ b/src/lxml/tests/test_xmlschema.py @@ -436,10 +436,13 @@ def test_xmlschema_resolvers_root(self): # test that the default resolver will get called if there's no # specific parser resolver. root_resolver = self.simple_resolver(self.resolver_schema_ext) - etree.get_default_parser().resolvers.add(root_resolver) - schema_doc = etree.parse(self.resolver_schema_int) - schema = etree.XMLSchema(schema_doc) - etree.get_default_parser().resolvers.remove(root_resolver) + default_resolvers = etree.get_default_parser().resolvers + default_resolvers.add(root_resolver) + try: + schema_doc = etree.parse(self.resolver_schema_int) + schema = etree.XMLSchema(schema_doc) + finally: + default_resolvers.remove(root_resolver) def test_xmlschema_resolvers_noroot(self): # test that the default resolver will not get called when a @@ -451,14 +454,16 @@ def resolve(self, url, id, context): return None root_resolver = res_root() - etree.get_default_parser().resolvers.add(root_resolver) - - parser = etree.XMLParser() - parser.resolvers.add(self.simple_resolver(self.resolver_schema_ext)) + default_resolvers = etree.get_default_parser().resolvers + default_resolvers.add(root_resolver) + try: + parser = etree.XMLParser() + parser.resolvers.add(self.simple_resolver(self.resolver_schema_ext)) - schema_doc = etree.parse(self.resolver_schema_int, parser = parser) - schema = etree.XMLSchema(schema_doc) - etree.get_default_parser().resolvers.remove(root_resolver) + schema_doc = etree.parse(self.resolver_schema_int, parser = parser) + schema = etree.XMLSchema(schema_doc) + finally: + default_resolvers.remove(root_resolver) def test_xmlschema_nested_resolvers(self): # test that resolvers work in a nested fashion. From 0306c3fa5b81f68c35817164fa20325d9e300a6e Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 7 Jul 2025 22:35:43 +0200 Subject: [PATCH 10/27] Build: bump pypa/cibuildwheel in the github-actions group (GH.468) Bumps the github-actions group with 1 update: [pypa/cibuildwheel](https://github.com/pypa/cibuildwheel). Updates `pypa/cibuildwheel` from 3.0.0 to 3.0.1 - [Release notes](https://github.com/pypa/cibuildwheel/releases) - [Changelog](https://github.com/pypa/cibuildwheel/blob/main/docs/changelog.md) - [Commits](https://github.com/pypa/cibuildwheel/compare/v3.0.0...v3.0.1) --- updated-dependencies: - dependency-name: pypa/cibuildwheel dependency-version: 3.0.1 dependency-type: direct:production update-type: version-update:semver-patch dependency-group: github-actions ... Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- .github/workflows/wheels.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index e812ea5d4..de4f499b5 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -136,13 +136,13 @@ jobs: platforms: all - name: Build wheels - uses: pypa/cibuildwheel@v3.0.0 + uses: pypa/cibuildwheel@v3.0.1 with: only: ${{ matrix.only }} - name: Build old Linux wheels if: contains(matrix.only, '-manylinux_') && (contains(matrix.only, 'i686') || contains(matrix.only, 'x86_64') || contains(matrix.only, 'aarch64')) - uses: pypa/cibuildwheel@v3.0.0 + uses: pypa/cibuildwheel@v3.0.1 env: CIBW_MANYLINUX_i686_IMAGE: manylinux2014 CIBW_MANYLINUX_X86_64_IMAGE: manylinux2014 @@ -153,7 +153,7 @@ jobs: - name: Build faster Linux wheels # also build wheels with the most recent manylinux images and gcc if: runner.os == 'Linux' && !contains(matrix.only, 'i686') - uses: pypa/cibuildwheel@v3.0.0 + uses: pypa/cibuildwheel@v3.0.1 env: CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28 CIBW_MANYLINUX_AARCH64_IMAGE: manylinux_2_28 From 05808e9f9a42004723a49e42f97a20cc7790c2c7 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Tue, 8 Jul 2025 21:12:05 +0200 Subject: [PATCH 11/27] Remove unused import. --- src/lxml/tests/test_threading.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lxml/tests/test_threading.py b/src/lxml/tests/test_threading.py index 3b0e3fb2a..50dbd9f7f 100644 --- a/src/lxml/tests/test_threading.py +++ b/src/lxml/tests/test_threading.py @@ -8,7 +8,7 @@ import unittest import threading -from .common_imports import etree, HelperTestCase, BytesIO, _bytes +from .common_imports import etree, HelperTestCase, BytesIO try: from Queue import Queue From 64951f611e38103ab08301b278e2b101cfd9b404 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Tue, 8 Jul 2025 21:14:07 +0200 Subject: [PATCH 12/27] Avoid a condition where we can simply sum up conditional values. --- src/lxml/etree.pyx | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lxml/etree.pyx b/src/lxml/etree.pyx index 92c1d4410..f6545d1aa 100644 --- a/src/lxml/etree.pyx +++ b/src/lxml/etree.pyx @@ -2669,8 +2669,7 @@ cdef class _Attrib: cdef xmlAttr* c_attr = self._element._c_node.properties cdef Py_ssize_t c = 0 while c_attr is not NULL: - if c_attr.type == tree.XML_ATTRIBUTE_NODE: - c += 1 + c += (c_attr.type == tree.XML_ATTRIBUTE_NODE) c_attr = c_attr.next return c From 1ace3ab8a368d75fa1cb142fa46812101fc0ba8e Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Tue, 8 Jul 2025 21:28:19 +0200 Subject: [PATCH 13/27] Avoid a condition where we can simply sum up conditional values. --- src/lxml/apihelpers.pxi | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lxml/apihelpers.pxi b/src/lxml/apihelpers.pxi index 2eac2ac63..6f3cebc31 100644 --- a/src/lxml/apihelpers.pxi +++ b/src/lxml/apihelpers.pxi @@ -629,8 +629,7 @@ cdef list _collectAttributes(xmlNode* c_node, int collecttype): c_attr = c_node.properties count = 0 while c_attr is not NULL: - if c_attr.type == tree.XML_ATTRIBUTE_NODE: - count += 1 + count += (c_attr.type == tree.XML_ATTRIBUTE_NODE) c_attr = c_attr.next if not count: From f33ac2c2f5f9c4c4c1fc47f363be96db308f2fa6 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Tue, 8 Jul 2025 22:27:05 +0200 Subject: [PATCH 14/27] Avoid checking the mapping size multiple times since it might have a non-trivial cost. --- src/lxml/apihelpers.pxi | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/lxml/apihelpers.pxi b/src/lxml/apihelpers.pxi index 6f3cebc31..d52cff30a 100644 --- a/src/lxml/apihelpers.pxi +++ b/src/lxml/apihelpers.pxi @@ -318,14 +318,18 @@ cdef _initNodeAttributes(xmlNode* c_node, _Document doc, attrib, dict extra): cdef xmlNs* c_ns if attrib is not None and not hasattr(attrib, 'items'): raise TypeError, f"Invalid attribute dictionary: {python._fqtypename(attrib).decode('utf8')}" - if not attrib and not extra: + + has_attrib = bool(attrib) + has_extra = bool(extra) + if not has_attrib and not has_extra: return # nothing to do + is_html = doc.ishtml() seen = set() - if extra: + if has_extra: for name, value in extra.items(): _addAttributeToNode(c_node, doc, is_html, name, value, seen) - if attrib: + if has_attrib: for name, value in _iter_attrib(attrib): _addAttributeToNode(c_node, doc, is_html, name, value, seen) From 1cb950e7360ab1dff3f5d20842de4803cc509236 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Thu, 17 Jul 2025 22:11:08 +0200 Subject: [PATCH 15/27] Add safety fix for "lxml.sax._getNsTag" function to prevent crashes on invalid tag names. Closes https://bugs.launchpad.net/lxml/+bug/2116333 --- src/lxml/sax.py | 7 +++---- src/lxml/tests/test_sax.py | 30 ++++++++++++++++++++++++++++-- 2 files changed, 31 insertions(+), 6 deletions(-) diff --git a/src/lxml/sax.py b/src/lxml/sax.py index 12088880e..db77f6f29 100644 --- a/src/lxml/sax.py +++ b/src/lxml/sax.py @@ -1,5 +1,3 @@ -# cython: language_level=2 - """ SAX-based adapter to copy trees from/to the Python standard library. @@ -32,7 +30,7 @@ class SaxError(etree.LxmlError): def _getNsTag(tag): - if tag[0] == '{': + if tag[0] == '{' and '}' in tag: return tuple(tag[1:].split('}', 1)) else: return None, tag @@ -152,10 +150,11 @@ def characters(self, data): try: # if there already is a child element, we must append to its tail last_element = last_element[-1] - last_element.tail = (last_element.tail or '') + data except IndexError: # otherwise: append to the text last_element.text = (last_element.text or '') + data + else: + last_element.tail = (last_element.tail or '') + data ignorableWhitespace = characters diff --git a/src/lxml/tests/test_sax.py b/src/lxml/tests/test_sax.py index e2d03c255..2c8379497 100644 --- a/src/lxml/tests/test_sax.py +++ b/src/lxml/tests/test_sax.py @@ -7,8 +7,8 @@ from xml.dom import pulldom from xml.sax.handler import ContentHandler -from .common_imports import HelperTestCase, make_doctest, BytesIO, _bytes -from lxml import sax +from .common_imports import HelperTestCase, make_doctest, BytesIO +from lxml import etree, sax class ETreeSaxTestCase(HelperTestCase): @@ -121,6 +121,32 @@ def test_sax_to_pulldom_multiple_namespaces(self): self.assertEqual('a', dom.firstChild.prefix) + def test_sax_non_html(self): + # https://bugs.launchpad.net/lxml/+bug/2116333 + events = [] + + from xml.sax.handler import ContentHandler + class MyContentHandler(ContentHandler): + def startElementNS(self, name, qname, attributes): + events.append(("START", name, qname, attributes.items())) + + def characters(self, data): + events.append(("DATA", data)) + + markup = ( + '' + '' + '' + ) + + parser = etree.HTMLParser(recover=True) + tree = etree.fromstring(markup, parser) + + self.assertFalse(events) + sax.saxify(tree, MyContentHandler()) + # The exact list of parsed attributes depends on the libxml2 parser version. + self.assertTrue(events) + def test_element_sax(self): tree = self.parse('') a = tree.getroot() From 5f347bbe22cdbb329decb8ea20cc0ef5188f51c8 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 18 Jul 2025 10:17:13 +0200 Subject: [PATCH 16/27] CI: Pin all library versions to prevent hammering remote sites with version searches. --- .github/workflows/ci.yml | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8cadf23b4..d1322b974 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -173,9 +173,11 @@ jobs: env: OS_NAME: ${{ matrix.os }} PYTHON_VERSION: ${{ matrix.python-version }} - MACOSX_DEPLOYMENT_TARGET: 11.0 - LIBXML2_VERSION: 2.14.4 - LIBXSLT_VERSION: 1.1.43 + MACOSX_DEPLOYMENT_TARGET: "11.0" + ZLIB_VERSION: "1.3.1" + LIBICONV_VERSION: "1.18" + LIBXML2_VERSION: "2.14.4" + LIBXSLT_VERSION: "1.1.43" COVERAGE: false GCC_VERSION: 9 USE_CCACHE: 1 From 259507c6e2a408148efdf9fa599d1f7910a865fd Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Mon, 21 Jul 2025 18:53:11 +0200 Subject: [PATCH 17/27] Disable the size limitations for the globally shared parser dict. Since we share the dict, users can control the dict size via the lifetime of a parser. --- src/lxml/includes/tree.pxd | 4 ++++ src/lxml/parser.pxi | 3 +++ 2 files changed, 7 insertions(+) diff --git a/src/lxml/includes/tree.pxd b/src/lxml/includes/tree.pxd index 43a52e647..df6b6a16d 100644 --- a/src/lxml/includes/tree.pxd +++ b/src/lxml/includes/tree.pxd @@ -77,6 +77,7 @@ cdef extern from "libxml/hash.h": cdef int xmlHashSize(xmlHashTable* table) nogil cdef void xmlHashFree(xmlHashTable* table, xmlHashDeallocator f) nogil + cdef extern from * nogil: # actually "libxml/dict.h" # libxml/dict.h appears to be broken to include in C ctypedef struct xmlDict @@ -84,6 +85,9 @@ cdef extern from * nogil: # actually "libxml/dict.h" cdef const_xmlChar* xmlDictExists(xmlDict* dict, const_xmlChar* name, int len) cdef int xmlDictOwns(xmlDict* dict, const_xmlChar* name) cdef size_t xmlDictSize(xmlDict* dict) + cdef size_t xmlDictSetLimit(xmlDict* dict, size_t limit) + cdef size_t xmlDictGetUsage(xmlDict* dict) + cdef extern from "libxml/tree.h" nogil: ctypedef struct xmlDoc diff --git a/src/lxml/parser.pxi b/src/lxml/parser.pxi index d199dc0a3..38399bd99 100644 --- a/src/lxml/parser.pxi +++ b/src/lxml/parser.pxi @@ -53,6 +53,8 @@ cdef class _ParserDictionary: def __cinit__(self): self._c_dict = xmlparser.xmlDictCreate() + # Disable size limitations for the globally shared parser dict. + tree.xmlDictSetLimit(self._c_dict, 0) def __dealloc__(self): xmlparser.xmlDictFree(self._c_dict) @@ -1353,6 +1355,7 @@ cdef void _initSaxDocument(void* ctxt) noexcept with gil: # memory errors are not fatal here c_dict = xmlparser.xmlDictCreate() if c_dict: + tree.xmlDictSetLimit(c_dict, 0) c_doc.ids = tree.xmlHashCreateDict(0, c_dict) xmlparser.xmlDictFree(c_dict) else: From 55517efda743d70054c96d22934fd721a20debca Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Mon, 21 Jul 2025 21:44:52 +0200 Subject: [PATCH 18/27] Disable the size limit of the global name dict only for parsers with "huge_tree=True". --- src/lxml/parser.pxi | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/src/lxml/parser.pxi b/src/lxml/parser.pxi index 38399bd99..cf2cb05ff 100644 --- a/src/lxml/parser.pxi +++ b/src/lxml/parser.pxi @@ -53,13 +53,14 @@ cdef class _ParserDictionary: def __cinit__(self): self._c_dict = xmlparser.xmlDictCreate() - # Disable size limitations for the globally shared parser dict. - tree.xmlDictSetLimit(self._c_dict, 0) def __dealloc__(self): xmlparser.xmlDictFree(self._c_dict) self._c_dict = NULL + cdef void disableSizeLimit(self): + tree.xmlDictSetLimit(self._c_dict, 0) + cdef tree.xmlDict *getDict(self): return self._c_dict @@ -919,6 +920,8 @@ cdef class _BaseParser: if self._parser_context is None: self._parser_context = self._createContext(self.target, None) self._parser_context._collect_ids = self._flags.collect_ids + if self._parse_options & tree.XML_PARSE_HUGE: + self._parser_context._dict.disableSizeLimit() if self._schema is not None: self._parser_context._validator = \ self._schema._newSaxValidator( @@ -934,6 +937,8 @@ cdef class _BaseParser: self._push_parser_context = self._createContext( self.target, self._events_to_collect) self._push_parser_context._collect_ids = self._flags.collect_ids + if self._parse_options & tree.XML_PARSE_HUGE: + self._push_parser_context._dict.disableSizeLimit() if self._schema is not None: self._push_parser_context._validator = \ self._schema._newSaxValidator( From 27313e909ab3d75d72687bde57b1505578326bcb Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Mon, 21 Jul 2025 21:56:16 +0200 Subject: [PATCH 19/27] Increase iterparse chunk size to 64 KiB and make it configurable. --- src/lxml/iterparse.pxi | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/lxml/iterparse.pxi b/src/lxml/iterparse.pxi index 42b752499..ad7d70926 100644 --- a/src/lxml/iterparse.pxi +++ b/src/lxml/iterparse.pxi @@ -1,13 +1,12 @@ # iterparse -- event-driven parsing -DEF __ITERPARSE_CHUNK_SIZE = 32768 - cdef class iterparse: """iterparse(self, source, events=("end",), tag=None, \ attribute_defaults=False, dtd_validation=False, \ load_dtd=False, no_network=True, remove_blank_text=False, \ remove_comments=False, remove_pis=False, encoding=None, \ - html=False, recover=None, huge_tree=False, schema=None) + html=False, recover=None, huge_tree=False, schema=None, \ + chunk_size=65536) Incremental parser. @@ -42,7 +41,7 @@ cdef class iterparse: - remove_blank_text: discard blank text nodes - remove_comments: discard comments - remove_pis: discard processing instructions - - strip_cdata: replace CDATA sections by normal text content (default: + - strip_cdata: replace CDATA sections by normal text content (default: True for XML, ignored otherwise) - compact: safe memory for short text content (default: True) - resolve_entities: replace entities by their text value (default: True) @@ -55,6 +54,8 @@ cdef class iterparse: Other keyword arguments: - encoding: override the document encoding - schema: an XMLSchema to validate against + - chunk_size: the number of bytes to read from the 'source' in one chunk + (default: 65536) """ cdef _FeedParser _parser cdef object _tag @@ -63,6 +64,7 @@ cdef class iterparse: cdef object _source cdef object _filename cdef object _error + cdef object _chunk_size cdef bint _close_source_after_read def __init__(self, source, events=("end",), *, tag=None, @@ -71,7 +73,7 @@ cdef class iterparse: compact=True, resolve_entities=True, remove_comments=False, remove_pis=False, strip_cdata=True, encoding=None, html=False, recover=None, huge_tree=False, collect_ids=True, - XMLSchema schema=None): + XMLSchema schema=None, int chunk_size=65536): if not hasattr(source, 'read'): source = _getFSPathOrObject(source) self._filename = source @@ -124,6 +126,7 @@ cdef class iterparse: target=None, # TODO compact=compact) + self._chunk_size = chunk_size self._events = parser.read_events() self._parser = parser @@ -215,7 +218,7 @@ cdef class iterparse: @cython.final cdef bint _read_more_events(self, _SaxParserContext context) except -123: - data = self._source.read(__ITERPARSE_CHUNK_SIZE) + data = self._source.read(self._chunk_size) if not isinstance(data, bytes): self._close_source() raise TypeError("reading file objects must return bytes objects") From bef1d2a6bc847ab1078121b72ad2cce2e658a914 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Mon, 21 Jul 2025 22:15:40 +0200 Subject: [PATCH 20/27] Updata changelog. --- CHANGES.txt | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/CHANGES.txt b/CHANGES.txt index ab0f253ed..5f832e9cd 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -2,6 +2,25 @@ lxml changelog ============== +7.0.0 (2025-??-??) +================== + +Features added +-------------- + +* The shared parser name dict is now local to a parser (as opposed to global), + which allows to control its lifetime and cross-document usage more easily. + It is now also unbounded in size if the ``huge_tree=True`` option is provided. + +* The default chunk size for reading from file-likes in ``iterparse()`` was increased + from 32 KiB to 64 KiB and is now configurable with a new ``chunk_size`` argument. + +Other changes +------------- + +* Some internal adaptations were made for libxml2 2.14.x and 2.15.x. + + 6.0.0 (2025-06-26) ================== From 28d532f6f17943786ad00e02a0fb269ba1293e9d Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Mon, 21 Jul 2025 22:22:09 +0200 Subject: [PATCH 21/27] Fix enum origin name. --- src/lxml/parser.pxi | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/lxml/parser.pxi b/src/lxml/parser.pxi index cf2cb05ff..a30418583 100644 --- a/src/lxml/parser.pxi +++ b/src/lxml/parser.pxi @@ -919,13 +919,16 @@ cdef class _BaseParser: cdef xmlparser.xmlParserCtxt* pctxt if self._parser_context is None: self._parser_context = self._createContext(self.target, None) + self._parser_context._collect_ids = self._flags.collect_ids - if self._parse_options & tree.XML_PARSE_HUGE: + if self._parse_options & xmlparser.XML_PARSE_HUGE: self._parser_context._dict.disableSizeLimit() + if self._schema is not None: self._parser_context._validator = \ self._schema._newSaxValidator( self._parse_options & xmlparser.XML_PARSE_DTDATTR) + pctxt = self._newParserCtxt() _initParserContext(self._parser_context, self._resolvers, pctxt) self._configureSaxContext(pctxt) @@ -936,13 +939,16 @@ cdef class _BaseParser: if self._push_parser_context is None: self._push_parser_context = self._createContext( self.target, self._events_to_collect) + self._push_parser_context._collect_ids = self._flags.collect_ids - if self._parse_options & tree.XML_PARSE_HUGE: + if self._parse_options & xmlparser.XML_PARSE_HUGE: self._push_parser_context._dict.disableSizeLimit() + if self._schema is not None: self._push_parser_context._validator = \ self._schema._newSaxValidator( self._parse_options & xmlparser.XML_PARSE_DTDATTR) + pctxt = self._newPushParserCtxt() _initParserContext( self._push_parser_context, self._resolvers, pctxt) From eeee9af9b257c48bffd6f03a8b7dcea7a5632db0 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Wed, 23 Jul 2025 13:58:21 +0200 Subject: [PATCH 22/27] Add simple tests for passing the "huge_tree" parser option. --- src/lxml/tests/test_etree.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/lxml/tests/test_etree.py b/src/lxml/tests/test_etree.py index 03f387454..81c4a1e72 100644 --- a/src/lxml/tests/test_etree.py +++ b/src/lxml/tests/test_etree.py @@ -722,6 +722,14 @@ def test_parse_parser_type_error(self): parse = self.etree.parse self.assertRaises(TypeError, parse, 'notthere.xml', object()) + def test_parse_huge_tree(self): + fromstring = self.etree.fromstring + XMLParser = self.etree.XMLParser + + xml = b'' + parser = XMLParser(huge_tree=True) + self.assertEqual(2, len(fromstring(xml, parser=parser))) + def test_parse_premature_end(self): fromstring = self.etree.fromstring XMLParser = self.etree.XMLParser @@ -750,6 +758,17 @@ def test_iterparse_getiterator(self): [1,2,1,4], counts) + def test_iterparse_huge_tree(self): + iterparse = self.etree.iterparse + f = BytesIO(b'') + + counts = [] + for _, elem in iterparse(f, huge_tree=True): + counts.append(len(elem)) + self.assertEqual( + [0,1,0,2], + counts) + def test_iterparse_tree_comments(self): # ET removes comments iterparse = self.etree.iterparse From f13a7ff40406f0950de9af97cbcbb854c2e38d53 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Wed, 23 Jul 2025 14:18:47 +0200 Subject: [PATCH 23/27] Move xmlDict function declarations from "xmlparser.pxd" to "tree.pxd". --- src/lxml/includes/tree.pxd | 5 +++++ src/lxml/includes/xmlparser.pxd | 2 ++ src/lxml/parser.pxi | 22 +++++++++++----------- src/lxml/xslt.pxi | 10 +++++----- 4 files changed, 23 insertions(+), 16 deletions(-) diff --git a/src/lxml/includes/tree.pxd b/src/lxml/includes/tree.pxd index df6b6a16d..19ff6da4c 100644 --- a/src/lxml/includes/tree.pxd +++ b/src/lxml/includes/tree.pxd @@ -81,6 +81,11 @@ cdef extern from "libxml/hash.h": cdef extern from * nogil: # actually "libxml/dict.h" # libxml/dict.h appears to be broken to include in C ctypedef struct xmlDict + + cdef xmlDict* xmlDictCreate() + cdef xmlDict* xmlDictCreateSub(xmlDict* subdict) + cdef void xmlDictFree(xmlDict* sub) + cdef int xmlDictReference(xmlDict* dict) cdef const_xmlChar* xmlDictLookup(xmlDict* dict, const_xmlChar* name, int len) cdef const_xmlChar* xmlDictExists(xmlDict* dict, const_xmlChar* name, int len) cdef int xmlDictOwns(xmlDict* dict, const_xmlChar* name) diff --git a/src/lxml/includes/xmlparser.pxd b/src/lxml/includes/xmlparser.pxd index 3a721c1dc..ca905c6ec 100644 --- a/src/lxml/includes/xmlparser.pxd +++ b/src/lxml/includes/xmlparser.pxd @@ -136,10 +136,12 @@ cdef extern from "libxml/parser.h" nogil: cdef bint xmlHasFeature(xmlFeature feature) + # START: Legacy, moved to tree.pxd cdef xmlDict* xmlDictCreate() cdef xmlDict* xmlDictCreateSub(xmlDict* subdict) cdef void xmlDictFree(xmlDict* sub) cdef int xmlDictReference(xmlDict* dict) + # END: Legacy, moved to tree.pxd cdef int XML_COMPLETE_ATTRS # SAX option for adding DTD default attributes cdef int XML_SKIP_IDS # SAX option for not building an XML ID dict diff --git a/src/lxml/parser.pxi b/src/lxml/parser.pxi index a30418583..f4d3ce68e 100644 --- a/src/lxml/parser.pxi +++ b/src/lxml/parser.pxi @@ -52,10 +52,10 @@ cdef class _ParserDictionary: cdef tree.xmlDict* _c_dict def __cinit__(self): - self._c_dict = xmlparser.xmlDictCreate() + self._c_dict = tree.xmlDictCreate() def __dealloc__(self): - xmlparser.xmlDictFree(self._c_dict) + tree.xmlDictFree(self._c_dict) self._c_dict = NULL cdef void disableSizeLimit(self): @@ -66,7 +66,7 @@ cdef class _ParserDictionary: cdef tree.xmlDict *getDictRef(self): c_dict = self._c_dict - xmlparser.xmlDictReference(c_dict) + tree.xmlDictReference(c_dict) return c_dict cdef size_t getDictSize(self): @@ -79,7 +79,7 @@ cdef class _ParserDictionary: c_dict_ref[0] = self.getDictRef() if c_dict is not NULL: - xmlparser.xmlDictFree(c_dict) + tree.xmlDictFree(c_dict) cdef void initParserDict(self, xmlparser.xmlParserCtxt* pctxt) noexcept: "Assure we always use the same string dictionary." @@ -1354,7 +1354,7 @@ cdef void _initSaxDocument(void* ctxt) noexcept with gil: if c_doc and c_ctxt.dict and not c_doc.dict: # I have no idea why libxml2 disables this - we need it c_ctxt.dictNames = 1 - xmlparser.xmlDictReference(c_ctxt.dict) + tree.xmlDictReference(c_ctxt.dict) c_doc.dict = c_ctxt.dict # set up XML ID hash table @@ -1364,11 +1364,11 @@ cdef void _initSaxDocument(void* ctxt) noexcept with gil: # keep the global parser dict from filling up with XML IDs if c_doc and not c_doc.ids: # memory errors are not fatal here - c_dict = xmlparser.xmlDictCreate() + c_dict = tree.xmlDictCreate() if c_dict: tree.xmlDictSetLimit(c_dict, 0) c_doc.ids = tree.xmlHashCreateDict(0, c_dict) - xmlparser.xmlDictFree(c_dict) + tree.xmlDictFree(c_dict) else: c_doc.ids = tree.xmlHashCreate(0) else: @@ -1573,8 +1573,8 @@ cdef (int, int) _parse_data_chunk(xmlparser.xmlParserCtxt* c_ctxt, fixup_error = _fixHtmlDictSubtreeNames( c_ctxt.dict, c_ctxt.myDoc, c_node) if c_ctxt.myDoc.dict and c_ctxt.myDoc.dict is not c_ctxt.dict: - xmlparser.xmlDictReference(c_ctxt.dict) - xmlparser.xmlDictFree(c_ctxt.myDoc.dict) + tree.xmlDictReference(c_ctxt.dict) + tree.xmlDictFree(c_ctxt.myDoc.dict) c_ctxt.myDoc.dict = c_ctxt.dict else: orig_loader = _register_document_loader() @@ -2022,7 +2022,7 @@ cdef xmlDoc* _copyDoc(xmlDoc* c_doc, int recursive) except NULL: result = tree.xmlCopyDoc(c_doc, 0) if result is NULL: raise MemoryError() - xmlparser.xmlDictReference(c_doc.dict) + tree.xmlDictReference(c_doc.dict) result.dict = c_doc.dict return result @@ -2032,7 +2032,7 @@ cdef xmlDoc* _copyDocRoot(xmlDoc* c_doc, xmlNode* c_new_root) except NULL: cdef xmlNode* c_node result = tree.xmlCopyDoc(c_doc, 0) # non recursive assert result.dict is NULL - xmlparser.xmlDictReference(c_doc.dict) + tree.xmlDictReference(c_doc.dict) result.dict = c_doc.dict with nogil: c_node = tree.xmlDocCopyNode(c_new_root, result, 1) # recursive diff --git a/src/lxml/xslt.pxi b/src/lxml/xslt.pxi index 9ffe27677..c050155b6 100644 --- a/src/lxml/xslt.pxi +++ b/src/lxml/xslt.pxi @@ -523,17 +523,17 @@ cdef class XSLT: # non-input tag/attr names will come from the stylesheet # anyway. if transform_ctxt.dict is not NULL: - xmlparser.xmlDictFree(transform_ctxt.dict) + tree.xmlDictFree(transform_ctxt.dict) if kw: # parameter values are stored in the dict # => avoid unnecessarily cluttering the global dict - transform_ctxt.dict = xmlparser.xmlDictCreateSub(self._c_style.doc.dict) + transform_ctxt.dict = tree.xmlDictCreateSub(self._c_style.doc.dict) if transform_ctxt.dict is NULL: xslt.xsltFreeTransformContext(transform_ctxt) raise MemoryError() else: transform_ctxt.dict = self._c_style.doc.dict - xmlparser.xmlDictReference(transform_ctxt.dict) + tree.xmlDictReference(transform_ctxt.dict) xslt.xsltSetCtxtParseOptions( transform_ctxt, input_doc._parser._parse_options) @@ -601,7 +601,7 @@ cdef class XSLT: resolver_context.clear() c_dict = c_result.dict - xmlparser.xmlDictReference(c_dict) + tree.xmlDictReference(c_dict) result_doc = _documentFactory(c_result, input_doc._parser) result_doc.initDict() @@ -620,7 +620,7 @@ cdef class XSLT: fixThreadDictNames(c_result, input_doc._c_doc.dict, c_result.dict) - xmlparser.xmlDictFree(c_dict) + tree.xmlDictFree(c_dict) return _xsltResultTreeFactory(result_doc, self, profile_doc) From 39de2f3fa9dd6e88297f8210c300dcc20f152236 Mon Sep 17 00:00:00 2001 From: George Rawlinson Date: Sat, 26 Jul 2025 18:04:48 +1200 Subject: [PATCH 24/27] Remove Trove License classifier to avoid setuptools deprecation warning (GH-470) When building with a recent version of setuptools, it emits a deprecation warning, e.g.: SetuptoolsDeprecationWarning: License classifiers are deprecated. !! ******************************************************************************** Please consider removing the following classifiers in favor of a SPDX license expression: License :: OSI Approved :: BSD License See https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license for details. ******************************************************************************** !! The setuptools documentation explains the required changes. https://packaging.python.org/en/latest/guides/writing-pyproject-toml/#license-and-license-files Please note that I did not add "license-files", because this is not recognised in older setuptools versions and will fail to build. --- setup.py | 1 - 1 file changed, 1 deletion(-) diff --git a/setup.py b/setup.py index c63225644..7375f0222 100644 --- a/setup.py +++ b/setup.py @@ -230,7 +230,6 @@ def build_packages(files): versioninfo.dev_status(), 'Intended Audience :: Developers', 'Intended Audience :: Information Technology', - 'License :: OSI Approved :: BSD License', 'Programming Language :: Cython', # NOTE: keep in sync with 'python_requires' list above. 'Programming Language :: Python :: 3', From 9261bb7bf09cdceddfc4050ddd34bc60c9b7c172 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Wed, 23 Jul 2025 20:36:20 +0200 Subject: [PATCH 25/27] Minor code cleanup. --- src/lxml/classlookup.pxi | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/src/lxml/classlookup.pxi b/src/lxml/classlookup.pxi index fcd766533..49cf85ae5 100644 --- a/src/lxml/classlookup.pxi +++ b/src/lxml/classlookup.pxi @@ -449,12 +449,10 @@ cdef object _custom_class_lookup(state, _Document doc, xmlNode* c_node): element_type = "entity" else: element_type = "element" - if c_node.name is NULL: - name = None - else: - name = funicode(c_node.name) + + name = funicodeOrNone(c_node.name) c_str = tree._getNs(c_node) - ns = funicode(c_str) if c_str is not NULL else None + ns = funicodeOrNone(c_str) cls = lookup.lookup(element_type, doc, ns, name) if cls is not None: From 42e90e674905aa63bde276e62250d021a60e73c8 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Sat, 26 Jul 2025 11:14:09 +0200 Subject: [PATCH 26/27] Build: Pin all library versions to prevent hammering remote sites with version searches. --- .github/workflows/wheels.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index de4f499b5..b2713afdf 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -113,6 +113,8 @@ jobs: include: ${{ fromJson(needs.generate-wheels-matrix.outputs.include) }} env: + ZLIB_VERSION: "1.3.1" + LIBICONV_VERSION: "1.18" LIBXML2_VERSION: 2.14.4 LIBXSLT_VERSION: 1.1.43 From 56f0eef494e8a30721c3a7bb2766dee752de16ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Miro=20Hron=C4=8Dok?= Date: Sun, 27 Jul 2025 15:46:25 +0200 Subject: [PATCH 27/27] Do not require wheel for building (GH-469) - current version of setuptools (70.1+) does not need wheel at all - older versions of setuptools would fetch wheel when building wheels (but not sdists) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index a7ef4d7e3..9ebac4a04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,5 +1,5 @@ [build-system] -requires = ["Cython>=3.1.2", "setuptools", "wheel"] +requires = ["Cython>=3.1.2", "setuptools"] [tool.cibuildwheel] build-verbosity = 1 pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy