From 0fdf179391c5d926dc75a7dec2604cd3021c798f Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 29 Mar 2024 08:46:55 +0100 Subject: [PATCH 001/321] Build: Increase minimum CPU architecture of Linux wheels from core2 to sandybridge (launched 2011, adds e.g. SSE4.2, AVX, PCLMUL, POPCNT). --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e7f6bb66c..ba413dabd 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ archs = ["x86_64", "aarch64", "i686"] repair-wheel-command = "auditwheel repair --strip -w {dest_dir} {wheel}" [tool.cibuildwheel.linux.environment] -CFLAGS = "-O3 -g1 -pipe -fPIC -flto -march=core2" +CFLAGS = "-O3 -g1 -pipe -fPIC -flto -march=sandybridge" AR = "gcc-ar" NM = "gcc-nm" RANLIB = "gcc-ranlib" From a37fac99f93447f0096e917c7286c1dad2c829b5 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 29 Mar 2024 08:57:38 +0100 Subject: [PATCH 002/321] Build: Add a build step for additional Linux wheels with manylinux 2.28 and gcc 12. --- .github/workflows/wheels.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 53e08d08d..1a8ea234f 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -143,6 +143,23 @@ jobs: with: only: ${{ matrix.only }} + - name: Build faster Linux wheels + # also build wheels with the most recent manylinux images and gcc + if: runner.os == 'Linux' + uses: pypa/cibuildwheel@v2.16.5 + env: + CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28 + CIBW_MANYLINUX_I686_IMAGE: manylinux_2_28 + CIBW_MANYLINUX_PYPY_X86_64_IMAGE: manylinux_2_28 + CIBW_MANYLINUX_AARCH64_IMAGE: manylinux_2_28 + CIBW_MANYLINUX_PYPY_AARCH64_IMAGE: manylinux_2_28 + CIBW_MANYLINUX_PYPY_I686_IMAGE: manylinux_2_28 + CIBW_MUSLLINUX_X86_64_IMAGE: musllinux_1_2 + CIBW_MUSLLINUX_I686_IMAGE: musllinux_1_2 + CIBW_MUSLLINUX_AARCH64_IMAGE: musllinux_1_2 + with: + only: ${{ matrix.only }} + - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 with: path: ./wheelhouse/*.whl From 221e19e9fef0c4171202eb0226e21b49aef89958 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 29 Mar 2024 08:57:57 +0100 Subject: [PATCH 003/321] Build: Fix directory name. --- .github/workflows/wheels.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 1a8ea234f..1d0e5a489 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -181,7 +181,7 @@ jobs: merge-multiple: true - name: List downloaded artifacts - run: ls -la ~/downloads + run: ls -la ./wheel_upload - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 with: From c99b253ed45250ed8d17cc6360b4af9a5b364008 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 29 Mar 2024 09:14:05 +0100 Subject: [PATCH 004/321] Build: Release sdist and wheels together. --- .github/workflows/wheels.yml | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 1d0e5a489..5402842af 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -59,12 +59,6 @@ jobs: run: make html sdist env: { STATIC_DEPS: false; CFLAGS="-Og" } # it's run-once, so build more quickly - - name: Release - uses: softprops/action-gh-release@v2 - if: github.ref_type == 'tag' - with: - files: dist/*.tar.gz - - name: Upload sdist uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 with: @@ -167,29 +161,31 @@ jobs: upload_release_assets: name: Upload Release Assets - needs: [ build_wheels ] + needs: [ sdist, build_wheels ] runs-on: ubuntu-latest permissions: contents: write steps: - - name: Download bdist files + - name: Download artifacts uses: actions/download-artifact@eaceaf801fd36c7dee90939fad912460b18a1ffe # v4.1.2 with: - path: ./wheel_upload + path: ./release_upload merge-multiple: true - name: List downloaded artifacts - run: ls -la ./wheel_upload + run: ls -la ./release_upload - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 with: - path: ./wheel_upload/*.whl + path: ./release_upload/*.whl name: all_wheels - name: Release uses: softprops/action-gh-release@v2 if: github.ref_type == 'tag' with: - files: ./wheel_upload/*.whl + files: | + ./release_upload/*.whl + ./release_upload/*.tar.gz From 1acc1b95d73a41f071cb1516fbf3c620d2a50293 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 29 Mar 2024 09:18:46 +0100 Subject: [PATCH 005/321] Build: See if a more recent cibuildwheel allows easier manylinux image selection. --- .github/workflows/wheels.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 5402842af..c2591673f 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -133,14 +133,14 @@ jobs: platforms: all - name: Build wheels - uses: pypa/cibuildwheel@v2.16.5 + uses: pypa/cibuildwheel@v2.17.0 with: only: ${{ matrix.only }} - name: Build faster Linux wheels # also build wheels with the most recent manylinux images and gcc if: runner.os == 'Linux' - uses: pypa/cibuildwheel@v2.16.5 + uses: pypa/cibuildwheel@v2.17.0 env: CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28 CIBW_MANYLINUX_I686_IMAGE: manylinux_2_28 From f2d880d3d5c652e6d19950881dd08dbe0e1b7024 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 29 Mar 2024 11:08:28 +0100 Subject: [PATCH 006/321] Build: manylinux_2_28 has no i686 images. --- .github/workflows/wheels.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index c2591673f..eef8abfb1 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -139,15 +139,15 @@ jobs: - name: Build faster Linux wheels # also build wheels with the most recent manylinux images and gcc - if: runner.os == 'Linux' + if: runner.os == 'Linux' && !contains(matrix.only, 'i686') uses: pypa/cibuildwheel@v2.17.0 env: CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28 CIBW_MANYLINUX_I686_IMAGE: manylinux_2_28 - CIBW_MANYLINUX_PYPY_X86_64_IMAGE: manylinux_2_28 CIBW_MANYLINUX_AARCH64_IMAGE: manylinux_2_28 - CIBW_MANYLINUX_PYPY_AARCH64_IMAGE: manylinux_2_28 + CIBW_MANYLINUX_PYPY_X86_64_IMAGE: manylinux_2_28 CIBW_MANYLINUX_PYPY_I686_IMAGE: manylinux_2_28 + CIBW_MANYLINUX_PYPY_AARCH64_IMAGE: manylinux_2_28 CIBW_MUSLLINUX_X86_64_IMAGE: musllinux_1_2 CIBW_MUSLLINUX_I686_IMAGE: musllinux_1_2 CIBW_MUSLLINUX_AARCH64_IMAGE: musllinux_1_2 From eee57ec2cebe20c95b01d699d48733e85f2d87af Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 29 Mar 2024 11:16:30 +0100 Subject: [PATCH 007/321] Build: Add more architectures to wheel build. --- .github/workflows/wheels.yml | 7 ++++--- pyproject.toml | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index eef8abfb1..6a052afac 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -143,14 +143,15 @@ jobs: uses: pypa/cibuildwheel@v2.17.0 env: CIBW_MANYLINUX_X86_64_IMAGE: manylinux_2_28 - CIBW_MANYLINUX_I686_IMAGE: manylinux_2_28 CIBW_MANYLINUX_AARCH64_IMAGE: manylinux_2_28 + CIBW_MANYLINUX_PPC64LE_IMAGE: manylinux_2_28 + CIBW_MANYLINUX_S390X_IMAGE: manylinux_2_28 CIBW_MANYLINUX_PYPY_X86_64_IMAGE: manylinux_2_28 - CIBW_MANYLINUX_PYPY_I686_IMAGE: manylinux_2_28 CIBW_MANYLINUX_PYPY_AARCH64_IMAGE: manylinux_2_28 CIBW_MUSLLINUX_X86_64_IMAGE: musllinux_1_2 - CIBW_MUSLLINUX_I686_IMAGE: musllinux_1_2 CIBW_MUSLLINUX_AARCH64_IMAGE: musllinux_1_2 + CIBW_MUSLLINUX_PPC64LE_IMAGE: manylinux_2_28 + CIBW_MUSLLINUX_S390X_IMAGE: manylinux_2_28 with: only: ${{ matrix.only }} diff --git a/pyproject.toml b/pyproject.toml index ba413dabd..b7a831350 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,11 +4,11 @@ requires = ["Cython>=3.0.9", "setuptools", "wheel"] [tool.cibuildwheel] build-verbosity = 2 environment = {STATIC_DEPS="true", LIBXML2_VERSION = "2.12.6", LIBXSLT_VERSION = "1.1.39"} -skip = ["pp*-manylinux_aarch64", "pp*-manylinux_i686", "*-musllinux_i686"] +skip = ["pp*-manylinux_i686", "*-musllinux_i686"] # test-command = "python {package}/test.py -p -v" [tool.cibuildwheel.linux] -archs = ["x86_64", "aarch64", "i686"] +archs = ["x86_64", "aarch64", "i686", "ppc64le", "s390x"] repair-wheel-command = "auditwheel repair --strip -w {dest_dir} {wheel}" [tool.cibuildwheel.linux.environment] From 89cb64ed50b7b2d34378e32f06ac99b48a20945c Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 29 Mar 2024 11:59:33 +0100 Subject: [PATCH 008/321] Build: Fix non-x86 CFLAGS. --- pyproject.toml | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b7a831350..08c579945 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ archs = ["x86_64", "aarch64", "i686", "ppc64le", "s390x"] repair-wheel-command = "auditwheel repair --strip -w {dest_dir} {wheel}" [tool.cibuildwheel.linux.environment] -CFLAGS = "-O3 -g1 -pipe -fPIC -flto -march=sandybridge" +CFLAGS = "-O3 -g1 -pipe -fPIC -flto -mtune=generic" AR = "gcc-ar" NM = "gcc-nm" RANLIB = "gcc-ranlib" @@ -21,9 +21,20 @@ STATIC_DEPS = "true" LIBXML2_VERSION = "2.12.6" LIBXSLT_VERSION = "1.1.39" +[[tool.cibuildwheel.overrides]] +select = "*linux_i686" +inherit.environment = "append" +environment.CFLAGS="-O3 -g1 -pipe -fPIC -flto -march=sandybridge" + +[[tool.cibuildwheel.overrides]] +select = "*linux_x86_64" +inherit.environment = "append" +environment.CFLAGS="-O3 -g1 -pipe -fPIC -flto -march=sandybridge" + [[tool.cibuildwheel.overrides]] select = "*aarch64" -environment = {CFLAGS = "-O3 -g1 -pipe -fPIC -flto -march=armv8-a -mtune=cortex-a72", AR = "gcc-ar", NM = "gcc-nm", RANLIB = "gcc-ranlib", LDFLAGS = "-flto", STATIC_DEPS = "true", LIBXML2_VERSION = "2.12.6", LIBXSLT_VERSION = "1.1.39" } +inherit.environment = "append" +environment.CFLAGS = "-O3 -g1 -pipe -fPIC -flto -march=armv8-a -mtune=cortex-a72" [tool.cibuildwheel.windows] archs = ["AMD64", "x86"] From 89f2e6a01fe752c3385077c2167166c1ffcbdfb5 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 29 Mar 2024 16:39:47 +0100 Subject: [PATCH 009/321] Build: Remove platforms that fail to build and apparently are not trivial to build. --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 08c579945..55b5555ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ skip = ["pp*-manylinux_i686", "*-musllinux_i686"] # test-command = "python {package}/test.py -p -v" [tool.cibuildwheel.linux] -archs = ["x86_64", "aarch64", "i686", "ppc64le", "s390x"] +archs = ["x86_64", "aarch64", "i686"] # , "ppc64le", "s390x"] repair-wheel-command = "auditwheel repair --strip -w {dest_dir} {wheel}" [tool.cibuildwheel.linux.environment] From 10bf9b7a609ffa1ca916bf778774d845efb4ea1b Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 29 Mar 2024 16:42:41 +0100 Subject: [PATCH 010/321] Build(deps): Bump the github-actions group with 2 updates (GH-415) Bumps the github-actions group with 2 updates: [actions/cache](https://github.com/actions/cache) and [actions/download-artifact](https://github.com/actions/download-artifact). Updates `actions/cache` from 4.0.0 to 4.0.2 - [Release notes](https://github.com/actions/cache/releases) - [Changelog](https://github.com/actions/cache/blob/main/RELEASES.md) - [Commits](https://github.com/actions/cache/compare/13aacd865c20de90d75de3b17ebe84f7a17d57d2...0c45773b623bea8c8e75f6c82b208c3cf94ea4f9) Updates `actions/download-artifact` from 4.1.2 to 4.1.4 - [Release notes](https://github.com/actions/download-artifact/releases) - [Commits](https://github.com/actions/download-artifact/compare/eaceaf801fd36c7dee90939fad912460b18a1ffe...c850b930e6ba138125429b7e5c93fc707a7f8427) --- .github/workflows/ci.yml | 4 ++-- .github/workflows/wheels.yml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 78a9801ed..15314a4fc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -174,7 +174,7 @@ jobs: key: ${{ runner.os }}-ccache${{ matrix.extra_hash }}-${{ matrix.python-version }}-${{ hashFiles('.github/workflows/ci.yml', 'tools/ci-run.sh') }} - name: Cache [libs] - uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 + uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2 if: matrix.env.STATIC_DEPS with: path: | @@ -221,7 +221,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Collect wheels - uses: actions/download-artifact@eaceaf801fd36c7dee90939fad912460b18a1ffe # v4.1.2 + uses: actions/download-artifact@c850b930e6ba138125429b7e5c93fc707a7f8427 # v4.1.4 with: path: ~/downloads merge-multiple: true diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index 6a052afac..a12b47f32 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -118,7 +118,7 @@ jobs: uses: actions/checkout@v4 - name: Cache [libs] - uses: actions/cache@13aacd865c20de90d75de3b17ebe84f7a17d57d2 # v4.0.0 + uses: actions/cache@0c45773b623bea8c8e75f6c82b208c3cf94ea4f9 # v4.0.2 with: path: | libs/*.xz @@ -170,7 +170,7 @@ jobs: steps: - name: Download artifacts - uses: actions/download-artifact@eaceaf801fd36c7dee90939fad912460b18a1ffe # v4.1.2 + uses: actions/download-artifact@c850b930e6ba138125429b7e5c93fc707a7f8427 # v4.1.4 with: path: ./release_upload merge-multiple: true From 06b70c3e047e405a94e82f95bc9ff8ed03d00892 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 29 Mar 2024 20:55:45 +0100 Subject: [PATCH 011/321] Set master version to 5.2.0a0. --- src/lxml/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lxml/__init__.py b/src/lxml/__init__.py index 4c91c5b72..6c08eacf0 100644 --- a/src/lxml/__init__.py +++ b/src/lxml/__init__.py @@ -1,6 +1,6 @@ # this is a package -__version__ = "5.1.1" +__version__ = "5.2.0a0" def get_include(): From 73778681f14359fe6d16644e69aaca276eba525a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Lum=C3=ADr=20=27Frenzy=27=20Balhar?= Date: Fri, 29 Mar 2024 21:01:38 +0100 Subject: [PATCH 012/321] Use html.clean from external project and provide "html_clean" extra dependency (GH-406) Following the discussion in https://bugs.launchpad.net/lxml/+bug/1958539, "lxml.html.clean" is now extracted into a separate project "lxml_html_clean": * Github: https://github.com/fedora-python/lxml_html_clean * PyPI: https://pypi.org/project/lxml-html-clean/ * Documentation: https://lxml-html-clean.readthedocs.io/en/latest/ The module is available as an "extra" setuptools dependency "lxml[html_clean]", so that: * Projects that use lxml without lxml.html.clean will continue to use it without any difference. Except they won't have potentially vulnerable code installed. * Projects that need lxml.html.clean will need to switch their requirements from lxml to lxml[html_clean]. The new package is added as a test dependency to continue to test the compatibility between the two projects. Closes https://bugs.launchpad.net/lxml/+bug/1958539 --- setup.py | 1 + setupinfo.py | 1 - src/lxml/html/clean.py | 787 +---------------------- src/lxml/html/tests/test_autolink.py | 10 - src/lxml/html/tests/test_autolink.txt | 79 --- src/lxml/html/tests/test_basic.txt | 1 - src/lxml/html/tests/test_clean.py | 313 --------- src/lxml/html/tests/test_clean.txt | 221 ------- src/lxml/html/tests/test_clean_embed.txt | 39 -- tools/ci-run.sh | 2 +- tox.ini | 2 + 11 files changed, 22 insertions(+), 1434 deletions(-) delete mode 100644 src/lxml/html/tests/test_autolink.py delete mode 100644 src/lxml/html/tests/test_autolink.txt delete mode 100644 src/lxml/html/tests/test_clean.py delete mode 100644 src/lxml/html/tests/test_clean.txt delete mode 100644 src/lxml/html/tests/test_clean_embed.txt diff --git a/setup.py b/setup.py index 6d7bd18df..c440c10a3 100644 --- a/setup.py +++ b/setup.py @@ -78,6 +78,7 @@ def static_env_list(name, separator=None): 'cssselect': 'cssselect>=0.7', 'html5': 'html5lib', 'htmlsoup': 'BeautifulSoup4', + 'html_clean': 'lxml_html_clean', } extra_options.update(setupinfo.extra_setup_args()) diff --git a/setupinfo.py b/setupinfo.py index 43e283fcc..97e339909 100644 --- a/setupinfo.py +++ b/setupinfo.py @@ -20,7 +20,6 @@ "lxml.builder", "lxml._elementpath", "lxml.html.diff", - "lxml.html.clean", "lxml.sax", ] HEADER_FILES = ['etree.h', 'etree_api.h'] diff --git a/src/lxml/html/clean.py b/src/lxml/html/clean.py index fdc96ab4c..d4b9e96d8 100644 --- a/src/lxml/html/clean.py +++ b/src/lxml/html/clean.py @@ -1,772 +1,21 @@ # cython: language_level=3str -"""A cleanup tool for HTML. - -Removes unwanted tags and content. See the `Cleaner` class for -details. -""" - -import copy -import re -from urllib.parse import urlsplit, unquote_plus - -from lxml import etree -from lxml.html import defs -from lxml.html import fromstring, XHTML_NAMESPACE -from lxml.html import xhtml_to_html, _transform_result - - -__all__ = ['clean_html', 'clean', 'Cleaner', 'autolink', 'autolink_html', - 'word_break', 'word_break_html'] - -# Look at http://code.sixapart.com/trac/livejournal/browser/trunk/cgi-bin/cleanhtml.pl -# Particularly the CSS cleaning; most of the tag cleaning is integrated now -# I have multiple kinds of schemes searched; but should schemes be -# whitelisted instead? -# max height? -# remove images? Also in CSS? background attribute? -# Some way to whitelist object, iframe, etc (e.g., if you want to -# allow *just* embedded YouTube movies) -# Log what was deleted and why? -# style="behavior: ..." might be bad in IE? -# Should we have something for just ? That's the worst of the -# metas. -# UTF-7 detections? Example: -# +ADw-SCRIPT+AD4-alert('XSS');+ADw-/SCRIPT+AD4- -# you don't always have to have the charset set, if the page has no charset -# and there's UTF7-like code in it. -# Look at these tests: http://htmlpurifier.org/live/smoketests/xssAttacks.php - - -# This is an IE-specific construct you can have in a stylesheet to -# run some Javascript: -_replace_css_javascript = re.compile( - r'expression\s*\(.*?\)', re.S|re.I).sub - -# Do I have to worry about @\nimport? -_replace_css_import = re.compile( - r'@\s*import', re.I).sub - -_looks_like_tag_content = re.compile( - r' safe_image_urls - -_substitute_whitespace = re.compile(r'[\s\x00-\x08\x0B\x0C\x0E-\x19]+').sub - -# FIXME: check against: http://msdn2.microsoft.com/en-us/library/ms537512.aspx -_conditional_comment_re = re.compile( - r'\[if[\s\n\r]+.*?][\s\n\r]*>', re.I|re.S) - -_find_styled_elements = etree.XPath( - "descendant-or-self::*[@style]") - -_find_external_links = etree.XPath( - ("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |" - "descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']"), - namespaces={'x':XHTML_NAMESPACE}) - - -class Cleaner: - """ - Instances cleans the document of each of the possible offending - elements. The cleaning is controlled by attributes; you can - override attributes in a subclass, or set them in the constructor. - - ``scripts``: - Removes any `` -... -... -... -... -... -... -... -... a link -... a control char link -... data -... another link -...

a paragraph

-...
secret EVIL!
-... of EVIL! -... -...
-... Password: -...
-... spam spam SPAM! -... -... Text -... -... -... ''' - ->>> print(re.sub('[\x00-\x07\x0E]', '', doc)) - - - - - - - - - - - a link - a control char link - data - another link -

a paragraph

-
secret EVIL!
- of EVIL! - -
- Password: -
- spam spam SPAM! - - Text - - - - ->>> print(tostring(fromstring(doc)).decode("utf-8")) - - - - - - - - - - - a link - a control char link - data - another link -

a paragraph

-
secret EVIL!
- of EVIL! - -
- Password: -
- spam spam SPAM! - - Text - - - - ->>> print(Cleaner(page_structure=False, comments=False).clean_html(doc)) - - - - - - - a link - a control char link - data - another link -

a paragraph

-
secret EVIL!
- of EVIL! - Password: - spam spam SPAM! - - Text - - - - ->>> print(Cleaner(page_structure=False, safe_attrs_only=False).clean_html(doc)) - - - - - - a link - a control char link - data - another link -

a paragraph

-
secret EVIL!
- of EVIL! - Password: - spam spam SPAM! - - Text - - - - ->>> print(Cleaner(style=True, inline_style=True, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc)) - - - - - a link - a control char link - data - another link -

a paragraph

-
secret EVIL!
- of EVIL! - Password: - spam spam SPAM! - Author - Text - - - - ->>> print(Cleaner(style=True, inline_style=False, links=True, add_nofollow=True, page_structure=False, safe_attrs_only=False).clean_html(doc)) - - - - - a link - a control char link - data - another link -

a paragraph

-
secret EVIL!
- of EVIL! - Password: - spam spam SPAM! - Author - Text - - - - ->>> print(Cleaner(links=False, page_structure=False, javascript=True, host_whitelist=['example.com'], whitelist_tags=None).clean_html(doc)) - - - - - - - - - a link - a control char link - data - another link -

a paragraph

-
secret EVIL!
- of EVIL! - Password: - spam spam SPAM! - - Text - - - diff --git a/src/lxml/html/tests/test_clean_embed.txt b/src/lxml/html/tests/test_clean_embed.txt deleted file mode 100644 index 59a40551d..000000000 --- a/src/lxml/html/tests/test_clean_embed.txt +++ /dev/null @@ -1,39 +0,0 @@ -THIS FAILS IN libxml2 2.6.29 AND 2.6.30 !! - - ->>> from lxml.html import fromstring, tostring ->>> from lxml.html.clean import clean, clean_html, Cleaner ->>> from lxml.html import usedoctest - ->>> def tostring(el): # work-around for Py3 'bytes' type -... from lxml.html import tostring -... s = tostring(el) -... if not isinstance(s, str): -... s = s.decode('UTF-8') -... return s - ->>> doc_embed = '''
-... -... -... -... -...
''' ->>> print(tostring(fromstring(doc_embed))) -
- - - - -
->>> print(Cleaner().clean_html(doc_embed)) -
-
->>> print(Cleaner(host_whitelist=['www.youtube.com']).clean_html(doc_embed)) -
- -
->>> print(Cleaner(host_whitelist=['www.youtube.com'], whitelist_tags=None).clean_html(doc_embed)) -
- - -
diff --git a/tools/ci-run.sh b/tools/ci-run.sh index 86c3530d3..e3ff44340 100644 --- a/tools/ci-run.sh +++ b/tools/ci-run.sh @@ -78,7 +78,7 @@ fi if [ -z "${PYTHON_VERSION##2*}" ] || [ -z "${PYTHON_VERSION##pypy-2*}" ]; then python -m pip install -U beautifulsoup4==4.9.3 cssselect==1.1.0 html5lib==1.1 rnc2rng==2.6.5 ${EXTRA_DEPS} || exit 1 else - python -m pip install -U beautifulsoup4 cssselect html5lib rnc2rng ${EXTRA_DEPS} || exit 1 + python -m pip install -U beautifulsoup4 cssselect html5lib lxml_html_clean rnc2rng ${EXTRA_DEPS} || exit 1 fi if [[ "$COVERAGE" == "true" ]]; then python -m pip install "coverage<5" || exit 1 diff --git a/tox.ini b/tox.ini index 9c5a3a28f..1a2d68a09 100644 --- a/tox.ini +++ b/tox.ini @@ -18,3 +18,5 @@ install_command = pip install {opts} {packages} deps = -r{toxinidir}/requirements.txt html5lib + lxml_html_clean + setuptools;python_version >= '3.12' From a970538530cb97476acadcf746357b7ff6ecc303 Mon Sep 17 00:00:00 2001 From: Stefan Behnel Date: Fri, 29 Mar 2024 21:12:59 +0100 Subject: [PATCH 013/321] Remove Cleaner related documentation after removing the code. --- doc/lxmlhtml.txt | 184 ----------------------------------------------- 1 file changed, 184 deletions(-) diff --git a/doc/lxmlhtml.txt b/doc/lxmlhtml.txt index 8f32da6c1..5800c2bba 100644 --- a/doc/lxmlhtml.txt +++ b/doc/lxmlhtml.txt @@ -482,190 +482,6 @@ Example: >>> [a.attrib['href'] for a in result.xpath("//a[@target='_blank']")] ['http://tinyurl.com/2xae8s', 'http://preview.tinyurl.com/2xae8s'] -Cleaning up HTML -================ - -The module ``lxml.html.clean`` provides a ``Cleaner`` class for cleaning up -HTML pages. It supports removing embedded or script content, special tags, -CSS style annotations and much more. - -Note: the HTML Cleaner in ``lxml.html.clean`` is **not** considered -appropriate **for security sensitive environments**. -See e.g. `bleach `_ or -`nh3 `_ for alternatives. - -Note: owing to the increased number of security vulnerabilities that have been -reported concerning the blocklist-based nature of lxml.html.clean, it has been -determined that this specific component of the project will be extracted -and transitioned into a separate project. This strategic decision is aimed -at enhancing the suitability of the lxml library for deployment -in security-sensitive environments, thereby addressing and mitigating potential -risks more effectively. - -Say, you have an overburdened web page from a hideous source which contains -lots of content that upsets browsers and tries to run unnecessary code on the -client side: - -.. sourcecode:: pycon - - >>> html = '''\ - ... - ... - ... - ... - ... - ... - ... - ... - ... a link - ... another link - ...

a paragraph

- ...
secret EVIL!
- ... of EVIL! - ... - ...
- ... Password: - ...
- ... annoying EVIL! - ... spam spam SPAM! - ... - ... - ... ''' - -To remove the all superfluous content from this unparsed document, use the -``clean_html`` function: - -.. sourcecode:: pycon - - >>> from lxml.html.clean import clean_html - >>> print clean_html(html) -
- - a link - another link -

a paragraph

-
secret EVIL!
- of EVIL! - - - Password: - annoying EVIL!spam spam SPAM! -
- -The ``Cleaner`` class supports several keyword arguments to control exactly -which content is removed: - -.. sourcecode:: pycon - - >>> from lxml.html.clean import Cleaner - - >>> cleaner = Cleaner(page_structure=False, links=False) - >>> print cleaner.clean_html(html) - - - - - - - a link - another link -

a paragraph

-
secret EVIL!
- of EVIL! - Password: - annoying EVIL! - spam spam SPAM! - - - - - >>> cleaner = Cleaner(style=True, links=True, add_nofollow=True, - ... page_structure=False, safe_attrs_only=False) - - >>> print cleaner.clean_html(html) - - - - - a link - another link -

a paragraph

-
secret EVIL!
- of EVIL! - Password: - annoying EVIL! - spam spam SPAM! - - - - -You can also whitelist some otherwise dangerous content with -``Cleaner(host_whitelist=['www.youtube.com'])``, which would allow -embedded media from YouTube, while still filtering out embedded media -from other sites. - -See the docstring of ``Cleaner`` for the details of what can be -cleaned. - - -autolink --------- - -In addition to cleaning up malicious HTML, ``lxml.html.clean`` -contains functions to do other things to your HTML. This includes -autolinking:: - - autolink(doc, ...) - - autolink_html(html, ...) - -This finds anything that looks like a link (e.g., -``http://example.com``) in the *text* of an HTML document, and -turns it into an anchor. It avoids making bad links. - -Links in the elements ``