diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index ee3330f44f47d0..67ef36890d5739 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -569,6 +569,13 @@ Pure paths provide the following methods and properties: >>> PurePath('a/b.py').match('/*.py') False + The *pattern* may be another path object; this speeds up matching the same + pattern against multiple files:: + + >>> pattern = PurePath('*.py') + >>> PurePath('a/b.py').match(pattern) + True + As with other methods, case-sensitivity follows platform defaults:: >>> PurePosixPath('b.py').match('*.PY') @@ -581,6 +588,10 @@ Pure paths provide the following methods and properties: .. versionadded:: 3.12 The *case_sensitive* argument. + .. versionchanged:: 3.13 + Support for the recursive wildcard "``**``" was added. In previous + versions, it acted like the non-recursive wildcard "``*``". + .. method:: PurePath.relative_to(other, walk_up=False) diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index 8c81ac76a56b46..44c0915492dcc0 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -90,6 +90,9 @@ Improved Modules pathlib ------- +* Add support for recursive wildcards in :meth:`pathlib.PurePath.match`. + (Contributed by Barney Gale in :gh:`73435`.) + * Add *follow_symlinks* keyword-only argument to :meth:`pathlib.Path.glob` and :meth:`~pathlib.Path.rglob`. (Contributed by Barney Gale in :gh:`77609`.) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index a57b582a211e06..62406473b66e4f 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -54,6 +54,7 @@ def _ignore_error(exception): getattr(exception, 'winerror', None) in _IGNORED_WINERRORS) +@functools.cache def _is_case_sensitive(flavour): return flavour.normcase('Aa') == 'Aa' @@ -61,6 +62,22 @@ def _is_case_sensitive(flavour): # Globbing helpers # + +# fnmatch.translate() returns a regular expression that includes a prefix and +# a suffix, which enable matching newlines and ensure the end of the string is +# matched, respectively. These features are undesirable for our implementation +# of PurePatch.match(), which represents path separators as newlines and joins +# pattern segments together. As a workaround, we define a slice object that +# can remove the prefix and suffix from any translate() result. See the +# _compile_pattern_lines() function for more details. +_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_') +_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX)) +_SWAP_SEP_AND_NEWLINE = { + '/': str.maketrans({'/': '\n', '\n': '/'}), + '\\': str.maketrans({'\\': '\n', '\n': '\\'}), +} + + @functools.lru_cache() def _make_selector(pattern_parts, flavour, case_sensitive): pat = pattern_parts[0] @@ -92,6 +109,51 @@ def _compile_pattern(pat, case_sensitive): return re.compile(fnmatch.translate(pat), flags).match +@functools.lru_cache() +def _compile_pattern_lines(pattern_lines, case_sensitive): + """Compile the given pattern lines to an `re.Pattern` object. + + The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with + its path separators and newlines swapped (e.g. '**\n*.py`). By using + newlines to separate path components, and not setting `re.DOTALL`, we + ensure that the `*` wildcard cannot match path separators. + + The returned `re.Pattern` object may have its `match()` method called to + match a complete pattern, or `search()` to match from the right. The + argument supplied to these methods must also have its path separators and + newlines swapped. + """ + + # Match the start of the path, or just after a path separator + parts = ['^'] + for part in pattern_lines.splitlines(keepends=True): + if part == '**\n': + # '**/' component: we use '[\s\S]' rather than '.' so that path + # separators (i.e. newlines) are matched. The trailing '^' ensures + # we terminate after a path separator (i.e. on a new line). + part = r'[\s\S]*^' + elif part == '**': + # '**' component. + part = r'[\s\S]*' + elif '**' in part: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + else: + # Any other component: pass to fnmatch.translate(). We slice off + # the common prefix and suffix added by translate() to ensure that + # re.DOTALL is not set, and the end of the string not matched, + # respectively. With DOTALL not set, '*' wildcards will not match + # path separators, because the '.' characters in the pattern will + # not match newlines. + part = fnmatch.translate(part)[_FNMATCH_SLICE] + parts.append(part) + # Match the end of the path, always. + parts.append(r'\Z') + flags = re.MULTILINE + if not case_sensitive: + flags |= re.IGNORECASE + return re.compile(''.join(parts), flags=flags) + + class _Selector: """A selector matches a specific glob pattern part against the children of a given path.""" @@ -276,6 +338,10 @@ class PurePath: # to implement comparison methods like `__lt__()`. '_parts_normcase_cached', + # The `_lines_cached` slot stores the string path with path separators + # and newlines swapped. This is used to implement `match()`. + '_lines_cached', + # The `_hash` slot stores the hash of the case-normalized string # path. It's set when `__hash__()` is called for the first time. '_hash', @@ -441,6 +507,16 @@ def _parts_normcase(self): self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep) return self._parts_normcase_cached + @property + def _lines(self): + # Path with separators and newlines swapped, for pattern matching. + try: + return self._lines_cached + except AttributeError: + trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep] + self._lines_cached = str(self).translate(trans) + return self._lines_cached + def __eq__(self, other): if not isinstance(other, PurePath): return NotImplemented @@ -697,23 +773,18 @@ def match(self, path_pattern, *, case_sensitive=None): """ Return True if this path matches the given pattern. """ + if not isinstance(path_pattern, PurePath): + path_pattern = self.with_segments(path_pattern) if case_sensitive is None: case_sensitive = _is_case_sensitive(self._flavour) - pat = self.with_segments(path_pattern) - if not pat.parts: + pattern = _compile_pattern_lines(path_pattern._lines, case_sensitive) + if path_pattern.drive or path_pattern.root: + return pattern.match(self._lines) is not None + elif path_pattern._tail: + return pattern.search(self._lines) is not None + else: raise ValueError("empty pattern") - pat_parts = pat.parts - parts = self.parts - if pat.drive or pat.root: - if len(pat_parts) != len(parts): - return False - elif len(pat_parts) > len(parts): - return False - for part, pat in zip(reversed(parts), reversed(pat_parts)): - match = _compile_pattern(pat, case_sensitive) - if not match(part): - return False - return True + # Subclassing os.PathLike makes isinstance() checks slower, # which in turn makes Path construction slower. Register instead! diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py index 4391d685d3c126..076ace3d930857 100644 --- a/Lib/test/test_pathlib.py +++ b/Lib/test/test_pathlib.py @@ -310,8 +310,30 @@ def test_match_common(self): self.assertFalse(P('/ab.py').match('/a/*.py')) self.assertFalse(P('/a/b/c.py').match('/a/*.py')) # Multi-part glob-style pattern. - self.assertFalse(P('/a/b/c.py').match('/**/*.py')) + self.assertTrue(P('a').match('**')) + self.assertTrue(P('c.py').match('**')) + self.assertTrue(P('a/b/c.py').match('**')) + self.assertTrue(P('/a/b/c.py').match('**')) + self.assertTrue(P('/a/b/c.py').match('/**')) + self.assertTrue(P('/a/b/c.py').match('**/')) + self.assertTrue(P('/a/b/c.py').match('/a/**')) + self.assertTrue(P('/a/b/c.py').match('**/*.py')) + self.assertTrue(P('/a/b/c.py').match('/**/*.py')) self.assertTrue(P('/a/b/c.py').match('/a/**/*.py')) + self.assertTrue(P('/a/b/c.py').match('/a/b/**/*.py')) + self.assertTrue(P('/a/b/c.py').match('/**/**/**/**/*.py')) + self.assertFalse(P('c.py').match('**/a.py')) + self.assertFalse(P('c.py').match('c/**')) + self.assertFalse(P('a/b/c.py').match('**/a')) + self.assertFalse(P('a/b/c.py').match('**/a/b')) + self.assertFalse(P('a/b/c.py').match('**/a/b/c')) + self.assertFalse(P('a/b/c.py').match('**/a/b/c.')) + self.assertFalse(P('a/b/c.py').match('**/a/b/c./**')) + self.assertFalse(P('a/b/c.py').match('**/a/b/c./**')) + self.assertFalse(P('a/b/c.py').match('/a/b/c.py/**')) + self.assertFalse(P('a/b/c.py').match('/**/a/b/c.py')) + self.assertRaises(ValueError, P('a').match, '**a/b/c') + self.assertRaises(ValueError, P('a').match, 'a/b/c**') # Case-sensitive flag self.assertFalse(P('A.py').match('a.PY', case_sensitive=True)) self.assertTrue(P('A.py').match('a.PY', case_sensitive=False)) diff --git a/Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst b/Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst new file mode 100644 index 00000000000000..d5a2ae07700b34 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst @@ -0,0 +1 @@ +Add support for recursive wildcards in :meth:`pathlib.PurePath.match`. pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy