From 608e91757ce80af34ae38784fbc17ad0ad3f33e0 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sat, 28 Jan 2023 00:07:54 +0000 Subject: [PATCH 01/28] gh-73435: Implement recursive wildcards in pathlib.PurePath.match() Add a new *recursive* argument to `pathlib.PurePath.match()`, defaulting to `False`. If set to true, `match()` handles the `**` wildcard as in `Path.glob()`, i.e. it matches any number of path segments. We now compile a `re.Pattern` object for the entire pattern. This is made more difficult by `fnmatch` not treating directory separators as special when evaluating wildcards (`*`, `?`, etc), and so we arrange the path parts onto separate *lines* in a string, and ensure we don't set `re.DOTALL`. --- Doc/library/pathlib.rst | 8 +++++++- Lib/fnmatch.py | 7 ++++++- Lib/pathlib.py | 43 +++++++++++++++++++++------------------- Lib/test/test_pathlib.py | 26 ++++++++++++++++++++++++ 4 files changed, 62 insertions(+), 22 deletions(-) diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index f222745a2c56bc..0b8cb28544682c 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -544,11 +544,14 @@ Pure paths provide the following methods and properties: PureWindowsPath('c:/Program Files') -.. method:: PurePath.match(pattern) +.. method:: PurePath.match(pattern, recursive=False) Match this path against the provided glob-style pattern. Return ``True`` if matching is successful, ``False`` otherwise. + If *recursive* is true, the pattern "``**``" will match any number of file + or directory segments. + If *pattern* is relative, the path can be either relative or absolute, and matching is done from the right:: @@ -574,6 +577,9 @@ Pure paths provide the following methods and properties: >>> PureWindowsPath('b.py').match('*.PY') True + .. versionadded:: 3.12 + The *recursive* argument. + .. method:: PurePath.relative_to(other, walk_up=False) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index d5e296f7748c1c..88b2d973b14d0f 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -77,6 +77,11 @@ def translate(pat): There is no way to quote meta-characters. """ + res = _translate(pat) + return fr'(?s:{res})\Z' + + +def _translate(pat): STAR = object() res = [] add = res.append @@ -182,4 +187,4 @@ def translate(pat): add(f"(?>.*?{fixed})") assert i == n res = "".join(res) - return fr'(?s:{res})\Z' + return res diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 17659bcd3e2d7f..8ceb0f82aa75d6 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -64,6 +64,25 @@ def _is_wildcard_pattern(pat): # Globbing helpers # +@functools.lru_cache() +def _make_matcher(path_cls, pattern, recursive): + pattern = path_cls(pattern) + if not pattern._parts: + raise ValueError("empty pattern") + result = [r'\A' if pattern._drv or pattern._root else '^'] + for part in pattern._parts_normcase: + if recursive: + if part == '**': + result.append('(.+\n)*') + continue + elif '**' in part: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + part = fnmatch._translate(part) + result.append(f'{part}\n') + result.append(r'\Z') + return re.compile(''.join(result), flags=re.MULTILINE) + + @functools.lru_cache() def _make_selector(pattern_parts, flavour): pat = pattern_parts[0] @@ -639,29 +658,13 @@ def is_reserved(self): name = self._parts[-1].partition('.')[0].partition(':')[0].rstrip(' ') return name.upper() in _WIN_RESERVED_NAMES - def match(self, path_pattern): + def match(self, path_pattern, recursive=False): """ Return True if this path matches the given pattern. """ - path_pattern = self._flavour.normcase(path_pattern) - drv, root, pat_parts = self._parse_parts((path_pattern,)) - if not pat_parts: - raise ValueError("empty pattern") - elif drv and drv != self._flavour.normcase(self._drv): - return False - elif root and root != self._root: - return False - parts = self._parts_normcase - if drv or root: - if len(pat_parts) != len(parts): - return False - pat_parts = pat_parts[1:] - elif len(pat_parts) > len(parts): - return False - for part, pat in zip(reversed(parts), reversed(pat_parts)): - if not fnmatch.fnmatchcase(part, pat): - return False - return True + matcher = _make_matcher(type(self), path_pattern, recursive) + lines = ''.join(f'{part}\n' for part in self._parts_normcase) + return matcher.search(lines) is not None # Can't subclass os.PathLike from PurePath and keep the constructor # optimizations in PurePath._parse_args(). diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py index a596795b44f0fa..7c3e169d3ce0e5 100644 --- a/Lib/test/test_pathlib.py +++ b/Lib/test/test_pathlib.py @@ -319,6 +319,32 @@ def test_match_common(self): # Multi-part glob-style pattern. self.assertFalse(P('/a/b/c.py').match('/**/*.py')) self.assertTrue(P('/a/b/c.py').match('/a/**/*.py')) + # Recursive patterns. + self.assertTrue(P('a').match('**', recursive=True)) + self.assertTrue(P('c.py').match('**', recursive=True)) + self.assertTrue(P('a/b/c.py').match('**', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('**', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('/**', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('**/', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('/a/**', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('**/*.py', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('/**/*.py', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('/a/**/*.py', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('/a/b/**/*.py', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('**/a/b/c.py/**', recursive=True)) + self.assertTrue(P('/a/b/c.py').match('/**/**/**/**/*.py', recursive=True)) + self.assertFalse(P('c.py').match('**/a.py', recursive=True)) + self.assertFalse(P('c.py').match('c/**', recursive=True)) + self.assertFalse(P('a/b/c.py').match('**/a', recursive=True)) + self.assertFalse(P('a/b/c.py').match('**/a/b', recursive=True)) + self.assertFalse(P('a/b/c.py').match('**/a/b/c', recursive=True)) + self.assertFalse(P('a/b/c.py').match('**/a/b/c.', recursive=True)) + self.assertFalse(P('a/b/c.py').match('**/a/b/c./**', recursive=True)) + self.assertFalse(P('a/b/c.py').match('**/a/b/c./**', recursive=True)) + self.assertFalse(P('a/b/c.py').match('/a/b/c.py/**', recursive=True)) + self.assertFalse(P('a/b/c.py').match('/**/a/b/c.py', recursive=True)) + self.assertRaises(ValueError, P('a').match, '**a/b/c', recursive=True) + self.assertRaises(ValueError, P('a').match, 'a/b/c**', recursive=True) def test_ordering_common(self): # Ordering is tuple-alike. From 9a43c7ff656fdec6e0050471297c2bc034a18abe Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 29 Jan 2023 00:15:55 +0000 Subject: [PATCH 02/28] Simplify code slightly --- Lib/pathlib.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 8ceb0f82aa75d6..7b4a9805f2e8c3 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -70,15 +70,14 @@ def _make_matcher(path_cls, pattern, recursive): if not pattern._parts: raise ValueError("empty pattern") result = [r'\A' if pattern._drv or pattern._root else '^'] - for part in pattern._parts_normcase: + for line in pattern._lines_normcase: if recursive: - if part == '**': - result.append('(.+\n)*') + if line == '**\n': + result.append('(.*\n)*') continue - elif '**' in part: + elif '**' in line: raise ValueError("Invalid pattern: '**' can only be an entire path component") - part = fnmatch._translate(part) - result.append(f'{part}\n') + result.append(fnmatch._translate(line)) result.append(r'\Z') return re.compile(''.join(result), flags=re.MULTILINE) @@ -658,13 +657,16 @@ def is_reserved(self): name = self._parts[-1].partition('.')[0].partition(':')[0].rstrip(' ') return name.upper() in _WIN_RESERVED_NAMES + @property + def _lines_normcase(self): + return [f'{part}\n' for part in self._parts_normcase] + def match(self, path_pattern, recursive=False): """ Return True if this path matches the given pattern. """ matcher = _make_matcher(type(self), path_pattern, recursive) - lines = ''.join(f'{part}\n' for part in self._parts_normcase) - return matcher.search(lines) is not None + return matcher.search(''.join(self._lines_normcase)) is not None # Can't subclass os.PathLike from PurePath and keep the constructor # optimizations in PurePath._parse_args(). From a846279d4274449ee15a084b7db84984694526fe Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 15 Feb 2023 17:47:34 +0000 Subject: [PATCH 03/28] Fix support for newlines --- Lib/pathlib.py | 17 +++++++++++++---- Lib/test/test_pathlib.py | 1 - 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 7b4a9805f2e8c3..6fbf293d275393 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -64,16 +64,23 @@ def _is_wildcard_pattern(pat): # Globbing helpers # + +_SWAP_SLASH_AND_NEWLINE = str.maketrans({'/': '\n', '\n': '/'}) + + @functools.lru_cache() def _make_matcher(path_cls, pattern, recursive): pattern = path_cls(pattern) if not pattern._parts: raise ValueError("empty pattern") result = [r'\A' if pattern._drv or pattern._root else '^'] - for line in pattern._lines_normcase: + for line in pattern._lines_normcase.splitlines(keepends=True): if recursive: if line == '**\n': - result.append('(.*\n)*') + result.append(r'[\S\s]*^') + continue + elif line == '**': + result.append(r'[\S\s]*') continue elif '**' in line: raise ValueError("Invalid pattern: '**' can only be an entire path component") @@ -659,14 +666,16 @@ def is_reserved(self): @property def _lines_normcase(self): - return [f'{part}\n' for part in self._parts_normcase] + path = self._flavour.normcase(self.as_posix()) + return path.translate(_SWAP_SLASH_AND_NEWLINE) def match(self, path_pattern, recursive=False): """ Return True if this path matches the given pattern. """ matcher = _make_matcher(type(self), path_pattern, recursive) - return matcher.search(''.join(self._lines_normcase)) is not None + return matcher.search(self._lines_normcase) is not None + # Can't subclass os.PathLike from PurePath and keep the constructor # optimizations in PurePath._parse_args(). diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py index 7c3e169d3ce0e5..fdacdd8f4afc92 100644 --- a/Lib/test/test_pathlib.py +++ b/Lib/test/test_pathlib.py @@ -331,7 +331,6 @@ def test_match_common(self): self.assertTrue(P('/a/b/c.py').match('/**/*.py', recursive=True)) self.assertTrue(P('/a/b/c.py').match('/a/**/*.py', recursive=True)) self.assertTrue(P('/a/b/c.py').match('/a/b/**/*.py', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('**/a/b/c.py/**', recursive=True)) self.assertTrue(P('/a/b/c.py').match('/**/**/**/**/*.py', recursive=True)) self.assertFalse(P('c.py').match('**/a.py', recursive=True)) self.assertFalse(P('c.py').match('c/**', recursive=True)) From bbd8cd603c71f87c948e11a1528a29536ae21827 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 15 Feb 2023 18:37:18 +0000 Subject: [PATCH 04/28] Cache translation of individual components --- Lib/pathlib.py | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 6fbf293d275393..755036bb97c48b 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -68,6 +68,18 @@ def _is_wildcard_pattern(pat): _SWAP_SLASH_AND_NEWLINE = str.maketrans({'/': '\n', '\n': '/'}) +@functools.lru_cache() +def _translate(pattern, recursive): + if recursive: + if pattern == '**\n': + return r'[\S\s]*^' + elif pattern == '**': + return r'[\S\s]*' + elif '**' in pattern: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + return fnmatch._translate(pattern) + + @functools.lru_cache() def _make_matcher(path_cls, pattern, recursive): pattern = path_cls(pattern) @@ -75,16 +87,7 @@ def _make_matcher(path_cls, pattern, recursive): raise ValueError("empty pattern") result = [r'\A' if pattern._drv or pattern._root else '^'] for line in pattern._lines_normcase.splitlines(keepends=True): - if recursive: - if line == '**\n': - result.append(r'[\S\s]*^') - continue - elif line == '**': - result.append(r'[\S\s]*') - continue - elif '**' in line: - raise ValueError("Invalid pattern: '**' can only be an entire path component") - result.append(fnmatch._translate(line)) + result.append(_translate(line, recursive)) result.append(r'\Z') return re.compile(''.join(result), flags=re.MULTILINE) From b5c002e36d7de58bb2a991b7fcfc4a716c4d8154 Mon Sep 17 00:00:00 2001 From: barneygale Date: Wed, 15 Feb 2023 20:12:47 +0000 Subject: [PATCH 05/28] Drop 'recursive' argument, make this the only behaviour. --- Doc/library/pathlib.rst | 10 ++++---- Lib/pathlib.py | 26 ++++++++++----------- Lib/test/test_pathlib.py | 49 +++++++++++++++++++--------------------- 3 files changed, 40 insertions(+), 45 deletions(-) diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 0b8cb28544682c..00788ef327753b 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -544,14 +544,11 @@ Pure paths provide the following methods and properties: PureWindowsPath('c:/Program Files') -.. method:: PurePath.match(pattern, recursive=False) +.. method:: PurePath.match(pattern) Match this path against the provided glob-style pattern. Return ``True`` if matching is successful, ``False`` otherwise. - If *recursive* is true, the pattern "``**``" will match any number of file - or directory segments. - If *pattern* is relative, the path can be either relative or absolute, and matching is done from the right:: @@ -577,8 +574,9 @@ Pure paths provide the following methods and properties: >>> PureWindowsPath('b.py').match('*.PY') True - .. versionadded:: 3.12 - The *recursive* argument. + .. versionchanged:: 3.12 + Support for the recursive wildcard "``**``" was added. In previous + versions, it acted like the non-recursive wildcard "``*``". .. method:: PurePath.relative_to(other, walk_up=False) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 755036bb97c48b..484a5d874ba138 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -69,25 +69,25 @@ def _is_wildcard_pattern(pat): @functools.lru_cache() -def _translate(pattern, recursive): - if recursive: - if pattern == '**\n': - return r'[\S\s]*^' - elif pattern == '**': - return r'[\S\s]*' - elif '**' in pattern: - raise ValueError("Invalid pattern: '**' can only be an entire path component") - return fnmatch._translate(pattern) +def _translate(pattern): + if pattern == '**\n': + return r'[\S\s]*^' + elif pattern == '**': + return r'[\S\s]*' + elif '**' in pattern: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + else: + return fnmatch._translate(pattern) @functools.lru_cache() -def _make_matcher(path_cls, pattern, recursive): +def _make_matcher(path_cls, pattern): pattern = path_cls(pattern) if not pattern._parts: raise ValueError("empty pattern") result = [r'\A' if pattern._drv or pattern._root else '^'] for line in pattern._lines_normcase.splitlines(keepends=True): - result.append(_translate(line, recursive)) + result.append(_translate(line)) result.append(r'\Z') return re.compile(''.join(result), flags=re.MULTILINE) @@ -672,11 +672,11 @@ def _lines_normcase(self): path = self._flavour.normcase(self.as_posix()) return path.translate(_SWAP_SLASH_AND_NEWLINE) - def match(self, path_pattern, recursive=False): + def match(self, path_pattern): """ Return True if this path matches the given pattern. """ - matcher = _make_matcher(type(self), path_pattern, recursive) + matcher = _make_matcher(type(self), path_pattern) return matcher.search(self._lines_normcase) is not None diff --git a/Lib/test/test_pathlib.py b/Lib/test/test_pathlib.py index fdacdd8f4afc92..1c486be55e5cfc 100644 --- a/Lib/test/test_pathlib.py +++ b/Lib/test/test_pathlib.py @@ -317,33 +317,30 @@ def test_match_common(self): self.assertFalse(P('/ab.py').match('/a/*.py')) self.assertFalse(P('/a/b/c.py').match('/a/*.py')) # Multi-part glob-style pattern. - self.assertFalse(P('/a/b/c.py').match('/**/*.py')) + self.assertTrue(P('a').match('**')) + self.assertTrue(P('c.py').match('**')) + self.assertTrue(P('a/b/c.py').match('**')) + self.assertTrue(P('/a/b/c.py').match('**')) + self.assertTrue(P('/a/b/c.py').match('/**')) + self.assertTrue(P('/a/b/c.py').match('**/')) + self.assertTrue(P('/a/b/c.py').match('/a/**')) + self.assertTrue(P('/a/b/c.py').match('**/*.py')) + self.assertTrue(P('/a/b/c.py').match('/**/*.py')) self.assertTrue(P('/a/b/c.py').match('/a/**/*.py')) - # Recursive patterns. - self.assertTrue(P('a').match('**', recursive=True)) - self.assertTrue(P('c.py').match('**', recursive=True)) - self.assertTrue(P('a/b/c.py').match('**', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('**', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('/**', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('**/', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('/a/**', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('**/*.py', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('/**/*.py', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('/a/**/*.py', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('/a/b/**/*.py', recursive=True)) - self.assertTrue(P('/a/b/c.py').match('/**/**/**/**/*.py', recursive=True)) - self.assertFalse(P('c.py').match('**/a.py', recursive=True)) - self.assertFalse(P('c.py').match('c/**', recursive=True)) - self.assertFalse(P('a/b/c.py').match('**/a', recursive=True)) - self.assertFalse(P('a/b/c.py').match('**/a/b', recursive=True)) - self.assertFalse(P('a/b/c.py').match('**/a/b/c', recursive=True)) - self.assertFalse(P('a/b/c.py').match('**/a/b/c.', recursive=True)) - self.assertFalse(P('a/b/c.py').match('**/a/b/c./**', recursive=True)) - self.assertFalse(P('a/b/c.py').match('**/a/b/c./**', recursive=True)) - self.assertFalse(P('a/b/c.py').match('/a/b/c.py/**', recursive=True)) - self.assertFalse(P('a/b/c.py').match('/**/a/b/c.py', recursive=True)) - self.assertRaises(ValueError, P('a').match, '**a/b/c', recursive=True) - self.assertRaises(ValueError, P('a').match, 'a/b/c**', recursive=True) + self.assertTrue(P('/a/b/c.py').match('/a/b/**/*.py')) + self.assertTrue(P('/a/b/c.py').match('/**/**/**/**/*.py')) + self.assertFalse(P('c.py').match('**/a.py')) + self.assertFalse(P('c.py').match('c/**')) + self.assertFalse(P('a/b/c.py').match('**/a')) + self.assertFalse(P('a/b/c.py').match('**/a/b')) + self.assertFalse(P('a/b/c.py').match('**/a/b/c')) + self.assertFalse(P('a/b/c.py').match('**/a/b/c.')) + self.assertFalse(P('a/b/c.py').match('**/a/b/c./**')) + self.assertFalse(P('a/b/c.py').match('**/a/b/c./**')) + self.assertFalse(P('a/b/c.py').match('/a/b/c.py/**')) + self.assertFalse(P('a/b/c.py').match('/**/a/b/c.py')) + self.assertRaises(ValueError, P('a').match, '**a/b/c') + self.assertRaises(ValueError, P('a').match, 'a/b/c**') def test_ordering_common(self): # Ordering is tuple-alike. From 0afcd54884c315587469c49752ce858036a15b04 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 16 Feb 2023 21:05:32 +0000 Subject: [PATCH 06/28] Undo modifications to fnmatch.py --- Lib/fnmatch.py | 7 +------ Lib/pathlib.py | 4 +++- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/Lib/fnmatch.py b/Lib/fnmatch.py index 88b2d973b14d0f..d5e296f7748c1c 100644 --- a/Lib/fnmatch.py +++ b/Lib/fnmatch.py @@ -77,11 +77,6 @@ def translate(pat): There is no way to quote meta-characters. """ - res = _translate(pat) - return fr'(?s:{res})\Z' - - -def _translate(pat): STAR = object() res = [] add = res.append @@ -187,4 +182,4 @@ def _translate(pat): add(f"(?>.*?{fixed})") assert i == n res = "".join(res) - return res + return fr'(?s:{res})\Z' diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 484a5d874ba138..a298a73a9f467f 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -66,6 +66,8 @@ def _is_wildcard_pattern(pat): _SWAP_SLASH_AND_NEWLINE = str.maketrans({'/': '\n', '\n': '/'}) +_FNMATCH_PADDING = fnmatch.translate('_').split('_') +_FNMATCH_SLICE = slice(len(_FNMATCH_PADDING[0]), -len(_FNMATCH_PADDING[1])) @functools.lru_cache() @@ -77,7 +79,7 @@ def _translate(pattern): elif '**' in pattern: raise ValueError("Invalid pattern: '**' can only be an entire path component") else: - return fnmatch._translate(pattern) + return fnmatch.translate(pattern)[_FNMATCH_SLICE] @functools.lru_cache() From 7b6f850c99964c12682b275cf65f5b52d7fcfb89 Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 17 Feb 2023 16:25:08 +0000 Subject: [PATCH 07/28] Fix Windows support --- Lib/pathlib.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 511e0f0577af0c..f937614611c07c 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -65,7 +65,6 @@ def _is_wildcard_pattern(pat): # -_SWAP_SLASH_AND_NEWLINE = str.maketrans({'/': '\n', '\n': '/'}) _FNMATCH_PADDING = fnmatch.translate('_').split('_') _FNMATCH_SLICE = slice(len(_FNMATCH_PADDING[0]), -len(_FNMATCH_PADDING[1])) @@ -82,6 +81,11 @@ def _translate(pattern): return fnmatch.translate(pattern)[_FNMATCH_SLICE] +@functools.lru_cache() +def _make_matcher_trans(flavour): + return str.maketrans({flavour.sep: '\n', '\n': flavour.sep}) + + @functools.lru_cache() def _make_matcher(path_cls, pattern): pattern = path_cls(pattern) @@ -671,8 +675,8 @@ def is_reserved(self): @property def _lines_normcase(self): - path = self._flavour.normcase(self.as_posix()) - return path.translate(_SWAP_SLASH_AND_NEWLINE) + trans = _make_matcher_trans(self._flavour) + return self._flavour.normcase(str(self)).translate(trans) def match(self, path_pattern): """ From 037488ac370971ce9c8dbfb966a10d4eb1b91656 Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 17 Feb 2023 18:38:10 +0000 Subject: [PATCH 08/28] Tidy up code. --- Lib/pathlib.py | 63 ++++++++++++++++++++++++++------------------------ 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index f937614611c07c..ecb39d9a40d0a5 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -65,25 +65,12 @@ def _is_wildcard_pattern(pat): # -_FNMATCH_PADDING = fnmatch.translate('_').split('_') -_FNMATCH_SLICE = slice(len(_FNMATCH_PADDING[0]), -len(_FNMATCH_PADDING[1])) - - -@functools.lru_cache() -def _translate(pattern): - if pattern == '**\n': - return r'[\S\s]*^' - elif pattern == '**': - return r'[\S\s]*' - elif '**' in pattern: - raise ValueError("Invalid pattern: '**' can only be an entire path component") - else: - return fnmatch.translate(pattern)[_FNMATCH_SLICE] - - -@functools.lru_cache() -def _make_matcher_trans(flavour): - return str.maketrans({flavour.sep: '\n', '\n': flavour.sep}) +_FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_') +_FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX)) +_SWAP_SEP_AND_NEWLINE = { + '/': str.maketrans({'/': '\n', '\n': '/'}), + '\\': str.maketrans({'\\': '\n', '\n': '\\'}), +} @functools.lru_cache() @@ -91,11 +78,19 @@ def _make_matcher(path_cls, pattern): pattern = path_cls(pattern) if not pattern._parts: raise ValueError("empty pattern") - result = [r'\A' if pattern._drv or pattern._root else '^'] - for line in pattern._lines_normcase.splitlines(keepends=True): - result.append(_translate(line)) - result.append(r'\Z') - return re.compile(''.join(result), flags=re.MULTILINE) + parts = [r'\A' if pattern._drv or pattern._root else '^'] + for part in pattern._lines_normcase.splitlines(keepends=True): + if part == '**\n': + part = r'[\s\S]*^' + elif part == '**': + part = r'[\s\S]*' + elif '**' in part: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + else: + part = fnmatch.translate(part)[_FNMATCH_SLICE] + parts.append(part) + parts.append(r'\Z') + return re.compile(''.join(parts), flags=re.MULTILINE) @functools.lru_cache() @@ -286,7 +281,8 @@ class PurePath(object): """ __slots__ = ( '_drv', '_root', '_parts', - '_str', '_hash', '_parts_tuple', '_parts_normcase_cached', + '_str', '_hash', '_parts_tuple', + '_parts_normcase_cached', '_lines_normcase_cached', ) _flavour = os.path @@ -415,6 +411,18 @@ def as_uri(self): path = str(self) return prefix + urlquote_from_bytes(os.fsencode(path)) + @property + def _lines_normcase(self): + # Case-normalized path with separators and newlines swapped, for + # pattern matching. + try: + return self._lines_normcase_cached + except AttributeError: + path = self._flavour.normcase(str(self)) + trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep] + self._lines_normcase_cached = path.translate(trans) + return self._lines_normcase_cached + @property def _parts_normcase(self): # Cached parts with normalized case, for hashing and comparison. @@ -673,11 +681,6 @@ def is_reserved(self): name = self._parts[-1].partition('.')[0].partition(':')[0].rstrip(' ') return name.upper() in _WIN_RESERVED_NAMES - @property - def _lines_normcase(self): - trans = _make_matcher_trans(self._flavour) - return self._flavour.normcase(str(self)).translate(trans) - def match(self, path_pattern): """ Return True if this path matches the given pattern. From 07419501d2dda92ef23899dd542d5c4ec6ad03e3 Mon Sep 17 00:00:00 2001 From: barneygale Date: Fri, 17 Feb 2023 18:56:57 +0000 Subject: [PATCH 09/28] Add news blurb. --- .../next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst diff --git a/Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst b/Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst new file mode 100644 index 00000000000000..d5a2ae07700b34 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2023-02-17-18-56-46.gh-issue-73435.7sTJHk.rst @@ -0,0 +1 @@ +Add support for recursive wildcards in :meth:`pathlib.PurePath.match`. From 314679ff7a1a2f2f189c1c3c8675f612104b5fd7 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 9 Apr 2023 20:12:59 +0100 Subject: [PATCH 10/28] Simplify patch; prepare for use in `glob()` --- Lib/pathlib.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index bd6a5869e11e36..847e0031cdd312 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -73,12 +73,9 @@ def _is_wildcard_pattern(pat): @functools.lru_cache() -def _make_matcher(path_cls, pattern): - pattern = path_cls(pattern) - if not pattern.parts: - raise ValueError("empty pattern") - parts = [r'\A' if pattern.drive or pattern.root else '^'] - for part in pattern._lines_normcase.splitlines(keepends=True): +def _make_matcher(lines): + parts = ['^'] + for part in lines.splitlines(keepends=True): if part == '**\n': part = r'[\s\S]*^' elif part == '**': @@ -717,8 +714,15 @@ def match(self, path_pattern): """ Return True if this path matches the given pattern. """ - matcher = _make_matcher(type(self), path_pattern) - return matcher.search(self._lines_normcase) is not None + pat = type(self)(path_pattern) + if not pat.parts: + raise ValueError("empty pattern") + matcher = _make_matcher(pat._lines_normcase) + if pat.drive or pat.root: + match = matcher.match(self._lines_normcase) + else: + match = matcher.search(self._lines_normcase) + return match is not None # Can't subclass os.PathLike from PurePath and keep the constructor From 90eebcc4ea83a7570e49dc9ad38606a1c57ec3a7 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 9 Apr 2023 20:38:30 +0100 Subject: [PATCH 11/28] Make better use of path object caching. --- Lib/pathlib.py | 72 ++++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 35 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 847e0031cdd312..f8dd6b39f4df1f 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -72,23 +72,6 @@ def _is_wildcard_pattern(pat): } -@functools.lru_cache() -def _make_matcher(lines): - parts = ['^'] - for part in lines.splitlines(keepends=True): - if part == '**\n': - part = r'[\s\S]*^' - elif part == '**': - part = r'[\s\S]*' - elif '**' in part: - raise ValueError("Invalid pattern: '**' can only be an entire path component") - else: - part = fnmatch.translate(part)[_FNMATCH_SLICE] - parts.append(part) - parts.append(r'\Z') - return re.compile(''.join(parts), flags=re.MULTILINE) - - @functools.lru_cache() def _make_selector(pattern_parts, flavour): pat = pattern_parts[0] @@ -298,17 +281,19 @@ class PurePath(object): # `__hash__()`, and `_parts_normcase` '_str_normcase_cached', - # The `_lines_normcase_cached` slot stores the string path with - # normalized case, and with path separators and newlines swapped. This - # is used to implement `match()`. - '_lines_normcase_cached', - # The `_parts_normcase_cached` slot stores the case-normalized # string path after splitting on path separators. It's set when the # `_parts_normcase` property is accessed for the first time. It's used # to implement comparison methods like `__lt__()`. '_parts_normcase_cached', + # The `_lines_normcase_cached` and `_matcher_cached` slots store the + # string path with path separators and newlines swapped, and an + # `re.Pattern` object derived thereof. These are used to implement + # `match()`. + '_lines_normcase_cached', + '_matcher_cached', + # The `_hash` slot stores the hash of the case-normalized string # path. It's set when `__hash__()` is called for the first time. '_hash', @@ -439,6 +424,15 @@ def _str_normcase(self): self._str_normcase_cached = self._flavour.normcase(str(self)) return self._str_normcase_cached + @property + def _parts_normcase(self): + # Cached parts with normalized case, for comparisons. + try: + return self._parts_normcase_cached + except AttributeError: + self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep) + return self._parts_normcase_cached + @property def _lines_normcase(self): # Case-normalized path with separators and newlines swapped, for @@ -451,13 +445,26 @@ def _lines_normcase(self): return self._lines_normcase_cached @property - def _parts_normcase(self): - # Cached parts with normalized case, for comparisons. + def _matcher(self): try: - return self._parts_normcase_cached + return self._matcher_cached except AttributeError: - self._parts_normcase_cached = self._str_normcase.split(self._flavour.sep) - return self._parts_normcase_cached + if not self.parts: + raise ValueError("empty pattern") + parts = [r'\A' if self.drive or self.root else '^'] + for part in self._lines_normcase.splitlines(keepends=True): + if part == '**\n': + part = r'[\s\S]*^' + elif part == '**': + part = r'[\s\S]*' + elif '**' in part: + raise ValueError("Invalid pattern: '**' can only be an entire path component") + else: + part = fnmatch.translate(part)[_FNMATCH_SLICE] + parts.append(part) + parts.append(r'\Z') + self._matcher_cached = re.compile(''.join(parts), flags=re.MULTILINE) + return self._matcher_cached def __eq__(self, other): if not isinstance(other, PurePath): @@ -714,14 +721,9 @@ def match(self, path_pattern): """ Return True if this path matches the given pattern. """ - pat = type(self)(path_pattern) - if not pat.parts: - raise ValueError("empty pattern") - matcher = _make_matcher(pat._lines_normcase) - if pat.drive or pat.root: - match = matcher.match(self._lines_normcase) - else: - match = matcher.search(self._lines_normcase) + if not isinstance(path_pattern, type(self)): + path_pattern = type(self)(path_pattern) + match = path_pattern._matcher.search(self._lines_normcase) return match is not None From 4b5fffdf961e7d187759ed67a68e97891a9b1d19 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 9 Apr 2023 21:09:43 +0100 Subject: [PATCH 12/28] Add performance tip to docs --- Doc/library/pathlib.rst | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 701470b5ebab2b..47057d99272942 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -568,6 +568,13 @@ Pure paths provide the following methods and properties: >>> PurePath('a/b.py').match('/*.py') False + The *pattern* may be another path object; this speeds up matching the same + pattern against multiple files:: + + >>> pattern = PurePath('*.py') + >>> PurePath('a/b.py').match(pattern) + True + As with other methods, case-sensitivity follows platform defaults:: >>> PurePosixPath('b.py').match('*.PY') From 5e8bc280ebbc52921ab7f246f502ccd62fb2fc26 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 20 Apr 2023 19:51:13 +0100 Subject: [PATCH 13/28] Skip re-initialisation of PurePath patterns. --- Lib/pathlib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index f8dd6b39f4df1f..e8300d22de6683 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -721,7 +721,7 @@ def match(self, path_pattern): """ Return True if this path matches the given pattern. """ - if not isinstance(path_pattern, type(self)): + if not isinstance(path_pattern, PurePath) or self._flavour is not path_pattern._flavour: path_pattern = type(self)(path_pattern) match = path_pattern._matcher.search(self._lines_normcase) return match is not None From 722a1ab0d9d52ee888da64a15cb65e94f7d34f06 Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 2 May 2023 23:14:32 +0100 Subject: [PATCH 14/28] Use `re.IGNORECASE` rather than `os.path.normcase()` --- Lib/pathlib.py | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index c6dda1dc3989d2..38374479ae352d 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -290,11 +290,11 @@ class PurePath(object): # to implement comparison methods like `__lt__()`. '_parts_normcase_cached', - # The `_lines_normcase_cached` and `_matcher_cached` slots store the + # The `_lines_cached` and `_matcher_cached` slots store the # string path with path separators and newlines swapped, and an # `re.Pattern` object derived thereof. These are used to implement # `match()`. - '_lines_normcase_cached', + '_lines_cached', '_matcher_cached', # The `_hash` slot stores the hash of the case-normalized string @@ -451,15 +451,14 @@ def _parts_normcase(self): return self._parts_normcase_cached @property - def _lines_normcase(self): - # Case-normalized path with separators and newlines swapped, for - # pattern matching. + def _lines(self): + # Path with separators and newlines swapped, for pattern matching. try: - return self._lines_normcase_cached + return self._lines_cached except AttributeError: trans = _SWAP_SEP_AND_NEWLINE[self._flavour.sep] - self._lines_normcase_cached = self._str_normcase.translate(trans) - return self._lines_normcase_cached + self._lines_cached = str(self).translate(trans) + return self._lines_cached @property def _matcher(self): @@ -469,7 +468,7 @@ def _matcher(self): if not self.parts: raise ValueError("empty pattern") parts = [r'\A' if self.drive or self.root else '^'] - for part in self._lines_normcase.splitlines(keepends=True): + for part in self._lines.splitlines(keepends=True): if part == '**\n': part = r'[\s\S]*^' elif part == '**': @@ -480,7 +479,10 @@ def _matcher(self): part = fnmatch.translate(part)[_FNMATCH_SLICE] parts.append(part) parts.append(r'\Z') - self._matcher_cached = re.compile(''.join(parts), flags=re.MULTILINE) + flags = re.MULTILINE + if not _is_case_sensitive(self._flavour): + flags |= re.IGNORECASE + self._matcher_cached = re.compile(''.join(parts), flags=flags) return self._matcher_cached def __eq__(self, other): @@ -740,7 +742,7 @@ def match(self, path_pattern): """ if not isinstance(path_pattern, PurePath) or self._flavour is not path_pattern._flavour: path_pattern = type(self)(path_pattern) - match = path_pattern._matcher.search(self._lines_normcase) + match = path_pattern._matcher.search(self._lines) return match is not None From ccea5e18df9980d8c78d3451fc98349df7826211 Mon Sep 17 00:00:00 2001 From: barneygale Date: Thu, 11 May 2023 19:47:59 +0100 Subject: [PATCH 15/28] Add whats new entry --- Doc/whatsnew/3.12.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index ec04178238b6b0..b9ff02619f47e8 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -365,6 +365,9 @@ pathlib * Add :meth:`pathlib.Path.is_junction` as a proxy to :func:`os.path.isjunction`. (Contributed by Charles Machalow in :gh:`99547`.) +* Add support for recursive wildcards in :meth:`pathlib.PurePath.match`. + (Contributed by Barney Gale in :gh:`101398`.) + dis --- From dd04294e1300a364c8d1c0bf7c514ef5256424f4 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Thu, 11 May 2023 20:44:59 +0100 Subject: [PATCH 16/28] Update Doc/whatsnew/3.12.rst Co-authored-by: Hugo van Kemenade --- Doc/whatsnew/3.12.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index b9ff02619f47e8..e656c8296394dd 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -366,7 +366,7 @@ pathlib (Contributed by Charles Machalow in :gh:`99547`.) * Add support for recursive wildcards in :meth:`pathlib.PurePath.match`. - (Contributed by Barney Gale in :gh:`101398`.) + (Contributed by Barney Gale in :gh:`73435`.) dis From b258641ce46dca84a8a01b83a4cc629cc6c85db9 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Sun, 14 May 2023 20:18:21 +0100 Subject: [PATCH 17/28] Apply suggestions from code review Co-authored-by: Alex Waygood --- Lib/pathlib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 3cc9ed6cb54379..ca48707c544f4a 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -462,7 +462,7 @@ def _matcher(self): return self._matcher_cached except AttributeError: if not self.parts: - raise ValueError("empty pattern") + raise ValueError("empty pattern") from None parts = [r'\A' if self.drive or self.root else '^'] for part in self._lines.splitlines(keepends=True): if part == '**\n': @@ -470,7 +470,7 @@ def _matcher(self): elif part == '**': part = r'[\s\S]*' elif '**' in part: - raise ValueError("Invalid pattern: '**' can only be an entire path component") + raise ValueError("Invalid pattern: '**' can only be an entire path component") from None else: part = fnmatch.translate(part)[_FNMATCH_SLICE] parts.append(part) From ced899853a5fa2bb0dfbaa80013bd3e1a9863971 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 14 May 2023 21:18:46 +0100 Subject: [PATCH 18/28] Explain _FNMATCH_SLICE --- Lib/pathlib.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index ca48707c544f4a..7bcfe5f3d476f9 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -62,6 +62,12 @@ def _is_case_sensitive(flavour): # +# fnmatch.translate() returns a regular expression that includes a prefix and +# a suffix, which enable matching newlines and ensure the end of the string is +# matched, respectively. These features are undesirable for our implementation +# of PurePatch.match(), which represents path separators as newlines and joins +# pattern segments together. As a workaround, we define a slice object that +# remove the prefix and suffix from any translate() result. _FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_') _FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX)) _SWAP_SEP_AND_NEWLINE = { From a33c7b659ae6f554b4a1e44a411acf872e2d388d Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 14 May 2023 21:20:26 +0100 Subject: [PATCH 19/28] Accidentally a word. --- Lib/pathlib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 7bcfe5f3d476f9..536111dcd9a0e5 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -67,7 +67,7 @@ def _is_case_sensitive(flavour): # matched, respectively. These features are undesirable for our implementation # of PurePatch.match(), which represents path separators as newlines and joins # pattern segments together. As a workaround, we define a slice object that -# remove the prefix and suffix from any translate() result. +# can remove the prefix and suffix from any translate() result. _FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_') _FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX)) _SWAP_SEP_AND_NEWLINE = { From 4b3bddb60e58afc5216323b15b7851c2cdce6702 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 14 May 2023 21:49:51 +0100 Subject: [PATCH 20/28] Cache pattern compilation --- Lib/pathlib.py | 56 ++++++++++++++++++++++---------------------------- 1 file changed, 25 insertions(+), 31 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 536111dcd9a0e5..41e78f6e7a121d 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -76,6 +76,28 @@ def _is_case_sensitive(flavour): } +@functools.lru_cache() +def _make_matcher(pattern): + if not pattern.parts: + raise ValueError("empty pattern") from None + parts = [r'\A' if pattern.drive or pattern.root else '^'] + for part in pattern._lines.splitlines(keepends=True): + if part == '**\n': + part = r'[\s\S]*^' + elif part == '**': + part = r'[\s\S]*' + elif '**' in part: + raise ValueError("Invalid pattern: '**' can only be an entire path component") from None + else: + part = fnmatch.translate(part)[_FNMATCH_SLICE] + parts.append(part) + parts.append(r'\Z') + flags = re.MULTILINE + if not _is_case_sensitive(pattern._flavour): + flags |= re.IGNORECASE + return re.compile(''.join(parts), flags=flags) + + @functools.lru_cache() def _make_selector(pattern_parts, flavour, case_sensitive): pat = pattern_parts[0] @@ -286,12 +308,9 @@ class PurePath(object): # to implement comparison methods like `__lt__()`. '_parts_normcase_cached', - # The `_lines_cached` and `_matcher_cached` slots store the - # string path with path separators and newlines swapped, and an - # `re.Pattern` object derived thereof. These are used to implement - # `match()`. + # The `_lines_cached`slot stores the string path with path separators + # and newlines swapped. This is used to implement `match()`. '_lines_cached', - '_matcher_cached', # The `_hash` slot stores the hash of the case-normalized string # path. It's set when `__hash__()` is called for the first time. @@ -462,31 +481,6 @@ def _lines(self): self._lines_cached = str(self).translate(trans) return self._lines_cached - @property - def _matcher(self): - try: - return self._matcher_cached - except AttributeError: - if not self.parts: - raise ValueError("empty pattern") from None - parts = [r'\A' if self.drive or self.root else '^'] - for part in self._lines.splitlines(keepends=True): - if part == '**\n': - part = r'[\s\S]*^' - elif part == '**': - part = r'[\s\S]*' - elif '**' in part: - raise ValueError("Invalid pattern: '**' can only be an entire path component") from None - else: - part = fnmatch.translate(part)[_FNMATCH_SLICE] - parts.append(part) - parts.append(r'\Z') - flags = re.MULTILINE - if not _is_case_sensitive(self._flavour): - flags |= re.IGNORECASE - self._matcher_cached = re.compile(''.join(parts), flags=flags) - return self._matcher_cached - def __eq__(self, other): if not isinstance(other, PurePath): return NotImplemented @@ -745,7 +739,7 @@ def match(self, path_pattern): """ if not isinstance(path_pattern, PurePath) or self._flavour is not path_pattern._flavour: path_pattern = self.with_segments(path_pattern) - match = path_pattern._matcher.search(self._lines) + match = _make_matcher(path_pattern).search(self._lines) return match is not None From 6ad30dd5bd30309b3ae33734e7f9ec92e2e34366 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 14 May 2023 21:51:59 +0100 Subject: [PATCH 21/28] Remove unneeded `from None` suffix, whoops. --- Lib/pathlib.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 41e78f6e7a121d..f1aaf28f24e049 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -79,7 +79,7 @@ def _is_case_sensitive(flavour): @functools.lru_cache() def _make_matcher(pattern): if not pattern.parts: - raise ValueError("empty pattern") from None + raise ValueError("empty pattern") parts = [r'\A' if pattern.drive or pattern.root else '^'] for part in pattern._lines.splitlines(keepends=True): if part == '**\n': @@ -87,7 +87,7 @@ def _make_matcher(pattern): elif part == '**': part = r'[\s\S]*' elif '**' in part: - raise ValueError("Invalid pattern: '**' can only be an entire path component") from None + raise ValueError("Invalid pattern: '**' can only be an entire path component") else: part = fnmatch.translate(part)[_FNMATCH_SLICE] parts.append(part) From 052890f93c4c4ed309a22f8ecc3d4bfd899f6434 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 14 May 2023 21:55:49 +0100 Subject: [PATCH 22/28] Tiny performance improvement: avoid accessing path.parts --- Lib/pathlib.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index f1aaf28f24e049..25fcf5a2f9e006 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -78,9 +78,12 @@ def _is_case_sensitive(flavour): @functools.lru_cache() def _make_matcher(pattern): - if not pattern.parts: + if pattern.drive or pattern.root: + parts = [r'\A'] + elif pattern._tail: + parts = ['^'] + else: raise ValueError("empty pattern") - parts = [r'\A' if pattern.drive or pattern.root else '^'] for part in pattern._lines.splitlines(keepends=True): if part == '**\n': part = r'[\s\S]*^' From d789b6db75f36069fe0705b4aadcb70c743036f5 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 14 May 2023 22:45:09 +0100 Subject: [PATCH 23/28] Typo fix --- Lib/pathlib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 25fcf5a2f9e006..ec87d3c0da7bef 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -311,7 +311,7 @@ class PurePath(object): # to implement comparison methods like `__lt__()`. '_parts_normcase_cached', - # The `_lines_cached`slot stores the string path with path separators + # The `_lines_cached` slot stores the string path with path separators # and newlines swapped. This is used to implement `match()`. '_lines_cached', From 4fe77c64ecd677318fb4d9d04d6c138515724593 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 14 May 2023 22:58:29 +0100 Subject: [PATCH 24/28] Avoid hashing path object when compiling pattern. --- Lib/pathlib.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index ec87d3c0da7bef..c887af7e7a1b6e 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -77,14 +77,9 @@ def _is_case_sensitive(flavour): @functools.lru_cache() -def _make_matcher(pattern): - if pattern.drive or pattern.root: - parts = [r'\A'] - elif pattern._tail: - parts = ['^'] - else: - raise ValueError("empty pattern") - for part in pattern._lines.splitlines(keepends=True): +def _compile_pattern(pattern_lines, case_sensitive): + parts = ['^'] + for part in pattern_lines.splitlines(keepends=True): if part == '**\n': part = r'[\s\S]*^' elif part == '**': @@ -96,7 +91,7 @@ def _make_matcher(pattern): parts.append(part) parts.append(r'\Z') flags = re.MULTILINE - if not _is_case_sensitive(pattern._flavour): + if not case_sensitive: flags |= re.IGNORECASE return re.compile(''.join(parts), flags=flags) @@ -742,8 +737,14 @@ def match(self, path_pattern): """ if not isinstance(path_pattern, PurePath) or self._flavour is not path_pattern._flavour: path_pattern = self.with_segments(path_pattern) - match = _make_matcher(path_pattern).search(self._lines) - return match is not None + case_sensitive = _is_case_sensitive(self._flavour) + pattern = _compile_pattern(path_pattern._lines, case_sensitive) + if path_pattern.drive or path_pattern.root: + return pattern.match(self._lines) is not None + elif path_pattern._tail: + return pattern.search(self._lines) is not None + else: + raise ValueError("empty pattern") # Can't subclass os.PathLike from PurePath and keep the constructor From 4770c13c2fe7cc1e9157a82b5d5297cd477dbcd2 Mon Sep 17 00:00:00 2001 From: barneygale Date: Sun, 14 May 2023 23:41:09 +0100 Subject: [PATCH 25/28] More performance tweaks --- Lib/pathlib.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index c887af7e7a1b6e..25922e0e7c8322 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -54,6 +54,7 @@ def _ignore_error(exception): getattr(exception, 'winerror', None) in _IGNORED_WINERRORS) +@functools.lru_cache() def _is_case_sensitive(flavour): return flavour.normcase('Aa') == 'Aa' @@ -735,7 +736,7 @@ def match(self, path_pattern): """ Return True if this path matches the given pattern. """ - if not isinstance(path_pattern, PurePath) or self._flavour is not path_pattern._flavour: + if not isinstance(path_pattern, PurePath): path_pattern = self.with_segments(path_pattern) case_sensitive = _is_case_sensitive(self._flavour) pattern = _compile_pattern(path_pattern._lines, case_sensitive) From eb35dbc3e552c9a4f59bc6d661995b9a01f167bf Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 23 May 2023 23:33:36 +0100 Subject: [PATCH 26/28] Re-target to 3.13. --- Doc/library/pathlib.rst | 2 +- Doc/whatsnew/3.12.rst | 4 ---- Doc/whatsnew/3.13.rst | 6 ++++++ 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/Doc/library/pathlib.rst b/Doc/library/pathlib.rst index 04c1c61bd0f9fa..cca727233db532 100644 --- a/Doc/library/pathlib.rst +++ b/Doc/library/pathlib.rst @@ -588,7 +588,7 @@ Pure paths provide the following methods and properties: .. versionadded:: 3.12 The *case_sensitive* argument. - .. versionchanged:: 3.12 + .. versionchanged:: 3.13 Support for the recursive wildcard "``**``" was added. In previous versions, it acted like the non-recursive wildcard "``*``". diff --git a/Doc/whatsnew/3.12.rst b/Doc/whatsnew/3.12.rst index 1da452f598a612..5e07a4caeb9ebe 100644 --- a/Doc/whatsnew/3.12.rst +++ b/Doc/whatsnew/3.12.rst @@ -399,10 +399,6 @@ pathlib :meth:`pathlib.Path.rglob` and :meth:`pathlib.PurePath.match` for matching the path's case sensitivity, allowing for more precise control over the matching process. -* Add support for recursive wildcards in :meth:`pathlib.PurePath.match`. - (Contributed by Barney Gale in :gh:`73435`.) - - dis --- diff --git a/Doc/whatsnew/3.13.rst b/Doc/whatsnew/3.13.rst index e0c3c2a3592ec7..ab5e4e509aa670 100644 --- a/Doc/whatsnew/3.13.rst +++ b/Doc/whatsnew/3.13.rst @@ -87,6 +87,12 @@ New Modules Improved Modules ================ +pathlib +------- + +* Add support for recursive wildcards in :meth:`pathlib.PurePath.match`. + (Contributed by Barney Gale in :gh:`73435`.) + Optimizations ============= From 9211297b1da404d67a322d4af573ef46e0ce1b3a Mon Sep 17 00:00:00 2001 From: barneygale Date: Tue, 30 May 2023 18:00:43 +0100 Subject: [PATCH 27/28] Add more comments! --- Lib/pathlib.py | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index 401942287c7bc0..d98c0742c9c7d5 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -68,7 +68,8 @@ def _is_case_sensitive(flavour): # matched, respectively. These features are undesirable for our implementation # of PurePatch.match(), which represents path separators as newlines and joins # pattern segments together. As a workaround, we define a slice object that -# can remove the prefix and suffix from any translate() result. +# can remove the prefix and suffix from any translate() result. See the +# _compile_pattern_lines() function for more details. _FNMATCH_PREFIX, _FNMATCH_SUFFIX = fnmatch.translate('_').split('_') _FNMATCH_SLICE = slice(len(_FNMATCH_PREFIX), -len(_FNMATCH_SUFFIX)) _SWAP_SEP_AND_NEWLINE = { @@ -110,17 +111,42 @@ def _compile_pattern(pat, case_sensitive): @functools.lru_cache() def _compile_pattern_lines(pattern_lines, case_sensitive): + """Compile the given pattern lines to an `re.Pattern` object. + + The *pattern_lines* argument is a glob-style pattern (e.g. '**/*.py') with + its path separators and newlines swapped (e.g. '**\n*.py`). By using + newlines to separate path components, and not setting `re.DOTALL`, we + ensure that the `*` wildcard cannot match path separators. + + The returned `re.Pattern` object may have its `match()` method called to + match a complete pattern, or `search()` to match from the right. The + argument supplied to these methods must also have its path separators and + newlines swapped. + """ + + # Match the start of the path, or just after a path separator parts = ['^'] for part in pattern_lines.splitlines(keepends=True): if part == '**\n': + # '**/' component: we use '[\s\S]' rather than '.' so that path + # separators (i.e. newlines) are matched. The trailing '^' ensures + # we terminate after a path separator (i.e. on a new line). part = r'[\s\S]*^' elif part == '**': + # '**' component. part = r'[\s\S]*' elif '**' in part: raise ValueError("Invalid pattern: '**' can only be an entire path component") else: + # Any other component: pass to fnmatch.translate(). We slice off + # the common prefix and suffix added by translate() to ensure that + # re.DOTALL is not set, and the end of the string not matched, + # respectively. With DOTALL not set, '*' wildcards will not match + # path separators, because the '.' characters in the pattern will + # not match newlines. part = fnmatch.translate(part)[_FNMATCH_SLICE] parts.append(part) + # Match the end of the path, always. parts.append(r'\Z') flags = re.MULTILINE if not case_sensitive: From 73bb3096844f5014abf15189e41249fc113670a8 Mon Sep 17 00:00:00 2001 From: Barney Gale Date: Tue, 30 May 2023 20:49:33 +0100 Subject: [PATCH 28/28] Update Lib/pathlib.py Co-authored-by: Alex Waygood --- Lib/pathlib.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/pathlib.py b/Lib/pathlib.py index d98c0742c9c7d5..62406473b66e4f 100644 --- a/Lib/pathlib.py +++ b/Lib/pathlib.py @@ -54,7 +54,7 @@ def _ignore_error(exception): getattr(exception, 'winerror', None) in _IGNORED_WINERRORS) -@functools.lru_cache() +@functools.cache def _is_case_sensitive(flavour): return flavour.normcase('Aa') == 'Aa' pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy