diff --git a/Lib/test/test_textwrap.py b/Lib/test/test_textwrap.py index 77366988b57fa7..c298d2f4fb414c 100644 --- a/Lib/test/test_textwrap.py +++ b/Lib/test/test_textwrap.py @@ -8,9 +8,11 @@ # $Id$ # +import re import unittest from textwrap import TextWrapper, wrap, fill, dedent, indent, shorten +from textwrap import _cached_regex as cached_regex class BaseTestCase(unittest.TestCase): @@ -712,6 +714,44 @@ def test_do_not_break_long_words_or_on_hyphens(self): 'ng_option_', 'indeed-', 'good-bye"'] self.check_wrap(self.text2, 10, expected) + +class TextWrapperCachedRegexTestCase(BaseTestCase): + def test_attr_access(self): + wrapper = TextWrapper() + # these names are not part of the public interface, + # but are not prefixed with an underscore. + for attr in 'wordsep_re', 'wordsep_simple_re', 'sentence_end_re': + self.assertTrue(hasattr(wrapper, attr)) + self.assertIsInstance(getattr(wrapper, attr), re.Pattern) + self.assertIsInstance(getattr(TextWrapper, attr), re.Pattern) + + setattr(wrapper, attr, attr) + self.assertEqual(getattr(wrapper, attr), attr) + self.assertIsInstance(getattr(TextWrapper, attr), re.Pattern) + + def test_cached_regex(self): + class Spam: + pat1 = cached_regex('pat1') + pat2 = cached_regex('pat2') + + # both patterns are instances of cached_regex + self.assertIsInstance(Spam.__dict__['pat1'], cached_regex) + self.assertIsInstance(Spam.__dict__['pat2'], cached_regex) + + # the attribute is replaced with a compiled pattern when accessed + self.assertEqual(Spam.pat1, re.compile('pat1')) + self.assertEqual(Spam.__dict__['pat1'], re.compile('pat1')) + + # including when accessed from an instance + spam = Spam() + self.assertEqual(spam.__dict__, {}) + self.assertIsInstance(spam.__class__.__dict__['pat2'], cached_regex) + self.assertEqual(spam.pat2, re.compile('pat2')) + self.assertEqual(Spam.pat2, re.compile('pat2')) + self.assertEqual(spam.__class__.__dict__['pat2'], re.compile('pat2')) + self.assertIs(spam.pat2, Spam.pat2) + + class IndentTestCases(BaseTestCase): # called before each test method diff --git a/Lib/textwrap.py b/Lib/textwrap.py index bac98c99e41df8..729f6aead55bec 100644 --- a/Lib/textwrap.py +++ b/Lib/textwrap.py @@ -5,10 +5,26 @@ # Copyright (C) 2002 Python Software Foundation. # Written by Greg Ward -import re - __all__ = ['TextWrapper', 'wrap', 'fill', 'dedent', 'indent', 'shorten'] + +class _cached_regex: + def __init__(self, pattern): + self.pattern = pattern + + def __set_name__(self, owner, name): + self.attr_name = name + + def __get__(self, instance, owner=None): + if owner is None: + return self + import re + # replace this descriptor with the compiled pattern + pat = re.compile(self.pattern) + setattr(owner, self.attr_name, pat) + return pat + + # Hardcode the recognized whitespace characters to the US-ASCII # whitespace characters. The main reason for doing this is that # some Unicode spaces (like \u00a0) are non-breaking whitespaces. @@ -73,41 +89,39 @@ class TextWrapper: # (after stripping out empty strings). word_punct = r'[\w!"\'&.,?]' letter = r'[^\d\W]' - whitespace = r'[%s]' % re.escape(_whitespace) - nowhitespace = '[^' + whitespace[1:] - wordsep_re = re.compile(r''' + whitespace = fr'[{_whitespace}]' + no_whitespace = f'[^{_whitespace}]' + wordsep_re = _cached_regex(fr'''(?x) ( # any whitespace - %(ws)s+ + {whitespace}+ | # em-dash between words - (?<=%(wp)s) -{2,} (?=\w) + (?<={word_punct}) -{{2,}} (?=\w) | # word, possibly hyphenated - %(nws)s+? (?: + {no_whitespace}+? (?: # hyphenated word - -(?: (?<=%(lt)s{2}-) | (?<=%(lt)s-%(lt)s-)) - (?= %(lt)s -? %(lt)s) + -(?: (?<={letter}{{2}}-) | (?<={letter}-{letter}-)) + (?= {letter} -? {letter}) | # end of word - (?=%(ws)s|\Z) + (?={whitespace}|\Z) | # em-dash - (?<=%(wp)s) (?=-{2,}\w) + (?<={word_punct}) (?=-{{2,}}\w) ) - )''' % {'wp': word_punct, 'lt': letter, - 'ws': whitespace, 'nws': nowhitespace}, - re.VERBOSE) - del word_punct, letter, nowhitespace + )''') + del word_punct, letter, no_whitespace # This less funky little regex just split on recognized spaces. E.g. # "Hello there -- you goof-ball, use the -b option!" # splits into # Hello/ /there/ /--/ /you/ /goof-ball,/ /use/ /the/ /-b/ /option!/ - wordsep_simple_re = re.compile(r'(%s+)' % whitespace) + wordsep_simple_re = _cached_regex(fr'({whitespace}+)') del whitespace # XXX this is not locale- or charset-aware -- string.lowercase # is US-ASCII only (and therefore English-only) - sentence_end_re = re.compile(r'[a-z]' # lowercase letter - r'[\.\!\?]' # sentence-ending punct. - r'[\"\']?' # optional end-of-quote - r'\Z') # end of chunk + sentence_end_re = _cached_regex(r'[a-z]' # lowercase letter + r'[\.\!\?]' # sentence-ending punct. + r'[\"\']?' # optional end-of-quote + r'\Z') # end of chunk def __init__(self, width=70, @@ -250,7 +264,7 @@ def _wrap_chunks(self, chunks): """ lines = [] if self.width <= 0: - raise ValueError("invalid width %r (must be > 0)" % self.width) + raise ValueError(f"invalid width {self.width!r} (must be > 0)") if self.max_lines is not None: if self.max_lines > 1: indent = self.subsequent_indent pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy