From da36214fc4de9b073d3e6d9562e588d553509ee1 Mon Sep 17 00:00:00 2001 From: Jessica James Date: Fri, 18 Apr 2025 17:23:30 -0400 Subject: [PATCH 1/9] GH-115512: Optimize peak memory usage and runtime for large emails --- Lib/email/feedparser.py | 466 ++++++++++++++++++++++++------ Lib/email/parser.py | 41 ++- Lib/test/test_email/test_email.py | 204 +++++++++++++ Misc/ACKS | 1 + 4 files changed, 610 insertions(+), 102 deletions(-) diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py index b2bc4afc1cc26f..26c226f589c4a0 100644 --- a/Lib/email/feedparser.py +++ b/Lib/email/feedparser.py @@ -22,6 +22,7 @@ __all__ = ['FeedParser', 'BytesFeedParser'] import re +import sys from email import errors from email._policybase import compat32 @@ -52,31 +53,30 @@ class BufferedSubFile(object): simple abstraction -- it parses until EOF closes the current message. """ def __init__(self): - # Text stream of the last partial line pushed into this object. - # See issue 22233 for why this is a text stream and not a list. - self._partial = StringIO(newline='') + self._partial: list[str] = [] + self._dangling_partial: bool = False # A deque of full, pushed lines - self._lines = deque() + self._lines: deque[str] = deque() # The stack of false-EOF checking predicates. self._eofstack = [] # A flag indicating whether the file has been closed or not. - self._closed = False + self._closed: bool = False + self._dump_destination: deque[str]|None = None + self._dump_result: str|None = None - def push_eof_matcher(self, pred): + def push_eof_matcher(self, pred) -> None: self._eofstack.append(pred) def pop_eof_matcher(self): return self._eofstack.pop() - def close(self): + def close(self) -> None: # Don't forget any trailing partial line. - self._partial.seek(0) - self.pushlines(self._partial.readlines()) - self._partial.seek(0) - self._partial.truncate() + if self._partial: + self._flush_partial() self._closed = True - def readline(self): + def readline(self) -> str|object: if not self._lines: if self._closed: return '' @@ -87,51 +87,351 @@ def readline(self): # RFC 2046, section 5.1.2 requires us to recognize outer level # boundaries at any level of inner nesting. Do this, but be sure it's # in the order of most to least nested. - for ateof in reversed(self._eofstack): - if ateof(line): - # We're at the false EOF. But push the last line back first. - self._lines.appendleft(line) - return '' + if self._check_eofstack(line): + # We're at the false EOF. But push the last line back first. + self._lines.appendleft(line) + return '' + return line - def unreadline(self, line): + def _check_eofstack(self, data, start=0, end=sys.maxsize) -> bool: + for ateof in reversed(self._eofstack): + if ateof(data, start, end): + # We're at the false EOF. + return True + + return False + + def unreadline(self, line) -> None: # Let the consumer push a line back into the buffer. assert line is not NeedMoreData self._lines.appendleft(line) - def push(self, data): + def _flush_partial(self) -> None: + line = EMPTYSTRING.join(self._partial) + if not line: + pass + elif self._dump_destination is None: + # We're not dumping data. Just flush the partial to lines, as normal + self._lines.append(line) + elif self._check_eofstack(line): + # We were dumping, but we've now reached the end of the dump. Push our line and stop dumping. + self._dump_destination = None + self._lines.append(line) + else: + # We're still dumping; push to dump + self._dump_destination.append(line) + + self._partial.clear() + self._dangling_partial = False + + def push(self, data) -> None: """Push some new data into this object.""" - self._partial.write(data) - if '\n' not in data and '\r' not in data: + if not data: + return + + # If we're dumping, and we don't have anything that will ever tell us to terminate, simply dump everything + if self._can_dump_data(data): + self._dump_destination.append(data) + return + + self._push_data(data) + + def _can_dump_data(self, data) -> bool: + if self._dump_destination is None: + # We're not dumping data + return False + + # We're dumping; check for easy optimizations + if not self._eofstack: + # There's nothing that will ever tell us to stop dumping. Go ahead and dump the entire `data` object. + # This does absolute wonders for large non-multipart emails. + assert not self._lines + assert not self._dangling_partial + assert not self._partial + return True + + # We can't dump this blob if we have pending partial data + if self._partial: + return False + + all_boundary_matches = True + for pred in self._eofstack: + if not hasattr(pred, 'is_boundary_match'): + all_boundary_matches = False + break + + if all_boundary_matches and '-' not in data: + # We eventually need to stop, but we only care about boundary matches, and there's no boundaries + # here. Dump the entire `data` object. This does wonders for multipart emails with large parts. + assert not self._lines + return True + + # We're still dumping, but there's a potential boundary marker or EOF or similar issue. Force a proper parse. + return False + + def _can_dump_partial(self, line, start: int=0, end: int=sys.maxsize) -> bool: + # Very similar to _can_dump_data above, except we can make some additional assumptions for partials/lines. + # This should only ever be checked when we have a new partial line, in which case we have no partial, + # or when checking the partial itself, in which case it'll always be the first part + assert not self._partial or line is self._partial[0] + + if self._dump_destination is None: + # We're not dumping data + return False + + # We're dumping. There should be absolutely no other pending lines, because those should've been dumped. + assert not self._lines + if not self._eofstack: + # There's nothing that will ever tell us to stop dumping. Dump away. + return True + + all_boundary_matches = True + for pred in self._eofstack: + if not hasattr(pred, 'is_boundary_match'): + all_boundary_matches = False + break + + if all_boundary_matches and not line.startswith("-", start, end): + # We eventually need to stop, but we only care about boundary matches, and there's no boundaries + # here. Dump the entire `data` object. This does wonders for multipart emails with large parts. + return True + + # We're still dumping, but there's a potential boundary marker or EOF or similar issue. Force a proper parse. + return False + + def _is_dump_midline(self): + if not self._dump_destination: + return False + + assert self._dump_destination[-1] # Never push empty strings to _dump_destination + return self._dump_destination[-1][-1] not in ('\n', '\r') + + def _push_data(self, data) -> None: + # Find first newline character in the data + unl_start_index = BufferedSubFile._find_unl(data) + if unl_start_index < 0: # No new complete lines, wait for more. + # Check to see if we had a previous dangling partial newline + if self._dangling_partial: + # We previously pushed a dangling line expecting a \n to follow, however we received other data instead. + # Therefore, that \r does actually terminate a line. Go ahead and push it. + self._flush_partial() + + # No lines in data to push; wait for more data + if self._is_dump_midline(): + assert not self._partial + self._dump_destination.append(data) + else: + self._partial.append(data) return - # Crack into lines, preserving the linesep characters. - self._partial.seek(0) - parts = self._partial.readlines() - self._partial.seek(0) - self._partial.truncate() - - # If the last element of the list does not end in a newline, then treat - # it as a partial line. We only check for '\n' here because a line - # ending with '\r' might be a line that was split in the middle of a - # '\r\n' sequence (see bugs 1555570 and 1721862). - if not parts[-1].endswith('\n'): - self._partial.write(parts.pop()) - self.pushlines(parts) - - def pushlines(self, lines): + data_start_index = 0 + + # Complete our previous/partial line + if self._partial: + # Check to see if we had any dangling newlines in our partial, and handle if appropriate + if self._dangling_partial: + # We had a previously dangling line; this is either a \n (completion), or some other char (termination) + if data[0] != NL: + # "\r" -- push what we had, as it has been terminated; data_start_index = 0 + self._flush_partial() + else: + # "\r\n" -- append \n and push it; data_start_index = 1 + self._partial.append(NL) + self._flush_partial() + data_start_index = 1 + + # Find the next newline + unl_start_index = BufferedSubFile._find_unl(data, data_start_index) + # Fall through + else: + # Our partial has no dangling newline; complete our partial with the new line and push it + unl_end_index = BufferedSubFile._find_unl_end(data, unl_start_index) + if unl_end_index < 0: + # The newline is incomplete; append data and return + self._partial.append(data) + self._dangling_partial = True + return + + # We have a complete line; append it and flush _partial + self._partial.append(data[data_start_index:unl_end_index]) + self._flush_partial() + data_start_index = unl_end_index + + # Find the next newline + unl_start_index = BufferedSubFile._find_unl(data, data_start_index) + # Fall through + + # _partial is now guaranteed to point to be empty + # data_start_index is an index which points to the start of the next line + # unl_start_index is an index which points to the start of the next newline character, if there is one + self._push_data_no_partial(data, data_start_index, unl_start_index) + + def _push_data_no_partial(self, data, data_start_index: int, unl_start_index: int) -> None: + # _partial is now guaranteed to point to be empty + # data_start_index is an index which points to the start of the next line + # unl_start_index is an index which points to the start of the next newline character, if there is one + + # Process any remaining whole lines in data + if unl_start_index < 0: + # Push right to the partial if there's no lines + if data_start_index < len(data): + assert data_start_index >= 0 + partial_line = data[data_start_index:] + if self._is_dump_midline() or self._can_dump_partial(partial_line): + self._dump_destination.append(partial_line) + else: + self._partial = [partial_line] + if data[-1] == '\r': + self._dangling_partial = True + elif self._dump_destination is None and unl_start_index < len(data) // 2: + # If it looks like we're going to be doing a lot of splits/joins, just go ahead and use StringIO, for speed + # If we had some sort of "StringViewIO" to avoid the copy, this would be significantly more efficient + # This code block, and the "else" code block below, functionally do the exact same thing, except this path + # makes no attempt to handle dumping data + sio = StringIO(data, '') + sio.seek(data_start_index) + lines = sio.readlines() + if lines: + if data[-1] != '\n': + self._partial.append(lines.pop()) + if data[-1] == '\r': + self._dangling_partial = True + + self.pushlines(lines) + else: + # If we're not, let's keep it in Python + dump_data_start = None if self._dump_destination is None else data_start_index + while unl_start_index >= 0: + unl_end_index = BufferedSubFile._find_unl_end(data, unl_start_index) + if unl_end_index < 0: + # Incomplete line ending; break to update our partial and return + self._dangling_partial = True + break + + # We have an easy line; push it + if self._dump_destination is not None: + # We have a window into a line. Make sure it's not EOF, and continue as long as it's not + if self._check_eofstack(data, data_start_index, unl_end_index): + # This line is "EOF". This is the end of our dump data! Push the dump data. + self._dump_destination.append(data[dump_data_start:data_start_index]) + + # Also push our line, since we already have it + self._lines.append(data[data_start_index:unl_end_index]) + + # Mark dump complete + self._dump_destination = None + #else: # This line didn't mark the end. Keep going. + else: + # We're not dumping. Just go ahead and push the line + self._lines.append(data[data_start_index:unl_end_index]) + + # Update our iterators + data_start_index = unl_end_index + unl_start_index = BufferedSubFile._find_unl(data, data_start_index) + + # If we're still dumping, push everything that isn't going into the partial to the dump + if self._dump_destination is not None: + # If we're able to safely flush the partial, go ahead and do that too + # We don't care about self._is_dump_midline() here, because data_start_index always represents the + # start of a new line, always + if self._can_dump_partial(data, data_start_index): + self._dump_destination.append(data[dump_data_start:]) + + # We've consumed the partial; flush any partial-related state we may have set + self._dangling_partial = False + return # skip the _partial.append below, because it's already been consumed + else: + self._dump_destination.append(data[dump_data_start:data_start_index]) + + # If we have any partial data leftover, go ahead and set it + if data_start_index < len(data): + self._partial.append(data[data_start_index:]) + + def pushlines(self, lines) -> None: + # This method is not documented on docs.python.org self._lines.extend(lines) def __iter__(self): return self - def __next__(self): + def __next__(self) -> str|object: line = self.readline() if line == '': raise StopIteration return line + def _get_dump(self, start_value:str|None = None): + _dump_destination = deque() + self._dump_destination = _dump_destination + + if start_value: + _dump_destination.append(start_value) + + # Flush our current _lines to _dump_destination + needs_more_data = False + for line in self: + if line is NeedMoreData: + needs_more_data = True + break + _dump_destination.append(line) + + # Pull in more data, if we need more + if needs_more_data: + # Flush our partial, if we can + if self._partial and self._can_dump_partial(self._partial[0]): + assert self._partial[0] # We shouldn't ever push empty strings to _partial + _dump_destination.extend(self._partial) + self._partial.clear() + self._dangling_partial = False + + # Pull in more data until we're told to stop + while not self._closed and self._dump_destination is not None: + yield NeedMoreData + + # Flush our final dump string to _dump_result + self._dump_destination = None + self._dump_result = EMPTYSTRING.join(_dump_destination) + + def _pop_dump(self) -> str: + result = self._dump_result + self._dump_result = None + return result + + @staticmethod + def _find_unl(data, start=0) -> int: + # Like str.find(), but for universal newlines + # Originally, this iterated over the string, however just calling find() twice is drastically faster + # This could be sped up by replacing with a similar function in C, so we don't pass over the string twice. + cr_index = data.find('\r', start) + if cr_index < 0: + return data.find(NL, start) + + nl_index = data.find(NL, start, cr_index) + return nl_index if nl_index >= 0 else cr_index + + @staticmethod + def _find_unl_end(data, start) -> int: + # A helper function which returns the 1-past-the-end index of a universal newline + # This could be sped up by replacing with a similar function in C. + #assert data[start] in '\r\n' + + # \n is always end of line + if data.startswith(NL, start): + return start + 1 + + # \r\n is always end of line + if data.startswith(NL, start + 1): + return start + 2 + + # End of string; we can't know if a \n follows, so no universal line end + if start + 1 >= len(data): + return -1 + + # This is a \r followed by some other non-newline character + return start + 1 + class FeedParser: """A feed-style parser of email.""" @@ -161,23 +461,23 @@ def __init__(self, _factory=None, *, policy=compat32): self._old_style_factory = True self._input = BufferedSubFile() self._msgstack = [] - self._parse = self._parsegen().__next__ + self._parse = self._parsegen().__next__ # Interesting trick which replaces yield values with return values self._cur = None self._last = None self._headersonly = False # Non-public interface for supporting Parser's headersonly flag - def _set_headersonly(self): + def _set_headersonly(self) -> None: self._headersonly = True - def feed(self, data): + def feed(self, data) -> None: """Push more data into the parser.""" self._input.push(data) self._call_parse() - def _call_parse(self): + def _call_parse(self) -> None: try: - self._parse() + self._parse() # Return value is always NeedMoreData or None, but discarded here in either case except StopIteration: pass @@ -194,7 +494,7 @@ def close(self): self.policy.handle_defect(root, defect) return root - def _new_message(self): + def _new_message(self) -> None: if self._old_style_factory: msg = self._factory() else: @@ -215,7 +515,7 @@ def _pop_message(self): self._cur = None return retval - def _parsegen(self): + def _parsegen(self): # yields: NeedMoreData # Create a new message and start by parsing headers. self._new_message() headers = [] @@ -242,16 +542,8 @@ def _parsegen(self): # necessary in the older parser, which could raise errors. All # remaining lines in the input are thrown into the message body. if self._headersonly: - lines = [] - while True: - line = self._input.readline() - if line is NeedMoreData: - yield NeedMoreData - continue - if line == '': - break - lines.append(line) - self._cur.set_payload(EMPTYSTRING.join(lines)) + yield from self._input._get_dump() + self._cur.set_payload(self._input._pop_dump()) return if self._cur.get_content_type() == 'message/delivery-status': # message/delivery-status contains blocks of headers separated by @@ -311,13 +603,8 @@ def _parsegen(self): # defective. defect = errors.NoBoundaryInMultipartDefect() self.policy.handle_defect(self._cur, defect) - lines = [] - for line in self._input: - if line is NeedMoreData: - yield NeedMoreData - continue - lines.append(line) - self._cur.set_payload(EMPTYSTRING.join(lines)) + yield from self._input._get_dump() + self._cur.set_payload(self._input._pop_dump()) return # Make sure a valid content type was specified per RFC 2045:6.4. if (str(self._cur.get('content-transfer-encoding', '8bit')).lower() @@ -329,10 +616,11 @@ def _parsegen(self): # this onto the input stream until we've scanned past the # preamble. separator = '--' + boundary - def boundarymatch(line): - if not line.startswith(separator): + def boundarymatch(line, pos: int = 0, endpos: int = sys.maxsize): + if not line.startswith(separator, pos, endpos): return None - return boundaryendRE.match(line, len(separator)) + return boundaryendRE.match(line, pos + len(separator), endpos) + boundarymatch.is_boundary_match = True capturing_preamble = True preamble = [] linesep = False @@ -424,12 +712,11 @@ def boundarymatch(line): defect = errors.StartBoundaryNotFoundDefect() self.policy.handle_defect(self._cur, defect) self._cur.set_payload(EMPTYSTRING.join(preamble)) - epilogue = [] for line in self._input: if line is NeedMoreData: yield NeedMoreData continue - self._cur.epilogue = EMPTYSTRING.join(epilogue) + self._cur.epilogue = '' return # If we're not processing the preamble, then we might have seen # EOF without seeing that end boundary...that is also a defect. @@ -440,36 +727,29 @@ def boundarymatch(line): # Everything from here to the EOF is epilogue. If the end boundary # ended in a newline, we'll need to make sure the epilogue isn't # None - if linesep: - epilogue = [''] - else: - epilogue = [] - for line in self._input: - if line is NeedMoreData: - yield NeedMoreData - continue - epilogue.append(line) - # Any CRLF at the front of the epilogue is not technically part of - # the epilogue. Also, watch out for an empty string epilogue, - # which means a single newline. - if epilogue: - firstline = epilogue[0] - bolmo = NLCRE_bol.match(firstline) - if bolmo: - epilogue[0] = firstline[len(bolmo.group(0)):] - self._cur.epilogue = EMPTYSTRING.join(epilogue) + first_line = '' + if not linesep: + for line in self._input: + if line is NeedMoreData: + yield NeedMoreData + continue + + first_line = line + if first_line: + bolmo = NLCRE_bol.match(first_line) + if bolmo: + first_line = first_line[len(bolmo.group(0)):] + break + + yield from self._input._get_dump(first_line) + self._cur.epilogue = self._input._pop_dump() return # Otherwise, it's some non-multipart type, so the entire rest of the # file contents becomes the payload. - lines = [] - for line in self._input: - if line is NeedMoreData: - yield NeedMoreData - continue - lines.append(line) - self._cur.set_payload(EMPTYSTRING.join(lines)) + yield from self._input._get_dump() + self._cur.set_payload(self._input._pop_dump()) - def _parse_headers(self, lines): + def _parse_headers(self, lines) -> None: # Passed a list of lines that make up the headers for the current msg lastheader = '' lastvalue = [] @@ -533,5 +813,5 @@ def _parse_headers(self, lines): class BytesFeedParser(FeedParser): """Like FeedParser, but feed accepts bytes.""" - def feed(self, data): + def feed(self, data) -> None: super().feed(data.decode('ascii', 'surrogateescape')) diff --git a/Lib/email/parser.py b/Lib/email/parser.py index 039f03cba74fa0..a182daebec0092 100644 --- a/Lib/email/parser.py +++ b/Lib/email/parser.py @@ -12,6 +12,8 @@ from email.feedparser import FeedParser, BytesFeedParser from email._policybase import compat32 +_FEED_CHUNK_SIZE = 8192 + class Parser: def __init__(self, _class=None, *, policy=compat32): @@ -38,6 +40,18 @@ def __init__(self, _class=None, *, policy=compat32): self._class = _class self.policy = policy + def _parse_chunks(self, chunk_generator, headersonly=False): + """Internal method / implementation detail + + Parses chunks from a chunk generator into a FeedParser, returning the result + """ + feedparser = FeedParser(self._class, policy=self.policy) + if headersonly: + feedparser._set_headersonly() + for data in chunk_generator: + feedparser.feed(data) + return feedparser.close() + def parse(self, fp, headersonly=False): """Create a message structure from the data in a file. @@ -46,12 +60,12 @@ def parse(self, fp, headersonly=False): parsing after reading the headers or not. The default is False, meaning it parses the entire contents of the file. """ - feedparser = FeedParser(self._class, policy=self.policy) - if headersonly: - feedparser._set_headersonly() - while data := fp.read(8192): - feedparser.feed(data) - return feedparser.close() + def _fp_get_chunks(): + while data := fp.read(_FEED_CHUNK_SIZE): + yield data + _chunk_generator = _fp_get_chunks() + + return self._parse_chunks(_chunk_generator, headersonly) def parsestr(self, text, headersonly=False): """Create a message structure from a string. @@ -61,7 +75,12 @@ def parsestr(self, text, headersonly=False): not. The default is False, meaning it parses the entire contents of the file. """ - return self.parse(StringIO(text), headersonly=headersonly) + _chunk_generator = ( + text[offset:offset + _FEED_CHUNK_SIZE] + for offset in range(0, len(text), _FEED_CHUNK_SIZE) + ) + + return self._parse_chunks(_chunk_generator, headersonly) class HeaderParser(Parser): @@ -115,8 +134,12 @@ def parsebytes(self, text, headersonly=False): not. The default is False, meaning it parses the entire contents of the file. """ - text = text.decode('ASCII', errors='surrogateescape') - return self.parser.parsestr(text, headersonly) + _chunk_generator = ( + text[offset:offset + _FEED_CHUNK_SIZE].decode('ASCII', errors='surrogateescape') + for offset in range(0, len(text), _FEED_CHUNK_SIZE) + ) + + return self.parser._parse_chunks(_chunk_generator, headersonly) class BytesHeaderParser(BytesParser): diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index 7b14305f997e5d..b02931e97b7f27 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -4600,6 +4600,210 @@ def _idempotent(self, msg, data, unixfrom=False): g.flatten(msg, unixfrom=unixfrom, linesep=self.linesep) self.assertEqual(data, b.getvalue()) +class TestFeedParserTrickle(TestEmailBase): + @staticmethod + def _msgobj_trickle(filename, trickle_size=2, force_linetype="\r\n"): + # Trickle data into the feed parser, one character at a time + with openfile(filename, encoding="utf-8") as fp: + file_str = fp.read() + file_str = file_str.replace("\r\n", "\n").replace("\r", "\n").replace("\n", force_linetype) + + feedparser = FeedParser() + for index in range(0, len(file_str), trickle_size): + feedparser.feed(file_str[index:index + trickle_size]) + return feedparser.close() + + def _validate_msg10_msgobj(self, msg, line_end): + if isinstance(line_end, str): + line_end = line_end.encode() + eq = self.assertEqual + # The outer message is a multipart + eq(msg.get_payload(decode=True), None) + # Subpart 1 is 7bit encoded + eq(msg.get_payload(0).get_payload(decode=True), + b'This is a 7bit encoded message.' + line_end) + # Subpart 2 is quopri + eq(msg.get_payload(1).get_payload(decode=True), + b'\xa1This is a Quoted Printable encoded message!' + line_end) + # Subpart 3 is base64 + eq(msg.get_payload(2).get_payload(decode=True), + b'This is a Base64 encoded message.') + # Subpart 4 is base64 with a trailing newline, which + # used to be stripped (issue 7143). + eq(msg.get_payload(3).get_payload(decode=True), + b'This is a Base64 encoded message.\n') + # Subpart 5 has no Content-Transfer-Encoding: header. + eq(msg.get_payload(4).get_payload(decode=True), + b'This has no Content-Transfer-Encoding: header.' + line_end) + + def test_trickle_1chr_crlf(self): + msg = self._msgobj_trickle('msg_10.txt', 1, '\r\n') + self._validate_msg10_msgobj(msg, '\r\n') + + def test_trickle_1chr_cr(self): + msg = self._msgobj_trickle('msg_10.txt', 1, '\r') + self._validate_msg10_msgobj(msg, '\r') + + def test_trickle_1chr_lf(self): + msg = self._msgobj_trickle('msg_10.txt', 1, '\n') + self._validate_msg10_msgobj(msg, '\n') + + def test_trickle_2chr_crlf(self): + # During initial testing, it was realized that an edge case was missed around dangling newlines. + # This helps test that behavior, as it is not otherwise covered by tests. + msg = self._msgobj_trickle('msg_10.txt', 2, '\r\n') + self._validate_msg10_msgobj(msg, '\r\n') + + def test_trickle_2chr_cr(self): + msg = self._msgobj_trickle('msg_10.txt', 2, '\r') + self._validate_msg10_msgobj(msg, '\r') + + def test_trickle_2chr_lf(self): + msg = self._msgobj_trickle('msg_10.txt', 2, '\n') + self._validate_msg10_msgobj(msg, '\n') + + def test_trickle_3chr_crlf(self): + msg = self._msgobj_trickle('msg_10.txt', 3, '\r\n') + self._validate_msg10_msgobj(msg, '\r\n') + + def test_trickle_3chr_cr(self): + msg = self._msgobj_trickle('msg_10.txt', 3, '\r') + self._validate_msg10_msgobj(msg, '\r') + + def test_trickle_3chr_lf(self): + msg = self._msgobj_trickle('msg_10.txt', 3, '\n') + self._validate_msg10_msgobj(msg, '\n') + + +class TestPeakMemoryUsage(unittest.TestCase): + + maxDiff = None + SMALLER_CHUNK_SIZE = 1024 + + def _msg_bytes(self, filename): + with openfile(filename, 'rb') as fp: + data = fp.read() + return data + + def _make_plaintext_msg_bytes(self, min_size): + # Get msg_01 as our baseline + msg_bytes = self._msg_bytes('msg_01.txt') + if len(msg_bytes) < min_size: + # Make it bigger + msg_bytes = msg_bytes * ((min_size // len(msg_bytes)) + 1) + msg_bytes = msg_bytes[:min_size] # Truncate it to min_size + assert len(msg_bytes) >= min_size + + match = re.search(rb'(\r|\n|\r\n){2}', msg_bytes) + self.assertIsNotNone(match) + expected_payload = msg_bytes[match.end():] + + return msg_bytes, expected_payload + + def _measure_message_from_bytes(self, msg_bytes): + import tracemalloc + + # Call email.message_from_bytes, gathering some memory usage stats in the process + tracemalloc.start() + start_time = time.perf_counter() + msgobj = email.message_from_bytes(msg_bytes, policy=email.policy.default) + end_time = time.perf_counter() + after_bytes, after_peak_bytes = tracemalloc.get_traced_memory() + tracemalloc.stop() + + # "How many bytes did we allocate, that were ultimately discarded?" + peak_overhead = after_peak_bytes - after_bytes + + # "How large was that overhead, relative to the size of the message?" + overhead_ratio = peak_overhead / len(msg_bytes) if len(msg_bytes) > 0 else None + + return msgobj, peak_overhead, overhead_ratio, end_time - start_time + + def _base64_encode(self, bytes_to_encode, one_line=True): + base64_str = base64mime.body_encode(bytes_to_encode) + if one_line: + base64_str = "".join(base64_str.splitlines()) + return base64_str + + _multipart_msg_base = textwrap.dedent("""\ + Date: Wed, 14 Nov 2007 12:56:23 GMT + From: foo@bar.invalid + To: foo@bar.invalid + Subject: Content-Transfer-Encoding: base64 and multipart + MIME-Version: 1.0 + Content-Type: multipart/mixed; boundary="BOUNDARY" + + --BOUNDARY + Content-Type: text/plain + + Test message + + --BOUNDARY + Content-Type: application/octet-stream + Content-Transfer-Encoding: base64 + + {} + --BOUNDARY-- + """) + + def _make_junk_bytes(self, bytes_length): + junk_data = bytearray(bytes_length) + for i in range(len(junk_data)): + junk_data[i] = i % 256 + return bytes(junk_data) + + def _make_junk_base64(self, bytes_length, one_line=True): + junk_bytes = self._make_junk_bytes(bytes_length) + return self._base64_encode(junk_bytes, one_line), junk_bytes + + _LARGE_EMAIL_BYTE_SIZE = 1024*1024*10 # 10 MiB + + def test_message_from_bytes_plaintext(self): + # Generate a 10MiB plaintext email + msg_bytes, expected_payload = self._make_plaintext_msg_bytes(self._LARGE_EMAIL_BYTE_SIZE) + + # Parse it, collecting stats + msgobj, peak_overhead, overhead_ratio, time_taken = self._measure_message_from_bytes(msg_bytes) + + # Verify the message payload/content is correct. + self.assertEqual(msgobj.get_payload(decode=True), expected_payload) + self.assertEqual(msgobj.get_content(), expected_payload.decode()) + + # overhead_ratio at time of writing: 1.0102445602416992 + self.assertLess(overhead_ratio, 1.05) + + def test_message_from_bytes_large_attachment_body_encoded(self): + # Generate a 10 MiB attachment + attachment_base64, attachment_bytes = self._make_junk_base64(self._LARGE_EMAIL_BYTE_SIZE, False) + multipart_msg_bytes = self._multipart_msg_base.format(attachment_base64).encode() + + # Parse it, collecting stats + msgobj, peak_overhead, overhead_ratio, time_taken = self._measure_message_from_bytes(multipart_msg_bytes) + + # Verify the message payload/content is correct. + attachment_msg = msgobj.get_payload(1) + self.assertEqual(attachment_msg.get_content(), attachment_bytes) + self.assertEqual(attachment_msg.get_payload(decode=False), attachment_base64) + + # overhead_ratio at time of writing: 1.0088957315722829 - 85.0565% decrease + self.assertLess(overhead_ratio, 1.05) + + def test_message_from_bytes_large_attachment_one_line(self): + # Generate a 10 MiB attachment + attachment_base64, attachment_bytes = self._make_junk_base64(self._LARGE_EMAIL_BYTE_SIZE, True) + multipart_msg_bytes = self._multipart_msg_base.format(attachment_base64).encode() + + # Parse it, collecting stats + msgobj, peak_overhead, overhead_ratio, time_taken = self._measure_message_from_bytes(multipart_msg_bytes) + + # Verify the message payload/content is correct. + attachment_msg = msgobj.get_payload(1) + self.assertEqual(attachment_msg.get_content(), attachment_bytes) + self.assertEqual(attachment_msg.get_payload(decode=False), attachment_base64) + + # overhead_ratio at time of writing: 1.0077472351610626 - 89.2775% decrease + self.assertLess(overhead_ratio, 1.05) + class TestBytesGeneratorIdempotentNL(BaseTestBytesGeneratorIdempotent, TestIdempotent): diff --git a/Misc/ACKS b/Misc/ACKS index 25542d01de695c..228459f0350b56 100644 --- a/Misc/ACKS +++ b/Misc/ACKS @@ -875,6 +875,7 @@ Jeffrey C. Jacobs Kevin Jacobs Kjetil Jacobsen Shantanu Jain +Jessica A. James Bertrand Janin Geert Jansen Jack Jansen From c2eb551cccda1661b901d3288b3ec83375b3ddd3 Mon Sep 17 00:00:00 2001 From: Jessica James Date: Fri, 18 Apr 2025 18:11:56 -0400 Subject: [PATCH 2/9] Add NEWS entry per bot --- .../Library/2025-04-18-18-11-15.gh-issue-115512.oE6Jkw.rst | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 Misc/NEWS.d/next/Library/2025-04-18-18-11-15.gh-issue-115512.oE6Jkw.rst diff --git a/Misc/NEWS.d/next/Library/2025-04-18-18-11-15.gh-issue-115512.oE6Jkw.rst b/Misc/NEWS.d/next/Library/2025-04-18-18-11-15.gh-issue-115512.oE6Jkw.rst new file mode 100644 index 00000000000000..ced37ad143576e --- /dev/null +++ b/Misc/NEWS.d/next/Library/2025-04-18-18-11-15.gh-issue-115512.oE6Jkw.rst @@ -0,0 +1,6 @@ +Substantially improved memory usage and performance when parsing email text +in :mod:`email`. Primarily reduces memory usage in +:func:`email.message_from_bytes`, :func:`email.message_from_string`, +:class:`email.parser.Parser`, :class:`email.parser.ByteParser`, +:class:`email.feedparser.FeedParser`, +:class:`email.feedparser.BytesFeedParser` From d6233e90adce57e04800523b3301734b91a4c65a Mon Sep 17 00:00:00 2001 From: Jessica James Date: Fri, 18 Apr 2025 18:21:13 -0400 Subject: [PATCH 3/9] Correct class references --- .../Library/2025-04-18-18-11-15.gh-issue-115512.oE6Jkw.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/Misc/NEWS.d/next/Library/2025-04-18-18-11-15.gh-issue-115512.oE6Jkw.rst b/Misc/NEWS.d/next/Library/2025-04-18-18-11-15.gh-issue-115512.oE6Jkw.rst index ced37ad143576e..80fd2f47385cc3 100644 --- a/Misc/NEWS.d/next/Library/2025-04-18-18-11-15.gh-issue-115512.oE6Jkw.rst +++ b/Misc/NEWS.d/next/Library/2025-04-18-18-11-15.gh-issue-115512.oE6Jkw.rst @@ -1,6 +1,5 @@ Substantially improved memory usage and performance when parsing email text in :mod:`email`. Primarily reduces memory usage in :func:`email.message_from_bytes`, :func:`email.message_from_string`, -:class:`email.parser.Parser`, :class:`email.parser.ByteParser`, -:class:`email.feedparser.FeedParser`, -:class:`email.feedparser.BytesFeedParser` +:class:`email.parser.Parser`, :class:`email.parser.BytesParser`, +:class:`email.parser.FeedParser`, :class:`email.parser.BytesFeedParser` From 530f6d442dae842a08f8a4c5a06428db13533d8b Mon Sep 17 00:00:00 2001 From: Jessica James Date: Fri, 18 Apr 2025 19:31:48 -0400 Subject: [PATCH 4/9] Comment: remove annotations --- Lib/email/feedparser.py | 58 ++++++++++++++++++++--------------------- 1 file changed, 29 insertions(+), 29 deletions(-) diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py index 26c226f589c4a0..ef4ebac82746e1 100644 --- a/Lib/email/feedparser.py +++ b/Lib/email/feedparser.py @@ -53,30 +53,30 @@ class BufferedSubFile(object): simple abstraction -- it parses until EOF closes the current message. """ def __init__(self): - self._partial: list[str] = [] - self._dangling_partial: bool = False + self._partial = [] + self._dangling_partial = False # A deque of full, pushed lines - self._lines: deque[str] = deque() + self._lines = deque() # The stack of false-EOF checking predicates. self._eofstack = [] # A flag indicating whether the file has been closed or not. - self._closed: bool = False - self._dump_destination: deque[str]|None = None - self._dump_result: str|None = None + self._closed = False + self._dump_destination = None + self._dump_result = None - def push_eof_matcher(self, pred) -> None: + def push_eof_matcher(self, pred): self._eofstack.append(pred) def pop_eof_matcher(self): return self._eofstack.pop() - def close(self) -> None: + def close(self): # Don't forget any trailing partial line. if self._partial: self._flush_partial() self._closed = True - def readline(self) -> str|object: + def readline(self): if not self._lines: if self._closed: return '' @@ -94,7 +94,7 @@ def readline(self) -> str|object: return line - def _check_eofstack(self, data, start=0, end=sys.maxsize) -> bool: + def _check_eofstack(self, data, start=0, end=sys.maxsize): for ateof in reversed(self._eofstack): if ateof(data, start, end): # We're at the false EOF. @@ -102,12 +102,12 @@ def _check_eofstack(self, data, start=0, end=sys.maxsize) -> bool: return False - def unreadline(self, line) -> None: + def unreadline(self, line): # Let the consumer push a line back into the buffer. assert line is not NeedMoreData self._lines.appendleft(line) - def _flush_partial(self) -> None: + def _flush_partial(self): line = EMPTYSTRING.join(self._partial) if not line: pass @@ -125,7 +125,7 @@ def _flush_partial(self) -> None: self._partial.clear() self._dangling_partial = False - def push(self, data) -> None: + def push(self, data): """Push some new data into this object.""" if not data: return @@ -137,7 +137,7 @@ def push(self, data) -> None: self._push_data(data) - def _can_dump_data(self, data) -> bool: + def _can_dump_data(self, data): if self._dump_destination is None: # We're not dumping data return False @@ -170,7 +170,7 @@ def _can_dump_data(self, data) -> bool: # We're still dumping, but there's a potential boundary marker or EOF or similar issue. Force a proper parse. return False - def _can_dump_partial(self, line, start: int=0, end: int=sys.maxsize) -> bool: + def _can_dump_partial(self, line, start=0, end=sys.maxsize): # Very similar to _can_dump_data above, except we can make some additional assumptions for partials/lines. # This should only ever be checked when we have a new partial line, in which case we have no partial, # or when checking the partial itself, in which case it'll always be the first part @@ -207,7 +207,7 @@ def _is_dump_midline(self): assert self._dump_destination[-1] # Never push empty strings to _dump_destination return self._dump_destination[-1][-1] not in ('\n', '\r') - def _push_data(self, data) -> None: + def _push_data(self, data): # Find first newline character in the data unl_start_index = BufferedSubFile._find_unl(data) if unl_start_index < 0: @@ -268,7 +268,7 @@ def _push_data(self, data) -> None: # unl_start_index is an index which points to the start of the next newline character, if there is one self._push_data_no_partial(data, data_start_index, unl_start_index) - def _push_data_no_partial(self, data, data_start_index: int, unl_start_index: int) -> None: + def _push_data_no_partial(self, data, data_start_index, unl_start_index): # _partial is now guaranteed to point to be empty # data_start_index is an index which points to the start of the next line # unl_start_index is an index which points to the start of the next newline character, if there is one @@ -349,14 +349,14 @@ def _push_data_no_partial(self, data, data_start_index: int, unl_start_index: in if data_start_index < len(data): self._partial.append(data[data_start_index:]) - def pushlines(self, lines) -> None: + def pushlines(self, lines): # This method is not documented on docs.python.org self._lines.extend(lines) def __iter__(self): return self - def __next__(self) -> str|object: + def __next__(self): line = self.readline() if line == '': raise StopIteration @@ -394,13 +394,13 @@ def _get_dump(self, start_value:str|None = None): self._dump_destination = None self._dump_result = EMPTYSTRING.join(_dump_destination) - def _pop_dump(self) -> str: + def _pop_dump(self): result = self._dump_result self._dump_result = None return result @staticmethod - def _find_unl(data, start=0) -> int: + def _find_unl(data, start=0): # Like str.find(), but for universal newlines # Originally, this iterated over the string, however just calling find() twice is drastically faster # This could be sped up by replacing with a similar function in C, so we don't pass over the string twice. @@ -412,7 +412,7 @@ def _find_unl(data, start=0) -> int: return nl_index if nl_index >= 0 else cr_index @staticmethod - def _find_unl_end(data, start) -> int: + def _find_unl_end(data, start): # A helper function which returns the 1-past-the-end index of a universal newline # This could be sped up by replacing with a similar function in C. #assert data[start] in '\r\n' @@ -467,15 +467,15 @@ def __init__(self, _factory=None, *, policy=compat32): self._headersonly = False # Non-public interface for supporting Parser's headersonly flag - def _set_headersonly(self) -> None: + def _set_headersonly(self): self._headersonly = True - def feed(self, data) -> None: + def feed(self, data): """Push more data into the parser.""" self._input.push(data) self._call_parse() - def _call_parse(self) -> None: + def _call_parse(self): try: self._parse() # Return value is always NeedMoreData or None, but discarded here in either case except StopIteration: @@ -494,7 +494,7 @@ def close(self): self.policy.handle_defect(root, defect) return root - def _new_message(self) -> None: + def _new_message(self): if self._old_style_factory: msg = self._factory() else: @@ -616,7 +616,7 @@ def _parsegen(self): # yields: NeedMoreData # this onto the input stream until we've scanned past the # preamble. separator = '--' + boundary - def boundarymatch(line, pos: int = 0, endpos: int = sys.maxsize): + def boundarymatch(line, pos = 0, endpos = sys.maxsize): if not line.startswith(separator, pos, endpos): return None return boundaryendRE.match(line, pos + len(separator), endpos) @@ -749,7 +749,7 @@ def boundarymatch(line, pos: int = 0, endpos: int = sys.maxsize): yield from self._input._get_dump() self._cur.set_payload(self._input._pop_dump()) - def _parse_headers(self, lines) -> None: + def _parse_headers(self, lines): # Passed a list of lines that make up the headers for the current msg lastheader = '' lastvalue = [] @@ -813,5 +813,5 @@ def _parse_headers(self, lines) -> None: class BytesFeedParser(FeedParser): """Like FeedParser, but feed accepts bytes.""" - def feed(self, data) -> None: + def feed(self, data): super().feed(data.decode('ascii', 'surrogateescape')) From 13ebb398c80c93372fbea31c85f5347d1e5a9dba Mon Sep 17 00:00:00 2001 From: Jessica James Date: Sat, 19 Apr 2025 13:21:26 -0400 Subject: [PATCH 5/9] Comment: 80-rule, remove some comments --- Lib/email/feedparser.py | 158 +++++++++++++++++++--------------------- Lib/email/parser.py | 5 +- 2 files changed, 77 insertions(+), 86 deletions(-) diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py index ef4ebac82746e1..9926770d53e047 100644 --- a/Lib/email/feedparser.py +++ b/Lib/email/feedparser.py @@ -112,10 +112,10 @@ def _flush_partial(self): if not line: pass elif self._dump_destination is None: - # We're not dumping data. Just flush the partial to lines, as normal + # We're not dumping data. Just flush the partial to lines self._lines.append(line) elif self._check_eofstack(line): - # We were dumping, but we've now reached the end of the dump. Push our line and stop dumping. + # We were dumping, but we've now reached the end of the dump. self._dump_destination = None self._lines.append(line) else: @@ -130,7 +130,6 @@ def push(self, data): if not data: return - # If we're dumping, and we don't have anything that will ever tell us to terminate, simply dump everything if self._can_dump_data(data): self._dump_destination.append(data) return @@ -139,12 +138,11 @@ def push(self, data): def _can_dump_data(self, data): if self._dump_destination is None: - # We're not dumping data return False # We're dumping; check for easy optimizations if not self._eofstack: - # There's nothing that will ever tell us to stop dumping. Go ahead and dump the entire `data` object. + # There's nothing that will ever tell us to stop dumping. # This does absolute wonders for large non-multipart emails. assert not self._lines assert not self._dangling_partial @@ -155,56 +153,44 @@ def _can_dump_data(self, data): if self._partial: return False - all_boundary_matches = True for pred in self._eofstack: if not hasattr(pred, 'is_boundary_match'): - all_boundary_matches = False - break - - if all_boundary_matches and '-' not in data: - # We eventually need to stop, but we only care about boundary matches, and there's no boundaries - # here. Dump the entire `data` object. This does wonders for multipart emails with large parts. - assert not self._lines - return True - - # We're still dumping, but there's a potential boundary marker or EOF or similar issue. Force a proper parse. - return False + # We can't blindly dump entire chunks, if we're interested in + # more than just boundaries + return False + + # We only care about boundaries; we can dump as long as there's no + # potential boundaries. + return '-' not in data def _can_dump_partial(self, line, start=0, end=sys.maxsize): - # Very similar to _can_dump_data above, except we can make some additional assumptions for partials/lines. - # This should only ever be checked when we have a new partial line, in which case we have no partial, - # or when checking the partial itself, in which case it'll always be the first part + # Very similar to _can_dump_data above, except we can make some + # additional assumptions for partials/lines. assert not self._partial or line is self._partial[0] if self._dump_destination is None: - # We're not dumping data return False - # We're dumping. There should be absolutely no other pending lines, because those should've been dumped. + # We're dumping. There should be absolutely no other pending lines, + # because those should've been dumped. assert not self._lines if not self._eofstack: - # There's nothing that will ever tell us to stop dumping. Dump away. + # There's nothing that will ever tell us to stop dumping. Dump away return True all_boundary_matches = True for pred in self._eofstack: if not hasattr(pred, 'is_boundary_match'): - all_boundary_matches = False - break - - if all_boundary_matches and not line.startswith("-", start, end): - # We eventually need to stop, but we only care about boundary matches, and there's no boundaries - # here. Dump the entire `data` object. This does wonders for multipart emails with large parts. - return True + return False - # We're still dumping, but there's a potential boundary marker or EOF or similar issue. Force a proper parse. - return False + # We only care about boundaries; we can dump as long as there's no + # potential boundaries. + return not line.startswith("-", start, end) def _is_dump_midline(self): if not self._dump_destination: return False - assert self._dump_destination[-1] # Never push empty strings to _dump_destination return self._dump_destination[-1][-1] not in ('\n', '\r') def _push_data(self, data): @@ -214,8 +200,9 @@ def _push_data(self, data): # No new complete lines, wait for more. # Check to see if we had a previous dangling partial newline if self._dangling_partial: - # We previously pushed a dangling line expecting a \n to follow, however we received other data instead. - # Therefore, that \r does actually terminate a line. Go ahead and push it. + # We previously pushed a dangling line expecting \n to follow, + # however we received other data instead. Therefore, that \r + # does actually terminate a line. Go ahead and push it. self._flush_partial() # No lines in data to push; wait for more data @@ -230,24 +217,23 @@ def _push_data(self, data): # Complete our previous/partial line if self._partial: - # Check to see if we had any dangling newlines in our partial, and handle if appropriate if self._dangling_partial: - # We had a previously dangling line; this is either a \n (completion), or some other char (termination) if data[0] != NL: - # "\r" -- push what we had, as it has been terminated; data_start_index = 0 + # "\r" -- push what we had, it's been terminated self._flush_partial() else: - # "\r\n" -- append \n and push it; data_start_index = 1 + # "\r\n" -- append \n to complete it and push self._partial.append(NL) self._flush_partial() data_start_index = 1 # Find the next newline - unl_start_index = BufferedSubFile._find_unl(data, data_start_index) - # Fall through + unl_start_index = BufferedSubFile._find_unl( + data, data_start_index) else: - # Our partial has no dangling newline; complete our partial with the new line and push it - unl_end_index = BufferedSubFile._find_unl_end(data, unl_start_index) + # Complete our partial with the new line and push it + unl_end_index = BufferedSubFile._find_unl_end( + data, unl_start_index) if unl_end_index < 0: # The newline is incomplete; append data and return self._partial.append(data) @@ -260,36 +246,37 @@ def _push_data(self, data): data_start_index = unl_end_index # Find the next newline - unl_start_index = BufferedSubFile._find_unl(data, data_start_index) - # Fall through + unl_start_index = BufferedSubFile._find_unl( + data, data_start_index) # _partial is now guaranteed to point to be empty - # data_start_index is an index which points to the start of the next line - # unl_start_index is an index which points to the start of the next newline character, if there is one + # data_start_index is an index which points to the start of next line + # unl_start_index is the start of the next newline character, or -1 self._push_data_no_partial(data, data_start_index, unl_start_index) def _push_data_no_partial(self, data, data_start_index, unl_start_index): - # _partial is now guaranteed to point to be empty - # data_start_index is an index which points to the start of the next line - # unl_start_index is an index which points to the start of the next newline character, if there is one - # Process any remaining whole lines in data if unl_start_index < 0: # Push right to the partial if there's no lines if data_start_index < len(data): assert data_start_index >= 0 partial_line = data[data_start_index:] - if self._is_dump_midline() or self._can_dump_partial(partial_line): + if self._is_dump_midline() \ + or self._can_dump_partial(partial_line): self._dump_destination.append(partial_line) else: self._partial = [partial_line] if data[-1] == '\r': self._dangling_partial = True - elif self._dump_destination is None and unl_start_index < len(data) // 2: - # If it looks like we're going to be doing a lot of splits/joins, just go ahead and use StringIO, for speed - # If we had some sort of "StringViewIO" to avoid the copy, this would be significantly more efficient - # This code block, and the "else" code block below, functionally do the exact same thing, except this path - # makes no attempt to handle dumping data + elif self._dump_destination is None \ + and unl_start_index < len(data) // 2: + # If it looks like we're going to be doing a lot of splits/joins, + # just go ahead and use StringIO, for speed + # If we had some sort of "StringViewIO" to avoid the copy, this + # would be significantly more efficient + # This code block, and the "else" code block below, functionally do + # the exact same thing, except this path makes no attempt to handle + # dumping data sio = StringIO(data, '') sio.seek(data_start_index) lines = sio.readlines() @@ -301,26 +288,28 @@ def _push_data_no_partial(self, data, data_start_index, unl_start_index): self.pushlines(lines) else: - # If we're not, let's keep it in Python - dump_data_start = None if self._dump_destination is None else data_start_index + dump_data_start = None if self._dump_destination is None \ + else data_start_index while unl_start_index >= 0: - unl_end_index = BufferedSubFile._find_unl_end(data, unl_start_index) + unl_end_index = BufferedSubFile._find_unl_end( + data, unl_start_index) if unl_end_index < 0: - # Incomplete line ending; break to update our partial and return + # Incomplete line ending; break to just update our partial self._dangling_partial = True break # We have an easy line; push it if self._dump_destination is not None: - # We have a window into a line. Make sure it's not EOF, and continue as long as it's not - if self._check_eofstack(data, data_start_index, unl_end_index): - # This line is "EOF". This is the end of our dump data! Push the dump data. - self._dump_destination.append(data[dump_data_start:data_start_index]) + # We have a window into a line. Make sure it's not EOF + if self._check_eofstack( + data, data_start_index, unl_end_index): + # This line is "EOF". This is the end of our dump data + self._dump_destination.append( + data[dump_data_start:data_start_index]) # Also push our line, since we already have it - self._lines.append(data[data_start_index:unl_end_index]) - - # Mark dump complete + self._lines.append( + data[data_start_index:unl_end_index]) self._dump_destination = None #else: # This line didn't mark the end. Keep going. else: @@ -329,21 +318,23 @@ def _push_data_no_partial(self, data, data_start_index, unl_start_index): # Update our iterators data_start_index = unl_end_index - unl_start_index = BufferedSubFile._find_unl(data, data_start_index) + unl_start_index = BufferedSubFile._find_unl( + data, data_start_index) - # If we're still dumping, push everything that isn't going into the partial to the dump if self._dump_destination is not None: - # If we're able to safely flush the partial, go ahead and do that too - # We don't care about self._is_dump_midline() here, because data_start_index always represents the - # start of a new line, always + # Push everything that isn't going into the partial to the dump + # If we're able to safely flush the partial, do that too + # We don't care about self._is_dump_midline() here, because + # data_start_index always represents the start of a new line if self._can_dump_partial(data, data_start_index): self._dump_destination.append(data[dump_data_start:]) - # We've consumed the partial; flush any partial-related state we may have set + # Flush any partial-related state we may have set self._dangling_partial = False - return # skip the _partial.append below, because it's already been consumed + return # skip the _partial.append below else: - self._dump_destination.append(data[dump_data_start:data_start_index]) + self._dump_destination.append( + data[dump_data_start:data_start_index]) # If we have any partial data leftover, go ahead and set it if data_start_index < len(data): @@ -381,7 +372,6 @@ def _get_dump(self, start_value:str|None = None): if needs_more_data: # Flush our partial, if we can if self._partial and self._can_dump_partial(self._partial[0]): - assert self._partial[0] # We shouldn't ever push empty strings to _partial _dump_destination.extend(self._partial) self._partial.clear() self._dangling_partial = False @@ -402,8 +392,9 @@ def _pop_dump(self): @staticmethod def _find_unl(data, start=0): # Like str.find(), but for universal newlines - # Originally, this iterated over the string, however just calling find() twice is drastically faster - # This could be sped up by replacing with a similar function in C, so we don't pass over the string twice. + # Originally, this iterated over the string, however this is faster + # This could be sped up by replacing with a similar function in C, + # so we don't pass over the string twice. cr_index = data.find('\r', start) if cr_index < 0: return data.find(NL, start) @@ -413,9 +404,8 @@ def _find_unl(data, start=0): @staticmethod def _find_unl_end(data, start): - # A helper function which returns the 1-past-the-end index of a universal newline + # Returns the 1-past-the-end index of a universal newline # This could be sped up by replacing with a similar function in C. - #assert data[start] in '\r\n' # \n is always end of line if data.startswith(NL, start): @@ -425,7 +415,7 @@ def _find_unl_end(data, start): if data.startswith(NL, start + 1): return start + 2 - # End of string; we can't know if a \n follows, so no universal line end + # End of data; we can't know if a \n follows, so no universal line end if start + 1 >= len(data): return -1 @@ -461,7 +451,7 @@ def __init__(self, _factory=None, *, policy=compat32): self._old_style_factory = True self._input = BufferedSubFile() self._msgstack = [] - self._parse = self._parsegen().__next__ # Interesting trick which replaces yield values with return values + self._parse = self._parsegen().__next__ self._cur = None self._last = None self._headersonly = False @@ -477,7 +467,7 @@ def feed(self, data): def _call_parse(self): try: - self._parse() # Return value is always NeedMoreData or None, but discarded here in either case + self._parse() except StopIteration: pass diff --git a/Lib/email/parser.py b/Lib/email/parser.py index a182daebec0092..2914fac09894be 100644 --- a/Lib/email/parser.py +++ b/Lib/email/parser.py @@ -43,7 +43,7 @@ def __init__(self, _class=None, *, policy=compat32): def _parse_chunks(self, chunk_generator, headersonly=False): """Internal method / implementation detail - Parses chunks from a chunk generator into a FeedParser, returning the result + Parses chunks from a chunk generator into a FeedParser """ feedparser = FeedParser(self._class, policy=self.policy) if headersonly: @@ -135,7 +135,8 @@ def parsebytes(self, text, headersonly=False): the file. """ _chunk_generator = ( - text[offset:offset + _FEED_CHUNK_SIZE].decode('ASCII', errors='surrogateescape') + text[offset:offset + _FEED_CHUNK_SIZE].decode( + 'ASCII', errors='surrogateescape') for offset in range(0, len(text), _FEED_CHUNK_SIZE) ) From 2f6002eb253197556f5ddb17577210960207049a Mon Sep 17 00:00:00 2001 From: Jessica James Date: Sat, 19 Apr 2025 13:32:01 -0400 Subject: [PATCH 6/9] Comment: Remove TestPeakMemoryUsage --- Lib/test/test_email/test_email.py | 142 +----------------------------- 1 file changed, 2 insertions(+), 140 deletions(-) diff --git a/Lib/test/test_email/test_email.py b/Lib/test/test_email/test_email.py index b02931e97b7f27..abfa3f7ddfaa73 100644 --- a/Lib/test/test_email/test_email.py +++ b/Lib/test/test_email/test_email.py @@ -4606,7 +4606,8 @@ def _msgobj_trickle(filename, trickle_size=2, force_linetype="\r\n"): # Trickle data into the feed parser, one character at a time with openfile(filename, encoding="utf-8") as fp: file_str = fp.read() - file_str = file_str.replace("\r\n", "\n").replace("\r", "\n").replace("\n", force_linetype) + file_str = file_str.replace("\r\n", "\n").replace("\r", "\n") \ + .replace("\n", force_linetype) feedparser = FeedParser() for index in range(0, len(file_str), trickle_size): @@ -4617,22 +4618,15 @@ def _validate_msg10_msgobj(self, msg, line_end): if isinstance(line_end, str): line_end = line_end.encode() eq = self.assertEqual - # The outer message is a multipart eq(msg.get_payload(decode=True), None) - # Subpart 1 is 7bit encoded eq(msg.get_payload(0).get_payload(decode=True), b'This is a 7bit encoded message.' + line_end) - # Subpart 2 is quopri eq(msg.get_payload(1).get_payload(decode=True), b'\xa1This is a Quoted Printable encoded message!' + line_end) - # Subpart 3 is base64 eq(msg.get_payload(2).get_payload(decode=True), b'This is a Base64 encoded message.') - # Subpart 4 is base64 with a trailing newline, which - # used to be stripped (issue 7143). eq(msg.get_payload(3).get_payload(decode=True), b'This is a Base64 encoded message.\n') - # Subpart 5 has no Content-Transfer-Encoding: header. eq(msg.get_payload(4).get_payload(decode=True), b'This has no Content-Transfer-Encoding: header.' + line_end) @@ -4649,8 +4643,6 @@ def test_trickle_1chr_lf(self): self._validate_msg10_msgobj(msg, '\n') def test_trickle_2chr_crlf(self): - # During initial testing, it was realized that an edge case was missed around dangling newlines. - # This helps test that behavior, as it is not otherwise covered by tests. msg = self._msgobj_trickle('msg_10.txt', 2, '\r\n') self._validate_msg10_msgobj(msg, '\r\n') @@ -4675,136 +4667,6 @@ def test_trickle_3chr_lf(self): self._validate_msg10_msgobj(msg, '\n') -class TestPeakMemoryUsage(unittest.TestCase): - - maxDiff = None - SMALLER_CHUNK_SIZE = 1024 - - def _msg_bytes(self, filename): - with openfile(filename, 'rb') as fp: - data = fp.read() - return data - - def _make_plaintext_msg_bytes(self, min_size): - # Get msg_01 as our baseline - msg_bytes = self._msg_bytes('msg_01.txt') - if len(msg_bytes) < min_size: - # Make it bigger - msg_bytes = msg_bytes * ((min_size // len(msg_bytes)) + 1) - msg_bytes = msg_bytes[:min_size] # Truncate it to min_size - assert len(msg_bytes) >= min_size - - match = re.search(rb'(\r|\n|\r\n){2}', msg_bytes) - self.assertIsNotNone(match) - expected_payload = msg_bytes[match.end():] - - return msg_bytes, expected_payload - - def _measure_message_from_bytes(self, msg_bytes): - import tracemalloc - - # Call email.message_from_bytes, gathering some memory usage stats in the process - tracemalloc.start() - start_time = time.perf_counter() - msgobj = email.message_from_bytes(msg_bytes, policy=email.policy.default) - end_time = time.perf_counter() - after_bytes, after_peak_bytes = tracemalloc.get_traced_memory() - tracemalloc.stop() - - # "How many bytes did we allocate, that were ultimately discarded?" - peak_overhead = after_peak_bytes - after_bytes - - # "How large was that overhead, relative to the size of the message?" - overhead_ratio = peak_overhead / len(msg_bytes) if len(msg_bytes) > 0 else None - - return msgobj, peak_overhead, overhead_ratio, end_time - start_time - - def _base64_encode(self, bytes_to_encode, one_line=True): - base64_str = base64mime.body_encode(bytes_to_encode) - if one_line: - base64_str = "".join(base64_str.splitlines()) - return base64_str - - _multipart_msg_base = textwrap.dedent("""\ - Date: Wed, 14 Nov 2007 12:56:23 GMT - From: foo@bar.invalid - To: foo@bar.invalid - Subject: Content-Transfer-Encoding: base64 and multipart - MIME-Version: 1.0 - Content-Type: multipart/mixed; boundary="BOUNDARY" - - --BOUNDARY - Content-Type: text/plain - - Test message - - --BOUNDARY - Content-Type: application/octet-stream - Content-Transfer-Encoding: base64 - - {} - --BOUNDARY-- - """) - - def _make_junk_bytes(self, bytes_length): - junk_data = bytearray(bytes_length) - for i in range(len(junk_data)): - junk_data[i] = i % 256 - return bytes(junk_data) - - def _make_junk_base64(self, bytes_length, one_line=True): - junk_bytes = self._make_junk_bytes(bytes_length) - return self._base64_encode(junk_bytes, one_line), junk_bytes - - _LARGE_EMAIL_BYTE_SIZE = 1024*1024*10 # 10 MiB - - def test_message_from_bytes_plaintext(self): - # Generate a 10MiB plaintext email - msg_bytes, expected_payload = self._make_plaintext_msg_bytes(self._LARGE_EMAIL_BYTE_SIZE) - - # Parse it, collecting stats - msgobj, peak_overhead, overhead_ratio, time_taken = self._measure_message_from_bytes(msg_bytes) - - # Verify the message payload/content is correct. - self.assertEqual(msgobj.get_payload(decode=True), expected_payload) - self.assertEqual(msgobj.get_content(), expected_payload.decode()) - - # overhead_ratio at time of writing: 1.0102445602416992 - self.assertLess(overhead_ratio, 1.05) - - def test_message_from_bytes_large_attachment_body_encoded(self): - # Generate a 10 MiB attachment - attachment_base64, attachment_bytes = self._make_junk_base64(self._LARGE_EMAIL_BYTE_SIZE, False) - multipart_msg_bytes = self._multipart_msg_base.format(attachment_base64).encode() - - # Parse it, collecting stats - msgobj, peak_overhead, overhead_ratio, time_taken = self._measure_message_from_bytes(multipart_msg_bytes) - - # Verify the message payload/content is correct. - attachment_msg = msgobj.get_payload(1) - self.assertEqual(attachment_msg.get_content(), attachment_bytes) - self.assertEqual(attachment_msg.get_payload(decode=False), attachment_base64) - - # overhead_ratio at time of writing: 1.0088957315722829 - 85.0565% decrease - self.assertLess(overhead_ratio, 1.05) - - def test_message_from_bytes_large_attachment_one_line(self): - # Generate a 10 MiB attachment - attachment_base64, attachment_bytes = self._make_junk_base64(self._LARGE_EMAIL_BYTE_SIZE, True) - multipart_msg_bytes = self._multipart_msg_base.format(attachment_base64).encode() - - # Parse it, collecting stats - msgobj, peak_overhead, overhead_ratio, time_taken = self._measure_message_from_bytes(multipart_msg_bytes) - - # Verify the message payload/content is correct. - attachment_msg = msgobj.get_payload(1) - self.assertEqual(attachment_msg.get_content(), attachment_bytes) - self.assertEqual(attachment_msg.get_payload(decode=False), attachment_base64) - - # overhead_ratio at time of writing: 1.0077472351610626 - 89.2775% decrease - self.assertLess(overhead_ratio, 1.05) - - class TestBytesGeneratorIdempotentNL(BaseTestBytesGeneratorIdempotent, TestIdempotent): linesep = '\n' From 4fa675568447e75ff23a02bdd06c2b6cce752d8f Mon Sep 17 00:00:00 2001 From: Jessica James Date: Sat, 19 Apr 2025 13:41:09 -0400 Subject: [PATCH 7/9] Lint: Remove spaces --- Lib/email/feedparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py index 9926770d53e047..1f4204a10426e8 100644 --- a/Lib/email/feedparser.py +++ b/Lib/email/feedparser.py @@ -158,7 +158,7 @@ def _can_dump_data(self, data): # We can't blindly dump entire chunks, if we're interested in # more than just boundaries return False - + # We only care about boundaries; we can dump as long as there's no # potential boundaries. return '-' not in data From 4f3622733d274169c9582aa7bf2ce4af942f7cd8 Mon Sep 17 00:00:00 2001 From: Jessica James Date: Sat, 19 Apr 2025 15:47:10 -0400 Subject: [PATCH 8/9] Missed a comment which served as an annotation --- Lib/email/feedparser.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py index 1f4204a10426e8..af09c560edd948 100644 --- a/Lib/email/feedparser.py +++ b/Lib/email/feedparser.py @@ -505,7 +505,7 @@ def _pop_message(self): self._cur = None return retval - def _parsegen(self): # yields: NeedMoreData + def _parsegen(self): # Create a new message and start by parsing headers. self._new_message() headers = [] From d8fa697120e1a55973e1a5f9edb4131967538a27 Mon Sep 17 00:00:00 2001 From: Jessica James Date: Sun, 20 Jul 2025 17:39:35 -0400 Subject: [PATCH 9/9] Apply stylistic suggestions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Bénédikt Tran <10796600+picnixz@users.noreply.github.com> --- Lib/email/feedparser.py | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/Lib/email/feedparser.py b/Lib/email/feedparser.py index af09c560edd948..00ad7f42de5ffe 100644 --- a/Lib/email/feedparser.py +++ b/Lib/email/feedparser.py @@ -95,12 +95,11 @@ def readline(self): return line def _check_eofstack(self, data, start=0, end=sys.maxsize): - for ateof in reversed(self._eofstack): - if ateof(data, start, end): - # We're at the false EOF. - return True - - return False + # check if we can find a dummy EOF + return any( + ateof(data, start, end) + for ateof in reversed(self._eofstack) + ) def unreadline(self, line): # Let the consumer push a line back into the buffer. @@ -112,7 +111,7 @@ def _flush_partial(self): if not line: pass elif self._dump_destination is None: - # We're not dumping data. Just flush the partial to lines + # We're not dumping data. Just flush the partial to lines. self._lines.append(line) elif self._check_eofstack(line): # We were dumping, but we've now reached the end of the dump. @@ -128,19 +127,17 @@ def _flush_partial(self): def push(self, data): """Push some new data into this object.""" if not data: - return - - if self._can_dump_data(data): + pass + elif self._can_dump_data(data): self._dump_destination.append(data) - return - - self._push_data(data) + else: + self._push_data(data) def _can_dump_data(self, data): if self._dump_destination is None: return False - # We're dumping; check for easy optimizations + # We're dumping; check for easy optimizations. if not self._eofstack: # There's nothing that will ever tell us to stop dumping. # This does absolute wonders for large non-multipart emails. @@ -178,7 +175,6 @@ def _can_dump_partial(self, line, start=0, end=sys.maxsize): # There's nothing that will ever tell us to stop dumping. Dump away return True - all_boundary_matches = True for pred in self._eofstack: if not hasattr(pred, 'is_boundary_match'): return False @@ -190,7 +186,6 @@ def _can_dump_partial(self, line, start=0, end=sys.maxsize): def _is_dump_midline(self): if not self._dump_destination: return False - return self._dump_destination[-1][-1] not in ('\n', '\r') def _push_data(self, data): @@ -214,8 +209,7 @@ def _push_data(self, data): return data_start_index = 0 - - # Complete our previous/partial line + # Complete our previous/partial line. if self._partial: if self._dangling_partial: if data[0] != NL: @@ -227,7 +221,6 @@ def _push_data(self, data): self._flush_partial() data_start_index = 1 - # Find the next newline unl_start_index = BufferedSubFile._find_unl( data, data_start_index) else: @@ -285,7 +278,6 @@ def _push_data_no_partial(self, data, data_start_index, unl_start_index): self._partial.append(lines.pop()) if data[-1] == '\r': self._dangling_partial = True - self.pushlines(lines) else: dump_data_start = None if self._dump_destination is None \ @@ -353,7 +345,7 @@ def __next__(self): raise StopIteration return line - def _get_dump(self, start_value:str|None = None): + def _get_dump(self, start_value=None): _dump_destination = deque() self._dump_destination = _dump_destination @@ -398,7 +390,6 @@ def _find_unl(data, start=0): cr_index = data.find('\r', start) if cr_index < 0: return data.find(NL, start) - nl_index = data.find(NL, start, cr_index) return nl_index if nl_index >= 0 else cr_index @@ -410,15 +401,12 @@ def _find_unl_end(data, start): # \n is always end of line if data.startswith(NL, start): return start + 1 - # \r\n is always end of line if data.startswith(NL, start + 1): return start + 2 - # End of data; we can't know if a \n follows, so no universal line end if start + 1 >= len(data): return -1 - # This is a \r followed by some other non-newline character return start + 1 pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy