diff --git a/Lib/test/test_tools/i18n_data/exclude_file.pot b/Lib/test/test_tools/i18n_data/exclude_file.pot new file mode 100644 index 00000000000000..4c94fdde65df51 --- /dev/null +++ b/Lib/test/test_tools/i18n_data/exclude_file.pot @@ -0,0 +1,38 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR ORGANIZATION +# FIRST AUTHOR , YEAR. +# +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"POT-Creation-Date: 2000-01-01 00:00+0000\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: pygettext.py 1.5\n" + + +msgid "excluded" +msgstr "" + +msgid "multiline\nexcluded" +msgstr "" + +msgid "singular excluded" +msgid_plural "plural excluded" +msgstr[0] "" +msgstr[1] "" + +msgctxt "context" +msgid "context excluded" +msgstr "" + +msgctxt "context" +msgid "context singular excluded" +msgid_plural "context plural excluded" +msgstr[0] "" +msgstr[1] "" + diff --git a/Lib/test/test_tools/i18n_data/excluded.pot b/Lib/test/test_tools/i18n_data/excluded.pot new file mode 100644 index 00000000000000..c3037b24e5cb87 --- /dev/null +++ b/Lib/test/test_tools/i18n_data/excluded.pot @@ -0,0 +1,21 @@ +# SOME DESCRIPTIVE TITLE. +# Copyright (C) YEAR ORGANIZATION +# FIRST AUTHOR , YEAR. +# +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"POT-Creation-Date: 2000-01-01 00:00+0000\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" +"Generated-By: pygettext.py 1.5\n" + + +#: excluded.py:4 +msgid "foo" +msgstr "" + diff --git a/Lib/test/test_tools/i18n_data/excluded.py b/Lib/test/test_tools/i18n_data/excluded.py new file mode 100644 index 00000000000000..a9249c6fa540c2 --- /dev/null +++ b/Lib/test/test_tools/i18n_data/excluded.py @@ -0,0 +1,14 @@ +from gettext import gettext as _, ngettext, npgettext, pgettext + + +_('foo') + +_('excluded') + +_('multiline\nexcluded') + +ngettext('singular excluded', 'plural excluded', 2) + +pgettext('context', 'context excluded') + +npgettext('context', 'context singular excluded', 'context plural excluded', 2) diff --git a/Lib/test/test_tools/i18n_data/general.json b/Lib/test/test_tools/i18n_data/general.json new file mode 100644 index 00000000000000..42f8470784ede3 --- /dev/null +++ b/Lib/test/test_tools/i18n_data/general.json @@ -0,0 +1,103 @@ +[ + { + "msgctxt": null, + "msgid": "", + "msgid_plural": null, + "msgstr": "Project-Id-Version: PACKAGE VERSION\nPOT-Creation-Date: 2024-10-26 18:06+0200\nPO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\nLast-Translator: FULL NAME \nLanguage-Team: LANGUAGE \nMIME-Version: 1.0\nContent-Type: text/plain; charset=UTF-8\nContent-Transfer-Encoding: 8bit\n" + }, + { + "msgctxt": null, + "msgid": "foo", + "msgid_plural": null, + "msgstr": "bar" + }, + { + "msgctxt": null, + "msgid": "Escape sequences \" \n \t", + "msgid_plural": null, + "msgstr": "" + }, + { + "msgctxt": null, + "msgid": "Python", + "msgid_plural": null, + "msgstr": "Python" + }, + { + "msgctxt": null, + "msgid": "Python (2)", + "msgid_plural": null, + "msgstr": "Python (2)" + }, + { + "msgctxt": null, + "msgid": "αβ", + "msgid_plural": null, + "msgstr": "αβ" + }, + { + "msgctxt": null, + "msgid": "barbaz", + "msgid_plural": null, + "msgstr": "quxxyz" + }, + { + "msgctxt": null, + "msgid": "xyz", + "msgid_plural": null, + "msgstr": "" + }, + { + "msgctxt": "context", + "msgid": "foo", + "msgid_plural": null, + "msgstr": "bar" + }, + { + "msgctxt": "xyz", + "msgid": "foo", + "msgid_plural": null, + "msgstr": "bar" + }, + { + "msgctxt": null, + "msgid": "One email sent.", + "msgid_plural": "%d emails sent.", + "msgstr": [ + "One email sent.", + "%d emails sent." + ] + }, + { + "msgctxt": null, + "msgid": "One message sent.", + "msgid_plural": "%d messages sent.", + "msgstr": [ + "%d message sent." + ] + }, + { + "msgctxt": "abc", + "msgid": "One email sent.", + "msgid_plural": "%d emails sent.", + "msgstr": [ + "One email sent.", + "%d emails sent." + ] + }, + { + "msgctxt": null, + "msgid": "qux", + "msgid_plural": "quxs", + "msgstr": [ + "abc", + "xyz" + ] + }, + { + "msgctxt": null, + "msgid": "baz", + "msgid_plural": null, + "msgstr": "" + } +] \ No newline at end of file diff --git a/Lib/test/test_tools/i18n_data/general.po b/Lib/test/test_tools/i18n_data/general.po new file mode 100644 index 00000000000000..e86725aa93acbd --- /dev/null +++ b/Lib/test/test_tools/i18n_data/general.po @@ -0,0 +1,85 @@ +msgid "" +msgstr "" +"Project-Id-Version: PACKAGE VERSION\n" +"POT-Creation-Date: 2024-10-26 18:06+0200\n" +"PO-Revision-Date: YEAR-MO-DA HO:MI+ZONE\n" +"Last-Translator: FULL NAME \n" +"Language-Team: LANGUAGE \n" +"MIME-Version: 1.0\n" +"Content-Type: text/plain; charset=UTF-8\n" +"Content-Transfer-Encoding: 8bit\n" + +msgid "foo" +msgstr "bar" + +# Selected escape sequences are allowed +msgid "Escape sequences \" \n \t" +msgstr "" + +# Octal escape sequences are allowed +msgid "\120\171\164\150\157\156" +msgstr "Python" + +# Hex escape sequences are allowed +msgid "\x50\x79\x74\x68\x6f\x6e (2)" +msgstr "Python (2)" + +# non-ascii +msgid "αβ" +msgstr "αβ" + +# Empty lines are ignored +msgid "" +"bar" + +"baz" +msgstr "" +"qux" + +"xyz" + +# Keyword line does not need to contain a string +msgid +"xyz" +msgstr "" + +# comment +# comment + +msgctxt "context" +msgid "foo" +msgstr "bar" + +msgctxt "xyz" +msgid "foo" +msgstr "bar" + +msgid "One email sent." +msgid_plural "%d emails sent." +msgstr[0] "One email sent." +msgstr[1] "%d emails sent." + +# Each message can have a different number of msgstr[N] +msgid "One message sent." +msgid_plural "%d messages sent." +msgstr[0] "%d message sent." + +msgctxt "abc" +msgid "One email sent." +msgid_plural "%d emails sent." +msgstr[0] "One email sent." +msgstr[1] "%d emails sent." + +msgid "qux" +msgid_plural "quxs" +msgstr[0] "a" +"b" +"c" +msgstr[1] "x" +"y" +"z" + +msgid "baz" +msgstr "" + +# trailing comments are allowed diff --git a/Lib/test/test_tools/test_i18n.py b/Lib/test/test_tools/test_i18n.py index 66c33077423229..650af9f733e973 100644 --- a/Lib/test/test_tools/test_i18n.py +++ b/Lib/test/test_tools/test_i18n.py @@ -1,5 +1,7 @@ """Tests to cover the Tools/i18n package""" +import codecs +import json import os import re import sys @@ -7,7 +9,7 @@ from textwrap import dedent from pathlib import Path -from test.support.script_helper import assert_python_ok +from test.support.script_helper import assert_python_failure, assert_python_ok from test.test_tools import imports_under_tool, skip_if_missing, toolsdir from test.support.os_helper import temp_cwd, temp_dir @@ -18,7 +20,7 @@ with imports_under_tool("i18n"): - from pygettext import parse_spec + from pygettext import parse_po, parse_quoted_strings, parse_spec def normalize_POT_file(pot): @@ -516,8 +518,256 @@ def test_parse_keyword_spec(self): parse_spec(spec) self.assertEqual(str(cm.exception), message) + def test_missing_exclude_file(self): + """ + Test that an error is raised if the exclude file (passed via + --exclude-file) does not exist. + """ + _, _, stderr = assert_python_failure(self.script, + '--exclude-file=foo.po') + self.assertIn("Can't read --exclude-file: foo.po", + stderr.decode('utf-8')) + + def test_invalid_exclude_file(self): + """ + Test that an error is raised if the exclude file (passed via + --exclude-file) is not a valid PO file. + """ + with temp_cwd(None): + # Create an invalid PO file + Path('invalid.po').write_text('Invalid PO file', encoding='utf-8') + + _, _, stderr = assert_python_failure(self.script, + '--exclude-file=invalid.po') + self.assertIn("Invalid exclude file (invalid.po):", + stderr.decode('utf-8')) + + +class TestPOParser(unittest.TestCase): + def test_parse_quoted_strings(self): + class DummyState: + filename = 'foo.po' + lineno = 1 + + valid_strings = ( + # no strings + ('', ''), + (' ', ''), + ('\t', ''), + # empty strings + ('""', ''), + ('"" "" ""', ''), + # allowed escape sequences + (r'"\\"', '\\'), + (r'"\""', '"'), + (r'"\t"', '\t'), + (r'"\n"', '\n'), + (r'"\r"', '\r'), + (r'"\f"', '\f'), + (r'"\a"', '\a'), + (r'"\b"', '\b'), + (r'"\v"', '\v'), + # non-empty strings + ('"foo"', 'foo'), + ('"foo" "bar"', 'foobar'), + ('"foo""bar"', 'foobar'), + ('"" "foo" ""', 'foo'), + # newlines and tabs + (r'"foo\nbar"', 'foo\nbar'), + (r'"foo\n" "bar"', 'foo\nbar'), + (r'"foo\tbar"', 'foo\tbar'), + (r'"foo\t" "bar"', 'foo\tbar'), + # escaped quotes + (r'"foo\"bar"', 'foo"bar'), + (r'"foo\"" "bar"', 'foo"bar'), + (r'"foo\\" "bar"', 'foo\\bar'), + # octal escapes + (r'"\120\171\164\150\157\156"', 'Python'), + (r'"\120\171\164" "\150\157\156"', 'Python'), + (r'"\"\120\171\164" "\150\157\156\""', '"Python"'), + # hex escapes + (r'"\x50\x79\x74\x68\x6f\x6e"', 'Python'), + (r'"\x50\x79\x74" "\x68\x6f\x6e"', 'Python'), + (r'"\"\x50\x79\x74" "\x68\x6f\x6e\""', '"Python"'), + ) + for string, expected in valid_strings: + with self.subTest(string=string): + parsed = parse_quoted_strings(DummyState(), string) + self.assertEqual(parsed, expected) + + invalid_strings = ( + "''", + '"', + '"""', + '"" "', + 'foo', + '"" "foo', + '"foo" foo', + '42', + '"" 42 ""', + # disallowed escape sequences + r"\'", + r'"\e"', + r'"\8"', + r'"\9"', + r'"\x"', + r'\u1234', + r'"\N{ROMAN NUMERAL NINE}"' + ) + for string in invalid_strings: + with self.subTest(string=string): + with self.assertRaises(ValueError): + parse_quoted_strings(DummyState(), string) + + def test_semantic_errors(self): + pos = ( + # parse_po + ('msgctxt "foo"', 'Missing msgid after msgctxt'), + ('msgid "foo"', 'Missing msgstr after msgid'), + # parse_comment + ('msgctxt "foo"\n# comment', + 'Comment line not allowed after msgctxt'), + ('msgid "foo"\n# comment', + 'Comment line not allowed after msgid'), + ('msgid "foo"\nmsgid_plural "foos"\n# comment', + 'Comment line not allowed after msgid_plural'), + # parse_msgctxt + ('msgctxt "foo"\nmsgctxt "bar"', + 'msgctxt not allowed after msgctxt'), + ('msgid "foo"\nmsgctxt "bar"', 'msgctxt not allowed after msgid'), + ('msgid "foo"\nmsgid_plural "foos"\nmsgctxt "bar"', + 'msgctxt not allowed after msgid_plural'), + # parse_msgid + ('msgid "foo"\nmsgid "bar"', 'msgid not allowed after msgid'), + ('msgid "foo"\nmsgid_plural "foos"\nmsgid "bar"', + 'msgid not allowed after msgid_plural'), + # parse_msgid_plural + ('msgid_plural "foos"', 'msgid_plural must be preceded by msgid'), + ('# comment\nmsgid_plural "foos"', + 'msgid_plural not allowed after comment'), + ('msgid "foo"\nmsgid_plural "foos"\nmsgid_plural "bars"', + 'msgid_plural not allowed after msgid_plural'), + ('msgctxt "foo"\nmsgid_plural "foos"', + 'msgid_plural not allowed after msgctxt'), + ('msgid "foo"\nmsgstr "bar"\nmsgid_plural "foos"', + 'msgid_plural not allowed after msgstr'), + # parse_msgstr + ('msgstr "foo"', 'msgstr must be preceded by msgid'), + ('# comment\nmsgstr "foo"', 'msgstr not allowed after comment'), + ('msgctxt "foo"\nmsgstr "bar"', + 'msgstr not allowed after msgctxt'), + ('msgid "foo"\nmsgstr "bar"\nmsgstr "baz"', + 'msgstr not allowed after msgstr'), + # parse_line + ('"foo"', 'Syntax error before:'), + ('# comment\n"foo"', 'Syntax error before:'), + ) + for po, message in pos: + with self.subTest(po=po): + with self.assertRaises(ValueError) as cm: + parse_po(po.encode('utf-8'), 'foo.po') + self.assertIn(message, str(cm.exception)) + + def test_msgstr_invalid_indices(self): + pos = ( + (''' +msgid "foo" +msgstr[0] "bar" +''', 'Missing msgid_plural section'), + (''' +msgid "foo" +msgid_plural "foos" +msgstr[0] "bar" +msgstr[42] "bars" +''', "Plural form has incorrect index, found '42' but should be '1'"), + (''' +msgid "foo" +msgid_plural "foos" +msgstr "bar" +''', "Indexed msgstr required after msgid_plural"), + ) + for po, message in pos: + with self.subTest(po=po): + with self.assertRaises(ValueError) as cm: + parse_po(po.encode('utf-8'), 'foo.po') + self.assertIn(message, str(cm.exception)) + + def test_duplicate_entries(self): + po = b''' +msgid "foo" +msgstr "bar" + +msgid "foo" +msgstr "baz" +''' + with self.assertRaisesRegex(ValueError, "Duplicate entry: 'foo'"): + parse_po(po, 'foo.po') + + po = b''' +msgctxt "context" +msgid "foo" +msgstr "bar" + +msgctxt "context" +msgid "foo" +msgstr "baz" +''' + with self.assertRaises(ValueError) as cm: + parse_po(po, 'foo.po') + self.assertIn("Duplicate entry: ('context', 'foo')", str(cm.exception)) + + def test_encoding(self): + po = r''' +msgid "" +msgstr "" +"Content-Type: text/plain; charset=UTF-8\n" + +msgid "αβ" +msgstr "αβ" +''' + expected = [{ + 'msgctxt': None, + 'msgid': '', + 'msgid_plural': None, + 'msgstr': 'Content-Type: text/plain; charset=UTF-8\n', + }, { + 'msgctxt': None, + 'msgid': 'αβ', + 'msgid_plural': None, + 'msgstr': 'αβ', + }] + self.assertEqual(parse_po(po.encode('utf-8'), 'foo.po'), expected) + + def test_missing_encoding(self): + po = ''' +msgid "αβ" +msgstr "αβ" +''' + ab = "αβ".encode('utf-8').decode('latin-1') + expected = [{ + 'msgctxt': None, + 'msgid': ab, + 'msgid_plural': None, + 'msgstr': ab, + }] + self.assertEqual(parse_po(po.encode('utf-8'), 'foo.po'), expected) + + def test_invalid_BOM(self): + po = codecs.BOM_UTF8 + b'msgid "foo"\nmsgstr "bar"' + with self.assertRaises(ValueError) as cm: + parse_po(po, 'foo.po') + self.assertIn("starts with a UTF-8 BOM", str(cm.exception)) + + def test_parse(self): + filename = DATA_DIR / 'general.po' + messages = parse_po(filename.read_bytes(), filename) + expected = json.loads( + (DATA_DIR / 'general.json').read_text(encoding='utf-8')) + self.assertEqual(messages, expected) + def extract_from_snapshots(): + exclude_file = DATA_DIR / 'exclude_file.pot' snapshots = { 'messages.py': (), 'fileloc.py': ('--docstrings',), @@ -526,6 +776,8 @@ def extract_from_snapshots(): 'custom_keywords.py': ('--keyword=foo', '--keyword=nfoo:1,2', '--keyword=pfoo:1c,2', '--keyword=npfoo:1c,2,3', '--keyword=_:1,2'), + # Test excluded msgids with an exclude file + 'excluded.py': (f'--exclude-file={exclude_file}',), # == Test character escaping # Escape ascii and unicode: 'escapes.py': ('--escape', '--add-comments='), @@ -556,9 +808,16 @@ def update_POT_snapshots(): output_file.write_text(output, encoding='utf-8') +def update_PO_snapshots(): + messages = parse_po((DATA_DIR / 'general.po').read_bytes(), 'general.po') + data = json.dumps(messages, indent=4, ensure_ascii=False) + (DATA_DIR / 'general.json').write_text(data, encoding='utf-8') + + if __name__ == '__main__': # To regenerate POT files if len(sys.argv) > 1 and sys.argv[1] == '--snapshot-update': update_POT_snapshots() + update_PO_snapshots() sys.exit(0) unittest.main() diff --git a/Misc/NEWS.d/next/Tools-Demos/2025-03-17-22-04-46.gh-issue-130197.I7AIvI.rst b/Misc/NEWS.d/next/Tools-Demos/2025-03-17-22-04-46.gh-issue-130197.I7AIvI.rst new file mode 100644 index 00000000000000..83cb6b4b450a34 --- /dev/null +++ b/Misc/NEWS.d/next/Tools-Demos/2025-03-17-22-04-46.gh-issue-130197.I7AIvI.rst @@ -0,0 +1 @@ +Fix the :option:`!--exclude-file` option in :program:`pygettext`. diff --git a/Tools/i18n/pygettext.py b/Tools/i18n/pygettext.py index f0ee2ea386f18f..2bece8e796945b 100755 --- a/Tools/i18n/pygettext.py +++ b/Tools/i18n/pygettext.py @@ -125,7 +125,7 @@ Set width of output to columns. -x filename - --exclude-file=filename + --exclude-file=filename.po Specify a file that contains a list of strings that are not be extracted from the input files. Each string to be excluded must appear on a line by itself in the file. @@ -140,15 +140,19 @@ """ import ast +import codecs import getopt import glob import importlib.machinery import importlib.util import os +import re import sys import time import tokenize from dataclasses import dataclass, field +from email.parser import HeaderParser +from enum import StrEnum, auto from io import BytesIO from operator import itemgetter @@ -279,6 +283,272 @@ def getFilesForName(name): return [] +def _key_for(msgid, msgctxt=None): + if msgctxt is None: + return msgid + return (msgctxt, msgid) + + +class POSection(StrEnum): + COMMENT = 'comment' + CTXT = 'msgctxt' + ID = 'msgid' + PLURAL = 'msgid_plural' + STR = 'msgstr' + + +def parse_po(po, filename): + """Parse a PO file.""" + if po.startswith(codecs.BOM_UTF8): + raise ValueError( + f"The file {filename} starts with a UTF-8 BOM which is not " + "allowed in .po files.\nPlease save the file without a BOM " + "and try again.") + + @dataclass + class ParserState: + filename: str + lineno: int = 0 + # Start off assuming Latin-1, so everything decodes without failure, + # until we know the exact encoding + encoding: str = 'latin-1' + # Current section + section: POSection | None = None + # Current message data + msgid: str | None = None + msgid_plural: str | None = None + msgctxt: str | None = None + msgstr: str | list[str] | None = None + # All parsed messages + messages: dict = field(default_factory=dict) + + @property + def is_plural(self): + return self.msgid_plural is not None + + + state = ParserState(filename) + # Parse the PO file + for line in po.splitlines(): + state.lineno += 1 + + # Skip empty lines + if not line.strip(): + continue + + if line.startswith(b'#'): + parse_comment(state) + elif line.startswith(b'msgctxt'): + parse_msgctxt(state, line) + elif line.startswith(b'msgid_plural'): + parse_msgid_plural(state, line) + elif line.startswith(b'msgid'): + parse_msgid(state, line) + elif line.startswith(b'msgstr'): + parse_msgstr(state, line) + else: + # Line containing only a string without a keyword + # This will be appended to the previous section + parse_line(state, line) + + if state.section == POSection.CTXT: + raise ValueError(f'{filename}:{state.lineno}: ' + 'Missing msgid after msgctxt') + if state.section == POSection.ID: + raise ValueError(f'{filename}:{state.lineno}: ' + 'Missing msgstr after msgid') + elif state.section == POSection.STR: + # Add last entry + _add_message(state) + return list(state.messages.values()) + + +def parse_comment(state): + if state.section not in (None, POSection.COMMENT, POSection.STR): + raise ValueError(f'{state.filename}:{state.lineno}: ' + f'Comment line not allowed after {state.section}') + + if state.section == POSection.STR: + # Previous msgstr section is finished so we need to add the message + _add_message(state) + state.section = POSection.COMMENT + + +def parse_msgctxt(state, line): + if state.section not in (None, POSection.COMMENT, POSection.STR): + raise ValueError(f'{state.filename}:{state.lineno}: ' + f'msgctxt not allowed after {state.section}') + + if state.section == POSection.STR: + # Previous msgstr section is finished so we need to add the message + _add_message(state) + line = line.decode(state.encoding).removeprefix('msgctxt') + state.msgctxt = parse_quoted_strings(state, line) + state.section = POSection.CTXT + + +def parse_msgid_plural(state, line): + if state.section is None: + raise ValueError(f'{state.filename}:{state.lineno}: ' + 'msgid_plural must be preceded by msgid') + if state.section != POSection.ID: + raise ValueError(f'{state.filename}:{state.lineno}: ' + f'msgid_plural not allowed after {state.section}') + + line = line.decode(state.encoding).removeprefix('msgid_plural') + state.msgid_plural = parse_quoted_strings(state, line) + state.section = POSection.PLURAL + + +def parse_msgid(state, line): + if state.section not in (None, POSection.COMMENT, + POSection.STR, POSection.CTXT): + raise ValueError(f'{state.filename}:{state.lineno}: ' + f'msgid not allowed after {state.section}') + + if state.section == POSection.STR: + # Previous msgstr section is finished so we need to add the message + _add_message(state) + line = line.decode(state.encoding).removeprefix('msgid') + state.msgid = parse_quoted_strings(state, line) + state.section = POSection.ID + + +def parse_msgstr(state, line): + if state.section is None: + raise ValueError(f'{state.filename}:{state.lineno}: ' + 'msgstr must be preceded by msgid') + if state.section not in (POSection.STR, POSection.ID, POSection.PLURAL): + raise ValueError(f'{state.filename}:{state.lineno}: ' + f'msgstr not allowed after {state.section}') + + line = line.decode(state.encoding) + if match := re.match(r'^msgstr\[(\d+)\]', line): + # This is a plural msgstr, e.g. msgstr[0] + if not state.is_plural: + raise ValueError(f'{state.filename}:{state.lineno}: ' + 'Missing msgid_plural section') + index = int(match.group(1)) + line = line.removeprefix(match.group()) + if state.msgstr is None: + state.msgstr = [] + next_plural_index = len(state.msgstr) + if index != next_plural_index: + raise ValueError(f'{state.filename}:{state.lineno}: ' + 'Plural form has incorrect index, found ' + f"'{index}' but should be '{next_plural_index}'") + state.msgstr.append(parse_quoted_strings(state, line)) + else: + # This is a regular (non-plural) msgstr + if state.is_plural: + raise ValueError(f'{state.filename}:{state.lineno}: ' + 'Indexed msgstr required after msgid_plural') + if state.section == POSection.STR: + raise ValueError(f'{state.filename}:{state.lineno}: ' + 'msgstr not allowed after msgstr') + line = line.removeprefix('msgstr') + state.msgstr = parse_quoted_strings(state, line) + state.section = POSection.STR + + +def parse_line(state, line): + line = parse_quoted_strings(state, line.decode(state.encoding)) + if state.section == POSection.CTXT: + state.msgctxt += line + elif state.section == POSection.PLURAL: + state.msgid_plural += line + elif state.section == POSection.ID: + state.msgid += line + elif state.section == POSection.STR: + if isinstance(state.msgstr, list): + # This belongs to the last msgstr[N] entry + state.msgstr[-1] += line + else: + state.msgstr += line + else: + raise ValueError(f'{state.filename}:{state.lineno}: ' + f'Syntax error before:\n{line}') + + +def parse_quoted_strings(state, line): + """ + Parse a line containing one or more quoted PO strings separated + by whitespace. + + Example: "Hello, " "world!" -> 'Hello, world!' + """ + line = line.strip() + if not line: + return '' + + quoted_string = r'"([^"\\]|\\.)*"' + # One or more quoted strings, possibly separated by whitespace + quoted_strings = fr'^({quoted_string}\s*)+$' + + if not re.match(quoted_strings, line): + raise ValueError(f'{state.filename}:{state.lineno}: ' + f'Syntax error: {line}') + + string = '' + for match in re.finditer(quoted_string, line): + part = match.group() + string += parse_quoted_string(state, part) + return string + + +def parse_quoted_string(state, string): + """Parse a single quoted PO string.""" + # Check if there are any disallowed escape sequences + # The allowed escape sequences are: + # - \n, \r, \t, \\, \", \a, \b, \f, \v + # - Octal escapes: \o, \oo, \ooo + # - Hex escapes: \xh, \xhh, ... + if match := re.search(r'\\[^"\\abfnrtvx0-7]|\\x[^0-9a-fA-F]', string): + escape = match.group() + raise ValueError(f'{state.filename}:{state.lineno}: ' + f"Invalid escape sequence: '{escape}'") + + try: + return ast.literal_eval(string) + except (ValueError, SyntaxError) as e: + raise ValueError(f'{state.filename}:{state.lineno}: ' + f"Invalid syntax: {string}") from e + +def _add_message(state): + key = _key_for(state.msgid, state.msgctxt) + if key in state.messages: + # PO files don't allow duplicate entries + raise ValueError(f"{state.filename}:{state.lineno}: " + f"Duplicate entry: {key!r}") + state.messages[key] = {'msgctxt': state.msgctxt, + 'msgid': state.msgid, + 'msgid_plural': state.msgid_plural, + 'msgstr': state.msgstr} + if state.msgid == "": + # This is the header, see whether there is an encoding declaration + state.encoding = _get_encoding(state.msgstr) + # Reset the message data + state.msgctxt = None + state.msgid = None + state.msgid_plural = None + state.msgstr = None + + +def _get_encoding(msgstr): + """Get the encoding from the header msgstr, if provided.""" + p = HeaderParser() + charset = p.parsestr(msgstr).get_content_charset() + return charset or 'latin-1' + + +def get_msgids_from_exclude_file(filename): + with open(filename, 'rb') as f: + po = f.read() + + messages = parse_po(po, filename) + return {m['msgid'] for m in messages} + + # Key is the function name, value is a dictionary mapping argument positions to the # type of the argument. The type is one of 'msgid', 'msgid_plural', or 'msgctxt'. DEFAULTKEYWORDS = { @@ -533,7 +803,7 @@ def _add_message( if not comments: comments = [] - key = self._key_for(msgid, msgctxt) + key = _key_for(msgid, msgctxt) message = self.messages.get(key) if message: message.add_location( @@ -553,12 +823,6 @@ def _add_message( comments=comments, ) - @staticmethod - def _key_for(msgid, msgctxt=None): - if msgctxt is not None: - return (msgctxt, msgid) - return msgid - def _get_func_name(self, node): match node.func: case ast.Name(id=id): @@ -742,14 +1006,18 @@ class Options: # initialize list of strings to exclude if options.excludefilename: try: - with open(options.excludefilename) as fp: - options.toexclude = fp.readlines() + options.toexclude = get_msgids_from_exclude_file( + options.excludefilename) + except ValueError as e: + print(f'Invalid exclude file ({options.excludefilename}): {e}', + file=sys.stderr) + sys.exit(1) except IOError: print(f"Can't read --exclude-file: {options.excludefilename}", file=sys.stderr) sys.exit(1) else: - options.toexclude = [] + options.toexclude = set() # resolve args to module lists expanded = [] pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy