diff --git a/Lib/test/test_tools/msgfmt_data/fuzzy.mo b/Lib/test/test_tools/msgfmt_data/fuzzy.mo index 4b144831cf5f75..273edbbbd7cbd7 100644 Binary files a/Lib/test/test_tools/msgfmt_data/fuzzy.mo and b/Lib/test/test_tools/msgfmt_data/fuzzy.mo differ diff --git a/Lib/test/test_tools/msgfmt_data/general.mo b/Lib/test/test_tools/msgfmt_data/general.mo index ee905cbb3ec58d..3107f6711bdd10 100644 Binary files a/Lib/test/test_tools/msgfmt_data/general.mo and b/Lib/test/test_tools/msgfmt_data/general.mo differ diff --git a/Lib/test/test_tools/test_msgfmt.py b/Lib/test/test_tools/test_msgfmt.py index 7be606bbff606a..30872f05d6b752 100644 --- a/Lib/test/test_tools/test_msgfmt.py +++ b/Lib/test/test_tools/test_msgfmt.py @@ -21,6 +21,9 @@ with imports_under_tool("i18n"): import msgfmt +with imports_under_tool("i18n"): + from msgfmt import _hashpjw + def compile_messages(po_file, mo_file): assert_python_ok(msgfmt_py, '-o', mo_file, po_file) @@ -44,6 +47,27 @@ def test_compilation(self): self.assertDictEqual(actual._catalog, expected._catalog) + def test_hash_table(self): + # Check _hashpjw generates correct hash values + self.assertEqual(_hashpjw(b"stan"), 502398) + self.assertEqual(_hashpjw(b"foo"), 27999) + + # Check hash table is generated correctly for general.po + with temp_cwd(): + tmp_mo_file = "messages.mo" + compile_messages(data_dir / "general.po", tmp_mo_file) + with open(tmp_mo_file, "rb") as f: + mo_data = f.read() + + header = struct.unpack("=7I", mo_data[:28]) + hash_table_size, hash_table_offset = header[5:7] + + hash_tab = struct.unpack(f"={hash_table_size}I", + mo_data[hash_table_offset : hash_table_offset + (hash_table_size * 4)]) + + self.assertEqual(hash_tab, (1, 3, 0, 8, 9, 7, 2, 0, 4, 5, 0, 6, 0)) + + def test_binary_header(self): with temp_cwd(): tmp_mo_file = 'messages.mo' @@ -66,8 +90,8 @@ def test_binary_header(self): self.assertEqual(num_strings, 9) self.assertEqual(orig_table_offset, 28) self.assertEqual(trans_table_offset, 100) - self.assertEqual(hash_table_size, 0) - self.assertEqual(hash_table_offset, 0) + self.assertEqual(hash_table_size, 13) + self.assertEqual(hash_table_offset, 172) def test_translations(self): with open(data_dir / 'general.mo', 'rb') as f: diff --git a/Misc/NEWS.d/next/Tools-Demos/2025-03-25-18-00-00.gh-issue-131725.qwfh321.rst b/Misc/NEWS.d/next/Tools-Demos/2025-03-25-18-00-00.gh-issue-131725.qwfh321.rst new file mode 100644 index 00000000000000..1e1647061b88c6 --- /dev/null +++ b/Misc/NEWS.d/next/Tools-Demos/2025-03-25-18-00-00.gh-issue-131725.qwfh321.rst @@ -0,0 +1 @@ +:program:`msgfmt` now generates GNU hash tables. diff --git a/Tools/i18n/msgfmt.py b/Tools/i18n/msgfmt.py index cd5f1ed9f3e268..b57a7fa50870ca 100755 --- a/Tools/i18n/msgfmt.py +++ b/Tools/i18n/msgfmt.py @@ -5,8 +5,8 @@ This program converts a textual Uniforum-style message catalog (.po file) into a binary GNU catalog (.mo file). This is essentially the same function as the -GNU msgfmt program, however, it is a simpler implementation. Currently it -does not handle plural forms but it does handle message contexts. +GNU msgfmt program. Currently it does not handle plural forms but it does +handle message contexts. Usage: msgfmt.py [OPTIONS] filename.po @@ -60,21 +60,56 @@ def add(ctxt, id, str, fuzzy): def generate(): "Return the generated output." global MESSAGES + + def hash_insert_entry(string, i): + hash_val = _hashpjw(string) + hash_cursor = hash_val % hash_tab_size + inc = 1 + (hash_val % (hash_tab_size - 2)) + while hash_table[hash_cursor]: + hash_cursor += inc + hash_cursor %= hash_tab_size + hash_table[hash_cursor] = i + 1 + + # From [gettext.git]/gettext-tools/src/write-mo.c: + # Each string has an associate hashing value V, computed by a fixed + # function. To locate the string we use open addressing with double + # hashing. The first index will be V % M, where M is the size of the + # hashing table. If no entry is found, iterating with a second, + # independent hashing function takes place. This second value will + # be 1 + V % (M - 2). + # The approximate number of probes will be + # + # for unsuccessful search: (1 - N / M) ^ -1 + # for successful search: - (N / M) ^ -1 * ln (1 - N / M) + # + # where N is the number of keys. + # + # If we now choose M to be the next prime bigger than 4 / 3 * N, + # we get the values + # 4 and 1.85 resp. + # Because unsuccessful searches are unlikely this is a good value. + # Formulas: [Knuth, The Art of Computer Programming, Volume 3, + # 766 Sorting and Searching, 1973, Addison Wesley] + hash_tab_size = _next_prime((len(MESSAGES) * 4) // 3) + if hash_tab_size <= 2: + hash_tab_size = 3 + hash_table = array.array("I", [0] * hash_tab_size) + # the keys are sorted in the .mo file keys = sorted(MESSAGES.keys()) offsets = [] ids = strs = b'' - for id in keys: + for i, id in enumerate(keys): # For each string, we need size and file offset. Each string is NUL # terminated; the NUL does not count into the size. + hash_insert_entry(id, i) offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id]))) ids += id + b'\0' strs += MESSAGES[id] + b'\0' - output = '' - # The header is 7 32-bit unsigned integers. We don't use hash tables, so - # the keys start right after the index tables. - # translated string. - keystart = 7*4+16*len(keys) + + # The header is 7 32-bit unsigned integers, and we have an index table and + # hash table. + keystart = 7*4+16*len(keys)+hash_tab_size*4 # and the values start after the keys valuestart = keystart + len(ids) koffsets = [] @@ -86,13 +121,15 @@ def generate(): voffsets += [l2, o2+valuestart] offsets = koffsets + voffsets output = struct.pack("Iiiiiii", - 0x950412de, # Magic - 0, # Version - len(keys), # # of entries - 7*4, # start of key index - 7*4+len(keys)*8, # start of value index - 0, 0) # size and offset of hash table + 0x950412de, # Magic + 0, # Version + len(keys), # # of entries + 7*4, # start of key index + 7*4+len(keys)*8, # start of value index + hash_tab_size, # size of hash table + 7 * 4 + 2 * (len(keys) * 8)) # offset of hash table output += array.array("i", offsets).tobytes() + output += hash_table.tobytes() output += ids output += strs return output @@ -258,5 +295,39 @@ def main(): make(filename, outfile) +# Utilities for writing hash table + +# Peter J. Weinberger hash function +# See: https://www.drdobbs.com/database/hashing-rehashed/184409859 +def _hashpjw(strs): + hval = 0 + for s in strs: + if not s: + break + hval <<= 4 + hval += s + g = hval & (0xF << 28) + if g: + hval ^= g >> 24 + hval ^= g + return hval + + +def _next_prime(start): + def is_prime(num): + divn = 3 + sq = divn * divn + while sq < num and num % divn != 0: + divn += 1 + sq += 4 * divn + divn += 1 + + return num % divn != 0 + + candidate = start | 1 + while not is_prime(candidate): + candidate += 2 + return candidate + if __name__ == '__main__': main()
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: