python · StanFromIreland · Mar 25, 2025 · Mar 27, 2025 · Mar 27, 2025 · Mar 27, 2025
diff --git a/Lib/test/test_tools/msgfmt_data/fuzzy.mo b/Lib/test/test_tools/msgfmt_data/fuzzy.mo
diff --git a/Lib/test/test_tools/msgfmt_data/general.mo b/Lib/test/test_tools/msgfmt_data/general.mo
diff --git a/Lib/test/test_tools/test_msgfmt.py b/Lib/test/test_tools/test_msgfmt.py
@@ -9,7 +9,7 @@
 
 from test.support.os_helper import temp_cwd
 from test.support.script_helper import assert_python_failure, assert_python_ok
-from test.test_tools import skip_if_missing, toolsdir
+from test.test_tools import imports_under_tool, skip_if_missing, toolsdir
 
 
 skip_if_missing('i18n')
@@ -18,6 +18,9 @@
 script_dir = Path(toolsdir) / 'i18n'
 msgfmt = script_dir / 'msgfmt.py'
 
+with imports_under_tool("i18n"):
+    from msgfmt import _hashpjw
+
 
 def compile_messages(po_file, mo_file):
     assert_python_ok(msgfmt, '-o', mo_file, po_file)
@@ -41,6 +44,27 @@ def test_compilation(self):
 
                     self.assertDictEqual(actual._catalog, expected._catalog)
 
+    def test_hash_table(self):
+        # Check _hashpjw generates correct hash values
+        self.assertEqual(_hashpjw(b"stan"), 502398)
+        self.assertEqual(_hashpjw(b"foo"), 27999)
+
+        # Check hash table is generated correctly for general.po
+        with temp_cwd():
+            tmp_mo_file = "messages.mo"
+            compile_messages(data_dir / "general.po", tmp_mo_file)
+            with open(tmp_mo_file, "rb") as f:
+                mo_data = f.read()
+
+            header = struct.unpack("=7I", mo_data[:28])
+            hash_table_size, hash_table_offset = header[5:7]
+
+            hash_tab = struct.unpack(f"={hash_table_size}I",
+                                       mo_data[hash_table_offset : hash_table_offset + (hash_table_size * 4)])
+
+            self.assertEqual(hash_tab, (1, 3, 0, 8, 9, 7, 2, 0, 4, 5, 0, 6, 0))
+
+
     def test_binary_header(self):
         with temp_cwd():
             tmp_mo_file = 'messages.mo'
@@ -63,8 +87,8 @@ def test_binary_header(self):
         self.assertEqual(num_strings, 9)
         self.assertEqual(orig_table_offset, 28)
         self.assertEqual(trans_table_offset, 100)
-        self.assertEqual(hash_table_size, 0)
-        self.assertEqual(hash_table_offset, 0)
+        self.assertEqual(hash_table_size, 13)
+        self.assertEqual(hash_table_offset, 172)
 
     def test_translations(self):
         with open(data_dir / 'general.mo', 'rb') as f:

diff --git a/Misc/NEWS.d/next/Tools-Demos/2025-03-25-18-00-00.gh-issue-131725.qwfh321.rst b/Misc/NEWS.d/next/Tools-Demos/2025-03-25-18-00-00.gh-issue-131725.qwfh321.rst
@@ -0,0 +1 @@
+:program:`msgfmt` now generates GNU hash tables.
diff --git a/Tools/i18n/msgfmt.py b/Tools/i18n/msgfmt.py
@@ -5,8 +5,8 @@
 
 This program converts a textual Uniforum-style message catalog (.po file) into
 a binary GNU catalog (.mo file).  This is essentially the same function as the
-GNU msgfmt program, however, it is a simpler implementation.  Currently it
-does not handle plural forms but it does handle message contexts.
+GNU msgfmt program.  Currently it does not handle plural forms but it does
+handle message contexts.
 
 Usage: msgfmt.py [OPTIONS] filename.po
 
@@ -60,21 +60,56 @@ def add(ctxt, id, str, fuzzy):
 def generate():
     "Return the generated output."
     global MESSAGES
+
+    def hash_insert_entry(string, i):
+        hash_val = _hashpjw(string)
+        hash_cursor = hash_val % hash_tab_size
+        inc = 1 + (hash_val % (hash_tab_size - 2))
+        while hash_table[hash_cursor]:
+            hash_cursor += inc
+            hash_cursor %= hash_tab_size
+        hash_table[hash_cursor] = i + 1
+
+    # From [gettext.git]/gettext-tools/src/write-mo.c:
+    #  Each string has an associate hashing value V, computed by a fixed
+    #  function.  To locate the string we use open addressing with double
+    #      hashing.  The first index will be V % M, where M is the size of the
+    #  hashing table.  If no entry is found, iterating with a second,
+    #  independent hashing function takes place.  This second value will
+    #  be 1 + V % (M - 2).
+    #  The approximate number of probes will be
+    #
+    #    for unsuccessful search:  (1 - N / M) ^ -1
+    #    for successful search:    - (N / M) ^ -1 * ln (1 - N / M)
+    #
+    #  where N is the number of keys.
+    #
+    #  If we now choose M to be the next prime bigger than 4 / 3 * N,
+    #  we get the values
+    #                      4   and   1.85  resp.
+    #  Because unsuccessful searches are unlikely this is a good value.
+    #  Formulas: [Knuth, The Art of Computer Programming, Volume 3,
+    #                 766 Sorting and Searching, 1973, Addison Wesley]
+    hash_tab_size = _next_prime((len(MESSAGES) * 4) // 3)
+    if hash_tab_size <= 2:
+        hash_tab_size = 3
+    hash_table = array.array("I", [0] * hash_tab_size)
+
     # the keys are sorted in the .mo file
     keys = sorted(MESSAGES.keys())
     offsets = []
     ids = strs = b''
-    for id in keys:
+    for i, id in enumerate(keys):
         # For each string, we need size and file offset.  Each string is NUL
         # terminated; the NUL does not count into the size.
+        hash_insert_entry(id, i)
         offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id])))
         ids += id + b'\0'
         strs += MESSAGES[id] + b'\0'
-    output = ''
-    # The header is 7 32-bit unsigned integers.  We don't use hash tables, so
-    # the keys start right after the index tables.
-    # translated string.
-    keystart = 7*4+16*len(keys)
+
+    # The header is 7 32-bit unsigned integers, and we have an index table and
+    # hash table.
+    keystart = 7*4+16*len(keys)+hash_tab_size*4
     # and the values start after the keys
     valuestart = keystart + len(ids)
     koffsets = []
@@ -86,13 +121,15 @@ def generate():
         voffsets += [l2, o2+valuestart]
     offsets = koffsets + voffsets
     output = struct.pack("Iiiiiii",
-                         0x950412de,       # Magic
-                         0,                 # Version
-                         len(keys),         # # of entries
-                         7*4,               # start of key index
-                         7*4+len(keys)*8,   # start of value index
-                         0, 0)              # size and offset of hash table
+                         0x950412de,                   # Magic
+                         0,                            # Version
+                         len(keys),                    # # of entries
+                         7*4,                          # start of key index
+                         7*4+len(keys)*8,              # start of value index
+                         hash_tab_size,                # size of hash table
+                         7 * 4 + 2 * (len(keys) * 8))  # offset of hash table
     output += array.array("i", offsets).tobytes()
+    output += hash_table.tobytes()
     output += ids
     output += strs
     return output
@@ -253,5 +290,39 @@ def main():
         make(filename, outfile)
 
 
+# Utilities for writing hash table
+
+# Peter J. Weinberger hash function
+# See: https://www.drdobbs.com/database/hashing-rehashed/184409859
+def _hashpjw(strs):
+    hval = 0
+    for s in strs:
+        if not s:
+            break
+        hval <<= 4
+        hval += s
+        g = hval & (0xF << 28)
+        if g:
+            hval ^= g >> 24
+            hval ^= g
+    return hval
+
+
+def _next_prime(start):
+    def is_prime(num):
+        divn = 3
+        sq = divn * divn
+        while sq < num and num % divn != 0:
+            divn += 1
+            sq += 4 * divn
+            divn += 1
+
+        return num % divn != 0
+
+    candidate = start | 1
+    while not is_prime(candidate):
+        candidate += 2
+    return candidate
+
 if __name__ == '__main__':
     main()
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		:program:`msgfmt` now generates GNU hash tables.