Skip to content

gh-131725: Generate GNU hash table in msgfmt.py #131727

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 13 commits into
base: main
Choose a base branch
from
Binary file modified Lib/test/test_tools/msgfmt_data/fuzzy.mo
Binary file not shown.
Binary file modified Lib/test/test_tools/msgfmt_data/general.mo
Binary file not shown.
30 changes: 27 additions & 3 deletions Lib/test/test_tools/test_msgfmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

from test.support.os_helper import temp_cwd
from test.support.script_helper import assert_python_failure, assert_python_ok
from test.test_tools import skip_if_missing, toolsdir
from test.test_tools import imports_under_tool, skip_if_missing, toolsdir


skip_if_missing('i18n')
Expand All @@ -18,6 +18,9 @@
script_dir = Path(toolsdir) / 'i18n'
msgfmt = script_dir / 'msgfmt.py'

with imports_under_tool("i18n"):
from msgfmt import _hashpjw


def compile_messages(po_file, mo_file):
assert_python_ok(msgfmt, '-o', mo_file, po_file)
Expand All @@ -41,6 +44,27 @@ def test_compilation(self):

self.assertDictEqual(actual._catalog, expected._catalog)

def test_hash_table(self):
# Check _hashpjw generates correct hash values
self.assertEqual(_hashpjw(b"stan"), 502398)
self.assertEqual(_hashpjw(b"foo"), 27999)

# Check hash table is generated correctly for general.po
with temp_cwd():
tmp_mo_file = "messages.mo"
compile_messages(data_dir / "general.po", tmp_mo_file)
with open(tmp_mo_file, "rb") as f:
mo_data = f.read()

header = struct.unpack("=7I", mo_data[:28])
hash_table_size, hash_table_offset = header[5:7]

hash_tab = struct.unpack(f"={hash_table_size}I",
mo_data[hash_table_offset : hash_table_offset + (hash_table_size * 4)])
Comment on lines +62 to +66
Copy link
Member Author

@StanFromIreland StanFromIreland Mar 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I could just hardcode these sizes since we test them below. Is that preferred?


self.assertEqual(hash_tab, (1, 3, 0, 8, 9, 7, 2, 0, 4, 5, 0, 6, 0))


def test_binary_header(self):
with temp_cwd():
tmp_mo_file = 'messages.mo'
Expand All @@ -63,8 +87,8 @@ def test_binary_header(self):
self.assertEqual(num_strings, 9)
self.assertEqual(orig_table_offset, 28)
self.assertEqual(trans_table_offset, 100)
self.assertEqual(hash_table_size, 0)
self.assertEqual(hash_table_offset, 0)
self.assertEqual(hash_table_size, 13)
self.assertEqual(hash_table_offset, 172)

def test_translations(self):
with open(data_dir / 'general.mo', 'rb') as f:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
:program:`msgfmt` now generates GNU hash tables.
99 changes: 85 additions & 14 deletions Tools/i18n/msgfmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@

This program converts a textual Uniforum-style message catalog (.po file) into
a binary GNU catalog (.mo file). This is essentially the same function as the
GNU msgfmt program, however, it is a simpler implementation. Currently it
does not handle plural forms but it does handle message contexts.
GNU msgfmt program. Currently it does not handle plural forms but it does
handle message contexts.

Usage: msgfmt.py [OPTIONS] filename.po

Expand Down Expand Up @@ -60,21 +60,56 @@ def add(ctxt, id, str, fuzzy):
def generate():
"Return the generated output."
global MESSAGES

def hash_insert_entry(string, i):
hash_val = _hashpjw(string)
hash_cursor = hash_val % hash_tab_size
inc = 1 + (hash_val % (hash_tab_size - 2))
while hash_table[hash_cursor]:
hash_cursor += inc
hash_cursor %= hash_tab_size
hash_table[hash_cursor] = i + 1

# From [gettext.git]/gettext-tools/src/write-mo.c:
# Each string has an associate hashing value V, computed by a fixed
# function. To locate the string we use open addressing with double
# hashing. The first index will be V % M, where M is the size of the
# hashing table. If no entry is found, iterating with a second,
# independent hashing function takes place. This second value will
# be 1 + V % (M - 2).
# The approximate number of probes will be
#
# for unsuccessful search: (1 - N / M) ^ -1
# for successful search: - (N / M) ^ -1 * ln (1 - N / M)
#
# where N is the number of keys.
#
# If we now choose M to be the next prime bigger than 4 / 3 * N,
# we get the values
# 4 and 1.85 resp.
# Because unsuccessful searches are unlikely this is a good value.
# Formulas: [Knuth, The Art of Computer Programming, Volume 3,
# 766 Sorting and Searching, 1973, Addison Wesley]
hash_tab_size = _next_prime((len(MESSAGES) * 4) // 3)
if hash_tab_size <= 2:
hash_tab_size = 3
hash_table = array.array("I", [0] * hash_tab_size)

# the keys are sorted in the .mo file
keys = sorted(MESSAGES.keys())
offsets = []
ids = strs = b''
for id in keys:
for i, id in enumerate(keys):
# For each string, we need size and file offset. Each string is NUL
# terminated; the NUL does not count into the size.
hash_insert_entry(id, i)
offsets.append((len(ids), len(id), len(strs), len(MESSAGES[id])))
ids += id + b'\0'
strs += MESSAGES[id] + b'\0'
output = ''
# The header is 7 32-bit unsigned integers. We don't use hash tables, so
# the keys start right after the index tables.
# translated string.
keystart = 7*4+16*len(keys)

# The header is 7 32-bit unsigned integers, and we have an index table and
# hash table.
keystart = 7*4+16*len(keys)+hash_tab_size*4
# and the values start after the keys
valuestart = keystart + len(ids)
koffsets = []
Expand All @@ -86,13 +121,15 @@ def generate():
voffsets += [l2, o2+valuestart]
offsets = koffsets + voffsets
output = struct.pack("Iiiiiii",
0x950412de, # Magic
0, # Version
len(keys), # # of entries
7*4, # start of key index
7*4+len(keys)*8, # start of value index
0, 0) # size and offset of hash table
0x950412de, # Magic
0, # Version
len(keys), # # of entries
7*4, # start of key index
7*4+len(keys)*8, # start of value index
hash_tab_size, # size of hash table
7 * 4 + 2 * (len(keys) * 8)) # offset of hash table
output += array.array("i", offsets).tobytes()
output += hash_table.tobytes()
output += ids
output += strs
return output
Expand Down Expand Up @@ -253,5 +290,39 @@ def main():
make(filename, outfile)


# Utilities for writing hash table

# Peter J. Weinberger hash function
# See: https://www.drdobbs.com/database/hashing-rehashed/184409859
def _hashpjw(strs):
hval = 0
for s in strs:
if not s:
break
hval <<= 4
hval += s
g = hval & (0xF << 28)
if g:
hval ^= g >> 24
hval ^= g
return hval


def _next_prime(start):
def is_prime(num):
divn = 3
sq = divn * divn
while sq < num and num % divn != 0:
divn += 1
sq += 4 * divn
divn += 1

return num % divn != 0

candidate = start | 1
while not is_prime(candidate):
candidate += 2
return candidate

if __name__ == '__main__':
main()
Loading
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy