Skip to content

Commit 9abfc51

Browse files
authored
Merge pull request micropython#3 from ciscorn/dict-comp
Small improvements to the dictionary compression
2 parents 15964a4 + d18d79a commit 9abfc51

File tree

2 files changed

+60
-87
lines changed

2 files changed

+60
-87
lines changed

py/makeqstrdata.py

Lines changed: 54 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
import re
1313
import sys
1414

15+
from math import log
1516
import collections
1617
import gettext
1718
import os.path
@@ -111,9 +112,10 @@ def __init__(self, words):
111112

112113
def iter_words(self, text):
113114
s = []
115+
words = self.words
114116
for m in self.pat.finditer(text):
115117
t = m.group(0)
116-
if t in self.words:
118+
if t in words:
117119
if s:
118120
yield (False, "".join(s))
119121
s = []
@@ -124,33 +126,35 @@ def iter_words(self, text):
124126
yield (False, "".join(s))
125127

126128
def iter(self, text):
127-
s = []
128129
for m in self.pat.finditer(text):
129130
yield m.group(0)
130131

131132
def iter_substrings(s, minlen, maxlen):
132-
maxlen = min(len(s), maxlen)
133+
len_s = len(s)
134+
maxlen = min(len_s, maxlen)
133135
for n in range(minlen, maxlen + 1):
134-
for begin in range(0, len(s) - n + 1):
136+
for begin in range(0, len_s - n + 1):
135137
yield s[begin : begin + n]
136138

137139
def compute_huffman_coding(translations, compression_filename):
138140
texts = [t[1] for t in translations]
139-
all_strings_concat = "".join(texts)
140141
words = []
142+
143+
start_unused = 0x80
144+
end_unused = 0xff
141145
max_ord = 0
142-
begin_unused = 128
143-
end_unused = 256
144146
for text in texts:
145147
for c in text:
146148
ord_c = ord(c)
147-
max_ord = max(max_ord, ord_c)
148-
if 128 <= ord_c < 256:
149+
max_ord = max(ord_c, max_ord)
150+
if 0x80 <= ord_c < 0xff:
149151
end_unused = min(ord_c, end_unused)
150-
max_words = end_unused - begin_unused
151-
char_size = 1 if max_ord < 256 else 2
152+
max_words = end_unused - 0x80
153+
154+
values_type = "uint16_t" if max_ord > 255 else "uint8_t"
155+
max_words_len = 160 if max_ord > 255 else 255
152156

153-
sum_word_len = 0
157+
sum_len = 0
154158
while True:
155159
extractor = TextSplitter(words)
156160
counter = collections.Counter()
@@ -162,30 +166,30 @@ def compute_huffman_coding(translations, compression_filename):
162166

163167
scores = sorted(
164168
(
165-
# I don't know why this works good. This could be better.
166-
(s, (len(s) - 1) ** ((max(occ - 2, 1) + 0.5) ** 0.8), occ)
169+
(s, (len(s) - 1) ** log(max(occ - 2, 1)), occ)
167170
for (s, occ) in counter.items()
168171
),
169172
key=lambda x: x[1],
170173
reverse=True,
171174
)
172175

173-
w = None
176+
word = None
174177
for (s, score, occ) in scores:
175-
if score < 0:
176-
break
177-
if len(s) > 1:
178-
w = s
178+
if occ < 5:
179+
continue
180+
if score < 5:
179181
break
182+
word = s
183+
break
180184

181-
if not w:
185+
if not word:
182186
break
183-
if len(w) + sum_word_len > 256:
187+
if sum_len + len(word) - 2 > max_words_len:
184188
break
185189
if len(words) == max_words:
186190
break
187-
words.append(w)
188-
sum_word_len += len(w)
191+
words.append(word)
192+
sum_len += len(word) - 2
189193

190194
extractor = TextSplitter(words)
191195
counter = collections.Counter()
@@ -194,26 +198,26 @@ def compute_huffman_coding(translations, compression_filename):
194198
counter[atom] += 1
195199
cb = huffman.codebook(counter.items())
196200

197-
word_start = begin_unused
201+
word_start = start_unused
198202
word_end = word_start + len(words) - 1
199203
print("// # words", len(words))
200204
print("// words", words)
201205

202206
values = []
203207
length_count = {}
204208
renumbered = 0
205-
last_l = None
209+
last_length = None
206210
canonical = {}
207211
for atom, code in sorted(cb.items(), key=lambda x: (len(x[1]), x[0])):
208212
values.append(atom)
209-
l = len(code)
210-
if l not in length_count:
211-
length_count[l] = 0
212-
length_count[l] += 1
213-
if last_l:
214-
renumbered <<= (l - last_l)
215-
canonical[atom] = '{0:0{width}b}'.format(renumbered, width=l)
216-
#print(f"atom={repr(atom)} code={code}", file=sys.stderr)
213+
length = len(code)
214+
if length not in length_count:
215+
length_count[length] = 0
216+
length_count[length] += 1
217+
if last_length:
218+
renumbered <<= (length - last_length)
219+
canonical[atom] = '{0:0{width}b}'.format(renumbered, width=length)
220+
# print(f"atom={repr(atom)} code={code}", file=sys.stderr)
217221
if len(atom) > 1:
218222
o = words.index(atom) + 0x80
219223
s = "".join(C_ESCAPES.get(ch1, ch1) for ch1 in atom)
@@ -222,34 +226,37 @@ def compute_huffman_coding(translations, compression_filename):
222226
o = ord(atom)
223227
print("//", o, s, counter[atom], canonical[atom], renumbered)
224228
renumbered += 1
225-
last_l = l
229+
last_length = length
226230
lengths = bytearray()
227231
print("// length count", length_count)
228232

229233
for i in range(1, max(length_count) + 2):
230234
lengths.append(length_count.get(i, 0))
231235
print("// values", values, "lengths", len(lengths), lengths)
232-
maxord = max(ord(u) for u in values if len(u) == 1)
233-
values_type = "uint16_t" if maxord > 255 else "uint8_t"
234-
ch_size = 1 if maxord > 255 else 2
236+
235237
print("//", values, lengths)
236238
values = [(atom if len(atom) == 1 else chr(0x80 + words.index(atom))) for atom in values]
237239
print("//", values, lengths)
238-
max_translation_encoded_length = max(len(translation.encode("utf-8")) for original,translation in translations)
240+
max_translation_encoded_length = max(
241+
len(translation.encode("utf-8")) for (original, translation) in translations)
242+
243+
wends = list(len(w) - 2 for w in words)
244+
for i in range(1, len(wends)):
245+
wends[i] += wends[i - 1]
246+
239247
with open(compression_filename, "w") as f:
240248
f.write("const uint8_t lengths[] = {{ {} }};\n".format(", ".join(map(str, lengths))))
241249
f.write("const {} values[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(u)) for u in values)))
242250
f.write("#define compress_max_length_bits ({})\n".format(max_translation_encoded_length.bit_length()))
243251
f.write("const {} words[] = {{ {} }};\n".format(values_type, ", ".join(str(ord(c)) for w in words for c in w)))
244-
f.write("const uint8_t wlen[] = {{ {} }};\n".format(", ".join(str(len(w)) for w in words)))
252+
f.write("const uint8_t wends[] = {{ {} }};\n".format(", ".join(str(p) for p in wends)))
245253
f.write("#define word_start {}\n".format(word_start))
246254
f.write("#define word_end {}\n".format(word_end))
247255

248-
extractor = TextSplitter(words)
249-
return values, lengths, words, extractor
256+
return (values, lengths, words, canonical, extractor)
250257

251258
def decompress(encoding_table, encoded, encoded_length_bits):
252-
values, lengths, words, extractor = encoding_table
259+
(values, lengths, words, _, _) = encoding_table
253260
dec = []
254261
this_byte = 0
255262
this_bit = 7
@@ -306,66 +313,32 @@ def decompress(encoding_table, encoded, encoded_length_bits):
306313
def compress(encoding_table, decompressed, encoded_length_bits, len_translation_encoded):
307314
if not isinstance(decompressed, str):
308315
raise TypeError()
309-
values, lengths, words, extractor = encoding_table
316+
(_, _, _, canonical, extractor) = encoding_table
310317

311318
enc = bytearray(len(decompressed) * 3)
312-
#print(decompressed)
313-
#print(lengths)
314319
current_bit = 7
315320
current_byte = 0
316321

317-
code = len_translation_encoded
318-
bits = encoded_length_bits+1
322+
bits = encoded_length_bits + 1
319323
for i in range(bits - 1, 0, -1):
320324
if len_translation_encoded & (1 << (i - 1)):
321325
enc[current_byte] |= 1 << current_bit
322326
if current_bit == 0:
323327
current_bit = 7
324-
#print("packed {0:0{width}b}".format(enc[current_byte], width=8))
325328
current_byte += 1
326329
else:
327330
current_bit -= 1
328331

329-
#print("values = ", values, file=sys.stderr)
330332
for atom in extractor.iter(decompressed):
331-
#print("", file=sys.stderr)
332-
if len(atom) > 1:
333-
c = chr(0x80 + words.index(atom))
334-
else:
335-
c = atom
336-
assert c in values
337-
338-
start = 0
339-
end = lengths[0]
340-
bits = 1
341-
compressed = None
342-
code = 0
343-
while compressed is None:
344-
s = start
345-
e = end
346-
#print("{0:0{width}b}".format(code, width=bits))
347-
# Linear search!
348-
for i in range(s, e):
349-
if values[i] == c:
350-
compressed = code + (i - start)
351-
#print("found {0:0{width}b}".format(compressed, width=bits), file=sys.stderr)
352-
break
353-
code += end - start
354-
code <<= 1
355-
start = end
356-
end += lengths[bits]
357-
bits += 1
358-
#print("next bit", bits)
359-
360-
for i in range(bits - 1, 0, -1):
361-
if compressed & (1 << (i - 1)):
333+
for b in canonical[atom]:
334+
if b == "1":
362335
enc[current_byte] |= 1 << current_bit
363336
if current_bit == 0:
364337
current_bit = 7
365-
#print("packed {0:0{width}b}".format(enc[current_byte], width=8))
366338
current_byte += 1
367339
else:
368340
current_bit -= 1
341+
369342
if current_bit != 7:
370343
current_byte += 1
371344
return enc[:current_byte]

supervisor/shared/translate.c

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -48,17 +48,17 @@ STATIC int put_utf8(char *buf, int u) {
4848
*buf = u;
4949
return 1;
5050
} else if(word_start <= u && u <= word_end) {
51-
int n = (u - 0x80);
52-
size_t off = 0;
53-
for(int i=0; i<n; i++) {
54-
off += wlen[i];
51+
uint n = (u - word_start);
52+
size_t pos = 0;
53+
if (n > 0) {
54+
pos = wends[n - 1] + (n * 2);
5555
}
5656
int ret = 0;
5757
// note that at present, entries in the words table are
5858
// guaranteed not to represent words themselves, so this adds
5959
// at most 1 level of recursive call
60-
for(int i=0; i<wlen[n]; i++) {
61-
int len = put_utf8(buf, words[off+i]);
60+
for(; pos < wends[n] + (n + 1) * 2; pos++) {
61+
int len = put_utf8(buf, words[pos]);
6262
buf += len;
6363
ret += len;
6464
}

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy