@@ -19,10 +19,10 @@ class UnigramTextModel(CountingProbDist):
19
19
20
20
"""This is a discrete probability distribution over words, so you
21
21
can add, sample, or get P[word], just like with CountingProbDist. You can
22
- also generate a random text n words long with P.samples(n)"""
22
+ also generate a random text n words long with P.samples(n). """
23
23
24
24
def samples (self , n ):
25
- "Return a string of n words, random according to the model."
25
+ """ Return a string of n words, random according to the model."" "
26
26
return ' ' .join (self .sample () for i in range (n ))
27
27
28
28
@@ -97,12 +97,13 @@ def viterbi_segment(text, P):
97
97
n = len (text )
98
98
words = ['' ] + list (text )
99
99
best = [1.0 ] + [0.0 ] * n
100
- # Fill in the vectors best, words via dynamic programming
100
+ # Fill in the vectors best words via dynamic programming
101
101
for i in range (n + 1 ):
102
102
for j in range (0 , i ):
103
103
w = text [j :i ]
104
- if P [w ] * best [i - len (w )] >= best [i ]:
105
- best [i ] = P [w ] * best [i - len (w )]
104
+ curr_score = P [w ] * best [i - len (w )]
105
+ if curr_score >= best [i ]:
106
+ best [i ] = curr_score
106
107
words [i ] = w
107
108
# Now recover the sequence of best words
108
109
sequence = []
@@ -124,7 +125,7 @@ class IRSystem:
124
125
The constructor s = IRSystem('the a') builds an empty system with two
125
126
stopwords. Next, index several documents with s.index_document(text, url).
126
127
Then ask queries with s.query('query words', n) to retrieve the top n
127
- matching documents. Queries are literal words from the document,
128
+ matching documents. Queries are literal words from the document,
128
129
except that stopwords are ignored, and there is one special syntax:
129
130
The query "learn: man cat", for example, runs "man cat" and indexes it."""
130
131
@@ -137,14 +138,14 @@ def __init__(self, stopwords='the a of'):
137
138
self .documents = []
138
139
139
140
def index_collection (self , filenames ):
140
- "Index a whole collection of files."
141
+ """ Index a whole collection of files."" "
141
142
prefix = os .path .dirname (__file__ )
142
143
for filename in filenames :
143
144
self .index_document (open (filename ).read (),
144
145
os .path .relpath (filename , prefix ))
145
146
146
147
def index_document (self , text , url ):
147
- "Index the text of a document."
148
+ """ Index the text of a document."" "
148
149
# For now, use first line for title
149
150
title = text [:text .index ('\n ' )].strip ()
150
151
docwords = words (text )
@@ -278,7 +279,7 @@ def maketrans(from_, to_):
278
279
279
280
280
281
def encode (plaintext , code ):
281
- """Encodes text, using a code which is a permutation of the alphabet."""
282
+ """Encode text using a code which is a permutation of the alphabet."""
282
283
trans = maketrans (alphabet + alphabet .upper (), code + code .upper ())
283
284
284
285
return translate (plaintext , trans )
@@ -331,19 +332,18 @@ def all_shifts(text):
331
332
332
333
class PermutationDecoder :
333
334
334
- """This is a much harder problem than the shift decoder. There are 26!
335
- permutations, so we can't try them all. Instead we have to search.
335
+ """This is a much harder problem than the shift decoder. There are 26!
336
+ permutations, so we can't try them all. Instead we have to search.
336
337
We want to search well, but there are many things to consider:
337
338
Unigram probabilities (E is the most common letter); Bigram probabilities
338
339
(TH is the most common bigram); word probabilities (I and A are the most
339
340
common one-letter words, etc.); etc.
340
- We could represent a search state as a permutation of the 26 letters,
341
- and alter the solution through hill climbing. With an initial guess
341
+ We could represent a search state as a permutation of the 26 letters,
342
+ and alter the solution through hill climbing. With an initial guess
342
343
based on unigram probabilities, this would probably fare well. However,
343
344
I chose instead to have an incremental representation. A state is
344
345
represented as a letter-to-letter map; for example {'z': 'e'} to
345
- represent that 'z' will be translated to 'e'.
346
- """
346
+ represent that 'z' will be translated to 'e'."""
347
347
348
348
def __init__ (self , training_text , ciphertext = None ):
349
349
self .Pwords = UnigramTextModel (words (training_text ))
0 commit comments