Skip to content

Commit 4edce2a

Browse files
antmarakisnorvig
authored andcommitted
Update text.py (aimacode#492)
1 parent c0c97bf commit 4edce2a

File tree

1 file changed

+15
-15
lines changed

1 file changed

+15
-15
lines changed

text.py

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,10 @@ class UnigramTextModel(CountingProbDist):
1919

2020
"""This is a discrete probability distribution over words, so you
2121
can add, sample, or get P[word], just like with CountingProbDist. You can
22-
also generate a random text n words long with P.samples(n)"""
22+
also generate a random text n words long with P.samples(n)."""
2323

2424
def samples(self, n):
25-
"Return a string of n words, random according to the model."
25+
"""Return a string of n words, random according to the model."""
2626
return ' '.join(self.sample() for i in range(n))
2727

2828

@@ -97,12 +97,13 @@ def viterbi_segment(text, P):
9797
n = len(text)
9898
words = [''] + list(text)
9999
best = [1.0] + [0.0] * n
100-
# Fill in the vectors best, words via dynamic programming
100+
# Fill in the vectors best words via dynamic programming
101101
for i in range(n+1):
102102
for j in range(0, i):
103103
w = text[j:i]
104-
if P[w] * best[i - len(w)] >= best[i]:
105-
best[i] = P[w] * best[i - len(w)]
104+
curr_score = P[w] * best[i - len(w)]
105+
if curr_score >= best[i]:
106+
best[i] = curr_score
106107
words[i] = w
107108
# Now recover the sequence of best words
108109
sequence = []
@@ -124,7 +125,7 @@ class IRSystem:
124125
The constructor s = IRSystem('the a') builds an empty system with two
125126
stopwords. Next, index several documents with s.index_document(text, url).
126127
Then ask queries with s.query('query words', n) to retrieve the top n
127-
matching documents. Queries are literal words from the document,
128+
matching documents. Queries are literal words from the document,
128129
except that stopwords are ignored, and there is one special syntax:
129130
The query "learn: man cat", for example, runs "man cat" and indexes it."""
130131

@@ -137,14 +138,14 @@ def __init__(self, stopwords='the a of'):
137138
self.documents = []
138139

139140
def index_collection(self, filenames):
140-
"Index a whole collection of files."
141+
"""Index a whole collection of files."""
141142
prefix = os.path.dirname(__file__)
142143
for filename in filenames:
143144
self.index_document(open(filename).read(),
144145
os.path.relpath(filename, prefix))
145146

146147
def index_document(self, text, url):
147-
"Index the text of a document."
148+
"""Index the text of a document."""
148149
# For now, use first line for title
149150
title = text[:text.index('\n')].strip()
150151
docwords = words(text)
@@ -278,7 +279,7 @@ def maketrans(from_, to_):
278279

279280

280281
def encode(plaintext, code):
281-
"""Encodes text, using a code which is a permutation of the alphabet."""
282+
"""Encode text using a code which is a permutation of the alphabet."""
282283
trans = maketrans(alphabet + alphabet.upper(), code + code.upper())
283284

284285
return translate(plaintext, trans)
@@ -331,19 +332,18 @@ def all_shifts(text):
331332

332333
class PermutationDecoder:
333334

334-
"""This is a much harder problem than the shift decoder. There are 26!
335-
permutations, so we can't try them all. Instead we have to search.
335+
"""This is a much harder problem than the shift decoder. There are 26!
336+
permutations, so we can't try them all. Instead we have to search.
336337
We want to search well, but there are many things to consider:
337338
Unigram probabilities (E is the most common letter); Bigram probabilities
338339
(TH is the most common bigram); word probabilities (I and A are the most
339340
common one-letter words, etc.); etc.
340-
We could represent a search state as a permutation of the 26 letters,
341-
and alter the solution through hill climbing. With an initial guess
341+
We could represent a search state as a permutation of the 26 letters,
342+
and alter the solution through hill climbing. With an initial guess
342343
based on unigram probabilities, this would probably fare well. However,
343344
I chose instead to have an incremental representation. A state is
344345
represented as a letter-to-letter map; for example {'z': 'e'} to
345-
represent that 'z' will be translated to 'e'.
346-
"""
346+
represent that 'z' will be translated to 'e'."""
347347

348348
def __init__(self, training_text, ciphertext=None):
349349
self.Pwords = UnigramTextModel(words(training_text))

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy