diff --git a/tests/test_text.py b/tests/test_text.py index d884e02a2..e0ee71e2c 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -99,6 +99,17 @@ def test_shift_decoding(): assert msg == 'This is a secret message.' +def test_permutation_decoder(): + gutenberg = DataFile("EN-text/gutenberg.txt").read() + flatland = DataFile("EN-text/flatland.txt").read() + + pd = PermutationDecoder(canonicalize(gutenberg)) + assert pd.decode('aba') in ('ece', 'ete', 'tat', 'tit', 'txt') + + pd = PermutationDecoder(canonicalize(flatland)) + assert pd.decode('aba') in ('ded', 'did', 'ece', 'ele', 'eme', 'ere', 'eve', 'eye', 'iti', 'mom', 'ses', 'tat', 'tit') + + def test_rot13_encoding(): code = rot13('Hello, world!') diff --git a/text.py b/text.py index 991c764d9..37fab1b25 100644 --- a/text.py +++ b/text.py @@ -4,7 +4,7 @@ Then we show a very simple Information Retrieval system, and an example working on a tiny sample of Unix manual pages.""" -from utils import argmin +from utils import argmin, argmax, hashabledict from learning import CountingProbDist import search @@ -60,7 +60,7 @@ def add_sequence(self, words): n = self.n words = self.add_empty(words, n) - for i in range(len(words) - n): + for i in range(len(words) - n + 1): self.add(tuple(words[i:i + n])) def samples(self, nwords): @@ -350,40 +350,59 @@ class PermutationDecoder: def __init__(self, training_text, ciphertext=None): self.Pwords = UnigramTextModel(words(training_text)) self.P1 = UnigramTextModel(training_text) # By letter - self.P2 = NgramTextModel(2, training_text) # By letter pair + self.P2 = NgramTextModel(2, words(training_text)) # By letter pair def decode(self, ciphertext): """Search for a decoding of the ciphertext.""" - self.ciphertext = ciphertext + self.ciphertext = canonicalize(ciphertext) + # reduce domain to speed up search + self.chardomain = {c for c in self.ciphertext if c is not ' '} problem = PermutationDecoderProblem(decoder=self) - return search.best_first_tree_search( + solution = search.best_first_graph_search( problem, lambda node: self.score(node.state)) + print(solution.state, len(solution.state)) + solution.state[' '] = ' ' + return translate(self.ciphertext, lambda c: solution.state[c]) + def score(self, code): """Score is product of word scores, unigram scores, and bigram scores. This can get very small, so we use logs and exp.""" - # TODO: Implement the permutation_decode function - text = permutation_decode(self.ciphertext, code) # noqa + # remake code dictionary to contain translation for all characters + full_code = code.copy() + full_code.update({x:x for x in self.chardomain if x not in code}) + full_code[' '] = ' ' + text = translate(self.ciphertext, lambda c: full_code[c]) - logP = (sum([log(self.Pwords[word]) for word in words(text)]) + - sum([log(self.P1[c]) for c in text]) + - sum([log(self.P2[b]) for b in bigrams(text)])) - return exp(logP) + # add small positive value to prevent computing log(0) + # TODO: Modify the values to make score more accurate + logP = (sum([log(self.Pwords[word] + 1e-20) for word in words(text)]) + + sum([log(self.P1[c] + 1e-5) for c in text]) + + sum([log(self.P2[b] + 1e-10) for b in bigrams(text)])) + return -exp(logP) class PermutationDecoderProblem(search.Problem): def __init__(self, initial=None, goal=None, decoder=None): - self.initial = initial or {} + self.initial = initial or hashabledict() self.decoder = decoder def actions(self, state): - # Find the best - p, plainchar = max([(self.decoder.P1[c], c) - for c in alphabet if c not in state]) - succs = [extend(state, plainchar, cipherchar)] # ???? # noqa + search_list = [c for c in self.decoder.chardomain if c not in state] + target_list = [c for c in alphabet if c not in state.values()] + # Find the best charater to replace + plainchar = argmax(search_list, key=lambda c: self.decoder.P1[c]) + for cipherchar in target_list: + yield (plainchar, cipherchar) + + def result(self, state, action): + new_state = hashabledict(state) # copy to prevent hash issues + assert type(new_state) == hashabledict + new_state[action[0]] = action[1] + return new_state def goal_test(self, state): - """We're done when we get all 26 letters assigned.""" - return len(state) >= 26 + """We're done when all letters in search domain are assigned.""" + return len(state) >= len(self.decoder.chardomain) diff --git a/utils.py b/utils.py index ed44f1e9e..86eb701c0 100644 --- a/utils.py +++ b/utils.py @@ -568,6 +568,33 @@ def __missing__(self, key): return result +class hashabledict(dict): + """Allows hashing by representing a dictionary as tuple of key:value pairs + May cause problems as the hash value may change during runtime + """ + def __tuplify__(self): + return tuple(sorted(self.items())) + + def __hash__(self): + return hash(self.__tuplify__()) + + def __lt__(self, odict): + assert type(odict) is hashabledict + return self.__tuplify__() < odict.__tuplify__() + + def __gt__(self, odict): + assert type(odict) is hashabledict + return self.__tuplify__() > odict.__tuplify__() + + def __le__(self, odict): + assert type(odict) is hashabledict + return self.__tuplify__() <= odict.__tuplify__() + + def __ge__(self, odict): + assert type(odict) is hashabledict + return self.__tuplify__() >= odict.__tuplify__() + + # ______________________________________________________________________________ # Queues: Stack, FIFOQueue, PriorityQueue pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy