Skip to content

Commit 21c4978

Browse files
committed
use BK tree to find similar words for ctcLexiconSearch
1 parent 548fd3a commit 21c4978

File tree

4 files changed

+94
-19
lines changed

4 files changed

+94
-19
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,8 +65,8 @@ The only path which gives "" still has probability 0.36, therefore "a" is the re
6565
The **Word example** testcase contains a single word from the IAM Handwriting Database \[4\].
6666
It is used to test lexicon search \[3\].
6767
RNN output was generated with the [SimpleHTR](https://github.com/githubharald/SimpleHTR) model.
68-
Lexicon search first computes an approximation with best path decoding, then searches for similar words in a dictionary, and finally scores them by computing the loss and returning the most probable dictionary word.
69-
Best path decoding outputs "aircrapt", lexicon search is able to find similar words like "aircraft", "airplane", ... in the dictionary, calculates a score for each of them and finally returns "aircraft", which is the correct result.
68+
Lexicon search first computes an approximation with best path decoding, then searches for similar words in a dictionary using a BK tree, and finally scores them by computing the loss and returning the most probable dictionary word.
69+
Best path decoding outputs "aircrapt", lexicon search is able to find similar words like "aircraft" and "airplane" in the dictionary, calculates a score for each of them and finally returns "aircraft", which is the correct result.
7070
The figure below shows the input image and the RNN output matrix with 32 time-steps and 80 classes (the last one being the CTC-blank).
7171
Each column sums to 1 and each entry represents the probability of seeing a label at a given time-step.
7272

src/BKTree.py

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,71 @@
1+
import editdistance as ed
2+
3+
4+
class BKTree:
5+
"Burkhard Keller Tree: used to find strings within tolerance (w.r.t. edit distance metric) to given query string"
6+
7+
8+
def __init__(self, txtList):
9+
"pass list of texts (words) to insert into tree"
10+
11+
# insert list entries
12+
self.root = None
13+
for txt in txtList:
14+
self._insert(self.root, txt)
15+
16+
17+
def query(self, txt, tolerance):
18+
"query strings within given tolerance (w.r.t. edit distance metric)"
19+
if self.root is None:
20+
return []
21+
return self._query(self.root, txt, tolerance)
22+
23+
24+
def _insert(self, node, txt):
25+
# insert root node
26+
if node is None:
27+
self.root = (txt, {})
28+
return
29+
30+
# insert all other nodes
31+
d = ed.eval(node[0], txt)
32+
if d in node[1]:
33+
self._insert(node[1][d], txt)
34+
else:
35+
node[1][d] = (txt, {})
36+
37+
38+
def _query(self, node, txt, tolerance):
39+
# distance between query and current node
40+
d = ed.eval(node[0], txt)
41+
42+
# add current node to result if within tolerance
43+
res = []
44+
if d <= tolerance:
45+
res.append(node[0])
46+
47+
# iterate over children
48+
for (edge, child) in node[1].items():
49+
if d - tolerance <= edge and edge <= d + tolerance:
50+
res += self._query(child, txt, tolerance)
51+
52+
return res
53+
54+
55+
def testBKTree():
56+
"test BK tree on words from corpus"
57+
with open('../data/word/corpus.txt') as f:
58+
words = f.read().split()
59+
60+
tolerance = 2
61+
t = BKTree(words)
62+
q = 'air'
63+
res1 = sorted(t.query(q, tolerance))
64+
res2 = sorted([w for w in words if ed.eval(q, w) <= tolerance])
65+
print(res1)
66+
print(res2)
67+
assert res1 == res2
68+
69+
70+
if __name__ == '__main__':
71+
testBKTree()

src/LexiconSearch.py

Lines changed: 14 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,22 @@
11
import BestPath
22
import Loss
3-
import LanguageModel
4-
import editdistance
53

64

7-
def ctcLexiconSearch(mat, classes, lm):
5+
def ctcLexiconSearch(mat, classes, bkTree):
86
"compute approximation with best path decoding, search most similar words in dictionary, calculate score for each of them, return best scoring one. See Shi, Bai and Yao."
9-
7+
108
# use best path decoding to get an approximation
119
approx = BestPath.ctcBestPath(mat, classes)
12-
13-
# search words with minimal edit-distance to the approximation (speed-up possible by using BK-tree data-structure)
14-
keepBest = 10
15-
dist = [(w, editdistance.eval(approx, w)) for w in lm.getWordList()] # edit-distance of words to the recognized word from best path decoding
16-
dist = sorted(dist, key=lambda x: x[1])[:keepBest] # keep 10 best words w.r.t. edit-distance
1710

18-
# for each word candidate, calculate probability and keep best-scoring word
19-
probs = [(entry[0], Loss.ctcLabelingProb(mat, entry[0], classes)) for entry in dist]
20-
probs = sorted(probs, key=lambda x: x[1], reverse=True)
21-
22-
return probs[0][0]
11+
# get similar words from dictionary within given tolerance
12+
tolerance = 4
13+
words = bkTree.query(approx, tolerance)
14+
15+
# if there are no similar words, return empty string
16+
if not words:
17+
return ''
18+
19+
# else compute probabilities of all similar words and return best scoring one
20+
wordProbs = [(w, Loss.ctcLabelingProb(mat, w, classes)) for w in words]
21+
wordProbs.sort(key=lambda x: x[1], reverse=True)
22+
return wordProbs[0][0]

src/main.py

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,11 @@
88
import BeamSearch
99
import TokenPassing
1010
import LanguageModel
11+
import BKTree
1112
import Loss
1213
import LexiconSearch
1314

15+
1416
# specify if GPU should be used (via OpenCL)
1517
useGPU = len(sys.argv) > 1 and sys.argv[1] == 'gpu'
1618
if useGPU:
@@ -62,14 +64,16 @@ def testWordExample():
6264
# matrix containing TxC RNN output. C=len(classes)+1 because of blank label.
6365
mat = softmax(loadRNNOutput('../data/word/rnnOutput.csv'))
6466

65-
# language model: used for
66-
lm = LanguageModel.LanguageModel('../data/word/corpus.txt', classes)
67+
# BK tree to find similar words
68+
with open('../data/word/corpus.txt') as f:
69+
words = f.read().split()
70+
bkTree = BKTree.BKTree(words)
6771

6872
# decode RNN output with different decoding algorithms
6973
gt = 'aircraft'
7074
print('TARGET :', '"' + gt + '"')
7175
print('BEST PATH :', '"' + BestPath.ctcBestPath(mat, classes) + '"')
72-
print('LEXICON SEARCH:', '"' + LexiconSearch.ctcLexiconSearch(mat, classes, lm) + '"')
76+
print('LEXICON SEARCH:', '"' + LexiconSearch.ctcLexiconSearch(mat, classes, bkTree) + '"')
7377

7478

7579
def testLineExample():

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy