use BK tree to find similar words for ctcLexiconSearch

githubharald · githubharald · commit 21c497836edc · 2018-11-08T17:56:18.000+01:00
diff --git a/README.md b/README.md
@@ -65,8 +65,8 @@ The only path which gives "" still has probability 0.36, therefore "a" is the re
 The **Word example** testcase contains a single word from the IAM Handwriting Database \[4\]. 
 It is used to test lexicon search \[3\].
 RNN output was generated with the [SimpleHTR](https://github.com/githubharald/SimpleHTR) model.
-Lexicon search first computes an approximation with best path decoding, then searches for similar words in a dictionary, and finally scores them by computing the loss and returning the most probable dictionary word.
-Best path decoding outputs "aircrapt", lexicon search is able to find similar words like "aircraft", "airplane", ... in the dictionary, calculates a score for each of them and finally returns "aircraft", which is the correct result.
+Lexicon search first computes an approximation with best path decoding, then searches for similar words in a dictionary using a BK tree, and finally scores them by computing the loss and returning the most probable dictionary word.
+Best path decoding outputs "aircrapt", lexicon search is able to find similar words like "aircraft" and "airplane" in the dictionary, calculates a score for each of them and finally returns "aircraft", which is the correct result.
 The figure below shows the input image and the RNN output matrix with 32 time-steps and 80 classes (the last one being the CTC-blank).
 Each column sums to 1 and each entry represents the probability of seeing a label at a given time-step.
 
diff --git a/src/BKTree.py b/src/BKTree.py
@@ -0,0 +1,71 @@
+import editdistance as ed
+
+
+class BKTree:
+	"Burkhard Keller Tree: used to find strings within tolerance (w.r.t. edit distance metric) to given query string"
+
+
+	def __init__(self, txtList):
+		"pass list of texts (words) to insert into tree"
+		
+		# insert list entries
+		self.root = None
+		for txt in txtList:
+			self._insert(self.root, txt)
+
+
+	def query(self, txt, tolerance):
+		"query strings within given tolerance (w.r.t. edit distance metric)"
+		if self.root is None:
+			return []
+		return self._query(self.root, txt, tolerance)
+
+
+	def _insert(self, node, txt):
+		# insert root node
+		if node is None:
+			self.root = (txt, {})
+			return
+
+		# insert all other nodes
+		d = ed.eval(node[0], txt)
+		if d in node[1]:
+			self._insert(node[1][d], txt)
+		else:
+			node[1][d] = (txt, {})
+
+
+	def _query(self, node, txt, tolerance):
+		# distance between query and current node
+		d = ed.eval(node[0], txt)
+
+		# add current node to result if within tolerance
+		res = []
+		if d <= tolerance:
+			res.append(node[0])
+
+		# iterate over children
+		for (edge, child) in node[1].items():
+			if d - tolerance <= edge and edge <= d + tolerance:
+				res += self._query(child, txt, tolerance)
+
+		return res
+
+
+def testBKTree():
+	"test BK tree on words from corpus"
+	with open('../data/word/corpus.txt') as f:
+		words = f.read().split()
+
+	tolerance = 2
+	t = BKTree(words)
+	q = 'air'
+	res1 = sorted(t.query(q, tolerance))
+	res2 = sorted([w for w in words if ed.eval(q, w) <= tolerance])
+	print(res1)
+	print(res2)
+	assert res1 == res2
+
+
+if __name__ == '__main__':
+	testBKTree()
diff --git a/src/LexiconSearch.py b/src/LexiconSearch.py
@@ -1,22 +1,22 @@
 import BestPath
 import Loss
-import LanguageModel
-import editdistance
 
 
-def ctcLexiconSearch(mat, classes, lm):
+def ctcLexiconSearch(mat, classes, bkTree):
 	"compute approximation with best path decoding, search most similar words in dictionary, calculate score for each of them, return best scoring one. See Shi, Bai and Yao."
-	
+
 	# use best path decoding to get an approximation
 	approx = BestPath.ctcBestPath(mat, classes)
-	
-	# search words with minimal edit-distance to the approximation (speed-up possible by using BK-tree data-structure)
-	keepBest = 10
-	dist = [(w, editdistance.eval(approx, w)) for w in lm.getWordList()] # edit-distance of words to the recognized word from best path decoding
-	dist = sorted(dist, key=lambda x: x[1])[:keepBest] # keep 10 best words w.r.t. edit-distance
 
-	# for each word candidate, calculate probability and keep best-scoring word
-	probs = [(entry[0], Loss.ctcLabelingProb(mat, entry[0], classes)) for entry in dist]
-	probs = sorted(probs, key=lambda x: x[1], reverse=True)
-	
-	return probs[0][0]
+	# get similar words from dictionary within given tolerance
+	tolerance = 4
+	words = bkTree.query(approx, tolerance)
+
+	# if there are no similar words, return empty string
+	if not words:
+		return ''
+
+	# else compute probabilities of all similar words and return best scoring one
+	wordProbs = [(w, Loss.ctcLabelingProb(mat, w, classes)) for w in words]
+	wordProbs.sort(key=lambda x: x[1], reverse=True)
+	return wordProbs[0][0]
diff --git a/src/main.py b/src/main.py
@@ -8,9 +8,11 @@
 import BeamSearch
 import TokenPassing
 import LanguageModel
+import BKTree
 import Loss
 import LexiconSearch
 
+
 # specify if GPU should be used (via OpenCL)
 useGPU = len(sys.argv) > 1 and sys.argv[1] == 'gpu'
 if useGPU:
@@ -62,14 +64,16 @@ def testWordExample():
 	# matrix containing TxC RNN output. C=len(classes)+1 because of blank label.
 	mat = softmax(loadRNNOutput('../data/word/rnnOutput.csv'))
 
-	# language model: used for 
-	lm = LanguageModel.LanguageModel('../data/word/corpus.txt', classes)
+	# BK tree to find similar words
+	with open('../data/word/corpus.txt') as f:
+		words = f.read().split()
+	bkTree = BKTree.BKTree(words)
 
 	# decode RNN output with different decoding algorithms
 	gt = 'aircraft'
 	print('TARGET        :', '"' + gt + '"')
 	print('BEST PATH     :', '"' + BestPath.ctcBestPath(mat, classes) + '"')
-	print('LEXICON SEARCH:', '"' + LexiconSearch.ctcLexiconSearch(mat, classes, lm) + '"')
+	print('LEXICON SEARCH:', '"' + LexiconSearch.ctcLexiconSearch(mat, classes, bkTree) + '"')
 
 
 def testLineExample():