beam search: reworked log-prob implementation, reworked lang model (unigram and bigram)

Harald Scheidl · Harald Scheidl · commit 6b5c3dd34944 · 2021-07-26T23:08:19.000+02:00
diff --git a/ctc_decoder/beam_search.py b/ctc_decoder/beam_search.py
@@ -1,62 +1,65 @@
-from typing import Optional
+from collections import defaultdict
+from dataclasses import dataclass
+from typing import Optional, List, Tuple
 
 import numpy as np
 
 from ctc_decoder.language_model import LanguageModel
 
-LOG_ZERO = float("-inf")
 
+def log(x: float) -> float:
+    with np.errstate(divide='ignore'):
+        return np.log(x)
+
+
+@dataclass
 class BeamEntry:
     """Information about one single beam at specific time-step."""
+    pr_total: float = log(0)  # blank and non-blank
+    pr_non_blank: float = log(0)  # non-blank
+    pr_blank: float = log(0)  # blank
+    pr_text: float = log(1)  # LM score
+    lm_applied: bool = False  # flag if LM was already applied to this beam
+    labeling: tuple = ()  # beam-labeling
 
-    def __init__(self):
-        self.pr_total = LOG_ZERO  # blank and non-blank
-        self.pr_non_blank = LOG_ZERO  # non-blank
-        self.pr_blank = LOG_ZERO  # blank
-        self.pr_text = 0  # LM score
-        self.lm_applied = False  # flag if LM was already applied to this beam
-        self.labeling = ()  # beam-labeling
-
-    def is_empty(self):
-        return len(self.labeling) == 0
 
-class BeamState:
+class BeamList:
     """Information about all beams at specific time-step."""
 
-    def __init__(self):
-        self.entries = {}
+    def __init__(self) -> None:
+        self.entries = defaultdict(BeamEntry)
 
-    def norm(self):
+    def normalize(self) -> None:
         """Length-normalise LM score."""
         for k in self.entries.keys():
             labeling_len = len(self.entries[k].labeling)
             self.entries[k].pr_text = (1.0 / (labeling_len if labeling_len else 1.0)) * self.entries[k].pr_text
 
-    def sort(self):
+    def sort_labelings(self) -> List[Tuple[int]]:
         """Return beam-labelings, sorted by probability."""
-        beams = [v for (_, v) in self.entries.items()]
+        beams = self.entries.values()
         sorted_beams = sorted(beams, reverse=True, key=lambda x: x.pr_total + x.pr_text)
         return [x.labeling for x in sorted_beams]
 
 
-def apply_lm(parent_beam, child_beam, chars, lm):
+def apply_lm(parent_beam: BeamEntry, child_beam: BeamEntry, chars: str, lm: LanguageModel) -> None:
     """Calculate LM score of child beam by taking score from parent beam and bigram probability of last two chars."""
-    if lm and not child_beam.lm_applied:
-        c1 = chars[parent_beam.labeling[-1] if parent_beam.labeling else chars.index(' ')]  # first char
-        c2 = chars[child_beam.labeling[-1]]  # second char
-        lm_factor = 0.01  # influence of language model
-        bigram_prob = lm_factor * np.log(lm.get_char_bigram(c1, c2))
-        if parent_beam.is_empty():
-            child_beam.pr_text = bigram_prob  # first char in beam
-        else:
-            child_beam.pr_text = parent_beam.pr_text + bigram_prob  # probability of char sequence
-        child_beam.lm_applied = True  # only apply LM once per beam entry
+    if not lm or child_beam.lm_applied:
+        return
 
+    # take bigram if beam length at least 2
+    if len(child_beam.labeling) > 1:
+        c = chars[child_beam.labeling[-2]]
+        d = chars[child_beam.labeling[-1]]
+        ngram_prob = lm.get_char_bigram(c, d)
+    # otherwise take unigram
+    else:
+        c = chars[child_beam.labeling[-1]]
+        ngram_prob = lm.get_char_unigram(c)
 
-def add_beam(beam_state, labeling):
-    """Add beam if it does not yet exist."""
-    if labeling not in beam_state.entries:
-        beam_state.entries[labeling] = BeamEntry()
+    lm_factor = 0.01  # influence of language model
+    child_beam.pr_text = parent_beam.pr_text + lm_factor * log(ngram_prob)  # probability of char sequence
+    child_beam.lm_applied = True  # only apply LM once per beam entry
 
 
 def beam_search(mat: np.ndarray, chars: str, beam_width: int = 25, lm: Optional[LanguageModel] = None) -> str:
@@ -78,46 +81,38 @@ def beam_search(mat: np.ndarray, chars: str, beam_width: int = 25, lm: Optional[
     max_T, max_C = mat.shape
 
     # initialise beam state
-    last = BeamState()
+    last = BeamList()
     labeling = ()
     last.entries[labeling] = BeamEntry()
-    last.entries[labeling].pr_blank = LOG_ZERO
-    last.entries[labeling].pr_total = LOG_ZERO
+    last.entries[labeling].pr_blank = log(1)
+    last.entries[labeling].pr_total = log(1)
 
     # go over all time-steps
     for t in range(max_T):
-        curr = BeamState()
+        curr = BeamList()
 
         # get beam-labelings of best beams
-        best_labelings = last.sort()[0:beam_width]
+        best_labelings = last.sort_labelings()[:beam_width]
 
         # go over best beams
         for labeling in best_labelings:
 
             # probability of paths ending with a non-blank
-            pr_non_blank = LOG_ZERO
+            pr_non_blank = log(0)
             # in case of non-empty beam
             if labeling:
                 # probability of paths with repeated last char at the end
-                if last.entries[labeling].pr_non_blank == LOG_ZERO:
-                    pr_non_blank = np.log(mat[t, labeling[-1]])  # cannot add to -inf
-                else:
-                    pr_non_blank = last.entries[labeling].pr_non_blank + np.log(mat[t, labeling[-1]])
+                pr_non_blank = last.entries[labeling].pr_non_blank + log(mat[t, labeling[-1]])
 
             # probability of paths ending with a blank
-            if last.entries[labeling].pr_total == LOG_ZERO:
-                pr_blank = np.log(mat[t, blank_idx]) # cannot add to -inf
-            else:
-                pr_blank = last.entries[labeling].pr_total + np.log(mat[t, blank_idx])
-
-            # add beam at current time-step if needed
-            add_beam(curr, labeling)
+            pr_blank = last.entries[labeling].pr_total + log(mat[t, blank_idx])
 
-            # fill in data
+            # fill in data for current beam
             curr.entries[labeling].labeling = labeling
             curr.entries[labeling].pr_non_blank = np.logaddexp(curr.entries[labeling].pr_non_blank, pr_non_blank)
             curr.entries[labeling].pr_blank = np.logaddexp(curr.entries[labeling].pr_blank, pr_blank)
-            curr.entries[labeling].pr_total = np.logaddexp(curr.entries[labeling].pr_total, np.logaddexp(pr_blank, pr_non_blank))
+            curr.entries[labeling].pr_total = np.logaddexp(curr.entries[labeling].pr_total,
+                                                           np.logaddexp(pr_blank, pr_non_blank))
             curr.entries[labeling].pr_text = last.entries[labeling].pr_text
             curr.entries[labeling].lm_applied = True  # LM already applied at previous time-step for this beam-labeling
 
@@ -128,21 +123,14 @@ def beam_search(mat: np.ndarray, chars: str, beam_width: int = 25, lm: Optional[
 
                 # if new labeling contains duplicate char at the end, only consider paths ending with a blank
                 if labeling and labeling[-1] == c:
-                    # if pr_blank is 0 then we cannot extend the beam with a  dupe char
-                    # so pr_non_blank should still be 0 (-inf in log-space)
-                    pr_non_blank = last.entries[labeling].pr_blank + np.log(mat[t, c])
+                    pr_non_blank = last.entries[labeling].pr_blank + log(mat[t, c])
                 else:
-                    if last.entries[labeling].pr_total == LOG_ZERO:
-                        pr_non_blank = np.log(mat[t, c])  # cannot add to -inf
-                    else:
-                        pr_non_blank = last.entries[labeling].pr_total + np.log(mat[t, c])
-
-                # add beam at current time-step if needed
-                add_beam(curr, new_labeling)
+                    pr_non_blank = last.entries[labeling].pr_total + log(mat[t, c])
 
                 # fill in data
                 curr.entries[new_labeling].labeling = new_labeling
-                curr.entries[new_labeling].pr_non_blank = np.logaddexp(curr.entries[new_labeling].pr_non_blank, pr_non_blank)
+                curr.entries[new_labeling].pr_non_blank = np.logaddexp(curr.entries[new_labeling].pr_non_blank,
+                                                                       pr_non_blank)
                 curr.entries[new_labeling].pr_total = np.logaddexp(curr.entries[new_labeling].pr_total, pr_non_blank)
 
                 # apply LM
@@ -152,11 +140,11 @@ def beam_search(mat: np.ndarray, chars: str, beam_width: int = 25, lm: Optional[
         last = curr
 
     # normalise LM scores according to beam-labeling-length
-    last.norm()
+    last.normalize()
 
     # sort by probability
-    best_labeling = last.sort()[0]  # get most probable labeling
+    best_labeling = last.sort_labelings()[0]  # get most probable labeling
 
     # map label string to char string
-    res = ''.join([chars[l] for l in best_labeling])
+    res = ''.join([chars[label] for label in best_labeling])
     return res
diff --git a/ctc_decoder/language_model.py b/ctc_decoder/language_model.py
@@ -3,33 +3,43 @@ class LanguageModel:
 
     def __init__(self, txt: str, chars: str) -> None:
         """Create language model from text corpus."""
-        txt = ' ' + txt + ' '  # ensure first/last characters appear next to whitespace
-        self._init_char_bigrams(txt, chars)
 
-    def _init_char_bigrams(self, txt: str, chars: str) -> None:
-        """Initialize table of character bigrams."""
-
-        # init bigrams with 0 values
-        self.bigram = {c: {d: 0 for d in chars} for c in chars}
+        # compute unigrams
+        self._unigram = {c: 0 for c in chars}
+        for c in chars:
+            # ignore unknown chars
+            if c not in self._unigram:
+                continue
+            self._unigram[c] += 1
 
-        # go through text and add each char bigram
+        # compute bigrams
+        self._bigram = {c: {d: 0 for d in chars} for c in chars}
         for i in range(len(txt) - 1):
-            first = txt[i]
-            second = txt[i + 1]
+            c = txt[i]
+            d = txt[i + 1]
 
             # ignore unknown chars
-            if first not in self.bigram or second not in self.bigram[first]:
+            if c not in self._bigram or d not in self._bigram[c]:
                 continue
 
-            self.bigram[first][second] += 1
+            self._bigram[c][d] += 1
+
+        # normalize
+        sum_unigram = sum(self._unigram.values())
+        for c in chars:
+            self._unigram[c] /= sum_unigram
+
+        for c in chars:
+            sum_bigram = sum(self._bigram[c].values())
+            if sum_bigram == 0:
+                continue
+            for d in chars:
+                self._bigram[c][d] /= sum_bigram
 
-    def get_char_bigram(self, first: str, second: str) -> float:
-        """Probability that first character is followed by second one."""
-        first = first if first else ' '  # map start to word beginning
-        second = second if second else ' '  # map end to word end
+    def get_char_unigram(self, c: str) -> float:
+        """Probability of character c."""
+        return self._unigram[c]
 
-        # number of bigrams starting with given char
-        num_bigrams = sum(self.bigram[first].values())
-        if num_bigrams == 0:
-            return 0
-        return self.bigram[first][second] / num_bigrams
+    def get_char_bigram(self, c: str, d: str) -> float:
+        """Probability that character c is followed by character d."""
+        return self._bigram[c][d]
diff --git a/setup.py b/setup.py
@@ -2,11 +2,11 @@
 
 setup(
     name='ctc-decoder',
-    version='1.0.0',
+    version='1.0.1',
     description='Connectionist Temporal Classification decoders.',
     author='Harald Scheidl',
     packages=['ctc_decoder'],
     url="https://github.com/githubharald/CTCDecoder",
     install_requires=['editdistance', 'numpy'],
-    python_requires=">=3.6"
+    python_requires='>=3.7'
 )