codeaudit
diff --git a/‎nlp.ipynb
Lines changed: 168 additions & 7 deletions b/‎nlp.ipynb
Lines changed: 168 additions & 7 deletions
diff --git a/‎nlp.py
Lines changed: 99 additions & 17 deletions b/‎nlp.py
Lines changed: 99 additions & 17 deletions
@@ -20,7 +20,8 @@
    "outputs": [],
    "source": [
     "import nlp\n",
-    "from nlp import Page, HITS, Lexicon, Rules, Grammar"
+    "from nlp import Page, HITS\n",
+    "from nlp import Lexicon, Rules, Grammar, ProbLexicon, ProbRules, ProbGrammar"
    ]
   },
   {
@@ -151,7 +152,9 @@
    "source": [
     "### Implementation\n",
     "\n",
-    "In the module we have implemented a `Lexicon` and a `Rules` function, which we can combine to create a `Grammar` object.\n",
+    "In the module we have implementation both for probabilistic and non-probabilistic grammars. Both these implementation follow the same format. There are functions for the lexicon and the rules which can be combined to create a grammar object.\n",
+    "\n",
+    "#### Non-Probabilistic\n",
     "\n",
     "Execute the cells below to view the implemenations:"
    ]
@@ -205,9 +208,9 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "Lexicon {'Article': ['the', 'a', 'an'], 'Adverb': ['here', 'lightly', 'now'], 'Digit': ['1', '2', '0'], 'Pronoun': ['me', 'you', 'he'], 'Name': ['john', 'mary', 'peter'], 'Adjective': ['good', 'new', 'sad'], 'Conjuction': ['and', 'or', 'but'], 'Preposition': ['to', 'in', 'at'], 'RelPro': ['that', 'who', 'which'], 'Verb': ['is', 'say', 'are'], 'Noun': ['robot', 'sheep', 'fence']}\n",
+      "Lexicon {'Verb': ['is', 'say', 'are'], 'RelPro': ['that', 'who', 'which'], 'Conjuction': ['and', 'or', 'but'], 'Digit': ['1', '2', '0'], 'Noun': ['robot', 'sheep', 'fence'], 'Pronoun': ['me', 'you', 'he'], 'Preposition': ['to', 'in', 'at'], 'Name': ['john', 'mary', 'peter'], 'Article': ['the', 'a', 'an'], 'Adjective': ['good', 'new', 'sad'], 'Adverb': ['here', 'lightly', 'now']}\n",
       "\n",
-      "Rules: {'Adjs': [['Adjective'], ['Adjective', 'Adjs']], 'PP': [['Preposition', 'NP']], 'RelClause': [['RelPro', 'VP']], 'VP': [['Verb'], ['VP', 'NP'], ['VP', 'Adjective'], ['VP', 'PP'], ['VP', 'Adverb']], 'NP': [['Pronoun'], ['Name'], ['Noun'], ['Article', 'Noun'], ['Article', 'Adjs', 'Noun'], ['Digit'], ['NP', 'PP'], ['NP', 'RelClause']], 'S': [['NP', 'VP'], ['S', 'Conjuction', 'S']]}\n"
+      "Rules: {'RelClause': [['RelPro', 'VP']], 'S': [['NP', 'VP'], ['S', 'Conjuction', 'S']], 'PP': [['Preposition', 'NP']], 'VP': [['Verb'], ['VP', 'NP'], ['VP', 'Adjective'], ['VP', 'PP'], ['VP', 'Adverb']], 'NP': [['Pronoun'], ['Name'], ['Noun'], ['Article', 'Noun'], ['Article', 'Adjs', 'Noun'], ['Digit'], ['NP', 'PP'], ['NP', 'RelClause']], 'Adjs': [['Adjective'], ['Adjective', 'Adjs']]}\n"
      ]
     }
    ],
@@ -287,7 +290,7 @@
     {
      "data": {
       "text/plain": [
-       "'a robot is to a robot sad but robot say you 0 in me in a robot at the sheep at 1 good an fence in sheep in me that are in john new lightly lightly here a new good new robot lightly new in sheep lightly'"
+       "'the fence are or 1 say in john that is here lightly to peter lightly sad good at you good here me good at john in an fence to fence at robot lightly and a robot who is here sad sheep in fence in fence at he sad here lightly to 0 say and fence is good in a sad sheep in a fence but he say here'"
       ]
      },
      "execution_count": 7,
@@ -296,9 +299,167 @@
     }
    ],
    "source": [
-    "from nlp import generate_random\n",
+    "grammar.generate_random('S')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Probabilistic\n",
+    "\n",
+    "The probabilistic grammars follow the same approach. They take as input a string, are assembled from a grammar and a lexicon and can generate random sentences (giving the probability of the sentence). The main difference is that in the lexicon we have tuples (terminal, probability) instead of strings and for the rules we have a list of tuples (list of non-terminals, probability) instead of list of lists of non-terminals.\n",
     "\n",
-    "generate_random(grammar)"
+    "Execute the cells to read the code:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%psource ProbLexicon"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%psource ProbRules"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%psource ProbGrammar"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's build a lexicon and rules for the probabilistic grammar:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Lexicon {'Verb': [('is', 0.5), ('say', 0.3), ('are', 0.2)], 'Adjective': [('good', 0.5), ('new', 0.2), ('sad', 0.3)], 'Preposition': [('to', 0.4), ('in', 0.3), ('at', 0.3)], 'Pronoun': [('me', 0.3), ('you', 0.4), ('he', 0.3)], 'Conjuction': [('and', 0.5), ('or', 0.2), ('but', 0.3)], 'Adverb': [('here', 0.6), ('lightly', 0.1), ('now', 0.3)], 'Article': [('the', 0.5), ('a', 0.25), ('an', 0.25)], 'Digit': [('0', 0.35), ('1', 0.35), ('2', 0.3)], 'RelPro': [('that', 0.5), ('who', 0.3), ('which', 0.2)], 'Noun': [('robot', 0.4), ('sheep', 0.4), ('fence', 0.2)], 'Name': [('john', 0.4), ('mary', 0.4), ('peter', 0.2)]}\n",
+      "\n",
+      "Rules: {'RelClause': [(['RelPro', 'VP'], 1.0)], 'Adjs': [(['Adjective'], 0.5), (['Adjective', 'Adjs'], 0.5)], 'PP': [(['Preposition', 'NP'], 1.0)], 'NP': [(['Pronoun'], 0.2), (['Name'], 0.05), (['Noun'], 0.2), (['Article', 'Noun'], 0.15), (['Article', 'Adjs', 'Noun'], 0.1), (['Digit'], 0.05), (['NP', 'PP'], 0.15), (['NP', 'RelClause'], 0.1)], 'S': [(['NP', 'VP'], 0.6), (['S', 'Conjuction', 'S'], 0.4)], 'VP': [(['Verb'], 0.3), (['VP', 'NP'], 0.2), (['VP', 'Adjective'], 0.25), (['VP', 'PP'], 0.15), (['VP', 'Adverb'], 0.1)]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "lexicon = ProbLexicon(\n",
+    "    Verb=\"is [0.5] | say [0.3] | are [0.2]\",\n",
+    "    Noun=\"robot [0.4] | sheep [0.4] | fence [0.2]\",\n",
+    "    Adjective=\"good [0.5] | new [0.2] | sad [0.3]\",\n",
+    "    Adverb=\"here [0.6] | lightly [0.1] | now [0.3]\",\n",
+    "    Pronoun=\"me [0.3] | you [0.4] | he [0.3]\",\n",
+    "    RelPro=\"that [0.5] | who [0.3] | which [0.2]\",\n",
+    "    Name=\"john [0.4] | mary [0.4] | peter [0.2]\",\n",
+    "    Article=\"the [0.5] | a [0.25] | an [0.25]\",\n",
+    "    Preposition=\"to [0.4] | in [0.3] | at [0.3]\",\n",
+    "    Conjuction=\"and [0.5] | or [0.2] | but [0.3]\",\n",
+    "    Digit=\"0 [0.35] | 1 [0.35] | 2 [0.3]\"\n",
+    ")\n",
+    "\n",
+    "print(\"Lexicon\", lexicon)\n",
+    "\n",
+    "rules = ProbRules(\n",
+    "    S=\"NP VP [0.6] | S Conjuction S [0.4]\",\n",
+    "    NP=\"Pronoun [0.2] | Name [0.05] | Noun [0.2] | Article Noun [0.15] \\\n",
+    "        | Article Adjs Noun [0.1] | Digit [0.05] | NP PP [0.15] | NP RelClause [0.1]\",\n",
+    "    VP=\"Verb [0.3] | VP NP [0.2] | VP Adjective [0.25] | VP PP [0.15] | VP Adverb [0.1]\",\n",
+    "    Adjs=\"Adjective [0.5] | Adjective Adjs [0.5]\",\n",
+    "    PP=\"Preposition NP [1]\",\n",
+    "    RelClause=\"RelPro VP [1]\"\n",
+    ")\n",
+    "\n",
+    "print(\"\\nRules:\", rules)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Let's use the above to assemble our probabilistic grammar and run some simple queries:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "How can we rewrite 'VP'? [(['Verb'], 0.3), (['VP', 'NP'], 0.2), (['VP', 'Adjective'], 0.25), (['VP', 'PP'], 0.15), (['VP', 'Adverb'], 0.1)]\n",
+      "Is 'the' an article? True\n",
+      "Is 'here' a noun? False\n"
+     ]
+    }
+   ],
+   "source": [
+    "grammar = ProbGrammar(\"A Simple Probabilistic Grammar\", rules, lexicon)\n",
+    "\n",
+    "print(\"How can we rewrite 'VP'?\", grammar.rewrites_for('VP'))\n",
+    "print(\"Is 'the' an article?\", grammar.isa('the', 'Article'))\n",
+    "print(\"Is 'here' a noun?\", grammar.isa('here', 'Noun'))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Lastly, we can generate random sentences from this grammar. The function `prob_generation` returns a tuple (sentence, probability)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "a sheep say at the sad sad robot the good new sheep but john at fence are to me who is to robot the good new fence to robot who is mary in robot to 1 to an sad sad sad robot in fence lightly now at 1 at a new robot here good at john an robot in a fence in john the sheep here 2 to sheep good and you is but sheep is sad a good robot or the fence is robot good lightly at a good robot at 2 now good new or 1 say but he say or peter are in you who is lightly and fence say to john to an robot and sheep say and me is good or a robot is and sheep that say good he new 2 which are sad to an good fence that say 1 good good new lightly are good at he sad here but an sheep who say say sad now lightly sad an sad sad sheep or mary are but a fence at he in 1 say and 2 are\n",
+      "5.453065905143236e-226\n"
+     ]
+    }
+   ],
+   "source": [
+    "sentence, prob = grammar.generate_random('S')\n",
+    "print(sentence)\n",
+    "print(prob)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "As with the non-probabilistic grammars, this one mostly overgenerates. You can also see that the probability is very, very low, which means there are a ton of generateable sentences (in this case infinite, since we have recursion; notice how `VP` can produce another `VP`, for example)."
    ]
   },
   {
 
@@ -4,6 +4,7 @@
 # from the third edition until this gets reviewed.)
 
 from collections import defaultdict
+from utils import weighted_choice
 import urllib.request
 import re
 
@@ -51,6 +52,104 @@ def isa(self, word, cat):
         """Return True iff word is of category cat"""
         return cat in self.categories[word]
 
+    def generate_random(self, S='S'):
+        """Replace each token in S by a random entry in grammar (recursively)."""
+        import random
+
+        def rewrite(tokens, into):
+            for token in tokens:
+                if token in self.rules:
+                    rewrite(random.choice(self.rules[token]), into)
+                elif token in self.lexicon:
+                    into.append(random.choice(self.lexicon[token]))
+                else:
+                    into.append(token)
+            return into
+
+        return ' '.join(rewrite(S.split(), []))
+
+    def __repr__(self):
+        return '<Grammar {}>'.format(self.name)
+
+
+def ProbRules(**rules):
+    """Create a dictionary mapping symbols to alternative sequences,
+    with probabilities.
+    >>> ProbRules(A = "B C [0.3] | D E [0.7]")
+    {'A': [(['B', 'C'], 0.3), (['D', 'E'], 0.7)]}
+    """
+    for (lhs, rhs) in rules.items():
+        rules[lhs] = []
+        rhs_separate = [alt.strip().split() for alt in rhs.split('|')]
+        for r in rhs_separate:
+            prob = float(r[-1][1:-1]) # remove brackets, convert to float
+            rhs_rule = (r[:-1], prob)
+            rules[lhs].append(rhs_rule)
+
+    return rules
+
+
+def ProbLexicon(**rules):
+    """Create a dictionary mapping symbols to alternative words,
+    with probabilities.
+    >>> ProbLexicon(Article = "the [0.5] | a [0.25] | an [0.25]")
+    {'Article': [('the', 0.5), ('a', 0.25), ('an', 0.25)]}
+    """
+    for (lhs, rhs) in rules.items():
+        rules[lhs] = []
+        rhs_separate = [word.strip().split() for word in rhs.split('|')]
+        for r in rhs_separate:
+            prob = float(r[-1][1:-1]) # remove brackets, convert to float
+            word = r[:-1][0]
+            rhs_rule = (word, prob)
+            rules[lhs].append(rhs_rule)
+
+    return rules
+
+
+class ProbGrammar:
+
+    def __init__(self, name, rules, lexicon):
+        """A grammar has a set of rules and a lexicon.
+        Each rule has a probability."""
+        self.name = name
+        self.rules = rules
+        self.lexicon = lexicon
+        self.categories = defaultdict(list)
+        for lhs in lexicon:
+            for word, prob in lexicon[lhs]:
+                self.categories[word].append((lhs, prob))
+
+    def rewrites_for(self, cat):
+        """Return a sequence of possible rhs's that cat can be rewritten as."""
+        return self.rules.get(cat, ())
+
+    def isa(self, word, cat):
+        """Return True iff word is of category cat"""
+        return cat in [c for c, _ in self.categories[word]]
+
+    def generate_random(self, S='S'):
+        """Replace each token in S by a random entry in grammar (recursively).
+        Returns a tuple of (sentence, probability)."""
+        import random
+
+        def rewrite(tokens, into):
+            for token in tokens:
+                if token in self.rules:
+                    non_terminal, prob = weighted_choice(self.rules[token])
+                    into[1] *= prob
+                    rewrite(non_terminal, into)
+                elif token in self.lexicon:
+                    terminal, prob = weighted_choice(self.lexicon[token])
+                    into[0].append(terminal)
+                    into[1] *= prob
+                else:
+                    into[0].append(token)
+            return into
+
+        rewritten_as, prob = rewrite(S.split(), [[], 1])
+        return (' '.join(rewritten_as), prob)
+
     def __repr__(self):
         return '<Grammar {}>'.format(self.name)
 
@@ -96,23 +195,6 @@ def __repr__(self):
                         N='man'))
 
 
-def generate_random(grammar=E_, S='S'):
-    """Replace each token in S by a random entry in grammar (recursively).
-    This is useful for testing a grammar, e.g. generate_random(E_)"""
-    import random
-
-    def rewrite(tokens, into):
-        for token in tokens:
-            if token in grammar.rules:
-                rewrite(random.choice(grammar.rules[token]), into)
-            elif token in grammar.lexicon:
-                into.append(random.choice(grammar.lexicon[token]))
-            else:
-                into.append(token)
-        return into
-
-    return ' '.join(rewrite(S.split(), []))
-
 # ______________________________________________________________________________
 # Chart Parsing