Skip to content

Commit 14c3f77

Browse files
antmarakisnorvig
authored andcommitted
NLP Module: Probabilistic Grammar (aimacode#599)
* add prob-grammar to notebook * Update nlp.py * add weighted choice * tests for prob grammar + generation * add test for weighted choice * Update nlp.py
1 parent b102884 commit 14c3f77

File tree

5 files changed

+349
-26
lines changed

5 files changed

+349
-26
lines changed

nlp.ipynb

Lines changed: 168 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@
2020
"outputs": [],
2121
"source": [
2222
"import nlp\n",
23-
"from nlp import Page, HITS, Lexicon, Rules, Grammar"
23+
"from nlp import Page, HITS\n",
24+
"from nlp import Lexicon, Rules, Grammar, ProbLexicon, ProbRules, ProbGrammar"
2425
]
2526
},
2627
{
@@ -151,7 +152,9 @@
151152
"source": [
152153
"### Implementation\n",
153154
"\n",
154-
"In the module we have implemented a `Lexicon` and a `Rules` function, which we can combine to create a `Grammar` object.\n",
155+
"In the module we have implementation both for probabilistic and non-probabilistic grammars. Both these implementation follow the same format. There are functions for the lexicon and the rules which can be combined to create a grammar object.\n",
156+
"\n",
157+
"#### Non-Probabilistic\n",
155158
"\n",
156159
"Execute the cells below to view the implemenations:"
157160
]
@@ -205,9 +208,9 @@
205208
"name": "stdout",
206209
"output_type": "stream",
207210
"text": [
208-
"Lexicon {'Article': ['the', 'a', 'an'], 'Adverb': ['here', 'lightly', 'now'], 'Digit': ['1', '2', '0'], 'Pronoun': ['me', 'you', 'he'], 'Name': ['john', 'mary', 'peter'], 'Adjective': ['good', 'new', 'sad'], 'Conjuction': ['and', 'or', 'but'], 'Preposition': ['to', 'in', 'at'], 'RelPro': ['that', 'who', 'which'], 'Verb': ['is', 'say', 'are'], 'Noun': ['robot', 'sheep', 'fence']}\n",
211+
"Lexicon {'Verb': ['is', 'say', 'are'], 'RelPro': ['that', 'who', 'which'], 'Conjuction': ['and', 'or', 'but'], 'Digit': ['1', '2', '0'], 'Noun': ['robot', 'sheep', 'fence'], 'Pronoun': ['me', 'you', 'he'], 'Preposition': ['to', 'in', 'at'], 'Name': ['john', 'mary', 'peter'], 'Article': ['the', 'a', 'an'], 'Adjective': ['good', 'new', 'sad'], 'Adverb': ['here', 'lightly', 'now']}\n",
209212
"\n",
210-
"Rules: {'Adjs': [['Adjective'], ['Adjective', 'Adjs']], 'PP': [['Preposition', 'NP']], 'RelClause': [['RelPro', 'VP']], 'VP': [['Verb'], ['VP', 'NP'], ['VP', 'Adjective'], ['VP', 'PP'], ['VP', 'Adverb']], 'NP': [['Pronoun'], ['Name'], ['Noun'], ['Article', 'Noun'], ['Article', 'Adjs', 'Noun'], ['Digit'], ['NP', 'PP'], ['NP', 'RelClause']], 'S': [['NP', 'VP'], ['S', 'Conjuction', 'S']]}\n"
213+
"Rules: {'RelClause': [['RelPro', 'VP']], 'S': [['NP', 'VP'], ['S', 'Conjuction', 'S']], 'PP': [['Preposition', 'NP']], 'VP': [['Verb'], ['VP', 'NP'], ['VP', 'Adjective'], ['VP', 'PP'], ['VP', 'Adverb']], 'NP': [['Pronoun'], ['Name'], ['Noun'], ['Article', 'Noun'], ['Article', 'Adjs', 'Noun'], ['Digit'], ['NP', 'PP'], ['NP', 'RelClause']], 'Adjs': [['Adjective'], ['Adjective', 'Adjs']]}\n"
211214
]
212215
}
213216
],
@@ -287,7 +290,7 @@
287290
{
288291
"data": {
289292
"text/plain": [
290-
"'a robot is to a robot sad but robot say you 0 in me in a robot at the sheep at 1 good an fence in sheep in me that are in john new lightly lightly here a new good new robot lightly new in sheep lightly'"
293+
"'the fence are or 1 say in john that is here lightly to peter lightly sad good at you good here me good at john in an fence to fence at robot lightly and a robot who is here sad sheep in fence in fence at he sad here lightly to 0 say and fence is good in a sad sheep in a fence but he say here'"
291294
]
292295
},
293296
"execution_count": 7,
@@ -296,9 +299,167 @@
296299
}
297300
],
298301
"source": [
299-
"from nlp import generate_random\n",
302+
"grammar.generate_random('S')"
303+
]
304+
},
305+
{
306+
"cell_type": "markdown",
307+
"metadata": {},
308+
"source": [
309+
"#### Probabilistic\n",
310+
"\n",
311+
"The probabilistic grammars follow the same approach. They take as input a string, are assembled from a grammar and a lexicon and can generate random sentences (giving the probability of the sentence). The main difference is that in the lexicon we have tuples (terminal, probability) instead of strings and for the rules we have a list of tuples (list of non-terminals, probability) instead of list of lists of non-terminals.\n",
300312
"\n",
301-
"generate_random(grammar)"
313+
"Execute the cells to read the code:"
314+
]
315+
},
316+
{
317+
"cell_type": "code",
318+
"execution_count": 2,
319+
"metadata": {
320+
"collapsed": true
321+
},
322+
"outputs": [],
323+
"source": [
324+
"%psource ProbLexicon"
325+
]
326+
},
327+
{
328+
"cell_type": "code",
329+
"execution_count": 3,
330+
"metadata": {
331+
"collapsed": true
332+
},
333+
"outputs": [],
334+
"source": [
335+
"%psource ProbRules"
336+
]
337+
},
338+
{
339+
"cell_type": "code",
340+
"execution_count": 4,
341+
"metadata": {
342+
"collapsed": true
343+
},
344+
"outputs": [],
345+
"source": [
346+
"%psource ProbGrammar"
347+
]
348+
},
349+
{
350+
"cell_type": "markdown",
351+
"metadata": {},
352+
"source": [
353+
"Let's build a lexicon and rules for the probabilistic grammar:"
354+
]
355+
},
356+
{
357+
"cell_type": "code",
358+
"execution_count": 2,
359+
"metadata": {},
360+
"outputs": [
361+
{
362+
"name": "stdout",
363+
"output_type": "stream",
364+
"text": [
365+
"Lexicon {'Verb': [('is', 0.5), ('say', 0.3), ('are', 0.2)], 'Adjective': [('good', 0.5), ('new', 0.2), ('sad', 0.3)], 'Preposition': [('to', 0.4), ('in', 0.3), ('at', 0.3)], 'Pronoun': [('me', 0.3), ('you', 0.4), ('he', 0.3)], 'Conjuction': [('and', 0.5), ('or', 0.2), ('but', 0.3)], 'Adverb': [('here', 0.6), ('lightly', 0.1), ('now', 0.3)], 'Article': [('the', 0.5), ('a', 0.25), ('an', 0.25)], 'Digit': [('0', 0.35), ('1', 0.35), ('2', 0.3)], 'RelPro': [('that', 0.5), ('who', 0.3), ('which', 0.2)], 'Noun': [('robot', 0.4), ('sheep', 0.4), ('fence', 0.2)], 'Name': [('john', 0.4), ('mary', 0.4), ('peter', 0.2)]}\n",
366+
"\n",
367+
"Rules: {'RelClause': [(['RelPro', 'VP'], 1.0)], 'Adjs': [(['Adjective'], 0.5), (['Adjective', 'Adjs'], 0.5)], 'PP': [(['Preposition', 'NP'], 1.0)], 'NP': [(['Pronoun'], 0.2), (['Name'], 0.05), (['Noun'], 0.2), (['Article', 'Noun'], 0.15), (['Article', 'Adjs', 'Noun'], 0.1), (['Digit'], 0.05), (['NP', 'PP'], 0.15), (['NP', 'RelClause'], 0.1)], 'S': [(['NP', 'VP'], 0.6), (['S', 'Conjuction', 'S'], 0.4)], 'VP': [(['Verb'], 0.3), (['VP', 'NP'], 0.2), (['VP', 'Adjective'], 0.25), (['VP', 'PP'], 0.15), (['VP', 'Adverb'], 0.1)]}\n"
368+
]
369+
}
370+
],
371+
"source": [
372+
"lexicon = ProbLexicon(\n",
373+
" Verb=\"is [0.5] | say [0.3] | are [0.2]\",\n",
374+
" Noun=\"robot [0.4] | sheep [0.4] | fence [0.2]\",\n",
375+
" Adjective=\"good [0.5] | new [0.2] | sad [0.3]\",\n",
376+
" Adverb=\"here [0.6] | lightly [0.1] | now [0.3]\",\n",
377+
" Pronoun=\"me [0.3] | you [0.4] | he [0.3]\",\n",
378+
" RelPro=\"that [0.5] | who [0.3] | which [0.2]\",\n",
379+
" Name=\"john [0.4] | mary [0.4] | peter [0.2]\",\n",
380+
" Article=\"the [0.5] | a [0.25] | an [0.25]\",\n",
381+
" Preposition=\"to [0.4] | in [0.3] | at [0.3]\",\n",
382+
" Conjuction=\"and [0.5] | or [0.2] | but [0.3]\",\n",
383+
" Digit=\"0 [0.35] | 1 [0.35] | 2 [0.3]\"\n",
384+
")\n",
385+
"\n",
386+
"print(\"Lexicon\", lexicon)\n",
387+
"\n",
388+
"rules = ProbRules(\n",
389+
" S=\"NP VP [0.6] | S Conjuction S [0.4]\",\n",
390+
" NP=\"Pronoun [0.2] | Name [0.05] | Noun [0.2] | Article Noun [0.15] \\\n",
391+
" | Article Adjs Noun [0.1] | Digit [0.05] | NP PP [0.15] | NP RelClause [0.1]\",\n",
392+
" VP=\"Verb [0.3] | VP NP [0.2] | VP Adjective [0.25] | VP PP [0.15] | VP Adverb [0.1]\",\n",
393+
" Adjs=\"Adjective [0.5] | Adjective Adjs [0.5]\",\n",
394+
" PP=\"Preposition NP [1]\",\n",
395+
" RelClause=\"RelPro VP [1]\"\n",
396+
")\n",
397+
"\n",
398+
"print(\"\\nRules:\", rules)"
399+
]
400+
},
401+
{
402+
"cell_type": "markdown",
403+
"metadata": {},
404+
"source": [
405+
"Let's use the above to assemble our probabilistic grammar and run some simple queries:"
406+
]
407+
},
408+
{
409+
"cell_type": "code",
410+
"execution_count": 3,
411+
"metadata": {},
412+
"outputs": [
413+
{
414+
"name": "stdout",
415+
"output_type": "stream",
416+
"text": [
417+
"How can we rewrite 'VP'? [(['Verb'], 0.3), (['VP', 'NP'], 0.2), (['VP', 'Adjective'], 0.25), (['VP', 'PP'], 0.15), (['VP', 'Adverb'], 0.1)]\n",
418+
"Is 'the' an article? True\n",
419+
"Is 'here' a noun? False\n"
420+
]
421+
}
422+
],
423+
"source": [
424+
"grammar = ProbGrammar(\"A Simple Probabilistic Grammar\", rules, lexicon)\n",
425+
"\n",
426+
"print(\"How can we rewrite 'VP'?\", grammar.rewrites_for('VP'))\n",
427+
"print(\"Is 'the' an article?\", grammar.isa('the', 'Article'))\n",
428+
"print(\"Is 'here' a noun?\", grammar.isa('here', 'Noun'))"
429+
]
430+
},
431+
{
432+
"cell_type": "markdown",
433+
"metadata": {},
434+
"source": [
435+
"Lastly, we can generate random sentences from this grammar. The function `prob_generation` returns a tuple (sentence, probability)."
436+
]
437+
},
438+
{
439+
"cell_type": "code",
440+
"execution_count": 5,
441+
"metadata": {},
442+
"outputs": [
443+
{
444+
"name": "stdout",
445+
"output_type": "stream",
446+
"text": [
447+
"a sheep say at the sad sad robot the good new sheep but john at fence are to me who is to robot the good new fence to robot who is mary in robot to 1 to an sad sad sad robot in fence lightly now at 1 at a new robot here good at john an robot in a fence in john the sheep here 2 to sheep good and you is but sheep is sad a good robot or the fence is robot good lightly at a good robot at 2 now good new or 1 say but he say or peter are in you who is lightly and fence say to john to an robot and sheep say and me is good or a robot is and sheep that say good he new 2 which are sad to an good fence that say 1 good good new lightly are good at he sad here but an sheep who say say sad now lightly sad an sad sad sheep or mary are but a fence at he in 1 say and 2 are\n",
448+
"5.453065905143236e-226\n"
449+
]
450+
}
451+
],
452+
"source": [
453+
"sentence, prob = grammar.generate_random('S')\n",
454+
"print(sentence)\n",
455+
"print(prob)"
456+
]
457+
},
458+
{
459+
"cell_type": "markdown",
460+
"metadata": {},
461+
"source": [
462+
"As with the non-probabilistic grammars, this one mostly overgenerates. You can also see that the probability is very, very low, which means there are a ton of generateable sentences (in this case infinite, since we have recursion; notice how `VP` can produce another `VP`, for example)."
302463
]
303464
},
304465
{

nlp.py

Lines changed: 99 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
# from the third edition until this gets reviewed.)
55

66
from collections import defaultdict
7+
from utils import weighted_choice
78
import urllib.request
89
import re
910

@@ -51,6 +52,104 @@ def isa(self, word, cat):
5152
"""Return True iff word is of category cat"""
5253
return cat in self.categories[word]
5354

55+
def generate_random(self, S='S'):
56+
"""Replace each token in S by a random entry in grammar (recursively)."""
57+
import random
58+
59+
def rewrite(tokens, into):
60+
for token in tokens:
61+
if token in self.rules:
62+
rewrite(random.choice(self.rules[token]), into)
63+
elif token in self.lexicon:
64+
into.append(random.choice(self.lexicon[token]))
65+
else:
66+
into.append(token)
67+
return into
68+
69+
return ' '.join(rewrite(S.split(), []))
70+
71+
def __repr__(self):
72+
return '<Grammar {}>'.format(self.name)
73+
74+
75+
def ProbRules(**rules):
76+
"""Create a dictionary mapping symbols to alternative sequences,
77+
with probabilities.
78+
>>> ProbRules(A = "B C [0.3] | D E [0.7]")
79+
{'A': [(['B', 'C'], 0.3), (['D', 'E'], 0.7)]}
80+
"""
81+
for (lhs, rhs) in rules.items():
82+
rules[lhs] = []
83+
rhs_separate = [alt.strip().split() for alt in rhs.split('|')]
84+
for r in rhs_separate:
85+
prob = float(r[-1][1:-1]) # remove brackets, convert to float
86+
rhs_rule = (r[:-1], prob)
87+
rules[lhs].append(rhs_rule)
88+
89+
return rules
90+
91+
92+
def ProbLexicon(**rules):
93+
"""Create a dictionary mapping symbols to alternative words,
94+
with probabilities.
95+
>>> ProbLexicon(Article = "the [0.5] | a [0.25] | an [0.25]")
96+
{'Article': [('the', 0.5), ('a', 0.25), ('an', 0.25)]}
97+
"""
98+
for (lhs, rhs) in rules.items():
99+
rules[lhs] = []
100+
rhs_separate = [word.strip().split() for word in rhs.split('|')]
101+
for r in rhs_separate:
102+
prob = float(r[-1][1:-1]) # remove brackets, convert to float
103+
word = r[:-1][0]
104+
rhs_rule = (word, prob)
105+
rules[lhs].append(rhs_rule)
106+
107+
return rules
108+
109+
110+
class ProbGrammar:
111+
112+
def __init__(self, name, rules, lexicon):
113+
"""A grammar has a set of rules and a lexicon.
114+
Each rule has a probability."""
115+
self.name = name
116+
self.rules = rules
117+
self.lexicon = lexicon
118+
self.categories = defaultdict(list)
119+
for lhs in lexicon:
120+
for word, prob in lexicon[lhs]:
121+
self.categories[word].append((lhs, prob))
122+
123+
def rewrites_for(self, cat):
124+
"""Return a sequence of possible rhs's that cat can be rewritten as."""
125+
return self.rules.get(cat, ())
126+
127+
def isa(self, word, cat):
128+
"""Return True iff word is of category cat"""
129+
return cat in [c for c, _ in self.categories[word]]
130+
131+
def generate_random(self, S='S'):
132+
"""Replace each token in S by a random entry in grammar (recursively).
133+
Returns a tuple of (sentence, probability)."""
134+
import random
135+
136+
def rewrite(tokens, into):
137+
for token in tokens:
138+
if token in self.rules:
139+
non_terminal, prob = weighted_choice(self.rules[token])
140+
into[1] *= prob
141+
rewrite(non_terminal, into)
142+
elif token in self.lexicon:
143+
terminal, prob = weighted_choice(self.lexicon[token])
144+
into[0].append(terminal)
145+
into[1] *= prob
146+
else:
147+
into[0].append(token)
148+
return into
149+
150+
rewritten_as, prob = rewrite(S.split(), [[], 1])
151+
return (' '.join(rewritten_as), prob)
152+
54153
def __repr__(self):
55154
return '<Grammar {}>'.format(self.name)
56155

@@ -96,23 +195,6 @@ def __repr__(self):
96195
N='man'))
97196

98197

99-
def generate_random(grammar=E_, S='S'):
100-
"""Replace each token in S by a random entry in grammar (recursively).
101-
This is useful for testing a grammar, e.g. generate_random(E_)"""
102-
import random
103-
104-
def rewrite(tokens, into):
105-
for token in tokens:
106-
if token in grammar.rules:
107-
rewrite(random.choice(grammar.rules[token]), into)
108-
elif token in grammar.lexicon:
109-
into.append(random.choice(grammar.lexicon[token]))
110-
else:
111-
into.append(token)
112-
return into
113-
114-
return ' '.join(rewrite(S.split(), []))
115-
116198
# ______________________________________________________________________________
117199
# Chart Parsing
118200

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy