diff --git a/nlp.ipynb b/nlp.ipynb index 12e00ba15..4f79afe75 100644 --- a/nlp.ipynb +++ b/nlp.ipynb @@ -20,7 +20,8 @@ "outputs": [], "source": [ "import nlp\n", - "from nlp import Page, HITS, Lexicon, Rules, Grammar" + "from nlp import Page, HITS\n", + "from nlp import Lexicon, Rules, Grammar, ProbLexicon, ProbRules, ProbGrammar" ] }, { @@ -151,7 +152,9 @@ "source": [ "### Implementation\n", "\n", - "In the module we have implemented a `Lexicon` and a `Rules` function, which we can combine to create a `Grammar` object.\n", + "In the module we have implementation both for probabilistic and non-probabilistic grammars. Both these implementation follow the same format. There are functions for the lexicon and the rules which can be combined to create a grammar object.\n", + "\n", + "#### Non-Probabilistic\n", "\n", "Execute the cells below to view the implemenations:" ] @@ -205,9 +208,9 @@ "name": "stdout", "output_type": "stream", "text": [ - "Lexicon {'Article': ['the', 'a', 'an'], 'Adverb': ['here', 'lightly', 'now'], 'Digit': ['1', '2', '0'], 'Pronoun': ['me', 'you', 'he'], 'Name': ['john', 'mary', 'peter'], 'Adjective': ['good', 'new', 'sad'], 'Conjuction': ['and', 'or', 'but'], 'Preposition': ['to', 'in', 'at'], 'RelPro': ['that', 'who', 'which'], 'Verb': ['is', 'say', 'are'], 'Noun': ['robot', 'sheep', 'fence']}\n", + "Lexicon {'Verb': ['is', 'say', 'are'], 'RelPro': ['that', 'who', 'which'], 'Conjuction': ['and', 'or', 'but'], 'Digit': ['1', '2', '0'], 'Noun': ['robot', 'sheep', 'fence'], 'Pronoun': ['me', 'you', 'he'], 'Preposition': ['to', 'in', 'at'], 'Name': ['john', 'mary', 'peter'], 'Article': ['the', 'a', 'an'], 'Adjective': ['good', 'new', 'sad'], 'Adverb': ['here', 'lightly', 'now']}\n", "\n", - "Rules: {'Adjs': [['Adjective'], ['Adjective', 'Adjs']], 'PP': [['Preposition', 'NP']], 'RelClause': [['RelPro', 'VP']], 'VP': [['Verb'], ['VP', 'NP'], ['VP', 'Adjective'], ['VP', 'PP'], ['VP', 'Adverb']], 'NP': [['Pronoun'], ['Name'], ['Noun'], ['Article', 'Noun'], ['Article', 'Adjs', 'Noun'], ['Digit'], ['NP', 'PP'], ['NP', 'RelClause']], 'S': [['NP', 'VP'], ['S', 'Conjuction', 'S']]}\n" + "Rules: {'RelClause': [['RelPro', 'VP']], 'S': [['NP', 'VP'], ['S', 'Conjuction', 'S']], 'PP': [['Preposition', 'NP']], 'VP': [['Verb'], ['VP', 'NP'], ['VP', 'Adjective'], ['VP', 'PP'], ['VP', 'Adverb']], 'NP': [['Pronoun'], ['Name'], ['Noun'], ['Article', 'Noun'], ['Article', 'Adjs', 'Noun'], ['Digit'], ['NP', 'PP'], ['NP', 'RelClause']], 'Adjs': [['Adjective'], ['Adjective', 'Adjs']]}\n" ] } ], @@ -287,7 +290,7 @@ { "data": { "text/plain": [ - "'a robot is to a robot sad but robot say you 0 in me in a robot at the sheep at 1 good an fence in sheep in me that are in john new lightly lightly here a new good new robot lightly new in sheep lightly'" + "'the fence are or 1 say in john that is here lightly to peter lightly sad good at you good here me good at john in an fence to fence at robot lightly and a robot who is here sad sheep in fence in fence at he sad here lightly to 0 say and fence is good in a sad sheep in a fence but he say here'" ] }, "execution_count": 7, @@ -296,9 +299,167 @@ } ], "source": [ - "from nlp import generate_random\n", + "grammar.generate_random('S')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Probabilistic\n", + "\n", + "The probabilistic grammars follow the same approach. They take as input a string, are assembled from a grammar and a lexicon and can generate random sentences (giving the probability of the sentence). The main difference is that in the lexicon we have tuples (terminal, probability) instead of strings and for the rules we have a list of tuples (list of non-terminals, probability) instead of list of lists of non-terminals.\n", "\n", - "generate_random(grammar)" + "Execute the cells to read the code:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%psource ProbLexicon" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%psource ProbRules" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "%psource ProbGrammar" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's build a lexicon and rules for the probabilistic grammar:" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Lexicon {'Verb': [('is', 0.5), ('say', 0.3), ('are', 0.2)], 'Adjective': [('good', 0.5), ('new', 0.2), ('sad', 0.3)], 'Preposition': [('to', 0.4), ('in', 0.3), ('at', 0.3)], 'Pronoun': [('me', 0.3), ('you', 0.4), ('he', 0.3)], 'Conjuction': [('and', 0.5), ('or', 0.2), ('but', 0.3)], 'Adverb': [('here', 0.6), ('lightly', 0.1), ('now', 0.3)], 'Article': [('the', 0.5), ('a', 0.25), ('an', 0.25)], 'Digit': [('0', 0.35), ('1', 0.35), ('2', 0.3)], 'RelPro': [('that', 0.5), ('who', 0.3), ('which', 0.2)], 'Noun': [('robot', 0.4), ('sheep', 0.4), ('fence', 0.2)], 'Name': [('john', 0.4), ('mary', 0.4), ('peter', 0.2)]}\n", + "\n", + "Rules: {'RelClause': [(['RelPro', 'VP'], 1.0)], 'Adjs': [(['Adjective'], 0.5), (['Adjective', 'Adjs'], 0.5)], 'PP': [(['Preposition', 'NP'], 1.0)], 'NP': [(['Pronoun'], 0.2), (['Name'], 0.05), (['Noun'], 0.2), (['Article', 'Noun'], 0.15), (['Article', 'Adjs', 'Noun'], 0.1), (['Digit'], 0.05), (['NP', 'PP'], 0.15), (['NP', 'RelClause'], 0.1)], 'S': [(['NP', 'VP'], 0.6), (['S', 'Conjuction', 'S'], 0.4)], 'VP': [(['Verb'], 0.3), (['VP', 'NP'], 0.2), (['VP', 'Adjective'], 0.25), (['VP', 'PP'], 0.15), (['VP', 'Adverb'], 0.1)]}\n" + ] + } + ], + "source": [ + "lexicon = ProbLexicon(\n", + " Verb=\"is [0.5] | say [0.3] | are [0.2]\",\n", + " Noun=\"robot [0.4] | sheep [0.4] | fence [0.2]\",\n", + " Adjective=\"good [0.5] | new [0.2] | sad [0.3]\",\n", + " Adverb=\"here [0.6] | lightly [0.1] | now [0.3]\",\n", + " Pronoun=\"me [0.3] | you [0.4] | he [0.3]\",\n", + " RelPro=\"that [0.5] | who [0.3] | which [0.2]\",\n", + " Name=\"john [0.4] | mary [0.4] | peter [0.2]\",\n", + " Article=\"the [0.5] | a [0.25] | an [0.25]\",\n", + " Preposition=\"to [0.4] | in [0.3] | at [0.3]\",\n", + " Conjuction=\"and [0.5] | or [0.2] | but [0.3]\",\n", + " Digit=\"0 [0.35] | 1 [0.35] | 2 [0.3]\"\n", + ")\n", + "\n", + "print(\"Lexicon\", lexicon)\n", + "\n", + "rules = ProbRules(\n", + " S=\"NP VP [0.6] | S Conjuction S [0.4]\",\n", + " NP=\"Pronoun [0.2] | Name [0.05] | Noun [0.2] | Article Noun [0.15] \\\n", + " | Article Adjs Noun [0.1] | Digit [0.05] | NP PP [0.15] | NP RelClause [0.1]\",\n", + " VP=\"Verb [0.3] | VP NP [0.2] | VP Adjective [0.25] | VP PP [0.15] | VP Adverb [0.1]\",\n", + " Adjs=\"Adjective [0.5] | Adjective Adjs [0.5]\",\n", + " PP=\"Preposition NP [1]\",\n", + " RelClause=\"RelPro VP [1]\"\n", + ")\n", + "\n", + "print(\"\\nRules:\", rules)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Let's use the above to assemble our probabilistic grammar and run some simple queries:" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "How can we rewrite 'VP'? [(['Verb'], 0.3), (['VP', 'NP'], 0.2), (['VP', 'Adjective'], 0.25), (['VP', 'PP'], 0.15), (['VP', 'Adverb'], 0.1)]\n", + "Is 'the' an article? True\n", + "Is 'here' a noun? False\n" + ] + } + ], + "source": [ + "grammar = ProbGrammar(\"A Simple Probabilistic Grammar\", rules, lexicon)\n", + "\n", + "print(\"How can we rewrite 'VP'?\", grammar.rewrites_for('VP'))\n", + "print(\"Is 'the' an article?\", grammar.isa('the', 'Article'))\n", + "print(\"Is 'here' a noun?\", grammar.isa('here', 'Noun'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Lastly, we can generate random sentences from this grammar. The function `prob_generation` returns a tuple (sentence, probability)." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "a sheep say at the sad sad robot the good new sheep but john at fence are to me who is to robot the good new fence to robot who is mary in robot to 1 to an sad sad sad robot in fence lightly now at 1 at a new robot here good at john an robot in a fence in john the sheep here 2 to sheep good and you is but sheep is sad a good robot or the fence is robot good lightly at a good robot at 2 now good new or 1 say but he say or peter are in you who is lightly and fence say to john to an robot and sheep say and me is good or a robot is and sheep that say good he new 2 which are sad to an good fence that say 1 good good new lightly are good at he sad here but an sheep who say say sad now lightly sad an sad sad sheep or mary are but a fence at he in 1 say and 2 are\n", + "5.453065905143236e-226\n" + ] + } + ], + "source": [ + "sentence, prob = grammar.generate_random('S')\n", + "print(sentence)\n", + "print(prob)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As with the non-probabilistic grammars, this one mostly overgenerates. You can also see that the probability is very, very low, which means there are a ton of generateable sentences (in this case infinite, since we have recursion; notice how `VP` can produce another `VP`, for example)." ] }, { diff --git a/nlp.py b/nlp.py index 9e3e87fec..e9eff8e01 100644 --- a/nlp.py +++ b/nlp.py @@ -4,6 +4,7 @@ # from the third edition until this gets reviewed.) from collections import defaultdict +from utils import weighted_choice import urllib.request import re @@ -51,6 +52,104 @@ def isa(self, word, cat): """Return True iff word is of category cat""" return cat in self.categories[word] + def generate_random(self, S='S'): + """Replace each token in S by a random entry in grammar (recursively).""" + import random + + def rewrite(tokens, into): + for token in tokens: + if token in self.rules: + rewrite(random.choice(self.rules[token]), into) + elif token in self.lexicon: + into.append(random.choice(self.lexicon[token])) + else: + into.append(token) + return into + + return ' '.join(rewrite(S.split(), [])) + + def __repr__(self): + return ''.format(self.name) + + +def ProbRules(**rules): + """Create a dictionary mapping symbols to alternative sequences, + with probabilities. + >>> ProbRules(A = "B C [0.3] | D E [0.7]") + {'A': [(['B', 'C'], 0.3), (['D', 'E'], 0.7)]} + """ + for (lhs, rhs) in rules.items(): + rules[lhs] = [] + rhs_separate = [alt.strip().split() for alt in rhs.split('|')] + for r in rhs_separate: + prob = float(r[-1][1:-1]) # remove brackets, convert to float + rhs_rule = (r[:-1], prob) + rules[lhs].append(rhs_rule) + + return rules + + +def ProbLexicon(**rules): + """Create a dictionary mapping symbols to alternative words, + with probabilities. + >>> ProbLexicon(Article = "the [0.5] | a [0.25] | an [0.25]") + {'Article': [('the', 0.5), ('a', 0.25), ('an', 0.25)]} + """ + for (lhs, rhs) in rules.items(): + rules[lhs] = [] + rhs_separate = [word.strip().split() for word in rhs.split('|')] + for r in rhs_separate: + prob = float(r[-1][1:-1]) # remove brackets, convert to float + word = r[:-1][0] + rhs_rule = (word, prob) + rules[lhs].append(rhs_rule) + + return rules + + +class ProbGrammar: + + def __init__(self, name, rules, lexicon): + """A grammar has a set of rules and a lexicon. + Each rule has a probability.""" + self.name = name + self.rules = rules + self.lexicon = lexicon + self.categories = defaultdict(list) + for lhs in lexicon: + for word, prob in lexicon[lhs]: + self.categories[word].append((lhs, prob)) + + def rewrites_for(self, cat): + """Return a sequence of possible rhs's that cat can be rewritten as.""" + return self.rules.get(cat, ()) + + def isa(self, word, cat): + """Return True iff word is of category cat""" + return cat in [c for c, _ in self.categories[word]] + + def generate_random(self, S='S'): + """Replace each token in S by a random entry in grammar (recursively). + Returns a tuple of (sentence, probability).""" + import random + + def rewrite(tokens, into): + for token in tokens: + if token in self.rules: + non_terminal, prob = weighted_choice(self.rules[token]) + into[1] *= prob + rewrite(non_terminal, into) + elif token in self.lexicon: + terminal, prob = weighted_choice(self.lexicon[token]) + into[0].append(terminal) + into[1] *= prob + else: + into[0].append(token) + return into + + rewritten_as, prob = rewrite(S.split(), [[], 1]) + return (' '.join(rewritten_as), prob) + def __repr__(self): return ''.format(self.name) @@ -96,23 +195,6 @@ def __repr__(self): N='man')) -def generate_random(grammar=E_, S='S'): - """Replace each token in S by a random entry in grammar (recursively). - This is useful for testing a grammar, e.g. generate_random(E_)""" - import random - - def rewrite(tokens, into): - for token in tokens: - if token in grammar.rules: - rewrite(random.choice(grammar.rules[token]), into) - elif token in grammar.lexicon: - into.append(random.choice(grammar.lexicon[token])) - else: - into.append(token) - return into - - return ' '.join(rewrite(S.split(), [])) - # ______________________________________________________________________________ # Chart Parsing diff --git a/tests/test_nlp.py b/tests/test_nlp.py index 6623162bc..e5ccb1e63 100644 --- a/tests/test_nlp.py +++ b/tests/test_nlp.py @@ -4,7 +4,7 @@ from nlp import loadPageHTML, stripRawHTML, findOutlinks, onlyWikipediaURLS from nlp import expand_pages, relevant_pages, normalize, ConvergenceDetector, getInlinks from nlp import getOutlinks, Page, determineInlinks, HITS -from nlp import Rules, Lexicon, Grammar +from nlp import Rules, Lexicon, Grammar, ProbRules, ProbLexicon, ProbGrammar # Clumsy imports because we want to access certain nlp.py globals explicitly, because # they are accessed by functions within nlp.py @@ -19,7 +19,8 @@ def test_rules(): def test_lexicon(): check = {'Article': ['the', 'a', 'an'], 'Pronoun': ['i', 'you', 'he']} - assert Lexicon(Article="the | a | an", Pronoun="i | you | he") == check + lexicon = Lexicon(Article="the | a | an", Pronoun="i | you | he") + assert lexicon == check def test_grammar(): @@ -31,6 +32,66 @@ def test_grammar(): assert grammar.isa('the', 'Article') +def test_generation(): + lexicon = Lexicon(Article="the | a | an", + Pronoun="i | you | he") + + rules = Rules( + S="Article | More | Pronoun", + More="Article Pronoun | Pronoun Pronoun" + ) + + grammar = Grammar("Simplegram", rules, lexicon) + + sentence = grammar.generate_random('S') + for token in sentence.split(): + found = False + for non_terminal, terminals in grammar.lexicon.items(): + if token in terminals: + found = True + assert found + + +def test_prob_rules(): + check = {'A': [(['B', 'C'], 0.3), (['D', 'E'], 0.7)], + 'B': [(['E'], 0.1), (['a'], 0.2), (['b', 'c'], 0.7)]} + rules = ProbRules(A="B C [0.3] | D E [0.7]", B="E [0.1] | a [0.2] | b c [0.7]") + assert rules == check + + +def test_prob_lexicon(): + check = {'Article': [('the', 0.5), ('a', 0.25), ('an', 0.25)], + 'Pronoun': [('i', 0.4), ('you', 0.3), ('he', 0.3)]} + lexicon = ProbLexicon(Article="the [0.5] | a [0.25] | an [0.25]", + Pronoun="i [0.4] | you [0.3] | he [0.3]") + assert lexicon == check + + +def test_prob_grammar(): + rules = ProbRules(A="B C [0.3] | D E [0.7]", B="E [0.1] | a [0.2] | b c [0.7]") + lexicon = ProbLexicon(Article="the [0.5] | a [0.25] | an [0.25]", + Pronoun="i [0.4] | you [0.3] | he [0.3]") + grammar = ProbGrammar("Simplegram", rules, lexicon) + + assert grammar.rewrites_for('A') == [(['B', 'C'], 0.3), (['D', 'E'], 0.7)] + assert grammar.isa('the', 'Article') + + +def test_prob_generation(): + lexicon = ProbLexicon(Verb="am [0.5] | are [0.25] | is [0.25]", + Pronoun="i [0.4] | you [0.3] | he [0.3]") + + rules = ProbRules( + S="Verb [0.5] | More [0.3] | Pronoun [0.1] | nobody is here [0.1]", + More="Pronoun Verb [0.7] | Pronoun Pronoun [0.3]" + ) + + grammar = ProbGrammar("Simplegram", rules, lexicon) + + sentence = grammar.generate_random('S') + assert len(sentence) == 2 + + # ______________________________________________________________________________ # Data Setup diff --git a/tests/test_utils.py b/tests/test_utils.py index c0687ad89..a07bc76ef 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -173,6 +173,12 @@ def test_sigmoid_derivative(): assert sigmoid_derivative(value) == -6 +def test_weighted_choice(): + choices = [('a', 0.5), ('b', 0.3), ('c', 0.2)] + choice = weighted_choice(choices) + assert choice in choices + + def compare_list(x, y): return all([elm_x == y[i] for i, elm_x in enumerate(x)]) diff --git a/utils.py b/utils.py index 74ceb11f8..d2720abe1 100644 --- a/utils.py +++ b/utils.py @@ -291,6 +291,19 @@ def isclose(a, b, rel_tol=1e-09, abs_tol=0.0): return abs(a - b) <= max(rel_tol * max(abs(a), abs(b)), abs_tol) +def weighted_choice(choices): + """A weighted version of random.choice""" + # NOTE: Shoule be replaced by random.choices if we port to Python 3.6 + + total = sum(w for _, w in choices) + r = random.uniform(0, total) + upto = 0 + for c, w in choices: + if upto + w >= r: + return c, w + upto += w + + # ______________________________________________________________________________ # Grid Functions pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy