diff --git a/tests/test_text.py b/tests/test_text.py index 0cd3e675c..d58cd497a 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -6,13 +6,55 @@ from utils import isclose, DataFile -def test_unigram_text_model(): +def test_text_models(): flatland = DataFile("EN-text/flatland.txt").read() wordseq = words(flatland) - P = UnigramTextModel(wordseq) + P1 = UnigramTextModel(wordseq) + P2 = NgramTextModel(2, wordseq) + P3 = NgramTextModel(3, wordseq) + + # The most frequent entries in each model + assert P1.top(10) == [(2081, 'the'), (1479, 'of'), (1021, 'and'), + (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'), + (478, 'that'), (399, 'is'), (348, 'you')] + + assert P2.top(10) == [(368, ('of', 'the')), (152, ('to', 'the')), + (152, ('in', 'the')), (86, ('of', 'a')), + (80, ('it', 'is')), + (71, ('by', 'the')), (68, ('for', 'the')), + (68, ('and', 'the')), (62, ('on', 'the')), + (60, ('to', 'be'))] + + assert P3.top(10) == [(30, ('a', 'straight', 'line')), + (19, ('of', 'three', 'dimensions')), + (16, ('the', 'sense', 'of')), + (13, ('by', 'the', 'sense')), + (13, ('as', 'well', 'as')), + (12, ('of', 'the', 'circles')), + (12, ('of', 'sight', 'recognition')), + (11, ('the', 'number', 'of')), + (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))] - s, p = viterbi_segment('itiseasytoreadwordswithoutspaces', P) + assert isclose(P1['the'], 0.0611, rel_tol=0.001) + + assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01) + + assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) + assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) + assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001) + + assert P2.cond_prob.get(('went',)) is None + + assert P3.cond_prob['in', 'order'].dictionary == {'to': 6} + + +def test_viterbi_segmentation(): + flatland = DataFile("EN-text/flatland.txt").read() + wordseq = words(flatland) + P = UnigramTextModel(wordseq) + text = "itiseasytoreadwordswithoutspaces" + s, p = viterbi_segment(text,P) assert s == [ 'it', 'is', 'easy', 'to', 'read', 'words', 'without', 'spaces'] @@ -56,48 +98,6 @@ def test_counting_probability_distribution(): assert 1 / 7 <= min(ps) <= max(ps) <= 1 / 5 -def test_ngram_models(): - flatland = DataFile("EN-text/flatland.txt").read() - wordseq = words(flatland) - P1 = UnigramTextModel(wordseq) - P2 = NgramTextModel(2, wordseq) - P3 = NgramTextModel(3, wordseq) - - # The most frequent entries in each model - assert P1.top(10) == [(2081, 'the'), (1479, 'of'), (1021, 'and'), - (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'), - (478, 'that'), (399, 'is'), (348, 'you')] - - assert P2.top(10) == [(368, ('of', 'the')), (152, ('to', 'the')), - (152, ('in', 'the')), (86, ('of', 'a')), - (80, ('it', 'is')), - (71, ('by', 'the')), (68, ('for', 'the')), - (68, ('and', 'the')), (62, ('on', 'the')), - (60, ('to', 'be'))] - - assert P3.top(10) == [(30, ('a', 'straight', 'line')), - (19, ('of', 'three', 'dimensions')), - (16, ('the', 'sense', 'of')), - (13, ('by', 'the', 'sense')), - (13, ('as', 'well', 'as')), - (12, ('of', 'the', 'circles')), - (12, ('of', 'sight', 'recognition')), - (11, ('the', 'number', 'of')), - (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))] - - assert isclose(P1['the'], 0.0611, rel_tol=0.001) - - assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01) - - assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) - assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) - assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001) - - assert P2.cond_prob.get(('went',)) is None - - assert P3.cond_prob['in', 'order'].dictionary == {'to': 6} - - def test_ir_system(): from collections import namedtuple Results = namedtuple('IRResults', ['score', 'url'])
Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.
Alternative Proxies: