diff --git a/tests/test_text.py b/tests/test_text.py index 0cd3e675c..d58cd497a 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -6,13 +6,55 @@ from utils import isclose, DataFile -def test_unigram_text_model(): +def test_text_models(): flatland = DataFile("EN-text/flatland.txt").read() wordseq = words(flatland) - P = UnigramTextModel(wordseq) + P1 = UnigramTextModel(wordseq) + P2 = NgramTextModel(2, wordseq) + P3 = NgramTextModel(3, wordseq) + + # The most frequent entries in each model + assert P1.top(10) == [(2081, 'the'), (1479, 'of'), (1021, 'and'), + (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'), + (478, 'that'), (399, 'is'), (348, 'you')] + + assert P2.top(10) == [(368, ('of', 'the')), (152, ('to', 'the')), + (152, ('in', 'the')), (86, ('of', 'a')), + (80, ('it', 'is')), + (71, ('by', 'the')), (68, ('for', 'the')), + (68, ('and', 'the')), (62, ('on', 'the')), + (60, ('to', 'be'))] + + assert P3.top(10) == [(30, ('a', 'straight', 'line')), + (19, ('of', 'three', 'dimensions')), + (16, ('the', 'sense', 'of')), + (13, ('by', 'the', 'sense')), + (13, ('as', 'well', 'as')), + (12, ('of', 'the', 'circles')), + (12, ('of', 'sight', 'recognition')), + (11, ('the', 'number', 'of')), + (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))] - s, p = viterbi_segment('itiseasytoreadwordswithoutspaces', P) + assert isclose(P1['the'], 0.0611, rel_tol=0.001) + + assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01) + + assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) + assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) + assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001) + + assert P2.cond_prob.get(('went',)) is None + + assert P3.cond_prob['in', 'order'].dictionary == {'to': 6} + + +def test_viterbi_segmentation(): + flatland = DataFile("EN-text/flatland.txt").read() + wordseq = words(flatland) + P = UnigramTextModel(wordseq) + text = "itiseasytoreadwordswithoutspaces" + s, p = viterbi_segment(text,P) assert s == [ 'it', 'is', 'easy', 'to', 'read', 'words', 'without', 'spaces'] @@ -56,48 +98,6 @@ def test_counting_probability_distribution(): assert 1 / 7 <= min(ps) <= max(ps) <= 1 / 5 -def test_ngram_models(): - flatland = DataFile("EN-text/flatland.txt").read() - wordseq = words(flatland) - P1 = UnigramTextModel(wordseq) - P2 = NgramTextModel(2, wordseq) - P3 = NgramTextModel(3, wordseq) - - # The most frequent entries in each model - assert P1.top(10) == [(2081, 'the'), (1479, 'of'), (1021, 'and'), - (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'), - (478, 'that'), (399, 'is'), (348, 'you')] - - assert P2.top(10) == [(368, ('of', 'the')), (152, ('to', 'the')), - (152, ('in', 'the')), (86, ('of', 'a')), - (80, ('it', 'is')), - (71, ('by', 'the')), (68, ('for', 'the')), - (68, ('and', 'the')), (62, ('on', 'the')), - (60, ('to', 'be'))] - - assert P3.top(10) == [(30, ('a', 'straight', 'line')), - (19, ('of', 'three', 'dimensions')), - (16, ('the', 'sense', 'of')), - (13, ('by', 'the', 'sense')), - (13, ('as', 'well', 'as')), - (12, ('of', 'the', 'circles')), - (12, ('of', 'sight', 'recognition')), - (11, ('the', 'number', 'of')), - (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))] - - assert isclose(P1['the'], 0.0611, rel_tol=0.001) - - assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01) - - assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) - assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) - assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001) - - assert P2.cond_prob.get(('went',)) is None - - assert P3.cond_prob['in', 'order'].dictionary == {'to': 6} - - def test_ir_system(): from collections import namedtuple Results = namedtuple('IRResults', ['score', 'url']) pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy