From 69e8b6f53ef6ff0bd17fc13edd613dae48d681e7 Mon Sep 17 00:00:00 2001 From: Antonis Maronikolakis Date: Sat, 11 Mar 2017 13:10:41 +0200 Subject: [PATCH 1/2] Rearranged Tests - test_ngram_models to the top - added test_viterbi-segmentation - removed test_unigram_text_model --- tests/test_text.py | 90 +++++++++++++++++++++++----------------------- 1 file changed, 45 insertions(+), 45 deletions(-) diff --git a/tests/test_text.py b/tests/test_text.py index 0cd3e675c..2391820e3 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -6,13 +6,55 @@ from utils import isclose, DataFile -def test_unigram_text_model(): +def test_ngram_models(): flatland = DataFile("EN-text/flatland.txt").read() wordseq = words(flatland) - P = UnigramTextModel(wordseq) + P1 = UnigramTextModel(wordseq) + P2 = NgramTextModel(2, wordseq) + P3 = NgramTextModel(3, wordseq) + + # The most frequent entries in each model + assert P1.top(10) == [(2081, 'the'), (1479, 'of'), (1021, 'and'), + (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'), + (478, 'that'), (399, 'is'), (348, 'you')] + + assert P2.top(10) == [(368, ('of', 'the')), (152, ('to', 'the')), + (152, ('in', 'the')), (86, ('of', 'a')), + (80, ('it', 'is')), + (71, ('by', 'the')), (68, ('for', 'the')), + (68, ('and', 'the')), (62, ('on', 'the')), + (60, ('to', 'be'))] - s, p = viterbi_segment('itiseasytoreadwordswithoutspaces', P) + assert P3.top(10) == [(30, ('a', 'straight', 'line')), + (19, ('of', 'three', 'dimensions')), + (16, ('the', 'sense', 'of')), + (13, ('by', 'the', 'sense')), + (13, ('as', 'well', 'as')), + (12, ('of', 'the', 'circles')), + (12, ('of', 'sight', 'recognition')), + (11, ('the', 'number', 'of')), + (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))] + + assert isclose(P1['the'], 0.0611, rel_tol=0.001) + + assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01) + + assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) + assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) + assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001) + + assert P2.cond_prob.get(('went',)) is None + + assert P3.cond_prob['in', 'order'].dictionary == {'to': 6} + + +def test_viterbi_segmentation(): + flatland = DataFile("EN-text/flatland.txt").read() + wordseq = words(flatland) + P = UnigramTextModel(wordseq) + text = "itiseasytoreadwordswithoutspaces" + s, p = viterbi_segment(text,P) assert s == [ 'it', 'is', 'easy', 'to', 'read', 'words', 'without', 'spaces'] @@ -56,48 +98,6 @@ def test_counting_probability_distribution(): assert 1 / 7 <= min(ps) <= max(ps) <= 1 / 5 -def test_ngram_models(): - flatland = DataFile("EN-text/flatland.txt").read() - wordseq = words(flatland) - P1 = UnigramTextModel(wordseq) - P2 = NgramTextModel(2, wordseq) - P3 = NgramTextModel(3, wordseq) - - # The most frequent entries in each model - assert P1.top(10) == [(2081, 'the'), (1479, 'of'), (1021, 'and'), - (1008, 'to'), (850, 'a'), (722, 'i'), (640, 'in'), - (478, 'that'), (399, 'is'), (348, 'you')] - - assert P2.top(10) == [(368, ('of', 'the')), (152, ('to', 'the')), - (152, ('in', 'the')), (86, ('of', 'a')), - (80, ('it', 'is')), - (71, ('by', 'the')), (68, ('for', 'the')), - (68, ('and', 'the')), (62, ('on', 'the')), - (60, ('to', 'be'))] - - assert P3.top(10) == [(30, ('a', 'straight', 'line')), - (19, ('of', 'three', 'dimensions')), - (16, ('the', 'sense', 'of')), - (13, ('by', 'the', 'sense')), - (13, ('as', 'well', 'as')), - (12, ('of', 'the', 'circles')), - (12, ('of', 'sight', 'recognition')), - (11, ('the', 'number', 'of')), - (11, ('that', 'i', 'had')), (11, ('so', 'as', 'to'))] - - assert isclose(P1['the'], 0.0611, rel_tol=0.001) - - assert isclose(P2['of', 'the'], 0.0108, rel_tol=0.01) - - assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) - assert isclose(P3['', '', 'but'], 0.0, rel_tol=0.001) - assert isclose(P3['so', 'as', 'to'], 0.000323, rel_tol=0.001) - - assert P2.cond_prob.get(('went',)) is None - - assert P3.cond_prob['in', 'order'].dictionary == {'to': 6} - - def test_ir_system(): from collections import namedtuple Results = namedtuple('IRResults', ['score', 'url']) From ebf6fb537b9ebe16dd6d322abf018ee512051677 Mon Sep 17 00:00:00 2001 From: Antonis Maronikolakis Date: Sat, 11 Mar 2017 13:39:44 +0200 Subject: [PATCH 2/2] "test_ngram_models" to "test_text_models" --- tests/test_text.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_text.py b/tests/test_text.py index 2391820e3..d58cd497a 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -6,7 +6,7 @@ from utils import isclose, DataFile -def test_ngram_models(): +def test_text_models(): flatland = DataFile("EN-text/flatland.txt").read() wordseq = words(flatland) P1 = UnigramTextModel(wordseq) pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy