@@ -74,6 +74,72 @@ def test_text_models():
74
74
assert len (P3 .dictionary ) == 3
75
75
76
76
77
+ def test_char_models ():
78
+ test_string = 'unigram'
79
+ wordseq = words (test_string )
80
+ P1 = NgramCharModel (1 , wordseq )
81
+
82
+ assert len (P1 .dictionary ) == len (test_string )
83
+ for char in test_string :
84
+ assert tuple (char ) in P1 .dictionary
85
+
86
+ test_string = 'a b c'
87
+ wordseq = words (test_string )
88
+ P1 = NgramCharModel (1 , wordseq )
89
+
90
+ assert len (P1 .dictionary ) == len (test_string .split ())
91
+ for char in test_string .split ():
92
+ assert tuple (char ) in P1 .dictionary
93
+
94
+ test_string = 'bigram'
95
+ wordseq = words (test_string )
96
+ P2 = NgramCharModel (2 , wordseq )
97
+
98
+ expected_bigrams = {(' ' , 'b' ): 1 , ('b' , 'i' ): 1 , ('i' , 'g' ): 1 , ('g' , 'r' ): 1 , ('r' , 'a' ): 1 , ('a' , 'm' ): 1 }
99
+
100
+ assert len (P2 .dictionary ) == len (expected_bigrams )
101
+ for bigram , count in expected_bigrams .items ():
102
+ assert bigram in P2 .dictionary
103
+ assert P2 .dictionary [bigram ] == count
104
+
105
+ test_string = 'bigram bigram'
106
+ wordseq = words (test_string )
107
+ P2 = NgramCharModel (2 , wordseq )
108
+
109
+ expected_bigrams = {(' ' , 'b' ): 2 , ('b' , 'i' ): 2 , ('i' , 'g' ): 2 , ('g' , 'r' ): 2 , ('r' , 'a' ): 2 , ('a' , 'm' ): 2 }
110
+
111
+ assert len (P2 .dictionary ) == len (expected_bigrams )
112
+ for bigram , count in expected_bigrams .items ():
113
+ assert bigram in P2 .dictionary
114
+ assert P2 .dictionary [bigram ] == count
115
+
116
+ test_string = 'trigram'
117
+ wordseq = words (test_string )
118
+ P3 = NgramCharModel (3 , wordseq )
119
+
120
+ expected_trigrams = {(' ' , ' ' , 't' ): 1 , (' ' , 't' , 'r' ): 1 , ('t' , 'r' , 'i' ): 1 ,
121
+ ('r' , 'i' , 'g' ): 1 , ('i' , 'g' , 'r' ): 1 , ('g' , 'r' , 'a' ): 1 ,
122
+ ('r' , 'a' , 'm' ): 1 }
123
+
124
+ assert len (P3 .dictionary ) == len (expected_trigrams )
125
+ for bigram , count in expected_trigrams .items ():
126
+ assert bigram in P3 .dictionary
127
+ assert P3 .dictionary [bigram ] == count
128
+
129
+ test_string = 'trigram trigram trigram'
130
+ wordseq = words (test_string )
131
+ P3 = NgramCharModel (3 , wordseq )
132
+
133
+ expected_trigrams = {(' ' , ' ' , 't' ): 3 , (' ' , 't' , 'r' ): 3 , ('t' , 'r' , 'i' ): 3 ,
134
+ ('r' , 'i' , 'g' ): 3 , ('i' , 'g' , 'r' ): 3 , ('g' , 'r' , 'a' ): 3 ,
135
+ ('r' , 'a' , 'm' ): 3 }
136
+
137
+ assert len (P3 .dictionary ) == len (expected_trigrams )
138
+ for bigram , count in expected_trigrams .items ():
139
+ assert bigram in P3 .dictionary
140
+ assert P3 .dictionary [bigram ] == count
141
+
142
+
77
143
def test_viterbi_segmentation ():
78
144
flatland = DataFile ("EN-text/flatland.txt" ).read ()
79
145
wordseq = words (flatland )
0 commit comments