Skip to content

Commit 03e6861

Browse files
committed
Move CountingProbDist to learning.py and use it in NaiveBayesLearner. Make NaiveBayesLearner use target-value frequencies too.
1 parent 8e39013 commit 03e6861

File tree

2 files changed

+71
-87
lines changed

2 files changed

+71
-87
lines changed

learning.py

Lines changed: 68 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,57 @@ def parse_csv(input, delim=','):
142142

143143
#______________________________________________________________________________
144144

145+
class CountingProbDist:
146+
"""A probability distribution formed by observing and counting examples.
147+
If p is an instance of this class and o is an observed value, then
148+
there are 3 main operations:
149+
p.add(o) increments the count for observation o by 1.
150+
p.sample() returns a random element from the distribution.
151+
p[o] returns the probability for o (as in a regular ProbDist)."""
152+
153+
def __init__(self, observations=[], default=0):
154+
"""Create a distribution, and optionally add in some observations.
155+
By default this is an unsmoothed distribution, but saying default=1,
156+
for example, gives you add-one smoothing."""
157+
update(self, dictionary={}, n_obs=0.0, default=default, sampler=None)
158+
for o in observations:
159+
self.add(o)
160+
161+
def add(self, o):
162+
"Add an observation o to the distribution."
163+
self.smooth_for(o)
164+
self.dictionary[o] += 1
165+
self.n_obs += 1
166+
self.sampler = None
167+
168+
def smooth_for(self, o):
169+
"""Include o among the possible observations, whether or not
170+
it's been observed yet."""
171+
if o not in self.dictionary:
172+
self.dictionary[o] = self.default
173+
self.n_obs += self.default
174+
self.sampler = None
175+
176+
def __getitem__(self, item):
177+
"Return an estimate of the probability of item."
178+
self.smooth_for(item)
179+
return self.dictionary[item] / self.n_obs
180+
181+
# (top() and sample() are not used in this module, but elsewhere.)
182+
183+
def top(self, n):
184+
"Return (count, obs) tuples for the n most frequent observations."
185+
return heapq.nlargest(n, [(v, k) for (k, v) in self.dictionary.items()])
186+
187+
def sample(self):
188+
"Return a random sample from the distribution."
189+
if self.sampler is None:
190+
self.sampler = weighted_sampler(self.dictionary.keys(),
191+
self.dictionary.values())
192+
return self.sampler()
193+
194+
#______________________________________________________________________________
195+
145196
def PluralityLearner(dataset):
146197
"""A very dumb algorithm: always pick the result that was most popular
147198
in the training data. Makes a baseline for comparison."""
@@ -154,48 +205,29 @@ def predict(example):
154205
#______________________________________________________________________________
155206

156207
def NaiveBayesLearner(dataset):
157-
"""Just count the target/attr/val occurrences.
158-
Count how many times each value of each input attribute occurs.
159-
Store count in _N[targetvalue][attr][val]. Let
160-
_N[targetvalue][attr][None] be the sum over all vals."""
161-
162-
_N = {}
163-
## Initialize to 0
164-
for gv in dataset.values[dataset.target]:
165-
_N[gv] = {}
166-
for attr in dataset.inputs:
167-
_N[gv][attr] = {}
168-
assert None not in dataset.values[attr]
169-
for val in dataset.values[attr]:
170-
_N[gv][attr][val] = 0
171-
_N[gv][attr][None] = 0
172-
## Go thru examples
208+
"""Just count how many times each value of each input attribute
209+
occurs, conditional on the target value. Count the different
210+
target values too."""
211+
212+
targetvals = dataset.values[dataset.target]
213+
target_dist = CountingProbDist(targetvals)
214+
attr_dists = dict(((gv, attr), CountingProbDist(dataset.values[attr]))
215+
for gv in targetvals
216+
for attr in dataset.inputs)
173217
for example in dataset.examples:
174-
Ngv = _N[example[dataset.target]]
218+
targetval = example[dataset.target]
219+
target_dist.add(targetval)
175220
for attr in dataset.inputs:
176-
Ngv[attr][example[attr]] += 1
177-
Ngv[attr][None] += 1
221+
attr_dists[targetval, attr].add(example[attr])
178222

179223
def predict(example):
180224
"""Predict the target value for example. Consider each possible value,
181-
choose the most likely, by looking at each attribute independently."""
182-
possible_values = dataset.values[dataset.target]
225+
and pick the most likely by looking at each attribute independently."""
183226
def class_probability(targetval):
184-
return product(P(targetval, a, example[a]) for a in dataset.inputs)
185-
return argmax(possible_values, class_probability)
186-
187-
def P(targetval, attr, attrval):
188-
"""Smooth the raw counts to give a probability estimate.
189-
Estimate adds 1 to numerator and len(possible vals) to denominator."""
190-
return ((N(targetval, attr, attrval) + 1.0) /
191-
(N(targetval, attr, None) + len(dataset.values[attr])))
192-
193-
def N(targetval, attr, attrval):
194-
"Return the count in the training data of this combination."
195-
try:
196-
return _N[targetval][attr][attrval]
197-
except KeyError:
198-
return 0
227+
return (target_dist[targetval]
228+
* product(attr_dists[targetval, attr][example[attr]]
229+
for attr in dataset.inputs))
230+
return argmax(targetvals, class_probability)
199231

200232
return predict
201233

text.py

Lines changed: 3 additions & 51 deletions
Original file line numberDiff line numberDiff line change
@@ -5,57 +5,9 @@
55
working on a tiny sample of Unix manual pages."""
66

77
from utils import *
8+
from learning import CountingProbDist
89
from math import log, exp
9-
import heapq, re, search
10-
11-
class CountingProbDist:
12-
"""A probability distribution formed by observing and counting examples.
13-
If p is an instance of this class and o is an observed value, then
14-
there are 3 main operations:
15-
p.add(o) increments the count for observation o by 1.
16-
p.sample() returns a random element from the distribution.
17-
p[o] returns the probability for o (as in a regular ProbDist)."""
18-
19-
def __init__(self, observations=[], default=0):
20-
"""Create a distribution, and optionally add in some observations.
21-
By default this is an unsmoothed distribution, but saying default=1,
22-
for example, gives you add-one smoothing."""
23-
update(self, dictionary={}, n_obs=0.0, default=default, sampler=None)
24-
for o in observations:
25-
self.add(o)
26-
27-
def add(self, o):
28-
"Add an observation o to the distribution."
29-
self.smooth_for(o)
30-
self.dictionary[o] += 1
31-
self.n_obs += 1
32-
self.sampler = None
33-
34-
def smooth_for(self, o):
35-
"""Include o among the possible observations, whether or not
36-
it's been observed yet."""
37-
if o not in self.dictionary:
38-
self.dictionary[o] = self.default
39-
self.n_obs += self.default
40-
self.sampler = None
41-
42-
def __getitem__(self, item):
43-
"Return an estimate of the probability of item."
44-
self.smooth_for(item)
45-
return self.dictionary[item] / self.n_obs
46-
47-
def top(self, n):
48-
"Return (count, obs) tuples for the n most frequent observations."
49-
return heapq.nlargest(n, [(v, k) for (k, v) in self.dictionary.items()])
50-
51-
def sample(self):
52-
"Return a random sample from the distribution."
53-
if self.sampler is None:
54-
self.sampler = weighted_sampler(self.dictionary.keys(),
55-
self.dictionary.values())
56-
return self.sampler()
57-
58-
#______________________________________________________________________________
10+
import re, search
5911

6012
class UnigramTextModel(CountingProbDist):
6113
"""This is a discrete probability distribution over words, so you
@@ -79,7 +31,7 @@ def __init__(self, n, observation_sequence=[]):
7931
self.cond_prob = DefaultDict(CountingProbDist())
8032
self.add_sequence(observation_sequence)
8133

82-
## sample, __getitem__ inherited from CountingProbDist
34+
## __getitem__, top, sample inherited from CountingProbDist
8335
## Note they deal with tuples, not strings, as inputs
8436

8537
def add(self, ngram):

0 commit comments

Comments
 (0)
pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy