1
2
3
4
5
6 """This provides code for a general Naive Bayes learner.
7
8 Naive Bayes is a supervised classification algorithm that uses Bayes
9 rule to compute the fit between a new observation and some previously
10 observed data. The observations are discrete feature vectors, with
11 the Bayes assumption that the features are independent. Although this
12 is hardly ever true, the classifier works well enough in practice.
13
14 Glossary:
15 observation A feature vector of discrete data.
16 class A possible classification for an observation.
17
18
19 Classes:
20 NaiveBayes Holds information for a naive Bayes classifier.
21
22 Functions:
23 train Train a new naive Bayes classifier.
24 calculate Calculate the probabilities of each class, given an observation.
25 classify Classify an observation into a class.
26
27 """
28
29 import numpy
30
31 -def _contents(items):
32 term = 1.0/len(items)
33 counts = {}
34 for item in items:
35 counts[item] = counts.get(item,0) + term
36 return counts
37
39 """Holds information for a NaiveBayes classifier.
40
41 Members:
42 classes List of the possible classes of data.
43 p_conditional CLASS x DIM array of dicts of value -> P(value|class,dim)
44 p_prior List of the prior probabilities for every class.
45 dimensionality Dimensionality of the data.
46
47 """
49 self.classes = []
50 self.p_conditional = None
51 self.p_prior = []
52 self.dimensionality = None
53
55 """calculate(nb, observation[, scale]) -> probability dict
56
57 Calculate log P(class|observation) for each class. nb is a NaiveBayes
58 classifier that has been trained. observation is a list representing
59 the observed data. scale is whether the probability should be
60 scaled by P(observation). By default, no scaling is done. The return
61 value is a dictionary where the keys is the class and the value is the
62 log probability of the class.
63
64 """
65
66
67
68
69
70 if len(observation) != nb.dimensionality:
71 raise ValueError("observation in %d dimension, but classifier in %d" \
72 % (len(observation), nb.dimensionality))
73
74
75 n = len(nb.classes)
76 lp_observation_class = numpy.zeros(n)
77 for i in range(n):
78
79 probs = [None] * len(observation)
80 for j in range(len(observation)):
81 probs[j] = nb.p_conditional[i][j].get(observation[j], 0)
82 lprobs = numpy.log(numpy.clip(probs, 1.e-300, 1.e+300))
83 lp_observation_class[i] = sum(lprobs)
84
85
86 lp_prior = numpy.log(nb.p_prior)
87
88
89 lp_observation = 0.0
90 if scale:
91
92 obs = numpy.exp(numpy.clip(lp_prior+lp_observation_class,-700,+700))
93 lp_observation = numpy.log(sum(obs))
94
95
96 lp_class_observation = {}
97 for i in range(len(nb.classes)):
98 lp_class_observation[nb.classes[i]] = \
99 lp_observation_class[i] + lp_prior[i] - lp_observation
100
101 return lp_class_observation
102
104 """classify(nb, observation) -> class
105
106 Classify an observation into a class.
107
108 """
109
110 probs = calculate(nb, observation, scale=0)
111 max_prob = max_class = None
112 for klass in nb.classes:
113 if max_prob is None or probs[klass] > max_prob:
114 max_prob, max_class = probs[klass], klass
115 return max_class
116
117 -def train(training_set, results, priors=None, typecode=None):
118 """train(training_set, results[, priors]) -> NaiveBayes
119
120 Train a naive bayes classifier on a training set. training_set is a
121 list of observations. results is a list of the class assignments
122 for each observation. Thus, training_set and results must be the same
123 length. priors is an optional dictionary specifying the prior
124 probabilities for each type of result. If not specified, the priors
125 will be estimated from the training results.
126
127 """
128 if not len(training_set):
129 raise ValueError("No data in the training set.")
130 if len(training_set) != len(results):
131 raise ValueError("training_set and results should be parallel lists.")
132
133
134
135
136
137
138
139
140 dimensions = [len(x) for x in training_set]
141 if min(dimensions) != max(dimensions):
142 raise ValueError("observations have different dimensionality")
143
144 nb = NaiveBayes()
145 nb.dimensionality = dimensions[0]
146
147
148
149 if priors is not None:
150 percs = priors
151 nb.classes = list(set(results))
152 else:
153 class_freq = _contents(results)
154 nb.classes = class_freq.keys()
155 percs = class_freq
156 nb.classes.sort()
157
158 nb.p_prior = numpy.zeros(len(nb.classes))
159 for i in range(len(nb.classes)):
160 nb.p_prior[i] = percs[nb.classes[i]]
161
162
163
164
165
166
167
168 c2i = {}
169 for index, key in enumerate(nb.classes):
170 c2i[key] = index
171 observations = [[] for c in nb.classes]
172 for i in range(len(results)):
173 klass, obs = results[i], training_set[i]
174 observations[c2i[klass]].append(obs)
175
176 for i in range(len(observations)):
177
178 observations[i] = numpy.asarray(observations[i], typecode)
179
180
181
182 nb.p_conditional = []
183 for i in range(len(nb.classes)):
184 class_observations = observations[i]
185 nb.p_conditional.append([None] * nb.dimensionality)
186 for j in range(nb.dimensionality):
187
188 values = class_observations[:, j]
189
190
191
192
193
194 nb.p_conditional[i][j] = _contents(values)
195 return nb
196
197 if __name__ == "__main__":
198
199
200 xcar=[
201 ['Red', 'Sports', 'Domestic'],
202 ['Red', 'Sports', 'Domestic'],
203 ['Red', 'Sports', 'Domestic'],
204 ['Yellow', 'Sports', 'Domestic'],
205 ['Yellow', 'Sports', 'Imported'],
206 ['Yellow', 'SUV', 'Imported'],
207 ['Yellow', 'SUV', 'Imported'],
208 ['Yellow', 'SUV', 'Domestic'],
209 ['Red', 'SUV', 'Imported'],
210 ['Red', 'Sports', 'Imported']
211 ]
212
213 ycar=[
214 'Yes',
215 'No',
216 'Yes',
217 'No',
218 'Yes',
219 'No',
220 'Yes',
221 'No',
222 'No',
223 'Yes'
224 ]
225
226 carmodel = train(xcar, ycar)
227 carresult = classify(carmodel, ['Red', 'Sports', 'Domestic'])
228 print 'Is Yes?', carresult
229 carresult = classify(carmodel, ['Red', 'SUV', 'Domestic'])
230 print 'Is No?', carresult
231