1 """
2 This is an implementation of a state-emitting MarkovModel. I am using
3 terminology similar to Manning and Schutze.
4
5
6
7 Functions:
8 train_bw Train a markov model using the Baum-Welch algorithm.
9 train_visible Train a visible markov model using MLE.
10 find_states Find the a state sequence that explains some observations.
11
12 load Load a MarkovModel.
13 save Save a MarkovModel.
14
15 Classes:
16 MarkovModel Holds the description of a markov model
17 """
18
19 import numpy
20
21
28
29 numpy.random.seed()
30
31 VERY_SMALL_NUMBER = 1E-300
32 LOG0 = numpy.log(VERY_SMALL_NUMBER)
33
35 - def __init__(self, states, alphabet,
36 p_initial=None, p_transition=None, p_emission=None):
37 self.states = states
38 self.alphabet = alphabet
39 self.p_initial = p_initial
40 self.p_transition = p_transition
41 self.p_emission = p_emission
48
54
56 """load(handle) -> MarkovModel()"""
57
58 line = _readline_and_check_start(handle, "STATES:")
59 states = line.split()[1:]
60
61
62 line = _readline_and_check_start(handle, "ALPHABET:")
63 alphabet = line.split()[1:]
64
65 mm = MarkovModel(states, alphabet)
66 N, M = len(states), len(alphabet)
67
68
69 mm.p_initial = numpy.zeros(N)
70 line = _readline_and_check_start(handle, "INITIAL:")
71 for i in range(len(states)):
72 line = _readline_and_check_start(handle, " %s:" % states[i])
73 mm.p_initial[i] = float(line.split()[-1])
74
75
76 mm.p_transition = numpy.zeros((N, N))
77 line = _readline_and_check_start(handle, "TRANSITION:")
78 for i in range(len(states)):
79 line = _readline_and_check_start(handle, " %s:" % states[i])
80 mm.p_transition[i,:] = map(float, line.split()[1:])
81
82
83 mm.p_emission = numpy.zeros((N, M))
84 line = _readline_and_check_start(handle, "EMISSION:")
85 for i in range(len(states)):
86 line = _readline_and_check_start(handle, " %s:" % states[i])
87 mm.p_emission[i,:] = map(float, line.split()[1:])
88
89 return mm
90
91 -def save(mm, handle):
92 """save(mm, handle)"""
93
94 w = handle.write
95 w("STATES: %s\n" % ' '.join(mm.states))
96 w("ALPHABET: %s\n" % ' '.join(mm.alphabet))
97 w("INITIAL:\n")
98 for i in range(len(mm.p_initial)):
99 w(" %s: %g\n" % (mm.states[i], mm.p_initial[i]))
100 w("TRANSITION:\n")
101 for i in range(len(mm.p_transition)):
102 x = map(str, mm.p_transition[i])
103 w(" %s: %s\n" % (mm.states[i], ' '.join(x)))
104 w("EMISSION:\n")
105 for i in range(len(mm.p_emission)):
106 x = map(str, mm.p_emission[i])
107 w(" %s: %s\n" % (mm.states[i], ' '.join(x)))
108
109
110 -def train_bw(states, alphabet, training_data,
111 pseudo_initial=None, pseudo_transition=None, pseudo_emission=None,
112 update_fn=None,
113 ):
114 """train_bw(states, alphabet, training_data[, pseudo_initial]
115 [, pseudo_transition][, pseudo_emission][, update_fn]) -> MarkovModel
116
117 Train a MarkovModel using the Baum-Welch algorithm. states is a list
118 of strings that describe the names of each state. alphabet is a
119 list of objects that indicate the allowed outputs. training_data
120 is a list of observations. Each observation is a list of objects
121 from the alphabet.
122
123 pseudo_initial, pseudo_transition, and pseudo_emission are
124 optional parameters that you can use to assign pseudo-counts to
125 different matrices. They should be matrices of the appropriate
126 size that contain numbers to add to each parameter matrix, before
127 normalization.
128
129 update_fn is an optional callback that takes parameters
130 (iteration, log_likelihood). It is called once per iteration.
131
132 """
133 N, M = len(states), len(alphabet)
134 if not training_data:
135 raise ValueError("No training data given.")
136 if pseudo_initial!=None:
137 pseudo_initial = numpy.asarray(pseudo_initial)
138 if pseudo_initial.shape != (N,):
139 raise ValueError("pseudo_initial not shape len(states)")
140 if pseudo_transition!=None:
141 pseudo_transition = numpy.asarray(pseudo_transition)
142 if pseudo_transition.shape != (N,N):
143 raise ValueError("pseudo_transition not shape " + \
144 "len(states) X len(states)")
145 if pseudo_emission!=None:
146 pseudo_emission = numpy.asarray(pseudo_emission)
147 if pseudo_emission.shape != (N,M):
148 raise ValueError("pseudo_emission not shape " + \
149 "len(states) X len(alphabet)")
150
151
152
153
154 training_outputs = []
155 indexes = itemindex(alphabet)
156 for outputs in training_data:
157 training_outputs.append([indexes[x] for x in outputs])
158
159
160 lengths = map(len, training_outputs)
161 if min(lengths) == 0:
162 raise ValueError("I got training data with outputs of length 0")
163
164
165 x = _baum_welch(N, M, training_outputs,
166 pseudo_initial=pseudo_initial,
167 pseudo_transition=pseudo_transition,
168 pseudo_emission=pseudo_emission,
169 update_fn=update_fn)
170 p_initial, p_transition, p_emission = x
171 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
172
173 MAX_ITERATIONS = 1000
174 -def _baum_welch(N, M, training_outputs,
175 p_initial=None, p_transition=None, p_emission=None,
176 pseudo_initial=None, pseudo_transition=None,
177 pseudo_emission=None, update_fn=None):
178
179 if p_initial==None:
180 p_initial = _random_norm(N)
181 else:
182 p_initial = _copy_and_check(p_initial, (N,))
183
184 if p_transition==None:
185 p_transition = _random_norm((N,N))
186 else:
187 p_transition = _copy_and_check(p_transition, (N,N))
188 if p_emission==None:
189 p_emission = _random_norm((N,M))
190 else:
191 p_emission = _copy_and_check(p_emission, (N,M))
192
193
194 lp_initial, lp_transition, lp_emission = map(
195 numpy.log, (p_initial, p_transition, p_emission))
196 if pseudo_initial!=None:
197 lpseudo_initial = numpy.log(pseudo_initial)
198 else:
199 lpseudo_initial = None
200 if pseudo_transition!=None:
201 lpseudo_transition = numpy.log(pseudo_transition)
202 else:
203 lpseudo_transition = None
204 if pseudo_emission!=None:
205 lpseudo_emission = numpy.log(pseudo_emission)
206 else:
207 lpseudo_emission = None
208
209
210
211
212 prev_llik = None
213 for i in range(MAX_ITERATIONS):
214 llik = LOG0
215 for outputs in training_outputs:
216 x = _baum_welch_one(
217 N, M, outputs,
218 lp_initial, lp_transition, lp_emission,
219 lpseudo_initial, lpseudo_transition, lpseudo_emission,)
220 llik += x
221 if update_fn is not None:
222 update_fn(i, llik)
223 if prev_llik is not None and numpy.fabs(prev_llik-llik) < 0.1:
224 break
225 prev_llik = llik
226 else:
227 raise RuntimeError("HMM did not converge in %d iterations" \
228 % MAX_ITERATIONS)
229
230
231 return map(numpy.exp, (lp_initial, lp_transition, lp_emission))
232
233 -def _baum_welch_one(N, M, outputs,
234 lp_initial, lp_transition, lp_emission,
235 lpseudo_initial, lpseudo_transition, lpseudo_emission):
236
237
238
239 T = len(outputs)
240 fmat = _forward(N, T, lp_initial, lp_transition, lp_emission, outputs)
241 bmat = _backward(N, T, lp_transition, lp_emission, outputs)
242
243
244
245 lp_arc = numpy.zeros((N, N, T))
246 for t in range(T):
247 k = outputs[t]
248 lp_traverse = numpy.zeros((N, N))
249 for i in range(N):
250 for j in range(N):
251
252
253
254
255 lp = fmat[i][t] + \
256 lp_transition[i][j] + \
257 lp_emission[i][k] + \
258 bmat[j][t+1]
259 lp_traverse[i][j] = lp
260
261 lp_arc[:,:,t] = lp_traverse - _logsum(lp_traverse)
262
263
264
265 lp_arcout_t = numpy.zeros((N, T))
266 for t in range(T):
267 for i in range(N):
268 lp_arcout_t[i][t] = _logsum(lp_arc[i,:,t])
269
270
271 lp_arcout = numpy.zeros(N)
272 for i in range(N):
273 lp_arcout[i] = _logsum(lp_arcout_t[i,:])
274
275
276 lp_initial = lp_arcout_t[:,0]
277 if lpseudo_initial!=None:
278 lp_initial = _logvecadd(lp_initial, lpseudo_initial)
279 lp_initial = lp_initial - _logsum(lp_initial)
280
281
282
283
284 for i in range(N):
285 for j in range(N):
286 lp_transition[i][j] = _logsum(lp_arc[i,j,:]) - lp_arcout[i]
287 if lpseudo_transition!=None:
288 lp_transition[i] = _logvecadd(lp_transition[i], lpseudo_transition)
289 lp_transition[i] = lp_transition[i] - _logsum(lp_transition[i])
290
291
292
293
294 for i in range(N):
295 ksum = numpy.zeros(M)+LOG0
296 for t in range(T):
297 k = outputs[t]
298 for j in range(N):
299 ksum[k] = _logadd(ksum[k], lp_arc[i,j,t])
300 ksum = ksum - _logsum(ksum)
301 if lpseudo_emission!=None:
302 ksum = _logvecadd(ksum, lpseudo_emission[i])
303 ksum = ksum - _logsum(ksum)
304 lp_emission[i,:] = ksum
305
306
307
308
309
310
311
312
313 return _logsum(fmat[:,T])
314
315 -def _forward(N, T, lp_initial, lp_transition, lp_emission, outputs):
316
317
318
319
320 matrix = numpy.zeros((N, T+1))
321
322
323 matrix[:,0] = lp_initial
324 for t in range(1, T+1):
325 k = outputs[t-1]
326 for j in range(N):
327
328
329 lprob = LOG0
330 for i in range(N):
331 lp = matrix[i][t-1] + \
332 lp_transition[i][j] + \
333 lp_emission[i][k]
334 lprob = _logadd(lprob, lp)
335 matrix[j][t] = lprob
336 return matrix
337
338 -def _backward(N, T, lp_transition, lp_emission, outputs):
339 matrix = numpy.zeros((N, T+1))
340 for t in range(T-1, -1, -1):
341 k = outputs[t]
342 for i in range(N):
343
344
345 lprob = LOG0
346 for j in range(N):
347 lp = matrix[j][t+1] + \
348 lp_transition[i][j] + \
349 lp_emission[i][k]
350 lprob = _logadd(lprob, lp)
351 matrix[i][t] = lprob
352 return matrix
353
354 -def train_visible(states, alphabet, training_data,
355 pseudo_initial=None, pseudo_transition=None,
356 pseudo_emission=None):
357 """train_visible(states, alphabet, training_data[, pseudo_initial]
358 [, pseudo_transition][, pseudo_emission]) -> MarkovModel
359
360 Train a visible MarkovModel using maximum likelihoood estimates
361 for each of the parameters. states is a list of strings that
362 describe the names of each state. alphabet is a list of objects
363 that indicate the allowed outputs. training_data is a list of
364 (outputs, observed states) where outputs is a list of the emission
365 from the alphabet, and observed states is a list of states from
366 states.
367
368 pseudo_initial, pseudo_transition, and pseudo_emission are
369 optional parameters that you can use to assign pseudo-counts to
370 different matrices. They should be matrices of the appropriate
371 size that contain numbers to add to each parameter matrix
372
373 """
374 N, M = len(states), len(alphabet)
375 if pseudo_initial!=None:
376 pseudo_initial = numpy.asarray(pseudo_initial)
377 if pseudo_initial.shape != (N,):
378 raise ValueError("pseudo_initial not shape len(states)")
379 if pseudo_transition!=None:
380 pseudo_transition = numpy.asarray(pseudo_transition)
381 if pseudo_transition.shape != (N,N):
382 raise ValueError("pseudo_transition not shape " + \
383 "len(states) X len(states)")
384 if pseudo_emission!=None:
385 pseudo_emission = numpy.asarray(pseudo_emission)
386 if pseudo_emission.shape != (N,M):
387 raise ValueError("pseudo_emission not shape " + \
388 "len(states) X len(alphabet)")
389
390
391
392
393 training_states, training_outputs = [], []
394 states_indexes = itemindex(states)
395 outputs_indexes = itemindex(alphabet)
396 for toutputs, tstates in training_data:
397 if len(tstates) != len(toutputs):
398 raise ValueError("states and outputs not aligned")
399 training_states.append([states_indexes[x] for x in tstates])
400 training_outputs.append([outputs_indexes[x] for x in toutputs])
401
402 x = _mle(N, M, training_outputs, training_states,
403 pseudo_initial, pseudo_transition, pseudo_emission)
404 p_initial, p_transition, p_emission = x
405
406 return MarkovModel(states, alphabet, p_initial, p_transition, p_emission)
407
408 -def _mle(N, M, training_outputs, training_states, pseudo_initial,
409 pseudo_transition, pseudo_emission):
410
411
412 p_initial = numpy.zeros(N)
413 if pseudo_initial:
414 p_initial = p_initial + pseudo_initial
415 for states in training_states:
416 p_initial[states[0]] += 1
417 p_initial = _normalize(p_initial)
418
419
420
421 p_transition = numpy.zeros((N,N))
422 if pseudo_transition:
423 p_transition = p_transition + pseudo_transition
424 for states in training_states:
425 for n in range(len(states)-1):
426 i, j = states[n], states[n+1]
427 p_transition[i, j] += 1
428 for i in range(len(p_transition)):
429 p_transition[i,:] = p_transition[i,:] / sum(p_transition[i,:])
430
431
432
433 p_emission = numpy.zeros((N,M))
434 if pseudo_emission:
435 p_emission = p_emission + pseudo_emission
436 p_emission = numpy.ones((N,M))
437 for outputs, states in zip(training_outputs, training_states):
438 for o, s in zip(outputs, states):
439 p_emission[s, o] += 1
440 for i in range(len(p_emission)):
441 p_emission[i,:] = p_emission[i,:] / sum(p_emission[i,:])
442
443 return p_initial, p_transition, p_emission
444
446 return [numpy.argmax(vector)]
447
449 """find_states(markov_model, output) -> list of (states, score)"""
450 mm = markov_model
451 N = len(mm.states)
452
453
454
455 x = mm.p_initial + VERY_SMALL_NUMBER
456 y = mm.p_transition + VERY_SMALL_NUMBER
457 z = mm.p_emission + VERY_SMALL_NUMBER
458 lp_initial, lp_transition, lp_emission = map(numpy.log, (x, y, z))
459
460 indexes = itemindex(mm.alphabet)
461 output = [indexes[x] for x in output]
462
463
464 results = _viterbi(N, lp_initial, lp_transition, lp_emission, output)
465
466 for i in range(len(results)):
467 states, score = results[i]
468 results[i] = [mm.states[x] for x in states], numpy.exp(score)
469 return results
470
471 -def _viterbi(N, lp_initial, lp_transition, lp_emission, output):
472
473
474
475 T = len(output)
476
477 backtrace = []
478 for i in range(N):
479 backtrace.append([None] * T)
480
481
482 scores = numpy.zeros((N, T))
483 scores[:,0] = lp_initial + lp_emission[:,output[0]]
484 for t in range(1, T):
485 k = output[t]
486 for j in range(N):
487
488 i_scores = scores[:,t-1] + \
489 lp_transition[:,j] + \
490 lp_emission[j,k]
491 indexes = _argmaxes(i_scores)
492 scores[j,t] = i_scores[indexes[0]]
493 backtrace[j][t] = indexes
494
495
496
497
498
499
500 in_process = []
501 results = []
502 indexes = _argmaxes(scores[:,T-1])
503 for i in indexes:
504 in_process.append((T-1, [i], scores[i][T-1]))
505 while in_process:
506 t, states, score = in_process.pop()
507 if t == 0:
508 results.append((states, score))
509 else:
510 indexes = backtrace[states[0]][t]
511 for i in indexes:
512 in_process.append((t-1, [i]+states, score))
513 return results
514
526
530
534
536
537 matrix = numpy.array(matrix, copy=1)
538
539 if matrix.shape != desired_shape:
540 raise ValueError("Incorrect dimension")
541
542 if len(matrix.shape) == 1:
543 if numpy.fabs(sum(matrix)-1.0) > 0.01:
544 raise ValueError("matrix not normalized to 1.0")
545 elif len(matrix.shape) == 2:
546 for i in range(len(matrix)):
547 if numpy.fabs(sum(matrix[i])-1.0) > 0.01:
548 raise ValueError("matrix %d not normalized to 1.0" % i)
549 else:
550 raise ValueError("I don't handle matrices > 2 dimensions")
551 return matrix
552
554 if logy - logx > 100:
555 return logy
556 elif logx - logy > 100:
557 return logx
558 minxy = min(logx, logy)
559 return minxy + numpy.log(numpy.exp(logx-minxy) + numpy.exp(logy-minxy))
560
570
572 assert len(logvec1) == len(logvec2), "vectors aren't the same length"
573 sumvec = numpy.zeros(len(logvec1))
574 for i in range(len(logvec1)):
575 sumvec[i] = _logadd(logvec1[i], logvec2[i])
576 return sumvec
577
581
582 try:
583 import cMarkovModel
584 except ImportError, x:
585 pass
586 else:
587 import sys
588 this_module = sys.modules[__name__]
589 for name in cMarkovModel.__dict__.keys():
590 if not name.startswith("__"):
591 this_module.__dict__[name] = cMarkovModel.__dict__[name]
592