Package Bio :: Package MEME :: Module Parser
[hide private]
[frames] | no frames]

Source Code for Module Bio.MEME.Parser

  1  # Copyright 2004 by Jason A. Hackney.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  from Bio.Alphabet import IUPAC 
  7  from Bio import File 
  8  from Bio.ParserSupport import * 
  9  from Bio import Seq 
 10  from Bio.MEME import Motif 
 11  import re 
 12   
13 -class MEMERecord:
14 """A class for holding the results of a MEME run (OBSOLETE). 15 16 A MEMERecord is an object that holds the results from running 17 MEME. It implements no methods of its own. 18 19 This class is OBSOLETE; its functionality is now available through 20 Bio.Motif.Parsers.MEME. 21 """
22 - def __init__ (self):
23 """__init__ (self)""" 24 self.motifs = [] 25 self.version = "" 26 self.datafile = "" 27 self.command = "" 28 self.alphabet = None 29 self.sequence_names = []
30
31 - def get_motif_by_name (self, name):
32 for m in self.motifs: 33 if m.name == name: 34 return m
35
36 -class MEMEParser (AbstractParser):
37 """A parser for the text output of the MEME program (OBSOLETE). 38 Parses the output into an object of the MEMERecord class. 39 40 Methods: 41 parse (handle): parses the contents of the file handle passed to it. 42 43 Example: 44 45 f = open("meme.output.txt") 46 parser = MEMEParser() 47 meme_record = parser.parse(f) 48 for motif in meme_record.motifs: 49 for instance in motif.instances: 50 print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue 51 52 This class is OBSOLETE; its functionality is now available through 53 Bio.Motif.Parsers.MEME. 54 """
55 - def __init__ (self):
56 """__init__ (self)""" 57 self._scanner = _MEMEScanner() 58 self._consumer = _MEMEConsumer()
59
60 - def parse (self, handle):
61 """parse (self, handle)""" 62 self._scanner.feed(handle, self._consumer) 63 return self._consumer.data
64 65 66
67 -class _MEMEScanner:
68 """Scanner for MEME output (OBSOLETE). 69 70 Methods: 71 feed 72 73 This class is OBSOLETE; its functionality is now available through 74 Bio.Motif.Parsers.MEME. 75 """ 76
77 - def feed (self, handle, consumer):
78 """ 79 Feeds in MEME output for scanning. handle should 80 implement the readline method. consumer is 81 a Consumer object that can receive the salient events. 82 """ 83 if isinstance(handle, File.UndoHandle): 84 uhandle = handle 85 else: 86 uhandle = File.UndoHandle(handle) 87 88 self._scan_header(uhandle, consumer) 89 self._scan_motifs (uhandle, consumer)
90
91 - def _scan_header(self, uhandle, consumer):
92 try: 93 read_and_call_until(uhandle, consumer.noevent, contains = 'MEME version') 94 except ValueError: 95 raise ValueError("Improper input file. File should contain a line starting MEME version.") 96 read_and_call(uhandle, consumer._version, start = 'MEME version') 97 read_and_call_until(uhandle, consumer.noevent, start = 'TRAINING SET') 98 read_and_call(uhandle, consumer.noevent, start = 'TRAINING SET') 99 read_and_call(uhandle, consumer.noevent, start = '****') 100 read_and_call(uhandle, consumer._datafile, start = 'DATAFILE') 101 read_and_call(uhandle, consumer._alphabet, start = 'ALPHABET') 102 read_and_call(uhandle, consumer.noevent, start = 'Sequence name') 103 read_and_call(uhandle, consumer.noevent, start = '----') 104 read_and_call_until(uhandle, consumer._sequence_name, start = '***') 105 read_and_call_until(uhandle, consumer.noevent, start = 'command:') 106 read_and_call(uhandle, consumer._commandline, start = 'command:') 107 read_and_call_until(uhandle, consumer.noevent, start = 'MOTIF 1')
108
109 - def _scan_motifs(self, uhandle, consumer):
110 while 1: 111 read_and_call(uhandle, consumer._add_motif_with_info, start = 'MOTIF') 112 read_and_call_until(uhandle, consumer.noevent, contains = 'sorted by position p-value') 113 read_and_call(uhandle, consumer.motif_name, contains = 'sorted by position p-value') 114 read_and_call(uhandle, consumer.noevent, start = '---') 115 read_and_call(uhandle, consumer.noevent, start = 'Sequence name') 116 read_and_call(uhandle, consumer.noevent, start = '---') 117 read_and_call_until(uhandle, consumer.add_instance, start = '---') 118 read_and_call_until(uhandle, consumer.noevent, start = 'log-odds matrix') 119 read_and_call(uhandle, consumer.noevent) 120 read_and_call_until(uhandle, consumer.add_to_logodds, start = '---') 121 read_and_call_until(uhandle, consumer.noevent, start = 'letter-probability matrix') 122 read_and_call(uhandle, consumer.noevent, start = 'letter-probability matrix') 123 read_and_call_until(uhandle, consumer.add_to_pssm, start = '---') 124 read_and_call_until(uhandle, consumer.noevent, start = 'Time') 125 read_and_call(uhandle, consumer.noevent, start = 'Time') 126 read_and_call(uhandle, consumer.noevent, blank = 1) 127 read_and_call(uhandle, consumer.noevent, start = '***') 128 read_and_call_while(uhandle, consumer.noevent, blank = 1) 129 read_and_call(uhandle, consumer.noevent, start = '***') 130 line = safe_peekline(uhandle) 131 if line.startswith("SUMMARY OF MOTIFS"): 132 break
133 134 135
136 -class _MEMEConsumer:
137 """ 138 Consumer that can receive events from MEME Scanner (OBSOLETE). 139 140 This is the Consumer object that should be passed to the 141 MEME Scanner. 142 143 This class is OBSOLETE; its functionality is now available through 144 Bio.Motif.Parsers.MEME. 145 """ 146
147 - def __init__ (self):
148 self.current_motif = None 149 self.sequence_names = [] 150 self.data = MEMERecord()
151
152 - def _version (self, line):
153 line = line.strip() 154 ls = line.split() 155 self.data.version = ls[2]
156
157 - def _datafile (self, line):
158 line = line.strip() 159 line = line.replace('DATAFILE= ','') 160 self.data.datafile = line
161
162 - def _alphabet (self, line):
163 line = line.strip() 164 line = line.replace('ALPHABET= ','') 165 if line == 'ACGT': 166 al = IUPAC.unambiguous_dna 167 else: 168 al = IUPAC.protein 169 self.data.alphabet = al
170
171 - def _sequence_name (self, line):
172 line = line.strip() 173 ls = line.split() 174 self.data.sequence_names.append(ls[0]) 175 if len(ls) == 6: 176 self.data.sequence_names.append(ls[3])
177
178 - def _commandline (self, line):
179 line = line.strip() 180 line = line.replace('command: ','') 181 self.data.command = line
182
183 - def _add_motif_with_info (self, line):
184 line = line.strip() 185 ls = line.split() 186 motif = Motif.MEMEMotif() 187 motif._length(ls[4]) 188 motif._numoccurrences(ls[7]) 189 motif._evalue(ls[13]) 190 motif._alphabet(self.data.alphabet) 191 self.data.motifs.append(motif) 192 self.current_motif = motif
193
194 - def motif_name (self, line):
195 line = line.strip() 196 ls = line.split() 197 name = ' '.join(ls[0:2]) 198 self.current_motif._name(name)
199
200 - def add_instance (self, line):
201 line = line.strip() 202 ls = line.split() 203 if self.data.command.find('revcomp') != -1: 204 seq = Seq.Seq(ls[5], self.data.alphabet) 205 self.current_motif.add_instance_from_values(name = ls[0], sequence = seq, start = ls[2], pvalue = ls[3], strand = ls[1]) 206 else: 207 seq = Seq.Seq(ls[4], self.data.alphabet) 208 self.current_motif.add_instance_from_values(name = ls[0], sequence = seq, start = ls[1], pvalue = ls[2])
209
210 - def add_to_pssm (self, line):
211 line = line.strip() 212 sl = line.split() 213 thisposition = tuple([float(i) for i in sl]) 214 self.current_motif.add_to_pssm(thisposition)
215
216 - def add_to_logodds (self, line):
217 line = line.strip() 218 sl = line.split() 219 thisposition = tuple([float(i) for i in sl]) 220 self.current_motif.add_to_logodds(thisposition)
221
222 - def noevent (self,line):
223 pass
224 225 226
227 -class _MASTConsumer:
228 """ 229 Consumer that can receive events from _MASTScanner (OBSOLETE). 230 231 A _MASTConsumer parses lines from a mast text output file. 232 The motif match diagrams are parsed using line buffering. 233 Each of the buffering functions have a dummy variable that is 234 required for testing using the Bio.ParserSupport.TaggingConsumer. 235 If this variable isn't there, the TaggingConsumer barfs. In 236 the _MASTScanner, None is passed in the place of this variable. 237 238 This class is OBSOLETE; its functionality is now available through 239 Bio.Motif.Parsers.MAST. 240 """
241 - def __init__ (self):
242 self.data = MASTRecord() 243 self._current_seq = "" 244 self._line_buffer = [] 245 self._buffer_size = 0 246 self._buffered_seq_start = 0
247
248 - def _version (self, line):
249 line = line.strip() 250 ls = line.split() 251 self.data._version(ls[2])
252
253 - def _database (self, line):
254 line = line.strip() 255 ls = line.split() 256 self.data._database(ls[1]) 257 al = "" 258 if ls[2] == '(nucleotide)': 259 al = IUPAC.unambiguous_dna 260 self.data._alphabet(al) 261 else: 262 al = IUPAC.protein 263 self.data._alphabet(al)
264
265 - def _add_motif (self, line):
266 line = line.strip() 267 ls = line.split() 268 m = Motif.MEMEMotif() 269 m._alphabet(self.data.alphabet) 270 m._length(ls[1]) 271 name = ls[0] 272 m._name(name) 273 m._consensus(ls[2]) 274 self.data._add_motif(m)
275
276 - def _add_match_diagram (self, line):
277 line = line.strip() 278 ls = line.split() 279 self.data._add_diagram_for_sequence(ls[1], self._current_seq) 280 ds = ls[1].split('_') 281 i = 0 282 start = 0 283 for i in range(0,len(ds)): 284 if ds[i].find('[') != -1 or ds[i].find('<') != -1: 285 inst = Motif.Instance() 286 inst._seqname (self._current_seq) 287 inst._start (start) 288 r = re.compile('\d+') 289 mn = r.findall(ds[i])[0] 290 if ds[i].find('-') != -1: 291 inst.strand = '-' 292 else: 293 inst.strand = '+' 294 motif = self.data.get_motif_by_name(mn) 295 motif.add_instance(inst) 296 start += motif.length 297 else: 298 start += int(ds[i])
299
300 - def _add_sequence_match_with_diagram (self, line):
301 line = line.strip() 302 ls = line.split() 303 self.data._add_sequence(ls[0]) 304 self.data._add_diagram_for_sequence(ls[2],ls[0]) 305 ds = ls[2].split('_') 306 i = 0 307 start = 0 308 for i in range(0,len(ds)): 309 if ds[i].find('+') != -1 or ds[i].find('-') != -1: 310 inst = Motif.Instance() 311 inst._seqname (ls[0]) 312 inst._start (start) 313 r = re.compile('\d+') 314 mn = r.findall(ds[i])[0] 315 if ds[i].find('-') != -1: 316 inst.strand = '-' 317 else: 318 inst.strand = '+' 319 motif = self.data.get_motif_by_name(mn) 320 motif.add_instance(inst) 321 start += motif.length 322 else: 323 start += int(ds[i])
324
325 - def _add_diagram_from_buffer (self, dummy):
326 line = "" 327 for l in self._line_buffer: 328 line += l.strip() 329 ls = line.split() 330 self.data._add_diagram_for_sequence(ls[1], self._current_seq) 331 ds = ls[1].split('_') 332 i = 0 333 start = 0 334 for i in range(0,len(ds)): 335 if ds[i].find('[') != -1 or ds[i].find('<') != -1: 336 inst = Motif.Instance() 337 inst._seqname (self._current_seq) 338 inst._start (start) 339 r = re.compile('\d+') 340 mn = r.findall(ds[i])[0] 341 if ds[i].find('-') != -1: 342 inst.strand = '-' 343 else: 344 inst.strand = '+' 345 motif = self.data.get_motif_by_name(mn) 346 motif.add_instance(inst) 347 start += motif.length 348 else: 349 start += int(ds[i])
350
351 - def _set_current_seq (self, line):
352 line = line.strip() 353 self._current_seq = line 354 if not self.data.sequences.count(line): 355 self.data.sequences.append(line)
356
357 - def _add_line_to_buffer (self, line):
358 line = line.strip() 359 if not line.startswith('*****'): 360 self._line_buffer.append(line) 361 else: 362 return -1
363
364 - def _parse_buffer (self, dummy):
365 """Parses the line buffer to get e-values for each instance of a motif. 366 This buffer parser is the most likely point of failure for the 367 MASTParser. 368 """ 369 insts = self.data.get_motif_matches_for_sequence(self._current_seq) 370 if len(insts) > 0: 371 372 fullSeq = self._line_buffer[self._buffer_size-1] 373 pvals = self._line_buffer[1].split() 374 p = 0 375 lpval = len(pvals) 376 while p < lpval: 377 if pvals[p].count('e') > 1: 378 #Break blocks up by e and parse into valid floats. This only 379 #works if there are no e-values greater than 1e-5. 380 pvs = [] 381 spe = pvals[p].split('e') 382 spe.reverse() 383 dotind = spe[1].find('.') 384 if dotind == -1: 385 thispval = spe[1][-1] + 'e' + spe[0] 386 else: 387 thispval = spe[1][dotind-1:] + 'e' + spe[0] 388 pvs.append(thispval) 389 for spi in range(2,len(spe)): 390 dotind = spe[spi].find('.') 391 prevdotind = spe[spi-1].find('.') 392 if dotind != -1: 393 if prevdotind == -1: 394 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][:-1] 395 else: 396 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][0:prevdotind-1] 397 else: 398 if prevdotind == -1: 399 thispval = spe[spi][-1] + 'e' + spe[spi-1][:-1] 400 else: 401 thispval = spe[spi][-1] + 'e' + spe[spi-1][0:prevdotind-1] 402 pvs.append(thispval) 403 pvs.reverse() 404 if p > 0: 405 pvals = pvals[0:p] + pvs + pvals[p+1:] 406 else: 407 pvals = pvs + pvals[p+1:] 408 lpval = len(pvals) 409 p += 1 410 i = 0 411 if len(pvals) != len(insts): 412 sys.stderr.write("Failure to parse p-values for " + self._current_seq + ": " + self._line_buffer[1] + " to: " + str(pvals) + "\n") 413 pvals = [] 414 # else: 415 # sys.stderr.write('These are just fine' + self._current_seq + ': ' + self._line_buffer[1] + " to: " + str(pvals) + "\n") 416 for i in range(0,len(insts)): 417 inst = insts[i] 418 start = inst.start - self._buffered_seq_start + 1 419 thisSeq = fullSeq[start:start+inst.length] 420 thisSeq = Seq.Seq(thisSeq, self.data.alphabet) 421 inst._sequence(thisSeq) 422 if pvals: 423 inst._pvalue(float(pvals[i]))
424
425 - def _blank_buffer (self, dummy):
426 self._line_buffer = [] 427 self._buffer_size = 0
428
429 - def _collapse_buffer(self, dummy):
430 if self._buffer_size == 0: 431 if len(self._line_buffer) > 0: 432 self._buffer_size = len(self._line_buffer) 433 ll = self._line_buffer[self._buffer_size-1].split() 434 self._line_buffer[self._buffer_size-1] = ll[1] 435 self._buffered_seq_start = int(ll[0]) 436 else: 437 i = 0 438 for i in range(self._buffer_size, len(self._line_buffer)-1): 439 self._line_buffer[i-self._buffer_size] = self._line_buffer[i-self._buffer_size] + self._line_buffer[i].strip() 440 ll = self._line_buffer[len(self._line_buffer)-1].split() 441 if int(ll[0]) == self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]): 442 self._line_buffer[self._buffer_size-1] += ll[1] 443 else: 444 differ = int(ll[0]) - (self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1])) 445 self._line_buffer[self._buffer_size-1] += "N"*differ 446 self._line_buffer[self._buffer_size-1] += ll[1] 447 self._line_buffer = self._line_buffer[0:self._buffer_size]
448
449 - def _add_motif_match (self, line):
450 line = line.strip() 451 if line.find('[') != -1 or line.find('<') != -1: 452 pass 453 elif line.find('e') != -1: 454 pass 455 elif line.find('+') != -1: 456 pass
457
458 - def noevent (self, line):
459 pass
460 461 462
463 -class MASTParser(AbstractParser):
464 """ 465 Parser for MAST text output (OBSOLETE). 466 HTML output cannot be parsed, yet. Returns a MASTRecord 467 468 A MASTParser takes a file handle for a MAST text output file and 469 returns a MASTRecord, containing the hits between motifs and 470 sequences. The parser does some unusual line buffering to parse out 471 match diagrams. Really complex diagrams often lead to an error message 472 and p-values not being parsed for a given line. 473 474 Methods: 475 parse (handle): parses the data from the file handle passed to it. 476 477 Example: 478 479 f = open("mast_file.txt") 480 parser = MASTParser() 481 mast_record = parser.parse(f) 482 for motif in mast_record.motifs: 483 for instance in motif.instances: 484 print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue 485 486 This class is OBSOLETE; its functionality is now available through 487 Bio.Motif.Parsers.MAST. 488 """
489 - def __init__ (self):
490 self._consumer = _MASTConsumer() 491 self._scanner = _MASTScanner()
492
493 - def parse (self, handle):
494 self._scanner.feed(handle, self._consumer) 495 return self._consumer.data
496 497 498
499 -class _MASTScanner:
500 """ 501 Scanner for MAST text output (OBSOLETE). 502 503 This class is OBSOLETE; its functionality is now available through 504 Bio.Motif.Parsers.MAST. 505 """
506 - def feed (self, handle, consumer):
507 if isinstance(handle, File.UndoHandle): 508 uhandle = handle 509 else: 510 uhandle = File.UndoHandle(handle) 511 512 self._scan_header(uhandle, consumer) 513 self._scan_matches(uhandle, consumer) 514 self._scan_annotated_matches(uhandle, consumer)
515
516 - def _scan_header (self, uhandle, consumer):
517 try: 518 read_and_call_until(uhandle, consumer.noevent, contains = "MAST version") 519 except ValueError: 520 raise ValueError("Improper input file. Does not begin with a line with 'MAST version'") 521 read_and_call(uhandle, consumer._version, contains = 'MAST version') 522 read_and_call_until(uhandle, consumer.noevent, start = 'DATABASE AND MOTIFS') 523 read_and_call(uhandle, consumer.noevent, start = 'DATABASE') 524 read_and_call(uhandle, consumer.noevent, start = '****') 525 read_and_call(uhandle, consumer._database, contains = 'DATABASE') 526 read_and_call_until(uhandle, consumer.noevent, contains = 'MOTIF WIDTH') 527 read_and_call(uhandle, consumer.noevent, contains = 'MOTIF') 528 read_and_call(uhandle, consumer.noevent, contains = '----') 529 read_and_call_until(uhandle, consumer._add_motif, blank = 1) 530 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION II:')
531
532 - def _scan_matches (self, uhandle, consumer):
533 read_and_call_until(uhandle, consumer.noevent, start = 'SEQUENCE NAME') 534 read_and_call(uhandle, consumer.noevent, start = 'SEQUENCE NAME') 535 read_and_call(uhandle, consumer.noevent, start = '---') 536 # read_and_call_until(uhandle, consumer._add_sequence_match_with_diagram, blank = 1) 537 read_and_call_until(uhandle, consumer.noevent, blank = 1) 538 read_and_call(uhandle, consumer.noevent, blank = 1)
539
540 - def _scan_annotated_matches (self, uhandle, consumer):
541 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION III:') 542 read_and_call(uhandle, consumer.noevent, start = 'SECTION III:') 543 read_and_call_until(uhandle, consumer.noevent, start = '****') 544 read_and_call(uhandle, consumer.noevent, start = '****') 545 read_and_call_until(uhandle, consumer.noevent, start = '*****') 546 read_and_call(uhandle, consumer.noevent) 547 read_and_call_while(uhandle, consumer.noevent, blank = 1) 548 readMatches = 1 549 while readMatches == 1: 550 if consumer._current_seq: 551 if consumer._buffer_size != 0: 552 consumer._parse_buffer(None) 553 consumer._blank_buffer(None) 554 read_and_call(uhandle, consumer._set_current_seq) 555 read_and_call_until(uhandle, consumer.noevent, start = ' DIAGRAM') 556 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1) 557 consumer._add_diagram_from_buffer(None) 558 consumer._blank_buffer(None) 559 read_and_call(uhandle, consumer.noevent, blank = 1) 560 while 1: 561 line = safe_peekline(uhandle) 562 if line.startswith('****'): 563 consumer._parse_buffer(None) 564 readMatches = 0 565 break 566 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1) 567 read_and_call(uhandle, consumer.noevent, blank = 1) 568 consumer._collapse_buffer(None) 569 if attempt_read_and_call(uhandle, consumer.noevent, blank = 1): 570 break 571 elif attempt_read_and_call(uhandle, consumer.noevent, start = '*****'): 572 consumer._parse_buffer(None) 573 consumer._blank_buffer(None) 574 readMatches = 0 575 break
576 577 578
579 -class MASTRecord:
580 """The class for holding the results from a MAST run (OBSOLETE). 581 582 A MASTRecord holds data about matches between motifs and sequences. 583 The motifs held by the MASTRecord are objects of the class MEMEMotif. 584 585 Methods: 586 get_motif_matches_for_sequence(sequence_name): returns all of the 587 motif matches within a given sequence. The matches are objects of 588 the class MEME.Motif.Instance 589 get_motif_matches (motif_name): returns all of the matches for a motif 590 in the sequences searched. The matches returned are of class 591 MEME.Motif.Instance 592 get_motif_by_name (motif_name): returns a MEMEMotif with the given 593 name. 594 595 This class is OBSOLETE; its functionality is now available through 596 Bio.Motif.Parsers.MAST. 597 """
598 - def __init__ (self):
599 self.sequences = [] 600 self.version = "" 601 self.matches = [] 602 self.database = "" 603 self.diagrams = {} 604 self.alphabet = None 605 self.motifs = []
606
607 - def _version (self, version):
608 self.version = version
609
610 - def _alphabet (self, alphabet):
611 if alphabet == IUPAC.protein or alphabet == IUPAC.ambiguous_dna or alphabet == IUPAC.unambiguous_dna: 612 self.alphabet = alphabet 613 else: 614 return -1
615
616 - def _database(self, database):
617 self.database = database
618
619 - def get_motif_matches_for_sequence (self, seq):
620 insts = [] 621 for m in self.motifs: 622 for i in m.instances: 623 if i.sequence_name == seq: 624 insts.append(i) 625 insts.sort(lambda x,y: cmp(x.start, y.start)) 626 return insts
627
628 - def get_motif_matches (self, motif):
629 m = self.get_motif_by_name (motif.name) 630 return m.instances
631
632 - def _add_diagram_for_sequence (self, diagram, seq):
633 self.diagrams[seq] = diagram
634
635 - def _add_match (self, match):
636 self.matches.append(match)
637
638 - def _add_sequence (self, sequence):
640
641 - def _add_motif (self, motif):
642 self.motifs.append(motif)
643
644 - def get_motif_by_name (self, name):
645 for m in self.motifs: 646 if m.name == name: 647 return m
648