Package Bio :: Package Motif :: Package Parsers :: Module MEME
[hide private]
[frames] | no frames]

Source Code for Module Bio.Motif.Parsers.MEME

  1  # Copyright 2008 by Bartek Wilczynski 
  2  # Adapted from  Bio.MEME.Parser by Jason A. Hackney.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  from Bio.Alphabet import IUPAC 
  8  from Bio import Seq 
  9  import re 
 10  from math import sqrt 
 11  import sys 
 12  from Bio.Motif import Motif 
 13   
 14   
 15   
16 -def read(handle):
17 """Parses the text output of the MEME program into MEME.Record object. 18 19 Example: 20 21 >>> f = open("meme.output.txt") 22 >>> from Bio.Motif.Parsers import MEME 23 >>> record = MEME.read(f) 24 >>> for motif in record.motifs: 25 ... for instance in motif.instances: 26 ... print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue 27 28 """ 29 record = MEMERecord() 30 __read_version(record, handle) 31 __read_datafile(record, handle) 32 __read_alphabet(record, handle) 33 __read_sequence_names(record, handle) 34 __read_command(record, handle) 35 for line in handle: 36 if line.startswith('MOTIF 1'): 37 break 38 else: 39 raise ValueError('Unexpected end of stream') 40 while True: 41 motif = __create_motif(line) 42 motif.alphabet = record.alphabet 43 record.motifs.append(motif) 44 __read_motif_name(motif, handle) 45 __read_motif_sequences(motif, handle, 'revcomp' in record.command) 46 __skip_unused_lines(handle) 47 try: 48 line = handle.next() 49 except StopIteration: 50 raise ValueError('Unexpected end of stream: Expected to find new motif, or the summary of motifs') 51 if line.startswith("SUMMARY OF MOTIFS"): 52 break 53 if not line.startswith('MOTIF'): 54 raise ValueError("Line does not start with 'MOTIF':\n%s" % line) 55 return record
56 57
58 -class MEMEMotif (Motif):
59 """A subclass of Motif used in parsing MEME (and MAST) output. 60 61 This sublcass defines functions and data specific to MEME motifs. 62 This includes the evalue for a motif and the PSSM of the motif. 63 64 Methods: 65 add_instance_from_values (name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = +): create a new instance of the motif with the specified values. 66 add_to_pssm (position): add a new position to the pssm. The position should be a list of nucleotide/amino acid frequencies 67 add_to_logodds (position): add a new position to the log odds matrix. The position should be a tuple of log odds values for the nucleotide/amino acid at that position. 68 compare_motifs (other_motif): returns the maximum correlation between this motif and other_motif 69 """
70 - def __init__ (self):
71 Motif.__init__(self) 72 self.evalue = 0.0
73
74 - def _numoccurrences (self, number):
75 if type(number) == int: 76 self.num_occurrences = number 77 else: 78 number = int(number) 79 self.num_occurrences = number
80
81 - def get_instance_by_name (self,name):
82 for i in self.instances: 83 if i.sequence_name == name: 84 return i 85 return None
86
87 - def add_instance_from_values (self, name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = '+'):
88 inst = MEMEInstance(sequence,self.alphabet) 89 inst._pvalue(pvalue) 90 inst._seqname(name) 91 inst._start(start) 92 inst._strand(strand) 93 if self.length: 94 inst._length(self.length) 95 if self.name: 96 inst._motifname(self.name) 97 self.add_instance(inst)
98
99 - def _evalue (self, evalue):
100 if type(evalue) == float: 101 self.evalue = evalue 102 else: 103 evalue = float(evalue) 104 self.evalue = evalue
105 106
107 -class MEMEInstance(Seq.Seq):
108 """A class describing the instances of a MEME motif, and the data thereof. 109 """
110 - def __init__ (self,*args,**kwds):
111 Seq.Seq.__init__(self,*args,**kwds) 112 self.sequence_name = "" 113 self.start = 0 114 self.pvalue = 1.0 115 self.strand = 0 116 self.length = 0 117 self.motif_name = ""
118 119
120 - def _seqname (self, name):
121 self.sequence_name = name
122
123 - def _motifname (self, name):
124 self.motif_name = name
125
126 - def _start (self,start):
127 start = int(start) 128 self.start = start
129
130 - def _pvalue (self,pval):
131 pval = float(pval) 132 self.pvalue = pval
133
134 - def _score (self, score):
135 score = float(score) 136 self.score = score
137
138 - def _strand (self, strand):
139 self.strand = strand
140
141 - def _length (self, length):
142 self.length = length
143 144
145 -class MEMERecord:
146 """A class for holding the results of a MEME run. 147 148 A MEMERecord is an object that holds the results from running 149 MEME. It implements no methods of its own. 150 151 """
152 - def __init__ (self):
153 """__init__ (self)""" 154 self.motifs = [] 155 self.version = "" 156 self.datafile = "" 157 self.command = "" 158 self.alphabet = None 159 self.sequence_names = []
160
161 - def get_motif_by_name (self, name):
162 for m in self.motifs: 163 if m.name == name: 164 return m
165 166 167 # Everything below is private 168 169
170 -def __read_version(record, handle):
171 for line in handle: 172 if line.startswith('MEME version'): 173 break 174 else: 175 raise ValueError("Improper input file. File should contain a line starting MEME version.") 176 line = line.strip() 177 ls = line.split() 178 record.version = ls[2]
179 180
181 -def __read_datafile(record, handle):
182 for line in handle: 183 if line.startswith('TRAINING SET'): 184 break 185 else: 186 raise ValueError("Unexpected end of stream: 'TRAINING SET' not found.") 187 try: 188 line = handle.next() 189 except StopIteration: 190 raise ValueError("Unexpected end of stream: Expected to find line starting with '****'") 191 if not line.startswith('****'): 192 raise ValueError("Line does not start with '****':\n%s" % line) 193 try: 194 line = handle.next() 195 except StopIteration: 196 raise ValueError("Unexpected end of stream: Expected to find line starting with 'DATAFILE'") 197 if not line.startswith('DATAFILE'): 198 raise ValueError("Line does not start with 'DATAFILE':\n%s" % line) 199 line = line.strip() 200 line = line.replace('DATAFILE= ','') 201 record.datafile = line
202 203
204 -def __read_alphabet(record, handle):
205 try: 206 line = handle.next() 207 except StopIteration: 208 raise ValueError("Unexpected end of stream: Expected to find line starting with 'ALPHABET'") 209 if not line.startswith('ALPHABET'): 210 raise ValueError("Line does not start with 'ALPHABET':\n%s" % line) 211 line = line.strip() 212 line = line.replace('ALPHABET= ','') 213 if line == 'ACGT': 214 al = IUPAC.unambiguous_dna 215 else: 216 al = IUPAC.protein 217 record.alphabet = al
218 219
220 -def __read_sequence_names(record, handle):
221 try: 222 line = handle.next() 223 except StopIteration: 224 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'") 225 if not line.startswith('Sequence name'): 226 raise ValueError("Line does not start with 'Sequence name':\n%s" % line) 227 try: 228 line = handle.next() 229 except StopIteration: 230 raise ValueError("Unexpected end of stream: Expected to find line starting with '----'") 231 if not line.startswith('----'): 232 raise ValueError("Line does not start with '----':\n%s" % line) 233 for line in handle: 234 if line.startswith('***'): 235 break 236 line = line.strip() 237 ls = line.split() 238 record.sequence_names.append(ls[0]) 239 if len(ls) == 6: 240 record.sequence_names.append(ls[3]) 241 else: 242 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
243 244
245 -def __read_command(record, handle):
246 for line in handle: 247 if line.startswith('command:'): 248 break 249 else: 250 raise ValueError("Unexpected end of stream: Expected to find line starting with 'command'") 251 line = line.strip() 252 line = line.replace('command: ','') 253 record.command = line
254 255
256 -def __create_motif(line):
257 line = line.strip() 258 ls = line.split() 259 motif = MEMEMotif() 260 motif.length = int(ls[4]) 261 motif._numoccurrences(ls[7]) 262 motif._evalue(ls[13]) 263 return motif
264 265
266 -def __read_motif_name(motif, handle):
267 for line in handle: 268 if 'sorted by position p-value' in line: 269 break 270 else: 271 raise ValueError('Unexpected end of stream: Failed to find motif name') 272 line = line.strip() 273 ls = line.split() 274 name = " ".join(ls[0:2]) 275 motif.name=name
276 277
278 -def __read_motif_sequences(motif, handle, rv):
279 try: 280 line = handle.next() 281 except StopIteration: 282 raise ValueError('Unexpected end of stream: Failed to find motif sequences') 283 if not line.startswith('---'): 284 raise ValueError("Line does not start with '---':\n%s" % line) 285 try: 286 line = handle.next() 287 except StopIteration: 288 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'") 289 if not line.startswith('Sequence name'): 290 raise ValueError("Line does not start with 'Sequence name':\n%s" % line) 291 try: 292 line = handle.next() 293 except StopIteration: 294 raise ValueError('Unexpected end of stream: Failed to find motif sequences') 295 if not line.startswith('---'): 296 raise ValueError("Line does not start with '---':\n%s" % line) 297 for line in handle: 298 if line.startswith('---'): 299 break 300 line = line.strip() 301 ls = line.split() 302 if rv: 303 #seq = Seq.Seq(ls[5], record.alphabet) 304 motif.add_instance_from_values(name = ls[0], sequence = ls[5], start = ls[2], pvalue = ls[3], strand = ls[1]) 305 else: 306 #seq = Seq.Seq(ls[4], record.alphabet) 307 motif.add_instance_from_values(name = ls[0], sequence = ls[4], start = ls[1], pvalue = ls[2]) 308 else: 309 raise ValueError('Unexpected end of stream')
310 311
312 -def __skip_unused_lines(handle):
313 for line in handle: 314 if line.startswith('log-odds matrix'): 315 break 316 else: 317 raise ValueError("Unexpected end of stream: Expected to find line starting with 'log-odds matrix'") 318 for line in handle: 319 if line.startswith('---'): 320 break 321 else: 322 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'") 323 for line in handle: 324 if line.startswith('letter-probability matrix'): 325 break 326 else: 327 raise ValueError("Unexpected end of stream: Expected to find line starting with 'letter-probability matrix'") 328 for line in handle: 329 if line.startswith('---'): 330 break 331 else: 332 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'") 333 for line in handle: 334 if line.startswith('Time'): 335 break 336 else: 337 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Time'") 338 try: 339 line = handle.next() 340 except StopIteration: 341 raise ValueError('Unexpected end of stream: Expected to find blank line') 342 if line.strip(): 343 raise ValueError("Expected blank line, but got:\n%s" % line) 344 try: 345 line = handle.next() 346 except StopIteration: 347 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'") 348 if not line.startswith('***'): 349 raise ValueError("Line does not start with '***':\n%s" % line) 350 for line in handle: 351 if line.strip(): 352 break 353 else: 354 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'") 355 if not line.startswith('***'): 356 raise ValueError("Line does not start with '***':\n%s" % line)
357 358 359 # Everything below is obsolete. 360 361 362 from Bio import File 363 from Bio.ParserSupport import * 364 365
366 -class MEMEParser (AbstractParser):
367 """A parser for the text output of the MEME program (OBSOLETE). 368 Parses the output into an object of the MEMERecord class. 369 370 Methods: 371 parse (handle): parses the contents of the file handle passed to it. 372 373 Example: 374 375 >>>f = open("meme.output.txt") 376 >>>parser = MEMEParser() 377 >>>meme_record = parser.parse(f) 378 >>>for motif in meme_record.motifs: 379 ... for instance in motif.instances: 380 ... print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue 381 382 This class is OBSOLETE; please use the read() function in this module 383 instead. 384 """
385 - def __init__ (self):
386 """__init__ (self)""" 387 self._scanner = _MEMEScanner() 388 self._consumer = _MEMEConsumer()
389
390 - def parse (self, handle):
391 """parse (self, handle)""" 392 self._scanner.feed(handle, self._consumer) 393 return self._consumer.data
394 395 396
397 -class _MEMEScanner:
398 """Scanner for MEME output (OBSOLETE). 399 400 Methods: 401 feed 402 403 This class is OBSOLETE; please use the read() function in this module 404 instead. 405 """ 406
407 - def feed (self, handle, consumer):
408 """ 409 Feeds in MEME output for scanning. handle should 410 implement the readline method. consumer is 411 a Consumer object that can receive the salient events. 412 """ 413 if isinstance(handle, File.UndoHandle): 414 uhandle = handle 415 else: 416 uhandle = File.UndoHandle(handle) 417 418 self._scan_header(uhandle, consumer) 419 self._scan_motifs (uhandle, consumer)
420
421 - def _scan_header(self, uhandle, consumer):
422 try: 423 read_and_call_until(uhandle, consumer.noevent, contains = 'MEME version') 424 except ValueError: 425 raise ValueError("Improper input file. File should contain a line starting MEME version.") 426 read_and_call(uhandle, consumer._version, start = 'MEME version') 427 read_and_call_until(uhandle, consumer.noevent, start = 'TRAINING SET') 428 read_and_call(uhandle, consumer.noevent, start = 'TRAINING SET') 429 read_and_call(uhandle, consumer.noevent, start = '****') 430 read_and_call(uhandle, consumer._datafile, start = 'DATAFILE') 431 read_and_call(uhandle, consumer._alphabet, start = 'ALPHABET') 432 read_and_call(uhandle, consumer.noevent, start = 'Sequence name') 433 read_and_call(uhandle, consumer.noevent, start = '----') 434 read_and_call_until(uhandle, consumer._sequence_name, start = '***') 435 read_and_call_until(uhandle, consumer.noevent, start = 'command:') 436 read_and_call(uhandle, consumer._commandline, start = 'command:') 437 read_and_call_until(uhandle, consumer.noevent, start = 'MOTIF 1')
438
439 - def _scan_motifs(self, uhandle, consumer):
440 while 1: 441 read_and_call(uhandle, consumer._add_motif_with_info, start = 'MOTIF') 442 read_and_call_until(uhandle, consumer.noevent, contains = 'sorted by position p-value') 443 read_and_call(uhandle, consumer.motif_name, contains = 'sorted by position p-value') 444 read_and_call(uhandle, consumer.noevent, start = '---') 445 read_and_call(uhandle, consumer.noevent, start = 'Sequence name') 446 read_and_call(uhandle, consumer.noevent, start = '---') 447 read_and_call_until(uhandle, consumer.add_instance, start = '---') 448 read_and_call_until(uhandle, consumer.noevent, start = 'log-odds matrix') 449 read_and_call(uhandle, consumer.noevent) 450 read_and_call_until(uhandle, consumer.add_to_logodds, start = '---') 451 read_and_call_until(uhandle, consumer.noevent, start = 'letter-probability matrix') 452 read_and_call(uhandle, consumer.noevent, start = 'letter-probability matrix') 453 read_and_call_until(uhandle, consumer.add_to_pssm, start = '---') 454 read_and_call_until(uhandle, consumer.noevent, start = 'Time') 455 read_and_call(uhandle, consumer.noevent, start = 'Time') 456 read_and_call(uhandle, consumer.noevent, blank = 1) 457 read_and_call(uhandle, consumer.noevent, start = '***') 458 read_and_call_while(uhandle, consumer.noevent, blank = 1) 459 read_and_call(uhandle, consumer.noevent, start = '***') 460 line = safe_peekline(uhandle) 461 if line.startswith("SUMMARY OF MOTIFS"): 462 break
463 464 465
466 -class _MEMEConsumer:
467 """ 468 Consumer that can receive events from MEME Scanner (OBSOLETE). 469 470 This is the Consumer object that should be passed to the 471 MEME Scanner. 472 473 This class is OBSOLETE; please use the read() function in this module 474 instead. 475 """ 476
477 - def __init__ (self):
478 self.current_motif = None 479 self.sequence_names = [] 480 self.data = MEMERecord()
481
482 - def _version (self, line):
483 line = line.strip() 484 ls = line.split() 485 self.data.version = ls[2]
486
487 - def _datafile (self, line):
488 line = line.strip() 489 line = line.replace('DATAFILE= ','') 490 self.data.datafile = line
491
492 - def _alphabet (self, line):
493 line = line.strip() 494 line = line.replace('ALPHABET= ','') 495 if line == 'ACGT': 496 al = IUPAC.unambiguous_dna 497 else: 498 al = IUPAC.protein 499 self.data.alphabet = al
500
501 - def _sequence_name (self, line):
502 line = line.strip() 503 ls = line.split() 504 self.data.sequence_names.append(ls[0]) 505 if len(ls) == 6: 506 self.data.sequence_names.append(ls[3])
507
508 - def _commandline (self, line):
509 line = line.strip() 510 line = line.replace('command: ','') 511 self.data.command = line
512
513 - def _add_motif_with_info (self, line):
514 line = line.strip() 515 ls = line.split() 516 motif = MEMEMotif() 517 #motif.length=ls[4] 518 motif._numoccurrences(ls[7]) 519 motif._evalue(ls[13]) 520 motif.alphabet=self.data.alphabet 521 self.data.motifs.append(motif) 522 self.current_motif = motif
523
524 - def motif_name (self, line):
525 line = line.strip() 526 ls = line.split() 527 name = " ".join(ls[0:2]) 528 self.current_motif.name=name
529
530 - def add_instance (self, line):
531 line = line.strip() 532 ls = line.split() 533 if self.data.command.find('revcomp') != -1: 534 #seq = Seq.Seq(ls[5], self.data.alphabet) 535 self.current_motif.add_instance_from_values(name = ls[0], sequence = ls[5], start = ls[2], pvalue = ls[3], strand = ls[1]) 536 else: 537 #seq = Seq.Seq(ls[4], self.data.alphabet) 538 self.current_motif.add_instance_from_values(name = ls[0], sequence = ls[4], start = ls[1], pvalue = ls[2])
539
540 - def add_to_pssm (self, line):
541 pass
542
543 - def add_to_logodds (self, line):
544 pass
545
546 - def noevent (self,line):
547 pass
548 549 550
551 -class _MASTConsumer:
552 """ 553 Consumer that can receive events from _MASTScanner (OBSOLETE). 554 555 A _MASTConsumer parses lines from a mast text output file. 556 The motif match diagrams are parsed using line buffering. 557 Each of the buffering functions have a dummy variable that is 558 required for testing using the Bio.ParserSupport.TaggingConsumer. 559 If this variable isn't there, the TaggingConsumer barfs. In 560 the _MASTScanner, None is passed in the place of this variable. 561 562 This class is OBSOLETE; please use the read() function in the module 563 Bio.Motif.Parsers.MAST instead. 564 """
565 - def __init__ (self):
566 self.data = MASTRecord() 567 self._current_seq = "" 568 self._line_buffer = [] 569 self._buffer_size = 0 570 self._buffered_seq_start = 0
571
572 - def _version (self, line):
573 line = line.strip() 574 ls = line.split() 575 self.data._version(ls[2])
576
577 - def _database (self, line):
578 line = line.strip() 579 ls = line.split() 580 self.data._database(ls[1]) 581 al = "" 582 if ls[2] == '(nucleotide)': 583 al = IUPAC.unambiguous_dna 584 self.data._alphabet(al) 585 else: 586 al = IUPAC.protein 587 self.data._alphabet(al)
588
589 - def _add_motif (self, line):
590 line = line.strip() 591 ls = line.split() 592 m = MEMEMotif() 593 m.alphabet=self.data.alphabet 594 m.length=ls[1] 595 name = ls[0] 596 m.name=name 597 m.add_instance(ls[2]) 598 self.data._add_motif(m)
599
600 - def _add_match_diagram (self, line):
601 line = line.strip() 602 ls = line.split() 603 self.data._add_diagram_for_sequence(ls[1], self._current_seq) 604 ds = ls[1].split('_') 605 i = 0 606 start = 0 607 for i in range(0,len(ds)): 608 if ds[i].find('[') != -1 or ds[i].find('<') != -1: 609 inst = MEMEInstance() 610 inst._seqname (self._current_seq) 611 inst._start (start) 612 r = re.compile('\d+') 613 mn = r.findall(ds[i])[0] 614 if ds[i].find('-') != -1: 615 inst.strand = '-' 616 else: 617 inst.strand = '+' 618 motif = self.data.get_motif_by_name(mn) 619 motif.add_instance(inst) 620 start += motif.length 621 else: 622 start += int(ds[i])
623
624 - def _add_sequence_match_with_diagram (self, line):
625 line = line.strip() 626 ls = line.split() 627 self.data._add_sequence(ls[0]) 628 self.data._add_diagram_for_sequence(ls[2],ls[0]) 629 ds = ls[2].split('_') 630 i = 0 631 start = 0 632 for i in range(0,len(ds)): 633 if ds[i].find('+') != -1 or ds[i].find('-') != -1: 634 inst = MEMEInstance() 635 inst._seqname (ls[0]) 636 inst._start (start) 637 r = re.compile('\d+') 638 mn = r.findall(ds[i])[0] 639 if ds[i].find('-') != -1: 640 inst.strand = '-' 641 else: 642 inst.strand = '+' 643 motif = self.data.get_motif_by_name(mn) 644 motif.add_instance(inst) 645 start += motif.length 646 else: 647 start += int(ds[i])
648
649 - def _add_diagram_from_buffer (self, dummy):
650 line = "" 651 for l in self._line_buffer: 652 line += l.strip() 653 ls = line.split() 654 self.data._add_diagram_for_sequence(ls[1], self._current_seq) 655 ds = ls[1].split('_') 656 i = 0 657 start = 0 658 for i in range(0,len(ds)): 659 if ds[i].find('[') != -1 or ds[i].find('<') != -1: 660 inst = MEMEInstance() 661 inst._seqname (self._current_seq) 662 inst._start (start) 663 r = re.compile('\d+') 664 mn = r.findall(ds[i])[0] 665 if ds[i].find('-') != -1: 666 inst.strand = '-' 667 else: 668 inst.strand = '+' 669 motif = self.data.get_motif_by_name(mn) 670 motif.add_instance(inst) 671 start += motif.length 672 else: 673 start += int(ds[i])
674
675 - def _set_current_seq (self, line):
676 line = line.strip() 677 self._current_seq = line 678 if not self.data.sequences.count(line): 679 self.data.sequences.append(line)
680
681 - def _add_line_to_buffer (self, line):
682 line = line.strip() 683 if not line.startswith('*****'): 684 self._line_buffer.append(line) 685 else: 686 return -1
687
688 - def _parse_buffer (self, dummy):
689 """Parses the line buffer to get e-values for each instance of a motif. 690 This buffer parser is the most likely point of failure for the 691 MASTParser. 692 """ 693 insts = self.data.get_motif_matches_for_sequence(self._current_seq) 694 if len(insts) > 0: 695 696 fullSeq = self._line_buffer[self._buffer_size-1] 697 pvals = self._line_buffer[1].split() 698 p = 0 699 lpval = len(pvals) 700 while p < lpval: 701 if pvals[p].count('e') > 1: 702 #Break blocks up by e and parse into valid floats. This only 703 #works if there are no e-values greater than 1e-5. 704 pvs = [] 705 spe = pvals[p].split('e') 706 spe.reverse() 707 dotind = spe[1].find('.') 708 if dotind == -1: 709 thispval = spe[1][-1] + 'e' + spe[0] 710 else: 711 thispval = spe[1][dotind-1:] + 'e' + spe[0] 712 pvs.append(thispval) 713 for spi in range(2,len(spe)): 714 dotind = spe[spi].find('.') 715 prevdotind = spe[spi-1].find('.') 716 if dotind != -1: 717 if prevdotind == -1: 718 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][:-1] 719 else: 720 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][0:prevdotind-1] 721 else: 722 if prevdotind == -1: 723 thispval = spe[spi][-1] + 'e' + spe[spi-1][:-1] 724 else: 725 thispval = spe[spi][-1] + 'e' + spe[spi-1][0:prevdotind-1] 726 pvs.append(thispval) 727 pvs.reverse() 728 if p > 0: 729 pvals = pvals[0:p] + pvs + pvals[p+1:] 730 else: 731 pvals = pvs + pvals[p+1:] 732 lpval = len(pvals) 733 p += 1 734 i = 0 735 if len(pvals) != len(insts): 736 sys.stderr.write("Failure to parse p-values for " + self._current_seq + ": " + self._line_buffer[1] + " to: " + str(pvals) + "\n") 737 pvals = [] 738 # else: 739 # sys.stderr.write('These are just fine' + self._current_seq + ': ' + self._line_buffer[1] + " to: " + str(pvals) + "\n") 740 for i in range(0,len(insts)): 741 inst = insts[i] 742 start = inst.start - self._buffered_seq_start + 1 743 thisSeq = fullSeq[start:start+inst.length] 744 thisSeq = Seq.Seq(thisSeq, self.data.alphabet) 745 inst._sequence(thisSeq) 746 if pvals: 747 inst._pvalue(float(pvals[i]))
748
749 - def _blank_buffer (self, dummy):
750 self._line_buffer = [] 751 self._buffer_size = 0
752
753 - def _collapse_buffer(self, dummy):
754 if self._buffer_size == 0: 755 if len(self._line_buffer) > 0: 756 self._buffer_size = len(self._line_buffer) 757 ll = self._line_buffer[self._buffer_size-1].split() 758 self._line_buffer[self._buffer_size-1] = ll[1] 759 self._buffered_seq_start = int(ll[0]) 760 else: 761 i = 0 762 for i in range(self._buffer_size, len(self._line_buffer)-1): 763 self._line_buffer[i-self._buffer_size] = self._line_buffer[i-self._buffer_size] + self._line_buffer[i].strip() 764 ll = self._line_buffer[len(self._line_buffer)-1].split() 765 if int(ll[0]) == self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]): 766 self._line_buffer[self._buffer_size-1] += ll[1] 767 else: 768 differ = int(ll[0]) - (self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1])) 769 self._line_buffer[self._buffer_size-1] += "N"*differ 770 self._line_buffer[self._buffer_size-1] += ll[1] 771 self._line_buffer = self._line_buffer[0:self._buffer_size]
772
773 - def _add_motif_match (self, line):
774 line = line.strip() 775 if line.find('[') != -1 or line.find('<') != -1: 776 pass 777 elif line.find('e') != -1: 778 pass 779 elif line.find('+') != -1: 780 pass
781
782 - def noevent (self, line):
783 pass
784 785
786 -class MASTParser(AbstractParser):
787 """ 788 Parser for MAST text output (OBSOLETE). 789 HTML output cannot be parsed, yet. Returns a MASTRecord 790 791 A MASTParser takes a file handle for a MAST text output file and 792 returns a MASTRecord, containing the hits between motifs and 793 sequences. The parser does some unusual line buffering to parse out 794 match diagrams. Really complex diagrams often lead to an error message 795 and p-values not being parsed for a given line. 796 797 Methods: 798 parse (handle): parses the data from the file handle passed to it. 799 800 Example: 801 802 >>>f = open("mast_file.txt") 803 >>>parser = MASTParser() 804 >>>mast_record = parser.parse(f) 805 >>>for motif in mast_record.motifs: 806 >>> for instance in motif.instances: 807 >>> print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue 808 809 This class is OBSOLETE; please use the read() function in the module 810 Bio.Motif.Parsers.MAST instead. 811 """
812 - def __init__ (self):
813 self._consumer = _MASTConsumer() 814 self._scanner = _MASTScanner()
815
816 - def parse (self, handle):
817 self._scanner.feed(handle, self._consumer) 818 return self._consumer.data
819 820 821
822 -class _MASTScanner:
823 """ 824 Scanner for MAST text output (OBSOLETE). 825 826 This class is OBSOLETE; please use the read() function in the module 827 Bio.Motif.Parsers.MAST instead. 828 """
829 - def feed (self, handle, consumer):
830 if isinstance(handle, File.UndoHandle): 831 uhandle = handle 832 else: 833 uhandle = File.UndoHandle(handle) 834 835 self._scan_header(uhandle, consumer) 836 self._scan_matches(uhandle, consumer) 837 self._scan_annotated_matches(uhandle, consumer)
838
839 - def _scan_header (self, uhandle, consumer):
840 try: 841 read_and_call_until(uhandle, consumer.noevent, contains = "MAST version") 842 except ValueError: 843 raise ValueError("Improper input file. Does not begin with a line with 'MAST version'") 844 read_and_call(uhandle, consumer._version, contains = 'MAST version') 845 read_and_call_until(uhandle, consumer.noevent, start = 'DATABASE AND MOTIFS') 846 read_and_call(uhandle, consumer.noevent, start = 'DATABASE') 847 read_and_call(uhandle, consumer.noevent, start = '****') 848 read_and_call(uhandle, consumer._database, contains = 'DATABASE') 849 read_and_call_until(uhandle, consumer.noevent, contains = 'MOTIF WIDTH') 850 read_and_call(uhandle, consumer.noevent, contains = 'MOTIF') 851 read_and_call(uhandle, consumer.noevent, contains = '----') 852 read_and_call_until(uhandle, consumer._add_motif, blank = 1) 853 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION II:')
854
855 - def _scan_matches (self, uhandle, consumer):
856 read_and_call_until(uhandle, consumer.noevent, start = 'SEQUENCE NAME') 857 read_and_call(uhandle, consumer.noevent, start = 'SEQUENCE NAME') 858 read_and_call(uhandle, consumer.noevent, start = '---') 859 # read_and_call_until(uhandle, consumer._add_sequence_match_with_diagram, blank = 1) 860 read_and_call_until(uhandle, consumer.noevent, blank = 1) 861 read_and_call(uhandle, consumer.noevent, blank = 1)
862
863 - def _scan_annotated_matches (self, uhandle, consumer):
864 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION III:') 865 read_and_call(uhandle, consumer.noevent, start = 'SECTION III:') 866 read_and_call_until(uhandle, consumer.noevent, start = '****') 867 read_and_call(uhandle, consumer.noevent, start = '****') 868 read_and_call_until(uhandle, consumer.noevent, start = '*****') 869 read_and_call(uhandle, consumer.noevent) 870 read_and_call_while(uhandle, consumer.noevent, blank = 1) 871 readMatches = 1 872 while readMatches == 1: 873 if consumer._current_seq: 874 if consumer._buffer_size != 0: 875 consumer._parse_buffer(None) 876 consumer._blank_buffer(None) 877 read_and_call(uhandle, consumer._set_current_seq) 878 read_and_call_until(uhandle, consumer.noevent, start = ' DIAGRAM') 879 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1) 880 consumer._add_diagram_from_buffer(None) 881 consumer._blank_buffer(None) 882 read_and_call(uhandle, consumer.noevent, blank = 1) 883 while 1: 884 line = safe_peekline(uhandle) 885 if line.startswith('****'): 886 consumer._parse_buffer(None) 887 readMatches = 0 888 break 889 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1) 890 read_and_call(uhandle, consumer.noevent, blank = 1) 891 consumer._collapse_buffer(None) 892 if attempt_read_and_call(uhandle, consumer.noevent, blank = 1): 893 break 894 elif attempt_read_and_call(uhandle, consumer.noevent, start = '*****'): 895 consumer._parse_buffer(None) 896 consumer._blank_buffer(None) 897 readMatches = 0 898 break
899 900 901
902 -class MASTRecord:
903 """The class for holding the results from a MAST run (OBSOLETE). 904 905 A MASTRecord holds data about matches between motifs and sequences. 906 The motifs held by the MASTRecord are objects of the class MEMEMotif. 907 908 Methods: 909 get_motif_matches_for_sequence(sequence_name): returns all of the 910 motif matches within a given sequence. The matches are objects of 911 the class MEMEInstance 912 get_motif_matches (motif_name): returns all of the matches for a motif 913 in the sequences searched. The matches returned are of class 914 MEMEInstance 915 get_motif_by_name (motif_name): returns a MEMEMotif with the given 916 name. 917 918 This class is OBSOLETE; please use the read() function in the module 919 Bio.Motif.Parsers.MAST instead. 920 """
921 - def __init__ (self):
922 self.sequences = [] 923 self.version = "" 924 self.matches = [] 925 self.database = "" 926 self.diagrams = {} 927 self.alphabet = None 928 self.motifs = []
929
930 - def _version (self, version):
931 self.version = version
932
933 - def _alphabet (self, alphabet):
934 if alphabet == IUPAC.protein or alphabet == IUPAC.ambiguous_dna or alphabet == IUPAC.unambiguous_dna: 935 self.alphabet = alphabet 936 else: 937 return -1
938
939 - def _database(self, database):
940 self.database = database
941
942 - def get_motif_matches_for_sequence (self, seq):
943 insts = [] 944 for m in self.motifs: 945 for i in m.instances: 946 if i.sequence_name == seq: 947 insts.append(i) 948 insts.sort(lambda x,y: cmp(x.start, y.start)) 949 return insts
950
951 - def get_motif_matches (self, motif):
952 m = self.get_motif_by_name (motif.name) 953 return m.instances
954
955 - def _add_diagram_for_sequence (self, diagram, seq):
956 self.diagrams[seq] = diagram
957
958 - def _add_match (self, match):
959 self.matches.append(match)
960
961 - def _add_sequence (self, sequence):
963
964 - def _add_motif (self, motif):
965 self.motifs.append(motif)
966
967 - def get_motif_by_name (self, name):
968 for m in self.motifs: 969 if m.name == name: 970 return m
971