1
2
3
4
5
6
7 from Bio.Alphabet import IUPAC
8 from Bio import Seq
9 import re
10 from math import sqrt
11 import sys
12 from Bio.Motif import Motif
13
14
15
17 """Parses the text output of the MEME program into MEME.Record object.
18
19 Example:
20
21 >>> f = open("meme.output.txt")
22 >>> from Bio.Motif.Parsers import MEME
23 >>> record = MEME.read(f)
24 >>> for motif in record.motifs:
25 ... for instance in motif.instances:
26 ... print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
27
28 """
29 record = MEMERecord()
30 __read_version(record, handle)
31 __read_datafile(record, handle)
32 __read_alphabet(record, handle)
33 __read_sequence_names(record, handle)
34 __read_command(record, handle)
35 for line in handle:
36 if line.startswith('MOTIF 1'):
37 break
38 else:
39 raise ValueError('Unexpected end of stream')
40 while True:
41 motif = __create_motif(line)
42 motif.alphabet = record.alphabet
43 record.motifs.append(motif)
44 __read_motif_name(motif, handle)
45 __read_motif_sequences(motif, handle, 'revcomp' in record.command)
46 __skip_unused_lines(handle)
47 try:
48 line = handle.next()
49 except StopIteration:
50 raise ValueError('Unexpected end of stream: Expected to find new motif, or the summary of motifs')
51 if line.startswith("SUMMARY OF MOTIFS"):
52 break
53 if not line.startswith('MOTIF'):
54 raise ValueError("Line does not start with 'MOTIF':\n%s" % line)
55 return record
56
57
59 """A subclass of Motif used in parsing MEME (and MAST) output.
60
61 This sublcass defines functions and data specific to MEME motifs.
62 This includes the evalue for a motif and the PSSM of the motif.
63
64 Methods:
65 add_instance_from_values (name = 'default', pvalue = 1, sequence = 'ATA', start = 0, strand = +): create a new instance of the motif with the specified values.
66 add_to_pssm (position): add a new position to the pssm. The position should be a list of nucleotide/amino acid frequencies
67 add_to_logodds (position): add a new position to the log odds matrix. The position should be a tuple of log odds values for the nucleotide/amino acid at that position.
68 compare_motifs (other_motif): returns the maximum correlation between this motif and other_motif
69 """
73
75 if type(number) == int:
76 self.num_occurrences = number
77 else:
78 number = int(number)
79 self.num_occurrences = number
80
82 for i in self.instances:
83 if i.sequence_name == name:
84 return i
85 return None
86
98
100 if type(evalue) == float:
101 self.evalue = evalue
102 else:
103 evalue = float(evalue)
104 self.evalue = evalue
105
106
108 """A class describing the instances of a MEME motif, and the data thereof.
109 """
118
119
121 self.sequence_name = name
122
125
129
131 pval = float(pval)
132 self.pvalue = pval
133
137
140
143
144
146 """A class for holding the results of a MEME run.
147
148 A MEMERecord is an object that holds the results from running
149 MEME. It implements no methods of its own.
150
151 """
153 """__init__ (self)"""
154 self.motifs = []
155 self.version = ""
156 self.datafile = ""
157 self.command = ""
158 self.alphabet = None
159 self.sequence_names = []
160
162 for m in self.motifs:
163 if m.name == name:
164 return m
165
166
167
168
169
171 for line in handle:
172 if line.startswith('MEME version'):
173 break
174 else:
175 raise ValueError("Improper input file. File should contain a line starting MEME version.")
176 line = line.strip()
177 ls = line.split()
178 record.version = ls[2]
179
180
182 for line in handle:
183 if line.startswith('TRAINING SET'):
184 break
185 else:
186 raise ValueError("Unexpected end of stream: 'TRAINING SET' not found.")
187 try:
188 line = handle.next()
189 except StopIteration:
190 raise ValueError("Unexpected end of stream: Expected to find line starting with '****'")
191 if not line.startswith('****'):
192 raise ValueError("Line does not start with '****':\n%s" % line)
193 try:
194 line = handle.next()
195 except StopIteration:
196 raise ValueError("Unexpected end of stream: Expected to find line starting with 'DATAFILE'")
197 if not line.startswith('DATAFILE'):
198 raise ValueError("Line does not start with 'DATAFILE':\n%s" % line)
199 line = line.strip()
200 line = line.replace('DATAFILE= ','')
201 record.datafile = line
202
203
205 try:
206 line = handle.next()
207 except StopIteration:
208 raise ValueError("Unexpected end of stream: Expected to find line starting with 'ALPHABET'")
209 if not line.startswith('ALPHABET'):
210 raise ValueError("Line does not start with 'ALPHABET':\n%s" % line)
211 line = line.strip()
212 line = line.replace('ALPHABET= ','')
213 if line == 'ACGT':
214 al = IUPAC.unambiguous_dna
215 else:
216 al = IUPAC.protein
217 record.alphabet = al
218
219
221 try:
222 line = handle.next()
223 except StopIteration:
224 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'")
225 if not line.startswith('Sequence name'):
226 raise ValueError("Line does not start with 'Sequence name':\n%s" % line)
227 try:
228 line = handle.next()
229 except StopIteration:
230 raise ValueError("Unexpected end of stream: Expected to find line starting with '----'")
231 if not line.startswith('----'):
232 raise ValueError("Line does not start with '----':\n%s" % line)
233 for line in handle:
234 if line.startswith('***'):
235 break
236 line = line.strip()
237 ls = line.split()
238 record.sequence_names.append(ls[0])
239 if len(ls) == 6:
240 record.sequence_names.append(ls[3])
241 else:
242 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
243
244
246 for line in handle:
247 if line.startswith('command:'):
248 break
249 else:
250 raise ValueError("Unexpected end of stream: Expected to find line starting with 'command'")
251 line = line.strip()
252 line = line.replace('command: ','')
253 record.command = line
254
255
264
265
267 for line in handle:
268 if 'sorted by position p-value' in line:
269 break
270 else:
271 raise ValueError('Unexpected end of stream: Failed to find motif name')
272 line = line.strip()
273 ls = line.split()
274 name = " ".join(ls[0:2])
275 motif.name=name
276
277
279 try:
280 line = handle.next()
281 except StopIteration:
282 raise ValueError('Unexpected end of stream: Failed to find motif sequences')
283 if not line.startswith('---'):
284 raise ValueError("Line does not start with '---':\n%s" % line)
285 try:
286 line = handle.next()
287 except StopIteration:
288 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Sequence name'")
289 if not line.startswith('Sequence name'):
290 raise ValueError("Line does not start with 'Sequence name':\n%s" % line)
291 try:
292 line = handle.next()
293 except StopIteration:
294 raise ValueError('Unexpected end of stream: Failed to find motif sequences')
295 if not line.startswith('---'):
296 raise ValueError("Line does not start with '---':\n%s" % line)
297 for line in handle:
298 if line.startswith('---'):
299 break
300 line = line.strip()
301 ls = line.split()
302 if rv:
303
304 motif.add_instance_from_values(name = ls[0], sequence = ls[5], start = ls[2], pvalue = ls[3], strand = ls[1])
305 else:
306
307 motif.add_instance_from_values(name = ls[0], sequence = ls[4], start = ls[1], pvalue = ls[2])
308 else:
309 raise ValueError('Unexpected end of stream')
310
311
313 for line in handle:
314 if line.startswith('log-odds matrix'):
315 break
316 else:
317 raise ValueError("Unexpected end of stream: Expected to find line starting with 'log-odds matrix'")
318 for line in handle:
319 if line.startswith('---'):
320 break
321 else:
322 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'")
323 for line in handle:
324 if line.startswith('letter-probability matrix'):
325 break
326 else:
327 raise ValueError("Unexpected end of stream: Expected to find line starting with 'letter-probability matrix'")
328 for line in handle:
329 if line.startswith('---'):
330 break
331 else:
332 raise ValueError("Unexpected end of stream: Expected to find line starting with '---'")
333 for line in handle:
334 if line.startswith('Time'):
335 break
336 else:
337 raise ValueError("Unexpected end of stream: Expected to find line starting with 'Time'")
338 try:
339 line = handle.next()
340 except StopIteration:
341 raise ValueError('Unexpected end of stream: Expected to find blank line')
342 if line.strip():
343 raise ValueError("Expected blank line, but got:\n%s" % line)
344 try:
345 line = handle.next()
346 except StopIteration:
347 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
348 if not line.startswith('***'):
349 raise ValueError("Line does not start with '***':\n%s" % line)
350 for line in handle:
351 if line.strip():
352 break
353 else:
354 raise ValueError("Unexpected end of stream: Expected to find line starting with '***'")
355 if not line.startswith('***'):
356 raise ValueError("Line does not start with '***':\n%s" % line)
357
358
359
360
361
362 from Bio import File
363 from Bio.ParserSupport import *
364
365
367 """A parser for the text output of the MEME program (OBSOLETE).
368 Parses the output into an object of the MEMERecord class.
369
370 Methods:
371 parse (handle): parses the contents of the file handle passed to it.
372
373 Example:
374
375 >>>f = open("meme.output.txt")
376 >>>parser = MEMEParser()
377 >>>meme_record = parser.parse(f)
378 >>>for motif in meme_record.motifs:
379 ... for instance in motif.instances:
380 ... print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
381
382 This class is OBSOLETE; please use the read() function in this module
383 instead.
384 """
389
390 - def parse (self, handle):
391 """parse (self, handle)"""
392 self._scanner.feed(handle, self._consumer)
393 return self._consumer.data
394
395
396
398 """Scanner for MEME output (OBSOLETE).
399
400 Methods:
401 feed
402
403 This class is OBSOLETE; please use the read() function in this module
404 instead.
405 """
406
407 - def feed (self, handle, consumer):
408 """
409 Feeds in MEME output for scanning. handle should
410 implement the readline method. consumer is
411 a Consumer object that can receive the salient events.
412 """
413 if isinstance(handle, File.UndoHandle):
414 uhandle = handle
415 else:
416 uhandle = File.UndoHandle(handle)
417
418 self._scan_header(uhandle, consumer)
419 self._scan_motifs (uhandle, consumer)
420
422 try:
423 read_and_call_until(uhandle, consumer.noevent, contains = 'MEME version')
424 except ValueError:
425 raise ValueError("Improper input file. File should contain a line starting MEME version.")
426 read_and_call(uhandle, consumer._version, start = 'MEME version')
427 read_and_call_until(uhandle, consumer.noevent, start = 'TRAINING SET')
428 read_and_call(uhandle, consumer.noevent, start = 'TRAINING SET')
429 read_and_call(uhandle, consumer.noevent, start = '****')
430 read_and_call(uhandle, consumer._datafile, start = 'DATAFILE')
431 read_and_call(uhandle, consumer._alphabet, start = 'ALPHABET')
432 read_and_call(uhandle, consumer.noevent, start = 'Sequence name')
433 read_and_call(uhandle, consumer.noevent, start = '----')
434 read_and_call_until(uhandle, consumer._sequence_name, start = '***')
435 read_and_call_until(uhandle, consumer.noevent, start = 'command:')
436 read_and_call(uhandle, consumer._commandline, start = 'command:')
437 read_and_call_until(uhandle, consumer.noevent, start = 'MOTIF 1')
438
440 while 1:
441 read_and_call(uhandle, consumer._add_motif_with_info, start = 'MOTIF')
442 read_and_call_until(uhandle, consumer.noevent, contains = 'sorted by position p-value')
443 read_and_call(uhandle, consumer.motif_name, contains = 'sorted by position p-value')
444 read_and_call(uhandle, consumer.noevent, start = '---')
445 read_and_call(uhandle, consumer.noevent, start = 'Sequence name')
446 read_and_call(uhandle, consumer.noevent, start = '---')
447 read_and_call_until(uhandle, consumer.add_instance, start = '---')
448 read_and_call_until(uhandle, consumer.noevent, start = 'log-odds matrix')
449 read_and_call(uhandle, consumer.noevent)
450 read_and_call_until(uhandle, consumer.add_to_logodds, start = '---')
451 read_and_call_until(uhandle, consumer.noevent, start = 'letter-probability matrix')
452 read_and_call(uhandle, consumer.noevent, start = 'letter-probability matrix')
453 read_and_call_until(uhandle, consumer.add_to_pssm, start = '---')
454 read_and_call_until(uhandle, consumer.noevent, start = 'Time')
455 read_and_call(uhandle, consumer.noevent, start = 'Time')
456 read_and_call(uhandle, consumer.noevent, blank = 1)
457 read_and_call(uhandle, consumer.noevent, start = '***')
458 read_and_call_while(uhandle, consumer.noevent, blank = 1)
459 read_and_call(uhandle, consumer.noevent, start = '***')
460 line = safe_peekline(uhandle)
461 if line.startswith("SUMMARY OF MOTIFS"):
462 break
463
464
465
467 """
468 Consumer that can receive events from MEME Scanner (OBSOLETE).
469
470 This is the Consumer object that should be passed to the
471 MEME Scanner.
472
473 This class is OBSOLETE; please use the read() function in this module
474 instead.
475 """
476
478 self.current_motif = None
479 self.sequence_names = []
480 self.data = MEMERecord()
481
486
488 line = line.strip()
489 line = line.replace('DATAFILE= ','')
490 self.data.datafile = line
491
500
502 line = line.strip()
503 ls = line.split()
504 self.data.sequence_names.append(ls[0])
505 if len(ls) == 6:
506 self.data.sequence_names.append(ls[3])
507
509 line = line.strip()
510 line = line.replace('command: ','')
511 self.data.command = line
512
523
529
539
542
545
548
549
550
552 """
553 Consumer that can receive events from _MASTScanner (OBSOLETE).
554
555 A _MASTConsumer parses lines from a mast text output file.
556 The motif match diagrams are parsed using line buffering.
557 Each of the buffering functions have a dummy variable that is
558 required for testing using the Bio.ParserSupport.TaggingConsumer.
559 If this variable isn't there, the TaggingConsumer barfs. In
560 the _MASTScanner, None is passed in the place of this variable.
561
562 This class is OBSOLETE; please use the read() function in the module
563 Bio.Motif.Parsers.MAST instead.
564 """
566 self.data = MASTRecord()
567 self._current_seq = ""
568 self._line_buffer = []
569 self._buffer_size = 0
570 self._buffered_seq_start = 0
571
576
588
599
623
648
674
680
682 line = line.strip()
683 if not line.startswith('*****'):
684 self._line_buffer.append(line)
685 else:
686 return -1
687
689 """Parses the line buffer to get e-values for each instance of a motif.
690 This buffer parser is the most likely point of failure for the
691 MASTParser.
692 """
693 insts = self.data.get_motif_matches_for_sequence(self._current_seq)
694 if len(insts) > 0:
695
696 fullSeq = self._line_buffer[self._buffer_size-1]
697 pvals = self._line_buffer[1].split()
698 p = 0
699 lpval = len(pvals)
700 while p < lpval:
701 if pvals[p].count('e') > 1:
702
703
704 pvs = []
705 spe = pvals[p].split('e')
706 spe.reverse()
707 dotind = spe[1].find('.')
708 if dotind == -1:
709 thispval = spe[1][-1] + 'e' + spe[0]
710 else:
711 thispval = spe[1][dotind-1:] + 'e' + spe[0]
712 pvs.append(thispval)
713 for spi in range(2,len(spe)):
714 dotind = spe[spi].find('.')
715 prevdotind = spe[spi-1].find('.')
716 if dotind != -1:
717 if prevdotind == -1:
718 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][:-1]
719 else:
720 thispval = spe[spi][dotind-1:] + 'e' + spe[spi-1][0:prevdotind-1]
721 else:
722 if prevdotind == -1:
723 thispval = spe[spi][-1] + 'e' + spe[spi-1][:-1]
724 else:
725 thispval = spe[spi][-1] + 'e' + spe[spi-1][0:prevdotind-1]
726 pvs.append(thispval)
727 pvs.reverse()
728 if p > 0:
729 pvals = pvals[0:p] + pvs + pvals[p+1:]
730 else:
731 pvals = pvs + pvals[p+1:]
732 lpval = len(pvals)
733 p += 1
734 i = 0
735 if len(pvals) != len(insts):
736 sys.stderr.write("Failure to parse p-values for " + self._current_seq + ": " + self._line_buffer[1] + " to: " + str(pvals) + "\n")
737 pvals = []
738
739
740 for i in range(0,len(insts)):
741 inst = insts[i]
742 start = inst.start - self._buffered_seq_start + 1
743 thisSeq = fullSeq[start:start+inst.length]
744 thisSeq = Seq.Seq(thisSeq, self.data.alphabet)
745 inst._sequence(thisSeq)
746 if pvals:
747 inst._pvalue(float(pvals[i]))
748
750 self._line_buffer = []
751 self._buffer_size = 0
752
754 if self._buffer_size == 0:
755 if len(self._line_buffer) > 0:
756 self._buffer_size = len(self._line_buffer)
757 ll = self._line_buffer[self._buffer_size-1].split()
758 self._line_buffer[self._buffer_size-1] = ll[1]
759 self._buffered_seq_start = int(ll[0])
760 else:
761 i = 0
762 for i in range(self._buffer_size, len(self._line_buffer)-1):
763 self._line_buffer[i-self._buffer_size] = self._line_buffer[i-self._buffer_size] + self._line_buffer[i].strip()
764 ll = self._line_buffer[len(self._line_buffer)-1].split()
765 if int(ll[0]) == self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]):
766 self._line_buffer[self._buffer_size-1] += ll[1]
767 else:
768 differ = int(ll[0]) - (self._buffered_seq_start + len(self._line_buffer[self._buffer_size-1]))
769 self._line_buffer[self._buffer_size-1] += "N"*differ
770 self._line_buffer[self._buffer_size-1] += ll[1]
771 self._line_buffer = self._line_buffer[0:self._buffer_size]
772
774 line = line.strip()
775 if line.find('[') != -1 or line.find('<') != -1:
776 pass
777 elif line.find('e') != -1:
778 pass
779 elif line.find('+') != -1:
780 pass
781
784
785
787 """
788 Parser for MAST text output (OBSOLETE).
789 HTML output cannot be parsed, yet. Returns a MASTRecord
790
791 A MASTParser takes a file handle for a MAST text output file and
792 returns a MASTRecord, containing the hits between motifs and
793 sequences. The parser does some unusual line buffering to parse out
794 match diagrams. Really complex diagrams often lead to an error message
795 and p-values not being parsed for a given line.
796
797 Methods:
798 parse (handle): parses the data from the file handle passed to it.
799
800 Example:
801
802 >>>f = open("mast_file.txt")
803 >>>parser = MASTParser()
804 >>>mast_record = parser.parse(f)
805 >>>for motif in mast_record.motifs:
806 >>> for instance in motif.instances:
807 >>> print instance.motif_name, instance.sequence_name, instance.strand, instance.pvalue
808
809 This class is OBSOLETE; please use the read() function in the module
810 Bio.Motif.Parsers.MAST instead.
811 """
815
816 - def parse (self, handle):
817 self._scanner.feed(handle, self._consumer)
818 return self._consumer.data
819
820
821
823 """
824 Scanner for MAST text output (OBSOLETE).
825
826 This class is OBSOLETE; please use the read() function in the module
827 Bio.Motif.Parsers.MAST instead.
828 """
829 - def feed (self, handle, consumer):
838
840 try:
841 read_and_call_until(uhandle, consumer.noevent, contains = "MAST version")
842 except ValueError:
843 raise ValueError("Improper input file. Does not begin with a line with 'MAST version'")
844 read_and_call(uhandle, consumer._version, contains = 'MAST version')
845 read_and_call_until(uhandle, consumer.noevent, start = 'DATABASE AND MOTIFS')
846 read_and_call(uhandle, consumer.noevent, start = 'DATABASE')
847 read_and_call(uhandle, consumer.noevent, start = '****')
848 read_and_call(uhandle, consumer._database, contains = 'DATABASE')
849 read_and_call_until(uhandle, consumer.noevent, contains = 'MOTIF WIDTH')
850 read_and_call(uhandle, consumer.noevent, contains = 'MOTIF')
851 read_and_call(uhandle, consumer.noevent, contains = '----')
852 read_and_call_until(uhandle, consumer._add_motif, blank = 1)
853 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION II:')
854
856 read_and_call_until(uhandle, consumer.noevent, start = 'SEQUENCE NAME')
857 read_and_call(uhandle, consumer.noevent, start = 'SEQUENCE NAME')
858 read_and_call(uhandle, consumer.noevent, start = '---')
859
860 read_and_call_until(uhandle, consumer.noevent, blank = 1)
861 read_and_call(uhandle, consumer.noevent, blank = 1)
862
864 read_and_call_until(uhandle, consumer.noevent, start = 'SECTION III:')
865 read_and_call(uhandle, consumer.noevent, start = 'SECTION III:')
866 read_and_call_until(uhandle, consumer.noevent, start = '****')
867 read_and_call(uhandle, consumer.noevent, start = '****')
868 read_and_call_until(uhandle, consumer.noevent, start = '*****')
869 read_and_call(uhandle, consumer.noevent)
870 read_and_call_while(uhandle, consumer.noevent, blank = 1)
871 readMatches = 1
872 while readMatches == 1:
873 if consumer._current_seq:
874 if consumer._buffer_size != 0:
875 consumer._parse_buffer(None)
876 consumer._blank_buffer(None)
877 read_and_call(uhandle, consumer._set_current_seq)
878 read_and_call_until(uhandle, consumer.noevent, start = ' DIAGRAM')
879 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1)
880 consumer._add_diagram_from_buffer(None)
881 consumer._blank_buffer(None)
882 read_and_call(uhandle, consumer.noevent, blank = 1)
883 while 1:
884 line = safe_peekline(uhandle)
885 if line.startswith('****'):
886 consumer._parse_buffer(None)
887 readMatches = 0
888 break
889 read_and_call_until(uhandle, consumer._add_line_to_buffer, blank = 1)
890 read_and_call(uhandle, consumer.noevent, blank = 1)
891 consumer._collapse_buffer(None)
892 if attempt_read_and_call(uhandle, consumer.noevent, blank = 1):
893 break
894 elif attempt_read_and_call(uhandle, consumer.noevent, start = '*****'):
895 consumer._parse_buffer(None)
896 consumer._blank_buffer(None)
897 readMatches = 0
898 break
899
900
901
903 """The class for holding the results from a MAST run (OBSOLETE).
904
905 A MASTRecord holds data about matches between motifs and sequences.
906 The motifs held by the MASTRecord are objects of the class MEMEMotif.
907
908 Methods:
909 get_motif_matches_for_sequence(sequence_name): returns all of the
910 motif matches within a given sequence. The matches are objects of
911 the class MEMEInstance
912 get_motif_matches (motif_name): returns all of the matches for a motif
913 in the sequences searched. The matches returned are of class
914 MEMEInstance
915 get_motif_by_name (motif_name): returns a MEMEMotif with the given
916 name.
917
918 This class is OBSOLETE; please use the read() function in the module
919 Bio.Motif.Parsers.MAST instead.
920 """
929
932
938
941
943 insts = []
944 for m in self.motifs:
945 for i in m.instances:
946 if i.sequence_name == seq:
947 insts.append(i)
948 insts.sort(lambda x,y: cmp(x.start, y.start))
949 return insts
950
954
956 self.diagrams[seq] = diagram
957
960
963
966
968 for m in self.motifs:
969 if m.name == name:
970 return m
971