1
2
3
4
5
6 """
7 Bio.AlignIO support for "fasta-m10" output from Bill Pearson's FASTA tools.
8
9 You are expected to use this module via the Bio.AlignIO functions (or the
10 Bio.SeqIO functions if you want to work directly with the gapped sequences).
11
12 This module contains a parser for the pairwise alignments produced by Bill
13 Pearson's FASTA tools, for use from the Bio.AlignIO interface where it is
14 refered to as the "fasta-m10" file format (as we only support the machine
15 readable output format selected with the -m 10 command line option).
16
17 This module does NOT cover the generic "fasta" file format originally
18 developed as an input format to the FASTA tools. The Bio.AlignIO and
19 Bio.SeqIO both use the Bio.SeqIO.FastaIO module to deal with these files,
20 which can also be used to store a multiple sequence alignments.
21 """
22
23 from Bio.Seq import Seq
24 from Bio.SeqRecord import SeqRecord
25 from Bio.Align import MultipleSeqAlignment
26 from Interfaces import AlignmentIterator
27 from Bio.Alphabet import single_letter_alphabet, generic_dna, generic_protein
28 from Bio.Alphabet import Gapped
29
30
32 """Alignment iterator for the FASTA tool's pairwise alignment output.
33
34 This is for reading the pairwise alignments output by Bill Pearson's
35 FASTA program when called with the -m 10 command line option for machine
36 readable output. For more details about the FASTA tools, see the website
37 http://fasta.bioch.virginia.edu/ and the paper:
38
39 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
40
41 This class is intended to be used via the Bio.AlignIO.parse() function
42 by specifying the format as "fasta-m10" as shown in the following code:
43
44 from Bio import AlignIO
45 handle = ...
46 for a in AlignIO.parse(handle, "fasta-m10"):
47 assert len(a) == 2, "Should be pairwise!"
48 print "Alignment length %i" % a.get_alignment_length()
49 for record in a:
50 print record.seq, record.name, record.id
51
52 Note that this is not a full blown parser for all the information
53 in the FASTA output - for example, most of the header and all of the
54 footer is ignored. Also, the alignments are not batched according to
55 the input queries.
56
57 Also note that there can be up to about 30 letters of flanking region
58 included in the raw FASTA output as contextual information. This is NOT
59 part of the alignment itself, and is not included in the resulting
60 MultipleSeqAlignment objects returned.
61 """
62
64 """Reads from the handle to construct and return the next alignment.
65
66 This returns the pairwise alignment of query and match/library
67 sequences as an MultipleSeqAlignment object containing two rows.
68 """
69 handle = self.handle
70
71 try:
72
73
74 line = self._header
75 del self._header
76 except AttributeError:
77 line = handle.readline()
78 if not line:
79 return None
80
81 if line.startswith("#"):
82
83 line = self._skip_file_header(line)
84 while ">>>" in line and not line.startswith(">>>"):
85
86 self._query_descr = ""
87 self._query_header_annotation = {}
88
89 line = self._parse_query_header(line)
90
91 if not line:
92
93 return None
94 if ">>><<<" in line:
95
96 return None
97
98
99
100 assert line[0:2] == ">>" and not line[2] == ">", line
101
102 query_seq_parts, match_seq_parts = [], []
103 query_annotation, match_annotation = {}, {}
104 match_descr = ""
105 alignment_annotation = {}
106
107
108
109 """
110 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
111 ; fa_frame: f
112 ; fa_initn: 52
113 ; fa_init1: 52
114 ; fa_opt: 70
115 ; fa_z-score: 105.5
116 ; fa_bits: 27.5
117 ; fa_expect: 0.082
118 ; sw_score: 70
119 ; sw_ident: 0.279
120 ; sw_sim: 0.651
121 ; sw_overlap: 43
122 """
123 if (not line[0:2] == ">>") or line[0:3] == ">>>":
124 raise ValueError("Expected target line starting '>>'")
125 match_descr = line[2:].strip()
126
127 line = handle.readline()
128 line = self._parse_tag_section(line, alignment_annotation)
129 assert not line[0:2] == "; "
130
131
132 """
133 >gi|10955265| ..
134 ; sq_len: 346
135 ; sq_offset: 1
136 ; sq_type: p
137 ; al_start: 197
138 ; al_stop: 238
139 ; al_display_start: 167
140 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
141 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
142 GEYFTENKPKYIIREIHQET
143 """
144 if not (line[0] == ">" and line.strip().endswith("..")):
145 raise ValueError("Expected line starting '>' and ending '..'")
146 assert self._query_descr.startswith(line[1:].split(None,1)[0])
147
148
149 line = handle.readline()
150 line = self._parse_tag_section(line, query_annotation)
151 assert not line[0:2] == "; "
152
153
154 while not line[0] == ">":
155 query_seq_parts.append(line.strip())
156 line = handle.readline()
157
158
159 """
160 >gi|152973545|ref|YP_001338596.1| ..
161 ; sq_len: 242
162 ; sq_type: p
163 ; al_start: 52
164 ; al_stop: 94
165 ; al_display_start: 22
166 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
167 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
168 QDFAFTRKMRREARQVEQSW
169 """
170
171 if not (line[0] == ">" and line.strip().endswith("..")):
172 raise ValueError("Expected line starting '>' and ending '..', got '%s'" % repr(line))
173 assert match_descr.startswith(line[1:].split(None,1)[0])
174
175
176 line = handle.readline()
177 line = self._parse_tag_section(line, match_annotation)
178 assert not line[0:2] == "; "
179
180
181
182 """
183 ; al_cons:
184 .::. : :. ---. :: :. . : ..-:::-: :.: ..:...:
185 etc
186 """
187 while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
188 match_seq_parts.append(line.strip())
189 line = handle.readline()
190 if line[0:2] == "; ":
191 assert line.strip() == "; al_cons:"
192 align_consensus_parts = []
193 line = handle.readline()
194 while not (line[0:2] == "; " or line[0] == ">" or ">>>" in line):
195 align_consensus_parts.append(line.strip())
196 line = handle.readline()
197
198 align_consensus = "".join(align_consensus_parts)
199 del align_consensus_parts
200 assert not line[0:2] == "; "
201 else:
202 align_consensus = None
203 assert (line[0] == ">" or ">>>" in line)
204 self._header = line
205
206
207
208 query_seq = "".join(query_seq_parts)
209 match_seq = "".join(match_seq_parts)
210 del query_seq_parts, match_seq_parts
211
212
213
214
215 query_align_seq = self._extract_alignment_region(query_seq, query_annotation)
216 match_align_seq = self._extract_alignment_region(match_seq, match_annotation)
217
218
219
220
221
222 if len(query_align_seq) != len(match_align_seq):
223 raise ValueError("Problem parsing the alignment sequence coordinates, "
224 "following should be the same length but are not:\n"
225 "%s - len %i\n%s - len %i" % (query_align_seq,
226 len(query_align_seq),
227 match_align_seq,
228 len(match_align_seq)))
229 if "sw_overlap" in alignment_annotation:
230 if int(alignment_annotation["sw_overlap"]) != len(query_align_seq):
231 raise ValueError("Specified sw_overlap = %s does not match expected value %i" \
232 % (alignment_annotation["sw_overlap"],
233 len(query_align_seq)))
234
235
236 alphabet = self.alphabet
237 alignment = MultipleSeqAlignment(alphabet)
238
239
240
241 alignment._annotations = {}
242
243
244 for key, value in self._query_header_annotation.iteritems():
245 alignment._annotations[key] = value
246 for key, value in alignment_annotation.iteritems():
247 alignment._annotations[key] = value
248
249
250
251 record = SeqRecord(Seq(query_align_seq, alphabet),
252 id = self._query_descr.split(None,1)[0].strip(","),
253 name = "query",
254 description = self._query_descr,
255 annotations = {"original_length" : int(query_annotation["sq_len"])})
256
257 record._al_start = int(query_annotation["al_start"])
258 record._al_stop = int(query_annotation["al_stop"])
259 alignment.append(record)
260
261
262
263
264 if alphabet == single_letter_alphabet and "sq_type" in query_annotation:
265 if query_annotation["sq_type"] == "D":
266 record.seq.alphabet = generic_dna
267 elif query_annotation["sq_type"] == "p":
268 record.seq.alphabet = generic_protein
269 if "-" in query_align_seq:
270 if not hasattr(record.seq.alphabet,"gap_char"):
271 record.seq.alphabet = Gapped(record.seq.alphabet, "-")
272
273
274
275 record = SeqRecord(Seq(match_align_seq, alphabet),
276 id = match_descr.split(None,1)[0].strip(","),
277 name = "match",
278 description = match_descr,
279 annotations = {"original_length" : int(match_annotation["sq_len"])})
280
281 record._al_start = int(match_annotation["al_start"])
282 record._al_stop = int(match_annotation["al_stop"])
283 alignment.append(record)
284
285
286 if alphabet == single_letter_alphabet and "sq_type" in match_annotation:
287 if match_annotation["sq_type"] == "D":
288 record.seq.alphabet = generic_dna
289 elif match_annotation["sq_type"] == "p":
290 record.seq.alphabet = generic_protein
291 if "-" in match_align_seq:
292 if not hasattr(record.seq.alphabet,"gap_char"):
293 record.seq.alphabet = Gapped(record.seq.alphabet, "-")
294
295 return alignment
296
298 """Helper function for the main parsing code.
299
300 Skips over the file header region.
301 """
302
303 """
304 # /home/xxx/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X "-5 -5" NC_002127.faa NC_009649.faa
305 FASTA searches a protein or DNA sequence data bank
306 version 35.03 Feb. 18, 2008
307 Please cite:
308 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
309
310 Query: NC_002127.faa
311 """
312
313
314
315 while ">>>" not in line:
316 line = self.handle.readline()
317 return line
318
320 """Helper function for the main parsing code.
321
322 Skips over the free format query header, extracting the tagged parameters.
323
324 If there are no hits for the current query, it is skipped entirely."""
325
326 """
327 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa
328 Library: NC_009649.faa 45119 residues in 180 sequences
329
330 45119 residues in 180 sequences
331 Statistics: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508
332 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32
333 Lambda= 0.191631
334 Algorithm: FASTA (3.5 Sept 2006) [optimized]
335 Parameters: BL50 matrix (15:-5) ktup: 2
336 join: 36, opt: 24, open/ext: -10/-2, width: 16
337 Scan time: 0.040
338
339 The best scores are: opt bits E(180)
340 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 23.3 0.22
341 gi|152973501|ref|YP_001338552.1| hypothetical prot ( 245) 55 22.5 0.93
342 """
343
344
345 """
346 2>>>gi|152973838|ref|YP_001338875.1| hypothetical protein KPN_pKPN7p10263 [Klebsiella pneumoniae subsp. pneumonia 76 aa - 76 aa
347 vs NC_002127.faa library
348
349 579 residues in 3 sequences
350 Altschul/Gish params: n0: 76 Lambda: 0.158 K: 0.019 H: 0.100
351
352 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
353 join: 36, opt: 24, open/ext: -10/-2, width: 16
354 Scan time: 0.000
355 !! No library sequences with E() < 0.5
356 """
357
358 self._query_header_annotation = {}
359 self._query_descr = ""
360
361 assert ">>>" in line and not line[0:3] == ">>>"
362
363
364 line = self.handle.readline()
365
366 while not line[0:3] == ">>>":
367
368 line = self.handle.readline()
369 if not line:
370 raise ValueError("Premature end of file!")
371 if ">>><<<" in line:
372
373
374 return line
375
376
377 """
378 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library
379 ; pg_name: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35
380 ; pg_ver: 35.03
381 ; pg_argv: /home/pjcock/Downloads/FASTA/fasta-35.3.6/fasta35 -Q -H -E 1 -m 10 -X -5 -5 NC_002127.faa NC_009649.faa
382 ; pg_name: FASTA
383 ; pg_ver: 3.5 Sept 2006
384 ; pg_matrix: BL50 (15:-5)
385 ; pg_open-ext: -10 -2
386 ; pg_ktup: 2
387 ; pg_optcut: 24
388 ; pg_cgap: 36
389 ; mp_extrap: 60000 500
390 ; mp_stats: (shuffled [500]) Expectation_n fit: rho(ln(x))= 5.0398+/-0.00968; mu= 2.8364+/- 0.508 mean_var=44.7937+/-10.479, 0's: 0 Z-trim: 0 B-trim: 0 in 0/32 Lambda= 0.191631
391 ; mp_KS: -0.0000 (N=1066338402) at 20
392 ; mp_Algorithm: FASTA (3.5 Sept 2006) [optimized]
393 ; mp_Parameters: BL50 matrix (15:-5) ktup: 2 join: 36, opt: 24, open/ext: -10/-2, width: 16
394 """
395
396 assert line[0:3] == ">>>", line
397 self._query_descr = line[3:].strip()
398
399
400 line = self.handle.readline()
401 line = self._parse_tag_section(line, self._query_header_annotation)
402 assert not line[0:2] == "; ", line
403 assert line[0:2] == ">>" or ">>>" in line, line
404 return line
405
406
408 """Helper function for the main parsing code.
409
410 To get the actual pairwise alignment sequences, we must first
411 translate the un-gapped sequence based coordinates into positions
412 in the gapped sequence (which may have a flanking region shown
413 using leading - characters). To date, I have never seen any
414 trailing flanking region shown in the m10 file, but the
415 following code should also cope with that.
416
417 Note that this code seems to work fine even when the "sq_offset"
418 entries are prsent as a result of using the -X command line option.
419 """
420 align_stripped = alignment_seq_with_flanking.strip("-")
421 display_start = int(annotation['al_display_start'])
422 if int(annotation['al_start']) <= int(annotation['al_stop']):
423 start = int(annotation['al_start']) \
424 - display_start
425 end = int(annotation['al_stop']) \
426 - display_start \
427 + align_stripped.count("-") + 1
428 else:
429
430 start = display_start \
431 - int(annotation['al_start'])
432 end = display_start \
433 - int(annotation['al_stop']) \
434 + align_stripped.count("-") + 1
435 assert 0 <= start and start < end and end <= len(align_stripped), \
436 "Problem with sequence start/stop,\n%s[%i:%i]\n%s" \
437 % (alignment_seq_with_flanking, start, end, annotation)
438 return align_stripped[start:end]
439
440
442 """Helper function for the main parsing code.
443
444 line - supply line just read from the handle containing the start of
445 the tagged section.
446 dictionary - where to record the tagged values
447
448 Returns a string, the first line following the tagged section."""
449 if not line[0:2] == "; ":
450 raise ValueError("Expected line starting '; '")
451 while line[0:2] == "; ":
452 tag, value = line[2:].strip().split(": ",1)
453
454
455
456
457
458
459 dictionary[tag] = value
460 line = self.handle.readline()
461 return line
462
463 if __name__ == "__main__":
464 print "Running a quick self-test"
465
466
467 simple_example = \
468 """# /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
469 FASTA searches a protein or DNA sequence data bank
470 version 34.26 January 12, 2007
471 Please cite:
472 W.R. Pearson & D.J. Lipman PNAS (1988) 85:2444-2448
473
474 Query library NC_002127.faa vs NC_009649.faa library
475 searching NC_009649.faa library
476
477 1>>>gi|10955263|ref|NP_052604.1| plasmid mobilization [Escherichia coli O157:H7 s 107 aa - 107 aa
478 vs NC_009649.faa library
479
480 45119 residues in 180 sequences
481 Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273
482 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25
483 Lambda= 0.175043
484
485 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
486 join: 36, opt: 24, open/ext: -10/-2, width: 16
487 Scan time: 0.000
488 The best scores are: opt bits E(180)
489 gi|152973457|ref|YP_001338508.1| ATPase with chape ( 931) 71 24.9 0.58
490 gi|152973588|ref|YP_001338639.1| F pilus assembly ( 459) 63 23.1 0.99
491
492 >>>gi|10955263|ref|NP_052604.1|, 107 aa vs NC_009649.faa library
493 ; pg_name: /opt/fasta/fasta34
494 ; pg_ver: 34.26
495 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
496 ; pg_name: FASTA
497 ; pg_ver: 3.5 Sept 2006
498 ; pg_matrix: BL50 (15:-5)
499 ; pg_open-ext: -10 -2
500 ; pg_ktup: 2
501 ; pg_optcut: 24
502 ; pg_cgap: 36
503 ; mp_extrap: 60000 180
504 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.9146+/-0.0249; mu= -5.7948+/- 1.273 mean_var=53.6859+/-13.609, 0's: 0 Z-trim: 1 B-trim: 9 in 1/25 Lambda= 0.175043
505 ; mp_KS: -0.0000 (N=0) at 8159228
506 >>gi|152973457|ref|YP_001338508.1| ATPase with chaperone activity, ATP-binding subunit [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
507 ; fa_frame: f
508 ; fa_initn: 65
509 ; fa_init1: 43
510 ; fa_opt: 71
511 ; fa_z-score: 90.3
512 ; fa_bits: 24.9
513 ; fa_expect: 0.58
514 ; sw_score: 71
515 ; sw_ident: 0.250
516 ; sw_sim: 0.574
517 ; sw_overlap: 108
518 >gi|10955263| ..
519 ; sq_len: 107
520 ; sq_offset: 1
521 ; sq_type: p
522 ; al_start: 5
523 ; al_stop: 103
524 ; al_display_start: 1
525 --------------------------MTKRSGSNT-RRRAISRPVRLTAE
526 ED---QEIRKRAAECGKTVSGFLRAAALGKKVNSLTDDRVLKEVM-----
527 RLGALQKKLFIDGKRVGDREYAEVLIAITEYHRALLSRLMAD
528 >gi|152973457|ref|YP_001338508.1| ..
529 ; sq_len: 931
530 ; sq_type: p
531 ; al_start: 96
532 ; al_stop: 195
533 ; al_display_start: 66
534 SDFFRIGDDATPVAADTDDVVDASFGEPAAAGSGAPRRRGSGLASRISEQ
535 SEALLQEAAKHAAEFGRS------EVDTEHLLLALADSDVVKTILGQFKI
536 KVDDLKRQIESEAKR-GDKPF-EGEIGVSPRVKDALSRAFVASNELGHSY
537 VGPEHFLIGLAEEGEGLAANLLRRYGLTPQ
538 >>gi|152973588|ref|YP_001338639.1| F pilus assembly protein [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
539 ; fa_frame: f
540 ; fa_initn: 33
541 ; fa_init1: 33
542 ; fa_opt: 63
543 ; fa_z-score: 86.1
544 ; fa_bits: 23.1
545 ; fa_expect: 0.99
546 ; sw_score: 63
547 ; sw_ident: 0.266
548 ; sw_sim: 0.656
549 ; sw_overlap: 64
550 >gi|10955263| ..
551 ; sq_len: 107
552 ; sq_offset: 1
553 ; sq_type: p
554 ; al_start: 32
555 ; al_stop: 94
556 ; al_display_start: 2
557 TKRSGSNTRRRAISRPVRLTAEEDQEIRKRAAECGKTVSGFLRAAALGKK
558 VNSLTDDRVLKEV-MRLGALQKKLFIDGKRVGDREYAEVLIAITEYHRAL
559 LSRLMAD
560 >gi|152973588|ref|YP_001338639.1| ..
561 ; sq_len: 459
562 ; sq_type: p
563 ; al_start: 191
564 ; al_stop: 248
565 ; al_display_start: 161
566 VGGLFPRTQVAQQKVCQDIAGESNIFSDWAASRQGCTVGG--KMDSVQDK
567 ASDKDKERVMKNINIMWNALSKNRLFDG----NKELKEFIMTLTGTLIFG
568 ENSEITPLPARTTDQDLIRAMMEGGTAKIYHCNDSDKCLKVVADATVTIT
569 SNKALKSQISALLSSIQNKAVADEKLTDQE
570 2>>>gi|10955264|ref|NP_052605.1| hypothetical protein pOSAK1_02 [Escherichia coli O157:H7 s 126 aa - 126 aa
571 vs NC_009649.faa library
572
573 45119 residues in 180 sequences
574 Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313
575 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25
576 Lambda= 0.179384
577
578 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
579 join: 36, opt: 24, open/ext: -10/-2, width: 16
580 Scan time: 0.000
581 The best scores are: opt bits E(180)
582 gi|152973462|ref|YP_001338513.1| hypothetical prot ( 101) 58 22.9 0.29
583
584 >>>gi|10955264|ref|NP_052605.1|, 126 aa vs NC_009649.faa library
585 ; pg_name: /opt/fasta/fasta34
586 ; pg_ver: 34.26
587 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
588 ; pg_name: FASTA
589 ; pg_ver: 3.5 Sept 2006
590 ; pg_matrix: BL50 (15:-5)
591 ; pg_open-ext: -10 -2
592 ; pg_ktup: 2
593 ; pg_optcut: 24
594 ; pg_cgap: 36
595 ; mp_extrap: 60000 180
596 ; mp_stats: Expectation_n fit: rho(ln(x))= 7.1374+/-0.0246; mu= -7.6540+/- 1.313 mean_var=51.1189+/-13.171, 0's: 0 Z-trim: 1 B-trim: 8 in 1/25 Lambda= 0.179384
597 ; mp_KS: -0.0000 (N=0) at 8159228
598 >>gi|152973462|ref|YP_001338513.1| hypothetical protein KPN_pKPN3p05904 [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
599 ; fa_frame: f
600 ; fa_initn: 50
601 ; fa_init1: 50
602 ; fa_opt: 58
603 ; fa_z-score: 95.8
604 ; fa_bits: 22.9
605 ; fa_expect: 0.29
606 ; sw_score: 58
607 ; sw_ident: 0.289
608 ; sw_sim: 0.632
609 ; sw_overlap: 38
610 >gi|10955264| ..
611 ; sq_len: 126
612 ; sq_offset: 1
613 ; sq_type: p
614 ; al_start: 1
615 ; al_stop: 38
616 ; al_display_start: 1
617 ------------------------------MKKDKKYQIEAIKNKDKTLF
618 IVYATDIYSPSEFFSKIESDLKKKKSKGDVFFDLIIPNGGKKDRYVYTSF
619 NGEKFSSYTLNKVTKTDEYN
620 >gi|152973462|ref|YP_001338513.1| ..
621 ; sq_len: 101
622 ; sq_type: p
623 ; al_start: 44
624 ; al_stop: 81
625 ; al_display_start: 14
626 DALLGEIQRLRKQVHQLQLERDILTKANELIKKDLGVSFLKLKNREKTLI
627 VDALKKKYPVAELLSVLQLARSCYFYQNVCTISMRKYA
628 3>>>gi|10955265|ref|NP_052606.1| hypothetical protein pOSAK1_03 [Escherichia coli O157:H7 s 346 aa - 346 aa
629 vs NC_009649.faa library
630
631 45119 residues in 180 sequences
632 Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461
633 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25
634 Lambda= 0.210386
635
636 FASTA (3.5 Sept 2006) function [optimized, BL50 matrix (15:-5)] ktup: 2
637 join: 37, opt: 25, open/ext: -10/-2, width: 16
638 Scan time: 0.020
639 The best scores are: opt bits E(180)
640 gi|152973545|ref|YP_001338596.1| putative plasmid ( 242) 70 27.5 0.082
641
642 >>>gi|10955265|ref|NP_052606.1|, 346 aa vs NC_009649.faa library
643 ; pg_name: /opt/fasta/fasta34
644 ; pg_ver: 34.26
645 ; pg_argv: /opt/fasta/fasta34 -Q -H -E 1 -m 10 NC_002127.faa NC_009649.faa
646 ; pg_name: FASTA
647 ; pg_ver: 3.5 Sept 2006
648 ; pg_matrix: BL50 (15:-5)
649 ; pg_open-ext: -10 -2
650 ; pg_ktup: 2
651 ; pg_optcut: 25
652 ; pg_cgap: 37
653 ; mp_extrap: 60000 180
654 ; mp_stats: Expectation_n fit: rho(ln(x))= 6.0276+/-0.0276; mu= 3.0670+/- 1.461 mean_var=37.1634+/- 8.980, 0's: 0 Z-trim: 1 B-trim: 14 in 1/25 Lambda= 0.210386
655 ; mp_KS: -0.0000 (N=0) at 8159228
656 >>gi|152973545|ref|YP_001338596.1| putative plasmid SOS inhibition protein A [Klebsiella pneumoniae subsp. pneumoniae MGH 78578]
657 ; fa_frame: f
658 ; fa_initn: 52
659 ; fa_init1: 52
660 ; fa_opt: 70
661 ; fa_z-score: 105.5
662 ; fa_bits: 27.5
663 ; fa_expect: 0.082
664 ; sw_score: 70
665 ; sw_ident: 0.279
666 ; sw_sim: 0.651
667 ; sw_overlap: 43
668 >gi|10955265| ..
669 ; sq_len: 346
670 ; sq_offset: 1
671 ; sq_type: p
672 ; al_start: 197
673 ; al_stop: 238
674 ; al_display_start: 167
675 DFMCSILNMKEIVEQKNKEFNVDIKKETIESELHSKLPKSIDKIHEDIKK
676 QLSC-SLIMKKIDVEMEDYSTYCFSALRAIEGFIYQILNDVCNPSSSKNL
677 GEYFTENKPKYIIREIHQET
678 >gi|152973545|ref|YP_001338596.1| ..
679 ; sq_len: 242
680 ; sq_type: p
681 ; al_start: 52
682 ; al_stop: 94
683 ; al_display_start: 22
684 IMTVEEARQRGARLPSMPHVRTFLRLLTGCSRINSDVARRIPGIHRDPKD
685 RLSSLKQVEEALDMLISSHGEYCPLPLTMDVQAENFPEVLHTRTVRRLKR
686 QDFAFTRKMRREARQVEQSW
687 >>><<<
688
689
690 579 residues in 3 query sequences
691 45119 residues in 180 library sequences
692 Scomplib [34.26]
693 start: Tue May 20 16:38:45 2008 done: Tue May 20 16:38:45 2008
694 Total Scan time: 0.020 Total Display time: 0.010
695
696 Function used was FASTA [version 34.26 January 12, 2007]
697
698 """
699
700
701 from StringIO import StringIO
702
703 alignments = list(FastaM10Iterator(StringIO(simple_example)))
704 assert len(alignments) == 4, len(alignments)
705 assert len(alignments[0]) == 2
706 for a in alignments:
707 print "Alignment %i sequences of length %i" \
708 % (len(a), a.get_alignment_length())
709 for r in a:
710 print "%s %s %i" % (r.seq, r.id, r.annotations["original_length"])
711
712 print "Done"
713
714 import os
715 path = "../../Tests/Fasta/"
716 files = [f for f in os.listdir(path) if os.path.splitext(f)[-1] == ".m10"]
717 files.sort()
718 for filename in files:
719 if os.path.splitext(filename)[-1] == ".m10":
720 print
721 print filename
722 print "="*len(filename)
723 for i,a in enumerate(FastaM10Iterator(open(os.path.join(path,filename)))):
724 print "#%i, %s" % (i+1,a)
725 for r in a:
726 if "-" in r.seq:
727 assert r.seq.alphabet.gap_char == "-"
728 else:
729 assert not hasattr(r.seq.alphabet, "gap_char")
730