1
2
3
4
5
6 """
7 Bio.AlignIO support for the "clustal" output from CLUSTAL W and other tools.
8
9 You are expected to use this module via the Bio.AlignIO functions (or the
10 Bio.SeqIO functions if you want to work directly with the gapped sequences).
11 """
12
13 from Bio.Align import MultipleSeqAlignment
14 from Interfaces import AlignmentIterator, SequentialAlignmentWriter
15
17 """Clustalw alignment writer."""
19 """Use this to write (another) single alignment to an open file."""
20
21 if len(alignment) == 0:
22 raise ValueError("Must have at least one sequence")
23 if alignment.get_alignment_length() == 0:
24
25 raise ValueError("Non-empty sequences are required")
26
27
28 try:
29 version = str(alignment._version)
30 except AttributeError:
31 version = ""
32 if not version:
33 version = '1.81'
34 if version.startswith("2."):
35
36 output = "CLUSTAL %s multiple sequence alignment\n\n\n" % version
37 else:
38
39 output = "CLUSTAL X (%s) multiple sequence alignment\n\n\n" % version
40
41 cur_char = 0
42 max_length = len(alignment[0])
43
44 if max_length <= 0:
45 raise ValueError("Non-empty sequences are required")
46
47
48 while cur_char != max_length:
49
50
51 if (cur_char + 50) > max_length:
52 show_num = max_length - cur_char
53 else:
54 show_num = 50
55
56
57
58
59 for record in alignment:
60
61
62
63 line = record.id[0:30].replace(" ","_").ljust(36)
64 line += record.seq.data[cur_char:(cur_char + show_num)]
65 output += line + "\n"
66
67
68
69 if hasattr(alignment, "_star_info") and alignment._star_info != '':
70 output += (" " * 36) + \
71 alignment._star_info[cur_char:(cur_char + show_num)] + "\n"
72
73 output += "\n"
74 cur_char += show_num
75
76
77 self.handle.write(output + "\n")
78
80 """Clustalw alignment iterator."""
81
83 handle = self.handle
84 try:
85
86
87 line = self._header
88 del self._header
89 except AttributeError:
90 line = handle.readline()
91 if not line:
92 return None
93
94
95 known_headers = ['CLUSTAL', 'PROBCONS', 'MUSCLE']
96 if line.strip().split()[0] not in known_headers:
97 raise ValueError("%s is not a known CLUSTAL header: %s" % \
98 (line.strip().split()[0],
99 ", ".join(known_headers)))
100
101
102 version = None
103 for word in line.split():
104 if word[0]=='(' and word[-1]==')':
105 word = word[1:-1]
106 if word[0] in '0123456789':
107 version = word
108 break
109
110
111 line = handle.readline()
112 while line.strip() == "":
113 line = handle.readline()
114
115
116
117
118 ids = []
119 seqs = []
120 consensus = ""
121 seq_cols = None
122
123
124 while True:
125 if line[0] != " " and line.strip() != "":
126
127 fields = line.rstrip().split()
128
129
130
131 if len(fields) < 2 or len(fields) > 3:
132 raise ValueError("Could not parse line:\n%s" % line)
133
134 ids.append(fields[0])
135 seqs.append(fields[1])
136
137
138 if seq_cols is None:
139 start = len(fields[0]) + line[len(fields[0]):].find(fields[1])
140 end = start + len(fields[1])
141 seq_cols = slice(start, end)
142 del start, end
143 assert fields[1] == line[seq_cols]
144
145 if len(fields) == 3:
146
147 try:
148 letters = int(fields[2])
149 except ValueError:
150 raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
151 if len(fields[1].replace("-","")) != letters:
152 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)
153 elif line[0] == " ":
154
155 assert len(ids) == len(seqs)
156 assert len(ids) > 0
157 assert seq_cols is not None
158 consensus = line[seq_cols]
159 assert not line[:seq_cols.start].strip()
160 assert not line[seq_cols.stop:].strip()
161
162 line = handle.readline()
163 assert line.strip() == ""
164 break
165 else:
166
167 break
168 line = handle.readline()
169 if not line : break
170
171 assert line.strip() == ""
172 assert seq_cols is not None
173
174
175 for s in seqs:
176 assert len(s) == len(seqs[0])
177 if consensus:
178 assert len(consensus) == len(seqs[0])
179
180
181 done = False
182 while not done:
183
184
185
186 while (not line) or line.strip() == "":
187 line = handle.readline()
188 if not line : break
189 if not line : break
190
191 if line.split(None,1)[0] in known_headers:
192
193 done = True
194 self._header = line
195 break
196
197 for i in range(len(ids)):
198 assert line[0] != " ", "Unexpected line:\n%s" % repr(line)
199 fields = line.rstrip().split()
200
201
202
203 if len(fields) < 2 or len(fields) > 3:
204 raise ValueError("Could not parse line:\n%s" % repr(line))
205
206 if fields[0] != ids[i]:
207 raise ValueError("Identifiers out of order? Got '%s' but expected '%s'" \
208 % (fields[0], ids[i]))
209
210 if fields[1] != line[seq_cols]:
211 start = len(fields[0]) + line[len(fields[0]):].find(fields[1])
212 assert start == seq_cols.start, 'Old location %s -> %i:XX' % (seq_cols, start)
213 end = start + len(fields[1])
214 seq_cols = slice(start, end)
215 del start, end
216
217
218 seqs[i] += fields[1]
219 assert len(seqs[i]) == len(seqs[0])
220
221 if len(fields) == 3:
222
223 try:
224 letters = int(fields[2])
225 except ValueError:
226 raise ValueError("Could not parse line, bad sequence number:\n%s" % line)
227 if len(seqs[i].replace("-","")) != letters:
228 raise ValueError("Could not parse line, invalid sequence number:\n%s" % line)
229
230
231 line = handle.readline()
232
233 if consensus:
234 assert line[0] == " "
235 assert seq_cols is not None
236 consensus += line[seq_cols]
237 assert len(consensus) == len(seqs[0])
238 assert not line[:seq_cols.start].strip()
239 assert not line[seq_cols.stop:].strip()
240
241 line = handle.readline()
242
243
244 assert len(ids) == len(seqs)
245 if len(seqs) == 0 or len(seqs[0]) == 0:
246 return None
247
248 if self.records_per_alignment is not None \
249 and self.records_per_alignment != len(ids):
250 raise ValueError("Found %i records in this alignment, told to expect %i" \
251 % (len(ids), self.records_per_alignment))
252
253 alignment = MultipleSeqAlignment(self.alphabet)
254 alignment_length = len(seqs[0])
255 for i in range(len(ids)):
256 if len(seqs[i]) != alignment_length:
257 raise ValueError("Error parsing alignment - sequences of different length?")
258 alignment.add_sequence(ids[i], seqs[i])
259
260
261 if version:
262 alignment._version = version
263 if consensus:
264 assert len(consensus) == alignment_length, \
265 "Alignment length is %i, consensus length is %i, '%s'" \
266 % (alignment_length, len(consensus), consensus)
267 alignment._star_info = consensus
268 return alignment
269
270 if __name__ == "__main__":
271 print "Running a quick self-test"
272
273
274
275 aln_example1 = \
276 """CLUSTAL W (1.81) multiple sequence alignment
277
278
279 gi|4959044|gb|AAD34209.1|AF069 MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN 50
280 gi|671626|emb|CAA85685.1| ---------MSPQTETKASVGFKAGVKEYKLTYYTPEYETKDTDILAAFR 41
281 * *: :: :. :* : :. : . :* :: .
282
283 gi|4959044|gb|AAD34209.1|AF069 LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW 100
284 gi|671626|emb|CAA85685.1| VTPQPG-----------------VPPEEAGAAVAAESSTGT--------- 65
285 : ** **:... *.*** ..
286
287 gi|4959044|gb|AAD34209.1|AF069 LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT 150
288 gi|671626|emb|CAA85685.1| WTTVWTDGLTSLDRYKG-----RCYHIEPVPG------------------ 92
289 .:* * *: .* :* : :* .*
290
291 gi|4959044|gb|AAD34209.1|AF069 SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE 200
292 gi|671626|emb|CAA85685.1| -EKDQCICYVAYPLDLFEEGSVTNMFTSIVGNVFGFKALRALRLEDLRIP 141
293 *::. . .:: :*..* :* .* .. . : . :
294
295 gi|4959044|gb|AAD34209.1|AF069 VPTTRAQRRA 210
296 gi|671626|emb|CAA85685.1| VAYVKTFQGP 151
297 *. .:: : .
298
299 """
300
301
302
303
304 aln_example2 = \
305 """CLUSTAL X (1.83) multiple sequence alignment
306
307
308 V_Harveyi_PATH --MKNWIKVAVAAIA--LSAA------------------TVQAATEVKVG
309 B_subtilis_YXEM MKMKKWTVLVVAALLAVLSACG------------NGNSSSKEDDNVLHVG
310 B_subtilis_GlnH_homo_YCKK MKKALLALFMVVSIAALAACGAGNDNQSKDNAKDGDLWASIKKKGVLTVG
311 YA80_HAEIN MKKLLFTTALLTGAIAFSTF-----------SHAGEIADRVEKTKTLLVG
312 FLIY_ECOLI MKLAHLGRQALMGVMAVALVAG---MSVKSFADEG-LLNKVKERGTLLVG
313 E_coli_GlnH --MKSVLKVSLAALTLAFAVS------------------SHAADKKLVVA
314 Deinococcus_radiodurans -MKKSLLSLKLSGLLVPSVLALS--------LSACSSPSSTLNQGTLKIA
315 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
316 HISJ_E_COLI MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG
317 : . : :.
318
319 V_Harveyi_PATH MSGRYFPFTFVKQ--DKLQGFEVDMWDEIGKRNDYKIEYVTANFSGLFGL
320 B_subtilis_YXEM ATGQSYPFAYKEN--GKLTGFDVEVMEAVAKKIDMKLDWKLLEFSGLMGE
321 B_subtilis_GlnH_homo_YCKK TEGTYEPFTYHDKDTDKLTGYDVEVITEVAKRLGLKVDFKETQWGSMFAG
322 YA80_HAEIN TEGTYAPFTFHDK-SGKLTGFDVEVIRKVAEKLGLKVEFKETQWDAMYAG
323 FLIY_ECOLI LEGTYPPFSFQGD-DGKLTGFEVEFAQQLAKHLGVEASLKPTKWDGMLAS
324 E_coli_GlnH TDTAFVPFEFKQG--DKYVGFDVDLWAAIAKELKLDYELKPMDFSGIIPA
325 Deinococcus_radiodurans MEGTYPPFTSKNE-QGELVGFDVDIAKAVAQKLNLKPEFVLTEWSGILAG
326 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
327 HISJ_E_COLI TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS
328 ** .: *::::. : :. . ..:
329
330 V_Harveyi_PATH LETGRIDTISNQITMTDARKAKYLFADPYVVDG-AQI
331 B_subtilis_YXEM LQTGKLDTISNQVAVTDERKETYNFTKPYAYAG-TQI
332 B_subtilis_GlnH_homo_YCKK LNSKRFDVVANQVG-KTDREDKYDFSDKYTTSR-AVV
333 YA80_HAEIN LNAKRFDVIANQTNPSPERLKKYSFTTPYNYSG-GVI
334 FLIY_ECOLI LDSKRIDVVINQVTISDERKKKYDFSTPYTISGIQAL
335 E_coli_GlnH LQTKNVDLALAGITITDERKKAIDFSDGYYKSG-LLV
336 Deinococcus_radiodurans LQANKYDVIVNQVGITPERQNSIGFSQPYAYSRPEII
337 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
338 HISJ_E_COLI LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV
339 *.: . * . * *: :
340
341 """
342
343 from StringIO import StringIO
344
345 alignments = list(ClustalIterator(StringIO(aln_example1)))
346 assert 1 == len(alignments)
347 assert alignments[0]._version == "1.81"
348 alignment = alignments[0]
349 assert 2 == len(alignment)
350 assert alignment[0].id == "gi|4959044|gb|AAD34209.1|AF069"
351 assert alignment[1].id == "gi|671626|emb|CAA85685.1|"
352 assert alignment[0].seq.tostring() == \
353 "MENSDSNDKGSDQSAAQRRSQMDRLDREEAFYQFVNNLSEEDYRLMRDNN" + \
354 "LLGTPGESTEEELLRRLQQIKEGPPPQSPDENRAGESSDDVTNSDSIIDW" + \
355 "LNSVRQTGNTTRSRQRGNQSWRAVSRTNPNSGDFRFSLEINVNRNNGSQT" + \
356 "SENESEPSTRRLSVENMESSSQRQMENSASESASARPSRAERNSTEAVTE" + \
357 "VPTTRAQRRA"
358
359 alignments = list(ClustalIterator(StringIO(aln_example2)))
360 assert 1 == len(alignments)
361 assert alignments[0]._version == "1.83"
362 alignment = alignments[0]
363 assert 9 == len(alignment)
364 assert alignment[-1].id == "HISJ_E_COLI"
365 assert alignment[-1].seq.tostring() == \
366 "MKKLVLSLSLVLAFSSATAAF-------------------AAIPQNIRIG" + \
367 "TDPTYAPFESKNS-QGELVGFDIDLAKELCKRINTQCTFVENPLDALIPS" + \
368 "LKAKKIDAIMSSLSITEKRQQEIAFTDKLYAADSRLV"
369
370 for alignment in ClustalIterator(StringIO(aln_example2 + aln_example1)):
371 print "Alignment with %i records of length %i" \
372 % (len(alignment),
373 alignment.get_alignment_length())
374
375 print "Checking empty file..."
376 assert 0 == len(list(ClustalIterator(StringIO(""))))
377
378 print "Checking write/read..."
379 alignments = list(ClustalIterator(StringIO(aln_example1))) \
380 + list(ClustalIterator(StringIO(aln_example2)))*2
381 handle = StringIO()
382 ClustalWriter(handle).write_file(alignments)
383 handle.seek(0)
384 for i,a in enumerate(ClustalIterator(handle)):
385 assert a.get_alignment_length() == alignments[i].get_alignment_length()
386 handle.seek(0)
387
388 print "Testing write/read when there is only one sequence..."
389 alignment = alignment[0:1]
390 handle = StringIO()
391 ClustalWriter(handle).write_file([alignment])
392 handle.seek(0)
393 for i,a in enumerate(ClustalIterator(handle)):
394 assert a.get_alignment_length() == alignment.get_alignment_length()
395 assert len(a) == 1
396
397 aln_example3 = \
398 """CLUSTAL 2.0.9 multiple sequence alignment
399
400
401 Test1seq ------------------------------------------------------------
402 AT3G20900.1-SEQ ATGAACAAAGTAGCGAGGAAGAACAAAACATCAGGTGAACAAAAAAAAAACTCAATCCAC
403 AT3G20900.1-CDS ------------------------------------------------------------
404
405
406 Test1seq -----AGTTACAATAACTGACGAAGCTAAGTAGGCTACTAATTAACGTCATCAACCTAAT
407 AT3G20900.1-SEQ ATCAAAGTTACAATAACTGACGAAGCTAAGTAGGCTAGAAATTAAAGTCATCAACCTAAT
408 AT3G20900.1-CDS ------------------------------------------------------------
409
410
411 Test1seq ACATAGCACTTAGAAAAAAGTGAAGTAAGAAAATATAAAATAATAAAAGGGTGGGTTATC
412 AT3G20900.1-SEQ ACATAGCACTTAGAAAAAAGTGAAGCAAGAAAATATAAAATAATAAAAGGGTGGGTTATC
413 AT3G20900.1-CDS ------------------------------------------------------------
414
415
416 Test1seq AATTGATAGTGTAAATCATCGTATTCCGGTGATATACCCTACCACAAAAACTCAAACCGA
417 AT3G20900.1-SEQ AATTGATAGTGTAAATCATAGTTGATTTTTGATATACCCTACCACAAAAACTCAAACCGA
418 AT3G20900.1-CDS ------------------------------------------------------------
419
420
421 Test1seq CTTGATTCAAATCATCTCAATAAATTAGCGCCAAAATAATGAAAAAAATAATAACAAACA
422 AT3G20900.1-SEQ CTTGATTCAAATCATCTCAAAAAACAAGCGCCAAAATAATGAAAAAAATAATAACAAAAA
423 AT3G20900.1-CDS ------------------------------------------------------------
424
425
426 Test1seq AAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT
427 AT3G20900.1-SEQ CAAACAAACCAAAATAAGAAAAAACATTACGCAAAACATAATAATTTACTCTTCGTTATT
428 AT3G20900.1-CDS ------------------------------------------------------------
429
430
431 Test1seq GTATTAACAAATCAAAGAGCTGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT
432 AT3G20900.1-SEQ GTATTAACAAATCAAAGAGATGAATTTTGATCACCTGCTAATACTACTTTCTGTATTGAT
433 AT3G20900.1-CDS ------------------------------------------------------------
434
435
436 Test1seq CCTATATCAACGTAAACAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT
437 AT3G20900.1-SEQ CCTATATCAAAAAAAAAAAAGATACTAATAATTAACTAAAAGTACGTTCATCGATCGTGT
438 AT3G20900.1-CDS ------------------------------------------------------ATGAAC
439 *
440
441 Test1seq TCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT
442 AT3G20900.1-SEQ GCGTTGACGAAGAAGAGCTCTATCTCCGGCGGAGCAAAGAAAACGATCTGTCTCCGTCGT
443 AT3G20900.1-CDS AAAGTAGCGAGGAAGAACAAAACATC------AGCAAAGAAAACGATCTGTCTCCGTCGT
444 * *** ***** * * ** ****************************
445
446 Test1seq AACACACGGTCGCTAGAGAAACTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
447 AT3G20900.1-SEQ AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
448 AT3G20900.1-CDS AACACACAGTTTTTCGAGACCCTTTGCTTCTTCGGCGCCGGTGGACACGTCAGCATCTCC
449 ******* ** * **** ***************************************
450
451 Test1seq GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCGTGGTGACGTCAGCACCGCT
452 AT3G20900.1-SEQ GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT
453 AT3G20900.1-CDS GGTATCCTAGACTTCTTGGCTTTCGGGGTACAACAACCGCCTGGTGACGTCAGCACCGCT
454 **************************************** *******************
455
456 Test1seq GCTGGGGATGGAGAGGGAACAGAGTT-
457 AT3G20900.1-SEQ GCTGGGGATGGAGAGGGAACAGAGTAG
458 AT3G20900.1-CDS GCTGGGGATGGAGAGGGAACAGAGTAG
459 *************************
460 """
461 alignments = list(ClustalIterator(StringIO(aln_example3)))
462 assert 1 == len(alignments)
463 assert alignments[0]._version == "2.0.9"
464
465 print "The End"
466