1
2
3
4
5
6 """Code for dealing with sequence alignments.
7 """
8
9 from Bio.Seq import Seq
10 from Bio.SeqRecord import SeqRecord
11 from Bio import Alphabet
12
13
14 from Bio.Align.Generic import Alignment as _Alignment
16 """"Represents a classical multiple sequence alignment (MSA).
17
18 By this we mean a collection of sequences (usually shown as rows) which
19 are all the same length (usually with gap characters for insertions of
20 padding). The data can then be regarded as a matrix of letters, with well
21 defined columns.
22
23 You would typically create an MSA by loading an alignment file with the
24 AlignIO module:
25
26 >>> from Bio import AlignIO
27 >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal")
28 >>> print align
29 SingleLetterAlphabet() alignment with 7 rows and 156 columns
30 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191
31 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191
32 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191
33 TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191
34 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191
35 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191
36 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191
37
38 In some respects you can treat these objects as lists of SeqRecord objects,
39 each representing a row of the alignment. Iterating over an alignment gives
40 the SeqRecord object for each row:
41
42 >>> len(align)
43 7
44 >>> for record in align:
45 ... print record.id, len(record)
46 gi|6273285|gb|AF191659.1|AF191 156
47 gi|6273284|gb|AF191658.1|AF191 156
48 gi|6273287|gb|AF191661.1|AF191 156
49 gi|6273286|gb|AF191660.1|AF191 156
50 gi|6273290|gb|AF191664.1|AF191 156
51 gi|6273289|gb|AF191663.1|AF191 156
52 gi|6273291|gb|AF191665.1|AF191 156
53
54 You can also access individual rows as SeqRecord objects via their index:
55
56 >>> print align[0].id
57 gi|6273285|gb|AF191659.1|AF191
58 >>> print align[-1].id
59 gi|6273291|gb|AF191665.1|AF191
60
61 And extract columns as strings:
62
63 >>> print align[:,1]
64 AAAAAAA
65
66 Or, take just the first ten columns as a sub-alignment:
67
68 >>> print align[:,:10]
69 SingleLetterAlphabet() alignment with 7 rows and 10 columns
70 TATACATTAA gi|6273285|gb|AF191659.1|AF191
71 TATACATTAA gi|6273284|gb|AF191658.1|AF191
72 TATACATTAA gi|6273287|gb|AF191661.1|AF191
73 TATACATAAA gi|6273286|gb|AF191660.1|AF191
74 TATACATTAA gi|6273290|gb|AF191664.1|AF191
75 TATACATTAA gi|6273289|gb|AF191663.1|AF191
76 TATACATTAA gi|6273291|gb|AF191665.1|AF191
77
78 Combining this alignment slicing with alignment addition allows you to
79 remove a section of the alignment. For example, taking just the first
80 and last ten columns:
81
82 >>> print align[:,:10] + align[:,-10:]
83 SingleLetterAlphabet() alignment with 7 rows and 20 columns
84 TATACATTAAGTGTACCAGA gi|6273285|gb|AF191659.1|AF191
85 TATACATTAAGTGTACCAGA gi|6273284|gb|AF191658.1|AF191
86 TATACATTAAGTGTACCAGA gi|6273287|gb|AF191661.1|AF191
87 TATACATAAAGTGTACCAGA gi|6273286|gb|AF191660.1|AF191
88 TATACATTAAGTGTACCAGA gi|6273290|gb|AF191664.1|AF191
89 TATACATTAAGTATACCAGA gi|6273289|gb|AF191663.1|AF191
90 TATACATTAAGTGTACCAGA gi|6273291|gb|AF191665.1|AF191
91
92 Note - This object is intended to replace the existing Alignment object
93 defined in module Bio.Align.Generic but is not fully backwards compatible
94 with it.
95
96 Note - This object does NOT attempt to model the kind of alignments used
97 in next generation sequencing with multiple sequencing reads which are
98 much shorter than the alignment, and where there is usually a consensus or
99 reference sequence with special status.
100 """
101
102 - def __init__(self, records, alphabet=None):
103 """Initialize a new MultipleSeqAlignment object.
104
105 Arguments:
106 records - A list (or iterator) of SeqRecord objects, whose sequences
107 are all the same length. This may be an be an empty list.
108 alphabet - The alphabet for the whole alignment, typically a gapped
109 alphabet, which should be a super-set of the individual
110 record alphabets. If omitted, a consensus alphabet is used.
111
112 You would normally load a MSA from a file using Bio.AlignIO, but you
113 can do this from a list of SeqRecord objects too:
114
115 >>> from Bio.Alphabet import generic_dna
116 >>> from Bio.Seq import Seq
117 >>> from Bio.SeqRecord import SeqRecord
118 >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha")
119 >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta")
120 >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma")
121 >>> align = MultipleSeqAlignment([a, b, c])
122 >>> print align
123 DNAAlphabet() alignment with 3 rows and 7 columns
124 AAAACGT Alpha
125 AAA-CGT Beta
126 AAAAGGT Gamma
127
128 NOTE - The older Bio.Align.Generic.Alignment class only accepted a
129 single argument, an alphabet. This is still supported via a backwards
130 compatible "hack" so as not to disrupt existing scripts and users, but
131 this will in future be deprecated.
132 """
133 if isinstance(records, Alphabet.Alphabet) \
134 or isinstance(records, Alphabet.AlphabetEncoder):
135 if alphabet is None:
136
137 alphabet = records
138 records = []
139 else :
140 raise ValueError("Invalid records argument")
141 if alphabet is not None :
142 if not (isinstance(alphabet, Alphabet.Alphabet) \
143 or isinstance(alphabet, Alphabet.AlphabetEncoder)):
144 raise ValueError("Invalid alphabet argument")
145 self._alphabet = alphabet
146 else :
147
148 self._alphabet = Alphabet.single_letter_alphabet
149
150 self._records = []
151 if records:
152 self.extend(records)
153 if alphabet is None:
154
155 self._alphabet = Alphabet._consensus_alphabet(rec.seq.alphabet for \
156 rec in self._records \
157 if rec.seq is not None)
158
160 """Add more SeqRecord objects to the alignment as rows.
161
162 They must all have the same length as the original alignment, and have
163 alphabets compatible with the alignment's alphabet. For example,
164
165 >>> from Bio.Alphabet import generic_dna
166 >>> from Bio.Seq import Seq
167 >>> from Bio.SeqRecord import SeqRecord
168 >>> from Bio.Align import MultipleSeqAlignment
169 >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha")
170 >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta")
171 >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma")
172 >>> d = SeqRecord(Seq("AAAACGT", generic_dna), id="Delta")
173 >>> e = SeqRecord(Seq("AAA-GGT", generic_dna), id="Epsilon")
174
175 First we create a small alignment (three rows):
176
177 >>> align = MultipleSeqAlignment([a, b, c])
178 >>> print align
179 DNAAlphabet() alignment with 3 rows and 7 columns
180 AAAACGT Alpha
181 AAA-CGT Beta
182 AAAAGGT Gamma
183
184 Now we can extend this alignment with another two rows:
185
186 >>> align.extend([d, e])
187 >>> print align
188 DNAAlphabet() alignment with 5 rows and 7 columns
189 AAAACGT Alpha
190 AAA-CGT Beta
191 AAAAGGT Gamma
192 AAAACGT Delta
193 AAA-GGT Epsilon
194
195 Because the alignment object allows iteration over the rows as
196 SeqRecords, you can use the extend method with a second alignment
197 (provided its sequences have the same length as the original alignment).
198 """
199 for rec in records:
200 self.append(rec)
201
203 """Add one more SeqRecord object to the alignment as a new row.
204
205 This must have the same length as the original alignment (unless this is
206 the first record), and have an alphabet compatible with the alignment's
207 alphabet.
208
209 >>> from Bio import AlignIO
210 >>> align = AlignIO.read("Clustalw/opuntia.aln", "clustal")
211 >>> print align
212 SingleLetterAlphabet() alignment with 7 rows and 156 columns
213 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191
214 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191
215 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191
216 TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191
217 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191
218 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191
219 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191
220 >>> len(align)
221 7
222
223 We'll now construct a dummy record to append as an example:
224
225 >>> from Bio.Seq import Seq
226 >>> from Bio.SeqRecord import SeqRecord
227 >>> dummy = SeqRecord(Seq("N"*156), id="dummy")
228
229 Now append this to the alignment,
230
231 >>> align.append(dummy)
232 >>> print align
233 SingleLetterAlphabet() alignment with 8 rows and 156 columns
234 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273285|gb|AF191659.1|AF191
235 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273284|gb|AF191658.1|AF191
236 TATACATTAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273287|gb|AF191661.1|AF191
237 TATACATAAAAGAAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273286|gb|AF191660.1|AF191
238 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273290|gb|AF191664.1|AF191
239 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273289|gb|AF191663.1|AF191
240 TATACATTAAAGGAGGGGGATGCGGATAAATGGAAAGGCGAAAG...AGA gi|6273291|gb|AF191665.1|AF191
241 NNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNN...NNN dummy
242 >>> len(align)
243 8
244
245 """
246 if not isinstance(record, SeqRecord):
247 raise TypeError("New sequence is not a SeqRecord object")
248 if self._records and len(record) != self.get_alignment_length():
249
250
251
252 raise ValueError("Sequences must all be the same length")
253
254
255 if not Alphabet._check_type_compatible([self._alphabet, record.seq.alphabet]):
256 raise ValueError("New sequence's alphabet is incompatible")
257 self._records.append(record)
258
260 """Combines to alignments with the same number of rows by adding them.
261
262 If you have two multiple sequence alignments (MSAs), there are two ways to think
263 about adding them - by row or by column. Using the extend method adds by row.
264 Using the addition operator adds by column. For example,
265
266 >>> from Bio.Alphabet import generic_dna
267 >>> from Bio.Seq import Seq
268 >>> from Bio.SeqRecord import SeqRecord
269 >>> from Bio.Align import MultipleSeqAlignment
270 >>> a1 = SeqRecord(Seq("AAAAC", generic_dna), id="Alpha")
271 >>> b1 = SeqRecord(Seq("AAA-C", generic_dna), id="Beta")
272 >>> c1 = SeqRecord(Seq("AAAAG", generic_dna), id="Gamma")
273 >>> a2 = SeqRecord(Seq("GT", generic_dna), id="Alpha")
274 >>> b2 = SeqRecord(Seq("GT", generic_dna), id="Beta")
275 >>> c2 = SeqRecord(Seq("GT", generic_dna), id="Gamma")
276 >>> left = MultipleSeqAlignment([a1, b1, c1])
277 >>> right = MultipleSeqAlignment([a2, b2, c2])
278
279 Now, let's look at these two alignments:
280
281 >>> print left
282 DNAAlphabet() alignment with 3 rows and 5 columns
283 AAAAC Alpha
284 AAA-C Beta
285 AAAAG Gamma
286 >>> print right
287 DNAAlphabet() alignment with 3 rows and 2 columns
288 GT Alpha
289 GT Beta
290 GT Gamma
291
292 And add them:
293
294 >>> print left + right
295 DNAAlphabet() alignment with 3 rows and 7 columns
296 AAAACGT Alpha
297 AAA-CGT Beta
298 AAAAGGT Gamma
299
300 For this to work, both alignments must have the same number of records (here
301 they both have 3 rows):
302
303 >>> len(left)
304 3
305 >>> len(right)
306 3
307
308 The individual rows are SeqRecord objects, and these can be added together. Refer
309 to the SeqRecord documentation for details of how the annotation is handled. This
310 example is a special case in that both original alignments shared the same names,
311 meaning when the rows are added they also get the same name.
312 """
313 if not isinstance(other, MultipleSeqAlignment):
314 raise NotImplementedError
315 if len(self) != len(other):
316 raise ValueError("When adding two alignments they must have the same length"
317 " (i.e. same number or rows)")
318 alpha = Alphabet._consensus_alphabet([self._alphabet, other._alphabet])
319 merged = (left+right for left,right in zip(self, other))
320 return MultipleSeqAlignment(merged, alpha)
321
323 """Access part of the alignment.
324
325 Depending on the indices, you can get a SeqRecord object
326 (representing a single row), a Seq object (for a single columns),
327 a string (for a single characters) or another alignment
328 (representing some part or all of the alignment).
329
330 align[r,c] gives a single character as a string
331 align[r] gives a row as a SeqRecord
332 align[r,:] gives a row as a SeqRecord
333 align[:,c] gives a column as a Seq (using the alignment's alphabet)
334
335 align[:] and align[:,:] give a copy of the alignment
336
337 Anything else gives a sub alignment, e.g.
338 align[0:2] or align[0:2,:] uses only row 0 and 1
339 align[:,1:3] uses only columns 1 and 2
340 align[0:2,1:3] uses only rows 0 & 1 and only cols 1 & 2
341
342 We'll use the following example alignment here for illustration:
343
344 >>> from Bio.Alphabet import generic_dna
345 >>> from Bio.Seq import Seq
346 >>> from Bio.SeqRecord import SeqRecord
347 >>> from Bio.Align import MultipleSeqAlignment
348 >>> a = SeqRecord(Seq("AAAACGT", generic_dna), id="Alpha")
349 >>> b = SeqRecord(Seq("AAA-CGT", generic_dna), id="Beta")
350 >>> c = SeqRecord(Seq("AAAAGGT", generic_dna), id="Gamma")
351 >>> d = SeqRecord(Seq("AAAACGT", generic_dna), id="Delta")
352 >>> e = SeqRecord(Seq("AAA-GGT", generic_dna), id="Epsilon")
353 >>> align = MultipleSeqAlignment([a, b, c, d, e], generic_dna)
354
355 You can access a row of the alignment as a SeqRecord using an integer
356 index (think of the alignment as a list of SeqRecord objects here):
357
358 >>> first_record = align[0]
359 >>> print first_record.id, first_record.seq
360 Alpha AAAACGT
361 >>> last_record = align[-1]
362 >>> print last_record.id, last_record.seq
363 Epsilon AAA-GGT
364
365 You can also access use python's slice notation to create a sub-alignment
366 containing only some of the SeqRecord objects:
367
368 >>> sub_alignment = align[2:5]
369 >>> print sub_alignment
370 DNAAlphabet() alignment with 3 rows and 7 columns
371 AAAAGGT Gamma
372 AAAACGT Delta
373 AAA-GGT Epsilon
374
375 This includes support for a step, i.e. align[start:end:step], which
376 can be used to select every second sequence:
377
378 >>> sub_alignment = align[::2]
379 >>> print sub_alignment
380 DNAAlphabet() alignment with 3 rows and 7 columns
381 AAAACGT Alpha
382 AAAAGGT Gamma
383 AAA-GGT Epsilon
384
385 Or to get a copy of the alignment with the rows in reverse order:
386
387 >>> rev_alignment = align[::-1]
388 >>> print rev_alignment
389 DNAAlphabet() alignment with 5 rows and 7 columns
390 AAA-GGT Epsilon
391 AAAACGT Delta
392 AAAAGGT Gamma
393 AAA-CGT Beta
394 AAAACGT Alpha
395
396 You can also use two indices to specify both rows and columns. Using simple
397 integers gives you the entry as a single character string. e.g.
398
399 >>> align[3,4]
400 'C'
401
402 This is equivalent to:
403
404 >>> align[3][4]
405 'C'
406
407 or:
408
409 >>> align[3].seq[4]
410 'C'
411
412 To get a single column (as a string) use this syntax:
413
414 >>> align[:,4]
415 'CCGCG'
416
417 Or, to get part of a column,
418
419 >>> align[1:3,4]
420 'CG'
421
422 However, in general you get a sub-alignment,
423
424 >>> print align[1:5,3:6]
425 DNAAlphabet() alignment with 4 rows and 3 columns
426 -CG Beta
427 AGG Gamma
428 ACG Delta
429 -GG Epsilon
430
431 This should all seem familiar to anyone who has used the NumPy
432 array or matrix objects.
433 """
434 if isinstance(index, int):
435
436
437 return self._records[index]
438 elif isinstance(index, slice):
439
440 return MultipleSeqAlignment(self._records[index], self._alphabet)
441 elif len(index)!=2:
442 raise TypeError("Invalid index type.")
443
444
445 row_index, col_index = index
446 if isinstance(row_index, int):
447
448 return self._records[row_index][col_index]
449 elif isinstance(col_index, int):
450
451 return "".join(rec[col_index] for rec in self._records[row_index])
452 else:
453
454 return MultipleSeqAlignment((rec[col_index] for rec in self._records[row_index]),
455 self._alphabet)
456
458 """Sort the rows (SeqRecord objects) of the alignment in place.
459
460 This sorts the rows alphabetically using the SeqRecord object id.
461 Currently no advanced sort options are available, although this may
462 be added in a future release of Biopython.
463
464 This is useful if you want to add two alignments which use the same
465 record identifiers, but in a different order. For example,
466
467 >>> from Bio.Alphabet import generic_dna
468 >>> from Bio.Seq import Seq
469 >>> from Bio.SeqRecord import SeqRecord
470 >>> from Bio.Align import MultipleSeqAlignment
471 >>> align1 = MultipleSeqAlignment([
472 ... SeqRecord(Seq("ACGT", generic_dna), id="Human"),
473 ... SeqRecord(Seq("ACGG", generic_dna), id="Mouse"),
474 ... SeqRecord(Seq("ACGC", generic_dna), id="Chicken"),
475 ... ])
476 >>> align2 = MultipleSeqAlignment([
477 ... SeqRecord(Seq("CGGT", generic_dna), id="Mouse"),
478 ... SeqRecord(Seq("CGTT", generic_dna), id="Human"),
479 ... SeqRecord(Seq("CGCT", generic_dna), id="Chicken"),
480 ... ])
481
482 If you simple try and add these without sorting, you get this:
483
484 >>> print align1 + align2
485 DNAAlphabet() alignment with 3 rows and 8 columns
486 ACGTCGGT <unknown id>
487 ACGGCGTT <unknown id>
488 ACGCCGCT Chicken
489
490 Consult the SeqRecord documentation which explains why you get a
491 default value when annotation like the identifier doesn't match up.
492 However, if we sort the alignments first, then add them we get the
493 desired result:
494
495 >>> align1.sort()
496 >>> align2.sort()
497 >>> print align1 + align2
498 DNAAlphabet() alignment with 3 rows and 8 columns
499 ACGCCGCT Chicken
500 ACGTCGTT Human
501 ACGGCGGT Mouse
502
503 """
504 self._records.sort(cmp = lambda x, y : cmp(x.id, y.id))
505
507 """Returns a string containing a given column (OBSOLETE).
508
509 This is a method provided for backwards compatibility with the old
510 Bio.Align.Generic.Alignment object. You are encouraged to use the
511 slice notation instead.
512 """
513 return _Alignment.get_column(self, col)
514
515 - def add_sequence(self, descriptor, sequence, start = None, end = None,
516 weight = 1.0):
517 """Add a sequence to the alignment (OBSOLETE).
518
519 The start, end, and weight arguments are not supported! This method
520 only provides limited backwards compatibility with the old
521 Bio.Align.Generic.Alignment object. You are encouraged to use the
522 append method with a SeqRecord instead.
523 """
524
525
526 if start is not None or end is not None or weight != 1.0:
527 raise ValueError("The add_Sequence method is obsolete, and only "
528 "provides limited backwards compatibily. The"
529 "start, end and weight arguments are not "
530 "supported.")
531 self.append(SeqRecord(Seq(sequence, self._alphabet),
532 id = descriptor, description = descriptor))
533
534
536 """Run the Bio.Align module's doctests.
537
538 This will try and locate the unit tests directory, and run the doctests
539 from there in order that the relative paths used in the examples work.
540 """
541 import doctest
542 import os
543 if os.path.isdir(os.path.join("..", "..", "Tests", "Clustalw")):
544 print "Runing doctests..."
545 cur_dir = os.path.abspath(os.curdir)
546 os.chdir(os.path.join("..", "..", "Tests"))
547 doctest.testmod()
548 os.chdir(cur_dir)
549 del cur_dir
550 print "Done"
551 elif os.path.isdir(os.path.join("Tests", "Clustalw")):
552 print "Runing doctests..."
553 cur_dir = os.path.abspath(os.curdir)
554 os.chdir(os.path.join("Tests"))
555 doctest.testmod()
556 os.chdir(cur_dir)
557 del cur_dir
558 print "Done"
559
560 if __name__ == "__main__":
561
562 _test()
563