1
2
3
4
5
6
7
8 """Provides objects to represent biological sequences with alphabets.
9
10 See also U{http://biopython.org/wiki/Seq} and the chapter in our tutorial:
11 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.html}
12 - U{http://biopython.org/DIST/docs/tutorial/Tutorial.pdf}
13 """
14 __docformat__ ="epytext en"
15
16 import string
17 import array
18 import sys
19
20 import Alphabet
21 from Alphabet import IUPAC
22 from Bio.SeqRecord import SeqRecord
23 from Data.IUPACData import ambiguous_dna_complement, ambiguous_rna_complement
24 from Bio.Data import CodonTable
27 """Makes a python string translation table (PRIVATE).
28
29 Arguments:
30 - complement_mapping - a dictionary such as ambiguous_dna_complement
31 and ambiguous_rna_complement from Data.IUPACData.
32
33 Returns a translation table (a string of length 256) for use with the
34 python string's translate method to use in a (reverse) complement.
35
36 Compatible with lower case and upper case sequences.
37
38 For internal use only.
39 """
40 before = ''.join(complement_mapping.keys())
41 after = ''.join(complement_mapping.values())
42 before = before + before.lower()
43 after = after + after.lower()
44 return string.maketrans(before, after)
45
46 _dna_complement_table = _maketrans(ambiguous_dna_complement)
47 _rna_complement_table = _maketrans(ambiguous_rna_complement)
48
49 -class Seq(object):
50 """A read-only sequence object (essentially a string with an alphabet).
51
52 Like normal python strings, our basic sequence object is immutable.
53 This prevents you from doing my_seq[5] = "A" for example, but does allow
54 Seq objects to be used as dictionary keys.
55
56 The Seq object provides a number of string like methods (such as count,
57 find, split and strip), which are alphabet aware where appropriate.
58
59 The Seq object also provides some biological methods, such as complement,
60 reverse_complement, transcribe, back_transcribe and translate (which are
61 not applicable to sequences with a protein alphabet).
62 """
64 """Create a Seq object.
65
66 Arguments:
67 - seq - Sequence, required (string)
68 - alphabet - Optional argument, an Alphabet object from Bio.Alphabet
69
70 You will typically use Bio.SeqIO to read in sequences from files as
71 SeqRecord objects, whose sequence will be exposed as a Seq object via
72 the seq property.
73
74 However, will often want to create your own Seq objects directly:
75
76 >>> from Bio.Seq import Seq
77 >>> from Bio.Alphabet import IUPAC
78 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF",
79 ... IUPAC.protein)
80 >>> my_seq
81 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein())
82 >>> print my_seq
83 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF
84 """
85
86 assert (type(data) == type("") or
87 type(data) == type(u""))
88 self._data = data
89 self.alphabet = alphabet
90
91
92
93 @property
95 """Sequence as a string (OBSOLETE/DEPRECATED).
96
97 This is a read only property provided for backwards compatility with
98 older versions of Biopython (as is the tostring() method). We now
99 encourage you to use str(my_seq) instead of my_seq.data or the method
100 my_seq.tostring().
101
102 In recent releases of Biopython it was possible to change a Seq object
103 by updating its data property, but this triggered a deprecation warning.
104 Now the data property is read only, since Seq objects are meant to be
105 immutable:
106
107 >>> from Bio.Seq import Seq
108 >>> from Bio.Alphabet import generic_dna
109 >>> my_seq = Seq("ACGT", generic_dna)
110 >>> str(my_seq) == my_seq.tostring() == my_seq.data == "ACGT"
111 True
112 >>> my_seq.data = "AAAA"
113 Traceback (most recent call last):
114 ...
115 AttributeError: can't set attribute
116 """
117 return str(self)
118
120 """Returns a (truncated) representation of the sequence for debugging."""
121 if len(self) > 60:
122
123
124
125 return "%s('%s...%s', %s)" % (self.__class__.__name__,
126 str(self)[:54], str(self)[-3:],
127 repr(self.alphabet))
128 else:
129 return "%s(%s, %s)" % (self.__class__.__name__,
130 repr(self.data),
131 repr(self.alphabet))
133 """Returns the full sequence as a python string, use str(my_seq).
134
135 Note that Biopython 1.44 and earlier would give a truncated
136 version of repr(my_seq) for str(my_seq). If you are writing code
137 which need to be backwards compatible with old Biopython, you
138 should continue to use my_seq.tostring() rather than str(my_seq).
139 """
140 return self._data
141
143 """Compare the sequence to another sequence or a string (README).
144
145 Historically comparing Seq objects has done Python object comparison.
146 After considerable discussion (keeping in mind constraints of the
147 Python language, hashes and dictionary support) a future release of
148 Biopython will change this to use simple string comparison. The plan is
149 that comparing incompatible alphabets (e.g. DNA to RNA) will trigger a
150 warning.
151
152 This version of Biopython still does Python object comparison, but with
153 a warning about this future change. During this transition period,
154 please just do explicit comparisons:
155
156 >>> seq1 = Seq("ACGT")
157 >>> seq2 = Seq("ACGT")
158 >>> id(seq1) == id(seq2)
159 False
160 >>> str(seq1) == str(seq2)
161 True
162
163 Note - This method indirectly supports ==, < , etc.
164 """
165 if hasattr(other, "alphabet"):
166
167 import warnings
168 warnings.warn("In future comparing Seq objects will use string "
169 "comparison (not object comparison). Incompatible "
170 "alphabets will trigger a warning (not an exception). "
171 "In the interim please use id(seq1)==id(seq2) or "
172 "str(seq1)==str(seq2) to make your code explicit "
173 "and to avoid this warning.", FutureWarning)
174 return cmp(id(self), id(other))
175
177 """Returns the length of the sequence, use len(my_seq)."""
178 return len(self._data)
179
181 """Returns a subsequence of single letter, use my_seq[index]."""
182
183
184
185 if isinstance(index, int):
186
187 return self._data[index]
188 else:
189
190 return Seq(self._data[index], self.alphabet)
191
193 """Add another sequence or string to this sequence.
194
195 If adding a string to a Seq, the alphabet is preserved:
196
197 >>> from Bio.Seq import Seq
198 >>> from Bio.Alphabet import generic_protein
199 >>> Seq("MELKI", generic_protein) + "LV"
200 Seq('MELKILV', ProteinAlphabet())
201
202 When adding two Seq (like) objects, the alphabets are important.
203 Consider this example:
204
205 >>> from Bio.Seq import Seq
206 >>> from Bio.Alphabet.IUPAC import unambiguous_dna, ambiguous_dna
207 >>> unamb_dna_seq = Seq("ACGT", unambiguous_dna)
208 >>> ambig_dna_seq = Seq("ACRGT", ambiguous_dna)
209 >>> unamb_dna_seq
210 Seq('ACGT', IUPACUnambiguousDNA())
211 >>> ambig_dna_seq
212 Seq('ACRGT', IUPACAmbiguousDNA())
213
214 If we add the ambiguous and unambiguous IUPAC DNA alphabets, we get
215 the more general ambiguous IUPAC DNA alphabet:
216
217 >>> unamb_dna_seq + ambig_dna_seq
218 Seq('ACGTACRGT', IUPACAmbiguousDNA())
219
220 However, if the default generic alphabet is included, the result is
221 a generic alphabet:
222
223 >>> Seq("") + ambig_dna_seq
224 Seq('ACRGT', Alphabet())
225
226 You can't add RNA and DNA sequences:
227
228 >>> from Bio.Alphabet import generic_dna, generic_rna
229 >>> Seq("ACGT", generic_dna) + Seq("ACGU", generic_rna)
230 Traceback (most recent call last):
231 ...
232 TypeError: Incompatable alphabets DNAAlphabet() and RNAAlphabet()
233
234 You can't add nucleotide and protein sequences:
235
236 >>> from Bio.Alphabet import generic_dna, generic_protein
237 >>> Seq("ACGT", generic_dna) + Seq("MELKI", generic_protein)
238 Traceback (most recent call last):
239 ...
240 TypeError: Incompatable alphabets DNAAlphabet() and ProteinAlphabet()
241 """
242 if hasattr(other, "alphabet"):
243
244 if not Alphabet._check_type_compatible([self.alphabet,
245 other.alphabet]):
246 raise TypeError("Incompatable alphabets %s and %s" \
247 % (repr(self.alphabet), repr(other.alphabet)))
248
249 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
250 return self.__class__(str(self) + str(other), a)
251 elif isinstance(other, basestring):
252
253 return self.__class__(str(self) + other, self.alphabet)
254 elif isinstance(other, SeqRecord):
255
256 return NotImplemented
257 else :
258 raise TypeError
259
261 """Adding a sequence on the left.
262
263 If adding a string to a Seq, the alphabet is preserved:
264
265 >>> from Bio.Seq import Seq
266 >>> from Bio.Alphabet import generic_protein
267 >>> "LV" + Seq("MELKI", generic_protein)
268 Seq('LVMELKI', ProteinAlphabet())
269
270 Adding two Seq (like) objects is handled via the __add__ method.
271 """
272 if hasattr(other, "alphabet"):
273
274 if not Alphabet._check_type_compatible([self.alphabet,
275 other.alphabet]):
276 raise TypeError("Incompatable alphabets %s and %s" \
277 % (repr(self.alphabet), repr(other.alphabet)))
278
279 a = Alphabet._consensus_alphabet([self.alphabet, other.alphabet])
280 return self.__class__(str(other) + str(self), a)
281 elif isinstance(other, basestring):
282
283 return self.__class__(other + str(self), self.alphabet)
284 else:
285 raise TypeError
286
288 """Returns the full sequence as a python string (OBSOLETE).
289
290 Although not formally deprecated, you are now encouraged to use
291 str(my_seq) instead of my_seq.tostring()."""
292 return str(self)
293
295 """Returns the full sequence as a MutableSeq object.
296
297 >>> from Bio.Seq import Seq
298 >>> from Bio.Alphabet import IUPAC
299 >>> my_seq = Seq("MKQHKAMIVALIVICITAVVAAL",
300 ... IUPAC.protein)
301 >>> my_seq
302 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
303 >>> my_seq.tomutable()
304 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
305
306 Note that the alphabet is preserved.
307 """
308 return MutableSeq(str(self), self.alphabet)
309
311 """string/Seq/MutableSeq to string, checking alphabet (PRIVATE).
312
313 For a string argument, returns the string.
314
315 For a Seq or MutableSeq, it checks the alphabet is compatible
316 (raising an exception if it isn't), and then returns a string.
317 """
318 try:
319 other_alpha = other_sequence.alphabet
320 except AttributeError:
321
322 return other_sequence
323
324
325 if not Alphabet._check_type_compatible([self.alphabet, other_alpha]):
326 raise TypeError("Incompatable alphabets %s and %s" \
327 % (repr(self.alphabet), repr(other_alpha)))
328
329 return str(other_sequence)
330
331 - def count(self, sub, start=0, end=sys.maxint):
332 """Non-overlapping count method, like that of a python string.
333
334 This behaves like the python string method of the same name,
335 which does a non-overlapping count!
336
337 Returns an integer, the number of occurrences of substring
338 argument sub in the (sub)sequence given by [start:end].
339 Optional arguments start and end are interpreted as in slice
340 notation.
341
342 Arguments:
343 - sub - a string or another Seq object to look for
344 - start - optional integer, slice start
345 - end - optional integer, slice end
346
347 e.g.
348
349 >>> from Bio.Seq import Seq
350 >>> my_seq = Seq("AAAATGA")
351 >>> print my_seq.count("A")
352 5
353 >>> print my_seq.count("ATG")
354 1
355 >>> print my_seq.count(Seq("AT"))
356 1
357 >>> print my_seq.count("AT", 2, -1)
358 1
359
360 HOWEVER, please note because python strings and Seq objects (and
361 MutableSeq objects) do a non-overlapping search, this may not give
362 the answer you expect:
363
364 >>> "AAAA".count("AA")
365 2
366 >>> print Seq("AAAA").count("AA")
367 2
368
369 A non-overlapping search would give the answer as three!
370 """
371
372 sub_str = self._get_seq_str_and_check_alphabet(sub)
373 return str(self).count(sub_str, start, end)
374
376 """Implements the 'in' keyword, like a python string.
377
378 e.g.
379
380 >>> from Bio.Seq import Seq
381 >>> from Bio.Alphabet import generic_dna, generic_rna, generic_protein
382 >>> my_dna = Seq("ATATGAAATTTGAAAA", generic_dna)
383 >>> "AAA" in my_dna
384 True
385 >>> Seq("AAA") in my_dna
386 True
387 >>> Seq("AAA", generic_dna) in my_dna
388 True
389
390 Like other Seq methods, this will raise a type error if another Seq
391 (or Seq like) object with an incompatible alphabet is used:
392
393 >>> Seq("AAA", generic_rna) in my_dna
394 Traceback (most recent call last):
395 ...
396 TypeError: Incompatable alphabets DNAAlphabet() and RNAAlphabet()
397 >>> Seq("AAA", generic_protein) in my_dna
398 Traceback (most recent call last):
399 ...
400 TypeError: Incompatable alphabets DNAAlphabet() and ProteinAlphabet()
401 """
402
403 sub_str = self._get_seq_str_and_check_alphabet(char)
404 return sub_str in str(self)
405
406 - def find(self, sub, start=0, end=sys.maxint):
407 """Find method, like that of a python string.
408
409 This behaves like the python string method of the same name.
410
411 Returns an integer, the index of the first occurrence of substring
412 argument sub in the (sub)sequence given by [start:end].
413
414 Arguments:
415 - sub - a string or another Seq object to look for
416 - start - optional integer, slice start
417 - end - optional integer, slice end
418
419 Returns -1 if the subsequence is NOT found.
420
421 e.g. Locating the first typical start codon, AUG, in an RNA sequence:
422
423 >>> from Bio.Seq import Seq
424 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
425 >>> my_rna.find("AUG")
426 3
427 """
428
429 sub_str = self._get_seq_str_and_check_alphabet(sub)
430 return str(self).find(sub_str, start, end)
431
432 - def rfind(self, sub, start=0, end=sys.maxint):
433 """Find from right method, like that of a python string.
434
435 This behaves like the python string method of the same name.
436
437 Returns an integer, the index of the last (right most) occurrence of
438 substring argument sub in the (sub)sequence given by [start:end].
439
440 Arguments:
441 - sub - a string or another Seq object to look for
442 - start - optional integer, slice start
443 - end - optional integer, slice end
444
445 Returns -1 if the subsequence is NOT found.
446
447 e.g. Locating the last typical start codon, AUG, in an RNA sequence:
448
449 >>> from Bio.Seq import Seq
450 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
451 >>> my_rna.rfind("AUG")
452 15
453 """
454
455 sub_str = self._get_seq_str_and_check_alphabet(sub)
456 return str(self).rfind(sub_str, start, end)
457
458 - def startswith(self, prefix, start=0, end=sys.maxint):
459 """Does the Seq start with the given prefix? Returns True/False.
460
461 This behaves like the python string method of the same name.
462
463 Return True if the sequence starts with the specified prefix
464 (a string or another Seq object), False otherwise.
465 With optional start, test sequence beginning at that position.
466 With optional end, stop comparing sequence at that position.
467 prefix can also be a tuple of strings to try. e.g.
468
469 >>> from Bio.Seq import Seq
470 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
471 >>> my_rna.startswith("GUC")
472 True
473 >>> my_rna.startswith("AUG")
474 False
475 >>> my_rna.startswith("AUG", 3)
476 True
477 >>> my_rna.startswith(("UCC","UCA","UCG"),1)
478 True
479 """
480
481 if isinstance(prefix, tuple):
482
483
484
485 prefix_strings = [self._get_seq_str_and_check_alphabet(p) \
486 for p in prefix]
487 for prefix_str in prefix_strings:
488 if str(self).startswith(prefix_str, start, end):
489 return True
490 return False
491 else:
492 prefix_str = self._get_seq_str_and_check_alphabet(prefix)
493 return str(self).startswith(prefix_str, start, end)
494
495 - def endswith(self, suffix, start=0, end=sys.maxint):
496 """Does the Seq end with the given suffix? Returns True/False.
497
498 This behaves like the python string method of the same name.
499
500 Return True if the sequence ends with the specified suffix
501 (a string or another Seq object), False otherwise.
502 With optional start, test sequence beginning at that position.
503 With optional end, stop comparing sequence at that position.
504 suffix can also be a tuple of strings to try. e.g.
505
506 >>> from Bio.Seq import Seq
507 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
508 >>> my_rna.endswith("UUG")
509 True
510 >>> my_rna.endswith("AUG")
511 False
512 >>> my_rna.endswith("AUG", 0, 18)
513 True
514 >>> my_rna.endswith(("UCC","UCA","UUG"))
515 True
516 """
517
518 if isinstance(suffix, tuple):
519
520
521
522 suffix_strings = [self._get_seq_str_and_check_alphabet(p) \
523 for p in suffix]
524 for suffix_str in suffix_strings:
525 if str(self).endswith(suffix_str, start, end):
526 return True
527 return False
528 else:
529 suffix_str = self._get_seq_str_and_check_alphabet(suffix)
530 return str(self).endswith(suffix_str, start, end)
531
532
533 - def split(self, sep=None, maxsplit=-1):
534 """Split method, like that of a python string.
535
536 This behaves like the python string method of the same name.
537
538 Return a list of the 'words' in the string (as Seq objects),
539 using sep as the delimiter string. If maxsplit is given, at
540 most maxsplit splits are done. If maxsplit is ommited, all
541 splits are made.
542
543 Following the python string method, sep will by default be any
544 white space (tabs, spaces, newlines) but this is unlikely to
545 apply to biological sequences.
546
547 e.g.
548
549 >>> from Bio.Seq import Seq
550 >>> my_rna = Seq("GUCAUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAGUUG")
551 >>> my_aa = my_rna.translate()
552 >>> my_aa
553 Seq('VMAIVMGR*KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))
554 >>> my_aa.split("*")
555 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
556 >>> my_aa.split("*",1)
557 [Seq('VMAIVMGR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('KGAR*L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
558
559 See also the rsplit method:
560
561 >>> my_aa.rsplit("*",1)
562 [Seq('VMAIVMGR*KGAR', HasStopCodon(ExtendedIUPACProtein(), '*')), Seq('L', HasStopCodon(ExtendedIUPACProtein(), '*'))]
563 """
564
565 sep_str = self._get_seq_str_and_check_alphabet(sep)
566
567
568 return [Seq(part, self.alphabet) \
569 for part in str(self).split(sep_str, maxsplit)]
570
571 - def rsplit(self, sep=None, maxsplit=-1):
572 """Right split method, like that of a python string.
573
574 This behaves like the python string method of the same name.
575
576 Return a list of the 'words' in the string (as Seq objects),
577 using sep as the delimiter string. If maxsplit is given, at
578 most maxsplit splits are done COUNTING FROM THE RIGHT.
579 If maxsplit is ommited, all splits are made.
580
581 Following the python string method, sep will by default be any
582 white space (tabs, spaces, newlines) but this is unlikely to
583 apply to biological sequences.
584
585 e.g. print my_seq.rsplit("*",1)
586
587 See also the split method.
588 """
589
590 sep_str = self._get_seq_str_and_check_alphabet(sep)
591 return [Seq(part, self.alphabet) \
592 for part in str(self).rsplit(sep_str, maxsplit)]
593
594 - def strip(self, chars=None):
595 """Returns a new Seq object with leading and trailing ends stripped.
596
597 This behaves like the python string method of the same name.
598
599 Optional argument chars defines which characters to remove. If
600 ommitted or None (default) then as for the python string method,
601 this defaults to removing any white space.
602
603 e.g. print my_seq.strip("-")
604
605 See also the lstrip and rstrip methods.
606 """
607
608 strip_str = self._get_seq_str_and_check_alphabet(chars)
609 return Seq(str(self).strip(strip_str), self.alphabet)
610
611 - def lstrip(self, chars=None):
612 """Returns a new Seq object with leading (left) end stripped.
613
614 This behaves like the python string method of the same name.
615
616 Optional argument chars defines which characters to remove. If
617 ommitted or None (default) then as for the python string method,
618 this defaults to removing any white space.
619
620 e.g. print my_seq.lstrip("-")
621
622 See also the strip and rstrip methods.
623 """
624
625 strip_str = self._get_seq_str_and_check_alphabet(chars)
626 return Seq(str(self).lstrip(strip_str), self.alphabet)
627
628 - def rstrip(self, chars=None):
629 """Returns a new Seq object with trailing (right) end stripped.
630
631 This behaves like the python string method of the same name.
632
633 Optional argument chars defines which characters to remove. If
634 ommitted or None (default) then as for the python string method,
635 this defaults to removing any white space.
636
637 e.g. Removing a nucleotide sequence's polyadenylation (poly-A tail):
638
639 >>> from Bio.Alphabet import IUPAC
640 >>> from Bio.Seq import Seq
641 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAGAAAAAA", IUPAC.unambiguous_dna)
642 >>> my_seq
643 Seq('CGGTACGCTTATGTCACGTAGAAAAAA', IUPACUnambiguousDNA())
644 >>> my_seq.rstrip("A")
645 Seq('CGGTACGCTTATGTCACGTAG', IUPACUnambiguousDNA())
646
647 See also the strip and lstrip methods.
648 """
649
650 strip_str = self._get_seq_str_and_check_alphabet(chars)
651 return Seq(str(self).rstrip(strip_str), self.alphabet)
652
654 """Returns an upper case copy of the sequence.
655
656 >>> from Bio.Alphabet import HasStopCodon, generic_protein
657 >>> from Bio.Seq import Seq
658 >>> my_seq = Seq("VHLTPeeK*", HasStopCodon(generic_protein))
659 >>> my_seq
660 Seq('VHLTPeeK*', HasStopCodon(ProteinAlphabet(), '*'))
661 >>> my_seq.lower()
662 Seq('vhltpeek*', HasStopCodon(ProteinAlphabet(), '*'))
663 >>> my_seq.upper()
664 Seq('VHLTPEEK*', HasStopCodon(ProteinAlphabet(), '*'))
665
666 This will adjust the alphabet if required. See also the lower method.
667 """
668 return Seq(str(self).upper(), self.alphabet._upper())
669
671 """Returns a lower case copy of the sequence.
672
673 This will adjust the alphabet if required. Note that the IUPAC alphabets
674 are upper case only, and thus a generic alphabet must be substituted.
675
676 >>> from Bio.Alphabet import Gapped, generic_dna
677 >>> from Bio.Alphabet import IUPAC
678 >>> from Bio.Seq import Seq
679 >>> my_seq = Seq("CGGTACGCTTATGTCACGTAG*AAAAAA", Gapped(IUPAC.unambiguous_dna, "*"))
680 >>> my_seq
681 Seq('CGGTACGCTTATGTCACGTAG*AAAAAA', Gapped(IUPACUnambiguousDNA(), '*'))
682 >>> my_seq.lower()
683 Seq('cggtacgcttatgtcacgtag*aaaaaa', Gapped(DNAAlphabet(), '*'))
684
685 See also the upper method.
686 """
687 return Seq(str(self).lower(), self.alphabet._lower())
688
690 """Returns the complement sequence. New Seq object.
691
692 >>> from Bio.Seq import Seq
693 >>> from Bio.Alphabet import IUPAC
694 >>> my_dna = Seq("CCCCCGATAG", IUPAC.unambiguous_dna)
695 >>> my_dna
696 Seq('CCCCCGATAG', IUPACUnambiguousDNA())
697 >>> my_dna.complement()
698 Seq('GGGGGCTATC', IUPACUnambiguousDNA())
699
700 You can of course used mixed case sequences,
701
702 >>> from Bio.Seq import Seq
703 >>> from Bio.Alphabet import generic_dna
704 >>> my_dna = Seq("CCCCCgatA-GD", generic_dna)
705 >>> my_dna
706 Seq('CCCCCgatA-GD', DNAAlphabet())
707 >>> my_dna.complement()
708 Seq('GGGGGctaT-CH', DNAAlphabet())
709
710 Note in the above example, ambiguous character D denotes
711 G, A or T so its complement is H (for C, T or A).
712
713 Trying to complement a protein sequence raises an exception.
714
715 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
716 >>> my_protein.complement()
717 Traceback (most recent call last):
718 ...
719 ValueError: Proteins do not have complements!
720 """
721 base = Alphabet._get_base_alphabet(self.alphabet)
722 if isinstance(base, Alphabet.ProteinAlphabet):
723 raise ValueError("Proteins do not have complements!")
724 if isinstance(base, Alphabet.DNAAlphabet):
725 ttable = _dna_complement_table
726 elif isinstance(base, Alphabet.RNAAlphabet):
727 ttable = _rna_complement_table
728 elif ('U' in self._data or 'u' in self._data) \
729 and ('T' in self._data or 't' in self._data):
730
731 raise ValueError("Mixed RNA/DNA found")
732 elif 'U' in self._data or 'u' in self._data:
733 ttable = _rna_complement_table
734 else:
735 ttable = _dna_complement_table
736
737
738 return Seq(str(self).translate(ttable), self.alphabet)
739
741 """Returns the reverse complement sequence. New Seq object.
742
743 >>> from Bio.Seq import Seq
744 >>> from Bio.Alphabet import IUPAC
745 >>> my_dna = Seq("CCCCCGATAGNR", IUPAC.ambiguous_dna)
746 >>> my_dna
747 Seq('CCCCCGATAGNR', IUPACAmbiguousDNA())
748 >>> my_dna.reverse_complement()
749 Seq('YNCTATCGGGGG', IUPACAmbiguousDNA())
750
751 Note in the above example, since R = G or A, its complement
752 is Y (which denotes C or T).
753
754 You can of course used mixed case sequences,
755
756 >>> from Bio.Seq import Seq
757 >>> from Bio.Alphabet import generic_dna
758 >>> my_dna = Seq("CCCCCgatA-G", generic_dna)
759 >>> my_dna
760 Seq('CCCCCgatA-G', DNAAlphabet())
761 >>> my_dna.reverse_complement()
762 Seq('C-TatcGGGGG', DNAAlphabet())
763
764 Trying to complement a protein sequence raises an exception:
765
766 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
767 >>> my_protein.reverse_complement()
768 Traceback (most recent call last):
769 ...
770 ValueError: Proteins do not have complements!
771 """
772
773 return self.complement()[::-1]
774
776 """Returns the RNA sequence from a DNA sequence. New Seq object.
777
778 >>> from Bio.Seq import Seq
779 >>> from Bio.Alphabet import IUPAC
780 >>> coding_dna = Seq("ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG",
781 ... IUPAC.unambiguous_dna)
782 >>> coding_dna
783 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
784 >>> coding_dna.transcribe()
785 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
786
787 Trying to transcribe a protein or RNA sequence raises an exception:
788
789 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
790 >>> my_protein.transcribe()
791 Traceback (most recent call last):
792 ...
793 ValueError: Proteins cannot be transcribed!
794 """
795 base = Alphabet._get_base_alphabet(self.alphabet)
796 if isinstance(base, Alphabet.ProteinAlphabet):
797 raise ValueError("Proteins cannot be transcribed!")
798 if isinstance(base, Alphabet.RNAAlphabet):
799 raise ValueError("RNA cannot be transcribed!")
800
801 if self.alphabet==IUPAC.unambiguous_dna:
802 alphabet = IUPAC.unambiguous_rna
803 elif self.alphabet==IUPAC.ambiguous_dna:
804 alphabet = IUPAC.ambiguous_rna
805 else:
806 alphabet = Alphabet.generic_rna
807 return Seq(str(self).replace('T','U').replace('t','u'), alphabet)
808
810 """Returns the DNA sequence from an RNA sequence. New Seq object.
811
812 >>> from Bio.Seq import Seq
813 >>> from Bio.Alphabet import IUPAC
814 >>> messenger_rna = Seq("AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG",
815 ... IUPAC.unambiguous_rna)
816 >>> messenger_rna
817 Seq('AUGGCCAUUGUAAUGGGCCGCUGAAAGGGUGCCCGAUAG', IUPACUnambiguousRNA())
818 >>> messenger_rna.back_transcribe()
819 Seq('ATGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG', IUPACUnambiguousDNA())
820
821 Trying to back-transcribe a protein or DNA sequence raises an
822 exception:
823
824 >>> my_protein = Seq("MAIVMGR", IUPAC.protein)
825 >>> my_protein.back_transcribe()
826 Traceback (most recent call last):
827 ...
828 ValueError: Proteins cannot be back transcribed!
829 """
830 base = Alphabet._get_base_alphabet(self.alphabet)
831 if isinstance(base, Alphabet.ProteinAlphabet):
832 raise ValueError("Proteins cannot be back transcribed!")
833 if isinstance(base, Alphabet.DNAAlphabet):
834 raise ValueError("DNA cannot be back transcribed!")
835
836 if self.alphabet==IUPAC.unambiguous_rna:
837 alphabet = IUPAC.unambiguous_dna
838 elif self.alphabet==IUPAC.ambiguous_rna:
839 alphabet = IUPAC.ambiguous_dna
840 else:
841 alphabet = Alphabet.generic_dna
842 return Seq(str(self).replace("U", "T").replace("u", "t"), alphabet)
843
844 - def translate(self, table="Standard", stop_symbol="*", to_stop=False,
845 cds=False):
846 """Turns a nucleotide sequence into a protein sequence. New Seq object.
847
848 This method will translate DNA or RNA sequences, and those with a
849 nucleotide or generic alphabet. Trying to translate a protein
850 sequence raises an exception.
851
852 Arguments:
853 - table - Which codon table to use? This can be either a name
854 (string) or an NCBI identifier (integer). This defaults
855 to the "Standard" table.
856 - stop_symbol - Single character string, what to use for terminators.
857 This defaults to the asterisk, "*".
858 - to_stop - Boolean, defaults to False meaning do a full translation
859 continuing on past any stop codons (translated as the
860 specified stop_symbol). If True, translation is
861 terminated at the first in frame stop codon (and the
862 stop_symbol is not appended to the returned protein
863 sequence).
864 - cds - Boolean, indicates this is a complete CDS. If True,
865 this checks the sequence starts with a valid alternative start
866 codon (which will be translated as methionine, M), that the
867 sequence length is a multiple of three, and that there is a
868 single in frame stop codon at the end (this will be excluded
869 from the protein sequence, regardless of the to_stop option).
870 If these tests fail, an exception is raised.
871
872 e.g. Using the standard table:
873
874 >>> coding_dna = Seq("GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG")
875 >>> coding_dna.translate()
876 Seq('VAIVMGR*KGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
877 >>> coding_dna.translate(stop_symbol="@")
878 Seq('VAIVMGR@KGAR@', HasStopCodon(ExtendedIUPACProtein(), '@'))
879 >>> coding_dna.translate(to_stop=True)
880 Seq('VAIVMGR', ExtendedIUPACProtein())
881
882 Now using NCBI table 2, where TGA is not a stop codon:
883
884 >>> coding_dna.translate(table=2)
885 Seq('VAIVMGRWKGAR*', HasStopCodon(ExtendedIUPACProtein(), '*'))
886 >>> coding_dna.translate(table=2, to_stop=True)
887 Seq('VAIVMGRWKGAR', ExtendedIUPACProtein())
888
889 In fact, GTG is an alternative start codon under NCBI table 2, meaning
890 this sequence could be a complete CDS:
891
892 >>> coding_dna.translate(table=2, cds=True)
893 Seq('MAIVMGRWKGAR', ExtendedIUPACProtein())
894
895 It isn't a valid CDS under NCBI table 1, due to both the start codon and
896 also the in frame stop codons:
897
898 >>> coding_dna.translate(table=1, cds=True)
899 Traceback (most recent call last):
900 ...
901 TranslationError: First codon 'GTG' is not a start codon
902
903 If the sequence has no in-frame stop codon, then the to_stop argument
904 has no effect:
905
906 >>> coding_dna2 = Seq("TTGGCCATTGTAATGGGCCGC")
907 >>> coding_dna2.translate()
908 Seq('LAIVMGR', ExtendedIUPACProtein())
909 >>> coding_dna2.translate(to_stop=True)
910 Seq('LAIVMGR', ExtendedIUPACProtein())
911
912 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
913 or a stop codon. These are translated as "X". Any invalid codon
914 (e.g. "TA?" or "T-A") will throw a TranslationError.
915
916 NOTE - Does NOT support gapped sequences.
917
918 NOTE - This does NOT behave like the python string's translate
919 method. For that use str(my_seq).translate(...) instead.
920 """
921 try:
922 table_id = int(table)
923 except ValueError:
924 table_id = None
925 if isinstance(table, str) and len(table)==256:
926 raise ValueError("The Seq object translate method DOES NOT take " \
927 + "a 256 character string mapping table like " \
928 + "the python string object's translate method. " \
929 + "Use str(my_seq).translate(...) instead.")
930 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
931 Alphabet.ProteinAlphabet):
932 raise ValueError("Proteins cannot be translated!")
933 if self.alphabet==IUPAC.unambiguous_dna:
934
935 if table_id is None:
936 codon_table = CodonTable.unambiguous_dna_by_name[table]
937 else:
938 codon_table = CodonTable.unambiguous_dna_by_id[table_id]
939 elif self.alphabet==IUPAC.unambiguous_rna:
940
941 if table_id is None:
942 codon_table = CodonTable.unambiguous_rna_by_name[table]
943 else:
944 codon_table = CodonTable.unambiguous_rna_by_id[table_id]
945 else:
946
947
948
949 if table_id is None:
950 codon_table = CodonTable.ambiguous_generic_by_name[table]
951 else:
952 codon_table = CodonTable.ambiguous_generic_by_id[table_id]
953 protein = _translate_str(str(self), codon_table, \
954 stop_symbol, to_stop, cds)
955 if stop_symbol in protein:
956 alphabet = Alphabet.HasStopCodon(codon_table.protein_alphabet,
957 stop_symbol = stop_symbol)
958 else:
959 alphabet = codon_table.protein_alphabet
960 return Seq(protein, alphabet)
961
962 - def ungap(self, gap=None):
963 """Return a copy of the sequence without the gap character(s).
964
965 The gap character can be specified in two ways - either as an explicit
966 argument, or via the sequence's alphabet. For example:
967
968 >>> from Bio.Seq import Seq
969 >>> from Bio.Alphabet import generic_dna
970 >>> my_dna = Seq("-ATA--TGAAAT-TTGAAAA", generic_dna)
971 >>> my_dna
972 Seq('-ATA--TGAAAT-TTGAAAA', DNAAlphabet())
973 >>> my_dna.ungap("-")
974 Seq('ATATGAAATTTGAAAA', DNAAlphabet())
975
976 If the gap character is not given as an argument, it will be taken from
977 the sequence's alphabet (if defined). Notice that the returned sequence's
978 alphabet is adjusted since it no longer requires a gapped alphabet:
979
980 >>> from Bio.Seq import Seq
981 >>> from Bio.Alphabet import IUPAC, Gapped, HasStopCodon
982 >>> my_pro = Seq("MVVLE=AD*", HasStopCodon(Gapped(IUPAC.protein, "=")))
983 >>> my_pro
984 Seq('MVVLE=AD*', HasStopCodon(Gapped(IUPACProtein(), '='), '*'))
985 >>> my_pro.ungap()
986 Seq('MVVLEAD*', HasStopCodon(IUPACProtein(), '*'))
987
988 Or, with a simpler gapped DNA example:
989
990 >>> from Bio.Seq import Seq
991 >>> from Bio.Alphabet import IUPAC, Gapped
992 >>> my_seq = Seq("CGGGTAG=AAAAAA", Gapped(IUPAC.unambiguous_dna, "="))
993 >>> my_seq
994 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '='))
995 >>> my_seq.ungap()
996 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA())
997
998 As long as it is consistent with the alphabet, although it is redundant,
999 you can still supply the gap character as an argument to this method:
1000
1001 >>> my_seq
1002 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '='))
1003 >>> my_seq.ungap("=")
1004 Seq('CGGGTAGAAAAAA', IUPACUnambiguousDNA())
1005
1006 However, if the gap character given as the argument disagrees with that
1007 declared in the alphabet, an exception is raised:
1008
1009 >>> my_seq
1010 Seq('CGGGTAG=AAAAAA', Gapped(IUPACUnambiguousDNA(), '='))
1011 >>> my_seq.ungap("-")
1012 Traceback (most recent call last):
1013 ...
1014 ValueError: Gap '-' does not match '=' from alphabet
1015
1016 Finally, if a gap character is not supplied, and the alphabet does not
1017 define one, an exception is raised:
1018
1019 >>> from Bio.Seq import Seq
1020 >>> from Bio.Alphabet import generic_dna
1021 >>> my_dna = Seq("ATA--TGAAAT-TTGAAAA", generic_dna)
1022 >>> my_dna
1023 Seq('ATA--TGAAAT-TTGAAAA', DNAAlphabet())
1024 >>> my_dna.ungap()
1025 Traceback (most recent call last):
1026 ...
1027 ValueError: Gap character not given and not defined in alphabet
1028
1029 """
1030 if hasattr(self.alphabet, "gap_char"):
1031 if not gap:
1032 gap = self.alphabet.gap_char
1033 elif gap != self.alphabet.gap_char:
1034 raise ValueError("Gap %s does not match %s from alphabet" \
1035 % (repr(gap), repr(self.alphabet.gap_char)))
1036 alpha = Alphabet._ungap(self.alphabet)
1037 elif not gap:
1038 raise ValueError("Gap character not given and not defined in alphabet")
1039 else:
1040 alpha = self.alphabet
1041 if len(gap)!=1 or not isinstance(gap, str):
1042 raise ValueError("Unexpected gap character, %s" % repr(gap))
1043 return Seq(str(self).replace(gap, ""), alpha)
1044
1046 """A read-only sequence object of known length but unknown contents.
1047
1048 If you have an unknown sequence, you can represent this with a normal
1049 Seq object, for example:
1050
1051 >>> my_seq = Seq("N"*5)
1052 >>> my_seq
1053 Seq('NNNNN', Alphabet())
1054 >>> len(my_seq)
1055 5
1056 >>> print my_seq
1057 NNNNN
1058
1059 However, this is rather wasteful of memory (especially for large
1060 sequences), which is where this class is most usefull:
1061
1062 >>> unk_five = UnknownSeq(5)
1063 >>> unk_five
1064 UnknownSeq(5, alphabet = Alphabet(), character = '?')
1065 >>> len(unk_five)
1066 5
1067 >>> print(unk_five)
1068 ?????
1069
1070 You can add unknown sequence together, provided their alphabets and
1071 characters are compatible, and get another memory saving UnknownSeq:
1072
1073 >>> unk_four = UnknownSeq(4)
1074 >>> unk_four
1075 UnknownSeq(4, alphabet = Alphabet(), character = '?')
1076 >>> unk_four + unk_five
1077 UnknownSeq(9, alphabet = Alphabet(), character = '?')
1078
1079 If the alphabet or characters don't match up, the addition gives an
1080 ordinary Seq object:
1081
1082 >>> unk_nnnn = UnknownSeq(4, character = "N")
1083 >>> unk_nnnn
1084 UnknownSeq(4, alphabet = Alphabet(), character = 'N')
1085 >>> unk_nnnn + unk_four
1086 Seq('NNNN????', Alphabet())
1087
1088 Combining with a real Seq gives a new Seq object:
1089
1090 >>> known_seq = Seq("ACGT")
1091 >>> unk_four + known_seq
1092 Seq('????ACGT', Alphabet())
1093 >>> known_seq + unk_four
1094 Seq('ACGT????', Alphabet())
1095 """
1097 """Create a new UnknownSeq object.
1098
1099 If character is ommited, it is determed from the alphabet, "N" for
1100 nucleotides, "X" for proteins, and "?" otherwise.
1101 """
1102 self._length = int(length)
1103 if self._length < 0:
1104
1105 raise ValueError("Length must not be negative.")
1106 self.alphabet = alphabet
1107 if character:
1108 if len(character) != 1:
1109 raise ValueError("character argument should be a single letter string.")
1110 self._character = character
1111 else:
1112 base = Alphabet._get_base_alphabet(alphabet)
1113
1114
1115 if isinstance(base, Alphabet.NucleotideAlphabet):
1116 self._character = "N"
1117 elif isinstance(base, Alphabet.ProteinAlphabet):
1118 self._character = "X"
1119 else:
1120 self._character = "?"
1121
1123 """Returns the stated length of the unknown sequence."""
1124 return self._length
1125
1127 """Returns the unknown sequence as full string of the given length."""
1128 return self._character * self._length
1129
1131 return "UnknownSeq(%i, alphabet = %s, character = %s)" \
1132 % (self._length, repr(self.alphabet), repr(self._character))
1133
1135 """Add another sequence or string to this sequence.
1136
1137 Adding two UnknownSeq objects returns another UnknownSeq object
1138 provided the character is the same and the alphabets are compatible.
1139
1140 >>> from Bio.Seq import UnknownSeq
1141 >>> from Bio.Alphabet import generic_protein
1142 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein)
1143 UnknownSeq(15, alphabet = ProteinAlphabet(), character = 'X')
1144
1145 If the characters differ, an UnknownSeq object cannot be used, so a
1146 Seq object is returned:
1147
1148 >>> from Bio.Seq import UnknownSeq
1149 >>> from Bio.Alphabet import generic_protein
1150 >>> UnknownSeq(10, generic_protein) + UnknownSeq(5, generic_protein,
1151 ... character="x")
1152 Seq('XXXXXXXXXXxxxxx', ProteinAlphabet())
1153
1154 If adding a string to an UnknownSeq, a new Seq is returned with the
1155 same alphabet:
1156
1157 >>> from Bio.Seq import UnknownSeq
1158 >>> from Bio.Alphabet import generic_protein
1159 >>> UnknownSeq(5, generic_protein) + "LV"
1160 Seq('XXXXXLV', ProteinAlphabet())
1161 """
1162 if isinstance(other, UnknownSeq) \
1163 and other._character == self._character:
1164
1165 return UnknownSeq(len(self)+len(other),
1166 self.alphabet, self._character)
1167
1168 return Seq(str(self), self.alphabet) + other
1169
1174
1183
1184 - def count(self, sub, start=0, end=sys.maxint):
1185 """Non-overlapping count method, like that of a python string.
1186
1187 This behaves like the python string (and Seq object) method of the
1188 same name, which does a non-overlapping count!
1189
1190 Returns an integer, the number of occurrences of substring
1191 argument sub in the (sub)sequence given by [start:end].
1192 Optional arguments start and end are interpreted as in slice
1193 notation.
1194
1195 Arguments:
1196 - sub - a string or another Seq object to look for
1197 - start - optional integer, slice start
1198 - end - optional integer, slice end
1199
1200 >>> "NNNN".count("N")
1201 4
1202 >>> Seq("NNNN").count("N")
1203 4
1204 >>> UnknownSeq(4, character="N").count("N")
1205 4
1206 >>> UnknownSeq(4, character="N").count("A")
1207 0
1208 >>> UnknownSeq(4, character="N").count("AA")
1209 0
1210
1211 HOWEVER, please note because that python strings and Seq objects (and
1212 MutableSeq objects) do a non-overlapping search, this may not give
1213 the answer you expect:
1214
1215 >>> UnknownSeq(4, character="N").count("NN")
1216 2
1217 >>> UnknownSeq(4, character="N").count("NNN")
1218 1
1219 """
1220 sub_str = self._get_seq_str_and_check_alphabet(sub)
1221 if len(sub_str) == 1:
1222 if str(sub_str) == self._character:
1223 if start==0 and end >= self._length:
1224 return self._length
1225 else:
1226
1227 return str(self).count(sub_str, start, end)
1228 else:
1229 return 0
1230 else:
1231 if set(sub_str) == set(self._character):
1232 if start==0 and end >= self._length:
1233 return self._length // len(sub_str)
1234 else:
1235
1236 return str(self).count(sub_str, start, end)
1237 else:
1238 return 0
1239
1241 """The complement of an unknown nucleotide equals itself.
1242
1243 >>> my_nuc = UnknownSeq(8)
1244 >>> my_nuc
1245 UnknownSeq(8, alphabet = Alphabet(), character = '?')
1246 >>> print my_nuc
1247 ????????
1248 >>> my_nuc.complement()
1249 UnknownSeq(8, alphabet = Alphabet(), character = '?')
1250 >>> print my_nuc.complement()
1251 ????????
1252 """
1253 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1254 Alphabet.ProteinAlphabet):
1255 raise ValueError("Proteins do not have complements!")
1256 return self
1257
1259 """The reverse complement of an unknown nucleotide equals itself.
1260
1261 >>> my_nuc = UnknownSeq(10)
1262 >>> my_nuc
1263 UnknownSeq(10, alphabet = Alphabet(), character = '?')
1264 >>> print my_nuc
1265 ??????????
1266 >>> my_nuc.reverse_complement()
1267 UnknownSeq(10, alphabet = Alphabet(), character = '?')
1268 >>> print my_nuc.reverse_complement()
1269 ??????????
1270 """
1271 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1272 Alphabet.ProteinAlphabet):
1273 raise ValueError("Proteins do not have complements!")
1274 return self
1275
1277 """Returns unknown RNA sequence from an unknown DNA sequence.
1278
1279 >>> my_dna = UnknownSeq(10, character="N")
1280 >>> my_dna
1281 UnknownSeq(10, alphabet = Alphabet(), character = 'N')
1282 >>> print my_dna
1283 NNNNNNNNNN
1284 >>> my_rna = my_dna.transcribe()
1285 >>> my_rna
1286 UnknownSeq(10, alphabet = RNAAlphabet(), character = 'N')
1287 >>> print my_rna
1288 NNNNNNNNNN
1289 """
1290
1291 s = Seq(self._character, self.alphabet).transcribe()
1292 return UnknownSeq(self._length, s.alphabet, self._character)
1293
1295 """Returns unknown DNA sequence from an unknown RNA sequence.
1296
1297 >>> my_rna = UnknownSeq(20, character="N")
1298 >>> my_rna
1299 UnknownSeq(20, alphabet = Alphabet(), character = 'N')
1300 >>> print my_rna
1301 NNNNNNNNNNNNNNNNNNNN
1302 >>> my_dna = my_rna.back_transcribe()
1303 >>> my_dna
1304 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1305 >>> print my_dna
1306 NNNNNNNNNNNNNNNNNNNN
1307 """
1308
1309 s = Seq(self._character, self.alphabet).back_transcribe()
1310 return UnknownSeq(self._length, s.alphabet, self._character)
1311
1313 """Returns an upper case copy of the sequence.
1314
1315 >>> from Bio.Alphabet import generic_dna
1316 >>> from Bio.Seq import UnknownSeq
1317 >>> my_seq = UnknownSeq(20, generic_dna, character="n")
1318 >>> my_seq
1319 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'n')
1320 >>> print my_seq
1321 nnnnnnnnnnnnnnnnnnnn
1322 >>> my_seq.upper()
1323 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1324 >>> print my_seq.upper()
1325 NNNNNNNNNNNNNNNNNNNN
1326
1327 This will adjust the alphabet if required. See also the lower method.
1328 """
1329 return UnknownSeq(self._length, self.alphabet._upper(), self._character.upper())
1330
1332 """Returns a lower case copy of the sequence.
1333
1334 This will adjust the alphabet if required:
1335
1336 >>> from Bio.Alphabet import IUPAC
1337 >>> from Bio.Seq import UnknownSeq
1338 >>> my_seq = UnknownSeq(20, IUPAC.extended_protein)
1339 >>> my_seq
1340 UnknownSeq(20, alphabet = ExtendedIUPACProtein(), character = 'X')
1341 >>> print my_seq
1342 XXXXXXXXXXXXXXXXXXXX
1343 >>> my_seq.lower()
1344 UnknownSeq(20, alphabet = ProteinAlphabet(), character = 'x')
1345 >>> print my_seq.lower()
1346 xxxxxxxxxxxxxxxxxxxx
1347
1348 See also the upper method.
1349 """
1350 return UnknownSeq(self._length, self.alphabet._lower(), self._character.lower())
1351
1353 """Translate an unknown nucleotide sequence into an unknown protein.
1354
1355 e.g.
1356
1357 >>> my_seq = UnknownSeq(11, character="N")
1358 >>> print my_seq
1359 NNNNNNNNNNN
1360 >>> my_protein = my_seq.translate()
1361 >>> my_protein
1362 UnknownSeq(3, alphabet = ProteinAlphabet(), character = 'X')
1363 >>> print my_protein
1364 XXX
1365
1366 In comparison, using a normal Seq object:
1367
1368 >>> my_seq = Seq("NNNNNNNNNNN")
1369 >>> print my_seq
1370 NNNNNNNNNNN
1371 >>> my_protein = my_seq.translate()
1372 >>> my_protein
1373 Seq('XXX', ExtendedIUPACProtein())
1374 >>> print my_protein
1375 XXX
1376
1377 """
1378 if isinstance(Alphabet._get_base_alphabet(self.alphabet),
1379 Alphabet.ProteinAlphabet):
1380 raise ValueError("Proteins cannot be translated!")
1381 return UnknownSeq(self._length//3, Alphabet.generic_protein, "X")
1382
1383 - def ungap(self, gap=None):
1384 """Return a copy of the sequence without the gap character(s).
1385
1386 The gap character can be specified in two ways - either as an explicit
1387 argument, or via the sequence's alphabet. For example:
1388
1389 >>> from Bio.Seq import UnknownSeq
1390 >>> from Bio.Alphabet import Gapped, generic_dna
1391 >>> my_dna = UnknownSeq(20, Gapped(generic_dna,"-"))
1392 >>> my_dna
1393 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = 'N')
1394 >>> my_dna.ungap()
1395 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1396 >>> my_dna.ungap("-")
1397 UnknownSeq(20, alphabet = DNAAlphabet(), character = 'N')
1398
1399 If the UnknownSeq is using the gap character, then an empty Seq is
1400 returned:
1401
1402 >>> my_gap = UnknownSeq(20, Gapped(generic_dna,"-"), character="-")
1403 >>> my_gap
1404 UnknownSeq(20, alphabet = Gapped(DNAAlphabet(), '-'), character = '-')
1405 >>> my_gap.ungap()
1406 Seq('', DNAAlphabet())
1407 >>> my_gap.ungap("-")
1408 Seq('', DNAAlphabet())
1409
1410 Notice that the returned sequence's alphabet is adjusted to remove any
1411 explicit gap character declaration.
1412 """
1413
1414 s = Seq(self._character, self.alphabet).ungap()
1415 if s :
1416 return UnknownSeq(self._length, s.alphabet, self._character)
1417 else :
1418 return Seq("", s.alphabet)
1419
1421 """An editable sequence object (with an alphabet).
1422
1423 Unlike normal python strings and our basic sequence object (the Seq class)
1424 which are immuatable, the MutableSeq lets you edit the sequence in place.
1425 However, this means you cannot use a MutableSeq object as a dictionary key.
1426
1427 >>> from Bio.Seq import MutableSeq
1428 >>> from Bio.Alphabet import generic_dna
1429 >>> my_seq = MutableSeq("ACTCGTCGTCG", generic_dna)
1430 >>> my_seq
1431 MutableSeq('ACTCGTCGTCG', DNAAlphabet())
1432 >>> my_seq[5]
1433 'T'
1434 >>> my_seq[5] = "A"
1435 >>> my_seq
1436 MutableSeq('ACTCGACGTCG', DNAAlphabet())
1437 >>> my_seq[5]
1438 'A'
1439 >>> my_seq[5:8] = "NNN"
1440 >>> my_seq
1441 MutableSeq('ACTCGNNNTCG', DNAAlphabet())
1442 >>> len(my_seq)
1443 11
1444
1445 Note that the MutableSeq object does not support as many string-like
1446 or biological methods as the Seq object.
1447 """
1454
1456 """Returns a (truncated) representation of the sequence for debugging."""
1457 if len(self) > 60:
1458
1459
1460
1461 return "%s('%s...%s', %s)" % (self.__class__.__name__,
1462 str(self[:54]), str(self[-3:]),
1463 repr(self.alphabet))
1464 else:
1465 return "%s('%s', %s)" % (self.__class__.__name__,
1466 str(self),
1467 repr(self.alphabet))
1468
1470 """Returns the full sequence as a python string.
1471
1472 Note that Biopython 1.44 and earlier would give a truncated
1473 version of repr(my_seq) for str(my_seq). If you are writing code
1474 which needs to be backwards compatible with old Biopython, you
1475 should continue to use my_seq.tostring() rather than str(my_seq).
1476 """
1477
1478 return "".join(self.data)
1479
1481 """Compare the sequence to another sequence or a string (README).
1482
1483 Currently if compared to another sequence the alphabets must be
1484 compatible. Comparing DNA to RNA, or Nucleotide to Protein will raise
1485 an exception. Otherwise only the sequence itself is compared, not the
1486 precise alphabet.
1487
1488 A future release of Biopython will change this (and the Seq object etc)
1489 to use simple string comparison. The plan is that comparing sequences
1490 with incompatible alphabets (e.g. DNA to RNA) will trigger a warning
1491 but not an exception.
1492
1493 During this transition period, please just do explicit comparisons:
1494
1495 >>> seq1 = MutableSeq("ACGT")
1496 >>> seq2 = MutableSeq("ACGT")
1497 >>> id(seq1) == id(seq2)
1498 False
1499 >>> str(seq1) == str(seq2)
1500 True
1501
1502 This method indirectly supports ==, < , etc.
1503 """
1504 if hasattr(other, "alphabet"):
1505
1506 import warnings
1507 warnings.warn("In future comparing incompatible alphabets will "
1508 "only trigger a warning (not an exception). In "
1509 "the interim please use id(seq1)==id(seq2) or "
1510 "str(seq1)==str(seq2) to make your code explicit "
1511 "and to avoid this warning.", FutureWarning)
1512 if not Alphabet._check_type_compatible([self.alphabet,
1513 other.alphabet]):
1514 raise TypeError("Incompatable alphabets %s and %s" \
1515 % (repr(self.alphabet), repr(other.alphabet)))
1516
1517 if isinstance(other, MutableSeq):
1518
1519
1520 return cmp(self.data, other.data)
1521 else:
1522 return cmp(str(self), str(other))
1523 elif isinstance(other, basestring):
1524 return cmp(str(self), other)
1525 else:
1526 raise TypeError
1527
1529
1540
1556
1558
1559
1560
1561
1562
1563 del self.data[index]
1564
1588
1609
1612
1615
1616 - def pop(self, i = (-1)):
1617 c = self.data[i]
1618 del self.data[i]
1619 return c
1620
1622 for i in range(len(self.data)):
1623 if self.data[i] == item:
1624 del self.data[i]
1625 return
1626 raise ValueError("MutableSeq.remove(x): x not in list")
1627
1628 - def count(self, sub, start=0, end=sys.maxint):
1629 """Non-overlapping count method, like that of a python string.
1630
1631 This behaves like the python string method of the same name,
1632 which does a non-overlapping count!
1633
1634 Returns an integer, the number of occurrences of substring
1635 argument sub in the (sub)sequence given by [start:end].
1636 Optional arguments start and end are interpreted as in slice
1637 notation.
1638
1639 Arguments:
1640 - sub - a string or another Seq object to look for
1641 - start - optional integer, slice start
1642 - end - optional integer, slice end
1643
1644 e.g.
1645
1646 >>> from Bio.Seq import MutableSeq
1647 >>> my_mseq = MutableSeq("AAAATGA")
1648 >>> print my_mseq.count("A")
1649 5
1650 >>> print my_mseq.count("ATG")
1651 1
1652 >>> print my_mseq.count(Seq("AT"))
1653 1
1654 >>> print my_mseq.count("AT", 2, -1)
1655 1
1656
1657 HOWEVER, please note because that python strings, Seq objects and
1658 MutableSeq objects do a non-overlapping search, this may not give
1659 the answer you expect:
1660
1661 >>> "AAAA".count("AA")
1662 2
1663 >>> print MutableSeq("AAAA").count("AA")
1664 2
1665
1666 A non-overlapping search would give the answer as three!
1667 """
1668 try:
1669
1670 search = sub.tostring()
1671 except AttributeError:
1672 search = sub
1673
1674 if not isinstance(search, basestring):
1675 raise TypeError("expected a string, Seq or MutableSeq")
1676
1677 if len(search) == 1:
1678
1679 count = 0
1680 for c in self.data[start:end]:
1681 if c == search: count += 1
1682 return count
1683 else:
1684
1685 return self.tostring().count(search, start, end)
1686
1688 for i in range(len(self.data)):
1689 if self.data[i] == item:
1690 return i
1691 raise ValueError("MutableSeq.index(x): x not in list")
1692
1694 """Modify the mutable sequence to reverse itself.
1695
1696 No return value.
1697 """
1698 self.data.reverse()
1699
1725
1727 """Modify the mutable sequence to take on its reverse complement.
1728
1729 Trying to reverse complement a protein sequence raises an exception.
1730
1731 No return value.
1732 """
1733 self.complement()
1734 self.data.reverse()
1735
1736
1737
1738
1746
1748 """Returns the full sequence as a python string.
1749
1750 Although not formally deprecated, you are now encouraged to use
1751 str(my_seq) instead of my_seq.tostring().
1752
1753 Because str(my_seq) will give you the full sequence as a python string,
1754 there is often no need to make an explicit conversion. For example,
1755
1756 print "ID={%s}, sequence={%s}" % (my_name, my_seq)
1757
1758 On Biopython 1.44 or older you would have to have done this:
1759
1760 print "ID={%s}, sequence={%s}" % (my_name, my_seq.tostring())
1761 """
1762 return "".join(self.data)
1763
1765 """Returns the full sequence as a new immutable Seq object.
1766
1767 >>> from Bio.Seq import Seq
1768 >>> from Bio.Alphabet import IUPAC
1769 >>> my_mseq = MutableSeq("MKQHKAMIVALIVICITAVVAAL", \
1770 IUPAC.protein)
1771 >>> my_mseq
1772 MutableSeq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
1773 >>> my_mseq.toseq()
1774 Seq('MKQHKAMIVALIVICITAVVAAL', IUPACProtein())
1775
1776 Note that the alphabet is preserved.
1777 """
1778 return Seq("".join(self.data), self.alphabet)
1779
1785 """Transcribes a DNA sequence into RNA.
1786
1787 If given a string, returns a new string object.
1788
1789 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
1790
1791 Trying to transcribe a protein or RNA sequence raises an exception.
1792
1793 e.g.
1794
1795 >>> transcribe("ACTGN")
1796 'ACUGN'
1797 """
1798 if isinstance(dna, Seq):
1799 return dna.transcribe()
1800 elif isinstance(dna, MutableSeq):
1801 return dna.toseq().transcribe()
1802 else:
1803 return dna.replace('T','U').replace('t','u')
1804
1806 """Back-transcribes an RNA sequence into DNA.
1807
1808 If given a string, returns a new string object.
1809
1810 Given a Seq or MutableSeq, returns a new Seq object with an RNA alphabet.
1811
1812 Trying to transcribe a protein or DNA sequence raises an exception.
1813
1814 e.g.
1815
1816 >>> back_transcribe("ACUGN")
1817 'ACTGN'
1818 """
1819 if isinstance(rna, Seq):
1820 return rna.back_transcribe()
1821 elif isinstance(rna, MutableSeq):
1822 return rna.toseq().back_transcribe()
1823 else:
1824 return rna.replace('U','T').replace('u','t')
1825
1826 -def _translate_str(sequence, table, stop_symbol="*", to_stop=False,
1827 cds=False, pos_stop="X"):
1828 """Helper function to translate a nucleotide string (PRIVATE).
1829
1830 Arguments:
1831 - sequence - a string
1832 - table - a CodonTable object (NOT a table name or id number)
1833 - stop_symbol - a single character string, what to use for terminators.
1834 - to_stop - boolean, should translation terminate at the first
1835 in frame stop codon? If there is no in-frame stop codon
1836 then translation continues to the end.
1837 - pos_stop - a single character string for a possible stop codon
1838 (e.g. TAN or NNN)
1839 - cds - Boolean, indicates this is a complete CDS. If True, this
1840 checks the sequence starts with a valid alternative start
1841 codon (which will be translated as methionine, M), that the
1842 sequence length is a multiple of three, and that there is a
1843 single in frame stop codon at the end (this will be excluded
1844 from the protein sequence, regardless of the to_stop option).
1845 If these tests fail, an exception is raised.
1846
1847 Returns a string.
1848
1849 e.g.
1850
1851 >>> from Bio.Data import CodonTable
1852 >>> table = CodonTable.ambiguous_dna_by_id[1]
1853 >>> _translate_str("AAA", table)
1854 'K'
1855 >>> _translate_str("TAR", table)
1856 '*'
1857 >>> _translate_str("TAN", table)
1858 'X'
1859 >>> _translate_str("TAN", table, pos_stop="@")
1860 '@'
1861 >>> _translate_str("TA?", table)
1862 Traceback (most recent call last):
1863 ...
1864 TranslationError: Codon 'TA?' is invalid
1865 >>> _translate_str("ATGCCCTAG", table, cds=True)
1866 'MP'
1867 >>> _translate_str("AAACCCTAG", table, cds=True)
1868 Traceback (most recent call last):
1869 ...
1870 TranslationError: First codon 'AAA' is not a start codon
1871 >>> _translate_str("ATGCCCTAGCCCTAG", table, cds=True)
1872 Traceback (most recent call last):
1873 ...
1874 TranslationError: Extra in frame stop codon found.
1875 """
1876 sequence = sequence.upper()
1877 amino_acids = []
1878 forward_table = table.forward_table
1879 stop_codons = table.stop_codons
1880 if table.nucleotide_alphabet.letters is not None:
1881 valid_letters = set(table.nucleotide_alphabet.letters.upper())
1882 else:
1883
1884 valid_letters = set(IUPAC.ambiguous_dna.letters.upper() + \
1885 IUPAC.ambiguous_rna.letters.upper())
1886 if cds:
1887 if str(sequence[:3]).upper() not in table.start_codons:
1888 raise CodonTable.TranslationError(\
1889 "First codon '%s' is not a start codon" % sequence[:3])
1890 if len(sequence) % 3 != 0:
1891 raise CodonTable.TranslationError(\
1892 "Sequence length %i is not a multiple of three" % len(sequence))
1893 if str(sequence[-3:]).upper() not in stop_codons:
1894 raise CodonTable.TranslationError(\
1895 "Final codon '%s' is not a stop codon" % sequence[-3:])
1896
1897 sequence = sequence[3:-3]
1898 amino_acids = ["M"]
1899 n = len(sequence)
1900 for i in xrange(0,n-n%3,3):
1901 codon = sequence[i:i+3]
1902 try:
1903 amino_acids.append(forward_table[codon])
1904 except (KeyError, CodonTable.TranslationError):
1905
1906 if codon in table.stop_codons:
1907 if cds:
1908 raise CodonTable.TranslationError(\
1909 "Extra in frame stop codon found.")
1910 if to_stop : break
1911 amino_acids.append(stop_symbol)
1912 elif valid_letters.issuperset(set(codon)):
1913
1914 amino_acids.append(pos_stop)
1915 else:
1916 raise CodonTable.TranslationError(\
1917 "Codon '%s' is invalid" % codon)
1918 return "".join(amino_acids)
1919
1920 -def translate(sequence, table="Standard", stop_symbol="*", to_stop=False,
1921 cds=False):
1922 """Translate a nucleotide sequence into amino acids.
1923
1924 If given a string, returns a new string object. Given a Seq or
1925 MutableSeq, returns a Seq object with a protein alphabet.
1926
1927 Arguments:
1928 - table - Which codon table to use? This can be either a name
1929 (string) or an NCBI identifier (integer). Defaults
1930 to the "Standard" table.
1931 - stop_symbol - Single character string, what to use for any
1932 terminators, defaults to the asterisk, "*".
1933 - to_stop - Boolean, defaults to False meaning do a full
1934 translation continuing on past any stop codons
1935 (translated as the specified stop_symbol). If
1936 True, translation is terminated at the first in
1937 frame stop codon (and the stop_symbol is not
1938 appended to the returned protein sequence).
1939 - cds - Boolean, indicates this is a complete CDS. If True, this
1940 checks the sequence starts with a valid alternative start
1941 codon (which will be translated as methionine, M), that the
1942 sequence length is a multiple of three, and that there is a
1943 single in frame stop codon at the end (this will be excluded
1944 from the protein sequence, regardless of the to_stop option).
1945 If these tests fail, an exception is raised.
1946
1947 A simple string example using the default (standard) genetic code:
1948
1949 >>> coding_dna = "GTGGCCATTGTAATGGGCCGCTGAAAGGGTGCCCGATAG"
1950 >>> translate(coding_dna)
1951 'VAIVMGR*KGAR*'
1952 >>> translate(coding_dna, stop_symbol="@")
1953 'VAIVMGR@KGAR@'
1954 >>> translate(coding_dna, to_stop=True)
1955 'VAIVMGR'
1956
1957 Now using NCBI table 2, where TGA is not a stop codon:
1958
1959 >>> translate(coding_dna, table=2)
1960 'VAIVMGRWKGAR*'
1961 >>> translate(coding_dna, table=2, to_stop=True)
1962 'VAIVMGRWKGAR'
1963
1964 In fact this example uses an alternative start codon valid under NCBI table 2,
1965 GTG, which means this example is a complete valid CDS which when translated
1966 should really start with methionine (not valine):
1967
1968 >>> translate(coding_dna, table=2, cds=True)
1969 'MAIVMGRWKGAR'
1970
1971 Note that if the sequence has no in-frame stop codon, then the to_stop
1972 argument has no effect:
1973
1974 >>> coding_dna2 = "GTGGCCATTGTAATGGGCCGC"
1975 >>> translate(coding_dna2)
1976 'VAIVMGR'
1977 >>> translate(coding_dna2, to_stop=True)
1978 'VAIVMGR'
1979
1980 NOTE - Ambiguous codons like "TAN" or "NNN" could be an amino acid
1981 or a stop codon. These are translated as "X". Any invalid codon
1982 (e.g. "TA?" or "T-A") will throw a TranslationError.
1983
1984 NOTE - Does NOT support gapped sequences.
1985
1986 It will however translate either DNA or RNA.
1987 """
1988 if isinstance(sequence, Seq):
1989 return sequence.translate(table, stop_symbol, to_stop, cds)
1990 elif isinstance(sequence, MutableSeq):
1991
1992 return sequence.toseq().translate(table, stop_symbol, to_stop, cds)
1993 else:
1994
1995 try:
1996 codon_table = CodonTable.ambiguous_generic_by_id[int(table)]
1997 except ValueError:
1998 codon_table = CodonTable.ambiguous_generic_by_name[table]
1999 return _translate_str(sequence, codon_table, stop_symbol, to_stop, cds)
2000
2034
2036 """Run the Bio.Seq module's doctests."""
2037 print "Runing doctests..."
2038 import doctest
2039 doctest.testmod()
2040 print "Done"
2041
2042 if __name__ == "__main__":
2043 _test()
2044