Package Bio :: Package Alphabet
[hide private]
[frames] | no frames]

Source Code for Package Bio.Alphabet

  1  # Copyright 2000-2002 by Andrew Dalke. 
  2  # Revisions copyright 2007-2008 by Peter Cock. 
  3  # All rights reserved. 
  4  # This code is part of the Biopython distribution and governed by its 
  5  # license.  Please see the LICENSE file that should have been included 
  6  # as part of this package. 
  7   
  8  """Alphabets used in Seq objects etc to declare sequence type and letters. 
  9   
 10  This is used by sequences which contain a finite number of similar words. 
 11  """ 
 12   
13 -class Alphabet:
14 size = None # default to no fixed size for words 15 letters = None # default to no fixed alphabet 16 # In general, a list-like object. However, 17 # assuming letters are single characters, use a 18 # string. This is expected for use with Seq like 19 # objects. 20
21 - def __repr__(self):
22 return self.__class__.__name__ + "()"
23
24 - def contains(self, other):
25 """Does this alphabet 'contain' the other (OBSOLETE?). 26 27 Returns a boolean. This relies on the Alphabet subclassing 28 hierarchy only, and does not check the letters property. 29 This isn't ideal, and doesn't seem to work as intended 30 with the AlphabetEncoder classes.""" 31 return isinstance(other, self.__class__)
32
33 - def _case_less(self):
34 """Return an case-less variant of the current alphabet (PRIVATE).""" 35 #TODO - remove this method by dealing with things in subclasses? 36 if isinstance(self, ProteinAlphabet): 37 return generic_protein 38 elif isinstance(self, DNAAlphabet): 39 return generic_dna 40 elif isinstance(self, NucleotideAlphabet): 41 return generic_rna 42 elif isinstance(self, NucleotideAlphabet): 43 return generic_nucleotide 44 elif isinstance(self, SingleLetterAlphabet): 45 return single_letter_alphabet 46 else: 47 return generic_alphabet
48
49 - def _upper(self):
50 """Return an upper case variant of the current alphabet (PRIVATE).""" 51 if not self.letters or self.letters==self.letters.upper(): 52 #Easy case, no letters or already upper case! 53 return self 54 else: 55 #TODO - Raise NotImplementedError and handle via subclass? 56 return self._case_less()
57
58 - def _lower(self):
59 """Return a lower case variant of the current alphabet (PRIVATE).""" 60 if not self.letters or self.letters==self.letters.lower(): 61 #Easy case, no letters or already lower case! 62 return self 63 else: 64 #TODO - Raise NotImplementedError and handle via subclass? 65 return self._case_less()
66 67 generic_alphabet = Alphabet() 68
69 -class SingleLetterAlphabet(Alphabet):
70 size = 1 71 letters = None # string of all letters in the alphabet
72 73 single_letter_alphabet = SingleLetterAlphabet() 74 75 ########### Protein 76
77 -class ProteinAlphabet(SingleLetterAlphabet):
78 pass
79 80 generic_protein = ProteinAlphabet() 81 82 ########### DNA
83 -class NucleotideAlphabet(SingleLetterAlphabet):
84 pass
85 86 generic_nucleotide = NucleotideAlphabet() 87
88 -class DNAAlphabet(NucleotideAlphabet):
89 pass
90 91 generic_dna = DNAAlphabet() 92 93 94 ########### RNA 95
96 -class RNAAlphabet(NucleotideAlphabet):
97 pass
98 99 generic_rna = RNAAlphabet() 100 101 102 103 ########### Other per-sequence encodings 104
105 -class SecondaryStructure(SingleLetterAlphabet):
106 letters = "HSTC"
107
108 -class ThreeLetterProtein(Alphabet):
109 size = 3 110 letters = [ 111 "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile", 112 "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", 113 "Sec", "Val", "Trp", "Xaa", "Tyr", "Glx", 114 ]
115 116 ###### Non per-sequence modifications 117 118 # (These are Decorator classes) 119
120 -class AlphabetEncoder:
121 - def __init__(self, alphabet, new_letters):
122 self.alphabet = alphabet 123 self.new_letters = new_letters 124 if alphabet.letters is not None: 125 self.letters = alphabet.letters + new_letters 126 else: 127 self.letters = None
128 - def __getattr__(self, key):
129 if key[:2] == "__" and key[-2:] == "__": 130 raise AttributeError(key) 131 return getattr(self.alphabet, key)
132
133 - def __repr__(self):
134 return "%s(%r, %r)" % (self.__class__.__name__, self.alphabet, 135 self.new_letters)
136
137 - def contains(self, other):
138 """Does this alphabet 'contain' the other (OBSOLETE?). 139 140 This is isn't implemented for the base AlphabetEncoder, 141 which will always return 0 (False).""" 142 return 0
143
144 - def _upper(self):
145 """Return an upper case variant of the current alphabet (PRIVATE).""" 146 return AlphabetEncoder(self.alphabet._upper(), self.new_letters.upper())
147
148 - def _lower(self):
149 """Return a lower case variant of the current alphabet (PRIVATE).""" 150 return AlphabetEncoder(self.alphabet._lower(), self.new_letters.lower())
151 152
153 -class Gapped(AlphabetEncoder):
154 - def __init__(self, alphabet, gap_char = "-"):
155 AlphabetEncoder.__init__(self, alphabet, gap_char) 156 self.gap_char = gap_char
157
158 - def contains(self, other):
159 """Does this alphabet 'contain' the other (OBSOLETE?). 160 161 Returns a boolean. This relies on the Alphabet subclassing 162 hierarchy, and attempts to check the gap character. This fails 163 if the other alphabet does not have a gap character! 164 """ 165 return other.gap_char == self.gap_char and \ 166 self.alphabet.contains(other.alphabet)
167
168 - def _upper(self):
169 """Return an upper case variant of the current alphabet (PRIVATE).""" 170 return Gapped(self.alphabet._upper(), self.gap_char.upper())
171
172 - def _lower(self):
173 """Return a lower case variant of the current alphabet (PRIVATE).""" 174 return Gapped(self.alphabet._lower(), self.gap_char.lower())
175 176
177 -class HasStopCodon(AlphabetEncoder):
178 - def __init__(self, alphabet, stop_symbol = "*"):
179 AlphabetEncoder.__init__(self, alphabet, stop_symbol) 180 self.stop_symbol = stop_symbol
181
182 - def __cmp__(self, other):
183 x = cmp(self.alphabet, other.alphabet) 184 if x == 0: 185 return cmp(self.stop_symbol, other.stop_symbol) 186 return x
187
188 - def contains(self, other):
189 """Does this alphabet 'contain' the other (OBSOLETE?). 190 191 Returns a boolean. This relies on the Alphabet subclassing 192 hierarchy, and attempts to check the stop symbol. This fails 193 if the other alphabet does not have a stop symbol! 194 """ 195 return other.stop_symbol == self.stop_symbol and \ 196 self.alphabet.contains(other.alphabet)
197
198 - def _upper(self):
199 """Return an upper case variant of the current alphabet (PRIVATE).""" 200 return HasStopCodon(self.alphabet._upper(), self.stop_symbol.upper())
201
202 - def _lower(self):
203 """Return a lower case variant of the current alphabet (PRIVATE).""" 204 return HasStopCodon(self.alphabet._lower(), self.stop_symbol.lower())
205 206
207 -def _get_base_alphabet(alphabet):
208 """Returns the non-gapped non-stop-codon Alphabet object (PRIVATE).""" 209 a = alphabet 210 while isinstance(a, AlphabetEncoder): 211 a = a.alphabet 212 assert isinstance(a, Alphabet), \ 213 "Invalid alphabet found, %s" % repr(a) 214 return a
215
216 -def _ungap(alphabet):
217 """Returns the alphabet without any gap encoder (PRIVATE).""" 218 #TODO - Handle via method of the objects? 219 if not hasattr(alphabet, "gap_char"): 220 return alphabet 221 elif isinstance(alphabet, Gapped): 222 return alphabet.alphabet 223 elif isinstance(alphabet, HasStopCodon): 224 return HasStopCodon(_ungap(alphabet.alphabet), stop_symbol=alphabet.stop_symbol) 225 elif isinstance(alphabet, AlphabetEncoder): 226 return AlphabetEncoder(_ungap(alphabet.alphabet), letters=alphabet.letters) 227 else: 228 raise NotImplementedError
229
230 -def _consensus_base_alphabet(alphabets):
231 """Returns a common but often generic base alphabet object (PRIVATE). 232 233 This throws away any AlphabetEncoder information, e.g. Gapped alphabets. 234 235 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 236 letter. These DO NOT raise an exception!""" 237 common = None 238 for alpha in alphabets: 239 a = _get_base_alphabet(alpha) 240 if common is None: 241 common = a 242 elif common == a: 243 pass 244 elif isinstance(a, common.__class__): 245 pass 246 elif isinstance(common, a.__class__): 247 common = a 248 elif isinstance(a, NucleotideAlphabet) \ 249 and isinstance(common, NucleotideAlphabet): 250 #e.g. Give a mix of RNA and DNA alphabets 251 common = generic_nucleotide 252 elif isinstance(a, SingleLetterAlphabet) \ 253 and isinstance(common, SingleLetterAlphabet): 254 #This is a pretty big mis-match! 255 common = single_letter_alphabet 256 else: 257 #We have a major mis-match... take the easy way out! 258 return generic_alphabet 259 if common is None: 260 #Given NO alphabets! 261 return generic_alphabet 262 return common
263
264 -def _consensus_alphabet(alphabets):
265 """Returns a common but often generic alphabet object (PRIVATE). 266 267 Note that DNA+RNA -> Nucleotide, and Nucleotide+Protein-> generic single 268 letter. These DO NOT raise an exception! 269 270 This is aware of Gapped and HasStopCodon and new letters added by 271 other AlphabetEncoders. This WILL raise an exception if more than 272 one gap character or stop symbol is present.""" 273 base = _consensus_base_alphabet(alphabets) 274 gap = None 275 stop = None 276 new_letters = "" 277 for alpha in alphabets: 278 #Gaps... 279 if not hasattr(alpha, "gap_char"): 280 pass 281 elif gap is None: 282 gap = alpha.gap_char 283 elif gap == alpha.gap_char: 284 pass 285 else: 286 raise ValueError("More than one gap character present") 287 #Stops... 288 if not hasattr(alpha, "stop_symbol"): 289 pass 290 elif stop is None: 291 stop = alpha.stop_symbol 292 elif stop == alpha.stop_symbol: 293 pass 294 else: 295 raise ValueError("More than one stop symbol present") 296 #New letters... 297 if hasattr(alpha, "new_letters"): 298 for letter in alpha.new_letters: 299 if letter not in new_letters \ 300 and letter != gap and letter != stop: 301 new_letters += letter 302 303 alpha = base 304 if new_letters: 305 alpha = AlphabetEncoder(alpha, new_letters) 306 if gap: 307 alpha = Gapped(alpha, gap_char=gap) 308 if stop: 309 alpha = HasStopCodon(alpha, stop_symbol=stop) 310 return alpha
311
312 -def _check_type_compatible(alphabets):
313 """Returns True except for DNA+RNA or Nucleotide+Protein (PRIVATE). 314 315 This relies on the Alphabet subclassing hierarchy. It does not 316 check things like gap characters or stop symbols.""" 317 dna, rna, nucl, protein = False, False, False, False 318 for alpha in alphabets: 319 a = _get_base_alphabet(alpha) 320 if isinstance(a, DNAAlphabet): 321 dna = True 322 nucl = True 323 if rna or protein : return False 324 elif isinstance(a, RNAAlphabet): 325 rna = True 326 nucl = True 327 if dna or protein : return False 328 elif isinstance(a, NucleotideAlphabet): 329 nucl = True 330 if protein : return False 331 elif isinstance(a, ProteinAlphabet): 332 protein = True 333 if nucl : return False 334 return True
335