Package Bio :: Package Align :: Module Generic
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.Generic

  1  # Copyright 2000-2004 Brad Chapman. 
  2  # Copyright 2001 Iddo Friedberg. 
  3  # Copyright 2007-2010 by Peter Cock. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """ 
  9  Contains classes to deal with generic sequence alignment stuff not 
 10  specific to a particular program or format. 
 11   
 12  classes: 
 13  o Alignment 
 14  """ 
 15   
 16  # biopython 
 17  from Bio.Seq import Seq 
 18  from Bio.SeqRecord import SeqRecord 
 19  from Bio import Alphabet 
 20   
21 -class Alignment:
22 """Represent a set of alignments (OBSOLETE?). 23 24 This is a base class to represent alignments, which can be subclassed 25 to deal with an alignment in a specific format. 26 27 With the introduction of the MultipleSeqAlignment class in Bio.Align, 28 this base class is effectively obsolete and will likely be deprecated and 29 later removed in future releases of Biopython. 30 """
31 - def __init__(self, alphabet):
32 """Initialize a new Alignment object. 33 34 Arguments: 35 o alphabet - The alphabet to use for the sequence objects that are 36 created. This alphabet must be a gapped type. 37 38 e.g. 39 >>> from Bio.Alphabet import IUPAC, Gapped 40 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 41 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 42 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 43 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 44 >>> print align 45 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 46 ACTGCTAGCTAG Alpha 47 ACT-CTAGCTAG Beta 48 ACTGCTAGATAG Gamma 49 """ 50 if not (isinstance(alphabet, Alphabet.Alphabet) \ 51 or isinstance(alphabet, Alphabet.AlphabetEncoder)): 52 raise ValueError("Invalid alphabet argument") 53 self._alphabet = alphabet 54 # hold everything at a list of SeqRecord objects 55 self._records = []
56
57 - def _str_line(self, record):
58 """Returns a truncated string representation of a SeqRecord (PRIVATE). 59 60 This is a PRIVATE function used by the __str__ method. 61 """ 62 if len(record.seq) <= 50: 63 return "%s %s" % (record.seq, record.id) 64 else: 65 return "%s...%s %s" \ 66 % (record.seq[:44], record.seq[-3:], record.id)
67
68 - def __str__(self):
69 """Returns a multi-line string summary of the alignment. 70 71 This output is intended to be readable, but large alignments are 72 shown truncated. A maximum of 20 rows (sequences) and 50 columns 73 are shown, with the record identifiers. This should fit nicely on a 74 single screen. e.g. 75 76 >>> from Bio.Alphabet import IUPAC, Gapped 77 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 78 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 79 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 80 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 81 >>> print align 82 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 83 ACTGCTAGCTAG Alpha 84 ACT-CTAGCTAG Beta 85 ACTGCTAGATAG Gamma 86 87 See also the alignment's format method. 88 """ 89 rows = len(self._records) 90 lines = ["%s alignment with %i rows and %i columns" \ 91 % (str(self._alphabet), rows, self.get_alignment_length())] 92 if rows <= 20: 93 lines.extend([self._str_line(rec) for rec in self._records]) 94 else: 95 lines.extend([self._str_line(rec) for rec in self._records[:18]]) 96 lines.append("...") 97 lines.append(self._str_line(self._records[-1])) 98 return "\n".join(lines)
99
100 - def __repr__(self):
101 """Returns a representation of the object for debugging. 102 103 The representation cannot be used with eval() to recreate the object, 104 which is usually possible with simple python ojects. For example: 105 106 <Bio.Align.Generic.Alignment instance (2 records of length 14, 107 SingleLetterAlphabet()) at a3c184c> 108 109 The hex string is the memory address of the object, see help(id). 110 This provides a simple way to visually distinguish alignments of 111 the same size. 112 """ 113 #A doctest for __repr__ would be nice, but __class__ comes out differently 114 #if run via the __main__ trick. 115 return "<%s instance (%i records of length %i, %s) at %x>" % \ 116 (self.__class__, len(self._records), 117 self.get_alignment_length(), repr(self._alphabet), id(self))
118 #This version is useful for doing eval(repr(alignment)), 119 #but it can be VERY long: 120 #return "%s(%s, %s)" \ 121 # % (self.__class__, repr(self._records), repr(self._alphabet)) 122
123 - def format(self, format):
124 """Returns the alignment as a string in the specified file format. 125 126 The format should be a lower case string supported as an output 127 format by Bio.AlignIO (such as "fasta", "clustal", "phylip", 128 "stockholm", etc), which is used to turn the alignment into a 129 string. 130 131 e.g. 132 >>> from Bio.Alphabet import IUPAC, Gapped 133 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 134 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 135 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 136 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 137 >>> print align.format("fasta") 138 >Alpha 139 ACTGCTAGCTAG 140 >Beta 141 ACT-CTAGCTAG 142 >Gamma 143 ACTGCTAGATAG 144 <BLANKLINE> 145 >>> print align.format("phylip") 146 3 12 147 Alpha ACTGCTAGCT AG 148 Beta ACT-CTAGCT AG 149 Gamma ACTGCTAGAT AG 150 <BLANKLINE> 151 152 For Python 2.6, 3.0 or later see also the built in format() function. 153 """ 154 #See also the __format__ added for Python 2.6 / 3.0, PEP 3101 155 #See also the SeqRecord class and its format() method using Bio.SeqIO 156 return self.__format__(format)
157 158
159 - def __format__(self, format_spec):
160 """Returns the alignment as a string in the specified file format. 161 162 This method supports the python format() function added in 163 Python 2.6/3.0. The format_spec should be a lower case 164 string supported by Bio.AlignIO as an output file format. 165 See also the alignment's format() method.""" 166 if format_spec: 167 from StringIO import StringIO 168 from Bio import AlignIO 169 handle = StringIO() 170 AlignIO.write([self], handle, format_spec) 171 return handle.getvalue() 172 else: 173 #Follow python convention and default to using __str__ 174 return str(self)
175
176 - def get_all_seqs(self):
177 """Return all of the sequences involved in the alignment (DEPRECATED). 178 179 The return value is a list of SeqRecord objects. 180 181 This method is deprecated, as the Alignment object itself now offers 182 much of the functionality of a list of SeqRecord objects (e.g. 183 iteration or slicing to create a sub-alignment). Instead use the 184 Python builtin function list, i.e. my_list = list(my_align) 185 """ 186 import warnings 187 warnings.warn("This method is deprecated, since the alignment object" 188 "now acts more like a list. Instead of calling " 189 "align.get_all_seqs() you can use list(align)", 190 DeprecationWarning) 191 return self._records
192
193 - def __iter__(self):
194 """Iterate over alignment rows as SeqRecord objects. 195 196 e.g. 197 >>> from Bio.Alphabet import IUPAC, Gapped 198 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 199 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 200 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 201 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 202 >>> for record in align: 203 ... print record.id 204 ... print record.seq 205 Alpha 206 ACTGCTAGCTAG 207 Beta 208 ACT-CTAGCTAG 209 Gamma 210 ACTGCTAGATAG 211 """ 212 return iter(self._records)
213
214 - def get_seq_by_num(self, number):
215 """Retrieve a sequence by row number (OBSOLETE). 216 217 Returns: 218 o A Seq object for the requested sequence. 219 220 Raises: 221 o IndexError - If the specified number is out of range. 222 223 NOTE: This is a legacy method. In new code where you need to access 224 the rows of the alignment (i.e. the sequences) consider iterating 225 over them or accessing them as SeqRecord objects. e.g. 226 227 for record in alignment: 228 print record.id 229 print record.seq 230 first_record = alignment[0] 231 last_record = alignment[-1] 232 """ 233 return self._records[number].seq
234
235 - def __len__(self):
236 """Returns the number of sequences in the alignment. 237 238 Use len(alignment) to get the number of sequences (i.e. the number of 239 rows), and alignment.get_alignment_length() to get the length of the 240 longest sequence (i.e. the number of columns). 241 242 This is easy to remember if you think of the alignment as being like a 243 list of SeqRecord objects. 244 """ 245 return len(self._records)
246
247 - def get_alignment_length(self):
248 """Return the maximum length of the alignment. 249 250 All objects in the alignment should (hopefully) have the same 251 length. This function will go through and find this length 252 by finding the maximum length of sequences in the alignment. 253 254 >>> from Bio.Alphabet import IUPAC, Gapped 255 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 256 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 257 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 258 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 259 >>> align.get_alignment_length() 260 12 261 262 If you want to know the number of sequences in the alignment, 263 use len(align) instead: 264 265 >>> len(align) 266 3 267 268 """ 269 max_length = 0 270 271 for record in self._records: 272 if len(record.seq) > max_length: 273 max_length = len(record.seq) 274 275 return max_length
276
277 - def add_sequence(self, descriptor, sequence, start = None, end = None, 278 weight = 1.0):
279 """Add a sequence to the alignment. 280 281 This doesn't do any kind of alignment, it just adds in the sequence 282 object, which is assumed to be prealigned with the existing 283 sequences. 284 285 Arguments: 286 o descriptor - The descriptive id of the sequence being added. 287 This will be used as the resulting SeqRecord's 288 .id property (and, for historical compatibility, 289 also the .description property) 290 o sequence - A string with sequence info. 291 o start - You can explicitly set the start point of the sequence. 292 This is useful (at least) for BLAST alignments, which can just 293 be partial alignments of sequences. 294 o end - Specify the end of the sequence, which is important 295 for the same reason as the start. 296 o weight - The weight to place on the sequence in the alignment. 297 By default, all sequences have the same weight. (0.0 => no weight, 298 1.0 => highest weight) 299 """ 300 new_seq = Seq(sequence, self._alphabet) 301 302 #We are now effectively using the SeqRecord's .id as 303 #the primary identifier (e.g. in Bio.SeqIO) so we should 304 #populate it with the descriptor. 305 #For backwards compatibility, also store this in the 306 #SeqRecord's description property. 307 new_record = SeqRecord(new_seq, 308 id = descriptor, 309 description = descriptor) 310 311 # hack! We really need to work out how to deal with annotations 312 # and features in biopython. Right now, I'll just use the 313 # generic annotations dictionary we've got to store the start 314 # and end, but we should think up something better. I don't know 315 # if I'm really a big fan of the LocatableSeq thing they've got 316 # in BioPerl, but I'm not positive what the best thing to do on 317 # this is... 318 if start: 319 new_record.annotations['start'] = start 320 if end: 321 new_record.annotations['end'] = end 322 323 # another hack to add weight information to the sequence 324 new_record.annotations['weight'] = weight 325 326 self._records.append(new_record)
327
328 - def get_column(self,col):
329 """Returns a string containing a given column. 330 331 e.g. 332 >>> from Bio.Alphabet import IUPAC, Gapped 333 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 334 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 335 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 336 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 337 >>> align.get_column(0) 338 'AAA' 339 >>> align.get_column(3) 340 'G-G' 341 """ 342 #TODO - Support negative indices? 343 col_str = '' 344 assert col >= 0 and col <= self.get_alignment_length() 345 for rec in self._records: 346 col_str += rec.seq[col] 347 return col_str
348
349 - def __getitem__(self, index):
350 """Access part of the alignment. 351 352 We'll use the following example alignment here for illustration: 353 354 >>> from Bio.Alphabet import IUPAC, Gapped 355 >>> align = Alignment(Gapped(IUPAC.unambiguous_dna, "-")) 356 >>> align.add_sequence("Alpha", "ACTGCTAGCTAG") 357 >>> align.add_sequence("Beta", "ACT-CTAGCTAG") 358 >>> align.add_sequence("Gamma", "ACTGCTAGATAG") 359 >>> align.add_sequence("Delta", "ACTGCTTGCTAG") 360 >>> align.add_sequence("Epsilon","ACTGCTTGATAG") 361 362 You can access a row of the alignment as a SeqRecord using an integer 363 index (think of the alignment as a list of SeqRecord objects here): 364 365 >>> first_record = align[0] 366 >>> print first_record.id, first_record.seq 367 Alpha ACTGCTAGCTAG 368 >>> last_record = align[-1] 369 >>> print last_record.id, last_record.seq 370 Epsilon ACTGCTTGATAG 371 372 You can also access use python's slice notation to create a sub-alignment 373 containing only some of the SeqRecord objects: 374 375 >>> sub_alignment = align[2:5] 376 >>> print sub_alignment 377 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 378 ACTGCTAGATAG Gamma 379 ACTGCTTGCTAG Delta 380 ACTGCTTGATAG Epsilon 381 382 This includes support for a step, i.e. align[start:end:step], which 383 can be used to select every second sequence: 384 385 >>> sub_alignment = align[::2] 386 >>> print sub_alignment 387 Gapped(IUPACUnambiguousDNA(), '-') alignment with 3 rows and 12 columns 388 ACTGCTAGCTAG Alpha 389 ACTGCTAGATAG Gamma 390 ACTGCTTGATAG Epsilon 391 392 Or to get a copy of the alignment with the rows in reverse order: 393 394 >>> rev_alignment = align[::-1] 395 >>> print rev_alignment 396 Gapped(IUPACUnambiguousDNA(), '-') alignment with 5 rows and 12 columns 397 ACTGCTTGATAG Epsilon 398 ACTGCTTGCTAG Delta 399 ACTGCTAGATAG Gamma 400 ACT-CTAGCTAG Beta 401 ACTGCTAGCTAG Alpha 402 403 Right now, these are the ONLY indexing operations supported. The use of 404 a second column based index is under discussion for a future update. 405 """ 406 if isinstance(index, int): 407 #e.g. result = align[x] 408 #Return a SeqRecord 409 return self._records[index] 410 elif isinstance(index, slice): 411 #e.g. sub_aling = align[i:j:k] 412 #Return a new Alignment using only the specified records. 413 #TODO - See Bug 2554 for changing the __init__ method 414 #to allow us to do this more cleanly. 415 sub_align = Alignment(self._alphabet) 416 sub_align._records = self._records[index] 417 return sub_align 418 elif len(index)==2: 419 raise TypeError("Row and Column indexing is not currently supported,"\ 420 +"but may be in future.") 421 else: 422 raise TypeError("Invalid index type.")
423
424 -def _test():
425 """Run the Bio.Align.Generic module's doctests.""" 426 print "Running doctests..." 427 import doctest 428 doctest.testmod() 429 print "Done"
430 431 if __name__ == "__main__": 432 _test() 433