Package Bio :: Package Clustalw
[hide private]
[frames] | no frames]

Source Code for Package Bio.Clustalw

  1  # This code is part of the Biopython distribution and governed by its 
  2  # license.  Please see the LICENSE file that should have been included 
  3  # as part of this package. 
  4  """Code for calling ClustalW and parsing its output (OBSOLETE). 
  5   
  6  This module has been superseded by the Bio.AlignIO framework for 
  7  alignment parsing, and the ClustalW command line wrapper in 
  8  Bio.Align.Applications for calling the tool. These are both described 
  9  in the current version of the Biopython Tutorial and Cookbook. 
 10  This means Bio.Clustalw is now obsolete and is likely to be deprecated 
 11  and eventually removed in future releases of Biopython. 
 12   
 13  A set of classes to interact with the multiple alignment command 
 14  line program clustalw.  
 15   
 16  Clustalw is the command line version of the graphical Clustalx  
 17  aligment program. 
 18   
 19  This requires clustalw available from: 
 20   
 21  ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalW/. 
 22   
 23  functions: 
 24  o read 
 25  o parse_file 
 26  o do_alignment 
 27   
 28  classes: 
 29  o ClustalAlignment 
 30  o MultipleAlignCL""" 
 31   
 32  # standard library 
 33  import os 
 34  import sys 
 35  import subprocess 
 36   
 37  # biopython 
 38  from Bio import Alphabet 
 39  from Bio.Alphabet import IUPAC 
 40  from Bio.Align.Generic import Alignment 
 41  from Bio.Application import _escape_filename 
 42   
43 -def parse_file(file_name, alphabet = IUPAC.unambiguous_dna, debug_level = 0):
44 """Parse the given file into a clustal aligment object (OBSOLETE). 45 46 Arguments: 47 o file_name - The name of the file to parse. 48 o alphabet - The type of alphabet to use for the alignment sequences. 49 This should correspond to the type of information contained in the file. 50 Defaults to be unambiguous_dna sequence. 51 52 There is a deprecated optional argument debug_level which has no effect. 53 54 This function is obsolete, and any new code should call Bio.AlignIO 55 instead. For example using Bio.Clustalw, you might have: 56 57 >>> from Bio import Clustalw 58 >>> from Bio import Alphabet 59 >>> filename = "Clustalw/protein.aln" 60 >>> alpha = Alphabet.Gapped(Alphabet.generic_protein) 61 >>> align = Clustalw.parse_file(filename, alphabet=alpha) 62 >>> print align.get_alignment_length() 63 411 64 >>> clustalw_string = str(align) 65 66 This becomes: 67 68 >>> from Bio import AlignIO 69 >>> from Bio import Alphabet 70 >>> filename = "Clustalw/protein.aln" 71 >>> alpha = Alphabet.Gapped(Alphabet.generic_protein) 72 >>> align = AlignIO.read(open(filename), "clustal", alphabet=alpha) 73 >>> print align.get_alignment_length() 74 411 75 >>> assert clustalw_string == align.format("clustal") 76 """ 77 78 # Avoid code duplication by calling Bio.AlignIO to do this for us. 79 handle = open(file_name, 'r') 80 from Bio import AlignIO 81 generic_alignment = AlignIO.read(handle, "clustal") 82 handle.close() 83 84 #Force this generic alignment into a ClustalAlignment... nasty hack 85 if isinstance(alphabet, Alphabet.Gapped): 86 alpha = alphabet 87 else: 88 alpha = Alphabet.Gapped(alphabet) 89 clustal_alignment = ClustalAlignment(alpha) 90 clustal_alignment._records = generic_alignment._records 91 for record in clustal_alignment._records: 92 record.seq.alphabet = alpha 93 94 try: 95 clustal_alignment._version = generic_alignment._version 96 except AttributeError: 97 #Missing the version, could be a 3rd party tool's output 98 pass 99 100 try : 101 clustal_alignment._star_info = generic_alignment._star_info 102 except AttributeError: 103 #Missing the consensus, again, this is not always present 104 pass 105 106 return clustal_alignment
107
108 -def do_alignment(command_line, alphabet=None):
109 """Perform an alignment with the given command line (OBSOLETE). 110 111 Arguments: 112 o command_line - A command line object that can give out 113 the command line we will input into clustalw. 114 o alphabet - the alphabet to use in the created alignment. If not 115 specified IUPAC.unambiguous_dna and IUPAC.protein will be used for 116 dna and protein alignment respectively. 117 118 Returns: 119 o A clustal alignment object corresponding to the created alignment. 120 If the alignment type was not a clustal object, None is returned. 121 122 This function (and the associated command line object) are now obsolete. 123 Please use the Bio.Align.Applications.ClustalwCommandline wrapper with 124 the Python subprocess module (and Bio.AlignIO for parsing) as described 125 in the tutorial. 126 """ 127 #We don't need to supply any piped input, but we setup the 128 #standard input pipe anyway as a work around for a python 129 #bug if this is called from a Windows GUI program. For 130 #details, see http://bugs.python.org/issue1124861 131 child_process = subprocess.Popen(str(command_line), 132 stdin=subprocess.PIPE, 133 stdout=subprocess.PIPE, 134 stderr=subprocess.PIPE, 135 shell=(sys.platform!="win32") 136 ) 137 #Use .communicate as can get deadlocks with .wait(), see Bug 2804 138 child_process.communicate() #ignore the stdout and strerr data 139 value = child_process.returncode 140 141 # check the return value for errors, as on 1.81 the return value 142 # from Clustalw is actually helpful for figuring out errors 143 # TODO - Update this for new error codes using in clustalw 2 144 145 # 1 => bad command line option 146 if value == 1: 147 raise ValueError("Bad command line option in the command: %s" 148 % str(command_line)) 149 # 2 => can't open sequence file 150 elif value == 2: 151 raise IOError("Cannot open sequence file %s" 152 % command_line.sequence_file) 153 # 3 => wrong format in sequence file 154 elif value == 3: 155 raise IOError("Sequence file %s has an invalid format." 156 % command_line.sequence_file) 157 # 4 => sequence file only has one sequence 158 elif value == 4: 159 raise IOError("Sequence file %s has only one sequence present." 160 % command_line.sequence_file) 161 162 # if an output file was specified, we need to grab it 163 if command_line.output_file: 164 out_file = command_line.output_file 165 else: 166 out_file = os.path.splitext(command_line.sequence_file)[0] + '.aln' 167 168 # if we can't deal with the format, just return None 169 if command_line.output_type and command_line.output_type != 'CLUSTAL': 170 return None 171 # otherwise parse it into a ClustalAlignment object 172 else: 173 if not alphabet: 174 alphabet = (IUPAC.unambiguous_dna, IUPAC.protein)[ 175 command_line.type == 'PROTEIN'] 176 177 # check if the outfile exists before parsing 178 if not(os.path.exists(out_file)): 179 raise IOError("Output .aln file %s not produced, commandline: %s" 180 % (out_file, command_line)) 181 182 return parse_file(out_file, alphabet)
183 184
185 -class ClustalAlignment(Alignment):
186 """Work with the clustal aligment format (OBSOLETE). 187 188 This format is the default output from clustal -- these files normally 189 have an extension of .aln. 190 191 This obsolete alignment object is a subclass of the more general alignment 192 object used in Bio.AlignIO. The old practical difference is here str(align) 193 would give the alignment as a string in clustal format, whereas in general 194 you must do align.format("clustal"), which supports other formats too. 195 """ 196 # the default version to use if one isn't set 197 DEFAULT_VERSION = '1.81' 198
199 - def __init__(self, alphabet = Alphabet.Gapped(IUPAC.ambiguous_dna)):
200 Alignment.__init__(self, alphabet) 201 # represent all of those stars in the aln output format 202 self._star_info = '' 203 self._version = ''
204
205 - def __str__(self):
206 """Print out the alignment so it looks pretty. 207 208 The output produced from this should also be formatted in valid 209 clustal format. 210 """ 211 return self.format("clustal")
212
213 -class MultipleAlignCL:
214 """Represent a clustalw multiple alignment command line (OBSOLETE). 215 216 This command line wrapper is considerd obsolete. Please use the replacement 217 Bio.Align.Applications.ClustalwCommandline wrapper instead, which uses the 218 standardised Bio.Application style interface. This is described in the 219 tutorial, with examples using ClustalW. 220 """ 221 # set the valid options for different parameters 222 OUTPUT_TYPES = ['GCG', 'GDE', 'PHYLIP', 'PIR', 'NEXUS', 'FASTA'] 223 OUTPUT_ORDER = ['INPUT', 'ALIGNED'] 224 OUTPUT_CASE = ['LOWER', 'UPPER'] 225 OUTPUT_SEQNOS = ['OFF', 'ON'] 226 RESIDUE_TYPES = ['PROTEIN', 'DNA'] 227 PROTEIN_MATRIX = ['BLOSUM', 'PAM', 'GONNET', 'ID'] 228 DNA_MATRIX = ['IUB', 'CLUSTALW'] 229
230 - def __init__(self, sequence_file, command = 'clustalw'):
231 """Initialize some general parameters that can be set as attributes. 232 233 Arguments: 234 o sequence_file - The file to read the sequences for alignment from. 235 o command - The command used to run clustalw. This defaults to 236 just 'clustalw' (ie. assumes you have it on your path somewhere). 237 238 General attributes that can be set: 239 o is_quick - if set as 1, will use a fast algorithm to create 240 the alignment guide tree. 241 o allow_negative - allow negative values in the alignment matrix. 242 243 Multiple alignment attributes that can be set as attributes: 244 o gap_open_pen - Gap opening penalty 245 o gap_ext_pen - Gap extension penalty 246 o is_no_end_pen - A flag as to whether or not there should be a gap 247 separation penalty for the ends. 248 o gap_sep_range - The gap separation penalty range. 249 o is_no_pgap - A flag to turn off residue specific gaps 250 o is_no_hgap - A flag to turn off hydrophilic gaps 251 o h_gap_residues - A list of residues to count a hydrophilic 252 o max_div - A percent identity to use for delay (? - I don't undertand 253 this!) 254 o trans_weight - The weight to use for transitions 255 """ 256 self.sequence_file = sequence_file 257 self.command = command 258 259 self.is_quick = None 260 self.allow_negative = None 261 262 self.gap_open_pen = None 263 self.gap_ext_pen = None 264 self.is_no_end_pen = None 265 self.gap_sep_range = None 266 self.is_no_pgap = None 267 self.is_no_hgap = None 268 self.h_gap_residues = [] 269 self.max_div = None 270 self.trans_weight = None 271 272 # other attributes that should be set via various functions 273 # 1. output parameters 274 self.output_file = None 275 self.output_type = None 276 self.output_order = None 277 self.change_case = None 278 self.add_seqnos = None 279 280 # 2. a guide tree to use 281 self.guide_tree = None 282 self.new_tree = None 283 284 # 3. matrices 285 self.protein_matrix = None 286 self.dna_matrix = None 287 288 # 4. type of residues 289 self.type = None
290
291 - def __str__(self):
292 """Write out the command line as a string.""" 293 294 #On Linux with clustalw 1.83, you can do: 295 #clustalw input.faa 296 #clustalw /full/path/input.faa 297 #clustalw -INFILE=input.faa 298 #clustalw -INFILE=/full/path/input.faa 299 # 300 #Note these fail (using DOS style slashes): 301 # 302 #clustalw /INFILE=input.faa 303 #clustalw /INFILE=/full/path/input.faa 304 # 305 #On Windows XP with clustalw.exe 1.83, these work at 306 #the command prompt: 307 # 308 #clustalw.exe input.faa 309 #clustalw.exe /INFILE=input.faa 310 #clustalw.exe /INFILE="input.faa" 311 #clustalw.exe /INFILE="with space.faa" 312 #clustalw.exe /INFILE=C:\full\path\input.faa 313 #clustalw.exe /INFILE="C:\full path\with spaces.faa" 314 # 315 #Sadly these fail: 316 #clustalw.exe "input.faa" 317 #clustalw.exe "with space.faa" 318 #clustalw.exe C:\full\path\input.faa 319 #clustalw.exe "C:\full path\with spaces.faa" 320 # 321 #Testing today (using a different binary of clustalw.exe 1.83), 322 #using -INFILE as follows seems to work. However I had once noted: 323 #These also fail but a minus/dash does seem to 324 #work with other options (!): 325 #clustalw.exe -INFILE=input.faa 326 #clustalw.exe -INFILE=C:\full\path\input.faa 327 # 328 #Also these fail: 329 #clustalw.exe "/INFILE=input.faa" 330 #clustalw.exe "/INFILE=C:\full\path\input.faa" 331 # 332 #Thanks to Emanuel Hey for flagging this on the mailing list. 333 # 334 #In addition, both self.command and self.sequence_file 335 #may contain spaces, so should be quoted. But clustalw 336 #is fussy. 337 cline = _escape_filename(self.command) 338 cline += ' -INFILE=%s' % _escape_filename(self.sequence_file) 339 340 # general options 341 if self.type: 342 cline += " -TYPE=%s" % self.type 343 if self.is_quick == 1: 344 #Some versions of clustalw are case sensitive, 345 #and require -quicktree rather than -QUICKTREE 346 cline += " -quicktree" 347 if self.allow_negative == 1: 348 cline += " -NEGATIVE" 349 350 # output options 351 if self.output_file: 352 cline += " -OUTFILE=%s" % _escape_filename(self.output_file) 353 if self.output_type: 354 cline += " -OUTPUT=%s" % self.output_type 355 if self.output_order: 356 cline += " -OUTORDER=%s" % self.output_order 357 if self.change_case: 358 cline += " -CASE=%s" % self.change_case 359 if self.add_seqnos: 360 cline += " -SEQNOS=%s" % self.add_seqnos 361 if self.new_tree: 362 # clustal does not work if -align is written -ALIGN 363 cline += " -NEWTREE=%s -align" % _escape_filename(self.new_tree) 364 365 # multiple alignment options 366 if self.guide_tree: 367 cline += " -USETREE=%s" % _escape_filename(self.guide_tree) 368 if self.protein_matrix: 369 cline += " -MATRIX=%s" % self.protein_matrix 370 if self.dna_matrix: 371 cline += " -DNAMATRIX=%s" % self.dna_matrix 372 if self.gap_open_pen: 373 cline += " -GAPOPEN=%s" % self.gap_open_pen 374 if self.gap_ext_pen: 375 cline += " -GAPEXT=%s" % self.gap_ext_pen 376 if self.is_no_end_pen == 1: 377 cline += " -ENDGAPS" 378 if self.gap_sep_range: 379 cline += " -GAPDIST=%s" % self.gap_sep_range 380 if self.is_no_pgap == 1: 381 cline += " -NOPGAP" 382 if self.is_no_hgap == 1: 383 cline += " -NOHGAP" 384 if len(self.h_gap_residues) != 0: 385 # stick the list of residues together as one big list o' residues 386 residue_list = '' 387 for residue in self.h_gap_residues: 388 residue_list = residue_list + residue 389 cline += " -HGAPRESIDUES=%s" % residue_list 390 if self.max_div: 391 cline += " -MAXDIV=%s" % self.max_div 392 if self.trans_weight: 393 cline += " -TRANSWEIGHT=%s" % self.trans_weight 394 395 return cline
396
397 - def set_output(self, output_file, output_type = None, output_order = None, 398 change_case = None, add_seqnos = None):
399 """Set the output parameters for the command line. 400 """ 401 self.output_file = output_file 402 403 if output_type: 404 output_type = output_type.upper() 405 if output_type not in self.OUTPUT_TYPES: 406 raise ValueError("Invalid output type %s. Valid choices are %s" 407 % (output_type, self.OUTPUT_TYPES)) 408 else: 409 self.output_type = output_type 410 411 if output_order: 412 output_order = output_order.upper() 413 if output_order not in self.OUTPUT_ORDER: 414 raise ValueError("Invalid output order %s. Valid choices are %s" 415 % (output_order, self.OUTPUT_ORDER)) 416 else: 417 self.output_order = output_order 418 419 if change_case: 420 change_case = change_case.upper() 421 if output_type != "GDE": 422 raise ValueError("Change case only valid for GDE output.") 423 elif change_case not in self.CHANGE_CASE: 424 raise ValueError("Invalid change case %s. Valid choices are %s" 425 % (change_case, self.CHANGE_CASE)) 426 else: 427 self.change_case = change_case 428 429 if add_seqnos: 430 add_seqnos = add_seqnos.upper() 431 if output_type: 432 raise ValueError("Add SeqNos only valid for CLUSTAL output.") 433 elif add_seqnos not in self.OUTPUT_SEQNOS: 434 raise ValueError("Invalid seqnos option %s. Valid choices: %s" 435 % (add_seqnos, self.OUTPUT_SEQNOS)) 436 else: 437 self.add_seqnos = add_seqnos
438
439 - def set_guide_tree(self, tree_file):
440 """Provide a file to use as the guide tree for alignment. 441 442 Raises: 443 o IOError - If the tree_file doesn't exist.""" 444 if not(os.path.exists(tree_file)): 445 raise IOError("Could not find the guide tree file %s." % 446 tree_file) 447 else: 448 self.guide_tree = tree_file
449
450 - def set_new_guide_tree(self, tree_file):
451 """Set the name of the guide tree file generated in the alignment. 452 """ 453 self.new_tree = tree_file
454
455 - def set_protein_matrix(self, protein_matrix):
456 """Set the type of protein matrix to use. 457 458 Protein matrix can be either one of the defined types (blosum, pam, 459 gonnet or id) or a file with your own defined matrix. 460 """ 461 if protein_matrix.upper() in self.PROTEIN_MATRIX: 462 self.protein_matrix = protein_matrix.upper() 463 elif os.path.exists(protein_matrix): 464 self.protein_matrix = protein_matrix 465 else: 466 raise ValueError("Invalid matrix %s. Options are %s or a file." % 467 (protein_matrix.upper(), self.PROTEIN_MATRIX))
468
469 - def set_dna_matrix(self, dna_matrix):
470 """Set the type of DNA matrix to use. 471 472 The dna_matrix can either be one of the defined types (iub or clustalw) 473 or a file with the matrix to use.""" 474 if dna_matrix.upper() in self.DNA_MATRIX: 475 self.dna_matrix = dna_matrix.upper() 476 elif os.path.exists(dna_matrix): 477 self.dna_matrix = dna_matrix 478 else: 479 raise ValueError("Invalid matrix %s. Options are %s or a file." % 480 (dna_matrix, self.DNA_MATRIX))
481
482 - def set_type(self, residue_type):
483 """Set the type of residues within the file. 484 485 Clustal tries to guess whether the info is protein or DNA based on 486 the number of GATCs, but this can be wrong if you have a messed up 487 protein or DNA you are working with, so this allows you to set it 488 explicitly. 489 """ 490 residue_type = residue_type.upper() 491 if residue_type in self.RESIDUE_TYPES: 492 self.type = residue_type 493 else: 494 raise ValueError("Invalid residue type %s. Valid choices are %s" 495 % (residue_type, self.RESIDUE_TYPES))
496
497 -def _test():
498 """Run the Bio.Clustalw module's doctests (PRIVATE). 499 500 This will try and locate the unit tests directory, and run the doctests 501 from there in order that the relative paths used in the examples work. 502 """ 503 import doctest 504 import os 505 if os.path.isdir(os.path.join("..","..","Tests")): 506 print "Runing doctests..." 507 cur_dir = os.path.abspath(os.curdir) 508 os.chdir(os.path.join("..","..","Tests")) 509 doctest.testmod() 510 os.chdir(cur_dir) 511 del cur_dir 512 print "Done"
513 514 if __name__ == "__main__": 515 _test() 516