1
2
3
4 """Code for calling ClustalW and parsing its output (OBSOLETE).
5
6 This module has been superseded by the Bio.AlignIO framework for
7 alignment parsing, and the ClustalW command line wrapper in
8 Bio.Align.Applications for calling the tool. These are both described
9 in the current version of the Biopython Tutorial and Cookbook.
10 This means Bio.Clustalw is now obsolete and is likely to be deprecated
11 and eventually removed in future releases of Biopython.
12
13 A set of classes to interact with the multiple alignment command
14 line program clustalw.
15
16 Clustalw is the command line version of the graphical Clustalx
17 aligment program.
18
19 This requires clustalw available from:
20
21 ftp://ftp-igbmc.u-strasbg.fr/pub/ClustalW/.
22
23 functions:
24 o read
25 o parse_file
26 o do_alignment
27
28 classes:
29 o ClustalAlignment
30 o MultipleAlignCL"""
31
32
33 import os
34 import sys
35 import subprocess
36
37
38 from Bio import Alphabet
39 from Bio.Alphabet import IUPAC
40 from Bio.Align.Generic import Alignment
41 from Bio.Application import _escape_filename
42
44 """Parse the given file into a clustal aligment object (OBSOLETE).
45
46 Arguments:
47 o file_name - The name of the file to parse.
48 o alphabet - The type of alphabet to use for the alignment sequences.
49 This should correspond to the type of information contained in the file.
50 Defaults to be unambiguous_dna sequence.
51
52 There is a deprecated optional argument debug_level which has no effect.
53
54 This function is obsolete, and any new code should call Bio.AlignIO
55 instead. For example using Bio.Clustalw, you might have:
56
57 >>> from Bio import Clustalw
58 >>> from Bio import Alphabet
59 >>> filename = "Clustalw/protein.aln"
60 >>> alpha = Alphabet.Gapped(Alphabet.generic_protein)
61 >>> align = Clustalw.parse_file(filename, alphabet=alpha)
62 >>> print align.get_alignment_length()
63 411
64 >>> clustalw_string = str(align)
65
66 This becomes:
67
68 >>> from Bio import AlignIO
69 >>> from Bio import Alphabet
70 >>> filename = "Clustalw/protein.aln"
71 >>> alpha = Alphabet.Gapped(Alphabet.generic_protein)
72 >>> align = AlignIO.read(open(filename), "clustal", alphabet=alpha)
73 >>> print align.get_alignment_length()
74 411
75 >>> assert clustalw_string == align.format("clustal")
76 """
77
78
79 handle = open(file_name, 'r')
80 from Bio import AlignIO
81 generic_alignment = AlignIO.read(handle, "clustal")
82 handle.close()
83
84
85 if isinstance(alphabet, Alphabet.Gapped):
86 alpha = alphabet
87 else:
88 alpha = Alphabet.Gapped(alphabet)
89 clustal_alignment = ClustalAlignment(alpha)
90 clustal_alignment._records = generic_alignment._records
91 for record in clustal_alignment._records:
92 record.seq.alphabet = alpha
93
94 try:
95 clustal_alignment._version = generic_alignment._version
96 except AttributeError:
97
98 pass
99
100 try :
101 clustal_alignment._star_info = generic_alignment._star_info
102 except AttributeError:
103
104 pass
105
106 return clustal_alignment
107
109 """Perform an alignment with the given command line (OBSOLETE).
110
111 Arguments:
112 o command_line - A command line object that can give out
113 the command line we will input into clustalw.
114 o alphabet - the alphabet to use in the created alignment. If not
115 specified IUPAC.unambiguous_dna and IUPAC.protein will be used for
116 dna and protein alignment respectively.
117
118 Returns:
119 o A clustal alignment object corresponding to the created alignment.
120 If the alignment type was not a clustal object, None is returned.
121
122 This function (and the associated command line object) are now obsolete.
123 Please use the Bio.Align.Applications.ClustalwCommandline wrapper with
124 the Python subprocess module (and Bio.AlignIO for parsing) as described
125 in the tutorial.
126 """
127
128
129
130
131 child_process = subprocess.Popen(str(command_line),
132 stdin=subprocess.PIPE,
133 stdout=subprocess.PIPE,
134 stderr=subprocess.PIPE,
135 shell=(sys.platform!="win32")
136 )
137
138 child_process.communicate()
139 value = child_process.returncode
140
141
142
143
144
145
146 if value == 1:
147 raise ValueError("Bad command line option in the command: %s"
148 % str(command_line))
149
150 elif value == 2:
151 raise IOError("Cannot open sequence file %s"
152 % command_line.sequence_file)
153
154 elif value == 3:
155 raise IOError("Sequence file %s has an invalid format."
156 % command_line.sequence_file)
157
158 elif value == 4:
159 raise IOError("Sequence file %s has only one sequence present."
160 % command_line.sequence_file)
161
162
163 if command_line.output_file:
164 out_file = command_line.output_file
165 else:
166 out_file = os.path.splitext(command_line.sequence_file)[0] + '.aln'
167
168
169 if command_line.output_type and command_line.output_type != 'CLUSTAL':
170 return None
171
172 else:
173 if not alphabet:
174 alphabet = (IUPAC.unambiguous_dna, IUPAC.protein)[
175 command_line.type == 'PROTEIN']
176
177
178 if not(os.path.exists(out_file)):
179 raise IOError("Output .aln file %s not produced, commandline: %s"
180 % (out_file, command_line))
181
182 return parse_file(out_file, alphabet)
183
184
186 """Work with the clustal aligment format (OBSOLETE).
187
188 This format is the default output from clustal -- these files normally
189 have an extension of .aln.
190
191 This obsolete alignment object is a subclass of the more general alignment
192 object used in Bio.AlignIO. The old practical difference is here str(align)
193 would give the alignment as a string in clustal format, whereas in general
194 you must do align.format("clustal"), which supports other formats too.
195 """
196
197 DEFAULT_VERSION = '1.81'
198
204
206 """Print out the alignment so it looks pretty.
207
208 The output produced from this should also be formatted in valid
209 clustal format.
210 """
211 return self.format("clustal")
212
214 """Represent a clustalw multiple alignment command line (OBSOLETE).
215
216 This command line wrapper is considerd obsolete. Please use the replacement
217 Bio.Align.Applications.ClustalwCommandline wrapper instead, which uses the
218 standardised Bio.Application style interface. This is described in the
219 tutorial, with examples using ClustalW.
220 """
221
222 OUTPUT_TYPES = ['GCG', 'GDE', 'PHYLIP', 'PIR', 'NEXUS', 'FASTA']
223 OUTPUT_ORDER = ['INPUT', 'ALIGNED']
224 OUTPUT_CASE = ['LOWER', 'UPPER']
225 OUTPUT_SEQNOS = ['OFF', 'ON']
226 RESIDUE_TYPES = ['PROTEIN', 'DNA']
227 PROTEIN_MATRIX = ['BLOSUM', 'PAM', 'GONNET', 'ID']
228 DNA_MATRIX = ['IUB', 'CLUSTALW']
229
230 - def __init__(self, sequence_file, command = 'clustalw'):
231 """Initialize some general parameters that can be set as attributes.
232
233 Arguments:
234 o sequence_file - The file to read the sequences for alignment from.
235 o command - The command used to run clustalw. This defaults to
236 just 'clustalw' (ie. assumes you have it on your path somewhere).
237
238 General attributes that can be set:
239 o is_quick - if set as 1, will use a fast algorithm to create
240 the alignment guide tree.
241 o allow_negative - allow negative values in the alignment matrix.
242
243 Multiple alignment attributes that can be set as attributes:
244 o gap_open_pen - Gap opening penalty
245 o gap_ext_pen - Gap extension penalty
246 o is_no_end_pen - A flag as to whether or not there should be a gap
247 separation penalty for the ends.
248 o gap_sep_range - The gap separation penalty range.
249 o is_no_pgap - A flag to turn off residue specific gaps
250 o is_no_hgap - A flag to turn off hydrophilic gaps
251 o h_gap_residues - A list of residues to count a hydrophilic
252 o max_div - A percent identity to use for delay (? - I don't undertand
253 this!)
254 o trans_weight - The weight to use for transitions
255 """
256 self.sequence_file = sequence_file
257 self.command = command
258
259 self.is_quick = None
260 self.allow_negative = None
261
262 self.gap_open_pen = None
263 self.gap_ext_pen = None
264 self.is_no_end_pen = None
265 self.gap_sep_range = None
266 self.is_no_pgap = None
267 self.is_no_hgap = None
268 self.h_gap_residues = []
269 self.max_div = None
270 self.trans_weight = None
271
272
273
274 self.output_file = None
275 self.output_type = None
276 self.output_order = None
277 self.change_case = None
278 self.add_seqnos = None
279
280
281 self.guide_tree = None
282 self.new_tree = None
283
284
285 self.protein_matrix = None
286 self.dna_matrix = None
287
288
289 self.type = None
290
292 """Write out the command line as a string."""
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337 cline = _escape_filename(self.command)
338 cline += ' -INFILE=%s' % _escape_filename(self.sequence_file)
339
340
341 if self.type:
342 cline += " -TYPE=%s" % self.type
343 if self.is_quick == 1:
344
345
346 cline += " -quicktree"
347 if self.allow_negative == 1:
348 cline += " -NEGATIVE"
349
350
351 if self.output_file:
352 cline += " -OUTFILE=%s" % _escape_filename(self.output_file)
353 if self.output_type:
354 cline += " -OUTPUT=%s" % self.output_type
355 if self.output_order:
356 cline += " -OUTORDER=%s" % self.output_order
357 if self.change_case:
358 cline += " -CASE=%s" % self.change_case
359 if self.add_seqnos:
360 cline += " -SEQNOS=%s" % self.add_seqnos
361 if self.new_tree:
362
363 cline += " -NEWTREE=%s -align" % _escape_filename(self.new_tree)
364
365
366 if self.guide_tree:
367 cline += " -USETREE=%s" % _escape_filename(self.guide_tree)
368 if self.protein_matrix:
369 cline += " -MATRIX=%s" % self.protein_matrix
370 if self.dna_matrix:
371 cline += " -DNAMATRIX=%s" % self.dna_matrix
372 if self.gap_open_pen:
373 cline += " -GAPOPEN=%s" % self.gap_open_pen
374 if self.gap_ext_pen:
375 cline += " -GAPEXT=%s" % self.gap_ext_pen
376 if self.is_no_end_pen == 1:
377 cline += " -ENDGAPS"
378 if self.gap_sep_range:
379 cline += " -GAPDIST=%s" % self.gap_sep_range
380 if self.is_no_pgap == 1:
381 cline += " -NOPGAP"
382 if self.is_no_hgap == 1:
383 cline += " -NOHGAP"
384 if len(self.h_gap_residues) != 0:
385
386 residue_list = ''
387 for residue in self.h_gap_residues:
388 residue_list = residue_list + residue
389 cline += " -HGAPRESIDUES=%s" % residue_list
390 if self.max_div:
391 cline += " -MAXDIV=%s" % self.max_div
392 if self.trans_weight:
393 cline += " -TRANSWEIGHT=%s" % self.trans_weight
394
395 return cline
396
397 - def set_output(self, output_file, output_type = None, output_order = None,
398 change_case = None, add_seqnos = None):
399 """Set the output parameters for the command line.
400 """
401 self.output_file = output_file
402
403 if output_type:
404 output_type = output_type.upper()
405 if output_type not in self.OUTPUT_TYPES:
406 raise ValueError("Invalid output type %s. Valid choices are %s"
407 % (output_type, self.OUTPUT_TYPES))
408 else:
409 self.output_type = output_type
410
411 if output_order:
412 output_order = output_order.upper()
413 if output_order not in self.OUTPUT_ORDER:
414 raise ValueError("Invalid output order %s. Valid choices are %s"
415 % (output_order, self.OUTPUT_ORDER))
416 else:
417 self.output_order = output_order
418
419 if change_case:
420 change_case = change_case.upper()
421 if output_type != "GDE":
422 raise ValueError("Change case only valid for GDE output.")
423 elif change_case not in self.CHANGE_CASE:
424 raise ValueError("Invalid change case %s. Valid choices are %s"
425 % (change_case, self.CHANGE_CASE))
426 else:
427 self.change_case = change_case
428
429 if add_seqnos:
430 add_seqnos = add_seqnos.upper()
431 if output_type:
432 raise ValueError("Add SeqNos only valid for CLUSTAL output.")
433 elif add_seqnos not in self.OUTPUT_SEQNOS:
434 raise ValueError("Invalid seqnos option %s. Valid choices: %s"
435 % (add_seqnos, self.OUTPUT_SEQNOS))
436 else:
437 self.add_seqnos = add_seqnos
438
440 """Provide a file to use as the guide tree for alignment.
441
442 Raises:
443 o IOError - If the tree_file doesn't exist."""
444 if not(os.path.exists(tree_file)):
445 raise IOError("Could not find the guide tree file %s." %
446 tree_file)
447 else:
448 self.guide_tree = tree_file
449
451 """Set the name of the guide tree file generated in the alignment.
452 """
453 self.new_tree = tree_file
454
456 """Set the type of protein matrix to use.
457
458 Protein matrix can be either one of the defined types (blosum, pam,
459 gonnet or id) or a file with your own defined matrix.
460 """
461 if protein_matrix.upper() in self.PROTEIN_MATRIX:
462 self.protein_matrix = protein_matrix.upper()
463 elif os.path.exists(protein_matrix):
464 self.protein_matrix = protein_matrix
465 else:
466 raise ValueError("Invalid matrix %s. Options are %s or a file." %
467 (protein_matrix.upper(), self.PROTEIN_MATRIX))
468
470 """Set the type of DNA matrix to use.
471
472 The dna_matrix can either be one of the defined types (iub or clustalw)
473 or a file with the matrix to use."""
474 if dna_matrix.upper() in self.DNA_MATRIX:
475 self.dna_matrix = dna_matrix.upper()
476 elif os.path.exists(dna_matrix):
477 self.dna_matrix = dna_matrix
478 else:
479 raise ValueError("Invalid matrix %s. Options are %s or a file." %
480 (dna_matrix, self.DNA_MATRIX))
481
483 """Set the type of residues within the file.
484
485 Clustal tries to guess whether the info is protein or DNA based on
486 the number of GATCs, but this can be wrong if you have a messed up
487 protein or DNA you are working with, so this allows you to set it
488 explicitly.
489 """
490 residue_type = residue_type.upper()
491 if residue_type in self.RESIDUE_TYPES:
492 self.type = residue_type
493 else:
494 raise ValueError("Invalid residue type %s. Valid choices are %s"
495 % (residue_type, self.RESIDUE_TYPES))
496
498 """Run the Bio.Clustalw module's doctests (PRIVATE).
499
500 This will try and locate the unit tests directory, and run the doctests
501 from there in order that the relative paths used in the examples work.
502 """
503 import doctest
504 import os
505 if os.path.isdir(os.path.join("..","..","Tests")):
506 print "Runing doctests..."
507 cur_dir = os.path.abspath(os.curdir)
508 os.chdir(os.path.join("..","..","Tests"))
509 doctest.testmod()
510 os.chdir(cur_dir)
511 del cur_dir
512 print "Done"
513
514 if __name__ == "__main__":
515 _test()
516