Package Bio :: Package AlignIO :: Module NexusIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.NexusIO

  1  # Copyright 2008-2010 by Peter Cock.  All rights reserved. 
  2  # 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6  """ 
  7  Bio.AlignIO support for the "nexus" file format. 
  8   
  9  You are expected to use this module via the Bio.AlignIO functions (or the 
 10  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 11   
 12  See also the Bio.Nexus module (which this code calls internally), 
 13  as this offers more than just accessing the alignment or its 
 14  sequences as SeqRecord objects. 
 15  """ 
 16   
 17  from Bio.Nexus import Nexus 
 18  from Bio.Align import MultipleSeqAlignment 
 19  from Bio.SeqRecord import SeqRecord 
 20  from Interfaces import AlignmentWriter 
 21  from Bio import Alphabet 
 22   
 23  #You can get a couple of example files here: 
 24  #http://www.molecularevolution.org/resources/fileformats/ 
 25       
 26  #This is a generator function! 
27 -def NexusIterator(handle, seq_count=None):
28 """Returns SeqRecord objects from a Nexus file. 29 30 Thus uses the Bio.Nexus module to do the hard work. 31 32 You are expected to call this function via Bio.SeqIO or Bio.AlignIO 33 (and not use it directly). 34 35 NOTE - We only expect ONE alignment matrix per Nexus file, 36 meaning this iterator will only yield one MultipleSeqAlignment. 37 """ 38 n = Nexus.Nexus(handle) 39 if not n.matrix: 40 #No alignment found 41 raise StopIteration 42 alignment = MultipleSeqAlignment(n.alphabet) 43 44 #Bio.Nexus deals with duplicated names by adding a '.copy' suffix. 45 #The original names and the modified names are kept in these two lists: 46 assert len(n.unaltered_taxlabels) == len(n.taxlabels) 47 48 if seq_count and seq_count != len(n.unaltered_taxlabels): 49 raise ValueError("Found %i sequences, but seq_count=%i" \ 50 % (len(n.unaltered_taxlabels), seq_count)) 51 52 for old_name, new_name in zip (n.unaltered_taxlabels, n.taxlabels): 53 assert new_name.startswith(old_name) 54 seq = n.matrix[new_name] #already a Seq object with the alphabet set 55 #ToDo - Can we extract any annotation too? 56 alignment.append(SeqRecord(seq, id=new_name, name=old_name, 57 description="")) 58 #All done 59 yield alignment
60
61 -class NexusWriter(AlignmentWriter):
62 """Nexus alignment writer. 63 64 Note that Nexus files are only expected to hold ONE alignment 65 matrix. 66 67 You are expected to call this class via the Bio.AlignIO.write() or 68 Bio.SeqIO.write() functions. 69 """
70 - def write_file(self, alignments):
71 """Use this to write an entire file containing the given alignments. 72 73 alignments - A list or iterator returning MultipleSeqAlignment objects. 74 This should hold ONE and only one alignment. 75 """ 76 align_iter = iter(alignments) #Could have been a list 77 try: 78 first_alignment = align_iter.next() 79 except StopIteration: 80 first_alignment = None 81 if first_alignment is None: 82 #Nothing to write! 83 return 0 84 85 #Check there is only one alignment... 86 try: 87 second_alignment = align_iter.next() 88 except StopIteration: 89 second_alignment = None 90 if second_alignment is not None: 91 raise ValueError("We can only write one Alignment to a Nexus file.") 92 93 #Good. Actually write the single alignment, 94 self.write_alignment(first_alignment) 95 return 1 #we only support writing one alignment!
96
97 - def write_alignment(self, alignment):
98 #Creates an empty Nexus object, adds the sequences, 99 #and then gets Nexus to prepare the output. 100 if len(alignment) == 0: 101 raise ValueError("Must have at least one sequence") 102 if alignment.get_alignment_length() == 0: 103 raise ValueError("Non-empty sequences are required") 104 minimal_record = "#NEXUS\nbegin data; dimensions ntax=0 nchar=0; " \ 105 + "format datatype=%s; end;" \ 106 % self._classify_alphabet_for_nexus(alignment._alphabet) 107 n = Nexus.Nexus(minimal_record) 108 n.alphabet = alignment._alphabet 109 for record in alignment: 110 n.add_sequence(record.id, record.seq.tostring()) 111 n.write_nexus_data(self.handle)
112
113 - def _classify_alphabet_for_nexus(self, alphabet):
114 """Returns 'protein', 'dna', 'rna' based on the alphabet (PRIVATE). 115 116 Raises an exception if this is not possible.""" 117 #Get the base alphabet (underneath any Gapped or StopCodon encoding) 118 a = Alphabet._get_base_alphabet(alphabet) 119 120 if not isinstance(a, Alphabet.Alphabet): 121 raise TypeError("Invalid alphabet") 122 elif isinstance(a, Alphabet.ProteinAlphabet): 123 return "protein" 124 elif isinstance(a, Alphabet.DNAAlphabet): 125 return "dna" 126 elif isinstance(a, Alphabet.RNAAlphabet): 127 return "rna" 128 else: 129 #Must be something like NucleotideAlphabet or 130 #just the generic Alphabet (default for fasta files) 131 raise ValueError("Need a DNA, RNA or Protein alphabet")
132 133 if __name__ == "__main__": 134 from StringIO import StringIO 135 print "Quick self test" 136 print 137 print "Repeated names without a TAXA block" 138 handle = StringIO("""#NEXUS 139 [TITLE: NoName] 140 141 begin data; 142 dimensions ntax=4 nchar=50; 143 format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; 144 145 matrix 146 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 147 ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 148 CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- 149 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X 150 ; 151 end; 152 """) 153 for a in NexusIterator(handle): 154 print a 155 for r in a: 156 print repr(r.seq), r.name, r.id 157 print "Done" 158 159 print 160 print "Repeated names with a TAXA block" 161 handle = StringIO("""#NEXUS 162 [TITLE: NoName] 163 164 begin taxa 165 CYS1_DICDI 166 ALEU_HORVU 167 CATH_HUMAN 168 CYS1_DICDI; 169 end; 170 171 begin data; 172 dimensions ntax=4 nchar=50; 173 format interleave datatype=protein gap=- symbols="FSTNKEYVQMCLAWPHDRIG"; 174 175 matrix 176 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---- 177 ALEU_HORVU MAHARVLLLA LAVLATAAVA VASSSSFADS NPIRPVTDRA ASTLESAVLG 178 CATH_HUMAN ------MWAT LPLLCAGAWL LGV------- -PVCGAAELS VNSLEK---- 179 CYS1_DICDI -----MKVIL LFVLAVFTVF VSS------- --------RG IPPEEQ---X 180 ; 181 end; 182 """) 183 for a in NexusIterator(handle): 184 print a 185 for r in a: 186 print repr(r.seq), r.name, r.id 187 print "Done" 188 print 189 print "Reading an empty file" 190 assert 0 == len(list(NexusIterator(StringIO()))) 191 print "Done" 192 print 193 print "Writing..." 194 195 handle = StringIO() 196 NexusWriter(handle).write_file([a]) 197 handle.seek(0) 198 print handle.read() 199 200 handle = StringIO() 201 try: 202 NexusWriter(handle).write_file([a,a]) 203 assert False, "Should have rejected more than one alignment!" 204 except ValueError: 205 pass 206