Package Bio :: Package AlignIO :: Module PhylipIO
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignIO.PhylipIO

  1  # Copyright 2006-2010 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  AlignIO support for the "phylip" format used in Joe Felsenstein's PHYLIP tools. 
  7   
  8  You are expected to use this module via the Bio.AlignIO functions (or the 
  9  Bio.SeqIO functions if you want to work directly with the gapped sequences). 
 10   
 11  Note 
 12  ==== 
 13  In TREE_PUZZLE (Schmidt et al. 2003) and PHYML (Guindon and Gascuel 2003) 
 14  a dot/period (".") in a sequence is interpreted as meaning the same 
 15  character as in the first sequence.  The PHYLIP 3.6 documentation says: 
 16   
 17     "a period was also previously allowed but it is no longer allowed, 
 18     because it sometimes is used in different senses in other programs" 
 19   
 20  At the time of writing, we do nothing special with a dot/period. 
 21  """ 
 22      
 23  from Bio.Alphabet import single_letter_alphabet 
 24  from Bio.Align import MultipleSeqAlignment 
 25  from Interfaces import AlignmentIterator, SequentialAlignmentWriter 
 26   
27 -class PhylipWriter(SequentialAlignmentWriter):
28 """Phylip alignment writer."""
29 - def write_alignment(self, alignment):
30 """Use this to write (another) single alignment to an open file. 31 32 This code will write interlaced alignments (when the sequences are 33 longer than 50 characters). 34 35 Note that record identifiers are strictly truncated at 10 characters. 36 37 For more information on the file format, please see: 38 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 39 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 40 """ 41 truncate=10 42 handle = self.handle 43 44 if len(alignment)==0: 45 raise ValueError("Must have at least one sequence") 46 length_of_seqs = alignment.get_alignment_length() 47 for record in alignment: 48 if length_of_seqs != len(record.seq): 49 raise ValueError("Sequences must all be the same length") 50 if length_of_seqs <= 0: 51 raise ValueError("Non-empty sequences are required") 52 53 if len(alignment) > len(set([r.id[:truncate] for r in alignment])): 54 raise ValueError("Repeated identifier, possibly due to truncation") 55 56 57 # From experimentation, the use of tabs is not understood by the 58 # EMBOSS suite. The nature of the expected white space is not 59 # defined in the PHYLIP documentation, simply "These are in free 60 # format, separated by blanks". We'll use spaces to keep EMBOSS 61 # happy. 62 handle.write(" %i %s\n" % (len(alignment), length_of_seqs)) 63 block=0 64 while True: 65 for record in alignment: 66 if block==0: 67 #Write name (truncated/padded to 10 characters) 68 """ 69 Quoting the PHYLIP version 3.6 documentation: 70 71 The name should be ten characters in length, filled out to 72 the full ten characters by blanks if shorter. Any printable 73 ASCII/ISO character is allowed in the name, except for 74 parentheses ("(" and ")"), square brackets ("[" and "]"), 75 colon (":"), semicolon (";") and comma (","). If you forget 76 to extend the names to ten characters in length by blanks, 77 the program [i.e. PHYLIP] will get out of synchronization 78 with the contents of the data file, and an error message will 79 result. 80 81 Note that Tab characters count as only one character in the 82 species names. Their inclusion can cause trouble. 83 """ 84 name = record.id.strip() 85 #Either remove the banned characters, or map them to something 86 #else like an underscore "_" or pipe "|" character... 87 for char in "[](),": 88 name = name.replace(char,"") 89 for char in ":;": 90 name = name.replace(char,"|") 91 92 #Now truncate and right pad to expected length. 93 handle.write(name[:truncate].ljust(truncate)) 94 else: 95 #write 10 space indent 96 handle.write(" "*truncate) 97 #Write five chunks of ten letters per line... 98 for chunk in range(0,5): 99 i = block*50 + chunk*10 100 seq_segment = record.seq.tostring()[i:i+10] 101 #TODO - Force any gaps to be '-' character? Look at the alphabet... 102 #TODO - How to cope with '?' or '.' in the sequence? 103 handle.write(" %s" % seq_segment) 104 if i+10 > length_of_seqs : break 105 handle.write("\n") 106 block=block+1 107 if block*50 > length_of_seqs : break 108 handle.write("\n")
109
110 -class PhylipIterator(AlignmentIterator):
111 """Reads a Phylip alignment file returning a MultipleSeqAlignment iterator. 112 113 Record identifiers are limited to at most 10 characters. 114 115 It only copes with interlaced phylip files! Sequential files won't work 116 where the sequences are split over multiple lines. 117 118 For more information on the file format, please see: 119 http://evolution.genetics.washington.edu/phylip/doc/sequence.html 120 http://evolution.genetics.washington.edu/phylip/doc/main.html#inputfiles 121 """ 122
123 - def _is_header(self, line):
124 line = line.strip() 125 parts = filter(None, line.split()) 126 if len(parts)!=2: 127 return False # First line should have two integers 128 try: 129 number_of_seqs = int(parts[0]) 130 length_of_seqs = int(parts[1]) 131 return True 132 except ValueError: 133 return False # First line should have two integers
134
135 - def next(self):
136 handle = self.handle 137 138 try: 139 #Header we saved from when we were parsing 140 #the previous alignment. 141 line = self._header 142 del self._header 143 except AttributeError: 144 line = handle.readline() 145 146 if not line: return 147 line = line.strip() 148 parts = filter(None, line.split()) 149 if len(parts)!=2: 150 raise ValueError("First line should have two integers") 151 try: 152 number_of_seqs = int(parts[0]) 153 length_of_seqs = int(parts[1]) 154 except ValueError: 155 raise ValueError("First line should have two integers") 156 157 assert self._is_header(line) 158 159 if self.records_per_alignment is not None \ 160 and self.records_per_alignment != number_of_seqs: 161 raise ValueError("Found %i records in this alignment, told to expect %i" \ 162 % (number_of_seqs, self.records_per_alignment)) 163 164 ids = [] 165 seqs = [] 166 167 #Expects STRICT truncation/padding to 10 characters 168 #Does not require any white space between name and seq. 169 for i in range(0,number_of_seqs): 170 line = handle.readline().rstrip() 171 ids.append(line[:10].strip()) #first ten characters 172 seqs.append([line[10:].strip().replace(" ","")]) 173 174 #Look for further blocks 175 line="" 176 while True: 177 #Skip any blank lines between blocks... 178 while ""==line.strip(): 179 line = handle.readline() 180 if not line : break #end of file 181 if not line : break #end of file 182 183 if self._is_header(line): 184 #Looks like the start of a concatenated alignment 185 self._header = line 186 break 187 188 #print "New block..." 189 for i in range(0,number_of_seqs): 190 seqs[i].append(line.strip().replace(" ","")) 191 line = handle.readline() 192 if (not line) and i+1 < number_of_seqs: 193 raise ValueError("End of file mid-block") 194 if not line : break #end of file 195 196 alignment = MultipleSeqAlignment(self.alphabet) 197 for i in range(0,number_of_seqs): 198 seq = "".join(seqs[i]) 199 if len(seq)!=length_of_seqs: 200 raise ValueError("Sequence %i length %i, expected length %i" \ 201 % (i+1, len(seq), length_of_seqs)) 202 alignment.add_sequence(ids[i], seq) 203 204 record = alignment[-1] 205 assert ids[i] == record.id or ids[i] == record.description 206 record.id = ids[i] 207 record.name = ids[i] 208 record.description = ids[i] 209 return alignment
210 211 if __name__=="__main__": 212 print "Running short mini-test" 213 214 phylip_text=""" 8 286 215 V_Harveyi_ --MKNWIKVA VAAIA--LSA A--------- ---------T VQAATEVKVG 216 B_subtilis MKMKKWTVLV VAALLAVLSA CG-------- ----NGNSSS KEDDNVLHVG 217 B_subtilis MKKALLALFM VVSIAALAAC GAGNDNQSKD NAKDGDLWAS IKKKGVLTVG 218 YA80_HAEIN MKKLLFTTAL LTGAIAFSTF ---------- -SHAGEIADR VEKTKTLLVG 219 FLIY_ECOLI MKLAHLGRQA LMGVMAVALV AG---MSVKS FADEG-LLNK VKERGTLLVG 220 E_coli_Gln --MKSVLKVS LAALTLAFAV S--------- ---------S HAADKKLVVA 221 Deinococcu -MKKSLLSLK LSGLLVPSVL ALS------- -LSACSSPSS TLNQGTLKIA 222 HISJ_E_COL MKKLVLSLSL VLAFSSATAA F--------- ---------- AAIPQNIRIG 223 224 MSGRYFPFTF VKQ--DKLQG FEVDMWDEIG KRNDYKIEYV TANFSGLFGL 225 ATGQSYPFAY KEN--GKLTG FDVEVMEAVA KKIDMKLDWK LLEFSGLMGE 226 TEGTYEPFTY HDKDTDKLTG YDVEVITEVA KRLGLKVDFK ETQWGSMFAG 227 TEGTYAPFTF HDK-SGKLTG FDVEVIRKVA EKLGLKVEFK ETQWDAMYAG 228 LEGTYPPFSF QGD-DGKLTG FEVEFAQQLA KHLGVEASLK PTKWDGMLAS 229 TDTAFVPFEF KQG--DKYVG FDVDLWAAIA KELKLDYELK PMDFSGIIPA 230 MEGTYPPFTS KNE-QGELVG FDVDIAKAVA QKLNLKPEFV LTEWSGILAG 231 TDPTYAPFES KNS-QGELVG FDIDLAKELC KRINTQCTFV ENPLDALIPS 232 233 LETGRIDTIS NQITMTDARK AKYLFADPYV VDG-AQITVR KGNDSIQGVE 234 LQTGKLDTIS NQVAVTDERK ETYNFTKPYA YAG-TQIVVK KDNTDIKSVD 235 LNSKRFDVVA NQVG-KTDRE DKYDFSDKYT TSR-AVVVTK KDNNDIKSEA 236 LNAKRFDVIA NQTNPSPERL KKYSFTTPYN YSG-GVIVTK SSDNSIKSFE 237 LDSKRIDVVI NQVTISDERK KKYDFSTPYT ISGIQALVKK GNEGTIKTAD 238 LQTKNVDLAL AGITITDERK KAIDFSDGYY KSG-LLVMVK ANNNDVKSVK 239 LQANKYDVIV NQVGITPERQ NSIGFSQPYA YSRPEIIVAK NNTFNPQSLA 240 LKAKKIDAIM SSLSITEKRQ QEIAFTDKLY AADSRLVVAK NSDIQP-TVE 241 242 DLAGKTVAVN LGSNFEQLLR DYDKDGKINI KTYDT--GIE HDVALGRADA 243 DLKGKTVAAV LGSNHAKNLE SKDPDKKINI KTYETQEGTL KDVAYGRVDA 244 DVKGKTSAQS LTSNYNKLAT N----AGAKV EGVEGMAQAL QMIQQARVDM 245 DLKGRKSAQS ATSNWGKDAK A----AGAQI LVVDGLAQSL ELIKQGRAEA 246 DLKGKKVGVG LGTNYEEWLR QNV--QGVDV RTYDDDPTKY QDLRVGRIDA 247 DLDGKVVAVK SGTGSVDYAK AN--IKTKDL RQFPNIDNAY MELGTNRADA 248 DLKGKRVGST LGSNYEKQLI DTG---DIKI VTYPGAPEIL ADLVAGRIDA 249 SLKGKRVGVL QGTTQETFGN EHWAPKGIEI VSYQGQDNIY SDLTAGRIDA 250 251 FIMDRLSALE -LIKKT-GLP LQLAGEPFET I-----QNAW PFVDNEKGRK 252 YVNSRTVLIA -QIKKT-GLP LKLAGDPIVY E-----QVAF PFAKDDAHDK 253 TYNDKLAVLN -YLKTSGNKN VKIAFETGEP Q-----STYF TFRKGS--GE 254 TINDKLAVLD -YFKQHPNSG LKIAYDRGDK T-----PTAF AFLQGE--DA 255 ILVDRLAALD -LVKKT-NDT LAVTGEAFSR Q-----ESGV ALRKGN--ED 256 VLHDTPNILY -FIKTAGNGQ FKAVGDSLEA Q-----QYGI AFPKGS--DE 257 AYNDRLVVNY -IINDQ-KLP VRGAGQIGDA A-----PVGI ALKKGN--SA 258 AFQDEVAASE GFLKQPVGKD YKFGGPSVKD EKLFGVGTGM GLRKED--NE 259 260 LQAEVNKALA EMRADGTVEK ISVKWFGADI TK---- 261 LRKKVNKALD ELRKDGTLKK LSEKYFNEDI TVEQKH 262 VVDQVNKALK EMKEDGTLSK ISKKWFGEDV SK---- 263 LITKFNQVLE ALRQDGTLKQ ISIEWFGYDI TQ---- 264 LLKAVNDAIA EMQKDGTLQA LSEKWFGADV TK---- 265 LRDKVNGALK TLRENGTYNE IYKKWFGTEP K----- 266 LKDQIDKALT EMRSDGTFEK ISQKWFGQDV GQP--- 267 LREALNKAFA EMRADGTYEK LAKKYFDFDV YGG--- 268 """ 269 270 from cStringIO import StringIO 271 handle = StringIO(phylip_text) 272 count=0 273 for alignment in PhylipIterator(handle): 274 for record in alignment: 275 count=count+1 276 print record.id 277 #print record.seq.tostring() 278 assert count == 8 279 280 expected="""mkklvlslsl vlafssataa faaipqniri gtdptyapfe sknsqgelvg 281 fdidlakelc krintqctfv enpldalips lkakkidaim sslsitekrq qeiaftdkly 282 aadsrlvvak nsdiqptves lkgkrvgvlq gttqetfgne hwapkgieiv syqgqdniys 283 dltagridaafqdevaaseg flkqpvgkdy kfggpsvkde klfgvgtgmg lrkednelre 284 alnkafaemradgtyeklak kyfdfdvygg""".replace(" ","").replace("\n","").upper() 285 assert record.seq.tostring().replace("-","") == expected 286 287 #From here: 288 #http://atgc.lirmm.fr/phyml/usersguide.html 289 phylip_text2="""5 60 290 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAG 291 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGG 292 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGG 293 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGG 294 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGG 295 296 GAAATGGTCAATATTACAAGGT 297 GAAATGGTCAACATTAAAAGAT 298 GAAATCGTCAATATTAAAAGGT 299 GAAATGGTCAATCTTAAAAGGT 300 GAAATGGTCAATATTAAAAGGT""" 301 302 phylip_text3="""5 60 303 Tax1 CCATCTCACGGTCGGTACGATACACCTGCTTTTGGCAGGAAATGGTCAATATTACAAGGT 304 Tax2 CCATCTCACGGTCAGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAACATTAAAAGAT 305 Tax3 CCATCTCCCGCTCAGTAAGATACCCCTGCTGTTGGCGGGAAATCGTCAATATTAAAAGGT 306 Tax4 TCATCTCATGGTCAATAAGATACTCCTGCTTTTGGCGGGAAATGGTCAATCTTAAAAGGT 307 Tax5 CCATCTCACGGTCGGTAAGATACACCTGCTTTTGGCGGGAAATGGTCAATATTAAAAGGT""" 308 309 handle = StringIO(phylip_text2) 310 list2 = list(PhylipIterator(handle)) 311 handle.close() 312 assert len(list2)==1 313 assert len(list2[0])==5 314 315 handle = StringIO(phylip_text3) 316 list3 = list(PhylipIterator(handle)) 317 handle.close() 318 assert len(list3)==1 319 assert len(list3[0])==5 320 321 for i in range(0,5): 322 list2[0][i].id == list3[0][i].id 323 list2[0][i].seq.tostring() == list3[0][i].seq.tostring() 324 325 #From here: 326 #http://evolution.genetics.washington.edu/phylip/doc/sequence.html 327 #Note the lack of any white space between names 2 and 3 and their seqs. 328 phylip_text4=""" 5 42 329 Turkey AAGCTNGGGC ATTTCAGGGT 330 Salmo gairAAGCCTTGGC AGTGCAGGGT 331 H. SapiensACCGGTTGGC CGTTCAGGGT 332 Chimp AAACCCTTGC CGTTACGCTT 333 Gorilla AAACCCTTGC CGGTACGCTT 334 335 GAGCCCGGGC AATACAGGGT AT 336 GAGCCGTGGC CGGGCACGGT AT 337 ACAGGTTGGC CGTTCAGGGT AA 338 AAACCGAGGC CGGGACACTC AT 339 AAACCATTGC CGGTACGCTT AA""" 340 341 #From here: 342 #http://evolution.genetics.washington.edu/phylip/doc/sequence.html 343 phylip_text5=""" 5 42 344 Turkey AAGCTNGGGC ATTTCAGGGT 345 GAGCCCGGGC AATACAGGGT AT 346 Salmo gairAAGCCTTGGC AGTGCAGGGT 347 GAGCCGTGGC CGGGCACGGT AT 348 H. SapiensACCGGTTGGC CGTTCAGGGT 349 ACAGGTTGGC CGTTCAGGGT AA 350 Chimp AAACCCTTGC CGTTACGCTT 351 AAACCGAGGC CGGGACACTC AT 352 Gorilla AAACCCTTGC CGGTACGCTT 353 AAACCATTGC CGGTACGCTT AA""" 354 355 phylip_text5a=""" 5 42 356 Turkey AAGCTNGGGC ATTTCAGGGT GAGCCCGGGC AATACAGGGT AT 357 Salmo gairAAGCCTTGGC AGTGCAGGGT GAGCCGTGGC CGGGCACGGT AT 358 H. SapiensACCGGTTGGC CGTTCAGGGT ACAGGTTGGC CGTTCAGGGT AA 359 Chimp AAACCCTTGC CGTTACGCTT AAACCGAGGC CGGGACACTC AT 360 Gorilla AAACCCTTGC CGGTACGCTT AAACCATTGC CGGTACGCTT AA""" 361 362 handle = StringIO(phylip_text4) 363 list4 = list(PhylipIterator(handle)) 364 handle.close() 365 assert len(list4)==1 366 assert len(list4[0])==5 367 368 handle = StringIO(phylip_text5) 369 try: 370 list5 = list(PhylipIterator(handle)) 371 assert len(list5)==1 372 assert len(list5[0])==5 373 print "That should have failed..." 374 except ValueError: 375 print "Evil multiline non-interlaced example failed as expected" 376 handle.close() 377 378 handle = StringIO(phylip_text5a) 379 list5 = list(PhylipIterator(handle)) 380 handle.close() 381 assert len(list5)==1 382 assert len(list4[0])==5 383 384 print "Concatenation" 385 handle = StringIO(phylip_text4 + "\n" + phylip_text4) 386 assert len(list(PhylipIterator(handle))) == 2 387 388 handle = StringIO(phylip_text3 + "\n" + phylip_text4 + "\n\n\n" + phylip_text) 389 assert len(list(PhylipIterator(handle))) == 3 390 391 print "OK" 392 393 print "Checking write/read" 394 handle = StringIO() 395 PhylipWriter(handle).write_file(list5) 396 handle.seek(0) 397 list6 = list(PhylipIterator(handle)) 398 assert len(list5) == len(list6) 399 for a1,a2 in zip(list5, list6): 400 assert len(a1) == len(a2) 401 for r1, r2 in zip(a1, a2): 402 assert r1.id == r2.id 403 assert r1.seq.tostring() == r2.seq.tostring() 404 print "Done" 405