Package Bio :: Package SeqIO :: Module _convert
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._convert

  1  # Copyright 2009 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Optimised sequence conversion code (PRIVATE). 
  6   
  7  You are not expected to access this module, or any of its code, directly. This 
  8  is all handled internally by the Bio.SeqIO.convert(...) function which is the 
  9  public interface for this. 
 10   
 11  The idea here is rather while doing this will work: 
 12   
 13  from Bio import SeqIO 
 14  records = SeqIO.parse(in_handle, in_format) 
 15  count = SeqIO.write(records, out_handle, out_format) 
 16   
 17  it is shorter to write: 
 18   
 19  from Bio import SeqIO 
 20  count = SeqIO.convert(in_handle, in_format, out_handle, out_format) 
 21   
 22  Also, the convert function can take a number of special case optimisations. This 
 23  means that using Bio.SeqIO.convert() may be faster, as well as more convenient. 
 24  All these file format specific optimisations are handled by this (private) module. 
 25  """ 
 26   
 27  from Bio import SeqIO 
 28  #NOTE - Lots of lazy imports further on... 
 29   
30 -def _genbank_convert_fasta(in_handle, out_handle, alphabet=None):
31 """Fast GenBank to FASTA (PRIVATE).""" 32 #We don't need to parse the features... 33 from Bio.GenBank.Scanner import GenBankScanner 34 records = GenBankScanner().parse_records(in_handle, do_features=False) 35 #For FASTA output we can ignore the alphabet too 36 return SeqIO.write(records, out_handle, "fasta")
37
38 -def _embl_convert_fasta(in_handle, out_handle, alphabet=None):
39 """Fast EMBL to FASTA (PRIVATE).""" 40 #We don't need to parse the features... 41 from Bio.GenBank.Scanner import EmblScanner 42 records = EmblScanner().parse_records(in_handle, do_features=False) 43 #For FASTA output we can ignore the alphabet too 44 return SeqIO.write(records, out_handle, "fasta")
45
46 -def _fastq_generic(in_handle, out_handle, mapping):
47 """FASTQ helper function where can't have data loss by truncation (PRIVATE).""" 48 from Bio.SeqIO.QualityIO import FastqGeneralIterator 49 #For real speed, don't even make SeqRecord and Seq objects! 50 count = 0 51 null = chr(0) 52 for title, seq, old_qual in FastqGeneralIterator(in_handle): 53 count += 1 54 #map the qual... 55 qual = old_qual.translate(mapping) 56 if null in qual: 57 raise ValueError("Invalid character in quality string") 58 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 59 return count
60
61 -def _fastq_generic2(in_handle, out_handle, mapping, truncate_char, truncate_msg):
62 """FASTQ helper function where there could be data loss by truncation (PRIVATE).""" 63 from Bio.SeqIO.QualityIO import FastqGeneralIterator 64 #For real speed, don't even make SeqRecord and Seq objects! 65 count = 0 66 null = chr(0) 67 for title, seq, old_qual in FastqGeneralIterator(in_handle): 68 count += 1 69 #map the qual... 70 qual = old_qual.translate(mapping) 71 if null in qual: 72 raise ValueError("Invalid character in quality string") 73 if truncate_char in qual: 74 qual = qual.replace(truncate_char, chr(126)) 75 import warnings 76 warnings.warn(truncate_msg) 77 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual)) 78 return count
79
80 -def _fastq_sanger_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
81 """Fast Sanger FASTQ to Sanger FASTQ conversion (PRIVATE). 82 83 Useful for removing line wrapping and the redundant second identifier 84 on the plus lines. Will check also check the quality string is valid. 85 86 Avoids creating SeqRecord and Seq objects in order to speed up this 87 conversion. 88 """ 89 #Map unexpected chars to null 90 mapping = "".join([chr(0) for ascii in range(0, 33)] \ 91 +[chr(ascii) for ascii in range(33, 127)] \ 92 +[chr(0) for ascii in range(127, 256)]) 93 assert len(mapping)==256 94 return _fastq_generic(in_handle, out_handle, mapping)
95
96 -def _fastq_solexa_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
97 """Fast Solexa FASTQ to Solexa FASTQ conversion (PRIVATE). 98 99 Useful for removing line wrapping and the redundant second identifier 100 on the plus lines. Will check also check the quality string is valid. 101 Avoids creating SeqRecord and Seq objects in order to speed up this 102 conversion. 103 """ 104 #Map unexpected chars to null 105 mapping = "".join([chr(0) for ascii in range(0, 59)] \ 106 +[chr(ascii) for ascii in range(59, 127)] \ 107 +[chr(0) for ascii in range(127, 256)]) 108 assert len(mapping)==256 109 return _fastq_generic(in_handle, out_handle, mapping)
110
111 -def _fastq_illumina_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
112 """Fast Illumina 1.3+ FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 113 114 Useful for removing line wrapping and the redundant second identifier 115 on the plus lines. Will check also check the quality string is valid. 116 Avoids creating SeqRecord and Seq objects in order to speed up this 117 conversion. 118 """ 119 #Map unexpected chars to null 120 mapping = "".join([chr(0) for ascii in range(0, 64)] \ 121 +[chr(ascii) for ascii in range(64,127)] \ 122 +[chr(0) for ascii in range(127,256)]) 123 assert len(mapping)==256 124 return _fastq_generic(in_handle, out_handle, mapping)
125
126 -def _fastq_illumina_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
127 """Fast Illumina 1.3+ FASTQ to Sanger FASTQ conversion (PRIVATE). 128 129 Avoids creating SeqRecord and Seq objects in order to speed up this 130 conversion. 131 """ 132 #Map unexpected chars to null 133 mapping = "".join([chr(0) for ascii in range(0, 64)] \ 134 +[chr(33+q) for q in range(0, 62+1)] \ 135 +[chr(0) for ascii in range(127, 256)]) 136 assert len(mapping)==256 137 return _fastq_generic(in_handle, out_handle, mapping)
138
139 -def _fastq_sanger_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
140 """Fast Sanger FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 141 142 Avoids creating SeqRecord and Seq objects in order to speed up this 143 conversion. Will issue a warning if the scores had to be truncated at 62 144 (maximum possible in the Illumina 1.3+ FASTQ format) 145 """ 146 #Map unexpected chars to null 147 trunc_char = chr(1) 148 mapping = "".join([chr(0) for ascii in range(0, 33)] \ 149 +[chr(64+q) for q in range(0, 62+1) ] \ 150 +[trunc_char for ascii in range(96,127)] \ 151 +[chr(0) for ascii in range(127, 256)]) 152 assert len(mapping)==256 153 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char, 154 "Data loss - max PHRED quality 62 in Illumina 1.3+ FASTQ")
155
156 -def _fastq_solexa_convert_fastq_sanger(in_handle, out_handle, alphabet=None):
157 """Fast Solexa FASTQ to Sanger FASTQ conversion (PRIVATE). 158 159 Avoids creating SeqRecord and Seq objects in order to speed up this 160 conversion. 161 """ 162 #Map unexpected chars to null 163 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 164 mapping = "".join([chr(0) for ascii in range(0, 59)] \ 165 +[chr(33+int(round(phred_quality_from_solexa(q)))) \ 166 for q in range(-5, 62+1)]\ 167 +[chr(0) for ascii in range(127, 256)]) 168 assert len(mapping)==256 169 return _fastq_generic(in_handle, out_handle, mapping)
170
171 -def _fastq_sanger_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
172 """Fast Sanger FASTQ to Solexa FASTQ conversion (PRIVATE). 173 174 Avoids creating SeqRecord and Seq objects in order to speed up this 175 conversion. Will issue a warning if the scores had to be truncated at 62 176 (maximum possible in the Solexa FASTQ format) 177 """ 178 #Map unexpected chars to null 179 from Bio.SeqIO.QualityIO import solexa_quality_from_phred 180 trunc_char = chr(1) 181 mapping = "".join([chr(0) for ascii in range(0, 33)] \ 182 +[chr(64+int(round(solexa_quality_from_phred(q)))) \ 183 for q in range(0, 62+1)] \ 184 +[trunc_char for ascii in range(96, 127)] \ 185 +[chr(0) for ascii in range(127, 256)]) 186 assert len(mapping)==256 187 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char, 188 "Data loss - max Solexa quality 62 in Solexa FASTQ")
189 190
191 -def _fastq_solexa_convert_fastq_illumina(in_handle, out_handle, alphabet=None):
192 """Fast Solexa FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE). 193 194 Avoids creating SeqRecord and Seq objects in order to speed up this 195 conversion. 196 """ 197 #Map unexpected chars to null 198 from Bio.SeqIO.QualityIO import phred_quality_from_solexa 199 mapping = "".join([chr(0) for ascii in range(0, 59)] \ 200 +[chr(64+int(round(phred_quality_from_solexa(q)))) \ 201 for q in range(-5, 62+1)]\ 202 +[chr(0) for ascii in range(127, 256)]) 203 assert len(mapping)==256 204 return _fastq_generic(in_handle, out_handle, mapping)
205
206 -def _fastq_illumina_convert_fastq_solexa(in_handle, out_handle, alphabet=None):
207 """Fast Illumina 1.3+ FASTQ to Solexa FASTQ conversion (PRIVATE). 208 209 Avoids creating SeqRecord and Seq objects in order to speed up this 210 conversion. 211 """ 212 #Map unexpected chars to null 213 from Bio.SeqIO.QualityIO import solexa_quality_from_phred 214 trunc_char = chr(1) 215 mapping = "".join([chr(0) for ascii in range(0, 64)] \ 216 +[chr(64+int(round(solexa_quality_from_phred(q)))) \ 217 for q in range(0, 62+1)] \ 218 +[chr(0) for ascii in range(127, 256)]) 219 assert len(mapping)==256 220 return _fastq_generic(in_handle, out_handle, mapping)
221 222
223 -def _fastq_convert_fasta(in_handle, out_handle, alphabet=None):
224 """Fast FASTQ to FASTA conversion (PRIVATE). 225 226 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and 227 Seq objects in order to speed up this conversion. 228 229 NOTE - This does NOT check the characters used in the FASTQ quality string 230 are valid! 231 """ 232 from Bio.SeqIO.QualityIO import FastqGeneralIterator 233 #For real speed, don't even make SeqRecord and Seq objects! 234 count = 0 235 for title, seq, qual in FastqGeneralIterator(in_handle): 236 count += 1 237 out_handle.write(">%s\n" % title) 238 #Do line wrapping 239 for i in range(0, len(seq), 60): 240 out_handle.write(seq[i:i+60] + "\n") 241 return count
242
243 -def _fastq_convert_tab(in_handle, out_handle, alphabet=None):
244 """Fast FASTQ to simple tabbed conversion (PRIVATE). 245 246 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and 247 Seq objects in order to speed up this conversion. 248 249 NOTE - This does NOT check the characters used in the FASTQ quality string 250 are valid! 251 """ 252 from Bio.SeqIO.QualityIO import FastqGeneralIterator 253 #For real speed, don't even make SeqRecord and Seq objects! 254 count = 0 255 for title, seq, qual in FastqGeneralIterator(in_handle): 256 count += 1 257 out_handle.write("%s\t%s\n" % (title.split(None, 1)[0], seq)) 258 return count
259 260 #TODO? - Handling aliases explicitly would let us shorten this list: 261 _converter = { 262 ("genbank", "fasta") : _genbank_convert_fasta, 263 ("gb", "fasta") : _genbank_convert_fasta, 264 ("embl", "fasta") : _embl_convert_fasta, 265 ("fastq", "fasta") : _fastq_convert_fasta, 266 ("fastq-sanger", "fasta") : _fastq_convert_fasta, 267 ("fastq-solexa", "fasta") : _fastq_convert_fasta, 268 ("fastq-illumina", "fasta") : _fastq_convert_fasta, 269 ("fastq", "tab") : _fastq_convert_tab, 270 ("fastq-sanger", "tab") : _fastq_convert_tab, 271 ("fastq-solexa", "tab") : _fastq_convert_tab, 272 ("fastq-illumina", "tab") : _fastq_convert_tab, 273 ("fastq", "fastq") : _fastq_sanger_convert_fastq_sanger, 274 ("fastq-sanger", "fastq") : _fastq_sanger_convert_fastq_sanger, 275 ("fastq-solexa", "fastq") : _fastq_solexa_convert_fastq_sanger, 276 ("fastq-illumina", "fastq") : _fastq_illumina_convert_fastq_sanger, 277 ("fastq", "fastq-sanger") : _fastq_sanger_convert_fastq_sanger, 278 ("fastq-sanger", "fastq-sanger") : _fastq_sanger_convert_fastq_sanger, 279 ("fastq-solexa", "fastq-sanger") : _fastq_solexa_convert_fastq_sanger, 280 ("fastq-illumina", "fastq-sanger") : _fastq_illumina_convert_fastq_sanger, 281 ("fastq", "fastq-solexa") : _fastq_sanger_convert_fastq_solexa, 282 ("fastq-sanger", "fastq-solexa") : _fastq_sanger_convert_fastq_solexa, 283 ("fastq-solexa", "fastq-solexa") : _fastq_solexa_convert_fastq_solexa, 284 ("fastq-illumina", "fastq-solexa") : _fastq_illumina_convert_fastq_solexa, 285 ("fastq", "fastq-illumina") : _fastq_sanger_convert_fastq_illumina, 286 ("fastq-sanger", "fastq-illumina") : _fastq_sanger_convert_fastq_illumina, 287 ("fastq-solexa", "fastq-illumina") : _fastq_solexa_convert_fastq_illumina, 288 ("fastq-illumina", "fastq-illumina") : _fastq_illumina_convert_fastq_illumina, 289 } 290
291 -def _handle_convert(in_handle, in_format, out_handle, out_format, alphabet=None):
292 """SeqIO conversion function (PRIVATE).""" 293 try: 294 f = _converter[(in_format, out_format)] 295 except KeyError: 296 f = None 297 if f: 298 return f(in_handle, out_handle, alphabet) 299 else: 300 records = SeqIO.parse(in_handle, in_format, alphabet) 301 return SeqIO.write(records, out_handle, out_format)
302