1
2
3
4
5 """Optimised sequence conversion code (PRIVATE).
6
7 You are not expected to access this module, or any of its code, directly. This
8 is all handled internally by the Bio.SeqIO.convert(...) function which is the
9 public interface for this.
10
11 The idea here is rather while doing this will work:
12
13 from Bio import SeqIO
14 records = SeqIO.parse(in_handle, in_format)
15 count = SeqIO.write(records, out_handle, out_format)
16
17 it is shorter to write:
18
19 from Bio import SeqIO
20 count = SeqIO.convert(in_handle, in_format, out_handle, out_format)
21
22 Also, the convert function can take a number of special case optimisations. This
23 means that using Bio.SeqIO.convert() may be faster, as well as more convenient.
24 All these file format specific optimisations are handled by this (private) module.
25 """
26
27 from Bio import SeqIO
28
29
37
45
60
61 -def _fastq_generic2(in_handle, out_handle, mapping, truncate_char, truncate_msg):
62 """FASTQ helper function where there could be data loss by truncation (PRIVATE)."""
63 from Bio.SeqIO.QualityIO import FastqGeneralIterator
64
65 count = 0
66 null = chr(0)
67 for title, seq, old_qual in FastqGeneralIterator(in_handle):
68 count += 1
69
70 qual = old_qual.translate(mapping)
71 if null in qual:
72 raise ValueError("Invalid character in quality string")
73 if truncate_char in qual:
74 qual = qual.replace(truncate_char, chr(126))
75 import warnings
76 warnings.warn(truncate_msg)
77 out_handle.write("@%s\n%s\n+\n%s\n" % (title, seq, qual))
78 return count
79
81 """Fast Sanger FASTQ to Sanger FASTQ conversion (PRIVATE).
82
83 Useful for removing line wrapping and the redundant second identifier
84 on the plus lines. Will check also check the quality string is valid.
85
86 Avoids creating SeqRecord and Seq objects in order to speed up this
87 conversion.
88 """
89
90 mapping = "".join([chr(0) for ascii in range(0, 33)] \
91 +[chr(ascii) for ascii in range(33, 127)] \
92 +[chr(0) for ascii in range(127, 256)])
93 assert len(mapping)==256
94 return _fastq_generic(in_handle, out_handle, mapping)
95
97 """Fast Solexa FASTQ to Solexa FASTQ conversion (PRIVATE).
98
99 Useful for removing line wrapping and the redundant second identifier
100 on the plus lines. Will check also check the quality string is valid.
101 Avoids creating SeqRecord and Seq objects in order to speed up this
102 conversion.
103 """
104
105 mapping = "".join([chr(0) for ascii in range(0, 59)] \
106 +[chr(ascii) for ascii in range(59, 127)] \
107 +[chr(0) for ascii in range(127, 256)])
108 assert len(mapping)==256
109 return _fastq_generic(in_handle, out_handle, mapping)
110
112 """Fast Illumina 1.3+ FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
113
114 Useful for removing line wrapping and the redundant second identifier
115 on the plus lines. Will check also check the quality string is valid.
116 Avoids creating SeqRecord and Seq objects in order to speed up this
117 conversion.
118 """
119
120 mapping = "".join([chr(0) for ascii in range(0, 64)] \
121 +[chr(ascii) for ascii in range(64,127)] \
122 +[chr(0) for ascii in range(127,256)])
123 assert len(mapping)==256
124 return _fastq_generic(in_handle, out_handle, mapping)
125
127 """Fast Illumina 1.3+ FASTQ to Sanger FASTQ conversion (PRIVATE).
128
129 Avoids creating SeqRecord and Seq objects in order to speed up this
130 conversion.
131 """
132
133 mapping = "".join([chr(0) for ascii in range(0, 64)] \
134 +[chr(33+q) for q in range(0, 62+1)] \
135 +[chr(0) for ascii in range(127, 256)])
136 assert len(mapping)==256
137 return _fastq_generic(in_handle, out_handle, mapping)
138
140 """Fast Sanger FASTQ to Illumina 1.3+ FASTQ conversion (PRIVATE).
141
142 Avoids creating SeqRecord and Seq objects in order to speed up this
143 conversion. Will issue a warning if the scores had to be truncated at 62
144 (maximum possible in the Illumina 1.3+ FASTQ format)
145 """
146
147 trunc_char = chr(1)
148 mapping = "".join([chr(0) for ascii in range(0, 33)] \
149 +[chr(64+q) for q in range(0, 62+1) ] \
150 +[trunc_char for ascii in range(96,127)] \
151 +[chr(0) for ascii in range(127, 256)])
152 assert len(mapping)==256
153 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char,
154 "Data loss - max PHRED quality 62 in Illumina 1.3+ FASTQ")
155
170
172 """Fast Sanger FASTQ to Solexa FASTQ conversion (PRIVATE).
173
174 Avoids creating SeqRecord and Seq objects in order to speed up this
175 conversion. Will issue a warning if the scores had to be truncated at 62
176 (maximum possible in the Solexa FASTQ format)
177 """
178
179 from Bio.SeqIO.QualityIO import solexa_quality_from_phred
180 trunc_char = chr(1)
181 mapping = "".join([chr(0) for ascii in range(0, 33)] \
182 +[chr(64+int(round(solexa_quality_from_phred(q)))) \
183 for q in range(0, 62+1)] \
184 +[trunc_char for ascii in range(96, 127)] \
185 +[chr(0) for ascii in range(127, 256)])
186 assert len(mapping)==256
187 return _fastq_generic2(in_handle, out_handle, mapping, trunc_char,
188 "Data loss - max Solexa quality 62 in Solexa FASTQ")
189
190
205
221
222
224 """Fast FASTQ to FASTA conversion (PRIVATE).
225
226 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and
227 Seq objects in order to speed up this conversion.
228
229 NOTE - This does NOT check the characters used in the FASTQ quality string
230 are valid!
231 """
232 from Bio.SeqIO.QualityIO import FastqGeneralIterator
233
234 count = 0
235 for title, seq, qual in FastqGeneralIterator(in_handle):
236 count += 1
237 out_handle.write(">%s\n" % title)
238
239 for i in range(0, len(seq), 60):
240 out_handle.write(seq[i:i+60] + "\n")
241 return count
242
244 """Fast FASTQ to simple tabbed conversion (PRIVATE).
245
246 Avoids dealing with the FASTQ quality encoding, and creating SeqRecord and
247 Seq objects in order to speed up this conversion.
248
249 NOTE - This does NOT check the characters used in the FASTQ quality string
250 are valid!
251 """
252 from Bio.SeqIO.QualityIO import FastqGeneralIterator
253
254 count = 0
255 for title, seq, qual in FastqGeneralIterator(in_handle):
256 count += 1
257 out_handle.write("%s\t%s\n" % (title.split(None, 1)[0], seq))
258 return count
259
260
261 _converter = {
262 ("genbank", "fasta") : _genbank_convert_fasta,
263 ("gb", "fasta") : _genbank_convert_fasta,
264 ("embl", "fasta") : _embl_convert_fasta,
265 ("fastq", "fasta") : _fastq_convert_fasta,
266 ("fastq-sanger", "fasta") : _fastq_convert_fasta,
267 ("fastq-solexa", "fasta") : _fastq_convert_fasta,
268 ("fastq-illumina", "fasta") : _fastq_convert_fasta,
269 ("fastq", "tab") : _fastq_convert_tab,
270 ("fastq-sanger", "tab") : _fastq_convert_tab,
271 ("fastq-solexa", "tab") : _fastq_convert_tab,
272 ("fastq-illumina", "tab") : _fastq_convert_tab,
273 ("fastq", "fastq") : _fastq_sanger_convert_fastq_sanger,
274 ("fastq-sanger", "fastq") : _fastq_sanger_convert_fastq_sanger,
275 ("fastq-solexa", "fastq") : _fastq_solexa_convert_fastq_sanger,
276 ("fastq-illumina", "fastq") : _fastq_illumina_convert_fastq_sanger,
277 ("fastq", "fastq-sanger") : _fastq_sanger_convert_fastq_sanger,
278 ("fastq-sanger", "fastq-sanger") : _fastq_sanger_convert_fastq_sanger,
279 ("fastq-solexa", "fastq-sanger") : _fastq_solexa_convert_fastq_sanger,
280 ("fastq-illumina", "fastq-sanger") : _fastq_illumina_convert_fastq_sanger,
281 ("fastq", "fastq-solexa") : _fastq_sanger_convert_fastq_solexa,
282 ("fastq-sanger", "fastq-solexa") : _fastq_sanger_convert_fastq_solexa,
283 ("fastq-solexa", "fastq-solexa") : _fastq_solexa_convert_fastq_solexa,
284 ("fastq-illumina", "fastq-solexa") : _fastq_illumina_convert_fastq_solexa,
285 ("fastq", "fastq-illumina") : _fastq_sanger_convert_fastq_illumina,
286 ("fastq-sanger", "fastq-illumina") : _fastq_sanger_convert_fastq_illumina,
287 ("fastq-solexa", "fastq-illumina") : _fastq_solexa_convert_fastq_illumina,
288 ("fastq-illumina", "fastq-illumina") : _fastq_illumina_convert_fastq_illumina,
289 }
290
291 -def _handle_convert(in_handle, in_format, out_handle, out_format, alphabet=None):
292 """SeqIO conversion function (PRIVATE)."""
293 try:
294 f = _converter[(in_format, out_format)]
295 except KeyError:
296 f = None
297 if f:
298 return f(in_handle, out_handle, alphabet)
299 else:
300 records = SeqIO.parse(in_handle, in_format, alphabet)
301 return SeqIO.write(records, out_handle, out_format)
302