1
2
3
4
5
6
7 """Bio.SeqIO support for the binary Standard Flowgram Format (SFF) file format.
8
9 SFF was designed by 454 Life Sciences (Roche), the Whitehead Institute for
10 Biomedical Research and the Wellcome Trust Sanger Institute. You are expected
11 to use this module via the Bio.SeqIO functions under the format name "sff".
12
13 For example, to iterate over the records in an SFF file,
14
15 >>> from Bio import SeqIO
16 >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff"):
17 ... print record.id, len(record), record.seq[:20]+"..."
18 E3MFGYR02JWQ7T 265 tcagGGTCTACATGTTGGTT...
19 E3MFGYR02JA6IL 271 tcagTTTTTTTTGGAAAGGA...
20 E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC...
21 E3MFGYR02GFKUC 299 tcagCGGCCGGGCCTCTCAT...
22 E3MFGYR02FTGED 281 tcagTGGTAATGGGGGGAAA...
23 E3MFGYR02FR9G7 261 tcagCTCCGTAAGAAGGTGC...
24 E3MFGYR02GAZMS 278 tcagAAAGAAGTAAGGTAAA...
25 E3MFGYR02HHZ8O 221 tcagACTTTCTTCTTTACCG...
26 E3MFGYR02GPGB1 269 tcagAAGCAGTGGTATCAAC...
27 E3MFGYR02F7Z7G 219 tcagAATCATCCACTTTTTA...
28
29 Each SeqRecord object will contain all the annotation from the SFF file,
30 including the PHRED quality scores.
31
32 >>> print record.id, len(record)
33 E3MFGYR02F7Z7G 219
34 >>> print record.seq[:10], "..."
35 tcagAATCAT ...
36 >>> print record.letter_annotations["phred_quality"][:10], "..."
37 [22, 21, 23, 28, 26, 15, 12, 21, 28, 21] ...
38
39 Notice that the sequence is given in mixed case, the central upper case region
40 corresponds to the trimmed sequence. This matches the output of the Roche
41 tools (and the 3rd party tool sff_extract) for SFF to FASTA.
42
43 >>> print record.annotations["clip_qual_left"]
44 4
45 >>> print record.annotations["clip_qual_right"]
46 134
47 >>> print record.seq[:4]
48 tcag
49 >>> print record.seq[4:20], "...", record.seq[120:134]
50 AATCATCCACTTTTTA ... CAAAACACAAACAG
51 >>> print record.seq[134:]
52 atcttatcaacaaaactcaaagttcctaactgagacacgcaacaggggataagacaaggcacacaggggataggnnnnnnnnnnn
53
54 The annotations dictionary also contains any adapter clip positions
55 (usually zero), and information about the flows. e.g.
56
57 >>> print record.annotations["flow_key"]
58 TCAG
59 >>> print record.annotations["flow_values"][:10], "..."
60 (83, 1, 128, 7, 4, 84, 6, 106, 3, 172) ...
61 >>> print len(record.annotations["flow_values"])
62 400
63 >>> print record.annotations["flow_index"][:10], "..."
64 (1, 2, 3, 2, 2, 0, 3, 2, 3, 3) ...
65 >>> print len(record.annotations["flow_index"])
66 219
67
68 As a convenience method, you can read the file with SeqIO format name "sff-trim"
69 instead of "sff" to get just the trimmed sequences (without any annotation
70 except for the PHRED quality scores):
71
72 >>> from Bio import SeqIO
73 >>> for record in SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim"):
74 ... print record.id, len(record), record.seq[:20]+"..."
75 E3MFGYR02JWQ7T 260 GGTCTACATGTTGGTTAACC...
76 E3MFGYR02JA6IL 265 TTTTTTTTGGAAAGGAAAAC...
77 E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG...
78 E3MFGYR02GFKUC 295 CGGCCGGGCCTCTCATCGGT...
79 E3MFGYR02FTGED 277 TGGTAATGGGGGGAAATTTA...
80 E3MFGYR02FR9G7 256 CTCCGTAAGAAGGTGCTGCC...
81 E3MFGYR02GAZMS 271 AAAGAAGTAAGGTAAATAAC...
82 E3MFGYR02HHZ8O 150 ACTTTCTTCTTTACCGTAAC...
83 E3MFGYR02GPGB1 221 AAGCAGTGGTATCAACGCAG...
84 E3MFGYR02F7Z7G 130 AATCATCCACTTTTTAACGT...
85
86 Looking at the final record in more detail, note how this differs to the
87 example above:
88
89 >>> print record.id, len(record)
90 E3MFGYR02F7Z7G 130
91 >>> print record.seq[:10], "..."
92 AATCATCCAC ...
93 >>> print record.letter_annotations["phred_quality"][:10], "..."
94 [26, 15, 12, 21, 28, 21, 36, 28, 27, 27] ...
95 >>> print record.annotations
96 {}
97
98 You might use the Bio.SeqIO.convert() function to convert the (trimmed) SFF
99 reads into a FASTQ file (or a FASTA file and a QUAL file), e.g.
100
101 >>> from Bio import SeqIO
102 >>> from StringIO import StringIO
103 >>> out_handle = StringIO()
104 >>> count = SeqIO.convert("Roche/E3MFGYR02_random_10_reads.sff", "sff",
105 ... out_handle, "fastq")
106 >>> print "Converted %i records" % count
107 Converted 10 records
108
109 The output FASTQ file would start like this:
110
111 >>> print "%s..." % out_handle.getvalue()[:50]
112 @E3MFGYR02JWQ7T
113 tcagGGTCTACATGTTGGTTAACCCGTACTGATT...
114
115 Bio.SeqIO.index() provides memory efficient random access to the reads in an
116 SFF file by name. SFF files can include an index within the file, which can
117 be read in making this very fast. If the index is missing (or in a format not
118 yet supported in Biopython) the file is indexed by scanning all the reads -
119 which is a little slower. For example,
120
121 >>> from Bio import SeqIO
122 >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff")
123 >>> record = reads["E3MFGYR02JHD4H"]
124 >>> print record.id, len(record), record.seq[:20]+"..."
125 E3MFGYR02JHD4H 310 tcagAAAGACAAGTGGTATC...
126
127 Or, using the trimmed reads:
128
129 >>> from Bio import SeqIO
130 >>> reads = SeqIO.index("Roche/E3MFGYR02_random_10_reads.sff", "sff-trim")
131 >>> record = reads["E3MFGYR02JHD4H"]
132 >>> print record.id, len(record), record.seq[:20]+"..."
133 E3MFGYR02JHD4H 292 AAAGACAAGTGGTATCAACG...
134
135 You can also use the Bio.SeqIO.write() function with the "sff" format. Note
136 that this requires all the flow information etc, and thus is probably only
137 useful for SeqRecord objects originally from reading another SFF file (and
138 not the trimmed SeqRecord objects from parsing an SFF file as "sff-trim").
139
140 As an example, let's pretend this example SFF file represents some DNA which
141 was pre-amplified with a PCR primers AAAGANNNNN. The following script would
142 produce a sub-file containing all those reads whose post-quality clipping
143 region (i.e. the sequence after trimming) starts with AAAGA exactly (the non-
144 degenerate bit of this pretend primer):
145
146 >>> from Bio import SeqIO
147 >>> records = (record for record in \
148 SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff","sff") \
149 if record.seq[record.annotations["clip_qual_left"]:].startswith("AAAGA"))
150 >>> count = SeqIO.write(records, "temp_filtered.sff", "sff")
151 >>> print "Selected %i records" % count
152 Selected 2 records
153
154 Of course, for an assembly you would probably want to remove these primers.
155 If you want FASTA or FASTQ output, you could just slice the SeqRecord. However,
156 if you want SFF output we have to preserve all the flow information - the trick
157 is just to adjust the left clip position!
158
159 >>> from Bio import SeqIO
160 >>> def filter_and_trim(records, primer):
161 ... for record in records:
162 ... if record.seq[record.annotations["clip_qual_left"]:].startswith(primer):
163 ... record.annotations["clip_qual_left"] += len(primer)
164 ... yield record
165 >>> records = SeqIO.parse("Roche/E3MFGYR02_random_10_reads.sff", "sff")
166 >>> count = SeqIO.write(filter_and_trim(records,"AAAGA"),
167 ... "temp_filtered.sff", "sff")
168 >>> print "Selected %i records" % count
169 Selected 2 records
170
171 We can check the results, note the lower case clipped region now includes the "AAAGA"
172 sequence:
173
174 >>> for record in SeqIO.parse("temp_filtered.sff", "sff"):
175 ... print record.id, len(record), record.seq[:20]+"..."
176 E3MFGYR02JHD4H 310 tcagaaagaCAAGTGGTATC...
177 E3MFGYR02GAZMS 278 tcagaaagaAGTAAGGTAAA...
178 >>> for record in SeqIO.parse("temp_filtered.sff", "sff-trim"):
179 ... print record.id, len(record), record.seq[:20]+"..."
180 E3MFGYR02JHD4H 287 CAAGTGGTATCAACGCAGAG...
181 E3MFGYR02GAZMS 266 AGTAAGGTAAATAACAAACG...
182 >>> import os
183 >>> os.remove("temp_filtered.sff")
184
185 For a description of the file format, please see the Roche manuals and:
186 http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=formats
187
188 """
189 from Interfaces import SequenceWriter
190 from Bio import Alphabet
191 from Bio.Seq import Seq
192 from Bio.SeqRecord import SeqRecord
193 import struct
194 import sys
195
197 """Read in an SFF file header (PRIVATE).
198
199 Assumes the handle is at the start of the file, will read forwards
200 though the header and leave the handle pointing at the first record.
201 Returns a tuple of values from the header.
202
203 >>> handle = open("Roche/greek.sff", "rb")
204 >>> header_length, index_offset, index_length, number_of_reads, \
205 flows_per_read, flow_chars, key_sequence = _sff_file_header(handle)
206 >>> header_length
207 840
208 >>> print index_offset, index_length
209 65040 256
210 >>> print number_of_reads
211 24
212 >>> print flows_per_read
213 800
214 >>> key_sequence
215 'TCAG'
216
217 """
218 if hasattr(handle,"mode") and "U" in handle.mode.upper():
219 raise ValueError("SFF files must NOT be opened in universal new "
220 "lines mode. Binary mode is recommended (although "
221 "on Unix the default mode is also fine).")
222 elif hasattr(handle,"mode") and "B" not in handle.mode.upper() \
223 and sys.platform == "win32":
224 raise ValueError("SFF files must be opened in binary mode on Windows")
225
226
227
228
229
230
231
232
233
234
235
236
237 fmt = '>4s4BQIIHHHB'
238 assert 31 == struct.calcsize(fmt)
239 data = handle.read(31)
240 if not data:
241 raise ValueError("Empty file.")
242 elif len(data) < 13:
243 raise ValueError("File too small to hold a valid SFF header.")
244 magic_number, ver0, ver1, ver2, ver3, index_offset, index_length, \
245 number_of_reads, header_length, key_length, number_of_flows_per_read, \
246 flowgram_format = struct.unpack(fmt, data)
247 if magic_number in [".hsh", ".srt", ".mft"]:
248
249 raise ValueError("Handle seems to be at SFF index block, not start")
250 if magic_number != ".sff":
251 raise ValueError("SFF file did not start '.sff', but '%s'" \
252 % magic_number)
253 if (ver0, ver1, ver2, ver3) != (0, 0, 0, 1):
254 raise ValueError("Unsupported SFF version in header, %i.%i.%i.%i" \
255 % (ver0, ver1, ver2, ver3))
256 if flowgram_format != 1:
257 raise ValueError("Flowgram format code %i not supported" \
258 % flowgram_format)
259 if (index_offset!=0) ^ (index_length!=0):
260 raise ValueError("Index offset %i but index length %i" \
261 % (index_offset, index_length))
262 flow_chars = handle.read(number_of_flows_per_read)
263 key_sequence = handle.read(key_length)
264
265
266
267
268 assert header_length % 8 == 0
269 padding = header_length - number_of_flows_per_read - key_length - 31
270 assert 0 <= padding < 8, padding
271 if handle.read(padding).count('\0') != padding:
272 raise ValueError("Post header %i byte padding region contained data" \
273 % padding)
274 return header_length, index_offset, index_length, \
275 number_of_reads, number_of_flows_per_read, \
276 flow_chars, key_sequence
277
278
280 """Generates an index by scanning though all the reads in an SFF file (PRIVATE).
281
282 This is a slow but generic approach if we can't parse the provided index (if
283 present).
284
285 Will use the handle seek/tell functions.
286 """
287 handle.seek(0)
288 header_length, index_offset, index_length, number_of_reads, \
289 number_of_flows_per_read, flow_chars, key_sequence \
290 = _sff_file_header(handle)
291
292 read_header_fmt = '>2HI4H'
293 read_header_size = struct.calcsize(read_header_fmt)
294
295 read_flow_fmt = ">%iH" % number_of_flows_per_read
296 read_flow_size = struct.calcsize(read_flow_fmt)
297 assert 1 == struct.calcsize(">B")
298 assert 1 == struct.calcsize(">s")
299 assert 1 == struct.calcsize(">c")
300 assert read_header_size % 8 == 0
301 for read in range(number_of_reads):
302 record_offset = handle.tell()
303 if record_offset == index_offset:
304
305 offset = index_offset + index_length
306 if offset % 8:
307 offset += 8 - (offset % 8)
308 assert offset % 8 == 0
309 handle.seek(offset)
310 record_offset = offset
311
312
313 data = handle.read(read_header_size)
314 read_header_length, name_length, seq_len, clip_qual_left, \
315 clip_qual_right, clip_adapter_left, clip_adapter_right \
316 = struct.unpack(read_header_fmt, data)
317 if read_header_length < 10 or read_header_length % 8 != 0:
318 raise ValueError("Malformed read header, says length is %i:\n%s" \
319 % (read_header_length, repr(data)))
320
321 name = handle.read(name_length)
322 padding = read_header_length - read_header_size - name_length
323 if handle.read(padding).count('\0') != padding:
324 raise ValueError("Post name %i byte padding region contained data" \
325 % padding)
326 assert record_offset + read_header_length == handle.tell()
327
328 size = read_flow_size + 3*seq_len
329 handle.seek(size, 1)
330
331 padding = size % 8
332 if padding:
333 padding = 8 - padding
334 if handle.read(padding).count('\0') != padding:
335 raise ValueError("Post quality %i byte padding region contained data" \
336 % padding)
337
338 yield name, record_offset
339 if handle.tell() % 8 != 0:
340 raise ValueError("After scanning reads, did not end on a multiple of 8")
341
343 """Locate any existing Roche style XML meta data and read index (PRIVATE).
344
345 Makes a number of hard coded assumptions based on reverse engineered SFF
346 files from Roche 454 machines.
347
348 Returns a tuple of read count, SFF "index" offset and size, XML offset and
349 size, and the actual read index offset and size.
350
351 Raises a ValueError for unsupported or non-Roche index blocks.
352 """
353 handle.seek(0)
354 header_length, index_offset, index_length, number_of_reads, \
355 number_of_flows_per_read, flow_chars, key_sequence \
356 = _sff_file_header(handle)
357 assert handle.tell() == header_length
358 if not index_offset or not index_offset:
359 raise ValueError("No index present in this SFF file")
360
361 handle.seek(index_offset)
362 fmt = ">4s4B"
363 fmt_size = struct.calcsize(fmt)
364 data = handle.read(fmt_size)
365 if not data:
366 raise ValueError("Premature end of file? Expected index of size %i at offest %i, found nothing" \
367 % (index_length, index_offset))
368 if len(data) < fmt_size:
369 raise ValueError("Premature end of file? Expected index of size %i at offest %i, found %s" \
370 % (index_length, index_offset, repr(data)))
371 magic_number, ver0, ver1, ver2, ver3 = struct.unpack(fmt, data)
372 if magic_number == ".mft":
373
374
375
376 if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48):
377
378 raise ValueError("Unsupported version in .mft index header, %i.%i.%i.%i" \
379 % (ver0, ver1, ver2, ver3))
380 fmt2 = ">LL"
381 fmt2_size = struct.calcsize(fmt2)
382 xml_size, data_size = struct.unpack(fmt2, handle.read(fmt2_size))
383 if index_length != fmt_size + fmt2_size + xml_size + data_size:
384 raise ValueError("Problem understanding .mft index header, %i != %i + %i + %i + %i" \
385 % (index_length, fmt_size, fmt2_size, xml_size, data_size))
386 return number_of_reads, header_length, \
387 index_offset, index_length, \
388 index_offset + fmt_size + fmt2_size, xml_size, \
389 index_offset + fmt_size + fmt2_size + xml_size, data_size
390 elif magic_number == ".srt":
391
392
393
394 if (ver0, ver1, ver2, ver3) != (49, 46, 48, 48):
395
396 raise ValueError("Unsupported version in .srt index header, %i.%i.%i.%i" \
397 % (ver0, ver1, ver2, ver3))
398 data = handle.read(4)
399 if data != "\0\0\0\0":
400 raise ValueError("Did not find expected null four bytes in .srt index")
401 return number_of_reads, header_length, \
402 index_offset, index_length, \
403 0, 0, \
404 index_offset + fmt_size + 4, index_length - fmt_size - 4
405 elif magic_number == ".hsh":
406 raise ValueError("Hash table style indexes (.hsh) in SFF files are "
407 "not (yet) supported")
408 else:
409 raise ValueError("Unknown magic number %s in SFF index header:\n%s" \
410 % (repr(magic_number), repr(data)))
411
413 """Reads any existing Roche style XML manifest data in the SFF "index" (PRIVATE).
414
415 Will use the handle seek/tell functions. Returns a string.
416 """
417 number_of_reads, header_length, index_offset, index_length, xml_offset, \
418 xml_size, read_index_offset, read_index_size = _sff_find_roche_index(handle)
419 if not xml_offset or not xml_size:
420 raise ValueError("No XML manifest found")
421 handle.seek(xml_offset)
422 return handle.read(xml_size)
423
424
425
427 """Reads any existing Roche style read index provided in the SFF file (PRIVATE).
428
429 Will use the handle seek/tell functions.
430
431 This works on ".srt1.00" and ".mft1.00" style Roche SFF index blocks.
432
433 Roche SFF indices use base 255 not 256, meaning we see bytes in range the range
434 0 to 254 only. This appears to be so that byte 0xFF (character 255) can be used
435 as a marker character to separate entries (required if the read name lengths
436 vary).
437
438 Note that since only four bytes are used for the read offset, this is limited to
439 255^4 bytes (nearly 4GB). If you try to use the Roche sfffile tool to combined
440 SFF files beyound this limit, they issue a warning and ommit the index (and
441 manifest).
442 """
443 number_of_reads, header_length, index_offset, index_length, xml_offset, \
444 xml_size, read_index_offset, read_index_size = _sff_find_roche_index(handle)
445
446 handle.seek(read_index_offset)
447 start = chr(0)
448 end = chr(255)
449 fmt = ">5B"
450 for read in range(number_of_reads):
451
452 data = handle.read(6)
453 while data[-1] != end:
454 more = handle.read(1)
455 if not more:
456 raise ValueError("Premature end of file!")
457 data += more
458 name = data[:-6]
459 off4, off3, off2, off1, off0 = struct.unpack(fmt, data[-6:-1])
460 offset = off0 + 255*off1 + 65025*off2 + 16581375*off3
461 if off4:
462
463
464
465 raise ValueError("Expected a null terminator to the read name.")
466 yield name, offset
467 if handle.tell() != read_index_offset + read_index_size:
468 raise ValueError("Problem with index length? %i vs %i" \
469 % (handle.tell(), read_index_offset + read_index_size))
470
471 -def _sff_read_seq_record(handle, number_of_flows_per_read, flow_chars,
472 key_sequence, alphabet, trim=False):
473 """Parse the next read in the file, return data as a SeqRecord (PRIVATE)."""
474
475
476
477
478
479
480
481
482
483
484 read_header_fmt = '>2HI4H'
485 read_header_size = struct.calcsize(read_header_fmt)
486 read_flow_fmt = ">%iH" % number_of_flows_per_read
487 read_flow_size = struct.calcsize(read_flow_fmt)
488
489 read_header_length, name_length, seq_len, clip_qual_left, \
490 clip_qual_right, clip_adapter_left, clip_adapter_right \
491 = struct.unpack(read_header_fmt, handle.read(read_header_size))
492 if clip_qual_left:
493 clip_qual_left -= 1
494 if clip_adapter_left:
495 clip_adapter_left -= 1
496 if read_header_length < 10 or read_header_length % 8 != 0:
497 raise ValueError("Malformed read header, says length is %i" \
498 % read_header_length)
499
500 name = handle.read(name_length)
501 padding = read_header_length - read_header_size - name_length
502 if handle.read(padding).count('\0') != padding:
503 raise ValueError("Post name %i byte padding region contained data" \
504 % padding)
505
506
507 flow_values = handle.read(read_flow_size)
508 temp_fmt = ">%iB" % seq_len
509 flow_index = handle.read(seq_len)
510 seq = handle.read(seq_len)
511 quals = list(struct.unpack(temp_fmt, handle.read(seq_len)))
512
513 padding = (read_flow_size + seq_len*3)%8
514 if padding:
515 padding = 8 - padding
516 if handle.read(padding).count('\0') != padding:
517 raise ValueError("Post quality %i byte padding region contained data" \
518 % padding)
519
520 if trim:
521 seq = seq[clip_qual_left:clip_qual_right].upper()
522 quals = quals[clip_qual_left:clip_qual_right]
523
524 annotations = {}
525 else:
526
527 seq = seq[:clip_qual_left].lower() + \
528 seq[clip_qual_left:clip_qual_right].upper() + \
529 seq[clip_qual_right:].lower()
530 annotations = {"flow_values":struct.unpack(read_flow_fmt, flow_values),
531 "flow_index":struct.unpack(temp_fmt, flow_index),
532 "flow_chars":flow_chars,
533 "flow_key":key_sequence,
534 "clip_qual_left":clip_qual_left,
535 "clip_qual_right":clip_qual_right,
536 "clip_adapter_left":clip_adapter_left,
537 "clip_adapter_right":clip_adapter_right}
538 record = SeqRecord(Seq(seq, alphabet),
539 id=name,
540 name=name,
541 description="",
542 annotations=annotations)
543
544
545 dict.__setitem__(record._per_letter_annotations,
546 "phred_quality", quals)
547
548
549 return record
550
551
553 """Iterate over Standard Flowgram Format (SFF) reads (as SeqRecord objects).
554
555 handle - input file, an SFF file, e.g. from Roche 454 sequencing.
556 This must NOT be opened in universal read lines mode!
557 alphabet - optional alphabet, defaults to generic DNA.
558 trim - should the sequences be trimmed?
559
560 The resulting SeqRecord objects should match those from a paired
561 FASTA and QUAL file converted from the SFF file using the Roche
562 454 tool ssfinfo. i.e. The sequence will be mixed case, with the
563 trim regions shown in lower case.
564
565 This function is used internally via the Bio.SeqIO functions:
566
567 >>> from Bio import SeqIO
568 >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
569 >>> for record in SeqIO.parse(handle, "sff"):
570 ... print record.id, len(record)
571 E3MFGYR02JWQ7T 265
572 E3MFGYR02JA6IL 271
573 E3MFGYR02JHD4H 310
574 E3MFGYR02GFKUC 299
575 E3MFGYR02FTGED 281
576 E3MFGYR02FR9G7 261
577 E3MFGYR02GAZMS 278
578 E3MFGYR02HHZ8O 221
579 E3MFGYR02GPGB1 269
580 E3MFGYR02F7Z7G 219
581 >>> handle.close()
582
583 You can also call it directly:
584
585 >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
586 >>> for record in SffIterator(handle):
587 ... print record.id, len(record)
588 E3MFGYR02JWQ7T 265
589 E3MFGYR02JA6IL 271
590 E3MFGYR02JHD4H 310
591 E3MFGYR02GFKUC 299
592 E3MFGYR02FTGED 281
593 E3MFGYR02FR9G7 261
594 E3MFGYR02GAZMS 278
595 E3MFGYR02HHZ8O 221
596 E3MFGYR02GPGB1 269
597 E3MFGYR02F7Z7G 219
598 >>> handle.close()
599
600 Or, with the trim option:
601
602 >>> handle = open("Roche/E3MFGYR02_random_10_reads.sff", "rb")
603 >>> for record in SffIterator(handle, trim=True):
604 ... print record.id, len(record)
605 E3MFGYR02JWQ7T 260
606 E3MFGYR02JA6IL 265
607 E3MFGYR02JHD4H 292
608 E3MFGYR02GFKUC 295
609 E3MFGYR02FTGED 277
610 E3MFGYR02FR9G7 256
611 E3MFGYR02GAZMS 271
612 E3MFGYR02HHZ8O 150
613 E3MFGYR02GPGB1 221
614 E3MFGYR02F7Z7G 130
615 >>> handle.close()
616
617 """
618 if isinstance(Alphabet._get_base_alphabet(alphabet),
619 Alphabet.ProteinAlphabet):
620 raise ValueError("Invalid alphabet, SFF files do not hold proteins.")
621 if isinstance(Alphabet._get_base_alphabet(alphabet),
622 Alphabet.RNAAlphabet):
623 raise ValueError("Invalid alphabet, SFF files do not hold RNA.")
624 header_length, index_offset, index_length, number_of_reads, \
625 number_of_flows_per_read, flow_chars, key_sequence \
626 = _sff_file_header(handle)
627
628
629
630
631
632
633
634
635
636
637 read_header_fmt = '>2HI4H'
638 read_header_size = struct.calcsize(read_header_fmt)
639 read_flow_fmt = ">%iH" % number_of_flows_per_read
640 read_flow_size = struct.calcsize(read_flow_fmt)
641 assert 1 == struct.calcsize(">B")
642 assert 1 == struct.calcsize(">s")
643 assert 1 == struct.calcsize(">c")
644 assert read_header_size % 8 == 0
645
646
647
648 for read in range(number_of_reads):
649 if index_offset and handle.tell() == index_offset:
650
651
652 offset = index_offset + index_length
653 if offset % 8:
654 offset += 8 - (offset % 8)
655 assert offset % 8 == 0
656 handle.seek(offset)
657
658
659 index_offset = 0
660 yield _sff_read_seq_record(handle,
661 number_of_flows_per_read,
662 flow_chars,
663 key_sequence,
664 alphabet,
665 trim)
666
667
668 if index_offset and handle.tell() == index_offset:
669 offset = index_offset + index_length
670 if offset % 8:
671 offset += 8 - (offset % 8)
672 assert offset % 8 == 0
673 handle.seek(offset)
674
675 if handle.read(1):
676 raise ValueError("Additional data at end of SFF file")
677
678
679
681 """Iterate over SFF reads (as SeqRecord objects) with trimming (PRIVATE)."""
682 return SffIterator(handle, alphabet, trim=True)
683
684
686 """SFF file writer."""
687
688 - def __init__(self, handle, index=True, xml=None):
689 """Creates the writer object.
690
691 handle - Output handle, ideally in binary write mode.
692 index - Boolean argument, should we try and write an index?
693 xml - Optional string argument, xml to be recorded in the index block.
694 """
695 if hasattr(handle,"mode") and "U" in handle.mode.upper():
696 raise ValueError("SFF files must NOT be opened in universal new "
697 "lines mode. Binary mode is recommended (although "
698 "on Unix the default mode is also fine).")
699 elif hasattr(handle,"mode") and "B" not in handle.mode.upper() \
700 and sys.platform == "win32":
701 raise ValueError(\
702 "SFF files must be opened in binary mode on Windows")
703 self.handle = handle
704 self._xml = xml
705 if index:
706 self._index = []
707 else:
708 self._index = None
709
711 """Use this to write an entire file containing the given records."""
712 try:
713 self._number_of_reads = len(records)
714 except TypeError:
715 self._number_of_reads = 0
716 if not hasattr(self.handle, "seek") \
717 or not hasattr(self.handle, "tell"):
718 raise ValueError("A handle with a seek/tell methods is "
719 "required in order to record the total "
720 "record count in the file header (once it "
721 "is known at the end).")
722 if self._index is not None and \
723 not (hasattr(self.handle, "seek") and hasattr(self.handle, "tell")):
724 import warnings
725 warnings.warn("A handle with a seek/tell methods is required in "
726 "order to record an SFF index.")
727 self._index = None
728 self._index_start = 0
729 self._index_length = 0
730 if not hasattr(records, "next"):
731 records = iter(records)
732
733
734 try:
735 record = records.next()
736 except StopIteration:
737 record = None
738 if record is None:
739
740
741
742 raise ValueError("Need at least one record for SFF output")
743 try:
744 self._key_sequence = record.annotations["flow_key"]
745 self._flow_chars = record.annotations["flow_chars"]
746 self._number_of_flows_per_read = len(self._flow_chars)
747 except KeyError:
748 raise ValueError("Missing SFF flow information")
749 self.write_header()
750 self.write_record(record)
751 count = 1
752 for record in records:
753 self.write_record(record)
754 count += 1
755 if self._number_of_reads == 0:
756
757 offset = self.handle.tell()
758 self.handle.seek(0)
759 self._number_of_reads = count
760 self.write_header()
761 self.handle.seek(offset)
762 else:
763 assert count == self._number_of_reads
764 if self._index is not None:
765 self._write_index()
766 return count
767
769 assert len(self._index)==self._number_of_reads
770 handle = self.handle
771 self._index.sort()
772 self._index_start = handle.tell()
773
774 if isinstance(self._xml, str):
775 xml = self._xml
776 else:
777 from Bio import __version__
778 xml = "<!-- This file was output with Biopython %s -->\n" % __version__
779 xml += "<!-- This XML and index block attempts to mimic Roche SFF files -->\n"
780 xml += "<!-- This file may be a combination of multiple SFF files etc -->\n"
781 xml_len = len(xml)
782
783 fmt = ">I4BLL"
784 fmt_size = struct.calcsize(fmt)
785 handle.write("\0"*fmt_size + xml)
786 fmt2 = ">6B"
787 assert 6 == struct.calcsize(fmt2)
788 self._index.sort()
789 index_len = 0
790 for name, offset in self._index:
791
792
793
794 off3 = offset
795 off0 = off3 % 255
796 off3 -= off0
797 off1 = off3 % 65025
798 off3 -= off1
799 off2 = off3 % 16581375
800 off3 -= off2
801 assert offset == off0 + off1 + off2 + off3, \
802 "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3)
803 off3, off2, off1, off0 = off3//16581375, off2//65025, \
804 off1//255, off0
805 assert off0 < 255 and off1 < 255 and off2 < 255 and off3 < 255, \
806 "%i -> %i %i %i %i" % (offset, off0, off1, off2, off3)
807 handle.write(name + struct.pack(fmt2, 0, \
808 off3, off2, off1, off0, 255))
809 index_len += len(name) + 6
810
811 self._index_length = fmt_size + xml_len + index_len
812
813
814
815 if self._index_length % 8:
816 padding = 8 - (self._index_length%8)
817 handle.write("\0"*padding)
818 else:
819 padding = 0
820 offset = handle.tell()
821 assert offset == self._index_start + self._index_length + padding, \
822 "%i vs %i + %i + %i" % (offset, self._index_start, \
823 self._index_length, padding)
824
825 handle.seek(self._index_start)
826 handle.write(struct.pack(fmt, 778921588,
827 49,46,48,48,
828 xml_len, index_len) + xml)
829
830 handle.seek(0)
831 self.write_header()
832 handle.seek(offset)
833
835
836 key_length = len(self._key_sequence)
837
838
839
840
841
842
843
844
845
846
847
848
849 fmt = '>I4BQIIHHHB%is%is' % (self._number_of_flows_per_read, key_length)
850
851
852
853
854 if struct.calcsize(fmt) % 8 == 0:
855 padding = 0
856 else:
857 padding = 8 - (struct.calcsize(fmt) % 8)
858 header_length = struct.calcsize(fmt) + padding
859 assert header_length % 8 == 0
860 header = struct.pack(fmt, 779314790,
861 0, 0, 0, 1,
862 self._index_start, self._index_length,
863 self._number_of_reads,
864 header_length, key_length,
865 self._number_of_flows_per_read,
866 1,
867 self._flow_chars, self._key_sequence)
868 self.handle.write(header + "\0"*padding)
869
871 """Write a single additional record to the output file.
872
873 This assumes the header has been done.
874 """
875
876 name = record.id
877 name_len = len(name)
878 seq = str(record.seq).upper()
879 seq_len = len(seq)
880
881 try:
882 quals = record.letter_annotations["phred_quality"]
883 except KeyError:
884 raise ValueError("Missing PHRED qualities information")
885
886 try:
887 flow_values = record.annotations["flow_values"]
888 flow_index = record.annotations["flow_index"]
889 if self._key_sequence != record.annotations["flow_key"] \
890 or self._flow_chars != record.annotations["flow_chars"]:
891 raise ValueError("Records have inconsistent SFF flow data")
892 except KeyError:
893 raise ValueError("Missing SFF flow information")
894 except AttributeError:
895 raise ValueError("Header not written yet?")
896
897 try:
898 clip_qual_left = record.annotations["clip_qual_left"]
899 if clip_qual_left:
900 clip_qual_left += 1
901 clip_qual_right = record.annotations["clip_qual_right"]
902 clip_adapter_left = record.annotations["clip_adapter_left"]
903 if clip_adapter_left:
904 clip_adapter_left += 1
905 clip_adapter_right = record.annotations["clip_adapter_right"]
906 except KeyError:
907 raise ValueError("Missing SFF clipping information")
908
909
910 if self._index is not None:
911 offset = self.handle.tell()
912
913
914
915 if offset > 4244897280L:
916 import warnings
917 warnings.warn("Read %s has file offset %i, which is too large "
918 "to store in the Roche SFF index structure. No index "
919 "block will be recorded." % (name, offset))
920
921 self._index = None
922 else:
923 self._index.append((name, self.handle.tell()))
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939 read_header_fmt = '>2HI4H%is' % name_len
940 if struct.calcsize(read_header_fmt) % 8 == 0:
941 padding = 0
942 else:
943 padding = 8 - (struct.calcsize(read_header_fmt) % 8)
944 read_header_length = struct.calcsize(read_header_fmt) + padding
945 assert read_header_length % 8 == 0
946 data = struct.pack(read_header_fmt,
947 read_header_length,
948 name_len, seq_len,
949 clip_qual_left, clip_qual_right,
950 clip_adapter_left, clip_adapter_right,
951 name) + "\0"*padding
952 assert len(data) == read_header_length
953
954
955 read_flow_fmt = ">%iH" % self._number_of_flows_per_read
956 read_flow_size = struct.calcsize(read_flow_fmt)
957 temp_fmt = ">%iB" % seq_len
958 data += struct.pack(read_flow_fmt, *flow_values) \
959 + struct.pack(temp_fmt, *flow_index) \
960 + seq \
961 + struct.pack(temp_fmt, *quals)
962
963 padding = (read_flow_size + seq_len*3)%8
964 if padding:
965 padding = 8 - padding
966 self.handle.write(data + "\0"*padding)
967
968
969 if __name__ == "__main__":
970 print "Running quick self test"
971 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.sff"
972 metadata = _sff_read_roche_index_xml(open(filename, "rb"))
973 index1 = sorted(_sff_read_roche_index(open(filename, "rb")))
974 index2 = sorted(_sff_do_slow_index(open(filename, "rb")))
975 assert index1 == index2
976 assert len(index1) == len(list(SffIterator(open(filename, "rb"))))
977 from StringIO import StringIO
978 assert len(index1) == len(list(SffIterator(StringIO(open(filename,"rb").read()))))
979
980 if sys.platform != "win32":
981 assert len(index1) == len(list(SffIterator(open(filename, "r"))))
982 index2 = sorted(_sff_read_roche_index(open(filename)))
983 assert index1 == index2
984 index2 = sorted(_sff_do_slow_index(open(filename)))
985 assert index1 == index2
986 assert len(index1) == len(list(SffIterator(open(filename))))
987 assert len(index1) == len(list(SffIterator(StringIO(open(filename,"r").read()))))
988 assert len(index1) == len(list(SffIterator(StringIO(open(filename).read()))))
989
990 sff = list(SffIterator(open(filename, "rb")))
991
992 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
993 assert len(sff) == len(sff2)
994 for old, new in zip(sff, sff2):
995 assert old.id == new.id
996 assert str(old.seq) == str(new.seq)
997
998 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
999 assert len(sff) == len(sff2)
1000 for old, new in zip(sff, sff2):
1001 assert old.id == new.id
1002 assert str(old.seq) == str(new.seq)
1003
1004 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
1005 assert len(sff) == len(sff2)
1006 for old, new in zip(sff, sff2):
1007 assert old.id == new.id
1008 assert str(old.seq) == str(new.seq)
1009
1010 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_index_at_start.sff", "rb")))
1011 assert len(sff) == len(sff2)
1012 for old, new in zip(sff, sff2):
1013 assert old.id == new.id
1014 assert str(old.seq) == str(new.seq)
1015
1016 sff2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_index_in_middle.sff", "rb")))
1017 assert len(sff) == len(sff2)
1018 for old, new in zip(sff, sff2):
1019 assert old.id == new.id
1020 assert str(old.seq) == str(new.seq)
1021
1022 sff_trim = list(SffIterator(open(filename, "rb"), trim=True))
1023
1024 print _sff_read_roche_index_xml(open(filename, "rb"))
1025
1026 from Bio import SeqIO
1027 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads_no_trim.fasta"
1028 fasta_no_trim = list(SeqIO.parse(open(filename,"rU"), "fasta"))
1029 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads_no_trim.qual"
1030 qual_no_trim = list(SeqIO.parse(open(filename,"rU"), "qual"))
1031
1032 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.fasta"
1033 fasta_trim = list(SeqIO.parse(open(filename,"rU"), "fasta"))
1034 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.qual"
1035 qual_trim = list(SeqIO.parse(open(filename,"rU"), "qual"))
1036
1037 for s, sT, f, q, fT, qT in zip(sff, sff_trim, fasta_no_trim,
1038 qual_no_trim, fasta_trim, qual_trim):
1039
1040 print s.id
1041
1042
1043
1044 assert s.id == f.id == q.id
1045 assert str(s.seq) == str(f.seq)
1046 assert s.letter_annotations["phred_quality"] == q.letter_annotations["phred_quality"]
1047
1048 assert s.id == sT.id == fT.id == qT.id
1049 assert str(sT.seq) == str(fT.seq)
1050 assert sT.letter_annotations["phred_quality"] == qT.letter_annotations["phred_quality"]
1051
1052
1053 print "Writing with a list of SeqRecords..."
1054 handle = StringIO()
1055 w = SffWriter(handle, xml=metadata)
1056 w.write_file(sff)
1057 data = handle.getvalue()
1058 print "And again with an iterator..."
1059 handle = StringIO()
1060 w = SffWriter(handle, xml=metadata)
1061 w.write_file(iter(sff))
1062 assert data == handle.getvalue()
1063
1064 filename = "../../Tests/Roche/E3MFGYR02_random_10_reads.sff"
1065 original = open(filename,"rb").read()
1066 assert len(data) == len(original)
1067 assert data == original
1068 del data
1069 handle.close()
1070
1071 print "-"*50
1072 filename = "../../Tests/Roche/greek.sff"
1073 for record in SffIterator(open(filename,"rb")):
1074 print record.id
1075 index1 = sorted(_sff_read_roche_index(open(filename, "rb")))
1076 index2 = sorted(_sff_do_slow_index(open(filename, "rb")))
1077 assert index1 == index2
1078 try:
1079 print _sff_read_roche_index_xml(open(filename, "rb"))
1080 assert False, "Should fail!"
1081 except ValueError:
1082 pass
1083
1084
1085 handle = open(filename, "rb")
1086 for record in SffIterator(handle):
1087 pass
1088 try:
1089 for record in SffIterator(handle):
1090 print record.id
1091 assert False, "Should have failed"
1092 except ValueError, err:
1093 print "Checking what happens on re-reading a handle:"
1094 print err
1095
1096
1097 """
1098 #Ugly code to make test files...
1099 index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
1100 padding = len(index)%8
1101 if padding:
1102 padding = 8 - padding
1103 index += chr(0)*padding
1104 assert len(index)%8 == 0
1105
1106 #Ugly bit of code to make a fake index at start
1107 records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
1108 out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "w")
1109 index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
1110 padding = len(index)%8
1111 if padding:
1112 padding = 8 - padding
1113 index += chr(0)*padding
1114 w = SffWriter(out_handle, index=False, xml=None)
1115 #Fake the header...
1116 w._number_of_reads = len(records)
1117 w._index_start = 0
1118 w._index_length = 0
1119 w._key_sequence = records[0].annotations["flow_key"]
1120 w._flow_chars = records[0].annotations["flow_chars"]
1121 w._number_of_flows_per_read = len(w._flow_chars)
1122 w.write_header()
1123 w._index_start = out_handle.tell()
1124 w._index_length = len(index)
1125 out_handle.seek(0)
1126 w.write_header() #this time with index info
1127 w.handle.write(index)
1128 for record in records:
1129 w.write_record(record)
1130 out_handle.close()
1131 records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
1132 for old, new in zip(records, records2):
1133 assert str(old.seq)==str(new.seq)
1134 i = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_at_start.sff", "rb")))
1135
1136 #Ugly bit of code to make a fake index in middle
1137 records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
1138 out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "w")
1139 index = ".diy1.00This is a fake index block (DIY = Do It Yourself), which is allowed under the SFF standard.\0"
1140 padding = len(index)%8
1141 if padding:
1142 padding = 8 - padding
1143 index += chr(0)*padding
1144 w = SffWriter(out_handle, index=False, xml=None)
1145 #Fake the header...
1146 w._number_of_reads = len(records)
1147 w._index_start = 0
1148 w._index_length = 0
1149 w._key_sequence = records[0].annotations["flow_key"]
1150 w._flow_chars = records[0].annotations["flow_chars"]
1151 w._number_of_flows_per_read = len(w._flow_chars)
1152 w.write_header()
1153 for record in records[:5]:
1154 w.write_record(record)
1155 w._index_start = out_handle.tell()
1156 w._index_length = len(index)
1157 w.handle.write(index)
1158 for record in records[5:]:
1159 w.write_record(record)
1160 out_handle.seek(0)
1161 w.write_header() #this time with index info
1162 out_handle.close()
1163 records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
1164 for old, new in zip(records, records2):
1165 assert str(old.seq)==str(new.seq)
1166 j = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_in_middle.sff", "rb")))
1167
1168 #Ugly bit of code to make a fake index at end
1169 records = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_random_10_reads.sff", "rb")))
1170 out_handle = open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "w")
1171 w = SffWriter(out_handle, index=False, xml=None)
1172 #Fake the header...
1173 w._number_of_reads = len(records)
1174 w._index_start = 0
1175 w._index_length = 0
1176 w._key_sequence = records[0].annotations["flow_key"]
1177 w._flow_chars = records[0].annotations["flow_chars"]
1178 w._number_of_flows_per_read = len(w._flow_chars)
1179 w.write_header()
1180 for record in records:
1181 w.write_record(record)
1182 w._index_start = out_handle.tell()
1183 w._index_length = len(index)
1184 out_handle.write(index)
1185 out_handle.seek(0)
1186 w.write_header() #this time with index info
1187 out_handle.close()
1188 records2 = list(SffIterator(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
1189 for old, new in zip(records, records2):
1190 assert str(old.seq)==str(new.seq)
1191 try:
1192 print _sff_read_roche_index_xml(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb"))
1193 assert False, "Should fail!"
1194 except ValueError:
1195 pass
1196 k = list(_sff_do_slow_index(open("../../Tests/Roche/E3MFGYR02_alt_index_at_end.sff", "rb")))
1197 """
1198
1199 print "Done"
1200