1
2
3
4
5 """Dictionary like indexing of sequence files (PRIVATE).
6
7 You are not expected to access this module, or any of its code, directly. This
8 is all handled internally by the Bio.SeqIO.index(...) function which is the
9 public interface for this functionality.
10
11 The basic idea is that we scan over a sequence file, looking for new record
12 markers. We then try and extract the string that Bio.SeqIO.parse/read would
13 use as the record id, ideally without actually parsing the full record. We
14 then use a subclassed Python dictionary to record the file offset for the
15 record start against the record id.
16
17 Note that this means full parsing is on demand, so any invalid or problem
18 record may not trigger an exception until it is accessed. This is by design.
19
20 This means our dictionary like objects have in memory ALL the keys (all the
21 record identifiers), which shouldn't be a problem even with second generation
22 sequencing. If this is an issue later on, storing the keys and offsets in a
23 temp lookup file might be one idea (e.g. using SQLite or an OBDA style index).
24 """
25
26 import re
27 from Bio import SeqIO
28 from Bio import Alphabet
29
31 """Read only dictionary interface to a sequential sequence file.
32
33 Keeps the keys in memory, reads the file to access entries as
34 SeqRecord objects using Bio.SeqIO for parsing them. This approach
35 is memory limited, but will work even with millions of sequences.
36
37 Note - as with the Bio.SeqIO.to_dict() function, duplicate keys
38 (record identifiers by default) are not allowed. If this happens,
39 a ValueError exception is raised.
40
41 By default the SeqRecord's id string is used as the dictionary
42 key. This can be changed by suppling an optional key_function,
43 a callback function which will be given the record id and must
44 return the desired key. For example, this allows you to parse
45 NCBI style FASTA identifiers, and extract the GI number to use
46 as the dictionary key.
47
48 Note that this dictionary is essentially read only. You cannot
49 add or change values, pop values, nor clear the dictionary.
50 """
51 - def __init__(self, filename, alphabet, key_function, mode="rU"):
58
59
61 return "SeqIO.index('%s', '%s', alphabet=%s, key_function=%s)" \
62 % (self._handle.name, self._format,
63 repr(self._alphabet), self._key_function)
64
66 if self:
67 return "{%s : SeqRecord(...), ...}" % repr(self.keys()[0])
68 else:
69 return "{}"
70
72 """Used by subclasses to record file offsets for identifiers (PRIVATE).
73
74 This will apply the key_function (if given) to map the record id
75 string to the desired key.
76
77 This will raise a ValueError if a key (record id string) occurs
78 more than once.
79 """
80 if self._key_function:
81 key = self._key_function(identifier)
82 else:
83 key = identifier
84 if key in self:
85 raise ValueError("Duplicate key '%s'" % key)
86 else:
87 dict.__setitem__(self, key, seek_position)
88
90 """Would be a list of the SeqRecord objects, but not implemented.
91
92 In general you can be indexing very very large files, with millions
93 of sequences. Loading all these into memory at once as SeqRecord
94 objects would (probably) use up all the RAM. Therefore we simply
95 don't support this dictionary method.
96 """
97 raise NotImplementedError("Due to memory concerns, when indexing a "
98 "sequence file you cannot access all the "
99 "records at once.")
100
102 """Would be a list of the (key, SeqRecord) tuples, but not implemented.
103
104 In general you can be indexing very very large files, with millions
105 of sequences. Loading all these into memory at once as SeqRecord
106 objects would (probably) use up all the RAM. Therefore we simply
107 don't support this dictionary method.
108 """
109 raise NotImplementedError("Due to memory concerns, when indexing a "
110 "sequence file you cannot access all the "
111 "records at once.")
112
117
119 """x.__getitem__(y) <==> x[y]"""
120
121 handle = self._handle
122 handle.seek(dict.__getitem__(self, key))
123 record = SeqIO.parse(handle, self._format, self._alphabet).next()
124 if self._key_function:
125 assert self._key_function(record.id) == key, \
126 "Requested key %s, found record.id %s which has key %s" \
127 % (repr(key), repr(record.id),
128 repr(self._key_function(record.id)))
129 else:
130 assert record.id == key, \
131 "Requested key %s, found record.id %s" \
132 % (repr(key), repr(record.id))
133 return record
134
135 - def get(self, k, d=None):
136 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None."""
137 try:
138 return self.__getitem__(k)
139 except KeyError:
140 return d
141
143 """Similar to the get method, but returns the record as a raw string.
144
145 If the key is not found, a KeyError exception is raised.
146
147 NOTE - This functionality is not supported for every file format.
148 """
149
150 raise NotImplementedError("Not available for this file format.")
151
153 """Would allow setting or replacing records, but not implemented."""
154 raise NotImplementedError("An indexed a sequence file is read only.")
155
157 """Would allow adding more values, but not implemented."""
158 raise NotImplementedError("An indexed a sequence file is read only.")
159
160
161 - def pop(self, key, default=None):
162 """Would remove specified record, but not implemented."""
163 raise NotImplementedError("An indexed a sequence file is read only.")
164
166 """Would remove and return a SeqRecord, but not implemented."""
167 raise NotImplementedError("An indexed a sequence file is read only.")
168
169
171 """Would clear dictionary, but not implemented."""
172 raise NotImplementedError("An indexed a sequence file is read only.")
173
175 """A dictionary method which we don't implement."""
176 raise NotImplementedError("An indexed a sequence file doesn't "
177 "support this.")
178
180 """A dictionary method which we don't implement."""
181 raise NotImplementedError("An indexed a sequence file doesn't "
182 "support this.")
183
184
185
186
187
188
189
190
191
192
193
194
195 -class SffDict(_IndexedSeqFileDict) :
196 """Indexed dictionary like access to a Standard Flowgram Format (SFF) file."""
197 - def __init__(self, filename, alphabet, key_function) :
198 if alphabet is None:
199 alphabet = Alphabet.generic_dna
200
201
202 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function, "rb")
203 handle = self._handle
204 header_length, index_offset, index_length, number_of_reads, \
205 self._flows_per_read, self._flow_chars, self._key_sequence \
206 = SeqIO.SffIO._sff_file_header(handle)
207 if index_offset and index_length:
208
209 try :
210 for name, offset in SeqIO.SffIO._sff_read_roche_index(handle) :
211 self._record_key(name, offset)
212 assert len(self) == number_of_reads, \
213 "Indexed %i records, expected %i" \
214 % (len(self), number_of_reads)
215 return
216 except ValueError, err :
217 import warnings
218 warnings.warn("Could not parse the SFF index: %s" % err)
219 dict.clear(self)
220 handle.seek(0)
221 else :
222
223 import warnings
224 warnings.warn("No SFF index, doing it the slow way")
225
226 for name, offset in SeqIO.SffIO._sff_do_slow_index(handle) :
227
228 self._record_key(name, offset)
229 assert len(self) == number_of_reads, \
230 "Indexed %i records, expected %i" % (len(self), number_of_reads)
231
242
255
256
257
258
259
261 """Subclass for easy cases (PRIVATE)."""
262 - def __init__(self, filename, alphabet, key_function, format, marker):
278
293
295 """Indexed dictionary like access to a FASTA file."""
296 - def __init__(self, filename, alphabet, key_function):
299
301 """Indexed dictionary like access to a QUAL file."""
302 - def __init__(self, filename, alphabet, key_function):
305
306 -class PirDict(_SequentialSeqFileDict):
307 """Indexed dictionary like access to a PIR/NBRF file."""
308 - def __init__(self, filename, alphabet, key_function):
311
312 -class PhdDict(_SequentialSeqFileDict):
313 """Indexed dictionary like access to a PHD (PHRED) file."""
314 - def __init__(self, filename, alphabet, key_function):
317
318 -class AceDict(_SequentialSeqFileDict):
319 """Indexed dictionary like access to an ACE file."""
320 - def __init__(self, filename, alphabet, key_function):
323
324
325
326
327
328
330 """Indexed dictionary like access to a GenBank file."""
331 - def __init__(self, filename, alphabet, key_function):
367
369 """Indexed dictionary like access to an EMBL file."""
370 - def __init__(self, filename, alphabet, key_function):
410
412 """Indexed dictionary like access to a SwissProt file."""
413 - def __init__(self, filename, alphabet, key_function):
430
432 """Indexed dictionary like access to a IntelliGenetics file."""
433 - def __init__(self, filename, alphabet, key_function):
453
454 -class TabDict(_IndexedSeqFileDict):
455 """Indexed dictionary like access to a simple tabbed file."""
456 - def __init__(self, filename, alphabet, key_function):
474
480
481
482
483
484
486 """Subclass for easy cases (PRIVATE).
487
488 With FASTQ the records all start with a "@" line, but so too can some
489 quality lines. Note this will cope with line-wrapped FASTQ files.
490 """
491 - def __init__(self, filename, alphabet, key_function, fastq_format):
492 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function)
493 self._format = fastq_format
494 handle = self._handle
495 pos = handle.tell()
496 line = handle.readline()
497 if not line:
498
499 return
500 if line[0] != "@":
501 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line))
502 while line:
503
504
505 self._record_key(line[1:].rstrip().split(None, 1)[0], pos)
506
507 seq_len = 0
508 while line:
509 line = handle.readline()
510 if line.startswith("+") : break
511 seq_len += len(line.strip())
512 if not line:
513 raise ValueError("Premature end of file in seq section")
514
515
516 qual_len = 0
517 while line:
518 if seq_len == qual_len:
519
520 pos = handle.tell()
521 line = handle.readline()
522 if line and line[0] != "@":
523 ValueError("Problem with line %s" % repr(line))
524 break
525 else:
526 line = handle.readline()
527 qual_len += len(line.strip())
528 if seq_len != qual_len:
529 raise ValueError("Problem with quality section")
530
531
533 """Similar to the get method, but returns the record as a raw string."""
534
535 handle = self._handle
536 handle.seek(dict.__getitem__(self, key))
537 line = handle.readline()
538 data = line
539 if line[0] != "@":
540 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line))
541 identifier = line[1:].rstrip().split(None, 1)[0]
542 if self._key_function:
543 identifier = self._key_function(identifier)
544 if key != identifier:
545 raise ValueError("Key did not match")
546
547 seq_len = 0
548 while line:
549 line = handle.readline()
550 data += line
551 if line.startswith("+") : break
552 seq_len += len(line.strip())
553 if not line:
554 raise ValueError("Premature end of file in seq section")
555 assert line[0]=="+"
556
557 qual_len = 0
558 while line:
559 if seq_len == qual_len:
560
561 pos = handle.tell()
562 line = handle.readline()
563 if line and line[0] != "@":
564 ValueError("Problem with line %s" % repr(line))
565 break
566 else:
567 line = handle.readline()
568 data += line
569 qual_len += len(line.strip())
570 if seq_len != qual_len:
571 raise ValueError("Problem with quality section")
572 return data
573
575 """Indexed dictionary like access to a standard Sanger FASTQ file."""
576 - def __init__(self, filename, alphabet, key_function):
579
581 """Indexed dictionary like access to a Solexa (or early Illumina) FASTQ file."""
582 - def __init__(self, filename, alphabet, key_function):
585
587 """Indexed dictionary like access to a Illumina 1.3+ FASTQ file."""
588 - def __init__(self, filename, alphabet, key_function):
591
592
593
594 _FormatToIndexedDict = {"ace" : AceDict,
595 "embl" : EmblDict,
596 "fasta" : FastaDict,
597 "fastq" : FastqSangerDict,
598 "fastq-sanger" : FastqSangerDict,
599 "fastq-solexa" : FastqSolexaDict,
600 "fastq-illumina" : FastqIlluminaDict,
601 "genbank" : GenBankDict,
602 "gb" : GenBankDict,
603 "ig" : IntelliGeneticsDict,
604 "phd" : PhdDict,
605 "pir" : PirDict,
606 "sff" : SffDict,
607 "sff-trim" : SffTrimmedDict,
608 "swiss" : SwissDict,
609 "tab" : TabDict,
610 "qual" : QualDict
611 }
612