Package Bio :: Package SeqIO :: Module _index
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqIO._index

  1  # Copyright 2009-2010 by Peter Cock.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """Dictionary like indexing of sequence files (PRIVATE). 
  6   
  7  You are not expected to access this module, or any of its code, directly. This 
  8  is all handled internally by the Bio.SeqIO.index(...) function which is the 
  9  public interface for this functionality. 
 10   
 11  The basic idea is that we scan over a sequence file, looking for new record 
 12  markers. We then try and extract the string that Bio.SeqIO.parse/read would 
 13  use as the record id, ideally without actually parsing the full record. We 
 14  then use a subclassed Python dictionary to record the file offset for the 
 15  record start against the record id. 
 16   
 17  Note that this means full parsing is on demand, so any invalid or problem 
 18  record may not trigger an exception until it is accessed. This is by design. 
 19   
 20  This means our dictionary like objects have in memory ALL the keys (all the 
 21  record identifiers), which shouldn't be a problem even with second generation 
 22  sequencing. If this is an issue later on, storing the keys and offsets in a 
 23  temp lookup file might be one idea (e.g. using SQLite or an OBDA style index). 
 24  """ 
 25   
 26  import re 
 27  from Bio import SeqIO 
 28  from Bio import Alphabet 
 29   
30 -class _IndexedSeqFileDict(dict):
31 """Read only dictionary interface to a sequential sequence file. 32 33 Keeps the keys in memory, reads the file to access entries as 34 SeqRecord objects using Bio.SeqIO for parsing them. This approach 35 is memory limited, but will work even with millions of sequences. 36 37 Note - as with the Bio.SeqIO.to_dict() function, duplicate keys 38 (record identifiers by default) are not allowed. If this happens, 39 a ValueError exception is raised. 40 41 By default the SeqRecord's id string is used as the dictionary 42 key. This can be changed by suppling an optional key_function, 43 a callback function which will be given the record id and must 44 return the desired key. For example, this allows you to parse 45 NCBI style FASTA identifiers, and extract the GI number to use 46 as the dictionary key. 47 48 Note that this dictionary is essentially read only. You cannot 49 add or change values, pop values, nor clear the dictionary. 50 """
51 - def __init__(self, filename, alphabet, key_function, mode="rU"):
52 #Use key_function=None for default value 53 dict.__init__(self) #init as empty dict! 54 self._handle = open(filename, mode) 55 self._alphabet = alphabet 56 self._format = "" 57 self._key_function = key_function
58 #Now scan it in a subclassed method, and set the format! 59
60 - def __repr__(self):
61 return "SeqIO.index('%s', '%s', alphabet=%s, key_function=%s)" \ 62 % (self._handle.name, self._format, 63 repr(self._alphabet), self._key_function)
64
65 - def __str__(self):
66 if self: 67 return "{%s : SeqRecord(...), ...}" % repr(self.keys()[0]) 68 else: 69 return "{}"
70
71 - def _record_key(self, identifier, seek_position):
72 """Used by subclasses to record file offsets for identifiers (PRIVATE). 73 74 This will apply the key_function (if given) to map the record id 75 string to the desired key. 76 77 This will raise a ValueError if a key (record id string) occurs 78 more than once. 79 """ 80 if self._key_function: 81 key = self._key_function(identifier) 82 else: 83 key = identifier 84 if key in self: 85 raise ValueError("Duplicate key '%s'" % key) 86 else: 87 dict.__setitem__(self, key, seek_position)
88
89 - def values(self):
90 """Would be a list of the SeqRecord objects, but not implemented. 91 92 In general you can be indexing very very large files, with millions 93 of sequences. Loading all these into memory at once as SeqRecord 94 objects would (probably) use up all the RAM. Therefore we simply 95 don't support this dictionary method. 96 """ 97 raise NotImplementedError("Due to memory concerns, when indexing a " 98 "sequence file you cannot access all the " 99 "records at once.")
100
101 - def items(self):
102 """Would be a list of the (key, SeqRecord) tuples, but not implemented. 103 104 In general you can be indexing very very large files, with millions 105 of sequences. Loading all these into memory at once as SeqRecord 106 objects would (probably) use up all the RAM. Therefore we simply 107 don't support this dictionary method. 108 """ 109 raise NotImplementedError("Due to memory concerns, when indexing a " 110 "sequence file you cannot access all the " 111 "records at once.")
112
113 - def iteritems(self):
114 """Iterate over the (key, SeqRecord) items.""" 115 for key in self.__iter__(): 116 yield key, self.__getitem__(key)
117
118 - def __getitem__(self, key):
119 """x.__getitem__(y) <==> x[y]""" 120 #For non-trivial file formats this must be over-ridden in the subclass 121 handle = self._handle 122 handle.seek(dict.__getitem__(self, key)) 123 record = SeqIO.parse(handle, self._format, self._alphabet).next() 124 if self._key_function: 125 assert self._key_function(record.id) == key, \ 126 "Requested key %s, found record.id %s which has key %s" \ 127 % (repr(key), repr(record.id), 128 repr(self._key_function(record.id))) 129 else: 130 assert record.id == key, \ 131 "Requested key %s, found record.id %s" \ 132 % (repr(key), repr(record.id)) 133 return record
134
135 - def get(self, k, d=None):
136 """D.get(k[,d]) -> D[k] if k in D, else d. d defaults to None.""" 137 try: 138 return self.__getitem__(k) 139 except KeyError: 140 return d
141
142 - def get_raw(self, key):
143 """Similar to the get method, but returns the record as a raw string. 144 145 If the key is not found, a KeyError exception is raised. 146 147 NOTE - This functionality is not supported for every file format. 148 """ 149 #Should be done by each sub-class (if possible) 150 raise NotImplementedError("Not available for this file format.")
151
152 - def __setitem__(self, key, value):
153 """Would allow setting or replacing records, but not implemented.""" 154 raise NotImplementedError("An indexed a sequence file is read only.")
155
156 - def update(self, **kwargs):
157 """Would allow adding more values, but not implemented.""" 158 raise NotImplementedError("An indexed a sequence file is read only.")
159 160
161 - def pop(self, key, default=None):
162 """Would remove specified record, but not implemented.""" 163 raise NotImplementedError("An indexed a sequence file is read only.")
164
165 - def popitem(self):
166 """Would remove and return a SeqRecord, but not implemented.""" 167 raise NotImplementedError("An indexed a sequence file is read only.")
168 169
170 - def clear(self):
171 """Would clear dictionary, but not implemented.""" 172 raise NotImplementedError("An indexed a sequence file is read only.")
173
174 - def fromkeys(self, keys, value=None):
175 """A dictionary method which we don't implement.""" 176 raise NotImplementedError("An indexed a sequence file doesn't " 177 "support this.")
178
179 - def copy(self):
180 """A dictionary method which we don't implement.""" 181 raise NotImplementedError("An indexed a sequence file doesn't " 182 "support this.")
183 184 185 186 #################### 187 # Special indexers # 188 #################### 189 190 # Anything where the records cannot be read simply by parsing from 191 # the record start. For example, anything requiring information from 192 # a file header - e.g. SFF files where we would need to know the 193 # number of flows. 194
195 -class SffDict(_IndexedSeqFileDict) :
196 """Indexed dictionary like access to a Standard Flowgram Format (SFF) file."""
197 - def __init__(self, filename, alphabet, key_function) :
198 if alphabet is None: 199 alphabet = Alphabet.generic_dna 200 #On Unix, using mode="r" or "rb" works, "rU" does not. 201 #On Windows, only using mode="rb" works, "r" and "rU" fail. 202 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function, "rb") 203 handle = self._handle 204 header_length, index_offset, index_length, number_of_reads, \ 205 self._flows_per_read, self._flow_chars, self._key_sequence \ 206 = SeqIO.SffIO._sff_file_header(handle) 207 if index_offset and index_length: 208 #There is an index provided, try this the fast way: 209 try : 210 for name, offset in SeqIO.SffIO._sff_read_roche_index(handle) : 211 self._record_key(name, offset) 212 assert len(self) == number_of_reads, \ 213 "Indexed %i records, expected %i" \ 214 % (len(self), number_of_reads) 215 return 216 except ValueError, err : 217 import warnings 218 warnings.warn("Could not parse the SFF index: %s" % err) 219 dict.clear(self) #reset in case partially populated 220 handle.seek(0) 221 else : 222 #TODO - Remove this debug warning? 223 import warnings 224 warnings.warn("No SFF index, doing it the slow way") 225 #Fall back on the slow way! 226 for name, offset in SeqIO.SffIO._sff_do_slow_index(handle) : 227 #print "%s -> %i" % (name, offset) 228 self._record_key(name, offset) 229 assert len(self) == number_of_reads, \ 230 "Indexed %i records, expected %i" % (len(self), number_of_reads)
231
232 - def __getitem__(self, key) :
233 handle = self._handle 234 handle.seek(dict.__getitem__(self, key)) 235 record = SeqIO.SffIO._sff_read_seq_record(handle, 236 self._flows_per_read, 237 self._flow_chars, 238 self._key_sequence, 239 self._alphabet) 240 assert record.id == key 241 return record
242
243 -class SffTrimmedDict(SffDict) :
244 - def __getitem__(self, key) :
245 handle = self._handle 246 handle.seek(dict.__getitem__(self, key)) 247 record = SeqIO.SffIO._sff_read_seq_record(handle, 248 self._flows_per_read, 249 self._flow_chars, 250 self._key_sequence, 251 self._alphabet, 252 trim=True) 253 assert record.id == key 254 return record
255 256 ################### 257 # Simple indexers # 258 ################### 259
260 -class _SequentialSeqFileDict(_IndexedSeqFileDict):
261 """Subclass for easy cases (PRIVATE)."""
262 - def __init__(self, filename, alphabet, key_function, format, marker):
263 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function) 264 self._format = format 265 handle = self._handle 266 marker_re = re.compile("^%s" % marker) 267 marker_offset = len(marker) 268 self._marker_re = marker_re #saved for the get_raw method 269 while True: 270 offset = handle.tell() 271 line = handle.readline() 272 if not line : break #End of file 273 if marker_re.match(line): 274 #Here we can assume the record.id is the first word after the 275 #marker. This is generally fine... but not for GenBank, EMBL, Swiss 276 self._record_key(line[marker_offset:].strip().split(None, 1)[0], \ 277 offset)
278
279 - def get_raw(self, key):
280 """Similar to the get method, but returns the record as a raw string.""" 281 #For non-trivial file formats this must be over-ridden in the subclass 282 handle = self._handle 283 marker_re = self._marker_re 284 handle.seek(dict.__getitem__(self, key)) 285 data = handle.readline() 286 while True: 287 line = handle.readline() 288 if not line or marker_re.match(line): 289 #End of file, or start of next record => end of this record 290 break 291 data += line 292 return data
293
294 -class FastaDict(_SequentialSeqFileDict):
295 """Indexed dictionary like access to a FASTA file."""
296 - def __init__(self, filename, alphabet, key_function):
297 _SequentialSeqFileDict.__init__(self, filename, alphabet, key_function, 298 "fasta", ">")
299
300 -class QualDict(_SequentialSeqFileDict):
301 """Indexed dictionary like access to a QUAL file."""
302 - def __init__(self, filename, alphabet, key_function):
303 _SequentialSeqFileDict.__init__(self, filename, alphabet, key_function, 304 "qual", ">")
305
306 -class PirDict(_SequentialSeqFileDict):
307 """Indexed dictionary like access to a PIR/NBRF file."""
308 - def __init__(self, filename, alphabet, key_function):
309 _SequentialSeqFileDict.__init__(self, filename, alphabet, key_function, 310 "pir", ">..;")
311
312 -class PhdDict(_SequentialSeqFileDict):
313 """Indexed dictionary like access to a PHD (PHRED) file."""
314 - def __init__(self, filename, alphabet, key_function):
315 _SequentialSeqFileDict.__init__(self, filename, alphabet, key_function, 316 "phd", "BEGIN_SEQUENCE")
317
318 -class AceDict(_SequentialSeqFileDict):
319 """Indexed dictionary like access to an ACE file."""
320 - def __init__(self, filename, alphabet, key_function):
321 _SequentialSeqFileDict.__init__(self, filename, alphabet, key_function, 322 "ace", "CO ")
323 324 325 ####################################### 326 # Fiddly indexers: GenBank, EMBL, ... # 327 ####################################### 328
329 -class GenBankDict(_SequentialSeqFileDict):
330 """Indexed dictionary like access to a GenBank file."""
331 - def __init__(self, filename, alphabet, key_function):
332 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function) 333 self._format = "genbank" 334 handle = self._handle 335 marker_re = re.compile("^LOCUS ") 336 self._marker_re = marker_re #saved for the get_raw method 337 while True: 338 offset = handle.tell() 339 line = handle.readline() 340 if not line : break #End of file 341 if marker_re.match(line): 342 #We cannot assume the record.id is the first word after LOCUS, 343 #normally the first entry on the VERSION or ACCESSION line is used. 344 key = None 345 done = False 346 while not done: 347 line = handle.readline() 348 if line.startswith("ACCESSION "): 349 key = line.rstrip().split()[1] 350 elif line.startswith("VERSION "): 351 version_id = line.rstrip().split()[1] 352 if version_id.count(".")==1 and version_id.split(".")[1].isdigit(): 353 #This should mimics the GenBank parser... 354 key = version_id 355 done = True 356 break 357 elif line.startswith("FEATURES ") \ 358 or line.startswith("ORIGIN ") \ 359 or line.startswith("//") \ 360 or marker_re.match(line) \ 361 or not line: 362 done = True 363 break 364 if not key: 365 raise ValueError("Did not find ACCESSION/VERSION lines") 366 self._record_key(key, offset)
367
368 -class EmblDict(_SequentialSeqFileDict):
369 """Indexed dictionary like access to an EMBL file."""
370 - def __init__(self, filename, alphabet, key_function):
371 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function) 372 self._format = "embl" 373 handle = self._handle 374 marker_re = re.compile("^ID ") 375 self._marker_re = marker_re #saved for the get_raw method 376 while True: 377 offset = handle.tell() 378 line = handle.readline() 379 if not line : break #End of file 380 if marker_re.match(line): 381 #We cannot assume the record.id is the first word after ID, 382 #normally the SV line is used. 383 if line[2:].count(";") == 6: 384 #Looks like the semi colon separated style introduced in 2006 385 parts = line[3:].rstrip().split(";") 386 if parts[1].strip().startswith("SV "): 387 #The SV bit gives the version 388 key = "%s.%s" \ 389 % (parts[0].strip(), parts[1].strip().split()[1]) 390 else: 391 key = parts[0].strip() 392 elif line[2:].count(";") == 3: 393 #Looks like the pre 2006 style, take first word only 394 key = line[3:].strip().split(None,1)[0] 395 else: 396 raise ValueError('Did not recognise the ID line layout:\n' + line) 397 while True: 398 line = handle.readline() 399 if line.startswith("SV "): 400 key = line.rstrip().split()[1] 401 break 402 elif line.startswith("FH ") \ 403 or line.startswith("FT ") \ 404 or line.startswith("SQ ") \ 405 or line.startswith("//") \ 406 or marker_re.match(line) \ 407 or not line: 408 break 409 self._record_key(key, offset)
410
411 -class SwissDict(_SequentialSeqFileDict):
412 """Indexed dictionary like access to a SwissProt file."""
413 - def __init__(self, filename, alphabet, key_function):
414 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function) 415 self._format = "swiss" 416 handle = self._handle 417 marker_re = re.compile("^ID ") 418 self._marker_re = marker_re #saved for the get_raw method 419 while True: 420 offset = handle.tell() 421 line = handle.readline() 422 if not line : break #End of file 423 if marker_re.match(line): 424 #We cannot assume the record.id is the first word after ID, 425 #normally the following AC line is used. 426 line = handle.readline() 427 assert line.startswith("AC ") 428 key = line[3:].strip().split(";")[0].strip() 429 self._record_key(key, offset)
430
431 -class IntelliGeneticsDict(_SequentialSeqFileDict):
432 """Indexed dictionary like access to a IntelliGenetics file."""
433 - def __init__(self, filename, alphabet, key_function):
434 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function) 435 self._format = "ig" 436 handle = self._handle 437 marker_re = re.compile("^;") 438 self._marker_re = marker_re #saved for the get_raw method 439 while True: 440 offset = handle.tell() 441 line = handle.readline() 442 if not line : break #End of file 443 if marker_re.match(line): 444 #Now look for the first line which doesn't start ";" 445 while True: 446 line = handle.readline() 447 if not line: 448 raise ValueError("Premature end of file?") 449 if line[0] != ";" and line.strip(): 450 key = line.split()[0] 451 self._record_key(key, offset) 452 break
453
454 -class TabDict(_IndexedSeqFileDict):
455 """Indexed dictionary like access to a simple tabbed file."""
456 - def __init__(self, filename, alphabet, key_function):
457 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function) 458 self._format = "tab" 459 handle = self._handle 460 while True: 461 offset = handle.tell() 462 line = handle.readline() 463 if not line : break #End of file 464 try: 465 key = line.split("\t")[0] 466 except ValueError, err: 467 if not line.strip(): 468 #Ignore blank lines 469 continue 470 else: 471 raise err 472 else: 473 self._record_key(key, offset)
474
475 - def get_raw(self, key):
476 """Like the get method, but returns the record as a raw string.""" 477 handle = self._handle 478 handle.seek(dict.__getitem__(self, key)) 479 return handle.readline()
480 481 ########################## 482 # Now the FASTQ indexers # 483 ########################## 484
485 -class _FastqSeqFileDict(_IndexedSeqFileDict):
486 """Subclass for easy cases (PRIVATE). 487 488 With FASTQ the records all start with a "@" line, but so too can some 489 quality lines. Note this will cope with line-wrapped FASTQ files. 490 """
491 - def __init__(self, filename, alphabet, key_function, fastq_format):
492 _IndexedSeqFileDict.__init__(self, filename, alphabet, key_function) 493 self._format = fastq_format 494 handle = self._handle 495 pos = handle.tell() 496 line = handle.readline() 497 if not line: 498 #Empty file! 499 return 500 if line[0] != "@": 501 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line)) 502 while line: 503 #assert line[0]=="@" 504 #This record seems OK (so far) 505 self._record_key(line[1:].rstrip().split(None, 1)[0], pos) 506 #Find the seq line(s) 507 seq_len = 0 508 while line: 509 line = handle.readline() 510 if line.startswith("+") : break 511 seq_len += len(line.strip()) 512 if not line: 513 raise ValueError("Premature end of file in seq section") 514 #assert line[0]=="+" 515 #Find the qual line(s) 516 qual_len = 0 517 while line: 518 if seq_len == qual_len: 519 #Should be end of record... 520 pos = handle.tell() 521 line = handle.readline() 522 if line and line[0] != "@": 523 ValueError("Problem with line %s" % repr(line)) 524 break 525 else: 526 line = handle.readline() 527 qual_len += len(line.strip()) 528 if seq_len != qual_len: 529 raise ValueError("Problem with quality section")
530 #print "EOF" 531
532 - def get_raw(self, key):
533 """Similar to the get method, but returns the record as a raw string.""" 534 #TODO - Refactor this and the __init__ method to reduce code duplication? 535 handle = self._handle 536 handle.seek(dict.__getitem__(self, key)) 537 line = handle.readline() 538 data = line 539 if line[0] != "@": 540 raise ValueError("Problem with FASTQ @ line:\n%s" % repr(line)) 541 identifier = line[1:].rstrip().split(None, 1)[0] 542 if self._key_function: 543 identifier = self._key_function(identifier) 544 if key != identifier: 545 raise ValueError("Key did not match") 546 #Find the seq line(s) 547 seq_len = 0 548 while line: 549 line = handle.readline() 550 data += line 551 if line.startswith("+") : break 552 seq_len += len(line.strip()) 553 if not line: 554 raise ValueError("Premature end of file in seq section") 555 assert line[0]=="+" 556 #Find the qual line(s) 557 qual_len = 0 558 while line: 559 if seq_len == qual_len: 560 #Should be end of record... 561 pos = handle.tell() 562 line = handle.readline() 563 if line and line[0] != "@": 564 ValueError("Problem with line %s" % repr(line)) 565 break 566 else: 567 line = handle.readline() 568 data += line 569 qual_len += len(line.strip()) 570 if seq_len != qual_len: 571 raise ValueError("Problem with quality section") 572 return data
573
574 -class FastqSangerDict(_FastqSeqFileDict):
575 """Indexed dictionary like access to a standard Sanger FASTQ file."""
576 - def __init__(self, filename, alphabet, key_function):
577 _FastqSeqFileDict.__init__(self, filename, alphabet, key_function, 578 "fastq-sanger")
579
580 -class FastqSolexaDict(_FastqSeqFileDict):
581 """Indexed dictionary like access to a Solexa (or early Illumina) FASTQ file."""
582 - def __init__(self, filename, alphabet, key_function):
583 _FastqSeqFileDict.__init__(self, filename, alphabet, key_function, 584 "fastq-solexa")
585
586 -class FastqIlluminaDict(_FastqSeqFileDict):
587 """Indexed dictionary like access to a Illumina 1.3+ FASTQ file."""
588 - def __init__(self, filename, alphabet, key_function):
589 _FastqSeqFileDict.__init__(self, filename, alphabet, key_function, 590 "fastq-illumina")
591 592 ############################################################################### 593 594 _FormatToIndexedDict = {"ace" : AceDict, 595 "embl" : EmblDict, 596 "fasta" : FastaDict, 597 "fastq" : FastqSangerDict, 598 "fastq-sanger" : FastqSangerDict, #alias of the above 599 "fastq-solexa" : FastqSolexaDict, 600 "fastq-illumina" : FastqIlluminaDict, 601 "genbank" : GenBankDict, 602 "gb" : GenBankDict, #alias of the above 603 "ig" : IntelliGeneticsDict, 604 "phd" : PhdDict, 605 "pir" : PirDict, 606 "sff" : SffDict, 607 "sff-trim" : SffTrimmedDict, 608 "swiss" : SwissDict, 609 "tab" : TabDict, 610 "qual" : QualDict 611 } 612