Package Bio :: Package GenBank :: Module Record
[hide private]
[frames] | no frames]

Source Code for Module Bio.GenBank.Record

  1  """Hold GenBank data in a straightforward format. 
  2   
  3  classes: 
  4  o Record - All of the information in a GenBank record. 
  5  o Reference - hold reference data for a record. 
  6  o Feature - Hold the information in a Feature Table. 
  7  o Qualifier - Qualifiers on a Feature. 
  8  17-MAR-2009: added support for WGS and WGS_SCAFLD lines.  Ying Huang & Iddo Friedberg 
  9  """ 
 10  # local stuff 
 11  import Bio.GenBank 
 12   
13 -def _wrapped_genbank(information, indent, wrap_space = 1, split_char = " "):
14 """Write a line of GenBank info that can wrap over multiple lines. 15 16 This takes a line of information which can potentially wrap over 17 multiple lines, and breaks it up with carriage returns and 18 indentation so it fits properly into a GenBank record. 19 20 Arguments: 21 22 o information - The string holding the information we want 23 wrapped in GenBank method. 24 25 o indent - The indentation on the lines we are writing. 26 27 o wrap_space - Whether or not to wrap only on spaces in the 28 information. 29 30 o split_char - A specific character to split the lines on. By default 31 spaces are used. 32 """ 33 info_length = Record.GB_LINE_LENGTH - indent 34 35 if not information: 36 #GenBank files use "." for missing data 37 return ".\n" 38 39 if wrap_space: 40 info_parts = information.split(split_char) 41 else: 42 cur_pos = 0 43 info_parts = [] 44 while cur_pos < len(information): 45 info_parts.append(information[cur_pos: cur_pos + info_length]) 46 cur_pos += info_length 47 48 # first get the information string split up by line 49 output_parts = [] 50 cur_part = "" 51 for info_part in info_parts: 52 if len(cur_part) + 1 + len(info_part) > info_length: 53 if cur_part: 54 if split_char != " ": 55 cur_part += split_char 56 output_parts.append(cur_part) 57 cur_part = info_part 58 else: 59 if cur_part == "": 60 cur_part = info_part 61 else: 62 cur_part += split_char + info_part 63 64 # add the last bit of information to the output 65 if cur_part: 66 output_parts.append(cur_part) 67 68 # now format the information string for return 69 output_info = output_parts[0] + "\n" 70 for output_part in output_parts[1:]: 71 output_info += " " * indent + output_part + "\n" 72 73 return output_info
74
75 -def _indent_genbank(information, indent):
76 """Write out information with the specified indent. 77 78 Unlike _wrapped_genbank, this function makes no attempt to wrap 79 lines -- it assumes that the information already has newlines in the 80 appropriate places, and will add the specified indent to the start of 81 each line. 82 """ 83 # split the info into lines based on line breaks 84 info_parts = information.split("\n") 85 86 # the first line will have no indent 87 output_info = info_parts[0] + "\n" 88 for info_part in info_parts[1:]: 89 output_info += " " * indent + info_part + "\n" 90 91 return output_info
92
93 -class Record:
94 """Hold GenBank information in a format similar to the original record. 95 96 The Record class is meant to make data easy to get to when you are 97 just interested in looking at GenBank data. 98 99 Attributes: 100 o locus - The name specified after the LOCUS keyword in the GenBank 101 record. This may be the accession number, or a clone id or something else. 102 o size - The size of the record. 103 o residue_type - The type of residues making up the sequence in this 104 record. Normally something like RNA, DNA or PROTEIN, but may be as 105 esoteric as 'ss-RNA circular'. 106 o data_file_division - The division this record is stored under in 107 GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...) 108 o date - The date of submission of the record, in a form like '28-JUL-1998' 109 o accession - list of all accession numbers for the sequence. 110 o nid - Nucleotide identifier number. 111 o pid - Proteint identifier number 112 o version - The accession number + version (ie. AB01234.2) 113 o db_source - Information about the database the record came from 114 o gi - The NCBI gi identifier for the record. 115 o keywords - A list of keywords related to the record. 116 o segment - If the record is one of a series, this is info about which 117 segment this record is (something like '1 of 6'). 118 o source - The source of material where the sequence came from. 119 o organism - The genus and species of the organism (ie. 'Homo sapiens') 120 o taxonomy - A listing of the taxonomic classification of the organism, 121 starting general and getting more specific. 122 o references - A list of Reference objects. 123 o comment - Text with any kind of comment about the record. 124 o features - A listing of Features making up the feature table. 125 o base_counts - A string with the counts of bases for the sequence. 126 o origin - A string specifying info about the origin of the sequence. 127 o sequence - A string with the sequence itself. 128 o contig - A string of location information for a CONTIG in a RefSeq file 129 o project - The genome sequencing project numbers 130 (will be replaced by the dblink cross-references in 2009). 131 o dblinks - The genome sequencing project number(s) and other links. 132 (will replace the project information in 2009). 133 """ 134 # constants for outputting GenBank information 135 GB_LINE_LENGTH = 79 136 GB_BASE_INDENT = 12 137 GB_FEATURE_INDENT = 21 138 GB_INTERNAL_INDENT = 2 139 GB_OTHER_INTERNAL_INDENT = 3 140 GB_FEATURE_INTERNAL_INDENT = 5 141 GB_SEQUENCE_INDENT = 9 142 143 BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s" 144 INTERNAL_FORMAT = " " * GB_INTERNAL_INDENT + "%-" + \ 145 str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s" 146 OTHER_INTERNAL_FORMAT = " " * GB_OTHER_INTERNAL_INDENT + "%-" + \ 147 str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT) + \ 148 "s" 149 150 BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s" 151 INTERNAL_FEATURE_FORMAT = " " * GB_FEATURE_INTERNAL_INDENT + "%-" + \ 152 str(GB_FEATURE_INDENT - 153 GB_FEATURE_INTERNAL_INDENT) + "s" 154 SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s" 155
156 - def __init__(self):
157 self.locus = '' 158 self.size = '' 159 self.residue_type = '' 160 self.data_file_division = '' 161 self.date = '' 162 self.definition = '' 163 self.accession = [] 164 self.nid = '' 165 self.pid = '' 166 self.version = '' 167 self.projects = [] 168 self.dblinks = [] 169 self.db_source = '' 170 self.gi = '' 171 self.keywords = [] 172 self.segment = '' 173 self.source = '' 174 self.organism = '' 175 self.taxonomy = [] 176 self.references = [] 177 self.comment = '' 178 self.features = [] 179 self.base_counts = '' 180 self.origin = '' 181 self.sequence = '' 182 self.contig = '' 183 self.primary=[] 184 self.wgs = '' 185 self.wgs_scafld = []
186
187 - def __str__(self):
188 """Provide a GenBank formatted output option for a Record. 189 190 The objective of this is to provide an easy way to read in a GenBank 191 record, modify it somehow, and then output it in 'GenBank format.' 192 We are striving to make this work so that a parsed Record that is 193 output using this function will look exactly like the original 194 record. 195 196 Much of the output is based on format description info at: 197 198 ftp://ncbi.nlm.nih.gov/genbank/gbrel.txt 199 """ 200 output = self._locus_line() 201 output += self._definition_line() 202 output += self._accession_line() 203 output += self._version_line() 204 output += self._project_line() 205 output += self._dblink_line() 206 output += self._nid_line() 207 output += self._pid_line() 208 output += self._keywords_line() 209 output += self._db_source_line() 210 output += self._segment_line() 211 output += self._source_line() 212 output += self._organism_line() 213 for reference in self.references: 214 output += str(reference) 215 output += self._comment_line() 216 output += self._features_line() 217 for feature in self.features: 218 output += str(feature) 219 output += self._base_count_line() 220 output += self._origin_line() 221 output += self._sequence_line() 222 output += self._wgs_line() 223 output += self._wgs_scafld_line() 224 output += self._contig_line() 225 output += "//" 226 return output
227
228 - def _locus_line(self):
229 """Provide the output string for the LOCUS line. 230 """ 231 output = "LOCUS" 232 output += " " * 7 # 6-12 spaces 233 output += "%-9s" % self.locus 234 output += " " # 22 space 235 output += "%7s" % self.size 236 if self.residue_type.find("PROTEIN") >= 0: 237 output += " aa" 238 else: 239 output += " bp " 240 241 # treat circular types differently, since they'll have long residue 242 # types 243 if self.residue_type.find("circular") >= 0: 244 output += "%17s" % self.residue_type 245 # second case: ss-DNA types of records 246 elif self.residue_type.find("-") >= 0: 247 output += "%7s" % self.residue_type 248 output += " " * 10 # spaces for circular 249 else: 250 output += " " * 3 # spaces for stuff like ss- 251 output += "%-4s" % self.residue_type 252 output += " " * 10 # spaces for circular 253 254 output += " " * 2 255 output += "%3s" % self.data_file_division 256 output += " " * 7 # spaces for 56-63 257 output += "%11s" % self.date 258 output += "\n" 259 return output
260
261 - def _definition_line(self):
262 """Provide output for the DEFINITION line. 263 """ 264 output = Record.BASE_FORMAT % "DEFINITION" 265 output += _wrapped_genbank(self.definition, Record.GB_BASE_INDENT) 266 return output
267
268 - def _accession_line(self):
269 """Output for the ACCESSION line. 270 """ 271 if self.accession: 272 output = Record.BASE_FORMAT % "ACCESSION" 273 274 acc_info = "" 275 for accession in self.accession: 276 acc_info += "%s " % accession 277 # strip off an extra space at the end 278 acc_info = acc_info.rstrip() 279 output += _wrapped_genbank(acc_info, Record.GB_BASE_INDENT) 280 else: 281 output = "" 282 283 return output
284
285 - def _version_line(self):
286 """Output for the VERSION line. 287 """ 288 if self.version: 289 output = Record.BASE_FORMAT % "VERSION" 290 output += self.version 291 output += " GI:" 292 output += "%s\n" % self.gi 293 else: 294 output = "" 295 return output
296
297 - def _project_line(self):
298 output = "" 299 if len(self.projects) > 0: 300 output = Record.BASE_FORMAT % "PROJECT" 301 output += "%s\n" % " ".join(self.projects) 302 return output
303 311
312 - def _nid_line(self):
313 """Output for the NID line. Use of NID is obsolete in GenBank files. 314 """ 315 if self.nid: 316 output = Record.BASE_FORMAT % "NID" 317 output += "%s\n" % self.nid 318 else: 319 output = "" 320 return output
321
322 - def _pid_line(self):
323 """Output for PID line. Presumedly, PID usage is also obsolete. 324 """ 325 if self.pid: 326 output = Record.BASE_FORMAT % "PID" 327 output += "%s\n" % self.pid 328 else: 329 output = "" 330 return output
331
332 - def _keywords_line(self):
333 """Output for the KEYWORDS line. 334 """ 335 output = "" 336 if len(self.keywords) >= 0: 337 output += Record.BASE_FORMAT % "KEYWORDS" 338 keyword_info = "" 339 for keyword in self.keywords: 340 keyword_info += "%s; " % keyword 341 # replace the ; at the end with a period 342 keyword_info = keyword_info[:-2] 343 keyword_info += "." 344 345 output += _wrapped_genbank(keyword_info, 346 Record.GB_BASE_INDENT) 347 348 return output
349
350 - def _db_source_line(self):
351 """Output for DBSOURCE line. 352 """ 353 if self.db_source: 354 output = Record.BASE_FORMAT % "DBSOURCE" 355 output += "%s\n" % self.db_source 356 else: 357 output = "" 358 return output
359
360 - def _segment_line(self):
361 """Output for the SEGMENT line. 362 """ 363 output = "" 364 if self.segment: 365 output += Record.BASE_FORMAT % "SEGMENT" 366 output += _wrapped_genbank(self.segment, Record.GB_BASE_INDENT) 367 return output
368
369 - def _source_line(self):
370 """Output for SOURCE line on where the sample came from. 371 """ 372 output = Record.BASE_FORMAT % "SOURCE" 373 output += _wrapped_genbank(self.source, Record.GB_BASE_INDENT) 374 return output
375
376 - def _organism_line(self):
377 """Output for ORGANISM line with taxonomy info. 378 """ 379 output = Record.INTERNAL_FORMAT % "ORGANISM" 380 # Now that species names can be too long, this line can wrap (Bug 2591) 381 output += _wrapped_genbank(self.organism, Record.GB_BASE_INDENT) 382 output += " " * Record.GB_BASE_INDENT 383 taxonomy_info = "" 384 for tax in self.taxonomy: 385 taxonomy_info += "%s; " % tax 386 # replace the ; at the end with a period 387 taxonomy_info = taxonomy_info[:-2] 388 taxonomy_info += "." 389 output += _wrapped_genbank(taxonomy_info, Record.GB_BASE_INDENT) 390 391 return output
392
393 - def _comment_line(self):
394 """Output for the COMMENT lines. 395 """ 396 output = "" 397 if self.comment: 398 output += Record.BASE_FORMAT % "COMMENT" 399 output += _indent_genbank(self.comment, 400 Record.GB_BASE_INDENT) 401 return output
402
403 - def _features_line(self):
404 """Output for the FEATURES line. 405 """ 406 output = "" 407 if len(self.features) > 0: 408 output += Record.BASE_FEATURE_FORMAT % "FEATURES" 409 output += "Location/Qualifiers\n" 410 return output
411
412 - def _base_count_line(self):
413 """Output for the BASE COUNT line with base information. 414 """ 415 output = "" 416 if self.base_counts: 417 output += Record.BASE_FORMAT % "BASE COUNT " 418 # split up the base counts into their individual parts 419 count_parts = self.base_counts.split(" ") 420 while '' in count_parts: 421 count_parts.remove('') 422 # deal with the standard case, with a normal origin line 423 # like: 474 a 356 c 428 g 364 t 424 if len(count_parts) % 2 == 0: 425 while len(count_parts) > 0: 426 count_info = count_parts.pop(0) 427 count_type = count_parts.pop(0) 428 429 output += "%7s %s" % (count_info, count_type) 430 # deal with ugly ORIGIN lines like: 431 # 1311257 a2224835 c2190093 g1309889 t 432 # by just outputting the raw information 433 else: 434 output += self.base_counts 435 output += "\n" 436 return output
437
438 - def _origin_line(self):
439 """Output for the ORIGIN line 440 """ 441 output = "" 442 # only output the ORIGIN line if we have a sequence 443 if self.sequence: 444 output += Record.BASE_FORMAT % "ORIGIN" 445 if self.origin: 446 output += _wrapped_genbank(self.origin, 447 Record.GB_BASE_INDENT) 448 else: 449 output += "\n" 450 return output
451
452 - def _sequence_line(self):
453 """Output for all of the sequence. 454 """ 455 output = "" 456 if self.sequence: 457 cur_seq_pos = 0 458 while cur_seq_pos < len(self.sequence): 459 output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1) 460 461 for section in range(6): 462 start_pos = cur_seq_pos + section * 10 463 end_pos = start_pos + 10 464 seq_section = self.sequence[start_pos:end_pos] 465 output += " %s" % seq_section.lower() 466 467 # stop looping if we are out of sequence 468 if end_pos > len(self.sequence): 469 break 470 471 output += "\n" 472 cur_seq_pos += 60 473 return output
474
475 - def _wgs_line(self):
476 output = "" 477 if self.wgs: 478 output += Record.BASE_FORMAT % "WGS" 479 output += self.wgs 480 return output
481
482 - def _wgs_scafld_line(self):
483 output = "" 484 if self.wgs_scafld: 485 output += Record.BASE_FORMAT % "WGS_SCAFLD" 486 output += self.wgs_scafld 487 return output
488
489 - def _contig_line(self):
490 """Output for CONTIG location information from RefSeq. 491 """ 492 output = "" 493 if self.contig: 494 output += Record.BASE_FORMAT % "CONTIG" 495 output += _wrapped_genbank(self.contig, 496 Record.GB_BASE_INDENT, split_char = ',') 497 return output
498 499
500 -class Reference:
501 """Hold information from a GenBank reference. 502 503 Attributes: 504 o number - The number of the reference in the listing of references. 505 o bases - The bases in the sequence the reference refers to. 506 o authors - String with all of the authors. 507 o consrtm - Consortium the authors belong to. 508 o title - The title of the reference. 509 o journal - Information about the journal where the reference appeared. 510 o medline_id - The medline id for the reference. 511 o pubmed_id - The pubmed_id for the reference. 512 o remark - Free-form remarks about the reference. 513 """
514 - def __init__(self):
515 self.number = '' 516 self.bases = '' 517 self.authors = '' 518 self.consrtm = '' 519 self.title = '' 520 self.journal = '' 521 self.medline_id = '' 522 self.pubmed_id = '' 523 self.remark = ''
524
525 - def __str__(self):
526 output = self._reference_line() 527 output += self._authors_line() 528 output += self._consrtm_line() 529 output += self._title_line() 530 output += self._journal_line() 531 output += self._medline_line() 532 output += self._pubmed_line() 533 output += self._remark_line() 534 535 return output
536
537 - def _reference_line(self):
538 """Output for REFERENCE lines. 539 """ 540 output = Record.BASE_FORMAT % "REFERENCE" 541 if self.number: 542 if self.bases: 543 output += "%-3s" % self.number 544 output += "%s" % self.bases 545 else: 546 output += "%s" % self.number 547 548 output += "\n" 549 return output
550
551 - def _authors_line(self):
552 """Output for AUTHORS information. 553 """ 554 output = "" 555 if self.authors: 556 output += Record.INTERNAL_FORMAT % "AUTHORS" 557 output += _wrapped_genbank(self.authors, Record.GB_BASE_INDENT) 558 return output
559
560 - def _consrtm_line(self):
561 """Output for CONSRTM information. 562 """ 563 output = "" 564 if self.consrtm: 565 output += Record.INTERNAL_FORMAT % "CONSRTM" 566 output += _wrapped_genbank(self.consrtm, Record.GB_BASE_INDENT) 567 return output
568
569 - def _title_line(self):
570 """Output for TITLE information. 571 """ 572 output = "" 573 if self.title: 574 output += Record.INTERNAL_FORMAT % "TITLE" 575 output += _wrapped_genbank(self.title, Record.GB_BASE_INDENT) 576 return output
577
578 - def _journal_line(self):
579 """Output for JOURNAL information. 580 """ 581 output = "" 582 if self.journal: 583 output += Record.INTERNAL_FORMAT % "JOURNAL" 584 output += _wrapped_genbank(self.journal, Record.GB_BASE_INDENT) 585 return output
586
587 - def _medline_line(self):
588 """Output for MEDLINE information. 589 """ 590 output = "" 591 if self.medline_id: 592 output += Record.INTERNAL_FORMAT % "MEDLINE" 593 output += self.medline_id + "\n" 594 return output
595
596 - def _pubmed_line(self):
597 """Output for PUBMED information. 598 """ 599 output = "" 600 if self.pubmed_id: 601 output += Record.OTHER_INTERNAL_FORMAT % "PUBMED" 602 output += self.pubmed_id + "\n" 603 return output
604
605 - def _remark_line(self):
606 """Output for REMARK information. 607 """ 608 output = "" 609 if self.remark: 610 output += Record.INTERNAL_FORMAT % "REMARK" 611 output += _wrapped_genbank(self.remark, Record.GB_BASE_INDENT) 612 return output
613
614 -class Feature:
615 """Hold information about a Feature in the Feature Table of GenBank record. 616 617 Attributes: 618 o key - The key name of the featue (ie. source) 619 o location - The string specifying the location of the feature. 620 o qualfiers - A listing Qualifier objects in the feature. 621 """
622 - def __init__(self):
623 self.key = '' 624 self.location = '' 625 self.qualifiers = []
626
627 - def __str__(self):
628 output = Record.INTERNAL_FEATURE_FORMAT % self.key 629 output += _wrapped_genbank(self.location, Record.GB_FEATURE_INDENT, 630 split_char = ',') 631 for qualifier in self.qualifiers: 632 output += " " * Record.GB_FEATURE_INDENT 633 634 # determine whether we can wrap on spaces 635 space_wrap = 1 636 for no_space_key in \ 637 Bio.GenBank._BaseGenBankConsumer.remove_space_keys: 638 if qualifier.key.find(no_space_key) >= 0: 639 space_wrap = 0 640 641 output += _wrapped_genbank(qualifier.key + qualifier.value, 642 Record.GB_FEATURE_INDENT, space_wrap) 643 return output
644
645 -class Qualifier:
646 """Hold information about a qualifier in a GenBank feature. 647 648 Attributes: 649 o key - The key name of the qualifier (ie. /organism=) 650 o value - The value of the qualifier ("Dictyostelium discoideum"). 651 """
652 - def __init__(self):
653 self.key = '' 654 self.value = ''
655