1 """Hold GenBank data in a straightforward format.
2
3 classes:
4 o Record - All of the information in a GenBank record.
5 o Reference - hold reference data for a record.
6 o Feature - Hold the information in a Feature Table.
7 o Qualifier - Qualifiers on a Feature.
8 17-MAR-2009: added support for WGS and WGS_SCAFLD lines. Ying Huang & Iddo Friedberg
9 """
10
11 import Bio.GenBank
12
14 """Write a line of GenBank info that can wrap over multiple lines.
15
16 This takes a line of information which can potentially wrap over
17 multiple lines, and breaks it up with carriage returns and
18 indentation so it fits properly into a GenBank record.
19
20 Arguments:
21
22 o information - The string holding the information we want
23 wrapped in GenBank method.
24
25 o indent - The indentation on the lines we are writing.
26
27 o wrap_space - Whether or not to wrap only on spaces in the
28 information.
29
30 o split_char - A specific character to split the lines on. By default
31 spaces are used.
32 """
33 info_length = Record.GB_LINE_LENGTH - indent
34
35 if not information:
36
37 return ".\n"
38
39 if wrap_space:
40 info_parts = information.split(split_char)
41 else:
42 cur_pos = 0
43 info_parts = []
44 while cur_pos < len(information):
45 info_parts.append(information[cur_pos: cur_pos + info_length])
46 cur_pos += info_length
47
48
49 output_parts = []
50 cur_part = ""
51 for info_part in info_parts:
52 if len(cur_part) + 1 + len(info_part) > info_length:
53 if cur_part:
54 if split_char != " ":
55 cur_part += split_char
56 output_parts.append(cur_part)
57 cur_part = info_part
58 else:
59 if cur_part == "":
60 cur_part = info_part
61 else:
62 cur_part += split_char + info_part
63
64
65 if cur_part:
66 output_parts.append(cur_part)
67
68
69 output_info = output_parts[0] + "\n"
70 for output_part in output_parts[1:]:
71 output_info += " " * indent + output_part + "\n"
72
73 return output_info
74
76 """Write out information with the specified indent.
77
78 Unlike _wrapped_genbank, this function makes no attempt to wrap
79 lines -- it assumes that the information already has newlines in the
80 appropriate places, and will add the specified indent to the start of
81 each line.
82 """
83
84 info_parts = information.split("\n")
85
86
87 output_info = info_parts[0] + "\n"
88 for info_part in info_parts[1:]:
89 output_info += " " * indent + info_part + "\n"
90
91 return output_info
92
94 """Hold GenBank information in a format similar to the original record.
95
96 The Record class is meant to make data easy to get to when you are
97 just interested in looking at GenBank data.
98
99 Attributes:
100 o locus - The name specified after the LOCUS keyword in the GenBank
101 record. This may be the accession number, or a clone id or something else.
102 o size - The size of the record.
103 o residue_type - The type of residues making up the sequence in this
104 record. Normally something like RNA, DNA or PROTEIN, but may be as
105 esoteric as 'ss-RNA circular'.
106 o data_file_division - The division this record is stored under in
107 GenBank (ie. PLN -> plants; PRI -> humans, primates; BCT -> bacteria...)
108 o date - The date of submission of the record, in a form like '28-JUL-1998'
109 o accession - list of all accession numbers for the sequence.
110 o nid - Nucleotide identifier number.
111 o pid - Proteint identifier number
112 o version - The accession number + version (ie. AB01234.2)
113 o db_source - Information about the database the record came from
114 o gi - The NCBI gi identifier for the record.
115 o keywords - A list of keywords related to the record.
116 o segment - If the record is one of a series, this is info about which
117 segment this record is (something like '1 of 6').
118 o source - The source of material where the sequence came from.
119 o organism - The genus and species of the organism (ie. 'Homo sapiens')
120 o taxonomy - A listing of the taxonomic classification of the organism,
121 starting general and getting more specific.
122 o references - A list of Reference objects.
123 o comment - Text with any kind of comment about the record.
124 o features - A listing of Features making up the feature table.
125 o base_counts - A string with the counts of bases for the sequence.
126 o origin - A string specifying info about the origin of the sequence.
127 o sequence - A string with the sequence itself.
128 o contig - A string of location information for a CONTIG in a RefSeq file
129 o project - The genome sequencing project numbers
130 (will be replaced by the dblink cross-references in 2009).
131 o dblinks - The genome sequencing project number(s) and other links.
132 (will replace the project information in 2009).
133 """
134
135 GB_LINE_LENGTH = 79
136 GB_BASE_INDENT = 12
137 GB_FEATURE_INDENT = 21
138 GB_INTERNAL_INDENT = 2
139 GB_OTHER_INTERNAL_INDENT = 3
140 GB_FEATURE_INTERNAL_INDENT = 5
141 GB_SEQUENCE_INDENT = 9
142
143 BASE_FORMAT = "%-" + str(GB_BASE_INDENT) + "s"
144 INTERNAL_FORMAT = " " * GB_INTERNAL_INDENT + "%-" + \
145 str(GB_BASE_INDENT - GB_INTERNAL_INDENT) + "s"
146 OTHER_INTERNAL_FORMAT = " " * GB_OTHER_INTERNAL_INDENT + "%-" + \
147 str(GB_BASE_INDENT - GB_OTHER_INTERNAL_INDENT) + \
148 "s"
149
150 BASE_FEATURE_FORMAT = "%-" + str(GB_FEATURE_INDENT) + "s"
151 INTERNAL_FEATURE_FORMAT = " " * GB_FEATURE_INTERNAL_INDENT + "%-" + \
152 str(GB_FEATURE_INDENT -
153 GB_FEATURE_INTERNAL_INDENT) + "s"
154 SEQUENCE_FORMAT = "%" + str(GB_SEQUENCE_INDENT) + "s"
155
186
227
229 """Provide the output string for the LOCUS line.
230 """
231 output = "LOCUS"
232 output += " " * 7
233 output += "%-9s" % self.locus
234 output += " "
235 output += "%7s" % self.size
236 if self.residue_type.find("PROTEIN") >= 0:
237 output += " aa"
238 else:
239 output += " bp "
240
241
242
243 if self.residue_type.find("circular") >= 0:
244 output += "%17s" % self.residue_type
245
246 elif self.residue_type.find("-") >= 0:
247 output += "%7s" % self.residue_type
248 output += " " * 10
249 else:
250 output += " " * 3
251 output += "%-4s" % self.residue_type
252 output += " " * 10
253
254 output += " " * 2
255 output += "%3s" % self.data_file_division
256 output += " " * 7
257 output += "%11s" % self.date
258 output += "\n"
259 return output
260
267
284
286 """Output for the VERSION line.
287 """
288 if self.version:
289 output = Record.BASE_FORMAT % "VERSION"
290 output += self.version
291 output += " GI:"
292 output += "%s\n" % self.gi
293 else:
294 output = ""
295 return output
296
298 output = ""
299 if len(self.projects) > 0:
300 output = Record.BASE_FORMAT % "PROJECT"
301 output += "%s\n" % " ".join(self.projects)
302 return output
303
311
313 """Output for the NID line. Use of NID is obsolete in GenBank files.
314 """
315 if self.nid:
316 output = Record.BASE_FORMAT % "NID"
317 output += "%s\n" % self.nid
318 else:
319 output = ""
320 return output
321
323 """Output for PID line. Presumedly, PID usage is also obsolete.
324 """
325 if self.pid:
326 output = Record.BASE_FORMAT % "PID"
327 output += "%s\n" % self.pid
328 else:
329 output = ""
330 return output
331
349
351 """Output for DBSOURCE line.
352 """
353 if self.db_source:
354 output = Record.BASE_FORMAT % "DBSOURCE"
355 output += "%s\n" % self.db_source
356 else:
357 output = ""
358 return output
359
368
375
392
402
404 """Output for the FEATURES line.
405 """
406 output = ""
407 if len(self.features) > 0:
408 output += Record.BASE_FEATURE_FORMAT % "FEATURES"
409 output += "Location/Qualifiers\n"
410 return output
411
413 """Output for the BASE COUNT line with base information.
414 """
415 output = ""
416 if self.base_counts:
417 output += Record.BASE_FORMAT % "BASE COUNT "
418
419 count_parts = self.base_counts.split(" ")
420 while '' in count_parts:
421 count_parts.remove('')
422
423
424 if len(count_parts) % 2 == 0:
425 while len(count_parts) > 0:
426 count_info = count_parts.pop(0)
427 count_type = count_parts.pop(0)
428
429 output += "%7s %s" % (count_info, count_type)
430
431
432
433 else:
434 output += self.base_counts
435 output += "\n"
436 return output
437
451
453 """Output for all of the sequence.
454 """
455 output = ""
456 if self.sequence:
457 cur_seq_pos = 0
458 while cur_seq_pos < len(self.sequence):
459 output += Record.SEQUENCE_FORMAT % str(cur_seq_pos + 1)
460
461 for section in range(6):
462 start_pos = cur_seq_pos + section * 10
463 end_pos = start_pos + 10
464 seq_section = self.sequence[start_pos:end_pos]
465 output += " %s" % seq_section.lower()
466
467
468 if end_pos > len(self.sequence):
469 break
470
471 output += "\n"
472 cur_seq_pos += 60
473 return output
474
481
483 output = ""
484 if self.wgs_scafld:
485 output += Record.BASE_FORMAT % "WGS_SCAFLD"
486 output += self.wgs_scafld
487 return output
488
498
499
501 """Hold information from a GenBank reference.
502
503 Attributes:
504 o number - The number of the reference in the listing of references.
505 o bases - The bases in the sequence the reference refers to.
506 o authors - String with all of the authors.
507 o consrtm - Consortium the authors belong to.
508 o title - The title of the reference.
509 o journal - Information about the journal where the reference appeared.
510 o medline_id - The medline id for the reference.
511 o pubmed_id - The pubmed_id for the reference.
512 o remark - Free-form remarks about the reference.
513 """
524
536
538 """Output for REFERENCE lines.
539 """
540 output = Record.BASE_FORMAT % "REFERENCE"
541 if self.number:
542 if self.bases:
543 output += "%-3s" % self.number
544 output += "%s" % self.bases
545 else:
546 output += "%s" % self.number
547
548 output += "\n"
549 return output
550
559
568
577
586
595
604
613
615 """Hold information about a Feature in the Feature Table of GenBank record.
616
617 Attributes:
618 o key - The key name of the featue (ie. source)
619 o location - The string specifying the location of the feature.
620 o qualfiers - A listing Qualifier objects in the feature.
621 """
623 self.key = ''
624 self.location = ''
625 self.qualifiers = []
626
644
646 """Hold information about a qualifier in a GenBank feature.
647
648 Attributes:
649 o key - The key name of the qualifier (ie. /organism=)
650 o value - The value of the qualifier ("Dictyostelium discoideum").
651 """
655