1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27 import warnings
28 import os
29 from Bio.Seq import Seq
30 from Bio.SeqRecord import SeqRecord
31 from Bio.Alphabet import generic_alphabet, generic_protein
32
34 """Basic functions for breaking up a GenBank/EMBL file into sub sections.
35
36 The International Nucleotide Sequence Database Collaboration (INSDC)
37 between the DDBJ, EMBL, and GenBank. These organisations all use the
38 same "Feature Table" layout in their plain text flat file formats.
39
40 However, the header and sequence sections of an EMBL file are very
41 different in layout to those produced by GenBank/DDBJ."""
42
43
44 RECORD_START = "XXX"
45 HEADER_WIDTH = 3
46 FEATURE_START_MARKERS = ["XXX***FEATURES***XXX"]
47 FEATURE_END_MARKERS = ["XXX***END FEATURES***XXX"]
48 FEATURE_QUALIFIER_INDENT = 0
49 FEATURE_QUALIFIER_SPACER = ""
50 SEQUENCE_HEADERS=["XXX"]
51
59
63
65 """Read in lines until find the ID/LOCUS line, which is returned.
66
67 Any preamble (such as the header used by the NCBI on *.seq.gz archives)
68 will we ignored."""
69 while True:
70 if self.line:
71 line = self.line
72 self.line = ""
73 else:
74 line = self.handle.readline()
75 if not line:
76 if self.debug : print "End of file"
77 return None
78 if line[:self.HEADER_WIDTH]==self.RECORD_START:
79 if self.debug > 1: print "Found the start of a record:\n" + line
80 break
81 line = line.rstrip()
82 if line == "//":
83 if self.debug > 1: print "Skipping // marking end of last record"
84 elif line == "":
85 if self.debug > 1: print "Skipping blank line before record"
86 else:
87
88 if self.debug > 1:
89 print "Skipping header line before record:\n" + line
90 self.line = line
91 return line
92
94 """Return list of strings making up the header
95
96 New line characters are removed.
97
98 Assumes you have just read in the ID/LOCUS line.
99 """
100 assert self.line[:self.HEADER_WIDTH]==self.RECORD_START, \
101 "Not at start of record"
102
103 header_lines = []
104 while True:
105 line = self.handle.readline()
106 if not line:
107 raise ValueError("Premature end of line during sequence data")
108 line = line.rstrip()
109 if line in self.FEATURE_START_MARKERS:
110 if self.debug : print "Found header table"
111 break
112
113
114
115 if line[:self.HEADER_WIDTH].rstrip() in self.SEQUENCE_HEADERS:
116 if self.debug : print "Found start of sequence"
117 break
118 if line == "//":
119 raise ValueError("Premature end of sequence data marker '//' found")
120 header_lines.append(line)
121 self.line = line
122 return header_lines
123
187
189 """Expects a feature as a list of strings, returns a tuple (key, location, qualifiers)
190
191 For example given this GenBank feature:
192
193 CDS complement(join(490883..490885,1..879))
194 /locus_tag="NEQ001"
195 /note="conserved hypothetical [Methanococcus jannaschii];
196 COG1583:Uncharacterized ACR; IPR001472:Bipartite nuclear
197 localization signal; IPR002743: Protein of unknown
198 function DUF57"
199 /codon_start=1
200 /transl_table=11
201 /product="hypothetical protein"
202 /protein_id="NP_963295.1"
203 /db_xref="GI:41614797"
204 /db_xref="GeneID:2732620"
205 /translation="MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK
206 EKYFNFTLIPKKDIIENKRYYLIISSPDKRFIEVLHNKIKDLDIITIGLAQFQLRKTK
207 KFDPKLRFPWVTITPIVLREGKIVILKGDKYYKVFVKRLEELKKYNLIKKKEPILEEP
208 IEISLNQIKDGWKIIDVKDRYYDFRNKSFSAFSNWLRDLKEQSLRKYNNFCGKNFYFE
209 EAIFEGFTFYKTVSIRIRINRGEAVYIGTLWKELNVYRKLDKEEREFYKFLYDCGLGS
210 LNSMGFGFVNTKKNSAR"
211
212 Then should give input key="CDS" and the rest of the data as a list of strings
213 lines=["complement(join(490883..490885,1..879))", ..., "LNSMGFGFVNTKKNSAR"]
214 where the leading spaces and trailing newlines have been removed.
215
216 Returns tuple containing: (key as string, location string, qualifiers as list)
217 as follows for this example:
218
219 key = "CDS", string
220 location = "complement(join(490883..490885,1..879))", string
221 qualifiers = list of string tuples:
222
223 [('locus_tag', '"NEQ001"'),
224 ('note', '"conserved hypothetical [Methanococcus jannaschii];\nCOG1583:..."'),
225 ('codon_start', '1'),
226 ('transl_table', '11'),
227 ('product', '"hypothetical protein"'),
228 ('protein_id', '"NP_963295.1"'),
229 ('db_xref', '"GI:41614797"'),
230 ('db_xref', '"GeneID:2732620"'),
231 ('translation', '"MRLLLELKALNSIDKKQLSNYLIQGFIYNILKNTEYSWLHNWKK\nEKYFNFT..."')]
232
233 In the above example, the "note" and "translation" were edited for compactness,
234 and they would contain multiple new line characters (displayed above as \n)
235
236 If a qualifier is quoted (in this case, everything except codon_start and
237 transl_table) then the quotes are NOT removed.
238
239 Note that no whitespace is removed.
240 """
241
242 iterator = iter(filter(None, lines))
243 try:
244 line = iterator.next()
245
246 feature_location = line.strip()
247 while feature_location[-1:]==",":
248
249 feature_location += iterator.next().strip()
250
251 qualifiers=[]
252
253 for line in iterator:
254 if line[0]=="/":
255
256 i = line.find("=")
257 key = line[1:i]
258 value = line[i+1:]
259 if i==-1:
260
261 key = line[1:]
262 qualifiers.append((key,None))
263 elif value[0]=='"':
264
265 if value[-1]!='"' or value!='"':
266
267 while value[-1] != '"':
268 value += "\n" + iterator.next()
269 else:
270
271 assert value == '"'
272 if self.debug : print "Quoted line %s:%s" % (key, value)
273
274 qualifiers.append((key,value))
275 else:
276
277
278 qualifiers.append((key,value))
279 else:
280
281 assert len(qualifiers) > 0
282 assert key==qualifiers[-1][0]
283
284 qualifiers[-1] = (key, qualifiers[-1][1] + "\n" + line)
285 return (feature_key, feature_location, qualifiers)
286 except StopIteration:
287
288 raise ValueError("Problem with '%s' feature:\n%s" \
289 % (feature_key, "\n".join(lines)))
290
311
313 """Handle the LOCUS/ID line, passing data to the comsumer
314
315 This should be implemented by the EMBL / GenBank specific subclass
316
317 Used by the parse_records() and parse() methods.
318 """
319 pass
320
322 """Handle the header lines (list of strings), passing data to the comsumer
323
324 This should be implemented by the EMBL / GenBank specific subclass
325
326 Used by the parse_records() and parse() methods.
327 """
328 pass
329
330
344
346 """Handle any lines between features and sequence (list of strings), passing data to the consumer
347
348 This should be implemented by the EMBL / GenBank specific subclass
349
350 Used by the parse_records() and parse() methods.
351 """
352 pass
353
354 - def feed(self, handle, consumer, do_features=True):
355 """Feed a set of data into the consumer.
356
357 This method is intended for use with the "old" code in Bio.GenBank
358
359 Arguments:
360 handle - A handle with the information to parse.
361 consumer - The consumer that should be informed of events.
362 do_features - Boolean, should the features be parsed?
363 Skipping the features can be much faster.
364
365 Return values:
366 true - Passed a record
367 false - Did not find a record
368 """
369
370
371 self.set_handle(handle)
372 if not self.find_start():
373
374 consumer.data=None
375 return False
376
377
378
379
380
381
382 self._feed_first_line(consumer, self.line)
383 self._feed_header_lines(consumer, self.parse_header())
384
385
386 if do_features:
387 self._feed_feature_table(consumer, self.parse_features(skip=False))
388 else:
389 self.parse_features(skip=True)
390
391
392 misc_lines, sequence_string = self.parse_footer()
393 self._feed_misc_lines(consumer, misc_lines)
394
395 consumer.sequence(sequence_string)
396
397 consumer.record_end("//")
398
399 assert self.line == "//"
400
401
402 return True
403
404 - def parse(self, handle, do_features=True):
419
420
422 """Returns a SeqRecord object iterator
423
424 Each record (from the ID/LOCUS line to the // line) becomes a SeqRecord
425
426 The SeqRecord objects include SeqFeatures if do_features=True
427
428 This method is intended for use in Bio.SeqIO
429 """
430
431 while True:
432 record = self.parse(handle, do_features)
433 if record is None : break
434 assert record.id is not None
435 assert record.name != "<unknown name>"
436 assert record.description != "<unknown description>"
437 yield record
438
442 """Returns SeqRecord object iterator
443
444 Each CDS feature becomes a SeqRecord.
445
446 alphabet - Used for any sequence found in a translation field.
447 tags2id - Tupple of three strings, the feature keys to use
448 for the record id, name and description,
449
450 This method is intended for use in Bio.SeqIO
451 """
452 self.set_handle(handle)
453 while self.find_start():
454
455 self.parse_header()
456 feature_tuples = self.parse_features()
457
458 while True:
459 line = self.handle.readline()
460 if not line : break
461 if line[:2]=="//" : break
462 self.line = line.rstrip()
463
464
465 for key, location_string, qualifiers in feature_tuples:
466 if key=="CDS":
467
468
469
470
471
472 record = SeqRecord(seq=None)
473 annotations = record.annotations
474
475
476
477
478 annotations['raw_location'] = location_string.replace(' ','')
479
480 for (qualifier_name, qualifier_data) in qualifiers:
481 if qualifier_data is not None \
482 and qualifier_data[0]=='"' and qualifier_data[-1]=='"':
483
484 qualifier_data = qualifier_data[1:-1]
485
486 if qualifier_name == "translation":
487 assert record.seq is None, "Multiple translations!"
488 record.seq = Seq(qualifier_data.replace("\n",""), alphabet)
489 elif qualifier_name == "db_xref":
490
491 record.dbxrefs.append(qualifier_data)
492 else:
493 if qualifier_data is not None:
494 qualifier_data = qualifier_data.replace("\n"," ").replace(" "," ")
495 try:
496 annotations[qualifier_name] += " " + qualifier_data
497 except KeyError:
498
499 annotations[qualifier_name]= qualifier_data
500
501
502
503 try:
504 record.id = annotations[tags2id[0]]
505 except KeyError:
506 pass
507 try:
508 record.name = annotations[tags2id[1]]
509 except KeyError:
510 pass
511 try:
512 record.description = annotations[tags2id[2]]
513 except KeyError:
514 pass
515
516 yield record
517
519 """For extracting chunks of information in EMBL files"""
520
521 RECORD_START = "ID "
522 HEADER_WIDTH = 5
523 FEATURE_START_MARKERS = ["FH Key Location/Qualifiers","FH"]
524 FEATURE_END_MARKERS = ["XX"]
525 FEATURE_QUALIFIER_INDENT = 21
526 FEATURE_QUALIFIER_SPACER = "FT" + " " * (FEATURE_QUALIFIER_INDENT-2)
527 SEQUENCE_HEADERS=["SQ", "CO"]
528
562
573
575
576
577
578 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
579 fields = [line[self.HEADER_WIDTH:].split(None,1)[0]]
580 fields.extend(line[self.HEADER_WIDTH:].split(None,1)[1].split(";"))
581 fields = [entry.strip() for entry in fields]
582 """
583 The tokens represent:
584 0. Primary accession number
585 (space sep)
586 1. ??? (e.g. standard)
587 (semi-colon)
588 2. Topology and/or Molecule type (e.g. 'circular DNA' or 'DNA')
589 3. Taxonomic division (e.g. 'PRO')
590 4. Sequence length (e.g. '4639675 BP.')
591 """
592 consumer.locus(fields[0])
593 consumer.residue_type(fields[2])
594 consumer.data_file_division(fields[3])
595 self._feed_seq_length(consumer, fields[4])
596
598
599
600
601 assert line[:self.HEADER_WIDTH].rstrip() == "ID"
602 fields = [data.strip() for data in line[self.HEADER_WIDTH:].strip().split(";")]
603 assert len(fields) == 7
604 """
605 The tokens represent:
606 0. Primary accession number
607 1. Sequence version number
608 2. Topology: 'circular' or 'linear'
609 3. Molecule type (e.g. 'genomic DNA')
610 4. Data class (e.g. 'STD')
611 5. Taxonomic division (e.g. 'PRO')
612 6. Sequence length (e.g. '4639675 BP.')
613 """
614
615 consumer.locus(fields[0])
616
617
618
619 consumer.accession(fields[0])
620
621
622
623 version_parts = fields[1].split()
624 if len(version_parts)==2 \
625 and version_parts[0]=="SV" \
626 and version_parts[1].isdigit():
627 consumer.version_suffix(version_parts[1])
628
629
630 consumer.residue_type(" ".join(fields[2:4]))
631
632
633
634 consumer.data_file_division(fields[5])
635
636 self._feed_seq_length(consumer, fields[6])
637
639 length_parts = text.split()
640 assert len(length_parts) == 2
641 assert length_parts[1].upper() in ["BP", "BP."]
642 consumer.size(length_parts[0])
643
645 EMBL_INDENT = self.HEADER_WIDTH
646 EMBL_SPACER = " " * EMBL_INDENT
647 consumer_dict = {
648 'AC' : 'accession',
649 'SV' : 'version',
650 'DE' : 'definition',
651
652
653
654
655 'RG' : 'consrtm',
656
657
658 'RL' : 'journal',
659 'OS' : 'organism',
660 'OC' : 'taxonomy',
661
662 'CC' : 'comment',
663
664 }
665
666
667 lines = filter(None,lines)
668 line_iter = iter(lines)
669 try:
670 while True:
671 try:
672 line = line_iter.next()
673 except StopIteration:
674 break
675 if not line : break
676 line_type = line[:EMBL_INDENT].strip()
677 data = line[EMBL_INDENT:].strip()
678
679 if line_type == 'XX':
680 pass
681 elif line_type == 'RN':
682
683
684 if data[0] == "[" and data[-1] == "]" : data = data[1:-1]
685 consumer.reference_num(data)
686 elif line_type == 'RP':
687
688
689
690 parts = [bases.replace("-"," to ").strip() for bases in data.split(",")]
691 consumer.reference_bases("(bases %s)" % "; ".join(parts))
692 elif line_type == 'RT':
693
694
695 if data.startswith('"'):
696 data = data[1:]
697 if data.endswith('";'):
698 data = data[:-2]
699 consumer.title(data)
700 elif line_type == 'RX':
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716 key, value = data.split(";",1)
717 if value.endswith(".") : value = value[:-1]
718 value = value.strip()
719 if key == "PUBMED":
720 consumer.pubmed_id(value)
721
722 elif line_type == 'CC':
723
724 consumer.comment([data])
725 elif line_type == 'DR':
726
727
728
729
730
731
732
733 parts = data.rstrip(".").split(";")
734
735
736 consumer.dblink("%s:%s" % (parts[0].strip(),
737 parts[1].strip()))
738 elif line_type == 'RA':
739
740 consumer.authors(data.rstrip(";"))
741 elif line_type == 'PR':
742
743
744
745 consumer.project(data.rstrip(";"))
746 elif line_type in consumer_dict:
747
748 getattr(consumer, consumer_dict[line_type])(data)
749 else:
750 if self.debug:
751 print "Ignoring EMBL header line:\n%s" % line
752 except StopIteration:
753 raise ValueError("Problem with header")
754
756
757 lines.append("")
758 line_iter = iter(lines)
759 try:
760 for line in line_iter:
761 if line.startswith("CO "):
762 line = line[5:].strip()
763 contig_location = line
764 while True:
765 line = line_iter.next()
766 if not line:
767 break
768 elif line.startswith("CO "):
769
770 contig_location += line[5:].strip()
771 else:
772 raise ValueError('Expected CO (contig) continuation line, got:\n' + line)
773 consumer.contig_location(contig_location)
774 return
775 except StopIteration:
776 raise ValueError("Problem in misc lines before sequence")
777
779 """For extracting chunks of information in GenBank files"""
780
781 RECORD_START = "LOCUS "
782 HEADER_WIDTH = 12
783 FEATURE_START_MARKERS = ["FEATURES Location/Qualifiers","FEATURES"]
784 FEATURE_END_MARKERS = []
785 FEATURE_QUALIFIER_INDENT = 21
786 FEATURE_QUALIFIER_SPACER = " " * FEATURE_QUALIFIER_INDENT
787 SEQUENCE_HEADERS=["CONTIG", "ORIGIN", "BASE COUNT", "WGS"]
788
832
834
835
836
837 GENBANK_INDENT = self.HEADER_WIDTH
838 GENBANK_SPACER = " "*GENBANK_INDENT
839 assert line[0:GENBANK_INDENT] == 'LOCUS ', \
840 'LOCUS line does not start correctly:\n' + line
841
842
843
844 if line[29:33] in [' bp ', ' aa ',' rc ']:
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863 assert line[29:33] in [' bp ', ' aa ',' rc '] , \
864 'LOCUS line does not contain size units at expected position:\n' + line
865 assert line[41:42] == ' ', \
866 'LOCUS line does not contain space at position 42:\n' + line
867 assert line[42:51].strip() in ['','linear','circular'], \
868 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
869 assert line[51:52] == ' ', \
870 'LOCUS line does not contain space at position 52:\n' + line
871 assert line[55:62] == ' ', \
872 'LOCUS line does not contain spaces from position 56 to 62:\n' + line
873 if line[62:73].strip():
874 assert line[64:65] == '-', \
875 'LOCUS line does not contain - at position 65 in date:\n' + line
876 assert line[68:69] == '-', \
877 'LOCUS line does not contain - at position 69 in date:\n' + line
878
879 name_and_length_str = line[GENBANK_INDENT:29]
880 while name_and_length_str.find(' ')!=-1:
881 name_and_length_str = name_and_length_str.replace(' ',' ')
882 name_and_length = name_and_length_str.split(' ')
883 assert len(name_and_length)<=2, \
884 'Cannot parse the name and length in the LOCUS line:\n' + line
885 assert len(name_and_length)!=1, \
886 'Name and length collide in the LOCUS line:\n' + line
887
888
889
890 consumer.locus(name_and_length[0])
891 consumer.size(name_and_length[1])
892
893
894 if line[33:51].strip() == "" and line[29:33] == ' aa ':
895
896
897
898
899 consumer.residue_type("PROTEIN")
900 else:
901 consumer.residue_type(line[33:51].strip())
902
903 consumer.data_file_division(line[52:55])
904 if line[62:73].strip():
905 consumer.date(line[62:73])
906 elif line[40:44] in [' bp ', ' aa ',' rc ']:
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926 assert line[40:44] in [' bp ', ' aa ',' rc '] , \
927 'LOCUS line does not contain size units at expected position:\n' + line
928 assert line[44:47] in [' ', 'ss-', 'ds-', 'ms-'], \
929 'LOCUS line does not have valid strand type (Single stranded, ...):\n' + line
930 assert line[47:54].strip() == "" \
931 or line[47:54].strip().find('DNA') != -1 \
932 or line[47:54].strip().find('RNA') != -1, \
933 'LOCUS line does not contain valid sequence type (DNA, RNA, ...):\n' + line
934 assert line[54:55] == ' ', \
935 'LOCUS line does not contain space at position 55:\n' + line
936 assert line[55:63].strip() in ['','linear','circular'], \
937 'LOCUS line does not contain valid entry (linear, circular, ...):\n' + line
938 assert line[63:64] == ' ', \
939 'LOCUS line does not contain space at position 64:\n' + line
940 assert line[67:68] == ' ', \
941 'LOCUS line does not contain space at position 68:\n' + line
942 if line[68:79].strip():
943 assert line[70:71] == '-', \
944 'LOCUS line does not contain - at position 71 in date:\n' + line
945 assert line[74:75] == '-', \
946 'LOCUS line does not contain - at position 75 in date:\n' + line
947
948 name_and_length_str = line[GENBANK_INDENT:40]
949 while name_and_length_str.find(' ')!=-1:
950 name_and_length_str = name_and_length_str.replace(' ',' ')
951 name_and_length = name_and_length_str.split(' ')
952 assert len(name_and_length)<=2, \
953 'Cannot parse the name and length in the LOCUS line:\n' + line
954 assert len(name_and_length)!=1, \
955 'Name and length collide in the LOCUS line:\n' + line
956
957
958
959 consumer.locus(name_and_length[0])
960 consumer.size(name_and_length[1])
961
962 if line[44:54].strip() == "" and line[40:44] == ' aa ':
963
964
965
966
967 consumer.residue_type(("PROTEIN " + line[54:63]).strip())
968 else:
969 consumer.residue_type(line[44:63].strip())
970
971 consumer.data_file_division(line[64:67])
972 if line[68:79].strip():
973 consumer.date(line[68:79])
974 elif line[GENBANK_INDENT:].strip().count(" ")==0 :
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990 if line[GENBANK_INDENT:].strip() != "":
991 consumer.locus(line[GENBANK_INDENT:].strip())
992 else:
993
994
995 warnings.warn("Minimal LOCUS line found - is this correct?\n" + line)
996 elif len(line.split())>=4 and line.split()[3] in ["aa","bp"]:
997
998
999 consumer.locus(line.split()[1])
1000 consumer.size(line.split()[2])
1001 warnings.warn("Malformed LOCUS line found - is this correct?\n" + line)
1002 else:
1003 raise ValueError('Did not recognise the LOCUS line layout:\n' + line)
1004
1005
1007
1008
1009
1010
1011 GENBANK_INDENT = self.HEADER_WIDTH
1012 GENBANK_SPACER = " "*GENBANK_INDENT
1013 consumer_dict = {
1014 'DEFINITION' : 'definition',
1015 'ACCESSION' : 'accession',
1016 'NID' : 'nid',
1017 'PID' : 'pid',
1018 'DBSOURCE' : 'db_source',
1019 'KEYWORDS' : 'keywords',
1020 'SEGMENT' : 'segment',
1021 'SOURCE' : 'source',
1022 'AUTHORS' : 'authors',
1023 'CONSRTM' : 'consrtm',
1024 'PROJECT' : 'project',
1025 'DBLINK' : 'dblink',
1026 'TITLE' : 'title',
1027 'JOURNAL' : 'journal',
1028 'MEDLINE' : 'medline_id',
1029 'PUBMED' : 'pubmed_id',
1030 'REMARK' : 'remark'}
1031
1032
1033
1034
1035
1036
1037 lines = filter(None,lines)
1038 lines.append("")
1039 line_iter = iter(lines)
1040 try:
1041 line = line_iter.next()
1042 while True:
1043 if not line : break
1044 line_type = line[:GENBANK_INDENT].strip()
1045 data = line[GENBANK_INDENT:].strip()
1046
1047 if line_type == 'VERSION':
1048
1049
1050
1051 while data.find(' ')!=-1:
1052 data = data.replace(' ',' ')
1053 if data.find(' GI:')==-1:
1054 consumer.version(data)
1055 else:
1056 if self.debug : print "Version [" + data.split(' GI:')[0] + "], gi [" + data.split(' GI:')[1] + "]"
1057 consumer.version(data.split(' GI:')[0])
1058 consumer.gi(data.split(' GI:')[1])
1059
1060 line = line_iter.next()
1061 elif line_type == 'REFERENCE':
1062 if self.debug >1 : print "Found reference [" + data + "]"
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073 data = data.strip()
1074
1075
1076 while True:
1077 line = line_iter.next()
1078 if line[:GENBANK_INDENT] == GENBANK_SPACER:
1079
1080 data += " " + line[GENBANK_INDENT:]
1081 if self.debug >1 : print "Extended reference text [" + data + "]"
1082 else:
1083
1084 break
1085
1086
1087
1088 while data.find(' ')!=-1:
1089 data = data.replace(' ',' ')
1090 if data.find(' ')==-1:
1091 if self.debug >2 : print 'Reference number \"' + data + '\"'
1092 consumer.reference_num(data)
1093 else:
1094 if self.debug >2 : print 'Reference number \"' + data[:data.find(' ')] + '\", \"' + data[data.find(' ')+1:] + '\"'
1095 consumer.reference_num(data[:data.find(' ')])
1096 consumer.reference_bases(data[data.find(' ')+1:])
1097 elif line_type == 'ORGANISM':
1098
1099
1100
1101
1102
1103
1104
1105
1106 organism_data = data
1107 lineage_data = ""
1108 while True:
1109 line = line_iter.next()
1110 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1111 if lineage_data or ";" in line:
1112 lineage_data += " " + line[GENBANK_INDENT:]
1113 else:
1114 organism_data += " " + line[GENBANK_INDENT:].strip()
1115 else:
1116
1117 break
1118 consumer.organism(organism_data)
1119 if lineage_data.strip() == "" and self.debug > 1:
1120 print "Taxonomy line(s) missing or blank"
1121 consumer.taxonomy(lineage_data.strip())
1122 del organism_data, lineage_data
1123 elif line_type == 'COMMENT':
1124 if self.debug > 1 : print "Found comment"
1125
1126
1127 comment_list=[]
1128 comment_list.append(data)
1129 while True:
1130 line = line_iter.next()
1131 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1132 data = line[GENBANK_INDENT:]
1133 comment_list.append(data)
1134 if self.debug > 2 : print "Comment continuation [" + data + "]"
1135 else:
1136
1137 break
1138 consumer.comment(comment_list)
1139 del comment_list
1140 elif line_type in consumer_dict:
1141
1142
1143 while True:
1144 line = line_iter.next()
1145 if line[0:GENBANK_INDENT] == GENBANK_SPACER:
1146 data += ' ' + line[GENBANK_INDENT:]
1147 else:
1148
1149 getattr(consumer, consumer_dict[line_type])(data)
1150
1151 break
1152 else:
1153 if self.debug:
1154 print "Ignoring GenBank header line:\n" % line
1155
1156 line = line_iter.next()
1157 except StopIteration:
1158 raise ValueError("Problem in header")
1159
1200
1201 if __name__ == "__main__":
1202 from StringIO import StringIO
1203
1204 gbk_example = \
1205 """LOCUS SCU49845 5028 bp DNA PLN 21-JUN-1999
1206 DEFINITION Saccharomyces cerevisiae TCP1-beta gene, partial cds, and Axl2p
1207 (AXL2) and Rev7p (REV7) genes, complete cds.
1208 ACCESSION U49845
1209 VERSION U49845.1 GI:1293613
1210 KEYWORDS .
1211 SOURCE Saccharomyces cerevisiae (baker's yeast)
1212 ORGANISM Saccharomyces cerevisiae
1213 Eukaryota; Fungi; Ascomycota; Saccharomycotina; Saccharomycetes;
1214 Saccharomycetales; Saccharomycetaceae; Saccharomyces.
1215 REFERENCE 1 (bases 1 to 5028)
1216 AUTHORS Torpey,L.E., Gibbs,P.E., Nelson,J. and Lawrence,C.W.
1217 TITLE Cloning and sequence of REV7, a gene whose function is required for
1218 DNA damage-induced mutagenesis in Saccharomyces cerevisiae
1219 JOURNAL Yeast 10 (11), 1503-1509 (1994)
1220 PUBMED 7871890
1221 REFERENCE 2 (bases 1 to 5028)
1222 AUTHORS Roemer,T., Madden,K., Chang,J. and Snyder,M.
1223 TITLE Selection of axial growth sites in yeast requires Axl2p, a novel
1224 plasma membrane glycoprotein
1225 JOURNAL Genes Dev. 10 (7), 777-793 (1996)
1226 PUBMED 8846915
1227 REFERENCE 3 (bases 1 to 5028)
1228 AUTHORS Roemer,T.
1229 TITLE Direct Submission
1230 JOURNAL Submitted (22-FEB-1996) Terry Roemer, Biology, Yale University, New
1231 Haven, CT, USA
1232 FEATURES Location/Qualifiers
1233 source 1..5028
1234 /organism="Saccharomyces cerevisiae"
1235 /db_xref="taxon:4932"
1236 /chromosome="IX"
1237 /map="9"
1238 CDS <1..206
1239 /codon_start=3
1240 /product="TCP1-beta"
1241 /protein_id="AAA98665.1"
1242 /db_xref="GI:1293614"
1243 /translation="SSIYNGISTSGLDLNNGTIADMRQLGIVESYKLKRAVVSSASEA
1244 AEVLLRVDNIIRARPRTANRQHM"
1245 gene 687..3158
1246 /gene="AXL2"
1247 CDS 687..3158
1248 /gene="AXL2"
1249 /note="plasma membrane glycoprotein"
1250 /codon_start=1
1251 /function="required for axial budding pattern of S.
1252 cerevisiae"
1253 /product="Axl2p"
1254 /protein_id="AAA98666.1"
1255 /db_xref="GI:1293615"
1256 /translation="MTQLQISLLLTATISLLHLVVATPYEAYPIGKQYPPVARVNESF
1257 TFQISNDTYKSSVDKTAQITYNCFDLPSWLSFDSSSRTFSGEPSSDLLSDANTTLYFN
1258 VILEGTDSADSTSLNNTYQFVVTNRPSISLSSDFNLLALLKNYGYTNGKNALKLDPNE
1259 VFNVTFDRSMFTNEESIVSYYGRSQLYNAPLPNWLFFDSGELKFTGTAPVINSAIAPE
1260 TSYSFVIIATDIEGFSAVEVEFELVIGAHQLTTSIQNSLIINVTDTGNVSYDLPLNYV
1261 YLDDDPISSDKLGSINLLDAPDWVALDNATISGSVPDELLGKNSNPANFSVSIYDTYG
1262 DVIYFNFEVVSTTDLFAISSLPNINATRGEWFSYYFLPSQFTDYVNTNVSLEFTNSSQ
1263 DHDWVKFQSSNLTLAGEVPKNFDKLSLGLKANQGSQSQELYFNIIGMDSKITHSNHSA
1264 NATSTRSSHHSTSTSSYTSSTYTAKISSTSAAATSSAPAALPAANKTSSHNKKAVAIA
1265 CGVAIPLGVILVALICFLIFWRRRRENPDDENLPHAISGPDLNNPANKPNQENATPLN
1266 NPFDDDASSYDDTSIARRLAALNTLKLDNHSATESDISSVDEKRDSLSGMNTYNDQFQ
1267 SQSKEELLAKPPVQPPESPFFDPQNRSSSVYMDSEPAVNKSWRYTGNLSPVSDIVRDS
1268 YGSQKTVDTEKLFDLEAPEKEKRTSRDVTMSSLDPWNSNISPSPVRKSVTPSPYNVTK
1269 HRNRHLQNIQDSQSGKNGITPTTMSTSSSDDFVPVKDGENFCWVHSMEPDRRPSKKRL
1270 VDFSNKSNVNVGQVKDIHGRIPEML"
1271 gene complement(3300..4037)
1272 /gene="REV7"
1273 CDS complement(3300..4037)
1274 /gene="REV7"
1275 /codon_start=1
1276 /product="Rev7p"
1277 /protein_id="AAA98667.1"
1278 /db_xref="GI:1293616"
1279 /translation="MNRWVEKWLRVYLKCYINLILFYRNVYPPQSFDYTTYQSFNLPQ
1280 FVPINRHPALIDYIEELILDVLSKLTHVYRFSICIINKKNDLCIEKYVLDFSELQHVD
1281 KDDQIITETEVFDEFRSSLNSLIMHLEKLPKVNDDTITFEAVINAIELELGHKLDRNR
1282 RVDSLEEKAEIERDSNWVKCQEDENLPDNNGFQPPKIKLTSLVGSDVGPLIIHQFSEK
1283 LISGDDKILNGVYSQYEEGESIFGSLF"
1284 ORIGIN
1285 1 gatcctccat atacaacggt atctccacct caggtttaga tctcaacaac ggaaccattg
1286 61 ccgacatgag acagttaggt atcgtcgaga gttacaagct aaaacgagca gtagtcagct
1287 121 ctgcatctga agccgctgaa gttctactaa gggtggataa catcatccgt gcaagaccaa
1288 181 gaaccgccaa tagacaacat atgtaacata tttaggatat acctcgaaaa taataaaccg
1289 241 ccacactgtc attattataa ttagaaacag aacgcaaaaa ttatccacta tataattcaa
1290 301 agacgcgaaa aaaaaagaac aacgcgtcat agaacttttg gcaattcgcg tcacaaataa
1291 361 attttggcaa cttatgtttc ctcttcgagc agtactcgag ccctgtctca agaatgtaat
1292 421 aatacccatc gtaggtatgg ttaaagatag catctccaca acctcaaagc tccttgccga
1293 481 gagtcgccct cctttgtcga gtaattttca cttttcatat gagaacttat tttcttattc
1294 541 tttactctca catcctgtag tgattgacac tgcaacagcc accatcacta gaagaacaga
1295 601 acaattactt aatagaaaaa ttatatcttc ctcgaaacga tttcctgctt ccaacatcta
1296 661 cgtatatcaa gaagcattca cttaccatga cacagcttca gatttcatta ttgctgacag
1297 721 ctactatatc actactccat ctagtagtgg ccacgcccta tgaggcatat cctatcggaa
1298 781 aacaataccc cccagtggca agagtcaatg aatcgtttac atttcaaatt tccaatgata
1299 841 cctataaatc gtctgtagac aagacagctc aaataacata caattgcttc gacttaccga
1300 901 gctggctttc gtttgactct agttctagaa cgttctcagg tgaaccttct tctgacttac
1301 961 tatctgatgc gaacaccacg ttgtatttca atgtaatact cgagggtacg gactctgccg
1302 1021 acagcacgtc tttgaacaat acataccaat ttgttgttac aaaccgtcca tccatctcgc
1303 1081 tatcgtcaga tttcaatcta ttggcgttgt taaaaaacta tggttatact aacggcaaaa
1304 1141 acgctctgaa actagatcct aatgaagtct tcaacgtgac ttttgaccgt tcaatgttca
1305 1201 ctaacgaaga atccattgtg tcgtattacg gacgttctca gttgtataat gcgccgttac
1306 1261 ccaattggct gttcttcgat tctggcgagt tgaagtttac tgggacggca ccggtgataa
1307 1321 actcggcgat tgctccagaa acaagctaca gttttgtcat catcgctaca gacattgaag
1308 1381 gattttctgc cgttgaggta gaattcgaat tagtcatcgg ggctcaccag ttaactacct
1309 1441 ctattcaaaa tagtttgata atcaacgtta ctgacacagg taacgtttca tatgacttac
1310 1501 ctctaaacta tgtttatctc gatgacgatc ctatttcttc tgataaattg ggttctataa
1311 1561 acttattgga tgctccagac tgggtggcat tagataatgc taccatttcc gggtctgtcc
1312 1621 cagatgaatt actcggtaag aactccaatc ctgccaattt ttctgtgtcc atttatgata
1313 1681 cttatggtga tgtgatttat ttcaacttcg aagttgtctc cacaacggat ttgtttgcca
1314 1741 ttagttctct tcccaatatt aacgctacaa ggggtgaatg gttctcctac tattttttgc
1315 1801 cttctcagtt tacagactac gtgaatacaa acgtttcatt agagtttact aattcaagcc
1316 1861 aagaccatga ctgggtgaaa ttccaatcat ctaatttaac attagctgga gaagtgccca
1317 1921 agaatttcga caagctttca ttaggtttga aagcgaacca aggttcacaa tctcaagagc
1318 1981 tatattttaa catcattggc atggattcaa agataactca ctcaaaccac agtgcgaatg
1319 2041 caacgtccac aagaagttct caccactcca cctcaacaag ttcttacaca tcttctactt
1320 2101 acactgcaaa aatttcttct acctccgctg ctgctacttc ttctgctcca gcagcgctgc
1321 2161 cagcagccaa taaaacttca tctcacaata aaaaagcagt agcaattgcg tgcggtgttg
1322 2221 ctatcccatt aggcgttatc ctagtagctc tcatttgctt cctaatattc tggagacgca
1323 2281 gaagggaaaa tccagacgat gaaaacttac cgcatgctat tagtggacct gatttgaata
1324 2341 atcctgcaaa taaaccaaat caagaaaacg ctacaccttt gaacaacccc tttgatgatg
1325 2401 atgcttcctc gtacgatgat acttcaatag caagaagatt ggctgctttg aacactttga
1326 2461 aattggataa ccactctgcc actgaatctg atatttccag cgtggatgaa aagagagatt
1327 2521 ctctatcagg tatgaataca tacaatgatc agttccaatc ccaaagtaaa gaagaattat
1328 2581 tagcaaaacc cccagtacag cctccagaga gcccgttctt tgacccacag aataggtctt
1329 2641 cttctgtgta tatggatagt gaaccagcag taaataaatc ctggcgatat actggcaacc
1330 2701 tgtcaccagt ctctgatatt gtcagagaca gttacggatc acaaaaaact gttgatacag
1331 2761 aaaaactttt cgatttagaa gcaccagaga aggaaaaacg tacgtcaagg gatgtcacta
1332 2821 tgtcttcact ggacccttgg aacagcaata ttagcccttc tcccgtaaga aaatcagtaa
1333 2881 caccatcacc atataacgta acgaagcatc gtaaccgcca cttacaaaat attcaagact
1334 2941 ctcaaagcgg taaaaacgga atcactccca caacaatgtc aacttcatct tctgacgatt
1335 3001 ttgttccggt taaagatggt gaaaattttt gctgggtcca tagcatggaa ccagacagaa
1336 3061 gaccaagtaa gaaaaggtta gtagattttt caaataagag taatgtcaat gttggtcaag
1337 3121 ttaaggacat tcacggacgc atcccagaaa tgctgtgatt atacgcaacg atattttgct
1338 3181 taattttatt ttcctgtttt attttttatt agtggtttac agatacccta tattttattt
1339 3241 agtttttata cttagagaca tttaatttta attccattct tcaaatttca tttttgcact
1340 3301 taaaacaaag atccaaaaat gctctcgccc tcttcatatt gagaatacac tccattcaaa
1341 3361 attttgtcgt caccgctgat taatttttca ctaaactgat gaataatcaa aggccccacg
1342 3421 tcagaaccga ctaaagaagt gagttttatt ttaggaggtt gaaaaccatt attgtctggt
1343 3481 aaattttcat cttcttgaca tttaacccag tttgaatccc tttcaatttc tgctttttcc
1344 3541 tccaaactat cgaccctcct gtttctgtcc aacttatgtc ctagttccaa ttcgatcgca
1345 3601 ttaataactg cttcaaatgt tattgtgtca tcgttgactt taggtaattt ctccaaatgc
1346 3661 ataatcaaac tatttaagga agatcggaat tcgtcgaaca cttcagtttc cgtaatgatc
1347 3721 tgatcgtctt tatccacatg ttgtaattca ctaaaatcta aaacgtattt ttcaatgcat
1348 3781 aaatcgttct ttttattaat aatgcagatg gaaaatctgt aaacgtgcgt taatttagaa
1349 3841 agaacatcca gtataagttc ttctatatag tcaattaaag caggatgcct attaatggga
1350 3901 acgaactgcg gcaagttgaa tgactggtaa gtagtgtagt cgaatgactg aggtgggtat
1351 3961 acatttctat aaaataaaat caaattaatg tagcatttta agtataccct cagccacttc
1352 4021 tctacccatc tattcataaa gctgacgcaa cgattactat tttttttttc ttcttggatc
1353 4081 tcagtcgtcg caaaaacgta taccttcttt ttccgacctt ttttttagct ttctggaaaa
1354 4141 gtttatatta gttaaacagg gtctagtctt agtgtgaaag ctagtggttt cgattgactg
1355 4201 atattaagaa agtggaaatt aaattagtag tgtagacgta tatgcatatg tatttctcgc
1356 4261 ctgtttatgt ttctacgtac ttttgattta tagcaagggg aaaagaaata catactattt
1357 4321 tttggtaaag gtgaaagcat aatgtaaaag ctagaataaa atggacgaaa taaagagagg
1358 4381 cttagttcat cttttttcca aaaagcaccc aatgataata actaaaatga aaaggatttg
1359 4441 ccatctgtca gcaacatcag ttgtgtgagc aataataaaa tcatcacctc cgttgccttt
1360 4501 agcgcgtttg tcgtttgtat cttccgtaat tttagtctta tcaatgggaa tcataaattt
1361 4561 tccaatgaat tagcaatttc gtccaattct ttttgagctt cttcatattt gctttggaat
1362 4621 tcttcgcact tcttttccca ttcatctctt tcttcttcca aagcaacgat ccttctaccc
1363 4681 atttgctcag agttcaaatc ggcctctttc agtttatcca ttgcttcctt cagtttggct
1364 4741 tcactgtctt ctagctgttg ttctagatcc tggtttttct tggtgtagtt ctcattatta
1365 4801 gatctcaagt tattggagtc ttcagccaat tgctttgtat cagacaattg actctctaac
1366 4861 ttctccactt cactgtcgag ttgctcgttt ttagcggaca aagatttaat ctcgttttct
1367 4921 ttttcagtgt tagattgctc taattctttg agctgttctc tcagctcctc atatttttct
1368 4981 tgccatgact cagattctaa ttttaagcta ttcaatttct ctttgatc
1369 //"""
1370
1371
1372
1373 gbk_example2 = \
1374 """LOCUS AAD51968 143 aa linear BCT 21-AUG-2001
1375 DEFINITION transcriptional regulator RovA [Yersinia enterocolitica].
1376 ACCESSION AAD51968
1377 VERSION AAD51968.1 GI:5805369
1378 DBSOURCE locus AF171097 accession AF171097.1
1379 KEYWORDS .
1380 SOURCE Yersinia enterocolitica
1381 ORGANISM Yersinia enterocolitica
1382 Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacteriales;
1383 Enterobacteriaceae; Yersinia.
1384 REFERENCE 1 (residues 1 to 143)
1385 AUTHORS Revell,P.A. and Miller,V.L.
1386 TITLE A chromosomally encoded regulator is required for expression of the
1387 Yersinia enterocolitica inv gene and for virulence
1388 JOURNAL Mol. Microbiol. 35 (3), 677-685 (2000)
1389 MEDLINE 20138369
1390 PUBMED 10672189
1391 REFERENCE 2 (residues 1 to 143)
1392 AUTHORS Revell,P.A. and Miller,V.L.
1393 TITLE Direct Submission
1394 JOURNAL Submitted (22-JUL-1999) Molecular Microbiology, Washington
1395 University School of Medicine, Campus Box 8230, 660 South Euclid,
1396 St. Louis, MO 63110, USA
1397 COMMENT Method: conceptual translation.
1398 FEATURES Location/Qualifiers
1399 source 1..143
1400 /organism="Yersinia enterocolitica"
1401 /mol_type="unassigned DNA"
1402 /strain="JB580v"
1403 /serotype="O:8"
1404 /db_xref="taxon:630"
1405 Protein 1..143
1406 /product="transcriptional regulator RovA"
1407 /name="regulates inv expression"
1408 CDS 1..143
1409 /gene="rovA"
1410 /coded_by="AF171097.1:380..811"
1411 /note="regulator of virulence"
1412 /transl_table=11
1413 ORIGIN
1414 1 mestlgsdla rlvrvwrali dhrlkplelt qthwvtlhni nrlppeqsqi qlakaigieq
1415 61 pslvrtldql eekglitrht candrrakri klteqsspii eqvdgvicst rkeilggisp
1416 121 deiellsgli dklerniiql qsk
1417 //
1418 """
1419
1420 embl_example="""ID X56734; SV 1; linear; mRNA; STD; PLN; 1859 BP.
1421 XX
1422 AC X56734; S46826;
1423 XX
1424 DT 12-SEP-1991 (Rel. 29, Created)
1425 DT 25-NOV-2005 (Rel. 85, Last updated, Version 11)
1426 XX
1427 DE Trifolium repens mRNA for non-cyanogenic beta-glucosidase
1428 XX
1429 KW beta-glucosidase.
1430 XX
1431 OS Trifolium repens (white clover)
1432 OC Eukaryota; Viridiplantae; Streptophyta; Embryophyta; Tracheophyta;
1433 OC Spermatophyta; Magnoliophyta; eudicotyledons; core eudicotyledons; rosids;
1434 OC eurosids I; Fabales; Fabaceae; Papilionoideae; Trifolieae; Trifolium.
1435 XX
1436 RN [5]
1437 RP 1-1859
1438 RX PUBMED; 1907511.
1439 RA Oxtoby E., Dunn M.A., Pancoro A., Hughes M.A.;
1440 RT "Nucleotide and derived amino acid sequence of the cyanogenic
1441 RT beta-glucosidase (linamarase) from white clover (Trifolium repens L.)";
1442 RL Plant Mol. Biol. 17(2):209-219(1991).
1443 XX
1444 RN [6]
1445 RP 1-1859
1446 RA Hughes M.A.;
1447 RT ;
1448 RL Submitted (19-NOV-1990) to the EMBL/GenBank/DDBJ databases.
1449 RL Hughes M.A., University of Newcastle Upon Tyne, Medical School, Newcastle
1450 RL Upon Tyne, NE2 4HH, UK
1451 XX
1452 FH Key Location/Qualifiers
1453 FH
1454 FT source 1..1859
1455 FT /organism="Trifolium repens"
1456 FT /mol_type="mRNA"
1457 FT /clone_lib="lambda gt10"
1458 FT /clone="TRE361"
1459 FT /tissue_type="leaves"
1460 FT /db_xref="taxon:3899"
1461 FT CDS 14..1495
1462 FT /product="beta-glucosidase"
1463 FT /EC_number="3.2.1.21"
1464 FT /note="non-cyanogenic"
1465 FT /db_xref="GOA:P26204"
1466 FT /db_xref="InterPro:IPR001360"
1467 FT /db_xref="InterPro:IPR013781"
1468 FT /db_xref="UniProtKB/Swiss-Prot:P26204"
1469 FT /protein_id="CAA40058.1"
1470 FT /translation="MDFIVAIFALFVISSFTITSTNAVEASTLLDIGNLSRSSFPRGFI
1471 FT FGAGSSAYQFEGAVNEGGRGPSIWDTFTHKYPEKIRDGSNADITVDQYHRYKEDVGIMK
1472 FT DQNMDSYRFSISWPRILPKGKLSGGINHEGIKYYNNLINELLANGIQPFVTLFHWDLPQ
1473 FT VLEDEYGGFLNSGVINDFRDYTDLCFKEFGDRVRYWSTLNEPWVFSNSGYALGTNAPGR
1474 FT CSASNVAKPGDSGTGPYIVTHNQILAHAEAVHVYKTKYQAYQKGKIGITLVSNWLMPLD
1475 FT DNSIPDIKAAERSLDFQFGLFMEQLTTGDYSKSMRRIVKNRLPKFSKFESSLVNGSFDF
1476 FT IGINYYSSSYISNAPSHGNAKPSYSTNPMTNISFEKHGIPLGPRAASIWIYVYPYMFIQ
1477 FT EDFEIFCYILKINITILQFSITENGMNEFNDATLPVEEALLNTYRIDYYYRHLYYIRSA
1478 FT IRAGSNVKGFYAWSFLDCNEWFAGFTVRFGLNFVD"
1479 FT mRNA 1..1859
1480 FT /experiment="experimental evidence, no additional details
1481 FT recorded"
1482 XX
1483 SQ Sequence 1859 BP; 609 A; 314 C; 355 G; 581 T; 0 other;
1484 aaacaaacca aatatggatt ttattgtagc catatttgct ctgtttgtta ttagctcatt 60
1485 cacaattact tccacaaatg cagttgaagc ttctactctt cttgacatag gtaacctgag 120
1486 tcggagcagt tttcctcgtg gcttcatctt tggtgctgga tcttcagcat accaatttga 180
1487 aggtgcagta aacgaaggcg gtagaggacc aagtatttgg gataccttca cccataaata 240
1488 tccagaaaaa ataagggatg gaagcaatgc agacatcacg gttgaccaat atcaccgcta 300
1489 caaggaagat gttgggatta tgaaggatca aaatatggat tcgtatagat tctcaatctc 360
1490 ttggccaaga atactcccaa agggaaagtt gagcggaggc ataaatcacg aaggaatcaa 420
1491 atattacaac aaccttatca acgaactatt ggctaacggt atacaaccat ttgtaactct 480
1492 ttttcattgg gatcttcccc aagtcttaga agatgagtat ggtggtttct taaactccgg 540
1493 tgtaataaat gattttcgag actatacgga tctttgcttc aaggaatttg gagatagagt 600
1494 gaggtattgg agtactctaa atgagccatg ggtgtttagc aattctggat atgcactagg 660
1495 aacaaatgca ccaggtcgat gttcggcctc caacgtggcc aagcctggtg attctggaac 720
1496 aggaccttat atagttacac acaatcaaat tcttgctcat gcagaagctg tacatgtgta 780
1497 taagactaaa taccaggcat atcaaaaggg aaagataggc ataacgttgg tatctaactg 840
1498 gttaatgcca cttgatgata atagcatacc agatataaag gctgccgaga gatcacttga 900
1499 cttccaattt ggattgttta tggaacaatt aacaacagga gattattcta agagcatgcg 960
1500 gcgtatagtt aaaaaccgat tacctaagtt ctcaaaattc gaatcaagcc tagtgaatgg 1020
1501 ttcatttgat tttattggta taaactatta ctcttctagt tatattagca atgccccttc 1080
1502 acatggcaat gccaaaccca gttactcaac aaatcctatg accaatattt catttgaaaa 1140
1503 acatgggata cccttaggtc caagggctgc ttcaatttgg atatatgttt atccatatat 1200
1504 gtttatccaa gaggacttcg agatcttttg ttacatatta aaaataaata taacaatcct 1260
1505 gcaattttca atcactgaaa atggtatgaa tgaattcaac gatgcaacac ttccagtaga 1320
1506 agaagctctt ttgaatactt acagaattga ttactattac cgtcacttat actacattcg 1380
1507 ttctgcaatc agggctggct caaatgtgaa gggtttttac gcatggtcat ttttggactg 1440
1508 taatgaatgg tttgcaggct ttactgttcg ttttggatta aactttgtag attagaaaga 1500
1509 tggattaaaa aggtacccta agctttctgc ccaatggtac aagaactttc tcaaaagaaa 1560
1510 ctagctagta ttattaaaag aactttgtag tagattacag tacatcgttt gaagttgagt 1620
1511 tggtgcacct aattaaataa aagaggttac tcttaacata tttttaggcc attcgttgtg 1680
1512 aagttgttag gctgttattt ctattatact atgttgtagt aataagtgca ttgttgtacc 1740
1513 agaagctatg atcataacta taggttgatc cttcatgtat cagtttgatg ttgagaatac 1800
1514 tttgaattaa aagtcttttt ttattttttt aaaaaaaaaa aaaaaaaaaa aaaaaaaaa 1859
1515 //
1516 """
1517
1518 print "GenBank CDS Iteration"
1519 print "====================="
1520
1521 g = GenBankScanner()
1522 for record in g.parse_cds_features(StringIO(gbk_example)):
1523 print record
1524
1525 g = GenBankScanner()
1526 for record in g.parse_cds_features(StringIO(gbk_example2),
1527 tags2id=('gene','locus_tag','product')):
1528 print record
1529
1530 g = GenBankScanner()
1531 for record in g.parse_cds_features(StringIO(gbk_example + "\n" + gbk_example2),
1532 tags2id=('gene','locus_tag','product')):
1533 print record
1534
1535 print
1536 print "GenBank Iteration"
1537 print "================="
1538 g = GenBankScanner()
1539 for record in g.parse_records(StringIO(gbk_example),do_features=False):
1540 print record.id, record.name, record.description
1541 print record.seq
1542
1543 g = GenBankScanner()
1544 for record in g.parse_records(StringIO(gbk_example),do_features=True):
1545 print record.id, record.name, record.description
1546 print record.seq
1547
1548 g = GenBankScanner()
1549 for record in g.parse_records(StringIO(gbk_example2),do_features=False):
1550 print record.id, record.name, record.description
1551 print record.seq
1552
1553 g = GenBankScanner()
1554 for record in g.parse_records(StringIO(gbk_example2),do_features=True):
1555 print record.id, record.name, record.description
1556 print record.seq
1557
1558 print
1559 print "EMBL CDS Iteration"
1560 print "=================="
1561
1562 e = EmblScanner()
1563 for record in e.parse_cds_features(StringIO(embl_example)):
1564 print record
1565
1566 print
1567 print "EMBL Iteration"
1568 print "=============="
1569 e = EmblScanner()
1570 for record in e.parse_records(StringIO(embl_example),do_features=True):
1571 print record.id, record.name, record.description
1572 print record.seq
1573