1
2
3
4
5
6
7
8 """Bio.SeqIO support for the "swiss" (aka SwissProt/UniProt) file format.
9
10 You are expected to use this module via the Bio.SeqIO functions.
11 See also the Bio.SwissProt module which offers more than just accessing
12 the sequences as SeqRecord objects."""
13
14 from Bio import Seq
15 from Bio import SeqRecord
16 from Bio import Alphabet
17 from Bio import SeqFeature
18 from Bio import SwissProt
19
20
22 """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects.
23
24 Every section from the ID line to the terminating // becomes
25 a single SeqRecord with associated annotation and features.
26
27 This parser is for the flat file "swiss" format as used by:
28 * Swiss-Prot aka SwissProt
29 * TrEMBL
30 * UniProtKB aka UniProt Knowledgebase
31
32 It does NOT read their new XML file format.
33 http://www.expasy.org/sprot/
34
35 For consistency with BioPerl and EMBOSS we call this the "swiss"
36 format.
37 """
38 swiss_records = SwissProt.parse(handle)
39 for swiss_record in swiss_records:
40
41 seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein)
42 record = SeqRecord.SeqRecord(seq,
43 id=swiss_record.accessions[0],
44 name=swiss_record.entry_name,
45 description=swiss_record.description,
46 )
47 record.description = swiss_record.description
48 for cross_reference in swiss_record.cross_references:
49 if len(cross_reference) < 2:
50 continue
51 database, accession = cross_reference[:2]
52 dbxref = "%s:%s" % (database, accession)
53 if not dbxref in record.dbxrefs:
54 record.dbxrefs.append(dbxref)
55 annotations = record.annotations
56 annotations['accessions'] = swiss_record.accessions
57 annotations['date'] = swiss_record.created[0]
58 annotations['date_last_sequence_update'] = swiss_record.sequence_update[0]
59 if swiss_record.annotation_update:
60 annotations['date_last_annotation_update'] = swiss_record.annotation_update[0]
61 if swiss_record.gene_name:
62 annotations['gene_name'] = swiss_record.gene_name
63 annotations['organism'] = swiss_record.organism.rstrip(".")
64 annotations['taxonomy'] = swiss_record.organism_classification
65 annotations['ncbi_taxid'] = swiss_record.taxonomy_id
66 if swiss_record.host_organism:
67 annotations['organism_host'] = [word.rstrip(".") \
68 for word \
69 in swiss_record.host_organism]
70 if swiss_record.comments:
71 annotations['comment'] = "\n".join(swiss_record.comments)
72 if swiss_record.references:
73 annotations['references'] = []
74 for reference in swiss_record.references:
75 feature = SeqFeature.Reference()
76 feature.comment = " ".join(["%s=%s;" % (key, value) \
77 for key, value \
78 in reference.comments])
79 for key, value in reference.references:
80 if key == 'PubMed':
81 feature.pubmed_id = value
82 elif key == 'MEDLINE':
83 feature.medline_id = value
84 elif key == 'DOI':
85 pass
86 elif key == 'AGRICOLA':
87 pass
88 else:
89 raise ValueError(\
90 "Unknown key %s found in references" % key)
91 feature.authors = reference.authors
92 feature.title = reference.title
93 feature.journal = reference.location
94 annotations['references'].append(feature)
95 if swiss_record.keywords:
96 record.annotations['keywords'] = swiss_record.keywords
97 yield record
98
99 if __name__ == "__main__":
100 print "Quick self test..."
101
102 example_filename = "../../Tests/SwissProt/sp008"
103
104 import os
105 if not os.path.isfile(example_filename):
106 print "Missing test file %s" % example_filename
107 else:
108
109 handle = open(example_filename)
110 records = SwissIterator(handle)
111 for record in records:
112 print record.name
113 print record.id
114 print record.annotations['keywords']
115 print repr(record.annotations['organism'])
116 print record.seq.tostring()[:20] + "..."
117 handle.close()
118