1
2
3
4
5
6
7
8
9
10 """Implementations of Biopython-like Seq objects on top of BioSQL.
11
12 This allows retrival of items stored in a BioSQL database using
13 a biopython-like SeqRecord and Seq interface.
14
15 Note: Currently we do not support recording per-letter-annotations
16 (like quality scores) in BioSQL.
17 """
18
19 from Bio import Alphabet
20 from Bio.Seq import Seq, UnknownSeq
21 from Bio.SeqRecord import SeqRecord, _RestrictedDict
22 from Bio import SeqFeature
23
25 - def __init__(self, primary_id, adaptor, alphabet, start, length):
26 """Create a new DBSeq object referring to a BioSQL entry.
27
28 You wouldn't normally create a DBSeq object yourself, this is done
29 for you when retreiving a DBSeqRecord object from the database.
30 """
31 self.primary_id = primary_id
32 self.adaptor = adaptor
33 self.alphabet = alphabet
34 self._length = length
35 self.start = start
36
39
41
42
43
44 if isinstance(index, int):
45
46 i = index
47 if i < 0:
48 if -i > self._length:
49 raise IndexError(i)
50 i = i + self._length
51 elif i >= self._length:
52 raise IndexError(i)
53 return self.adaptor.get_subseq_as_string(self.primary_id,
54 self.start + i,
55 self.start + i + 1)
56 if not isinstance(index, slice):
57 raise ValueError("Unexpected index type")
58
59
60
61 if index.start is None:
62 i=0
63 else:
64 i = index.start
65 if i < 0:
66
67 if -i > self._length:
68 raise IndexError(i)
69 i = i + self._length
70 elif i >= self._length:
71
72 i = self._length
73
74 if index.stop is None:
75 j = self._length
76 else:
77 j = index.stop
78 if j < 0:
79
80 if -j > self._length:
81 raise IndexError(j)
82 j = j + self._length
83 elif j >= self._length:
84 j = self._length
85
86 if i >= j:
87
88 return Seq("", self.alphabet)
89 elif index.step is None or index.step == 1:
90
91 return self.__class__(self.primary_id, self.adaptor, self.alphabet,
92 self.start + i, j - i)
93 else:
94
95 full = self.adaptor.get_subseq_as_string(self.primary_id,
96 self.start + i,
97 self.start + j)
98 return Seq(full[::index.step], self.alphabet)
99
101 """Returns the full sequence as a python string.
102
103 Although not formally deprecated, you are now encouraged to use
104 str(my_seq) instead of my_seq.tostring()."""
105 return self.adaptor.get_subseq_as_string(self.primary_id,
106 self.start,
107 self.start + self._length)
113
114 data = property(tostring, doc="Sequence as string (DEPRECATED)")
115
117 """Returns the full sequence as a Seq object."""
118
119 return Seq(str(self), self.alphabet)
120
124
128
129
131
132
133
134
135
136
137 seqs = adaptor.execute_and_fetchall(
138 "SELECT alphabet, length, length(seq) FROM biosequence" \
139 " WHERE bioentry_id = %s", (primary_id,))
140 if not seqs : return
141 assert len(seqs) == 1
142 moltype, given_length, length = seqs[0]
143
144 try:
145 length = int(length)
146 given_length = int(length)
147 assert length == given_length
148 have_seq = True
149 except TypeError:
150 assert length is None
151 seqs = adaptor.execute_and_fetchall(
152 "SELECT alphabet, length, seq FROM biosequence" \
153 " WHERE bioentry_id = %s", (primary_id,))
154 assert len(seqs) == 1
155 moltype, given_length, seq = seqs[0]
156 assert seq is None or seq==""
157 length = int(given_length)
158 have_seq = False
159 del seq
160 del given_length
161
162 moltype = moltype.lower()
163
164
165 if moltype == "dna":
166 alphabet = Alphabet.generic_dna
167 elif moltype == "rna":
168 alphabet = Alphabet.generic_rna
169 elif moltype == "protein":
170 alphabet = Alphabet.generic_protein
171 elif moltype == "unknown":
172
173
174 alphabet = Alphabet.single_letter_alphabet
175 else:
176 raise AssertionError("Unknown moltype: %s" % moltype)
177
178 if have_seq:
179 return DBSeq(primary_id, adaptor, alphabet, 0, int(length))
180 else:
181 return UnknownSeq(length, alphabet)
182
184 """Retrieve the database cross references for the sequence."""
185 _dbxrefs = []
186 dbxrefs = adaptor.execute_and_fetchall(
187 "SELECT dbname, accession, version" \
188 " FROM bioentry_dbxref join dbxref using (dbxref_id)" \
189 " WHERE bioentry_id = %s" \
190 " ORDER BY rank", (primary_id,))
191 for dbname, accession, version in dbxrefs:
192 if version and version != "0":
193 v = "%s.%s" % (accession, version)
194 else:
195 v = accession
196 _dbxrefs.append("%s:%s" % (dbname, v))
197 return _dbxrefs
198
200 sql = "SELECT seqfeature_id, type.name, rank" \
201 " FROM seqfeature join term type on (type_term_id = type.term_id)" \
202 " WHERE bioentry_id = %s" \
203 " ORDER BY rank"
204 results = adaptor.execute_and_fetchall(sql, (primary_id,))
205 seq_feature_list = []
206 for seqfeature_id, seqfeature_type, seqfeature_rank in results:
207
208 qvs = adaptor.execute_and_fetchall(
209 "SELECT name, value" \
210 " FROM seqfeature_qualifier_value join term using (term_id)" \
211 " WHERE seqfeature_id = %s" \
212 " ORDER BY rank", (seqfeature_id,))
213 qualifiers = {}
214 for qv_name, qv_value in qvs:
215 qualifiers.setdefault(qv_name, []).append(qv_value)
216
217 qvs = adaptor.execute_and_fetchall(
218 "SELECT dbxref.dbname, dbxref.accession" \
219 " FROM dbxref join seqfeature_dbxref using (dbxref_id)" \
220 " WHERE seqfeature_dbxref.seqfeature_id = %s" \
221 " ORDER BY rank", (seqfeature_id,))
222 for qv_name, qv_value in qvs:
223 value = "%s:%s" % (qv_name, qv_value)
224 qualifiers.setdefault("db_xref", []).append(value)
225
226 results = adaptor.execute_and_fetchall(
227 "SELECT location_id, start_pos, end_pos, strand" \
228 " FROM location" \
229 " WHERE seqfeature_id = %s" \
230 " ORDER BY rank", (seqfeature_id,))
231 locations = []
232
233
234
235
236
237
238 for location_id, start, end, strand in results:
239 if start:
240 start -= 1
241 if strand == 0:
242 strand = None
243 if strand not in (+1, -1, None):
244 raise ValueError("Invalid strand %s found in database for " \
245 "seqfeature_id %s" % (strand, seqfeature_id))
246 if end < start:
247 import warnings
248 warnings.warn("Inverted location start/end (%i and %i) for " \
249 "seqfeature_id %s" % (start, end, seqfeature_id))
250 locations.append( (location_id, start, end, strand) )
251
252 remote_results = adaptor.execute_and_fetchall(
253 "SELECT location_id, dbname, accession, version" \
254 " FROM location join dbxref using (dbxref_id)" \
255 " WHERE seqfeature_id = %s", (seqfeature_id,))
256 lookup = {}
257 for location_id, dbname, accession, version in remote_results:
258 if version and version != "0":
259 v = "%s.%s" % (accession, version)
260 else:
261 v = accession
262
263
264 if dbname == "":
265 dbname = None
266 lookup[location_id] = (dbname, v)
267
268 feature = SeqFeature.SeqFeature(type = seqfeature_type)
269 feature._seqfeature_id = seqfeature_id
270 feature.qualifiers = qualifiers
271 if len(locations) == 0:
272 pass
273 elif len(locations) == 1:
274 location_id, start, end, strand = locations[0]
275
276
277 feature.location_operator = \
278 _retrieve_location_qualifier_value(adaptor, location_id)
279 dbname, version = lookup.get(location_id, (None, None))
280 feature.location = SeqFeature.FeatureLocation(start, end)
281 feature.strand = strand
282 feature.ref_db = dbname
283 feature.ref = version
284 else:
285 assert feature.sub_features == []
286 for location in locations:
287 location_id, start, end, strand = location
288 dbname, version = lookup.get(location_id, (None, None))
289 subfeature = SeqFeature.SeqFeature()
290 subfeature.type = seqfeature_type
291 subfeature.location_operator = \
292 _retrieve_location_qualifier_value(adaptor, location_id)
293
294
295
296 if not subfeature.location_operator:
297 subfeature.location_operator="join"
298 subfeature.location = SeqFeature.FeatureLocation(start, end)
299 subfeature.strand = strand
300 subfeature.ref_db = dbname
301 subfeature.ref = version
302 feature.sub_features.append(subfeature)
303
304
305 feature.location_operator = \
306 feature.sub_features[0].location_operator
307
308
309 start = locations[0][1]
310 end = locations[-1][2]
311 feature.location = SeqFeature.FeatureLocation(start, end)
312
313
314
315 strands = set(sf.strand for sf in feature.sub_features)
316 if len(strands)==1:
317 feature.strand = feature.sub_features[0].strand
318 else:
319 feature.strand = None
320
321 seq_feature_list.append(feature)
322
323 return seq_feature_list
324
326 value = adaptor.execute_and_fetch_col0(
327 "SELECT value FROM location_qualifier_value" \
328 " WHERE location_id = %s", (location_id,))
329 try:
330 return value[0]
331 except IndexError:
332 return ""
333
350
352 if isinstance(text, unicode):
353 return str(text)
354 else :
355 return text
356
358 qvs = adaptor.execute_and_fetchall(
359 "SELECT name, value" \
360 " FROM bioentry_qualifier_value JOIN term USING (term_id)" \
361 " WHERE bioentry_id = %s" \
362 " ORDER BY rank", (primary_id,))
363 qualifiers = {}
364 for name, value in qvs:
365 if name == "keyword": name = "keywords"
366 elif name == "date_changed": name = "dates"
367 elif name == "secondary_accession": name = "accessions"
368 qualifiers.setdefault(name, []).append(value)
369 return qualifiers
370
372
373
374 refs = adaptor.execute_and_fetchall(
375 "SELECT start_pos, end_pos, " \
376 " location, title, authors," \
377 " dbname, accession" \
378 " FROM bioentry_reference" \
379 " JOIN reference USING (reference_id)" \
380 " LEFT JOIN dbxref USING (dbxref_id)" \
381 " WHERE bioentry_id = %s" \
382 " ORDER BY rank", (primary_id,))
383 references = []
384 for start, end, location, title, authors, dbname, accession in refs:
385 reference = SeqFeature.Reference()
386
387 if (start is not None) or (end is not None):
388 if start is not None: start -= 1
389 reference.location = [SeqFeature.FeatureLocation(start, end)]
390
391 if authors : reference.authors = authors
392 if title : reference.title = title
393 reference.journal = location
394 if dbname == 'PUBMED':
395 reference.pubmed_id = accession
396 elif dbname == 'MEDLINE':
397 reference.medline_id = accession
398 references.append(reference)
399 if references:
400 return {'references': references}
401 else:
402 return {}
403
405 a = {}
406 common_names = adaptor.execute_and_fetch_col0(
407 "SELECT name FROM taxon_name WHERE taxon_id = %s" \
408 " AND name_class = 'genbank common name'", (taxon_id,))
409 if common_names:
410 a['source'] = common_names[0]
411 scientific_names = adaptor.execute_and_fetch_col0(
412 "SELECT name FROM taxon_name WHERE taxon_id = %s" \
413 " AND name_class = 'scientific name'", (taxon_id,))
414 if scientific_names:
415 a['organism'] = scientific_names[0]
416 ncbi_taxids = adaptor.execute_and_fetch_col0(
417 "SELECT ncbi_taxon_id FROM taxon WHERE taxon_id = %s", (taxon_id,))
418 if ncbi_taxids and ncbi_taxids[0] and ncbi_taxids[0] != "0":
419 a['ncbi_taxid'] = ncbi_taxids[0]
420
421
422
423
424
425
426
427
428
429
430 taxonomy = []
431 while taxon_id:
432 name, rank, parent_taxon_id = adaptor.execute_one(
433 "SELECT taxon_name.name, taxon.node_rank, taxon.parent_taxon_id" \
434 " FROM taxon, taxon_name" \
435 " WHERE taxon.taxon_id=taxon_name.taxon_id" \
436 " AND taxon_name.name_class='scientific name'" \
437 " AND taxon.taxon_id = %s", (taxon_id,))
438 if taxon_id == parent_taxon_id:
439
440
441
442 break
443 if rank != "no rank":
444
445
446
447 taxonomy.insert(0, name)
448 taxon_id = parent_taxon_id
449
450 if taxonomy:
451 a['taxonomy'] = taxonomy
452 return a
453
465
467 """BioSQL equivalent of the biopython SeqRecord object.
468 """
469
470 - def __init__(self, adaptor, primary_id):
471 self._adaptor = adaptor
472 self._primary_id = primary_id
473
474 (self._biodatabase_id, self._taxon_id, self.name,
475 accession, version, self._identifier,
476 self._division, self.description) = self._adaptor.execute_one(
477 "SELECT biodatabase_id, taxon_id, name, accession, version," \
478 " identifier, division, description" \
479 " FROM bioentry" \
480 " WHERE bioentry_id = %s", (self._primary_id,))
481 if version and version != "0":
482 self.id = "%s.%s" % (accession, version)
483 else:
484 self.id = accession
485
486
487
488 try:
489 length = len(self.seq)
490 except:
491
492 length = 0
493 self._per_letter_annotations = _RestrictedDict(length=length)
494
496 if not hasattr(self, "_seq"):
497 self._seq = _retrieve_seq(self._adaptor, self._primary_id)
498 return self._seq
501 seq = property(__get_seq, __set_seq, __del_seq, "Seq object")
502
504 if not hasattr(self,"_dbxrefs"):
505 self._dbxrefs = _retrieve_dbxrefs(self._adaptor, self._primary_id)
506 return self._dbxrefs
509 dbxrefs = property(__get_dbxrefs, __set_dbxrefs, __del_dbxrefs,
510 "Database cross references")
511
513 if not hasattr(self, "_features"):
514 self._features = _retrieve_features(self._adaptor,
515 self._primary_id)
516 return self._features
519 features = property(__get_features, __set_features, __del_features,
520 "Features")
521
523 if not hasattr(self, "_annotations"):
524 self._annotations = _retrieve_annotations(self._adaptor,
525 self._primary_id,
526 self._taxon_id)
527 if self._identifier:
528 self._annotations["gi"] = self._identifier
529 if self._division:
530 self._annotations["data_file_division"] = self._division
531 return self._annotations
534 annotations = property(__get_annotations, __set_annotations,
535 __del_annotations, "Annotations")
536