Package Bio :: Module SeqRecord
[hide private]
[frames] | no frames]

Source Code for Module Bio.SeqRecord

  1  # Copyright 2000-2002 Andrew Dalke. 
  2  # Copyright 2002-2004 Brad Chapman. 
  3  # Copyright 2006-2009 by Peter Cock. 
  4  # All rights reserved. 
  5  # This code is part of the Biopython distribution and governed by its 
  6  # license.  Please see the LICENSE file that should have been included 
  7  # as part of this package. 
  8  """Represent a Sequence Record, a sequence with annotation.""" 
  9  __docformat__ = "epytext en" #Simple markup to show doctests nicely 
 10   
 11  # NEEDS TO BE SYNCH WITH THE REST OF BIOPYTHON AND BIOPERL 
 12  # In particular, the SeqRecord and BioSQL.BioSeq.DBSeqRecord classes 
 13  # need to be in sync (this is the BioSQL "Database SeqRecord", see 
 14  # also BioSQL.BioSeq.DBSeq which is the "Database Seq" class) 
 15   
16 -class _RestrictedDict(dict):
17 """Dict which only allows sequences of given length as values (PRIVATE). 18 19 This simple subclass of the Python dictionary is used in the SeqRecord 20 object for holding per-letter-annotations. This class is intended to 21 prevent simple errors by only allowing python sequences (e.g. lists, 22 strings and tuples) to be stored, and only if their length matches that 23 expected (the length of the SeqRecord's seq object). It cannot however 24 prevent the entries being edited in situ (for example appending entries 25 to a list). 26 """
27 - def __init__(self, length):
28 """Create an EMPTY restricted dictionary.""" 29 dict.__init__(self) 30 self._length = int(length)
31 - def __setitem__(self, key, value):
32 if not hasattr(value,"__len__") or not hasattr(value,"__getitem__") \ 33 or len(value) != self._length: 34 raise TypeError("We only allow python sequences (lists, tuples or " 35 "strings) of length %i." % self._length) 36 dict.__setitem__(self, key, value)
37 - def update(self, new_dict):
38 #Force this to go via our strict __setitem__ method 39 for (key, value) in new_dict.iteritems(): 40 self[key] = value
41
42 -class SeqRecord(object):
43 """A SeqRecord object holds a sequence and information about it. 44 45 Main attributes: 46 - id - Identifier such as a locus tag (string) 47 - seq - The sequence itself (Seq object or similar) 48 49 Additional attributes: 50 - name - Sequence name, e.g. gene name (string) 51 - description - Additional text (string) 52 - dbxrefs - List of database cross references (list of strings) 53 - features - Any (sub)features defined (list of SeqFeature objects) 54 - annotations - Further information about the whole sequence (dictionary) 55 Most entries are strings, or lists of strings. 56 - letter_annotations - Per letter/symbol annotation (restricted 57 dictionary). This holds Python sequences (lists, strings 58 or tuples) whose length matches that of the sequence. 59 A typical use would be to hold a list of integers 60 representing sequencing quality scores, or a string 61 representing the secondary structure. 62 63 You will typically use Bio.SeqIO to read in sequences from files as 64 SeqRecord objects. However, you may want to create your own SeqRecord 65 objects directly (see the __init__ method for further details): 66 67 >>> from Bio.Seq import Seq 68 >>> from Bio.SeqRecord import SeqRecord 69 >>> from Bio.Alphabet import IUPAC 70 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 71 ... IUPAC.protein), 72 ... id="YP_025292.1", name="HokC", 73 ... description="toxic membrane protein") 74 >>> print record 75 ID: YP_025292.1 76 Name: HokC 77 Description: toxic membrane protein 78 Number of features: 0 79 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 80 81 If you want to save SeqRecord objects to a sequence file, use Bio.SeqIO 82 for this. For the special case where you want the SeqRecord turned into 83 a string in a particular file format there is a format method which uses 84 Bio.SeqIO internally: 85 86 >>> print record.format("fasta") 87 >YP_025292.1 toxic membrane protein 88 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 89 <BLANKLINE> 90 91 You can also do things like slicing a SeqRecord, checking its length, etc 92 93 >>> len(record) 94 44 95 >>> edited = record[:10] + record[11:] 96 >>> print edited.seq 97 MKQHKAMIVAIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 98 >>> print record.seq 99 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 100 101 """
102 - def __init__(self, seq, id = "<unknown id>", name = "<unknown name>", 103 description = "<unknown description>", dbxrefs = None, 104 features = None, annotations = None, 105 letter_annotations = None):
106 """Create a SeqRecord. 107 108 Arguments: 109 - seq - Sequence, required (Seq, MutableSeq or UnknownSeq) 110 - id - Sequence identifier, recommended (string) 111 - name - Sequence name, optional (string) 112 - description - Sequence description, optional (string) 113 - dbxrefs - Database cross references, optional (list of strings) 114 - features - Any (sub)features, optional (list of SeqFeature objects) 115 - annotations - Dictionary of annotations for the whole sequence 116 - letter_annotations - Dictionary of per-letter-annotations, values 117 should be strings, list or tuples of the same 118 length as the full sequence. 119 120 You will typically use Bio.SeqIO to read in sequences from files as 121 SeqRecord objects. However, you may want to create your own SeqRecord 122 objects directly. 123 124 Note that while an id is optional, we strongly recommend you supply a 125 unique id string for each record. This is especially important 126 if you wish to write your sequences to a file. 127 128 If you don't have the actual sequence, but you do know its length, 129 then using the UnknownSeq object from Bio.Seq is appropriate. 130 131 You can create a 'blank' SeqRecord object, and then populate the 132 attributes later. 133 """ 134 if id is not None and not isinstance(id, basestring): 135 #Lots of existing code uses id=None... this may be a bad idea. 136 raise TypeError("id argument should be a string") 137 if not isinstance(name, basestring): 138 raise TypeError("name argument should be a string") 139 if not isinstance(description, basestring): 140 raise TypeError("description argument should be a string") 141 self._seq = seq 142 self.id = id 143 self.name = name 144 self.description = description 145 146 # database cross references (for the whole sequence) 147 if dbxrefs is None: 148 dbxrefs = [] 149 elif not isinstance(dbxrefs, list): 150 raise TypeError("dbxrefs argument should be a list (of strings)") 151 self.dbxrefs = dbxrefs 152 153 # annotations about the whole sequence 154 if annotations is None: 155 annotations = {} 156 elif not isinstance(annotations, dict): 157 raise TypeError("annotations argument should be a dict") 158 self.annotations = annotations 159 160 if letter_annotations is None: 161 # annotations about each letter in the sequence 162 if seq is None: 163 #Should we allow this and use a normal unrestricted dict? 164 self._per_letter_annotations = _RestrictedDict(length=0) 165 else: 166 try: 167 self._per_letter_annotations = \ 168 _RestrictedDict(length=len(seq)) 169 except: 170 raise TypeError("seq argument should be a Seq object or similar") 171 else: 172 #This will be handled via the property set function, which will 173 #turn this into a _RestrictedDict and thus ensure all the values 174 #in the dict are the right length 175 self.letter_annotations = letter_annotations 176 177 # annotations about parts of the sequence 178 if features is None: 179 features = [] 180 elif not isinstance(features, list): 181 raise TypeError("features argument should be a list (of SeqFeature objects)") 182 self.features = features
183 184 #TODO - Just make this a read only property?
185 - def _set_per_letter_annotations(self, value):
186 if not isinstance(value, dict): 187 raise TypeError("The per-letter-annotations should be a " 188 "(restricted) dictionary.") 189 #Turn this into a restricted-dictionary (and check the entries) 190 try: 191 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 192 except AttributeError: 193 #e.g. seq is None 194 self._per_letter_annotations = _RestrictedDict(length=0) 195 self._per_letter_annotations.update(value)
196 letter_annotations = property( \ 197 fget=lambda self : self._per_letter_annotations, 198 fset=_set_per_letter_annotations, 199 doc="""Dictionary of per-letter-annotation for the sequence. 200 201 For example, this can hold quality scores used in FASTQ or QUAL files. 202 Consider this example using Bio.SeqIO to read in an example Solexa 203 variant FASTQ file as a SeqRecord: 204 205 >>> from Bio import SeqIO 206 >>> handle = open("Quality/solexa_faked.fastq", "rU") 207 >>> record = SeqIO.read(handle, "fastq-solexa") 208 >>> handle.close() 209 >>> print record.id, record.seq 210 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 211 >>> print record.letter_annotations.keys() 212 ['solexa_quality'] 213 >>> print record.letter_annotations["solexa_quality"] 214 [40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 215 216 The letter_annotations get sliced automatically if you slice the 217 parent SeqRecord, for example taking the last ten bases: 218 219 >>> sub_record = record[-10:] 220 >>> print sub_record.id, sub_record.seq 221 slxa_0001_1_0001_01 ACGTNNNNNN 222 >>> print sub_record.letter_annotations["solexa_quality"] 223 [4, 3, 2, 1, 0, -1, -2, -3, -4, -5] 224 225 Any python sequence (i.e. list, tuple or string) can be recorded in 226 the SeqRecord's letter_annotations dictionary as long as the length 227 matches that of the SeqRecord's sequence. e.g. 228 229 >>> len(sub_record.letter_annotations) 230 1 231 >>> sub_record.letter_annotations["dummy"] = "abcdefghij" 232 >>> len(sub_record.letter_annotations) 233 2 234 235 You can delete entries from the letter_annotations dictionary as usual: 236 237 >>> del sub_record.letter_annotations["solexa_quality"] 238 >>> sub_record.letter_annotations 239 {'dummy': 'abcdefghij'} 240 241 You can completely clear the dictionary easily as follows: 242 243 >>> sub_record.letter_annotations = {} 244 >>> sub_record.letter_annotations 245 {} 246 """) 247
248 - def _set_seq(self, value):
249 #TODO - Add a deprecation warning that the seq should be write only? 250 if self._per_letter_annotations: 251 #TODO - Make this a warning? Silently empty the dictionary? 252 raise ValueError("You must empty the letter annotations first!") 253 self._seq = value 254 try: 255 self._per_letter_annotations = _RestrictedDict(length=len(self.seq)) 256 except AttributeError: 257 #e.g. seq is None 258 self._per_letter_annotations = _RestrictedDict(length=0)
259 260 seq = property(fget=lambda self : self._seq, 261 fset=_set_seq, 262 doc="The sequence itself, as a Seq or MutableSeq object.") 263
264 - def __getitem__(self, index):
265 """Returns a sub-sequence or an individual letter. 266 267 Slicing, e.g. my_record[5:10], returns a new SeqRecord for 268 that sub-sequence with approriate annotation preserved. The 269 name, id and description are kept. 270 271 Any per-letter-annotations are sliced to match the requested 272 sub-sequence. Unless a stride is used, all those features 273 which fall fully within the subsequence are included (with 274 their locations adjusted accordingly). 275 276 However, the annotations dictionary and the dbxrefs list are 277 not used for the new SeqRecord, as in general they may not 278 apply to the subsequence. If you want to preserve them, you 279 must explictly copy them to the new SeqRecord yourself. 280 281 Using an integer index, e.g. my_record[5] is shorthand for 282 extracting that letter from the sequence, my_record.seq[5]. 283 284 For example, consider this short protein and its secondary 285 structure as encoded by the PDB (e.g. H for alpha helices), 286 plus a simple feature for its histidine self phosphorylation 287 site: 288 289 >>> from Bio.Seq import Seq 290 >>> from Bio.SeqRecord import SeqRecord 291 >>> from Bio.SeqFeature import SeqFeature, FeatureLocation 292 >>> from Bio.Alphabet import IUPAC 293 >>> rec = SeqRecord(Seq("MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLAT" 294 ... "EMMSEQDGYLAESINKDIEECNAIIEQFIDYLR", 295 ... IUPAC.protein), 296 ... id="1JOY", name="EnvZ", 297 ... description="Homodimeric domain of EnvZ from E. coli") 298 >>> rec.letter_annotations["secondary_structure"] = \ 299 " S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT " 300 >>> rec.features.append(SeqFeature(FeatureLocation(20,21), 301 ... type = "Site")) 302 303 Now let's have a quick look at the full record, 304 305 >>> print rec 306 ID: 1JOY 307 Name: EnvZ 308 Description: Homodimeric domain of EnvZ from E. coli 309 Number of features: 1 310 Per letter annotation for: secondary_structure 311 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 312 >>> print rec.letter_annotations["secondary_structure"] 313 S SSSSSSHHHHHTTTHHHHHHHHHHHHHHHHHHHHHHTHHHHHHHHHHHHHHHHHHHHHTT 314 >>> print rec.features[0].location 315 [20:21] 316 317 Now let's take a sub sequence, here chosen as the first (fractured) 318 alpha helix which includes the histidine phosphorylation site: 319 320 >>> sub = rec[11:41] 321 >>> print sub 322 ID: 1JOY 323 Name: EnvZ 324 Description: Homodimeric domain of EnvZ from E. coli 325 Number of features: 1 326 Per letter annotation for: secondary_structure 327 Seq('RTLLMAGVSHDLRTPLTRIRLATEMMSEQD', IUPACProtein()) 328 >>> print sub.letter_annotations["secondary_structure"] 329 HHHHHTTTHHHHHHHHHHHHHHHHHHHHHH 330 >>> print sub.features[0].location 331 [9:10] 332 333 You can also of course omit the start or end values, for 334 example to get the first ten letters only: 335 336 >>> print rec[:10] 337 ID: 1JOY 338 Name: EnvZ 339 Description: Homodimeric domain of EnvZ from E. coli 340 Number of features: 0 341 Per letter annotation for: secondary_structure 342 Seq('MAAGVKQLAD', IUPACProtein()) 343 344 Or for the last ten letters: 345 346 >>> print rec[-10:] 347 ID: 1JOY 348 Name: EnvZ 349 Description: Homodimeric domain of EnvZ from E. coli 350 Number of features: 0 351 Per letter annotation for: secondary_structure 352 Seq('IIEQFIDYLR', IUPACProtein()) 353 354 If you omit both, then you get a copy of the original record (although 355 lacking the annotations and dbxrefs): 356 357 >>> print rec[:] 358 ID: 1JOY 359 Name: EnvZ 360 Description: Homodimeric domain of EnvZ from E. coli 361 Number of features: 1 362 Per letter annotation for: secondary_structure 363 Seq('MAAGVKQLADDRTLLMAGVSHDLRTPLTRIRLATEMMSEQDGYLAESINKDIEE...YLR', IUPACProtein()) 364 365 Finally, indexing with a simple integer is shorthand for pulling out 366 that letter from the sequence directly: 367 368 >>> rec[5] 369 'K' 370 >>> rec.seq[5] 371 'K' 372 """ 373 if isinstance(index, int): 374 #NOTE - The sequence level annotation like the id, name, etc 375 #do not really apply to a single character. However, should 376 #we try and expose any per-letter-annotation here? If so how? 377 return self.seq[index] 378 elif isinstance(index, slice): 379 if self.seq is None: 380 raise ValueError("If the sequence is None, we cannot slice it.") 381 parent_length = len(self) 382 answer = self.__class__(self.seq[index], 383 id=self.id, 384 name=self.name, 385 description=self.description) 386 #TODO - The desription may no longer apply. 387 #It would be safer to change it to something 388 #generic like "edited" or the default value. 389 390 #Don't copy the annotation dict and dbxefs list, 391 #they may not apply to a subsequence. 392 #answer.annotations = dict(self.annotations.iteritems()) 393 #answer.dbxrefs = self.dbxrefs[:] 394 #TODO - Review this in light of adding SeqRecord objects? 395 396 #TODO - Cope with strides by generating ambiguous locations? 397 if index.step is None or index.step == 1: 398 #Select relevant features, add them with shifted locations 399 if index.start is None: 400 start = 0 401 else: 402 start = index.start 403 if index.stop is None: 404 stop = -1 405 else: 406 stop = index.stop 407 if (start < 0 or stop < 0) and parent_length == 0: 408 raise ValueError, \ 409 "Cannot support negative indices without the sequence length" 410 if start < 0: 411 start = parent_length + start 412 if stop < 0: 413 stop = parent_length + stop + 1 414 #assert str(self.seq)[index] == str(self.seq)[start:stop] 415 for f in self.features: 416 if f.ref or f.ref_db: 417 #TODO - Implement this (with lots of tests)? 418 import warnings 419 warnings.warn("When slicing SeqRecord objects, any " 420 "SeqFeature referencing other sequences (e.g. " 421 "from segmented GenBank records) is ignored.") 422 continue 423 if start <= f.location.nofuzzy_start \ 424 and f.location.nofuzzy_end <= stop: 425 answer.features.append(f._shift(-start)) 426 427 #Slice all the values to match the sliced sequence 428 #(this should also work with strides, even negative strides): 429 for key, value in self.letter_annotations.iteritems(): 430 answer._per_letter_annotations[key] = value[index] 431 432 return answer 433 raise ValueError, "Invalid index"
434
435 - def __iter__(self):
436 """Iterate over the letters in the sequence. 437 438 For example, using Bio.SeqIO to read in a protein FASTA file: 439 440 >>> from Bio import SeqIO 441 >>> record = SeqIO.read(open("Fasta/loveliesbleeding.pro"),"fasta") 442 >>> for amino in record: 443 ... print amino 444 ... if amino == "L" : break 445 X 446 A 447 G 448 L 449 >>> print record.seq[3] 450 L 451 452 This is just a shortcut for iterating over the sequence directly: 453 454 >>> for amino in record.seq: 455 ... print amino 456 ... if amino == "L" : break 457 X 458 A 459 G 460 L 461 >>> print record.seq[3] 462 L 463 464 Note that this does not facilitate iteration together with any 465 per-letter-annotation. However, you can achieve that using the 466 python zip function on the record (or its sequence) and the relevant 467 per-letter-annotation: 468 469 >>> from Bio import SeqIO 470 >>> rec = SeqIO.read(open("Quality/solexa_faked.fastq", "rU"), 471 ... "fastq-solexa") 472 >>> print rec.id, rec.seq 473 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 474 >>> print rec.letter_annotations.keys() 475 ['solexa_quality'] 476 >>> for nuc, qual in zip(rec,rec.letter_annotations["solexa_quality"]): 477 ... if qual > 35: 478 ... print nuc, qual 479 A 40 480 C 39 481 G 38 482 T 37 483 A 36 484 485 You may agree that using zip(rec.seq, ...) is more explicit than using 486 zip(rec, ...) as shown above. 487 """ 488 return iter(self.seq)
489
490 - def __contains__(self, char):
491 """Implements the 'in' keyword, searches the sequence. 492 493 e.g. 494 495 >>> from Bio import SeqIO 496 >>> record = SeqIO.read(open("Fasta/sweetpea.nu"), "fasta") 497 >>> "GAATTC" in record 498 False 499 >>> "AAA" in record 500 True 501 502 This essentially acts as a proxy for using "in" on the sequence: 503 504 >>> "GAATTC" in record.seq 505 False 506 >>> "AAA" in record.seq 507 True 508 509 Note that you can also use Seq objects as the query, 510 511 >>> from Bio.Seq import Seq 512 >>> from Bio.Alphabet import generic_dna 513 >>> Seq("AAA") in record 514 True 515 >>> Seq("AAA", generic_dna) in record 516 True 517 518 See also the Seq object's __contains__ method. 519 """ 520 return char in self.seq
521 522
523 - def __str__(self):
524 """A human readable summary of the record and its annotation (string). 525 526 The python built in function str works by calling the object's ___str__ 527 method. e.g. 528 529 >>> from Bio.Seq import Seq 530 >>> from Bio.SeqRecord import SeqRecord 531 >>> from Bio.Alphabet import IUPAC 532 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 533 ... IUPAC.protein), 534 ... id="YP_025292.1", name="HokC", 535 ... description="toxic membrane protein, small") 536 >>> print str(record) 537 ID: YP_025292.1 538 Name: HokC 539 Description: toxic membrane protein, small 540 Number of features: 0 541 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 542 543 In this example you don't actually need to call str explicity, as the 544 print command does this automatically: 545 546 >>> print record 547 ID: YP_025292.1 548 Name: HokC 549 Description: toxic membrane protein, small 550 Number of features: 0 551 Seq('MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF', IUPACProtein()) 552 553 Note that long sequences are shown truncated. 554 """ 555 lines = [] 556 if self.id : lines.append("ID: %s" % self.id) 557 if self.name : lines.append("Name: %s" % self.name) 558 if self.description : lines.append("Description: %s" % self.description) 559 if self.dbxrefs : lines.append("Database cross-references: " \ 560 + ", ".join(self.dbxrefs)) 561 lines.append("Number of features: %i" % len(self.features)) 562 for a in self.annotations: 563 lines.append("/%s=%s" % (a, str(self.annotations[a]))) 564 if self.letter_annotations: 565 lines.append("Per letter annotation for: " \ 566 + ", ".join(self.letter_annotations.keys())) 567 #Don't want to include the entire sequence, 568 #and showing the alphabet is useful: 569 lines.append(repr(self.seq)) 570 return "\n".join(lines)
571
572 - def __repr__(self):
573 """A concise summary of the record for debugging (string). 574 575 The python built in function repr works by calling the object's ___repr__ 576 method. e.g. 577 578 >>> from Bio.Seq import Seq 579 >>> from Bio.SeqRecord import SeqRecord 580 >>> from Bio.Alphabet import generic_protein 581 >>> rec = SeqRecord(Seq("MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKAT" 582 ... +"GEMKEQTEWHRVVLFGKLAEVASEYLRKGSQVYIEGQLRTRKWTDQ" 583 ... +"SGQDRYTTEVVVNVGGTMQMLGGRQGGGAPAGGNIGGGQPQGGWGQ" 584 ... +"PQQPQGGNQFSGGAQSRPQQSAPAAPSNEPPMDFDDDIPF", 585 ... generic_protein), 586 ... id="NP_418483.1", name="b4059", 587 ... description="ssDNA-binding protein", 588 ... dbxrefs=["ASAP:13298", "GI:16131885", "GeneID:948570"]) 589 >>> print repr(rec) 590 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 591 592 At the python prompt you can also use this shorthand: 593 594 >>> rec 595 SeqRecord(seq=Seq('MASRGVNKVILVGNLGQDPEVRYMPNGGAVANITLATSESWRDKATGEMKEQTE...IPF', ProteinAlphabet()), id='NP_418483.1', name='b4059', description='ssDNA-binding protein', dbxrefs=['ASAP:13298', 'GI:16131885', 'GeneID:948570']) 596 597 Note that long sequences are shown truncated. Also note that any 598 annotations, letter_annotations and features are not shown (as they 599 would lead to a very long string). 600 """ 601 return self.__class__.__name__ \ 602 + "(seq=%s, id=%s, name=%s, description=%s, dbxrefs=%s)" \ 603 % tuple(map(repr, (self.seq, self.id, self.name, 604 self.description, self.dbxrefs)))
605
606 - def format(self, format):
607 r"""Returns the record as a string in the specified file format. 608 609 The format should be a lower case string supported as an output 610 format by Bio.SeqIO, which is used to turn the SeqRecord into a 611 string. e.g. 612 613 >>> from Bio.Seq import Seq 614 >>> from Bio.SeqRecord import SeqRecord 615 >>> from Bio.Alphabet import IUPAC 616 >>> record = SeqRecord(Seq("MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF", 617 ... IUPAC.protein), 618 ... id="YP_025292.1", name="HokC", 619 ... description="toxic membrane protein") 620 >>> record.format("fasta") 621 '>YP_025292.1 toxic membrane protein\nMKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF\n' 622 >>> print record.format("fasta") 623 >YP_025292.1 toxic membrane protein 624 MKQHKAMIVALIVICITAVVAALVTRKDLCEVHIRTGQTEVAVF 625 <BLANKLINE> 626 627 The python print command automatically appends a new line, meaning 628 in this example a blank line is shown. If you look at the string 629 representation you can see there is a trailing new line (shown as 630 slash n) which is important when writing to a file or if 631 concatenating mutliple sequence strings together. 632 633 Note that this method will NOT work on every possible file format 634 supported by Bio.SeqIO (e.g. some are for multiple sequences only). 635 """ 636 #See also the __format__ added for Python 2.6 / 3.0, PEP 3101 637 #See also the Bio.Align.Generic.Alignment class and its format() 638 return self.__format__(format)
639
640 - def __format__(self, format_spec):
641 """Returns the record as a string in the specified file format. 642 643 This method supports the python format() function added in 644 Python 2.6/3.0. The format_spec should be a lower case string 645 supported by Bio.SeqIO as an output file format. See also the 646 SeqRecord's format() method. 647 """ 648 if format_spec: 649 from StringIO import StringIO 650 from Bio import SeqIO 651 handle = StringIO() 652 SeqIO.write([self], handle, format_spec) 653 return handle.getvalue() 654 else: 655 #Follow python convention and default to using __str__ 656 return str(self)
657
658 - def __len__(self):
659 """Returns the length of the sequence. 660 661 For example, using Bio.SeqIO to read in a FASTA nucleotide file: 662 663 >>> from Bio import SeqIO 664 >>> record = SeqIO.read(open("Fasta/sweetpea.nu"),"fasta") 665 >>> len(record) 666 309 667 >>> len(record.seq) 668 309 669 """ 670 return len(self.seq)
671
672 - def __nonzero__(self):
673 """Returns True regardless of the length of the sequence. 674 675 This behaviour is for backwards compatibility, since until the 676 __len__ method was added, a SeqRecord always evaluated as True. 677 678 Note that in comparison, a Seq object will evaluate to False if it 679 has a zero length sequence. 680 681 WARNING: The SeqRecord may in future evaluate to False when its 682 sequence is of zero length (in order to better match the Seq 683 object behaviour)! 684 """ 685 return True
686
687 - def __add__(self, other):
688 """Add another sequence or string to this sequence. 689 690 The other sequence can be a SeqRecord object, a Seq object (or 691 similar, e.g. a MutableSeq) or a plain Python string. If you add 692 a plain string or a Seq (like) object, the new SeqRecord will simply 693 have this appended to the existing data. However, any per letter 694 annotation will be lost: 695 696 >>> from Bio import SeqIO 697 >>> handle = open("Quality/solexa_faked.fastq", "rU") 698 >>> record = SeqIO.read(handle, "fastq-solexa") 699 >>> handle.close() 700 >>> print record.id, record.seq 701 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 702 >>> print record.letter_annotations.keys() 703 ['solexa_quality'] 704 705 >>> new = record + "ACT" 706 >>> print new.id, new.seq 707 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNNACT 708 >>> print new.letter_annotations.keys() 709 [] 710 711 The new record will attempt to combine the annotation, but for any 712 ambiguities (e.g. different names) it defaults to omitting that 713 annotation. 714 715 >>> from Bio import SeqIO 716 >>> handle = open("GenBank/pBAD30.gb") 717 >>> plasmid = SeqIO.read(handle, "gb") 718 >>> handle.close() 719 >>> print plasmid.id, len(plasmid) 720 pBAD30 4923 721 722 Now let's cut the plasmid into two pieces, and join them back up the 723 other way round (i.e. shift the starting point on this plasmid, have 724 a look at the annotated features in the original file to see why this 725 particular split point might make sense): 726 727 >>> left = plasmid[:3765] 728 >>> right = plasmid[3765:] 729 >>> new = right + left 730 >>> print new.id, len(new) 731 pBAD30 4923 732 >>> str(new.seq) == str(right.seq + left.seq) 733 True 734 >>> len(new.features) == len(left.features) + len(right.features) 735 True 736 737 When we add the left and right SeqRecord objects, their annotation 738 is all consistent, so it is all conserved in the new SeqRecord: 739 740 >>> new.id == left.id == right.id == plasmid.id 741 True 742 >>> new.name == left.name == right.name == plasmid.name 743 True 744 >>> new.description == plasmid.description 745 True 746 >>> new.annotations == left.annotations == right.annotations 747 True 748 >>> new.letter_annotations == plasmid.letter_annotations 749 True 750 >>> new.dbxrefs == left.dbxrefs == right.dbxrefs 751 True 752 753 However, we should point out that when we sliced the SeqRecord, 754 any annotations dictionary or dbxrefs list entries were lost. 755 You can explicitly copy them like this: 756 757 >>> new.annotations = plasmid.annotations.copy() 758 >>> new.dbxrefs = plasmid.dbxrefs[:] 759 """ 760 if not isinstance(other, SeqRecord): 761 #Assume it is a string or a Seq. 762 #Note can't transfer any per-letter-annotations 763 return SeqRecord(self.seq + other, 764 id = self.id, name = self.name, 765 description = self.description, 766 features = self.features[:], 767 annotations = self.annotations.copy(), 768 dbxrefs = self.dbxrefs[:]) 769 #Adding two SeqRecord objects... must merge annotation. 770 answer = SeqRecord(self.seq + other.seq, 771 features = self.features[:], 772 dbxrefs = self.dbxrefs[:]) 773 #Will take all the features and all the db cross refs, 774 l = len(self) 775 for f in other.features: 776 answer.features.append(f._shift(l)) 777 del l 778 for ref in other.dbxrefs: 779 if ref not in answer.dbxrefs: 780 answer.append(ref) 781 #Take common id/name/description/annotation 782 if self.id == other.id: 783 answer.id = self.id 784 if self.name == other.name: 785 answer.name = self.name 786 if self.description == other.description: 787 answer.description = self.description 788 for k,v in self.annotations.iteritems(): 789 if k in other.annotations and other.annotations[k] == v: 790 answer.annotations[k] = v 791 #Can append matching per-letter-annotation 792 for k,v in self.letter_annotations.iteritems(): 793 if k in other.letter_annotations: 794 answer.letter_annotations[k] = v + other.letter_annotations[k] 795 return answer
796
797 - def __radd__(self, other):
798 """Add another sequence or string to this sequence (from the left). 799 800 This method handles adding a Seq object (or similar, e.g. MutableSeq) 801 or a plain Python string (on the left) to a SeqRecord (on the right). 802 See the __add__ method for more details, but for example: 803 804 >>> from Bio import SeqIO 805 >>> handle = open("Quality/solexa_faked.fastq", "rU") 806 >>> record = SeqIO.read(handle, "fastq-solexa") 807 >>> handle.close() 808 >>> print record.id, record.seq 809 slxa_0001_1_0001_01 ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 810 >>> print record.letter_annotations.keys() 811 ['solexa_quality'] 812 813 >>> new = "ACT" + record 814 >>> print new.id, new.seq 815 slxa_0001_1_0001_01 ACTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTNNNNNN 816 >>> print new.letter_annotations.keys() 817 [] 818 """ 819 if isinstance(other, SeqRecord): 820 raise RuntimeError("This should have happened via the __add__ of " 821 "the other SeqRecord being added!") 822 #Assume it is a string or a Seq. 823 #Note can't transfer any per-letter-annotations 824 offset = len(other) 825 return SeqRecord(other + self.seq, 826 id = self.id, name = self.name, 827 description = self.description, 828 features = [f._shift(offset) for f in self.features], 829 annotations = self.annotations.copy(), 830 dbxrefs = self.dbxrefs[:])
831
832 -def _test():
833 """Run the Bio.SeqRecord module's doctests (PRIVATE). 834 835 This will try and locate the unit tests directory, and run the doctests 836 from there in order that the relative paths used in the examples work. 837 """ 838 import doctest 839 import os 840 if os.path.isdir(os.path.join("..","Tests")): 841 print "Runing doctests..." 842 cur_dir = os.path.abspath(os.curdir) 843 os.chdir(os.path.join("..","Tests")) 844 doctest.testmod() 845 os.chdir(cur_dir) 846 del cur_dir 847 print "Done" 848 elif os.path.isdir(os.path.join("Tests")) : 849 print "Runing doctests..." 850 cur_dir = os.path.abspath(os.curdir) 851 os.chdir(os.path.join("Tests")) 852 doctest.testmod() 853 os.chdir(cur_dir) 854 del cur_dir 855 print "Done"
856 857 if __name__ == "__main__": 858 _test() 859