1
2
3
4
5
6 """Classes corresponding to phyloXML elements.
7
8 See U{ http://phyloxml.org/ } for the official specification.
9
10 See also Han and Zmasek (2009) doi:10.1186/1471-2105-10-356
11 """
12 __docformat__ = "epytext en"
13
14 import re
15 import warnings
16
17 from Bio import Alphabet
18 from Bio.Align import MultipleSeqAlignment
19 from Bio.Seq import Seq
20 from Bio.SeqFeature import SeqFeature, FeatureLocation
21 from Bio.SeqRecord import SeqRecord
22
23 import BaseTree
24 import _sugar
28 """Warning for non-compliance with the phyloXML specification."""
29 pass
30
33 """Check a string using testfunc, and warn if there's no match."""
34 if text is not None and not testfunc(text):
35 warnings.warn("String %s doesn't match the given regexp" % text,
36 PhyloXMLWarning, stacklevel=2)
37
42 """Base class for all PhyloXML objects."""
43
46 """Root node of the PhyloXML document.
47
48 Contains an arbitrary number of Phylogeny elements, possibly followed by
49 elements from other namespaces.
50
51 @param attributes: (XML namespace definitions)
52 @param phylogenies: list of phylogenetic trees
53 @param other: list of arbitrary non-phyloXML elements, if any
54 """
55 - def __init__(self, attributes, phylogenies=None, other=None):
56 self.attributes = attributes
57 self.phylogenies = phylogenies or []
58 self.other = other or []
59
61 """Get a phylogeny by index or name."""
62 if isinstance(index, int) or isinstance(index, slice):
63 return self.phylogenies[index]
64 if not isinstance(index, basestring):
65 raise KeyError, "can't use %s as an index" % type(index)
66 for tree in self.phylogenies:
67 if tree.name == index:
68 return tree
69 else:
70 raise KeyError, "no phylogeny found with name " + repr(index)
71
73 """Iterate through the phylogenetic trees in this object."""
74 return iter(self.phylogenies)
75
77 """Number of phylogenetic trees in this object."""
78 return len(self.phylogenies)
79
81 return '%s([%s])' % (self.__class__.__name__,
82 ',\n'.join(map(str, self.phylogenies)))
83
84
85 -class Other(PhyloElement):
86 """Container for non-phyloXML elements in the tree.
87
88 Usually, an Other object will have either a 'value' or a non-empty list
89 of 'children', but not both. This is not enforced here, though.
90
91 @param tag: local tag for the XML node
92 @param namespace: XML namespace for the node -- should not be the default
93 phyloXML namespace.
94 @param attributes: string attributes on the XML node
95 @param value: text contained directly within this XML node
96 @param children: list of child nodes, if any (also Other instances)
97 """
98 - def __init__(self, tag, namespace=None, attributes=None, value=None,
99 children=None):
100 self.tag = tag
101 self.namespace = namespace
102 self.attributes = attributes
103 self.value = value
104 self.children = children or []
105
107 """Iterate through the children of this object (if any)."""
108 return iter(self.children)
109
110
111 -class Phylogeny(PhyloElement, BaseTree.Tree):
112 """A phylogenetic tree.
113
114 @param root: the root node/clade of this tree
115 @param rooted: True if this tree is rooted
116 @param rerootable: True if this tree is rerootable
117 @param branch_length_unit: unit for branch_length values on clades
118 @type type: str
119
120 @param name: string identifier for this tree, not required to be unique
121 @param id: unique identifier for this tree (type Id)
122 @param description: plain-text description
123 @param date: date for the root node of this tree (type Date)
124 @param confidences: list of Confidence objects for this tree
125 @param clade_relations: list of CladeRelation objects
126 @param sequence_relations: list of SequenceRelation objects
127 @param properties: list of Property objects
128 @param other: list of non-phyloXML elements (type Other)
129 """
130 - def __init__(self, root=None, rooted=True,
131 rerootable=None, branch_length_unit=None, type=None,
132
133 name=None, id=None, description=None, date=None,
134
135 confidences=None, clade_relations=None, sequence_relations=None,
136 properties=None, other=None,
137 ):
138 assert isinstance(rooted, bool)
139 self.root = root
140 self.rooted = rooted
141 self.rerootable = rerootable
142 self.branch_length_unit = branch_length_unit
143 self.type = type
144 self.name = name
145 self.id = id
146 self.description = description
147 self.date = date
148 self.confidences = confidences or []
149 self.clade_relations = clade_relations or []
150 self.sequence_relations = sequence_relations or []
151 self.properties = properties or []
152 self.other = other or []
153
154 @classmethod
163
164 @classmethod
167
169 """Create a new PhyloXML object containing just this phylogeny."""
170 return Phyloxml(kwargs, phylogenies=[self])
171
173 """Construct an alignment from the aligned sequences in this tree."""
174 def is_aligned_seq(elem):
175 if isinstance(elem, Sequence) and elem.mol_seq.is_aligned:
176 return True
177 return False
178 seqs = self._filter_search(is_aligned_seq, 'preorder', True)
179 try:
180 first_seq = seqs.next()
181 except StopIteration:
182
183 return MultipleSeqAlignment([])
184 msa = MultipleSeqAlignment([first_seq.to_seqrecord()],
185 first_seq.get_alphabet())
186 msa.extend(seq.to_seqrecord() for seq in seqs)
187 return msa
188
189
191 """Equivalent to self.confidences[0] if there is only 1 value.
192
193 See also: Clade.confidence, Clade.taxonomy
194 """
195 if len(self.confidences) == 0:
196 return None
197 if len(self.confidences) > 1:
198 raise AttributeError("more than 1 confidence value available; "
199 "use Phylogeny.confidences")
200 return self.confidences[0]
201
203 if isinstance(value, float) or isinstance(value, int):
204 value = Confidence(value)
205 elif not isinstance(value, Confidence):
206 raise ValueError("value must be a number or Confidence instance")
207 if len(self.confidences) == 0:
208 self.confidences.append(value)
209 elif len(self.confidences) == 1:
210 self.confidences[0] = value
211 else:
212 raise ValueError("multiple confidence values already exist; "
213 "use Phylogeny.confidences instead")
214
215 confidence = property(_get_confidence, _set_confidence)
216
217
218 -class Clade(PhyloElement, BaseTree.Clade):
219 """Describes a branch of the current phylogenetic tree.
220
221 Used recursively, describes the topology of a phylogenetic tree.
222
223 Both 'color' and 'width' elements should be interpreted by client code as
224 applying to the whole clade, including all descendents, unless overwritten
225 in-sub clades. This module doesn't automatically assign these attributes to
226 sub-clades to achieve this cascade -- and neither should you.
227
228 @param branch_length: parent branch length of this clade
229 @param id_source: link other elements to a clade (on the xml-level)
230
231 @param name: short string label for this clade
232 @param confidences: list of Confidence objects, used to indicate the
233 support for a clade/parent branch.
234 @param width: branch width for this clade (including branch from parent)
235 @param color: color used for graphical display of this clade
236 @param node_id: unique identifier for the root node of this clade
237 @param taxonomies: list of Taxonomy objects
238 @param sequences: list of Sequence objects
239 @param events: describe such events as gene-duplications at the root
240 node/parent branch of this clade
241 @param binary_characters: a BinaryCharacters object
242 @param distributions: list of Distribution objects
243 @param date: a date for the root node of this clade (type Date)
244 @param references: list of Reference objects
245 @param properties: list of Property objects
246 @param clades: list of sub-clades (type Clade)
247 @param other: list of non-phyloXML objects
248 """
249 - def __init__(self,
250
251 branch_length=None, id_source=None,
252
253 name=None, width=None, color=None, node_id=None, events=None,
254 binary_characters=None, date=None,
255
256 confidences=None, taxonomies=None, sequences=None,
257 distributions=None, references=None, properties=None, clades=None,
258 other=None,
259 ):
277
278 @classmethod
286
288 """Create a new phylogeny containing just this clade."""
289 phy = Phylogeny(root=self, date=self.date)
290 phy.__dict__.update(kwargs)
291 return phy
292
293
295 if len(self.confidences) == 0:
296 return None
297 if len(self.confidences) > 1:
298 raise AttributeError("more than 1 confidence value available; "
299 "use Clade.confidences")
300 return self.confidences[0]
301
303 if isinstance(value, float) or isinstance(value, int):
304 value = Confidence(value)
305 elif not isinstance(value, Confidence):
306 raise ValueError("value must be a number or Confidence instance")
307 if len(self.confidences) == 0:
308 self.confidences.append(value)
309 elif len(self.confidences) == 1:
310 self.confidences[0] = value
311 else:
312 raise ValueError("multiple confidence values already exist; "
313 "use Phylogeny.confidences instead")
314
315 confidence = property(_get_confidence, _set_confidence)
316
318 if len(self.taxonomies) == 0:
319 return None
320 if len(self.taxonomies) > 1:
321 raise AttributeError("more than 1 taxonomy value available; "
322 "use Clade.taxonomies")
323 return self.taxonomies[0]
324
326 if not isinstance(value, Taxonomy):
327 raise ValueError("assigned value must be a Taxonomy instance")
328 if len(self.taxonomies) == 0:
329 self.taxonomies.append(value)
330 elif len(self.taxonomies) == 1:
331 self.taxonomies[0] = value
332 else:
333 raise ValueError("multiple taxonomy values already exist; "
334 "use Phylogeny.taxonomies instead")
335
336 taxonomy = property(_get_taxonomy, _set_taxonomy)
337
338
341
343 if arg is None or isinstance(arg, BranchColor):
344 self._color = arg
345 elif isinstance(arg, basestring):
346 if arg in BranchColor.color_names:
347
348 self._color = BranchColor.from_name(arg)
349 elif arg.startswith('#') and len(arg) == 7:
350
351 self._color = BranchColor.from_hex(arg)
352 else:
353 raise ValueError("invalid color string %s" % arg)
354 elif hasattr(arg, '__iter__') and len(arg) == 3:
355
356 self._color = BranchColor(*arg)
357 else:
358 raise ValueError("invalid color value %s" % arg)
359
360 color = property(_get_color, _set_color, doc="Branch color.")
361
366 """Captures the local part in a sequence identifier.
367
368 Example: In 'UniProtKB:P17304', the Accession instance attribute 'value' is
369 'P17304' and the 'source' attribute is 'UniProtKB'.
370 """
374
376 """Show the class name and an identifying attribute."""
377 return '%s:%s' % (self.source, self.value)
378
381 """The annotation of a molecular sequence.
382
383 It is recommended to annotate by using the optional 'ref' attribute (some
384 examples of acceptable values for the ref attribute: 'GO:0008270',
385 'KEGG:Tetrachloroethene degradation', 'EC:1.1.1.1').
386
387 @type ref: str
388 @param source: plain-text source for this annotation
389 @param evidence: describe evidence as free text (e.g. 'experimental')
390 @type type: str
391
392 @param desc: free text description
393 @param confidence: state the type and value of support (type Confidence)
394 @param properties: list of typed and referenced annotations from external
395 resources
396 @type uri: Uri
397 """
398 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+')
399
400 - def __init__(self,
401
402 ref=None, source=None, evidence=None, type=None,
403
404 desc=None, confidence=None, uri=None,
405
406 properties=None):
416
419 """The names and/or counts of binary characters present, gained, and lost
420 at the root of a clade.
421 """
422 - def __init__(self,
423
424 type=None, gained_count=None, lost_count=None, present_count=None,
425 absent_count=None,
426
427 gained=None, lost=None, present=None, absent=None):
428 self.type=type
429 self.gained_count=gained_count
430 self.lost_count=lost_count
431 self.present_count=present_count
432 self.absent_count=absent_count
433 self.gained=gained or []
434 self.lost=lost or []
435 self.present=present or []
436 self.absent=absent or []
437
440 """Indicates the color of a clade when rendered graphically.
441
442 The color should be interpreted by client code (e.g. visualization
443 programs) as applying to the whole clade, unless overwritten by the
444 color(s) of sub-clades.
445
446 Color values must be integers from 0 to 255.
447 """
448
449 color_names = {
450 'red': (255, 0, 0),
451 'r': (255, 0, 0),
452 'yellow': (255, 255, 0),
453 'y': (255, 255, 0),
454 'green': ( 0, 128, 0),
455 'g': ( 0, 128, 0),
456 'cyan': ( 0, 255, 255),
457 'c': ( 0, 255, 255),
458 'blue': ( 0, 0, 255),
459 'b': ( 0, 0, 255),
460 'magenta': (255, 0, 255),
461 'm': (255, 0, 255),
462 'black': ( 0, 0, 0),
463 'k': ( 0, 0, 0),
464 'white': (255, 255, 255),
465 'w': (255, 255, 255),
466
467
468 'maroon': (128, 0, 0),
469 'olive': (128, 128, 0),
470 'lime': ( 0, 255, 0),
471 'aqua': ( 0, 255, 255),
472 'teal': ( 0, 128, 128),
473 'navy': ( 0, 0, 128),
474 'fuchsia': (255, 0, 255),
475 'purple': (128, 0, 128),
476 'silver': (192, 192, 192),
477 'gray': (128, 128, 128),
478
479 'grey': (128, 128, 128),
480 'pink': (255, 192, 203),
481 'salmon': (250, 128, 114),
482 'orange': (255, 165, 0),
483 'gold': (255, 215, 0),
484 'tan': (210, 180, 140),
485 'brown': (165, 42, 42),
486 }
487
496
497 @classmethod
499 """Construct a BranchColor object from a hexadecimal string.
500
501 The string format is the same style used in HTML and CSS, such as
502 '#FF8000' for an RGB value of (255, 128, 0).
503 """
504 assert (isinstance(hexstr, basestring) and
505 hexstr.startswith('#') and
506 len(hexstr) == 7
507 ), "need a 24-bit hexadecimal string, e.g. #000000"
508 def unpack(cc):
509 return int('0x'+cc, base=16)
510 RGB = hexstr[1:3], hexstr[3:5], hexstr[5:]
511 return cls(*map(unpack, RGB))
512
513 @classmethod
515 """Construct a BranchColor object by the color's name."""
516 return cls(*cls.color_names[colorname])
517
519 """Return a 24-bit hexadecimal RGB representation of this color.
520
521 The returned string is suitable for use in HTML/CSS, as a color
522 parameter in matplotlib, and perhaps other situations.
523
524 Example:
525
526 >>> bc = BranchColor(12, 200, 100)
527 >>> bc.to_hex()
528 '#0cc864'
529 """
530 return '#' + hex(
531 self.red * (16**4)
532 + self.green * (16**2)
533 + self.blue)[2:].zfill(6)
534
536 """Return a tuple of RGB values (0 to 255) representing this color.
537
538 Example:
539
540 >>> bc = BranchColor(255, 165, 0)
541 >>> bc.to_rgb()
542 (255, 165, 0)
543 """
544 return (self.red, self.green, self.blue)
545
547 """Preserve the standard RGB order when representing this object."""
548 return ('%s(red=%d, green=%d, blue=%d)'
549 % (self.__class__.__name__, self.red, self.green, self.blue)
550 ).encode('utf-8')
551
553 """Show the color's RGB values."""
554 return "(%d, %d, %d)" % (self.red, self.green, self.blue)
555
558 """Expresses a typed relationship between two clades.
559
560 For example, this could be used to describe multiple parents of a clade.
561
562 @type id_ref_0: str
563 @type id_ref_1: str
564 @type distance: str
565 @type type: str
566
567 @type confidence: Confidence
568 """
569 - def __init__(self, type, id_ref_0, id_ref_1,
570 distance=None, confidence=None):
576
579 """A general purpose confidence element.
580
581 For example, this can be used to express the bootstrap support value of a
582 clade (in which case the 'type' attribute is 'bootstrap').
583
584 @type value: float
585 @type type: str
586 """
587 - def __init__(self, value, type='unknown'):
590
592 return float(self.value)
593
595 return int(self.value)
596
597
598 -class Date(PhyloElement):
599 """A date associated with a clade/node.
600
601 Its value can be numerical by using the 'value' element and/or free text
602 with the 'desc' element' (e.g. 'Silurian'). If a numerical value is used, it
603 is recommended to employ the 'unit' attribute.
604
605 @param unit: type of numerical value (e.g. 'mya' for 'million years ago')
606
607 @type value: float
608 @param desc: plain-text description of the date
609 @param minimum: lower bound on the date value
610 @param maximum: upper bound on the date value
611 """
612 - def __init__(self, value=None, unit=None, desc=None,
613 minimum=None, maximum=None):
619
621 """Show the class name and the human-readable date."""
622 if self.unit and self.value is not None:
623 return '%s %s' % (self.value, self.unit)
624 if self.desc is not None:
625 return self.desc
626 return self.__class__.__name__
627
630 """Geographic distribution of the items of a clade (species, sequences).
631
632 Intended for phylogeographic applications.
633
634 The location can be described either by free text in the 'desc' element
635 and/or by the coordinates of one or more 'Points' (similar to the 'Point'
636 element in Google's KML format) or by 'Polygons'.
637 """
638 - def __init__(self, desc=None, points=None, polygons=None):
639 self.desc = desc
640 self.points = points or []
641 self.polygons = polygons or []
642
643
644 -class DomainArchitecture(PhyloElement):
645 """Domain architecture of a protein.
646
647 @param length: total length of the protein sequence (type int)
648 @param domains: list of ProteinDomain objects
649 """
650 - def __init__(self, length=None, domains=None):
651 self.length = length
652 self.domains = domains
653
654
655 -class Events(PhyloElement):
656 """Events at the root node of a clade (e.g. one gene duplication).
657
658 All attributes are set to None by default, but this object can also be
659 treated as a dictionary, in which case None values are treated as missing
660 keys and deleting a key resets that attribute's value back to None.
661 """
662 ok_type = set(('transfer', 'fusion', 'speciation_or_duplication', 'other',
663 'mixed', 'unassigned'))
664
665 - def __init__(self, type=None, duplications=None, speciations=None,
666 losses=None, confidence=None):
673
675 return ((k, v) for k, v in self.__dict__.iteritems() if v is not None)
676
678 return (k for k, v in self.__dict__.iteritems() if v is not None)
679
681 return (v for v in self.__dict__.itervalues() if v is not None)
682
685
688
691
694
696 if not hasattr(self, key):
697 raise KeyError(key)
698 val = getattr(self, key)
699 if val is None:
700 raise KeyError("%s has not been set in this object" % repr(key))
701 return val
702
704 setattr(self, key, val)
705
707 setattr(self, key, None)
708
711
713 return (hasattr(self, key) and getattr(self, key) is not None)
714
715
716 -class Id(PhyloElement):
717 """A general-purpose identifier element.
718
719 Allows to indicate the provider (or authority) of an identifier, e.g. NCBI,
720 along with the value itself.
721 """
722 - def __init__(self, value, provider=None):
723 self.value = value
724 self.provider = provider
725
727 if self.provider is not None:
728 return '%s:%s' % (self.provider, self.value)
729 return self.value
730
731
732 -class MolSeq(PhyloElement):
733 """Store a molecular sequence.
734
735 @param value: the sequence, as a string
736 @param is_aligned: True is mol_seq is aligned (usu. meaning gaps are
737 introduced and all aligned seqs are the same length)
738 """
739 re_value = re.compile(r'[a-zA-Z\.\-\?\*_]+')
740
741 - def __init__(self, value, is_aligned=None):
745
748
749
750 -class Point(PhyloElement):
751 """Geographic coordinates of a point, with an optional altitude.
752
753 Used by element 'Distribution'.
754
755 @param geodetic_datum: indicate the geodetic datum (also called 'map
756 datum'). For example, Google's KML uses 'WGS84'. (required)
757 @param lat: latitude
758 @param long: longitude
759 @param alt: altitude
760 @param alt_unit: unit for the altitude (e.g. 'meter')
761 """
762 - def __init__(self, geodetic_datum, lat, long, alt=None, alt_unit=None):
763 self.geodetic_datum = geodetic_datum
764 self.lat = lat
765 self.long = long
766 self.alt = alt
767 self.alt_unit = alt_unit
768
771 """A polygon defined by a list of 'Points' (used by element 'Distribution').
772
773 @param points: list of 3 or more points representing vertices.
774 """
776 self.points = points or []
777
779 return '%s([%s])' % (self.__class__.__name__,
780 ',\n'.join(map(str, self.points)))
781
784 """A typed and referenced property from an external resources.
785
786 Can be attached to 'Phylogeny', 'Clade', and 'Annotation' objects.
787
788 @param ref: reference to an external resource, e.g. "NOAA:depth"
789
790 @param unit: the unit of the property, e.g. "METRIC:m" (optional)
791
792 @param datatype: indicates the type of a property and is limited to
793 xsd-datatypes (e.g. 'xsd:string', 'xsd:boolean', 'xsd:integer',
794 'xsd:decimal', 'xsd:float', 'xsd:double', 'xsd:date', 'xsd:anyURI').
795
796 @param applies_to: indicates the item to which a property applies to (e.g.
797 'node' for the parent node of a clade, 'parent_branch' for the parent
798 branch of a clade, or just 'clade').
799
800 @param id_ref: allows to attached a property specifically to one element
801 (on the xml-level). (optional)
802
803 @type value: str
804 """
805 re_ref = re.compile(r'[a-zA-Z0-9_]+:[a-zA-Z0-9_\.\-\s]+')
806 ok_applies_to = set(('phylogeny', 'clade', 'node', 'annotation',
807 'parent_branch', 'other'))
808 ok_datatype = set(('xsd:string', 'xsd:boolean', 'xsd:decimal', 'xsd:float',
809 'xsd:double', 'xsd:duration', 'xsd:dateTime', 'xsd:time', 'xsd:date',
810 'xsd:gYearMonth', 'xsd:gYear', 'xsd:gMonthDay', 'xsd:gDay',
811 'xsd:gMonth', 'xsd:hexBinary', 'xsd:base64Binary', 'xsd:anyURI',
812 'xsd:normalizedString', 'xsd:token', 'xsd:integer',
813 'xsd:nonPositiveInteger', 'xsd:negativeInteger', 'xsd:long', 'xsd:int',
814 'xsd:short', 'xsd:byte', 'xsd:nonNegativeInteger', 'xsd:unsignedLong',
815 'xsd:unsignedInt', 'xsd:unsignedShort', 'xsd:unsignedByte',
816 'xsd:positiveInteger'))
817
818 - def __init__(self, value, ref, applies_to, datatype,
819 unit=None, id_ref=None):
830
831
832 -class ProteinDomain(PhyloElement):
833 """Represents an individual domain in a domain architecture.
834
835 The locations use 0-based indexing, as most Python objects including
836 SeqFeature do, rather than the usual biological convention starting at 1.
837 This means the start and end attributes can be used directly as slice
838 indexes on Seq objects.
839
840 @param start: start of the domain on the sequence, using 0-based indexing
841 @type start: non-negative integer
842 @param end: end of the domain on the sequence
843 @type end: non-negative integer
844 @param confidence: can be used to store e.g. E-values. (type float)
845 @param id: unique identifier/name
846 """
847
848 - def __init__(self, value, start, end, confidence=None, id=None):
849 self.value = value
850 self.start = start
851 self.end = end
852 self.confidence = confidence
853 self.id = id
854
855 @classmethod
856 - def from_seqfeature(cls, feat):
857 return ProteinDomain(feat.id,
858 feat.location.nofuzzy_start,
859 feat.location.nofuzzy_end,
860 confidence=feat.qualifiers.get('confidence'))
861
862 - def to_seqfeature(self):
863 feat = SeqFeature(location=FeatureLocation(self.start, self.end),
864 id=self.value)
865 if hasattr(self, 'confidence'):
866 feat.qualifiers['confidence'] = self.confidence
867 return feat
868
871 """Literature reference for a clade.
872
873 It is recommended to use the 'doi' attribute instead of the free text
874 'desc' element whenever possible.
875 """
876 re_doi = re.compile(r'[a-zA-Z0-9_\.]+/[a-zA-Z0-9_\.]+')
877
878 - def __init__(self, doi=None, desc=None):
882
885 """A molecular sequence (Protein, DNA, RNA) associated with a node.
886
887 One intended use for 'id_ref' is to link a sequence to a taxonomy (via the
888 taxonomy's 'id_source') in case of multiple sequences and taxonomies per
889 node.
890
891 @param type: type of sequence ('dna', 'rna', or 'protein').
892 @type id_ref: str
893 @type id_source: str
894
895 @param symbol: short symbol of the sequence, e.g. 'ACTM' (max. 10 chars)
896 @type accession: Accession
897 @param name: full name of the sequence, e.g. 'muscle Actin'
898 @param location: location of a sequence on a genome/chromosome.
899 @type mol_seq: MolSeq
900 @type uri: Uri
901 @param annotations: list of Annotation objects
902 @param domain_architecture: protein domains on this sequence (type
903 DomainArchitecture)
904 @param other: list of non-phyloXML elements (type Other)
905 """
906 alphabets = {'dna': Alphabet.generic_dna,
907 'rna': Alphabet.generic_rna,
908 'protein': Alphabet.generic_protein}
909 re_symbol = re.compile(r'\S{1,10}')
910
911 - def __init__(self,
912
913 type=None, id_ref=None, id_source=None,
914
915 symbol=None, accession=None, name=None, location=None,
916 mol_seq=None, uri=None, domain_architecture=None,
917
918 annotations=None, other=None,
919 ):
934
935 @classmethod
937 """Create a new PhyloXML Sequence from a SeqRecord object."""
938 if is_aligned == None:
939 is_aligned = isinstance(record.seq.alphabet, Alphabet.Gapped)
940 params = {
941 'accession': Accession(record.id, ''),
942 'symbol': record.name,
943 'name': record.description,
944 'mol_seq': MolSeq(str(record.seq), is_aligned),
945 }
946 if isinstance(record.seq.alphabet, Alphabet.DNAAlphabet):
947 params['type'] = 'dna'
948 elif isinstance(record.seq.alphabet, Alphabet.RNAAlphabet):
949 params['type'] = 'rna'
950 elif isinstance(record.seq.alphabet, Alphabet.ProteinAlphabet):
951 params['type'] = 'protein'
952
953
954 for key in ('id_ref', 'id_source', 'location'):
955 if key in record.annotations:
956 params[key] = record.annotations[key]
957 if isinstance(record.annotations.get('uri'), dict):
958 params['uri'] = Uri(**record.annotations['uri'])
959
960 if record.annotations.get('annotations'):
961 params['annotations'] = []
962 for annot in record.annotations['annotations']:
963 ann_args = {}
964 for key in ('ref', 'source', 'evidence', 'type', 'desc'):
965 if key in annot:
966 ann_args[key] = annot[key]
967 if isinstance(annot.get('confidence'), list):
968 ann_args['confidence'] = Confidence(
969 *annot['confidence'])
970 if isinstance(annot.get('properties'), list):
971 ann_args['properties'] = [Property(**prop)
972 for prop in annot['properties']
973 if isinstance(prop, dict)]
974 params['annotations'].append(Annotation(**ann_args))
975
976
977 if record.features:
978 params['domain_architecture'] = DomainArchitecture(
979 length=len(record.seq),
980 domains=[ProteinDomain.from_seqfeature(feat)
981 for feat in record.features])
982
983 return Sequence(**params)
984
986 """Create a SeqRecord object from this Sequence instance.
987
988 The seqrecord.annotations dictionary is packed like so::
989
990 { # Sequence attributes with no SeqRecord equivalent:
991 'id_ref': self.id_ref,
992 'id_source': self.id_source,
993 'location': self.location,
994 'uri': { 'value': self.uri.value,
995 'desc': self.uri.desc,
996 'type': self.uri.type },
997 # Sequence.annotations attribute (list of Annotations)
998 'annotations': [{ 'ref': ann.ref,
999 'source': ann.source,
1000 'evidence': ann.evidence,
1001 'type': ann.type,
1002 'confidence': [ ann.confidence.value,
1003 ann.confidence.type ],
1004 'properties': [{ 'value': prop.value,
1005 'ref': prop.ref,
1006 'applies_to': prop.applies_to,
1007 'datatype': prop.datatype,
1008 'unit': prop.unit,
1009 'id_ref': prop.id_ref }
1010 for prop in ann.properties],
1011 } for ann in self.annotations],
1012 }
1013 """
1014 def clean_dict(dct):
1015 """Remove None-valued items from a dictionary."""
1016 return dict((key, val) for key, val in dct.iteritems()
1017 if val is not None)
1018
1019 seqrec = SeqRecord(Seq(self.mol_seq.value, self.get_alphabet()),
1020 **clean_dict({
1021 'id': str(self.accession),
1022 'name': self.symbol,
1023 'description': self.name,
1024
1025 }))
1026 if self.domain_architecture:
1027 seqrec.features = [dom.to_seqfeature()
1028 for dom in self.domain_architecture.domains]
1029
1030 seqrec.annotations = clean_dict({
1031 'id_ref': self.id_ref,
1032 'id_source': self.id_source,
1033 'location': self.location,
1034 'uri': self.uri and clean_dict({
1035 'value': self.uri.value,
1036 'desc': self.uri.desc,
1037 'type': self.uri.type,
1038 }),
1039 'annotations': self.annotations and [
1040 clean_dict({
1041 'ref': ann.ref,
1042 'source': ann.source,
1043 'evidence': ann.evidence,
1044 'type': ann.type,
1045 'confidence': ann.confidence and [
1046 ann.confidence.value,
1047 ann.confidence.type],
1048 'properties': [clean_dict({
1049 'value': prop.value,
1050 'ref': prop.ref,
1051 'applies_to': prop.applies_to,
1052 'datatype': prop.datatype,
1053 'unit': prop.unit,
1054 'id_ref': prop.id_ref })
1055 for prop in ann.properties],
1056 }) for ann in self.annotations],
1057 })
1058 return seqrec
1059
1065
1068 """Express a typed relationship between two sequences.
1069
1070 For example, this could be used to describe an orthology (in which case
1071 attribute 'type' is 'orthology').
1072
1073 @param id_ref_0: first sequence reference identifier
1074 @param id_ref_1: second sequence reference identifier
1075 @param distance: distance between the two sequences (type float)
1076 @param type: describe the type of relationship
1077
1078 @type confidence: Confidence
1079 """
1080 ok_type = set(('orthology', 'one_to_one_orthology', 'super_orthology',
1081 'paralogy', 'ultra_paralogy', 'xenology', 'unknown', 'other'))
1082
1083 - def __init__(self, type, id_ref_0, id_ref_1,
1084 distance=None, confidence=None):
1091
1094 """Describe taxonomic information for a clade.
1095
1096 @param id_source: link other elements to a taxonomy (on the XML level)
1097
1098 @param id: unique identifier of a taxon, e.g. Id('6500',
1099 provider='ncbi_taxonomy') for the California sea hare
1100 @param code: store UniProt/Swiss-Prot style organism codes, e.g. 'APLCA'
1101 for the California sea hare 'Aplysia californica' (restricted string)
1102 @param scientific_name: the standard scientific name for this organism,
1103 e.g. 'Aplysia californica' for the California sea hare
1104 @param authority: keep the authority, such as 'J. G. Cooper, 1863',
1105 associated with the 'scientific_name'
1106 @param common_names: list of common names for this organism
1107 @param synonyms: ???
1108 @param rank: taxonomic rank (restricted string)
1109 @type uri: Uri
1110 @param other: list of non-phyloXML elements (type Other)
1111 """
1112 re_code = re.compile(r'[a-zA-Z0-9_]{2,10}')
1113 ok_rank = set(('domain', 'kingdom', 'subkingdom', 'branch', 'infrakingdom',
1114 'superphylum', 'phylum', 'subphylum', 'infraphylum', 'microphylum',
1115 'superdivision', 'division', 'subdivision', 'infradivision',
1116 'superclass', 'class', 'subclass', 'infraclass', 'superlegion',
1117 'legion', 'sublegion', 'infralegion', 'supercohort', 'cohort',
1118 'subcohort', 'infracohort', 'superorder', 'order', 'suborder',
1119 'superfamily', 'family', 'subfamily', 'supertribe', 'tribe', 'subtribe',
1120 'infratribe', 'genus', 'subgenus', 'superspecies', 'species',
1121 'subspecies', 'variety', 'subvariety', 'form', 'subform', 'cultivar',
1122 'unknown', 'other'))
1123
1124 - def __init__(self,
1125
1126 id_source=None,
1127
1128 id=None, code=None, scientific_name=None, authority=None,
1129 rank=None, uri=None,
1130
1131 common_names=None, synonyms=None, other=None,
1132 ):
1145
1147 """Show the class name and an identifying attribute."""
1148 if self.code is not None:
1149 return self.code
1150 if self.scientific_name is not None:
1151 return self.scientific_name
1152 if self.rank is not None:
1153 return self.rank
1154 if self.id is not None:
1155 return str(self.id)
1156 return self.__class__.__name__
1157
1158
1159 -class Uri(PhyloElement):
1160 """A uniform resource identifier.
1161
1162 In general, this is expected to be an URL (for example, to link to an image
1163 on a website, in which case the 'type' attribute might be 'image' and 'desc'
1164 might be 'image of a California sea hare').
1165 """
1166 - def __init__(self, value, desc=None, type=None):
1170
1172 if self.value:
1173 return self.value
1174 return repr(self)
1175