1
2
3
4
5
6 """Base classes for Bio.Phylo objects.
7
8 All object representations for phylogenetic trees should derive from these base
9 classes in order to use the common methods defined on them.
10 """
11 __docformat__ = "epytext en"
12
13 import collections
14 import itertools
15 import random
16 import re
17
18 import _sugar
23 """Traverse a tree in breadth-first (level) order."""
24 Q = collections.deque([root])
25 while Q:
26 v = Q.popleft()
27 yield v
28 Q.extend(get_children(v))
29
31 """Traverse a tree in depth-first pre-order (parent before children)."""
32 def dfs(elem):
33 yield elem
34 for v in get_children(elem):
35 for u in dfs(v):
36 yield u
37 for elem in dfs(root):
38 yield elem
39
40 -def _postorder_traverse(root, get_children):
41 """Traverse a tree in depth-first post-order (children before parent)."""
42 def dfs(elem):
43 for v in get_children(elem):
44 for u in dfs(v):
45 yield u
46 yield elem
47 for elem in dfs(root):
48 yield elem
49
51 """Get a flat list of elem's attributes, sorted for consistency."""
52 singles = []
53 lists = []
54
55 for child in sorted(elem.__dict__.itervalues()):
56 if child is None:
57 continue
58 if isinstance(child, list):
59 lists.extend(child)
60 else:
61 singles.append(child)
62 return (x for x in singles + lists
63 if isinstance(x, TreeElement))
64
68 """Match a node to the target object by identity."""
69 def match(node):
70 return (node is target)
71 return match
72
74 """Match a node if it's an instance of the given class."""
75 def match(node):
76 return isinstance(node, target_cls)
77 return match
78
80 """Match a node by specified attribute values.
81
82 'terminal' is a special case: True restricts the search to external (leaf)
83 nodes, False restricts to internal nodes, and None allows all tree elements
84 to be searched, including phyloXML annotations.
85
86 Otherwise, for a tree element to match the specification (i.e. for the
87 function produced by _attribute_matcher to return True when given a tree
88 element), it must have each of the attributes specified by the keys and
89 match each of the corresponding values -- think 'and', not 'or', for
90 multiple keys.
91 """
92 def match(node):
93 if 'terminal' in kwargs:
94
95 kwa_copy = kwargs.copy()
96 pattern = kwa_copy.pop('terminal')
97 if (pattern is not None and
98 (not hasattr(node, 'is_terminal') or
99 node.is_terminal() != pattern)):
100 return False
101 else:
102 kwa_copy = kwargs
103 for key, pattern in kwa_copy.iteritems():
104
105 if not hasattr(node, key):
106 return False
107 target = getattr(node, key)
108 if isinstance(pattern, basestring):
109 return (isinstance(target, basestring) and
110 re.match(pattern+'$', target))
111 if isinstance(pattern, bool):
112 return (pattern == bool(target))
113 if isinstance(pattern, int):
114 return (pattern == target)
115 if pattern is None:
116 return (target is None)
117 raise TypeError('invalid query type: %s' % type(pattern))
118 return True
119 return match
120
122 """Safer attribute lookup -- returns False instead of raising an error."""
123 def match(node):
124 try:
125 return matcher_func(node)
126 except (LookupError, AttributeError, ValueError):
127 return False
128 return match
129
131 """Retrieve a matcher function by passing an arbitrary object.
132
133 i.e. passing a TreeElement such as a Node or Tree instance returns an
134 identity matcher, passing a type such as the PhyloXML.Taxonomy class returns
135 a class matcher, and passing a dictionary returns an attribute matcher.
136
137 The resulting 'match' function returns True when given an object matching
138 the specification (identity, type or attribute values), otherwise False.
139 This is useful for writing functions that search the tree, and probably
140 shouldn't be used directly by the end user.
141 """
142 if isinstance(obj, TreeElement):
143 return _identity_matcher(obj)
144 if isinstance(obj, type):
145 return _class_matcher(obj)
146 if isinstance(obj, dict):
147 return _attribute_matcher(obj)
148 if callable(obj):
149 return _function_matcher(obj)
150 raise ValueError("%s (type %s) is not a valid type for comparison."
151 % (obj, type(obj)))
152
154 if not target:
155 if not kwargs:
156 if require_spec:
157 raise ValueError("you must specify a target object or keyword "
158 "arguments.")
159 return lambda x: True
160 return _attribute_matcher(kwargs)
161 match_obj = _object_matcher(target)
162 if not kwargs:
163 return match_obj
164 match_kwargs = _attribute_matcher(kwargs)
165 return (lambda x: match_obj(x) and match_kwargs(x))
166
171 """Base class for all Bio.Phylo classes."""
172
174 """Show this object's constructor with its primitive arguments."""
175 def pair_as_kwarg_string(key, val):
176 if isinstance(val, basestring):
177 return "%s='%s'" % (key, _sugar.trim_str(unicode(val)))
178 return "%s=%s" % (key, val)
179 s = '%s(%s)' % (self.__class__.__name__,
180 ', '.join(pair_as_kwarg_string(key, val)
181 for key, val in self.__dict__.iteritems()
182 if val is not None and
183 type(val) in (str, int, float, bool, unicode)
184 ))
185 return s.encode('utf-8')
186
187 __str__ = __repr__
188
191 """Methods for Tree- and Clade-based classes.
192
193 This lets Tree and Clade support the same traversal and searching
194 operations without requiring Clade to inherit from Tree, so Clade isn't
195 required to have all of Tree's attributes -- just 'root' (a Clade
196 instance) and 'is_terminal()'.
197 """
198
199
201 """Perform a BFS or DFS traversal through all elements in this tree.
202
203 @return: generator of all elements for which 'filter_func' is True.
204 """
205 order_opts = {'preorder': _preorder_traverse,
206 'postorder': _postorder_traverse,
207 'level': _level_traverse}
208 try:
209 order_func = order_opts[order]
210 except KeyError:
211 raise ValueError("Invalid order '%s'; must be one of: %s"
212 % (order, tuple(order_opts.keys())))
213 if follow_attrs:
214 get_children = _sorted_attrs
215 root = self
216 else:
217 get_children = lambda elem: elem.clades
218 root = self.root
219 return itertools.ifilter(filter_func, order_func(root, get_children))
220
222 """Return the first element found by find_elements(), or None.
223
224 This is also useful for checking whether any matching element exists in
225 the tree.
226 """
227 hits = self.find_elements(*args, **kwargs)
228 try:
229 return hits.next()
230 except StopIteration:
231 return None
232
233 - def find_elements(self, target=None, terminal=None, order='preorder',
234 **kwargs):
235 """Find all tree elements matching the given attributes.
236
237 The arbitrary keyword arguments indicate the attribute name of the
238 sub-element and the value to match: string, integer or boolean. Strings
239 are evaluated as regular expression matches; integers are compared
240 directly for equality, and booleans evaluate the attribute's truth value
241 (True or False) before comparing. To handle nonzero floats, search with
242 a boolean argument, then filter the result manually.
243
244 If no keyword arguments are given, then just the class type is used for
245 matching.
246
247 The result is an iterable through all matching objects, by depth-first
248 search. (Not necessarily the same order as the elements appear in the
249 source file!)
250
251 Example:
252
253 >>> from Bio.Phylo.IO import PhyloXMIO
254 >>> phx = PhyloXMLIO.read('phyloxml_examples.xml')
255 >>> matches = phx.phylogenies[5].find_elements(code='OCTVU')
256 >>> matches.next()
257 Taxonomy(code='OCTVU', scientific_name='Octopus vulgaris')
258
259 @param target:
260 Specifies the characteristics to search for. (The default,
261 TreeElement, matches any standard Bio.Phylo type.)
262 @type target: TreeElement instance, type, dict, or callable
263
264 @param terminal:
265 A boolean value to select for or against terminal nodes (a.k.a. leaf
266 nodes). True searches for only terminal nodes, False excludes
267 terminal nodes, and the default, None, searches both terminal and
268 non-terminal nodes, as well as any tree elements lacking the
269 'is_terminal' method.
270 @type terminal: bool
271
272 @param order:
273 Tree traversal order: 'preorder' (default) is depth-first search,
274 'postorder' is DFS with child nodes preceding parents, and 'level'
275 is breadth-first search.
276 @type order: string ('preorder'|'postorder'|'level')
277 """
278 if terminal is not None:
279 kwargs['terminal'] = terminal
280 is_matching_elem = _combine_matchers(target, kwargs, False)
281 return self._filter_search(is_matching_elem, order, True)
282
283 - def find_clades(self, target=None, terminal=None, order='preorder',
284 **kwargs):
285 """Find each clade containing a matching element.
286
287 That is, find each element as with find_elements(), but return the
288 corresponding clade object.
289 """
290 def match_attrs(elem):
291 orig_clades = elem.__dict__.pop('clades')
292 found = elem.find_any(target, **kwargs)
293 elem.clades = orig_clades
294 return (found is not None)
295 if terminal is None:
296 is_matching_elem = match_attrs
297 else:
298 def is_matching_elem(elem):
299 return ((elem.is_terminal() == terminal) and
300 match_attrs(elem))
301 return self._filter_search(is_matching_elem, order, False)
302
303 - def get_path(self, target=None, **kwargs):
304 """List the clades directly between the root and the given target.
305
306 Returns a list of all clade objects along this path, ending with
307 the given target, but excluding the root clade.
308 """
309
310 path = []
311 match = _combine_matchers(target, kwargs, True)
312 def check_in_path(v):
313 if match(v):
314 path.append(v)
315 return True
316 elif v.is_terminal():
317 return False
318 for child in v:
319 if check_in_path(child):
320 path.append(v)
321 return True
322 return False
323 if not check_in_path(self.root):
324 return None
325 return path[-2::-1]
326
328 """Get a list of all of this tree's nonterminal (internal) nodes."""
329 return list(self.find_clades(terminal=False, order=order))
330
332 """Get a list of all of this tree's terminal (leaf) nodes."""
333 return list(self.find_clades(terminal=True, order=order))
334
335 - def trace(self, start, finish):
336 """List of all clade object between two targets in this tree.
337
338 Excluding start, including finish.
339 """
340 mrca = self.common_ancestor(start, finish)
341 fromstart = mrca.get_path(start)[-2::-1]
342 to = mrca.get_path(finish)
343 return fromstart + [mrca] + to
344
345
346
348 """Most recent common ancestor (clade) of all the given targets.
349
350 Edge cases:
351
352 - If no target is given, returns self.root
353 - If 1 target is given, returns the target
354 - If any target is not found in this tree, raises a ValueError
355 """
356 paths = [self.get_path(t) for t in targets]
357
358 for p, t in zip(paths, targets):
359 if p is None:
360 raise ValueError("target %s is not in this tree" % repr(t))
361 mrca = self.root
362 for level in itertools.izip(*paths):
363 ref = level[0]
364 for other in level[1:]:
365 if ref is not other:
366 break
367 else:
368 mrca = ref
369 if ref is not mrca:
370 break
371 return mrca
372
376
377 - def depths(self, unit_branch_lengths=False):
378 """Create a mapping of tree clades to depths (by branch length).
379
380 @return: dict of {clade: depth}
381 """
382 if unit_branch_lengths:
383 depth_of = lambda c: 1
384 else:
385 depth_of = lambda c: c.branch_length or 0
386 depths = {}
387 def update_depths(node, curr_depth):
388 depths[node] = curr_depth
389 for child in node.clades:
390 new_depth = curr_depth + depth_of(child)
391 update_depths(child, new_depth)
392 update_depths(self.root, 0)
393 return depths
394
395 - def distance(self, target1, target2=None):
396 """Calculate the sum of the branch lengths between two targets.
397
398 If only one target is specified, the other is the root of this tree.
399 """
400 if target2 is None:
401 return sum(n.branch_length for n in self.get_path(target1)
402 if n.branch_length is not None)
403 mrca = self.common_ancestor(target1, target2)
404 return mrca.distance(target1) + mrca.distance(target2)
405
419
421 """MRCA of terminals if they comprise a complete subclade, or False.
422
423 @return: common ancestor if terminals are monophyletic, otherwise False.
424 """
425 target_set = set(terminals)
426 current = self.root
427 while True:
428 if set(current.get_terminals()) == target_set:
429 return current
430
431 for subclade in current.clades:
432 if set(subclade.get_terminals()).issuperset(target_set):
433 current = subclade
434 break
435 else:
436 return False
437
439 """True if target is a descendent of this tree.
440
441 Not required to be a direct descendent.
442 """
443 return (self.get_path(target, **kwargs) is not None)
444
446 """True if all direct descendents are terminal."""
447 if self.root.is_terminal():
448 return False
449 for clade in self.root.clades:
450 if not clade.is_terminal():
451 return False
452 return True
453
458
459
460
461 - def collapse(self, target=None, **kwargs):
462 """Deletes target from the tree, relinking its children to its parent.
463
464 @return: the parent clade.
465 """
466 path = self.get_path(target, **kwargs)
467 if not path:
468 raise ValueError("couldn't collapse %s in this tree"
469 % (target or kwargs))
470 if len(path) == 1:
471 parent = self.root
472 else:
473 parent = path[-2]
474 popped = parent.clades.pop(parent.clades.index(path[-1]))
475 extra_length = popped.branch_length or 0
476 for child in popped:
477 child.branch_length += extra_length
478 parent.clades.extend(popped.clades)
479 return parent
480
482 """Collapse all the descendents of this tree, leaving only terminals.
483
484 To collapse only certain elements, use the collapse method directly in a
485 loop with find_clades:
486
487 >>> for clade in tree.find_clades(branch_length=True, order='level'):
488 >>> if (clade.branch_length < .5 and
489 >>> not clade.is_terminal() and
490 >>> clade is not self.root):
491 >>> tree.collapse(clade)
492
493 Note that level-order traversal helps avoid strange side-effects when
494 modifying the tree while iterating over its clades.
495 """
496 internals = self.find_clades(terminal=False, order='level')
497
498 internals.next()
499 for clade in internals:
500 self.collapse(clade)
501
503 """Sort clades in-place according to the number of terminal nodes.
504
505 Deepest clades are last by default. Use reverse=True to sort clades
506 deepest-to-shallowest.
507 """
508 self.root.clades.sort(key=lambda c: c.count_terminals(),
509 reverse=reverse)
510 for subclade in self.root.clades:
511 subclade.ladderize(reverse=reverse)
512
513 - def prune(self, target=None, **kwargs):
514 """Prunes a terminal clade from the tree.
515
516 If taxon is from a bifurcation, the connecting node will be collapsed
517 and its branch length added to remaining terminal node. This might be no
518 longer a meaningful value.
519
520 @return: parent clade of the pruned target
521 """
522 if 'terminal' in kwargs and kwargs['terminal']:
523 raise ValueError("target must be terminal")
524 path = self.get_path(target, terminal=True, **kwargs)
525 if not path:
526 raise ValueError("can't find a matching target below this root")
527 if len(path) == 1:
528 parent = self.root
529 else:
530 parent = path[-2]
531 parent.clades.remove(path[-1])
532 if len(parent) == 1:
533
534 if parent == self.root:
535
536
537 newroot = parent.clades[0]
538 newroot.branch_length = None
539 parent = self.root = newroot
540 else:
541
542 child = parent.clades[0]
543 if child.branch_length is not None:
544 child.branch_length += (parent.branch_length or 0.0)
545 if len(path) < 3:
546 grandparent = self.root
547 else:
548 grandparent = path[-3]
549
550 index = grandparent.clades.index(parent)
551 grandparent.clades.pop(index)
552 grandparent.clades.insert(index, child)
553 parent = grandparent
554 return parent
555
556 - def split(self, n=2, branch_length=1.0):
557 """Speciation: generate n (default 2) new descendants.
558
559 New clades have the given branch_length and the same name as this
560 clade's root plus an integer suffix (counting from 0).
561 """
562 subtree_cls = type(self.root)
563 base_name = self.root.name or ''
564 for i in range(n):
565 clade = subtree_cls(name=base_name+str(i),
566 branch_length=branch_length)
567 self.root.clades.append(clade)
568
569
570 -class Tree(TreeElement, TreeMixin):
571 """A phylogenetic tree, containing global info for the phylogeny.
572
573 The structure and node-specific data is accessible through the 'root'
574 subtree attached to the Tree instance.
575
576 @param root:
577 The starting node of the tree. If the tree is rooted, this will usually
578 be the root node.
579 @type root: Clade
580
581 @param rooted:
582 Whether or not the tree is rooted. By default, a tree is assumed to be
583 rooted.
584 @type rooted: bool
585
586 @param id: The identifier of the tree, if there is one.
587 @type id: str
588
589 @param name: The name of the tree, in essence a label.
590 @type name: str
591 """
592 - def __init__(self, root=None, rooted=True, id=None, name=None):
597
598 @classmethod
600 """Create a new Tree object given a subtree.
601
602 Keyword arguments are the usual Tree constructor parameters.
603 """
604 return cls(subtree, **kwargs)
605
606 @classmethod
607 - def randomized(cls, taxa, branch_length=1.0, branch_stdev=None):
608 """Create a randomized bifurcating tree given a list of taxa.
609
610 @param taxa: Either an integer specifying the number of taxa to create
611 (automatically named taxon#), or an iterable of taxon names, as
612 strings.
613
614 @return: a tree of the same type as this class.
615 """
616 if isinstance(taxa, int):
617 taxa = ['taxon%s' % (i+1) for i in xrange(taxa)]
618 elif hasattr(taxa, '__iter__'):
619 taxa = list(taxa)
620 else:
621 raise TypeError("taxa argument must be integer (# taxa) or "
622 "iterable of taxon names.")
623 rtree = cls()
624 terminals = [rtree.root]
625 while len(terminals) < len(taxa):
626 newsplit = random.choice(terminals)
627 newterms = newsplit.split(branch_length=branch_length)
628 if branch_stdev:
629
630 for nt in newterms:
631 nt.branch_length = max(0,
632 random.gauss(branch_length, branch_stdev))
633 terminals.remove(newsplit)
634 terminals.extend(newterms)
635
636 random.shuffle(taxa)
637 for node, name in zip(terminals, taxa):
638 node.name = name
639 return rtree
640
641 @property
643 """The first subtree in this tree (not itself)."""
644 return self.root
645
646
647
649 """True if the root of this tree is terminal."""
650 return (not self.root.clades)
651
652
653
670
677
678
679
681 """String representation of the entire tree.
682
683 Serializes each sub-clade recursively using repr() to create a summary
684 of the object structure.
685 """
686 TAB = ' '
687 textlines = []
688 def print_tree(obj, indent):
689 """Recursively serialize sub-elements.
690
691 This closes over textlines and modifies it in-place.
692 """
693 textlines.append(TAB*indent + repr(obj))
694 indent += 1
695 for attr in obj.__dict__:
696 child = getattr(obj, attr)
697 if isinstance(child, TreeElement):
698 print_tree(child, indent)
699 elif isinstance(child, list):
700 for elem in child:
701 if isinstance(elem, TreeElement):
702 print_tree(elem, indent)
703 print_tree(self, 0)
704 return '\n'.join(textlines)
705
706
707 -class Clade(TreeElement, TreeMixin):
708 """A recursively defined subtree.
709
710 @param branch_length:
711 The length of the branch leading to the root node of this subtree.
712 @type branch_length: str
713
714 @param name: The clade's name (a label).
715 @type name: str
716
717 @param clades: Sub-trees rooted directly under this tree's root.
718 @type clades: list
719 """
720 - def __init__(self, branch_length=None, name=None, clades=None):
724
725 @property
727 """Allow TreeMixin methods to traverse subtrees properly."""
728 return self
729
731 """True if this is a terminal (leaf) node."""
732 return (not self.clades)
733
734
735
737 """Get subtrees by index (integer or slice)."""
738 if isinstance(index, int) or isinstance(index, slice):
739 return self.clades[index]
740 ref = self
741 for idx in index:
742 ref = ref[idx]
743 return ref
744
746 """Iterate through this tree's direct subtrees (clades)."""
747 return iter(self.clades)
748
750 """Number of subtrees directy under the root."""
751 return len(self.clades)
752
754 """Boolean value of an instance of this class.
755
756 NB: If this method is not defined, but __len__ is, then the object is
757 considered true if the result of __len__() is nonzero. We want Clade
758 instances to always be considered true.
759 """
760 return True
761
766