Package Bio :: Package AlignAce :: Module Motif
[hide private]
[frames] | no frames]

Source Code for Module Bio.AlignAce.Motif

  1  # Copyright 2003 by Bartek Wilczynski.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5  """ 
  6  Implementation of sequence motifs. 
  7   
  8  Changes: 
  9  9.2007 (BW) : added the to_faste() and .weblogo() methods allowing to use the Berkeley weblogo server at http://weblogo.berkeley.edu/ 
 10  """ 
 11   
 12  from Bio.SubsMat import FreqTable 
 13   
14 -class Motif(object):
15 """ 16 A class representing sequence motifs. 17 """
18 - def __init__(self):
19 self.instances = [] 20 self.score = 0.0 21 self.mask = [] 22 self._pwm_is_current = 0 23 self._pwm = [] 24 self.alphabet=None 25 self.length=None
26
27 - def _check_length(self, len):
28 if self.length==None: 29 self.length = len 30 elif self.length != len: 31 raise ValueError("You can't change the length of the motif")
32
33 - def _check_alphabet(self,alphabet):
34 if self.alphabet==None: 35 self.alphabet=alphabet 36 elif self.alphabet != alphabet: 37 raise ValueError("Wrong Alphabet")
38
39 - def add_instance(self,instance):
40 """ 41 adds new instance to the motif 42 """ 43 self._check_alphabet(instance.alphabet) 44 self._check_length(len(instance)) 45 self.instances.append(instance) 46 self._pwm_is_current = False
47
48 - def set_mask(self,mask):
49 """ 50 sets the mask for the motif 51 52 The mask should be a string containing asterisks in the position of significant columns and spaces in other columns 53 """ 54 self._check_length(len(mask)) 55 self.mask=[] 56 for char in mask: 57 if char=="*": 58 self.mask.append(1) 59 elif char==" ": 60 self.mask.append(0) 61 else: 62 raise ValueError("Mask should contain only '*' or ' ' and not a '%s'"%char)
63
64 - def pwm(self):
65 """ 66 returns the PWM computed for the set of instances 67 """ 68 69 if self._pwm_is_current: 70 return self._pwm 71 #we need to compute new pwm 72 self._pwm = [] 73 for i in xrange(len(self.mask)): 74 dict = {} 75 #filling the dict with 0's 76 for letter in self.alphabet.letters: 77 dict[letter]=0 78 #counting the occurences of letters in instances 79 for seq in self.instances: 80 dict[seq[i]]=dict[seq[i]]+1 81 self._pwm.append(FreqTable.FreqTable(dict,FreqTable.COUNT,self.alphabet)) 82 self._pwm_is_current=True 83 return self._pwm
84
85 - def search_instances(self,sequence):
86 """ 87 a generator function, returning found positions of instances of the motif in a given sequence 88 """ 89 for pos in xrange(0,len(sequence)-self.length+1): 90 for instance in self.instances: 91 if instance.tostring()==sequence[pos:pos+self.length].tostring(): 92 yield(pos,instance) 93 break # no other instance will fit (we don't want to return multiple hits)
94
95 - def score_hit(self,sequence,position,normalized=1,masked=0):
96 """ 97 give the pwm score for a given position 98 """ 99 score = 0.0 100 for pos in xrange(self.length): 101 if not masked or self.mask[pos]: 102 score += self.pwm()[pos][sequence[position+pos]] 103 if normalized: 104 if not masked: 105 score/=self.length 106 else: 107 score/=len(filter(lambda x: x, self.mask)) 108 return score
109
110 - def search_pwm(self,sequence,threshold=0.0,normalized=1,masked=1):
111 """ 112 a generator function, returning found hits in a given sequence with the pwm score higher than the threshold 113 """ 114 115 for pos in xrange(0,len(sequence)-self.length+1): 116 score = self.score_hit(sequence,pos,normalized,masked) 117 if score > threshold: 118 yield (pos,score)
119 120
121 - def sim(self, motif, masked = 0):
122 """ 123 return the similarity score for the given motif against self. 124 125 We use the Pearson's correlation of the respective probabilities. 126 If the motifs have different length or mask raise the ValueError. 127 """ 128 129 from math import sqrt 130 131 if self.alphabet != motif.alphabet: 132 raise ValueError("Wrong alphabet") 133 134 if self.length != motif.length: 135 raise ValueError("Wrong length") 136 137 if masked and self.mask!=motif.mask: 138 raise ValueError("Wrong mask") 139 140 sxx = 0 # \sum x^2 141 sxy = 0 # \sum x \cdot y 142 sx = 0 # \sum x 143 sy = 0 # \sum y 144 syy = 0 # \sum x^2 145 146 for pos in xrange(self.length): 147 if not masked or self.mask: 148 for l in self.alphabet.letters: 149 xi = self.pwm()[pos][l] 150 yi = motif.pwm()[pos][l] 151 sx = sx + xi 152 sy = sy + yi 153 sxx = sxx + xi * xi 154 syy = syy + yi * yi 155 sxy = sxy + xi * yi 156 157 if masked: 158 norm = len(filter(lambda x: x,self.mask)) 159 else: 160 norm = self.length 161 162 norm *= len(self.alphabet.letters) 163 s1 = (sxy - sx*sy*1.0/norm) 164 s2 = (sxx - sx*sx*1.0/norm)*(syy- sy*sy*1.0/norm) 165 166 return s1/sqrt(s2)
167
168 - def read(self,stream):
169 """ 170 reads the motif from the stream 171 172 the self.alphabet variable must be set before 173 """ 174 from Bio.Seq import Seq 175 while 1: 176 ln = stream.readline() 177 if "*" in ln: 178 self.set_mask(ln.strip("\n\c")) 179 break 180 self.add_instance(Seq(ln.strip(),self.alphabet))
181
182 - def __str__(self):
183 """ 184 string representation of motif 185 """ 186 str = "" 187 for inst in self.instances: 188 str = str + inst.tostring() + "\n" 189 190 for i in xrange(self.length): 191 if self.mask[i]: 192 str = str + "*" 193 else: 194 str = str + " " 195 str = str + "\n" 196 197 return str
198
199 - def write(self,stream):
200 """ 201 writes the motif to the stream 202 """ 203 204 stream.write(self.__str__())
205 206 207
208 - def to_fasta(self):
209 """ 210 FASTA representation of motif 211 """ 212 str = "" 213 for i,inst in enumerate(self.instances): 214 str = str + "> instance %d\n"%i + inst.tostring() + "\n" 215 216 return str
217
218 - def weblogo(self,fname,format="PNG",**kwds):
219 """ 220 uses the Berkeley weblogo service to download and save a weblogo of itself 221 222 requires an internet connection. 223 The parameters from **kwds are passed directly to the weblogo server. 224 """ 225 import urllib 226 import urllib2 227 #import Image 228 al= self.to_fasta() 229 230 url = 'http://weblogo.berkeley.edu/logo.cgi' 231 values = {'sequence' : al, 232 'format' : format, 233 'logowidth' : '18', 234 'logoheight' : '5', 235 'logounits' : 'cm', 236 'kind' : 'AUTO', 237 'firstnum' : "1", 238 'command' : 'Create Logo', 239 'smallsamplecorrection' : "on", 240 'symbolsperline' : 32, 241 'res' : '96', 242 'res_units' : 'ppi', 243 'antialias' : 'on', 244 'title' : '', 245 'barbits' : '', 246 'xaxis': 'on', 247 'xaxis_label' : '', 248 'yaxis': 'on', 249 'yaxis_label' : '', 250 'showends' : 'on', 251 'shrink' : '0.5', 252 'fineprint' : 'on', 253 'ticbits' : '1', 254 'colorscheme' : 'DEFAULT', 255 'color1' : 'green', 256 'color2' : 'blue', 257 'color3' : 'red', 258 'color4' : 'black', 259 'color5' : 'purple', 260 'color6' : 'orange', 261 'color1' : 'black', 262 } 263 for k,v in kwds.items(): 264 values[k]=str(v) 265 266 data = urllib.urlencode(values) 267 req = urllib2.Request(url, data) 268 response = urllib2.urlopen(req) 269 f=open(fname,"w") 270 im=response.read() 271 272 f.write(im) 273 f.close()
274