Package Bio :: Package Prosite :: Module Pattern
[hide private]
[frames] | no frames]

Source Code for Module Bio.Prosite.Pattern

  1  # Copyright 2000 by Andrew Dalke.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  # The Prosite patterns are defined at http://www.expasy.ch/txt/prosuser.txt 
  7  #  
  8  # The PA  (PAttern) lines  contains the definition of a PROSITE pattern. The 
  9  # patterns are described using the following conventions: 
 10  #  
 11  #    -  The standard IUPAC one-letter codes for the amino acids are used. 
 12  #    -  The symbol `x' is used for a position where any amino acid is accepted. 
 13  #    -  Ambiguities are  indicated by  listing the acceptable amino acids for a 
 14  #       given position,  between square  parentheses `[  ]'. For example: [ALT] 
 15  #       stands for Ala or Leu or Thr. 
 16  #    -  Ambiguities are  also indicated  by listing  between a  pair  of  curly 
 17  #       brackets `{  }' the  amino acids  that are  not  accepted  at  a  given 
 18  #       position. For  example: {AM}  stands for  any amino acid except Ala and 
 19  #       Met. 
 20  #    -  Each element in a pattern is separated from its neighbor by a `-'. 
 21  #    -  Repetition of  an element  of the pattern can be indicated by following 
 22  #       that element  with a  numerical value  or  a  numerical  range  between 
 23  #       parenthesis. Examples: x(3) corresponds to x-x-x, x(2,4) corresponds to 
 24  #       x-x or x-x-x or x-x-x-x. 
 25  #    -  When a  pattern is  restricted to  either the  N- or  C-terminal  of  a 
 26  #       sequence, that  pattern either starts with a `<' symbol or respectively 
 27  #       ends with a `>' symbol. 
 28  #    -  A period ends the pattern. 
 29  #  
 30  # That boils down to doing these conversions 
 31  #  
 32  # [] -> [] 
 33  # {} -> [^ ] 
 34  # -  -> 
 35  # () -> {} 
 36  # <  -> ^ 
 37  # >  -> $ 
 38  # x->X 
 39  # . -> 
 40   
 41  # Note: 
 42  #  [G>] is a valid Prosite pattern, equivalent to "([G]|$)" 
 43   
 44  # I assume then that 
 45  #  [>G] is equivalent to "(^|[G])" 
 46  # It is conceivable that [G>]-G-G is valid, meaning a "G" at the end 
 47  # of the sequence or followed by two more Gs.  I did not implement 
 48  # this.  I haven't gotten an answer to my query on either of these two 
 49  # non-documented possibilities. 
 50   
 51  import string, re 
 52  from Bio import Seq, Alphabet 
 53   
 54   
 55  # Syntactic conversion to two types of regular expressions 
 56   
 57  _prosite_trans = string.maketrans("abcdefghijklmnopqrstuvwxyzX}()<>", 
 58                                    "ABCDEFGHIJKLMNOPQRSTUVW.YZ.]{}^$") 
 59   
 60  # This does not verify that the pattern is correct - invalid patterns 
 61  # can be converted! 
62 -def prosite_to_re(pattern):
63 """convert a valid Prosite pattern into an re string""" 64 flg = (pattern[:2] == "[<") 65 s = pattern.replace("{", "[^") 66 s = s.translate(_prosite_trans, "-.") 67 # special case "[<" and ">]", if they exist 68 if flg: 69 i = s.index("]") 70 s = "(?:^|[" + s[2:i] + "])" + s[i+1:] 71 if s[-2:] == "$]": 72 i = s.rindex("[") 73 s = s[:i] + "(?:" + s[i:-2] + "]|$)" 74 elif s[-3:] == "$]$": 75 i = s.rindex("[") 76 s = s[:i] + "(?:" + s[i:-3] + "]|$)$" 77 return s
78 79 80 # This does not verify the pattern is correct - invalid patterns can 81 # be converted!
82 -def prosite_to_grouped_re(pattern):
83 """convert a valid Prosite pattern into an re with groups for each term""" 84 flg = (pattern[:2] == "[<") 85 s = pattern.replace("{", "[^") 86 # Don't delete the "-" characters: use them to place the ()s 87 s = s.translate(_prosite_trans, ".") 88 89 # Get the [< and >] terms correct 90 if flg: 91 i = s.index("]") 92 s = "(?:^|[" + s[2:i] + "])" + s[i+1:] 93 if s[-2:] == "$]": 94 i = s.rindex("[") 95 s = s[:i] + "(?:" + s[i:-2] + "]|$)" 96 if s[-3:] == "$]$": 97 i = s.rindex("[") 98 s = s[:i] + "(?:" + s[i:-3] + "]|$)$" 99 100 # Watch out for unescaped < and > terms 101 if s[:1] == "^": 102 s = "^(" + s[1:] 103 else: 104 s = "(" + s 105 if s[-1:] == "$": 106 s = s[:-1] + ")$" 107 else: 108 s = s + ")" 109 110 return s.replace("-", ")(")
111 112 113 114 # Both the Prosite pattern and match result act like sequences.
115 -class PrositeAlphabet(Alphabet.Alphabet):
116 pass
117 prosite_alphabet = PrositeAlphabet() 118
119 -def compile(pattern):
120 if not verify_pattern(pattern): 121 raise TypeError("not a legal prosite pattern") 122 return Prosite(pattern = pattern)
123
124 -class Prosite:
125 alphabet = prosite_alphabet 126 127 # Don't like having two different types of input - not very pythonic 128 # However, it is faster since I can assume the input has already been 129 # verified (if it's a pattern).
130 - def __init__(self, pattern = None, data = None):
131 assert (pattern is None and data is not None) ^ \ 132 (pattern is not None and data is None), \ 133 "one and only one of pattern and data can have a value" 134 if pattern is not None: 135 self.pattern = pattern 136 if data is not None: 137 self.data = data
138
139 - def __repr__(self):
140 return "Prosite(%s)" % repr(str(self))
141 - def __str__(self):
142 return '-'.join(map(str, self.data)) + "."
143 - def __len__(self): return len(self.data)
144 - def __getitem__(self, i): return self.data[i]
145 - def __getslice__(self, i, j):
146 i = max(i, 0); j = max(j, 0) 147 return Prosite(data = self.data[i:j])
148 - def __getattr__(self, name):
149 # Lazy creation of these elements / cache results 150 if name == "re": 151 self.re = re.compile(prosite_to_re(self.pattern)) 152 return self.re 153 elif name == "grouped_re": 154 self.grouped_re = re.compile(prosite_to_grouped_re(self.pattern)) 155 return self.grouped_re 156 elif name == "data": 157 self.data = find_terms(self.pattern) 158 return self.data 159 elif name == "pattern": 160 self.pattern = str(self) 161 return self.pattern 162 raise AttributeError(name)
163
164 - def tostring(self):
165 return str(self)
166
167 - def search(self, seq, pos=0, endpos=None):
168 if endpos is not None: 169 m = self.grouped_re.search(buffer(seq.data), pos, endpos) 170 else: 171 m = self.grouped_re.search(buffer(seq.data), pos) 172 if m is None: 173 return None 174 return PrositeMatch(self, seq, m)
175 - def match(self, seq, pos=0, endpos=None):
176 if endpos is not None: 177 m = self.grouped_re.match(buffer(seq.data), pos, endpos) 178 else: 179 m = self.grouped_re.match(buffer(seq.data), pos) 180 if m is None: 181 return None 182 return PrositeMatch(self, seq, m)
183 184 # I was thinking about adding sub, subn, findall, etc., but either 185 # you just want the string (in which case, use the ".re") or 186 # you could be changing to a different alphabet (eg, T->U). 187 188 189 # Elements of a Prosite pattern
190 -class PrositeTerm:
191 - def __init__(self, letters, ignore, is_begin, is_end, \ 192 min_count, max_count, can_begin, can_end):
193 self.letters = letters 194 self.ignore = ignore 195 self.is_begin = is_begin 196 self.is_end = is_end 197 self.min_count = min_count 198 self.max_count = max_count 199 self.can_begin = can_begin 200 self.can_end = can_end
201 - def copy(self):
202 return PrositeTerm(self.letters, self.ignore, self.is_begin, 203 self.is_end, self.min_count, self.max_count, 204 self.can_begin, self.can_end)
205 - def __str__(self):
206 # Convert the term back into Prosite form 207 s = self.base_str() 208 209 if self.min_count == self.max_count: 210 if self.min_count == 1: 211 pass 212 else: 213 s = s + "(%d)" % self.min_count 214 else: 215 s = s + "(%d,%d)" % (self.min_count, self.max_count) 216 if self.is_end: 217 s = s + ">" 218 return s
219
220 - def base_str(self):
221 # Convert the term back into Prosite form, without the repeat 222 # count fields. 223 224 if self.is_begin: 225 s = "<" 226 else: 227 s = "" 228 if self.ignore: 229 s = s + "{" + self.letters + "}" 230 elif len(self.letters) == 1 and \ 231 (not self.can_begin and not self.can_end): 232 s = s + self.letters 233 else: 234 s = s + "[" 235 if self.can_begin: 236 s = s + "<" 237 s = s + self.letters 238 if self.can_end: 239 s = s + ">" 240 s = s + "]" 241 return s
242 243 # Results of a Prosite match. Wrapper to the re.MatchObj, but returns 244 # Seq objects instead of strings. And lookee - it implements the Seq 245 # interface too!
246 -class PrositeMatch:
247 - def __init__(self, prosite, seq, match):
248 self.prosite = prosite 249 self.seq = seq 250 self.match = match 251 self.pos = match.pos 252 self.endpos = match.pos 253 254 # for Seq.Seq initialization 255 self.data = match.group(0) 256 self.alphabet = seq.alphabet
257
258 - def __repr__(self):
259 # XXX this isn't the right way 260 return "<PrositeMatch instance at %x>" % id(self)
261 - def __str__(self):
262 return str(self.data)
263 - def __len__(self): return len(self.data)
264 - def __getitem__(self, i): return self.data[i]
265 - def __getslice__(self, i, j):
266 i = max(i, 0); j = max(j, 0) 267 return Seq.Seq(self.data[i:j], self.alphabet)
268
269 - def mapping(self):
270 """return a list of numbers mapping to items of the original pattern 271 272 For example, if the Prosite pattern is "[AP](2)-D." matched against 273 "PAD", then the mapping is [1, 1, 2], meaning the first character 274 of the match ("P") is from the first Prosite group ("[AP]"), as 275 is the second letter ("A"). The 3rd letter ("D") is mapped to 276 group 2 of the pattern. 277 """ 278 279 vals = [] 280 i = 0 281 start = self.start(0) 282 try: 283 while 1: 284 end = self.match.end(i+1) 285 while start < end: 286 vals.append(i) 287 start = start + 1 288 i = i + 1 289 except IndexError: 290 pass 291 return vals
292
293 - def mapped_pattern(self):
294 """returns the specific Prosite pattern used to find this sequence 295 296 >>> p = Prosite.compile("[AP](2,3)-D.") 297 >>> m = p.search(Seq.Seq("PAD")) 298 >>> mapping = m.mapping() 299 >>> mapped = m.mapped_pattern() 300 >>> print str(m[1]), str(p[mapping[1]]), str(mapped[1]) 301 P [AP](2,3) [AP] 302 >>> print str(mapped) 303 [AP]-[AP]-D. 304 >>> 305 306 Note that the original term includes the count, while the 307 mapped pattern does the expansion. 308 309 """ 310 return pattern_mapping(self.prosite, self.mapping())
311
312 - def start(self, g=0):
313 return self.match.start(g)
314 - def end(self, g=0):
315 return self.match.end(g)
316 - def span(self, g):
317 return self.match.span(g)
318 - def groups(self, default=None):
319 result = [] 320 alphabet = self.alphabet 321 for g in self.match.groups(default): 322 result.append( Seq.Seq(g, alphabet) ) 323 return tuple(result)
324 - def group(self, *groups):
325 result = self.match.group(*groups) 326 if result == (): 327 return result 328 if len(result) == 1: 329 return Seq.Seq(result, self.alphabet) 330 retval = [] 331 for x in result: 332 retval.append(Seq.Seq(x, self.alphabet)) 333 return tuple(retval)
334
335 -def pattern_mapping(prosite, mapping):
336 data = [] 337 for i in mapping: 338 x = prosite[i].copy() 339 x.min_count = x.max_count = 1 340 data.append(x) 341 return Prosite(data=data)
342 343 prosite_term_re = re.compile(r""" 344 (?: 345 ([ABCDEFGHIKLMNPQRSTVWXYZx])| # a character OR 346 \[(<?)([ABCDEFGHIKLMNPQRSTVWXYZ]+)(>?)\]| # something in []s OR 347 \{([ABCDEFGHIKLMNPQRSTVWXYZ]+)\} # something in {}s 348 )(?:\((\d+)(,\d+)?\))? # optional count of the form "(i,j)", ",j" optional 349 $ 350 """, re.VERBOSE) 351 352 # This does not verify the pattern is correct - invalid patterns can 353 # be converted!
354 -def find_terms(pattern):
355 if pattern[-1:] != ".": 356 raise TypeError("not a prosite pattern - needs a final '.'") 357 pattern = pattern[:-1] 358 terms = pattern.split("-") 359 result = [] 360 i = 0 361 for term in terms: 362 can_begin = can_end = 0 363 # Starts with a "<"? 364 if term[:1] == "<": 365 term = term[1:] 366 is_begin = 1 367 else: 368 is_begin = 0 369 370 # Ends with a ">"? 371 if term[-1:] == ">": 372 term = term[:-1] 373 is_end = 1 374 else: 375 is_end = 0 376 377 match = prosite_term_re.match(term) 378 if match is None: 379 raise TypeError("not a Prosite term (%s)" % repr(term)) 380 if match.group(1) is not None: 381 # Single letter 382 ignore = 0 383 letters = match.group(1) 384 elif match.group(3) is not None: 385 # Letters inside of "[]"s 386 ignore = 0 387 letters = match.group(3) 388 if match.group(2): 389 can_begin = 1 390 if i != 0: 391 raise TypeError("[<] only allowed for first term (%s)" \ 392 % repr(term)) 393 394 if match.group(4): 395 can_end = 1 396 if i != len(terms) - 1: 397 raise TypeError("[>] only allowed for last term (%s)" \ 398 % repr(term)) 399 400 elif match.group(5) is not None: 401 # Letters inside of "{}"s 402 ignore = 1 403 letters = match.group(5) 404 else: 405 raise TypeError("not a prosite term (%s)" % repr(term)) 406 407 if match.group(6) is not None: 408 # there is a minimum number 409 min_count = int(match.group(6)) 410 else: 411 # no min, so it's 1 412 min_count = 1 413 if match.group(7) is not None: 414 # there is a maximum number 415 max_count = int(match.group(7)[1:]) 416 else: 417 # no max specified, so use the same as the min 418 max_count = min_count 419 420 result.append(PrositeTerm(letters, ignore, is_begin, 421 is_end, min_count, max_count, 422 can_begin, can_end)) 423 424 i = i + 1 425 return result
426 427 428 429 430 prosite_re = re.compile(r""" 431 ^<? # starts with an optional "<" 432 ( 433 [ABCDEFGHIKLMNPQRSTVWXYZx]| # a character OR 434 (\[<?[ABCDEFGHIKLMNPQRSTVWXYZ]+>?\])| # something in []s OR 435 \{[ABCDEFGHIKLMNPQRSTVWXYZ]+\} # something in {}s 436 )(\(\d+(,\d+)?\))? # optional count of the form "(i,j)" (",j" is optional) 437 (- # new terms seperated by a '-' 438 ( 439 [ABCDEFGHIKLMNPQRSTVWXYZx]| # a character OR 440 \[[ABCDEFGHIKLMNPQRSTVWXYZ]+>?\]| # something in []s OR 441 \{[ABCDEFGHIKLMNPQRSTVWXYZ]+\} # something in {}s 442 )(\(\d+(,\d+)?\))? # optional count 443 )* # repeat until done 444 >? # pattern ends with an optional ">" 445 \.$ # description ends with a required "." 446 """, re.VERBOSE) 447 448 # This verifies the pattern is correct.
449 -def verify_pattern(pattern):
450 """returns 1 if the Prosite pattern is syntactically correct, else 0""" 451 x = prosite_re.match(pattern) 452 if x is None: 453 return 0 454 # check there's only one [< at the beginning, or >] at the end 455 if pattern.find("[<", 1) != -1: 456 return 0 457 if pattern.find(">]", 0, len(pattern)-2) != -1: 458 return 0 459 return 1
460
461 -def _verify_test(infile):
462 """verify the patterns from a Prosite file handle""" 463 pattern = "" 464 while 1: 465 line = infile.readline() 466 if not line: 467 break 468 if line[:2] != "PA": 469 continue 470 471 pattern = pattern + line[5:-1] 472 if line[-2] == ".": 473 try: 474 print "*" * 60 475 print pattern 476 p = compile(pattern) 477 print prosite_to_re(pattern) 478 print repr(p.re) 479 print prosite_to_grouped_re(pattern) 480 print repr(p.grouped_re) 481 terms = str(p) 482 if terms != pattern: 483 print "DIFFER", terms, pattern 484 except TypeError, msg: 485 print "PROBLEM", pattern, msg 486 pattern = ""
487 488 # Commented out by jchang 4/13/00. 489 # Specific to Andrew's test environment. 490 #if __name__ == "__main__": 491 # import os 492 # infile = os.popen("bzcat /home/dalke/ftps/prosite/prosite.dat.bz2 | grep ^PA") 493 # _verify_test(infile) 494