Package Bio :: Package InterPro
[hide private]
[frames] | no frames]

Source Code for Package Bio.InterPro

  1  # Copyright 2001 by Katharine Lindner.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This module provides code to work with html files from InterPro, 
  8  and code to access resources at InterPro over the WWW. 
  9  http://www.ebi.ac.uk/interpro/ 
 10   
 11   
 12  Classes: 
 13  Record             Holds interpro sequence data. 
 14  InterProParser     Parses interpro sequence data into a Record object. 
 15   
 16  Functions: 
 17  get_interpro_entry 
 18   
 19  """ 
 20   
 21  from Bio import File 
 22  import sgmllib 
 23  from Bio.SeqFeature import Reference 
 24   
25 -class Record( dict ):
26
27 - def __str__( self ):
28 keys = self.keys() 29 keys.sort() 30 out = '' 31 for key in keys: 32 val = self[ key ] 33 if key == 'References': 34 out = out + '\n%s\n' % key 35 for reference in val: 36 out = out + '%s\n' % str( reference ) 37 out = out + '\n' 38 elif key == 'Examples': 39 out = out + '\n%s\n' % key 40 for example in val: 41 out = out + '%s\n' % example 42 elif key == 'Abstract': 43 out = out + '\n%s\n' % key 44 out = out + '%s...\n' % val[ : 80 ] 45 elif type( self[ key ] ) == list: 46 out = out + '\n%s\n' % key 47 for item in val: 48 out = out + '%s\n' % item 49 50 else: 51 out = out + '%s: %s\n' % ( key, self[ key ] ) 52 return out
53
54 -class InterProParser( sgmllib.SGMLParser ):
55 """Parses InterPro sequence data into a Record object. 56 57 """
58 - def reset(self):
59 sgmllib.SGMLParser.reset( self ) 60 self.text = '' 61 self.inter_pro_dict = Record() 62 self.inter_pro_dict['Database'] = '' 63 self.inter_pro_dict['Accession'] = '' 64 self.inter_pro_dict['Name'] = '' 65 self.inter_pro_dict['Dates'] = '' 66 self.inter_pro_dict['Type'] = '' 67 self.inter_pro_dict['Parent'] = '' 68 self.inter_pro_dict['Process'] = '' 69 self.inter_pro_dict['Function'] = '' 70 self.inter_pro_dict['Component'] = '' 71 self.inter_pro_dict['Signatures'] = [] 72 self.inter_pro_dict['Abstract'] = '' 73 self.inter_pro_dict['Examples'] = [] 74 self.inter_pro_dict['References'] = [] 75 self.inter_pro_dict['Database links'] = [] 76 self._state = 'title' 77 self._reference_state = '' 78 self._key_waiting = '' 79 self._current_reference = ''
80
81 - def parse(self, handle):
82 self.reset() 83 self.feed(handle) 84 return self.inter_pro_dict
85
86 - def feed(self, handle):
87 """feed(self, handle ) 88 89 Feed in interpro data for scanning. handle is a file-like object 90 containing interpro data. consumer is a Consumer object that will 91 receive events as the ndb data is scanned. 92 93 """ 94 if isinstance(handle, File.UndoHandle): 95 uhandle = handle 96 else: 97 uhandle = File.UndoHandle(handle) 98 text = '' 99 while 1: 100 line = uhandle.readline() 101 if not line: 102 break 103 line = line.strip() 104 if line[ -7: ] == '</HTML>': 105 break 106 text = text + ' ' + line 107 108 sgmllib.SGMLParser.feed( self, text )
109 110
111 - def handle_data(self, newtext ):
112 newtext = newtext.strip() 113 self.text = self.text + newtext
114
115 - def start_table( self, attrs ):
116 dictionary = dict( attrs ) 117 for key in dictionary: 118 val = dictionary[key]
119
120 - def start_h2( self, attrs ):
121 pass
122
123 - def end_h2( self ):
124 self._state = 'chugging_along'
125
126 - def start_td( self, attrs ):
127 dictionary = dict( attrs ) 128 if self._state == 'chugging_along': 129 if 'class' in dictionary: 130 if dictionary['class'] == 'tag': 131 self._state = 'waiting_tag' 132 self._flush_text() 133 elif dictionary['class'] == 'inf': 134 self._state = 'waiting_inf' 135 self._flush_text()
136
137 - def end_td( self ):
138 if self._state == 'waiting_tag': 139 self._key_waiting = self._flush_text() 140 self._state = 'chugging_along' 141 elif self._state == 'waiting_inf': 142 key = self._key_waiting 143 if key in self.inter_pro_dict: 144 val = self._flush_text() 145 if key == 'Signatures': 146 pass 147 elif key == 'Database links': 148 pass 149 else: 150 self.inter_pro_dict[ key ] = val 151 self._key_waiting = '' 152 self._state = 'chugging_along'
153 154
155 - def start_ul( self, attrs ):
156 if self._key_waiting == 'Examples': 157 self._state = 'examples' 158 self._flush_text()
159
160 - def end_ul( self ):
161 self._key_waiting = '' 162 self._state = 'chugging_along'
163
164 - def start_ol( self, attrs ):
165 if self._key_waiting == 'References': 166 self._state = 'references' 167 self._reference_state = 'pubmed_id' 168 self._flush_text() 169 self._references = []
170
171 - def end_ol( self ):
172 if self._state == 'references': 173 self._references.append( self._current_reference ) 174 self.inter_pro_dict['References'] = self._references 175 self._state = 'chugging_along'
176
177 - def start_li( self, attrs ):
178 if self._state == 'references': 179 self._reference_state = 'pubmed_id' 180 self._flush_text() 181 if( self._current_reference != '' ): 182 self._references.append( self._current_reference ) 183 self._current_reference = Reference()
184
185 - def end_li( self ):
186 if self._state == 'examples': 187 text = self._flush_text() 188 self.inter_pro_dict['Examples'].append( text )
189
190 - def start_a( self, attrs ):
191 dictionary = dict( attrs ) 192 if self._state == 'references': 193 if self._reference_state == 'pubmed_id': 194 if 'name' in dictionary: 195 self._current_reference.pubmed_id = dictionary['name'] 196 self._reference_state = 'authors' 197 elif self._reference_state == 'journal': 198 self._current_reference.journal = self._flush_text() 199 self._reference_state = 'medline_id'
200
201 - def end_a( self ):
202 if self._state == 'references': 203 if self._reference_state == 'medline_id': 204 text = self._flush_text() 205 cols = text.split( ':' ) 206 try: 207 medline_id = cols[ 1 ] 208 except IndexError: 209 medline_id = None 210 else: 211 medline_id = medline_id[ : -1 ] 212 self._current_reference.medline_id = medline_id
213
214 - def do_br( self, attrs ):
215 if self._state == 'references': 216 if self._reference_state == 'authors': 217 self._current_reference.authors = self._flush_text() 218 self._reference_state = 'title' 219 elif self._key_waiting == 'Signatures': 220 self.inter_pro_dict['Signatures'].append( self._flush_text() ) 221 elif self._key_waiting == 'Database links': 222 self.inter_pro_dict['Database links'].append( self._flush_text() )
223
224 - def start_i( self, attrs ):
225 pass
226
227 - def end_i( self ):
228 if self._state == 'references': 229 if self._reference_state == 'title': 230 text = self._flush_text() 231 self._current_reference.title = text 232 self._reference_state = 'journal'
233 234
235 - def handle_starttag(self, tag, method, attrs):
236 if self._state == 'references': 237 if tag == 'li': 238 self.stack.pop() 239 elif tag == 'a': 240 if self._reference_state == 'pubmed_id': 241 self.stack.pop() 242 method(attrs)
243 244
245 - def _flush_text( self ):
246 text = self.text.strip() 247 self.text = '' 248 return text[:]
249
250 -def get_interpro_entry( id ):
251 """get specified interpro entry""" 252 import urllib 253 handle = urllib.urlopen("http://www.ebi.ac.uk/interpro/IEntry?ac=" + id ) 254 255 # XXX need to check to see if the entry exists! 256 return handle
257 258 if __name__ == '__main__': 259 import Bio.File 260 handle = open('IPR001064.htm') 261 undo_handle = Bio.File.UndoHandle( handle ) 262 interpro_parser = InterProParser() 263 record = interpro_parser.parse( handle ) 264 print str( record ) 265