1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20 import string
21 import operator
22 import urllib
23 import sgmllib
24 import UserDict
25 import Bio.File
26
27
28
30
32 sgmllib.SGMLParser.reset( self )
33 self.text = ''
34 self.queue = UserDict.UserDict()
35 self.open_tag_stack = []
36 self.open_tag = 'open_html'
37 self.key_waiting = ''
38 self.master_key = ''
39 self.context = 'general_info'
40
41 - def parse( self, handle ):
42 self.reset()
43 self.feed( handle )
44 for key in self.queue.keys():
45 if( self.queue[ key ] == {} ):
46 if( key[ :15 ] == 'UniGene Cluster' ):
47 self.queue[ 'UniGene Cluster' ] = key[ 16: ]
48 del self.queue[ key ]
49 return self.queue
50
51
52
53
54 - def feed( self, handle ):
68
69
70
72 newtext = string.strip( newtext )
73 self.text = self.text + newtext
74
76 if( self.context == 'seq_info' ):
77 if( self.open_tag != 'open_b' ):
78 self.text = ''
79
80
81
83 if( self.context == 'seq_info' ):
84 if( self.open_tag != 'open_b' ):
85 if( self.key_waiting == '' ):
86 self.key_waiting = self.text
87 self.text = ''
88
90
91 self.open_tag_stack.append( self.open_tag )
92 self.open_tag = 'open_b'
93 if( self.key_waiting == '' ):
94 self.text = ''
95
97 if( self.text[ :15 ] == 'UniGene Cluster' ):
98 self.queue[ 'UniGene Cluster' ] = self.text[ 16: ]
99 self.text = ''
100 elif( self.key_waiting == '' ):
101 self.extract_key()
102
104 text = string.strip( self.text )
105 key = string.join( string.split( text ) )
106 words = string.split( key )
107 key = string.join( words[ :2 ] )
108 self.text = ''
109
110 try:
111 self.open_tag = self.open_tag_stack.pop()
112 except:
113 self.open_tag = 'open_html'
114 if( self.open_tag == 'open_table_data' ):
115 if( self.context == 'general_info' ):
116 if( self.key_waiting == '' ):
117 self.key_waiting = key
118 self.text = ''
119 elif( self.context == 'seq_info' ):
120 if( text == 'Key to Symbols' ):
121 self.context = 'legend'
122 self.master_key = key
123 elif( self.context == 'general_info' ):
124 self.master_key = key
125 if( string.find( key, 'SEQUENCE' ) != -1 ):
126 self.context = 'seq_info'
127 self.queue[ key ] = UserDict.UserDict()
128 elif( self.context == 'seq_info' ):
129 self.queue[ key ] = UserDict.UserDict()
130 self.master_key = key
131
132
133
135 self.open_tag_stack.append( self.open_tag )
136 self.open_tag = 'open_table'
137
139 try:
140 self.open_tag = self.open_tag_stack.pop()
141 except:
142 self.open_tag = 'open_html'
143 self.key_waiting = ''
144
146 self.open_tag_stack.append( self.open_tag )
147 self.open_tag = 'open_table_row'
148 self.text = ''
149
151 try:
152 self.open_tag = self.open_tag_stack.pop()
153 except:
154 self.open_tag = 'open_html'
155 text = self.text
156 if text:
157 self.text = ''
158 if( text[ 0 ] == ':' ):
159 text = text[ 1: ]
160 text = string.join( string.split( text ) )
161 if( ( self.context == 'general_info' ) or \
162 ( self.context == 'seq_info' ) ):
163 try:
164 contents = self.queue[ self.master_key ][ self.key_waiting ]
165 if( type( contents ) == type( [] ) ):
166 contents.append( text )
167 else:
168 self.queue[ self.master_key ][ self.key_waiting ] = \
169 [ contents , text ]
170 except:
171 self.queue[ self.master_key ][ self.key_waiting ] = text
172
173
174 self.key_waiting = ''
175
176
177
179 self.open_tag_stack.append( self.open_tag )
180 self.open_tag = 'open_table_data'
181
183 try:
184 self.open_tag = self.open_tag_stack.pop()
185 except:
186 self.open_tag = 'open_html'
187 if( self.context == 'seq_info' ):
188 self.text = self.text + ' '
189
191 indent = ' '
192 for j in range( 0, level ):
193 indent = indent + ' '
194 if( type( item ) == type( '' ) ):
195 if( item != '' ):
196 print '%s%s' % ( indent, item )
197 elif( type( item ) == type([])):
198 for subitem in item:
199 self.print_item( subitem, level + 1 )
200 elif( isinstance( item, UserDict.UserDict ) ):
201 for subitem in item.keys():
202 print '%skey is %s' % ( indent, subitem )
203 self.print_item( item[ subitem ], level + 1 )
204 else:
205 print item
206
211
212
213
214 if( __name__ == '__main__' ):
215 handle = open( 'Hs13225.htm')
216 undo_handle = Bio.File.UndoHandle( handle )
217 unigene_parser = UniGeneParser()
218 unigene_parser.parse( handle )
219 unigene_parser.print_tags()
220