Package Bio :: Package Entrez
[hide private]
[frames] | no frames]

Source Code for Package Bio.Entrez

  1  # Copyright 1999-2000 by Jeffrey Chang.  All rights reserved. 
  2  # Copyright 2008 by Michiel de Hoon.  All rights reserved. 
  3  # This code is part of the Biopython distribution and governed by its 
  4  # license.  Please see the LICENSE file that should have been included 
  5  # as part of this package. 
  6   
  7  """Provides code to access NCBI over the WWW. 
  8   
  9  The main Entrez web page is available at: 
 10  http://www.ncbi.nlm.nih.gov/Entrez/ 
 11   
 12  A list of the Entrez utilities is available at: 
 13  http://www.ncbi.nlm.nih.gov/entrez/utils/utils_index.html 
 14   
 15  Variables: 
 16  email        Set the Entrez email parameter (default is not set). 
 17  tool         Set the Entrez tool parameter (default is  biopython). 
 18   
 19  Functions: 
 20  efetch       Retrieves records in the requested format from a list of one or 
 21               more primary IDs or from the user's environment 
 22  epost        Posts a file containing a list of primary IDs for future use in 
 23               the user's environment to use with subsequent search strategies 
 24  esearch      Searches and retrieves primary IDs (for use in EFetch, ELink, 
 25               and ESummary) and term translations and optionally retains 
 26               results for future use in the user's environment. 
 27  elink        Checks for the existence of an external or Related Articles link 
 28               from a list of one or more primary IDs.  Retrieves primary IDs 
 29               and relevancy scores for links to Entrez databases or Related 
 30               Articles;  creates a hyperlink to the primary LinkOut provider 
 31               for a specific ID and database, or lists LinkOut URLs 
 32               and Attributes for multiple IDs. 
 33  einfo        Provides field index term counts, last update, and available 
 34               links for each database. 
 35  esummary     Retrieves document summaries from a list of primary IDs or from 
 36               the user's environment. 
 37  egquery      Provides Entrez database counts in XML for a single search 
 38               using Global Query. 
 39  espell       Retrieves spelling suggestions. 
 40   
 41  read         Parses the XML results returned by any of the above functions. 
 42               Typical usage is: 
 43               >>> handle = Entrez.einfo() # or esearch, efetch, ... 
 44               >>> record = Entrez.read(handle) 
 45               where record is now a Python dictionary or list. 
 46   
 47  _open        Internally used function. 
 48   
 49  """ 
 50  import urllib, time, warnings 
 51  import os.path 
 52  from Bio import File 
 53   
 54   
 55  email = None 
 56  tool = "biopython" 
 57   
 58   
 59  # XXX retmode? 
60 -def epost(db, **keywds):
61 """Post a file of identifiers for future use. 62 63 Posts a file containing a list of UIs for future use in the user's 64 environment to use with subsequent search strategies. 65 66 See the online documentation for an explanation of the parameters: 67 http://www.ncbi.nlm.nih.gov/entrez/query/static/epost_help.html 68 69 Return a handle to the results. 70 71 Raises an IOError exception if there's a network error. 72 """ 73 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi' 74 variables = {'db' : db} 75 variables.update(keywds) 76 return _open(cgi, variables, post=True)
77
78 -def efetch(db, **keywds):
79 """Fetches Entrez results which are returned as a handle. 80 81 EFetch retrieves records in the requested format from a list of one or 82 more UIs or from user's environment. 83 84 See the online documentation for an explanation of the parameters: 85 http://www.ncbi.nlm.nih.gov/entrez/query/static/efetch_help.html 86 87 Return a handle to the results. 88 89 Raises an IOError exception if there's a network error. 90 91 Short example: 92 93 from Bio import Entrez 94 handle = Entrez.efetch(db="nucleotide", id="57240072", rettype="gb") 95 print handle.read() 96 """ 97 for key in keywds: 98 if key.lower()=="rettype" and keywds[key].lower()=="genbank": 99 warnings.warn('As of Easter 2009, Entrez EFetch no longer ' 100 'supports the unofficial return type "genbank", ' 101 'use "gb" or "gp" instead.', DeprecationWarning) 102 if db.lower()=="protein": 103 keywds[key] = "gp" #GenPept 104 else: 105 keywds[key] = "gb" #GenBank 106 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi' 107 variables = {'db' : db} 108 variables.update(keywds) 109 return _open(cgi, variables)
110
111 -def esearch(db, term, **keywds):
112 """ESearch runs an Entrez search and returns a handle to the results. 113 114 ESearch searches and retrieves primary IDs (for use in EFetch, ELink 115 and ESummary) and term translations, and optionally retains results 116 for future use in the user's environment. 117 118 See the online documentation for an explanation of the parameters: 119 http://www.ncbi.nlm.nih.gov/entrez/query/static/esearch_help.html 120 121 Return a handle to the results which are always in XML format. 122 123 Raises an IOError exception if there's a network error. 124 125 Short example: 126 127 from Bio import Entez 128 handle = Entrez.esearch(db="nucleotide", retmax=10, term="Opuntia") 129 record = Entrez.read(handle) 130 print record["Count"] 131 print record["IdList"] 132 """ 133 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi' 134 variables = {'db' : db, 135 'term' : term} 136 variables.update(keywds) 137 return _open(cgi, variables)
138 159
160 -def einfo(**keywds):
161 """EInfo returns a summary of the Entez databases as a results handle. 162 163 EInfo provides field names, index term counts, last update, and 164 available links for each Entrez database. 165 166 See the online documentation for an explanation of the parameters: 167 http://www.ncbi.nlm.nih.gov/entrez/query/static/einfo_help.html 168 169 Return a handle to the results, by default in XML format. 170 171 Raises an IOError exception if there's a network error. 172 173 Short example: 174 175 from Bio import Entrez 176 record = Entrez.read(Entrez.einfo()) 177 print record['DbList'] 178 """ 179 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi' 180 variables = {} 181 variables.update(keywds) 182 return _open(cgi, variables)
183
184 -def esummary(**keywds):
185 """ESummary retrieves document summaries as a results handle. 186 187 ESummary retrieves document summaries from a list of primary IDs or 188 from the user's environment. 189 190 See the online documentation for an explanation of the parameters: 191 http://www.ncbi.nlm.nih.gov/entrez/query/static/esummary_help.html 192 193 Return a handle to the results, by default in XML format. 194 195 Raises an IOError exception if there's a network error. 196 """ 197 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi' 198 variables = {} 199 variables.update(keywds) 200 return _open(cgi, variables)
201
202 -def egquery(**keywds):
203 """EGQuery provides Entrez database counts for a global search. 204 205 EGQuery provides Entrez database counts in XML for a single search 206 using Global Query. 207 208 See the online documentation for an explanation of the parameters: 209 http://www.ncbi.nlm.nih.gov/entrez/query/static/egquery_help.html 210 211 Return a handle to the results in XML format. 212 213 Raises an IOError exception if there's a network error. 214 """ 215 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/egquery.fcgi' 216 variables = {} 217 variables.update(keywds) 218 return _open(cgi, variables)
219
220 -def espell(**keywds):
221 """ESpell retrieves spelling suggestions, returned in a results handle. 222 223 ESpell retrieves spelling suggestions, if available. 224 225 See the online documentation for an explanation of the parameters: 226 http://www.ncbi.nlm.nih.gov/entrez/query/static/espell_help.html 227 228 Return a handle to the results, by default in XML format. 229 230 Raises an IOError exception if there's a network error. 231 232 Short example: 233 234 from Bio import Entrez 235 record = Entrez.read(Entrez.espell(term="biopythooon")) 236 print record["Query"] 237 print record["CorrectedQuery"] 238 """ 239 cgi='http://eutils.ncbi.nlm.nih.gov/entrez/eutils/espell.fcgi' 240 variables = {} 241 variables.update(keywds) 242 return _open(cgi, variables)
243
244 -def read(handle):
245 """Parses an XML file from the NCBI Entrez Utilities into python objects. 246 247 This function parses an XML file created by NCBI's Entrez Utilities, 248 returning a multilevel data structure of Python lists and dictionaries. 249 Most XML files returned by NCBI's Entrez Utilities can be parsed by 250 this function, provided its DTD is available. Biopython includes the 251 DTDs for most commonly used Entrez Utilities. 252 253 Whereas the data structure seems to consist of generic Python lists, 254 dictionaries, strings, and so on, each of these is actually a class 255 derived from the base type. This allows us to store the attributes 256 (if any) of each element in a dictionary my_element.attributes, and 257 the tag name in my_element.tag. 258 """ 259 from Parser import DataHandler 260 DTDs = os.path.join(__path__[0], "DTDs") 261 handler = DataHandler(DTDs) 262 record = handler.read(handle) 263 return record
264
265 -def parse(handle):
266 from Parser import DataHandler 267 DTDs = os.path.join(__path__[0], "DTDs") 268 handler = DataHandler(DTDs) 269 records = handler.parse(handle) 270 return records
271
272 -def _open(cgi, params={}, post=False):
273 """Helper function to build the URL and open a handle to it (PRIVATE). 274 275 Open a handle to Entrez. cgi is the URL for the cgi script to access. 276 params is a dictionary with the options to pass to it. Does some 277 simple error checking, and will raise an IOError if it encounters one. 278 279 This function also enforces the "up to three queries per second rule" 280 to avoid abusing the NCBI servers. 281 """ 282 # NCBI requirement: At most three queries per second. 283 # Equivalently, at least a third of second between queries 284 delay = 0.333333334 285 current = time.time() 286 wait = _open.previous + delay - current 287 if wait > 0: 288 time.sleep(wait) 289 _open.previous = current + wait 290 else: 291 _open.previous = current 292 # Remove None values from the parameters 293 for key, value in params.items(): 294 if value is None: 295 del params[key] 296 # Tell Entrez that we are using Biopython (or whatever the user has 297 # specified explicitly in the parameters or by changing the default) 298 if not "tool" in params: 299 params["tool"] = tool 300 # Tell Entrez who we are 301 if not "email" in params: 302 if email!=None: 303 params["email"] = email 304 else: 305 warnings.warn(""" 306 Email address is not specified. 307 308 To make use of NCBI's E-utilities, NCBI strongly recommends you to specify 309 your email address with each request. From June 1, 2010, this will be 310 mandatory. As an example, if your email address is A.N.Other@example.com, you 311 can specify it as follows: 312 from Bio import Entrez 313 Entrez.email = 'A.N.Other@example.com' 314 In case of excessive usage of the E-utilities, NCBI will attempt to contact 315 a user at the email address provided before blocking access to the 316 E-utilities.""", UserWarning) 317 # Open a handle to Entrez. 318 options = urllib.urlencode(params, doseq=True) 319 if post: 320 #HTTP POST 321 handle = urllib.urlopen(cgi, data=options) 322 else: 323 #HTTP GET 324 cgi += "?" + options 325 handle = urllib.urlopen(cgi) 326 327 # Wrap the handle inside an UndoHandle. 328 uhandle = File.UndoHandle(handle) 329 330 # Check for errors in the first 7 lines. 331 # This is kind of ugly. 332 lines = [] 333 for i in range(7): 334 lines.append(uhandle.readline()) 335 for i in range(6, -1, -1): 336 uhandle.saveline(lines[i]) 337 data = ''.join(lines) 338 339 if "500 Proxy Error" in data: 340 # Sometimes Entrez returns a Proxy Error instead of results 341 raise IOError("500 Proxy Error (NCBI busy?)") 342 elif "502 Proxy Error" in data: 343 raise IOError("502 Proxy Error (NCBI busy?)") 344 elif "WWW Error 500 Diagnostic" in data: 345 raise IOError("WWW Error 500 Diagnostic (NCBI busy?)") 346 elif "<title>Service unavailable!</title>" in data: 347 #Probably later in the file it will say "Error 503" 348 raise IOError("Service unavailable!") 349 elif "<title>Bad Gateway!</title>" in data: 350 #Probably later in the file it will say: 351 # "The proxy server received an invalid 352 # response from an upstream server." 353 raise IOError("Bad Gateway!") 354 elif "<title>414 Request-URI Too Large</title>" in data \ 355 or "<h1>Request-URI Too Large</h1>" in data: 356 raise IOError("Requested URL too long (try using EPost?)") 357 elif data.startswith("Error:"): 358 #e.g. 'Error: Your session has expired. Please repeat your search.\n' 359 raise IOError(data.strip()) 360 elif data.startswith("The resource is temporarily unavailable"): 361 #This can occur with an invalid query_key 362 #Perhaps this should be a ValueError? 363 raise IOError("The resource is temporarily unavailable") 364 elif data.startswith("download dataset is empty"): 365 #This can occur when omit the identifier, or the WebEnv and query_key 366 #Perhaps this should be a ValueError? 367 raise IOError("download dataset is empty") 368 elif data[:5] == "ERROR": 369 # XXX Possible bug here, because I don't know whether this really 370 # occurs on the first line. I need to check this! 371 raise IOError("ERROR, possibly because id not available?") 372 # Should I check for 404? timeout? etc? 373 return uhandle
374 375 _open.previous = 0 376