Package Bio :: Package PDB :: Module PDBList'
[hide private]
[frames] | no frames]

Source Code for Module Bio.PDB.PDBList'

  1  #!/usr/bin/env python 
  2  # 
  3  # PDBList.py 
  4  # 
  5  # A tool for tracking changes in the PDB Protein Structure Database. 
  6  # 
  7  # Version 2.0 
  8  # 
  9  # (c) 2003 Kristian Rother 
 10  # This work was supported by the German Ministry of Education 
 11  # and Research (BMBF). Project http://www.bcbio.de 
 12  #  
 13  # Contact the author 
 14  #    homepage : http://www.rubor.de/bioinf 
 15  #    email    : krother@genesilico.pl 
 16  # 
 17  # 
 18  # This Code is released under the conditions of the Biopython license. 
 19  # It may be distributed freely with respect to the original author. 
 20  # Any maintainer of the BioPython code may change this notice 
 21  # when appropriate. 
 22  # 
 23  # Last modified on Fri, Oct 24th 2006, Warszawa 
 24  # 
 25  # Removed 'write' options from retrieve_pdb_file method: it is not used. 
 26  # Also added a 'dir' options (pdb file is put in this directory if given), 
 27  # and an 'exist' option (test if the file is already there). This method 
 28  # now returns the name of the downloaded uncompressed file. 
 29  # 
 30  # -Thomas, 1/06/04 
 31  # 
 32  # 
 33  # Including bugfixes from Sunjoong Lee (9/2006) 
 34  # 
 35   
 36  __doc__="Access the PDB over the internet (for example to download structures)." 
 37   
 38  import urllib, re, os 
 39  import shutil 
 40   
41 -class PDBList:
42 """ 43 This class provides quick access to the structure lists on the 44 PDB server or its mirrors. The structure lists contain 45 four-letter PDB codes, indicating that structures are 46 new, have been modified or are obsolete. The lists are released 47 on a weekly basis. 48 49 It also provides a function to retrieve PDB files from the server. 50 To use it properly, prepare a directory /pdb or the like, 51 where PDB files are stored. 52 53 If You want to use this module from inside a proxy, add 54 the proxy variable to Your environment, e.g. in Unix 55 export HTTP_PROXY='http://realproxy.charite.de:888' 56 (This can also be added to ~/.bashrc) 57 """ 58 59 PDB_REF=""" 60 The Protein Data Bank: a computer-based archival file for macromolecular structures. 61 F.C.Bernstein, T.F.Koetzle, G.J.B.Williams, E.F.Meyer Jr, M.D.Brice, J.R.Rodgers, O.Kennard, T.Shimanouchi, M.Tasumi 62 J. Mol. Biol. 112 pp. 535-542 (1977) 63 http://www.pdb.org/. 64 """ 65 66 alternative_download_url = "http://www.rcsb.org/pdb/files/" 67 # just append PDB code to this, and then it works. 68 # (above URL verified with a XXXX.pdb appended on 2 Sept 2008) 69
70 - def __init__(self,server='ftp://ftp.wwpdb.org', pdb=os.getcwd(), obsolete_pdb=None):
71 """Initialize the class with the default server or a custom one.""" 72 # remote pdb server 73 self.pdb_server = server 74 75 # local pdb file tree 76 self.local_pdb = pdb 77 78 # local file tree for obsolete pdb files 79 if obsolete_pdb: 80 self.obsolete_pdb = obsolete_pdb 81 else: 82 self.obsolete_pdb = os.path.join(self.local_pdb, 'obsolete') 83 if not os.access(self.obsolete_pdb,os.F_OK): 84 os.makedirs(self.obsolete_pdb) 85 86 # variables for command-line options 87 self.overwrite = 0 88 self.flat_tree = 0
89 90
91 - def get_status_list(self,url):
92 """Retrieves a list of pdb codes in the weekly pdb status file 93 from the given URL. Used by get_recent_files. 94 95 Typical contents of the list files parsed by this method is now 96 very simply one PDB name per line. 97 """ 98 handle = urllib.urlopen(url) 99 answer = [] 100 for line in handle: 101 pdb = line.strip() 102 assert len(pdb)==4 103 answer.append(pdb) 104 handle.close() 105 return answer
106 107
108 - def get_recent_changes(self):
109 """Returns three lists of the newest weekly files (added,mod,obsolete). 110 111 Reads the directories with changed entries from the PDB server and 112 returns a tuple of three URL's to the files of new, modified and 113 obsolete entries from the most recent list. The directory with the 114 largest numerical name is used. 115 Returns None if something goes wrong. 116 117 Contents of the data/status dir (20031013 would be used); 118 drwxrwxr-x 2 1002 sysadmin 512 Oct 6 18:28 20031006 119 drwxrwxr-x 2 1002 sysadmin 512 Oct 14 02:14 20031013 120 -rw-r--r-- 1 1002 sysadmin 1327 Mar 12 2001 README 121 122 123 """ 124 url = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/') 125 126 # added by S.Lee 127 recent = filter(lambda x: x.isdigit(), \ 128 map(lambda x: x.split()[-1], url.readlines()))[-1] 129 130 path = self.pdb_server+'/pub/pdb/data/status/%s/'%(recent) 131 # retrieve the lists 132 added = self.get_status_list(path+'added.pdb') 133 modified = self.get_status_list(path+'modified.pdb') 134 obsolete = self.get_status_list(path+'obsolete.pdb') 135 return [added,modified,obsolete]
136 137 138
139 - def get_all_entries(self):
140 """Retrieves a big file containing all the 141 PDB entries and some annotation to them. 142 Returns a list of PDB codes in the index file. 143 """ 144 entries = [] 145 print "retrieving index file. Takes about 5 MB." 146 url = urllib.urlopen(self.pdb_server+'/pub/pdb/derived_data/index/entries.idx') 147 # extract four-letter-codes 148 entries = map(lambda x: x[:4], \ 149 filter(lambda x: len(x)>4, url.readlines()[2:])) 150 151 return entries
152 153 154
155 - def get_all_obsolete(self):
156 """Returns a list of all obsolete entries ever in the PDB. 157 158 Returns a list of all obsolete pdb codes that have ever been 159 in the PDB. 160 161 Gets and parses the file from the PDB server in the format 162 (the first pdb_code column is the one used). The file looks 163 like this: 164 165 LIST OF OBSOLETE COORDINATE ENTRIES AND SUCCESSORS 166 OBSLTE 31-JUL-94 116L 216L 167 ... 168 OBSLTE 29-JAN-96 1HFT 2HFT 169 OBSLTE 21-SEP-06 1HFV 2J5X 170 OBSLTE 21-NOV-03 1HG6 171 OBSLTE 18-JUL-84 1HHB 2HHB 3HHB 172 OBSLTE 08-NOV-96 1HID 2HID 173 OBSLTE 01-APR-97 1HIU 2HIU 174 OBSLTE 14-JAN-04 1HKE 1UUZ 175 ... 176 177 """ 178 handle = urllib.urlopen(self.pdb_server+'/pub/pdb/data/status/obsolete.dat') 179 # extract pdb codes. Could use a list comprehension, but I want 180 # to include an assert to check for mis-reading the data. 181 obsolete = [] 182 for line in handle: 183 if not line.startswith("OBSLTE ") : continue 184 pdb = line.split()[2] 185 assert len(pdb)==4 186 obsolete.append(pdb) 187 handle.close() 188 return obsolete
189 190 191
192 - def retrieve_pdb_file(self,pdb_code, obsolete=0, compression='.gz', 193 uncompress="gunzip", pdir=None):
194 """Retrieves a PDB structure file from the PDB server and 195 stores it in a local file tree. 196 The PDB structure is returned as a single string. 197 If obsolete is 1, the file will be by default saved in a special file tree. 198 The compression should be '.Z' or '.gz'. 'uncompress' is 199 the command called to uncompress the files. 200 201 @param pdir: put the file in this directory (default: create a PDB-style directory tree) 202 @type pdir: string 203 204 @return: filename 205 @rtype: string 206 """ 207 # get the structure 208 code=pdb_code.lower() 209 filename="pdb%s.ent%s"%(code,compression) 210 if not obsolete: 211 url=(self.pdb_server+ 212 '/pub/pdb/data/structures/divided/pdb/%s/pdb%s.ent%s' 213 % (code[1:3],code,compression)) 214 else: 215 url=(self.pdb_server+ 216 '/pub/pdb/data/structures/obsolete/pdb/%s/pdb%s.ent%s' 217 % (code[1:3],code,compression)) 218 219 # in which dir to put the pdb file? 220 if pdir is None: 221 if self.flat_tree: 222 if not obsolete: 223 path=self.local_pdb 224 else: 225 path=self.obsolete_pdb 226 else: 227 # Put in PDB style directory tree 228 if not obsolete: 229 path=os.path.join(self.local_pdb, code[1:3]) 230 else: 231 path=os.path.join(self.obsolete_pdb,code[1:3]) 232 else: 233 # Put in specified directory 234 path=pdir 235 236 if not os.access(path,os.F_OK): 237 os.makedirs(path) 238 239 filename=os.path.join(path, filename) 240 # the final uncompressed file 241 final_file=os.path.join(path, "pdb%s.ent" % code) 242 243 # check whether the file exists 244 if not self.overwrite: 245 if os.path.exists(final_file): 246 print "file exists, not retrieved %s" % final_file 247 return final_file 248 249 # Retrieve the file 250 print 'retrieving %s' % url 251 lines=urllib.urlopen(url).read() 252 open(filename,'wb').write(lines) 253 # uncompress the file 254 os.system("%s %s" % (uncompress, filename)) 255 256 return final_file
257 258
259 - def update_pdb(self):
260 """ 261 I guess this is the 'most wanted' function from this module. 262 It gets the weekly lists of new and modified pdb entries and 263 automatically downloads the according PDB files. 264 You can call this module as a weekly cronjob. 265 """ 266 assert os.path.isdir(self.local_pdb) 267 assert os.path.isdir(self.obsolete_pdb) 268 269 new, modified, obsolete = self.get_recent_changes() 270 271 for pdb_code in new+modified: 272 try: 273 #print 'retrieving %s' % pdb_code 274 self.retrieve_pdb_file(pdb_code) 275 except Exception: 276 print 'error %s\n' % pdb_code 277 # you can insert here some more log notes that 278 # something has gone wrong. 279 280 # move the obsolete files to a special folder 281 for pdb_code in obsolete: 282 if self.flat_tree: 283 old_file = os.path.join(self.local_pdb, 284 'pdb%s.ent' % pdb_code) 285 new_dir = self.obsolete_pdb 286 else: 287 old_file = os.path.join(self.local_pdb, pdb_code[1:3], 288 'pdb%s.ent' % pdb_code) 289 new_dir = os.path.join(self.obsolete_pdb, pdb_code[1:3]) 290 new_file = os.path.join(new_dir, 'pdb%s.ent' % pdb_code) 291 if os.path.isfile(old_file): 292 if not os.path.isdir(new_dir): 293 os.mkdir(new_dir) 294 try: 295 shutil.move(old_file, new_file) 296 except Exception: 297 print "Could not move %s to obsolete folder" % old_file 298 elif os.path.isfile(new_file): 299 print "Obsolete file %s already moved" % old_file 300 else: 301 print "Obsolete file %s is missing" % old_file
302 303
304 - def download_entire_pdb(self,listfile=None):
305 """Retrieves all PDB entries not present in the local PDB copy. 306 Writes a list file containing all PDB codes (optional, if listfile is given). 307 """ 308 entries = self.get_all_entries() 309 for pdb_code in entries: self.retrieve_pdb_file(pdb_code) 310 311 # write the list 312 if listfile: 313 open(listfile,'w').writelines(map(lambda x: x+'\n',entries))
314 315
316 - def download_obsolete_entries(self,listfile=None):
317 318 """Retrieves all obsolete PDB entries not present in the local obsolete PDB copy. 319 Writes a list file containing all PDB codes (optional, if listfile is given). 320 """ 321 entries = self.get_all_obsolete() 322 for pdb_code in entries: self.retrieve_pdb_file(pdb_code,obsolete=1) 323 324 # write the list 325 if listfile: 326 open(listfile,'w').writelines(map(lambda x: x+'\n',entries))
327 328 329 330 # 331 # this is actually easter egg code not used by any of the methods 332 # maybe someone will find it useful. 333 #
334 - def get_seqres_file(self,savefile='pdb_seqres.txt'):
335 """Retrieves a (big) file containing all the sequences 336 of PDB entries and writes it to a file.""" 337 print "retrieving sequence file. Takes about 15 MB." 338 url = urllib.urlopen(self.pdb_server+'/pub/pdb/derived_data/pdb_seqres.txt') 339 file = url.readlines() 340 open(savefile,'w').writelines(file)
341 342 343 344 if __name__ == '__main__': 345 346 import sys 347 348 doc = """PDBList.py 349 (c) Kristian Rother 2003, Contributed to BioPython 350 351 Usage: 352 PDBList.py update <pdb_path> [options] - write weekly PDB updates to 353 local pdb tree. 354 PDBList.py all <pdb_path> [options] - write all PDB entries to 355 local pdb tree. 356 PDBList.py obsol <pdb_path> [options] - write all obsolete PDB 357 entries to local pdb tree. 358 PDBList.py <PDB-ID> <pdb_path> [options] - retrieve single structure 359 360 Options: 361 -d A single directory will be used as <pdb_path>, not a tree. 362 -o Overwrite existing structure files. 363 """ 364 print doc 365 366 if len(sys.argv)>2: 367 pdb_path = sys.argv[2] 368 pl = PDBList(pdb=pdb_path) 369 if len(sys.argv)>3: 370 for option in sys.argv[3:]: 371 if option == '-d': pl.flat_tree = 1 372 elif option == '-o': pl.overwrite = 1 373 374 else: 375 pdb_path = os.getcwd() 376 pl = PDBList() 377 pl.flat_tree = 1 378 379 if len(sys.argv) > 1: 380 if sys.argv[1] == 'update': 381 # update PDB 382 print "updating local PDB at "+pdb_path 383 pl.update_pdb() 384 385 elif sys.argv[1] == 'all': 386 # get the entire PDB 387 pl.download_entire_pdb() 388 389 elif sys.argv[1] == 'obsol': 390 # get all obsolete entries 391 pl.download_obsolete_entries(pdb_path) 392 393 elif re.search('^\d...$',sys.argv[1]): 394 # get single PDB entry 395 pl.retrieve_pdb_file(sys.argv[1],pdir=pdb_path) 396