1
2
3
4
5
6
7
8 """ASTRAL RAF (Rapid Access Format) Sequence Maps.
9
10 The ASTRAL RAF Sequence Maps record the relationship between the PDB SEQRES
11 records (representing the sequence of the molecule used in an experiment) to
12 the ATOM records (representing the atoms experimentally observed).
13
14 This data is derived from the Protein Data Bank CIF files. Known errors in the
15 CIF files are corrected manually, with the original PDB file serving as the
16 final arbiter in case of discrepancies.
17
18 Residues are referenced by residue ID. This consists of a the PDB residue
19 sequence number (upto 4 digits) and an optional PDB insertion code (an
20 ascii alphabetic character, a-z, A-Z). e.g. "1", "10A", "1010b", "-1"
21
22 See "ASTRAL RAF Sequence Maps":http://astral.stanford.edu/raf.html
23
24 to_one_letter_code -- A mapping from the 3-letter amino acid codes found
25 in PDB files to 1-letter codes. The 3-letter codes
26 include chemically modified residues.
27 """
28
29 from copy import copy
30 from types import *
31
32 from Residues import Residues
33
34
35
36 to_one_letter_code= {
37 'ALA':'A', 'VAL':'V', 'PHE':'F', 'PRO':'P', 'MET':'M',
38 'ILE':'I', 'LEU':'L', 'ASP':'D', 'GLU':'E', 'LYS':'K',
39 'ARG':'R', 'SER':'S', 'THR':'T', 'TYR':'Y', 'HIS':'H',
40 'CYS':'C', 'ASN':'N', 'GLN':'Q', 'TRP':'W', 'GLY':'G',
41 '2AS':'D', '3AH':'H', '5HP':'E', 'ACL':'R', 'AIB':'A',
42 'ALM':'A', 'ALO':'T', 'ALY':'K', 'ARM':'R', 'ASA':'D',
43 'ASB':'D', 'ASK':'D', 'ASL':'D', 'ASQ':'D', 'AYA':'A',
44 'BCS':'C', 'BHD':'D', 'BMT':'T', 'BNN':'A', 'BUC':'C',
45 'BUG':'L', 'C5C':'C', 'C6C':'C', 'CCS':'C', 'CEA':'C',
46 'CHG':'A', 'CLE':'L', 'CME':'C', 'CSD':'A', 'CSO':'C',
47 'CSP':'C', 'CSS':'C', 'CSW':'C', 'CXM':'M', 'CY1':'C',
48 'CY3':'C', 'CYG':'C', 'CYM':'C', 'CYQ':'C', 'DAH':'F',
49 'DAL':'A', 'DAR':'R', 'DAS':'D', 'DCY':'C', 'DGL':'E',
50 'DGN':'Q', 'DHA':'A', 'DHI':'H', 'DIL':'I', 'DIV':'V',
51 'DLE':'L', 'DLY':'K', 'DNP':'A', 'DPN':'F', 'DPR':'P',
52 'DSN':'S', 'DSP':'D', 'DTH':'T', 'DTR':'W', 'DTY':'Y',
53 'DVA':'V', 'EFC':'C', 'FLA':'A', 'FME':'M', 'GGL':'E',
54 'GLZ':'G', 'GMA':'E', 'GSC':'G', 'HAC':'A', 'HAR':'R',
55 'HIC':'H', 'HIP':'H', 'HMR':'R', 'HPQ':'F', 'HTR':'W',
56 'HYP':'P', 'IIL':'I', 'IYR':'Y', 'KCX':'K', 'LLP':'K',
57 'LLY':'K', 'LTR':'W', 'LYM':'K', 'LYZ':'K', 'MAA':'A',
58 'MEN':'N', 'MHS':'H', 'MIS':'S', 'MLE':'L', 'MPQ':'G',
59 'MSA':'G', 'MSE':'M', 'MVA':'V', 'NEM':'H', 'NEP':'H',
60 'NLE':'L', 'NLN':'L', 'NLP':'L', 'NMC':'G', 'OAS':'S',
61 'OCS':'C', 'OMT':'M', 'PAQ':'Y', 'PCA':'E', 'PEC':'C',
62 'PHI':'F', 'PHL':'F', 'PR3':'C', 'PRR':'A', 'PTR':'Y',
63 'SAC':'S', 'SAR':'G', 'SCH':'C', 'SCS':'C', 'SCY':'C',
64 'SEL':'S', 'SEP':'S', 'SET':'S', 'SHC':'C', 'SHR':'K',
65 'SOC':'C', 'STY':'Y', 'SVA':'S', 'TIH':'A', 'TPL':'W',
66 'TPO':'T', 'TPQ':'A', 'TRG':'K', 'TRO':'W', 'TYB':'Y',
67 'TYQ':'Y', 'TYS':'Y', 'TYY':'Y', 'AGM':'R', 'GL3':'G',
68 'SMC':'C', 'ASX':'B', 'CGU':'E', 'CSX':'C', 'GLX':'Z',
69 'PYX':'C',
70 'UNK':'X'
71 }
72
73
75 """Convert RAF one-letter amino acid codes into IUPAC standard codes.
76
77 Letters are uppercased, and "." ("Unknown") is converted to "X".
78 """
79 if one_letter_code == '.':
80 return 'X'
81 else:
82 return one_letter_code.upper()
83
85 """An RAF file index.
86
87 The RAF file itself is about 50 MB. This index provides rapid, random
88 access of RAF records without having to load the entire file into memory.
89
90 The index key is a concatenation of the PDB ID and chain ID. e.g
91 "2drcA", "155c_". RAF uses an underscore to indicate blank
92 chain IDs.
93 """
94
96 """
97 Arguments:
98
99 filename -- The file to index
100 """
101 dict.__init__(self)
102 self.filename = filename
103
104 f = open(self.filename, "rU")
105 try:
106 position = 0
107 while True:
108 line = f.readline()
109 if not line: break
110 key = line[0:5]
111 if key != None:
112 self[key]=position
113 position = f.tell()
114 finally:
115 f.close()
116
118 """ Return an item from the indexed file. """
119 position = dict.__getitem__(self,key)
120
121 f = open(self.filename, "rU")
122 try:
123 f.seek(position)
124 line = f.readline()
125 record = SeqMap(line)
126 finally:
127 f.close()
128 return record
129
130
132 """Get the sequence map for a collection of residues.
133
134 residues -- A Residues instance, or a string that can be converted into
135 a Residues instance.
136 """
137 if type(residues) == StringType:
138 residues = Residues(residues)
139
140 pdbid = residues.pdbid
141 frags = residues.fragments
142 if not frags: frags =(('_','',''),)
143
144 seqMap = None
145 for frag in frags:
146 chainid = frag[0]
147 if chainid=='' or chainid=='-' or chainid==' ' or chainid=='_':
148 chainid = '_'
149 id = pdbid + chainid
150
151
152 sm = self[id]
153
154
155 start = 0
156 end = len(sm.res)
157 if frag[1] : start = int(sm.index(frag[1], chainid))
158 if frag[2] : end = int(sm.index(frag[2], chainid)+1)
159
160 sm = sm[start:end]
161
162 if seqMap == None:
163 seqMap = sm
164 else:
165 seqMap += sm
166
167 return seqMap
168
169
170
172 """An ASTRAL RAF (Rapid Access Format) Sequence Map.
173
174 This is a list like object; You can find the location of particular residues
175 with index(), slice this SeqMap into fragments, and glue fragments back
176 together with extend().
177
178 pdbid -- The PDB 4 character ID
179
180 pdb_datestamp -- From the PDB file
181
182 version -- The RAF format version. e.g. 0.01
183
184 flags -- RAF flags. (See release notes for more information.)
185
186 res -- A list of Res objects, one for each residue in this sequence map
187 """
188
190 self.pdbid = ''
191 self.pdb_datestamp = ''
192 self.version = ''
193 self.flags = ''
194 self.res = []
195 if line:
196 self._process(line)
197
198
200 """Parses a RAF record into a SeqMap object.
201 """
202 header_len = 38
203
204 line = line.rstrip()
205
206 if len(line)<header_len:
207 raise ValueError("Incomplete header: "+line)
208
209 self.pdbid = line[0:4]
210 chainid = line[4:5]
211
212 self.version = line[6:10]
213
214
215 if(self.version != "0.01" and self.version !="0.02"):
216 raise ValueError("Incompatible RAF version: "+self.version)
217
218 self.pdb_datestamp = line[14:20]
219 self.flags = line[21:27]
220
221 for i in range(header_len, len(line), 7):
222 f = line[i : i+7]
223 if len(f)!=7:
224 raise ValueError("Corrupt Field: ("+f+")")
225 r = Res()
226 r.chainid = chainid
227 r.resid = f[0:5].strip()
228 r.atom = normalize_letters(f[5:6])
229 r.seqres = normalize_letters(f[6:7])
230
231 self.res.append(r)
232
233
234 - def index(self, resid, chainid="_"):
235 for i in range(0, len(self.res)):
236 if self.res[i].resid == resid and self.res[i].chainid == chainid:
237 return i
238 raise KeyError("No such residue "+chainid+resid)
239
241 s = copy(self)
242 s.res = s.res[i:j]
243 return s
244
246 """Append another Res object onto the list of residue mappings."""
247 self.res.append(res)
248
250 """Append another SeqMap onto the end of self.
251
252 Both SeqMaps must have the same PDB ID, PDB datestamp and
253 RAF version. The RAF flags are erased if they are inconsistent. This
254 may happen when fragments are taken from different chains.
255 """
256 if not isinstance(other, SeqMap):
257 raise TypeError("Can only extend a SeqMap with a SeqMap.")
258 if self.pdbid != other.pdbid:
259 raise TypeError("Cannot add fragments from different proteins")
260 if self.version != other.version:
261 raise TypeError("Incompatible rafs")
262 if self.pdb_datestamp != other.pdb_datestamp:
263 raise TypeError("Different pdb dates!")
264 if self.flags != other.flags:
265 self.flags = ''
266 self.res += other.res
267
271
276
277 - def getAtoms(self, pdb_handle, out_handle):
278 """Extract all relevant ATOM and HETATOM records from a PDB file.
279
280 The PDB file is scanned for ATOM and HETATOM records. If the
281 chain ID, residue ID (seqNum and iCode), and residue type match
282 a residue in this sequence map, then the record is echoed to the
283 output handle.
284
285 This is typically used to find the coordinates of a domain, or other
286 residue subset.
287
288 pdb_handle -- A handle to the relevant PDB file.
289
290 out_handle -- All output is written to this file like object.
291 """
292
293
294
295 resSet = {}
296 for r in self.res:
297 if r.atom=='X' :
298 continue
299 chainid = r.chainid
300 if chainid == '_':
301 chainid = ' '
302 resid = r.resid
303 resSet[(chainid,resid)] = r
304
305 resFound = {}
306 for line in pdb_handle.xreadlines():
307 if line.startswith("ATOM ") or line.startswith("HETATM"):
308 chainid = line[21:22]
309 resid = line[22:27].strip()
310 key = (chainid, resid)
311 if key in resSet:
312 res = resSet[key]
313 atom_aa = res.atom
314 resName = line[17:20]
315 if resName in to_one_letter_code:
316 if to_one_letter_code[resName] == atom_aa:
317 out_handle.write(line)
318 resFound[key] = res
319
320 if len(resSet) != len(resFound):
321
322
323
324
325 raise RuntimeError('I could not find at least one ATOM or HETATM' \
326 +' record for each and every residue in this sequence map.')
327
328
329
331 """ A single residue mapping from a RAF record.
332
333 chainid -- A single character chain ID.
334
335 resid -- The residue ID.
336
337 atom -- amino acid one-letter code from ATOM records.
338
339 seqres -- amino acid one-letter code from SEQRES records.
340 """
342 self.chainid = ''
343 self.resid = ''
344 self.atom = ''
345 self.seqres = ''
346
347
349 """Iterates over a RAF file, returning a SeqMap object for each line
350 in the file.
351
352 Arguments:
353
354 handle -- file-like object.
355 """
356 for line in handle:
357 yield SeqMap(line)
358