Package Bio :: Package SeqUtils
[hide private]
[frames] | no frames]

Source Code for Package Bio.SeqUtils

  1  #!/usr/bin/env python 
  2  # Created: Wed May 29 08:07:18 2002 
  3  # thomas@cbs.dtu.dk, Cecilia.Alsmark@ebc.uu.se 
  4  # Copyright 2001 by Thomas Sicheritz-Ponten and Cecilia Alsmark. 
  5  # All rights reserved. 
  6  # This code is part of the Biopython distribution and governed by its 
  7  # license.  Please see the LICENSE file that should have been included 
  8  # as part of this package. 
  9   
 10  """Miscellaneous functions for dealing with sequences.""" 
 11   
 12  import re, time 
 13  from Bio import SeqIO 
 14  from Bio.Seq import Seq 
 15  from Bio import Alphabet 
 16  from Bio.Alphabet import IUPAC 
 17  from Bio.Data import IUPACData, CodonTable 
 18   
 19   
 20  ###################################### 
 21  # DNA 
 22  ###################### 
 23  # {{{  
 24   
25 -def reverse(seq):
26 """Reverse the sequence. Works on string sequences (DEPRECATED). 27 28 This function is now deprecated, instead use the string's built in slice 29 method with a step of minus one: 30 31 >>> "ACGGT"[::-1] 32 'TGGCA' 33 """ 34 import warnings 35 warnings.warn("Bio.SeqUtils.reverse() is deprecated, use the string's " 36 "slice method with a step of minus one instead.", 37 DeprecationWarning) 38 r = list(seq) 39 r.reverse() 40 return ''.join(r)
41
42 -def GC(seq):
43 """Calculates G+C content, returns the percentage (float between 0 and 100). 44 45 Copes mixed case seuqneces, and with the ambiguous nucleotide S (G or C) 46 when counting the G and C content. The percentage is calculated against 47 the full length, e.g.: 48 49 >>> from Bio.SeqUtils import GC 50 >>> GC("ACTGN") 51 40.0 52 53 Note that this will return zero for an empty sequence. 54 """ 55 try: 56 gc = sum(map(seq.count,['G','C','g','c','S','s'])) 57 return gc*100.0/len(seq) 58 except ZeroDivisionError: 59 return 0.0
60 61
62 -def GC123(seq):
63 """Calculates total G+C content plus first, second and third positions. 64 65 Returns a tuple of four floats (percentages between 0 and 100) for the 66 entire sequence, and the three codon positions. e.g. 67 68 >>> from Bio.SeqUtils import GC123 69 >>> GC123("ACTGTN") 70 (40.0, 50.0, 50.0, 0.0) 71 72 Copes with mixed case sequences, but does NOT deal with ambiguous 73 nucleotides. 74 """ 75 d= {} 76 for nt in ['A','T','G','C']: 77 d[nt] = [0,0,0] 78 79 for i in range(0,len(seq),3): 80 codon = seq[i:i+3] 81 if len(codon) <3: codon += ' ' 82 for pos in range(0,3): 83 for nt in ['A','T','G','C']: 84 if codon[pos] == nt or codon[pos] == nt.lower(): 85 d[nt][pos] += 1 86 gc = {} 87 gcall = 0 88 nall = 0 89 for i in range(0,3): 90 try: 91 n = d['G'][i] + d['C'][i] +d['T'][i] + d['A'][i] 92 gc[i] = (d['G'][i] + d['C'][i])*100.0/n 93 except: 94 gc[i] = 0 95 96 gcall = gcall + d['G'][i] + d['C'][i] 97 nall = nall + n 98 99 gcall = 100.0*gcall/nall 100 return gcall, gc[0], gc[1], gc[2]
101
102 -def GC_skew(seq, window = 100):
103 """Calculates GC skew (G-C)/(G+C) for multuple windows along the sequence. 104 105 Returns a list of ratios (floats), controlled by the length of the sequence 106 and the size of the window. 107 108 Does NOT look at any ambiguous nucleotides. 109 """ 110 # 8/19/03: Iddo: added lowercase 111 values = [] 112 for i in range(0, len(seq), window): 113 s = seq[i: i + window] 114 g = s.count('G') + s.count('g') 115 c = s.count('C') + s.count('c') 116 skew = (g-c)/float(g+c) 117 values.append(skew) 118 return values
119 120 from math import pi, sin, cos, log
121 -def xGC_skew(seq, window = 1000, zoom = 100, 122 r = 300, px = 100, py = 100):
123 """Calculates and plots normal and accumulated GC skew (GRAPHICS !!!).""" 124 from Tkinter import Scrollbar, Canvas, BOTTOM, BOTH, ALL, \ 125 VERTICAL, HORIZONTAL, RIGHT, LEFT, X, Y 126 yscroll = Scrollbar(orient = VERTICAL) 127 xscroll = Scrollbar(orient = HORIZONTAL) 128 canvas = Canvas(yscrollcommand = yscroll.set, 129 xscrollcommand = xscroll.set, background = 'white') 130 win = canvas.winfo_toplevel() 131 win.geometry('700x700') 132 133 yscroll.config(command = canvas.yview) 134 xscroll.config(command = canvas.xview) 135 yscroll.pack(side = RIGHT, fill = Y) 136 xscroll.pack(side = BOTTOM, fill = X) 137 canvas.pack(fill=BOTH, side = LEFT, expand = 1) 138 canvas.update() 139 140 X0, Y0 = r + px, r + py 141 x1, x2, y1, y2 = X0 - r, X0 + r, Y0 -r, Y0 + r 142 143 ty = Y0 144 canvas.create_text(X0, ty, text = '%s...%s (%d nt)' % (seq[:7], seq[-7:], len(seq))) 145 ty +=20 146 canvas.create_text(X0, ty, text = 'GC %3.2f%%' % (GC(seq))) 147 ty +=20 148 canvas.create_text(X0, ty, text = 'GC Skew', fill = 'blue') 149 ty +=20 150 canvas.create_text(X0, ty, text = 'Accumulated GC Skew', fill = 'magenta') 151 ty +=20 152 canvas.create_oval(x1,y1, x2, y2) 153 154 acc = 0 155 start = 0 156 for gc in GC_skew(seq, window): 157 r1 = r 158 acc+=gc 159 # GC skew 160 alpha = pi - (2*pi*start)/len(seq) 161 r2 = r1 - gc*zoom 162 x1 = X0 + r1 * sin(alpha) 163 y1 = Y0 + r1 * cos(alpha) 164 x2 = X0 + r2 * sin(alpha) 165 y2 = Y0 + r2 * cos(alpha) 166 canvas.create_line(x1,y1,x2,y2, fill = 'blue') 167 # accumulated GC skew 168 r1 = r - 50 169 r2 = r1 - acc 170 x1 = X0 + r1 * sin(alpha) 171 y1 = Y0 + r1 * cos(alpha) 172 x2 = X0 + r2 * sin(alpha) 173 y2 = Y0 + r2 * cos(alpha) 174 canvas.create_line(x1,y1,x2,y2, fill = 'magenta') 175 176 canvas.update() 177 start += window 178 179 canvas.configure(scrollregion = canvas.bbox(ALL))
180
181 -def molecular_weight(seq):
182 """Calculate the molecular weight of a DNA sequence.""" 183 if type(seq) == type(''): seq = Seq(seq, IUPAC.unambiguous_dna) 184 weight_table = IUPACData.unambiguous_dna_weights 185 return sum(weight_table[x] for x in seq)
186
187 -def nt_search(seq, subseq):
188 """Search for a DNA subseq in sequence. 189 190 use ambiguous values (like N = A or T or C or G, R = A or G etc.) 191 searches only on forward strand 192 """ 193 pattern = '' 194 for nt in subseq: 195 value = IUPACData.ambiguous_dna_values[nt] 196 if len(value) == 1: 197 pattern += value 198 else: 199 pattern += '[%s]' % value 200 201 pos = -1 202 result = [pattern] 203 l = len(seq) 204 while True: 205 pos+=1 206 s = seq[pos:] 207 m = re.search(pattern, s) 208 if not m: break 209 pos += int(m.start(0)) 210 result.append(pos) 211 return result
212 213 # }}} 214 215 ###################################### 216 # Protein 217 ###################### 218 # {{{ 219 220 # temporary hack for exception free translation of "dirty" DNA 221 # should be moved to ??? 222
223 -class ProteinX(Alphabet.ProteinAlphabet):
224 """Variant of the extended IUPAC extended protein alphabet (DEPRECATED).""" 225 letters = IUPACData.extended_protein_letters + "X"
226 227 #Can't add a deprecation warning to the class due to the following line: 228 proteinX = ProteinX() 229
230 -class MissingTable:
231 - def __init__(self, table):
232 self._table = table 233 import warnings 234 warnings.warn("Function Bio.SeqUtils.makeTableX() and related classes ProteinX " 235 "and MissingTable are deprecated.", DeprecationWarning)
236 - def get(self, codon, stop_symbol):
237 try: 238 return self._table.get(codon, stop_symbol) 239 except CodonTable.TranslationError: 240 return 'X'
241
242 -def makeTableX(table):
243 assert table.protein_alphabet == IUPAC.extended_protein 244 return CodonTable.CodonTable(table.nucleotide_alphabet, proteinX, 245 MissingTable(table.forward_table), 246 table.back_table, table.start_codons, 247 table.stop_codons)
248 249 # end of hacks 250
251 -def seq3(seq):
252 """Turn a one letter code protein sequence into one with three letter codes. 253 254 The single input argument 'seq' should be a protein sequence using single 255 letter codes, either as a python string or as a Seq or MutableSeq object. 256 257 This function returns the amino acid sequence as a string using the three 258 letter amino acid codes. Output follows the IUPAC standard (including 259 ambiguous characters B for "Asx", J for "Xle" and X for "Xaa", and also U 260 for "Sel" and O for "Pyl") plus "Ter" for a terminator given as an asterisk. Any unknown 261 character (including possible gap characters), is changed into 'Xaa'. 262 263 e.g. 264 >>> from Bio.SeqUtils import seq3 265 >>> seq3("MAIVMGRWKGAR*") 266 'MetAlaIleValMetGlyArgTrpLysGlyAlaArgTer' 267 268 This function was inspired by BioPerl's seq3. 269 """ 270 threecode = {'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp', 271 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His', 272 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met', 273 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg', 274 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp', 275 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', '*':'Ter', 276 'U':'Sel', 'O':'Pyl', 'J':'Xle', 277 } 278 #We use a default of 'Xaa' for undefined letters 279 #Note this will map '-' to 'Xaa' which may be undesirable! 280 return ''.join([threecode.get(aa,'Xaa') for aa in seq])
281 282 283 # }}} 284 285 ###################################### 286 # Mixed ??? 287 ###################### 288 # {{{ 289
290 -def GC_Frame(seq, genetic_code = 1):
291 """Just an alias for six_frame_translations (OBSOLETE). 292 293 Use six_frame_translation directly, as this function may be deprecated 294 in a future release.""" 295 return six_frame_translations(seq, genetic_code)
296
297 -def six_frame_translations(seq, genetic_code = 1):
298 """Formatted string showing the 6 frame translations and GC content. 299 300 nice looking 6 frame translation with GC content - code from xbbtools 301 similar to DNA Striders six-frame translation 302 303 e.g. 304 from Bio.SeqUtils import six_frame_translations 305 print six_frame_translations("AUGGCCAUUGUAAUGGGCCGCUGA") 306 """ 307 from Bio.Seq import reverse_complement, translate 308 anti = reverse_complement(seq) 309 comp = anti[::-1] 310 length = len(seq) 311 frames = {} 312 for i in range(0,3): 313 frames[i+1] = translate(seq[i:], genetic_code) 314 frames[-(i+1)] = reverse(translate(anti[i:], genetic_code)) 315 316 # create header 317 if length > 20: 318 short = '%s ... %s' % (seq[:10], seq[-10:]) 319 else: 320 short = seq 321 #TODO? Remove the date as this would spoil any unit test... 322 date = time.strftime('%y %b %d, %X', time.localtime(time.time())) 323 header = 'GC_Frame: %s, ' % date 324 for nt in ['a','t','g','c']: 325 header += '%s:%d ' % (nt, seq.count(nt.upper())) 326 327 header += '\nSequence: %s, %d nt, %0.2f %%GC\n\n\n' % (short.lower(),length, GC(seq)) 328 res = header 329 330 for i in range(0,length,60): 331 subseq = seq[i:i+60] 332 csubseq = comp[i:i+60] 333 p = i/3 334 res = res + '%d/%d\n' % (i+1, i/3+1) 335 res = res + ' ' + ' '.join(map(None,frames[3][p:p+20])) + '\n' 336 res = res + ' ' + ' '.join(map(None,frames[2][p:p+20])) + '\n' 337 res = res + ' '.join(map(None,frames[1][p:p+20])) + '\n' 338 # seq 339 res = res + subseq.lower() + '%5d %%\n' % int(GC(subseq)) 340 res = res + csubseq.lower() + '\n' 341 # - frames 342 res = res + ' '.join(map(None,frames[-2][p:p+20])) +' \n' 343 res = res + ' ' + ' '.join(map(None,frames[-1][p:p+20])) + '\n' 344 res = res + ' ' + ' '.join(map(None,frames[-3][p:p+20])) + '\n\n' 345 return res
346 347 # }}} 348 349 ###################################### 350 # FASTA file utilities 351 ###################### 352 # {{{ 353
354 -def fasta_uniqids(file):
355 """Checks and changes the name/ID's to be unique identifiers by adding numbers (OBSOLETE). 356 357 file - a FASTA format filename to read in. 358 359 No return value, the output is written to screen. 360 """ 361 dict = {} 362 txt = open(file).read() 363 entries = [] 364 for entry in txt.split('>')[1:]: 365 name, seq= entry.split('\n',1) 366 name = name.split()[0].split(',')[0] 367 368 if name in dict: 369 n = 1 370 while 1: 371 n = n + 1 372 _name = name + str(n) 373 if _name not in dict: 374 name = _name 375 break 376 377 dict[name] = seq 378 379 for name, seq in dict.items(): 380 print '>%s\n%s' % (name, seq)
381
382 -def quick_FASTA_reader(file):
383 """Simple FASTA reader, returning a list of string tuples. 384 385 The single argument 'file' should be the filename of a FASTA format file. 386 This function will open and read in the entire file, constructing a list 387 of all the records, each held as a tuple of strings (the sequence name or 388 title, and its sequence). 389 390 This function was originally intended for use on large files, where its 391 low overhead makes it very fast. However, because it returns the data as 392 a single in memory list, this can require a lot of RAM on large files. 393 394 You are generally encouraged to use Bio.SeqIO.parse(handle, "fasta") which 395 allows you to iterate over the records one by one (avoiding having all the 396 records in memory at once). Using Bio.SeqIO also makes it easy to switch 397 between different input file formats. However, please note that rather 398 than simple strings, Bio.SeqIO uses SeqRecord objects for each record. 399 """ 400 #Want to split on "\n>" not just ">" in case there are any extra ">" 401 #in the name/description. So, in order to make sure we also split on 402 #the first entry, prepend a "\n" to the start of the file. 403 handle = open(file) 404 txt = "\n" + handle.read() 405 handle.close() 406 entries = [] 407 for entry in txt.split('\n>')[1:]: 408 name,seq= entry.split('\n',1) 409 seq = seq.replace('\n','').replace(' ','').upper() 410 entries.append((name, seq)) 411 return entries
412
413 -def apply_on_multi_fasta(file, function, *args):
414 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE). 415 416 file - filename of a FASTA format file 417 function - the function you wish to invoke on each record 418 *args - any extra arguments you want passed to the function 419 420 This function will iterate over each record in a FASTA file as SeqRecord 421 objects, calling your function with the record (and supplied args) as 422 arguments. 423 424 This function returns a list. For those records where your function 425 returns a value, this is taken as a sequence and used to construct a 426 FASTA format string. If your function never has a return value, this 427 means apply_on_multi_fasta will return an empty list. 428 """ 429 try: 430 f = globals()[function] 431 except: 432 raise NotImplementedError("%s not implemented" % function) 433 434 handle = open(file, 'r') 435 records = SeqIO.parse(handle, "fasta") 436 results = [] 437 for record in records: 438 arguments = [record.sequence] 439 for arg in args: arguments.append(arg) 440 result = f(*arguments) 441 if result: 442 results.append('>%s\n%s' % (record.name, result)) 443 handle.close() 444 return results
445
446 -def quicker_apply_on_multi_fasta(file, function, *args):
447 """Apply a function on each sequence in a multiple FASTA file (OBSOLETE). 448 449 file - filename of a FASTA format file 450 function - the function you wish to invoke on each record 451 *args - any extra arguments you want passed to the function 452 453 This function will use quick_FASTA_reader to load every record in the 454 FASTA file into memory as a list of tuples. For each record, it will 455 call your supplied function with the record as a tuple of the name and 456 sequence as strings (plus any supplied args). 457 458 This function returns a list. For those records where your function 459 returns a value, this is taken as a sequence and used to construct a 460 FASTA format string. If your function never has a return value, this 461 means quicker_apply_on_multi_fasta will return an empty list. 462 """ 463 try: 464 f = globals()[function] 465 except: 466 raise NotImplementedError("%s not implemented" % function) 467 468 entries = quick_FASTA_reader(file) 469 results = [] 470 for name, seq in entries: 471 arguments = [seq] 472 for arg in args: arguments.append(arg) 473 result = f(*arguments) 474 if result: 475 results.append('>%s\n%s' % (name, result)) 476 file.close() 477 return results
478 479 # }}} 480 481 ###################################### 482 # Main 483 ##################### 484 # {{{ 485 486 if __name__ == '__main__': 487 import sys, getopt 488 # crude command line options to use most functions directly on a FASTA file 489 options = {'apply_on_multi_fasta':0, 490 'quick':0, 491 'uniq_ids':0, 492 } 493 494 optlist, args = getopt.getopt(sys.argv[1:], '', ['describe', 'apply_on_multi_fasta=', 495 'help', 'quick', 'uniq_ids', 'search=']) 496 for arg in optlist: 497 if arg[0] in ['-h', '--help']: 498 pass 499 elif arg[0] in ['--describe']: 500 # get all new functions from this file 501 mol_funcs = [x[0] for x in locals().items() if type(x[1]) == type(GC)] 502 mol_funcs.sort() 503 print 'available functions:' 504 for f in mol_funcs: print '\t--%s' % f 505 print '\n\ne.g.\n./sequtils.py --apply_on_multi_fasta GC test.fas' 506 507 sys.exit(0) 508 elif arg[0] in ['--apply_on_multi_fasta']: 509 options['apply_on_multi_fasta'] = arg[1] 510 elif arg[0] in ['--search']: 511 options['search'] = arg[1] 512 else: 513 key = re.search('-*(.+)', arg[0]).group(1) 514 options[key] = 1 515 516 517 if options.get('apply_on_multi_fasta'): 518 file = args[0] 519 function = options['apply_on_multi_fasta'] 520 arguments = [] 521 if options.get('search'): 522 arguments = options['search'] 523 if function == 'xGC_skew': 524 arguments = 1000 525 if options.get('quick'): 526 results = quicker_apply_on_multi_fasta(file, function, arguments) 527 else: 528 results = apply_on_multi_fasta(file, function, arguments) 529 for result in results: print result 530 531 elif options.get('uniq_ids'): 532 file = args[0] 533 fasta_uniqids(file) 534 535 # }}} 536