Package Bio :: Package Align :: Module AlignInfo
[hide private]
[frames] | no frames]

Source Code for Module Bio.Align.AlignInfo

  1  """Extract information from alignment objects. 
  2   
  3  In order to try and avoid huge alignment objects with tons of functions, 
  4  functions which return summary type information about alignments should 
  5  be put into classes in this module. 
  6   
  7  classes: 
  8  o SummaryInfo 
  9  o PSSM 
 10  """ 
 11   
 12  # standard library 
 13  import math 
 14  import sys 
 15   
 16  # biopython modules 
 17  from Bio import Alphabet 
 18  from Bio.Alphabet import IUPAC 
 19  from Bio.Seq import Seq 
 20  from Bio.SubsMat import FreqTable 
 21   
 22  # Expected random distributions for 20-letter protein, and 
 23  # for 4-letter nucleotide alphabets 
 24  Protein20Random = 0.05 
 25  Nucleotide4Random = 0.25 
26 -class SummaryInfo:
27 """Calculate summary info about the alignment. 28 29 This class should be used to caclculate information summarizing the 30 results of an alignment. This may either be straight consensus info 31 or more complicated things. 32 """
33 - def __init__(self, alignment):
34 """Initialize with the alignment to calculate information on. 35 ic_vector attribute. A dictionary. Keys: column numbers. Values: 36 """ 37 self.alignment = alignment 38 self.ic_vector = {}
39
40 - def dumb_consensus(self, threshold = .7, ambiguous = "X", 41 consensus_alpha = None, require_multiple = 0):
42 """Output a fast consensus sequence of the alignment. 43 44 This doesn't do anything fancy at all. It will just go through the 45 sequence residue by residue and count up the number of each type 46 of residue (ie. A or G or T or C for DNA) in all sequences in the 47 alignment. If the percentage of the most common residue type is 48 greater then the passed threshold, then we will add that residue type, 49 otherwise an ambiguous character will be added. 50 51 This could be made a lot fancier (ie. to take a substitution matrix 52 into account), but it just meant for a quick and dirty consensus. 53 54 Arguments: 55 o threshold - The threshold value that is required to add a particular 56 atom. 57 o ambiguous - The ambiguous character to be added when the threshold is 58 not reached. 59 o consensus_alpha - The alphabet to return for the consensus sequence. 60 If this is None, then we will try to guess the alphabet. 61 o require_multiple - If set as 1, this will require that more than 62 1 sequence be part of an alignment to put it in the consensus (ie. 63 not just 1 sequence and gaps). 64 """ 65 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 66 consensus = '' 67 68 # find the length of the consensus we are creating 69 con_len = self.alignment.get_alignment_length() 70 71 # go through each seq item 72 for n in range(con_len): 73 # keep track of the counts of the different atoms we get 74 atom_dict = {} 75 num_atoms = 0 76 77 for record in self.alignment._records: 78 # make sure we haven't run past the end of any sequences 79 # if they are of different lengths 80 if n < len(record.seq): 81 if record.seq[n] != '-' and record.seq[n] != '.': 82 if record.seq[n] not in atom_dict.keys(): 83 atom_dict[record.seq[n]] = 1 84 else: 85 atom_dict[record.seq[n]] += 1 86 87 num_atoms = num_atoms + 1 88 89 max_atoms = [] 90 max_size = 0 91 92 for atom in atom_dict.keys(): 93 if atom_dict[atom] > max_size: 94 max_atoms = [atom] 95 max_size = atom_dict[atom] 96 elif atom_dict[atom] == max_size: 97 max_atoms.append(atom) 98 99 if require_multiple and num_atoms == 1: 100 consensus += ambiguous 101 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms)) 102 >= threshold): 103 consensus += max_atoms[0] 104 else: 105 consensus += ambiguous 106 107 # we need to guess a consensus alphabet if one isn't specified 108 if consensus_alpha is None: 109 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 110 111 return Seq(consensus, consensus_alpha)
112
113 - def gap_consensus(self, threshold = .7, ambiguous = "X", 114 consensus_alpha = None, require_multiple = 0):
115 """Same as dumb_consensus(), but allows gap on the output. 116 117 Things to do: Let the user define that with only one gap, the result 118 character in consensus is gap. Let the user select gap character, now 119 it takes the same is input. 120 """ 121 # Iddo Friedberg, 1-JUL-2004: changed ambiguous default to "X" 122 consensus = '' 123 124 # find the length of the consensus we are creating 125 con_len = self.alignment.get_alignment_length() 126 127 # go through each seq item 128 for n in range(con_len): 129 # keep track of the counts of the different atoms we get 130 atom_dict = {} 131 num_atoms = 0 132 133 for record in self.alignment._records: 134 # make sure we haven't run past the end of any sequences 135 # if they are of different lengths 136 if n < len(record.seq): 137 if record.seq[n] not in atom_dict.keys(): 138 atom_dict[record.seq[n]] = 1 139 else: 140 atom_dict[record.seq[n]] += 1 141 142 num_atoms += 1 143 144 max_atoms = [] 145 max_size = 0 146 147 for atom in atom_dict.keys(): 148 if atom_dict[atom] > max_size: 149 max_atoms = [atom] 150 max_size = atom_dict[atom] 151 elif atom_dict[atom] == max_size: 152 max_atoms.append(atom) 153 154 if require_multiple and num_atoms == 1: 155 consensus += ambiguous 156 elif (len(max_atoms) == 1) and ((float(max_size)/float(num_atoms)) 157 >= threshold): 158 consensus += max_atoms[0] 159 else: 160 consensus += ambiguous 161 162 # we need to guess a consensus alphabet if one isn't specified 163 if consensus_alpha is None: 164 #TODO - Should we make this into a Gapped alphabet? 165 consensus_alpha = self._guess_consensus_alphabet(ambiguous) 166 167 return Seq(consensus, consensus_alpha)
168
169 - def _guess_consensus_alphabet(self, ambiguous):
170 """Pick an (ungapped) alphabet for an alignment consesus sequence. 171 172 This just looks at the sequences we have, checks their type, and 173 returns as appropriate type which seems to make sense with the 174 sequences we've got. 175 """ 176 #Start with the (un-gapped version of) the alignment alphabet 177 a = Alphabet._get_base_alphabet(self.alignment._alphabet) 178 179 #Now check its compatible with all the rest of the sequences 180 for record in self.alignment: 181 #Get the (un-gapped version of) the sequence's alphabet 182 alt = Alphabet._get_base_alphabet(record.seq.alphabet) 183 if not isinstance(alt, a.__class__): 184 raise ValueError \ 185 ("Alignment contains a sequence with an incompatible alphabet.") 186 187 #Check the ambiguous character we are going to use in the consensus 188 #is in the alphabet's list of valid letters (if defined). 189 if hasattr(a, "letters") and a.letters is not None \ 190 and ambiguous not in a.letters: 191 #We'll need to pick a more generic alphabet... 192 if isinstance(a, IUPAC.IUPACUnambiguousDNA): 193 if ambiguous in IUPAC.IUPACUnambiguousDNA().letters: 194 a = IUPAC.IUPACUnambiguousDNA() 195 else: 196 a = Alphabet.generic_dna 197 elif isinstance(a, IUPAC.IUPACUnambiguousRNA): 198 if ambiguous in IUPAC.IUPACUnambiguousRNA().letters: 199 a = IUPAC.IUPACUnambiguousRNA() 200 else: 201 a = Alphabet.generic_rna 202 elif isinstance(a, IUPAC.IUPACProtein): 203 if ambiguous in IUPAC.ExtendedIUPACProtein().letters: 204 a = IUPAC.ExtendedIUPACProtein() 205 else: 206 a = Alphabet.generic_protein 207 else: 208 a = Alphabet.single_letter_alphabet 209 return a
210
211 - def replacement_dictionary(self, skip_chars = []):
212 """Generate a replacement dictionary to plug into a substitution matrix 213 214 This should look at an alignment, and be able to generate the number 215 of substitutions of different residues for each other in the 216 aligned object. 217 218 Will then return a dictionary with this information: 219 {('A', 'C') : 10, ('C', 'A') : 12, ('G', 'C') : 15 ....} 220 221 This also treats weighted sequences. The following example shows how 222 we calculate the replacement dictionary. Given the following 223 multiple sequence alignments: 224 225 GTATC 0.5 226 AT--C 0.8 227 CTGTC 1.0 228 229 For the first column we have: 230 ('A', 'G') : 0.5 * 0.8 = 0.4 231 ('C', 'G') : 0.5 * 1.0 = 0.5 232 ('A', 'C') : 0.8 * 1.0 = 0.8 233 234 We then continue this for all of the columns in the alignment, summing 235 the information for each substitution in each column, until we end 236 up with the replacement dictionary. 237 238 Arguments: 239 o skip_chars - A list of characters to skip when creating the dictionary. 240 For instance, you might have Xs (screened stuff) or Ns, and not want 241 to include the ambiguity characters in the dictionary. 242 """ 243 # get a starting dictionary based on the alphabet of the alignment 244 rep_dict, skip_items = self._get_base_replacements(skip_chars) 245 246 # iterate through each record 247 for rec_num1 in range(len(self.alignment._records)): 248 # iterate through each record from one beyond the current record 249 # to the end of the list of records 250 for rec_num2 in range(rec_num1 + 1, len(self.alignment._records)): 251 # for each pair of records, compare the sequences and add 252 # the pertinent info to the dictionary 253 rep_dict = self._pair_replacement( 254 self.alignment._records[rec_num1].seq, 255 self.alignment._records[rec_num2].seq, 256 self.alignment._records[rec_num1].annotations.get('weight',1.0), 257 self.alignment._records[rec_num2].annotations.get('weight',1.0), 258 rep_dict, skip_items) 259 260 return rep_dict
261
262 - def _pair_replacement(self, seq1, seq2, weight1, weight2, 263 start_dict, ignore_chars):
264 """Compare two sequences and generate info on the replacements seen. 265 266 Arguments: 267 o seq1, seq2 - The two sequences to compare. 268 o weight1, weight2 - The relative weights of seq1 and seq2. 269 o start_dict - The dictionary containing the starting replacement 270 info that we will modify. 271 o ignore_chars - A list of characters to ignore when calculating 272 replacements (ie. '-'). 273 274 Returns: 275 o A replacment dictionary which is modified from initial_dict with 276 the information from the sequence comparison. 277 """ 278 # loop through each residue in the sequences 279 for residue_num in range(len(seq1)): 280 residue1 = seq1[residue_num] 281 try: 282 residue2 = seq2[residue_num] 283 # if seq2 is shorter, then we just stop looking at replacements 284 # and return the information 285 except IndexError: 286 return start_dict 287 288 # if the two residues are characters we want to count 289 if (residue1 not in ignore_chars) and (residue2 not in ignore_chars): 290 try: 291 # add info about the replacement to the dictionary, 292 # modified by the sequence weights 293 start_dict[(residue1, residue2)] += weight1 * weight2 294 295 # if we get a key error, then we've got a problem with alphabets 296 except KeyError: 297 raise ValueError("Residues %s, %s not found in alphabet %s" 298 % (residue1, residue2, 299 self.alignment._alphabet)) 300 301 return start_dict
302 303
304 - def _get_all_letters(self):
305 """Returns a string containing the expected letters in the alignment.""" 306 all_letters = self.alignment._alphabet.letters 307 if all_letters is None \ 308 or (isinstance(self.alignment._alphabet, Alphabet.Gapped) \ 309 and all_letters == self.alignment._alphabet.gap_char): 310 #We are dealing with a generic alphabet class where the 311 #letters are not defined! We must build a list of the 312 #letters used... 313 set_letters = set() 314 for record in self.alignment: 315 #Note the built in set does not have a union_update 316 #which was provided by the sets module's Set 317 set_letters = set_letters.union(record.seq) 318 list_letters = list(set_letters) 319 list_letters.sort() 320 all_letters = "".join(list_letters) 321 return all_letters
322
323 - def _get_base_replacements(self, skip_items = []):
324 """Get a zeroed dictonary of all possible letter combinations. 325 326 This looks at the type of alphabet and gets the letters for it. 327 It then creates a dictionary with all possible combinations of these 328 letters as keys (ie. ('A', 'G')) and sets the values as zero. 329 330 Returns: 331 o The base dictionary created 332 o A list of alphabet items to skip when filling the dictionary.Right 333 now the only thing I can imagine in this list is gap characters, but 334 maybe X's or something else might be useful later. This will also 335 include any characters that are specified to be skipped. 336 """ 337 base_dictionary = {} 338 all_letters = self._get_all_letters() 339 340 # if we have a gapped alphabet we need to find the gap character 341 # and drop it out 342 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 343 skip_items.append(self.alignment._alphabet.gap_char) 344 all_letters = all_letters.replace(self.alignment._alphabet.gap_char,'') 345 346 # now create the dictionary 347 for first_letter in all_letters: 348 for second_letter in all_letters: 349 if (first_letter not in skip_items and 350 second_letter not in skip_items): 351 base_dictionary[(first_letter, second_letter)] = 0 352 353 return base_dictionary, skip_items
354 355
356 - def pos_specific_score_matrix(self, axis_seq = None, 357 chars_to_ignore = []):
358 """Create a position specific score matrix object for the alignment. 359 360 This creates a position specific score matrix (pssm) which is an 361 alternative method to look at a consensus sequence. 362 363 Arguments: 364 o chars_to_ignore - A listing of all characters not to include in 365 the pssm. If the alignment alphabet declares a gap character, 366 then it will be excluded automatically. 367 o axis_seq - An optional argument specifying the sequence to 368 put on the axis of the PSSM. This should be a Seq object. If nothing 369 is specified, the consensus sequence, calculated with default 370 parameters, will be used. 371 372 Returns: 373 o A PSSM (position specific score matrix) object. 374 """ 375 # determine all of the letters we have to deal with 376 all_letters = self._get_all_letters() 377 assert all_letters 378 379 if not isinstance(chars_to_ignore, list): 380 raise TypeError("chars_to_ignore should be a list.") 381 382 # if we have a gap char, add it to stuff to ignore 383 if isinstance(self.alignment._alphabet, Alphabet.Gapped): 384 chars_to_ignore.append(self.alignment._alphabet.gap_char) 385 386 for char in chars_to_ignore: 387 all_letters = all_letters.replace(char, '') 388 389 if axis_seq: 390 left_seq = axis_seq 391 assert len(axis_seq) == self.alignment.get_alignment_length() 392 else: 393 left_seq = self.dumb_consensus() 394 395 pssm_info = [] 396 # now start looping through all of the sequences and getting info 397 for residue_num in range(len(left_seq)): 398 score_dict = self._get_base_letters(all_letters) 399 for record in self.alignment._records: 400 try: 401 this_residue = record.seq[residue_num] 402 # if we hit an index error we've run out of sequence and 403 # should not add new residues 404 except IndexError: 405 this_residue = None 406 407 if this_residue and this_residue not in chars_to_ignore: 408 weight = record.annotations.get('weight', 1.0) 409 try: 410 score_dict[this_residue] += weight 411 # if we get a KeyError then we have an alphabet problem 412 except KeyError: 413 raise ValueError("Residue %s not found in alphabet %s" 414 % (this_residue, 415 self.alignment._alphabet)) 416 417 pssm_info.append((left_seq[residue_num], 418 score_dict)) 419 420 421 return PSSM(pssm_info)
422
423 - def _get_base_letters(self, letters):
424 """Create a zeroed dictionary with all of the specified letters. 425 """ 426 base_info = {} 427 for letter in letters: 428 base_info[letter] = 0 429 430 return base_info
431
432 - def information_content(self, start = 0, 433 end = None, 434 e_freq_table = None, log_base = 2, 435 chars_to_ignore = []):
436 """Calculate the information content for each residue along an alignment. 437 438 Arguments: 439 o start, end - The starting an ending points to calculate the 440 information content. These points should be relative to the first 441 sequence in the alignment, starting at zero (ie. even if the 'real' 442 first position in the seq is 203 in the initial sequence, for 443 the info content, we need to use zero). This defaults to the entire 444 length of the first sequence. 445 o e_freq_table - A FreqTable object specifying the expected frequencies 446 for each letter in the alphabet we are using (e.g. {'G' : 0.4, 447 'C' : 0.4, 'T' : 0.1, 'A' : 0.1}). Gap characters should not be 448 included, since these should not have expected frequencies. 449 o log_base - The base of the logathrim to use in calculating the 450 information content. This defaults to 2 so the info is in bits. 451 o chars_to_ignore - A listing of characterw which should be ignored 452 in calculating the info content. 453 454 Returns: 455 o A number representing the info content for the specified region. 456 457 Please see the Biopython manual for more information on how information 458 content is calculated. 459 """ 460 # if no end was specified, then we default to the end of the sequence 461 if end is None: 462 end = len(self.alignment._records[0].seq) 463 464 if start < 0 or end > len(self.alignment._records[0].seq): 465 raise ValueError \ 466 ("Start (%s) and end (%s) are not in the range %s to %s" 467 % (start, end, 0, len(self.alignment._records[0].seq))) 468 # determine random expected frequencies, if necessary 469 random_expected = None 470 if not e_freq_table: 471 #TODO - What about ambiguous alphabets? 472 base_alpha = Alphabet._get_base_alphabet(self.alignment._alphabet) 473 if isinstance(base_alpha, Alphabet.ProteinAlphabet): 474 random_expected = Protein20Random 475 elif isinstance(base_alpha, Alphabet.NucleotideAlphabet): 476 random_expected = Nucleotide4Random 477 else: 478 errstr = "Error in alphabet: not Nucleotide or Protein, " 479 errstr += "supply expected frequencies" 480 raise ValueError(errstr) 481 del base_alpha 482 elif not isinstance(e_freq_table, FreqTable.FreqTable): 483 raise ValueError("e_freq_table should be a FreqTable object") 484 485 486 # determine all of the letters we have to deal with 487 all_letters = self._get_all_letters() 488 for char in chars_to_ignore: 489 all_letters = all_letters.replace(char, '') 490 491 info_content = {} 492 for residue_num in range(start, end): 493 freq_dict = self._get_letter_freqs(residue_num, 494 self.alignment._records, 495 all_letters, chars_to_ignore) 496 # print freq_dict, 497 column_score = self._get_column_info_content(freq_dict, 498 e_freq_table, 499 log_base, 500 random_expected) 501 502 info_content[residue_num] = column_score 503 # sum up the score 504 total_info = 0 505 for column_info in info_content.values(): 506 total_info += column_info 507 # fill in the ic_vector member: holds IC for each column 508 for i in info_content.keys(): 509 self.ic_vector[i] = info_content[i] 510 return total_info
511
512 - def _get_letter_freqs(self, residue_num, all_records, letters, to_ignore):
513 """Determine the frequency of specific letters in the alignment. 514 515 Arguments: 516 o residue_num - The number of the column we are getting frequencies 517 from. 518 o all_records - All of the SeqRecords in the alignment. 519 o letters - The letters we are interested in getting the frequency 520 for. 521 o to_ignore - Letters we are specifically supposed to ignore. 522 523 This will calculate the frequencies of each of the specified letters 524 in the alignment at the given frequency, and return this as a 525 dictionary where the keys are the letters and the values are the 526 frequencies. 527 """ 528 freq_info = self._get_base_letters(letters) 529 530 total_count = 0 531 # collect the count info into the dictionary for all the records 532 for record in all_records: 533 try: 534 if record.seq[residue_num] not in to_ignore: 535 weight = record.annotations.get('weight',1.0) 536 freq_info[record.seq[residue_num]] += weight 537 total_count += weight 538 # getting a key error means we've got a problem with the alphabet 539 except KeyError: 540 raise ValueError("Residue %s not found in alphabet %s" 541 % (record.seq[residue_num], 542 self.alignment._alphabet)) 543 544 if total_count == 0: 545 # This column must be entirely ignored characters 546 for letter in freq_info.keys(): 547 assert freq_info[letter] == 0 548 #TODO - Map this to NA or NaN? 549 else: 550 # now convert the counts into frequencies 551 for letter in freq_info.keys(): 552 freq_info[letter] = freq_info[letter] / total_count 553 554 return freq_info
555
556 - def _get_column_info_content(self, obs_freq, e_freq_table, log_base, 557 random_expected):
558 """Calculate the information content for a column. 559 560 Arguments: 561 o obs_freq - The frequencies observed for each letter in the column. 562 o e_freq_table - An optional argument specifying the expected 563 frequencies for each letter. This is a SubsMat.FreqTable instance. 564 o log_base - The base of the logathrim to use in calculating the 565 info content. 566 """ 567 try: 568 gap_char = self.alignment._alphabet.gap_char 569 except AttributeError: 570 #The alphabet doesn't declare a gap - there could be none 571 #in the sequence... or just a vague alphabet. 572 gap_char = "-" #Safe? 573 574 if e_freq_table: 575 if not isinstance(e_freq_table, FreqTable.FreqTable): 576 raise ValueError("e_freq_table should be a FreqTable object") 577 # check the expected freq information to make sure it is good 578 for key in obs_freq.keys(): 579 if (key != gap_char and key not in e_freq_table): 580 raise ValueError("Expected frequency letters %s " 581 "do not match observed %s" \ 582 % (e_freq_table.keys(), 583 obs_freq.keys() - [gap_char])) 584 585 total_info = 0.0 586 587 for letter in obs_freq.keys(): 588 inner_log = 0.0 589 # if we have expected frequencies, modify the log value by them 590 # gap characters do not have expected frequencies, so they 591 # should just be the observed frequency. 592 if letter != gap_char: 593 if e_freq_table: 594 inner_log = obs_freq[letter] / e_freq_table[letter] 595 else: 596 inner_log = obs_freq[letter] / random_expected 597 # if the observed frequency is zero, we don't add any info to the 598 # total information content 599 if inner_log > 0: 600 letter_info = (obs_freq[letter] * 601 math.log(inner_log) / math.log(log_base)) 602 total_info += letter_info 603 return total_info
604
605 - def get_column(self,col):
606 return self.alignment.get_column(col)
607
608 -class PSSM:
609 """Represent a position specific score matrix. 610 611 This class is meant to make it easy to access the info within a PSSM 612 and also make it easy to print out the information in a nice table. 613 614 Let's say you had an alignment like this: 615 GTATC 616 AT--C 617 CTGTC 618 619 The position specific score matrix (when printed) looks like: 620 621 G A T C 622 G 1 1 0 1 623 T 0 0 3 0 624 A 1 1 0 0 625 T 0 0 2 0 626 C 0 0 0 3 627 628 You can access a single element of the PSSM using the following: 629 630 your_pssm[sequence_number][residue_count_name] 631 632 For instance, to get the 'T' residue for the second element in the 633 above alignment you would need to do: 634 635 your_pssm[1]['T'] 636 """
637 - def __init__(self, pssm):
638 """Initialize with pssm data to represent. 639 640 The pssm passed should be a list with the following structure: 641 642 list[0] - The letter of the residue being represented (for instance, 643 from the example above, the first few list[0]s would be GTAT... 644 list[1] - A dictionary with the letter substitutions and counts. 645 """ 646 self.pssm = pssm
647
648 - def __getitem__(self, pos):
649 return self.pssm[pos][1]
650
651 - def __str__(self):
652 out = " " 653 all_residues = self.pssm[0][1].keys() 654 all_residues.sort() 655 656 # first print out the top header 657 for res in all_residues: 658 out += " %s" % res 659 out += "\n" 660 661 # for each item, write out the substitutions 662 for item in self.pssm: 663 out += "%s " % item[0] 664 for res in all_residues: 665 out += " %.1f" % item[1][res] 666 667 out += "\n" 668 return out
669
670 - def get_residue(self, pos):
671 """Return the residue letter at the specified position. 672 """ 673 return self.pssm[pos][0]
674 675 688 689 if __name__ == "__main__": 690 print "Quick test" 691 from Bio import AlignIO 692 from Bio.Align.Generic import Alignment 693 694 filename = "../../Tests/GFF/multi.fna" 695 format = "fasta" 696 expected = FreqTable.FreqTable({"A":0.25,"G":0.25,"T":0.25,"C":0.25}, 697 FreqTable.FREQ, 698 IUPAC.unambiguous_dna) 699 700 alignment = AlignIO.read(open(filename), format) 701 for record in alignment: 702 print record.seq.tostring() 703 print "="*alignment.get_alignment_length() 704 705 summary = SummaryInfo(alignment) 706 consensus = summary.dumb_consensus(ambiguous="N") 707 print consensus 708 consensus = summary.gap_consensus(ambiguous="N") 709 print consensus 710 print 711 print summary.pos_specific_score_matrix(chars_to_ignore=['-'], 712 axis_seq=consensus) 713 print 714 #Have a generic alphabet, without a declared gap char, so must tell 715 #provide the frequencies and chars to ignore explicitly. 716 print summary.information_content(e_freq_table=expected, 717 chars_to_ignore=['-']) 718 print 719 print "Trying a protein sequence with gaps and stops" 720 721 alpha = Alphabet.HasStopCodon(Alphabet.Gapped(Alphabet.generic_protein, "-"), "*") 722 a = Alignment(alpha) 723 a.add_sequence("ID001", "MHQAIFIYQIGYP*LKSGYIQSIRSPEYDNW-") 724 a.add_sequence("ID002", "MH--IFIYQIGYAYLKSGYIQSIRSPEY-NW*") 725 a.add_sequence("ID003", "MHQAIFIYQIGYPYLKSGYIQSIRSPEYDNW*") 726 print a 727 print "="*a.get_alignment_length() 728 729 s = SummaryInfo(a) 730 c = s.dumb_consensus(ambiguous="X") 731 print c 732 c = s.gap_consensus(ambiguous="X") 733 print c 734 print 735 print s.pos_specific_score_matrix(chars_to_ignore=['-', '*'], axis_seq=c) 736 737 print s.information_content(chars_to_ignore=['-', '*']) 738 739 740 print "Done" 741