Package Bio :: Module pairwise2
[hide private]
[frames] | no frames]

Source Code for Module Bio.pairwise2

  1  # Copyright 2002 by Jeffrey Chang.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """This package implements pairwise sequence alignment using a dynamic 
  7  programming algorithm. 
  8   
  9  This provides functions to get global and local alignments between two 
 10  sequences.  A global alignment finds the best concordance between all 
 11  characters in two sequences.  A local alignment finds just the 
 12  subsequences that align the best. 
 13   
 14  When doing alignments, you can specify the match score and gap 
 15  penalties.  The match score indicates the compatibility between an 
 16  alignment of two characters in the sequences.  Highly compatible 
 17  characters should be given positive scores, and incompatible ones 
 18  should be given negative scores or 0.  The gap penalties should be 
 19  negative. 
 20   
 21  The names of the alignment functions in this module follow the 
 22  convention 
 23  <alignment type>XX 
 24  where <alignment type> is either "global" or "local" and XX is a 2 
 25  character code indicating the parameters it takes.  The first 
 26  character indicates the parameters for matches (and mismatches), and 
 27  the second indicates the parameters for gap penalties. 
 28   
 29  The match parameters are 
 30  CODE  DESCRIPTION 
 31  x     No parameters.  Identical characters have score of 1, otherwise 0. 
 32  m     A match score is the score of identical chars, otherwise mismatch score. 
 33  d     A dictionary returns the score of any pair of characters. 
 34  c     A callback function returns scores. 
 35   
 36  The gap penalty parameters are 
 37  CODE  DESCRIPTION 
 38  x     No gap penalties. 
 39  s     Same open and extend gap penalties for both sequences. 
 40  d     The sequences have different open and extend gap penalties. 
 41  c     A callback function returns the gap penalties. 
 42   
 43  All the different alignment functions are contained in an object 
 44  "align".  For example: 
 45   
 46      >>> from Bio import pairwise2 
 47      >>> alignments = pairwise2.align.globalxx("ACCGT", "ACG") 
 48   
 49  will return a list of the alignments between the two strings.  The 
 50  parameters of the alignment function depends on the function called. 
 51  Some examples: 
 52   
 53  >>> pairwise2.align.globalxx("ACCGT", "ACG") 
 54      # Find the best global alignment between the two sequences. 
 55      # Identical characters are given 1 point.  No points are deducted 
 56      # for mismatches or gaps. 
 57       
 58  >>> pairwise2.align.localxx("ACCGT", "ACG") 
 59      # Same thing as before, but with a local alignment. 
 60       
 61  >>> pairwise2.align.globalmx("ACCGT", "ACG", 2, -1) 
 62      # Do a global alignment.  Identical characters are given 2 points, 
 63      # 1 point is deducted for each non-identical character. 
 64   
 65  >>> pairwise2.align.globalms("ACCGT", "ACG", 2, -1, -.5, -.1) 
 66      # Same as above, except now 0.5 points are deducted when opening a 
 67      # gap, and 0.1 points are deducted when extending it. 
 68   
 69   
 70  To see a description of the parameters for a function, please look at 
 71  the docstring for the function. 
 72   
 73  >>> print newalign.align.localds.__doc__ 
 74  localds(sequenceA, sequenceB, match_dict, open, extend) -> alignments 
 75   
 76  """ 
 77  # The alignment functions take some undocumented keyword parameters: 
 78  # - penalize_extend_when_opening: boolean 
 79  #   Whether to count an extension penalty when opening a gap.  If 
 80  #   false, a gap of 1 is only penalize an "open" penalty, otherwise it 
 81  #   is penalized "open+extend". 
 82  # - penalize_end_gaps: boolean 
 83  #   Whether to count the gaps at the ends of an alignment.  By 
 84  #   default, they are counted for global alignments but not for local 
 85  #   ones. 
 86  # - gap_char: string 
 87  #   Which character to use as a gap character in the alignment 
 88  #   returned.  By default, uses '-'. 
 89  # - force_generic: boolean 
 90  #   Always use the generic, non-cached, dynamic programming function. 
 91  #   For debugging. 
 92  # - score_only: boolean 
 93  #   Only get the best score, don't recover any alignments.  The return 
 94  #   value of the function is the score. 
 95  # - one_alignment_only: boolean 
 96  #   Only recover one alignment. 
 97   
 98  from types import * 
 99   
100  MAX_ALIGNMENTS = 1000   # maximum alignments recovered in traceback 
101   
102 -class align:
103 """This class provides functions that do alignments.""" 104
105 - class alignment_function:
106 """This class is callable impersonates an alignment function. 107 The constructor takes the name of the function. This class 108 will decode the name of the function to figure out how to 109 interpret the parameters. 110 111 """ 112 # match code -> tuple of (parameters, docstring) 113 match2args = { 114 'x' : ([], ''), 115 'm' : (['match', 'mismatch'], 116 """match is the score to given to identical characters. mismatch is 117 the score given to non-identical ones."""), 118 'd' : (['match_dict'], 119 """match_dict is a dictionary where the keys are tuples of pairs of 120 characters and the values are the scores, e.g. ("A", "C") : 2.5."""), 121 'c' : (['match_fn'], 122 """match_fn is a callback function that takes two characters and 123 returns the score between them."""), 124 } 125 # penalty code -> tuple of (parameters, docstring) 126 penalty2args = { 127 'x' : ([], ''), 128 's' : (['open', 'extend'], 129 """open and extend are the gap penalties when a gap is opened and 130 extended. They should be negative."""), 131 'd' : (['openA', 'extendA', 'openB', 'extendB'], 132 """openA and extendA are the gap penalties for sequenceA, and openB 133 and extendB for sequeneB. The penalties should be negative."""), 134 'c' : (['gap_A_fn', 'gap_B_fn'], 135 """gap_A_fn and gap_B_fn are callback functions that takes 1) the 136 index where the gap is opened, and 2) the length of the gap. They 137 should return a gap penalty."""), 138 } 139
140 - def __init__(self, name):
141 # Check to make sure the name of the function is 142 # reasonable. 143 if name.startswith("global"): 144 if len(name) != 8: 145 raise AttributeError("function should be globalXX") 146 elif name.startswith("local"): 147 if len(name) != 7: 148 raise AttributeError("function should be localXX") 149 else: 150 raise AttributeError(name) 151 align_type, match_type, penalty_type = \ 152 name[:-2], name[-2], name[-1] 153 try: 154 match_args, match_doc = self.match2args[match_type] 155 except KeyError, x: 156 raise AttributeError("unknown match type %r" % match_type) 157 try: 158 penalty_args, penalty_doc = self.penalty2args[penalty_type] 159 except KeyError, x: 160 raise AttributeError("unknown penalty type %r" % penalty_type) 161 162 # Now get the names of the parameters to this function. 163 param_names = ['sequenceA', 'sequenceB'] 164 param_names.extend(match_args) 165 param_names.extend(penalty_args) 166 self.function_name = name 167 self.align_type = align_type 168 self.param_names = param_names 169 170 self.__name__ = self.function_name 171 # Set the doc string. 172 doc = "%s(%s) -> alignments\n" % ( 173 self.__name__, ', '.join(self.param_names)) 174 if match_doc: 175 doc += "\n%s\n" % match_doc 176 if penalty_doc: 177 doc += "\n%s\n" % penalty_doc 178 doc += ( 179 """\nalignments is a list of tuples (seqA, seqB, score, begin, end). 180 seqA and seqB are strings showing the alignment between the 181 sequences. score is the score of the alignment. begin and end 182 are indexes into seqA and seqB that indicate the where the 183 alignment occurs. 184 """) 185 self.__doc__ = doc
186
187 - def decode(self, *args, **keywds):
188 # Decode the arguments for the _align function. keywds 189 # will get passed to it, so translate the arguments to 190 # this function into forms appropriate for _align. 191 keywds = keywds.copy() 192 if len(args) != len(self.param_names): 193 raise TypeError("%s takes exactly %d argument (%d given)" \ 194 % (self.function_name, len(self.param_names), len(args))) 195 i = 0 196 while i < len(self.param_names): 197 if self.param_names[i] in [ 198 'sequenceA', 'sequenceB', 199 'gap_A_fn', 'gap_B_fn', 'match_fn']: 200 keywds[self.param_names[i]] = args[i] 201 i += 1 202 elif self.param_names[i] == 'match': 203 assert self.param_names[i+1] == 'mismatch' 204 match, mismatch = args[i], args[i+1] 205 keywds['match_fn'] = identity_match(match, mismatch) 206 i += 2 207 elif self.param_names[i] == 'match_dict': 208 keywds['match_fn'] = dictionary_match(args[i]) 209 i += 1 210 elif self.param_names[i] == 'open': 211 assert self.param_names[i+1] == 'extend' 212 open, extend = args[i], args[i+1] 213 pe = keywds.get('penalize_extend_when_opening', 0) 214 keywds['gap_A_fn'] = affine_penalty(open, extend, pe) 215 keywds['gap_B_fn'] = affine_penalty(open, extend, pe) 216 i += 2 217 elif self.param_names[i] == 'openA': 218 assert self.param_names[i+3] == 'extendB' 219 openA, extendA, openB, extendB = args[i:i+4] 220 pe = keywds.get('penalize_extend_when_opening', 0) 221 keywds['gap_A_fn'] = affine_penalty(openA, extendA, pe) 222 keywds['gap_B_fn'] = affine_penalty(openB, extendB, pe) 223 i += 4 224 else: 225 raise ValueError("unknown parameter %r" \ 226 % self.param_names[i]) 227 228 # Here are the default parameters for _align. Assign 229 # these to keywds, unless already specified. 230 pe = keywds.get('penalize_extend_when_opening', 0) 231 default_params = [ 232 ('match_fn', identity_match(1, 0)), 233 ('gap_A_fn', affine_penalty(0, 0, pe)), 234 ('gap_B_fn', affine_penalty(0, 0, pe)), 235 ('penalize_extend_when_opening', 0), 236 ('penalize_end_gaps', self.align_type == 'global'), 237 ('align_globally', self.align_type == 'global'), 238 ('gap_char', '-'), 239 ('force_generic', 0), 240 ('score_only', 0), 241 ('one_alignment_only', 0) 242 ] 243 for name, default in default_params: 244 keywds[name] = keywds.get(name, default) 245 return keywds
246
247 - def __call__(self, *args, **keywds):
248 keywds = self.decode(*args, **keywds) 249 return _align(**keywds)
250
251 - def __getattr__(self, attr):
252 return self.alignment_function(attr)
253 align = align() 254 255
256 -def _align(sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn, 257 penalize_extend_when_opening, penalize_end_gaps, 258 align_globally, gap_char, force_generic, score_only, 259 one_alignment_only):
260 if not sequenceA or not sequenceB: 261 return [] 262 263 if (not force_generic) and \ 264 type(gap_A_fn) is InstanceType and \ 265 gap_A_fn.__class__ is affine_penalty and \ 266 type(gap_B_fn) is InstanceType and \ 267 gap_B_fn.__class__ is affine_penalty: 268 open_A, extend_A = gap_A_fn.open, gap_A_fn.extend 269 open_B, extend_B = gap_B_fn.open, gap_B_fn.extend 270 x = _make_score_matrix_fast( 271 sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B, 272 penalize_extend_when_opening, penalize_end_gaps, align_globally, 273 score_only) 274 else: 275 x = _make_score_matrix_generic( 276 sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn, 277 penalize_extend_when_opening, penalize_end_gaps, align_globally, 278 score_only) 279 score_matrix, trace_matrix = x 280 281 #print "SCORE"; print_matrix(score_matrix) 282 #print "TRACEBACK"; print_matrix(trace_matrix) 283 284 # Look for the proper starting point. Get a list of all possible 285 # starting points. 286 starts = _find_start( 287 score_matrix, sequenceA, sequenceB, 288 gap_A_fn, gap_B_fn, penalize_end_gaps, align_globally) 289 # Find the highest score. 290 best_score = max([x[0] for x in starts]) 291 292 # If they only want the score, then return it. 293 if score_only: 294 return best_score 295 296 tolerance = 0 # XXX do anything with this? 297 # Now find all the positions within some tolerance of the best 298 # score. 299 i = 0 300 while i < len(starts): 301 score, pos = starts[i] 302 if rint(abs(score-best_score)) > rint(tolerance): 303 del starts[i] 304 else: 305 i += 1 306 307 # Recover the alignments and return them. 308 x = _recover_alignments( 309 sequenceA, sequenceB, starts, score_matrix, trace_matrix, 310 align_globally, penalize_end_gaps, gap_char, one_alignment_only) 311 return x
312
313 -def _make_score_matrix_generic( 314 sequenceA, sequenceB, match_fn, gap_A_fn, gap_B_fn, 315 penalize_extend_when_opening, penalize_end_gaps, align_globally, 316 score_only):
317 # This is an implementation of the Needleman-Wunsch dynamic 318 # programming algorithm for aligning sequences. 319 320 # Create the score and traceback matrices. These should be in the 321 # shape: 322 # sequenceA (down) x sequenceB (across) 323 lenA, lenB = len(sequenceA), len(sequenceB) 324 score_matrix, trace_matrix = [], [] 325 for i in range(lenA): 326 score_matrix.append([None] * lenB) 327 trace_matrix.append([[None]] * lenB) 328 329 # The top and left borders of the matrices are special cases 330 # because there are no previously aligned characters. To simplify 331 # the main loop, handle these separately. 332 for i in range(lenA): 333 # Align the first residue in sequenceB to the ith residue in 334 # sequence A. This is like opening up i gaps at the beginning 335 # of sequence B. 336 score = match_fn(sequenceA[i], sequenceB[0]) 337 if penalize_end_gaps: 338 score += gap_B_fn(0, i) 339 score_matrix[i][0] = score 340 for i in range(1, lenB): 341 score = match_fn(sequenceA[0], sequenceB[i]) 342 if penalize_end_gaps: 343 score += gap_A_fn(0, i) 344 score_matrix[0][i] = score 345 346 # Fill in the score matrix. Each position in the matrix 347 # represents an alignment between a character from sequenceA to 348 # one in sequence B. As I iterate through the matrix, find the 349 # alignment by choose the best of: 350 # 1) extending a previous alignment without gaps 351 # 2) adding a gap in sequenceA 352 # 3) adding a gap in sequenceB 353 for row in range(1, lenA): 354 for col in range(1, lenB): 355 # First, calculate the score that would occur by extending 356 # the alignment without gaps. 357 best_score = score_matrix[row-1][col-1] 358 best_score_rint = rint(best_score) 359 best_indexes = [(row-1, col-1)] 360 361 # Try to find a better score by opening gaps in sequenceA. 362 # Do this by checking alignments from each column in the 363 # previous row. Each column represents a different 364 # character to align from, and thus a different length 365 # gap. 366 for i in range(0, col-1): 367 score = score_matrix[row-1][i] + gap_A_fn(i, col-1-i) 368 score_rint = rint(score) 369 if score_rint == best_score_rint: 370 best_score, best_score_rint = score, score_rint 371 best_indexes.append((row-1, i)) 372 elif score_rint > best_score_rint: 373 best_score, best_score_rint = score, score_rint 374 best_indexes = [(row-1, i)] 375 376 # Try to find a better score by opening gaps in sequenceB. 377 for i in range(0, row-1): 378 score = score_matrix[i][col-1] + gap_B_fn(i, row-1-i) 379 score_rint = rint(score) 380 if score_rint == best_score_rint: 381 best_score, best_score_rint = score, score_rint 382 best_indexes.append((i, col-1)) 383 elif score_rint > best_score_rint: 384 best_score, best_score_rint = score, score_rint 385 best_indexes = [(i, col-1)] 386 387 score_matrix[row][col] = best_score + \ 388 match_fn(sequenceA[row], sequenceB[col]) 389 if not align_globally and score_matrix[row][col] < 0: 390 score_matrix[row][col] = 0 391 trace_matrix[row][col] = best_indexes 392 return score_matrix, trace_matrix
393
394 -def _make_score_matrix_fast( 395 sequenceA, sequenceB, match_fn, open_A, extend_A, open_B, extend_B, 396 penalize_extend_when_opening, penalize_end_gaps, 397 align_globally, score_only):
398 first_A_gap = calc_affine_penalty(1, open_A, extend_A, 399 penalize_extend_when_opening) 400 first_B_gap = calc_affine_penalty(1, open_B, extend_B, 401 penalize_extend_when_opening) 402 403 # Create the score and traceback matrices. These should be in the 404 # shape: 405 # sequenceA (down) x sequenceB (across) 406 lenA, lenB = len(sequenceA), len(sequenceB) 407 score_matrix, trace_matrix = [], [] 408 for i in range(lenA): 409 score_matrix.append([None] * lenB) 410 trace_matrix.append([[None]] * lenB) 411 412 # The top and left borders of the matrices are special cases 413 # because there are no previously aligned characters. To simplify 414 # the main loop, handle these separately. 415 for i in range(lenA): 416 # Align the first residue in sequenceB to the ith residue in 417 # sequence A. This is like opening up i gaps at the beginning 418 # of sequence B. 419 score = match_fn(sequenceA[i], sequenceB[0]) 420 if penalize_end_gaps: 421 score += calc_affine_penalty( 422 i, open_B, extend_B, penalize_extend_when_opening) 423 score_matrix[i][0] = score 424 for i in range(1, lenB): 425 score = match_fn(sequenceA[0], sequenceB[i]) 426 if penalize_end_gaps: 427 score += calc_affine_penalty( 428 i, open_A, extend_A, penalize_extend_when_opening) 429 score_matrix[0][i] = score 430 431 # In the generic algorithm, at each row and column in the score 432 # matrix, we had to scan all previous rows and columns to see 433 # whether opening a gap might yield a higher score. Here, since 434 # we know the penalties are affine, we can cache just the best 435 # score in the previous rows and columns. Instead of scanning 436 # through all the previous rows and cols, we can just look at the 437 # cache for the best one. Whenever the row or col increments, the 438 # best cached score just decreases by extending the gap longer. 439 440 # The best score and indexes for each row (goes down all columns). 441 # I don't need to store the last row because it's the end of the 442 # sequence. 443 row_cache_score, row_cache_index = [None]*(lenA-1), [None]*(lenA-1) 444 # The best score and indexes for each column (goes across rows). 445 col_cache_score, col_cache_index = [None]*(lenB-1), [None]*(lenB-1) 446 447 for i in range(lenA-1): 448 # Initialize each row to be the alignment of sequenceA[i] to 449 # sequenceB[0], plus opening a gap in sequenceA. 450 row_cache_score[i] = score_matrix[i][0] + first_A_gap 451 row_cache_index[i] = [(i, 0)] 452 for i in range(lenB-1): 453 col_cache_score[i] = score_matrix[0][i] + first_B_gap 454 col_cache_index[i] = [(0, i)] 455 456 # Fill in the score_matrix. 457 for row in range(1, lenA): 458 for col in range(1, lenB): 459 # Calculate the score that would occur by extending the 460 # alignment without gaps. 461 nogap_score = score_matrix[row-1][col-1] 462 463 # Check the score that would occur if there were a gap in 464 # sequence A. 465 if col > 1: 466 row_score = row_cache_score[row-1] 467 else: 468 row_score = nogap_score - 1 # Make sure it's not the best. 469 # Check the score that would occur if there were a gap in 470 # sequence B. 471 if row > 1: 472 col_score = col_cache_score[col-1] 473 else: 474 col_score = nogap_score - 1 475 476 best_score = max(nogap_score, row_score, col_score) 477 best_score_rint = rint(best_score) 478 best_index = [] 479 if best_score_rint == rint(nogap_score): 480 best_index.append((row-1, col-1)) 481 if best_score_rint == rint(row_score): 482 best_index.extend(row_cache_index[row-1]) 483 if best_score_rint == rint(col_score): 484 best_index.extend(col_cache_index[col-1]) 485 486 # Set the score and traceback matrices. 487 score = best_score + match_fn(sequenceA[row], sequenceB[col]) 488 if not align_globally and score < 0: 489 score_matrix[row][col] = 0 490 else: 491 score_matrix[row][col] = score 492 trace_matrix[row][col] = best_index 493 494 # Update the cached column scores. The best score for 495 # this can come from either extending the gap in the 496 # previous cached score, or opening a new gap from the 497 # most previously seen character. Compare the two scores 498 # and keep the best one. 499 open_score = score_matrix[row-1][col-1] + first_B_gap 500 extend_score = col_cache_score[col-1] + extend_B 501 open_score_rint, extend_score_rint = \ 502 rint(open_score), rint(extend_score) 503 if open_score_rint > extend_score_rint: 504 col_cache_score[col-1] = open_score 505 col_cache_index[col-1] = [(row-1, col-1)] 506 elif extend_score_rint > open_score_rint: 507 col_cache_score[col-1] = extend_score 508 else: 509 col_cache_score[col-1] = open_score 510 if (row-1, col-1) not in col_cache_index[col-1]: 511 col_cache_index[col-1] = col_cache_index[col-1] + \ 512 [(row-1, col-1)] 513 514 # Update the cached row scores. 515 open_score = score_matrix[row-1][col-1] + first_A_gap 516 extend_score = row_cache_score[row-1] + extend_A 517 open_score_rint, extend_score_rint = \ 518 rint(open_score), rint(extend_score) 519 if open_score_rint > extend_score_rint: 520 row_cache_score[row-1] = open_score 521 row_cache_index[row-1] = [(row-1, col-1)] 522 elif extend_score_rint > open_score_rint: 523 row_cache_score[row-1] = extend_score 524 else: 525 row_cache_score[row-1] = open_score 526 if (row-1, col-1) not in row_cache_index[row-1]: 527 row_cache_index[row-1] = row_cache_index[row-1] + \ 528 [(row-1, col-1)] 529 530 return score_matrix, trace_matrix
531
532 -def _recover_alignments(sequenceA, sequenceB, starts, 533 score_matrix, trace_matrix, align_globally, 534 penalize_end_gaps, gap_char, one_alignment_only):
535 # Recover the alignments by following the traceback matrix. This 536 # is a recursive procedure, but it's implemented here iteratively 537 # with a stack. 538 lenA, lenB = len(sequenceA), len(sequenceB) 539 tracebacks = [] # list of (seq1, seq2, score, begin, end) 540 in_process = [] # list of ([same as tracebacks], prev_pos, next_pos) 541 542 # sequenceA and sequenceB may be sequences, including strings, 543 # lists, or list-like objects. In order to preserve the type of 544 # the object, we need to use slices on the sequences instead of 545 # indexes. For example, sequenceA[row] may return a type that's 546 # not compatible with sequenceA, e.g. if sequenceA is a list and 547 # sequenceA[row] is a string. Thus, avoid using indexes and use 548 # slices, e.g. sequenceA[row:row+1]. Assume that client-defined 549 # sequence classes preserve these semantics. 550 551 # Initialize the in_process stack 552 for score, (row, col) in starts: 553 if align_globally: 554 begin, end = None, None 555 else: 556 begin, end = None, -max(lenA-row, lenB-col)+1 557 if not end: 558 end = None 559 # Initialize the in_process list with empty sequences of the 560 # same type as sequenceA. To do this, take empty slices of 561 # the sequences. 562 in_process.append( 563 (sequenceA[0:0], sequenceB[0:0], score, begin, end, 564 (lenA, lenB), (row, col))) 565 if one_alignment_only: 566 break 567 while in_process and len(tracebacks) < MAX_ALIGNMENTS: 568 seqA, seqB, score, begin, end, prev_pos, next_pos = in_process.pop() 569 prevA, prevB = prev_pos 570 if next_pos is None: 571 prevlen = len(seqA) 572 # add the rest of the sequences 573 seqA = sequenceA[:prevA] + seqA 574 seqB = sequenceB[:prevB] + seqB 575 # add the rest of the gaps 576 seqA, seqB = _lpad_until_equal(seqA, seqB, gap_char) 577 578 # Now make sure begin is set. 579 if begin is None: 580 if align_globally: 581 begin = 0 582 else: 583 begin = len(seqA) - prevlen 584 tracebacks.append((seqA, seqB, score, begin, end)) 585 else: 586 nextA, nextB = next_pos 587 nseqA, nseqB = prevA-nextA, prevB-nextB 588 maxseq = max(nseqA, nseqB) 589 ngapA, ngapB = maxseq-nseqA, maxseq-nseqB 590 seqA = sequenceA[nextA:nextA+nseqA] + gap_char*ngapA + seqA 591 seqB = sequenceB[nextB:nextB+nseqB] + gap_char*ngapB + seqB 592 prev_pos = next_pos 593 # local alignment stops early if score falls < 0 594 if not align_globally and score_matrix[nextA][nextB] <= 0: 595 begin = max(prevA, prevB) 596 in_process.append( 597 (seqA, seqB, score, begin, end, prev_pos, None)) 598 else: 599 for next_pos in trace_matrix[nextA][nextB]: 600 in_process.append( 601 (seqA, seqB, score, begin, end, prev_pos, next_pos)) 602 if one_alignment_only: 603 break 604 605 return _clean_alignments(tracebacks)
606
607 -def _find_start(score_matrix, sequenceA, sequenceB, gap_A_fn, gap_B_fn, 608 penalize_end_gaps, align_globally):
609 # Return a list of (score, (row, col)) indicating every possible 610 # place to start the tracebacks. 611 if align_globally: 612 if penalize_end_gaps: 613 starts = _find_global_start( 614 sequenceA, sequenceB, score_matrix, gap_A_fn, gap_B_fn, 1) 615 else: 616 starts = _find_global_start( 617 sequenceA, sequenceB, score_matrix, None, None, 0) 618 else: 619 starts = _find_local_start(score_matrix) 620 return starts
621
622 -def _find_global_start(sequenceA, sequenceB, 623 score_matrix, gap_A_fn, gap_B_fn, penalize_end_gaps):
624 # The whole sequence should be aligned, so return the positions at 625 # the end of either one of the sequences. 626 nrows, ncols = len(score_matrix), len(score_matrix[0]) 627 positions = [] 628 # Search all rows in the last column. 629 for row in range(nrows): 630 # Find the score, penalizing end gaps if necessary. 631 score = score_matrix[row][ncols-1] 632 if penalize_end_gaps: 633 score += gap_B_fn(ncols, nrows-row-1) 634 positions.append((score, (row, ncols-1))) 635 # Search all columns in the last row. 636 for col in range(ncols-1): 637 score = score_matrix[nrows-1][col] 638 if penalize_end_gaps: 639 score += gap_A_fn(nrows, ncols-col-1) 640 positions.append((score, (nrows-1, col))) 641 return positions
642
643 -def _find_local_start(score_matrix):
644 # Return every position in the matrix. 645 positions = [] 646 nrows, ncols = len(score_matrix), len(score_matrix[0]) 647 for row in range(nrows): 648 for col in range(ncols): 649 score = score_matrix[row][col] 650 positions.append((score, (row, col))) 651 return positions
652
653 -def _clean_alignments(alignments):
654 # Take a list of alignments and return a cleaned version. Remove 655 # duplicates, make sure begin and end are set correctly, remove 656 # empty alignments. 657 unique_alignments = [] 658 for align in alignments : 659 if align not in unique_alignments : 660 unique_alignments.append(align) 661 i = 0 662 while i < len(unique_alignments): 663 seqA, seqB, score, begin, end = unique_alignments[i] 664 # Make sure end is set reasonably. 665 if end is None: # global alignment 666 end = len(seqA) 667 elif end < 0: 668 end = end + len(seqA) 669 # If there's no alignment here, get rid of it. 670 if begin >= end: 671 del unique_alignments[i] 672 continue 673 unique_alignments[i] = seqA, seqB, score, begin, end 674 i += 1 675 return unique_alignments
676
677 -def _pad_until_equal(s1, s2, char):
678 # Add char to the end of s1 or s2 until they are equal length. 679 ls1, ls2 = len(s1), len(s2) 680 if ls1 < ls2: 681 s1 = _pad(s1, char, ls2-ls1) 682 elif ls2 < ls1: 683 s2 = _pad(s2, char, ls1-ls2) 684 return s1, s2
685
686 -def _lpad_until_equal(s1, s2, char):
687 # Add char to the beginning of s1 or s2 until they are equal 688 # length. 689 ls1, ls2 = len(s1), len(s2) 690 if ls1 < ls2: 691 s1 = _lpad(s1, char, ls2-ls1) 692 elif ls2 < ls1: 693 s2 = _lpad(s2, char, ls1-ls2) 694 return s1, s2
695
696 -def _pad(s, char, n):
697 # Append n chars to the end of s. 698 return s + char*n
699
700 -def _lpad(s, char, n):
701 # Prepend n chars to the beginning of s. 702 return char*n + s
703 704 _PRECISION = 1000
705 -def rint(x, precision=_PRECISION):
706 return int(x * precision + 0.5)
707
708 -class identity_match:
709 """identity_match([match][, mismatch]) -> match_fn 710 711 Create a match function for use in an alignment. match and 712 mismatch are the scores to give when two residues are equal or 713 unequal. By default, match is 1 and mismatch is 0. 714 715 """
716 - def __init__(self, match=1, mismatch=0):
717 self.match = match 718 self.mismatch = mismatch
719 - def __call__(self, charA, charB):
720 if charA == charB: 721 return self.match 722 return self.mismatch
723
724 -class dictionary_match:
725 """dictionary_match(score_dict[, symmetric]) -> match_fn 726 727 Create a match function for use in an alignment. score_dict is a 728 dictionary where the keys are tuples (residue 1, residue 2) and 729 the values are the match scores between those residues. symmetric 730 is a flag that indicates whether the scores are symmetric. If 731 true, then if (res 1, res 2) doesn't exist, I will use the score 732 at (res 2, res 1). 733 734 """
735 - def __init__(self, score_dict, symmetric=1):
736 self.score_dict = score_dict 737 self.symmetric = symmetric
738 - def __call__(self, charA, charB):
739 if self.symmetric and (charA, charB) not in self.score_dict: 740 # If the score dictionary is symmetric, then look up the 741 # score both ways. 742 charB, charA = charA, charB 743 return self.score_dict[(charA, charB)]
744
745 -class affine_penalty:
746 """affine_penalty(open, extend[, penalize_extend_when_opening]) -> gap_fn 747 748 Create a gap function for use in an alignment. 749 750 """
751 - def __init__(self, open, extend, penalize_extend_when_opening=0):
752 if open > 0 or extend > 0: 753 raise ValueError("Gap penalties should be non-positive.") 754 self.open, self.extend = open, extend 755 self.penalize_extend_when_opening = penalize_extend_when_opening
756 - def __call__(self, index, length):
757 return calc_affine_penalty( 758 length, self.open, self.extend, self.penalize_extend_when_opening)
759
760 -def calc_affine_penalty(length, open, extend, penalize_extend_when_opening):
761 if length <= 0: 762 return 0 763 penalty = open + extend * length 764 if not penalize_extend_when_opening: 765 penalty -= extend 766 return penalty
767 785
786 -def format_alignment(align1, align2, score, begin, end):
787 """format_alignment(align1, align2, score, begin, end) -> string 788 789 Format the alignment prettily into a string. 790 791 """ 792 s = [] 793 s.append("%s\n" % align1) 794 s.append("%s%s\n" % (" "*begin, "|"*(end-begin))) 795 s.append("%s\n" % align2) 796 s.append(" Score=%g\n" % score) 797 return ''.join(s)
798 799 800 # Try and load C implementations of functions. If I can't, 801 # then just ignore and use the pure python implementations. 802 try: 803 import cpairwise2 804 except ImportError: 805 pass 806 else: 807 import sys 808 this_module = sys.modules[__name__] 809 for name in cpairwise2.__dict__.keys(): 810 if not name.startswith("__"): 811 this_module.__dict__[name] = cpairwise2.__dict__[name] 812