1
2
3
4
5
6
7
8 """Functions to calculate assorted sequence checksums."""
9
10
11
12
13 from binascii import crc32 as _crc32
14
16 """Returns the crc32 checksum for a sequence (string or Seq object)."""
17 try:
18
19 return _crc32(seq.tostring())
20 except AttributeError:
21
22 return _crc32(seq)
23
25 _table_h = []
26 for i in range(256):
27 l = i
28 part_h = 0
29 for j in range(8):
30 rflag = l & 1
31 l >>= 1
32 if part_h & 1: l |= (1L << 31)
33 part_h >>= 1L
34 if rflag: part_h ^= 0xd8000000L
35 _table_h.append(part_h)
36 return _table_h
37
38
39 _table_h = _init_table_h()
40
42 """Returns the crc64 checksum for a sequence (string or Seq object)."""
43 crcl = 0
44 crch = 0
45 for c in s:
46 shr = (crch & 0xFF) << 24
47 temp1h = crch >> 8
48 temp1l = (crcl >> 8) | shr
49 idx = (crcl ^ ord(c)) & 0xFF
50 crch = temp1h ^ _table_h[idx]
51 crcl = temp1l
52
53 return "CRC-%08X%08X" % (crch, crcl)
54
55
57 """Returns the GCG checksum (int) for a sequence (string or Seq object).
58
59 Given a nucleotide or amino-acid secuence (or any string),
60 returns the GCG checksum (int). Checksum used by GCG program.
61 seq type = str.
62 Based on BioPerl GCG_checksum. Adapted by Sebastian Bassi
63 with the help of John Lenton, Pablo Ziliani, and Gabriel Genellina.
64 All sequences are converted to uppercase """
65 index = checksum = 0
66 if type(seq)!=type("aa"):
67 seq=seq.tostring()
68 for char in seq:
69 index += 1
70 checksum += index * ord(char.upper())
71 if index == 57: index = 0
72 return checksum % 10000
73
75 """Returns the SEGUID (string) for a sequence (string or Seq object).
76
77 Given a nucleotide or amino-acid secuence (or any string),
78 returns the SEGUID string (A SEquence Globally Unique IDentifier).
79 seq type = str.
80 For more information about SEGUID, see:
81 http://bioinformatics.anl.gov/seguid/
82 DOI: 10.1002/pmic.200600032 """
83 try:
84
85 import hashlib
86 m = hashlib.sha1()
87 except:
88
89 import sha
90 m = sha.new()
91 import base64
92 if type(seq)!=type("aa"):
93 seq=seq.tostring().upper()
94 else:
95 seq=seq.upper()
96 m.update(seq)
97 try:
98
99 return base64.b64encode(m.digest()).rstrip("=")
100 except:
101
102 import os
103
104
105
106 return base64.encodestring(m.digest()).replace("\n","").rstrip("=")
107
108 if __name__ == "__main__":
109 print "Quick self test"
110
111 str_light_chain_one = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
112 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
113 + "YCSSYAGSSTLVFGGGTKLTVL"
114
115 str_light_chain_two = "QSALTQPASVSGSPGQSITISCTGTSSDVGSYNLVSWYQQHPGK" \
116 + "APKLMIYEGSKRPSGVSNRFSGSKSGNTASLTISGLQAEDEADY" \
117 + "YCCSYAGSSTWVFGGGTKLTVL"
118
119 assert crc64(str_light_chain_one) == crc64(str_light_chain_two)
120 assert 'CRC-44CAAD88706CC153' == crc64(str_light_chain_one)
121
122 assert 'BpBeDdcNUYNsdk46JoJdw7Pd3BI' == seguid(str_light_chain_one)
123 assert 'X5XEaayob1nZLOc7eVT9qyczarY' == seguid(str_light_chain_two)
124
125 print "Done"
126