Package Bio :: Package PopGen :: Package GenePop :: Module FileParser
[hide private]
[frames] | no frames]

Source Code for Module Bio.PopGen.GenePop.FileParser

  1  # Copyright 2010 by Tiago Antao.  All rights reserved. 
  2  # This code is part of the Biopython distribution and governed by its 
  3  # license.  Please see the LICENSE file that should have been included 
  4  # as part of this package. 
  5   
  6  """ 
  7  This class provides code to parse BIG GenePop files. 
  8   
  9  The difference between this class and the standard Bio.PopGen.GenePop.Record 
 10  class is that this one does not read the whole file to memory. 
 11  It provides an iterator interface, slower but consuming much mess memory. 
 12  Should be used with big files (Thousands of markers and individuals). 
 13   
 14  See http://wbiomed.curtin.edu.au/genepop/ , the format is documented 
 15  here: http://wbiomed.curtin.edu.au/genepop/help_input.html . 
 16   
 17  Classes: 
 18  FileRecord           Holds GenePop data. 
 19   
 20  Functions: 
 21   
 22   
 23  """ 
 24  from copy import deepcopy 
 25  from Bio.PopGen.GenePop import get_indiv 
 26   
27 -def read(fname):
28 """Parses a file containing a GenePop file. 29 30 fname is a file name that contains a GenePop record. 31 """ 32 record = FileRecord(fname) 33 return record
34 35
36 -class FileRecord:
37 """Holds information from a GenePop record. 38 39 Members: 40 marker_len The marker length (2 or 3 digit code per allele). 41 42 comment_line Comment line. 43 44 loci_list List of loci names. 45 46 Functions: 47 get_individual Returns the next individual of the current population. 48 49 skip_population Skips the current population. 50 51 skip_population skips the individuals of the current population, returns 52 True if there are more populations. 53 54 get_individual returns an individual of the current population (or None 55 if the list ended). 56 Each individual is a pair composed by individual 57 name and a list of alleles (2 per marker or 1 for haploid data). 58 Examples 59 ('Ind1', [(1,2), (3,3), (200,201)] 60 ('Ind2', [(2,None), (3,3), (None,None)] 61 ('Other1', [(1,1), (4,3), (200,200)] 62 63 64 """
65 - def __init__(self, fname):
66 self.comment_line = "" 67 self.loci_list = [] 68 self.fname = fname 69 self.start_read()
70
71 - def __str__(self):
72 """Returns (reconstructs) a GenePop textual representation. 73 74 This might take a lot of memory. 75 Marker length will be 3. 76 """ 77 marker_len = 3 78 rep = [self.comment_line + '\n'] 79 rep.append('\n'.join(self.loci_list) + '\n') 80 current_pop = self.current_pop 81 current_ind = self.current_ind 82 self._handle.close() 83 self._handle = open(self.fname) 84 self.skip_header() 85 rep.append('Pop\n') 86 more = True 87 while more: 88 res = self.get_individual() 89 if res == True: 90 rep.append('Pop\n') 91 elif res == False: 92 more = False 93 else: 94 name, markers = res 95 rep.append(name) 96 rep.append(',') 97 for marker in markers: 98 rep.append(' ') 99 for al in marker: 100 if al == None: 101 al = '0' 102 aStr = str(al) 103 while len(aStr)<marker_len: 104 aStr = "".join(['0', aStr]) 105 rep.append(aStr) 106 rep.append('\n') 107 self.seek_position(current_pop, current_ind) 108 return "".join(rep)
109 110
111 - def start_read(self):
112 """Starts parsing a file containing a GenePop file. 113 """ 114 self._handle = open(self.fname) 115 self.comment_line = self._handle.next().rstrip() 116 #We can now have one loci per line or all loci in a single line 117 #separated by either space or comma+space... 118 #We will remove all commas on loci... that should not be a problem 119 sample_loci_line = self._handle.next().rstrip().replace(',', '') 120 all_loci = sample_loci_line.split(' ') 121 self.loci_list.extend(all_loci) 122 for line in self._handle: 123 line = line.rstrip() 124 if line.upper()=='POP': 125 break 126 self.loci_list.append(line) 127 else: 128 raise ValueError('No population data found, file probably not GenePop related') 129 #self._after_pop = True 130 self.current_pop = 0 131 self.current_ind = 0
132
133 - def skip_header(self):
134 """Skips the Header. To be done after a re-open.""" 135 self.current_pop = 0 136 self.current_ind = 0 137 for line in self._handle: 138 if line.rstrip().upper()=="POP": 139 return
140
141 - def seek_position(self, pop, indiv):
142 """Seeks a certain position in the file. 143 144 pop - pop position (0 is first) 145 indiv - individual in pop 146 """ 147 self._handle.close() 148 self._handle = open(self.fname) 149 self.skip_header() 150 while pop>0: 151 self.skip_population() 152 pop -= 1 153 while indiv>0: 154 self.get_individual() 155 indiv -= 1
156
157 - def skip_population(self):
158 "Skips the current population. Returns true if there is another pop." 159 for line in self._handle: 160 if line=="": 161 return False 162 line = line.rstrip() 163 if line.upper()=='POP': 164 self.current_pop += 1 165 self.current_ind = 0 166 return True
167
168 - def get_individual(self):
169 """Gets the next individual. 170 171 Returns individual information if there are more individuals 172 in the current population. 173 Returns True if there are no more individuals in the current 174 population, but there are more populations. Next read will 175 be of the following pop. 176 Returns False if at end of file. 177 """ 178 marker_len = None 179 for line in self._handle: 180 line = line.rstrip() 181 if line.upper()=='POP': 182 self.current_pop += 1 183 self.current_ind = 0 184 return True 185 else: 186 self.current_ind += 1 187 indiv_name, allele_list, ignore = get_indiv(line) 188 return (indiv_name, allele_list) 189 return False
190
191 - def remove_population(self, pos):
192 """Removes a population (by position). 193 194 pos - position 195 fw - A file handle (write enabled) to write the new record 196 """ 197 #del self.populations[pos] 198 pass
199
200 - def remove_locus_by_position(self, pos, fw):
201 """Removes a locus by position. 202 203 pos - position 204 fw - A file handle (write enabled) to write the new record 205 """
206 #del self.loci_list[pos] 207 #for pop in self.populations: 208 # for indiv in pop: 209 # name, loci = indiv 210 # del loci[pos] 211
212 - def remove_locus_by_name(self, name, fw):
213 """Removes a locus by name. 214 215 name - name 216 fw - A file handle (write enabled) to write the new record 217 """ 218 for i in range(len(self.loci_list)): 219 if self.loci_list[i] == name: 220 self.remove_locus_by_position(i, fw) 221 return
222 #If here than locus not existent... Maybe raise exception? 223 # Although it should be Ok... Just a boolean return, maybe? 224