1
2
3
4
5
6
7
8
9
10
11
12
13 """Parse Unigene flat file format files such as the Hs.data file.
14
15 Here is an overview of the flat file format that this parser deals with:
16 Line types/qualifiers:
17
18 ID UniGene cluster ID
19 TITLE Title for the cluster
20 GENE Gene symbol
21 CYTOBAND Cytological band
22 EXPRESS Tissues of origin for ESTs in cluster
23 RESTR_EXPR Single tissue or development stage contributes
24 more than half the total EST frequency for this gene.
25 GNM_TERMINUS genomic confirmation of presence of a 3' terminus;
26 T if a non-templated polyA tail is found among
27 a cluster's sequences; else
28 I if templated As are found in genomic sequence or
29 S if a canonical polyA signal is found on
30 the genomic sequence
31 GENE_ID Entrez gene identifier associated with at least one
32 sequence in this cluster;
33 to be used instead of LocusLink.
34 LOCUSLINK LocusLink identifier associated with at least one
35 sequence in this cluster;
36 deprecated in favor of GENE_ID
37 HOMOL Homology;
38 CHROMOSOME Chromosome. For plants, CHROMOSOME refers to mapping
39 on the arabidopsis genome.
40 STS STS
41 ACC= GenBank/EMBL/DDBJ accession number of STS
42 [optional field]
43 UNISTS= identifier in NCBI's UNISTS database
44 TXMAP Transcript map interval
45 MARKER= Marker found on at least one sequence in this
46 cluster
47 RHPANEL= Radiation Hybrid panel used to place marker
48 PROTSIM Protein Similarity data for the sequence with
49 highest-scoring protein similarity in this cluster
50 ORG= Organism
51 PROTGI= Sequence GI of protein
52 PROTID= Sequence ID of protein
53 PCT= Percent alignment
54 ALN= length of aligned region (aa)
55 SCOUNT Number of sequences in the cluster
56 SEQUENCE Sequence
57 ACC= GenBank/EMBL/DDBJ accession number of sequence
58 NID= Unique nucleotide sequence identifier (gi)
59 PID= Unique protein sequence identifier (used for
60 non-ESTs)
61 CLONE= Clone identifier (used for ESTs only)
62 END= End (5'/3') of clone insert read (used for
63 ESTs only)
64 LID= Library ID; see Hs.lib.info for library name
65 and tissue
66 MGC= 5' CDS-completeness indicator; if present, the
67 clone associated with this sequence is believed
68 CDS-complete. A value greater than 511 is the gi
69 of the CDS-complete mRNA matched by the EST,
70 otherwise the value is an indicator of the
71 reliability of the test indicating CDS
72 completeness; higher values indicate more
73 reliable CDS-completeness predictions.
74 SEQTYPE= Description of the nucleotide sequence.
75 Possible values are mRNA, EST and HTC.
76 TRACE= The Trace ID of the EST sequence, as provided by
77 NCBI Trace Archive
78 """
79
80
82 """Store the information for one SEQUENCE line from a Unigene file
83
84 Initialize with the text part of the SEQUENCE line, or nothing.
85
86 Attributes and descriptions (access as LOWER CASE)
87 ACC= GenBank/EMBL/DDBJ accession number of sequence
88 NID= Unique nucleotide sequence identifier (gi)
89 PID= Unique protein sequence identifier (used for non-ESTs)
90 CLONE= Clone identifier (used for ESTs only)
91 END= End (5'/3') of clone insert read (used for ESTs only)
92 LID= Library ID; see Hs.lib.info for library name and tissue
93 MGC= 5' CDS-completeness indicator; if present,
94 the clone associated with this sequence
95 is believed CDS-complete. A value greater than 511
96 is the gi of the CDS-complete mRNA matched by the EST,
97 otherwise the value is an indicator of the reliability
98 of the test indicating CDS completeness;
99 higher values indicate more reliable CDS-completeness
100 predictions.
101 SEQTYPE= Description of the nucleotide sequence. Possible values
102 are mRNA, EST and HTC.
103 TRACE= The Trace ID of the EST sequence, as provided by NCBI
104 Trace Archive
105 """
106
108 self.acc = ''
109 self.nid = ''
110 self.lid = ''
111 self.pid = ''
112 self.clone = ''
113 self.image = ''
114 self.is_image = False
115 self.end = ''
116 self.mgc = ''
117 self.seqtype = ''
118 self.trace = ''
119 if not text==None:
120 self.text=text
121 self._init_from_text(text)
122
123 - def _init_from_text(self,text):
124 parts = text.split('; ');
125 for part in parts:
126 key, val = part.split("=")
127 if key=='CLONE':
128 if val[:5]=='IMAGE':
129 self.is_image=True
130 self.image = val[6:]
131 setattr(self,key.lower(),val)
132
135
136
138 """Store the information for one PROTSIM line from a Unigene file
139
140 Initialize with the text part of the PROTSIM line, or nothing.
141
142 Attributes and descriptions (access as LOWER CASE)
143 ORG= Organism
144 PROTGI= Sequence GI of protein
145 PROTID= Sequence ID of protein
146 PCT= Percent alignment
147 ALN= length of aligned region (aa)
148 """
149
151 self.org = ''
152 self.protgi = ''
153 self.protid = ''
154 self.pct = ''
155 self.aln = ''
156 if not text==None:
157 self.text=text
158 self._init_from_text(text)
159
160 - def _init_from_text(self,text):
161 parts = text.split('; ');
162
163 for part in parts:
164 key, val = part.split("=")
165 setattr(self,key.lower(),val)
166
169
170
172 """Store the information for one STS line from a Unigene file
173
174 Initialize with the text part of the STS line, or nothing.
175
176 Attributes and descriptions (access as LOWER CASE)
177
178 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
179 UNISTS= identifier in NCBI's UNISTS database
180 """
181
188
189 - def _init_from_text(self,text):
190 parts = text.split(' ');
191
192 for part in parts:
193 key, val = part.split("=")
194 setattr(self,key.lower(),val)
195
198
199
201 """Store a Unigene record
202
203 Here is what is stored:
204
205 self.ID = '' # ID line
206 self.species = '' # Hs, Bt, etc.
207 self.title = '' # TITLE line
208 self.symbol = '' # GENE line
209 self.cytoband = '' # CYTOBAND line
210 self.express = [] # EXPRESS line, parsed on ';'
211 # Will be an array of strings
212 self.restr_expr = '' # RESTR_EXPR line
213 self.gnm_terminus = '' # GNM_TERMINUS line
214 self.gene_id = '' # GENE_ID line
215 self.locuslink = '' # LOCUSLINK line
216 self.homol = '' # HOMOL line
217 self.chromosome = '' # CHROMOSOME line
218 self.protsim = [] # PROTSIM entries, array of Protsims
219 # Type ProtsimLine
220 self.sequence = [] # SEQUENCE entries, array of Sequence entries
221 # Type SequenceLine
222 self.sts = [] # STS entries, array of STS entries
223 # Type STSLine
224 self.txmap = [] # TXMAP entries, array of TXMap entries
225 """
226
228 self.ID = ''
229 self.species = ''
230 self.title = ''
231 self.symbol = ''
232 self.cytoband = ''
233 self.express = []
234 self.restr_expr = ''
235 self.gnm_terminus = ''
236 self.gene_id = ''
237 self.locuslink = ''
238 self.homol = ''
239 self.chromosome = ''
240 self.protsim = []
241 self.sequence = []
242 self.sts = []
243 self.txmap = []
244
246 return "<%s> %s %s\n%s" % (self.__class__.__name__,
247 self.ID, self.symbol, self.title)
248
255
256
266
267
268
269
270
323
324
325
326
327
328 from Bio.ParserSupport import *
329 import re
330
331
332
333
334 UG_INDENT=12
335
337 """Store the information for one SEQUENCE line from a Unigene file
338 (OBSOLETE).
339
340 Initialize with the text part of the SEQUENCE line, or nothing.
341
342 Attributes and descriptions (access as LOWER CASE)
343 ACC= GenBank/EMBL/DDBJ accession number of sequence
344 NID= Unique nucleotide sequence identifier (gi)
345 PID= Unique protein sequence identifier (used for non-ESTs)
346 CLONE= Clone identifier (used for ESTs only)
347 END= End (5'/3') of clone insert read (used for ESTs only)
348 LID= Library ID; see Hs.lib.info for library name and tissue
349 MGC= 5' CDS-completeness indicator; if present,
350 the clone associated with this sequence
351 is believed CDS-complete. A value greater than 511
352 is the gi of the CDS-complete mRNA matched by the EST,
353 otherwise the value is an indicator of the reliability
354 of the test indicating CDS comleteness;
355 higher values indicate more reliable CDS-completeness predictions.
356 SEQTYPE= Description of the nucleotide sequence. Possible values are
357 mRNA, EST and HTC.
358 TRACE= The Trace ID of the EST sequence, as provided by NCBI Trace Archive
359 PERIPHERAL= Indicator that the sequence is a suboptimal
360 representative of the gene represented by this cluster.
361 Peripheral sequences are those that are in a cluster
362 which represents a spliced gene without sharing a
363 splice junction with any other sequence. In many
364 cases, they are unspliced transcripts originating
365 from the gene.
366
367 This class is OBSOLETE; please use the read() function in this module
368 instead.
369 """
370
372 self.acc = ''
373 self.nid = ''
374 self.lid = ''
375 self.pid = ''
376 self.clone = ''
377 self.image = ''
378 self.is_image = False
379 self.end = ''
380 self.mgc = ''
381 self.seqtype = ''
382 self.Trace = ''
383 self.peripheral = ''
384 if not text==None:
385 self.text=text
386 return self._init_from_text(text)
387
388 - def _init_from_text(self,text):
389 parts = text.split('; ');
390 for part in parts:
391 key,val = re.match('(\w+)=(\S+)',part).groups()
392 if key=='CLONE':
393 if val[:5]=='IMAGE':
394 self.is_image=True
395 self.image = val[6:]
396 setattr(self,key.lower(),val)
397
400
401
403 """Store the information for one PROTSIM line from a Unigene file
404 (OBSOLETE).
405
406 Initialize with the text part of the PROTSIM line, or nothing.
407
408 Attributes and descriptions (access as LOWER CASE)
409 ORG= Organism
410 PROTGI= Sequence GI of protein
411 PROTID= Sequence ID of protein
412 PCT= Percent alignment
413 ALN= length of aligned region (aa)
414
415 This class is OBSOLETE; please use the read() function in this module
416 instead.
417 """
418
420 self.org = ''
421 self.protgi = ''
422 self.protid = ''
423 self.pct = ''
424 self.aln = ''
425 if not text==None:
426 self.text=text
427 return self._init_from_text(text)
428
429 - def _init_from_text(self,text):
430 parts = text.split('; ');
431
432 for part in parts:
433 key,val = re.match('(\w+)=(\S+)',part).groups()
434 setattr(self,key.lower(),val)
435
438
439
441 """Store the information for one STS line from a Unigene file
442 (OBSOLETE).
443
444 Initialize with the text part of the STS line, or nothing.
445
446 Attributes and descriptions (access as LOWER CASE)
447
448 NAME= Name of STS
449 ACC= GenBank/EMBL/DDBJ accession number of STS [optional field]
450 DSEG= GDB Dsegment number [optional field]
451 UNISTS= identifier in NCBI's UNISTS database
452
453 This class is OBSOLETE; please use the read() function in this module
454 instead.
455 """
456
465
466 - def _init_from_text(self,text):
467 parts = text.split(' ');
468
469 for part in parts:
470 key,val = re.match('(\w+)=(\S+)',part).groups()
471 setattr(self,key.lower(),val)
472
475
476
478 """Store a Unigene record (OBSOLETE).
479
480 Here is what is stored:
481
482 self.ID = '' # ID line
483 self.species = '' # Hs, Bt, etc.
484 self.title = '' # TITLE line
485 self.symbol = '' # GENE line
486 self.cytoband = '' # CYTOBAND line
487 self.express = [] # EXPRESS line, parsed on ';'
488 # Will be an array of strings
489 self.restr_expr = '' # RESTR_EXPR line
490 self.gnm_terminus = '' # GNM_TERMINUS line
491 self.gene_id = '' # GENE_ID line
492 self.chromosome = '' # CHROMOSOME
493 self.protsim = [] # PROTSIM entries, array of Protsims
494 # Type UnigeneProtsimRecord
495 self.sequence = [] # SEQUENCE entries, array of Sequence entries
496 # Type UnigeneSequenceRecord
497 self.sts = [] # STS entries, array of STS entries
498 # Type UnigeneSTSRecord
499 self.txmap = [] # TXMAP entries, array of TXMap entries
500
501 This class is OBSOLETE; please use the read() function in this module
502 instead.
503 """
504
506 self.ID = ''
507 self.species = ''
508 self.title = ''
509 self.symbol = ''
510 self.cytoband = ''
511 self.express = []
512 self.restr_expr = ''
513 self.gnm_terminus = ''
514 self.gene_id = ''
515 self.chromosome = ''
516 self.protsim = []
517 self.sequence = []
518 self.sts = []
519 self.txmap = []
520
522 return "<%s> %s %s\n%s" % (self.__class__.__name__,
523 self.ID, self.symbol, self.title)
524
525
527 """This class is OBSOLETE; please use the read() function in this module
528 instead."""
529
537 - def GENE(self,line):
555 - def STS(self,line):
558
559
560 - def _get_single_entry(self,line):
561 """Consume a single-value line
562 """
563 return line[UG_INDENT:]
564
565 - def _get_array_entry(self,line,split_on):
566 """Consume a multi-value line by splitting on split_on
567 """
568 return line[UG_INDENT:].split(split_on)
569
570
572 """Scans a Unigene Flat File Format file (OBSOLETE).
573
574 This class is OBSOLETE; please use the read() function in this module
575 instead.
576 """
577
578 - def feed(self, handle, consumer):
579 """feed(self, handle, consumer)
580
581 Feed events from parsing a Unigene file to a consumer.
582 handle is a file-like object, and consumer is a consumer object
583 that will receive events as the file is scanned
584
585 """
586 consumer.start_record()
587 for line in handle:
588 tag = line.split(' ')[0]
589 line = line.rstrip()
590 if line=='//':
591 consumer.end_record()
592 break
593 try:
594 f = getattr(consumer, tag)
595 except AttributeError:
596 print 'no method called', tag
597 else:
598 if callable(f):
599 f(line)
600
601
603 """This class is OBSOLETE; please use the read() function in this module
604 instead."""
605
609
610 - def parse(self, handle):
617
619 """This class is OBSOLETE; please use the parse() function in this module
620 instead."""
621
622 - def __init__(self, handle, parser=None):
624
626 self._parser = RecordParser()
627 lines = []
628 while True:
629 line = self._uhandle.readline()
630 if not line: break
631 if line[:2] == '//':
632 break
633 lines.append(line)
634 if not lines:
635 return None
636 lines.append('//')
637 data = ''.join(lines)
638 if self._parser is not None:
639 return self._parser.parse(File.StringHandle(data))
640 return data
641
643 return iter(self.next, None)
644