![](../images/_nbs.gif) |
|
![](../images/_nbs.gif) |
![](../images/_nbs.gif) |
![](../images/_nb_interface/Linked.png) ![](../images/_nbs.gif) | #!/usr/bin/python #
""" Extracts FASTA records from multi-mapview file. Eric Kofoid, 1/24/2011 (start) 5/25/2012 (most recent)
############################################################ Introduction ============
Based on code from a previous bash script, which worked well but was horribly slow!
This program extracts records from large multi-mapview files, based on ID, reference location or file line number. In all cases, single records or records within ranges can be found.
Incoming data is in a mapview multisequence files, whose tab-separated records have the general structure shown here:
SOLEXA3_0001:8:8:7287:5069#0/1 cbid 2 - 0 64 84 84 077 TagCAtTGCaGCTCCTTAGCAATCTCGTCGACGGGACGATTACTGCCGAGAATATGGTTATCAAAAGAAAACAGGAT =6:=A4@@B0<?><C@BB@@@B@@DCCDCCDCCCCCCCCCCCCBCBCCCCCCCCCCCCCCCBBBBCCCCCCCCCCCC
where important fields are (1)ID, (2)reference, (3)position in reference, (15)sequence, and, (16)quality scores. These files are made presorted on the reference position and must be kept in this sort order.
Output is formatted as a multi-fasta file.
############################################################ Usage ==================
$ getfv -i1 <ID 1> [-i2 <ID 2>] [-p <period>] [-v] <infile> > <outfile> $ getfv -p1 <pos. 1> [-p2 <pos. 2>] [-p <period>] [-v] <infile> > <outfile> $ getfv -l1 <line 1> [-l2 <line 2>] [-p <period>] [-v] <infile> > <outfile> $ getfv -f <ID file> [-p <period>] [-v] <infile> > <outfile>
############################################################ Supported options =================
Record ID range from id1 to id2. Useful only for files sorted on position:
-i1 <first record ID "id1">
-i2 <last record ID "id2">
Reference sequence range from position 1 to position 2. Useful only for files sorted on position:
-p1 <first position "pos1">
-p2 <last position "pos2">
Find records from line lin1 to lin2 of multi-mapview file. Mainly useful when records are sorted by position:
-l1 <first line of block "lin1">
-l2 <last line of block "lin2">
Use IDs found in ID file, one per line. Useful only when IDs all have identical prefices and records have been sorted on ID:
-i <ID file "idf">
If a single ID, position or line is need, then use the corresponding option with suffix "1", and leave the "2" option blank.
Odometer period -- if set, every "per" records inspected will print the record number, if verbose output flag is set. If the option is give without a value, a default value of 10000 will be used.
-p <period "per">
Verbose output flag -- if set, odometer will be activated, and it and a list of parameter values will be printed to standard output.
-v <flag, "True" if present, "False" if not>
Preserve mapview format flag -- if set, output will be saved in the original mapview format instead of being converted to multi-fasta.
-V <flag, "True" if present, "False" if not>
########################################################### Algorithm =========
1. get next target 2. read next line from input file 3. search for target in appropriate field 4. return to 1 until target found or eof 5. if target found, convert line to fasta format & write to stdout 6. return to 1 until target exceeds maximum
########################################################### Caveat ======
This function does too much for a single tool. It should be split into the following:
1. getvpos: get position range from view file 2. getvlin: get line range from view file 3. getvids: get ID range from view file 4. getvmat: get ID matches from view file 5. v2fa: convert .view to .fasta ########################################################### History ======
5/25/2012 Added "-V" flag
"""
from time import time import os import sys import plac
############################## ### function dmsg ### ############################## def dmsg(msg): """ Print debug message to stderr """ sys.stderr.write(msg)
############################## ### function odometer ### ############################## def odometer(v,per): """ Simple count-up odometer to keep user happy. """ global a a=a+1 if v and (a % per)==0: # trigger odometer output cycle=str(a)+'\r' sys.stderr.write(cycle)
############################## ### function getinf ### ############################## def getinf(inf): """ Open an input file """ try: infile=open(inf,'r') return infile except: ex1,ex2,ex3=sys.exc_info() msg="Something went wrong opening the input data file!\nException ="+str(ex2)+"\n\n" sys.stderr.write(msg) sys.exit(1)
############################## ### function getvid ### ############################## def getvid(id1,id2,inf,per=100000,v=False): """ Get an ID range from a .view file. """ # Define a few constants: ID=0 REF=1 POS=2 A4=3 A5=4 A6=5 A7=6 A8=7 A9=8 A10=9 A11=10 A12=11 A13=12 A14=13 SEQ=14 QUAL=15 infile=getinf(inf) aLine=True # will be used to check eof inrange=False atedge=False while aLine and not inrange: odometer(v,per) aLine=infile.readline() if aLine: splitLine=aLine.split('\t') id=splitLine[ID] inrange=id1 in id else: # eof inrange=False # just to drop through
atedge=False while aLine and inrange and not atedge: odometer(v,per) pos=int(splitLine[POS]) seq=splitLine[SEQ] if V: # preserve the mapview format print aLine, else: print ">",id,"\n",seq aLine=infile.readline() if aLine: splitLine=aLine.split('\t') id=splitLine[ID] atedge=(id2 in id) or (id1 == id2) else: # eof atedge=True # just to drop through
pastrange=(id1 == id2) while aLine and not pastrange: odometer(v,per) pos=int(splitLine[POS]) seq=splitLine[SEQ] if V: # preserve the mapview format print aLine, else: print ">",id,"\n",seq aLine=infile.readline() if aLine: splitLine=aLine.split('\t') id=splitLine[ID] pastrange=id2 not in id else: # eof pastrange=True # just to drop through @plac.annotations( id1=("first id in range",'option','i1',str,None,'id1'), id2=("last id in range",'option','i2',str,None,'id2'), pos1=("first reference position in range",'option','p1',int,None,'pos1'), pos2=("last reference position in range",'option','p2',int,None,'pos2'), lin1=("first line position in range",'option','l1',int,None,'lin1'), lin2=("last line position in range",'option','l2',int,None,'lin2'), per=("odometer period",'option','p',int,None,'period'), idf=("id file",'option','f',str,None,'idfile'), inf=("mapview input file",'positional',None,str,None,'infile'), v=("verbose output flag",'flag'), V=("mapview format flag",'flag') )
def main( id1, id2, pos1, pos2, lin1, lin2, per, idf, inf, v, V ): """ Extracts FASTA records from multi-mapview file.
The view file must be in its default state, sorted by position, for all options except -i1, -i2 & -f, which require IDs with identical prefices (maq mangles long IDs, which must first be shortened to a minimal prefix) and presorting on the ID field. The -V option allows selection of records from any file of single-line records beginning with a space delimited ID field. Fastq and view formats are examples. """
startTime=time() if v: msg="\nStart time: "+str(startTime)+"\n" sys.stderr.write(msg)
# Define a few constants: ID=0 REF=1 POS=2 A4=3 A5=4 A6=5 A7=6 A8=7 A9=8 A10=9 A11=10 A12=11 A13=12 A14=13 SEQ=14 QUAL=15
# Check that options are of only one type: mode="" modeOK=True if (not mode and (id1 or id2)): mode="id" if (mode and (pos1 or pos2)): modeOK=False elif (pos1 or pos2): mode="pos" if (mode and (lin1 or lin2)): modeOK=False elif (lin1 or lin2): mode="lin" if (mode and idf): modeOK=False elif (idf): mode="idf" if (not modeOK): print "options can only be of one set type, either id, position, line or file"; exit(1); # Set empty <option>2 to <option>1, and sort if necessary: if not id2: id2=id1 elif not id1: id1=id2 if not pos2: pos2=pos1 elif not pos1: pos1=pos2 if not lin2: lin2=lin1 elif not lin1: lin1=lin2 if pos2<pos1: dum=pos2;pos2=pos1;pos1=dum; if lin2<lin1: dum=lin2;lin2=lin1;lin1=dum;
# Variables global to program: global path path=os.getcwd() # path to working directory (for all i/o) global a a=0 # index of iterations
# Defaults, etc: if per == None: per = 100000
if v: sys.stderr.write("\nParameters:") msg="\ninf: "+str(inf) sys.stderr.write(msg) msg="\nid1: "+str(id1)+"\nid2: "+str(id2)+"\npos1: "+str(pos1)+"\npos2: "+str(pos2)+ \ "\nlin1: "+str(lin1)+"\nlin2: "+str(lin2)+"\nper: "+str(per)+"\nidf: "+str(idf) \ +"\n\n" sys.stderr.write(msg)
# Open infiles: infile=getinf(inf) ####################################### # Finally ready to do the actual work: #######################################
##################### ### Find ID Range ### ##################### a=0 if (mode == "id"): # hunting for an ID range
getvid(id1,id2,inf,per,v)
############################ ### Find Position Range ### ############################ elif (mode == "pos"): # hunting for a position range:
aLine=True # will be used to check eof inrange=False atedge=False while aLine and not inrange: odometer(v,per) aLine=infile.readline() if aLine: splitLine=aLine.split('\t') pos=int(splitLine[POS]) inrange=(pos1<pos) else: # eof inrange=True # just to drop through
atedge=False while aLine and inrange and not atedge: odometer(v,per) id=splitLine[ID] seq=splitLine[SEQ] if V: # preserve the mapview format print aLine, else: print ">",id,"\n",seq aLine=infile.readline() if aLine: splitLine=aLine.split('\t') pos=int(splitLine[POS]) atedge=(pos2<pos) else: # eof atedge=True # just to drop through
####################### ### Find Line Range ### ####################### elif (mode == "lin"): # hunting for a line range
aLine=True # will be used to check eof while aLine and a<lin1: odometer(v,per) aLine=infile.readline()
while aLine and a<=lin2: odometer(v,per) splitLine=aLine.split('\t') id=splitLine[ID] pos=int(splitLine[POS]) seq=splitLine[SEQ] if V: # preserve the mapview format print aLine, else: print ">",id,"\n",seq aLine=infile.readline()
############################## ### Find Matches from File ### ############################## elif (mode == "idf"):
try: idfile=open(idf,'r') except: ex1,ex2,ex3=sys.exc_info() msg="Something went wrong while opening the input ID file!\nException ="+str(ex2)+"\n\n" sys.stderr.write(msg) sys.exit(1) aLine=infile.readline() iLine=True # check eof of ID input file lineOK=False # does the line have a good ID? while (iLine and aLine): # get the next ID iLine=idfile.readline() if iLine: # necessary to catch eof iLine=iLine.split("\n")[0] lineOK=(aLine.find(iLine) != -1) else: aLine=False # bale out as no more ID's while aLine and not lineOK: # throw away lines when no match odometer(v,per) aLine=infile.readline() lineOK=(aLine.find(iLine) != -1) while aLine and lineOK: # keep all lines with matches if V: # preserve the mapview format print aLine, else: splitLine=aLine.split('\t') id=splitLine[ID] pos=int(splitLine[POS]) seq=splitLine[SEQ] print ">",id,"\n",seq aLine=infile.readline() lineOK=(aLine.find(iLine) != -1) idfile.close() else: sys.stderr.write("The mode was improperly set; check your parameters!\n\n")
infile.close()
if v: sys.stderr.write("\nEnd getfv.") msg="\nTime: "+str(time()) sys.stderr.write(msg) msg="\nElapsed time: "+str(time()-startTime)+"\n\n" sys.stderr.write(msg)
if __name__ == '__main__': plac.call(main)
|
![](../images/_nbs.gif) |
![](../images/_nbs.gif) |
|
![](../images/_nb_interface/pagecurlrt.gif)
![](../images/_nbs.gif)
![](../images/_nbs.gif) |
![](../images/_nb_tabs/6tab.png) |
![](../images/_nbs.gif) |