6


getfv
#!/usr/bin/python
#

"""
Extracts FASTA records from multi-mapview file.
Eric Kofoid, 1/24/2011 (start)
             5/25/2012 (most recent)

############################################################
Introduction
============

Based on code from a previous bash script, which worked well but was
horribly slow!

This program extracts records from large multi-mapview files, based on
ID, reference location or file line number. In all cases, single
records or records within ranges can be found.

Incoming data is in a mapview multisequence files, whose tab-separated
records have the general structure shown here:

SOLEXA3_0001:8:8:7287:5069#0/1    cbid    2    -    0    64    84    84    077    TagCAtTGCaGCTCCTTAGCAATCTCGTCGACGGGACGATTACTGCCGAGAATATGGTTATCAAAAGAAAACAGGAT    =6:=A4@@B0<?><C@BB@@@B@@DCCDCCDCCCCCCCCCCCCBCBCCCCCCCCCCCCCCCBBBBCCCCCCCCCCCC

where important fields are (1)ID, (2)reference, (3)position in
reference, (15)sequence, and, (16)quality scores. These files are
made presorted on the reference position and must be kept in this
sort order.

Output is formatted as a multi-fasta file.

############################################################
Usage
==================

$ getfv -i1 <ID 1> [-i2 <ID 2>] [-p <period>] [-v] <infile> > <outfile>
$ getfv -p1 <pos. 1> [-p2 <pos. 2>] [-p <period>] [-v] <infile> > <outfile>
$ getfv -l1 <line 1> [-l2 <line 2>] [-p <period>] [-v] <infile> > <outfile>
$ getfv -f <ID file> [-p <period>] [-v] <infile> > <outfile>

############################################################
Supported options
=================

Record ID range from id1 to id2.
Useful only for files sorted on position:

-i1 <first record ID "id1">

-i2 <last record ID "id2">

Reference sequence range from position 1 to position 2.
Useful only for files sorted on position:

-p1 <first position "pos1">

-p2 <last position "pos2">

Find records from line lin1 to lin2 of multi-mapview file.
Mainly useful when records are sorted by position:

-l1 <first line of block "lin1">

-l2 <last line of block "lin2">

Use IDs found in ID file, one per line. Useful only when
IDs all have identical prefices and records have been sorted
on ID:

-i <ID file "idf">  

If a single ID, position or line is need, then use the
corresponding option with suffix "1", and leave the "2"
option blank.

Odometer period -- if set, every "per" records inspected
will print the record number, if verbose output flag is set.
If the option is give without a value, a default value of
10000 will be used.

-p <period "per">

Verbose output flag -- if set, odometer will be activated,
and it and a list of parameter values will be printed to
standard output.

-v <flag, "True" if present, "False" if not>

Preserve mapview format flag -- if set, output will be saved
in the original mapview format instead of being converted to
multi-fasta.

-V <flag, "True" if present, "False" if not>

###########################################################
Algorithm
=========

1. get next target
2. read next line from input file
3. search for target in appropriate field
4. return to 1 until target found or eof
5. if target found, convert line to fasta format & write to stdout
6. return to 1 until target exceeds maximum

###########################################################
Caveat
======

This function does too much for a single tool.
It should be split into the following:

1. getvpos: get position range from view file
2. getvlin: get line range from view file
3. getvids: get ID range from view file
4. getvmat: get ID matches from view file
5. v2fa:    convert .view to .fasta

###########################################################
History
======

5/25/2012 Added "-V" flag

"""

from time import time
import os
import sys
import plac

##############################
###     function dmsg      ###
##############################
def dmsg(msg):
    """
    Print debug message to stderr
    """
    sys.stderr.write(msg)

##############################
###   function odometer    ###
##############################
def odometer(v,per):
    """
    Simple count-up odometer to keep user happy.
    """
    global a
    a=a+1
    if v and  (a % per)==0: # trigger odometer output
        cycle=str(a)+'\r'
        sys.stderr.write(cycle)

##############################
###    function getinf     ###
##############################
def getinf(inf):
    """
    Open an input file
    """
    try:
        infile=open(inf,'r')
        return infile
    except:
        ex1,ex2,ex3=sys.exc_info()
        msg="Something went wrong opening the input data file!\nException ="+str(ex2)+"\n\n"
        sys.stderr.write(msg)
        sys.exit(1)
    

##############################
###    function getvid     ###
##############################
def getvid(id1,id2,inf,per=100000,v=False):
    """
    Get an ID range from a .view file.
    """
    
    # Define a few constants:
    ID=0
    REF=1
    POS=2
    A4=3
    A5=4
    A6=5
    A7=6
    A8=7
    A9=8
    A10=9
    A11=10
    A12=11
    A13=12
    A14=13
    SEQ=14
    QUAL=15
    
    infile=getinf(inf)
    aLine=True # will be used to check eof
    inrange=False
    atedge=False
    while aLine and not inrange:
        odometer(v,per)
        aLine=infile.readline()
        if aLine:
            splitLine=aLine.split('\t')
            id=splitLine[ID]
            inrange=id1 in id
        else: # eof
            inrange=False # just to drop through

    atedge=False
    while aLine and inrange and not atedge:
        odometer(v,per)
        pos=int(splitLine[POS])
        seq=splitLine[SEQ]
        if V: # preserve the mapview format
            print aLine,
        else:
            print ">",id,"\n",seq
        aLine=infile.readline()
        if aLine:
            splitLine=aLine.split('\t')
            id=splitLine[ID]
            atedge=(id2 in id) or (id1 == id2)
        else: # eof
            atedge=True # just to drop through

    pastrange=(id1 == id2)
    while aLine and not pastrange:
        odometer(v,per)
        pos=int(splitLine[POS])
        seq=splitLine[SEQ]
        if V: # preserve the mapview format
            print aLine,
        else:
            print ">",id,"\n",seq
        aLine=infile.readline()
        if aLine:
            splitLine=aLine.split('\t')
            id=splitLine[ID]
            pastrange=id2 not in id
        else: # eof
            pastrange=True # just to drop through
            
            
@plac.annotations(
    id1=("first id in range",'option','i1',str,None,'id1'),
    id2=("last id in range",'option','i2',str,None,'id2'),
    pos1=("first reference position in range",'option','p1',int,None,'pos1'),
    pos2=("last reference position in range",'option','p2',int,None,'pos2'),
    lin1=("first line position in range",'option','l1',int,None,'lin1'),
    lin2=("last line position in range",'option','l2',int,None,'lin2'),
    per=("odometer period",'option','p',int,None,'period'),
    idf=("id file",'option','f',str,None,'idfile'),
    inf=("mapview input file",'positional',None,str,None,'infile'),
    v=("verbose output flag",'flag'),
    V=("mapview format flag",'flag')
    )

def main(
    id1,
    id2,
    pos1,
    pos2,
    lin1,
    lin2,
    per,
    idf,
    inf,
    v,
    V
    ):
    """
    Extracts FASTA records from multi-mapview file.

    The view file must be in its default state, sorted
    by position, for all options except -i1, -i2 & -f,
    which require IDs with identical prefices (maq
    mangles long IDs, which must first be shortened to
    a minimal prefix) and presorting on the ID field.
    
    The -V option allows selection of records from any
    file of single-line records beginning with a space
    delimited ID field. Fastq and view formats are
    examples.
    """

    startTime=time()
    if v:
        msg="\nStart time: "+str(startTime)+"\n"
        sys.stderr.write(msg)

    # Define a few constants:
    ID=0
    REF=1
    POS=2
    A4=3
    A5=4
    A6=5
    A7=6
    A8=7
    A9=8
    A10=9
    A11=10
    A12=11
    A13=12
    A14=13
    SEQ=14
    QUAL=15

    # Check that options are of only one type:
    mode=""
    modeOK=True
    if (not mode and (id1 or id2)):
        mode="id"
        
    if (mode and (pos1 or pos2)):
        modeOK=False
    elif (pos1 or pos2):
        mode="pos"
        
    if (mode and (lin1 or lin2)):
        modeOK=False
    elif (lin1 or lin2):
        mode="lin"
        
    if (mode and idf):
        modeOK=False
    elif (idf):
        mode="idf"
    
    if (not modeOK):
        print "options can only be of one set type, either id, position, line or file";
        exit(1);        
    
    # Set empty <option>2 to <option>1, and sort if necessary:
    if not id2: id2=id1
    elif not id1: id1=id2
    if not pos2: pos2=pos1
    elif not pos1: pos1=pos2
    if not lin2: lin2=lin1
    elif not lin1: lin1=lin2
    if pos2<pos1: dum=pos2;pos2=pos1;pos1=dum;
    if lin2<lin1: dum=lin2;lin2=lin1;lin1=dum;

    # Variables global to program:
    global path
    path=os.getcwd() # path to working directory (for all i/o)
    global a
    a=0 # index of iterations

    # Defaults, etc:
    if per == None: per = 100000

    if v:
        sys.stderr.write("\nParameters:")
        msg="\ninf: "+str(inf)
        sys.stderr.write(msg)
        msg="\nid1: "+str(id1)+"\nid2: "+str(id2)+"\npos1: "+str(pos1)+"\npos2: "+str(pos2)+ \
              "\nlin1: "+str(lin1)+"\nlin2: "+str(lin2)+"\nper: "+str(per)+"\nidf: "+str(idf) \
              +"\n\n"
        sys.stderr.write(msg)

    # Open infiles:
    
    infile=getinf(inf)
    
    #######################################
    # Finally ready to do the actual work:
    #######################################

    #####################
    ### Find ID Range ###
    #####################
    a=0
    if (mode == "id"): # hunting for an ID range

        getvid(id1,id2,inf,per,v)

    ############################
    ### Find Position Range  ###
    ############################
    elif (mode == "pos"): # hunting for a position range:

        aLine=True # will be used to check eof
        inrange=False
        atedge=False
        while aLine and not inrange:
            odometer(v,per)
            aLine=infile.readline()
            if aLine:
                splitLine=aLine.split('\t')
                pos=int(splitLine[POS])
                inrange=(pos1<pos)
            else: # eof
                inrange=True # just to drop through

        atedge=False
        while aLine and inrange and not atedge:
            odometer(v,per)
            id=splitLine[ID]
            seq=splitLine[SEQ]
            if V: # preserve the mapview format
                print aLine,
            else:
                print ">",id,"\n",seq
            aLine=infile.readline()
            if aLine:
                splitLine=aLine.split('\t')
                pos=int(splitLine[POS])
                atedge=(pos2<pos)
            else: # eof
                atedge=True # just to drop through            

    #######################
    ### Find Line Range ###
    #######################
    elif (mode == "lin"): # hunting for a line range

        aLine=True # will be used to check eof
        while aLine and a<lin1:
            odometer(v,per)
            aLine=infile.readline()

        while aLine and a<=lin2:
            odometer(v,per)
            splitLine=aLine.split('\t')
            id=splitLine[ID]
            pos=int(splitLine[POS])
            seq=splitLine[SEQ]
            if V: # preserve the mapview format
                print aLine,
            else:
                print ">",id,"\n",seq
            aLine=infile.readline()

    ##############################
    ### Find Matches from File ###
    ##############################
    elif (mode == "idf"):

        try:
            idfile=open(idf,'r')
        except:
            ex1,ex2,ex3=sys.exc_info()
            msg="Something went wrong while opening the input ID file!\nException ="+str(ex2)+"\n\n"
            sys.stderr.write(msg)
            sys.exit(1)
        
        aLine=infile.readline()
        iLine=True # check eof of ID input file
        lineOK=False # does the line have a good ID?
        
        while (iLine and aLine): # get the next ID
            iLine=idfile.readline()
            if iLine: # necessary to catch eof
                iLine=iLine.split("\n")[0]
                lineOK=(aLine.find(iLine) != -1)
            else:
                aLine=False # bale out as no more ID's
            
            while aLine and not lineOK: # throw away lines when no match
                odometer(v,per)
                aLine=infile.readline()
                lineOK=(aLine.find(iLine) != -1)
                
            while aLine and lineOK: # keep all lines with matches
                if V: # preserve the mapview format
                    print aLine,
                else:   
                    splitLine=aLine.split('\t')
                    id=splitLine[ID]
                    pos=int(splitLine[POS])
                    seq=splitLine[SEQ]
                    print ">",id,"\n",seq
                aLine=infile.readline()
                lineOK=(aLine.find(iLine) != -1)
                
        idfile.close()
                
    else:
        sys.stderr.write("The mode was improperly set; check your parameters!\n\n")

    infile.close()

    if v:
        sys.stderr.write("\nEnd getfv.")
        msg="\nTime: "+str(time())
        sys.stderr.write(msg)
        msg="\nElapsed time: "+str(time()-startTime)+"\n\n"
        sys.stderr.write(msg)

if __name__ == '__main__':
    plac.call(main)