#!/usr/bin/env python3 """ rebasecnv.py - Create subsets of REBASE based on types of cutting sites. Synopsis: rebasecnv.py --infile infile --outfile outfile [-outfmt name|number ] [--prototypes] [--commercial] [--ends 5|3|b] [--symmetric a|s|b] @modified: January 5, 2017 @author: Brian Fristensky @contact: frist@cc.umanitoba.ca """ import os import sys #optparse is deprecated in favor of argparse as of Python 2.7. However, # since 2.7 is not always present on many systems, at this writing, # it is safer to stick with optparse for now. It should be easy # to change later, since the syntax is very similar between argparse and optparse. from optparse import OptionParser blib = os.environ.get("BIRCHPYLIB") sys.path.append(blib) from birchlib import Birchmod PROGRAM = "rebasecnv.py : " USAGE = "\n\tUSAGE: rebasecnv.py --infile infile --outfile outfile [-outfmt name|number ] [--prototypes] [--commercial] [--ends 5|3|b] [--symmetry a|s]" BM = Birchmod(PROGRAM, USAGE) DEBUG = True class Parameters: """ Wrapper class for command line parameters """ def __init__(self): """ Initializes arguments: IFN="" OFN="" OUTFMT="bairoch" PROTOTYPES=False COMMERCIAL=False ENDS=["5","3","b"] SYMMETRY=["a","s"] PID = str(os.getpid()) Then calls read_args() to fill in their values from command line """ self.IFN = "" self.OFN = "" self.OUTFMT="bairoch" self.PROTOTYPES=False self.COMMERCIAL=False self.ENDS=["5","3","b"] self.SYMMETRY=["a","s"] self.PID = str(os.getpid()) self.read_args() def read_args(self): """ Read command line arguments into a Parameter object """ parser = OptionParser() parser.add_option("--infile", dest="infile", action="store", type="string", default="", help="input file in Bairoch format") parser.add_option("--outfile", dest="outfile", action="store", type="string", default="", help="output file") parser.add_option("--outfmt", dest="outfmt", action="store", type="string", default="bairoch", help="name or number of output format") parser.add_option("--prototypes", dest="prototypes", action="store_true", default=False, help="Write prototype enzymes only to output") parser.add_option("--commercial", dest="commercial", action="store_true", default=False, help="Write commercially available enzymes only to output") parser.add_option("--ends", dest="ends", action="store", type="string", default="53b", help="Type of ends") parser.add_option("--symmetry", dest="symmetry", action="store", type="string", default="as", help="asymmetric or symmetric sites") (options, args) = parser.parse_args() self.IFN = options.infile self.OFN = options.outfile self.OUTFMT = options.outfmt self.PROTOTYPES = options.prototypes self.COMMERCIAL = options.commercial self.ENDS=list(options.ends) #break up string into a list of characters self.SYMMETRY=list(options.symmetry) #break up string into a list of characters if DEBUG : print('------------ Parameters from command line ------' + '\n') print(' IFN: ' + self.IFN) print(' OFN: ' + self.OFN) print(' OUTFMT: ' + self.OUTFMT) print(' PROTOTYPES: ' + str(self.PROTOTYPES)) print(' COMMERCIAL: ' + str(self.COMMERCIAL)) print(' ENDS: ' + str(self.ENDS)) print(' SYMMETRY: ' + str(self.SYMMETRY)) class Rsite : def __init__(self) : """ Restriction site is a tuple with a recognition sequence and a cutting site. Asymmetric enzymes will have two Rsites, symmetric enzymes will have one. """ self.Rseq = "" self.Rcut = "" class Enzyme : def __init__(self): """ Initialize enzyme """ self.LINES = [] self.ID = "" self.PROTOTYPE = False self.COMMERCIAL = False self.SITE = [] # List of Rsites. Asymmetric enzymes will have two Rsites, # Symmetric enzymes will have one. self.ENDS="" # 5|3|b self.SYMMETRIC=True def readenz(self,INFILE,LINE,OUTFILE) : """ Read the next enzyme from the file. """ while (LINE != "") and (not LINE.startswith('ID')) : LINE=INFILE.readline() while not LINE.startswith('//') : self.LINES.append(LINE) LINE = INFILE.readline() def getID(self,ELINE) : """ Get the ID from ELINE. """ return ELINE[5:].strip() def getCOMMERCIAL(self,ELINE) : """ Get COMMERCIAL from ELINE. """ COM=ELINE[5:].strip() if COM == "." : return False else : return True def getPROTO(self,ELINE) : """ Get PROTOTYPE from ELINE. """ PID = ELINE[5:].strip() if PID == self.ID : return True def getRSlist(self,ELINE) : """ Get a list of Restriction sites from ELINE. Symmetric cutters will have a single site eg. RS GAATTC, 1; A asymmetric will have two eg. RS GAYNNNNNVTC, 24; GABNNNNNRTC, -8; """ RSlist = [] RStr = ELINE[5:].strip() # remove terminal semicolon if RStr.endswith(';') : RStr = RStr[:-1] # parse the rest. seq and cutting site TOKENS1 = RStr.split(';') for T in TOKENS1 : S = Rsite () TOKENS2 = T.split(',') S.Rseq = TOKENS2[0].strip() S.Rcut = TOKENS2[1].strip() RSlist.append(S) return RSlist def isSymmetric(self) : """ Test recognition sequence for symmetry. REBASE gives us a single recognition sequence for symmetric sequences, and two sequences for assymetric sites. """ if len(self.SITE) == 1 : return True else : return False def EndType(self) : """ Determine the type of fragment ends. Returns 5, 3 or b. """ FragEnds = "" if self.SYMMETRIC : Plen = len(self.SITE[0].Rseq) #CutsAfter = Plen + int(self.SITE[0].Rcut) CutsAfter = int(self.SITE[0].Rcut) if CutsAfter < (Plen/2) : FragEnds = "5" elif CutsAfter > (Plen/2) : FragEnds = "3" else: FragEnds = "b" else : Cut = int(self.SITE[0].Rcut) CutOpp = int(self.SITE[1].Rcut) if Cut < CutOpp : FragEnds = "5" elif Cut > CutOpp : FragEnds = "3" else : FragEnds = "b" return FragEnds def writeenz(self,OUTFILE) : """ Write enzyme to output file. """ OUTFILE.write('\n') for LINE in self.LINES : OUTFILE.write(LINE) OUTFILE.write('//' + '\n') #======================== MAIN PROCEDURE ========================== def main(): """ Called when not in documentation mode. """ P = Parameters () INFILE=open(P.IFN,'r') OUTFILE=open(P.OFN,'w') #Copy header comment lines from infile to outfile LINE = INFILE.readline() while LINE.startswith('CC') : OUTFILE.write(LINE) LINE = INFILE.readline() # Read an enzyme at a time. For each enzyme, find out whether it # meets criteria for prototypes, commercial, symmetry and ends. # If so, print the enzyme to the output. E = Enzyme () while LINE != "" : E.readenz(INFILE,LINE,OUTFILE) #OUTFILE.write(LINE) # Get criteria information from the enzyme OKAY = True for ELINE in E.LINES : FIELD=ELINE[0:2] if FIELD == 'ID' : # id E.ID = E.getID(ELINE) elif FIELD == 'PT' : # prototype E.PROTOTYPE = E.getPROTO(ELINE) elif FIELD == 'RS' : #restriction site E.SITE = E.getRSlist(ELINE) elif FIELD == 'CR' : #commercial availability E.COMMERCIAL = E.getCOMMERCIAL(ELINE) # Ignore all enzymes that have unknown recognition sequences or cutting sites for S in E.SITE : if S.Rseq == "?" or S.Rcut == "?" : OKAY = False if OKAY : E.SYMMETRIC = E.isSymmetric() E.ENDS = E.EndType() # OKAY is False if any of the criteria from enzyme E match fail to match # criteria from command line parameters P if P.PROTOTYPES and not E.PROTOTYPE : OKAY = False if P.COMMERCIAL and not E.COMMERCIAL : OKAY = False if P.SYMMETRY == ["s"] and not E.SYMMETRIC : OKAY = False if P.SYMMETRY == ["a"] and E.SYMMETRIC : OKAY = False if not E.ENDS in P.ENDS : OKAY = False # If enzyme meets all criteria, print it to output if OKAY : if DEBUG: print(E.ID) for S in E.SITE : print(S.Rseq + ' ' + S.Rcut) E.writeenz(OUTFILE) # Get the next enzyme LINE = INFILE.readline() E.__init__() INFILE.close() OUTFILE.close() BM.exit_success() if (BM.documentor() or "-test" in sys.argv): pass else: main()