#!/usr/bin/env python3 """ optparse is deprecated in favor of argparse as of Python 2.7. However, since 2.7 is not always present on many systems, at this writing, it is safer to stick with optparse for now. It should be easy to change later, since the syntax is very similar between argparse and optparse. from optparse import OptionParser """ from optparse import OptionParser import os import re import sys ''' guesspairs.py - Given a list of sequencing read files, make a guess as to which pairs of files should be grouped together as left and right read files. Output is a .tsv file. Pairs of files are written as two fields on a line. Unpaired files are written as output lines with a single field eg. Synopsis: guesspairs.py --infile --ltag --rtag [--extension ] --outfile --infile - file containing one filename per line --ltag - part of the filename that is only found in left read files --rtag - part of the filename that is only found in right read files --extension If a file extension is specified, only files with that file extension will be included in the output. Files with other extensions (eg. .html) will be ignored at input. string can be a comma-separated list of file extensions --outfile - output in TAB-separated (.tsv) format. Paired end files are together on an output line, separated by TAB. Unpaired files are each on a separate line. EXAMPLE: Given the inputfile names.in illumina_control_L1_.fq.gz illumina_control_R2.fq.gz illumina_treatment_L1.fq.gz illumina_treatment_R2.fq.gz iontorrent_control1.fq.gz iontorrent_control2.fq.gz Command: guesspairs.py --infile names.in --ltag L1_ --rtag R2 --extension .fq.gz --outfile names.grouped will create a file called names.grouped: illumina_control_L1_.fq.gzillumina_control_R2.fq.gz illumina_treatment_L1.fq.gzillumina_treatment_R2.fq.gz iontorrent_control1.fq.gz iontorrent_control2.fq.gz It may still be necessary to edit this file to get a namefile that can be used for genome or transcriptome assembly. @modified: April 24, 2018 @author: Brian Fristensky @contact: Brian.Fristensky@umanitoba.ca ''' #blib = os.environ.get("BIRCHPYLIB") #sys.path.append(blib) #from birchlib import Birchmod PROGRAM = "guesspairs.py : " USAGE = "\n\tUSAGE: --infile --ltag --rtag [--extension ] --outfile " DEBUG = False if DEBUG : print('guesspairs: Debugging mode on') #BM = Birchmod(PROGRAM, USAGE) # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class Parameters: """ Wrapper class for command line parameters """ def __init__(self): """ Initializes arguments: IFN = "" LTAG="" RTAG="" EXTENSION="" OUTFILE="" """ self.IFN = "" self.LTAG="" self.RTAG="" self.EXTENSION=[] self.OFN="" self.read_args() if DEBUG : print('------------ Parameters from command line ------') print(' IFN: ' + self.IFN) print(' LTAG: ' + self.LTAG) print(' RTAG: ' + self.RTAG) print(' EXTENSION: ' + str(self.EXTENSION)) print(' OFN: ' + self.OFN) print() def read_args(self): """ Read command line arguments into a Parameter object """ parser = OptionParser() parser.add_option("--infile", dest="ifn", action="store", default="", help="input file, one filename per line") parser.add_option("--ltag", dest="ltag", action="store", default="", help="string found in left read filenames") parser.add_option("--rtag", dest="rtag", action="store", default="", help="string found in right read filenames") parser.add_option("--extension", dest="extension", action="store", default="", help="common file extension for all read files") parser.add_option("--outfile", dest="ofn", action="store", default="", help="output file in .tsv format") (options, args) = parser.parse_args() self.IFN = options.ifn self.LTAG=options.ltag self.RTAG=options.rtag self.EXTENSION=options.extension.split(",") self.OFN=options.ofn # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class SeqFiles: """ Read, write and process lists of sequencing read filenames. """ def __init__(self): """ Initializes arguments: RawReads = [] """ self.RawReads = [] self.PairedReads = [] self.UnpairedReads = [] def ReadNames(self,FN,EXT) : """ Read input file with one name per line, and add these to RawReads """ # Returns True if name ends with one of the file extensions # specified in --extension option def LegalExtension(name,EXT) : OKAY=False for E in EXT : if name.endswith(E) : OKAY=True return OKAY F = open(FN,"r") for line in F.readlines() : name = line.strip() if len(name) > 0 : # Ignore names that do not end in the file extension if len(EXT) > 0 : if LegalExtension(name,EXT) : self.RawReads.append(name) else : self.RawReads.append(name) F.close() if DEBUG : print('ReadNames: ' + str(self.RawReads)) def MovePair(self,L,R) : """ Move a read pair to the self.PairedReads """ LeftName = self.RawReads[L] RightName = self.RawReads[R] ReadPair = [LeftName,RightName] self.RawReads.remove(LeftName) self.RawReads.remove(RightName) self.PairedReads.append(ReadPair) def MoveSingle(self,I) : """ Move a read to the self.UnpairedReads """ self.UnpairedReads.append(self.RawReads.pop(I)) def FindPairs(self,LTAG,RTAG) : """ Iterate through RawReads and move filenames either to PairedReads or UnpairedReads. """ self.RawReads.sort() #pre-sorting should make this more efficient # Find the first read that begins with the same unique string as the first read def SeekMate(UniquePart,TAG) : J=1 N = len(self.RawReads) RETCODE = -1 while J < N : SecondName = self.RawReads[J] TagIndex = SecondName.find(TAG) if SecondName[:TagIndex] == UniquePart : RETCODE = J J = N else: J+=1 return RETCODE # The exit condition occurs when all names have been moved from # self.RawReads to either self.PairdReads or self.Unpaired.Reads while len(self.RawReads) > 0 : I = 0 FirstName = self.RawReads[I] LFOUND = FirstName.rfind(LTAG) RFOUND = FirstName.rfind(RTAG) if DEBUG : print("LFOUND: " + str(LFOUND) + " RFOUND: " + str(RFOUND)) if LFOUND > -1 : if DEBUG : print(FirstName[:LFOUND]) J = SeekMate(FirstName[:LFOUND],RTAG) if DEBUG : print("J: " + str(J)) if J > -1 : self.MovePair(I,J) else : self.MoveSingle(I) elif RFOUND > -1 : print(FirstName[:RFOUND]) J = SeekMate(FirstName[:RFOUND],LTAG) if J > -1 : self.MovePair(J,I) else : self.MoveSingle(I) else : self.MoveSingle(I) def WriteOutput(self,OFN) : F = open(OFN,"w") TAB = "\t" NL = "\n" for pair in self.PairedReads : F.write(pair[0] + TAB + pair[1] + NL) for single in self.UnpairedReads: F.write(single + NL) F.close() #======================== MAIN PROCEDURE ========================== def main(): """ Called when not in documentation mode. """ # Read parameters from command line P = Parameters() SF = SeqFiles() SF.ReadNames(P.IFN,P.EXTENSION) SF.FindPairs(P.LTAG,P.RTAG) SF.WriteOutput(P.OFN) if __name__ == "__main__": main() #else: #used to generate documentation # import doctest # doctest.testmod() #if (BM.documentor() or "-test" in sys.argv): # pass #else: # main()