/** * Project : CASAVA * Module : $RCSfile: CasavaOptions.cpp,v $ * @author : Lukasz Szajkowski * Copyright : Copyright (c) Illumina 2008, 2009. All rights reserved. * ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). * */ #include "applications/CasavaOptions.hh" #include "common/Const.hh" #include #include #include #include #include #include namespace ca { namespace applications { using namespace std; using namespace casava::common; CasavaOptions:: CasavaOptions() : Options(), _version("@CASAVA_VERSION_FULL@"), _verbose(0), isQphred(true), isFilterUnanchored(false), _srasThreshold(10), _prasThreshold(6), _spReadThreshold(20.0), _numLowInsertSizeSds(5.0), _numHighInsertSizeSds(3.0), // SVfinder vals _minGroupSize(5), _maxDistance(10.0), _clusterMaxInterBreakPtDist(100), _doAdjMerge(false), _clusterMaxAdjMergeInterBreakPtDist(10), alignScoreThresh(120.), is_set_alignScoreThresh(false), sdFlankWeight(1), is_set_sdFlankWeight(false), _reportMode(false), _variantOnlyMode(false), _indelOnlyMode(false), _snpOnlyMode(false), _purityFilter(false), // AlignCandIndelRead versions :- _alignCandIndelReadsMatchScore(5), _alignCandIndelReadsMismatchScore(4), _alignCandIndelReadsOpenScore(12), _alignCandIndelReadsExtendScore(1), // AlignContig versions :- _alignContigMatchScore(5), _alignContigMismatchScore(4), _alignContigOpenScore(6), _alignContigExtendScore(1), // Other AlignContig params :- _minScore(30),_minContext(0) { } int CasavaOptions::usage(int, char * const argv[], const std::string &message) { if (0 != message.size()) { std::cerr << "ERROR: " << message << std::endl; } const string program = argv[0]; //const string padding = string(program.length(), ' '); ostringstream os; os << "Usage: " << argv[0] << " [Options]" << endl << "\t-a, --applicationType=TYPE - type of analysis [AlignContig, CheckOrder, IndelFinder, SmallAssembler, ExportExample, Export2Fastq]" << endl << "APPLICATION TYPE IndelFinder" << endl << "\t-r, --refSequence=PATH - Path to reference genome fasta file (*.fa)" << endl << "\t-b, --bamRegion=STRING - Specifies the binned region (chrom:begin-end)" << endl << "\t-e, --inReadsPath=PATH - Path to reads file (*.bam)" << endl << "\t--spReadThreshold=NUMBER - Spanning read score threshold" << endl << "\t--prasThreshold=NUMBER - Paired read alignment score threshold" << endl << "\t--srasThreshold=NUMBER - Single read alignment score threshold" << endl << "\t--numLowInsertSizeSds=NUMBER - Num SDs below the median at which insert size is considered anomalous" << endl << "\t (default: " << _numLowInsertSizeSds << ")" << endl << "\t--numHighInsertSizeSds=NUMBER - Num SDs above the median at which insert size is considered anomalous" << endl << "\t (default: " << _numHighInsertSizeSds << ")" << endl << "\t--filterUnanchored - remove unanchored read pairs from GROUPER input\n" << "APPLICATION TYPE AlignCandIndelReads" << endl << "\t-e, --inReadsPath=PATH - Path to input file" << endl << "\t-o, --outputFilePath=PATH - Path to output file" << endl << "\t--sampleStatsPath=PATH - Location of sample stats path, if not provided it will use the range provided in the input (export) file" << endl << "\t--alignScoreThresh=NUMBER - Local alignment score threshold to keep a local alignment" << endl << "\t (default: " << alignScoreThresh << ")\n" << "\t--sdFlankWeight=NUMBER - Number of standard deviations to use if a stats file is provided to extend reference sequence fragment for local alignment" << endl << "\t (default: " << sdFlankWeight << ")\n" << "APPLICATION TYPE ClusterFinder\n" << "\t-e, --inReadsPath=PATH - Path to 'shadow' reads file\n" << "\t-o, --outputFilePath=PATH - Path to output file\n" << "\t-s, --summaryFilePath=PATH - Path to input stats file\n" << "\t--maxDistance=NUMBER - Max distance between group members\n" << "\t--clusterMaxInterBreakPtDist=NUM - Max distance between adjacent anom read pair midpoints for same cluster\n" << "\t--minGroupSize=NUMBER - Minimal size of a shadow read cluster\n" << "\t--spReadThreshold=NUMBER - Spanning read score threshold\n" << "\t--numLowInsertSizeSds=NUMBER - Num SDs below the median at which insert size is considered anomalous" << endl << "\t (default: " << _numLowInsertSizeSds << ")" << endl << "\t--numHighInsertSizeSds=NUMBER - Num SDs above the median at which insert size is considered anomalous" << endl << "\t (default: " << _numHighInsertSizeSds << ")" << endl << "APPLICATION TYPE ClusterMerger" << endl << "\t-u, --rawClustersFilePath=PATH - Path to raw clusters file" << endl << "\t-o, --outputFilePath=PATH - Path to output file" << endl // << "\t--doAdjMerge - Merge compatible, adjacent clusters"<< endl // << "\t--clusterMaxAdjMergeInterBreakPtDist=NUMBER - Max distance between candidate cluster breakpoints" << endl << "\t--numLowInsertSizeSds=NUMBER - Num SDs below the median at which insert size is considered anomalous" << endl << "\t (default: " << _numLowInsertSizeSds << ")" << endl << "\t--numHighInsertSizeSds=NUMBER - Num SDs above the median at which insert size is considered anomalous" << endl << "\t (default: " << _numHighInsertSizeSds << ")" << endl << "APPLICATION TYPE SmallAssembler" << endl << "\t-e, --inReadsPath=PATH - Path to input file" << endl << "\t-o, --outputFilePath=PATH - Path to output file" << endl << "\t-p, --outputReadsFilePath=PATH - Path to output reads file" << endl << "\t--word-length=UINT - initial word length to try for assembly (default " << asmOptions.wordLength << ")\n" << "\t--max-word-length=UINT - maximum word length to try for assembly (default " << asmOptions.maxWordLength << ")\n" << "\t--min-contig-length=UINT - min contig length to print (default " << asmOptions.minContigLength << ")\n" << "\t--min-coverage=UINT - min coverage to extend contig (default " << asmOptions.minCoverage << ")\n" << "\t--max-error=FLOAT - max error rate to extend contig (default " << asmOptions.maxError << ")\n" << "\t--min-seed-reads=UINT - min number of seeding reads in contig required (default " << asmOptions.minSeedReads << ")\n" << "APPLICATION TYPE AlignContig" << endl << "\t-r, --refSequence=PATH - Path to reference genome fasta file (*.fa)" << endl << "\t-c, --contigFilePath=PATH - Path to assembled contigs fasta file (*.fa)" << endl << "\t-o, --outputFilePath=PATH - Path to output file (contigs with alignments)" << endl << "\t--alignedContigsPath=PATH - Path to aligned indel contigs\n" << "\t--report - show one variant per line (else show full alignment)" << endl << "\t--variant - don't show contigs that exactly match reference" << endl << "\t--indel-only - only show contigs containing indels" << endl << "\t--snp-only - only show contigs containing SNPs (no indels)" << endl << "\t--score-match=x - set match score in alignment to x (default " << _alignContigMatchScore << ")" << endl << "\t--score-mismatch=x - set mismatch score in alignment to x (default " << _alignContigMismatchScore << ")" << endl << "\t--score-gap-open=x - set alignment gap opening score to x (default " << _alignContigOpenScore << ")" << endl << "\t--score-gap-extend=x - set alignment gap extension score to x (default " << _alignContigExtendScore << ")" << endl << "\t--min-score=x - only show contigs with alignment score at least x (default " << _minScore << ")" << endl << "\t--min-context=x - demand at least x exact matching bases either side of variant (default " << _minContext << ")" << endl << "APPLICATION TYPE Export2Fastq" << endl << "\t-o, --outputFilePath=PATH - Path to output" << endl << "\t-2, --outputFilePath=PATH - Path to second output (read2 file in case of paired reads data)" << endl << "\t-e, --inReadsPath=PATH - Path to a input" << endl << "\t--purityFilter=VALUE - Remove all read with filter=N YES|NO (default NO)" << endl << "\t-i, --indexPath=PATH - Path to a fai (fasta index) input file" << endl << "\t--summary PATH - output a summary file containing the details of the conversion" << endl << "OPTIONAL" << endl << "\t--qlogodds - interpret qualities as logodds (i.e. solexa) scores (default: phred)" << endl << "\t-h, --help - prints this help and exits" << endl << "\t-v, --version - prints version information" << endl << "EXAMPLES" << endl << "Run export format usage example" << endl << "./CASAVA -a ExportExample" << endl << endl; std::cerr << "INFO: " << os.str() << std::endl; return NO_RUN; } // register options here int CasavaOptions::getOptions(int argc, char * const argv[]) throw (UnknownOptionException, MissingOptionException, InvalidArgumentException) { const char *shortOptionList = "p:c:b:r:e:u:a:vhs:o:2:i:"; const struct option longOptionList[] = { // AlignContig { "alignedContigsPath", required_argument, 0, 36 }, { "report", no_argument, 0, 1 }, { "variant", no_argument, 0, 2 }, { "indel-only", no_argument, 0, 3 }, { "snp-only", no_argument, 0, 4 }, { "score-match", required_argument, 0, 5 }, { "score-mismatch", required_argument, 0, 6 }, { "score-gap-open", required_argument, 0, 7 }, { "score-gap-extend", required_argument, 0, 8 }, { "min-score", required_argument, 0, 9 }, { "min-context", required_argument, 0, 10 }, // small assembler options { "outputReadsFilePaths", required_argument, 0, 'p'}, { "word-length", required_argument, 0, 11}, { "max-word-length", required_argument, 0, 12}, { "min-contig-length", required_argument, 0, 13}, { "min-coverage", required_argument, 0, 14}, { "max-error", required_argument, 0, 15}, { "min-seed-reads", required_argument, 0, 16}, // indel finder options { "spReadThreshold", required_argument, 0, 27 }, { "prasThreshold", required_argument, 0, 28 }, { "srasThreshold", required_argument, 0, 29 }, { "purityFilter", required_argument, 0, 31 }, { "numLowInsertSizeSds", required_argument, 0, 41 }, { "numHighInsertSizeSds", required_argument, 0, 42 }, { "bamRegion", required_argument, 0, 'b' }, { "filterUnanchored", no_argument, 0, 43 }, // cluster finder options: { "maxDistance", required_argument, 0, 32 }, { "clusterMaxInterBreakPtDist", required_argument, 0, 38 }, { "minGroupSize", required_argument, 0, 30 }, // cluster merger options: { "rawClustersFilePath", required_argument, 0, 'u' }, { "doAdjMerge", no_argument, 0, 39 }, { "clusterMaxAdjMergeInterBreakPtDist", required_argument, 0, 40 }, // align candidate indel reads { "alignScoreThresh", required_argument, 0, 33 }, { "sdFlankWeight", required_argument, 0, 34 }, { "sampleStatsPath", required_argument, 0, 35 }, { "refSequence", required_argument, 0, 'r' }, { "contigFilePath", required_argument, 0, 'c' }, { "inReadsPath", required_argument, 0, 'e' }, { "applicationType", required_argument, 0, 'a' }, { "version", no_argument, 0, 'v' }, { "help", no_argument, 0, 'h' }, { "qlogodds", no_argument, 0, 37 }, { "summaryFilePath", required_argument, 0, 's' }, { "outputFilePath", required_argument, 0, 'o' }, { "output2FilePath", required_argument, 0, '2' }, { "indexPath", required_argument, 0, 'i' }, { 0, 0, 0, 0 } }; int longOptionIndex; int option; // Note: "case 'h'" returns immediately while (-1 != (option = getopt_long(argc, argv, shortOptionList, longOptionList, &longOptionIndex))) { switch (option) { case 36: alignedContigsPath = optarg; break; case 1: _reportMode = true; break; case 2: _variantOnlyMode = true; break; case 3: _indelOnlyMode = true; break; case 4: _snpOnlyMode = true; break; case 5: _alignContigMatchScore = atoi(optarg); break; case 6: _alignContigMismatchScore = atoi(optarg); break; case 7: _alignContigOpenScore = atoi(optarg); break; case 8: _alignContigExtendScore = atoi(optarg); break; case 9: _minScore = atoi(optarg); break; case 10: _minContext = atoi(optarg); break; case 11: asmOptions.wordLength = boost::lexical_cast(optarg); break; case 12: asmOptions.maxWordLength = boost::lexical_cast(optarg); break; case 13: asmOptions.minContigLength = boost::lexical_cast(optarg); break; case 14: asmOptions.minCoverage = boost::lexical_cast(optarg); break; case 15: asmOptions.maxError = boost::lexical_cast(optarg); break; case 16: asmOptions.minSeedReads = boost::lexical_cast(optarg); break; case 'b': _bamRegion = optarg; break; case 'r': _refSequencePath = optarg; break; case 'c': _contigFilePath = optarg; break; case 'e': _inputReadsFilePath = optarg; break; case 'u': _rawClustersFilePath = optarg; break; case 'p': _outputReadsFilePath = optarg; break; case 27: _spReadThreshold = atof(optarg); break; case 28: _prasThreshold = atoi(optarg); break; case 29: _srasThreshold = atoi(optarg); break; case 30: _minGroupSize = atoi(optarg); break; case 31: _purityFilter = Const::get().isYES(optarg); break; case 32: _maxDistance = atof(optarg); break; case 33: alignScoreThresh = atof(optarg); is_set_alignScoreThresh=true; break; case 34: sdFlankWeight = atoi(optarg); is_set_sdFlankWeight=true; break; case 35: sampleStatsPath = optarg; break; case 37: isQphred=false; break; case 38: _clusterMaxInterBreakPtDist = atol(optarg); break; case 39: _doAdjMerge = true; break; case 40: _clusterMaxAdjMergeInterBreakPtDist = atoi(optarg); break; case 41: _numLowInsertSizeSds = boost::lexical_cast(optarg); break; case 42: _numHighInsertSizeSds = boost::lexical_cast(optarg); break; case 43: isFilterUnanchored = true; break; case 'a': _applicationType = optarg; break; case 'v': print_version(); return NO_RUN; break; case 'h': return usage(argc, argv); break; case 's': _summaryFilePath = optarg; break; case 'o': _outputFilePath = optarg; break; case '2': _output2FilePath = optarg; break; case 'i': _indexPath = optarg; break; case '?': case ':': default: throw UnknownOptionException(""); break; }; } return RUN; } void CasavaOptions::processOptions() throw (InvalidArgumentException) { if (applicationType().compare(Const::get().ExportExample()) == 0) { } else if (applicationType().compare(Const::get().Export2Fastq()) == 0) { } else if (applicationType().compare(Const::get().Sam2Export()) == 0) { } else if (applicationType().compare(Const::get().IndelFinder()) == 0) { } else if (applicationType().compare(Const::get().ClusterFinder()) == 0) { } else if (applicationType().compare(Const::get().ClusterMerger()) == 0) { } else if (applicationType().compare(Const::get().CheckOrder()) == 0) { } else if (applicationType().compare(Const::get().AlignContig()) == 0) { } else if (applicationType().compare(Const::get().SmallAssembler()) == 0) { } else if (applicationType().compare(Const::get().AlignCandIndelReads()) == 0) { } else { throw InvalidArgumentException(string("Unknown applicationType ") + applicationType()); } } void CasavaOptions::getParameters(int /*argc*/, char * const /*argv*/[]) throw (InvalidArgumentException) { //TODO: Implement } std::string CasavaOptions::version() { return _version; } void CasavaOptions::print_version() { std::cerr << "INFO: " << version() << std::endl; } // getters std::string CasavaOptions::applicationType() { return _applicationType; } std::string CasavaOptions::inputReadsFilePath() { return _inputReadsFilePath; } std::string CasavaOptions::outputReadsFilePath() { return _outputReadsFilePath; } std::string CasavaOptions::summaryFilePath() { return _summaryFilePath; } std::string CasavaOptions::outputFilePath() { return _outputFilePath; } std::string CasavaOptions::output2FilePath() { return _output2FilePath; } std::string CasavaOptions::indexPath() { return _indexPath; } } } // end namespace casava{ namespace { applications