/** ** Copyright (c) 2007-2009 Illumina, Inc. ** ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). ** ** This file is part of the Consensus Assessment of Sequence And VAriation ** (CASAVA) software package. ** ** \file DemultiplexOptions.cpp ** ** \brief Command line options for the tile demultiplexer. ** ** \author Mauricio Varea **/ #include #include #include #include #include #include "config.h" #include "common/Compression.hh" #include "demultiplex/BarcodeTranslationTable.hh" #include "demultiplex/DemultiplexBclsOptions.hh" #include "common/Exceptions.hh" namespace cc = casava::common; casava::demultiplex::DemultiplexBclsOptions::DemultiplexBclsOptions() : runNumber(0) , componentMaxMismatchesString("0") // default to exact match , lane(0) // no default , filterPerRead(false) , needSeparateControls(false) , fastqClusterCount(0) , createMissingFolders(false) , createDemuxIndex(false) , ignoreMissingBcl(false) , ignoreMissingCtrl(false) , withFailedReads(false) { std::string supportedCompressions(boost::join(cc::CompressionFactory::getCompressionList(), ", ")); // short namespaces namespace po = boost::program_options; namespace fs = boost::filesystem; namedOptions_.add_options() ("instrument-name,I" , po::value(&instrumentName), "Instrument name (used in various output files such as qseq and fastq)") ("run-number,R" , po::value(&runNumber), "Run number (used in various output files such as qseq and fastq)") ("flow-cell-id,F" , po::value(&flowCellId)->implicit_value(""), "Optional flowcell identifier (used in various output files such as qseq and fastq)") ("mismatches,m" , po::value(&componentMaxMismatchesString)->default_value(componentMaxMismatchesString), "Number of mismatches allowed in a barcode 0|1|2 [0|1|2]..." " One entry per barcode component. Last entry is propagated to" " all components that don't have one specified.") ("lane,l" , po::value< unsigned int >(&lane), "Lane number (1..8)") ("tile,t" , po::value< std::vector >(&tiles), "Tile numbers") ("barcode-cycles,c" , po::value(&barcodeCyclesString), "List of cycle numbers to use as barcode") ("sample,s " , po::value< std::vector >(&samples), "Bar code. One entry for each sample-dir") ("barcode,b" , po::value< std::vector >(&barcodes), "Bar code. One entry for each sample-dir") ("sample-dir,d" , po::value< std::vector >(&demuxDirNames), "Folder name created under output-dir. One entry for each barcode") ("read-cycles,r" , po::value< std::vector >(&readCyclesStrings), "Lists of cycle numbers to demultiplex for a read. Lists must appear in the order of reads") ("adapter,s " , po::value< std::vector >(&readAdapterSequences), "Adapter sequences to mask. Each entry corresponds to a read-cycles entry") ("input-read-number,n" , po::value< std::vector >(&inputReadNumbers), "Base calls read numbers. Must appear in the same order as read-cycles") ("output-read-number,n" , po::value< std::vector >(&outputReadNumbers), "Ouutput read numbers. Must appear in the same order as read-cycles") ("basecalls-dir,bc" , po::value< fs::path >(&basecallsDirectory), "BaseCalls directory path to the input data") ("intensities-dir,i" , po::value< fs::path >(&intensitiesDirectory), "Path to the Intensities directory") ("positions-dir,p" , po::value< fs::path >(&inputPositionsDir), "Positions files location. If specified assumes single location. otherwise split-by-lane Intensities/L00? structure is assumed") ("positions-format,P" , po::value< std::string >(&inputPositionsFileType)->default_value(".clocs"), "Positions file format\n(supported formats are: {'_pos.txt','.locs','.clocs'})") ("filter-dir,f" , po::value< fs::path >(&inputFilterDir), "Filter files location. If specified assumes single location. otherwise split-by-lane BaseCalls/L00? structure is assumed") ("filter-per-read" , po::value< bool >(&filterPerRead)->zero_tokens(), "Whether the filter files span across the entire run (default), or they are on a read-by-read basis") ("need-separate-controls", po::value< bool >(&needSeparateControls)->zero_tokens(), "Whether the controls are in a separate file, or included in the filter files (default)") ("output-dir,o" , po::value< fs::path >(&outputDirectory)->default_value(fs::path(".") / "Demultiplexed"), "Root folder path for demultiplexed data") ("output-format,O" , po::value(&outputFormat)->default_value("bcl"), "bcl or fastq") ("output-summary-path" , po::value< fs::path >(&outputSummaryFilePath)->default_value(fs::path("")), "Path for summary xml file") ("unknown-barcode" , po::value< std::string >(&unknownBarcode)->default_value("unknown"), "Barcode string to use for files that contain data that does not map to a barcode. " "This is one of the names passed in as --barcode parameters") ("fastq-cluster-count" , po::value(&fastqClusterCount), "Maximum number of clusters per signle output fastq file") ("with-demux-index" , "Produce list of original cycle numbers in each demultiplexed directory") ("create-missing-dirs" , "Create missing output folders instead of generating an errror") ("ignore-missing-bcl", po::value(&ignoreMissingBcl)->default_value(false)->zero_tokens(), "interpret missing *.bcl files as no call") ("ignore-missing-control",po::value(&ignoreMissingCtrl)->default_value(false)->zero_tokens(), "interpret missing control files as not-set control bits") ("compression" , po::value< std::string >(&compression)->default_value("none"), ("Supported compressions: " + supportedCompressions).c_str()) ("gz-level" , po::value< int >(&gzParams.level), "Compression level. Must be equal to zlib::default_compression or a value in the " "range 0-9. The value 0 yields no compression, while 9 yields the best compression " "ratio. Affects compression only.") // ("gz-method" , "Compression method. Must equal zlib::deflated. Affects compression only.") ("gz-window-bits" , po::value< int >(&gzParams.window_bits)->default_value(15), "The base two logarithm of the window size. Must be in the range 8-15") ("gz-mem-level" , po::value< int >(&gzParams.mem_level)->default_value(8), "Specifies the amount of memory to be used. Must be in the range 1-9") ("no-eamss" , "Do not mask the quality values using the EAMSS algorithm") ("with-failed-reads" , "Include reads that do not pass the filer (failed reads).") // ("gz-strategy" , po::value< std::string >(&gzStrategy)->default_value("zlib::default_strategy"), // "Must be zlib::default_strategy, zlib::filtered or zlib::huffman_only.") ; } const std::vector parseIntegers(const std::string &delimitedIntegersString) { std::vector ret; boost::tokenizer > tknzr(delimitedIntegersString, boost::char_separator(" \t,")); std::transform(tknzr.begin(), tknzr.end(), std::back_inserter(ret), static_cast(&boost::lexical_cast)); return ret; } unsigned barcodeComponents(const std::string &barcode) { return 1 + std::count(barcode.begin(), barcode.end(), // gcc 4.1 fails to link the barcodeComponentSeparator_ unless stack temporary is created char(casava::demultiplex::BarcodeTranslationTable::barcodeComponentSeparator_)); } unsigned barcodeLength(const std::string &barcode) { return barcode.length() - std::count(barcode.begin(), barcode.end(), // gcc 4.1 fails to link the barcodeComponentSeparator_ unless stack temporary is created char(casava::demultiplex::BarcodeTranslationTable::barcodeComponentSeparator_)); } const std::string barcodeGeometry(std::string barcode) { std::replace_if(barcode.begin(), barcode.end(), boost::bind(&boost::cref, _1) != casava::demultiplex::BarcodeTranslationTable::barcodeComponentSeparator_, 'X'); return barcode; } void casava::demultiplex::DemultiplexBclsOptions::postProcess(boost::program_options::variables_map &vm) { namespace po = boost::program_options; namespace fs = boost::filesystem; // do not process exceptions if "--help" was given if (vm.count("help")) return; createMissingFolders = !!vm.count("create-missing-dirs"); createDemuxIndex = !!vm.count("with-demux-index"); withEamss = !vm.count("no-eamss"); withFailedReads = !!vm.count("with-failed-reads"); static const char *const validOutputFormats[] = {"bcl", "fastq"}; static const char *const *validOutputFormatsEnd(validOutputFormats + sizeof(validOutputFormats) / sizeof(validOutputFormats[0])); if (validOutputFormatsEnd == std::find(validOutputFormats, validOutputFormatsEnd, outputFormat)){ BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** output format invalid: valid formats are: bcl, fastq, fastq-gz ***\n")); } static const char *const validPosFileTypes[] = {"_pos.txt",".locs",".clocs"}; static const char *const *validPosFileTypesEnd(validPosFileTypes + sizeof(validPosFileTypes) / sizeof(validPosFileTypes[0])); if (validPosFileTypesEnd == std::find(validPosFileTypes, validPosFileTypesEnd, inputPositionsFileType)){ BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** input positions-format invalid: valid formats are: _pos.txt, .locs, .clocs ***\n")); } if ("bcl" != outputFormat && instrumentName.empty() ) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** instrument name not valid: please provide instrument name ***\n")); } if ("bcl" != outputFormat && !runNumber) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** run number not valid: please provide non-zero run number ***\n")); } if ("fastq" == outputFormat && !fastqClusterCount){ fastqClusterCount = UINT_MAX; std::cerr << "Setting --fastq-cluster-count to " << fastqClusterCount << "\n"; } std::transform(readCyclesStrings.begin(), readCyclesStrings.end(), std::back_inserter(readCycles), parseIntegers); if ( readCycles.empty() ) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** --read-cycles not valid: please provide list of cycle numbers to demultiplex ***\n")); } if (readAdapterSequences.size() != readCyclesStrings.size()) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** adapter not valid: please provide --adapter sequence to for each --read-cycles argument***\n")); } if (inputReadNumbers.size() != readCycles.size()) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** input-read-number: please provide input-read-number for each read-cycles entry ***\n")); } if (outputReadNumbers.size() != readCycles.size()) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** output-read-number: please provide output-read-number for each read-cycles entry ***\n")); } if (!barcodeCyclesString.empty()) { barcodeCycles = parseIntegers(barcodeCyclesString); if ( barcodeCycles.empty() ) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** barcode cycles not valid: please provide list of cycle numbers that represent barcode ***\n")); } if ( barcodes.empty() ) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** sample-barcode not valid: please provide list of valid barcodes ***\n")); } std::vector realBarcodes(barcodes); realBarcodes.erase(std::remove(realBarcodes.begin(), realBarcodes.end(), unknownBarcode), realBarcodes.end()); std::vector::const_iterator invalidBarcodeIterator( std::find_if(realBarcodes.begin(), realBarcodes.end(), boost::bind(&barcodeLength, _1) != barcodeCycles.size())); if (realBarcodes.end() != invalidBarcodeIterator) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException( "\n *** sample-barcode not valid: length of barcode must match the number of barcode-cycles ***\n" "Incorrect barcode: " + *invalidBarcodeIterator)); } if ( demuxDirNames.size() != barcodes.size() ) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** sample-dir not valid: number of directories must match the number of barcodes ***\n")); } invalidBarcodeIterator = std::find_if(realBarcodes.begin(), realBarcodes.end(), boost::bind(&barcodeGeometry, _1) != barcodeGeometry(realBarcodes.front())); if (realBarcodes.end() != invalidBarcodeIterator) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException( "\n *** sample-barcode not valid: barcode geometry must match across all barcodes ***\n" "Incorrect barcode: " + *invalidBarcodeIterator)); } componentMaxMismatches = parseIntegers(componentMaxMismatchesString); if ( componentMaxMismatches.empty() ) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** mismatch not valid: there must be at least one entry in it ***\n")); } if ( componentMaxMismatches.end() != std::find_if(componentMaxMismatches.begin(), componentMaxMismatches.end(), boost::bind(&boost::cref, _1) > 2u)) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** the number of allowed mismatches must be 2 or less ***\n")); } componentMaxMismatches.insert(componentMaxMismatches.end(), barcodeComponents(realBarcodes.front()) - componentMaxMismatches.size(), componentMaxMismatches.back()); } else { if ( demuxDirNames.size() != 1 || barcodes.size() != 1 ) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** only one sample-dir and one sample-barcode is allowed when --barcode-cycles is not given ***\n")); } } if ( 1 > lane || lane > 8 ) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** lane not valid: please provide an integer in the range '1 <= n <= 8' ***\n")); } if ( tiles.empty() ) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** one or more tile numbers must be provided ***\n")); } if ( basecallsDirectory.empty() ) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** the basecalls directory can't be empty ***\n")); } else if (! fs::is_directory(basecallsDirectory) ) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** the basecalls directory must exist ***\n")); } if ( intensitiesDirectory.empty() ) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** the intensities directory can't be empty ***\n")); } else if (! fs::is_directory(intensitiesDirectory) ) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** the intensities directory must exist ***\n")); } if ( outputDirectory.empty() ) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** the output directory can't be empty ***\n")); } else if (! fs::is_directory(outputDirectory) ) { BOOST_THROW_EXCEPTION(cc::InvalidOptionException("\n *** the output directory must exist ***\n")); } }