#include #include #include "ProgramOptionsGenerator.hpp" #include "SalmonDefaults.hpp" namespace salmon { po::options_description ProgramOptionsGenerator::getBasicOptions(SalmonOpts& sopt) { using std::string; using std::vector; po::options_description basic("\n" "basic options"); basic.add_options()("version,v", "print version string") ("help,h", "produce help message") ("output,o", po::value()->required(), "Output quantification directory.") ("seqBias", po::bool_switch(&(sopt.biasCorrect))->default_value(salmon::defaults::seqBiasCorrect), "Perform sequence-specific bias correction.") ("gcBias", po::bool_switch(&(sopt.gcBiasCorrect))->default_value(salmon::defaults::gcBiasCorrect), "[beta for single-end reads] Perform fragment GC bias correction.") ("posBias", po::bool_switch(&(sopt.posBiasCorrect))->default_value(salmon::defaults::posBiasCorrect), "Perform positional bias correction.") ("threads,p", po::value(&(sopt.numThreads))->default_value(sopt.numThreads), "The number of threads to use concurrently.") ("incompatPrior", po::value(&(sopt.incompatPrior))->default_value(salmon::defaults::incompatPrior), "This option " "sets the prior probability that an alignment that disagrees with the " "specified " "library type (--libType) results from the true fragment origin. " "Setting this to 0 " "specifies that alignments that disagree with the library type should be " "\"impossible\", " "while setting it to 1 says that alignments that disagree with the " "library type are no " "less likely than those that do") ("geneMap,g", po::value(), "File containing a mapping of transcripts to genes. If this file is " "provided " "salmon will output both quant.sf and quant.genes.sf files, where the " "latter " "contains aggregated gene-level abundance estimates. The transcript to " "gene mapping " "should be provided as either a GTF file, or a in a simple tab-delimited " "format " "where each line contains the name of a transcript and the gene to which " "it belongs " "separated by a tab. The extension of the file is used to determine how " "the file " "should be parsed. Files ending in \'.gtf\', \'.gff\' or \'.gff3\' are " "assumed to " "be in GTF " "format; files with any other extension are assumed to be in the simple " "format. In GTF / GFF format, the \"transcript_id\" is assumed to " "contain the " "transcript identifier and the \"gene_id\" is assumed to contain the " "corresponding " "gene identifier.") ("auxTargetFile", po::value(&(sopt.auxTargetFile))->default_value(salmon::defaults::auxTargetFile), "A file containing a list of \"auxiliary\" targets. These are valid targets (i.e., not decoys) to " "which fragments are allowed to map and be assigned, and which will be quantified, but for which " "auxiliary models like sequence-specific and fragment-GC bias correction should not be applied." ) ("meta", po::bool_switch(&(sopt.meta))->default_value(salmon::defaults::metaMode), "If you're using Salmon on a metagenomic dataset, consider setting this " "flag to disable parts of the abundance estimation model that make less " "sense for metagenomic data."); // use rich eq classes by default sopt.noRichEqClasses = salmon::defaults::noRichEqClasses; // mapping cache has been deprecated sopt.disableMappingCache = salmon::defaults::disableMappingCache; return basic; } po::options_description ProgramOptionsGenerator::getMappingSpecificOptions(SalmonOpts& sopt) { using std::string; using std::vector; po::options_description mapspec("\n" "options specific to mapping mode"); mapspec.add_options() ("discardOrphansQuasi", po::bool_switch(&(sopt.discardOrphansQuasi))->default_value(salmon::defaults::discardOrphansQuasi), "[selective-alignment mode only] : Discard orphan mappings in selective-alignment " "mode. If this flag is passed " "then only paired mappings will be considered toward quantification " "estimates. The default behavior is " "to consider orphan mappings if no valid paired mappings exist. This " "flag is independent of the option to " "write the orphaned mappings to file (--writeOrphanLinks).") /* ("noSA", po::bool_switch(&(sopt.disableSA))->default_value(salmon::defaults::disableSA), "[Not currently supported] : Disable selective-alignment in favor of basic quasi-mapping. " "If this flag is passed, selective-alignment and alignment scoring of reads will be disabled." )*/ ("validateMappings", po::bool_switch(&(sopt.validateMappings))->default_value(salmon::defaults::validateMappings), "[*deprecated* (no effect; selective-alignment is the default)]") ("consensusSlack", po::value(&(sopt.consensusSlack))->default_value(salmon::defaults::consensusSlack), "[selective-alignment mode only] : The amount of slack allowed in the selective-alignment " "filtering mechanism. If this is set to a fraction, X, greater than 0 (and in [0,1)), then " "uniMEM chains with scores below (100 * X)% of the best chain score for a read, and read pairs " "with a sum of chain scores below (100 * X)% of the best chain score for a read pair " "will be discounted as a mapping candidates. The default value of this option is 0.35." ) ("preMergeChainSubThresh", po::value(&(sopt.pre_merge_chain_sub_thresh))->default_value(salmon::defaults::pre_merge_chain_sub_thresh), "[selective-alignment mode only] : The threshold of sub-optimal chains, compared to the best chain on a given " "target, that will be retained and passed to the next phase of mapping. Specifically, if the best chain " "for a read (or read-end in paired-end mode) to target t has score X_t, then all chains for this read with " "score >= X_t * preMergeChainSubThresh will be retained and passed to subsequent mapping phases. This value " "must be in the range [0, 1]." ) ("postMergeChainSubThresh", po::value(&(sopt.post_merge_chain_sub_thresh))->default_value(salmon::defaults::post_merge_chain_sub_thresh), "[selective-alignment mode only] : The threshold of sub-optimal chain pairs, compared to the best chain pair " "on a given target, that will be retained and passed to the next phase of mapping. This is different than " "preMergeChainSubThresh, because this is applied to pairs of chains (from the ends of paired-end reads) after " "merging (i.e. after checking concordancy constraints etc.). Specifically, if the best chain pair " "to target t has score X_t, then all chain pairs for this read pair with score " ">= X_t * postMergeChainSubThresh will be retained and passed to subsequent mapping phases. This value " "must be in the range [0, 1]. Note: This option is only meaningful for paired-end libraries, and is ignored " "for single-end libraries." ) ("orphanChainSubThresh", po::value(&(sopt.orphan_chain_sub_thresh))->default_value(salmon::defaults::orphan_chain_sub_thresh), "[selective-alignment mode only] : This threshold sets a global sub-optimality threshold for chains " "corresponding to orphan mappings. That is, if the merging procedure results in no concordant mappings " "then only orphan mappings with a chain score >= orphanChainSubThresh * bestChainScore will be " "retained and passed to subsequent mapping phases. This value must be in the range [0, 1]. Note: This " "option is only meaningful for paired-end libraries, and is ignored for single-end libraries." ) ("scoreExp", po::value(&sopt.scoreExp)->default_value(salmon::defaults::scoreExp), "[selective-alignment mode only] : The factor by which sub-optimal alignment scores are " "downweighted to produce a probability. If the best alignment score for the current read is S, and the score " "for a particular alignment is w, then the probability will be computed porportional to exp( - scoreExp * (S-w) )." ) ("minScoreFraction", po::value(&sopt.minScoreFraction), "[selective-alignment mode only] : The fraction of the optimal possible alignment score that a " "mapping must achieve in order to be considered \"valid\" --- should be in (0,1].\n" "Salmon Default 0.65 and Alevin Default 0.87" ) ("mismatchSeedSkip", po::value(&sopt.mismatchSeedSkip)->default_value(salmon::defaults::mismatchSeedSkip), "[selective-alignment mode only] : After a k-mer hit is extended to a uni-MEM, the uni-MEM extension " "can terminate for one of 3 reasons; the end of the read, the end of the unitig, or a mismatch. If the " "extension ends because of a mismatch, this is likely the result of a sequencing error. To avoid looking " "up many k-mers that will likely fail to be located in the index, the search procedure skips by a factor " "of mismatchSeedSkip until it either (1) finds another match or (2) is k-bases past the mismatch position. " "This value controls that skip length. A smaller value can increase sensitivity, while a larger value " "can speed up seeding." ) ("disableChainingHeuristic", po::bool_switch(&(sopt.disableChainingHeuristic))->default_value(salmon::defaults::disableChainingHeuristic), "[selective-alignment mode only] : By default, the heuristic of (Li 2018) is implemented, which terminates " "the chaining DP once a given number of valid backpointers are found. This speeds up the seed (MEM) " "chaining step, but may result in sub-optimal chains in complex situations (e.g. sequences with many repeats and " "overlapping repeats). Passing this flag will disable the chaining heuristic, and perform the full chaining " "dynamic program, guaranteeing the optimal chain is found in this step." ) ("decoyThreshold", po::value(&sopt.decoyThreshold)->default_value(salmon::defaults::decoyThreshold), "[selective-alignment mode only] : For an alignemnt to an annotated transcript to be considered invalid, it must have an alignment " "score < (decoyThreshold * bestDecoyScore). A value of 1.0 means that any alignment strictly worse than " "the best decoy alignment will be discarded. A smaller value will allow reads to be allocated to transcripts " "even if they strictly align better to the decoy sequence." ) ("ma", po::value(&sopt.matchScore)->default_value(salmon::defaults::matchScore), "[selective-alignment mode only] : The value given to a match between read and reference nucleotides " "in an alignment." ) ("mp", po::value(&sopt.mismatchPenalty)->default_value(salmon::defaults::mismatchPenalty), "[selective-alignment mode only] : The value given to a mis-match between read and reference nucleotides " "in an alignment." ) ("go", po::value(&sopt.gapOpenPenalty)->default_value(salmon::defaults::gapOpenPenalty), "[selective-alignment mode only] : The value given to a gap opening in an alignment." ) ("ge", po::value(&sopt.gapExtendPenalty)->default_value(salmon::defaults::gapExtendPenalty), "[selective-alignment mode only] : The value given to a gap extension in an alignment." ) ("bandwidth", po::value(&sopt.dpBandwidth)->default_value(salmon::defaults::dpBandwidth), "[selective-alignment mode only] : The value used for the bandwidth passed to ksw2. A smaller " "bandwidth can make the alignment verification run more quickly, but could possibly miss valid alignments." ) ("allowDovetail", po::bool_switch(&(sopt.allowDovetail))->default_value(salmon::defaults::allowDovetail), "[selective-alignment mode only] : allow dovetailing mappings." ) ("recoverOrphans", po::bool_switch(&(sopt.recoverOrphans))->default_value(salmon::defaults::recoverOrphans), "[selective-alignment mode only] : Attempt to recover the mates of orphaned reads. This uses edlib for " "orphan recovery, and so introduces some computational overhead, but it can improve sensitivity." ) ("mimicBT2", // horrible flag name, think of something better po::bool_switch(&(sopt.mimicBT2))->default_value(salmon::defaults::mimicBT2), "[selective-alignment mode only] : Set flags to mimic parameters similar to " "Bowtie2 with --no-discordant and --no-mixed flags. This increases disallows dovetailing reads, and " "discards orphans. Note, this does not impose the very strict parameters assumed by RSEM+Bowtie2, " "like gapless alignments. For that behavior, use the --mimiStrictBT2 flag below." ) ("mimicStrictBT2", // horrible flag name, think of something better po::bool_switch(&(sopt.mimicStrictBT2))->default_value(salmon::defaults::mimicStrictBT2), "[selective-alignment mode only] : Set flags to mimic the very strict parameters used by " "RSEM+Bowtie2. This increases --minScoreFraction to 0.8, disallows dovetailing reads, " "discards orphans, and disallows gaps in alignments." ) ("softclip", po::bool_switch(&(sopt.softclip))->default_value(salmon::defaults::softclip), "[selective-alignment mode only (experimental)] : Allos soft-clipping of reads during selective-alignment. If this " "option is provided, then regions at the beginning or end of the read can be withheld from alignment " "without any effect on the resulting score (i.e. neither adding nor removing from the score). This " "will drastically reduce the penalty if there are mismatches at the beginning or end of the read " "due to e.g. low-quality bases or adapters. NOTE: Even with soft-clipping enabled, the read must still " "achieve a score of at least minScoreFraction * maximum achievable score, where the maximum achievable " "score is computed based on the full (un-clipped) read length." ) ("softclipOverhangs", po::bool_switch(&(sopt.softclipOverhangs))->default_value(salmon::defaults::softclipOverhangs), "[selective-alignment mode only] : Allow soft-clipping of reads that overhang the beginning or ends " "of the transcript. In this case, the overhaning section of the read will simply be unaligned, and " "will not contribute or detract from the alignment score. The default policy is to force an end-to-end " "alignment of the entire read, so that overhanings will result in some deletion of nucleotides from the " "read." ) ("fullLengthAlignment", po::bool_switch(&(sopt.fullLengthAlignment))->default_value(salmon::defaults::fullLengthAlignment), "[selective-alignment mode only] : Perform selective alignment over the full length of the read, beginning " "from the (approximate) initial mapping location and using extension alignment. This is in contrast with the " "default behavior which is to only perform alignment between the MEMs in the optimal chain (and before the " "first and after the last MEM if applicable). The default strategy forces the MEMs to belong to the alignment, " "but has the benefit that it can discover indels prior to the first hit shared between the read and reference. Except in " "very rare circumstances, the default mode should be more accurate." ) ("hardFilter", po::bool_switch(&(sopt.hardFilter))->default_value(salmon::defaults::hardFilter), "[selective-alignemnt mode only] : Instead of weighting mappings by their alignment score, " "this flag will discard any mappings with sub-optimal alignment score. The default option of soft-filtering " "(i.e. weighting mappings by their alignment score) usually yields slightly more accurate abundance estimates " "but this flag may be desirable if you want more accurate 'naive' equivalence classes, rather " "than range factorized equivalence classes." ) ("minAlnProb", po::value(&(sopt.minAlnProb))->default_value(salmon::defaults::minAlnProb), "[selective-alignment mode only] : Any mapping whose alignment probability (as computed by " "P(aln) = exp(-scoreExp * difference from best mapping score) is less than minAlnProb will " "not be considered as a valid alignment for this read. The goal of this flag is to remove " "very low probability alignments that are unlikely to have any non-trivial effect on the " "final quantifications. Filtering such alignments reduces the number of variables that need " "to be considered and can result in slightly faster inference and 'cleaner' equivalence classes." ) ("writeMappings,z", po::value(&sopt.qmFileName) ->default_value(salmon::defaults::quasiMappingDefaultFile) ->implicit_value(salmon::defaults::quasiMappingImplicitFile), "If this option is provided, then the selective-alignment " "results will be written out in SAM-compatible " "format. By default, output will be directed to " "stdout, but an alternative file name can be " "provided instead.") ("writeQualities", po::bool_switch(&(sopt.writeQualities))->default_value(salmon::defaults::writeQualities), "This flag only has meaning if mappings are being written (with --writeMappings/-z). " "If this flag is provided, then the output SAM file will contain quality strings as well as " "read sequences. Note that this can greatly increase the size of the output file." ) ("hitFilterPolicy", po::value(&sopt.hitFilterPolicyStr)->default_value(salmon::defaults::hitFilterPolicyStr), "[selective-alignment mode only] : Determines the policy by which hits are filtered in selective alignment. Filtering hits after " "chaining (the default) is more sensitive, but more computationally intensive, because it performs " "the chaining dynamic program for all hits. Filtering before chaining is faster, but some true hits " "may be missed. The options are BEFORE, AFTER, BOTH and NONE." ); sopt.disableSA = salmon::defaults::disableSA; sopt.fasterMapping = salmon::defaults::fasterMapping; return mapspec; } po::options_description ProgramOptionsGenerator::getMappingInputOptions(SalmonOpts& sopt) { using std::string; using std::vector; po::options_description mapin("\n" "mapping input options"); mapin.add_options() ("libType,l", po::value()->required(), "Format string describing the library type") ("index,i", po::value()->required(), "salmon index") ("unmatedReads,r", po::value>(&(sopt.unmatedReadFiles))->multitoken(), "List of files containing unmated reads of (e.g. single-end reads)") ("mates1,1", po::value>(&(sopt.mate1ReadFiles))->multitoken(), "File containing the #1 mates") ("mates2,2", po::value>(&(sopt.mate2ReadFiles))->multitoken(), "File containing the #2 mates"); return mapin; } po::options_description ProgramOptionsGenerator::getAlignmentInputOptions(SalmonOpts& sopt) { using std::string; using std::vector; po::options_description alignin("\n" "alignment input options"); alignin.add_options() ("discardOrphans", po::bool_switch(&(sopt.discardOrphansAln))->default_value(salmon::defaults::discardOrphansAln), "[alignment-based mode only] : Discard orphan alignments in the input " ". If this flag is passed, then only paired alignments will be " "considered toward quantification estimates. The default behavior is " "to consider orphan alignments if no valid paired mappings exist.") ("libType,l", po::value()->required(), "Format string describing the library type") ("alignments,a", po::value>()->multitoken(), "input alignment (BAM) file(s).") ("eqclasses,e", po::value(), "input salmon weighted equivalence class file.") ("targets,t", po::value(), "FASTA format file containing target transcripts.") ("ont", po::bool_switch(&(sopt.oxfordNanoporeModel))->default_value(false)->notifier([&sopt](const bool& v) { if(v) sopt.noLengthCorrection = true; }), "use alignment model for Oxford Nanopore long reads"); return alignin; } po::options_description ProgramOptionsGenerator::getAlevinDevsOptions() { po::options_description alevindevs("\n" "alevin-developer Options"); alevindevs.add_options() ( "indropV2", po::bool_switch()->default_value(alevin::defaults::isInDrop), "Use inDropV2 Single Cell protocol for the library. Must specify w1 too.") ( "w1", po::value(), "Must be used in conjunction with inDrop;") ( "dumpBarcodeEq", po::bool_switch()->default_value(alevin::defaults::dumpBarcodeEq), "Dump JointEqClas with umi-barcode count.") ( "iupac,u",po::value(), "iupac code for cell-level barcodes.") ( "vbemPrior", po::value(), "a mtx file containing VBEM priors") ( "vbemNorm",po::value()->default_value(alevin::defaults::vbemNorm), "Variational Bayesian global normalization factor. Usually the total number of deduplicated UMIs.") ( "trimRight",po::value()->default_value(alevin::defaults::trimRight), "The number of bases to trim off the 5' (right) end of the read seequence.") ( "naiveEqclass", po::bool_switch()->default_value(alevin::defaults::naiveEqclass), "Run naive per equivalence class deduplication, generating only total number of UMIs") ( "noWhitelist", po::bool_switch()->default_value(alevin::defaults::noWhitelist), "Stops the pipeline after UMI deduplication and quantification; do not perform intelligent whitelisting.") ( "noDedup", po::bool_switch()->default_value(alevin::defaults::noDedup), "Stops the pipeline after CB sequence correction and selective-alignment of reads."); return alevindevs; } po::options_description ProgramOptionsGenerator::getAlevinBasicOptions(SalmonOpts& sopt) { po::options_description alevinspec("\n" "alevin-specific Options"); alevinspec.add_options() ("version,v", "print version string") ("help,h", "produce help message") ("output,o", po::value()->required(), "Output quantification directory.") ("rad,justAlign,j", po::bool_switch()->default_value(alevin::defaults::just_align), "just selectively align the data and write the results to a RAD file. Do not perform " "the rest of the quantification procedure.") ("sketch,sketchMode", po::bool_switch()->default_value(alevin::defaults::sketch_mode), "perform sketching rather than selective alignment and write the results to a RAD file. " "Requires the `--rad` flag. Do not perform the rest of the quantification procedure." ) ("threads,p", po::value(&(sopt.numThreads))->default_value(sopt.numThreads), "The number of threads to use concurrently.") ( "tgMap", po::value(), "transcript to gene map tsv file") ( "hash", po::value(), "Secondary input point for Alevin " "using Big freaking Hash (bfh.txt) file. Works Only with --chromium") ( "dropseq", po::bool_switch()->default_value(alevin::defaults::isDropseq), "Use DropSeq Single Cell protocol for the library") ( "chromiumV3", po::bool_switch()->default_value(alevin::defaults::isChromiumV3), "Use 10x chromium v3 Single Cell protocol for the library.") ( "chromium", po::bool_switch()->default_value(alevin::defaults::isChromium), "Use 10x chromium v2 Single Cell protocol for the library.") ( "gemcode", po::bool_switch()->default_value(alevin::defaults::isGemcode), "Use 10x gemcode v1 Single Cell protocol for the library.") ( "citeseq", po::bool_switch()->default_value(alevin::defaults::isCITESeq), "Use CITESeq Single Cell protocol for the library, 16 CB, 12 UMI and features.") ( "celseq", po::bool_switch()->default_value(alevin::defaults::isCELSeq), "Use CEL-Seq Single Cell protocol for the library.") ( "celseq2", po::bool_switch()->default_value(alevin::defaults::isCELSeq2), "Use CEL-Seq2 Single Cell protocol for the library.") ( "splitseqV1", po::bool_switch()->default_value(alevin::defaults::isSplitSeqV1), "Use Split-SeqV1 Single Cell protocol for the library.") ( "splitseqV2", po::bool_switch()->default_value(alevin::defaults::isSplitSeqV2), "Use Split-SeqV2 Single Cell protocol for the library.") ( "quartzseq2", po::bool_switch()->default_value(alevin::defaults::isQuartzSeq2), "Use Quartz-Seq2 v3.2 Single Cell protocol for the library assumes 15 length barcode and 8 length UMI.") ( "sciseq3", po::bool_switch()->default_value(alevin::defaults::isSciSeq3), "Use sci-RNA-seq3 protocol for the library.") ( "whitelist", po::value(), "File containing white-list barcodes") ( "featureStart", po::value(), "This flag should be used with citeseq and specifies the starting index of the feature barcode on Read2.") ( "featureLength", po::value(), "This flag should be used with citeseq and specifies the length of the feature barcode.") ( "noQuant", po::bool_switch()->default_value(alevin::defaults::noQuant), "Don't run downstream barcode-salmon model.") ( "numCellBootstraps",po::value()->default_value(alevin::defaults::numBootstraps), "Generate mean and variance for cell x gene matrix quantification" " estimates.") ( "numCellGibbsSamples",po::value()->default_value(alevin::defaults::numGibbsSamples), "Generate mean and variance for cell x gene matrix quantification by running gibbs chain" " estimates.") ( "forceCells",po::value()->default_value(alevin::defaults::forceCells), "Explicitly specify the number of cells.") ( "expectCells",po::value()->default_value(alevin::defaults::expectCells), "define a close upper bound on expected number of cells") ( "mrna", po::value(), "path to a file containing mito-RNA gene, one per line") ( "rrna", po::value(), "path to a file containing ribosomal RNA, one per line") ( "keepCBFraction", po::value()->default_value(alevin::defaults::keepCBFraction), "fraction of CB to keep, value must be in range (0,1], use 1 to quantify all CB." ) ("read-geometry", po::value(), "format string describing the geometry of the read" ) ("bc-geometry", po::value(), "format string describing the geometry of the cell barcode" ) ("umi-geometry", po::value(), "format string describing the genometry of the umi" ) ( "end",po::value(), "Cell-Barcodes end (5 or 3) location in the read sequence from where barcode has to" " be extracted. (end, umiLength, barcodeLength)" " should all be provided if using this option") ( "umiLength",po::value(), "umi length Parameter for unknown protocol. (end, umiLength, barcodeLength)" " should all be provided if using this option") ( "barcodeLength",po::value(), "barcode length Parameter for unknown protocol. (end, umiLength, barcodeLength)" " should all be provided if using this option") ( "noem",po::bool_switch()->default_value(alevin::defaults::noEM), "do not run em") ( "freqThreshold", po::value()->default_value(alevin::defaults::freqThreshold), "threshold for the frequency of the barcodes") ( "umiEditDistance", po::value()->default_value(alevin::defaults::umiEditDistance), "Maximum allowble edit distance to collapse UMIs, Expect delay in running time if != 1") ( "dumpfq", po::bool_switch()->default_value(alevin::defaults::dumpFQ), "Dump barcode modified fastq file for downstream analysis by" " using coin toss for multi-mapping.") ( "dumpBfh", po::bool_switch()->default_value(alevin::defaults::dumpBFH), "dump the big hash with all the barcodes and the UMI sequence.") ( "dumpArborescences", po::bool_switch()->default_value(alevin::defaults::dumpArborescences), "dump the gene-v-cell matrix for the total number of fragments used in the UMI deduplicaiton.") ( "dumpUmiGraph", po::bool_switch()->default_value(alevin::defaults::dumpUmiGraph), "dump the per cell level Umi Graph.") ( "dumpCellEq", po::bool_switch()->default_value(alevin::defaults::dumpCellEq), "dump the per cell level deduplicated equivalence classes.") ( "dumpFeatures", po::bool_switch()->default_value(alevin::defaults::dumpFeatures), "Dump features for whitelist and downstream analysis.") ( "dumpMtx", po::bool_switch()->default_value(alevin::defaults::dumpMtx), "Dump cell v transcripts count matrix in sparse mtx format.") ( "lowRegionMinNumBarcodes", po::value()->default_value(alevin::defaults::lowRegionMinNumBarcodes), "Minimum Number of CB to use for learning Low confidence region (Default: 200).") ( "maxNumBarcodes", po::value()->default_value(alevin::defaults::maxNumBarcodes), "Maximum allowable limit to process the cell barcodes. (Default: 100000)"); return alevinspec; } po::options_description ProgramOptionsGenerator::getAlignmentSpecificOptions(SalmonOpts& sopt) { using std::string; using std::vector; sopt.useMassBanking = salmon::defaults::useMassBanking; sopt.noRichEqClasses = salmon::defaults::noRichEqClasses; auto onErrorModel = [&sopt](bool noErrorModel) -> void { sopt.useErrorModel = !noErrorModel; }; po::options_description alnspec("\n" "alignment-specific options"); alnspec.add_options() ("noErrorModel", po::bool_switch()->default_value(salmon::defaults::noErrorModel)->notifier(onErrorModel), "Turn off the alignment error model, which takes into " "account the the observed frequency of different types of mismatches / " "indels when computing the likelihood of a given alignment. Turning this off can " "speed up alignment-based salmon, but can harm quantification accuracy.") ("numErrorBins", po::value(&(sopt.numErrorBins))->default_value(salmon::defaults::numErrorBins), "The number of bins into which to divide " "each read when learning and applying the error model. For example, a " "value of 10 would mean that " "effectively, a separate error model is leared and applied to each 10th " "of the read, while a value of " "3 would mean that a separate error model is applied to the read " "beginning (first third), middle (second third) " "and end (final third).") ( "sampleOut,s", po::bool_switch(&(sopt.sampleOutput))->default_value(salmon::defaults::sampleOutput), "Write a \"postSample.bam\" file in the output directory " "that will sample the input alignments according to the estimated " "transcript abundances. If you're " "going to perform downstream analysis of the alignments with tools " "which don't, themselves, take " "fragment assignment ambiguity into account, you should use this " "output.") ( "sampleUnaligned,u", po::bool_switch(&(sopt.sampleUnaligned))->default_value(salmon::defaults::sampleUnaligned), "In addition to sampling the aligned reads, also write " "the un-aligned reads to \"postSample.bam\".") ("gencode", po::bool_switch(&(sopt.gencodeRef))->default_value(salmon::defaults::gencodeRef), "This flag will expect the input transcript fasta to be " "in GENCODE format, and will split the transcript name at the first " "\'|\' character. These reduced names will be used in " "the output and when looking for these transcripts in a gene to " "transcript GTF.") ("scoreExp", po::value(&sopt.scoreExp)->default_value(salmon::defaults::scoreExp), "The factor by which sub-optimal alignment scores are " "downweighted to produce a probability. If the best alignment score for the current read is S, and the score " "for a particular alignment is w, then the probability will be computed porportional to exp( - scoreExp * (S-w) ). " "NOTE: This flag only has an effect if you are parsing alignments produced by salmon itself (i.e. pufferfish or " "RapMap in selective-alignment mode)." ) ( "mappingCacheMemoryLimit", po::value(&(sopt.mappingCacheMemoryLimit)) ->default_value(salmon::defaults::mappingCacheMemoryLimit), "If the file contained fewer than this " "many mapped reads, then just keep the data in memory for subsequent " "rounds of inference. Obviously, this value should " "not be too large if you wish to keep a low memory usage, but setting it " "large enough to accommodate all of the mapped " "read can substantially speed up inference on \"small\" files that " "contain only a few million reads."); return alnspec; } po::options_description ProgramOptionsGenerator::getAdvancedOptions(int32_t& numBiasSamples, SalmonOpts& sopt) { using std::string; using std::vector; po::options_description advanced("\n" "advanced options"); advanced.add_options() ("alternativeInitMode", po::bool_switch(&(sopt.alternativeInitMode))->default_value(salmon::defaults::alternativeInitMode), "[Experimental]: Use an alternative strategy (rather than simple " "interpolation between) the " "online and uniform abundance estimates to initialize the EM / VBEM " "algorithm.") ("auxDir", po::value(&(sopt.auxDir))->default_value(salmon::defaults::auxDir), "The sub-directory of the quantification directory where auxiliary " "information " "e.g. bootstraps, bias parameters, etc. will be written.") ("skipQuant", po::bool_switch(&(sopt.skipQuant))->default_value(salmon::defaults::skipQuant), "Skip performing the actual transcript quantification (including any Gibbs sampling or bootstrapping)." ) ("dumpEq", po::bool_switch(&(sopt.dumpEq))->default_value(salmon::defaults::dumpEq), "Dump the simple equivalence class counts " "that were computed during mapping or alignment.") ("dumpEqWeights,d", po::bool_switch(&(sopt.dumpEqWeights))->default_value(salmon::defaults::dumpEqWeights), "Dump conditional probabilities associated with transcripts when " "equivalence class information is being dumped to file. Note, this will " "dump the factorization that is actually used by salmon's offline phase " "for inference. If you are using range-factorized equivalence classes (the default) " "then the same transcript set may appear multiple times with different associated " "conditional probabilities.") ("minAssignedFrags", po::value(&(sopt.minRequiredFrags))->default_value(salmon::defaults::minAssignedFrags), "The minimum number of fragments that must be assigned to the " "transcriptome for " "quantification to proceed.") ("reduceGCMemory", po::bool_switch(&(sopt.reduceGCMemory))->default_value(salmon::defaults::reduceGCMemory), "If this option is selected, a more memory efficient (but slightly " "slower) representation is " "used to compute fragment GC content. Enabling this will reduce " "memory usage, but can also reduce " "speed. However, the results themselves will remain the same.") ("biasSpeedSamp", po::value(&(sopt.pdfSampFactor))->default_value(salmon::defaults::biasSpeedSamp), "The value at which the fragment length PMF is down-sampled " "when evaluating sequence-specific & GC fragment bias. Larger " "values speed up effective " "length correction, but may decrease the fidelity of bias modeling " "results.") ("fldMax", po::value(&(sopt.fragLenDistMax))->default_value(salmon::defaults::maxFragLength), "The maximum fragment length to consider when building the empirical " "distribution") ("fldMean", po::value(&(sopt.fragLenDistPriorMean))->default_value(salmon::defaults::fragLenPriorMean), "The mean used in the fragment length distribution prior") ("fldSD", po::value(&(sopt.fragLenDistPriorSD))->default_value(salmon::defaults::fragLenPriorSD), "The standard deviation used in the fragment length distribution " "prior") ("forgettingFactor,f", po::value(&(sopt.forgettingFactor))->default_value(salmon::defaults::ffactor), "The forgetting factor used " "in the online learning schedule. A smaller value results in " "quicker learning, but higher variance " "and may be unstable. A larger value results in slower learning but " "may be more stable. Value should " "be in the interval (0.5, 1.0].") ("initUniform", po::bool_switch(&(sopt.initUniform))->default_value(salmon::defaults::initUniform), "initialize the offline inference with uniform parameters, rather " "than seeding with online parameters.") ("maxOccsPerHit", po::value(&(sopt.maxOccsPerHit))->default_value(salmon::defaults::maxOccsPerHit), "When collecting \"hits\" (MEMs), hits having more than maxOccsPerHit occurrences won't be considered.") ("maxReadOcc,w", po::value(&(sopt.maxReadOccs))->default_value(salmon::defaults::maxReadOccs), "Reads \"mapping\" to more than this many places won't be " "considered.") ("maxRecoverReadOcc", po::value(&(sopt.maxRecoverReadOccs))->default_value(salmon::defaults::maxRecoverReadOccs), "Relevant for alevin with \'--sketch\' mode only: if a read has valid seed matches, but no read has matches " "leading to fewer than \"maxReadOcc\" mappings, then try to recover mappings for this read as long as there are " "fewer than \"maxRecoverReadOcc\" mappings." ) ("noLengthCorrection", po::bool_switch(&(sopt.noLengthCorrection))->default_value(salmon::defaults::noLengthCorrection), "[experimental] : Entirely disables length correction when " "estimating " "the abundance of transcripts. This option can be used with " "protocols " "where one expects that fragments derive from their underlying " "targets " "without regard to that target's length (e.g. QuantSeq)") ( "noEffectiveLengthCorrection", po::bool_switch(&(sopt.noEffectiveLengthCorrection)) ->default_value(salmon::defaults::noEffectiveLengthCorrection), "Disables " "effective length correction when computing the " "probability that a fragment was generated " "from a transcript. If this flag is passed in, the " "fragment length distribution is not taken " "into account when computing this probability.") ("noSingleFragProb", po::bool_switch(&(sopt.noSingleFragProb))->default_value(salmon::defaults::noSingleFragProb), "Disables the estimation of an associated fragment length probability for single-end reads or for " "orphaned mappings in paired-end libraries. The default behavior is to consider the probability of " "all possible fragment lengths associated with the retained mapping. Enabling this flag (i.e. turning this " "default behavior off) will simply not attempt to estimate a fragment length probability in such cases.") ("noFragLengthDist", po::bool_switch(&(sopt.noFragLengthDist))->default_value(salmon::defaults::noFragLengthDist), "[experimental] : " "Don't consider concordance with the learned fragment length " "distribution when trying to determine " "the probability that a fragment has originated from a specified " "location. Normally, Fragments with " "unlikely lengths will be assigned a smaller relative probability " "than those with more likely " "lengths. When this flag is passed in, the observed fragment length " "has no effect on that fragment's " "a priori probability.") ("noBiasLengthThreshold", po::bool_switch(&(sopt.noBiasLengthThreshold))->default_value(salmon::defaults::noBiasLengthThreshold), "[experimental] : " "If this option is enabled, then no (lower) threshold will be set on " "how short bias correction can make effective lengths. This can " "increase the precision " "of bias correction, but harm robustness. The default correction " "applies a threshold.") ("numBiasSamples", po::value(&numBiasSamples)->default_value(salmon::defaults::numBiasSamples), "Number of fragment mappings to use when learning the " "sequence-specific bias model.") ("numAuxModelSamples", po::value(&(sopt.numBurninFrags))->default_value(salmon::defaults::numBurninFrags), "The first are used to train the " "auxiliary model parameters (e.g. fragment length distribution, " "bias, etc.). After ther first observations " "the auxiliary model parameters will be assumed to have converged " "and will be fixed.") ("numPreAuxModelSamples", po::value(&(sopt.numPreBurninFrags)) ->default_value(salmon::defaults::numPreBurninFrags), "The first will have their " "assignment likelihoods and contributions to the transcript " "abundances computed without applying any auxiliary models. The " "purpose " "of ignoring the auxiliary models for the first " " observations is to avoid applying these " "models before their " "parameters have been learned sufficiently well.") ("useEM", po::bool_switch(&(sopt.useEM))->default_value(salmon::defaults::useEM), "Use the traditional EM algorithm for optimization in the batch passes.") ("useVBOpt", po::bool_switch(&(sopt.useVBOpt))->default_value(salmon::defaults::useVBOpt), "Use the Variational Bayesian EM [default]") ("rangeFactorizationBins", po::value(&(sopt.rangeFactorizationBins))->default_value(salmon::defaults::rangeFactorizationBins), "Factorizes the likelihood used in quantification by adopting a new " "notion of equivalence classes based on " "the conditional probabilities with which fragments are generated " "from different transcripts. This is a more " "fine-grained factorization than the normal rich equivalence " "classes. The default value (4) corresponds to " "the default used in Zakeri et al. 2017 (doi: 10.1093/bioinformatics/btx262), " "and larger values imply a more fine-grained factorization. If range factorization " "is enabled, a common value to select for this parameter is 4. A value of " "0 signifies the use of basic rich equivalence classes.") ("numGibbsSamples", po::value(&(sopt.numGibbsSamples))->default_value(salmon::defaults::numGibbsSamples), "Number of Gibbs sampling rounds to " "perform.") ("noGammaDraw", po::bool_switch(&(sopt.noGammaDraw))->default_value(salmon::defaults::noGammaDraw), "This switch will disable drawing transcript fractions from a Gamma distribution during Gibbs sampling. In this case " "the sampler does not account for shot-noise, but only assignment ambiguity") ("numBootstraps", po::value(&(sopt.numBootstraps))->default_value(salmon::defaults::numBootstraps), "Number of bootstrap samples to generate. Note: " "This is mutually exclusive with Gibbs sampling.") ("bootstrapReproject", po::bool_switch(&(sopt.bootstrapReproject))->default_value(salmon::defaults::noGammaDraw), "This switch will learn the parameter distribution from the bootstrapped counts for each sample, but " "will reproject those parameters onto the original equivalence class counts.") ("thinningFactor", po::value(&(sopt.thinningFactor))->default_value(salmon::defaults::thinningFactor), "Number of steps to discard for every sample kept from the Gibbs " "chain. " "The larger this number, the less chance that subsequent samples are " "auto-correlated, but the slower sampling becomes.") ("quiet,q", po::bool_switch(&(sopt.quiet))->default_value(salmon::defaults::quiet), "Be quiet while doing quantification (don't write informative " "output to the console unless something goes wrong).") ("perTranscriptPrior", po::bool_switch(&(sopt.perTranscriptPrior))->default_value(salmon::defaults::perTranscriptPrior), "The " "prior (either the default or the argument provided via --vbPrior) " "will " "be interpreted as a transcript-level prior (i.e. each transcript " "will " "be given a prior read count of this value)") ("perNucleotidePrior", po::bool_switch(&(sopt.perNucleotidePrior))->default_value(salmon::defaults::perNucleotidePrior), "The " "prior (either the default or the argument provided via --vbPrior) " "will " "be interpreted as a nucleotide-level prior (i.e. each nucleotide " "will " "be given a prior read count of this value)") ("sigDigits", po::value(&(sopt.sigDigits))->default_value(salmon::defaults::sigDigits), "The number of significant digits to write when outputting the EffectiveLength and NumReads columns") ("vbPrior", po::value(&(sopt.vbPrior))->default_value(salmon::defaults::vbPrior), "The prior that will be used in the VBEM algorithm. This is " "interpreted " "as a per-transcript prior, unless the --perNucleotidePrior flag " "is also given. If the --perNucleotidePrior flag is given, this is used as a nucleotide-level " "prior. If the default is used, it will be divided by 1000 before being used as a nucleotide-level " "prior, i.e. the default per-nucleotide prior will be 1e-5.") ("writeOrphanLinks", po::bool_switch(&(sopt.writeOrphanLinks))->default_value(salmon::defaults::writeOrphanLinks), "Write the transcripts that are linked by orphaned reads.") ("writeUnmappedNames", po::bool_switch(&(sopt.writeUnmappedNames))->default_value(salmon::defaults::writeUnmappedNames), "Write the names of un-mapped reads to the file unmapped_names.txt " "in the auxiliary directory."); return advanced; } po::options_description ProgramOptionsGenerator::getHiddenOptions(SalmonOpts& sopt) { using std::string; using std::vector; po::options_description hidden("\nhidden options"); hidden.add_options() ("numGCBins", po::value(&(sopt.numFragGCBins))->default_value(salmon::defaults::numFragGCBins), "Number of bins to use when modeling fragment GC bias") ("conditionalGCBins", po::value(&(sopt.numConditionalGCBins))->default_value(salmon::defaults::numConditionalGCBins), "Number of different fragment GC models to learn based on read start/end " "context") ("numRequiredObs,n", po::value(&(sopt.numRequiredFragments))->default_value(salmon::defaults::numRequiredFrags), "[Deprecated]: The minimum number of observations (mapped reads) " "that must be observed before " "the inference procedure will terminate.") ("maxHashResizeThreads", po::value(&(sopt.maxHashResizeThreads))->default_value(salmon::defaults::maxHashResizeThreads), "Maximum number of threads to allow cuckoo hash map to use when / if it resizes"); return hidden; } po::options_description ProgramOptionsGenerator::getTestingOptions(SalmonOpts& sopt) { using std::string; using std::vector; po::options_description testing("\n" "testing options"); testing.add_options() ("noRichEqClasses", po::bool_switch(&(sopt.noRichEqClasses))->default_value(salmon::defaults::noRichEqClasses), "[TESTING OPTION]: Disable \"rich\" equivalent classes. If this flag is " "passed, then " "all information about the relative weights for each transcript in the " "label of an equivalence class will be ignored, and only the relative " "abundance and effective length of each transcript will be considered.") ("noFragLenFactor", po::bool_switch(&(sopt.noFragLenFactor))->default_value(salmon::defaults::noFragLengthFactor), "[TESTING OPTION]: Disable the factor in the likelihood that takes into " "account the " "goodness-of-fit of an alignment with the empirical fragment length " "distribution") ("disableAlignmentCache", po::bool_switch(&(sopt.disableAlignmentCache))->default_value(salmon::defaults::disableAlignmentCache), "[TESTING OPTION]: Turn of the alignment cache. This will hurt performance but " "can help debug any issues that might result from caching") ("rankEqClasses", po::bool_switch(&(sopt.rankEqClasses))->default_value(salmon::defaults::rankEqClasses), "[TESTING OPTION]: Keep separate equivalence classes for each distinct " "ordering of transcripts in the label.") ("noExtrapolateCounts", po::bool_switch(&(sopt.dontExtrapolateCounts))->default_value(salmon::defaults::dontExtrapolateCounts), "[TESTING OPTION]: When generating posterior counts for Gibbs sampling, " "use the directly re-allocated counts in each iteration, rather than " "extrapolating " "from transcript fractions."); return testing; } po::options_description ProgramOptionsGenerator::getDeprecatedOptions(SalmonOpts& /*sopt*/) { using std::string; using std::vector; po::options_description deprecated( "\ndeprecated options about which to inform the user"); return deprecated; } }