/* * ContextBase.h * * Created on: Feb 11, 2013 * Author: nek3d */ #ifndef CONTEXTBASE_H_ #define CONTEXTBASE_H_ // The Context class handles the settings for an operation, // such as merge, intersect, jaccard, etc. // // Settings include the input and output parameters, such as input // files, file types (if explicitly provided), genome files, // run options, output format, etc. #include #include "version.h" #include "BedtoolsTypes.h" #include "FileRecordTypeChecker.h" #include "FileRecordMergeMgr.h" #include "NewGenomeFile.h" #include "api/BamReader.h" #include "api/BamAux.h" #include "KeyListOps.h" class ContextBase { public: ContextBase(); virtual ~ContextBase(); typedef FileRecordTypeChecker::FILE_TYPE ContextFileType; typedef FileRecordTypeChecker::RECORD_TYPE ContextRecordType; typedef enum {UNSPECIFIED_PROGRAM, INTERSECT, WINDOW, CLOSEST, COVERAGE, MAP, GENOMECOV, MERGE, CLUSTER, COMPLEMENT, SUBTRACT, SLOP, FLANK, SORT, RANDOM, SAMPLE, SHUFFLE, ANNOTATE, MULTIINTER, UNIONBEDG, PAIRTOBED, PAIRTOPAIR,BAMTOBED, BEDTOBAM, BEDTOFASTQ, BEDPETOBAM, BED12TOBED6, GETFASTA, MASKFASTA, NUC, MULTICOV, TAG, JACCARD, OVERLAP, IGV, LINKS,MAKEWINDOWS, GROUPBY, EXPAND, SPACING, FISHER, GROUP_BY} PROGRAM_TYPE; PROGRAM_TYPE getProgram() const { return _program; } FileRecordMgr *getFile(int fileIdx) { return _files[fileIdx]; } void setProgram(PROGRAM_TYPE program) { _program = program; } void addInputFile(const string &inputFile) { _fileNames.push_back(inputFile); } int getNumInputFiles() const { return _fileNames.size(); } const string &getInputFileName(int fileNum) const { return _fileNames[fileNum]; } ContextFileType getInputFileType(int fileNum) const { return _files[fileNum]->getFileType(); } ContextRecordType getInputRecordType(int fileNum) const { return _files[fileNum]->getRecordType(); } virtual bool determineOutputType(); const string &getHeader(int fileIdx) { return _files[fileIdx]->getHeader(); } const BamTools::RefVector &getBamReferences(int fileIdx) { return _files[fileIdx]->getBamReferences(); } int getBamHeaderAndRefIdx(); //return idx of 1st query that is BAM. If none, first DB that is BAM. bool getUseMergedIntervals() const { return _useMergedIntervals; } void setUseMergedIntervals(bool val) { _useMergedIntervals = val; } FileRecordMergeMgr::WANTED_STRAND_TYPE getDesiredStrand() const { return _desiredStrand; } void openGenomeFile(const string &genomeFilename); void openGenomeFile(const BamTools::RefVector &refVector); bool hasGenomeFile() const { return _genomeFile != NULL; } NewGenomeFile *getGenomeFile() const { return _genomeFile; } void setOutputFileType(ContextFileType fileType) { _outputFileType = fileType; } ContextFileType getOutputFileType() const { return _outputFileType; } virtual bool testCmdArgs(int argc, char **argv); virtual bool errorEncountered(); //isValidState checks that parameters to context are in an acceptable state. // If not, the error msg string will be set with a reason why it failed. virtual bool isValidState(); bool getShowHelp() const { return _showHelp; } void setShowHelp(bool val) { _showHelp = val; } const string &getErrorMsg() const { return _errorMsg; } void setErrorMessage(const string &errorMsg) { _errorMsg = errorMsg; } //split handling. bool getObeySplits() const {return _obeySplits; } void setObeySplits(bool val) { _obeySplits = val; } //Decide whether output format is BED or BAM. //Default is BAM if any input files are BAM. bool getExplicitBedOutput() const { return _explicitBedOutput; } void setExplicitBedOutput(bool val) { _explicitBedOutput = val; } bool getUncompressedBam() const { return _uncompressedBam; } void setUncompressedBam(bool val) { _uncompressedBam = val; } bool getUseBufferedOutput() const { return _useBufferedOutput; } void setUseBufferedOutput(bool val) { _useBufferedOutput = val; } virtual bool getSortedInput() const {return _sortedInput; } virtual void setSortedInput(bool val) { _sortedInput = val; } virtual bool getSortOutput() const {return _sortOutput; } virtual void setSortOutput(bool val) { _sortOutput = val; } virtual bool getUseDBnameTags() const { return _reportDBnameTags; } virtual void setUseDBnameTags(bool val) { _reportDBnameTags = val; } virtual bool getUseDBfileNames() const { return _reportDBfileNames; } virtual void setUseDBfileNames(bool val) { _reportDBfileNames = val; } virtual bool getPrintHeader() const {return _printHeader; } virtual void setPrintHeader(bool val) { _printHeader = val; } virtual bool getPrintable() const { return _printable; } virtual void setPrintable(bool val) { _printable = val; } virtual bool getUseFullBamTags() const { return _useFullBamTags; } virtual void setUseFullBamTags(bool val) { _useFullBamTags = val; } virtual bool getNameCheckDisabled() const { return _nameCheckDisabled; } virtual void setNameCheckDisabled(bool val) { _nameCheckDisabled = val; } // METHODS FOR PROGRAMS WITH USER_SPECIFIED NUMBER // OF OUTPUT RECORDS. virtual int getNumOutputRecords() const { return _numOutputRecords; } virtual void setNumOutputRecords(int val) { _numOutputRecords = val; } //SEED OPS FOR APPS WITH RANDOMNESS //When a seed has been specified on the command line, srand //will have been already been called during command line parsing. //For reference, srand is the C function that initializes the //psuedo-random number generator. If a seed has not been //specifified, the user may call the virtual bool hasConstantSeed() const { return _hasConstantSeed; } virtual int getConstantSeed() const { return _seed; } virtual int getUnspecifiedSeed(); //method used by other classes to distinguish ContextBase from //ContextIntersect and other derived classes that use intersection //methods. virtual bool hasIntersectMethods() const { return false; } // determine whether column operations like those used in map // are available. void setColumnOpsMethods(bool val); virtual bool hasColumnOpsMethods() const { return _hasColumnOpsMethods; } const string &getColumnOpsVal(RecordKeyVector &keyList) const; //methods applicable only to column operations. int getReportPrecision() const { return _reportPrecision; } void testNameConventions(const Record *); BlockMgr *getSplitBlockInfo() { return _splitBlockInfo; } string getErrorMessages() const { return _errorMsg; } protected: PROGRAM_TYPE _program; vector _fileNames; vector _files; bool _allFilesOpened; map _programNames; string _origProgramName; NewGenomeFile *_genomeFile; ContextFileType _outputFileType; bool _outputTypeDetermined; map _headers; map _references; string _errorMsg; vector _argsProcessed; //used for processing cmdLine args. int _skipFirstArgs; bool _showHelp; bool _obeySplits; bool _uncompressedBam; bool _useBufferedOutput; int _ioBufSize; bool _anyHit; bool _noHit; bool _writeA; bool _writeB; bool _leftJoin; bool _writeCount; bool _writeOverlap; bool _writeAllOverlap; bool _haveFractionA; bool _haveFractionB; float _overlapFractionA; float _overlapFractionB; bool _reciprocalFraction; bool _eitherFraction; bool _sameStrand; bool _diffStrand; bool _sortedInput; bool _sortOutput; bool _reportDBnameTags; bool _reportDBfileNames; bool _printHeader; bool _printable; bool _explicitBedOutput; bool _runToQueryEnd; int _queryFileIdx; vector _dbFileIdxs; vector _dbNameTags; map _fileIdsToDbIdxs; int _bamHeaderAndRefIdx; int _maxNumDatabaseFields; bool _useFullBamTags; int _numOutputRecords; bool _hasConstantSeed; int _seed; bool _forwardOnly; bool _reverseOnly; bool _nameCheckDisabled; //Members for column operations bool _hasColumnOpsMethods; KeyListOps *_keyListOps; string _nullStr; //placeholder return value when col ops aren't valid. //Members for merged records FileRecordMergeMgr::WANTED_STRAND_TYPE _desiredStrand; int _maxDistance; bool _useMergedIntervals; int _reportPrecision; //used in fields reported from numeric ops from map and merge. void markUsed(int i) { _argsProcessed[i] = true; } bool isUsed(int i) const { return _argsProcessed[i]; } virtual bool parseCmdArgs(int argc, char **argv, int skipFirstArgs); bool cmdArgsValid(); virtual bool openFiles(); virtual FileRecordMgr *getNewFRM(const string &filename, int fileIdx); //set cmd line params and counter, i, as members so code //is more readable (as opposed to passing all 3 everywhere). int _argc; char **_argv; int _i; //track whether each file has the letters chr in their chrom names. //this is needed for enforcing consistent naming conventions across //multiple files, and auto-detecting errors when they're not followed. typedef enum { YES, NO, UNTESTED } testType; typedef map conventionType; conventionType _fileHasChrInChromNames; // as above, but check whether first digit to appear in // a chrom name is a zero. conventionType _fileHasLeadingZeroInChromNames; static const int MIN_ALLOWED_BUF_SIZE = 8; BlockMgr *_splitBlockInfo; testType _allFilesHaveChrInChromNames; testType _allFileHaveLeadingZeroInChromNames; bool _noEnforceCoordSort; bool _inheader; virtual bool handle_bed(); virtual bool handle_fbam(); virtual bool handle_g(); virtual bool handle_h(); virtual bool handle_header(); virtual bool handle_i(); virtual bool handle_n(); virtual bool handle_nobuf(); virtual bool handle_iobuf(); virtual bool handle_seed(); virtual bool handle_split(); virtual bool handle_sorted(); virtual bool handle_ubam(); virtual bool handle_c(); virtual bool handle_o(); virtual bool handle_null(); virtual bool handle_delim(); virtual bool handle_sortout(); virtual bool handle_nonamecheck(); bool handle_prec(); bool parseIoBufSize(string bufStr); testType fileHasChrInChromNames(int fileIdx); testType fileHasLeadingZeroInChromNames(int fileIdx); void setNoEnforceCoordSort(bool val) { _noEnforceCoordSort = val; } //Warning messages. bool _nameConventionWarningTripped; string _nameConventionWarningMsg; void nameConventionWarning(const Record *record, const string &filename, const string &message); //give warning but continue. void warn(const Record *, const string str1, const string str2 = "", const string str3 = ""); // Give error and exit. void die(const Record *, const string str1, const string str2 = "", const string str3 = ""); //private error handler void setErrorMsg(string &msg, bool onlyWarn, const Record * record, string str1, const string str2, const string str3); bool strandedToolSupported(); }; #endif /* CONTEXTBASE_H_ */