/* * ContextBase.cpp * * Created on: Feb 12, 2013 * Author: nek3d */ #include "ContextBase.h" #include #include #include ContextBase::ContextBase() : _program(UNSPECIFIED_PROGRAM), _allFilesOpened(false), _genomeFile(NULL), _outputFileType(FileRecordTypeChecker::UNKNOWN_FILE_TYPE), _outputTypeDetermined(false), _skipFirstArgs(0), _showHelp(false), _obeySplits(false), _uncompressedBam(false), _useBufferedOutput(true), _ioBufSize(0), _anyHit(false), _noHit(false), _writeA(false), _writeB(false), _leftJoin(false), _writeCount(false), _writeOverlap(false), _writeAllOverlap(false), _haveFractionA(false), _haveFractionB(false), _overlapFractionA(0.0), _overlapFractionB(0.0), _reciprocalFraction(false), _eitherFraction(false), _sameStrand(false), _diffStrand(false), _sortedInput(false), _sortOutput(false), _reportDBnameTags(false), _reportDBfileNames(false), _printHeader(false), _printable(true), _explicitBedOutput(false), _queryFileIdx(-1), _bamHeaderAndRefIdx(-1), _maxNumDatabaseFields(0), _useFullBamTags(false), _numOutputRecords(0), _hasConstantSeed(false), _seed(0), _forwardOnly(false), _reverseOnly(false), _nameCheckDisabled(false), _hasColumnOpsMethods(false), _keyListOps(NULL), _desiredStrand(FileRecordMergeMgr::ANY_STRAND), _maxDistance(0), _useMergedIntervals(false), _reportPrecision(-1), _splitBlockInfo(NULL), _allFilesHaveChrInChromNames(UNTESTED), _allFileHaveLeadingZeroInChromNames(UNTESTED), _noEnforceCoordSort(false), _inheader(false), _nameConventionWarningTripped(false) { _programNames["intersect"] = INTERSECT; _programNames["sample"] = SAMPLE; _programNames["map"] = MAP; _programNames["merge"] = MERGE; _programNames["closest"] = CLOSEST; _programNames["subtract"] = SUBTRACT; _programNames["jaccard"] = JACCARD; _programNames["spacing"] = SPACING; _programNames["fisher"] = FISHER; _programNames["sample"] = SAMPLE; _programNames["coverage"] = COVERAGE; _programNames["complement"] = COMPLEMENT; _programNames["groupby"] = GROUP_BY; if (hasColumnOpsMethods()) { _keyListOps = new KeyListOps(); } } ContextBase::~ContextBase() { delete _genomeFile; _genomeFile = NULL; delete _splitBlockInfo; _splitBlockInfo = NULL; if (_nameConventionWarningTripped) { cerr << _nameConventionWarningMsg << endl; } //close all files and delete FRM objects. for (int i=0; i < (int)_files.size(); i++) { _files[i]->close(); delete _files[i]; _files[i] = NULL; } if (hasColumnOpsMethods()) { delete _keyListOps; _keyListOps = NULL; } } bool ContextBase::errorEncountered() { // just a subcommand was given with no options. if (_argc == 1) { return true; } return !_errorMsg.empty() || getShowHelp(); } bool ContextBase::determineOutputType() { if (_outputTypeDetermined) { return true; } //test whether output should be BED or BAM. //If the user explicitly requested BED, then it's BED. if (getExplicitBedOutput()) { setOutputFileType(FileRecordTypeChecker::SINGLE_LINE_DELIM_TEXT_FILE_TYPE); _outputTypeDetermined = true; return true; } //Otherwise, if the input is BAM, then the output is BAM int fileIdx = hasIntersectMethods() ? _queryFileIdx : 0; if (_files[fileIdx]->getFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) { setOutputFileType(FileRecordTypeChecker::BAM_FILE_TYPE); return true; } //Okay, it's bed. setOutputFileType(FileRecordTypeChecker::SINGLE_LINE_DELIM_TEXT_FILE_TYPE); _outputTypeDetermined = true; return true; } void ContextBase::openGenomeFile(const string &genomeFilename) { _genomeFile = new NewGenomeFile(genomeFilename.c_str()); } void ContextBase::openGenomeFile(const BamTools::RefVector &refVector) { _genomeFile = new NewGenomeFile(refVector); } bool ContextBase::testCmdArgs(int argc, char **argv) { _argc = argc; _argv = argv; _skipFirstArgs = 1; _origProgramName = argv[0]; setProgram(_programNames[_origProgramName]); _argsProcessed.resize(_argc - _skipFirstArgs, false); if (!parseCmdArgs(argc, argv, 1) || getShowHelp() || !isValidState()) { return false; } return true; } bool ContextBase::parseCmdArgs(int argc, char **argv, int skipFirstArgs) { for (_i=_skipFirstArgs; _i < argc; _i++) { if (isUsed(_i - _skipFirstArgs)) { continue; } if (strcmp(_argv[_i], "-i") == 0) { if (!handle_i()) return false; } else if (strcmp(_argv[_i], "-g") == 0) { if (!handle_g()) return false; } else if ((strcmp(_argv[_i], "-h") == 0) || (strcmp(_argv[_i], "--help") == 0)) { if (!handle_h()) return false; } else if (strcmp(_argv[_i], "-split") == 0) { if (!handle_split()) return false; } else if (strcmp(_argv[_i], "-bed") == 0) { if (!handle_bed()) return false; } else if (strcmp(_argv[_i], "-ubam") == 0) { if (!handle_ubam()) return false; } else if (strcmp(_argv[_i], "-fbam") == 0) { if (!handle_fbam()) return false; } else if(strcmp(_argv[_i], "-sorted") == 0) { if (!handle_sorted()) return false; } else if (strcmp(_argv[_i], "-nobuf") == 0) { if (!handle_nobuf()) return false; } else if (strcmp(_argv[_i], "-iobuf") == 0) { if (!handle_iobuf()) return false; } else if (strcmp(_argv[_i], "-prec") == 0) { if (!handle_prec()) return false; } else if (strcmp(_argv[_i], "-header") == 0) { if (!handle_header()) return false; } else if (strcmp(_argv[_i], "-n") == 0) { if (!handle_n()) return false; } else if (strcmp(_argv[_i], "-seed") == 0) { if (!handle_seed()) return false; } else if ((strcmp(_argv[_i], "-o") == 0) || (strcmp(_argv[_i], "-ops") == 0)) { if (!handle_o()) return false; } else if ((strcmp(_argv[_i], "-c") == 0) || (strcmp(_argv[_i], "-opCols") == 0)) { if (!handle_c()) return false; } else if (strcmp(_argv[_i], "-null") == 0) { if (!handle_null()) return false; } else if (strcmp(_argv[_i], "-delim") == 0) { if (!handle_delim()) return false; } else if (strcmp(_argv[_i], "-sortout") == 0) { if (!handle_sortout()) return false; } else if (strcmp(_argv[_i], "-nonamecheck") == 0) { if (!handle_nonamecheck()) return false; } } return true; } bool ContextBase::isValidState() { if (!openFiles()) { return false; } if (!cmdArgsValid()) { return false; } if (!determineOutputType()) { return false; } if (_program != GROUP_BY && _files[0]->getRecordType() == FileRecordTypeChecker::NO_POS_PLUS_RECORD_TYPE) { _errorMsg = "ERROR: file "; _errorMsg.append(_files[0]->getFileName()); _errorMsg.append(" has non positional records, which are only valid for \n"); _errorMsg.append(" the groupBy tool. Perhaps you are using a header"); _errorMsg.append(" line(s) that starts with \n"); _errorMsg.append(" something other than \"#\", \"chrom\", or \"chr\" (any case)?"); return false; } if (getObeySplits()) { _splitBlockInfo = new BlockMgr(_overlapFractionA, _reciprocalFraction); } if (hasColumnOpsMethods()) { if (hasIntersectMethods()) { for (int i=0; i < (int)_dbFileIdxs.size(); i++) { FileRecordMgr *dbFile = getFile(_dbFileIdxs[i]); _keyListOps->setDBfileType(dbFile->getFileType()); if (!_keyListOps->isValidColumnOps(dbFile)) { return false; } } } else { FileRecordMgr *dbFile = getFile(0); _keyListOps->setDBfileType(dbFile->getFileType()); if (!_keyListOps->isValidColumnOps(dbFile)) { return false; } } //if user specified a precision, pass it to //keyList ops if (_reportPrecision != -1) { _keyListOps->setPrecision(_reportPrecision); } } return true; } bool ContextBase::cmdArgsValid() { bool retval = true; for (_i = _skipFirstArgs; _i < _argc; _i++) { if (!isUsed(_i - _skipFirstArgs)) { _errorMsg += "\n***** ERROR: Unrecognized parameter: "; _errorMsg += _argv[_i]; _errorMsg += " *****"; retval = false; } } return retval; } bool ContextBase::openFiles() { //Make a vector of FileRecordMgr objects by going through the vector //of filenames and opening each one. if (_allFilesOpened) { return true; } if (_fileNames.size() == 0) { //No input was specified. Error and exit. _errorMsg += "\n***** ERROR: No input file given. Exiting. *****"; return false; } _files.resize(_fileNames.size()); for (int i = 0; i < (int)_fileNames.size(); i++) { FileRecordMgr *frm = getNewFRM(_fileNames[i], i); if (hasGenomeFile()) { frm->setGenomeFile(_genomeFile); } //If we're going to do column operations, and an input file // is BAM, we'll need the full flags. if (hasColumnOpsMethods()) { setUseFullBamTags(true); } frm->setFullBamFlags(_useFullBamTags); frm->setIsSorted(_sortedInput); frm->setIoBufSize(_ioBufSize); frm->setIsGroupBy(_program == GROUP_BY); if (!frm->open(_inheader)) { return false; } if (_noEnforceCoordSort) { frm->setNoEnforceCoordSort(true); } _files[i] = frm; } _allFilesOpened = true; return true; } int ContextBase::getBamHeaderAndRefIdx() { if (_bamHeaderAndRefIdx != -1) { //already found which BAM file to use for the header return _bamHeaderAndRefIdx; } if (hasIntersectMethods()) { if (_files[_queryFileIdx]->getFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) { _bamHeaderAndRefIdx = _queryFileIdx; } else { _bamHeaderAndRefIdx = _dbFileIdxs[0]; } return _bamHeaderAndRefIdx; } if (_files[0]->getFileType() == FileRecordTypeChecker::BAM_FILE_TYPE) { _bamHeaderAndRefIdx = 0; return _bamHeaderAndRefIdx; } return _bamHeaderAndRefIdx; } int ContextBase::getUnspecifiedSeed() { // thanks to Rob Long for the tip. _seed = (unsigned)time(0)+(unsigned)getpid(); srand(_seed); return _seed; } bool ContextBase::handle_bed() { setExplicitBedOutput(true); markUsed(_i - _skipFirstArgs); return true; } bool ContextBase::handle_fbam() { setUseFullBamTags(true); markUsed(_i - _skipFirstArgs); return true; } bool ContextBase::handle_g() { if (_argc <= _i+1) { _errorMsg = "\n***** ERROR: -g option given, but no genome file specified. *****"; return false; } openGenomeFile(_argv[_i+1]); markUsed(_i - _skipFirstArgs); _i++; markUsed(_i - _skipFirstArgs); return true; } bool ContextBase::handle_h() { setShowHelp(true); markUsed(_i - _skipFirstArgs); return true; } bool ContextBase::handle_header() { setPrintHeader(true); markUsed(_i - _skipFirstArgs); return true; } bool ContextBase::handle_i() { if (_argc <= _i+1) { _errorMsg = "\n***** ERROR: -i option given, but no input file specified. *****"; return false; } addInputFile(_argv[_i+1]); markUsed(_i - _skipFirstArgs); _i++; markUsed(_i - _skipFirstArgs); return true; } bool ContextBase::handle_n() { if (_argc <= _i+1) { _errorMsg = "\n***** ERROR: -n option given, but no number of output records specified. *****"; return false; } setNumOutputRecords(atoi(_argv[_i + 1])); markUsed(_i - _skipFirstArgs); _i++; markUsed(_i - _skipFirstArgs); return true; } bool ContextBase::handle_nobuf() { setUseBufferedOutput(false); markUsed(_i - _skipFirstArgs); return true; } bool ContextBase::handle_iobuf() { if (_argc <= _i+1) { _errorMsg = "\n***** ERROR: -iobuf option given, but size of input buffer not specified. *****"; return false; } if (!parseIoBufSize(_argv[_i + 1])) return false; markUsed(_i - _skipFirstArgs); _i++; markUsed(_i - _skipFirstArgs); return true; } bool ContextBase::handle_seed() { if (_argc <= _i+1) { _errorMsg = "\n***** ERROR: -seed option given, but no seed specified. *****"; return false; } _hasConstantSeed = true; _seed = atoi(_argv[_i+1]); srand(_seed); markUsed(_i - _skipFirstArgs); _i++; markUsed(_i - _skipFirstArgs); return true; } bool ContextBase::handle_split() { setObeySplits(true); markUsed(_i - _skipFirstArgs); return true; } bool ContextBase::handle_sorted() { setSortedInput(true); markUsed(_i - _skipFirstArgs); return true; } bool ContextBase::handle_ubam() { setUncompressedBam(true); markUsed(_i - _skipFirstArgs); return true; } // Methods specific to column operations. // for col ops, -c is the string of columns upon which to operate bool ContextBase::handle_c() { if (!hasColumnOpsMethods()) { return false; } if ((_i+1) < _argc) { _keyListOps->setColumns(_argv[_i + 1]); markUsed(_i - _skipFirstArgs); _i++; markUsed(_i - _skipFirstArgs); return true; } return false; } // for col ops, -o is the string of operations to apply to the columns (-c) bool ContextBase::handle_o() { if (!hasColumnOpsMethods()) { return false; } if ((_i+1) < _argc) { _keyListOps->setOperations(_argv[_i + 1]); markUsed(_i - _skipFirstArgs); _i++; markUsed(_i - _skipFirstArgs); return true; } return false; } bool ContextBase::handle_prec() { if (!hasColumnOpsMethods()) { return false; } if ((_i+1) < _argc) { int prec = atoi(_argv[_i + 1]); if (prec < 1) { _errorMsg += "\n***** ERROR: -prec must be followed by a positive integer. Exiting. *****"; return false; } _reportPrecision = prec; markUsed(_i - _skipFirstArgs); _i++; markUsed(_i - _skipFirstArgs); return true; } _errorMsg += "\n***** ERROR: -prec must be followed by a positive integer. Exiting. *****"; return false; } // for col ops, -null is a NULL value assigned // when no overlaps are detected. bool ContextBase::handle_null() { if (!hasColumnOpsMethods()) { return false; } if ((_i+1) < _argc) { _keyListOps->setNullValue(_argv[_i + 1]); markUsed(_i - _skipFirstArgs); _i++; markUsed(_i - _skipFirstArgs); return true; } return false; } //for col ops, delimStr will appear between each item in //a collapsed but delimited list. bool ContextBase::handle_delim() { if (!hasColumnOpsMethods()) { _errorMsg = "\n***** ERROR: Can't set delimiter for tools without column operations. Exiting. *****"; return false; } if ((_i+1) < _argc) { _keyListOps->setDelimStr(_argv[_i + 1]); markUsed(_i - _skipFirstArgs); _i++; markUsed(_i - _skipFirstArgs); } return true; } bool ContextBase::handle_sortout() { setSortOutput(true); markUsed(_i - _skipFirstArgs); return true; } bool ContextBase::handle_nonamecheck() { setNameCheckDisabled(true); markUsed(_i - _skipFirstArgs); return true; } void ContextBase::setColumnOpsMethods(bool val) { if (val && !_hasColumnOpsMethods) { //was off, but we're turning it on. _keyListOps = new KeyListOps(); } _hasColumnOpsMethods = val; } const string &ContextBase::getColumnOpsVal(RecordKeyVector &keyList) const { if (!hasColumnOpsMethods()) { return _nullStr; } return _keyListOps->getOpVals(keyList); } FileRecordMgr *ContextBase::getNewFRM(const string &filename, int fileIdx) { if (_useMergedIntervals) { FileRecordMergeMgr *frm = new FileRecordMergeMgr(filename); frm->setStrandType(_desiredStrand); frm->setMaxDistance(_maxDistance); frm->setFileIdx(fileIdx); return frm; } else { FileRecordMgr *frm = new FileRecordMgr(filename); frm->setFileIdx(fileIdx); return frm; } } bool ContextBase::parseIoBufSize(string bufStr) { char lastChar = bufStr[bufStr.size()-1]; int multiplier = 1; if (!isdigit(lastChar)) { switch (lastChar) { case 'K': multiplier = 1 << 10; break; case 'M': multiplier = 1 << 20; break; case 'G': multiplier = 1 << 30; break; default: _errorMsg = "\n***** ERROR: Unrecognized memory buffer size suffix \'"; _errorMsg += lastChar; _errorMsg += "\' given. *****"; return false; break; } //lop off suffix character bufStr.resize(bufStr.size()-1); } if (!isNumeric(bufStr)) { _errorMsg = "\n***** ERROR: argument passed to -iobuf is not numeric. *****"; return false; } _ioBufSize = str2chrPos(bufStr) * multiplier; if (_ioBufSize < MIN_ALLOWED_BUF_SIZE) { _errorMsg = "\n***** ERROR: specified buffer size is too small. *****"; return false; } return true; } void ContextBase::testNameConventions(const Record *record) { //Do nothing if using the -nonamecheck option, //warning already given, or record is unmapped BAM record if (getNameCheckDisabled() || _nameConventionWarningTripped || record->isUnmapped()) return; int fileIdx = record->getFileIdx(); // // First test whether chr in chrom names match // bool hasChr = record->hasChrInChromName(); testType testChrVal = fileHasChrInChromNames(fileIdx); if (testChrVal == UNTESTED) { _fileHasChrInChromNames[fileIdx] = hasChr ? YES : NO; } if ((_allFilesHaveChrInChromNames == YES && !hasChr) || (_allFilesHaveChrInChromNames == NO && hasChr)) { nameConventionWarning(record, _fileNames[fileIdx], " has inconsistent naming convention for record:\n"); } if (_allFilesHaveChrInChromNames == UNTESTED) { _allFilesHaveChrInChromNames = hasChr ? YES : NO; } // // Now test whether leading zero in chrom names match // bool zeroVal = record->hasLeadingZeroInChromName(hasChr); testChrVal = fileHasLeadingZeroInChromNames(fileIdx); if (testChrVal == UNTESTED) { _fileHasLeadingZeroInChromNames[fileIdx] = zeroVal ? YES : NO; } if ((_allFileHaveLeadingZeroInChromNames == YES && !zeroVal) || (_allFileHaveLeadingZeroInChromNames == NO && zeroVal)) { nameConventionWarning(record, _fileNames[fileIdx], " has a record where naming convention (leading zero) is inconsistent with other files:\n"); } if (_allFileHaveLeadingZeroInChromNames == UNTESTED) { _allFileHaveLeadingZeroInChromNames = zeroVal ? YES : NO; } } ContextBase::testType ContextBase::fileHasChrInChromNames(int fileIdx) { conventionType::iterator iter = _fileHasChrInChromNames.find(fileIdx); if (iter == _fileHasChrInChromNames.end()) { return UNTESTED; } return iter->second; } ContextBase::testType ContextBase::fileHasLeadingZeroInChromNames(int fileIdx) { conventionType::iterator iter = _fileHasLeadingZeroInChromNames.find(fileIdx); if (iter == _fileHasLeadingZeroInChromNames.end()) { return UNTESTED; } return iter->second; } void ContextBase::warn(const Record *record, const string str1, const string str2, const string str3) { string msg; setErrorMsg(msg, true, record, str1, str2, str3); cerr << msg << endl; } void ContextBase::die(const Record *record, const string str1, const string str2, const string str3) { string msg; setErrorMsg(msg, false, record, str1, str2, str3); cerr << msg << endl; exit(1); } void ContextBase::setErrorMsg(string &msg, bool onlyWarn, const Record * record, string str1, const string str2, const string str3) { if (onlyWarn) { msg = "\n***** WARNING: "; } else { msg = "\n***** ERROR: "; } msg.append(str1); msg.append(str2); msg.append(str3); msg.append(" Exiting...\n"); if (record != NULL) { record->print(msg); } } void ContextBase::nameConventionWarning(const Record *record, const string &filename, const string &message) { _nameConventionWarningMsg = "***** WARNING: File "; _nameConventionWarningMsg.append(filename); _nameConventionWarningMsg.append(message); record->print(_nameConventionWarningMsg); _nameConventionWarningMsg.append("\n"); _nameConventionWarningTripped = true; cerr << _nameConventionWarningMsg << endl; } bool ContextBase::strandedToolSupported() { //Test that all files have strands. Should be called if any tool // invoked with sameStrand / diffStrand option. for (int i=0; i < getNumInputFiles(); i++) { // make sure file has strand. if (!getFile(i)->recordsHaveStrand()) { _errorMsg = "\n***** ERROR: stranded "; _errorMsg += _origProgramName; _errorMsg += " requested, but input file "; _errorMsg += getInputFileName(i); _errorMsg += " does not have strands. *****"; return false; } //make sure file is not VCF. if (getFile(i)->getFileType() == FileRecordTypeChecker::VCF_FILE_TYPE) { _errorMsg = "\n***** ERROR: stranded "; _errorMsg += _origProgramName; _errorMsg += " not supported for VCF file "; _errorMsg += getInputFileName(i); _errorMsg += ". *****"; return false; } } return true; }