/** ** Copyright (c) 2010 Illumina, Inc. ** ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). ** ** This file is part of the Consensus Assessment of Sequence And VAriation ** (CASAVA) software package. ** ** \file PairStats.cpp ** ** \brief Encapsulation of pair statistics XML file. ** ** Encapsulation of pair statistics XML file. ** ** \author Richard Shaw **/ /*****************************************************************************/ #include #include #include #include #include #include #include "common/Exceptions.hh" #include "common/PairStats.hh" #include "io/ReadsIdx.hh" /*****************************************************************************/ namespace casava { namespace common { /*****************************************************************************/ // LaneBarcodePairStats /*****************************************************************************/ LaneBarcodePairStats::LaneBarcodePairStats() : numLowSds_(0) , numHighSds_(0) , goodPairs_(0) , read1Len_(0) , read2Len_(0) , fragmentLenMedian_(0) , fragmentLenLowSd_(0) , fragmentLenHighSd_(0) , fragmentLenSizeMin_(0) , fragmentLenSizeMax_(0) { } /*****************************************************************************/ LaneBarcodePairStats::LaneBarcodePairStats(const std::string &goodPairs, const std::string &read1Len, const std::string &read2Len, const RelOrient &relOrient, const std::string &fragmentLenMedian, const std::string &fragmentLenLowSd, const std::string &fragmentLenHighSd) : numLowSds_(0) , numHighSds_(0) , goodPairs_(boost::lexical_cast(goodPairs)) , read1Len_(boost::lexical_cast(read1Len)) , read2Len_(boost::lexical_cast(read2Len)) , relOrient_(relOrient) , fragmentLenMedian_(boost::lexical_cast(fragmentLenMedian)) , fragmentLenLowSd_(boost::lexical_cast(fragmentLenLowSd)) , fragmentLenHighSd_(boost::lexical_cast(fragmentLenHighSd)) , fragmentLenSizeMin_(0) , fragmentLenSizeMax_(0) { setNumSds(5, 3); // default SVfinder values } /** * * @param readNum 1-offset read number * * ***************************************************************************/ unsigned int LaneBarcodePairStats::getReadLen(unsigned int readNum) const { switch (readNum) { case 1: return read1Len_; case 2: return read2Len_; default: BOOST_THROW_EXCEPTION(CasavaException( EINVAL, "readNum " + boost::lexical_cast(readNum) + " out of range")); } } /*****************************************************************************/ void LaneBarcodePairStats::updateDerived() { fragmentLenSizeMin_ = fragmentLenMedian_ - (numLowSds_ * fragmentLenLowSd_); fragmentLenSizeMax_ = fragmentLenMedian_ + (numHighSds_ * fragmentLenHighSd_); } void LaneBarcodePairStats::setNumSds(float numLowSds, float numHighSds) { numLowSds_ = numLowSds; numHighSds_ = numHighSds; updateDerived(); } bool LaneBarcodePairStats::isCompatible(const LaneBarcodePairStats& that) { return (relOrient_ == that.relOrient_ && read1Len_ == that.read1Len_ && read2Len_ == that.read2Len_ && fabs(fragmentLenMedian_ - that.fragmentLenMedian_) < std::max(fragmentLenHighSd_, that.fragmentLenHighSd_) * 6.0); } LaneBarcodePairStats & LaneBarcodePairStats::assign(LaneBarcodePairStats that) { using std::swap; swap(*this, that); return *this; } LaneBarcodePairStats & LaneBarcodePairStats::operator |= (const LaneBarcodePairStats& that) { if (this == &that || !that.fragmentLenMedian_) { return *this; } if (!fragmentLenMedian_) { return assign(that); } if (!isCompatible(that)) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("ERROR: Invalid attempt to merge incompatible pair stats: %s:%d+-%d with %s:%d+-%d") % relOrient_.string() % fragmentLenMedian_ % fragmentLenHighSd_ % that.relOrient_.string() % that.fragmentLenMedian_ % that.fragmentLenHighSd_ ).str())); } fragmentLenHighSd_ = std::max(fragmentLenHighSd_, that.fragmentLenHighSd_); fragmentLenLowSd_ = std::min(fragmentLenLowSd_, that.fragmentLenLowSd_); // rounding is required or else the fractional part will cause exceptions in the indel finder code. fragmentLenMedian_ = round((fragmentLenMedian_ * goodPairs_ + that.fragmentLenMedian_ * that.goodPairs_) / (goodPairs_ + that.goodPairs_)); goodPairs_ += that.goodPairs_; updateDerived(); return *this; } /*****************************************************************************/ void LaneBarcodePairStats::getBounds(RelOrient& relOrient, double& minSize, double& maxSize) const { relOrient = relOrient_; minSize = fragmentLenSizeMin_; maxSize = fragmentLenSizeMax_; } /*****************************************************************************/ void LaneBarcodePairStats::getSds(double& lowSd, double& highSd) const { lowSd = fragmentLenLowSd_; highSd = fragmentLenHighSd_; } /*****************************************************************************/ /*****************************************************************************/ // PairStats /*****************************************************************************/ namespace cio=casava::io; namespace fs=boost::filesystem; inline std::string makeKey(const std::string &instrument, const int run, const int lane, const std::string &barcode) { return instrument + ':' + boost::lexical_cast(run) + ':' + boost::lexical_cast(lane) + ':' + barcode; } PairStats::PairStats(const boost::filesystem::path& projDirPath) { fs::path readsIdxPath(projDirPath / "stats" / "Reads.idx"); std::ifstream is(readsIdxPath.string().c_str()); std::string line; std::vector lineColumns; int lineNumber(0); while(std::getline(is, line)) { if (!line.empty() && '#' != line[0]) { lineColumns.clear(); boost::algorithm::split(lineColumns, line, boost::algorithm::is_any_of("\t")); if (lineColumns.size() != cio::readsIdxFieldsCount) { BOOST_THROW_EXCEPTION(IoException(EINVAL, "ERROR: incorrect number of columns in " + readsIdxPath.string() + ":" + boost::lexical_cast(lineNumber) + " expected: " + boost::lexical_cast(cio::readsIdxFieldsCount) + " actual: " + boost::lexical_cast(lineColumns.size()))); } // Remove any padding leading zeros by converting to int. const int runNum(boost::lexical_cast (lineColumns[cio::eReadsIdxRunID])); const int laneNum(boost::lexical_cast (lineColumns[cio::eReadsIdxLaneNumber])); const std::string laneBarcodekey(makeKey(lineColumns[cio::eReadsIdxInstrumentName], runNum, laneNum, lineColumns[cio::eReadsIdxBarcode])); if (!lineColumns[cio::eReadsIdxMedianFragmentLength].empty()) { myLaneBarcodePairStatsMap[laneBarcodekey] |= LaneBarcodePairStats(lineColumns[cio::eReadsIdxGoodPairs], lineColumns[cio::eReadsIdxRead1Length], lineColumns[cio::eReadsIdxRead2Length], lineColumns[cio::eReadsIdxNominalOrientation], lineColumns[cio::eReadsIdxMedianFragmentLength], lineColumns[cio::eReadsIdxLowSDFragmentLength], lineColumns[cio::eReadsIdxHighSDFragmentLength]); } } ++lineNumber; } } /*****************************************************************************/ unsigned int PairStats::getReadLen(const std::string &instrument, const int run, const int lane, const std::string &barcode, unsigned int readNum) const { return getLaneBarcodePairStats(instrument, run, lane, barcode, "getReadLen")->second.getReadLen(readNum); } /*****************************************************************************/ void PairStats::setNumSds(float numLowSds, float numHighSds) { for (LaneBarcodePairStatsMapIter lanePairStatsMapIter(myLaneBarcodePairStatsMap.begin()); lanePairStatsMapIter != myLaneBarcodePairStatsMap.end(); ++lanePairStatsMapIter) { lanePairStatsMapIter->second.setNumSds(numLowSds, numHighSds); } } /*****************************************************************************/ void PairStats::getBounds(const std::string &instrument, const int run, const int lane, const std::string &barcode, RelOrient& relOrient, double& minSize, double& maxSize) const { getLaneBarcodePairStats(instrument, run, lane, barcode, "getBounds")->second.getBounds(relOrient, minSize, maxSize); } /*****************************************************************************/ void PairStats::getSds(const std::string &instrument, const int run, const int lane, const std::string &barcode, double& lowSd, double& highSd) const { getLaneBarcodePairStats(instrument, run, lane, barcode, "getSds")->second.getSds(lowSd, highSd); } /*****************************************************************************/ double PairStats::mostLikelyInsertSize(const std::string &instrument, const int run, const int lane, const std::string &barcode) const { return getLaneBarcodePairStats(instrument, run, lane, barcode, "mostLikelyInsertSize")->second.mostLikelyInsertSize(); } /*****************************************************************************/ PairStats::LaneBarcodePairStatsMap::const_iterator PairStats::getLaneBarcodePairStats(const std::string &instrument, const int run, const int lane, const std::string &barcode, const std::string &) const { LaneBarcodePairStatsMap::const_iterator lanePairStatsMapCIter(myLaneBarcodePairStatsMap.find(makeKey(instrument, run, lane, barcode))); return lanePairStatsMapCIter; } /*****************************************************************************/ } // namespace common } // namespace casava /*****************************************************************************/