/** * Project : CASAVA * Module : $RCSfile: AnomReadVec.cpp,v $ * @author : Richard Shaw * Copyright : Copyright (c) Illumina 2010. All rights reserved. * ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). * */ /*****************************************************************************/ #include #include #include #include #include "common/StreamUtil.h" #include "common/Exceptions.hh" #include "common/MathCompatibility.hh" #include "common/SequenceUtils.hh" #include "variance/AnomReadVec.hh" #include "variance/StatWidget.hh" /*****************************************************************************/ namespace ca { namespace variance_detection { /*****************************************************************************/ using boost::assign::list_of; /*****************************************************************************/ AnomReadVec::AnomReadVec(unsigned int groupNum) : myAnomReadType(AnomalousRead::ShadowOrSemiAligned), myPos(-1), myGroupNum(groupNum), myLeftBreakPos(myPos), myRightBreakPos(myPos) { ; } /*****************************************************************************/ AnomReadVec::AnomReadVec(AnomalousRead::Type anomReadType, unsigned int groupNum) : myAnomReadType(anomReadType), myPos(-1), myGroupNum(groupNum), myLeftBreakPos(myPos), myRightBreakPos(myPos) { ; } /*****************************************************************************/ void AnomReadVec::updateBreakPoints(PairStats& pairStats) { // FIXME : subclass - but all types are read using same AnomReadVec. if (myAnomReadType != AnomalousRead::DeletionPair) { // For insertions: there is only one breakpoint and myPos is already // the best estimate. // For duplications : a special case of insertions - one breakpoint. // For inversions: two different clusters of reads for the two // breakpoints return; } double meanDelSize(0); for (AnomReadVecCIter anomReadPtrCIter(begin()); anomReadPtrCIter != end(); ++anomReadPtrCIter) { Singleton* anomReadPtr(*anomReadPtrCIter); Alignment& alignment(anomReadPtr->alignment_); if (pairStats.isValid(alignment.getMachineName(), alignment.getRunNumber(), alignment.getLaneNumber(), alignment.getIndex())) { const double mostLikelyInsertSize(pairStats.mostLikelyInsertSize(alignment.getMachineName(), alignment.getRunNumber(), alignment.getLaneNumber(), alignment.getIndex())); // Get the actual pair values for this read. const unsigned int readNum(alignment.getReadNumber()); unsigned int readLen(pairStats.getReadLen(alignment.getMachineName(), alignment.getRunNumber(), alignment.getLaneNumber(), alignment.getIndex(), readNum)); const unsigned int partnerReadNum(3 - readNum); unsigned int partnerReadLen(pairStats.getReadLen(alignment.getMachineName(), alignment.getRunNumber(), alignment.getLaneNumber(), alignment.getIndex(), partnerReadNum)); bool isChimeric(false); long leftPos(-1); unsigned int insertSize(0); RelOrient relOrient; SequenceUtils::getReadPairInfo(alignment, readLen, partnerReadLen, isChimeric, leftPos, insertSize, relOrient); meanDelSize += (static_cast(insertSize) - mostLikelyInsertSize); } } meanDelSize /= size(); myLeftBreakPos = myPos - static_cast(round(meanDelSize)); myRightBreakPos = myPos + static_cast(round(meanDelSize)); } /*****************************************************************************/ unsigned int AnomReadVec::numCommonReads(const AnomReadVec& anomReadVec) { ensureReadIdSetPopulated(); unsigned int numCommon(0); for (AnomReadVecCIter anomReadPtrCIter(anomReadVec.begin()); anomReadPtrCIter != anomReadVec.end(); ++anomReadPtrCIter) { Singleton* anomReadPtr(*anomReadPtrCIter); const std::string readId(SequenceUtils::getReadId(anomReadPtr-> alignment_)); if (myReadIdSet.find(readId) != myReadIdSet.end()) { ++numCommon; } } return numCommon; } /*****************************************************************************/ void AnomReadVec::absorb(AnomReadVec& anomReadVec) { ensureReadIdSetPopulated(); for (AnomReadVecIter anomReadPtrIter(anomReadVec.begin()); anomReadPtrIter != anomReadVec.end(); ++anomReadPtrIter) { Singleton* anomReadPtr(*anomReadPtrIter); const std::string readId(SequenceUtils::getReadId(anomReadPtr-> alignment_)); if (myReadIdSet.find(readId) == myReadIdSet.end()) { // Was in the absorbee but not the absorber. // Ownership of the pointer is transferred to this AnomReadVec. push_back(anomReadPtr); } else { // Duplicate, so delete it in the other AnomReadVec. delete(*anomReadPtrIter); } *anomReadPtrIter = NULL; // just to be safe } // Empty the other anomReadVec. anomReadVec.erase(anomReadVec.begin(), anomReadVec.end()); // Acquire its type(s) myExtraAnomReadTypeVec.push_back(anomReadVec.myAnomReadType); myExtraAnomReadTypeVec.insert(myExtraAnomReadTypeVec.end(), anomReadVec.myExtraAnomReadTypeVec.begin(), anomReadVec.myExtraAnomReadTypeVec.end()); // Update our read ID set. ensureReadIdSetPopulated(true); } /*****************************************************************************/ void AnomReadVec::ensureReadIdSetPopulated(bool forceUpdate) { if (!(myReadIdSet.empty() || forceUpdate)) { return; } for (AnomReadVecCIter anomReadPtrCIter(begin()); anomReadPtrCIter != end(); ++anomReadPtrCIter) { Singleton* anomReadPtr(*anomReadPtrCIter); myReadIdSet.insert(SequenceUtils::getReadId(anomReadPtr->alignment_)); } } /*****************************************************************************/ std::istream& operator>>(std::istream& istrm, AnomReadVec& anomReadVec) { const static StrVec startTokenList1 = list_of("Found")("cluster")("of")("size"); const static StrVec startTokenList2 = list_of("at"); const static StrVec startTokenList3 = list_of("type"); std::string lineStr; bool inCluster(false); // For checking unsigned int clusterSize(0); while (1) { getline(istrm, lineStr); if (istrm.eof()) break; istringstream lineStrm(lineStr); std::string tokenStr; lineStrm >> tokenStr; if (!inCluster) { // Detect start of cluster or skip lines outside of clusters. if (tokenStr == "CLUSTER_START") { inCluster = true; if (!eat_speced_strs(lineStrm, startTokenList1)) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, "Unexpected format " + lineStr)); } lineStrm >> clusterSize; if (!eat_speced_strs(lineStrm, startTokenList2)) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, "Unexpected format " + lineStr)); } // Simply use the position read in. // Do not regenerate it from the original reads. lineStrm >> anomReadVec.myPos; anomReadVec.myLeftBreakPos = anomReadVec.myPos; anomReadVec.myRightBreakPos = anomReadVec.myPos; if (lineStrm.eof()) { anomReadVec.myAnomReadType = AnomalousRead::ShadowOrSemiAligned; } else { if (!eat_speced_strs(lineStrm, startTokenList3)) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, "Unexpected " "format " + lineStr)); } lineStrm >> anomReadVec.myAnomReadType; } } continue; } // In a cluster if (tokenStr == "CLUSTER_END") { // Could check rest of line. if (static_cast(anomReadVec.size()) != clusterSize) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, "Number of reads found " "in cluster did not " "match count")); } inCluster = false; break; } // Should be a read. Rewind the line stream. lineStrm.str(lineStr); int minPos(0), maxPos(0); Alignment alignment; lineStrm >> minPos; lineStrm >> maxPos; lineStrm.ignore(); // eat the tab lineStrm >> alignment; // There must be no further reverse complementing of R-mapped reads // (with reversing of quality) at this stage (ClusterMerger) // -> use IdentityAnomRead. anomReadVec.push_back(new IdentityAnomRead(alignment, minPos, maxPos)); } if (inCluster) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, "Incomplete cluster found")); } return istrm; } /*****************************************************************************/ std::ostream& operator<<(std::ostream& ostrm, const AnomReadVec& anomReadVec) { // Suppress output of empty clusters (as when cluster has been absorbed). if (anomReadVec.empty()) { return ostrm; } ostrm << "CLUSTER_START Found cluster of size " << anomReadVec.size() << " at " << anomReadVec.myPos << " type " << anomReadVec.myAnomReadType; for (AnomReadVec::AnomReadTypeVecCIter anomReadTypeCIter(anomReadVec.myExtraAnomReadTypeVec.begin()); anomReadTypeCIter != anomReadVec.myExtraAnomReadTypeVec.end(); ++anomReadTypeCIter) { ostrm << " " << *anomReadTypeCIter; } ostrm << endl; const Singleton* pAnomRead; StatWidget minStats; StatWidget maxStats; for (AnomReadVec::const_iterator anomReadCIter(anomReadVec.begin()); anomReadCIter != anomReadVec.end(); ++anomReadCIter) { pAnomRead = *anomReadCIter; ostrm << pAnomRead->minPos_ << "\t" << pAnomRead->maxPos_ << "\t" << pAnomRead->alignment_; minStats.update(pAnomRead->minPos_); maxStats.update(pAnomRead->maxPos_); } ostrm << "CLUSTER_END group=" << static_cast(anomReadVec.myGroupNum) << "|reads=" << static_cast(minStats.getNumSamples()) << "|start=" << static_cast(minStats.getMean()) << "|end=" << static_cast(maxStats.getMean()) << endl; return ostrm; } /*****************************************************************************/ } // namespace variance_detection } // namespace ca /*****************************************************************************/