#include "variance/ClusterFinderImpl.hh" /** * Project : CASAVA * Module : $RCSfile: ClusterFinderImpl.cpp,v $ * @author : Tony Cox * Copyright : Copyright (c) Illumina 2008, 2009. All rights reserved. * ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). * */ #define NEAREST_NEIGHBOUR_PEAK_FINDER 1 //#define DEBUG_CLF 1 #include "variance/AnomalousRead.hh" using namespace casava::common; namespace ca { namespace variance_detection { ClusterFinderImpl::ClusterFinderImpl ( double maxDist, unsigned int minGroupSize, double spanningReadThreshold, const std::string& inputFileName, const std::string& statsFileName, const std::string& outputFileName, const bool isQphred) : _maxDist(maxDist), _minGroupSize(minGroupSize), _spanningReadThreshold(spanningReadThreshold), _inputFileName(inputFileName), _statsFileName(statsFileName), _outputFileName(outputFileName), _isQphred(isQphred), _maxDistAligned(0.0), _maxDistShadow(20.0), _maxDistInterStrand(12.0) { if (_inputFileName.empty()) { cerr << "No value given for input file" << endl; // TBD exception? } if (_statsFileName.empty()) { cerr << "No value given for stats file" << endl; // TBD exception? } if (_outputFileName.empty()) { cerr << "No value given for output file" << endl; // TBD exception? } // Tony's default values: //_minPairedReadAlignmentScore=6; //_minSingleReadAlignmentScore=10; //_numStandardDevs=3; // need to verify if this threshold is applicable to different data sets //_spanningReadThreshold=25; //_numGroups=0; //_numSpanReads=0; //_numSingletons=0; //_numGoodSingletons=0; } // ~c'tor void ClusterFinderImpl::importSampleStats ( map& sampleInfo ) { #ifdef DEBUG_CLF cerr << "importSampleStats" << endl; #endif std::string line, sampleName, field; double mean, sd, se; ifstream statsFile( _statsFileName.c_str() ); if (!statsFile) { cerr << "Problem opening sample file " << _statsFileName << endl; } // ~if while (1) { getline( statsFile, line ); if (statsFile.eof()) break; // cerr << "Line : " << line << endl; stringstream ss(line); ss >>sampleName; sampleName+=" "; ss >>field; sampleName+=field; sampleName+=" "; ss >>field; sampleName+=field; ss >> mean; ss >> sd; ss >> se; cerr << sampleName << " = " << mean << " " << sd << " " << se << endl; sampleInfo[sampleName]=OffsetInfo(); sampleInfo[sampleName].mean_=mean; sampleInfo[sampleName].sd_=sd; } // ~while statsFile.close(); #ifdef DEBUG_CLF cerr << "importSampleStats done" << endl; #endif } // ~ClusterFinderImpl::importSampleStats /*****************************************************************************/ unsigned int ClusterFinderImpl::addIfAnomalousPairRead(AnomalousReadMgr& anomalousReadMgr, const PairStats& pairStats, const Alignment& alignment) { // FIXME : some duplication of IndelFinderImpl::addIfAnomalousPairRead. // Get the expected pair values for reads in the same lane as this read. RelOrient expectedRelOrient; double minSize(0.0); double maxSize(0.0); pairStats.getBounds(alignment.getMachineName(), alignment.getRunNumber(), alignment.getLaneNumber(), alignment.getIndex(), expectedRelOrient, minSize, maxSize); // Get the actual pair values for this read. const unsigned int readNum(alignment.getReadNumber()); unsigned int readLen(pairStats.getReadLen(alignment.getMachineName(), alignment.getRunNumber(), alignment.getLaneNumber(), alignment.getIndex(), readNum)); // FIXME : Assumes only possible read numbers are 1 and 2. const unsigned int partnerReadNum(3 - readNum); unsigned int partnerReadLen(pairStats.getReadLen(alignment.getMachineName(), alignment.getRunNumber(), alignment.getLaneNumber(), alignment.getIndex(), partnerReadNum)); bool isChimeric(false); long leftPos(-1); unsigned int insertSize(0); RelOrient relOrient; if (!SequenceUtils::getReadPairInfo(alignment, readLen, partnerReadLen, isChimeric, leftPos, insertSize, relOrient)) { // Skip unless both reads are mapped (and not to the same position). return false; } // Allow reads to be added to multiple anomalous pair types. typedef std::vector AnomReadTypeVec; typedef AnomReadTypeVec::const_iterator AnomReadTypeVecCIter; AnomReadTypeVec anomReadTypeVec; if (isChimeric) { anomReadTypeVec.push_back(AnomalousRead::ChimericPair); } else { if (!(relOrient == expectedRelOrient)) { if ((relOrient == RelOrient("Fm")) || (relOrient == RelOrient("Fp"))) { anomReadTypeVec.push_back(AnomalousRead::InversionPair); } else { anomReadTypeVec.push_back(AnomalousRead::DuplicationPair); } } else { // expected orientation if (insertSize < minSize) { anomReadTypeVec.push_back(AnomalousRead::InsertionPair); } else if (insertSize > maxSize) { anomReadTypeVec.push_back(AnomalousRead::DeletionPair); } } } for (AnomReadTypeVecCIter anomReadTypeVecCIter(anomReadTypeVec.begin()); anomReadTypeVecCIter != anomReadTypeVec.end(); ++anomReadTypeVecCIter) { AnomalousRead::Type anomReadType(*anomReadTypeVecCIter); anomalousReadMgr.addRead(anomReadType, ((anomReadType == AnomalousRead::ChimericPair) ? dynamic_cast (new ChimericPairAnomRead(alignment, pairStats, readLen)) : dynamic_cast (new SingleChromPairAnomRead(alignment, leftPos, insertSize)))); } return static_cast(anomReadTypeVec.size()); } /*****************************************************************************/ // Return true if an aligned read should be passed to the clustering process // It's AlignCandReads responsibility to ensure shadow reads don't get // erroneously assigned a low-confidence alignment but instead stay as // shadows. ClusterFinderImpl's task is to // - always include gapped alignments // - never include reads containing Ns (we don't likes them) // - don't include reads with high-confidence alignments (which are // likely from shadows caused by repeats and not indels) bool ClusterFinderImpl::wantAlignment( const Alignment& alignment ) { if (strchr(alignment.getData().c_str(),'N')!=NULL) { // if it contains "N"s we don't want it return false; } // ~if else if (alignment.getMatchDescriptor()[0] == '-') { // if it's a shadow we want it return true; } // ~else iif else { // if it's a gapped alignment we want it if (strpbrk(alignment.getMatchDescriptor().c_str(),"^$")!=NULL) { return true; // ~ include } //~if else { double adsc (SequenceUtils::getSemiAlignedReadMetric(alignment,_isQphred)); if (adsc<_spanningReadThreshold) return false; // if it's a *good* alignment we *don't* want it else return true; } // ~else } // ~else } // ~wantAlignment void ClusterFinderImpl::importReads ( AnomalousReadMgr& anomalousReadMgr, map& sampleInfo, PairStats& pairStats) { #ifdef DEBUG_CLF cerr << "importReads" << endl; #endif Alignment alignment; int minOff,maxOff; // const char* pAlign; std::string s, sampleName; // Now extract the singletons variable from anomalousReadMgr. std::vector& singletons(anomalousReadMgr. getVec(AnomalousRead::ShadowOrSemiAligned)); int numAligned(0); int numAnomPairReads(0), numSemiAligned(0), numAlignedRejected(0); int numShadows(0),numShadowsRejected(0); int numRead(0); ifstream inputFile( _inputFileName.c_str() ); if (!inputFile) { cerr << "Problem opening input file " << _inputFileName << endl; } // ~if std::string line; while (1) { // getline( inputFile, line ); inputFile >> minOff; if (inputFile.eof()) break; inputFile >> maxOff; inputFile.ignore(); inputFile >> alignment; numRead++; #ifdef DEBUG_CLF // if ((numRead%5000)==0) cerr << numRead << " " << sampleInfo.size() << endl; if (numRead>=217085) cerr << numRead << " " << alignment << endl; #endif // cout << "minOff: " << minOff << " maxOff: " << maxOff << " " << alignment; if (alignment.getMatchDescriptor().c_str()[0] != '-') { numAligned++; if (pairStats.isValid(alignment.getMachineName(), alignment.getRunNumber(), alignment.getLaneNumber(), alignment.getIndex())) { if (addIfAnomalousPairRead(anomalousReadMgr, pairStats, alignment) > 0) { ++numAnomPairReads; } } singletons.push_back(new SingletonSemiAligned(alignment)); singletons.back()->minPos_=minOff; singletons.back()->maxPos_=maxOff; // All alignments here have passed purity filtering hence // always have setPassed==true. We used setPassed as an // internal flag to denote whether or not we want to cluster // the alignment. We set it back to true before we output the // alignment again. if (!wantAlignment(alignment)) { singletons.back()->alignment_.setPassed(false); numAlignedRejected++; } else { ++numSemiAligned; } } // ~if else { numShadows++; sampleName=SequenceUtils::getSampleName(alignment); #ifdef DEBUG_CLF if (sampleInfo.find(sampleName)==sampleInfo.end()) { cerr << "could not find sample info for sample name " << sampleName << " found at line " << numRead << " in file " << _inputFileName << ", entry = " << alignment << endl; } // ~if #endif singletons.push_back( new SingletonNotMatched (alignment, minOff, maxOff, sampleInfo[sampleName].sd_ )); if (!wantAlignment(alignment)) { singletons.back()->alignment_.setPassed(false); numShadowsRejected++; } } // ~else } // ~while cerr << "Read " << numAligned << " aligned reads, of which " << endl << " " << numAnomPairReads << " were anomalous pair reads" << endl << " " << numSemiAligned << " were semi-aligned reads" << endl << " " << numAlignedRejected << " were excluded from clustering" << endl; cerr << "Read " << numShadows << " shadow reads, of which " << numShadowsRejected << " were excluded from clustering" << endl; inputFile.close(); #ifdef DEBUG_CLF cerr << "importReads done" << endl; #endif } // ~ClusterFinderImpl::importReads void ClusterFinderImpl::printGroup( vector& group, int& numGroups, ostream& outputFile ) { double gStart(0.0), gEnd(0.0); if (group.size()>=_minGroupSize) { outputFile << "CLUSTER_START Found cluster of size " << group.size() << endl; } // ~if for (vector::iterator i(group.begin()); i!=group.end();++i) { outputFile << (*i)->minPos_ << "\t" << (*i)->maxPos_ << "\t" << (*i)->alignment_; gStart+=(*i)->minPos_; gEnd+=(*i)->maxPos_; } // ~for gStart/=group.size(); gEnd/=group.size(); if (group.size()>=_minGroupSize) { outputFile << "CLUSTER_END group=" << ++numGroups << "|reads=" << group.size() << "|start=" << (int)gStart << "|end=" << (int)gEnd << endl; } // ~if } // ~void ClusterFinderImpl::printGroup( vector& group ) int numLeadingMatches( Alignment& alignment ) { int numLeadingMatches(0); int exactRun(0); int firstMismatch(-1); int firstIndel(-1); int numIndels(0); for (unsigned int i(0);i(alignment.getMatchDescriptor()[i])-48); } // if else { // found a nondigit character, return leading values numLeadingMatches+=exactRun; exactRun=0; if (isalpha(alignment.getMatchDescriptor()[i])) { if (firstMismatch==-1) firstMismatch=numLeadingMatches; numLeadingMatches++; } else if (alignment.getMatchDescriptor()[i]=='^') { if (firstIndel==-1) firstIndel=numLeadingMatches; numIndels++; } } // ~else } // ~for // if exactly one indel is found, return position of that // else return position of first mismatch if (numIndels==1) return firstIndel; else return firstMismatch; } // ~numLeadingMatches #ifdef UNDER_CONSTRUCTION int numLeadingMatches( Alignment& alignment ) { int numLeadingMatches(0); for (unsigned int i(0);i(alignment.getMatchDescriptor()[i])-48); } // if else { // found a nondigit character, return leading values return numLeadingMatches; } // ~else } // ~for // only get here if descriptor is all digits, i.e. an exact match return -1; } // ~numLeadingMatches #endif void ClusterFinderImpl::clusterAlignedReads ( vector& matched, vector& matchedClusters ) { int lastPos(-500000); for (vector::iterator i(matched.begin()); i!=matched.end();++i) { if (i->pos_-lastPos>_maxDistAligned) { #ifdef DEBUG_CLF if (!matchedClusters.empty()) cerr << "MEAN=" << std::fixed << matchedClusters.back().positionStats_.getMean() << endl; cerr << "CLUSTER " << std::fixed << i->pos_ << " " << lastPos << endl; #endif matchedClusters. push_back( AnomReadGroup(AnomalousRead::ShadowOrSemiAligned) ); } // ~if #ifdef DEBUG_CLF cerr << i->pos_ << " " << i->pRead_->alignment_; #endif if (!matchedClusters.empty()) { matchedClusters.back().insert(i->pRead_); matchedClusters.back().positionStats_.update(i->pos_); } lastPos=i->pos_; } // ~for } // ~clusterAlignedReads void ClusterFinderImpl::clusterShadowsForward ( vector& shadowForward, vector& matchedClustersForward ) { // vector::iterator j(shadowForward.begin()); // double upperLimit; int numAdded(0); map< Singleton*, AnomReadGroup* > active; if (matchedClustersForward.empty()) { if (!shadowForward.empty()) { // There are shadow reads but no semi-aligned clusters. // Even if the shadows could be clustered & assembled, // the contig is unlikely to align reliably -> discard. shadowForward.clear(); return; } } vector::iterator i(matchedClustersForward.begin()); vector::iterator j(shadowForward.begin()); while (shadowForward.end() != j) { //cerr << "j pos= " << j->pos_ << endl; //cerr << "i position stats= " << i->positionStats_.getMean() << endl; if (j->pos_<=i->positionStats_.getMean()) { if (active.find(j->pRead_)!=active.end()) { if (active[j->pRead_]!=NULL) { active[j->pRead_]->insert(j->pRead_); numAdded++; } // ~if active.erase(j->pRead_); } // ~if else { active[j->pRead_]=NULL; } // ~else if (++j==shadowForward.end()) break; } // ~if else { for ( map< Singleton*, AnomReadGroup* >::iterator k(active.begin()); k!=active.end();++k) { k->second=&*i; } // ~for if (++i==matchedClustersForward.end()) break; } // ~else } // ~while #ifdef XXX vector::iterator j(shadowForward.begin()); double upperLimit; int numAdded(0); for (vector::iterator i(matchedClustersForward.begin()); i!=matchedClustersForward.end();++i) { while ( (j!=shadowForward.end()) &&(j->pos_positionStats_.getMean()-_maxDistShadow)) ++j; // &&(j->pos_positionStats_.getMean())) ++j; if (j==shadowForward.end()) break; upperLimit=i->positionStats_.getMean()+_maxDistShadow; while (j->pos_<=upperLimit) { i->insert(j->pRead_); // i->positionStats_.update(j->pos_); // TBD upperLimit=j->pos_+_maxDistShadow; numAdded++; if (++j==shadowForward.end()) break; } } #endif cerr << "Added " << numAdded << " shadow reads to forward strand clusters" << endl; return; } // ~addShadowsForward void ClusterFinderImpl::clusterShadowsReverse ( vector& shadowReverse, vector& matchedClustersReverse ) { int numAdded(0); map< Singleton*, AnomReadGroup* > active; if (matchedClustersReverse.empty()) { if (!shadowReverse.empty()) { // There are shadow reads but no semi-aligned clusters. // Even if the shadows could be clustered & assembled, // the contig is unlikely to align reliably -> discard. shadowReverse.clear(); return; } } vector::reverse_iterator i(matchedClustersReverse.rbegin()); vector::reverse_iterator j(shadowReverse.rbegin()); while (shadowReverse.rend() != j) { if (j->pos_>=i->positionStats_.getMean()) { if (active.find(j->pRead_)!=active.end()) { if (active[j->pRead_]!=NULL) { active[j->pRead_]->insert(j->pRead_); numAdded++; } // ~if active.erase(j->pRead_); } // ~if else { active[j->pRead_]=NULL; } // ~else if (++j==shadowReverse.rend()) break; } // ~if else { for ( map< Singleton*, AnomReadGroup* >::iterator k(active.begin()); k!=active.end();++k) { k->second=&*i; } // ~for if (++i==matchedClustersReverse.rend()) break; } // ~else } // ~while #ifdef XXX vector::reverse_iterator j(shadowReverse.rbegin()); double upperLimit; int numAdded(0); for (vector::reverse_iterator i(matchedClustersReverse.rbegin()); i!=matchedClustersReverse.rend();++i) { while ( (j!=shadowReverse.rend()) &&(j->pos_>i->positionStats_.getMean()+_maxDistShadow)) ++j; if (j==shadowReverse.rend()) break; upperLimit=i->positionStats_.getMean()-_maxDistShadow; while (j->pos_>=upperLimit) { i->insert(j->pRead_); // i->positionStats_.update(j->pos_); // TBD upperLimit=j->pos_-_maxDistShadow; numAdded++; if (++j==shadowReverse.rend()) break; } } #endif cerr << "Added " << numAdded << " shadow reads to reverse strand clusters" << endl; return; } // ~addShadowsReverse void ClusterFinderImpl::clusterStrands ( vector& matchedClustersForward, vector& matchedClustersReverse ) { // double lastPos(-500000); // TBD do this with insert for (vector::iterator i(matchedClustersReverse.begin()); i!=matchedClustersReverse.end();++i) { matchedClustersForward.push_back(*i); } // ~for sort (matchedClustersForward.begin(), matchedClustersForward.end(), SortAnomReadGroup()); if (matchedClustersForward.size()<2) return; vector::iterator i(matchedClustersForward.begin()), prev(matchedClustersForward.begin()); i++; for (;i!=matchedClustersForward.end();++i) { #ifdef DEBUG_CLF cerr << i->size() << " " << i->positionStats_.getMean() << endl; if (!i->empty()) cerr << (*i->begin())->alignment_; #endif if ((prev->size()!=0) &&(i->size()!=0) &&((*prev->begin())->alignment_.getMatch().getStrand() == Match::Forward) &&((*i->begin())->alignment_.getMatch().getStrand() == Match::Reverse) &&(i->positionStats_.getMean()-prev->positionStats_.getMean() <=_maxDistInterStrand)) { #ifdef DEBUG_CLF cerr << "DOING_MERGE" << endl; #endif for (AnomReadGroup::iterator j(i->begin());j!=i->end();j++) { prev->insert(*j); } i->clear(); } //~if prev=i; } // ~for } // ~clusterStrands int ClusterFinderImpl::getMostProbableBreakPoint(Alignment& alignment) { int breakPointPos = 0; const char* qualityString = alignment.getQuality().c_str(); int lengthOfAlignment = strlen (qualityString); const char* type = alignment.getMatchDescriptor().c_str(); bool noGaps = true; for (unsigned j=0;j& clustersSoFar) { cerr << "For " << anomReadType << " :-" << std::endl; AnomalousReadMgr::SingletonEventVec sortedAnomReads; anomalousReadMgr.getSortedAnomReads(anomReadType, sortedAnomReads); cerr << " Found " << sortedAnomReads.size() << " anom reads." << std::endl; AnomalousReadMgr::AnomReadGroupVec clusters; anomalousReadMgr.clusterAnomPairReads(anomReadType, sortedAnomReads, clusters); cerr << " Made " << clusters.size() << " clusters." << std::endl; // Append to clustersSoFar. clustersSoFar.insert(clustersSoFar.end(), clusters.begin(), clusters.end()); } /*****************************************************************************/ void ClusterFinderImpl::addAnomPairClusters(AnomalousReadMgr& anomalousReadMgr, vector& clustersSoFar) { AnomalousReadMgr::TypeVec anomPairTypeVec; anomalousReadMgr.getAnomPairTypeVec(anomPairTypeVec); for (AnomalousReadMgr::TypeVecCIter typeCIter(anomPairTypeVec.begin()); typeCIter != anomPairTypeVec.end(); ++typeCIter) { addAnomPairTypeClusters(anomalousReadMgr, *typeCIter, clustersSoFar); } // Final sort. sort(clustersSoFar.begin(), clustersSoFar.end(), SortAnomReadGroup()); } /*****************************************************************************/ void ClusterFinderImpl::operator()( AnomalousReadMgr& anomalousReadMgr ) { // const int maxDistAligned(1); // const int maxDistShadow(20); // const int maxDistInterStrand(7); ofstream outputFile(_outputFileName.c_str()); if (!outputFile) { cerr << "Failed to open output file " << _outputFileName << " for writing" << endl; return; // TBD what to do here - throw exception?? } // ~if if (anomalousReadMgr.totalNumReadsAdded() == 0) { cerr << "No reads to cluster, nothing to do" << endl; outputFile.close(); return; } // Now extract the singletons variable from anomalousReadMgr. std::vector& singletons(anomalousReadMgr. getVec(AnomalousRead::ShadowOrSemiAligned)); if (singletons.size() == 0) { cerr << "No shadow/semi-aligned reads to cluster." << endl; } // std::vector eventStarts; // std::vector eventEnds; vector matchedForward, matchedReverse, shadowForward, shadowReverse; vector matchedForwardClusters, matchedReverseClusters; int numLeading; unsigned int numAlignedRejected(0),numClusters(0); //double posToSort; for ( vector::iterator i(singletons.begin()); i!=singletons.end();++i) { if (!(*i)->alignment_.getPassed()) { (*i)->alignment_.setPassed(true); outputFile << (*i)->minPos_ << "\t" << (*i)->maxPos_ << "\t" << (*i)->alignment_; continue; } // ~if if (dynamic_cast(*i)!=NULL) { //numLeading=numLeadingMatches( (*i)->alignment_ ); numLeading = getMostProbableBreakPoint( (*i)->alignment_ ); //cerr << "In CluterFinderImpl, after numLeading = " << numLeading << "\n"; // if (numLeading>=1) // { if ((*i)->alignment_.getMatch().getStrand() == Match::Forward) { matchedForward.push_back ( SingletonEvent( (*i)->minPos_+numLeading, *i )); } // ~if else { matchedReverse.push_back ( SingletonEvent( (*i)->maxPos_-numLeading, *i )); } // ~else //} // else // { numAlignedRejected++; // } } // ~ if SemiAligned else { assert (dynamic_cast(*i)!=NULL); //posToSort=((*i)->minPos_+(*i)->maxPos_)/2.0; if ((*i)->alignment_.getMatch().getStrand() == Match::Forward) { // cout << (*i)->minPos_ << " " << (*i)->maxPos_ << " " << (*i)->alignment_; // cout << (*i)->minPos_ << " " << (*i)->alignment_.getMatch().getPosition() << endl; //cerr << "shadow read:" << (*i)->alignment_ << endl; shadowForward.push_back( SingletonEvent( (*i)->minPos_, *i)); shadowForward.push_back( SingletonEvent( (*i)->alignment_.getMatch().getPosition(), *i)); // shadowForward.push_back( SingletonEvent( posToSort, *i)); } // ~if else { // cout << (*i)->minPos_ << " " << (*i)->maxPos_ << " " << (*i)->alignment_; // cout << (*i)->maxPos_ << " " << (*i)->alignment_.getMatch().getPosition() << endl; shadowReverse.push_back( SingletonEvent( (*i)->maxPos_, *i)); shadowReverse.push_back( SingletonEvent( (*i)->alignment_.getMatch().getPosition(), *i)); // shadowReverse.push_back( SingletonEvent( posToSort, *i)); } // ~else } // ~else NotMatched } // assert(1==0); cerr << "For " << AnomalousRead::ShadowOrSemiAligned << " :-" << endl; cerr << numAlignedRejected << " further aligned reads were rejected" << endl; cerr << matchedForward.size() << " forward strand aligned reads" << endl; cerr << shadowForward.size() << " forward strand shadow + singleton reads" << endl; cerr << matchedReverse.size() << " reverse strand aligned reads" << endl; cerr << shadowReverse.size() << " reverse strand shadow + singleton reads" << endl; sort(matchedForward.begin(), matchedForward.end(), Sorter()); sort(matchedReverse.begin(), matchedReverse.end(), Sorter()); sort(shadowForward.begin(), shadowForward.end(), Sorter()); sort(shadowReverse.begin(), shadowReverse.end(), Sorter()); clusterAlignedReads( matchedForward, matchedForwardClusters ); clusterAlignedReads( matchedReverse, matchedReverseClusters ); clusterShadowsForward( shadowForward, matchedForwardClusters ); clusterShadowsReverse( shadowReverse, matchedReverseClusters ); cerr << " Made " << matchedForwardClusters.size() << " forward clusters." << std::endl << " Made " << matchedReverseClusters.size() << " reverse clusters." << std::endl; clusterStrands( matchedForwardClusters, matchedReverseClusters ); //for (vector::iterator i(matchedForwardClusters.begin()); // i!=matchedForwardClusters.end();++i) { //i->print( _minGroupSize, numClusters, outputFile ); //int breakpointPos = static_cast(i->positionStats_.getMean()); //cout << "breakpoint pos " << breakpointPos << "\t"; //int eventType = i->identifyType(); //cerr << i->anomReadType_ << endl; //EventChecker* eventChecker = new EventChecker::EventChecker(*i); //int eventType = eventChecker->getEventType(); //cout << "event type " << eventType << "\n"; //} cerr << " Merged strands -> " << matchedForwardClusters.size() << " clusters." << std::endl; addAnomPairClusters(anomalousReadMgr, matchedForwardClusters); for (vector::iterator i(matchedForwardClusters.begin()); i!=matchedForwardClusters.end();++i) i->print( _minGroupSize, numClusters, outputFile ); // for (vector::iterator i(matchedReverseClusters.begin()); // i!=matchedReverseClusters.end();++i) // i->print( _minGroupSize, outputFile ); cerr << "found " << numClusters << " clusters in all" << endl; outputFile.close(); } // ~void ClusterFinderImpl::operator() #ifdef PROTOTYPING_NEW_VERSION void ClusterFinderImpl::operator()( const AnomalousReadMgr& anomalousReadMgr ) { // Now extract the singletons variable from anomalousReadMgr. std::vector& singletons(anomalousReadMgr. getVec(AnomalousRead::ShadowOrSemiAligned)); ofstream outputFile(_outputFileName.c_str()); if (!outputFile) { cerr << "Failed to open stats file " << _outputFileName << " for writing" << endl; return; // TBD what to do here - throw exception?? } // ~if if (singletons.size()==0) { cerr << "No reads to cluster, nothing to do" << endl; outputFile.close(); outputFile.close(); return; } // ~if std::vector eventStarts; std::vector eventEnds; for ( vector::iterator i(singletons.begin()); i!=singletons.end();++i) { (*i)->populateBounds( eventStarts, eventEnds); } // ~for assert(!eventEnds.empty()); assert(!eventStarts.empty()); assert(eventEnds.size()==eventStarts.size()); #ifdef NEAREST_NEIGHBOUR_PEAK_FINDER vector::iterator pStart(eventStarts.begin()),pEnd(eventEnds.begin()); for (;pStart!=eventStarts.end();++pStart,++pEnd) { pStart->pos_+=pEnd->pos_; pStart->pos_/=2.0; } #endif sort(eventStarts.begin(), eventStarts.end(), Sorter()); sort(eventEnds.begin(), eventEnds.end(), Sorter()); #ifdef NEAREST_NEIGHBOUR_PEAK_FINDER double lastPos=-5000000; Singleton* lastRead(NULL); // const double maxDist(_options.maxDistance()); // const unsigned int minGroupSize(_options.minGroupSize()); vector group; int numGroups(0); for (pStart=eventStarts.begin();pStart!=eventStarts.end();++pStart) { if (!pStart->pRead_->alignment_.getPassed()) { pStart->pRead_->alignment_.setPassed(true); outputFile << pStart->pRead_->minPos_ << "\t" << pStart->pRead_->maxPos_ << "\t" << pStart->pRead_->alignment_; continue; } // ~if if ((pStart->pos_-lastPos)<=_maxDist) { if (group.empty()) group.push_back(lastRead); group.push_back(pStart->pRead_); } // ~if else { printGroup( group, numGroups, outputFile ); if (group.size()>0) { group.clear(); } // ~if else if (lastRead!=NULL) { outputFile << lastRead->minPos_ << "\t" << lastRead->maxPos_ << "\t" << lastRead->alignment_; } } // ~else lastPos=pStart->pos_; lastRead=pStart->pRead_; } // ~for printGroup( group, numGroups, outputFile ); if (group.size()>0) { group.clear(); } // ~if else if (lastRead!=NULL) { outputFile << lastRead->minPos_ << "\t" << lastRead->maxPos_ << "\t" << lastRead->alignment_; } // ~else if #endif #ifdef GAUSSIAN_PEAK_FINDER vector::iterator pStart(eventStarts.begin()),pEnd(eventEnds.begin()); bool isInPeak(false); IndelMetricStore metric; float threshold(0.05); int numGroups(0); // const unsigned int _minGroupSize(_options.minGroupSize()); AnomReadGroup activeGroup, shadowGroup; for (int i(eventStarts.begin()->pos_); i!=eventEnds.back().pos_;++i) { while ((pStart!=eventStarts.end())&&(i>=pStart->pos_)) { pStart->pRead_->populateMetric( metric ); activeGroup.insert(pStart->pRead_); if (isInPeak==true) shadowGroup.insert(pStart->pRead_); pStart++; } // ~while if (metric.find(i)==metric.end()) metric[i]=0; printf ("# %d %f zz\n", i, metric[i]); if ((metric[i]>=threshold)&&(isInPeak==false)) { shadowGroup=activeGroup; isInPeak=true; } // ~if else if ((metric[i]minGroupSize) // { ++numGroups; shadowGroup.print(numGroups); // } isInPeak=false; } // ~else if while ((pEnd!=eventEnds.end())&&(i>=pEnd->pos_)) { activeGroup.erase(pEnd->pRead_); pEnd++; } // ~while // TBD flush metric occasionally // TBD check for active group at end } // ~for #endif // GAUSSIAN_PEAK_FINDER outputFile.close(); } // ~void ClusterFinderImpl::operator() #endif // PROTOTYPING_NEW_VERSION } } // end namespace casava{ namespace { applications