#include "variance/IndelFinderImpl.hh" /** * Project : CASAVA * Module : $RCSfile: IndelFinderImpl.cpp,v $ * @author : Tony Cox * Copyright : Copyright (c) Illumina 2008, 2009. All rights reserved. * ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). * */ using namespace casava::common; namespace ca { namespace variance_detection { IndelFinderImpl::IndelFinderImpl ( int minPairAlScore, int minSingleAlScore, double spanReadThreshold, const std::string& outputFileName, const std::string& statsFileName, const bool isQphred) : _minPairedReadAlignmentScore(minPairAlScore), _minSingleReadAlignmentScore(minSingleAlScore), _spanningReadThreshold(spanReadThreshold), _numStandardDevs(3), // FIXME separate object to store counts _numSpanReads(0), _numSingletons(0), _numGoodSingletons(0), _numGoodShadows(0), _numInsCandPairReads(0), _numInsCandPairReadsNotSemiAligned(0), _numDelCandPairReads(0), _numDelCandPairReadsNotSemiAligned(0), _numDuplCandPairReads(0), _numDuplCandPairReadsNotSemiAligned(0), _numInvCandPairReads(0), _numInvCandPairReadsNotSemiAligned(0), _numChimericPairReads(0), _numChimericPairReadsNotSemiAligned(0), // end FIXME _outputFileName(outputFileName), _statsFileName(statsFileName), _isQphred(isQphred) { if (_statsFileName.empty()) { cerr << "No value given for stats file" << endl; // TBD exception? } if (_outputFileName.empty()) { cerr << "No value given for output file" << endl; // TBD exception? } // Tony's default values: //_minPairedReadAlignmentScore=6; //_minSingleReadAlignmentScore=10; //_numStandardDevs=3; // need to verify if this threshold is applicable to different data sets //_spanningReadThreshold=25; //_numGroups=0; //_numSpanReads=0; //_numSingletons=0; //_numGoodSingletons=0; } IndelFinderImpl::~IndelFinderImpl() { } void IndelFinderImpl::findGoodSingletons ( BamAlignmentReader& alignmentReader, const PairStats& pairStats, map< string, SampleStats>& sampleStats, vector& singletons ) // vector& eventStarts, // vector& eventEnds ) { // int minPos = -1, maxPos = -1; // Match::Strand strand = Match::None; bool wantShadow = false; bool isRejectedShadowAnchor = false; string sampleName; Alignment alignment, alignmentPrevious; while (alignmentReader.GetNextRead(alignment)) { sampleName=SequenceUtils::getSampleName(alignment); if(! alignment.getPassed()) { //assert(! wantShadow); continue; } if(! alignment.getPassed()) { //assert(! wantShadow); continue; } if (wantShadow) { // a shadow read should be in this line after the corresponding // singleton in the sorted.txt file wantShadow=false; // Next lines commented out for time being - will try to align // repeat shadows in alignCandidate reads rather than ignoring // them from the outset // std::string ctg = alignment.getMatch().getContig(); // if (ctg.find(':') != std::string::npos) // { // // this is a shadow read aligning to several repetitive regions, discard it. // continue; // } if ((SequenceUtils::totalBaseQualityForRead (alignment.getQuality().c_str()) >sampleStats[sampleName]. qualThreshold_[alignment.getReadNumber()-1])) // &&(strcmp(((std::string)alignment.getMatch().getChromosome()).c_str(),"NM")==0)) { singletons.push_back (new SingletonNotMatched(alignment, alignmentPrevious, sampleStats )); ++_numGoodShadows; // singletons.back()->populateBounds( eventStarts, eventEnds); #ifdef DEBUG_IDF_IMPL cout << "# Shadow found at " << minPos << " " << maxPos << endl; #endif } // ~if } // ~if else { // check for singletons, i.e. reads without aligning read partner // if (alignment.getPartnerMatch().getStrand() == Match::NM) if ((alignment.getPartnerMatch().getValidity() == true) &&(alignment.getPartnerMatch().getStrand() == Match::NM)) { if(isRejectedShadowAnchor) { isRejectedShadowAnchor=false; continue; } if (alignment.getSingleReadScore() > sampleStats[sampleName].scoreThreshold_ [alignment.getReadNumber()-1]) { #ifdef DEBUG_IDF_IMPL cout << "Good shadow found. Score : " << alignment.getSingleReadScore() << endl; #endif wantShadow=true; alignmentPrevious=alignment; } else { isRejectedShadowAnchor=true; } } // ~if (alignment.getPartnerMatch().getStrand() == Match::NM) // check if this read qualifies as semi-aligning (indel border-spanning read) else { isRejectedShadowAnchor=false; bool isSemiAligned(false); if (strpbrk(alignment.getMatchDescriptor().c_str(),"^$")!=NULL) { isSemiAligned=true; } else if (alignment.getMatchDescriptor()[0] != '-') { double adsc (SequenceUtils::getSemiAlignedReadMetric(alignment, _isQphred)); #ifdef MOVED_TO_SEQUENCE_UTILS_HH // exact (maximum possible) alignment score double esc = SequenceUtils::get().getExactScoreForRead (alignment.getQuality().c_str()); // actual alignment score double alsc = SequenceUtils::get().getAlignmentScore (alignment.getQuality().c_str(), alignment.getMatchDescriptor().c_str(), esc,alignment.getSequence()); // for semi-aligning reads, this ratio should be high double adsc = (esc - alsc)/log(10); #endif // need to verify if this threshold is applicable to // different data sets isSemiAligned=(adsc > _spanningReadThreshold); } // ~else if if (isSemiAligned) { singletons.push_back(new SingletonSemiAligned(alignment)); // singletons.back()->populateBounds(eventStarts, // eventEnds ); ++_numSpanReads; } if (pairStats.isValid(alignment.getMachineName(), alignment.getRunNumber(), alignment.getLaneNumber(), alignment.getIndex())) { // Call this even if isSemiAligned to get full statistics // - will add only if not isSemiAligned. addIfAnomalousPairRead(singletons, pairStats, alignment, isSemiAligned); } } // ~else if alignment.getPassed } // ~else } // ~while cout << "Num good shadow reads : " << _numGoodShadows << std::endl; cout << "Num semi-aligned reads : " << _numSpanReads << std::endl; cout << "Num insertion cand reads : " << _numInsCandPairReads << std::endl; cout << " not semi-aligned : " << _numInsCandPairReadsNotSemiAligned << std::endl; cout << "Num deletion cand reads : " << _numDelCandPairReads << std::endl; cout << " not semi-aligned : " << _numDelCandPairReadsNotSemiAligned << std::endl; cout << "Num duplication cand reads: " << _numDuplCandPairReads << std::endl; cout << " not semi-aligned : " << _numDuplCandPairReadsNotSemiAligned << std::endl; cout << "Num inversion cand reads : " << _numInvCandPairReads << std::endl; cout << " not semi-aligned : " << _numInvCandPairReadsNotSemiAligned << std::endl; cout << "Num chimeric pair reads : " << _numChimericPairReads << std::endl; cout << " not semi-aligned : " << _numChimericPairReadsNotSemiAligned << std::endl; } // ~findGoodSingletons /*****************************************************************************/ void IndelFinderImpl::addIfAnomalousPairRead(std::vector& singletons, const PairStats& pairStats, const Alignment& alignment, bool alreadyAdded) { // Get the expected pair values for reads in the same lane as this read. RelOrient expectedRelOrient; double minSize(0.0); double maxSize(0.0); pairStats.getBounds(alignment.getMachineName(), alignment.getRunNumber(), alignment.getLaneNumber(), alignment.getIndex(), expectedRelOrient, minSize, maxSize); // Get the actual pair values for this read. const unsigned int readNum(alignment.getReadNumber()); unsigned int readLen(pairStats.getReadLen(alignment.getMachineName(), alignment.getRunNumber(), alignment.getLaneNumber(), alignment.getIndex(), readNum)); // FIXME : Assumes only possible read numbers are 1 and 2. const unsigned int partnerReadNum(3 - readNum); unsigned int partnerReadLen(pairStats.getReadLen(alignment.getMachineName(), alignment.getRunNumber(), alignment.getLaneNumber(), alignment.getIndex(), partnerReadNum)); bool isChimeric(false); long leftPos(-1); unsigned int insertSize(0); RelOrient relOrient; if (!SequenceUtils::getReadPairInfo(alignment, readLen, partnerReadLen, isChimeric, leftPos, insertSize, relOrient)) { // Skip unless both reads are mapped (and not to the same position). return; } if (isChimeric) { ++_numChimericPairReads; if (!alreadyAdded) { ++_numChimericPairReadsNotSemiAligned; singletons.push_back(new ChimericPairAnomRead(alignment, pairStats, readLen)); } } else { bool isAnomalous(false); if (!(relOrient == expectedRelOrient)) { isAnomalous = true; // Assumes both reads mapping to the same strand is never // a valid prep but indicates inversion. if ((relOrient == RelOrient("Fm")) || (relOrient == RelOrient("Fp"))) { ++_numInvCandPairReads; if (!alreadyAdded) { ++_numInvCandPairReadsNotSemiAligned; } } else { ++_numDuplCandPairReads; if (!alreadyAdded) { ++_numDuplCandPairReadsNotSemiAligned; } } } else if (insertSize < minSize) { isAnomalous = true; ++_numInsCandPairReads; if (!alreadyAdded) { ++_numInsCandPairReadsNotSemiAligned; } } else if (insertSize > maxSize) { isAnomalous = true; ++_numDelCandPairReads; if (!alreadyAdded) { ++_numDelCandPairReadsNotSemiAligned; } } if (isAnomalous && !alreadyAdded) { singletons.push_back(new SingleChromPairAnomRead(alignment, leftPos, insertSize)); } } } /*****************************************************************************/ void IndelFinderImpl::meanBaseQualityGoodPairs( BamAlignmentReader& alignmentReader, map& sampleStats) { int numEntries(0); std::string sampleName; Alignment alignment; while (alignmentReader.GetNextRead(alignment)) { numEntries++; sampleName=SequenceUtils::getSampleName(alignment); assert(SequenceUtils::isValidQualString(alignment.getQuality().c_str(),_isQphred)); if (alignment.getPairedReadScore()> _minPairedReadAlignmentScore) { sampleStats[sampleName].qualStats_[alignment.getReadNumber()-1].update(SequenceUtils::totalBaseQualityForRead(alignment.getQuality().c_str())); sampleStats[sampleName].scoreStats_[alignment.getReadNumber()-1].update(alignment.getSingleReadScore()); // OK just to work out insert for F strand reads // this gets called for both R1 and R1 so all pairs get considered if (alignment.getMatch().getStrand() == Match::Forward) { // ensure wrap arounds in circular genomes don't mess up insert stats if (alignment.getPartnerMatch().getPosition()>0) { sampleStats[sampleName].offsetStats_.update(alignment.getPartnerMatch().getPosition()); if ((sampleStats[sampleName].offsetStats_.getNumSamples() %1000000)==0) { printf("# "); sampleStats[sampleName].offsetStats_.print(); } // ~if } // ~if } // ~if } // ~if // if (alignment.getMatchDescriptor().c_str()[0] != '-' && alignment.getPassed() == true) { // don't do alignment score stats for non-present ('-') or gapped alignments if ( (strpbrk(alignment.getMatchDescriptor().c_str(),"-^$")==NULL) && (alignment.getPassed() == true) ) { double adsc (SequenceUtils::getSemiAlignedReadMetric(alignment,_isQphred)); #if 0 double esc = SequenceUtils::get().getExactScoreForRead(alignment.getQuality().c_str()); double alsc = SequenceUtils::get().getAlignmentScore(alignment.getQuality().c_str(), alignment.getMatchDescriptor().c_str(), esc,alignment.getSequence()); double adsc = (esc - alsc)/log(10); #endif if (isnan(adsc)) { cout << "NAN error. " << endl; cout << "Checking : " << alignment.getSequence() << " "; cout << alignment.getQuality() << endl; cout << adsc << endl; cout << alignment.getData() << " " << alignment.getQuality().c_str() << " " << alignment.getMatchDescriptor() << " " << (alignment.getPassed() ? "Y" : "N") << endl; cout << "---------------------------------------" << endl; } else { sampleStats[sampleName].spanReadStats_[alignment.getReadNumber()-1].update(adsc); } } } // ~while // printf("# reads=%d\n", numEntries); for (map::iterator i(sampleStats.begin()); i !=sampleStats.end(); ++i) { printf("# sample=%s\n", i->first.c_str()); // i->second.print(stderr); i->second.setThresholds(_numStandardDevs); i->second.printOffsetStats(); } // ~for } // ~meanBaseQualityGoodPairs void IndelFinderImpl::generateStatsFile ( const std::map& sampleStats ) { ofstream statsFile(_statsFileName.c_str()); if (!statsFile) { cerr << "Failed to open stats file " << _statsFileName << " for writing" << endl; return; // TBD what to do here - throw exception?? } // ~if typedef boost::tokenizer > tokenizer; boost::char_separator sep(" "); for (map::const_iterator i(sampleStats.begin()); i!=sampleStats.end(); ++i) { if (i->first.empty()) continue; // getSampleName returns machine/run/lane space separated // need them tab separated for stats file tokenizer tok(i->first, sep); for (tokenizer::iterator j(tok.begin());j!=tok.end();++j) statsFile << *j << "\t"; statsFile // << i->first << "\t" << i->second.offsetStats_.getMean() << "\t" << i->second.offsetStats_.getStandardDeviation() << "\t" << i->second.offsetStats_.getStandardError() << endl; } // ~for statsFile.close(); } // ~generateStatsFile void IndelFinderImpl::generateOutputFile ( vector& singletons ) { ofstream outputFile(_outputFileName.c_str()); if (!outputFile) { cerr << "Failed to open output file " << _outputFileName << " for writing" << endl; return; // TBD what to do here - throw exception?? } // ~if for ( vector::iterator i(singletons.begin()); i!=singletons.end();++i) { outputFile << (*i)->minPos_ <<"\t" << (*i)->maxPos_ << "\t" << (*i)->alignment_; } // ~for outputFile.close(); } // ~generateOutputFile } } // end namespace casava{ namespace { applications