/** ** Copyright (c) 2007-2009 Illumina, Inc. ** ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). ** ** This file is part of the Consensus Assessment of Sequence And VAriation ** (CASAVA) software package. ** ** \file DiversityCalculator.cpp ** ** \brief Calculates the diversity in a population of matched fragments. ** ** DiversityCalculator is the top-level component used to estimate ** the diversity in the population of fragment sizes obtained during ** alignment analysis. ** ** \author Come Raczy **/ #include #include #include #include #include #include #include "alignment/DiversityCalculator.hh" #include "common/Alignment.hh" #include "common/Exceptions.hh" namespace casava { namespace alignment { const int pixelSize = 10; // a scaling factor of 10 is used for the coordinates in the qseq files const int opticalDuplicatesWindow = 10 * pixelSize; // use a window of 10 pixels to detect optical duplicates const int squaredOpticalDuplicatesWindow = opticalDuplicatesWindow * opticalDuplicatesWindow; namespace fs = boost::filesystem; namespace bi = boost::iostreams; Fragment::Fragment(const cc::Alignment &read1, const cc::Alignment &read2) : position_(computePosition(read1, read2)) , length_(computeLength(read1, read2, position_)) , type_(computeType(read1, read2, position_, length_)) , tile_(read1.getTileNumber()) , x_(read1.getX()) , y_(read1.getY()) { } Fragment::Fragment(const Fragment &fragment) : position_(fragment.position_) , length_(fragment.length_) , type_(fragment.type_) , tile_(fragment.tile_) , x_(fragment.x_) , y_(fragment.y_) { } Fragment &Fragment::operator=(const Fragment &rhs) { if (&rhs != this) { position_ = rhs.position_; length_ = rhs.length_; type_ = rhs.type_; tile_ = rhs.tile_; x_ = rhs.x_; y_ = rhs.y_; } return *this; } long Fragment::computePosition(const cc::Alignment &read1, const cc::Alignment &read2) const { return (read2.getPosition() > read1.getPosition() ? read1 : read2).getPosition(); } unsigned int Fragment::computeLength(const cc::Alignment &read1, const cc::Alignment &read2, long position) const { const cc::Alignment &second = (read2.getPosition() > read1.getPosition() ? read2 : read1); const long length = second.getPosition() + second.getLength() - position; if (0 > length) { return 0; } if (length > std::numeric_limits::max()) { return std::numeric_limits::max(); } return static_cast(length); } Fragment::FragmentType Fragment::computeType(const cc::Alignment &read1, const cc::Alignment &read2, long , unsigned int ) const { if (read1.getMatch().getPosition() == read2.getMatch().getPosition()) { return EQUAL; } if(read1.getMatch().getStrand() == read2.getMatch().getStrand()) { return FFRR; } const cc::Alignment &first = (read2.getMatch().getPosition() > read1.getMatch().getPosition() ? read1 : read2); if(cc::Match::Forward == first.getMatch().getStrand()) { return IN; } if(cc::Match::Reverse == first.getMatch().getStrand()) { return OUT; } return UNKNOWN; } struct FragmentSorter { bool operator()(const Fragment &lhs, const Fragment &rhs) { return (lhs.getPosition() < rhs.getPosition()) || ((lhs.getPosition() == rhs.getPosition()) && (lhs.getLength() < rhs.getLength())) || ((lhs.getPosition() == rhs.getPosition()) && (lhs.getLength() == rhs.getLength()) && (lhs.getTile() < rhs.getTile())) || ((lhs.getPosition() == rhs.getPosition()) && (lhs.getLength() == rhs.getLength()) && (lhs.getTile() == rhs.getTile()) && (lhs.getX() < rhs.getX())); } }; DiversityCalculator::DiversityCalculator(const unsigned int maximumFragmentSize, const fs::path &inputDirectory, const fs::path &outputFile, const unsigned int processLane) : maximumFragmentSize_(maximumFragmentSize) , inputDirectory_(inputDirectory) , outputFile_(outputFile) , processLane_(processLane) { } inline double ratio(const double total, const double unique, const double diversity) { const double lhs = unique / diversity; const double rhs = 1 - exp(-total/diversity); return lhs / rhs; } double landerWaterman(const double total, const double unique) { assert(0 <= total); assert(0 <= unique); assert(unique <= total); double lowerBound = 0.1; double upperBound = 1e12; if (total == unique) { return upperBound; } if (0. == unique) { return 0.; } double diversity = (upperBound + lowerBound) / 2.; double r = ratio(total, unique, diversity); while ( (fabs(1. - r) > 0.0000001) && (fabs(upperBound - lowerBound) > 0.1) ) { if(r > 1.0) { lowerBound = diversity; } else { upperBound = diversity; } diversity = (upperBound + lowerBound) / 2.; r = ratio(total, unique, diversity); } return diversity; } void generateReport(std::vector laneDiversityCalculatorList, const fs::path &outputFile) { std::ofstream os(outputFile.string().c_str()); if (!os) { BOOST_THROW_EXCEPTION(cc::IoException(errno, "Couldn't open output file " + outputFile.string())); } os << "Lander-Waterman estimation of diversity" << std::endl; os << "Lane\tRead-Pairs-Used\t Unique-Pairs\t Diversity-Estimate" << std::endl; for(std::vector::const_iterator lane = laneDiversityCalculatorList.begin(); laneDiversityCalculatorList.end() != lane; ++lane) { const unsigned int unique = lane->getUniqueCount(); const unsigned int total = unique + lane->getDuplicateCount(); os << std::setw(4) << lane->getLaneId() << '\t' << std::setw(15) << total << '\t' << std::setw(15) << unique << '\t' << std::setw(20) << static_cast(landerWaterman(total, unique)) << std::endl; } os << "\n\n\nNumber of Aligned Paired Reads" << std::endl; os << "Lane\t\tNumber of Reads" << std::endl; for(std::vector::const_iterator lane = laneDiversityCalculatorList.begin(); laneDiversityCalculatorList.end() != lane; ++lane) { os << std::setw(4) << lane->getLaneId() << '\t' << '\t' << std::setw(15) << lane->getAlignedPairCount() << std::endl; } size_t maximumSize = 0; for(std::vector::const_iterator lane = laneDiversityCalculatorList.begin(); laneDiversityCalculatorList.end() != lane; ++lane) { if (lane->getMaximumSize() > maximumSize) { maximumSize = lane->getMaximumSize(); } } os << "\n\n\nFragment Sizes -- All Pairs\n" << std::endl; os << std::setw(12) << "Size"; for(std::vector::const_iterator lane = laneDiversityCalculatorList.begin(); laneDiversityCalculatorList.end() != lane; ++lane) { os << std::setw(11) << "Lane " << lane->getLaneId(); } os << std::endl; for (size_t i = 0; maximumSize > i; ++i) { os << std::setw(12) << i*10; for(std::vector::const_iterator lane = laneDiversityCalculatorList.begin(); laneDiversityCalculatorList.end() != lane; ++lane) { if (0 == lane->getSizeCount(i*10)) { os << std::setw(12) << '0'; } else { os << std::setw(12) << lane->getSizeCount(i*10); } } os << std::endl; } } void DiversityCalculator::run() { std::cout << "processLane: " << processLane_ << "\n"; std::vector laneDiversityCalculatorList; if (processLane_ != 0) // a lane was specified on the command line { const std::string exportFile1 = (boost::format("s_%d_1_export.txt") % processLane_).str(); const std::string exportFile2 = (boost::format("s_%d_2_export.txt") % processLane_).str(); const std::string compExportFile1 = exportFile1 + ".gz"; const std::string compExportFile2 = exportFile2 + ".gz"; if ((fs::exists(inputDirectory_ / exportFile1) || fs::exists(inputDirectory_ / compExportFile1)) && (fs::exists(inputDirectory_ / exportFile2) || fs::exists(inputDirectory_ / compExportFile2))) { laneDiversityCalculatorList.push_back(LaneDiversityCalculator()); laneDiversityCalculatorList.back().run(maximumFragmentSize_, inputDirectory_, processLane_); } generateReport(laneDiversityCalculatorList, outputFile_); } else // default option: if --lane=0 then do all lanes { for (unsigned int laneId = 1; 8>= laneId; ++laneId) { const std::string exportFile1 = (boost::format("s_%d_1_export.txt") % laneId).str(); const std::string exportFile2 = (boost::format("s_%d_2_export.txt") % laneId).str(); const std::string compExportFile1 = exportFile1 + ".gz"; const std::string compExportFile2 = exportFile2 + ".gz"; if ((fs::exists(inputDirectory_ / exportFile1) || fs::exists(inputDirectory_ / compExportFile1)) && (fs::exists(inputDirectory_ / exportFile2) || fs::exists(inputDirectory_ / compExportFile2))) { laneDiversityCalculatorList.push_back(LaneDiversityCalculator()); laneDiversityCalculatorList.back().run(maximumFragmentSize_, inputDirectory_, laneId); } } generateReport(laneDiversityCalculatorList, outputFile_); } } void LaneDiversityCalculator::run(unsigned int maximumFragmentSize, const fs::path &inputDirectory, unsigned int laneId) { assert(laneId > 0); laneId_ = laneId; std::cout << "Processing Lane " << laneId << std::endl; // open export files bi::filtering_istream exportFilterStream1, exportFilterStream2; std::ifstream exportStream1, exportStream2; openExportFile(inputDirectory, laneId, 1, exportFilterStream1, exportStream1); openExportFile(inputDirectory, laneId, 2, exportFilterStream2, exportStream2); FragmentMap fragmentMap = getFragmentMap(exportFilterStream1, exportFilterStream2); conventionalPairCount_ = 0; outwardPairCount_ = 0; anomalousPairCount_ = 0; duplicateCount_ = 0; doubleDetection_ = 0; uniqueCount_ = 0; equalCount_ = 0; unknownCount_ = 0; sizeCount_.resize(0); sizeCount_.resize(maximumFragmentSize / 10, 0); for (FragmentMap::iterator fragmentIterator = fragmentMap.begin();fragmentMap.end() != fragmentIterator; ++fragmentIterator) { std::vector &fragmentList = fragmentIterator->second; std::cout << "Chromosome " << fragmentIterator->first << ": " << fragmentList.size() << std::endl; std::sort(fragmentList.begin(), fragmentList.end(), FragmentSorter()); countFragments(fragmentList); } std::cout << "\nLane " << laneId << ":" << '\n' << " Aligned fragments : " << alignedPairCount_ << '\n' << " Used fragments : " << uniqueCount_ + duplicateCount_ << '\n' << " Unique fragments : " << uniqueCount_ << '\n' << " Duplicate fragments: " << duplicateCount_ << '\n' << " Double detections : " << doubleDetection_ << '\n' << " Conventional pairs : " << conventionalPairCount_ << '\n' << " Outward pairs : " << outwardPairCount_ << '\n' << " Anomalous pairs : " << anomalousPairCount_ << '\n' << " Equal pair ends : " << equalCount_ << '\n' << std::endl; } void LaneDiversityCalculator::openExportFile(const fs::path& inputDirectory, const unsigned int laneId, const unsigned int readNum, bi::filtering_istream &exportFilterStream, std::ifstream &exportStream) { const std::string exportFilename = (boost::format("s_%u_%u_export.txt") % laneId % readNum).str(); const std::string compExportFilename = exportFilename + ".gz"; fs::path exportFilePath = inputDirectory / exportFilename; fs::path compExportFilePath = inputDirectory / compExportFilename; // check if a compressed export file exists instead of a normal export file bool isCompressed = false; if(!fs::exists(exportFilePath) && fs::exists(compExportFilePath)) { exportFilePath = compExportFilePath; isCompressed = true; } // open the export file exportStream.open(exportFilePath.string().c_str()); if(exportStream.fail()) { BOOST_THROW_EXCEPTION(casava::common::IoException(errno, "Couldn't open " + exportFilePath.string())); } // check if the current file is compressed if(!isCompressed) { isCompressed = (exportStream.get() == 037) && (exportStream.get() == 0213); if(exportStream.fail()) { BOOST_THROW_EXCEPTION(cc::IoException(errno, "Unable to read the magic number from file " + exportFilePath.string())); } exportStream.seekg(0, std::ios::beg); } // add the gzip decompressor if(isCompressed) { exportFilterStream.push(bi::gzip_decompressor()); } exportFilterStream.push(exportStream); } void LaneDiversityCalculator::countFragments(const std::vector &fragmentList) { if (fragmentList.empty()) { return; } std::vector::const_iterator current = fragmentList.begin(); addFragment(*current); ++uniqueCount_; std::vector::const_iterator previous = current; while(fragmentList.end() != ++current) { if ( (previous->getPosition() != current->getPosition()) || (previous->getLength() != current->getLength()) ) { addFragment(*current); ++uniqueCount_; } else { // check that the duplicate is not a double detection of a cluster const int dx = previous->getX() - current->getX(); const int dy = previous->getY() - current->getY(); if ( (previous->getTile() != current->getTile()) || (dx * dx + dy * dy > squaredOpticalDuplicatesWindow) ) { ++duplicateCount_; } else { ++doubleDetection_; } } previous = current; } } void LaneDiversityCalculator::addFragment(const Fragment &fragment) { switch (fragment.getType()) { case Fragment::IN: ++conventionalPairCount_; break; case Fragment::OUT: ++outwardPairCount_; break; case Fragment::FFRR: ++anomalousPairCount_; break; case Fragment::EQUAL: ++equalCount_; break; default: ++unknownCount_; break; } const size_t i = fragment.getLength() / 10; if (sizeCount_.size() > i) { ++sizeCount_[i]; } } LaneDiversityCalculator::FragmentMap LaneDiversityCalculator::getFragmentMap(bi::filtering_istream &exportStream1, bi::filtering_istream &exportStream2) { int failed1 = 0; int failed2 = 0; int success = 0; int chimeraCount = 0; int contigMismatch = 0; alignedPairCount_ = 0; FragmentMap fragmentMap; cc::Alignment alignment1, alignment2; unsigned int count = 0; while (exportStream1 >> alignment1 && exportStream2 >> alignment2) { if (0 == (++count % 200000)) { std::cout << "*" << std::flush; } // discart all those that do not pass filter if ( ! (alignment1.getPassed() && alignment2.getPassed()) ) { continue; } // TODO: CHECK THIS if ( (alignment1.getStrand() != cc::Match::Forward && alignment1.getStrand() != cc::Match::Reverse) || (alignment2.getStrand() != cc::Match::Forward && alignment2.getStrand() != cc::Match::Reverse) ) { continue; } ++alignedPairCount_; // discard chimeras if (alignment1.getChromosome() != alignment2.getChromosome()) { ++chimeraCount; continue; } // discard fragments aligning across different contigs if (alignment1.getContig() != alignment2.getContig()) { ++contigMismatch; continue; } // discard anomalous reads with low alignment scores if ( (0 == alignment1.getPairedReadScore() && 0 == alignment2.getPairedReadScore()) && (10 >= alignment1.getSingleReadScore() && 10 >= alignment2.getSingleReadScore()) ) { ++failed1; continue; } // discard reads with low alignment score if (0 < alignment1.getPairedReadScore() && 0 < alignment2.getPairedReadScore() && 5 >= alignment1.getPairedReadScore() && 5 >= alignment2.getPairedReadScore()) { ++failed2; continue; } ++success; fragmentMap[alignment1.getChromosome() + alignment1.getContig()].push_back(Fragment(alignment1, alignment2)); } std::cout << std::endl; if (!exportStream1.eof() || !(exportStream2 >> alignment2).eof()) { BOOST_THROW_EXCEPTION(casava::common::IoException(errno, "Couldn't read both export files until the end")); } std::cout << "Aligned pair count : " << alignedPairCount_ << std::endl; std::cout << "Chimera count : " << chimeraCount << std::endl; std::cout << "Contig mismatch count : " << contigMismatch << std::endl; std::cout << "Failed single read score: " << failed1 << std::endl; std::cout << "Failed paired read score: " << failed2 << std::endl; std::cout << "Good fragments : " << success << std::endl; return fragmentMap; } } // namespace alignment } // namespace casava