/** * Project : CASAVA * Module : $RCSfile: AlignCandIndelReads.cpp,v $ * @author : Bret D. Barnes * Copyright : Copyright (c) Illumina 2008, 2009. All rights reserved. * ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). * */ #include "blt_util/seq_util.hh" #include "applications/AlignCandIndelReads.hh" #include "variance/export.h" #include "variance/tagAligner.h" #include "variance/indelReadsStats.h" #include "variance/AlignCandIndelReadsImpl.hh" // Next #define prints out all shadows as export lines preceded by comments //#define LIST_SHADOWS 1 namespace ca { namespace applications { using namespace casava::common; AlignCandIndelReads::~AlignCandIndelReads() {} int AlignCandIndelReads::run() { // // Parameter declarations // // Intialize start time int startTime = time(0); // Set verbosity level // _options.verbose() ??? bool verbose = true; // Get match, mismatch, gapExtend, gapOpen // local alignment stats // // match = _options.alignCandIndelReadsMatchScore() // misMatch = _options.alignCandIndelReadsMismatchScore() // gapOpen = _options.alignCandIndelReadsOpenScore() // gapExtend = _options.alignCandIndelReadsExtendScore() // const unsigned sdFlankWeight(_options.sdFlankWeight); if ((not _options.is_set_sdFlankWeight) and verbose) { cerr << "# No sdFlankWeight provided, setting to " << sdFlankWeight << endl; } // Load reference genome string genomeSeq; get_ref_seq(_options.refSequencePath().c_str(),genomeSeq); standardize_ref_seq(_options.refSequencePath().c_str(),genomeSeq); Export::forceOnlyACTGN(genomeSeq); const int genomeLength((int)genomeSeq.length()); // Load sample stats file // _options.sampleStatsPath().c_str() bool statsFilePresent = false; IndelReadsStats sampleInsStats; if (not _options.sampleStatsPath.empty()) { statsFilePresent = true; sampleInsStats.load(_options.sampleStatsPath); } else if (verbose) { cerr << "# No stats file provided, using range provided in export fiie." << endl; } // Open output file // _options.outputFilePath().c_str() ofstream outStream; outStream.open(_options.outputFilePath().c_str(), ios_base::out); // Process sorted file of candidate indel reads // _options.inputReadsFilePath().c_str() ifstream readStream; readStream.open(_options.inputReadsFilePath().c_str(), ios_base::in); // Get align score thresh or set to default value const double alignScoreThresh(_options.alignScoreThresh); if (not _options.is_set_alignScoreThresh) { cerr << "# No alignment score threshold provided!" << endl; } if (verbose) { cerr << "# Set alignment score threshold to: " << alignScoreThresh << endl; } cerr << "# Using sdFlankWeight " << sdFlankWeight << endl; cerr << "# Using alignScoreThresh " << alignScoreThresh << endl; cerr << "# Using matchScore " << _options.alignCandIndelReadsMatchScore() << endl; cerr << "# Using mismatchScore " << _options.alignCandIndelReadsMismatchScore() << endl; cerr << "# Using extendScore " << _options.alignCandIndelReadsExtendScore() << endl; cerr << "# Using openScore " << _options.alignCandIndelReadsOpenScore() << endl; // Intialize AlignCandIndelReadsImpl AlignCandIndelReadsImpl candReadAligner(_options.alignCandIndelReadsMatchScore(), _options.alignCandIndelReadsMismatchScore(), _options.alignCandIndelReadsExtendScore(), _options.alignCandIndelReadsOpenScore(), alignScoreThresh); // // Process reads data // unsigned int lineCount = 0; if (verbose) { cerr << "# Processing export data!" << endl; } while(! readStream.eof()) {// && lineCount < 1000) { // Read line string line; getline(readStream, line); //cerr << "LINE\t" << lineCount << "\t" << line << endl; // Skip blank and comment(#) lines if (line.length() == 0) { continue; } if (line[0] == '#') { continue; } // Load export line into export record object Export record(line, 1); // Get reference region to align tag too string refSeq; int refPos; int refLen; // Well extract the ref seq to align to using the information // in the stats file if the stats file is present // Otherwise we expect the range start and end data to be present // in the export record. if (statsFilePresent) { // Get mean and sd insert size int mean = (int)sampleInsStats.getMeanInsSize(record.getMachineName(), record.getRunNum(), record.getLane()); int sd = (int)sampleInsStats.getSdInsSize(record.getMachineName(), record.getRunNum(), record.getLane()); // Get reference region to align tag too refLen = (sdFlankWeight * sd) + (record.getRead().length() * 2); if (record.isShadow()) { if (record.getMatchStrand() == 'R') { refPos = record.getMatchStart() + mean - (refLen / 2); } else { refPos = record.getMatchStart() - mean - (refLen / 2); } } else { refPos = record.getMatchStart() - (refLen / 2); } } else { refPos = record.getRangeStart(); refLen = record.getRangeEnd() - record.getRangeStart() + 1; } #ifdef OLD_BOUND_CHECKING_CODE // Ensure that we are not trying to substr off the end of reference if (refPos >= (int)genomeSeq.length()) { cerr << "[ERROR]: Request for a substr out of bounds, pos: " << refPos << ", genome len: " << genomeSeq.length() << endl; exit(1); } while(refPos + refLen >= (int)genomeSeq.length()) { refLen -= 5; if (refLen < 0) { cerr << "[ERROR]: Don't now how we got here. Please contact bbarnes@illumina.com." << refPos << ", " << refLen << ", " << genomeSeq.length() << endl; exit(1); } } #endif // Ensure we're not substring off the front of the chromosome, // i.e. refPos < 1. NOTE: This should be allowed for circular // genomes, but we won't handel that for now... if (refPos < 1) { refPos = 1; refLen -= 1-refPos; } if (refPos+refLen>genomeLength) { // check request substring doesn't go off the end of the genome // if whole of requested substring is off the edge then refLen becomes // negative. This doesn't matter as it then fails the next check, so // doesn't get aligned and the record is output without changes. refLen-=refPos+refLen-genomeLength; } if (refLen>=(int)record.getRead().length()) { // Minus 1 to get into zero based coordinates refSeq = genomeSeq.substr(refPos - 1, refLen); //cerr << "RANGE\t" << refPos << "\t" << refPos + refLen << endl; //cerr << "LINE\t" << record.toString() << endl; // Perform local alignment and update export read info candReadAligner(record, refSeq, refPos); } // Write updated export record outStream << record.toString() << endl; lineCount++; } // Close file handles readStream.close(); outStream.close(); // Write run summary stats if (verbose) { double runTime = (time(0) - startTime) / 3600.0; cerr << "# Processed " << lineCount << " total records." << endl; cerr << "# Total run time: " << runTime << endl << endl; cerr << candReadAligner.toString() << endl; } return 0; } // ~int IndelFinder::run() } } // end namespace casava{ namespace { applications // End of file