/** ** Copyright (c) 2007-2010 Illumina, Inc. ** ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). ** ** This file is part of the Consensus Assessment of Sequence And VAriation ** (CASAVA) software package. ** ** @file ElandExtendedReader.cpp ** ** @brief This class is responsible for parsing ELAND extended files. ** ** @author Michael Stromberg **/ #include "common/ElandExtendedReader.h" #include "common/StringUtilities.hh" using namespace std; namespace ck = casava::kagu; namespace casava { namespace common { // regex that captures information from a VMF file const boost::regex ElandExtendedReader::mPositionsRegex("^(-?\\d+)(F|R)(\\S+)$"); // constructor ElandExtendedReader::ElandExtendedReader(void) : mUseContigNames(false) , mUseReferenceNames(false) , mProvideReadName(false) {} // destructor ElandExtendedReader::~ElandExtendedReader(void) {} // populates the supplied casava read with the ELAND extended read name void ElandExtendedReader::ExtractReadName(CasavaRead& cr, const char* pBegin, const char* pEnd) { bool foundError = false; const uint32_t BUFFER_SIZE = (uint32_t)(pEnd - pBegin); // initialize pointers char* pUnderscore = NULL; char* pColon1 = NULL; char* pColon2 = NULL; char* pColon3 = NULL; char* pColon4 = NULL; char* pHash = NULL; char* pSlash = NULL; // HWI-BRUNOP16X_0001:8:1:3302:1060#0/1 pUnderscore = (char*)memchr(pBegin, '_', BUFFER_SIZE); if(!pUnderscore) foundError = true; if(!foundError) pColon1 = (char*)memchr(pUnderscore + 1, ':', BUFFER_SIZE - (pUnderscore - pBegin)); if(!pColon1) foundError = true; if(!foundError) pColon2 = (char*)memchr(pColon1 + 1, ':', BUFFER_SIZE - (pColon1 - pBegin)); if(!pColon2) foundError = true; if(!foundError) pColon3 = (char*)memchr(pColon2 + 1, ':', BUFFER_SIZE - (pColon2 - pBegin)); if(!pColon3) foundError = true; if(!foundError) pColon4 = (char*)memchr(pColon3 + 1, ':', BUFFER_SIZE - (pColon3 - pBegin)); if(!pColon4) foundError = true; if(!foundError) pHash = (char*)memchr(pColon4 + 1, '#', BUFFER_SIZE - (pColon4 - pBegin)); if(!pHash) foundError = true; if(!foundError) pSlash = (char*)memchr(pHash + 1, '/', BUFFER_SIZE - (pHash - pBegin)); if(!pSlash) foundError = true; if(foundError) { string readName; StringUtilities::CopyString(readName, pBegin, pEnd); BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("Read name extraction failed: [%s]") % readName).str())); } StringUtilities::CopyString(cr.Machine, pBegin, pUnderscore); StringUtilities::CopyString(cr.RunNumber, pUnderscore + 1, pColon1); StringUtilities::CopyString(cr.Lane, pColon1 + 1, pColon2); StringUtilities::CopyString(cr.Tile, pColon2 + 1, pColon3); StringUtilities::CopyString(cr.XCoord, pColon3 + 1, pColon4); StringUtilities::CopyString(cr.YCoord, pColon4 + 1, pHash); StringUtilities::CopyString(cr.Index, pHash + 1, pSlash); StringUtilities::CopyString(cr.ReadNumber, pSlash + 1, pEnd); } // returns true if there is another read available bool ElandExtendedReader::GetNextRead(CasavaRead& cr) { // return false if our file stream is closed if(!mIsOpen) return false; // get the next line from the alignment and base quality files string line; if(!GetNextLine(line)) return false; // extract the VMF fields // // using memchr & CopyString instead of boost::regex sped up kagu 12 % const uint32_t BUFFER_SIZE = (uint32_t)line.size(); const char* pBuffer = line.data(); const char* pEnd = pBuffer + BUFFER_SIZE; bool foundError = false; // initialize pointers char* pTab1 = NULL; char* pTab2 = NULL; char* pTab3 = NULL; pTab1 = (char*)memchr(pBuffer, '\t', BUFFER_SIZE); if(!pTab1) foundError = true; if(!foundError) pTab2 = (char*)memchr(pTab1 + 1, '\t', BUFFER_SIZE - (pTab1 - pBuffer)); if(!pTab2) foundError = true; if(!foundError) pTab3 = (char*)memchr(pTab2 + 1, '\t', BUFFER_SIZE - (pTab2 - pBuffer)); if(!pTab3) foundError = true; if(foundError) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("Tab-delimited splitting could not applied to the following line: [%s]") % line).str())); } if(mProvideReadName) ExtractReadName(cr, pBuffer + 1, pTab1); StringUtilities::CopyString(cr.Bases, pTab1 + 1, pTab2); StringUtilities::CopyString(cr.Status, pTab2 + 1, pTab3); StringUtilities::CopyString(cr.Positions, pTab3 + 1, pEnd); cr.IsNm = false; cr.IsQc = false; cr.IsTmm = false; cr.MStatus = MS_Unknown; // determine if the read is actually aligned const bool hasTooManyMatches = (cr.Positions[0] == '-' ? true : false); const string::size_type colonPos = cr.Status.find(':'); const bool isAligned = (!hasTooManyMatches && (colonPos != string::npos) ? true : false); // extract the seed errors if(colonPos != string::npos) { const string::size_type secondColon = cr.Status.find(':', colonPos + 1); if(secondColon == string::npos) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("Unable to find the second colon in the neighborhood string: [%s]") % cr.Status).str())); } // extract the three strings cr.SeedErrors[0] = atoi(cr.Status.substr(0, colonPos).c_str()); cr.SeedErrors[1] = atoi(cr.Status.substr(colonPos + 1, secondColon - colonPos - 1).c_str()); cr.SeedErrors[2] = atoi(cr.Status.substr(secondColon + 1).c_str()); if(hasTooManyMatches) { cr.IsTmm = true; cr.MStatus = MS_Repeat; } } else { const char statusChar = cr.Status[0]; if(statusChar == 'N') { cr.IsNm = true; cr.MStatus = MS_NM; } else if(statusChar == 'Q') { cr.IsQc = true; cr.MStatus = MS_QC; } cr.SeedErrors[0] = 0; cr.SeedErrors[1] = 0; cr.SeedErrors[2] = 0; } // extract the positions if(isAligned) { // split and assign the comma delimited positions vector posVector; StringUtilities::Split(cr.Positions, ',', posVector); if(posVector.size() == 1) { cr.MStatus = MS_SingleAlignmentFound; } else { cr.MStatus = MS_ManyAlignmentsFound; } cr.Alignments.resize(posVector.size()); CasavaAlignments::iterator caIter = cr.Alignments.begin(); string currentReferenceName; string currentContigName; boost::smatch positionsResults; vector::iterator sIter; for(sIter = posVector.begin(); sIter != posVector.end(); ++sIter, ++caIter) { // find the colon delimiter (if present) string::size_type colonPos = sIter->find(':'); if(colonPos != string::npos) { // find the forward slash contig name delimiter currentReferenceName = sIter->substr(0, colonPos); currentContigName.clear(); string::size_type slashPos = currentReferenceName.find('/'); if(slashPos != string::npos) { if(mUseContigNames) { currentReferenceName = currentReferenceName.substr(slashPos + 1); } else if(mUseReferenceNames) { currentReferenceName = currentReferenceName.substr(0, slashPos); } else { currentContigName = currentReferenceName.substr(slashPos + 1); currentReferenceName = currentReferenceName.substr(0, slashPos); } } *sIter = sIter->substr(colonPos + 1); } if(!boost::regex_search(*sIter, positionsResults, mPositionsRegex)) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("Regular expression (mPositionsRegex) could not applied to the following line: [%s]") % *sIter).str())); } caIter->ReferenceName = currentReferenceName; caIter->ContigName = currentContigName; caIter->IsReverseStrand = (positionsResults[2].str()[0] == 'R' ? true : false); caIter->MatchDescriptor = positionsResults[3].str(); caIter->ReferencePosition = atoi(positionsResults[1].str().c_str()); } } else cr.Alignments.clear(); return true; } // set to true if base qualities should be parsed, false otherwise void ElandExtendedReader::ProvideReadName(bool b) { mProvideReadName = b; } // sets the desired reference renaming strategy (contig, reference, or both) void ElandExtendedReader::SetReferenceRenamingStrategy(ck::ReferenceRenamingStrategy_t strategy) { if(strategy == ck::USE_CONTIG_NAME) mUseContigNames = true; else if(strategy == ck::USE_REFERENCE_NAME) mUseReferenceNames = true; } } }