/** ** Copyright (c) 2007-2010 Illumina, Inc. ** ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). ** ** This file is part of the Consensus Assessment of Sequence And VAriation ** (CASAVA) software package. ** ** @file StringUtilities.cpp ** ** @brief Contains helper string functions that are used in kagu. ** ** @author Michael Stromberg **/ #include #include #include "common/Exceptions.hh" #include "common/StringUtilities.hh" using namespace std; namespace casava { namespace common { // returns true if we were able to extract the value from the key value pair bool StringUtilities::ExtractKeyValuePair(const std::string& s, const uint32_t offset, const std::string& key, std::string& val) { // find the start of the value const char* pBuffer = s.c_str(); const char* pEnd = pBuffer + s.size(); const char* pKey = key.c_str(); // start at the offset pBuffer += offset; if(pBuffer >= pEnd) return false; // look for our key const char* pStart = strstr(pBuffer, pKey); if(!pStart) return false; pStart += key.size(); // find the first occurrence of whitespace const char* pStop = pStart + 1; while(pStop != pEnd) { if(isspace(*pStop)) break; ++pStop; } // copy our value StringUtilities::CopyString(val, pStart, pStop); return true; } // returns the number of lines in the specified filename // // N.B.: a number of different ways were attempted, but this one was the fastest: // All tests were performed on a FASTA file containing human genome NCBI36. // using C fgetc: 173.673 seconds // using C++ getline: 59.599 seconds // using C read with 16 MB buffer: 8.813 seconds // using C++ ifstream::read with 16 MB buffer: 5.723 seconds // using C fread with 16 MB buffer: 4.214 seconds // using gzread with 1 MB buffer: ?.??? seconds uint32_t StringUtilities::GetNumLines(const string& filename) { gzFile in = gzopen(filename.c_str(), "rb"); if(!in) { BOOST_THROW_EXCEPTION(IoException(errno, (boost::format("Unable to open the file (%s) to count the number of lines") % filename).str())); } string buffer; buffer.resize(WC_BUFFER_SIZE + 1); char* pBuffer = (char*)buffer.data(); uint32_t numLines = 0; int bytes_read; while((bytes_read = gzread(in, pBuffer, WC_BUFFER_SIZE)) > 0) { if(bytes_read == -1) { BOOST_THROW_EXCEPTION(IoException(EINVAL, (boost::format("Unable to read data from %s") % filename).str())); } char *p = pBuffer; while((p = (char*)memchr(p, '\n', (pBuffer + bytes_read) - p))) { ++p; ++numLines; } } gzclose(in); return numLines; } // returns the read name given the supplied CASAVA read data structure string StringUtilities::GetReadName(const CasavaRead& cr) { ostringstream sb; sb << cr.Machine << ':' << cr.RunNumber << ':' << cr.FlowcellID << ':' << cr.Lane << ':' << cr.Tile << ':' << cr.XCoord << ':' << cr.YCoord; return sb.str(); } // retrieves the splice length from the supplied read name (RNA) int32_t StringUtilities::GetSpliceLength(const string& readName) { const uint32_t BUFFER_SIZE = (uint32_t)readName.size(); const char* pBuffer = readName.data(); // initialize pointers char* pUnderscore1 = NULL; char* pUnderscore2 = NULL; bool foundError = false; pUnderscore1 = (char*)memchr(pBuffer, '_', BUFFER_SIZE); if(!pUnderscore1) foundError = true; if(!foundError) pUnderscore2 = (char*)memchr(pUnderscore1 + 1, '_', BUFFER_SIZE - (pUnderscore1 - pBuffer)); if(!pUnderscore2) foundError = true; if(foundError) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("Could not extract the splice length from the following read name: [%s]") % readName).str())); } string spliceLenString; CopyString(spliceLenString, pUnderscore1 + 1, pUnderscore2); return atoi(spliceLenString.c_str()); } // splits the supplied delimited string into a vector // // boost split - 8000000 iterations = 33.422 seconds // mosaik split - 8000000 iterations = 15.593 seconds // split - 8000000 iterations = 7.397 seconds void StringUtilities::Split(const string& s, const char delimiter, vector& v) { const uint32_t sLen = (uint32_t)s.size(); const char* pStart = s.data(); const char* pOld = pStart; const char* pEnd = pStart + sLen - 1; const char* pCurrent = NULL; // count the columns uint32_t numColumns = 0; while((pCurrent = (char*)memchr(pOld, delimiter, (pStart + sLen) - pOld))) { ++numColumns; if(pCurrent == pEnd) break; pOld = pCurrent + 1; } uint32_t numRemaining = (uint32_t)(pEnd - pOld + 1); if(numRemaining > 0) ++numColumns; // resize the vector v.resize(numColumns); vector::iterator sIter = v.begin(); // assign the vector elements pOld = pStart; while((pCurrent = (char*)memchr(pOld, delimiter, (pStart + sLen) - pOld))) { CopyString(*sIter, pOld, pCurrent); ++sIter; if(pCurrent == pEnd) break; pOld = pCurrent + 1; } numRemaining = (uint32_t)(pEnd - pOld + 1); if(numRemaining > 0) CopyString(*sIter, pOld, pEnd + 1); } } }