/** ** Copyright (c) 2007-2010 Illumina, Inc. ** ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). ** ** This file is part of the Consensus Assessment of Sequence And VAriation ** (CASAVA) software package. ** ** @file FastqConverter ** ** @brief ** ** @author Michael Stromberg **/ #include "common/Exceptions.hh" #include "common/FileConversion.hh" #include "common/StringUtilities.hh" using namespace std; namespace bi = boost::iostreams; namespace casava { namespace common { // regular expressions const boost::regex FileConversion::mExternalHeaderRegex("^(\\S+)"); const boost::regex FileConversion::mCasava18HeaderRegex("^([^:]*):([^:]*):([^:]*):([^:]+):([^:]+):([^:]+):(\\S+)\\s+([^:]+):([^:]+):([^:]+):(\\S*)"); const boost::regex FileConversion::mCasava17HeaderRegex("^([^_]*)_([^:]*):([^:]+):([^:]+):([^:]+):([^/]+)/(\\d)"); const boost::regex FileConversion::mCasava17FcHeaderRegex("^([^_]*)_([^_]*)_([^:]*):([^:]+):([^:]+):([^:]+):([^/]+)/(\\d)"); const boost::regex FileConversion::mCasava17IdxHeaderRegex("^([^_]*)_([^:]*):([^:]+):([^:]+):([^:]+):([^#]+)#([^/]+)/(\\d)"); const boost::regex FileConversion::mCasava17FcIdxHeaderRegex("^([^_]*)_([^_]*)_([^:]*):([^:]+):([^:]+):([^:]+):([^#]+)#([^/]+)/(\\d)"); // constructor FileConversion::FileConversion(const std::string& barcodeSequence, const std::string& flowcellID, const std::string& runID, const std::string& readNum, const bool isCompressedOutput) : mIsInputOpen(false) , mIsOutputOpen(false) , mBarcodeSequence(barcodeSequence) , mControlID("0") , mFlowcellID(flowcellID) , mReadNum(readNum) , mRunID(runID) , mIsCompressedOutput(isCompressedOutput) {} // destructor FileConversion::~FileConversion(void) { CloseInput(); CloseOutput(); } // determines what the input file format might be SeqFormat FileConversion::CheckInputFormat(const string& inputFilename) { SeqFormat fmt = SeqFormat_UNKNOWN; // open the file ifstream inStream(inputFilename.c_str(), ios::binary); if(inStream.fail()) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("Unable to open the input file (%s) for reading.") % inputFilename).str())); } // decide if we should use the gzip decompressor const bool isCompressed = (inStream.get() == 037) && (inStream.get() == 0213); if(inStream.fail()) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("Unable to read the magic number from file (%s).") % inputFilename).str())); } inStream.seekg(0, ios::beg); bi::filtering_istream ifs; if(isCompressed) ifs.push(bi::gzip_decompressor()); ifs.push(inStream); // read in the first three lines vector lines; vector colSizes; const uint32_t NUM_LINES_TO_READ = 3; string line; vector columns; for(uint32_t i = 0; i < NUM_LINES_TO_READ; i++) { getline(ifs, line); if(!line.empty()) lines.push_back(line); if(ifs.fail() || ifs.eof()) break; StringUtilities::Split(line, '\t', columns); colSizes.push_back((uint32_t)columns.size()); } const uint32_t numLines = (uint32_t)lines.size(); // if the file starts with a greater than symbol, it's probably FASTA if((numLines >= 2) && (lines[0][0] == '>') && (IsNucleotide(lines[1][0]))) fmt = SeqFormat_FASTA; // if the file has an at symbol and a plus symbol, it's probably FASTQ if((numLines >= 3) && (lines[0][0] == '@') && (lines[2][0] == '+')) fmt = SeqFormat_FASTQ; // check for a consistent number of columns vector::const_iterator csCIter; bool isQseq = true; bool isExport = true; for(csCIter = colSizes.begin(); csCIter != colSizes.end(); ++csCIter) { if(isQseq && (*csCIter != 11)) isQseq = false; if(isExport && (*csCIter != 22)) isExport = false; } // set the QSEQ and export formats if(isQseq) fmt = SeqFormat_QSEQ; if(isExport) fmt = SeqFormat_EXPORT; // close the file inStream.close(); return fmt; } // closes the input streams void FileConversion::CloseInput(void) { if(mIsInputOpen) { mIsInputOpen = false; mInStream.close(); } } // closes the output streams void FileConversion::CloseOutput(void) { if(mIsOutputOpen) { mIsOutputOpen = false; //mOutStream.close(); // causes the program to hang if uncommented } } // converts a EXPORT file to a FASTQ file void FileConversion::ExportToFastq(const std::string& inputFilename, const std::string& outputFilename) { // open our files OpenInput(inputFilename); OpenOutput(outputFilename); // convert the data from each file vector exportColumns; string line; string header; string bases; string qualities; while(true) { // retrieve the next line getline(mInFilterStream, line); if(mInFilterStream.eof()) break; // split the tab delimited columns StringUtilities::Split(line, '\t', exportColumns); // sanity check if(exportColumns.size() != 22) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("Expected 22 columns in the export entry, but found %u columns.") % exportColumns.size()).str())); } const bool isFiltered = (exportColumns[21] == "N"); // adjust the barcode sequence if(exportColumns[6][0] == '0') exportColumns[6].clear(); // adjust the bases transform(exportColumns[8].begin(), exportColumns[8].end(), exportColumns[8].begin(), RemoveDots); // adjust the base qualities transform(exportColumns[9].begin(), exportColumns[9].end(), exportColumns[9].begin(), Phred64ToPhred33); // write the FASTQ entry // @EAS139:136:FC706VJ:2:5:1000:1285 1:Y:18:ATCACG mOutFilterStream << '@' << exportColumns[0] << ':' << exportColumns[1] << ':' << mFlowcellID << ':' << exportColumns[2] << ':' << exportColumns[3] << ':' << exportColumns[4] << ':' << exportColumns[5] << ' ' << exportColumns[7] << ':' << (isFiltered ? 'Y' : 'N') << ':' << mControlID << ':' << exportColumns[6] << '\n' << exportColumns[8] << "\n+\n" << exportColumns[9] << '\n'; } // close our files CloseInput(); CloseOutput(); } // extracts the metadata from the FASTA or FASTQ headers void FileConversion::ExtractHeaderData(HeaderData_t& data, const string& s, FastqFormat& headerStyle) const { boost::smatch headerResults; // figure out which header style is appropriate if(headerStyle == FastqFormat_UNKNOWN) { if(boost::regex_search(s, headerResults, mCasava18HeaderRegex)) headerStyle = FastqFormat_CASAVA18; if((headerStyle == FastqFormat_UNKNOWN) && boost::regex_search(s, headerResults, mCasava17FcIdxHeaderRegex)) headerStyle = FastqFormat_CASAVA17_FC_INDEX; if((headerStyle == FastqFormat_UNKNOWN) && boost::regex_search(s, headerResults, mCasava17FcHeaderRegex)) headerStyle = FastqFormat_CASAVA17_FC; if((headerStyle == FastqFormat_UNKNOWN) && boost::regex_search(s, headerResults, mCasava17IdxHeaderRegex)) headerStyle = FastqFormat_CASAVA17_INDEX; if((headerStyle == FastqFormat_UNKNOWN) && boost::regex_search(s, headerResults, mCasava17HeaderRegex)) headerStyle = FastqFormat_CASAVA17; if(headerStyle == FastqFormat_UNKNOWN) headerStyle = FastqFormat_EXTERNAL; } // handle the various header styles string machineName; string lastTwoChars; string::size_type machineNameLen; switch(headerStyle) { case FastqFormat_CASAVA18: // extract our header fields if(!boost::regex_search(s, headerResults, mCasava18HeaderRegex)) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("The CASAVA 1.8 regular expression failed on the following FASTQ header: [%s]") % s).str())); } // canonical read name (EAS139:136:FC706VJ:2:5:996:13539) data.Machine = headerResults[1].str(); data.RunNumber = headerResults[2].str(); data.FlowcellID = headerResults[3].str(); data.Lane = headerResults[4].str(); data.Tile = headerResults[5].str(); data.XCoord = headerResults[6].str(); data.YCoord = headerResults[7].str(); // metadata (1:Y:22:ATCACG) data.ReadNumber = headerResults[8].str(); data.IsFiltered = headerResults[9].str(); data.ControlID = headerResults[10].str(); data.BarcodeSequence = headerResults[11].str(); break; case FastqFormat_EXTERNAL: // extract our header fields if(!boost::regex_search(s, headerResults, mExternalHeaderRegex)) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("The external failed on the following FASTQ header: [%s]") % s).str())); } // remove the trailing read number markers machineName = headerResults[1].str(); machineNameLen = machineName.size(); if(machineNameLen >= 2) { lastTwoChars = machineName.substr(machineNameLen - 2); if((lastTwoChars == "/1") || (lastTwoChars == "/2")) machineName.resize(machineNameLen - 2); } // replace colons in the machine name with underlines replace(machineName.begin(), machineName.end(), ':', '_'); // canonical read name (EAS139:136:FC706VJ:2:5:996:13539) data.Machine = machineName; data.RunNumber = mRunID; data.FlowcellID = mFlowcellID; data.Lane = "0"; data.Tile = "0"; data.XCoord = "0"; data.YCoord = "0"; // metadata (1:Y:22:ATCACG) data.ReadNumber = mReadNum; data.IsFiltered = "N"; data.ControlID = mControlID; data.BarcodeSequence = mBarcodeSequence; break; case FastqFormat_CASAVA17: // extract our header fields if(!boost::regex_search(s, headerResults, mCasava17HeaderRegex)) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("The CASAVA 1.7 regular expression failed on the following FASTQ header: [%s]") % s).str())); } // canonical read name (EAS139:136:FC706VJ:2:5:996:13539) data.Machine = headerResults[1].str(); data.RunNumber = headerResults[2].str(); data.FlowcellID = mFlowcellID; data.Lane = headerResults[3].str(); data.Tile = headerResults[4].str(); data.XCoord = headerResults[5].str(); data.YCoord = headerResults[6].str(); // metadata (1:Y:22:ATCACG) data.ReadNumber = headerResults[7].str(); data.IsFiltered = "N"; data.ControlID = mControlID; data.BarcodeSequence = mBarcodeSequence; break; case FastqFormat_CASAVA17_FC: // extract our header fields if(!boost::regex_search(s, headerResults, mCasava17FcHeaderRegex)) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("The CASAVA 1.7 regular expression failed on the following FASTQ header: [%s]") % s).str())); } // canonical read name (EAS139:136:FC706VJ:2:5:996:13539) data.Machine = headerResults[1].str(); data.RunNumber = headerResults[2].str(); data.FlowcellID = headerResults[3].str(); data.Lane = headerResults[4].str(); data.Tile = headerResults[5].str(); data.XCoord = headerResults[6].str(); data.YCoord = headerResults[7].str(); // metadata (1:Y:22:ATCACG) data.ReadNumber = headerResults[8].str(); data.IsFiltered = "N"; data.ControlID = mControlID; data.BarcodeSequence = mBarcodeSequence; break; case FastqFormat_CASAVA17_INDEX: // extract our header fields if(!boost::regex_search(s, headerResults, mCasava17IdxHeaderRegex)) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("The CASAVA 1.7 regular expression failed on the following FASTQ header: [%s]") % s).str())); } // canonical read name (EAS139:136:FC706VJ:2:5:996:13539) data.Machine = headerResults[1].str(); data.RunNumber = headerResults[2].str(); data.FlowcellID = mFlowcellID; data.Lane = headerResults[3].str(); data.Tile = headerResults[4].str(); data.XCoord = headerResults[5].str(); data.YCoord = headerResults[6].str(); // metadata (1:Y:22:ATCACG) data.ReadNumber = headerResults[8].str(); data.IsFiltered = "N"; data.ControlID = mControlID; data.BarcodeSequence = (headerResults[7].str()[0] == '0' ? "" : headerResults[7].str()); break; case FastqFormat_CASAVA17_FC_INDEX: // extract our header fields if(!boost::regex_search(s, headerResults, mCasava17FcIdxHeaderRegex)) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("The CASAVA 1.7 regular expression failed on the following FASTQ header: [%s]") % s).str())); } // canonical read name (EAS139:136:FC706VJ:2:5:996:13539) data.Machine = headerResults[1].str(); data.RunNumber = headerResults[2].str(); data.FlowcellID = headerResults[3].str(); data.Lane = headerResults[4].str(); data.Tile = headerResults[5].str(); data.XCoord = headerResults[6].str(); data.YCoord = headerResults[7].str(); // metadata (1:Y:22:ATCACG) data.ReadNumber = headerResults[9].str(); data.IsFiltered = "N"; data.ControlID = mControlID; data.BarcodeSequence = (headerResults[8].str()[0] == '0' ? "" : headerResults[8].str()); break; default: BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("Unknown FASTQ header style encountered: [%d]") % headerStyle).str())); break; } } // converts a FASTA file to a FASTQ file void FileConversion::FastaToFastq(const std::string& inputFilename, const std::string& outputFilename, const char bq) { // open our files OpenInput(inputFilename); OpenOutput(outputFilename); // convert the data from each file string header; string bases; string qualities; string::size_type numQualities = 0; HeaderData_t data; FastqFormat headerStyle = FastqFormat_EXTERNAL; while(true) { // retrieve the next FASTA entry getline(mInFilterStream, header); if(mInFilterStream.eof() || mInFilterStream.fail()) break; // sanity check if(header[0] != '>') { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("A '>' character was expected in the FASTA header (%s).") % header).str())); } bool foundError = false; getline(mInFilterStream, bases); if(mInFilterStream.eof() || mInFilterStream.fail()) foundError = true; if(foundError) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("An truncated FASTA entry was detected (%s).") % header).str())); } // extract the header data ExtractHeaderData(data, header.substr(1), headerStyle); // reset the qualities if(numQualities != bases.size()) { numQualities = bases.size(); qualities = string(numQualities, bq + FASTQ_BQ_OFFSET); } // write the FASTQ entry // @EAS139:136:FC706VJ:2:5:1000:1285 1:Y:18:ATCACG mOutFilterStream << '@' << data.Machine << ':' << data.RunNumber << ':' << data.FlowcellID << ':' << data.Lane << ':' << data.Tile << ':' << data.XCoord << ':' << data.YCoord<< ' ' << data.ReadNumber << ':' << data.IsFiltered << ':' << data.ControlID << ':' << data.BarcodeSequence << '\n' << bases << "\n+\n" << qualities << '\n'; } // close our files CloseInput(); CloseOutput(); } // converts a FASTQ file to a FASTQ file void FileConversion::FastqToFastq(const std::string& inputFilename, const std::string& outputFilename) { // open our files OpenInput(inputFilename); OpenOutput(outputFilename); // convert the data from each file string header1, header2; string bases; string qualities; // keep track of header style HeaderData_t data; FastqFormat fastqHeaderStyle = FastqFormat_UNKNOWN; while(true) { // retrieve the next FASTQ entry getline(mInFilterStream, header1); if(mInFilterStream.eof() || mInFilterStream.fail()) break; // sanity check if(header1[0] != '@') { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("An '@' character was expected in the FASTQ header (%s).") % header1).str())); } bool foundError = false; getline(mInFilterStream, bases); if(mInFilterStream.eof() || mInFilterStream.fail()) foundError = true; if(!foundError) getline(mInFilterStream, header2); if(mInFilterStream.eof() || mInFilterStream.fail()) foundError = true; if(!foundError) getline(mInFilterStream, qualities); if(mInFilterStream.eof() || mInFilterStream.fail()) foundError = true; if(foundError) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("An truncated FASTQ entry was detected (%s).") % header1).str())); } // sanity check if(header2[0] != '+') { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("An '+' character was expected in the FASTQ header (%s).") % header2).str())); } // extract the header data ExtractHeaderData(data, header1.substr(1), fastqHeaderStyle); // adjust the base qualities if((fastqHeaderStyle == FastqFormat_CASAVA17) || (fastqHeaderStyle == FastqFormat_CASAVA17_FC) || (fastqHeaderStyle == FastqFormat_CASAVA17_INDEX) || (fastqHeaderStyle == FastqFormat_CASAVA17_FC_INDEX)) { transform(qualities.begin(), qualities.end(), qualities.begin(), Phred64ToPhred33); } // write the FASTQ entry // @EAS139:136:FC706VJ:2:5:1000:1285 1:Y:18:ATCACG mOutFilterStream << '@' << data.Machine << ':' << data.RunNumber << ':' << data.FlowcellID << ':' << data.Lane << ':' << data.Tile << ':' << data.XCoord << ':' << data.YCoord<< ' ' << data.ReadNumber << ':' << data.IsFiltered << ':' << data.ControlID << ':' << data.BarcodeSequence << '\n' << bases << "\n+\n" << qualities << '\n'; } // close our files CloseInput(); CloseOutput(); } // returns the number of columns in the specified string uint32_t FileConversion::GetNumColumns(const string& s, const char delimiter) { const uint32_t sLen = (uint32_t)s.size(); const char* pStart = s.data(); const char* pOld = pStart; const char* pEnd = pStart + sLen - 1; const char* pCurrent = NULL; // count the columns uint32_t numColumns = 0; while((pCurrent = (char*)memchr(pOld, delimiter, (pStart + sLen) - pOld))) { ++numColumns; if(pCurrent == pEnd) break; pOld = pCurrent + 1; } uint32_t numRemaining = (uint32_t)(pEnd - pOld + 1); if(numRemaining > 0) ++numColumns; return numColumns; } // opens the input streams void FileConversion::OpenInput(const std::string& inputFilename) { // sanity check if(mIsInputOpen) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("An attempt was made to open a file (%s) that was already open.") % inputFilename).str())); } // open the output file mInStream.open(inputFilename.c_str(), ios::binary); if(mInStream.fail()) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("Unable to open the input file (%s) for reading.") % inputFilename).str())); } // decide if we should use the gzip decompressor const bool isCompressed = (mInStream.get() == 037) && (mInStream.get() == 0213); mInStream.seekg(0, ios::beg); // create a gzip decompressor if(isCompressed) mInFilterStream.push(bi::gzip_decompressor()); mInFilterStream.push(mInStream); mIsInputOpen = true; } // opens the output streams void FileConversion::OpenOutput(const std::string& outputFilename) { // sanity check if(mIsOutputOpen) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("An attempt was made to open a file (%s) that was already open.") % outputFilename).str())); } // open the output file mOutStream.open(outputFilename.c_str(), ios::binary); if(mOutStream.fail()) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("Unable to open the output file (%s) for writing.") % outputFilename).str())); } // create a fast gzip compressor if(mIsCompressedOutput){ mOutFilterStream.push(bi::gzip_compressor(bi::gzip::best_speed)); } mOutFilterStream.push(mOutStream); mIsOutputOpen = true; } // converts a QSEQ file to a FASTQ file void FileConversion::QseqToFastq(const string& inputFilename, const string& outputFilename) { // open our files OpenInput(inputFilename); OpenOutput(outputFilename); // convert the data from each file vector qseqColumns; string line; string header; string bases; string qualities; while(true) { // retrieve the next line getline(mInFilterStream, line); if(mInFilterStream.eof()) break; // split the tab delimited columns StringUtilities::Split(line, '\t', qseqColumns); // sanity check if(qseqColumns.size() != 11) { BOOST_THROW_EXCEPTION(CasavaException(EINVAL, (boost::format("Expected 11 columns in the qseq entry, but found %u columns.") % qseqColumns.size()).str())); } const bool isFiltered = (qseqColumns[10] == "0"); // adjust the barcode sequence if(qseqColumns[6][0] == '0') qseqColumns[6].clear(); // adjust the bases transform(qseqColumns[8].begin(), qseqColumns[8].end(), qseqColumns[8].begin(), RemoveDots); // adjust the base qualities transform(qseqColumns[9].begin(), qseqColumns[9].end(), qseqColumns[9].begin(), Phred64ToPhred33); // write the FASTQ entry // @EAS139:136:FC706VJ:2:5:1000:1285 1:Y:18:ATCACG mOutFilterStream << '@' << qseqColumns[0] << ':' << qseqColumns[1] << ':' << mFlowcellID << ':' << qseqColumns[2] << ':' << qseqColumns[3] << ':' << qseqColumns[4] << ':' << qseqColumns[5] << ' ' << qseqColumns[7] << ':' << (isFiltered ? 'Y' : 'N') << ':' << mControlID << ':' << qseqColumns[6] << '\n' << qseqColumns[8] << "\n+\n" << qseqColumns[9] << '\n'; } // close our files CloseInput(); CloseOutput(); } } }