/** ** Copyright (c) 2007-2009 Illumina, Inc. ** ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). ** ** This file is part of the Consensus Assessment of Sequence And VAriation ** (CASAVA) software package. ** ** \file AdapterTrimmingRecordConverter.hh ** ** \brief Masks the adapter sequences. ** ** \author Roman Petrovski **/ #ifndef CASAVA_DEMULTIPLEX_ADAPTER_TRIMMING_RECORD_CONVERTER_HH #define CASAVA_DEMULTIPLEX_ADAPTER_TRIMMING_RECORD_CONVERTER_HH #include #include "io/FastqWriter.hh" #include "demultiplex/CompositeBclReader.hh" #include "Bcl2FastqRecordConverter.hh" namespace casava { namespace demultiplex { /** * @brief Wrapper for Bcl2FastqRecordConveter that masks adapter sequences found in the read with Ns. */ struct AdapterTrimmingRecordConverter { AdapterTrimmingRecordConverter( const std::string &adapterSequence, const unsigned barcodeCycles, Bcl2FastqRecordConverter &bcl2FastqConverter) : adapterSequence_(adapterSequence), barcodeCycles_(barcodeCycles), bcl2FastqConverter_(bcl2FastqConverter), basesTrimmed_(0) {} const cio::FastqWriter::RecordType &operator()(const CompositeBclReader::RecordType &bclRecord) { bclRecord_ = bclRecord; maskAdapter(); return bcl2FastqConverter_(bclRecord_); } private: const std::string adapterSequence_; // Number of cycles at the end of the read sequence that belong to the barcode. const unsigned barcodeCycles_; Bcl2FastqRecordConverter &bcl2FastqConverter_; CompositeBclReader::RecordType bclRecord_; unsigned long basesTrimmed_; void maskAdapter() { // readLength includes barcode cycles. Stop processing at the last non-barcode cycle const size_t readLength = bclRecord_.cluster_.first.length() - barcodeCycles_; const size_t adapterLength = adapterSequence_.length(); for (size_t readOffset = 0; readOffset < readLength; ++readOffset) { unsigned mismatchCount = 0; unsigned matchCount = 0; const size_t checkCycles = std::min(adapterLength, readLength - readOffset); for (size_t adapterOffset = 0; adapterOffset < checkCycles; ++adapterOffset) { if (bclRecord_.cluster_.first.at(readOffset + adapterOffset) == adapterSequence_.at(adapterOffset)) { ++matchCount; } else { ++mismatchCount; } if (mismatchCount > 1 && mismatchCount > matchCount) { // If at any point there are 2 or more mismatches and the number of mismatches is // greater than the number of matches, stop scanning and start a new scan at // the next initial position in the read sequence break; } } if (matchCount > mismatchCount * 2) { //If the match count is greater than 2 times the mismatch count, //mask all bases from the start position onward std::fill(bclRecord_.cluster_.first.begin() + readOffset, bclRecord_.cluster_.first.begin() + readLength, 'N'); basesTrimmed_ += readLength - readOffset; break; } } } }; } // namespce demultiplex } // namespace casava #endif // CASAVA_DEMULTIPLEX_ADAPTER_TRIMMING_RECORD_CONVERTER_HH