/** ** Copyright (c) 2007-2010 Illumina, Inc. ** ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). ** ** This file is part of the Consensus Assessment of Sequence And VAriation ** (CASAVA) software package. ** **/ #include #include "smallRna/TrimmerErrors.h" #include "smallRna/Trimmer.h" #include "smallRna/TrimmerProcess.h" #include "smallRna/TrimmerInput.h" #include "smallRna/TrimmerReport.h" CTrimmerProcess::CTrimmerProcess(CTrimmerInput & trimmerInput) : m_FpIn(trimmerInput.GetInFilePtr()) , m_FpOut(trimmerInput.GetOutFilePtr()) , m_FpReport(trimmerInput.GetReportFilePtr()) , m_Threshold(trimmerInput.GetThreshold()) , m_nPadLen(trimmerInput.GetNpad()) , m_nExtraBases(trimmerInput.GetNextraBases()) { // printf("Process=%d, %d\n", m_nPadLen, trimmerInput.GetNpad()); } CTrimmerProcess::~CTrimmerProcess() { // Don't close m_FpIn and m_FpOut here // They will be closed in CTrimmerInput class deallocateHolder(); } void CTrimmerProcess::Run(CTrimmer & trimmer) { if (m_FpIn) runFromFastqFile(trimmer); else runFromConsole(trimmer); } void CTrimmerProcess::runFromFile(CTrimmer & trimmer) { int bufLength; CTrimmerResult result; const char * matcher; int matcherLength; std::map::iterator iterTrimResult; fseek(m_FpIn, 0, SEEK_SET); deallocateHolder(); //printf("Run=%d\n", m_nPadLen); while (fgets(m_Buffer, MAX_BUF_SIZE, m_FpIn)) { m_Buffer[MAX_BUF_SIZE - 1] = 0; bufLength = (int) strlen(m_Buffer); while (m_Buffer[--bufLength] == '\n') m_Buffer[bufLength] = 0; // parse(':', 2, 6); for GERALD type files parse(); if (strlen(m_ReadString) == 0) continue; iterTrimResult = m_ResultHolder.find(m_ReadString); if (iterTrimResult == m_ResultHolder.end()) { result.m_Count = 1; result.m_Status = trimmer.Run(m_ReadString, result.m_SuffixPosition, result.m_Score); if (result.m_Status == OKAY) { matcher = trimmer.GetMatch(); matcherLength = (int) strlen(matcher); result.m_Matcher = new char [matcherLength + 1]; if (result.m_Matcher == 0) result.m_Status = MEMORY_ALLOCATION_FAILURE; else { strcpy(result.m_Matcher, matcher); result.m_Matcher[matcherLength] = 0; } } m_ResultHolder[m_ReadString] = result; iterTrimResult = m_ResultHolder.find(m_ReadString); } else (iterTrimResult->second).m_Count++; if (m_FpOut) (iterTrimResult->second).Print(m_FpOut, m_ReadString, m_Threshold, m_PreString); else (iterTrimResult->second).Print(m_ReadString, m_Threshold, m_PreString); } } void CTrimmerProcess::runFromFastqFile(CTrimmer & trimmer) { CTrimmerResult result; const char * matcher; int matcherLength; std::map::iterator iterTrimResult; TrimmerReport report( m_FpReport ); fseek(m_FpIn, 0, SEEK_SET); deallocateHolder(); //printf("RunFastq=%d, %d\n", m_nPadLen, m_Threshold); char name1[MAX_BUF_SIZE]; char name2[MAX_BUF_SIZE]; char quality[MAX_BUF_SIZE]; while (fgets(name1, MAX_BUF_SIZE, m_FpIn)) { if (!fgets (m_ReadString, MAX_BUF_SIZE, m_FpIn) || !fgets (name2, MAX_BUF_SIZE, m_FpIn) || !fgets (quality, MAX_BUF_SIZE, m_FpIn) ) { break; } //ensure null termination name1[MAX_BUF_SIZE - 1] = 0; m_ReadString[MAX_BUF_SIZE - 1] = 0; name2[MAX_BUF_SIZE - 1] = 0; quality[MAX_BUF_SIZE - 1] = 0; // strip newlines name1[strlen(name1)-1] = 0; m_ReadString[strlen(m_ReadString)-1] = 0; name2[strlen(name2)-1] = 0; quality[strlen(quality)-1] = 0; if ( m_nPadLen == 0 ) { m_nPadLen = strlen(m_ReadString); } if (strlen(m_ReadString) == 0) { // SC - This case is not handled well. The correct behavior here would be to output this // record to the fastq file. In theory, the current implementation can cause problems with paired end reads. // Use case that causes problem: Suppose I have two different adaptors in my sample, so I // need to trim twice. First time through, I use -pad=-1 (no N padding). This could potentially // leave me with a 0 length sequences in one read and not in the other read (it shouldn't, but it could!). // Then the second time through the trimmer we will end up with a record missing in one PE file and not in // the other. Big problem for downstream processing. if ( report.Flag() ) report.Add( 0, 0 ); continue; } iterTrimResult = m_ResultHolder.find(m_ReadString); if (iterTrimResult == m_ResultHolder.end()) { result.m_Count = 1; result.m_Status = trimmer.Run(m_ReadString, result.m_SuffixPosition, result.m_Score); if (result.m_Status == OKAY) { matcher = trimmer.GetMatch(); matcherLength = (int) strlen(matcher); result.m_Matcher = new char [matcherLength + 1]; if (result.m_Matcher == 0) result.m_Status = MEMORY_ALLOCATION_FAILURE; else { strcpy(result.m_Matcher, matcher); result.m_Matcher[matcherLength] = 0; } } m_ResultHolder[m_ReadString] = result; iterTrimResult = m_ResultHolder.find(m_ReadString); } else (iterTrimResult->second).m_Count++; //if (m_FpOut) (iterTrimResult->second).Print(m_FpOut, m_ReadString, m_Threshold, m_PreString); //else (iterTrimResult->second).PrintFastq( m_ReadString, m_Threshold, name1, name2, quality, m_nPadLen); (iterTrimResult->second).PrintFastq( m_ReadString, m_Threshold, name1, name2, quality, m_nPadLen, m_nExtraBases, report ); } report.Print(); } void CTrimmerProcess::runFromConsole(CTrimmer & trimmer) { int bufLength; CTrimmerResult result; const char * matcher; int matcherLength; printf("Enter the read string : "); fflush(stdin); while (fgets(m_ReadString, MAX_READ_SIZE, stdin)) { m_ReadString[MAX_READ_SIZE - 1] = 0; bufLength = (int) strlen(m_ReadString); while (m_ReadString[--bufLength] == '\n') m_ReadString[bufLength] = 0; if (++bufLength == 0) continue; result.m_Count = 1; result.m_Status = trimmer.Run(m_ReadString, result.m_SuffixPosition, result.m_Score); if (result.m_Status == OKAY) { matcher = trimmer.GetMatch(); matcherLength = (int) strlen(matcher); result.m_Matcher = new char [matcherLength + 1]; if (result.m_Matcher == 0) result.m_Status = MEMORY_ALLOCATION_FAILURE; else { strcpy(result.m_Matcher, matcher); result.m_Matcher[matcherLength] = 0; } } result.Print(m_ReadString, m_Threshold); if (m_FpOut) result.Print(m_FpOut, m_ReadString, m_Threshold); delete [] result.m_Matcher; printf("Enter the read string : "); fflush(stdin); } } void CTrimmerProcess::deallocateHolder() { for (std::map::iterator iterTrimResult = m_ResultHolder.begin(); iterTrimResult != m_ResultHolder.end(); iterTrimResult++) { delete [] (iterTrimResult->second).m_Matcher; } m_ResultHolder.clear(); } void CTrimmerProcess::parse() { m_ReadString[0] = 0; m_PreString[0] = 0; char preStringStrArray[4][MAX_READ_SIZE]; sscanf(m_Buffer, "%s%s%s%s%s", preStringStrArray[0], preStringStrArray[1], preStringStrArray[2], preStringStrArray[3], m_ReadString); sprintf(m_PreString, "%s_%s_%s_%s", preStringStrArray[0], preStringStrArray[1], preStringStrArray[2], preStringStrArray[3]); } void CTrimmerProcess::parse(const char delimiter, const int startColumn, // Columns are indexed from 1 const int endColumn) { m_ReadString[0] = 0; m_PreString[0] = 0; int nDelimiterCount = 0; int index = 0; int startIndex = -1; int endIndex = -1; for (index = 0; m_Buffer[index] != 0; index++) { if ((m_Buffer[index] == delimiter) && (++nDelimiterCount == (startColumn - 1))) { startIndex = ++index; break; } } for (; m_Buffer[index] != 0; index++) { if ((m_Buffer[index] == delimiter) && (++nDelimiterCount == (endColumn - 1))) { endIndex = index++; break; } } memcpy(m_PreString, m_Buffer + startIndex, (endIndex - startIndex + 1)); m_PreString[endIndex - startIndex] = 0; startIndex = endIndex + 1; endIndex = -1; for (; m_Buffer[index] != 0; index++) { if (m_Buffer[index] == delimiter) { endIndex = index++; break; } } if (endIndex == -1) endIndex = index; memcpy(m_ReadString, m_Buffer + startIndex, (endIndex - startIndex + 1)); m_ReadString[endIndex - startIndex] = 0; for (index = 0; m_PreString[index] != 0; index++) { if (m_PreString[index] == delimiter) m_PreString[index] = '_'; } }