/** ** Copyright (c) 2007-2010 Illumina, Inc. ** ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). ** ** This file is part of the Consensus Assessment of Sequence And VAriation ** (CASAVA) software package. ** **/ #include #include #include #include "smallRna/TrimmerInput.h" //#include //using namespace std; CTrimmerInput::CTrimmerInput() { createDefaultAdaptorString(); createDefaultSimilarityMatrix(); m_InputFilename[0] = 0; m_OutputFilename[0] = 0; m_ReportFilename[0] = 0; m_SimMatFilename[0] = 0; m_Threshold = INT_MIN; m_FpIn = 0; m_FpOut = 0; m_FpReport = 0; m_padLength = 0; m_nExtraBases = 0; } CTrimmerInput::CTrimmerInput(int argc, char * argv[], int & status) { status = OKAY; createDefaultAdaptorString(); createDefaultSimilarityMatrix(); m_InputFilename[0] = 0; m_OutputFilename[0] = 0; m_ReportFilename[0] = 0; m_SimMatFilename[0] = 0; m_Threshold = INT_MIN; m_FpIn = 0; m_FpOut = 0; m_FpReport = 0; m_padLength = 0; m_nExtraBases = 0; int i, j; float fThreshold; FILE * fpSimMat = 0; /* float fPad; case 'p': sscanf(argv[i], "-pad=%f", &fPad); m_padLength = (int) ((fPad < 0) ? (fPad - 0.5f) : (fPad + 0.5f)); break; */ int okay = 1; for (i = 1; i < argc; i++) { if (argv[i][0] == '-') switch (argv[i][1]) { case 'h': CTrimmerInput::PrintHelp(argv[0]); exit(0); case 'e': okay = (1 == sscanf(argv[i], "-extra=%d", &m_nExtraBases)); break; case 'a': okay = (1 == sscanf(argv[i], "-adaptor=%s", adaptorString)); break; case 'i': okay = (1 == sscanf(argv[i], "-input=%s", m_InputFilename)); break; case 'o': okay = (1 == sscanf(argv[i], "-output=%s", m_OutputFilename)); break; case 'r': okay = (1 == sscanf(argv[i], "-report=%s", m_ReportFilename)); break; case 's': okay = (1 == sscanf(argv[i], "-simmat=%s", m_SimMatFilename)); break; case 'p': okay = (1 == sscanf(argv[i], "-pad=%d", &m_padLength)); break; case 't': okay = (1 == sscanf(argv[i], "-threshold=%f", &fThreshold)); m_Threshold = (int) ((fThreshold < 0) ? (fThreshold - 0.5f) : (fThreshold + 0.5f)); break; default: okay = 0; break; }} if ( !okay ) { printf( "**Error in input to %s.\n", argv[0]); CTrimmerInput::PrintHelp(argv[0]); exit(0); exit(1) ; } if ((m_InputFilename[0] != 0) ) { if ( strcmp( m_InputFilename, "-") == 0 ) { m_FpIn = stdin; } else { if ((m_FpIn = fopen(m_InputFilename, "r")) == 0 ) { printf("Input file: %s not found\n", m_InputFilename); status = FILE_OPEN_FAILURE; return; } } } if ((m_OutputFilename[0] != 0) && ((m_FpOut = fopen(m_OutputFilename, "w")) == 0)) { printf("Output file: %s not found\n", m_OutputFilename); status = FILE_OPEN_FAILURE; return; } if ((m_ReportFilename[0] != 0) && ((m_FpReport = fopen(m_ReportFilename, "w")) == 0)) { printf("Output file: %s not found\n", m_ReportFilename); status = FILE_OPEN_FAILURE; return; } if ((m_SimMatFilename[0] != 0) && ((fpSimMat = fopen(m_SimMatFilename, "r")) == 0)) { printf("Score matrix file: %s not found\n", m_SimMatFilename); status = FILE_OPEN_FAILURE; return; } if (fpSimMat != 0) { for (i = 0; i < NSYMBOLS; i++) { for (j = 0; j < NSYMBOLS; j++) { if (fscanf(fpSimMat, "%d", &simmatrix[i][j]) <= 0) { printf("Error reading matrix file: %s\n", m_SimMatFilename); printf("Should be a %d x %d matrix\n", NSYMBOLS, NSYMBOLS); status = FILE_READ_FAILURE; fclose(fpSimMat); return; }}}} if (fpSimMat) fclose(fpSimMat); fpSimMat = 0; } CTrimmerInput::~CTrimmerInput() { if (m_FpOut) fclose(m_FpOut); m_FpOut = 0; if (m_FpIn) fclose(m_FpIn); m_FpIn = 0; if (m_FpReport) fclose(m_FpReport); m_FpReport = 0; } void CTrimmerInput::PrintHelp(const char * command) { printf("Usage: %s [options]\n" "Options:\n" " -adaptor= set adaptor string\n" " -extra= remove N additional bases at 5' end of adaptor (for DNA prep with A overhangs, use -extra=1)\n" " -input= set the path to the input Fastq file. Use - to read fastq from stdin\n" " -output= set the path to the output file\n" " -simmat= set the path to the similarity matrix text file\n" " -threshold= set lowest acceptable score threshold\n" " -pad= sets length to N-pad sequences to. Default is read length of first sequence. Use -1 to crop instead of N pad.\n" " -report= specifies name of output report file (for histogram of lengths of trimmed sequences)\n" , command); } void CTrimmerInput::WriteBanner() { int i, j; if ((m_FpIn && !m_FpOut) || !m_FpIn) { printf("> Illumina Adaptor Trimming. V1.0."); printf(" Copyright (c) 2008-2012, Illumina Inc. All rights reserved.\n"); printf("> Input Parameters:\n"); printf("> Adaptor = %s\n", adaptorString); printf("> Threshold = %d \n", m_Threshold); printf("> Similarity Matrix =\n"); for (i = 0; i < NSYMBOLS; i++) { printf("> "); for (j = 0; j < NSYMBOLS; j++) { printf("%10d\t", simmatrix[i][j]); } printf("\n"); } printf(">\n"); } if (m_FpOut) { fprintf(m_FpOut, "> Illumina Adaptor Trimming. V1.0."); fprintf(m_FpOut, " Copyright (c) 2008-2012, Illumina Inc. All rights reserved.\n"); fprintf(m_FpOut, "> Input Parameters:\n"); fprintf(m_FpOut, "> Adaptor = %s\n", adaptorString); fprintf(m_FpOut, "> Threshold = %d \n", m_Threshold); fprintf(m_FpOut, "> Similarity Matrix =\n"); for (i = 0; i < NSYMBOLS; i++) { fprintf(m_FpOut, "> "); for (j = 0; j < NSYMBOLS; j++) { fprintf(m_FpOut, "%10d\t", simmatrix[i][j]); } fprintf(m_FpOut, "\n"); } fprintf(m_FpOut, ">\n"); } } void CTrimmerInput::createDefaultAdaptorString() { // strcpy(adaptorString, "TCGTATGCCGTCTTCTGCTTG"); Old adaptor default sequence 2008 strcpy(adaptorString, "ATCTCGTATGCCGTCTTCTGCTTG"); } void CTrimmerInput::createDefaultSimilarityMatrix() { /*********************************************************************************** * Enumeration of symbols: L, A, C, G, T, N * 0 = L (L is for lambda, the null character) * 1 = A * 2 = C * 3 = G * 4 = T * 5 = N ***********************************************************************************/ int isoMatchScore = 500; // A:A, C:C, G:G, T:T int isoMisMatchScore = -530; // A:C, G:T, & symmetry int nonIsoMisMatchScore = -560; // A:G, A:T, C:G, C:T, & symmetry int wildcardMatchScore = -47; // A:N, C:N, G:N, T:N, & symmetry, & N:N int indelMatchScore = -2000; // L:A, L:C, L:G, L:T, L:N, & symmetry // NOTE: L:L is 0 int i, j; simmatrix[0][0] = 0; // L:L is 0 for (i = 1; i <= 5; i++) { simmatrix[0][i] = simmatrix[i][0] = indelMatchScore; // L:A, L:C, L:G, L:T, L:N, & symmetry } for (i = 1; i <= 4; i++) { simmatrix[i][5] = simmatrix[5][i] = wildcardMatchScore; // A:N, C:N, G:N, T:N, & symmetry } simmatrix[5][5] = wildcardMatchScore; // N:N for (i = 1; i <= 4; i++) { for (j = 1; j <= 4; j++) { simmatrix[i][j] = nonIsoMisMatchScore; // Set all to nonIsoMisMatchScore } } simmatrix[1][2] = simmatrix[2][1] = isoMisMatchScore; // A:C & symmetry simmatrix[3][4] = simmatrix[4][3] = isoMisMatchScore; // G:T & symmetry for (i = 1; i <= 4; i++) { simmatrix[i][i] = isoMatchScore; // A:A, C:C, G:G, T:T } }