/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "snpGenome.h" #include using namespace std; #include #include "soLine.h" #include "mbtValOrderedVectorOfRWCString.h" #include "filename.h" #include "assert.h" #include "rwctokenizer.h" #include "bIsNumericMaybeWithWhitespace.h" #include "soAddCommas.h" #include "complement_so.h" #include "consedParameters.h" #include "min.h" #include "rwtvalvector.h" const int nAmbiguityTableSize = 256; char cAmbiguityTable[nAmbiguityTableSize][nAmbiguityTableSize]; static void setAmbiguityTable() { for( int nRef = 0; nRef < nAmbiguityTableSize; ++nRef ) { for( int nAlleleBase = 0; nAlleleBase < nAmbiguityTableSize; ++nAlleleBase ) { char cAnswer = '?'; if ( isalpha( nRef ) && isalpha( nAlleleBase ) ) { char cLower1 = tolower( nRef ); char cLower2 = tolower( nAlleleBase ); if ( cLower1 == cLower2 ) { cAnswer = toupper( cLower1 ); } else { if ( cLower1 > cLower2 ) { char cTemp = cLower1; cLower1 = cLower2; cLower2 = cTemp; } if ( cLower1 == 'a' && cLower2 == 'c' ) cAnswer = 'm'; else if ( cLower1 == 'a' && cLower2 == 'g' ) cAnswer = 'r'; else if ( cLower1 == 'a' && cLower2 == 't' ) cAnswer = 'w'; else if ( cLower1 == 'c' && cLower2 == 'g' ) cAnswer = 's'; else if ( cLower1 == 'c' && cLower2 == 't' ) cAnswer = 'y'; else if ( cLower1 == 'g' && cLower2 == 't' ) cAnswer = 'k'; else { cAnswer = '?'; } if ( tolower( nRef ) < tolower( nAlleleBase ) ) cAnswer = toupper( cAnswer ); } } // if ( isalpha( nRef ) && isalpha( nAlleleBase ) ) { cAmbiguityTable[nRef][nAlleleBase] = cAnswer; } } } const int nInsertionTableSize = 256; char cInsertionTable[nInsertionTableSize]; static void setInsertionTable() { for( int n = 0; n < nInsertionTableSize; ++n ) { cInsertionTable[n] = 0; } cInsertionTable['a'] = cInsertionTable['A'] = 'e'; cInsertionTable['c'] = cInsertionTable['C'] = 'f'; cInsertionTable['g'] = cInsertionTable['G'] = 'i'; cInsertionTable['t'] = cInsertionTable['T'] = 'j'; } const int nDeletionTableSize = 256; char cDeletionTable[nDeletionTableSize]; static void setDeletionTable() { for( int n = 0; n < nDeletionTableSize; ++n ) { cDeletionTable[n] = 0; } cDeletionTable['a'] = cDeletionTable['A'] = 'E'; cDeletionTable['c'] = cDeletionTable['C'] = 'F'; cDeletionTable['g'] = cDeletionTable['G'] = 'I'; cDeletionTable['t'] = cDeletionTable['T'] = 'J'; } const int ucINSERTION_BASE = 1; const int ucNOT_INSERTION_BASE = 2; const int nIfInsertionTableSize = 256; unsigned char ucIfInsertionTable[ nIfInsertionTableSize ]; static void setIfInsertionTable() { for( int n = 0; n < nIfInsertionTableSize; ++n ) { ucIfInsertionTable[n] = ucNOT_INSERTION_BASE; } ucIfInsertionTable['e'] = ucINSERTION_BASE; ucIfInsertionTable['f'] = ucINSERTION_BASE; ucIfInsertionTable['i'] = ucINSERTION_BASE; ucIfInsertionTable['j'] = ucINSERTION_BASE; } const int nIsModifiedTableSize = 256; static bool bIsModifiedTable[ nIsModifiedTableSize ]; static void setIsModifiedTable() { for( int n = 0; n < nIsModifiedTableSize; ++n ) { bIsModifiedTable[n] = true; } bIsModifiedTable['a'] = false; bIsModifiedTable['A'] = false; bIsModifiedTable['c'] = false; bIsModifiedTable['C'] = false; bIsModifiedTable['g'] = false; bIsModifiedTable['G'] = false; bIsModifiedTable['t'] = false; bIsModifiedTable['T'] = false; bIsModifiedTable['n'] = false; } const int nIsDeletionTableSize = 256; static bool bIsDeletion[nDeletionTableSize]; static void setIsDeletionTable() { for( int n = 0; n < nIsDeletionTableSize; ++n ) { bIsDeletion[n] = false; } bIsDeletion['E'] = true; bIsDeletion['F'] = true; bIsDeletion['I'] = true; bIsDeletion['J'] = true; } const int nMaxBasesOnLine = 50; #define FPUTC_TO_SNP( cBASE_TO_WRITE ) \ if ( nBasesOnLine >= nMaxBasesOnLine ) { \ putc( '\n', pSnpChr ); \ nBasesOnLine = 0; \ } \ fputc( cBASE_TO_WRITE, pSnpChr ); \ ++nBasesOnLine; \ ++n1BasesInFile; #define FPUTC_TO_JUST_SNP( soCHR_NAME, n1CHR_POS, cAMBIGUITY_CODE, cREF_BASE, soRSName ) \ fprintf( pFilteredSnps, "%s %d %c %c %s\n", \ soCHR_NAME.data(), \ n1CHR_POS, \ cAMBIGUITY_CODE, \ cREF_BASE, \ soRSName.data() ); const int nAmbiguityCodeToNumberTableSize = 256; int nAmbiguityCodeToNumber[ nAmbiguityCodeToNumberTableSize ]; const int nAmbiguityCodeFromNumberTableSize = 16; char cAmbiguityCodeFromNumber[ nAmbiguityCodeFromNumberTableSize ]; static void setAmbiguityCodeTables() { const int nA = 1; const int nC = 2; const int nG = 4; const int nT = 8; int n; for( n = 0; n < nAmbiguityCodeToNumberTableSize; ++n ) { nAmbiguityCodeToNumber[n] = 0; } for( n = 0; n < nAmbiguityCodeFromNumberTableSize; ++n ) { cAmbiguityCodeFromNumber[n] = 0; } nAmbiguityCodeToNumber['A'] = nA; nAmbiguityCodeToNumber['C'] = nC; nAmbiguityCodeToNumber['G'] = nG; nAmbiguityCodeToNumber['T'] = nT; nAmbiguityCodeToNumber['R'] = nA + nG; nAmbiguityCodeToNumber['Y'] = nC + nT; nAmbiguityCodeToNumber['W'] = nA + nT; nAmbiguityCodeToNumber['S'] = nC + nG; nAmbiguityCodeToNumber['M'] = nA + nC; nAmbiguityCodeToNumber['K'] = nG + nT; nAmbiguityCodeToNumber['B'] = nC + nG + nT; nAmbiguityCodeToNumber['D'] = nA + nG + nT; nAmbiguityCodeToNumber['H'] = nA + nC + nT; nAmbiguityCodeToNumber['V'] = nA + nC + nG; nAmbiguityCodeToNumber['N'] = nA + nC + nG + nT; for( n = 0; n < nAmbiguityCodeToNumberTableSize; ++n ) { int nNumber = nAmbiguityCodeToNumber[n]; assert( nNumber < nAmbiguityCodeFromNumberTableSize ); cAmbiguityCodeFromNumber[ nNumber ] = n; } } static char cCombineBases( const char cOldBase, const char cNewBase, const char cRefBase, bool& bBothAreSubstitutions ) { bBothAreSubstitutions = false; if ( !bIsModifiedTable[ cOldBase ] ) return cNewBase; if ( bIsDeletion[ cOldBase ] ) return cNewBase; assert( ucIfInsertionTable[ cOldBase ] == ucNOT_INSERTION_BASE ); // if reached here, cOldBase is a substitution bBothAreSubstitutions = true; if ( toupper( cOldBase ) == toupper( cNewBase ) ) return cNewBase; int nUnionOfAlleles = nAmbiguityCodeToNumber[ toupper( cOldBase ) ] | nAmbiguityCodeToNumber[ toupper( cNewBase ) ]; char cCombinedBase = cAmbiguityCodeFromNumber[ nUnionOfAlleles ]; if ( ! ( cRefBase == 'A' || ( cCombinedBase == 'B' && cRefBase == 'C' ) ) ) { if ( cCombinedBase != 'N' ) { cCombinedBase = tolower( cCombinedBase ); } } return cCombinedBase; } void snpGenome :: doIt() { setAmbiguityTable(); setAmbiguityCodeTables(); setInsertionTable(); setDeletionTable(); setIfInsertionTable(); setIsModifiedTable(); setIsDeletionTable(); // if there are any chromosomes remaining that do not have any snps, // we will copy those chromosomes over without modification mbtValOrderedVectorOfRWCString aProcessedChromosomes( (size_t) 30 ); FILE* pSnps = fopen( filSnps_.data(), "r" ); if ( !pSnps ) { THROW_FILE_ERROR( filSnps_ ); } FileName filFilteredSnps = "filteredSnps.txt"; FILE* pFilteredSnps = fopen( filFilteredSnps.data(), "w" ); if ( !pFilteredSnps ) { THROW_FILE_ERROR( filFilteredSnps ); } FILE* pGenomeFOF = fopen( filGenome_.data(), "r" ); if ( !pGenomeFOF ) { THROW_FILE_ERROR( filGenome_ ); } FILE* pExcludedValidations = fopen( filExcludedValidations_.data(), "r" ); if ( !pExcludedValidations ) { THROW_FILE_ERROR( filExcludedValidations_ ); } aExcludedValidations_.resize( 100 ); int nLine = 0; while( fgets( soLine.data(), nMaxLineSize, pExcludedValidations ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); ++nLine; soLine.stripTrailingWhitespaceFast(); aExcludedValidations_.insert( soLine ); } fclose( pExcludedValidations ); aGenomeFOF_.resize( 30 ); nLine = 0; while( fgets( soLine.data(), nMaxLineSize, pGenomeFOF ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); ++nLine; if ( soLine.bStartsWith("#" ) ) continue; soLine.stripTrailingWhitespaceFast(); aGenomeFOF_.insert( soLine ); } fclose( pGenomeFOF ); // these will be opened on the first iteration, and when the // chromosome changes FILE* pSnpChr = 0; FILE* pChr = 0; RWCString soLastChromosome = ""; RWCString soChromosome; RWCString soLastUnwantedChromosome = ""; RWCString so0ChrStart; int n1ChrSnpStart; RWCString so1ChrEnd; int n1ChrSnpEnd; RWCString soSnpName; RWCString soStrand; RWCString soRefNCBI; RWCString soRefUCSC; RWCString soAlleleBases; RWCString soMolType; RWCString soClass; RWCString soValid; RWCString soHeterozygosity; RWCString soStdErrorOfHeterozygosity; RWCString soFunctionalCategory; RWCString soLocType; RWCString soWeight; FileName filSnpChromosomePath; FileName filRegularChromosomePath; char cRefBase; int nBasesOnLine = 0; int n1BasesInFile = 0; int n1ChrPosLastSnp; int nOverlapping = 0; int nFiltered = 0; const int nMaxChromosomeSize = 300000000; // 300,000,000; RWCString soChromosomeBases( (size_t) nMaxChromosomeSize ); RWCString soSnpChromosomeBases; RWTValVector aRSNameIndexes( nMaxChromosomeSize ); aRSNameIndexes.soName_ = "aRSNameIndexes"; // RWTValOrderedVector aRSNames( 1500000, "aRSNames" ); RWTValOrderedVector aRSNames( 15000000, "aRSNames" ); const int nMaxSnp130LineSize = 10000; RWCString soSnp130Line( (size_t) nMaxSnp130LineSize ); RWCString soSnp130LineOld( (size_t) nMaxSnp130LineSize ); int nNumberOfSnpRecordsRead = 0; int nNumberPassingValidationFilter = 0; int nNumberPassingWeightFilter = 0; int nPassingMostFilters = 0; int nPassingOverlapFilter = 0; int nPassingWantedChromosomeFilter = 0; int nNumberOfDeletionsMoved = 0; int nNumberOfFilteredSubstitutions = 0; int nNumberOfFilteredSnpInsertions = 0; int nNumberOfFilteredSnpDeletions = 0; int nNumberOfFilteredSnpEntries = 0; int nEntriesIgnoredDueToAlleleNotContainingReference = 0; int nSingleButNotExact = 0; int nSingleIgnoredBecauseAlleleIsStrange = 0; int nDeletionAlleleBasesNotMatchingUCSC = 0; int nPassingMostFiltersSingle = 0; int nPassingMostFiltersDeletion = 0; int nDeletionAllelesBothNotEmpty = 0; int nDeletionAlleleBasesNotACGT = 0; int nDeletions = 0; int nSingleExact = 0; int nSnpLinesOfWantedChromosomes = 0; int nMultipleSnpsSameLocation = 0; int nMultipleSubsSnpsSameLocation = 0; nLine = 0; while( 1 ) { // loop is exited by if ( soChromosome == "LAST" ) break if ( fgets( soSnp130Line.data(), nMaxSnp130LineSize, pSnps ) == NULL ) { soChromosome = "LAST"; fclose( pSnps ); } else { soSnp130Line.nCurrentLength_ = strlen( soSnp130Line.data() ); ++nLine; ++nNumberOfSnpRecordsRead; /* looks like: 585 chr1 10259 10260 rs72477211 0 + C C A/G genomic single unknown 0 0 unknown exact 1 585 chr1 10433 10433 rs56289060 0 + - - -/C genomic insertion unknown 0 0 unknown between 1 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 */ RWCTokenizer tok( soSnp130Line ); tok('\t'); // indexing field for ucsc database soChromosome = tok('\t'); so0ChrStart = tok('\t'); so1ChrEnd = tok('\t'); soSnpName = tok('\t'); tok('\t'); // not used soStrand = tok('\t'); soRefNCBI = tok('\t'); soRefUCSC = tok('\t'); soAlleleBases = tok('\t'); soMolType = tok('\t'); // genomic or cDNA or unknown soClass = tok('\t'); // single, indel, het, microsatellite, insertion, deletion, etc. soValid = tok('\t'); // unknown, by-cluster, by-frequency, by-submitter, by-2hit-2allele, by-hapmap, by-1000genomes if ( bIsThisValidationFieldAnExcludedOne( soValid ) ) continue; ++nNumberPassingValidationFilter; soHeterozygosity = tok('\t'); soStdErrorOfHeterozygosity = tok('\t'); soFunctionalCategory = tok('\t'); // coding-synon, coding-nonsynon, intron, etc soLocType = tok('\t' ); // range, exact, between, rangeInsertion, rangeSubstitution, rangeDeletion // notice that weight ends (in all cases I examined) with a // carriage return soWeight = tok(); // quality of the alignment if ( pCP->bSnpGenomeFilterByWeight_ ) { if ( soWeight != "1" ) continue; } ++nNumberPassingWeightFilter; int n0ChrStart; assert( bIsNumericMaybeWithWhitespace( so0ChrStart, n0ChrStart ) ); n1ChrSnpStart = n0ChrStart + 1; assert( bIsNumericMaybeWithWhitespace( so1ChrEnd, n1ChrSnpEnd ) ); assert( soStrand == "-" || soStrand == "+" ); } // if ( fgets( soSnp130Line.data(), nMaxSnp130LineSize, pSnps ) == NULL ) { if ( soChromosome != soLastChromosome ) { if ( soLastChromosome != "" ) { // debugging fprintf( stderr, "printing %s of length %d\n", soLastChromosome.data(), soSnpChromosomeBases.length() ); // end debugging const int nMaxBasesPerLine = 50; for( int n1 = 1; n1 < soSnpChromosomeBases.length(); n1 += nMaxBasesPerLine ) { int nBasesThisLine = MIN( nMaxBasesPerLine, soSnpChromosomeBases.length() - n1 ); fprintf( pSnpChr, "%.*s\n", nBasesThisLine, soSnpChromosomeBases.data() + n1 ); } fclose( pSnpChr ); // added June 4, 2010 (DG) so filteredSnps.txt is created // from same array that the fasta files (above) are for( int n2 = 1; n2 < soSnpChromosomeBases.length(); ++n2 ) { if ( bIsModifiedTable[ soSnpChromosomeBases[n2] ] ) { int nIndex = aRSNameIndexes[n2]; RWCString soRSName; if ( nIndex == -1 ) { cerr << "warning: base is " << soSnpChromosomeBases[n2] << " but index is " << nIndex << " at " << n2 << " " << soLastChromosome << endl; soRSName = ""; } else { soRSName = aRSNames[ aRSNameIndexes[n2] ]; } FPUTC_TO_JUST_SNP( soLastChromosome, n2, soSnpChromosomeBases[n2], toupper( soChromosomeBases[n2] ), soRSName ); } } writeConversionTable( filSnpChromosomePath ); fprintf( stderr, "finished with %s\n", soLastChromosome.data() ); soLastChromosome = ""; // indicates that soLastChromosome // is closed, so that we don't try to close it again. This // would otherwise happen if it is followed by a chromosome // we don't want, so continue and come around the loop again } if ( soChromosome == "LAST" ) { break; // exits loop } // handle case of snps from chromosomes we don't want, such as the // "random" ones: if ( soChromosome == soLastUnwantedChromosome ) { continue; } bool bWantThisChromosome = false; getPathsFromChromosome( soChromosome, filRegularChromosomePath, filSnpChromosomePath, bWantThisChromosome ); if ( !bWantThisChromosome ) { fprintf( stderr, "skipping %s\n", soChromosome.data() ); soLastUnwantedChromosome = soChromosome; continue; } fprintf( stderr, "starting to process %s\n", soChromosome.data() ); pChr = fopen( filRegularChromosomePath.data(), "r" ); if ( !pChr ) { THROW_FILE_ERROR( filRegularChromosomePath ); } pSnpChr = fopen( filSnpChromosomePath.data(), "w" ); if ( !pSnpChr ) { THROW_FILE_ERROR( filSnpChromosomePath ); } // get over the header line. That will be terminated with the // first carriage return. char cChar; do { cChar = fgetc( pChr ); fputc( cChar, pSnpChr ); } while( cChar != '\n' ); const int nMaxChrLineSize = 5000; RWCString soChrLine( (size_t) ( nMaxChrLineSize + 1 ) ); soChromosomeBases = " "; // this makes the index 1-based while( fgets( soChrLine.data(), nMaxChrLineSize, pChr ) != NULL ) { soChrLine.nCurrentLength_ = strlen( soChrLine.data() ); soChrLine.stripTrailingWhitespaceFast(); soChrLine.toUpper(); soChromosomeBases += soChrLine; } fclose( pChr ); fprintf( stderr, "finished reading %s\n", filRegularChromosomePath.data() ); soSnpChromosomeBases = soChromosomeBases; // convert any N's to n for( int n1 = 1; n1 < soSnpChromosomeBases.length(); ++n1 ) { if ( soSnpChromosomeBases[ n1 ] == 'N' ) soSnpChromosomeBases[ n1 ] = 'n'; } // this will allow aRSNameIndexes to be referenced by the same // index as soChromosomeBases aRSNameIndexes.reshapeIfNotLargeEnough( soChromosomeBases.length() ); aRSNameIndexes.setAllValues( -1 ); aRSNames.clear(); nBasesOnLine = 0; n1BasesInFile = 0; n1ChrPosLastSnp = 0; aProcessedChromosomes.insert( soChromosome ); soLastChromosome = soChromosome; } // if ( soChromosome != soLastChromosome ) { ++nSnpLinesOfWantedChromosomes; bool bWeWantThis = false; if ( soClass == "single" && soLocType == "exact" ) { ++nSingleExact; soAlleleBases.removeSingleCharacterAllOccurrences( '/' ); // I don't know if this test is valid for categories other // than single/exact. We'll need to look at deletion and // insertion to see if it is also valid there. soAlleleBases.toUpper(); if ( soAlleleBases.bContainsOnlyTheseCharacters( "ACGTacgt" ) ) { if ( soStrand == "-" ) { soAlleleBases = soComplementSO( soAlleleBases ); } // checking that allele includes the reference base cRefBase = toupper( soChromosomeBases[ n1ChrSnpStart ] ); assert( soRefNCBI.length() == 1 ); assert( soRefUCSC.length() == 1 ); if ( cRefBase != toupper( soRefUCSC[0]) ) { fprintf( stderr, "%d ref %c doesn't match ucsc: %s ncbi: %s %s", n1ChrSnpStart, cRefBase, soRefUCSC.data(), soRefNCBI.data(), soSnp130Line.data() ); RWCString soError = "ref sequence at 1-based chr pos " + RWCString( (long) n1ChrSnpStart ) + " is " + RWCString( cRefBase ) + " doesn't match ucsc " + soRefUCSC + " ncbi = " + soRefNCBI; // I'm not going to throw an error for these cases: // 60805574 ref M doesn't match N how about M 1048 chr3 60805573 60805574 rs62249331 0 + M N A/C genomic single unknown 00intron exact 1 // 60805803 ref R doesn't match N how about R 1048 chr3 60805802 60805803 rs62249332 0 + R N A/G genomic single unknown 00intron exact 1 // 60805804 ref R doesn't match N how about R 1048 chr3 60805803 60805804 rs62249333 0 + R N A/G genomic single unknown 00intron exact 1 if ( cRefBase == 'A' || cRefBase == 'C' || cRefBase == 'G' || cRefBase == 'T' ) { THROW_ERROR2( soError ); } } // if ( cRefBase != toupper( soRefUCSC[0]) ) { bool bReferenceFound = false; for( int n = 0; n < soAlleleBases.length(); ++n ) { if ( soAlleleBases[n] == cRefBase ) { bReferenceFound = true; break; } } if ( bReferenceFound ) { bWeWantThis = true; } else { ++nEntriesIgnoredDueToAlleleNotContainingReference; } } // if ( soAlleleBases.bContainsOnlyTheseCharacters( "ACGTacgt" ) ) { else { ++nSingleIgnoredBecauseAlleleIsStrange; } } // if ( soClass == "single" && soLocType == "exact" ) { // removed 5/11/10 when changed to memory array soSnpChromosomeBases // else if ( soClass == "insertion" && soLocType == "between" ) { // if ( !pCP->bSnpGenomeUseInsertionPolymorphisms_ ) continue; // assert( soAlleleBases.bStartsWithAndRemove( "-/" ) ); // soAlleleBases.toUpper(); // if ( soAlleleBases.bContainsOnlyTheseCharacters( "ACGT" ) ) { // if ( soStrand == "-" ) { // soAlleleBases = soComplementSO( soAlleleBases ); // } // // checks show that, using the UCSC coordinates, // // insertions are between n1ChrSnpEnd and // // n1ChrSnpEnd + 1 // n1ChrPosLastBaseToCopy = n1ChrSnpEnd; // bWeWantThis = true; // } // } else if ( soClass == "deletion" ) { ++nDeletions; assert( soLocType == "exact" || soLocType == "range" ); if ( soAlleleBases.bStartsWithAndRemove( "-/" ) ) { // now soAlleleBases is just the bases if ( soAlleleBases.bContainsOnlyTheseCharacters( "ACGT" ) ) { if ( soStrand == "-" ) { soAlleleBases = soComplementSO( soAlleleBases ); } if ( soAlleleBases == soRefUCSC ) { bWeWantThis = true; // at this point, look left in reference sequence // to see if this is in a mononucleotide repeat // and change the snp to the furthest left within // the repeat int nLengthOfDeletion = n1ChrSnpEnd - n1ChrSnpStart + 1; assert( soRefUCSC == soChromosomeBases( n1ChrSnpStart, nLengthOfDeletion ) ); /* algorithm for sliding the deleted segment rightward: slide it 1 base at a time and see if, when removing it, the remaining sequence is not distinguishable from the original. If this is true, use that as the starting location and do this again. The way we see if the remaining sequence is indistinguishable is to see if: a---X ^n1ChrOfLeftMostBaseOfDeletion ^n1ChrOfRightMostBaseOfDeletion ^n1ChrOfRightMostBaseOfDeletion + 1 so if a = X, then move it to right by 1 base: a---X */ int n1ChrOfLeftMostBaseOfDeletion = n1ChrSnpStart; bool bContinueTryingToMoveDeletion = true; while( bContinueTryingToMoveDeletion ) { int n1ChrOfRightMostBaseOfDeletion = n1ChrOfLeftMostBaseOfDeletion + nLengthOfDeletion - 1; // this is a half-hearted handling of deletions // right at the end of the chromosome. In // some cases they might be able to be moved further // right. if ( n1ChrOfRightMostBaseOfDeletion == soChromosomeBases.length() - 1 ) break; if ( soChromosomeBases[ n1ChrOfLeftMostBaseOfDeletion ] == soChromosomeBases[ n1ChrOfRightMostBaseOfDeletion + 1 ] ) { ++n1ChrOfLeftMostBaseOfDeletion; } else { bContinueTryingToMoveDeletion = false; } } // while( bContinueTryingToMoveDeletion ) { // did we move the deletion? if ( n1ChrOfLeftMostBaseOfDeletion != n1ChrSnpStart ) { n1ChrSnpStart = n1ChrOfLeftMostBaseOfDeletion; n1ChrSnpEnd = n1ChrSnpStart + nLengthOfDeletion - 1; ++nNumberOfDeletionsMoved; // note: this deletion might be eliminated // due to overlapping another one, especially // if there are multiple entries that all // move to the same location } } else { ++nDeletionAlleleBasesNotMatchingUCSC; } } else { ++nDeletionAlleleBasesNotACGT; } } else { ++nDeletionAllelesBothNotEmpty; } } // else if ( soClass == "deletion" ) { else if ( soClass == "single" && soLocType != "exact" ) { ++nSingleButNotExact; } else if ( soClass == "microsatellite" ) { } if ( bWeWantThis ) { ++nPassingMostFilters; if ( soClass == "single" ) { ++nPassingMostFiltersSingle; } else if ( soClass == "deletion" ) { ++nPassingMostFiltersDeletion; } } if ( bWeWantThis ) { ++nFiltered; } if ( !bWeWantThis ) continue; // get next snp ++nPassingOverlapFilter; // copy all bases until get to nChrStart // if the current base is just before the snp base, we don't // want this loop executed even once. Thus the last loop should // be when the current base is 2 bases before. if ( n1ChrSnpEnd > ( soChromosomeBases.length() - 1 ) ) { // error condition--there is a snp position waiting, but // there are not that many bases RWCString soError = "snp line: " + soSnp130Line + " says that in " + soChromosome + " pos " + so0ChrStart + " there is a snp but chromosome file " + filRegularChromosomePath + " does not have that many bases"; THROW_ERROR2( soError ); } ++nPassingWantedChromosomeFilter; if ( soClass == "single" ) { // this check is to be sure that we haven't already handled // this particular base. This can occur when there are multiple // snp lines for the same base. // if ( n1ChrPosLastSnp < n1ChrSnpStart ) { // get snp base if ( n1ChrSnpStart > ( soChromosomeBases.length() - 1 ) ) { RWCString soError = "looking for base n1ChrSnpStart = " + RWCString( (long) n1ChrSnpStart ) + " but the regular chromosome file " + filRegularChromosomePath + " ended prematurely"; THROW_ERROR2( soError ); } cRefBase = soChromosomeBases[ n1ChrSnpStart ]; assert( soRefNCBI.length() == 1 ); assert( soRefUCSC.length() == 1 ); if ( cRefBase != toupper( soRefUCSC[0]) ) { fprintf( stderr, "%d ref %c doesn't match %s how about %s %s", n1ChrSnpStart, cRefBase, soRefUCSC.data(), soRefNCBI.data(), soSnp130Line.data() ); RWCString soError = "ref sequence at 1-based chr pos " + RWCString( (long) n1ChrSnpStart ) + " is " + RWCString( cRefBase ) + " doesn't match ucsc " + soRefUCSC + " or ncbi " + soRefNCBI; THROW_ERROR2( soError ); } cRefBase = toupper( cRefBase ); bool bReferenceFound = false; char cBaseOtherThanReference = 0; for( int n = 0; n < soAlleleBases.length(); ++n ) { if ( soAlleleBases[n] != cRefBase ) { cBaseOtherThanReference = soAlleleBases[n]; } else { bReferenceFound = true; } } // this was checked above assert( bReferenceFound ); char cBaseToWrite; if ( soAlleleBases.length() == 1 ) { // how could this happen? If the "observed" field just // contains the reference. There may be other ways. cBaseToWrite = cRefBase; } else if ( soAlleleBases.length() == 2 ) { cBaseToWrite = cAmbiguityTable[cRefBase][cBaseOtherThanReference ]; } else { soAlleleBases.sortLetters(); if ( soAlleleBases == "ACG" ) { cBaseToWrite = 'V'; } else if( soAlleleBases == "ACT" ) { cBaseToWrite = 'H'; } else if( soAlleleBases == "AGT" ) { cBaseToWrite = 'D'; } else if( soAlleleBases == "CGT" ) { cBaseToWrite = 'B'; } else if( soAlleleBases == "ACGT" ) { cBaseToWrite = 'N'; } else { RWCString soError = "observed field " + soAlleleBases + " contained something other than ACGT: " + soSnp130Line; THROW_ERROR2( soError ); } // if cRefBase is an A, then leave upper case // if cRefBase is C, G, or T: // make lowercase in case of VHDN // if cBaseToWrite is B, then leave uppercase if // cRefBase is C. If it is not C, make it lowercase // Problem with N--it must be left N even if the // reference base is A. if ( ! ( cRefBase == 'A' || ( cBaseToWrite == 'B' && cRefBase == 'C' ) ) ) { if ( cBaseToWrite != 'N' ) { cBaseToWrite = tolower( cBaseToWrite ); } } } ++nNumberOfFilteredSnpEntries; ++nNumberOfFilteredSubstitutions; // changed 5/10/10 // note that even if it has already been changed, it // will be changed again because a substitution overrides // a deletion RWCString soSnpNameToWrite = soSnpName; if ( bIsModifiedTable[ soSnpChromosomeBases[ n1ChrSnpStart ] ] ) { if ( ( soSnpChromosomeBases[ n1ChrSnpStart ] != cBaseToWrite ) ) { ++nMultipleSnpsSameLocation; if ( !bIsDeletion[ cBaseToWrite ] && !bIsDeletion[ soSnpChromosomeBases[ n1ChrSnpStart ] ] ) { ++nMultipleSubsSnpsSameLocation; } } bool bBothAreSubstitutions; cBaseToWrite = cCombineBases( soSnpChromosomeBases[ n1ChrSnpStart ], cBaseToWrite, cRefBase, bBothAreSubstitutions ); if ( bBothAreSubstitutions ) { RWCString soExistingSnp = aRSNames[ aRSNameIndexes[ n1ChrSnpStart ] ]; soSnpNameToWrite += " "; soSnpNameToWrite += soExistingSnp; } } soSnpChromosomeBases[ n1ChrSnpStart ] = cBaseToWrite; aRSNames.append( soSnpNameToWrite ); aRSNameIndexes[ n1ChrSnpStart ] = aRSNames.length() - 1; // removed June 4, 2010 // FPUTC_TO_JUST_SNP( n1ChrSnpStart, cBaseToWrite, cRefBase ); n1ChrPosLastSnp = n1ChrSnpStart; // } // if ( n1ChrPosRead < n1ChrSnpStart ) { } // if ( soClass == "single" ) { // not handling insertions 5/10/10 // else if ( soClass == "insertion" ) { // if ( n1ChrPosRead != n1ChrPosLastBaseToCopy ) { // fprintf( stderr, "n1ChrPosRead = %s n1ChrPosLastBaseToCopy = %s line %s old line %s\n", // soAddCommas( n1ChrPosRead ).data(), // soAddCommas( n1ChrPosLastBaseToCopy ).data(), // soSnp130Line.data(), // soSnp130LineOld.data() ); // } // assert( n1ChrPosRead == n1ChrPosLastBaseToCopy ); // // for( int n = 0; n < soAlleleBases.length(); ++n ) { // // char cBaseToWrite = cInsertionTable[ soAlleleBases[n] ]; // // FPUTC_TO_SNP( cBaseToWrite ); // // } // ++nNumberOfFilteredSnpEntries; // ++nNumberOfFilteredSnpInsertions; // } else if ( soClass == "deletion" ) { int nLengthOfDeletion = n1ChrSnpEnd - n1ChrSnpStart + 1; assert( nLengthOfDeletion == soRefUCSC.length() ); if ( n1ChrSnpEnd > ( soChromosomeBases.length() - 1 ) ) { THROW_ERROR2( "premature end of file " + filRegularChromosomePath + " while looking for deletion " + soSnp130Line ); } RWCString soReadReference = soChromosomeBases( n1ChrSnpStart, nLengthOfDeletion ); // soReadReference.toUpper(); assert( soReadReference.length() == soRefUCSC.length() ); RWCString soCirculatedRefUCSC = soRefUCSC; if ( soReadReference != soRefUCSC ) { bool bOK = false; if ( soReadReference.length() > 1 ) { // see if it matches circulating for( int n = 0; n < ( soCirculatedRefUCSC.length() - 1 ); ++n ) { soCirculatedRefUCSC = soCirculatedRefUCSC.cGetLastChar() + soCirculatedRefUCSC(0, soCirculatedRefUCSC.length() - 1 ); if ( soCirculatedRefUCSC == soReadReference ) { bOK = true; break; } } } if ( !bOK ) { RWCString soError = "RefUCSC = " + soRefUCSC + " but part of reference (after moving deletion tags) = " + soReadReference + " which are not equal even after permuting with line" + soSnp130Line; THROW_ERROR2( soError ); } } // assert( soReadReference == soRefUCSC ); RWCString soRSNameToWrite = soSnpName; aRSNames.append( soRSNameToWrite ); int nIndexOfRSName = aRSNames.length() - 1; for( int n = 0; n < nLengthOfDeletion; ++n ) { char cBaseToWrite = cDeletionTable[ soReadReference[n] ]; if ( !bIsModifiedTable[ soSnpChromosomeBases[ n1ChrSnpStart + n ] ] ) { soSnpChromosomeBases[ n1ChrSnpStart + n ] = cBaseToWrite; aRSNameIndexes[ n1ChrSnpStart + n ] = nIndexOfRSName; // removed June 4, 2010 // FPUTC_TO_JUST_SNP( n1ChrSnpStart + n, // cBaseToWrite, soCirculatedRefUCSC[n] ); } } n1ChrPosLastSnp = n1ChrSnpStart + nLengthOfDeletion - 1; ++nNumberOfFilteredSnpEntries; ++nNumberOfFilteredSnpDeletions; } // else if ( soClass == "deletion" ) { soSnp130LineOld = soSnp130Line; } // while( 1 ) { // loop is exited by if ( soChromosome == "LAST" ) break fprintf( stderr, "nNumberOfSnpRecordsRead = %12s\n", soAddCommas( nNumberOfSnpRecordsRead).data() ); fprintf( stderr, "nNumberPassingValidationFilter = %12s\n", soAddCommas( nNumberPassingValidationFilter).data() ); fprintf( stderr, "nNumberPassingWeightFilter = %12s\n", soAddCommas( nNumberPassingWeightFilter).data() ); fprintf( stderr, "nSnpLinesOfWantedChromosomes: %12s\n", soAddCommas( nSnpLinesOfWantedChromosomes ).data() ); fprintf( stderr, "nSingleExact: %12s\n", soAddCommas( nSingleExact ).data() ); fprintf( stderr, "nDeletions: %12s\n", soAddCommas( nDeletions ).data() ); fprintf( stderr, "nPassingMostFilters = %12s\n", soAddCommas( nPassingMostFilters).data() ); fprintf( stderr, "nPassingOverlapFilter = %12s\n", soAddCommas( nPassingOverlapFilter).data() ); fprintf( stderr, "nPassingWantedChromosomeFilter = %12s\n", soAddCommas( nPassingWantedChromosomeFilter ).data() ); // fprintf( stderr, "filtered = %12s\n", soAddCommas( nFiltered ).data() ); // fprintf( stderr, "overlapping = %12s\n", soAddCommas( nOverlapping ).data() ); fprintf( stderr, "nEntriesIgnoredDueToAlleleNotContainingReference = %d\n", nEntriesIgnoredDueToAlleleNotContainingReference ); fprintf( stderr, "final filtered = %12s\n", soAddCommas( nNumberOfFilteredSnpEntries ).data() ); fprintf( stderr, "filtered deletions: %12s\n", soAddCommas( nNumberOfFilteredSnpDeletions ).data() ); fprintf( stderr, "filtered insertions: %12s\n", soAddCommas( nNumberOfFilteredSnpInsertions ).data() ); fprintf( stderr, "filtered substitutions: %12s\n", soAddCommas( nNumberOfFilteredSubstitutions ).data() ); fprintf( stderr, "deletions moved (perhaps redund. %12s\n", soAddCommas( nNumberOfDeletionsMoved ).data() ); fprintf( stderr, "single but not exact: %12s\n", soAddCommas( nSingleButNotExact ).data() ); fprintf( stderr, "nSingleIgnoredBecauseAlleleIsStrange: %12s\n", soAddCommas( nSingleIgnoredBecauseAlleleIsStrange ).data() ); fprintf( stderr, "nDeletionAlleleBasesNotMatchingUCSC: %12s\n", soAddCommas( nDeletionAlleleBasesNotMatchingUCSC ).data() ); fprintf( stderr, "nPassingMostFiltersSingle: %12s\n", soAddCommas( nPassingMostFiltersSingle ).data() ); fprintf( stderr, "nPassingMostFiltersDeletion: %12s\n", soAddCommas( nPassingMostFiltersDeletion ).data() ); fprintf( stderr, "nDeletionAllelesBothNotEmpty: %12s\n", soAddCommas( nDeletionAllelesBothNotEmpty ).data() ); fprintf( stderr, "nDeletionAlleleBasesNotACGT: %12s\n", soAddCommas( nDeletionAlleleBasesNotACGT ).data() ); fprintf( stderr, "nSingleExact: %12s\n", soAddCommas( nSingleExact ).data() ); fprintf( stderr, "nMultipleSnpsSameLocation: %12s\n", soAddCommas( nMultipleSnpsSameLocation ).data() ); fprintf( stderr, "nMultipleSubsSnpsSameLocation: %12s\n", soAddCommas( nMultipleSubsSnpsSameLocation ).data() ); // should check if there are any chromosomes that are left // unprocessed int n; for( n = 0; n < aGenomeFOF_.length(); ++n ) { RWCTokenizer tok( aGenomeFOF_[n] ); RWCString soChromosome = tok(); assert( !soChromosome.bIsNull() ); if ( aProcessedChromosomes.bContains( soChromosome ) ) continue; // if reached here, the chromosome has not been processed. Just // copy it to the new location fprintf( stderr, "no snps for %s so now processing it\n", soChromosome.data() ); FileName filRegularChromosomePath = (FileName) tok(); FileName filSnpChromosomePath = (FileName) tok(); assert( !filSnpChromosomePath.bIsNull() ); fprintf( stderr, "starting to process %s\n", soChromosome.data() ); pChr = fopen( filRegularChromosomePath.data(), "r" ); if ( !pChr ) { THROW_FILE_ERROR( filRegularChromosomePath ); } pSnpChr = fopen( filSnpChromosomePath.data(), "w" ); if ( !pSnpChr ) { THROW_FILE_ERROR( filSnpChromosomePath ); } // get over the header line. That will be terminated with the // first carriage return. char cChar; do { cChar = fgetc( pChr ); fputc( cChar, pSnpChr ); } while( cChar != '\n' ); nBasesOnLine = 0; n1BasesInFile = 0; char cBase; while( ( cBase = fgetc( pChr ) ) != EOF ) { if ( isspace( cBase ) ) continue; cBase = toupper( cBase ); if ( cBase == 'N' ) cBase = 'n'; FPUTC_TO_SNP( cBase ); } // final carriage return in the file fputc( '\n', pSnpChr ); fclose( pSnpChr ); fclose( pChr ); FileName filSnpChromosomeTable = filSnpChromosomePath; assert( filSnpChromosomeTable.bEndsWithAndRemove( ".fa" ) ); filSnpChromosomeTable += ".table"; // this file will contain one line, since there will be no insertions FILE* pSnpChrTable = fopen( filSnpChromosomeTable.data(), "w" ); if ( !pSnpChrTable ) { THROW_FILE_ERROR( filSnpChromosomeTable ); } fprintf( pSnpChrTable, "%d 0\n", n1BasesInFile ); fclose( pSnpChrTable ); fprintf( stderr, "finished with %s\n", soLastChromosome.data() ); } // for( n = 0; n < aGenomeFOF_.length(); ++n ) { fprintf( stderr, "for output chromosomes, see:\n" ); for( n = 0; n < aGenomeFOF_.length(); ++n ) { fprintf( stderr, "%s\n", aGenomeFOF_[n].data() ); } fclose( pFilteredSnps ); } void snpGenome :: getPathsFromChromosome( const RWCString& soChromosome, FileName& filRegularChromosomePath, FileName& filSnpChromosomePath, bool& bWantThisChromosome ) { for( int nChr = 0; nChr < aGenomeFOF_.length(); ++nChr ) { RWCString soLine = aGenomeFOF_[ nChr ]; RWCTokenizer tok( soLine ); if ( soChromosome == tok() ) { // found chromosome filRegularChromosomePath = (FileName) tok(); filSnpChromosomePath = (FileName) tok(); assert( !filSnpChromosomePath.bIsNull() ); bWantThisChromosome = true; return; } } bWantThisChromosome = false; return; // RWCString soError = "could not find " + soChromosome + " in aGenomeFOF_"; // THROW_ERROR2( soError ); } bool snpGenome :: bIsThisValidationFieldAnExcludedOne( const RWCString& soValid ) { bool bExclude = aExcludedValidations_.bContains( soValid ); return bExclude; } void snpGenome :: writeConversionTable( const FileName& filSnpChromosomePath ) { FileName filSnpChromosomeTable = filSnpChromosomePath; assert( filSnpChromosomeTable.bEndsWithAndRemove( ".fa" ) ); filSnpChromosomeTable += ".table"; FILE* pSnpChrTable = fopen( filSnpChromosomeTable.data(), "w" ); if ( !pSnpChrTable ) { THROW_FILE_ERROR( filSnpChromosomeTable ); } FILE* pSnpChrReadOnly = fopen( filSnpChromosomePath.data(), "r" ); if ( !pSnpChrReadOnly ) { THROW_FILE_ERROR( filSnpChromosomePath ); } // throw away the header line assert( fgets( soLine.data(), nMaxLineSize, pSnpChrReadOnly ) ); assert( soLine[0] == '>' ); int nNotInsertedBases = 0; int nInsertedBases = 0; unsigned char ucLastState = ucNOT_INSERTION_BASE; while( fgets( soLine.data(), nMaxLineSize, pSnpChrReadOnly ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); // get rid of carriage return soLine.stripTrailingWhitespaceFast(); for( int n = 0; n < soLine.length(); ++n ) { if ( ucIfInsertionTable[ soLine[n] ] == ucINSERTION_BASE ) { if ( ucLastState == ucINSERTION_BASE ) { ++nInsertedBases; } else { // just started a stretch of inserted bases nInsertedBases = 1; ucLastState = ucINSERTION_BASE; } } else { if ( ucLastState == ucINSERTION_BASE ) { // just ended a stretch of inserted bases fprintf( pSnpChrTable, "%d %d\n", nNotInsertedBases, nInsertedBases ); nNotInsertedBases = 1; ucLastState = ucNOT_INSERTION_BASE; } else { ++nNotInsertedBases; } } } } // just finished chromosome if ( ucLastState == ucINSERTION_BASE ) { fprintf( pSnpChrTable, "%d %d\n", nNotInsertedBases, nInsertedBases ); } else { // case in which the chromosome ends with non-inserted bases. This // is a special case in which last line is # 0 where 0 is the // number of inserted bases fprintf( pSnpChrTable, "%d 0\n", nNotInsertedBases ); } fclose( pSnpChrTable ); fclose( pSnpChrReadOnly ); }