/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "geneClassifications.h" #include "cGetProtein.h" #include #include "soLine.h" #include "rwctokenizer.h" #include "max.h" #include "bIntersect.h" #include "cComplementBase.h" #include "soGetDateTime.h" #include "soAddCommas.h" #include #include "assert.h" const int cUTR = 'u'; const int cINTERGENIC = 'n'; const int cINTRON = 'i'; const int cSPLICE_SITE = 'p'; const int cSYN_CODING = 's'; const int cNONSYN_CODING = 'o'; const int cFROM_STOP_CODON = 'f'; const int cTO_STOP_CODON = 't'; #define szTranslateGeneType( CODE ) ( \ ( CODE == cUTR ) ? "utr" : (( CODE == cINTERGENIC ) ? "intergenic" : (( CODE == cINTRON ) ? "intron" : ( CODE == cSPLICE_SITE ) ? "splice" : ((( CODE == cSYN_CODING ) ? "syn" : (( CODE == cNONSYN_CODING ) ? "nonsyn" : (( CODE == cFROM_STOP_CODON ) ? "from_stop" : (( CODE == cTO_STOP_CODON ) ? "to_stop" : "???" )))))))) void geneClassifications :: doIt() { FileName filOutputFile = filGenomicLocations_.filReplaceExtension( "class" ); pOutputFile_ = fopen( filOutputFile.data(), "w" ); if ( !pOutputFile_ ) { THROW_FILE_ERROR( filOutputFile ); } readKnownGene(); readChromosomesFOF(); readPhyloFOF(); FILE* pGenomicLocations = fopen( filGenomicLocations_.data(), "r" ); if ( !pGenomicLocations ) { THROW_FILE_ERROR( filGenomicLocations_ ); } RWCString soChromosome( (size_t) 200 ); // sscanf is used on this RWCString soLastChromosome; RWCString soSnpLine( (size_t) nMaxLineSize ); int n1ChrPos; RWCString soAlternativeAlleles( (size_t) 200 ); // sscanf is used on this while( 1 ) { if ( fgets( soSnpLine.data(), nMaxLineSize, pGenomicLocations ) == NULL ) { soChromosome = "LAST"; } else { soSnpLine.nCurrentLength_ = strlen( soSnpLine.data() ); soSnpLine.stripTrailingWhitespaceFast(); if ( soSnpLine.bIsNull() ) continue; parseCurrentLine( soSnpLine, soChromosome, n1ChrPos, soAlternativeAlleles ); } if ( soChromosome != soLastChromosome ) { // finishLastChromosome( ); if ( soChromosome == "LAST" ) break; startNextChromosome( soChromosome ); soLastChromosome = soChromosome; } processCurrentLine( soChromosome, n1ChrPos, soAlternativeAlleles ); } fclose( pOutputFile_ ); cerr << "see " << filOutputFile << endl; } void geneClassifications :: readChromosomesFOF() { FILE* pChromosomesFOF = fopen( filChromosomesFOF_.data(), "r" ); if ( !pChromosomesFOF ) { THROW_FILE_ERROR( filChromosomesFOF_ ); } while( fgets( soLine.data(), nMaxLineSize, pChromosomesFOF ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aChromosomeFiles_.append( soLine ); } fclose( pChromosomesFOF ); } void geneClassifications :: readPhyloFOF() { if ( filPhyloFOF_.bIsNull() ) return; FILE* pPhyloFOF = fopen( filPhyloFOF_.data(), "r" ); if ( !pPhyloFOF ) { THROW_FILE_ERROR( filPhyloFOF_ ); } while( fgets( soLine.data(), nMaxLineSize, pPhyloFOF ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aPhyloFiles_.append( soLine ); } fclose( pPhyloFOF ); } void geneClassifications :: parseCurrentLine( const RWCString& soLine, RWCString& soChromosome, int& n1ChrPos, RWCString& soAlternativeAlleles ) { // assuming line looks like: // chr1 38383 soChromosome = " "; // just so there is enough room sscanf( soLine.data(), "%s %d %s\n", soChromosome.data(), &n1ChrPos, soAlternativeAlleles.data() ); soChromosome.nCurrentLength_ = strlen( soChromosome.data() ); soAlternativeAlleles.nCurrentLength_ = strlen( soAlternativeAlleles.data() ); } void geneClassifications :: startNextChromosome( const RWCString& soChromosome ) { RWCString soPattern = soChromosome + ".fa"; RWCString soFoundChromosomeFile; int nChr; for( nChr = 0; nChr < aChromosomeFiles_.length(); ++nChr ) { RWCString soChromosomeFile = aChromosomeFiles_[ nChr ]; if ( soChromosomeFile.bContains( soPattern ) ) { soFoundChromosomeFile = soChromosomeFile; break; } } cerr << "reading " << soFoundChromosomeFile << endl; cerr.flush(); FILE* pChromosome = fopen( soFoundChromosomeFile.data(), "r" ); if ( !pChromosome ) { THROW_FILE_ERROR( soFoundChromosomeFile ); } assert( fgets( soLine.data(), nMaxLineSize, pChromosome ) != NULL ); assert( soLine[0] == '>' ); soChromosomeBases_ = " "; // to make index 1-based while( fgets( soLine.data(), nMaxLineSize, pChromosome ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); soChromosomeBases_ += soLine; } fclose( pChromosome ); cerr << "done" << endl; cerr.flush(); if ( !filPhyloFOF_.bIsNull() ) { aConservationScores_.increaseButNotDecreaseMaxLength( soChromosomeBases_.length() + 1 ); aConservationScores_.nCurrentLength_ = soChromosomeBases_.length() + 1; for( int n = 1; n <= soChromosomeBases_.length(); ++n ) { aConservationScores_[n] = -666.0; } // find Phylo file: RWCString soPhyloPattern = soChromosome + "."; FileName filFoundPhyloFile; for( nChr = 0; nChr < aPhyloFiles_.length(); ++nChr ) { if ( aPhyloFiles_[nChr].bContains( soPhyloPattern ) ) { filFoundPhyloFile = aPhyloFiles_[ nChr ]; break; } } cerr << "reading " << filFoundPhyloFile << endl; cerr.flush(); FILE* pPhylo = fopen( filFoundPhyloFile.data(), "r" ); if ( !pPhylo ) { THROW_FILE_ERROR( filFoundPhyloFile ); } char szChromosome[100]; float fConsScore; int nNextChrPos; int nChrPos; while( fgets( soLine.data(), nMaxLineSize, pPhylo ) != NULL ) { // soLine.nCurrentLength_ = strlen( soLine.data() ); if ( sscanf( soLine.data(), "%f", &fConsScore ) == 1 ) { ++nChrPos; aConservationScores_[ nChrPos ] = fConsScore; } else if ( sscanf( soLine.data(), "fixedStep chrom=%s start=%d step=1", szChromosome, &nNextChrPos ) ) { assert( soChromosome == RWCString( szChromosome ) ); nChrPos = nNextChrPos - 1; } else { soLine.nCurrentLength_ = strlen( soLine.data() ); RWCString soError = "do not understand conservation line " + soLine; THROW_ERROR( soError ); } } fclose( pPhylo ); cerr << "done" << endl; cerr.flush(); } } static int nCompareLocations( const RWCString& soChromosome, const int n1ChrPos, const RWCString& soKnownGeneLine ) { // looks like: // uc001aaa.2 chr1 + 1115 4121 // 0 1 2 3 4 RWCTokenizer tok( soKnownGeneLine ); tok(); RWCString soKnownGeneChromosome = tok(); tok(); // strand RWCString so0Start = tok(); // 0-based if ( soChromosome < soKnownGeneChromosome ) return -1; else if ( soChromosome > soKnownGeneChromosome ) return 1; else { int n1KnownGeneChrPos = atoi( so0Start.data() ) + 1; if ( n1ChrPos < n1KnownGeneChrPos ) return -1; else if ( n1ChrPos > n1KnownGeneChrPos ) return 1; else return 0; } } void geneClassifications :: readKnownGene() { FILE* pKnownGene = fopen( filKnownGene_.data(), "r" ); if ( !pKnownGene ){ THROW_FILE_ERROR( filKnownGene_ ); } // this is in ctor aKnownGene_.resize( 70000 ); const int nMaxKnownGeneLineSize = 1000000; RWCString soKnownGeneLine( (size_t) nMaxKnownGeneLineSize + 2 ); int nMaxLengthSoFar = 0; while( fgets( soKnownGeneLine.data(), nMaxKnownGeneLineSize, pKnownGene ) != NULL ) { soKnownGeneLine.nCurrentLength_ = strlen( soKnownGeneLine.data() ); if ( soKnownGeneLine.cGetLastChar() != '\n' ) { THROW_ERROR2( "line " + soKnownGeneLine + " didn't end with a cr" ); } soKnownGeneLine.removeFinalCR(); if ( nMaxLengthSoFar < soKnownGeneLine.length() ) { nMaxLengthSoFar = soKnownGeneLine.length(); } aKnownGene_.insert( soKnownGeneLine ); } printf( "longest line %d\n", nMaxLengthSoFar ); fclose( pKnownGene ); cerr << "done" << endl; cerr.flush(); // check that knownGene is sorted for( int nKnownGene = 1; nKnownGene < aKnownGene_.length(); ++nKnownGene ) { RWCString soKnownGene1 = aKnownGene_[nKnownGene - 1]; RWCString soKnownGene2 = aKnownGene_[nKnownGene]; RWCTokenizer tok1( soKnownGene1 ); RWCTokenizer tok2( soKnownGene2 ); // looks like: // uc001aaa.2 chr1 + 1115 4121 // 0 1 2 3 4 tok1('\t'); tok2('\t'); RWCString soChromosome1 = tok1('\t'); RWCString soChromosome2 = tok2('\t'); // printf( "dddd %s\n", soChromosome1.data() ); // printf( "eeee %s\n", soChromosome2.data() ); if ( soChromosome1 > soChromosome2 ) { printf( "known genes at index %d has %s then %s\n", nKnownGene, soChromosome1.data(), soChromosome2.data() ); } tok1('\t'); tok2('\t'); RWCString so0Start1 = tok1('\t'); RWCString so0Start2 = tok2('\t'); int n1ChrPos1 = atoi( so0Start1.data() ) + 1; if ( nCompareLocations( soChromosome1, n1ChrPos1, soKnownGene2 ) > 0 ) { printf( "genes out of order: %s %s %s %s\n", soChromosome1.data(), soChromosome2.data(), so0Start1.data(), so0Start2.data() ); } } // for( int nKnownGene = 1; } void geneClassifications :: processCurrentLine( const RWCString& soChromosome, const int n1ChrPos, const RWCString& soAlternativeAlleles ) { char cNormalAA = '?'; char cSnpAA = '?'; char cGeneType = cFindGeneTypeAllGenes( soChromosome, n1ChrPos, soAlternativeAlleles, cNormalAA, cSnpAA ); if ( filPhyloFOF_.bIsNull() ) { fprintf( pOutputFile_, "%s %d %s %s %c %c\n", soChromosome.data(), n1ChrPos, soAlternativeAlleles.data(), szTranslateGeneType( cGeneType ), cNormalAA, cSnpAA ); } else { fprintf( pOutputFile_, "%s %d %s %s %c %c %.3f\n", soChromosome.data(), n1ChrPos, soAlternativeAlleles.data(), szTranslateGeneType( cGeneType ), cNormalAA, cSnpAA, aConservationScores_[ n1ChrPos ] ); } } char geneClassifications :: cFindGeneTypeAllGenes( const RWCString& soChromosome, const int n1ChrPos, const RWCString& soAlternativeAlleles, char& cNormalAA, char& cSnpAA ) { char cAlternativeAllele; // just so the compiler doesn't complain, make a copy RWCString soAlternativeAlleles2 = soAlternativeAlleles; if ( soAlternativeAlleles2.bStartsWith( "del" ) ) cAlternativeAllele = 'a'; else if ( soAlternativeAlleles2.bStartsWith( "ins" ) ) cAlternativeAllele = 'a'; else { cAlternativeAllele = soAlternativeAlleles2[0]; } int nIndex = nFindBeginningOfGenesPossiblyOverlappingPos( soChromosome, n1ChrPos ); if ( nIndex == RW_NPOS ) { return cINTERGENIC; } char cGeneType = cINTERGENIC; for( ; nIndex < aKnownGene_.length(); ++nIndex ) { if ( nCompareLocations( soChromosome, n1ChrPos, aKnownGene_[ nIndex ] ) < 0 ) { // this gene is to the right of n1ChrPos so we need look at // no more genes break; } char cTempNormalAA; char cTempSnpAA; char cGeneTypeThisGene = cFindGeneType( soChromosome, n1ChrPos, cAlternativeAllele, aKnownGene_[ nIndex], cTempNormalAA, cTempSnpAA ); cGeneType = cCombineGeneTypes( cGeneType, cGeneTypeThisGene ); // this check will determine if the current gene is the one // that dominates if ( cGeneType == cGeneTypeThisGene ) { cNormalAA = cTempNormalAA; cSnpAA = cTempSnpAA; } } return cGeneType; } int geneClassifications :: nFindBeginningOfGenesPossiblyOverlappingPos( const RWCString& soChromosome, const int n1ChrPos ) { // largest gene is 2304634 // ------ --- -------- // x position to find genes overlapping // x position after subtracting nLargestGene const int nLargestGene = 2400000; int n1ChrPosToLookFor = MAX( 1, n1ChrPos - nLargestGene ); int nIndex = nFindIndexOfMatchOrSuccessor( soChromosome, n1ChrPosToLookFor ); if ( nIndex != RW_NPOS ) { if ( nCompareLocations( soChromosome, n1ChrPosToLookFor, aKnownGene_[ nIndex ] ) == 0 ) { // try backing up as far as can go and still have the // 2 locations equal while( nIndex > 0 ) { --nIndex; if ( nCompareLocations( soChromosome, n1ChrPosToLookFor, aKnownGene_[ nIndex ] ) != 0 ) { ++nIndex; break; } } // reach here either because nIndex == 0 // or because backed up until the locations were different // and then ++nIndex so that the locations at the same } } return nIndex; } int geneClassifications :: nFindIndexOfMatchOrSuccessor( const RWCString& soChromosome, const int n1ChrPosToLookFor ) { // this routine doesn't worry about whether the gene overlaps the // position or not--it is just simply concerned with finding the // first gene that starts at or after n1ChrPosToLookFor if ( aKnownGene_.isEmpty() ) return RW_NPOS; // these labels don't yet apply int nMaxIndex = aKnownGene_.length() - 1 ; // consider case in with last gene is too far left if ( nCompareLocations( soChromosome, n1ChrPosToLookFor, aKnownGene_[ nMaxIndex ] ) > 0 ) return RW_NPOS; // if reached here, // . ----> (last gene) // or // . (n1ChrPosToLookFor // -----> (last gene) if ( nCompareLocations( soChromosome, n1ChrPosToLookFor, aKnownGene_[0] ) <= 0 ) return 0; // if reached here, the gene at 0 must be to the left of the position: // -----> // . // so the gene at 0 is too far to the left: int nTooSmallIndex = 0; while( true ) { if ( nMaxIndex - nTooSmallIndex <= 1 ) return( nMaxIndex ); else { int nTestIndex = ( nTooSmallIndex + nMaxIndex ) / 2; if ( nCompareLocations( soChromosome, n1ChrPosToLookFor, aKnownGene_[ nTestIndex ] ) > 0 ) nTooSmallIndex = nTestIndex; else nMaxIndex = nTestIndex; } } } char geneClassifications :: cFindGeneType( const RWCString& soChromosome, const int n1ChrPos, const char cAlternativeAllele, const RWCString& soKnownGeneLine, char& cNormalAA, char& cSnpAA ) { // `name` varchar(255) NOT NULL default '', // `chrom` varchar(255) NOT NULL default '', // `strand` char(1) NOT NULL default '', // `txStart` int(10) unsigned NOT NULL default '0', // `txEnd` int(10) unsigned NOT NULL default '0', // `cdsStart` int(10) unsigned NOT NULL default '0', // `cdsEnd` int(10) unsigned NOT NULL default '0', // `exonCount` int(10) unsigned NOT NULL default '0', // `exonStarts` longblob NOT NULL, // `exonEnds` longblob NOT NULL, // `proteinID` varchar(40) NOT NULL default '', // `alignID` varchar(255) NOT NULL default '', cNormalAA = '?'; cSnpAA = '?'; char cGeneType = cINTERGENIC; RWCTokenizer tok( soKnownGeneLine ); tok(); // name RWCString soKnownGeneChromosome = tok(); if ( soChromosome != soKnownGeneChromosome ) return cINTERGENIC; RWCString soStrand = tok(); RWCString so0Start = tok(); RWCString so1End = tok(); RWCString so0CodingStart = tok(); RWCString so1CodingEnd = tok(); RWCString soNumberOfExons = tok(); RWCString soExonStarts = tok(); RWCString soExonEnds = tok(); assert( !soExonEnds.bIsNull() ); // check whether it is within the gene int n1GeneStart = atoi( so0Start.data() ) + 1; int n1GeneEnd = atoi( so1End.data() ); if ( ( n1ChrPos < n1GeneStart ) || ( n1GeneEnd < n1ChrPos ) ) { return cINTERGENIC; } // now we know it is within an exon or intron // check whether it is in an exon or intron int nNumberOfExons = atoi( soNumberOfExons.data() ); RWCTokenizer tokExonStarts( soExonStarts ); RWCTokenizer tokExonEnds( soExonEnds ); bool bIsInSpliceSite = false; bool bIsInExon = false; for( int nExon = 0; nExon < nNumberOfExons; ++nExon ) { RWCString so0ExonStart = tokExonStarts(',' ); RWCString so1ExonEnd = tokExonEnds(',' ); assert( !so1ExonEnd.bIsNull() ); int n1ExonStart = atoi( so0ExonStart.data() ) + 1; int n1ExonEnd = atoi( so1ExonEnd.data() ); // first exon intron last exon // ---------ssiiiiiiiiiiiss----------- // ^^ ^^ look these places in introns // ^n1ExonEnd // ^ n1ExonEnd + 2 // ^n1ExonStart -2 // ^n1ExonStart if ( nExon != 0 ) { if ( ( ( n1ExonStart - 2 ) <= n1ChrPos ) && ( n1ChrPos < n1ExonStart ) ) { bIsInSpliceSite = true; break; } } if ( nExon != ( nNumberOfExons - 1 ) ) { if ( ( n1ExonEnd < n1ChrPos ) && ( n1ChrPos <= ( n1ExonEnd + 2 ) ) ) { bIsInSpliceSite = true; break; } } if ( n1ChrPos < n1ExonStart ) { break; } if ( ( n1ExonStart <= n1ChrPos ) && ( n1ChrPos <= n1ExonEnd ) ) { bIsInExon = true; break; } } if ( bIsInSpliceSite ) return cSPLICE_SITE; if ( !bIsInExon ) return cINTRON; putExonsIntoAnArray( soExonStarts, soExonEnds, nNumberOfExons ); n1ChrPosCodingRegionLeft_ = atoi( so0CodingStart.data() ) + 1; n1ChrPosCodingRegionRight_ = atoi( so1CodingEnd.data() ); if ( ( n1ChrPos < n1ChrPosCodingRegionLeft_ ) || ( n1ChrPosCodingRegionRight_ < n1ChrPos ) ) return cUTR; // in coding region. We need to find whether it is a synonomous // or nonsynonomous snp. // get the codons int n1CodingPositionOfSnp = 0; bool bFoundCodingPositionOfSnp = false; if ( soStrand == "+" ) bTopNotBottomStrand_ = true; else if ( soStrand == "-" ) bTopNotBottomStrand_ = false; else assert( false ); if ( bTopNotBottomStrand_ ) { for( int nExon = 0; nExon < aExonStarts_.length(); ++nExon ) { int n1ExonStart = aExonStarts_[ nExon ]; int n1ExonEnd = aExonEnds_[ nExon ]; int n1ChrPosCodingRegionExonStart; int n1ChrPosCodingRegionExonEnd; if ( !bIntersect( n1ExonStart, n1ExonEnd, n1ChrPosCodingRegionLeft_, n1ChrPosCodingRegionRight_, n1ChrPosCodingRegionExonStart, n1ChrPosCodingRegionExonEnd ) ) { // this exon is totally within a utr continue; } if ( n1ChrPosCodingRegionExonEnd < n1ChrPos ) { n1CodingPositionOfSnp += ( n1ChrPosCodingRegionExonEnd - n1ChrPosCodingRegionExonStart + 1 ); } else { assert( n1ChrPosCodingRegionExonStart <= n1ChrPos ); assert( n1ChrPos <= n1ChrPosCodingRegionExonEnd ); n1CodingPositionOfSnp += ( n1ChrPos - n1ChrPosCodingRegionExonStart + 1 ); bFoundCodingPositionOfSnp = true; break; } } } else { for( int nExon = aExonStarts_.length() - 1; nExon >= 0; --nExon ) { int n1ExonStart = aExonStarts_[ nExon ]; int n1ExonEnd = aExonEnds_[ nExon ]; int n1ChrPosCodingRegionExonStart; int n1ChrPosCodingRegionExonEnd; if ( !bIntersect( n1ExonStart, n1ExonEnd, n1ChrPosCodingRegionLeft_, n1ChrPosCodingRegionRight_, n1ChrPosCodingRegionExonStart, n1ChrPosCodingRegionExonEnd ) ) { // this exon is total within a utr continue; } // -------- -------- ----- ------- coding region parts of exons // n1ChrPos is somewhere here because we've // already established that it is in an exon // and within the coding region boundaries if ( n1ChrPos < n1ChrPosCodingRegionExonStart ) { // the snp is not within this exon n1CodingPositionOfSnp += ( n1ChrPosCodingRegionExonEnd - n1ChrPosCodingRegionExonStart + 1 ); } else { assert( n1ChrPosCodingRegionExonStart <= n1ChrPos ); assert( n1ChrPos <= n1ChrPosCodingRegionExonEnd ); n1CodingPositionOfSnp += ( n1ChrPosCodingRegionExonEnd - n1ChrPos + 1 ); bFoundCodingPositionOfSnp = true; break; } } } assert( bFoundCodingPositionOfSnp ); // now need to find the 2 other bases in the codon. Do this // by first finding the coordinates in terms of coding region coordinates // of the 2 other bases and convert these to chromosome positions int nCodonPosition = n1CodingPositionOfSnp % 3; // nCodonPosition is thus 1, 2, or 0, with 0 being the 3rd codon // position int n1CodingPosition[2]; if ( nCodonPosition == 1 ) { n1CodingPosition[0] = n1CodingPositionOfSnp + 1; n1CodingPosition[1] = n1CodingPositionOfSnp + 2; } else if ( nCodonPosition == 2 ) { n1CodingPosition[0] = n1CodingPositionOfSnp - 1; n1CodingPosition[1] = n1CodingPositionOfSnp + 1; } else if ( nCodonPosition == 0 ) { n1CodingPosition[0] = n1CodingPositionOfSnp - 2; n1CodingPosition[1] = n1CodingPositionOfSnp - 1; } else { assert( false ); } int n1ChrPosOfOtherPositions[2]; for( int n = 0; n < 2; ++n ) { n1ChrPosOfOtherPositions[n] = nConvertCodingPositionToChrPos( n1CodingPosition[n] ); } RWCString soNormalCodon = " "; RWCString soSnpCodon = " "; char cRefAlleleBase = soChromosomeBases_[n1ChrPos]; if ( nCodonPosition == 1 ) { soNormalCodon[0] = cRefAlleleBase; soNormalCodon[1] = soChromosomeBases_[ n1ChrPosOfOtherPositions[0] ]; soNormalCodon[2] = soChromosomeBases_[ n1ChrPosOfOtherPositions[1] ]; soSnpCodon = soNormalCodon; soSnpCodon[0] = cAlternativeAllele; } else if ( nCodonPosition == 2 ) { soNormalCodon[0] = soChromosomeBases_[ n1ChrPosOfOtherPositions[0] ]; soNormalCodon[1] = cRefAlleleBase; soNormalCodon[2] = soChromosomeBases_[ n1ChrPosOfOtherPositions[1] ]; soSnpCodon = soNormalCodon; soSnpCodon[1] = cAlternativeAllele; } else if ( nCodonPosition == 0 ) { soNormalCodon[0] = soChromosomeBases_[ n1ChrPosOfOtherPositions[0] ]; soNormalCodon[1] = soChromosomeBases_[ n1ChrPosOfOtherPositions[1] ]; soNormalCodon[2] = cRefAlleleBase; soSnpCodon = soNormalCodon; soSnpCodon[2] = cAlternativeAllele; } else assert( false ); if ( soStrand == "-" ) { // I believe that we do not want to reverse-complement these codons, // but rather to just complement them. The reason is that // they are already in bottom-strand orientation by virtue // of the way the codons are assembled above. For example, // n1CodingPosition[0] < n1CodingPosition[1] always, but // for bottom strand genes, nConvertCodingPositionToChrPos // will reverse this ordering. So the codon will be assembled // from right to left. That is the way we want bottom strand // codons. So we just need to complement them, but not reverse them. for( int n = 0; n < 3; ++n ) { soNormalCodon[n] = cComplementBase( soNormalCodon[n] ); soSnpCodon[n] = cComplementBase( soSnpCodon[n] ); } } cNormalAA = cGetProtein( soNormalCodon[0], soNormalCodon[1], soNormalCodon[2] ); cSnpAA = cGetProtein( soSnpCodon[0], soSnpCodon[1], soSnpCodon[2] ); if ( cNormalAA == cSnpAA ) { return cSYN_CODING; } else { if ( cNormalAA == '*' ) { return cFROM_STOP_CODON; } else if ( cSnpAA == '*' ) { return cTO_STOP_CODON; } else { return cNONSYN_CODING; } } } void geneClassifications :: putExonsIntoAnArray( const RWCString& soExonStarts, const RWCString& soExonEnds, const int nNumberOfExons ) { aExonStarts_.clear(); aExonEnds_.clear(); RWCTokenizer tokExonStarts( soExonStarts ); RWCTokenizer tokExonEnds( soExonEnds ); for( int nExon = 0; nExon < nNumberOfExons; ++nExon ) { RWCString so0ExonStart = tokExonStarts(',' ); RWCString so1ExonEnd = tokExonEnds(',' ); assert( !so1ExonEnd.bIsNull() ); int n1ExonStart = atoi( so0ExonStart.data() ) + 1; int n1ExonEnd = atoi( so1ExonEnd.data() ); aExonStarts_.append( n1ExonStart ); aExonEnds_.append( n1ExonEnd ); } assert( aExonStarts_.length() == nNumberOfExons ); } int geneClassifications :: nConvertCodingPositionToChrPos( const int n1CodingPos ) { bool bFoundCodingPos = false; int nCodingBasesSoFar = 0; if ( bTopNotBottomStrand_ ) { for( int nExon = 0; nExon < aExonStarts_.length(); ++nExon ) { int n1ExonStart = aExonStarts_[ nExon ]; int n1ExonEnd = aExonEnds_[ nExon ]; int n1ChrPosCodingRegionExonStart; int n1ChrPosCodingRegionExonEnd; if ( !bIntersect( n1ExonStart, n1ExonEnd, n1ChrPosCodingRegionLeft_, n1ChrPosCodingRegionRight_, n1ChrPosCodingRegionExonStart, n1ChrPosCodingRegionExonEnd ) ) { // this exon is totally within a utr continue; } int nCodingBasesIncludingThisExon = nCodingBasesSoFar + ( n1ChrPosCodingRegionExonEnd - n1ChrPosCodingRegionExonStart + 1 ); if ( nCodingBasesIncludingThisExon < n1CodingPos ) { nCodingBasesSoFar = nCodingBasesIncludingThisExon; } else { assert( nCodingBasesSoFar < n1CodingPos ); int nAdditionalBasesThisExon = n1CodingPos - nCodingBasesSoFar; return n1ChrPosCodingRegionExonStart + nAdditionalBasesThisExon - 1; } } // should never get here assert( false ); } else { for( int nExon = aExonStarts_.length() - 1; nExon >= 0; --nExon ) { int n1ExonStart = aExonStarts_[ nExon ]; int n1ExonEnd = aExonEnds_[ nExon ]; int n1ChrPosCodingRegionExonStart; int n1ChrPosCodingRegionExonEnd; if ( !bIntersect( n1ExonStart, n1ExonEnd, n1ChrPosCodingRegionLeft_, n1ChrPosCodingRegionRight_, n1ChrPosCodingRegionExonStart, n1ChrPosCodingRegionExonEnd ) ) { // this exon is totally within a utr continue; } int nCodingBasesIncludingThisExon = nCodingBasesSoFar + ( n1ChrPosCodingRegionExonEnd - n1ChrPosCodingRegionExonStart + 1 ); if ( nCodingBasesIncludingThisExon < n1CodingPos ) { nCodingBasesSoFar = nCodingBasesIncludingThisExon; } else { assert( nCodingBasesSoFar < n1CodingPos ); int nAdditionalBasesThisExon = n1CodingPos - nCodingBasesSoFar; return n1ChrPosCodingRegionExonEnd - nAdditionalBasesThisExon + 1; } } // should never get here assert( false ); } } char geneClassifications :: cCombineGeneTypes( char cGeneType1, char cGeneType2 ) { if ( ( cGeneType1 == cTO_STOP_CODON ) || ( cGeneType2 == cTO_STOP_CODON ) ) return cTO_STOP_CODON; if ( ( cGeneType1 == cFROM_STOP_CODON ) || ( cGeneType2 == cFROM_STOP_CODON ) ) return cFROM_STOP_CODON; // notice that a splice site is even higher priority // than a nonsynonomous coding substitution if ( ( cGeneType1 == cSPLICE_SITE ) || ( cGeneType2 == cSPLICE_SITE ) ) return cSPLICE_SITE; if ( ( cGeneType1 == cNONSYN_CODING ) || ( cGeneType2 == cNONSYN_CODING ) ) return cNONSYN_CODING; if ( ( cGeneType1 == cSYN_CODING ) || ( cGeneType2 == cSYN_CODING ) ) return cSYN_CODING; if ( ( cGeneType1 == cUTR ) || ( cGeneType2 == cUTR ) ) return cUTR; if ( ( cGeneType1 == cINTRON ) || ( cGeneType2 == cINTRON ) ) return cINTRON; return cINTERGENIC; }