/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #include "diffChromosomes.h" #include #include #include "mbtValOrderedVectorOfRWCString.h" #include "mbt_exception.h" #include "soLine.h" #include "soAddCommas.h" #include "min.h" #include "max.h" #include "mbtPtrOrderedVector.h" class deletion { public: int n1ChrPosStart_; int nLength_; int nIndexOfAssociatedDeletion_; bool bAssociated_; public: deletion( const int n1ChrPosStart, const int n1ChrPosEnd ) : n1ChrPosStart_( n1ChrPosStart ), nLength_( n1ChrPosEnd - n1ChrPosStart + 1 ), bAssociated_( false ) {} bool operator<( const deletion& myDeletion ) const { return( n1ChrPosStart_ < myDeletion.n1ChrPosStart_ ); } bool operator==( const deletion& myDeletion ) const { return( n1ChrPosStart_ == myDeletion.n1ChrPosStart_ ); } }; const int nDeletionTableSize = 256; bool bIsDeletion[nDeletionTableSize]; static void setDeletionTable() { for( int n = 0; n < nDeletionTableSize; ++n ) { bIsDeletion[n] = false; } bIsDeletion['E'] = true; bIsDeletion['F'] = true; bIsDeletion['I'] = true; bIsDeletion['J'] = true; } void diffChromosomes :: doIt() { setDeletionTable(); mbtValOrderedVectorOfRWCString aChromosomes( (size_t) 25 ); aChromosomes.soName_ = "aChromosomes"; for( int n = 1; n <= 22; ++n ) { RWCString soChromosome = "snp_chr" + RWCString( (long) n ) + ".fa"; aChromosomes.append( soChromosome ); } aChromosomes.append( "snp_chrM.fa" ); aChromosomes.append( "snp_chrX.fa" ); aChromosomes.append( "snp_chrY.fa" ); RWCString soChromosomeBasesHere( (size_t) 250000000 ); RWCString soChromosomeBasesThere( (size_t) 250000000 ); RWCString soChromosomeBasesOrig( (size_t) 250000000 ); for( int nChr = 0; nChr < aChromosomes.length(); ++nChr ) { RWCString soChromosome = aChromosomes[ nChr ]; fprintf( stderr, "working on %s\n", soChromosome.data() ); FILE* pChrHere = fopen( soChromosome.data(), "r" ); if ( !pChrHere ) { THROW_FILE_ERROR( soChromosome ); } FileName filChrThere = filOtherDirectory_ + "/" + soChromosome; FILE* pChrThere= fopen( filChrThere.data(), "r" ); if ( !pChrThere ) { THROW_FILE_ERROR( filChrThere ); } FileName filChrOrig = soChromosome; filChrOrig.bStartsWithAndRemove( "snp_" ); filChrOrig = "../../" + filChrOrig; FILE* pChrOrig = fopen( filChrOrig.data(), "r" ); if ( !pChrOrig ) { THROW_FILE_ERROR( filChrOrig ); } // throw away headers fgets( soLine.data(), nMaxLineSize, pChrHere ); fgets( soLine.data(), nMaxLineSize, pChrThere ); fgets( soLine.data(), nMaxLineSize, pChrOrig ); // initialize strings. Make 1-based soChromosomeBasesHere = " "; soChromosomeBasesThere = " "; soChromosomeBasesOrig = " "; while( fgets( soLine.data(), nMaxLineSize, pChrHere ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); soChromosomeBasesHere += soLine; } while( fgets( soLine.data(), nMaxLineSize, pChrThere ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); soChromosomeBasesThere += soLine; } while( fgets( soLine.data(), nMaxLineSize, pChrOrig ) ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); soChromosomeBasesOrig += soLine; } if ( ( soChromosomeBasesHere.length() != soChromosomeBasesThere.length() ) || ( soChromosomeBasesHere.length() != soChromosomeBasesOrig.length() ) ) { printf( "warning: length difference. here: %s there: %s orig: %s\n", soAddCommas( soChromosomeBasesHere.length() ).data(), soAddCommas( soChromosomeBasesThere.length() ).data(), soAddCommas( soChromosomeBasesOrig.length() ).data() ); } int nMinLength = MIN( soChromosomeBasesHere.length(), soChromosomeBasesThere.length() ); mbtPtrOrderedVector aDeletionsHere( (size_t) 1000000 ); mbtPtrOrderedVector aDeletionsThere( (size_t) 1000000 ); bool bInDeletionHere = false; bool bInDeletionThere = false; int nDeletionHereStart; int nDeletionThereStart; // add sentinels soChromosomeBasesHere.append( 'N' ); soChromosomeBasesThere.append('N' ); // use <= to advance to sentinels int n1ChrPos; for( n1ChrPos = 1; n1ChrPos <= nMinLength; ++n1ChrPos ) { if ( soChromosomeBasesHere[ n1ChrPos ] != soChromosomeBasesThere[ n1ChrPos ] ) { if ( !bInDeletionHere && bIsDeletion[ soChromosomeBasesHere[ n1ChrPos ] ] ) { nDeletionHereStart = n1ChrPos; bInDeletionHere = true; } else if ( bInDeletionHere && !bIsDeletion[ soChromosomeBasesHere[ n1ChrPos ] ] ) { aDeletionsHere.insert( new deletion( nDeletionHereStart, n1ChrPos - 1 ) ); bInDeletionHere = false; } if ( !bInDeletionThere && bIsDeletion[ soChromosomeBasesThere[ n1ChrPos ] ] ) { nDeletionThereStart = n1ChrPos; bInDeletionThere = true; } else if ( bInDeletionThere && !bIsDeletion[ soChromosomeBasesThere[ n1ChrPos ] ] ) { aDeletionsThere.insert( new deletion( nDeletionThereStart, n1ChrPos - 1 ) ); bInDeletionThere = false; } } else { if ( bInDeletionHere ) { bInDeletionHere = false; aDeletionsHere.insert( new deletion( nDeletionHereStart, n1ChrPos - 1 ) ); } if ( bInDeletionThere ) { bInDeletionThere = false; aDeletionsThere.insert( new deletion( nDeletionThereStart, n1ChrPos - 1 ) ); } } } aDeletionsHere.resort(); aDeletionsThere.resort(); const int nMaxRegion = 50; for( int nDeletion = 0; nDeletion < aDeletionsHere.length(); ++nDeletion ) { deletion* pDeletionHere = aDeletionsHere[ nDeletion ]; // find the corresponding deletion "there" int nIndex = aDeletionsThere.nFindIndexOfMatchOrPredecessor( pDeletionHere ); if ( nIndex == RW_NPOS ) continue; deletion* pDeletionThere = aDeletionsThere[ nIndex ]; if ( pDeletionHere->nLength_ != pDeletionThere->nLength_ ) continue; if ( pDeletionHere->n1ChrPosStart_ - pDeletionThere->n1ChrPosStart_ > nMaxRegion ) continue; if ( pDeletionThere->bAssociated_ ) continue; // ------------DDDDDDD // DDDDDDD------------ // ^nHereStart // ^nThereStart // or // ----DDDDDDDDDDDD // DDDDDDDDDDDD---- // ^nHereStart // ^nThereStart int n1ChrRegionLeft = pDeletionThere->n1ChrPosStart_; int nRegionLength = pDeletionHere->nLength_ + pDeletionHere->n1ChrPosStart_ - pDeletionThere->n1ChrPosStart_; RWCString soRegionHere = soChromosomeBasesHere( n1ChrRegionLeft, nRegionLength ); RWCString soRegionThere = soChromosomeBasesThere( n1ChrRegionLeft, nRegionLength ); for( int n = nRegionLength - 1; n >= 0; --n ) { if ( bIsDeletion[ soRegionHere[n] ] ) soRegionHere.remove( n, 1 ); if ( bIsDeletion[ soRegionThere[n] ] ) soRegionThere.remove( n, 1 ); } if ( soRegionHere == soRegionThere ) { // found an associated pair pDeletionHere->nIndexOfAssociatedDeletion_ = nIndex; pDeletionThere->nIndexOfAssociatedDeletion_ = nDeletion; pDeletionHere->bAssociated_ = true; pDeletionThere->bAssociated_ = true; } } RWCString soOddLooking = "\033[35m"; RWCString soNormalLooking = "\033[m\033(B"; int nHere = 0; int nThere = 0; for( n1ChrPos = 1; n1ChrPos <= nMinLength; ++n1ChrPos ) { if ( soChromosomeBasesHere[ n1ChrPos ] != soChromosomeBasesThere[ n1ChrPos ] ) { // for now ignore cases in which either base is a deletion: // if ( bIsDeletion[ soChromosomeBasesHere[ n1ChrPos ] ] || // bIsDeletion[ soChromosomeBasesThere[ n1ChrPos ] ] ) { // continue; // } // end of ignoring deletions bool bContains = false; for( ; nHere < aDeletionsHere.length(); ++nHere ) { deletion* pHereDeletion = aDeletionsHere[ nHere ]; if ( !pHereDeletion->bAssociated_ ) continue; if ( pHereDeletion->n1ChrPosStart_ <= n1ChrPos && n1ChrPos <= ( pHereDeletion->n1ChrPosStart_ + pHereDeletion->nLength_ - 1 ) ) { bContains = true; break; } if ( ( pHereDeletion->n1ChrPosStart_ + pHereDeletion->nLength_ - 1 ) > n1ChrPos ) { break; } } // ignore this mismatch at this location if ( bContains ) continue; for( ; nThere < aDeletionsThere.length(); ++nThere ) { deletion* pThereDeletion = aDeletionsThere[ nThere ]; if ( !pThereDeletion->bAssociated_ ) continue; if ( pThereDeletion->n1ChrPosStart_ <= n1ChrPos && n1ChrPos <= ( pThereDeletion->n1ChrPosStart_ + pThereDeletion->nLength_ - 1 ) ) { bContains = true; break; } if ( ( pThereDeletion->n1ChrPosStart_ + pThereDeletion->nLength_ - 1 ) > n1ChrPos ) { break; } } if ( bContains ) continue; // so this is where there is some difference that is significant // let's print it out with some context--say 5 bases then a space const int nContextBefore =25; const int nContextAfter = 25; int nBackedUp = n1ChrPos - nContextBefore; nBackedUp = MAX( 1, nBackedUp ); int nEndOfContext = n1ChrPos + nContextAfter; nEndOfContext = MIN( nEndOfContext, nMinLength ); printf( "\ndiscrepancy at %s %s\n", soChromosome.data(), soAddCommas( n1ChrPos ).data() ); int n1; for( n1 = nBackedUp; n1 <= nEndOfContext; ++n1 ) { if ( soChromosomeBasesHere[n1] == soChromosomeBasesThere[n1] ) { printf( "%c", soChromosomeBasesHere[n1] ); } else { printf( "%s%c%s", soOddLooking.data(), soChromosomeBasesHere[n1], soNormalLooking.data() ); } } printf( "\n" ); for( n1 = nBackedUp; n1 <= nEndOfContext; ++n1 ) { if ( soChromosomeBasesHere[n1] == soChromosomeBasesThere[n1] ) { printf( "%c", soChromosomeBasesThere[n1] ); } else { printf( "%s%c%s", soOddLooking.data(), soChromosomeBasesThere[n1], soNormalLooking.data() ); } } printf( "\n" ); printf( "%.*s\n", nEndOfContext - nBackedUp + 1, soChromosomeBasesOrig.data() + nBackedUp ); n1ChrPos = nEndOfContext; } // if ( soChromosomeBasesHere[ n1ChrPos ] != soChromosomeBasesThere[ n1ChrPos ] ) { } // for( int n1ChrPos = 1; n1ChrPos <= nMinLength; ++n1ChrPos ) { fclose( pChrThere ); fclose( pChrHere ); fflush( stdout ); } // for( int nChr = 0; nChr < aChromosomes.length(); ++nChr ) { }