/***************************************************************************** # Copyright (C) 1994-2008 by David Gordon. # All rights reserved. # # This software is part of a beta-test version of the Consed/Autofinish # package. It should not be redistributed or # used for any commercial purpose, including commercially funded # sequencing, without written permission from the author and the # University of Washington. # # This software is provided ``AS IS'' and any express or implied # warranties, including, but not limited to, the implied warranties of # merchantability and fitness for a particular purpose, are disclaimed. # In no event shall the authors or the University of Washington be # liable for any direct, indirect, incidental, special, exemplary, or # consequential damages (including, but not limited to, procurement of # substitute goods or services; loss of use, data, or profits; or # business interruption) however caused and on any theory of liability, # whether in contract, strict liability, or tort (including negligence # or otherwise) arising in any way out of the use of this software, even # if advised of the possibility of such damage. # # Building Consed from source is error prone and not simple which is # why I provide executables. Due to time limitations I cannot # provide any assistance in building Consed. Even if you do not # modify the source, you may introduce errors due to using a # different version of the compiler, a different version of motif, # different versions of other libraries than I used, etc. For this # reason, if you discover Consed bugs, I can only offer help with # those bugs if you first reproduce those bugs with an executable # provided by me--not an executable you have built. # # Modifying Consed is also difficult. Although Consed is modular, # some modules are used by many other modules. Thus making a change # in one place can have unforeseen effects on many other features. # It may takes months for you to notice these other side-effects # which may not seen connected at all. It is not feasable for me to # provide help with modifying Consed sources because of the # potentially huge amount of time involved. # #*****************************************************************************/ #ifndef MISC_PROGRAM #include "miscProgram.h" void miscProgram() {} #else #include "miscProgram.h" #include "soLine.h" #include "bIntervalsIntersect.h" #include "rwcstring.h" #include "rwctokenizer.h" #include "mbtPtrOrderedVector.h" #include "filename.h" #include #include "mbt_exception.h" #include "bIsNumericMaybeWithWhitespace.h" #include "min.h" #include "max.h" #include "bIntersect.h" #include "rwtvalorderedvector.h" #include #include "mbtValOrderedVectorOfRWCString.h" #include "complement_so.h" #include "setNumberFromBaseTable.h" #include "nNumberFromBase.h" #include #include "soAddCommas.h" #include "rwtvalvector.h" #include #include "cComplementBase.h" class region { public: RWCString soChromosome_; int nStart_; int nEnd_; bool bOKToUse_; bool bMultipleOverlaps_; public: region( const RWCString& soChromosome, const int nStart, const int nEnd ) : soChromosome_( soChromosome ), nStart_( nStart ), nEnd_( nEnd ), bOKToUse_( true ), bMultipleOverlaps_( false ) {} bool operator==( const region& myRegion ) { return( ( soChromosome_ == myRegion.soChromosome_ ) && ( nStart_ == myRegion.nStart_ ) && ( nEnd_ == myRegion.nEnd_ ) ); } bool operator<( const region& myRegion ) const { if ( soChromosome_ != myRegion.soChromosome_ ) { return( soChromosome_ < myRegion.soChromosome_ ); } else { return( nStart_ < myRegion.nStart_ ); } } int nGetLength() { return( nEnd_ - nStart_ + 1 ); } }; static bool bRegionsOverlap( region* pRegion1, region* pRegion2 ) { if ( pRegion1->soChromosome_ != pRegion2->soChromosome_ ) return false; return ( bIntervalsIntersect( pRegion1->nStart_, pRegion1->nEnd_, pRegion2->nStart_, pRegion2->nEnd_ ) ); } static bool bRegionsLessThan( region* pRegion1, region* pRegion2 ) { if ( pRegion1->soChromosome_ < pRegion2->soChromosome_ ) return true; else if ( pRegion1->soChromosome_ > pRegion2->soChromosome_) return false; else { if ( pRegion1->nEnd_ < pRegion2->nStart_ ) return true; else return false; } } static bool bRegionContainedWithinRegion( region* pRegion1, region* pRegion2 ) { if ( pRegion1->soChromosome_ != pRegion2->soChromosome_ ) return false; if ( ( pRegion2->nStart_ <= pRegion1->nStart_ ) && ( pRegion1->nEnd_ <= pRegion2->nEnd_ ) ) { return true; } else return false; } static void run_100812() { // used to compare Ian and Tristan's coordinates // FileName filHumanGenomeFOF = "/codon/gordon/genomes/human_genome_hg18/human_genome.fof"; // FILE* pHumanGenomeFOF = fopen( filHumanGenomeFOF.data(), "r" ); // if ( !pHumanGenomeFOF ) { // THROW_FILE_ERROR( filHumanGenomeFOF ); // } // while( fgets( soLine.data(), nMaxLineSize, pHumanGenomeFOF ) != NULL ) { // FileName filChromosome( soLine.data() ); // // how big is this chromosome? // FILE* pChr = fopen( filChromosome.data(), "r" ); // if ( !pChr ) { // THROW_FILE_ERROR( filChromosome ); // } // int nChrLength = 0; // while( fgets( soLine.data(), nMaxLineSize, pHumanGenomeFOF ) != NULL ) { // soLine.nCurrentLength_ = strlen( soLine.data() ); // soLine.stripTrailingWhitespaceFast(); // if ( soLine[0] == '>' ) continue; // nChrLength += soLine.length(); // } // fclose( filChromosome.data() ); // printf( "%s has %d bases\n", filChromosome.data(), nChrLength ); // mbtValVectorOfBool aIan( nChrLength, 1, "aIan" ); // mbtValVectorOfBool aTristan( nChrLength, 1, "aTristan" ); // apply( filChromosome, aIanRegions, aIan ); // apply( filChromosome, aTristanRegions, aTristan ); // mbtValVector aJustIan( nChrLength, 1, 0, "aJustIan" ); // mbtValVector aJustTristan( nChrLength, 1, 0, "aJustTristan" ); // for( int nChrPos = 1; nChrPos <= nChrLength; ++nChrPos ) { // if ( aIan[nChrPos] != aTristan[nChrPos] ) { // if ( aIan[ nChrPos ] ) { // aJustIan[ nChrPos ] = 1; // } // else { // aJustTristan[ nChrPos ] = 1; // } // } // } // } // used to compare Ian and Tristan's coordinates FileName filTristan = "/codon/gordon/misc/geneClassifications/exomeSegments/exome_coords.txt"; FileName filIan = "/codon/gordon/misc/geneClassifications/otherExomeSegments/nimblegen_solution_ccds_2008.HG18.list"; FILE* pIan = fopen( filIan.data(), "r" ); if ( !pIan ) { THROW_FILE_ERROR( filIan ); } FILE* pTristan = fopen( filTristan.data(), "r" ); if ( !pTristan ) { THROW_FILE_ERROR( filTristan ); } mbtPtrOrderedVector aTristanRegion( 160000 ); mbtPtrOrderedVector aIanRegion( 160000 ); while( fgets( soLine.data(), nMaxLineSize, pTristan ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); // looks like: // chr1 58953 59871 RWCTokenizer tok( soLine ); RWCString soChromosome = tok(); RWCString soStart = tok(); RWCString soEnd = tok(); if ( soEnd.bIsNull() ) { THROW_ERROR2( "LINE " + soLine + " should be of form chr start end" ); } int nStart; int nEnd; assert( bIsNumericMaybeWithWhitespace( soStart, nStart ) ); assert( bIsNumericMaybeWithWhitespace( soEnd, nEnd ) ); region* pRegion = new region( soChromosome, nStart, nEnd ); aTristanRegion.append( pRegion ); } fclose( pTristan ); RWCString soChromosome( (size_t) 1000); int nStart; int nEnd; int nTokens; int nLinesRead = 1; while( fgets( soLine.data(), nMaxLineSize, pIan ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); int nColon = soLine.index( ":" ); assert( nColon != RW_NPOS ); int nDash = soLine.index( "-", nColon + 1 ); assert( nDash != RW_NPOS ); RWCString soChromosome = soLine( 0, nColon ); nTokens = sscanf( soLine.soGetRestOfString( nColon + 1 ).data(), "%d-%d", &nStart, &nEnd ); if ( nTokens != 2 ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soChromosome.nCurrentLength_ = strlen( soChromosome.data() ); THROW_ERROR2( "data error with Ian, nToken = " + RWCString( (long) nTokens ) + " converted string is " + soChromosome + " remaining string is " + soLine + " line " + RWCString( (long) nLinesRead ) ); } ++nLinesRead; region* pRegion = new region( soChromosome, nStart, nEnd ); aIanRegion.append( pRegion ); } fclose( pIan ); aTristanRegion.resort(); aIanRegion.resort(); printf( "original tristan regions: %d\n", aTristanRegion.length() ); printf( "original ian regions: %d\n", aIanRegion.length() ); // do any ian regions overlap? int nIntersectingIanRegions = 0; int nIan; for( nIan = 1; nIan < aIanRegion.length(); ++nIan ) { region* pIan1 = aIanRegion[nIan-1]; region* pIan2 = aIanRegion[nIan]; if ( bRegionsOverlap( pIan1, pIan2 ) ) { ++nIntersectingIanRegions; } } printf( "ian regions overlapping: %d\n", nIntersectingIanRegions ); // how many tristan regions are there that don't overlap any ian regions? int nTristanRegionsWithoutIan = 0; int nTristan; for( nTristan = 0; nTristan < aTristanRegion.length(); ++nTristan ) { region* pTristan = aTristanRegion[ nTristan ]; bool bFoundOverlapWithIan = false; int nIan = aIanRegion.nFindIndexOfMatchOrPredecessor( pTristan ); if ( nIan == RW_NPOS ) nIan = 0; while( ( nIan < aIanRegion.length() ) ) { region* pIan = aIanRegion[ nIan ]; if ( bRegionsOverlap( pTristan, pIan ) ) { bFoundOverlapWithIan = true; break; } if ( bRegionsLessThan( pTristan, pIan ) ) { break; } ++nIan; } if ( !bFoundOverlapWithIan ) { ++nTristanRegionsWithoutIan; } } printf( "tristan regions without ian: %d\n", nTristanRegionsWithoutIan ); // how many ian regions are there without any tristan? int nIanRegionsWithoutTristan = 0; for( nIan = 0; nIan < aIanRegion.length(); ++nIan ) { region* pIan = aIanRegion[ nIan ]; bool bFoundOverlapWithTristan = false; int nTristan = aTristanRegion.nFindIndexOfMatchOrPredecessor( pIan ); if ( nTristan == RW_NPOS ) nTristan = 0; while( ( nTristan < aTristanRegion.length() ) ) { region* pTristan = aTristanRegion[ nTristan ]; if ( bRegionsOverlap( pIan, pTristan ) ) { bFoundOverlapWithTristan = true; break; } if ( bRegionsLessThan( pIan, pTristan ) ) { break; } ++nTristan; } if ( !bFoundOverlapWithTristan ) { printf( "ian region without overlap with tristan: %s %d %d\n", pIan->soChromosome_.data(), pIan->nStart_, pIan->nEnd_ ); ++nIanRegionsWithoutTristan; } } printf( "ian regions without tristan: %d\n", nIanRegionsWithoutTristan ); // go through and eliminate ian regions that overlap int nIntersectingRegions = 0; for( nTristan = 1; nTristan < aTristanRegion.length(); ++nTristan ) { region* pTristan1 = aTristanRegion[nTristan-1]; region* pTristan2 = aTristanRegion[nTristan]; if ( ( pTristan1->soChromosome_ == pTristan2->soChromosome_ ) && ( bIntervalsIntersect( pTristan1->nStart_, pTristan1->nEnd_, pTristan2->nStart_, pTristan2->nEnd_ ) ) ) { // how much do they overlap by? int nOverlap = pTristan1->nEnd_ - pTristan2->nStart_ + 1; printf( "overlap by %d\n", nOverlap ); pTristan1->bOKToUse_ = false; pTristan2->bOKToUse_ = false; ++nIntersectingRegions; // find corresponding region in aIan region* pDummy = new region( pTristan1->soChromosome_, MIN( pTristan1->nStart_, pTristan2->nStart_ ), MAX( pTristan1->nEnd_, pTristan2->nEnd_ ) ); int nIndex = aIanRegion.nFindIndexOfMatchOrPredecessor( pDummy ); if ( nIndex == RW_NPOS ) nIndex = 0; while( nIndex < aIanRegion.length() ) { region* pIanRegion = aIanRegion[ nIndex ]; if ( bRegionsOverlap( pDummy, pIanRegion ) ) { pIanRegion->bOKToUse_ = false; } else { if ( pDummy->soChromosome_ < pIanRegion->soChromosome_ ) { break; } else if ( pDummy->soChromosome_ == pIanRegion->soChromosome_ ) { if ( pDummy->nEnd_ < pIanRegion->nStart_ ) break; } } ++nIndex; } } // if ( ( pTristan1->soChromosome_ == pTristan2->soChromosome_ ) && } printf( "tristan regions that overlap previous region: %d\n", nIntersectingRegions ); mbtPtrOrderedVector aTristanRegionTemp = aTristanRegion; mbtPtrOrderedVector aIanRegionTemp = aIanRegion; aTristanRegion.clear(); aIanRegion.clear(); int n; for( n = 0; n < aTristanRegionTemp.length(); ++n ) { region* pRegion = aTristanRegionTemp[ n ]; if ( pRegion->bOKToUse_ ) { aTristanRegion.append( pRegion ); } } for( n = 0; n < aIanRegionTemp.length(); ++n ) { region* pRegion = aIanRegionTemp[ n ]; if ( pRegion->bOKToUse_ ) { aIanRegion.append( pRegion ); } } printf( "now tristan: %d\n", aTristanRegion.length() ); printf( "now ian: %d\n", aIanRegion.length() ); aTristanRegionTemp.clear(); aIanRegionTemp.clear(); // look for regions in Ian that have many points than in Tristan int nRegionsWithLotsOfExtraBases = 0; for( nIan = 0; nIan < aIanRegion.length(); ++nIan ) { region* pIan = aIanRegion[ nIan ]; int nTristan = aTristanRegion.nFindIndexOfMatchOrPredecessor( pIan ); if ( nTristan == RW_NPOS ) nTristan = 0; while( nTristan < aTristanRegion.length() ) { region* pTristan = aTristanRegion[ nTristan ]; if ( bRegionsLessThan( pIan, pTristan ) ) break; if ( bRegionsOverlap( pTristan, pIan ) ) { // how much do they overlap and how much do they // not overlap? int nIntersectStart; int nIntersectEnd; assert( bIntersect( pTristan->nStart_, pTristan->nEnd_, pIan->nStart_, pIan->nEnd_, nIntersectStart, nIntersectEnd ) ); int nIntersectingSize = nIntersectEnd - nIntersectStart + 1; int nNonIntersectingSizeAtBeginning = MAX( pTristan->nStart_, pIan->nStart_ ) - MIN( pTristan->nStart_, pIan->nStart_ ); int nNonIntersectingSizeAtEnd = MAX( pTristan->nEnd_, pIan->nEnd_ ) - MIN( pTristan->nEnd_, pIan->nEnd_ ); assert( pIan->nStart_ < pTristan->nStart_ ); assert( pTristan->nEnd_ < pIan->nEnd_ ); printf( "intersecting size: %d noninter left: %d noninter right %d", nIntersectingSize, nNonIntersectingSizeAtBeginning, nNonIntersectingSizeAtEnd ); if ( ( nNonIntersectingSizeAtBeginning + nNonIntersectingSizeAtEnd ) > 3 ) { pTristan->bOKToUse_ = false; pIan->bOKToUse_ = false; ++nRegionsWithLotsOfExtraBases; printf( " big " ); } printf( "\n" ); } // if ( bRegionsOverlap( pTristan, pIan ) ) { ++nTristan; } // while( nTristan < aTristanRegion.length() ) { } // for( int nIan = 0; nIan < aIanRegion.length(); ++nIan ) { printf( "regions with lots of extra bases: %d\n", nRegionsWithLotsOfExtraBases ); // again eliminate not used aTristanRegionTemp = aTristanRegion; aIanRegionTemp = aIanRegion; aTristanRegion.clear(); aIanRegion.clear(); for( n = 0; n < aTristanRegionTemp.length(); ++n ) { region* pRegion = aTristanRegionTemp[ n ]; if ( pRegion->bOKToUse_ ) { aTristanRegion.append( pRegion ); } } for( n = 0; n < aIanRegionTemp.length(); ++n ) { region* pRegion = aIanRegionTemp[ n ]; if ( pRegion->bOKToUse_ ) { aIanRegion.append( pRegion ); } } aTristanRegionTemp.clear(); aIanRegionTemp.clear(); printf( "tristan regions: %d\n", aTristanRegion.length() ); printf( "ian regions: %d\n", aIanRegion.length() ); fflush( stdout ); // for( int nIan = 0; nIan < aIanRegion.length(); ++nIan ) { // region* pIan = aIanRegion[ nIan ]; // int nTristan = aTristanRegion.nFindIndexOfMatchOrPredecessor( pIan ); // if ( nTristan == RW_NPOS ) { // nTristan = aTristanRegion.nFindIndexOfMatchOrSuccessor( pIan ); // if ( nTristan == RW_NPOS ) continue; // } // for( int nLoop = 0; nLoop < 2; ++nLoop ) { // if ( nLoop == 1 ) { // ++nTristan; // if ( nTristan >= aTristanRegion.length() ) break; // } // region* pTristan = aTristanRegion[ nTristan ]; // if ( bRegionsOverlap( pTristan, pIan ) ) { // // how much do they overlap and how much do they // // not overlap? // int nIntersectStart; // int nIntersectEnd; // assert( bIntersect( pTristan->nStart_, // pTristan->nEnd_, // pIan->nStart_, // pIan->nEnd_, // nIntersectStart, } static void run_100813() { // used to compare Ian and Tristan's coordinates FileName filTristan = "/codon/gordon/misc/geneClassifications/exomeSegments/exome_coords.txt"; FileName filIan = "/codon/gordon/misc/geneClassifications/otherExomeSegments/nimblegen_solution_ccds_2008.HG18.list"; FILE* pIan = fopen( filIan.data(), "r" ); if ( !pIan ) { THROW_FILE_ERROR( filIan ); } FILE* pTristan = fopen( filTristan.data(), "r" ); if ( !pTristan ) { THROW_FILE_ERROR( filTristan ); } mbtPtrOrderedVector aTristanRegion( 160000 ); mbtPtrOrderedVector aIanRegion( 160000 ); RWTValOrderedVector aHistogramSizesTristan( 10000 ); RWTValOrderedVector aHistogramSizesIan( 10000 ); int nLongestTristanRegion = 0; while( fgets( soLine.data(), nMaxLineSize, pTristan ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); // looks like: // chr1 58953 59871 RWCTokenizer tok( soLine ); RWCString soChromosome = tok(); RWCString soStart = tok(); RWCString soEnd = tok(); if ( soEnd.bIsNull() ) { THROW_ERROR2( "LINE " + soLine + " should be of form chr start end" ); } int nStart; int nEnd; assert( bIsNumericMaybeWithWhitespace( soStart, nStart ) ); assert( bIsNumericMaybeWithWhitespace( soEnd, nEnd ) ); // note: converting Tristan regions to 1-based region* pRegion = new region( soChromosome, nStart + 1, nEnd ); aTristanRegion.append( pRegion ); if ( pRegion->nGetLength() > nLongestTristanRegion ) nLongestTristanRegion = pRegion->nGetLength(); while( pRegion->nGetLength() >= aHistogramSizesTristan.length() ) { aHistogramSizesTristan.append( 0 ); } ++aHistogramSizesTristan[ pRegion->nGetLength() ]; } fclose( pTristan ); printf( "histogram of sizes of tristan regions:\n" ); int nSize; for( nSize = 0; nSize < aHistogramSizesTristan.length(); ++nSize ) { if ( aHistogramSizesTristan[ nSize ] != 0 ) { printf( "Tristan histogram %d : %d\n", nSize, aHistogramSizesTristan[ nSize ] ); } } RWCString soChromosome( (size_t) 1000); int nStart; int nEnd; int nTokens; int nLinesRead = 1; int nLongestIanRegion = 0; while( fgets( soLine.data(), nMaxLineSize, pIan ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); int nColon = soLine.index( ":" ); assert( nColon != RW_NPOS ); int nDash = soLine.index( "-", nColon + 1 ); assert( nDash != RW_NPOS ); RWCString soChromosome = soLine( 0, nColon ); nTokens = sscanf( soLine.soGetRestOfString( nColon + 1 ).data(), "%d-%d", &nStart, &nEnd ); if ( nTokens != 2 ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soChromosome.nCurrentLength_ = strlen( soChromosome.data() ); THROW_ERROR2( "data error with Ian, nToken = " + RWCString( (long) nTokens ) + " converted string is " + soChromosome + " remaining string is " + soLine + " line " + RWCString( (long) nLinesRead ) ); } ++nLinesRead; region* pRegion = new region( soChromosome, nStart, nEnd ); aIanRegion.append( pRegion ); if ( pRegion->nGetLength() > nLongestIanRegion ) nLongestIanRegion = pRegion->nGetLength(); } fclose( pIan ); aTristanRegion.resort(); aIanRegion.resort(); printf( "original tristan regions: %d\n", aTristanRegion.length() ); printf( "original ian regions: %d\n", aIanRegion.length() ); // do any ian regions overlap? int nIntersectingIanRegions = 0; int nIan; for( nIan = 1; nIan < aIanRegion.length(); ++nIan ) { region* pIan1 = aIanRegion[nIan-1]; region* pIan2 = aIanRegion[nIan]; if ( bRegionsOverlap( pIan1, pIan2 ) ) { ++nIntersectingIanRegions; } } printf( "ian regions overlapping: %d\n", nIntersectingIanRegions ); // how many tristan regions are there that don't overlap any ian regions? int nTristanRegionsWithoutIan = 0; int nTristan; for( nTristan = 0; nTristan < aTristanRegion.length(); ++nTristan ) { region* pTristan = aTristanRegion[ nTristan ]; bool bFoundOverlapWithIan = false; int nIan = aIanRegion.nFindIndexOfMatchOrPredecessor( pTristan ); if ( nIan == RW_NPOS ) nIan = 0; while( ( nIan < aIanRegion.length() ) ) { region* pIan = aIanRegion[ nIan ]; if ( bRegionsOverlap( pTristan, pIan ) ) { bFoundOverlapWithIan = true; break; } if ( bRegionsLessThan( pTristan, pIan ) ) { break; } ++nIan; } if ( !bFoundOverlapWithIan ) { ++nTristanRegionsWithoutIan; } } printf( "tristan regions without ian: %d\n", nTristanRegionsWithoutIan ); // how many ian regions are there without any tristan? int nIanRegionsWithoutTristan = 0; for( nIan = 0; nIan < aIanRegion.length(); ++nIan ) { region* pIan = aIanRegion[ nIan ]; region myRegion = *pIan; myRegion.nStart_ -= nLongestTristanRegion; bool bFoundOverlapWithTristan = false; int nTristan = aTristanRegion.nFindIndexOfMatchOrPredecessor( &myRegion ); if ( nTristan == RW_NPOS ) nTristan = 0; while( ( nTristan < aTristanRegion.length() ) ) { region* pTristan = aTristanRegion[ nTristan ]; if ( bRegionsOverlap( pIan, pTristan ) ) { bFoundOverlapWithTristan = true; break; } if ( bRegionsLessThan( pIan, pTristan ) ) { break; } ++nTristan; } if ( !bFoundOverlapWithTristan ) { printf( "ian region without overlap with tristan: %s %d %d\n", pIan->soChromosome_.data(), pIan->nStart_, pIan->nEnd_ ); ++nIanRegionsWithoutTristan; } } printf( "ian regions without tristan: %d\n", nIanRegionsWithoutTristan ); // ian regions overlapping multiple Tristan regions RWTValOrderedVector aHistogramIan; for( nIan = 0; nIan < aIanRegion.length(); ++nIan ) { region* pIan = aIanRegion[ nIan ]; region myRegion = *pIan; myRegion.nStart_ -= nLongestTristanRegion; int nTristan = aTristanRegion.nFindIndexOfMatchOrPredecessor( &myRegion ); RWTPtrOrderedVector aTristanRegionsWithOverlap; if ( nTristan == RW_NPOS ) nTristan = 0; int nNumberOfOverlapsWithThisIanRegion = 0; while( ( nTristan < aTristanRegion.length() ) ) { region* pTristan = aTristanRegion[ nTristan ]; if ( bRegionsOverlap( pIan, pTristan ) ) { ++nNumberOfOverlapsWithThisIanRegion; aTristanRegionsWithOverlap.append( pTristan ); } if ( bRegionsLessThan( pIan, pTristan ) ) { break; } ++nTristan; } while( nNumberOfOverlapsWithThisIanRegion >= aHistogramIan.length() ) { aHistogramIan.append( 0 ); } ++aHistogramIan[ nNumberOfOverlapsWithThisIanRegion ]; if ( nNumberOfOverlapsWithThisIanRegion > 1 ) { pIan->bMultipleOverlaps_ = true; for( int n = 0; n < aTristanRegionsWithOverlap.length(); ++n ) { region* pTristan = aTristanRegionsWithOverlap[n]; pTristan->bMultipleOverlaps_ = true; } } } printf( "histogram\n" ); printf( "# of overlaps # of ian regions with this # of overlaps\n" ); int nNumberOfOverlaps; for( nNumberOfOverlaps = 0; nNumberOfOverlaps < aHistogramIan.length(); ++nNumberOfOverlaps ) { printf( "%d %d\n", nNumberOfOverlaps, aHistogramIan[ nNumberOfOverlaps ] ); } // Tristan regions overlapping multiple Ian regions RWTValOrderedVector aHistogramTristan; for( nTristan = 0; nTristan < aTristanRegion.length(); ++nTristan ) { region* pTristan = aTristanRegion[ nTristan ]; region myRegion = *pTristan; myRegion.nStart_ -= nLongestIanRegion; int nIan = aIanRegion.nFindIndexOfMatchOrPredecessor( &myRegion ); if ( nIan == RW_NPOS ) nIan = 0; int nNumberOfOverlapsWithThisTristanRegion = 0; while( ( nIan < aIanRegion.length() ) ) { region* pIan = aIanRegion[ nIan ]; if ( bRegionsOverlap( pTristan, pIan ) ) { ++nNumberOfOverlapsWithThisTristanRegion; } if ( bRegionsLessThan( pTristan, pIan ) ) { break; } ++nIan; } while( nNumberOfOverlapsWithThisTristanRegion >= aHistogramTristan.length() ) { aHistogramTristan.append( 0 ); } ++aHistogramTristan[ nNumberOfOverlapsWithThisTristanRegion ]; } printf( "histogram\n" ); printf( "# of overlaps # of Tristan regions with this # of overlaps\n" ); for( nNumberOfOverlaps = 0; nNumberOfOverlaps < aHistogramTristan.length(); ++nNumberOfOverlaps ) { printf( "%d %d\n", nNumberOfOverlaps, aHistogramTristan[ nNumberOfOverlaps ] ); } // eliminate regions that have multiple overlaps mbtPtrOrderedVector aTristanRegionTemp = aTristanRegion; mbtPtrOrderedVector aIanRegionTemp = aIanRegion; aTristanRegion.clear(); aIanRegion.clear(); int n; for( n = 0; n < aTristanRegionTemp.length(); ++n ) { region* pRegion = aTristanRegionTemp[ n ]; if ( !pRegion->bMultipleOverlaps_ ) { aTristanRegion.append( pRegion ); } } for( n = 0; n < aIanRegionTemp.length(); ++n ) { region* pRegion = aIanRegionTemp[ n ]; if ( !pRegion->bMultipleOverlaps_ ) { aIanRegion.append( pRegion ); } } printf( "after eliminating Tristan and Ian regions in which any Ian region overlaps multiple Tristan regions\n" ); printf( "now tristan: %d\n", aTristanRegion.length() ); printf( "now ian: %d\n", aIanRegion.length() ); // go through and eliminate ian regions that overlap int nIntersectingRegions = 0; for( nTristan = 1; nTristan < aTristanRegion.length(); ++nTristan ) { region* pTristan1 = aTristanRegion[nTristan-1]; region* pTristan2 = aTristanRegion[nTristan]; if ( ( pTristan1->soChromosome_ == pTristan2->soChromosome_ ) && ( bIntervalsIntersect( pTristan1->nStart_, pTristan1->nEnd_, pTristan2->nStart_, pTristan2->nEnd_ ) ) ) { // how much do they overlap by? int nOverlap = pTristan1->nEnd_ - pTristan2->nStart_ + 1; printf( "overlap by %d\n", nOverlap ); pTristan1->bOKToUse_ = false; pTristan2->bOKToUse_ = false; ++nIntersectingRegions; // find corresponding region in aIan region regionToSearch( pTristan1->soChromosome_, MIN( pTristan1->nStart_, pTristan2->nStart_ ) - nLongestIanRegion, MAX( pTristan1->nEnd_, pTristan2->nEnd_ ) ); region* pCombinedTristan = new region( pTristan1->soChromosome_, MIN( pTristan1->nStart_, pTristan2->nStart_ ), MAX( pTristan1->nEnd_, pTristan2->nEnd_ ) ); int nIndex = aIanRegion.nFindIndexOfMatchOrPredecessor( ®ionToSearch ); if ( nIndex == RW_NPOS ) nIndex = 0; while( nIndex < aIanRegion.length() ) { region* pIanRegion = aIanRegion[ nIndex ]; if ( bRegionsOverlap( pCombinedTristan, pIanRegion ) ) { pIanRegion->bOKToUse_ = false; } else { if ( bRegionsLessThan( pCombinedTristan, pIanRegion ) ) break; } ++nIndex; } } // if ( ( pTristan1->soChromosome_ == pTristan2->soChromosome_ ) && } printf( "tristan regions that overlap previous region: %d\n", nIntersectingRegions ); aTristanRegionTemp = aTristanRegion; aIanRegionTemp = aIanRegion; aTristanRegion.clear(); aIanRegion.clear(); for( n = 0; n < aTristanRegionTemp.length(); ++n ) { region* pRegion = aTristanRegionTemp[ n ]; if ( pRegion->bOKToUse_ ) { aTristanRegion.append( pRegion ); } } for( n = 0; n < aIanRegionTemp.length(); ++n ) { region* pRegion = aIanRegionTemp[ n ]; if ( pRegion->bOKToUse_ ) { aIanRegion.append( pRegion ); } } printf( "now tristan: %d\n", aTristanRegion.length() ); printf( "now ian: %d\n", aIanRegion.length() ); aTristanRegionTemp.clear(); aIanRegionTemp.clear(); // look for regions in Ian that have many points than in Tristan int nIanWithinTristan = 0; int nTristanWithinIan = 0; int nNotEither = 0; int nRegionsWithLotsOfExtraBases = 0; for( nIan = 0; nIan < aIanRegion.length(); ++nIan ) { region* pIan = aIanRegion[ nIan ]; region regionToSearch = *pIan; regionToSearch.nStart_ -= nLongestTristanRegion; int nTristan = aTristanRegion.nFindIndexOfMatchOrPredecessor( ®ionToSearch ); if ( nTristan == RW_NPOS ) nTristan = 0; while( nTristan < aTristanRegion.length() ) { region* pTristan = aTristanRegion[ nTristan ]; if ( bRegionsLessThan( pIan, pTristan ) ) break; if ( bRegionsOverlap( pTristan, pIan ) ) { // how much do they overlap and how much do they // not overlap? int nIntersectStart; int nIntersectEnd; assert( bIntersect( pTristan->nStart_, pTristan->nEnd_, pIan->nStart_, pIan->nEnd_, nIntersectStart, nIntersectEnd ) ); int nIntersectingSize = nIntersectEnd - nIntersectStart + 1; int nNonIntersectingSizeAtBeginning = MAX( pTristan->nStart_, pIan->nStart_ ) - MIN( pTristan->nStart_, pIan->nStart_ ); int nNonIntersectingSizeAtEnd = MAX( pTristan->nEnd_, pIan->nEnd_ ) - MIN( pTristan->nEnd_, pIan->nEnd_ ); assert( pIan->nStart_ < pTristan->nStart_ ); assert( pTristan->nEnd_ < pIan->nEnd_ ); printf( "intersecting size: %d noninter left: %d noninter right %d bp ", nIntersectingSize, nNonIntersectingSizeAtBeginning, nNonIntersectingSizeAtEnd ); if ( bRegionContainedWithinRegion( pTristan, pIan ) ) { ++nTristanWithinIan; } else if ( bRegionContainedWithinRegion( pIan, pTristan ) ) { ++nIanWithinTristan; } else { ++nNotEither; } if ( ( nNonIntersectingSizeAtBeginning + nNonIntersectingSizeAtEnd ) > 4 ) { pTristan->bOKToUse_ = false; pIan->bOKToUse_ = false; ++nRegionsWithLotsOfExtraBases; printf( " big " ); } printf( "\n" ); } // if ( bRegionsOverlap( pTristan, pIan ) ) { ++nTristan; } // while( nTristan < aTristanRegion.length() ) { } // for( int nIan = 0; nIan < aIanRegion.length(); ++nIan ) { printf( "regions with lots of extra bases: %d\n", nRegionsWithLotsOfExtraBases ); printf( "tristan within ian: %d\n", nTristanWithinIan ); printf( "ian within tristan: %d\n", nIanWithinTristan ); printf( "not either : %d\n", nNotEither ); // again eliminate not used aTristanRegionTemp = aTristanRegion; aIanRegionTemp = aIanRegion; aTristanRegion.clear(); aIanRegion.clear(); for( n = 0; n < aTristanRegionTemp.length(); ++n ) { region* pRegion = aTristanRegionTemp[ n ]; if ( pRegion->bOKToUse_ ) { aTristanRegion.append( pRegion ); } } for( n = 0; n < aIanRegionTemp.length(); ++n ) { region* pRegion = aIanRegionTemp[ n ]; if ( pRegion->bOKToUse_ ) { aIanRegion.append( pRegion ); } } aTristanRegionTemp.clear(); aIanRegionTemp.clear(); printf( "tristan regions: %d\n", aTristanRegion.length() ); printf( "ian regions: %d\n", aIanRegion.length() ); fflush( stdout ); } static RWCString soChromosomeBases; static mbtValOrderedVectorOfRWCString aHumanGenomeFOF( (size_t) 50 ); static void loadChromosome( const RWCString& soChromosome ) { RWCString soPattern = soChromosome + ".fa"; // find the corresponding file FileName filChromosome; for( int n = 0; n < aHumanGenomeFOF.length(); ++n ) { FileName filChromosomeTemp = aHumanGenomeFOF[n]; if ( filChromosomeTemp.bContains( soPattern ) ) { filChromosome = filChromosomeTemp; break; } } assert( !filChromosome.bIsNull() ); FILE* pChromosome = fopen( filChromosome.data(), "r" ); if ( !pChromosome ) { THROW_FILE_ERROR( filChromosome ); } soChromosomeBases.increaseButNotDecreaseMaxLength( 260000000 ); soChromosomeBases = " "; // make 1-based // throw away header assert( fgets( soLine.data(), nMaxLineSize, pChromosome ) != NULL ); assert( soLine.sz_[0] == '>' ); while( fgets( soLine.data(), nMaxLineSize, pChromosome ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); soChromosomeBases += soLine; } fclose( pChromosome ); } // this is for examining motifs within the exome static void run_100814() { setNumberFromBaseTable(); // looks like: // chr10:82995-84056 const FileName filExomeRegions = "/codon/gordon/misc/geneClassifications/otherExomeSegments/nimblegen_solution_ccds_2008.HG18.list"; const FileName filHumanGenomeFOF = "/codon/gordon/genomes/human_genome_hg18/human_genome.fof"; const FileName filScoreMatrix = "/codon/gordon/misc/geneClassifications/exomeMotifs/run_100819/scoreMatrix.txt"; const int nScoreMatrixColumns = 9; const float fBinSize = 1.0; FILE* pExome = fopen( filExomeRegions.data(), "r" ); if ( !pExome ) { THROW_FILE_ERROR( filExomeRegions ); } mbtPtrOrderedVector aExomeRegion( 160000 ); RWCString soChromosome( (size_t) 1000); int nStart; int nEnd; int nTokens; int nLinesRead = 1; int nLongestExomeRegion = 0; while( fgets( soLine.data(), nMaxLineSize, pExome ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); int nColon = soLine.index( ":" ); assert( nColon != RW_NPOS ); int nDash = soLine.index( "-", nColon + 1 ); assert( nDash != RW_NPOS ); RWCString soChromosome = soLine( 0, nColon ); nTokens = sscanf( soLine.soGetRestOfString( nColon + 1 ).data(), "%d-%d", &nStart, &nEnd ); if ( nTokens != 2 ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soChromosome.nCurrentLength_ = strlen( soChromosome.data() ); THROW_ERROR2( "data error with Exome, nToken = " + RWCString( (long) nTokens ) + " converted string is " + soChromosome + " remaining string is " + soLine + " line " + RWCString( (long) nLinesRead ) ); } ++nLinesRead; region* pRegion = new region( soChromosome, nStart, nEnd ); aExomeRegion.append( pRegion ); if ( pRegion->nGetLength() > nLongestExomeRegion ) nLongestExomeRegion = pRegion->nGetLength(); } fclose( pExome ); aExomeRegion.resort(); FILE* pHumanGenomeFOF = fopen( filHumanGenomeFOF.data(), "r" ); if ( !pHumanGenomeFOF ) { THROW_FILE_ERROR( filHumanGenomeFOF ); } while( fgets( soLine.data(), nMaxLineSize, pHumanGenomeFOF ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aHumanGenomeFOF.append( soLine ); } fclose( pHumanGenomeFOF ); float fScoreMatrix[nScoreMatrixColumns][4]; int nCountMatrix[nScoreMatrixColumns][4]; for( int nCol = 0; nCol < nScoreMatrixColumns; ++nCol ) { for( int nRow = 0; nRow < 4; ++nRow ) { nCountMatrix[nCol][nRow] = 0; } } FILE* pScoreMatrix = fopen( filScoreMatrix.data(), "r" ); if ( !pScoreMatrix ) { THROW_FILE_ERROR( filScoreMatrix ); } int nRow = -1; // ready for ++ while( fgets( soLine.data(), nMaxLineSize, pScoreMatrix ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); if ( soLine.bIsNull() ) continue; ++nRow; RWCTokenizer tok( soLine ); printf( "parsing score matrix line %s\n", soLine.data() ); for( int n = 0; n < nScoreMatrixColumns; ++n ) { RWCString soNumber = tok(); if ( n == 0 && (soNumber == "A" || soNumber == "C" || soNumber == "G" || soNumber == "T" ) ) { soNumber = tok(); } printf( "%s ", soNumber.data() ); fflush( stdout ); assert( !soNumber.bIsNull() ); float f; errno = 0; char* szEndPtr; #ifndef SOLARIS f = strtof( soNumber.data(), &szEndPtr ); assert( errno == 0 ); if ( *szEndPtr != 0 ) { RWCString soError = "floating point number " + soNumber + " didn't not convert ok, left over was " + RWCString( szEndPtr ); THROW_ERROR( soError ); } #else f = atof( soNumber.data() ); #endif fScoreMatrix[ n ][ nRow ] = f; } printf( "\n" ); } // 0 to 3 for the 4 nucleotides assert( nRow == 3 ); RWCString soExome( (size_t) 100000000 ); RWCString soCurrentChromosome; RWTValVector aExomeBaseCounts( 4, 0, "aExomeBaseCounts" ); for( int nRegion = 0; nRegion < aExomeRegion.length(); ++nRegion ) { region* pRegion = aExomeRegion[ nRegion ]; if ( pRegion->soChromosome_ != soCurrentChromosome ) { // load another chromosome cerr << "reading " << pRegion->soChromosome_ << endl; loadChromosome( pRegion->soChromosome_ ); cerr << "done" << endl; soCurrentChromosome = pRegion->soChromosome_; } soExome += soChromosomeBases( pRegion->nStart_, pRegion->nEnd_ - pRegion->nStart_ + 1 ); for( int n = pRegion->nStart_; n <= pRegion->nEnd_; ++n ) { ++aExomeBaseCounts[ nNumberFromBase[ soChromosomeBases[n] ] ]; } soExome += "X"; } cerr << "done reading exome bases" << endl; // and now reverse complement this exome // there will be two X's in the middle of the string and // no X on the end soExome += soComplementSO( soExome ); cerr << "done complementing exome bases" << endl; // calculate lowest possible score so we can figure out the // offset float fTotalLeast = 0.0; for( int nColumn = 0; nColumn < nScoreMatrixColumns; ++nColumn ) { float fLeast = fScoreMatrix[nColumn][0]; for( int nBase = 1; nBase < 4; ++nBase ) { fLeast = MIN( fLeast, fScoreMatrix[nColumn][nBase] ); } fTotalLeast += fLeast; } int nHistogramOffset = -( floor( fTotalLeast / fBinSize ) ); cerr << "histogram offset = " << nHistogramOffset << endl; RWTValOrderedVector aHistogram; aHistogram.soName_ = "histogram"; // debugging int nTempCount = 0; // end debugging int nLastExomePos = soExome.length() - nScoreMatrixColumns; for( int nExomePos = 0; nExomePos <= nLastExomePos; ++nExomePos ) { // check if this has an X float fTotalScore = 0.0; bool bFoundX = false; int n; for( n = nScoreMatrixColumns - 1; n >= 0; --n ) { if ( soExome[ nExomePos + n ] == 'X' ) { bFoundX = true; break; } fTotalScore += fScoreMatrix[n][nNumberFromBase[ soExome[ nExomePos + n ] ] ]; } if ( bFoundX ) continue; // debugging if ( fTotalScore >= 20.0 ) { RWCString soPartOfExome = soExome( nExomePos, nScoreMatrixColumns ); cerr << soPartOfExome << " with score " << fTotalScore << endl; } if ( ( -20 <= fTotalScore) && (fTotalScore < -15 ) ) { ++nTempCount; } for( n = nScoreMatrixColumns - 1; n >= 0; --n ) { ++nCountMatrix[n][nNumberFromBase[ soExome[ nExomePos + n ] ] ]; } // end debugging int nBin = floor( fTotalScore / fBinSize ) + nHistogramOffset; assert( nBin >= 0 ); while( nBin >= aHistogram.length() ) { aHistogram.append( 0 ); } assert( nBin < aHistogram.length() ); ++aHistogram[ nBin ]; } // debugging cerr << "-20 <= x < -15: " << nTempCount << endl; // end debugging cerr << "done calculating histogram" << endl; cerr.flush(); // print matrix printf( "score matrix:\n" ); int nBase; for( nBase = 0; nBase < 4; ++nBase ) { printf( "base: %d ", nBase ); for( int n = 0; n < nScoreMatrixColumns; ++n ) { printf( " %4.1f", fScoreMatrix[n][nBase] ); } printf( "\n" ); } printf( "count matrix:\n" ); for( nBase = 0; nBase < 4; ++nBase ) { printf( "base: %d ", nBase ); for( int n = 0; n < nScoreMatrixColumns; ++n ) { printf( " %10s", soAddCommas( nCountMatrix[n][nBase] ).data() ); } printf( "\n" ); } int nExomeSize = 0; printf( "base counts in exome:\n" ); for( nBase = 0; nBase < 4; ++nBase ) { printf( "base %d : %d\n", nBase, aExomeBaseCounts[nBase] ); nExomeSize += aExomeBaseCounts[nBase]; } float fGCPerCent = ( aExomeBaseCounts[1] + aExomeBaseCounts[2] ) * 100.0 / ( aExomeBaseCounts[0] + aExomeBaseCounts[1] + aExomeBaseCounts[2] + aExomeBaseCounts[3] ); printf( "gc : %.1f %%\n", fGCPerCent ); // debugging cerr << "aHistogram.length() = " << aHistogram.length() << endl; // end debugging // print histogram int nCumulativeCounts = 0; for( int nBin = aHistogram.length() - 1; nBin >= 0; --nBin ) { int nBin2 = nBin - nHistogramOffset; float fUpperBoundOfBin = ( nBin2 + 1 ) * fBinSize; int nCountsThisBin = aHistogram[ nBin ]; nCumulativeCounts += nCountsThisBin; int nMeanDistanceBetweenMotifs = 2*nExomeSize / nCumulativeCounts; // skipping negative values if ( fUpperBoundOfBin < 0 ) continue; printf( "%6.1f <= x < %6.1f : %10s %15s %10s\n", nBin2 * fBinSize, ( nBin2 + 1 ) * fBinSize, soAddCommas( nCountsThisBin ).data(), soAddCommas( nCumulativeCounts ).data(), soAddCommas( nMeanDistanceBetweenMotifs ).data() ); } printf( "score counts in bin cum counts mean dist between motifs\n" ); // so that stdout buffer is flushed //exit(0); fflush( stdout ); } const int nNumberOfPutativeExons = 160000; static RWTValOrderedVector aExomeToHumanGenome0E( (size_t) 2*nNumberOfPutativeExons ); static RWTValOrderedVector aExomeToHumanGenomeChrName( (size_t) 2*nNumberOfPutativeExons ); static RWTValOrderedVector aExomeToHumanGenome1ChrPos( (size_t) 2*nNumberOfPutativeExons ); static void convertExomePosToChromosome( const int nExomePos, RWCString& soChromosome, int& n1ChrPos ) { for( int nExome = 0; nExome < aExomeToHumanGenome0E.length() - 1; ++nExome ) { if ( ( aExomeToHumanGenome0E[ nExome ] <= nExomePos ) && ( nExomePos < aExomeToHumanGenome0E[ nExome + 1 ] ) ) { soChromosome = aExomeToHumanGenomeChrName[ nExome ]; if ( soChromosome.bEndsWith( ".comp" ) ) { n1ChrPos = aExomeToHumanGenome1ChrPos[ nExome ] - ( nExomePos - aExomeToHumanGenome0E[ nExome ] ); } else { n1ChrPos = nExomePos - aExomeToHumanGenome0E[ nExome ] + aExomeToHumanGenome1ChrPos[ nExome ]; } return; } } // if reached here, something is wrong RWCString soError = "trying to find chr pos for exome pos " + RWCString( (long) nExomePos ) + " but the largest is " + RWCString( (long) aExomeToHumanGenome0E.last() ) + " " + aExomeToHumanGenomeChrName.last() + " " + RWCString( (long) aExomeToHumanGenome1ChrPos.last() ); THROW_ERROR( soError ); } // this is for examining motifs within the exome static void run_100820() { cerr << "in run_100820" << endl; cerr.flush(); setNumberFromBaseTable(); // looks like: // chr10:82995-84056 const FileName filExomeRegions = "/codon/gordon/misc/geneClassifications/otherExomeSegments/nimblegen_solution_ccds_2008.HG18.list"; const FileName filHumanGenomeFOF = "/codon/gordon/genomes/human_genome_hg18/human_genome.fof"; const FileName filScoreMatrix = "scoreMatrix.txt"; const int nScoreMatrixColumns = 11; const int nSizeOfMotif = nScoreMatrixColumns; const float fScoreThreshold = 11.0; const float fBinSize = 1.0; FILE* pExome = fopen( filExomeRegions.data(), "r" ); if ( !pExome ) { THROW_FILE_ERROR( filExomeRegions ); } mbtPtrOrderedVector aExomeRegion( 160000 ); RWCString soChromosome( (size_t) 1000); int nStart; int nEnd; int nTokens; int nLinesRead = 1; int nLongestExomeRegion = 0; while( fgets( soLine.data(), nMaxLineSize, pExome ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); int nColon = soLine.index( ":" ); assert( nColon != RW_NPOS ); int nDash = soLine.index( "-", nColon + 1 ); assert( nDash != RW_NPOS ); RWCString soChromosome = soLine( 0, nColon ); nTokens = sscanf( soLine.soGetRestOfString( nColon + 1 ).data(), "%d-%d", &nStart, &nEnd ); if ( nTokens != 2 ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soChromosome.nCurrentLength_ = strlen( soChromosome.data() ); THROW_ERROR2( "data error with Exome, nToken = " + RWCString( (long) nTokens ) + " converted string is " + soChromosome + " remaining string is " + soLine + " line " + RWCString( (long) nLinesRead ) ); } ++nLinesRead; region* pRegion = new region( soChromosome, nStart, nEnd ); aExomeRegion.append( pRegion ); if ( pRegion->nGetLength() > nLongestExomeRegion ) nLongestExomeRegion = pRegion->nGetLength(); } fclose( pExome ); aExomeRegion.resort(); FILE* pHumanGenomeFOF = fopen( filHumanGenomeFOF.data(), "r" ); if ( !pHumanGenomeFOF ) { THROW_FILE_ERROR( filHumanGenomeFOF ); } while( fgets( soLine.data(), nMaxLineSize, pHumanGenomeFOF ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); aHumanGenomeFOF.append( soLine ); } fclose( pHumanGenomeFOF ); float fScoreMatrix[nScoreMatrixColumns][4]; int nCountMatrix[nScoreMatrixColumns][4]; for( int nCol = 0; nCol < nScoreMatrixColumns; ++nCol ) { for( int nRow = 0; nRow < 4; ++nRow ) { nCountMatrix[nCol][nRow] = 0; } } FILE* pScoreMatrix = fopen( filScoreMatrix.data(), "r" ); if ( !pScoreMatrix ) { THROW_FILE_ERROR( filScoreMatrix ); } int nRow = -1; // ready for ++ while( fgets( soLine.data(), nMaxLineSize, pScoreMatrix ) != NULL ) { soLine.nCurrentLength_ = strlen( soLine.data() ); soLine.stripTrailingWhitespaceFast(); if ( soLine.bIsNull() ) continue; ++nRow; RWCTokenizer tok( soLine ); printf( "parsing score matrix line %s\n", soLine.data() ); for( int n = 0; n < nScoreMatrixColumns; ++n ) { RWCString soNumber = tok(); if ( n == 0 && (soNumber == "A" || soNumber == "C" || soNumber == "G" || soNumber == "T" ) ) { soNumber = tok(); } printf( "%s ", soNumber.data() ); fflush( stdout ); assert( !soNumber.bIsNull() ); float f; errno = 0; char* szEndPtr; #ifndef SOLARIS f = strtof( soNumber.data(), &szEndPtr ); assert( errno == 0 ); if ( *szEndPtr != 0 ) { RWCString soError = "floating point number " + soNumber + " didn't not convert ok, left over was " + RWCString( szEndPtr ); THROW_ERROR( soError ); } #else f = atof( soNumber.data() ); #endif fScoreMatrix[ n ][ nRow ] = f; } printf( "\n" ); } // 0 to 3 for the 4 nucleotides assert( nRow == 3 ); RWCString soExome( (size_t) 100000000 ); RWCString soCurrentChromosome; RWTValVector aExomeBaseCounts( 4, 0, "aExomeBaseCounts" ); aExomeToHumanGenome0E.soName_ = "aExomeToHumanGenome0E"; aExomeToHumanGenomeChrName.soName_ = "aExomeToHumanGenomeChrName"; aExomeToHumanGenome1ChrPos.soName_ = "aExomeToHumanGenome1ChrPos"; int nRegion; for( nRegion = 0; nRegion < aExomeRegion.length(); ++nRegion ) { region* pRegion = aExomeRegion[ nRegion ]; if ( pRegion->soChromosome_ != soCurrentChromosome ) { // load another chromosome cerr << "reading " << pRegion->soChromosome_ << endl; loadChromosome( pRegion->soChromosome_ ); cerr << "done" << endl; soCurrentChromosome = pRegion->soChromosome_; } aExomeToHumanGenome0E.append( soExome.length() ); aExomeToHumanGenomeChrName.append( pRegion->soChromosome_ ); aExomeToHumanGenome1ChrPos.append( pRegion->nStart_ ); soExome += soChromosomeBases( pRegion->nStart_, pRegion->nEnd_ - pRegion->nStart_ + 1 ); for( int n = pRegion->nStart_; n <= pRegion->nEnd_; ++n ) { ++aExomeBaseCounts[ nNumberFromBase[ soChromosomeBases[n] ] ]; } soExome += "X"; } cerr << "done reading exome bases" << endl; // and now reverse complement this exome // there will be two X's in the middle of the string and // no X on the end int nExomeSizeBeforeAddingComplement = soExome.length(); soExome += soComplementSO( soExome ); cerr << "done complementing exome bases" << endl; // need to add all of the complemented exons // there are 2 X's in a row at the // boundary. nExomeSizeBeforeAddingComplement points to the 2nd X, // and the +1 points to the first base in the complemented exome int n0ExomeStartPos = nExomeSizeBeforeAddingComplement + 1; int nMatch = 0; int nMismatch = 0; for( nRegion = aExomeRegion.length() - 1; nRegion >= 0; --nRegion ) { region* pRegion = aExomeRegion[ nRegion ]; assert( soExome[ n0ExomeStartPos - 1 ] == 'X' ); aExomeToHumanGenome0E.append( n0ExomeStartPos ); aExomeToHumanGenomeChrName.append( pRegion->soChromosome_ + ".comp" ); aExomeToHumanGenome1ChrPos.append( pRegion->nEnd_ ); // sanity check--checks chr1 and chrY if ( ( pRegion->soChromosome_ == "chr1" ) && ( pRegion->soChromosome_ != soCurrentChromosome ) ) { loadChromosome( pRegion->soChromosome_ ); soCurrentChromosome = pRegion->soChromosome_; } if ( pRegion->soChromosome_ == soCurrentChromosome ) { if ( soExome[ n0ExomeStartPos ] == cComplementBase( soChromosomeBases[ pRegion->nEnd_ ] ) ) { ++nMatch; } else { ++nMismatch; cerr << soExome[ n0ExomeStartPos ] << " " << cComplementBase( soChromosomeBases[ pRegion->nEnd_ ] ) << endl; } } // end sanity check int nLength = pRegion->nEnd_ - pRegion->nStart_ + 1; n0ExomeStartPos += nLength; n0ExomeStartPos += 1; // for the "X" } assert( nMismatch == 0 ); cerr << "match: " << nMatch << " mismatch: " << nMismatch << endl; // add a sentinel aExomeToHumanGenome0E.append( soExome.length() ); aExomeToHumanGenomeChrName.append( "SENTINEL" ); aExomeToHumanGenome1ChrPos.append( 1 ); // calculate lowest possible score so we can figure out the // offset float fTotalLeast = 0.0; for( int nColumn = 0; nColumn < nScoreMatrixColumns; ++nColumn ) { float fLeast = fScoreMatrix[nColumn][0]; for( int nBase = 1; nBase < 4; ++nBase ) { fLeast = MIN( fLeast, fScoreMatrix[nColumn][nBase] ); } fTotalLeast += fLeast; } int nHistogramOffset = -( floor( fTotalLeast / fBinSize ) ); cerr << "histogram offset = " << nHistogramOffset << endl; RWTValOrderedVector aHistogram; aHistogram.soName_ = "histogram"; FILE* pLocations = fopen( "locations.txt", "w" ); if ( !pLocations ) { THROW_FILE_ERROR( "locations.txt" ); } int nLastExomePos = soExome.length() - nScoreMatrixColumns; for( int nExomePos = 0; nExomePos <= nLastExomePos; ++nExomePos ) { // check if this has an X float fTotalScore = 0.0; bool bFoundX = false; int n; for( n = nScoreMatrixColumns - 1; n >= 0; --n ) { if ( soExome[ nExomePos + n ] == 'X' ) { bFoundX = true; break; } fTotalScore += fScoreMatrix[n][nNumberFromBase[ soExome[ nExomePos + n ] ] ]; } if ( bFoundX ) continue; if ( fTotalScore >= fScoreThreshold ) { RWCString soPartOfExome = soExome( nExomePos, nScoreMatrixColumns ); RWCString soChromosome; int n1ChrPos; convertExomePosToChromosome( nExomePos, soChromosome, n1ChrPos ); printf( "exome position %d score %.1f %s %s %d\n", nExomePos, fTotalScore, soPartOfExome.data(), soChromosome.data(), n1ChrPos ); RWCString soChromosomeWithoutComp= soChromosome; int n1ChrPosAtEndOfMotif = n1ChrPos; if ( soChromosomeWithoutComp.bEndsWithAndRemove( ".comp" ) ) { n1ChrPosAtEndOfMotif -= ( nSizeOfMotif - 1); } else { n1ChrPosAtEndOfMotif += ( nSizeOfMotif - 1 ); } fprintf( pLocations, "%s %d\n", soChromosomeWithoutComp.data(), n1ChrPosAtEndOfMotif ); } int nBin = floor( fTotalScore / fBinSize ) + nHistogramOffset; assert( nBin >= 0 ); while( nBin >= aHistogram.length() ) { aHistogram.append( 0 ); } assert( nBin < aHistogram.length() ); ++aHistogram[ nBin ]; } fclose( pLocations ); // print histogram int nCumulativeCounts = 0; for( int nBin = aHistogram.length() - 1; nBin >= 0; --nBin ) { int nBin2 = nBin - nHistogramOffset; float fUpperBoundOfBin = ( nBin2 + 1 ) * fBinSize; int nCountsThisBin = aHistogram[ nBin ]; nCumulativeCounts += nCountsThisBin; //int nMeanDistanceBetweenMotifs = 2*nExomeSize / nCumulativeCounts; // skipping negative values if ( fUpperBoundOfBin < 0 ) continue; printf( "%6.1f <= x < %6.1f : %10s %15s\n", nBin2 * fBinSize, ( nBin2 + 1 ) * fBinSize, soAddCommas( nCountsThisBin ).data(), soAddCommas( nCumulativeCounts ).data() ); // soAddCommas( nMeanDistanceBetweenMotifs ).data() ); } printf( "score counts in bin cum counts mean dist between motifs\n" ); // so that stdout buffer is flushed fflush( stdout ); } void miscProgram() { cerr << "in miscProgram" << endl; cerr.flush(); // run_100812(); // run_100813(); //run_100814(); run_100820(); } #endif