#!/usr/bin/perl -w # selectRegions.perl # # Usage: # First run alignSelectRegions2Refs.perl # which gives cross_match alignments listed in (e.g.) my_alignments.fof # # Then run: # selectRegions.perl regions.txt my_alignments.fof my_new_ace.ace # in which regions.txt looks like: # BEGIN_SEQ_FASTA # ref1 ref1.fa # ref2 ref2.fa # ref3 ref2.fa # END_SEQ_FASTA # ref1 1 100 # ref1 901 1000 # ref2 1 100 # ref2 901 1000 # ref3 5 200 # there are 3 levels of sequences: fasta files (such as ref1.fa and # ref2.fa above, sequences (starting with ">") within the fasta files # such as ref1, ref2, and ref3 above, and regions within these # sequences as indicated by the 5 above # my_alignments.fof lists the cross_match output files except that # they are each preceded by a block like this: # BEGIN_PHDBALLS # ../phdball_dir/phd.ball.1 # END_PHDBALLS # which indicates which phd balls were aligned against the reference sequences # my_new_ace.ace says what the ace file should be called (the # extension may not start at 1 if there is already a .1 $SIG{__WARN__} = dieWhenGetWarning; sub dieWhenGetWarning { my $szErrorMessage = shift; die "$szErrorMessage"; } $szProgramName = "selectRegions.perl"; $szUsage = "$szProgramName (regions of references) (fof of alignments) (new ace file without .1 .2 ...)"; $szVersion = "090629"; if ( $#ARGV >= 0 ) { if ( $ARGV[0] eq "-V" || $ARGV[0] eq "-v" ) { print "selectRegions.perl version $szVersion\n"; exit( 1 ); } } $nQualityOfReferenceBases = 50; if ( $#ARGV != 2 ) { die "$szUsage"; } $szRegionsFile = $ARGV[0]; $szAlignmentsFOF = $ARGV[1]; $szAceFileRoot = $ARGV[2]; defined( $szConsedHome = $ENV{'CONSED_HOME'} ) || ( $szConsedHome = "/usr/local/genome" ); $szConsed = $szConsedHome . "/bin/consed"; if ( $szAceFileRoot =~ /\.ace\.\d+$/ ) { $szAceFileRoot =~ s/\.\d+$//; } elsif( $szAceFileRoot =~ /\.ace$/ ) { # good } else { $szAceFileRoot .= ".ace"; } # file looks like this # BEGIN_SEQ_FASTA # seq34679 mm-contigs.fa # seq4 mm-contigs.fa # END_SEQ_FASTA # seq34679 1 203 # seq4 1 2765 # where the sequences are seq34679 and seq4 # these sequences are the keys of the associ array %aFastaFileForSequences # For example: $aFastaFileForSequences{ "seq34679" } = "mm-contigs.fa # then there are the associative arrays %aRegionsStart and %aRegionsEnd # which give, for each sequence, a linear array of start and end positions, # respectively # In the case above, $aRegionsEnd{ "seq34679" } is an array with # just one element, 203 open( filRegions, "$szRegionsFile" ) || die "couldn't open $szRegionsFile"; $bFoundBeginSeqFasta = 0; while( ) { # ignore whitespace if (/^\s*$/ ) { next; } if ( /^BEGIN_SEQ_FASTA/ ) { $bFoundBeginSeqFasta = 1; last; } } if ( !$bFoundBeginSeqFasta ) { die "couldn't find BEGIN_SEQ_FASTA in file $szRegionsFile"; } %aFastaFilesForSequences = (); $bFoundEndSeqFasta = 0; while( ) { # ignore whitespace if (/^\s*$/ ) { next; } if ( /^END_SEQ_FASTA/ ) { $bFoundEndSeqFasta = 1; last; } ( $szSequenceName, $szFastaFile ) = split; if ( exists( $aFastaFilesForSequences{ $szSequenceName } ) ) { die "the header indicates that sequence $szSequenceName is both in $szFastaFile and " . $aFastaFilesForSequences{ $szSequenceName }; } $aFastaFilesForSequences{ $szSequenceName } = $szFastaFile; } %aRegionsStart = (); %aRegionsEnd = (); $nNumberOfRegions = 0; while( ) { # ignore whitespace if (/^\s*$/ ) { next; } # looks like: # chrX 151,073,054 151,383,976 # or # chrX 151073054 151383976 @aWords = split; if ( $#aWords != 2 ) { die "regions should be of form chrX 151,073,054 151,383,976 ( 3 tokens) but instead there were " . ( $#aWords + 1 ); } $szSequenceName = $aWords[0]; $n1StartInSequence = $aWords[1]; $n1EndInSequence = $aWords[2]; # get rid of commas $n1StartInSequence =~ s/,//g; $n1EndInSequence =~ s/,//g; if ( $n1StartInSequence !~ /^\d+$/ ) { die "start position is not numeric in $_"; } if ( $n1EndInSequence !~ /^\d+$/ ) { die "end position $n1EndInSequence is not numeric in $_"; } if ( !exists( $aFastaFilesForSequences{ $szSequenceName } ) ) { die "Fatal Error: sequence $szSequenceName as in region $_ does not have a corresponding fasta file in the BEGIN_SEQ_FASTA section above\n"; } if ( !exists $aRegionsStart{ $szSequenceName } ) { $aRegionsStart{ $szSequenceName } = [()]; $aRegionsEnd{ $szSequenceName } = [()]; } $pStartArray = $aRegionsStart{ $szSequenceName }; $pEndArray = $aRegionsEnd{ $szSequenceName }; push( @$pStartArray, $n1StartInSequence ); push( @$pEndArray, $n1EndInSequence ); ++$nNumberOfRegions; } close( filRegions ); # should sort each of the regions--well, not really necessary the # way we are doing it now foreach $szSequenceName ( keys %aRegionsStart ) { $pStartArray = $aRegionsStart{ $szSequenceName }; $pEndArray = $aRegionsEnd{ $szSequenceName }; # sort by start position. Just doing @$pStartArray = sort { ... } @$pStartArray will not work because $pEndArray must be sorted in the same way that # $pStartArray is. So we need to have an array of indices (@aArrayOfIndices) # that is sorted the way $pStartArray needs to be sorted, and then # we use this array to sort both arrays at the same time. $nArraySize = $#$pStartArray + 1; @aArrayOfIndices = (); for( $n = 0; $n < $nArraySize; ++$n ) { push( @aArrayOfIndices, $n ); } @aArrayOfIndices = sort { $$pStartArray[$a] <=> $$pStartArray[$b] } @aArrayOfIndices; # now modify the arrays based on this new order @aTempStart = @$pStartArray; @aTempEnd = @$pEndArray; for( $n = 0; $n < $nArraySize; ++$n ) { $$pStartArray[$n] = $aTempStart[ $aArrayOfIndices[ $n ] ]; $$pEndArray[$n] = $aTempEnd[ $aArrayOfIndices[ $n ] ]; } # check that the list is sorted the way we intended (in ascending order) if ( $#$pStartArray != $#$pEndArray ) { die "arrays are not equal sequence $szSequenceName"; } if ( ( $#$pStartArray + 1 ) != $nArraySize ) { die "arrays are sizes " . ( $#$pStartArray + 1 ) . " but was length " . $nArraySize; } $bSorted = 1; $bIntersecting = 0; for( $n = 1; $n < $nArraySize; ++$n ) { if ( $$pStartArray[ $n - 1 ] > $$pStartArray[ $n ] ) { printf "out of order %d and %d\n", $$pStartArray[ $n - 1 ], $$pStartArray[ $n ]; $bSorted = 0; } if ( &bIntersects( $$pStartArray[ $n - 1 ], $$pEndArray[ $n - 1 ], $$pStartArray[ $n ], $$pEndArray[$n] ) ) { printf "regions intersect in sequence $szSequenceName regions (%d, %d) and (%d, %d)\n", $$pStartArray[ $n - 1 ], $$pEndArray[ $n - 1 ], $$pStartArray[ $n ], $$pEndArray[$n]; $bIntersecting = 1; } } if ( !$bSorted ) { die "array for sequence $szSequenceName didn't sort correctly"; } if ( $bIntersecting ) { die "array for sequence $szSequenceName has intersecting regions"; } } # make a list of the fasta files that has only 1 copy of each fasta file # (values %aFastaFilesForSequences has potentially many copies of each fasta file) %aListOfFastaFiles = (); foreach $szFastaFile ( values %aFastaFilesForSequences) { if ( !exists( $aListOfFastaFiles{ $szFastaFile } ) ) { $aListOfFastaFiles{ $szFastaFile } = 1; } } # make a bit associated array indicating whether we found the desired sequence %aIsSequenceFound = (); foreach $szSequenceName ( keys %aRegionsStart ) { $aIsSequenceFound{ $szSequenceName } = 0; } # create phd ball and ace file with just the reference sequences $szPhdBallDir = "../phdball_dir"; # this is now the full path $szRefPhdBallRelativePath = &szFindNextPhdBallRelativePath(); open( filRefPhdBall, ">$szRefPhdBallRelativePath" ) || die "couldn't open $szRefPhdBallRelativePath for write"; $szRefAceFile = &szFindNextAceFile( $szAceFileRoot ); open( filRefAceFile, ">$szRefAceFile" ) || die "couldn't open $szRefAceFile for write"; $szDateTimeWithDayOfWeek = &szGetDateTimeWithDayOfWeek(); $szDateForCTandWRItems = &szGetDateForCTandWRItems(); print filRefPhdBall "# created by $szProgramName with $nNumberOfRegions regions reference copy $szDateTimeWithDayOfWeek\n"; print filRefAceFile "AS $nNumberOfRegions $nNumberOfRegions\n\n"; foreach $szFastaFile ( sort keys %aListOfFastaFiles ) { open( filReference, "$szFastaFile" ) || die "couldn't open $szFastaFile"; lookForNextSequenceHeader: $bFoundHeaderLine = 0; while( ) { if ( /^>/ ) { $szHeaderLine = $_; $bFoundHeaderLine = 1; last; } } if ( !$bFoundHeaderLine ) { # this means we've reached the end of the file goto close_fasta_file; } foundNextSequenceHeader: # we need to determine which, if any, of the # regions are contained within this sequence $szSequenceName = ( split(' ', $szHeaderLine ) )[0]; $szSequenceName =~ s/^>//; if ( !exists( $aRegionsStart{ $szSequenceName } ) ) { print "no regions in sequence $szSequenceName\n"; goto lookForNextSequenceHeader; } # status print "found desired sequence $szSequenceName in $szFastaFile\n"; if ( !exists( $aFastaFilesForSequences{ $szSequenceName } ) ) { print "found $szSequenceName in $szFastaFile but this sequence is not desired\n"; goto lookForNextSequenceHeader; } if ( $aFastaFilesForSequences{ $szSequenceName } ne $szFastaFile ) { print "$szSequenceName is desired but not the one in $szFastaFile\n"; goto lookForNextSequenceHeader; } # if reached here, we want at least one region in this sequence $aIsSequenceFound{ $szSequenceName } = 1; # save the next sequence header in $szHeaderLine # This will be used by the goto foundNextSequenceHeader; # way below $szHeaderLine = ""; # if it is not filled in when exit loop, that means # we hit the end of the file $szSequenceBases = ""; while( ) { if ( /^>/ ) { $szHeaderLine = $_; last; } chomp; $szSequenceBases .= $_; } # print status print "just read in $szSequenceName of length " . length( $szSequenceBases ) . "\n"; # now we have the sequence. Make the phdball and the ace file $pStartArray = $aRegionsStart{ $szSequenceName }; $pEndArray = $aRegionsEnd{ $szSequenceName }; for( $nRegion = 0; $nRegion <= $#$pStartArray; ++$nRegion ) { $nStartPos = $$pStartArray[ $nRegion ]; $nEndPos = $$pEndArray[ $nRegion ]; $szFakeReadName = $szSequenceName . "_" . &addCommas( $nStartPos ); print filRefPhdBall "BEGIN_SEQUENCE $szFakeReadName 1\n"; print filRefPhdBall "BEGIN_COMMENT\n"; print filRefPhdBall "TIME: $szDateTimeWithDayOfWeek\n"; print filRefPhdBall "CHEM: fasta\n"; print filRefPhdBall "END_COMMENT\n"; print filRefPhdBall "BEGIN_DNA\n"; # now get bases and print them if ( $nEndPos < $nStartPos ) { die "Fatal Error: for $szSequenceName, start = $nStartPos end = $nEndPos are not in order\n"; } if ( $nStartPos < 1 ) { die "Fatal Error: for $szSequenceName, start = $nStartPos is less than 1 "; } if ( $nEndPos > length( $szSequenceBases ) ) { die "Fatal Error: for $szSequenceName, end = $nEndPos is beyond the end of the sequence which is at " . ( length( $szSequenceBases ) ); } $nBasesThisRead = $nEndPos - $nStartPos + 1; $szRegionBases = lc substr( $szSequenceBases, $nStartPos - 1, $nBasesThisRead ); if ( $nBasesThisRead != length( $szRegionBases ) ) { die "perl doesn't work"; } for( $n0Pos = 0; $n0Pos < $nBasesThisRead; ++$n0Pos ) { print filRefPhdBall substr( $szRegionBases, $n0Pos, 1 ) . " $nQualityOfReferenceBases\n"; } print filRefPhdBall "END_DNA\n"; print filRefPhdBall "WR{\n"; print filRefPhdBall "referenceSequence $szProgramName $szDateForCTandWRItems\n"; print filRefPhdBall "$szSequenceName " . &addCommas( $nStartPos ) . " " . &addCommas( $nEndPos ) . " $szFastaFile\n"; print filRefPhdBall "}\n"; print filRefPhdBall "END_SEQUENCE\n\n"; # print out status so user knows something is happening ++$nRegionsSoFar; if ( $nRegionsSoFar < 20 || ( $nRegionsSoFar % 1000 == 0 ) || ( $nBasesThisRead > 1000000 ) ) { print "$nRegionsSoFar regions so far out of a total of $nNumberOfRegions. Finished with phd now working on ace file\n"; } $szContigName = $szFakeReadName; # the 1 1 below is for 1 read in contig, 1 base segment in contig print filRefAceFile "CO $szContigName $nBasesThisRead 1 1 U\n"; # print the bases for this contig $nMaxNumberOfBasesOnLine = 50; for( $n0Base = 0; $n0Base < $nBasesThisRead; $n0Base += $nMaxNumberOfBasesOnLine ) { $nCharsToPrint = $nMaxNumberOfBasesOnLine; if ( $n0Base + $nCharsToPrint > $nBasesThisRead ) { $nCharsToPrint = $nBasesThisRead - $n0Base; } print filRefAceFile substr( $szRegionBases, $n0Base, $nCharsToPrint ), "\n"; } print filRefAceFile "\n\n"; # now write the qualities for the bases print filRefAceFile "BQ\n"; $nMaxNumberOfQualitiesOnLine = 25; $szMaxNumberOfQualities = ""; for( $n = 0; $n < $nMaxNumberOfQualitiesOnLine; ++$n ) { $szMaxNumberOfQualities .= " $nQualityOfReferenceBases"; } $szMaxNumberOfQualities .= "\n"; for( $n0Base = 0; $n0Base < $nBasesThisRead; $n0Base += $nMaxNumberOfQualitiesOnLine ) { if ( $n0Base + $nMaxNumberOfQualitiesOnLine <= $nBasesThisRead ) { print filRefAceFile "$szMaxNumberOfQualities"; } else { $nQualitiesToPrint = $nBasesThisRead - $n0Base; for( $n = 0; $n < $nQualitiesToPrint; ++$n ) { print filRefAceFile " $nQualityOfReferenceBases"; } print filRefAceFile "\n"; } } print filRefAceFile "\n\n"; if ( $nBasesThisRead > 1_000_000 ) { print "finished writing BQ consensus qualities. Now writing read bases...\n"; } print filRefAceFile "AF $szFakeReadName U 1\n"; print filRefAceFile "BS 1 $nBasesThisRead $szFakeReadName\n\n"; # This 0 0 below is for the # of whole read info items and the # of # read tags print filRefAceFile "RD $szFakeReadName $nBasesThisRead 0 0\n"; for( $n0Base = 0; $n0Base < $nBasesThisRead; $n0Base += $nMaxNumberOfBasesOnLine ) { $nCharsToPrint = $nMaxNumberOfBasesOnLine; if ( $n0Base + $nCharsToPrint > $nBasesThisRead ) { $nCharsToPrint = $nBasesThisRead - $n0Base; } print filRefAceFile substr( $szRegionBases, $n0Base, $nCharsToPrint ), "\n"; } print filRefAceFile "\n\n"; # This says nothing is clipped off print filRefAceFile "QA 1 $nBasesThisRead 1 $nBasesThisRead\n"; print filRefAceFile "DS VERSION: 1 TIME: $szDateTimeWithDayOfWeek CHEM: fasta\n"; # all CT tags must go at end of ace file, but start collecting # info for them now # add tag for user-defined positions $szCTLine = "$szContigName startNumberingConsensus selectRegions.perl 1 1 $szDateForCTandWRItems\n"; $szCTLine .= "$nStartPos\n"; push( @aCTLines, $szCTLine ); } # for( $nRegion = 0; $nRegion <= $#$pStartArray; ++$nRegion ) { # there might be other sequences in this same fasta file that we want! # fixed bug in which multiple sequences in same fasta file didn't work # June 2009 reported by Yaron. if ( length( $szHeaderLine ) != 0 ) { goto foundNextSequenceHeader; } close_fasta_file: close( filReference ); } # foreach $szFastaFile ( sort keys %aListOfFastaFiles ) { # add all CT lines print "\n"; foreach $szCTLine ( @aCTLines ) { print filRefAceFile "CT{\n"; print filRefAceFile $szCTLine; print filRefAceFile "}\n\n"; } print filRefAceFile "WA{\n"; print filRefAceFile "phdBall $szProgramName $szDateForCTandWRItems\n"; # note that we are putting a relative path here so the entire project # can be copied somewhere else and will still work print filRefAceFile "$szRefPhdBallRelativePath\n"; print filRefAceFile "}\n"; close( filRefAceFile ); close( filRefPhdBall ); # check if found all of the sequences $bProblem = 0; foreach $szSequenceName (keys %aIsSequenceFound ) { if ( $aIsSequenceFound{ $szSequenceName } == 0 ) { print "fatal error: $szSequenceName is not found in fasta file " . $aFastaFilesForSequences{ $szSequenceName } . "\n"; $bProblem = 1; } } if ( $bProblem ) { die "terminating\n"; } $szCommand = "$szConsed -ace $szRefAceFile -selectRegions $szRegionsFile -alignments $szAlignmentsFOF"; print "about to run: $szCommand\n"; !system( $szCommand ) || die "couldn't execute $szCommand"; ##################### subroutines #################### sub addCommas { ( $nOriginal ) = @_; ( $nWithCommas = $nOriginal ) =~ s/(?<=\d)(?=(\d\d\d)+$)/,/g; return $nWithCommas; } sub min { my ( $a, $b ) = @_; if ( $a < $b ) { return $a; } else { return $b; } } sub max { my ( $a, $b ) = @_; if ( $a < $b ) { return $b; } else { return $a; } } sub szFindNextPhdBallRelativePath() { $nHighestVersion = -666; opendir( dirPhdBall, "$szPhdBallDir" ) || die "couldn't open directory $szPhdBallDir"; while( defined( $szDir = readdir( dirPhdBall ) ) ) { next if ( $szDir eq "." ); next if ( $szDir eq ".." ); if ( $szDir =~ /^phd\.ball\.(\d+)$/ ) { $nVersion = $1; if ( $nVersion > $nHighestVersion ) { $nHighestVersion = $nVersion; } } } closedir( dirPhdBall ); if ( $nHighestVersion == -666 ) { return "$szPhdBallDir/phd.ball.1"; } else { return "$szPhdBallDir/phd.ball." . ( $nHighestVersion + 1 ); } } sub szFindNextAceFile() { my ( $szAceFileWithoutVersion ) = @_; my $szAceFile; my $nVersion = 0; my $bAceFileAlreadyExists = 1; while( $bAceFileAlreadyExists ) { ++$nVersion; $szAceFile = $szAceFileWithoutVersion . "." . $nVersion; if ( ! -e "$szAceFile" ) { $bAceFileAlreadyExists = 0; } } return $szAceFile; } sub szGetDateTimeWithDayOfWeek() { my $szTemp3 = `date`; my $szTemp4 = substr($szTemp3, 0, length( "Fri Dec 1 14:22:38 " )); my $szTemp5 = substr($szTemp3, length( "Fri Dec 1 14:22:38 PST "), length("1995")); # remove timezone (for some historical reason) my $szDate = $szTemp4 . $szTemp5; return $szDate; } sub szGetDateForCTandWRItems { my $szDate; ($nSecond, $nMinute, $nHour, $nDayInMonth, $nMonth, $nYear, $wday, $yday, $isdst ) = localtime; undef $isdst; undef $wday; undef $yday; if ( $nYear >= 100 ) { $nYear = $nYear % 100; } $szDate = sprintf( "%02d%02d%02d:%02d%02d%02d", $nYear, $nMonth + 1, $nDayInMonth, $nHour, $nMinute, $nSecond ); return( $szDate ); } sub bIntersects { my ( $nALeft, $nARight, $nBLeft, $nBRight ) = @_; if ( ( $nALeft <= $nBRight ) && ( $nBLeft <= $nARight ) ) { return 1; } else { return 0; } }