#!/usr/bin/perl -w # add454Reads.perl # part of Consed package # use File::Basename; $szVersion = "100407"; $szUsage = "add454Reads.perl (ace file) (sff fof) (masked fasta file) [fof of reads to add (optional)]"; # Note: the (ace file) and the (masked fasta file) must correspond # precisely--the contigs of the ace file must have the same names and # same base sequence as the masked fasta file sequences # You might need to edit these for where your executables are: defined( $szConsedHome = $ENV{'CONSED_HOME'} ) || ( $szConsedHome = "/usr/local/genome" ); $szCrossMatch = $szConsedHome . "/bin/cross_match"; $szConsed = $szConsedHome . "/bin/consed"; if ( $#ARGV >= 0 ) { if ( $ARGV[0] eq "-V" || $ARGV[0] eq "-v" ) { print "$szVersion\n"; exit( 1 ); } } if ( $#ARGV != 2 && $#ARGV != 3 ) { die "$szUsage"; } $szSffDir = "../sff_dir"; $szPhdBallDir = "../phdball_dir"; # check that we are in edit_dir of a project if ( ! -e "$szSffDir" ) { die "we need to be in edit_dir of a project that has a $szSffDir"; } $nStartTime = time(); $nStartTime2 = $nStartTime; $szAceFile = $ARGV[0]; $szSffFof = $ARGV[1]; $szMaskedFastaFile = $ARGV[2]; $szReadsFof = ""; if ( $#ARGV == 3 ) { $szReadsFof = $ARGV[3]; } &compareAceAndMaskedFastaFile( $szMaskedFastaFile, $szAceFile ); $szUniqueChars = &szGetDateTime; $szAlignmentFilesFOF = "alignmentFiles" . $szUniqueChars . ".fof"; open( filAlignmentFilesFOF, ">$szAlignmentFilesFOF" ) || die "couldn't open $szAlignmentFilesFOF for output"; # I want this file updated immediately with each write: $filOld = select( filAlignmentFilesFOF ); $| = 1; select( $filOld ); open( filSffFof, "$szSffFof" ) || die "couldn't open $szSffFof"; while( defined( $szSffFile = ) ) { chomp( $szSffFile ); if ( !-e "$szSffFile" ) { print "warning: sff file $szSffFile doesn't exist\n"; next; } $szPhdBall = &szFindNextPhdBallFullPath(); print "extracting reads from sff file $szSffFile...\n"; $szCommand = "$szConsed -sff2PhdBall $szSffFile -phdBall $szPhdBall"; if ( $szReadsFof ne "" ) { $szCommand .= " -fof $szReadsFof"; } print "executing: $szCommand\n"; !system( $szCommand ) || die "couldn't execute $szCommand"; print "done running $szCommand\n"; # case in which there are no reads in the phdball if ( -z "$szPhdBall" ) { next; } $szSffFileBasename = basename( $szSffFile ); $szFastaOfReads = "reads" . $szUniqueChars . "." . $szSffFileBasename . ".fa"; $szCommand = "$szConsed -phdBall2Fasta $szPhdBall -fasta $szFastaOfReads"; !system($szCommand) || die "couldn't execute $szCommand"; $szAlignmentFile = "alignments" . $szUniqueChars . "." . $szSffFileBasename . ".cross"; # write header into cross_match output file open( filAlignmentFile, ">$szAlignmentFile" ) || die "couldn't open $szAlignmentFile for output"; print filAlignmentFile "BEGIN_PHDBALLS\n"; print filAlignmentFile "$szPhdBall\n"; print filAlignmentFile "END_PHDBALLS\n"; close( filAlignmentFile ); $nStartTime = time(); # do not use -gap1_only for 454 reads $szCrossMatchCommand = "time " . $szCrossMatch . " " . $szFastaOfReads . " " . $szMaskedFastaFile . " -discrep_lists -tags -masklevel 0 -minscore 25 -repeat_screen 2 >>$szAlignmentFile 2>/dev/null"; # if you want to be able to handle larger indels, such as 2 or 3, use the # following: # " -discrep_lists -tags -masklevel 0 -bandwidth 4 -minscore 25 -repeat_screen 2 >>$szAlignmentFile 2>/dev/null"; print "now running $szCrossMatchCommand\n"; !system( $szCrossMatchCommand ) || die "couldn't execute $szCrossMatchCommand"; # put the alignments file in a fof for consed print filAlignmentFilesFOF "$szAlignmentFile\n"; # clean up $szLogFile = $szFastaOfReads . ".log"; unlink( "$szLogFile" ); unlink( "$szFastaOfReads" ); # associated qual file $szQualOfReads = $szFastaOfReads . ".qual"; unlink( "$szQualOfReads" ); # save for cleaning up push( @aAlignmentFiles, $szAlignmentFile ); } close( filAlignmentFilesFOF ); $nCrossMatchTime = time() - $nStartTime; printf "%.1f minutes to until done with alignments\n", $nCrossMatchTime / 60.0; $nStartTime = time(); print "now using alignments to add reads to ace file\n"; $szCommand = $szConsed . " -ace " . $szAceFile . " -addReads " . $szAlignmentFilesFOF . " -chem 454"; print "executing: $szCommand\n"; !system( $szCommand ) || die "couldn't execute $szCommand"; foreach $szAlignmentFile ( @aAlignmentFiles ) { unlink( $szAlignmentFile ); } unlink( "$szAlignmentFilesFOF" ); printf "%.1f minutes cross_match and fasta time\n", $nCrossMatchTime / 60.0; $nConsedTime = time() - $nStartTime; printf "%.1f minutes consed time\n", $nConsedTime / 60.0; $nTotalTime = time() - $nStartTime2; printf "%.1f minutes total time\n", $nTotalTime / 60.0; ######################## subroutines ######################### sub compareAceAndMaskedFastaFile { my ( $szMaskedFastaFile, $szAceFile ) = @_; open( filFasta, "$szMaskedFastaFile" ) || die "couldn't open $szMaskedFastaFile for reading"; open( filAce, "$szAceFile" ) || die "couldn't open $szAceFile for reading"; defined( $szFastaFileHeaderLine = ) || die "premature end of file $szMaskedFastaFile"; big_loop: while( $szFastaFileHeaderLine ne "" ) { if ($szFastaFileHeaderLine !~ /^>/ ) { die "unrecognized out of place line $szFastaFileHeaderLine in $szMaskedFastaFile"; } @aWords = split(' ', $szFastaFileHeaderLine ); ( $szFastaFileSequenceName = $aWords[0] ) =~ s/^>//; # now load the fasta sequence $szFastaSeq = ""; $szFastaFileHeaderLine = ""; while( ) { if ( ! /^>/ ) { chomp; $szFastaSeq .= $_; } else { $szFastaFileHeaderLine = $_; last; } } # now find the corresponding read in the ace file $bFoundSameSequenceInAceFile = 0; while ( !$bFoundSameSequenceInAceFile && defined( $_ = ) ) { if ( /^CO / ) { @aWords = split; $szAceFileSequenceName = $aWords[1]; if ( $szAceFileSequenceName eq $szFastaFileSequenceName ) { $bFoundSameSequenceInAceFile = 1; } else { ( $szAceFileSequenceName2 = $szAceFileSequenceName ) =~ s/-Contig//; if ( $szAceFileSequenceName2 eq $szFastaFileSequenceName ) { $bFoundSameSequenceInAceFile = 1; } } } } if ( !$bFoundSameSequenceInAceFile ) { die "sorry--the ace file doesn't contain $szFastaFileSequenceName which is in $szMaskedFastaFile"; } $szAceSeq = ""; while( ) { chomp; if ( /^\s*$/ ) { last; } s/\*//g; $szAceSeq .= $_; } if ( lc( $szFastaSeq ) ne lc( $szAceSeq ) ) { if ( length( $szFastaSeq ) != length( $szAceSeq ) ) { die "fasta length is " . length( $szFastaSeq ) . " while ace length is " . length( $szAceSeq ); } die "fasta file sequence does not match ace file sequence for $szFastaFileSequenceName"; } } # while( $szFastaFileHeaderLine ne "" ) { close( filFasta ); close( filAce ); } sub szGetDateTime { my $szDateTime; my ($nSecond, $nMinute, $nHour, $nDayInMonth, $nMonth, $nYear, $wday, $yday, $isdst ) = localtime; undef $isdst; undef $wday; undef $yday; if ( $nYear >= 100 ) { $nYear = $nYear % 100; } $szDateTime = sprintf( "%02d%02d%02d_%02d%02d%02d", $nYear, $nMonth + 1, $nDayInMonth, $nHour, $nMinute, $nSecond ); return( $szDateTime ); } sub min { my ( $a, $b ) = @_; if ( $a < $b ) { return $a; } else { return $b; } } sub max { my ( $a, $b ) = @_; if ( $a < $b ) { return $b; } else { return $a; } } sub szFindNextPhdBallFullPath() { $nHighestVersion = -666; opendir( dirPhdBall, "$szPhdBallDir" ) || die "couldn't open directory $szPhdBallDir"; while( defined( $szDir = readdir( dirPhdBall ) ) ) { next if ( $szDir eq "." ); next if ( $szDir eq ".." ); if ( $szDir =~ /^phd\.ball\.(\d+)$/ ) { $nVersion = $1; if ( $nVersion > $nHighestVersion ) { $nHighestVersion = $nVersion; } } } closedir( dirPhdBall ); if ( $nHighestVersion == -666 ) { return "$szPhdBallDir/phd.ball.1"; } else { my $szPhdBall = "phd.ball." . ( $nHighestVersion + 1 ); return "$szPhdBallDir/$szPhdBall"; } }