#!/usr/bin/perl -w $szUsage = "addSolexaReads.perl (ace file) (solexa files and prefixes) (original fasta file)"; # Inputs: # ace file: This could be the ace file created with fasta2Ace.perl. If it is an ace file that already contains other reads, then it must precisely match the fasta file which is the 3rd argument. To make it precisely match, you could use consed and, from the Main Consed Window, point to "File", hold down the left mouse button and release on "Write All Contigs to Fasta File". # "solexa files and prefixes" is a file containing paths relative to # solexa_dir (usually they # will just be fastq files in solexa_dir) # dir and "read prefix" is the prefix to add to each read within the # solexa file # Output: an ace file ready for consed defined( $szConsedHome = $ENV{'CONSED_HOME'} ) || ( $szConsedHome = "/usr/local/genome" ); # You might need to edit these for where your executables are: $szCrossMatch = $szConsedHome . "/bin/cross_match"; $szConsed = $szConsedHome . "/bin/consed"; $szVersion = "080723"; if ( $#ARGV >= 0 ) { if ( $ARGV[0] eq "-V" || $ARGV[0] eq "-v" ) { print "$szVersion\n"; exit( 1 ); } } # $szSolexaDir = "../solexa_dir/"; $szPhdBallDir = "../phdball_dir"; # check that we are in edit_dir of a project if ( ! -e "../solexa_dir" ) { die "we need to be in edit_dir of a project that has a ../solexa_dir"; } if ( $#ARGV != 2 ) { die "$szUsage"; } $nStartTime = time(); $nStartTime2 = $nStartTime; $szAceFile = $ARGV[0]; $szSolexaFilesFOF = $ARGV[1]; $szMaskedFastaFile = $ARGV[2]; &compareAceAndMaskedFastaFile( $szMaskedFastaFile, $szAceFile ); $szUniqueChars = &szGetDateTime; print "converting reads from solexa files to phd balls...\n"; $nStartTime = time(); # we need to break these reads into packages of around a million reads # each for efficient processing of cross_match $szPhdBallFOF = "phdball" . $szUniqueChars . ".fof"; $szCommand = "$szConsed -solexa2PhdBall $szSolexaFilesFOF -phdBallFOF $szPhdBallFOF"; print "executing $szCommand\n"; !system( $szCommand ) || die "couldn't execute $szCommand"; $nPhdBallTime = time() - $nStartTime; printf "took %.1f minutes to make phdballs\n", $nPhdBallTime / 60.0; $nStartTime = time(); @aPhdBalls = (); open( filPhdBallFOF, "$szPhdBallFOF" ) || die "couldn't open $szPhdBallFOF"; while( ) { chomp; push( @aPhdBalls, $_ ); } close( filPhdBallFOF ); print "created " . ( $#aPhdBalls + 1 ) . " phd balls--see $szPhdBallFOF\n"; # put the alignments file in a fof for consed $szAlignmentFileFOF = "alignmentFiles" . $szUniqueChars . ".fof"; open( filAlignmentFileFOF, ">$szAlignmentFileFOF" ) || die "couldn't open $szAlignmentFileFOF for output"; for( $nPhdBall = 0; $nPhdBall <= $#aPhdBalls; ++$nPhdBall ) { $szPhdBall = $aPhdBalls[ $nPhdBall ]; $szFastaOfReads = "reads" . $szUniqueChars . ".fa." . $nPhdBall; $szCommand = "$szConsed -phdBall2Fasta $szPhdBall -fasta $szFastaOfReads"; print "now executing $szCommand\n"; !system( $szCommand ) || die "couldn't execute $szCommand"; print "done creating fasta file $szFastaOfReads\n"; # now run cross_match $szAlignmentFile = "alignmentFile." . $szUniqueChars . ".cross." . $nPhdBall; # write header into cross_match output file open( filAlignmentFile, ">$szAlignmentFile" ) || die "couldn't open $szAlignmentFile for output"; print filAlignmentFile "BEGIN_PHDBALLS\n"; print filAlignmentFile "$szPhdBall\n"; print filAlignmentFile "END_PHDBALLS\n"; close( filAlignmentFile ); $szCrossMatchCommand = "time " . $szCrossMatch . " " . $szFastaOfReads . " " . $szMaskedFastaFile . " -discrep_lists -tags -masklevel 0 -minscore 25 -gap1_only -repeat_screen 2 >>$szAlignmentFile 2>/dev/null"; print "now running $szCrossMatchCommand\n"; !system( $szCrossMatchCommand ) || die "couldn't execute $szCrossMatchCommand"; # put the alignment file in a fof for consed print filAlignmentFileFOF "$szAlignmentFile\n"; # clean up $szLogFile = $szFastaOfReads . ".log"; unlink( "$szLogFile" ); unlink( "$szFastaOfReads" ); $szQualFile = $szFastaOfReads . ".qual"; unlink( "$szQualFile" ); } # for( $nPhdBall = 0... close( filAlignmentFileFOF ); $nCrossMatchTime = time() - $nStartTime; printf "%.1f minutes until done with alignments\n", $nCrossMatchTime / 60.0; $nStartTime = time(); print "now using alignments to add reads to ace file\n"; $szCommand = $szConsed . " -ace " . $szAceFile . " -addReads " . $szAlignmentFileFOF . " -chem solexa"; print "executing: $szCommand\n"; !system( $szCommand ) || die "couldn't execute $szCommand"; $nConsedTime = time() - $nStartTime; printf "%.1f minutes cross_match and -phdBall2Fasta time\n", $nCrossMatchTime / 60.0; printf "%.1f minutes consed time\n", $nConsedTime / 60.0; $nTotalTime = time() - $nStartTime2; printf "%.1f minutes total time\n", $nTotalTime / 60.0; # clean up unlink( "$szPhdBallFOF" ); for( $nPhdBall = 0; $nPhdBall <= $#aPhdBalls; ++$nPhdBall ) { $szAlignmentFile = "alignmentFile." . $szUniqueChars . ".cross." . $nPhdBall; unlink( "$szAlignmentFile" ); } unlink( "$szAlignmentFileFOF" ); ######################## subroutines ######################### sub szGetDateTime { my $szDateTime; my ($nSecond, $nMinute, $nHour, $nDayInMonth, $nMonth, $nYear, $wday, $yday, $isdst ) = localtime; undef $isdst; undef $wday; undef $yday; if ( $nYear >= 100 ) { $nYear = $nYear % 100; } $szDateTime = sprintf( "%02d%02d%02d_%02d%02d%02d", $nYear, $nMonth + 1, $nDayInMonth, $nHour, $nMinute, $nSecond ); return( $szDateTime ); } sub compareAceAndMaskedFastaFile { my ( $szMaskedFastaFile, $szAceFile ) = @_; open( filFasta, "$szMaskedFastaFile" ) || die "couldn't open $szMaskedFastaFile for reading"; open( filAce, "$szAceFile" ) || die "couldn't open $szAceFile for reading"; defined( $szFastaFileHeaderLine = ) || die "premature end of file $szMaskedFastaFile"; big_loop: while( $szFastaFileHeaderLine ne "" ) { if ($szFastaFileHeaderLine !~ /^>/ ) { die "unrecognized out of place line $szFastaFileHeaderLine in $szMaskedFastaFile"; } @aWords = split(' ', $szFastaFileHeaderLine ); ( $szFastaFileSequenceName = $aWords[0] ) =~ s/^>//; # now load the fasta sequence $szFastaSeq = ""; $szFastaFileHeaderLine = ""; while( ) { if ( ! /^>/ ) { chomp; $szFastaSeq .= $_; } else { $szFastaFileHeaderLine = $_; last; } } # now find the corresponding read in the ace file $bFoundSameSequenceInAceFile = 0; while ( !$bFoundSameSequenceInAceFile && defined( $_ = ) ) { if ( /^CO / ) { @aWords = split; $szAceFileSequenceName = $aWords[1]; if ( $szAceFileSequenceName eq $szFastaFileSequenceName ) { $bFoundSameSequenceInAceFile = 1; } else { ( $szAceFileSequenceName2 = $szAceFileSequenceName ) =~ s/-Contig//; if ( $szAceFileSequenceName2 eq $szFastaFileSequenceName ) { $bFoundSameSequenceInAceFile = 1; } } } } if ( !$bFoundSameSequenceInAceFile ) { die "sorry--the ace file doesn't contain $szFastaFileSequenceName which is in $szMaskedFastaFile"; } $szAceSeq = ""; while( ) { chomp; if ( /^\s*$/ ) { last; } s/\*//g; $szAceSeq .= $_; } if ( lc( $szFastaSeq ) ne lc( $szAceSeq ) ) { if ( length( $szFastaSeq ) != length( $szAceSeq ) ) { die "fasta length is " . length( $szFastaSeq ) . " while ace length is " . length( $szAceSeq ); } die "fasta file sequence does not match ace file sequence for $szFastaFileSequenceName"; } } close( filFasta ); close( filAce ); } sub min { my ( $a, $b ) = @_; if ( $a < $b ) { return $a; } else { return $b; } } sub max { my ( $a, $b ) = @_; if ( $a < $b ) { return $b; } else { return $a; } } sub szFindNextPhdBallFullPath() { $nHighestVersion = -666; opendir( dirPhdBall, "$szPhdBallDir" ) || die "couldn't open directory $szPhdBallDir"; while( defined( $szDir = readdir( dirPhdBall ) ) ) { next if ( $szDir eq "." ); next if ( $szDir eq ".." ); if ( $szDir =~ /^phd\.ball\.(\d+)$/ ) { $nVersion = $1; if ( $nVersion > $nHighestVersion ) { $nHighestVersion = $nVersion; } } } closedir( dirPhdBall ); if ( $nHighestVersion == -666 ) { return "$szPhdBallDir/phd.ball.1"; } else { my $szPhdBall = "phd.ball." . ( $nHighestVersion + 1 ); return "$szPhdBallDir/$szPhdBall"; } }