#!/usr/bin/perl -w # Input: a file where each line looks like this: # (solexa seq file) (read prefix) # "solexa seq file" is a path relative to solexa_dir (usually it # will just be in solexa_dir or will be a symbolic link within solexa # dir and # "read prefix" is the prefix to add to each read within the solexa file # Output: a list of alignment files defined( $szConsedHome = $ENV{'CONSED_HOME'} ) || ( $szConsedHome = "/usr/local/genome" ); # You might need to edit these for where your executables are: $szCrossMatch = $szConsedHome . "/bin/cross_match"; $szConsed = $szConsedHome . "/bin/consed"; $szVersion = "081126"; if ( $#ARGV >= 0 ) { if ( $ARGV[0] eq "-V" || $ARGV[0] eq "-v" ) { print "$szVersion\n"; exit( 1 ); } } # $szSolexaDir = "../solexa_dir/"; $szPhdBallDir = "../phdball_dir"; $szUsage = "alignSolexaReadsToRefs.perl (file of solexa fastq filenames) (file of names of reference fasta files) (file to put names of alignment files)"; # check that we are in edit_dir of a project if ( ! -e "../solexa_dir" ) { die "we need to be in edit_dir of a project that has a ../solexa_dir"; } if ( $#ARGV != 2 ) { die "$szUsage"; } $nStartTime = time(); $nStartTime2 = $nStartTime; $szSolexaFilesFOF = $ARGV[0]; $szReferenceSequenceFOF = $ARGV[1]; $szAlignmentFileFOF = $ARGV[2]; $szUniqueChars = &szGetDateTime; # get a string of the list of fasta files (subjects to cross_match) open( filReferenceSequenceFOF, "$szReferenceSequenceFOF" ) || die "couldn't open $szReferenceSequenceFOF"; $szReferenceSequenceFiles = ""; while( ) { chomp; # check that the file can be opened open( filMaskedFastaFile, "$_" ) || die "couldn't open $_"; close( filMaskedFastaFile ); if ( $szReferenceSequenceFiles eq "" ) { $szReferenceSequenceFiles = $_; } else { $szReferenceSequenceFiles .= " $_"; } } close( filReferenceSequenceFOF ); # open fof of alignment files open( filAlignmentFileFOF, ">$szAlignmentFileFOF" ) || die "couldn't open $szAlignmentFileFOF for write"; print "converting reads from solexa files to phd ball...\n"; $nStartTime = time(); $szPhdBallFOF = "phdball." . $szUniqueChars . ".fof"; $szCommand = "$szConsed -solexa2PhdBall $szSolexaFilesFOF -phdBallFOF $szPhdBallFOF"; print "executing $szCommand\n"; !system( $szCommand ) || die "couldn't execute $szCommand"; $nPhdBallTime = time() - $nStartTime; printf "took %.1f minutes to make phdballs\n", $nPhdBallTime / 60.0; $nStartTime = time(); @aPhdBalls = (); open( filPhdBallFOF, "$szPhdBallFOF" ) || die "couldn't open $szPhdBallFOF"; while( ) { chomp; push( @aPhdBalls, $_ ); } close filPhdBallFOF; print "created " . ( $#aPhdBalls + 1 ) . " phd balls--see $szPhdBallFOF\n"; for( $nPhdBall = 0; $nPhdBall <= $#aPhdBalls; ++$nPhdBall ) { $szPhdBall = $aPhdBalls[ $nPhdBall ]; $szFastaOfReads = "reads" . $szUniqueChars . ".fa." . $nPhdBall; $szCommand = "$szConsed -phdBall2Fasta $szPhdBall -fasta $szFastaOfReads"; print "now executing $szCommand\n"; !system( $szCommand ) || die "couldn't execute $szCommand"; print "done creating fasta file $szFastaOfReads\n"; # now run cross_match $szAlignmentFile = "alignmentFile." . $szUniqueChars . ".cross." . $nPhdBall; # write header into cross_match output file open( filAlignmentFile, ">$szAlignmentFile" ) || die "couldn't open $szAlignmentFile for output"; print filAlignmentFile "BEGIN_PHDBALLS\n"; print filAlignmentFile "$szPhdBall\n"; print filAlignmentFile "END_PHDBALLS\n"; close( filAlignmentFile ); $szCrossMatchCommand = "time " . $szCrossMatch . " " . $szFastaOfReads . " " . $szReferenceSequenceFiles . " -discrep_lists -tags -masklevel 0 -minscore 25 -gap1_only -repeat_screen 2 >>$szAlignmentFile 2>/dev/null"; print "now running $szCrossMatchCommand\n"; !system( $szCrossMatchCommand ) || die "couldn't execute $szCrossMatchCommand"; # put the alignments file in a fof for consed print filAlignmentFileFOF "$szAlignmentFile\n"; # clean up $szLogFile = $szFastaOfReads . ".log"; unlink( "$szLogFile" ); unlink( "$szFastaOfReads" ); $szQualFile = $szFastaOfReads . ".qual"; unlink( "$szQualFile" ); } # for( $nPhdBall = 0 ... # clean up unlink( "$szPhdBallFOF" ); $nCrossMatchTime = time() - $nStartTime; printf "%.1f minutes until done with alignments\n", $nCrossMatchTime / 60.0; $nTotalTime = time() - $nStartTime2; printf "%.1f minutes total time\n", $nTotalTime / 60.0; print "see $szAlignmentFileFOF\n"; ######################## subroutines ######################### sub szGetDateTime { my $szDateTime; my ($nSecond, $nMinute, $nHour, $nDayInMonth, $nMonth, $nYear, $wday, $yday, $isdst ) = localtime; undef $isdst; undef $wday; undef $yday; if ( $nYear >= 100 ) { $nYear = $nYear % 100; } $szDateTime = sprintf( "%02d%02d%02d_%02d%02d%02d", $nYear, $nMonth + 1, $nDayInMonth, $nHour, $nMinute, $nSecond ); return( $szDateTime ); } sub min { my ( $a, $b ) = @_; if ( $a < $b ) { return $a; } else { return $b; } } sub max { my ( $a, $b ) = @_; if ( $a < $b ) { return $b; } else { return $a; } } sub szFindNextPhdBall() { $nHighestVersion = -666; opendir( dirPhdBall, "$szPhdBallDir" ) || die "couldn't open directory $szPhdBallDir"; while( defined( $szDir = readdir( dirPhdBall ) ) ) { next if ( $szDir eq "." ); next if ( $szDir eq ".." ); if ( $szDir =~ /^phd\.ball\.(\d+)$/ ) { $nVersion = $1; if ( $nVersion > $nHighestVersion ) { $nHighestVersion = $nVersion; } } } closedir( dirPhdBall ); if ( $nHighestVersion == -666 ) { return "phd.ball.1"; } else { my $szPhdBall = "phd.ball." . ( $nHighestVersion + 1 ); return "$szPhdBall"; } }