#!/usr/bin/perl -w # # manyFasta2Ace.perl # # PURPOSE: create an ace file and phd.ball file (everything consed needs) from # a fasta file. This then is generally the starting point for adding new # reads using the original fasta file of reference sequences. # # INPUT: fasta file # # NOTE: you must run this in edit_dir use Getopt::Long; $szRevision = "081205"; $szUsage = "Usage: fasta2Ace.perl (-phdball (phdball name, optional) (-append, optional) (-quality (quality), optional)"; $nQuality = 20; $| = 1; # flush stdout $bVersion = 0; $szPhdBall = "phd.ball.1"; $nOK = GetOptions( "phdball=s" => \$szPhdBall, "v" => \$bVersion, "acefile=s" => \$szAceFileName, "quality=s" => \$nQuality ); if ( $nOK != 1 ) { die "$szUsage"; } if ( $bVersion ) { print "$szRevision\n"; exit( 0 ); } if ( $#ARGV != 0 ) { die "$szUsage"; } $szFastaFile = $ARGV[0]; # where is phd ball? if ( -e "./.consedrc" ) { $szConsedRC = ".consedrc"; } elsif ( -e "~/.consedrc" ) { $szConsedRC = "~/.consedrc"; } elsif ( exists( $ENV{ "CONSED_PARAMETERS" } ) ) { $szConsedRC = $ENV{ "CONSED_PARAMETERS" }; } $szPhdBallDir = "../phdball_dir"; if ( defined( $szConsedRC ) ) { open( filConsedRC, "$szConsedRC" ) || die "could not open $szConsedRC"; while( ) { if (/^consed.phdBallDirectory:/ ) { $szPhdBallDir = (split)[1]; last; } } close( filConsedRC ); } $szPhdBallFullPath = $szPhdBallDir . "/" . $szPhdBall; if ( -e "$szPhdBallFullPath" ) { die "$szPhdBallFullPath already exists--if you want to overwrite it, delete it."; } open( filFasta, "$szFastaFile" ) || die "couldn't open file $szFastaFile\n"; # strip off the pathname to use for the assembly name $szBase1 = `basename $szFastaFile`; chop( $szBase1 ); # strip off extension $nFindDot = rindex( $szBase1, "." ); if ($nFindDot == -1 ) { $szAssemblyName = $szBase1; } else { $szAssemblyName = substr( $szBase1, 0, $nFindDot ); } if ( !defined( $szAceFileName ) ) { $szAceFileName = $szAssemblyName . ".ace"; } if ( -e "$szAceFileName" ) { die "$szAceFileName already exists. If you want to overwrite it, delete it."; } open( filAce, ">$szAceFileName" ) || die "couldn't open $szAceFileName"; # this was above but caused a problem when the user had the phdball and # acefile already existing. He would delete one, and then the other would # be created. He would delete the other, and the first would be created. open( filPhdBall, ">$szPhdBallFullPath" ) || die "couldn't open $szPhdBallFullPath for write"; # read fasta file twice. First time to determine how many sequences there are $nStartTime = time(); $nLastTime = $nStartTime; $nLongestRead = 0; $nThisReadLength = 0; while( ) { if ( /^>/) { ++$nSequences; if ( $nThisReadLength > $nLongestRead ) { $nLongestRead = $nThisReadLength; } $nThisReadLength = 0; # just to let user know something is happening $nTempTime = time(); if ( $nTempTime - $nLastTime > 10 ) { print "$nSequences sequences so far\n"; $nLastTime = $nTempTime; } } else { $nThisReadLength += (length() - 1 ); } } if ( $nThisReadLength > $nLongestRead ) { $nLongestRead = $nThisReadLength; } print "$nSequences sequences. Longest is " . &addCommas( $nLongestRead ) . "\n"; # preallocate memory long enough for longest read $szBasesThisRead = ' ' x $nLongestRead; $szBasesThisRead = ""; print filAce "AS $nSequences $nSequences\n"; print filAce "\n"; seek filFasta, 0, 0; $szDate = localtime(); #$szTemp = `date`; #$szTemp4 = substr($szTemp3, 0, length( "Fri Dec 1 14:22:38 " )); #$szTemp5 = substr($szTemp3, length( "Fri Dec 1 14:22:38 PST "), length("1995")); # remove timezone (for some historical reason) #$szDate = $szTemp4 . $szTemp5; $szDateForWRItems = &szGetDateForWRItems; for( $nSequence = 0; $nSequence < $nSequences; ++$nSequence ) { # look for header if it is the first time through if ( $nSequence == 0 ) { $bFoundHeader = 0; while( ) { if ( /^>/ ) { $bFoundHeader = 1; $szHeader = $_; last; } } if ( $bFoundHeader == 0 ) { die "couldn't find header for sequence $nSequence"; } } @aWords = split(' ', $szHeader ); # there may be other information on the header line ( $szReadName = $aWords[0] ) =~ s/^>//; print( filPhdBall "BEGIN_SEQUENCE $szReadName 1\n"); print( filPhdBall "\n" ); print( filPhdBall "BEGIN_COMMENT\n" ); print( filPhdBall "\n" ); print( filPhdBall "CALL_METHOD: fasta2Ace.perl\n" ); print( filPhdBall "TIME: $szDate\n" ); print( filPhdBall "CHEM: fasta\n"); print( filPhdBall "END_COMMENT\n" ); print( filPhdBall "\n" ); print( filPhdBall "BEGIN_DNA\n" ); # now get bases and print them $szBasesThisRead = ""; while( defined( $szLine = ) ) { if ($szLine =~ /^>/) { $szHeader = $szLine; last; } chomp( $szLine ); ( $szBasesLower = $szLine ) =~ tr/A-Z/a-z/; $szBasesLower =~ tr/\.\-/nn/; $szBasesThisRead .= $szBasesLower; for( $nBase = 0; $nBase < length( $szBasesLower ); ++$nBase ) { print filPhdBall substr( $szBasesLower, $nBase, 1 ) . " $nQuality 0\n"; } } print( filPhdBall "END_DNA\n" ); print( filPhdBall "\n" ); print( filPhdBall "WR{\n" ); print( filPhdBall "referenceSequence fasta2Ace.perl $szDateForWRItems\n" ); print( filPhdBall "}\n\n" ); print( filPhdBall "END_SEQUENCE\n" ); print( filPhdBall "\n\n\n" ); $nBasesThisRead = length( $szBasesThisRead ); if ( $nSequence < 20 || ( $nSequence % 1000 == 0 ) || $nBasesThisRead > 1000000 ) { print "read $szReadName " . ( $nSequence + 1 ) . " (max: $nSequences) finished with phdball now working on ace file.\n"; } # now work on ace file # $szContigName = $szReadName . "-Contig"; The reason this had to # change is that cross_match uses the fasta file (without -Contig) is # run against the new reads and the output is used to determine which # contig the reads align against. Consed reads these alignments. # Thus the contigs must not have the -Contig $szContigName = $szReadName; # the 1 1 below is for 1 read in contig, 1 base segment in contig print filAce "CO $szContigName $nBasesThisRead 1 1 U\n"; # print out the bases for this contig $nMaxNumberOfBasesOnLine = 50; for( $nBase = 0; $nBase < $nBasesThisRead; $nBase += $nMaxNumberOfBasesOnLine ) { # length - 1 is index of last base # $nBase + $nMaxNumberOfBasesOnLine - 1 is index of last base if we # were to write $nMaxNumberOfBasesOnLine # so drop the -1 from each side to check if there is a problem if ( length( $szBasesThisRead ) < ( $nBase + $nMaxNumberOfBasesOnLine ) ) { $nNumberOfBasesOnLine = length( $szBasesThisRead ) - $nBase; } else { $nNumberOfBasesOnLine = $nMaxNumberOfBasesOnLine; } $szBasesOnLine = substr( $szBasesThisRead, $nBase, $nNumberOfBasesOnLine ); print filAce $szBasesOnLine,"\n"; } print filAce "\n\n"; # now write the qualities for the bases print filAce "BQ\n"; $nMaxNumberOfQualitiesOnLine = 25; $nQualitiesOnLine = 0; # note: saving the line in a buffer and writing line-by-line # does not speed up. Below is actually a little faster: for( $nBase = 0; $nBase < $nBasesThisRead; ++$nBase ) { if ( $nQualitiesOnLine >= $nMaxNumberOfQualitiesOnLine ) { print filAce "\n"; $nQualitiesOnLine = 0; } print filAce " $nQuality"; ++$nQualitiesOnLine; } print filAce "\n\n"; if ( $nBasesThisRead > 1000000 ) { print "finished writing BQ consensus qualities. Now writing read bases...\n"; } print filAce "AF $szReadName U 1\n"; print filAce "BS 1 $nBasesThisRead $szReadName\n\n"; # This 0 0 below is for the # of whole read info items and the # of read tags print filAce "RD $szReadName $nBasesThisRead 0 0\n"; for( $nBase = 0; $nBase < $nBasesThisRead; $nBase += $nMaxNumberOfBasesOnLine ) { if ( length( $szBasesThisRead ) < ( $nBase + $nMaxNumberOfBasesOnLine ) ) { $nNumberOfBasesOnLine = length( $szBasesThisRead ) - $nBase; } else { $nNumberOfBasesOnLine = $nMaxNumberOfBasesOnLine; } $szBasesOnLine = substr( $szBasesThisRead, $nBase, $nNumberOfBasesOnLine ); print filAce $szBasesOnLine,"\n"; } print filAce "\n\n"; # This says basically that nothing is clipped off print filAce "QA 1 $nBasesThisRead 1 $nBasesThisRead\n"; print filAce "DS VERSION: 1 TIME: $szDate CHEM: fasta\n"; } # for( $nSequence = 0; $nSequence <= $nSequences; ++$nSequence ) { print filAce "WA{\n"; print filAce "phdBall fasta2Ace.perl $szDateForWRItems\n"; # note that we are putting a relative path here so that the entire # project can be copied somewhere else and will still work print filAce "$szPhdBallFullPath\n"; print filAce "}\n"; close filAce; close filPhdBall; close filFasta; print "done creating $szAceFileName and $szPhdBallFullPath\n"; ##################### subroutines #################### sub addCommas { ( $nOriginal ) = @_; ( $nWithCommas = $nOriginal ) =~ s/(?<=\d)(?=(\d\d\d)+$)/,/g; return $nWithCommas; } sub szGetDateForWRItems { my $szDate; ($nSecond, $nMinute, $nHour, $nDayInMonth, $nMonth, $nYear, $wday, $yday, $isdst ) = localtime; undef $isdst; undef $wday; undef $yday; if ( $nYear >= 100 ) { $nYear = $nYear % 100; } $szDate = sprintf( "%02d%02d%02d:%02d%02d%02d", $nYear, $nMonth + 1, $nDayInMonth, $nHour, $nMinute, $nSecond ); return( $szDate ); }