#!/usr/bin/perl -w # # lib2Phd.perl # # PURPOSE: allows phrap to utilize a sequences that may not come from # reads (such as a consensus from an overlapping cosmid or a genbank # sequence). This is NOT to be used for normal sequence assembly. # For that, you should use the phredPhrap script which will give much # more accurate results. This program is the equivalent of # fasta2Phd.perl except that it allows more than one sequence in # the fasta file # # INPUT: fasta "library" file containing the sequence(s) # # OUTPUT: for each sequence in the fasta file (above), a dummy phd # file with the bases, quality 20, and peak position 0 # Note: you might want to play around the with quality score, depending # on how certain you are of the base calls. You might want them to be # as low as 5. # # Original code from David Gordon at UWGC. # Modified by Gerry Bouffard (bouffard@nhgri.nih.gov) 20 Nov 1998 to # accept fasta library files as input. The output file is named # according to the first, non-space block of text on each fasta defline. $nQuality = 20; $szUsage = "Usage: lib2Phd.perl "; if ($#ARGV != 0) { die "$szUsage\n"; } $szLibFile = $ARGV[0]; open( filLib, "$szLibFile" ) || die "couldn't open file $szLibFile\n"; # strip off the pathname $szBase1 = `basename $szLibFile`; chop( $szBase1 ); #| # strip off extension #| $nFindDot = index( $szBase1, "." ); #| #| if ($nFindDot == -1 ) { #| $szFastaSequenceName = $szBase1; #| } #| else { #| $szFastaSequenceName = substr( $szBase1, 0, $nFindDot ); #| } $szFastaSequenceName = ""; while () { # Check for fasta defline, otherwise expect sequence data if (/^\s*>\s*(\S*)/) { # fasta type defline $tempSequenceName = $1; #If this is not the first fasta file, close out the previous one. unless ($szFastaSequenceName eq "") { print( filPHD "END_DNA\n" ); print( filPHD "\n" ); print( filPHD "END_SEQUENCE\n" ); close( filPHD ); } $szFastaSequenceName = $tempSequenceName; $szPHDFileName = $szFastaSequenceName . ".phd.1"; #test for pre-existing phd file if (-e $szPHDFileName) { print STDERR "Warning: overwriting previous $szPHDFileName!\n"; } else { print STDERR "... creating file $szPHDFileName\n"; } open( filPHD, ">$szPHDFileName" ) || die "couldn't open $szPHDFileName\n"; print( filPHD "BEGIN_SEQUENCE $szFastaSequenceName\n"); print( filPHD "\n" ); print( filPHD "BEGIN_COMMENT\n" ); print( filPHD "\n" ); print( filPHD "CHROMAT_FILE: none\n"); print( filPHD "ABI_THUMBPRINT: none\n"); print( filPHD "PHRED_VERSION: not called by phred\n"); print( filPHD "CALL_METHOD: fasta2Phd.perl\n" ); print( filPHD "QUALITY_LEVELS: 2\n" ); $szTemp3 = `date`; $szTemp4 = substr($szTemp3, 0, length( "Fri Dec 1 14:22:38 " )); $szTemp5 = substr($szTemp3, length( "Fri Dec 1 14:22:38 PST "), length("1995")); $szDate = $szTemp4 . $szTemp5; print( filPHD "TIME: $szDate\n" ); print( filPHD "END_COMMENT\n" ); print( filPHD "\n" ); print( filPHD "BEGIN_DNA\n" ); } # end of fasta type defline else { # line of sequence data (we hope) if ($szFastaSequenceName eq "") { print STDERR "Ignoring text preceeding first fasta defline.\n"; next; # no file to write to } else { # convert the sequence data to single line quality scores chomp; # removing any trailing blanks or returns $szBigLine = $_; $szBigLine =~ tr/A-Z/a-z/; # switch to lower case, if necessary if ($szBigLine =~ /[^acgtrymkswbdhvnx]/) { print STDOUT "Warning: Unexpected characters found in ". "$szFastaSequenceName sequence.\n"; } for( $n = 0; $n < length( $szBigLine ); ++$n ) { $cBase = substr( $szBigLine, $n, 1 ); # base, quality, peak position print( filPHD "$cBase $nQuality 0\n" ); } } } # end of line of sequence data } # end of while loop #---------------------------------------------------------- #Close out the last file, if there is one. unless ($szFastaSequenceName eq "") { print( filPHD "END_DNA\n" ); print( filPHD "\n" ); print( filPHD "END_SEQUENCE\n" ); close( filPHD ); } close( filLib ); die "\nlib2Phd program completed successfully.\n";