#!/usr/bin/perl -w # PURPOSE: runs consed -autoPCRAmplify in order to pick pcr primers # for a large group of regions # INPUT: a fasta file of sequences. The head of each sequence indicates # the region to be amplified: # >(name of region) l1 r1 l2 r2 (biggest or smallest) # where the left primer is picked in the region (l1,r1) and the # right primer is picked in the region (l2,r2) # "biggest" means the primers should be chosen as far apart as possible. # "smallest" means the primers should be chosen as close together as # possible. # for example, this could be the input file: # >AP000527.C22.6.mRNA.primerRegion 1 70 81 150 smallest # ACAGGGCCCCTCGCGGGCCCTGACGCAGGATGGAGTTGAGGTGGGGGCAGCGCTGGACCCCAGGGCCCCT # NNNNNNNNNN # TGCCGCAGTCTTGGATGATGGGTTCCTAGAAGCTCTCAACATCTCTTCTTAATTGGAGAAAGTGTTAAGC # >AC004019.C22.4.mRNA.primerRegion 1 70 81 150 smallest # AGCTGTGAGCTGTGCAATCATGTAACTAACTTTGTTTAAGTATTGTTTAGTCTTTCTGGTCTCC # AGATGANNNNNNNNNNTCAGACATTCCACAGCTACCTAGAGGACATCATCAACTACCGCTGGGA # GCTCGAAGAAGGGAAGCCCAAC # The 1 70 of the first sequence means that the 1st primer should be chosen # from ACAGGGCCCCTCGCGGGCCCTGACGCAGGATGGAGTTGAGGTGGGGGCAGCGCTGGACCCCAGGGCCCCT # and the "smallest" means that this primer should be chosen as far # right within these bases as possible so the product will be as small as # possible. # The 81 150 of the first sequence means that the 2nd primer should be chosen # from the bottom strand of # TGCCGCAGTCTTGGATGATGGGTTCCTAGAAGCTCTCAACATCTCTTCTTAATTGGAGAAAGTGTTAAGC # and the "smallest" means that the primer should be chosen as far as # possible to the left within this sequence so that the product is as # small as possible. # OUTPUT: # primers_unsorted.txt gives a list of primer pairs. The # primers are both given in the strand that you should order them. # That is, the left primers are given in top strand orientation, and # the right primers are given in bottom strand orientation. defined( $szConsedHome = $ENV{'CONSED_HOME'} ) || ( $szConsedHome = "/usr/local/genome" ); # These programs will be called from this one. You might need to edit # the paths. $szPhd2Ace = "phd2Ace.perl"; $szFasta2Phd = "fasta2Phd.perl"; $szConsed = "consed"; $szCrossMatch = $szConsedHome . "/bin/cross_match"; $szUsage = "Usage: amplifyTranscripts.perl "; if ($#ARGV != 0 && $#ARGV != 1 ) { die "$szUsage"; } $szFileOfTranscripts = $ARGV[0]; $bSplicedLeader = 0; if ( $#ARGV == 1 ) { if ( $ARGV[1] eq "SL" ) { $bSplicedLeader = 1; } else { die "$szUsage"; } } $szFileOfPrimers = "primers_unsorted.txt"; if ( -e $szFileOfPrimers ) { die "$szFileOfPrimers must not already exist"; } $szFileOfFailures = "failures.txt"; $SIG{__WARN__} = dieWhenGetWarning; sub dieWhenGetWarning { my $szErrorMessage = shift; die "$szErrorMessage"; } # keep track of this filename for subsequent scripts open( filNameOfFileOfTranscripts, ">transcripts_filename.txt" ) || die "couldn't open transcripts_filename.txt for output"; print filNameOfFileOfTranscripts "$szFileOfTranscripts\n"; close( filNameOfFileOfTranscripts ); $szRootForDirectories = `/bin/pwd`; chomp( $szRootForDirectories ); @aTranscriptNames = (); open( filTranscripts, "$szFileOfTranscripts" ) || die "could not open $szFileOfTranscripts"; chdir( $szRootForDirectories ) || die "couldn't chdir to $szRootForDirectories"; # write a .consedrc file and setenv CONSED_PARAMETERS to it so that $szConsedRCFile = $szRootForDirectories . "/.consedrc"; open( filConsedRC, ">$szConsedRCFile" ) || die "couldn't open .consedrc in $szRootForDirectories"; print filConsedRC "consed.autoPCRAmplifyTooManySeriousFalseMatches: 1500\n"; print filConsedRC "consed.primersMinQuality: 20\n"; #print filConsedRC "consed.primersMaxMeltingTempForPCR: 70\n"; #print filConsedRC "consed.primersMinMeltingTempForPCR: 68\n"; print filConsedRC "consed.primersMaxMeltingTempForPCR: 66\n"; print filConsedRC "consed.primersMinMeltingTempForPCR: 64\n"; print filConsedRC "consed.primersMinimumLengthOfAPrimerForPCR: 18\n"; print filConsedRC "consed.primersMaximumLengthOfAPrimerForPCR: 40\n"; #print filConsedRC "consed.primersMaximumLengthOfAPrimerForPCR: 50\n"; print filConsedRC "consed.autoPCRAmplifyFalseProductsOKIfLargerThanThis: 8000\n"; print filConsedRC "consed.primersMaxLengthOfMononucleotideRepeat: 6\n"; if ( $bSplicedLeader ) { print filConsedRC "consed.autoPCRAmplifyMakePrimerOutOfFirstRegion: true\n"; # print filConsedRC "consed.primersMaxMeltingTempDifferenceForPCR: 100.0\n"; } close( filConsedRC ); $ENV{"CONSED_PARAMETERS"} = $szConsedRCFile; $szLineUsage = ">(transcript name) #1 #2 #3 #4 (biggest/smallest)"; $szSavedLine = ; $bEndOfFile = 0; while( !$bEndOfFile ) { ($szTranscriptName, $nLeftPos1, $nRightPos1, $nLeftPos2, $nRightPos2, $szBiggestOrSmallest ) = split(' ', $szSavedLine ); if ( !defined( $szTranscriptName ) || !defined( $nLeftPos1 ) || !defined( $nRightPos1 ) || !defined( $nLeftPos2 ) || !defined( $nRightPos2 ) || !defined( $szBiggestOrSmallest ) ) { die "line $_ did not have form $szLineUsage"; } # chop off > from the name $szTranscriptName =~ s/^>//; print "\n\n\n------------------------------------------------\n"; print "transcript $szTranscriptName\n"; print "---------------------------------------------------\n\n\n"; # added Feb 2003 push( @aTranscriptNames, $szTranscriptName ); # if ( !defined( $szSequence = ) ) { # die "premature end of file while looking for sequence for transcript $szTranscriptName"; # } $szSequence = ""; $bEndOfFile = 1; # will be changed if not while( defined( $szLine = ) ) { if ( $szLine =~ /^>/ ) { $szSavedLine = $szLine; $bEndOfFile = 0; last; } else { chomp( $szLine ); $szSequence .= $szLine; $szSavedLine = ""; } } if ( -e $szTranscriptName ) { # this transcript has already been processed so skip it next; } mkdir( $szTranscriptName, 0777 ) || die "could not create directory $szTranscriptName"; chdir( $szTranscriptName ) || die "couldn't change directories to $szTranscriptName"; mkdir( "phd_dir", 0777 ) || die "could not create directory $szTranscriptName/phd_dir"; mkdir( "edit_dir", 0777 ) || die "could not create directory $szTranscriptName/edit_dir"; chdir( "edit_dir" ) || die "could not change directories to $szTranscriptName/edit_dir"; # make a fasta file with this single sequence $szFastaFile = $szTranscriptName . ".fasta"; open( filFasta, ">$szFastaFile" ) || die "could not open $szFastaFile for writing"; print( filFasta ">$szTranscriptName\n" ); for( $nChar = 0; $nChar < length( $szSequence ); $nChar += 50 ) { if ( $nChar + 50 < length( $szSequence ) ) { $szLine = substr( $szSequence, $nChar, 50 ); print( filFasta "$szLine\n" ); } else { $szLine = substr( $szSequence, $nChar ); print( filFasta "$szLine\n" ); } } close( filFasta ); !system( "$szFasta2Phd $szFastaFile" ) || die "something wrong running fasta2Phd.perl $szFastaFile"; # move the phd file to where it belongs $szPhdFile = $szTranscriptName . ".phd.1"; rename( $szPhdFile, "../phd_dir/$szPhdFile" ) || die "could not rename $szPhdFile to ../phd_dir/$szPhdFile"; !system( "$szPhd2Ace $szPhdFile" ) || die "something wrong running $szPhd2Ace $szPhdFile"; # We have created the assembly. Now create the autoPCRAmplify.txt file. # Note that phd2Ace.perl adds "-Contig" to the end of the read name. $szContig = $szTranscriptName . "-Contig"; open( filAutoPCRAmplify, ">autoPCRAmplify.txt" ) || die "could not open autoPCRAmplify.txt file for $szTranscriptName"; $szLine = "$szTranscriptName $szContig $nLeftPos1-$nRightPos1 -> $szContig $nLeftPos2-$nRightPos2 <- $szBiggestOrSmallest"; print( filAutoPCRAmplify "$szLine\n" ); close( filAutoPCRAmplify ); # run Consed to get the primers $szAceFile = $szTranscriptName . ".ace"; !system( "$szConsed -ace $szAceFile -autoPCRAmplify autoPCRAmplify.txt" ) || die "something wrong running Consed for $szTranscriptName"; chdir( ".." ) || die "could not change directories for $szTranscriptName"; chdir( ".." ) || die "could not change directories again for $szTranscriptName"; } # location of old checkAndPrintPrimerPairs.perl # back to the beginning of the pcr.transcripts file seek( filTranscripts, 0, 0 ); @aListOfTranscriptHeaders = (); while( ) { if ( /^>/ ) { push( @aListOfTranscriptHeaders, $_ ); } } close( filTranscripts ); # at this point, there are a zillion trans.xxxxx files within this directory # We have the names stored in @aTranscriptNames open( filFileOfPrimers, ">$szFileOfPrimers" ) || die "couldn't open $szFileOfPrimers"; open( filFailures, ">$szFileOfFailures" ) || die "couldn't open $szFileOfFailures"; for( $nTranscript = 0; $nTranscript <= $#aTranscriptNames; ++$nTranscript ) { $szTranscriptName = $aTranscriptNames[ $nTranscript ]; #foreach $szTranscriptName (@aTranscriptNames ) { print "working on transcript $szTranscriptName (" . ($nTranscript + 1) . " out of " . ( $#aTranscriptNames + 1 ) . "...\n"; $szAutoPCRAmplifyFile = $szTranscriptName . "/edit_dir/autoPCRAmplify.fof"; # used if there is a failure $szEditDirPath = $szTranscriptName . "/edit_dir"; # read autoPCRAmplify.fof to get the name of the Consed .out file open( filAutoPCRAmplifyFOF, "$szAutoPCRAmplifyFile" ) || die "could not open autoPCRAmplify.fof file $szAutoPCRAmplifyFile"; if (!defined( $szConsedOutputFile = ) ) { die "empty file $szAutoPCRAmplifyFile"; } chomp( $szConsedOutputFile ); close( filAutoPCRAmplifyFOF ); # open the Consed .out file and then let the subroutine read # and process it. ( $szPathname = $szAutoPCRAmplifyFile ) =~ s/autoPCRAmplify\.fof$//; chdir( $szPathname ) || die "could not change directory to $szPathname"; open( filConsedOutput, "$szConsedOutputFile" ) || die "could not open $szConsedOutputFile"; &processOneOutputFile; close( filConsedOutput ); chdir( $szRootForDirectories ) || die "could not chdir to $szRootForDirectories"; } close( filFileOfPrimers ); print "see files $szFileOfPrimers for primers and $szFileOfFailures for failures (if any)\n"; exit( 0 ); ################################################### # subroutines ################################################### sub processOneOutputFile { @aFailureOutput = (); $nFoundOne = 0; $nAcceptablePrimersRegion1 = -666; $nAcceptablePrimersRegion2 = -666; while() { if ( /^PRIMER_PAIR/ ) { $nFoundOne = 1; # looks like this #PRIMER_PAIR{ #id: trans.2466659.comp product_size: 3311 #ggtttgaatgctgcaccctt temp: 62 2466684-2466703 in CHROMOSOME_I-Contig #cagtttgagtcattttatcaggatgtaa temp: 61 2469967-2469994 in CHROMOSOME_I-Contig #} if ( !defined( $_ = ) ) { die "premature end of file 1st line after PRIMER_PAIR"; } @aWords = split; if ( $aWords[0] ne "id:" || $aWords[2] ne "product_size:" || $#aWords != 3 ) { die "do not recognize line: $_"; } $szRegionID = $aWords[1]; $nProductSize = $aWords[3]; print "\n\n\n"; print "---------------------------------------------------------\n"; print "working on $szRegionID\n"; print "---------------------------------------------------------\n\n\n\n"; # a name like trans.2466659 $szPrimerBaseName = $szRegionID; $szPrimerBaseName =~ s/^trans\.//; $szPrimerBaseName =~ s/\.comp$//; $szForwardPrimerName = $szPrimerBaseName . "f"; $szReversePrimerName = $szPrimerBaseName . "r"; if ( !defined( $_ = ) ) { die "premature end of file 2nd line after PRIMER_PAIR"; } @aWords = split; if ( $aWords[1] ne "temp:" || $aWords[4] ne "in" ) { die "do not recognize line: $_"; } $szForwardPrimerSequence = $aWords[0]; $szForwardPrimerSequence =~ tr/a-z/A-Z/; $nForwardPrimerMeltingTemp = $aWords[2]; if ( !defined( $_ = ) ) { die "premature end of file 3rd line after PRIMER_PAIR"; } @aWords = split; if ( $aWords[1] ne "temp:" || $aWords[4] ne "in" ) { die "do not recognize line: $_"; } $szReversePrimerSequence = $aWords[0]; $szReversePrimerSequence =~ tr/a-z/A-Z/; $nReversePrimerMeltingTemp = $aWords[2]; # now do a cross match against the transcript and see how large # the product is. $szPrimersFasta = $szRegionID . ".primers.fasta"; $szTranscriptFasta = $szRegionID . ".fasta"; $szCrossMatchOutput = $szRegionID . ".primers.cross"; if ( -e $szCrossMatchOutput ) { unlink( $szCrossMatchOutput ); } if ( -e $szPrimersFasta ) { unlink( $szPrimersFasta ); } if ( ! -e $szTranscriptFasta ) { die "couldn't find $szTranscriptFasta"; } open( filPrimersFasta, ">$szPrimersFasta" ) || die "couldn't open $szPrimersFasta for output"; print filPrimersFasta ">$szForwardPrimerName\n"; print filPrimersFasta "$szForwardPrimerSequence\n"; print filPrimersFasta ">$szReversePrimerName\n"; print filPrimersFasta "$szReversePrimerSequence\n"; close( filPrimersFasta ); # print "just in case there is something strange here, this is what will be printed out:\n"; # print "PRIMER_PAIR {\n"; # print "Region: $szRegionID Product size: $nProductSize\n"; # print "$szForwardPrimerName: $szForwardPrimerSequence temp: $nForwardPrimerMeltingTemp\n"; # print "$szReversePrimerName: $szReversePrimerSequence temp: $nReversePrimerMeltingTemp\n"; # print "}\n\n\n"; @aFailureOutput = (); push( @aFailureOutput, "PRIMER_PAIR {\n" ); push( @aFailureOutput, "Region: $szRegionID Product size: $nProductSize\n" ); push( @aFailureOutput, "$szForwardPrimerName: $szForwardPrimerSequence temp: $nForwardPrimerMeltingTemp\n" ); push (@aFailureOutput, "$szReversePrimerName: $szReversePrimerSequence temp: $nReversePrimerMeltingTemp\n" ); push (@aFailureOutput, "}\n\n\n" ); # $szCommand = "$szCrossMatch $szPrimersFasta $szTranscriptFasta -tags -minscore 10 -masklevel 101 -minmatch 10 >$szCrossMatchOutput 2>/dev/null"; # unfortunately, some primers are low complexity sequences so -raw -word_raw is needed $szCommand = "$szCrossMatch $szPrimersFasta $szTranscriptFasta -tags -minscore 10 -masklevel 101 -minmatch 10 -raw -word_raw >$szCrossMatchOutput 2>/dev/null"; !system( $szCommand ) || die "$szCommand failed"; ( $nProblem, $nProductSize2 ) = &parseCrossMatchOutput( $szCrossMatchOutput ); if ( $nProblem != 0 ) { print filFailures @aFailureOutput; print filFailures "\n\n"; } elsif ( $nProductSize2 != $nProductSize ) { print filFailures "for transcript $szRegionID Consed calculated product size $nProductSize but crossmatch calculated $nProductSize2\n\n\n"; } print filFileOfPrimers "PRIMER_PAIR {\n"; print filFileOfPrimers "Region: $szRegionID Product size: $nProductSize\n"; print filFileOfPrimers "$szForwardPrimerName: $szForwardPrimerSequence temp: $nForwardPrimerMeltingTemp\n"; print filFileOfPrimers "$szReversePrimerName: $szReversePrimerSequence temp: $nReversePrimerMeltingTemp\n"; print filFileOfPrimers "}\n\n\n"; } # if ( /^PRIMER_PAIR/ ) { elsif( /acceptable primers for region 1/ ) { $nAcceptablePrimersRegion1 = ( split )[0]; } elsif( /acceptable primers for region 2/ ) { $nAcceptablePrimersRegion2 = ( split )[0]; } } # while() { if ( !$nFoundOne ) { print filFailures "\n\nthere was no PRIMER_PAIR in $szConsedOutputFile\n"; print filFailures "LOOK AT $szEditDirPath/$szConsedOutputFile\n"; # $szRegionID is not set if !$nFoundOne # print filFailures "$szRegionID $nAcceptablePrimersRegion1 acceptable primers region1 and $nAcceptablePrimersRegion2 region2\n"; print filFailures "$nAcceptablePrimersRegion1 acceptable primers region1 and $nAcceptablePrimersRegion2 region2\n"; # let's see what the size of the regions were $bFoundHeader = 0; foreach $szTranscriptHeaderLine (@aListOfTranscriptHeaders ) { @aHeaderWords = split( ' ', $szTranscriptHeaderLine ); $szTranscriptName1 = $aHeaderWords[0]; $szTranscriptName1 =~ s/^>//; if ( $szTranscriptName1 eq $szTranscriptName ) { $bFoundHeader = 1; last; } } die "could not find header for transcript $szTranscriptName" unless ( $bFoundHeader == 1 ); # example of header: # >trans.687551 599 1172 1862 2120 smallest $nTopStrandPrimerLeftmost = $aHeaderWords[1]; $nTopStrandPrimerRightmost = $aHeaderWords[2]; $nBottomStrandPrimerLeftmost = $aHeaderWords[3]; $nBottomStrandPrimerRightmost = $aHeaderWords[4]; printf( filFailures "could it be that the regions are too small?: %d %d\n\n", $nTopStrandPrimerRightmost - $nTopStrandPrimerLeftmost + 1, $nBottomStrandPrimerRightmost - $nBottomStrandPrimerLeftmost + 1 ); } close( filConsedOutput ); } sub parseCrossMatchOutput { ($szCrossMatchOutput ) = @_; my @aPrimerNames = (); my @aPrimersComplemented = (); my @aPrimerLeftPositions = (); my @aPrimerRightPositions = (); open( filCrossMatchOutput, "$szCrossMatchOutput" ) || die "could not open $szCrossMatchOutput file"; while( ) { if ( /^ALIGNMENT/ ) { # The lines look like this: #ALIGNMENT 22 0.00 0.00 0.00 8223518f 1 22 (0) trans.8223518 288 309 (2620) #ALIGNMENT 22 0.00 0.00 0.00 8223518r 1 23 (0) C trans.8223518 (2444) 485 463 @aWords = split; if ( $aWords[8] ne "(0)" ) { # case in which we have a partial match somewhere that does not # extend to the 3' end of the primer next; } if ( $aWords[2] ne "0.00" || $aWords[3] ne "0.00" || $aWords[4] ne "0.00" ) { print filFailures "the primer did not match the transcript precisely in $szCrossMatchOutput\n"; } push( @aPrimerNames, $aWords[5] ); if ( $aWords[9] eq "C" ) { push( @aPrimersComplemented, "C" ); push( @aPrimerLeftPositions, $aWords[13] ); push( @aPrimerRightPositions, $aWords[12] ); } else { push( @aPrimersComplemented, "U" ); push( @aPrimerLeftPositions, $aWords[10] ); push( @aPrimerRightPositions, $aWords[11] ); } } # if ( /^ALIGNMENT/ ) {... } # while( ) { close( filCrossMatchOutput ); if ( $#aPrimerNames != 1 ) { $nAlignmentLines = $#aPrimerNames + 1; print filFailures "there should be 2 ALIGNMENT lines in $szCrossMatchOutput but instead there were $nAlignmentLines\n"; print filFailures "le " . $szEditDirPath . "/" . $szCrossMatchOutput . "\n"; return( ( 1, 0 ) ); } if ( $aPrimersComplemented[0] eq $aPrimersComplemented[1] ) { print filFailures "the primers in $szCrossMatchOutput appear to not be in the correct orientation\n"; return( ( 1, 0 ) ); } if ( $aPrimersComplemented[0] eq "C" ) { $nBottomStrandPrimer = 0; $nTopStrandPrimer = 1; } else { $nBottomStrandPrimer = 1; $nTopStrandPrimer = 0; } $nProductSize = $aPrimerRightPositions[ $nBottomStrandPrimer ] - $aPrimerLeftPositions[ $nTopStrandPrimer ] + 1; if ( $nProductSize < 0 ) { print @aPrimersComplemented, "\n"; print @aPrimerLeftPositions, "\n"; print @aPrimerRightPositions, "\n"; print @aPrimerNames, "\n"; print filFailures "the primers in $szCrossMatchOutput must be in incorrect orientation since the product size is $nProductSize between $aPrimerRightPositions[ $nBottomStrandPrimer ] and $aPrimerLeftPositions[ $nTopStrandPrimer ]\n"; return( ( 1, 0 ) ); } # check that the bottom strand (rightmost) primer is the reverse (r) one so # that we can tell our technician to always sequence the reverse $szBottomStrandPrimerName = $aPrimerNames[ $nBottomStrandPrimer ]; if ( $szBottomStrandPrimerName !~ /r$/ ) { print filFailures "the bottom strand primer is $szBottomStrandPrimerName which is not the reverse primer\n"; return( ( 1, 0 ) ); } # now check that the primers are within the regions that Phil specified # in the transcript file $bFoundHeader = 0; foreach $szTranscriptHeaderLine (@aListOfTranscriptHeaders ) { @aHeaderWords = split( ' ', $szTranscriptHeaderLine ); $szTranscriptName1 = $aHeaderWords[0]; $szTranscriptName1 =~ s/^>//; if ( $szTranscriptName1 eq $szTranscriptName ) { $bFoundHeader = 1; last; } } die "could not find header for transcript $szTranscriptName" unless ( $bFoundHeader == 1 ); # example of header: # >trans.687551 599 1172 1862 2120 smallest $nTopStrandPrimerLeftmost = $aHeaderWords[1]; $nTopStrandPrimerRightmost = $aHeaderWords[2]; $nBottomStrandPrimerLeftmost = $aHeaderWords[3]; $nBottomStrandPrimerRightmost = $aHeaderWords[4]; if ( ! ( ( $nTopStrandPrimerLeftmost <= $aPrimerLeftPositions[ $nTopStrandPrimer ] ) && ( $aPrimerRightPositions[ $nTopStrandPrimer ] <= $nTopStrandPrimerRightmost ) ) ) { print filFailures "\n\ntop strand primer for transcript $szTranscriptName was not within the specified positions"; return( ( 1, 0 ) ); } if ( ! ( ( $nBottomStrandPrimerLeftmost <= $aPrimerLeftPositions[ $nBottomStrandPrimer ] ) && ( $aPrimerRightPositions[ $nBottomStrandPrimer ] <= $nBottomStrandPrimerRightmost ) )) { print filFailures "\n\nbottom strand primer for transcript $szTranscriptName was not within the specified positions\n"; return( (1, 0 ) ); } return( ( 0, $nProductSize ) ); }