#!/usr/bin/perl ##---------------------------------------------------------------------------## ## File: ## @(#) RepeatMasker ## Author: ## Arian Smit ## Robert Hubley ## Description: ## Takes one or more DNA sequence files, in fasta format, and returns ## masked sequence file(s) (repetitive DNA is masked) for database ## searches and a file with detailed annotation of repeat locations.The ## sequence data are screened against a library of repetitive sequences ## using the program cross_match (Phil Green, unpublished) or ## WUBlast ( Gish et al ). ## ## NOTE: See RepeatMaskerConfig.pm for necessary installation ## customization. ## #****************************************************************************** #* Copyright (C) University of Washington 1996-1999 Developed by Arian Smit, #* Philip Green and Colin Wilson of the University of Washington Department of #* Genomics. #* #* Copyright (C) Arian Smit 2000-2001 #* #* Copyright (C) Institute for Systems Biology 2002-2004 Developed by #* Arian Smit and Robert Hubley. #* #* This work is licensed under the Open Source License v2.1. To view a copy #* of this license, visit http://www.opensource.org/licenses/osl-2.1.php or #* see the license.txt file contained in this distribution. #* ############################################################################### # ChangeLog: # # $Log: RepeatMasker,v $ # Revision 1.250 2011/04/26 22:41:44 rhubley # Cleanup before a distribution # # ############################################################################### # # To Do: # # =head1 NAME RepeatMasker - Mask repetitive DNA =head1 SYNOPSIS RepeatMasker [-options] =head1 DESCRIPTION The options are: =over 4 =item -h(elp) Detailed help =back Default settings are for masking all type of repeats in a primate sequence. =over 4 =item -w(ublast) **deprecated** Use WU-blast/AB-blast, rather than cross_match as engine. Note use the new -engine wublast option. =item -de(cypher) **deprecated** Use DeCypher, rather than cross_match as engine. Note use the new -engine decypher option. =item -e(ngine) [crossmatch|wublast|abblast|ncbi|decypher] Use an alternate search engine to the default. =item -pa(rallel) [number] The number of processors to use in parallel (only works for batch files or sequences over 50 kb) =item -s Slow search; 0-5% more sensitive, 2-3 times slower than default =item -q Quick search; 5-10% less sensitive, 2-5 times faster than default =item -qq Rush job; about 10% less sensitive, 4->10 times faster than default (quick searches are fine under most circumstances) repeat options =item -nolow /-low Does not mask low_complexity DNA or simple repeats =item -noint /-int Only masks low complex/simple repeats (no interspersed repeats) =item -norna Does not mask small RNA (pseudo) genes =item -alu Only masks Alus (and 7SLRNA, SVA and LTR5)(only for primate DNA) =item -div [number] Masks only those repeats < x percent diverged from consensus seq =item -lib [filename] Allows use of a custom library (e.g. from another species) =item -cutoff [number] Sets cutoff score for masking repeats when using -lib (default 225) =item -species Specify the species or clade of the input sequence. The species name must be a valid NCBI Taxonomy Database species name and be contained in the RepeatMasker repeat database. Some examples are: -species human -species mouse -species rattus -species "ciona savignyi" -species arabidopsis Other commonly used species: mammal, carnivore, rodentia, rat, cow, pig, cat, dog, chicken, fugu, danio, "ciona intestinalis" drosophila, anopheles, elegans, diatoaea, artiodactyl, arabidopsis, rice, wheat, and maize =back Contamination options =over 4 =item -is_only Only clips E coli insertion elements out of fasta and .qual files =item -is_clip Clips IS elements before analysis (default: IS only reported) =item -no_is Skips bacterial insertion element check =item -rodspec Only checks for rodent specific repeats (no repeatmasker run) =item -primspec Only checks for primate specific repeats (no repeatmasker run) =back Running options =over 4 =item -gc [number] Use matrices calculated for 'number' percentage background GC level =item -gccalc RepeatMasker calculates the GC content even for batch files/small seqs =item -frag [number] Maximum sequence length masked without fragmenting (default 60000, 300000 for DeCypher) =item -maxsize [nr] Maximum length for which IS- or repeat clipped sequences can be produced (default 4000000). Memory requirements go up with higher maxsize. =item -nocut Skips the steps in which repeats are excised =item -noisy Prints search engine progress report to screen (defaults to .stderr file) =item -nopost Do not postprocess the results of the run ( i.e. call ProcessRepeats ). NOTE: This options should only be used when ProcessRepeats will be run manually on the results. =back output options =over 4 =item -dir [directory name] Writes output to this directory (default is query file directory, "-dir ." will write to current directory). =item -a(lignments) Writes alignments in .align output file =item -inv Alignments are presented in the orientation of the repeat (with option -a) =item -lcambig Outputs ambiguous DNA transposon fragments using a lower case name. All other repeats are listed in upper case. Ambiguous fragments match multiple repeat elements and can only be called based on flanking repeat information. =item -small Returns complete .masked sequence in lower case =item -xsmall Returns repetitive regions in lowercase (rest capitals) rather than masked =item -x Returns repetitive regions masked with Xs rather than Ns =item -poly Reports simple repeats that may be polymorphic (in file.poly) =item -source Includes for each annotation the HSP "evidence". Currently this option is only available with the "-html" output format listed below. =item -html Creates an additional output file in xhtml format. =item -ace Creates an additional output file in ACeDB format =item -gff Creates an additional Gene Feature Finding format output =item -u Creates an additional annotation file not processed by ProcessRepeats =item -xm Creates an additional output file in cross_match format (for parsing) =item -fixed Creates an (old style) annotation file with fixed width columns =item -no_id Leaves out final column with unique ID for each element (was default) =item -e(xcln) Calculates repeat densities (in .tbl) excluding runs of >=20 N/Xs in the query =back =head1 SEE ALSO =over 4 Crossmatch, ProcessRepeats =back =head1 COPYRIGHT Copyright 2007-2010 Arian Smit, Institute for Systems Biology =head1 AUTHORS Arian Smit Robert Hubley =cut # # Module Dependence # use strict; use FindBin; use lib $FindBin::RealBin; use Getopt::Long; use POSIX qw(:sys_wait_h); use Storable qw(nstore retrieve); use File::Copy; use File::Spec; use File::Path; use Data::Dumper; use Cwd; # RepeatMasker Libraries use RepbaseEMBL; use RepeatMaskerConfig; use SearchResult; use SearchResultCollection; use Taxonomy; use CrossmatchSearchEngine; use WUBlastSearchEngine; use NCBIBlastSearchEngine; use DeCypherSearchEngine; use SimpleBatcher; use FastaDB; # A bug in 5.8 produces way too many warnings if ( $] && $] >= 5.008003 ) { use warnings; } # Debugging flag my $DEBUG = 0; # # Version # # This is a neat trick. CVS allows you to tag # files in a repository ( i.e. cvs tag "2003/12/03" ). # If you check out that release into a new # directory with "cvs co -r "2003/12/03" it will # place this string into the $Name: open-3-3-0 $ space below # automatically. This will help us keep track # of which release we are using. If we simply # check out the code as "cvs co RepeatMasker" the # $Name: open-3-3-0 $ macro will be blank and thus we use # this as the development version. # my $CVSTag = '$Name: open-3-3-0 $'; my $version; if ( $CVSTag =~ /\$\s*Name:\s*open-(\S+)\s*\$/ ) { $version = $1; $version =~ s/-/./g; $version = "open-$version"; } else { $version = 'development-$Id: RepeatMasker,v 1.250 2011/04/26 22:41:44 rhubley Exp $'; } print "RepeatMasker version $version\n"; if ( $ARGV[ 0 ] && $ARGV[ 0 ] eq '-v' ) { exit( 0 ); } # # Option processing # e.g. # -t: Single letter binary option # -t=s: String parameters # -t=i: Number paramters # my @opts = qw( a|alignments ace alu cut cutoff=s decypher|d dir=s div=s excln fixed frag=s gc=s gccalc gff help int|noint inv is_clip is_only lib=s low|l|nolow lcambig maxsize=i no_id no_is nocut noisy norna parallel=i poly primspec q qq rodspec s small species|sp=s u wublast|w ncbi x xm xsmall nopost engine|e=s source html ); # # Get the supplied command line options, and set flags # my %options = (); unless ( &GetOptions( \%options, @opts ) ) { exec "pod2text $0"; exit( 0 ); } # Print the internal POD documentation if something is missing if ( $#ARGV == -1 && !$options{'help'} ) { print "No query sequence file indicated\n\n"; # This is a nifty trick so we don't have to have # a duplicate "USAGE()" subroutine. Instead we # just recycle our POD docs. See PERL POD for more # details. exec "pod2text $0"; die; } # Print out the big help file if requested my $PAGER = $ENV{PAGER}; $PAGER = "more" if !defined $PAGER; system( "$PAGER $RepeatMaskerConfig::REPEATMASKER_DIR/repeatmasker.help\n" ), exit( 0 ) if $options{'help'}; # # Verify that the libraries exist # die "The RepeatMasker installation directory " . "(\$RepeatMaskerConfig::REPEATMASKER_DIR) is " . "incorrectly set in the RepeatMaskerConfig.pm file. Please open the " . "RepeatMaskerConfig.pm file and edit the " . "\$RepeatMaskerConfig::REPEATMASKER_DIR line.\n" unless ( -d "$RepeatMaskerConfig::REPEATMASKER_DIR/Libraries" && -d "$RepeatMaskerConfig::REPEATMASKER_DIR/Matrices" ); die "RepeatMasker could not find the repeat library at:\n" . " $RepeatMaskerConfig::REPEATMASKER_DIR/Libraries/RepeatMaskerLib.embl\n" . "This file is necessary to run RepeatMasker. A minimal version of this\n" . "file is distributed with RepeatMasker. For an extensive library of\n" . "interspersed repeats please obtain the complete file at:\n" . "http://www.girinst.org and install before using RepeatMasker.\n\n" unless ( -s "$RepeatMaskerConfig::REPEATMASKER_DIR/Libraries/RepeatMaskerLib.embl" ); my $dbversion; my $repbaseVersion; my $rmLibraryVersion; my $minLibraryVersion = 20040311.0; open INVERS, "<$RepeatMaskerConfig::REPEATMASKER_DIR/Libraries/RepeatMaskerLib.embl" || die "Error! Could not open RepeatMasker library database file!\n"; while ( ) { if ( /CC\s+RELEASE\s+(\d+)(-min)*;/ ) { die "The RepeatMasker.lib file is out of date. This version of " . "RepeatMasker requires library version $minLibraryVersion or " . "higher. Your version is $1.\n" unless ( $1 >= $minLibraryVersion ); $repbaseVersion = $1 . $2; $rmLibraryVersion = $1 . $2; $dbversion = "RepBase Update $repbaseVersion, RM database version $rmLibraryVersion"; die "\nMissing full RepeatMasker repeat library!\n\n" . "This installation of RepeatMasker is using the minimal repeat\n" . "database ( RepeatMaskerLib.embl ) which is distributed with the\n" . "program. This minimal database can only be used along with a\n" . "custom repeat library ( specified with the -lib option ). To\n" . "obtain the full repeat database please visit:\n" . " http://www.girinst.org.\n\n" if ( $2 eq "-min" && !defined $options{'lib'} ); last; } } close INVERS; # TODO: Implement and remove this. if ( $options{'cut'} ) { die "\nERROR: The -cut option to RepeatMasker is not supported in this\n" . "release. It will be rolled into a new annotation utility in\n" . "the near future. If you need this functionality sooner please\n" . "send an email to Robert Hubley ( rhubley\@systemsbiology.org ).\n" . "Thanks for your patience.\n\n"; } # # Get the date # my $date = localtime( time() ); # Windows does not support the use of ":" in a filename. $date =~ s/[ ,\t,\n:]//g; # # Setup the search engine # my $searchEngine; my $engine = $RepeatMaskerConfig::DEFAULT_SEARCH_ENGINE; if ( ( defined $options{'engine'} && ( $options{'engine'} eq "wublast" || $options{'engine'} eq "abblast" ) ) || defined $options{'wublast'} ) { # User wants wublast/abblast $engine = "wublast"; } elsif ( defined $options{'engine'} && $options{'engine'} eq "crossmatch" ) { # User wants crossmatch $engine = "crossmatch"; } elsif ( ( defined $options{'engine'} && $options{'engine'} eq "decypher" ) || defined $options{'decypher'} ) { # User wants decypher $engine = "decypher"; } elsif ( ( defined $options{'engine'} && $options{'engine'} eq "ncbi" ) || defined $options{'ncbi'} ) { # User wants ncbi $engine = "ncbi"; } elsif ( defined $options{'engine'} ) { die "I have never heard of the search engine $options{'engine'}. Please\n" . "use ncbi/abblast/wublast/decypher or crossmatch.\n"; } if ( $engine eq "wublast" ) { print "Search Engine: ABBlast/WUBlast\n"; $searchEngine = WUBlastSearchEngine->new( pathToEngine => $RepeatMaskerConfig::WUBLASTP_PRGM ); if ( not defined $searchEngine ) { die "Cannot execute $RepeatMaskerConfig::WUBLASTP_PRGM (check " . " the RepeatMaskerConfig.pm file in the RepeatMasker directory )\n"; } # TODO CHECK TO SEE IF WE NEED THIS $options{'wublast'} = 1; } elsif ( $engine eq "crossmatch" ) { print "Search Engine: Crossmatch\n"; $searchEngine = CrossmatchSearchEngine->new( pathToEngine => $RepeatMaskerConfig::CROSSMATCH_PRGM ); if ( not defined $searchEngine ) { die "Cannot execute $RepeatMaskerConfig::CROSSMATCH_PRGM (check " . " the RepeatMaskerConfig.pm file in the RepeatMasker directory )\n"; } } elsif ( $engine eq "decypher" ) { print "Search Engine: Decypher\n"; # Set up search engine for DeCypher $searchEngine = DeCypherSearchEngine->new( pathToEngine => $RepeatMaskerConfig::DECYPHER ); if ( not defined $searchEngine ) { die "Cannot execute $RepeatMaskerConfig::DECYPHER ( check " . " the RepeatMaskerConfig.pm file in RepeatMasker directory )\n"; } } elsif ( $engine eq "ncbi" ) { print "Search Engine: NCBI/RMBLAST\n"; $searchEngine = NCBIBlastSearchEngine->new( pathToEngine => $RepeatMaskerConfig::RMBLASTN_PRGM ); if ( not defined $searchEngine ) { die "Cannot execute rmblastn"; } $engine = "ncbi"; } else { die "Search engine ( $engine ) is unknown to RepeatMasker. Please check " . "the RepeatMaskerConfig.pm or rerun the configure script!.\n"; } ## ## RepeatMasker Batching Parameters ## # Most computers can handle the memory/cpu requirements of # running 60kb chunks. Currently our batching mechanism will # not breakup anything smaller than this. my $fragmentSize = 60000; # Decypher is even more capable $fragmentSize = 300000 if ( $engine eq "decypher" ); # # Selection of a batch overlap length can have large impacts on the program. # The overlap boundaries are places where edge effects produce partial # overlapping annotations. Also matrix differences in flanking batches # can cause the same repeat to have different divergence, score and alignment # length characteristics. # my $overlapLen = 2000; # # User supplied fragment length # if ( defined $options{'frag'} ) { if ( $options{'frag'} < ( 2 * $overlapLen ) ) { warn "RepeatMasker: You may not use a fragment size (-frag " . "$options{'frag'} ) which is less than 2 times the overlap " . "length (overlapLen = $overlapLen). Defaulting to $fragmentSize\n"; } else { $fragmentSize = $options{'frag'}; } } # # Parse filenames # foreach my $file ( @ARGV ) { if ( $file =~ /\s/ ) { die "RepeatMasker can not handle filenames with spaces " . "like the file \"$file\"\n"; } elsif ( $file =~ /([\`\!\$\^\&\*\{\}\[\]\|\\\;\"\'\<\>\?])/ ) { die "RepeatMasker can not handle filenames with the special " . "character \"$1\" as in the file \"$file\"\n"; } } # # If $file is across a system boundary writing temporary files # to the file's directory takes a lot of time, so these are written # to a temporary directory: my ( $tempdir, $runnumber ) = &createtempdir( \%options, $date, $ARGV[ 0 ] ); # Add tempdir to the end of the library search path push @RepeatMaskerConfig::LIBPATH, $tempdir; # # Species & Library Setup # my $tax = Taxonomy->new( taxonomyDataFile => "$RepeatMaskerConfig::REPEATMASKER_DIR/taxonomy.dat" ); my $generalLibDir; my $speciesLibDir; my $customLibDir; ( $options{"species"}, $generalLibDir, $speciesLibDir, $customLibDir ) = &initLibraries( \%options, "$RepeatMaskerConfig::REPEATMASKER_LIB_DIR", $tax, $tempdir, \@RepeatMaskerConfig::LIBPATH, $searchEngine, $rmLibraryVersion ); # Read in a frozen hash of element id's which are refineable. This # hash is created by initLibraries the first time a library is initialized. my $refineableHashRef; if ( !$options{"lib"} ) { $refineableHashRef = retrieve( "$generalLibDir/refineableHash.dat" ); } my $conspec; $conspec = 'primate' if $options{'primspec'}; $conspec = 'rodentia' if $options{'rodspec'}; my ( $sinecutlib, $shortcutlib, $cutlib, $shortlib, $longlib, $retrolib ) = ( "$tempdir/sinecutlib", "$tempdir/shortcutlib", "$tempdir/cutlib", "$tempdir/shortlib", "$tempdir/longlib", "$tempdir/retrolib" ); # # Main loop # FILECYCLE: foreach my $file ( @ARGV ) { unless ( -r $file ) { print "cannot read file $file in " . cwd() . "\n"; next; } unless ( -s $file ) { print "File $file appears to be empty.\n"; next; } my $compressed = ""; if ( $file =~ /\.gz$/ ) { unless ( `gunzip $file 2>&1` ) { # Name $file only changes if gunzip did not complain # (file may end with .gz but not be zipped $file =~ s/\.gz$//; $compressed = "zipped"; } } elsif ( $file =~ /\.Z$/ ) { unless ( `uncompress $file 2>&1` ) { $file =~ s/\.Z$//; $compressed = "Zed"; } } # With one file $#ARGV == 0 print "\nanalyzing file $file\n" if ( $#ARGV >= 0 ); # Don't mess with original query file and remember original # location, e.g. for quality file. my $fileori = $file; my ( $originaldir, $fileend ) = ( File::Spec->splitpath( $file ) )[ 1, 2 ]; $originaldir = "." if ( $originaldir eq "" ); my $file = "$tempdir\/$fileend"; ## Look for quality files and read in my $qualFile = "$fileori" . ".qual"; # foo.seq files have foo.qual quality files; $qualFile =~ s/\.seq.qual$/\.qual/; # Check if files will be overwritten &SaveOldFiles( $fileori, $fileend, $originaldir, $date, \%options ); ## Create a batcher object. Upon construction the ## object will survey the fasta file, check for syntax ## errors, and create a byte index for all parseable sequences ## in the file. Copy the file so we don't mess with the ## original. system( " cp $fileori $file " ); my $db = FastaDB->new( fileName => $file, openMode => SeqDBI::ReadWrite, maxIDLength => 50 ); my $batcher = SimpleBatcher->new( $db, $fragmentSize, $overlapLen ); ## How many sequences are in the fasta file? my $totseqcnt = $db->getSeqCount(); ## Don't process this file unless we have some ## unambiguous (ACGT) sequence. my $sublength = 0; unless ( $sublength = $db->getSubtLength() ) { &SkipFile( $file ); &cleanUp( \%options, $runnumber, $tempdir, $fileori, $fileend, $file, $originaldir, $compressed ); next FILECYCLE; } my $totGClevel = 100 * $db->getGCLength() / $sublength; $totGClevel = sprintf "%4.2f", $totGClevel; my $maskfile = "$file.masked"; my $fullmaskfile = "$file.masked.all"; my $fullcatfile = "$file.cat.all"; my $fullRefinedFile = "$file.ref"; my $fullcutfile = "$file.cut.all"; my $fullLogFile = "$file.log"; my $fullErrFile = "$file.stderr"; my $numX = 1; # The number of X's to take the place of a # repeat base. my $totseqlen = $db->getSeqLength(); my $batchCount = $batcher->getBatchCount(); my $numberChildren = $options{'parallel'} ? $options{'parallel'} : 1; $numberChildren = $batchCount if ( $batchCount < $numberChildren ); my $badForkCount = 0; # A failsafe for in case the fork goes badly my $badForkMax = 20; # The number of bad forks ( in a row ) before exit my $retryLimit = 2; # Attempt to run a batch 2 times before failing it my %children = (); # A hash of children PIDs with their batch nums my $child_id = 0; # The PID of returned by fork my %batchStatus = (); # A hash which holds the retry count for # each batch number. # Initialize the batchStatus hash for ( my $k = 1 ; $k <= $batchCount ; $k++ ) { $batchStatus{$k} = { 'retry' => 0, 'childPID' => -1 }; } # # Job processing loop # Process all batches stored in the batchStatus hash. If # for some reason a job fails the entry in $batchStatus is # incremented. We continue to re-run this batch until the # $retryLimit is reached. # JOBLOOP: while ( keys( %batchStatus ) ) { # # First check the status of currently running # proceses. # if ( keys( %children ) ) { # Wait for at least one to exit; print "Waiting for a child to die\n" if ( $DEBUG ); my $childPID = wait(); my $retVal = ( $? >> 8 ); print "Child died. Organize the funeral for PID=$childPID" . " RetVal=$retVal\n" if ( $DEBUG ); # Check if we returned with a vaid PID if ( $childPID > 0 ) { ## Child process is gone # Find out what batch it was working on my $batchNum = $children{$childPID}; # Delete it from the children list delete $children{$childPID}; my $batchFile = $file . "_batch-" . $batchNum; # Check it's status ## TODO:...check that the masked file is the same ## size of the original. if ( $retVal == 0 && -e "$batchFile.cat" && -s "$batchFile.masked" ) { ## Child completed ok. ## Append annotations, log, and stderrs system( "cat $batchFile.cat >> $fullcatfile" ); system( "cat $batchFile.ref >> $fullRefinedFile" ); system( "cat $batchFile.masked.log >> $fullLogFile" ) if ( -e "$batchFile.masked.log" ); system( "cat $batchFile.masked.stderr >> $fullErrFile" ) if ( -e "$batchFile.masked.stderr" ); # Now remove them unlink( "$batchFile.cat", "$batchFile.masked", "$batchFile.masked.log", "$batchFile.masked.stderr" ) unless ( $DEBUG ); unlink( "$batchFile.ref" ) unless ( $DEBUG ); print "Child for batch=$batchNum completed ok....\n" if ( $DEBUG ); # Remove the batch entry in batchStatus delete $batchStatus{$batchNum}; } else { ## Process failed. # Check how many times we have run it if ( $batchStatus{$batchNum}->{'retry'} < $retryLimit ) { ## Under the retry limit...rerun it! print "Child for batch=$batchNum failed ($retVal) " . "retry#" . $batchStatus{$batchNum}->{'retry'} . "\n" if ( $DEBUG ); $batchStatus{$batchNum}->{'retry'}++; $batchStatus{$batchNum}->{'childPID'} = -1; print "WARNING: Retrying batch ( $batchNum )...\n"; } else { ## Too many retries. ## We are out of here! print "FATAL ERROR: RepeatMasker giving up. One or more\n" . "batches failed! Unfortunately this type of error\n" . "cannot be recovered from. Please submit this output\n" . "to the feedback page at the repeatmasker website: \n" . " www.repeatmasker.org\n" . "Further details about this problem may be found in\n" . "the directory: $tempdir\n"; exit( -1 ); } } } else { # Child is still running } } # End if ( keys( %children ... # Gather a list of batches to work on my @batchNums = grep { ( $batchStatus{$_}->{'childPID'} < 0 ) } sort( { $a <=> $b } keys( %batchStatus ) ); # Decide how many jobs to start my $numberToStart = 0; if ( @batchNums > ( $numberChildren - keys( %children ) ) ) { # Simply the number requested - the number running $numberToStart = ( $numberChildren - keys( %children ) ); } else { # Simply the remaining batches $numberToStart = @batchNums; } # # Loop through and fork to our hearts # content. # for ( my $k = 0 ; $k < $numberToStart ; $k++ ) { FORK: if ( $child_id = fork ) { # Our children are our future print "Parent produced child $child_id\n" if ( $DEBUG ); $children{$child_id} = $batchNums[ $k ]; $batchStatus{ $batchNums[ $k ] }->{'childPID'} = $child_id; } elsif ( $child_id == 0 ) { my $batchSeqFile = $file . "_batch-" . $batchNums[ $k ]; my $batchCatFile = $batchSeqFile . ".cat"; my $batchRefinedFile = $batchSeqFile . ".ref"; my $batchMaskFile = $batchSeqFile . ".masked"; ## Get batch parameters my $seq_cnt = $batcher->getBatchSeqCount( $batchNums[ $k ] ); my $seqlen = $batcher->getBatchSeqLength( $batchNums[ $k ] ); my $frac_GC = $batcher->getBatchAverageGC( $batchNums[ $k ] ); print "Batch average GC = $frac_GC\n" if ( $DEBUG ); ## Create the batch file $batcher->writeBatchFile( $batchNums[ $k ], $batchMaskFile ); my $GC_frac = 0; if ( $options{'gc'} ) { # user decides GC background $GC_frac = $options{'gc'}; } elsif ( ( $seq_cnt > 1 || $seqlen <= 2000 ) && !$options{'gccalc'} ) { # More than one sequence *or* too short to get # acccurate measure of background then take average matrices # NOTE: Option -gccalc overules this behaviour $GC_frac = 43; } else { # program calculates GC background from sequence $GC_frac = $frac_GC; } my $GC = &choosematrices( $GC_frac ); print "GC choosen= $GC\n" if ( $DEBUG ); # Check for E coli insertion elements unless ( $options{'no_is'} || $conspec ) { #$options{'wublast'} && $seqlen > 500000) { # with wu_blast clipping in large queries goes wrong, but # can't remember why; perhaps only in large single seqs my $sequenceArrayRef; my $seqWithNameHashRef; &locateISElements( \%options, $batcher, $batchNums[ $k ], $batchMaskFile, $qualFile, $RepeatMaskerConfig::REPEATMASKER_DIR, $searchEngine, $file, $generalLibDir ); } elsif ( !$options{'no_is'} && !$conspec ) { print "When using wu_blast on sequences over 500 kb the IS " . "checking is skipped. You can check for IS elements " . "with -is_only using cross_match or using smaller " . "files\n\n"; } my $cutAlus = 0; my $cutSome = 0; unless ( $options{'is_only'} ) { my $batchSeqDB = FastaDB->new( fileName => "$batchSeqFile.masked", openMode => SeqDBI::ReadWrite, maxIDLength => 50 ); ( $cutAlus, $cutSome ) = &locateRepeats( \%options, $RepeatMaskerConfig::REPEATMASKER_DIR, $GC, $GC_frac, $batchSeqFile, $batchMaskFile, $sinecutlib, $cutlib, $shortcutlib, $shortlib, $longlib, $retrolib, $generalLibDir, $speciesLibDir, $batchCount, $conspec, $searchEngine, $numX, $batchSeqDB, "batch $batchNums[$k] of $batchCount", $tax, $customLibDir, $tempdir, $batchNums[ $k ], $refineableHashRef ); ## The rule of thumb is that if we didn't return ## results we at least have an empty *.cat file. if ( !-e "$batchSeqFile.cat" ) { system( "touch $batchSeqFile.cat" ); } if ( !-e "$batchSeqFile.ref" ) { system( "touch $batchSeqFile.ref" ); } if ( $cutAlus || $cutSome ) { &resizeMaskedSeq( [ "$batchSeqFile.temp.cut2", "$batchSeqFile.temp.cut1", "$batchSeqFile.temp.alu1", "$batchSeqFile.temp.alu0", "$batchSeqFile.temp.simple1" ], $numX, $batchSeqDB ); } unlink( <$batchSeqFile.tmp.* $batchSeqFile.temp.*> ); } else { # Create an emtpy cat file. system( "touch $batchSeqFile.cat" ); } if ( $batcher->isBatchFragmented( $batchNums[ $k ] ) ) { # The batch file included fragments so # the annotation positions need updating # before we can integrate them into the # the final results. &adjustFragmentPositions( $batcher, $batchCatFile ); if ( -f "$batchSeqFile.ref" ) { &adjustFragmentPositions( $batcher, $batchRefinedFile ); } } # Exit child process with clean status exit( 0 ); } # elsif ( $child_id == 0 )... elsif ( $! =~ /No more process/ ) { # Supposedly recoverable fork error $badForkCount++; sleep 5; redo FORK; } else { # Weird fork error print "\nWARNING: Cannot fork...for unknown reasons!\n"; $badForkCount++; last; } } # End for ( my $k = 0 ; $k < $numberToStart... # Give up if it looks bad for us ## TODO: Consider die'ng here...since there ## is nothing more we can do to recover. last if ( $badForkCount == $badForkMax ); # # Just so we don't loop endlessly. Lets # hang around here until at least one # process quits. NOTE: This may be dangerous # consider this more.... # print "Parent waiting for child to finish...\n" if ( $DEBUG ); } # End of JOBLOOP ## TODO: This is where we used to make the cut file. This ## option should be rolled into a new utility which ## annotates/cuts based on the annotation database. # # Report batch overlap boundaries for anyone who cares # my $overlapBoundariesHashRef = $batcher->getOverlapBoundaries(); if ( keys( %{$overlapBoundariesHashRef} ) > 0 ) { system( "echo \"## Batch Overlap Boundaries\" >> $fullcatfile" ); foreach my $fragSeqName ( keys( %{$overlapBoundariesHashRef} ) ) { my $overlapList = join( ", ", @{ $overlapBoundariesHashRef->{$fragSeqName} } ); system( "echo \"## $fragSeqName $overlapList\" >> $fullcatfile" ); } } # Rename final cat file if ( -s "$fullcatfile" ) { rename $fullcatfile, "$file.cat"; rename $fullcutfile, "$file.cut" if -s $fullcutfile; } ## Looks for a small set of sequences which are known ## only to be in rodents or primates. if ( $conspec ) { my $matchspec = ""; my $specspecrepfound = ""; my %didspec = (); open( IN, "$file.cat" ); while ( ) { if ( /$\d+$/ ) { unless ( $didspec{$_} ) { # in overlap regions of fragments ++$specspecrepfound; $matchspec .= $_; } $didspec{$_} = 1; } } close IN; if ( $specspecrepfound ) { my $outfile = "$fileori.$conspec.spec"; if ( $options{'dir'} ) { $outfile = $options{'dir'} . "\/$fileend.$conspec.spec"; } unless ( open CONTAMOUT, ">$outfile" ) { $outfile = cwd() . "\/$fileend.$conspec.spec"; open CONTAMOUT, ">$outfile"; } print CONTAMOUT "The contigs or reads containing the " . "following matches are either completely " . "or partially of $conspec origin. The " . "annotation is not necessarily optimal " . "because the query was only compared to a " . "subset of the $conspec database.\n" . "\n$matchspec\n"; close CONTAMOUT; print "There appear to be at least $specspecrepfound " . "$conspec specific interspersed repeats in the query " . "sequence. See the file \"$outfile\"\n\n"; } else { print "No repeats of definite $conspec origin were matched\n"; } &systemint( "rm $tempdir\/$fileend*" ); next; # isn't this redundant? } # # Setup options for processRepeats # my $speed = "undef"; if ( $options{'q'} ) { $speed = "quick"; } elsif ( $options{'qq'} ) { $speed = "rushjob"; } elsif ( $options{'s'} ) { $speed = "sensitive"; } else { $speed = "default"; } my $engine = $searchEngine->getPathToEngine(); my @path = split( /[\\\/]/, $engine ); my $engineInfo = "run with " . pop( @path ) . " version " . $searchEngine->getVersion(); open( CATOUT, ">>$file.cat" ); print CATOUT "## Total Sequences: " . $db->getSeqCount() . "\n"; print CATOUT "## Total Length: " . $db->getSeqLength() . "\n"; print CATOUT "## Total NonMask ( excluding >20bp runs of N/X bases ): " . $db->getXNLength() . "\n"; print CATOUT "## Total NonSub ( excluding all non ACGT bases ):" . $db->getSubtLength() . "\n"; print CATOUT "RepeatMasker version $version , $speed mode\n"; print CATOUT "$engineInfo\n$dbversion\n"; close CATOUT; undef $db; # TODO: Check on .seqending stuff in SEQDBI &systemint( "cat $fileori.alert >> $file.cat" ) if -s "$fileori.alert" && !$options{'is_clip'}; unless ( exists $options{'nopost'} ) { # # Alter source file for masking if we want to go with # the clipped IS sequence file. All annotations will # be based on the $file.withoutIS rather than the original # file. # my $maskSourceFile = $file; $maskSourceFile = "$file.withoutIS" if ( $options{'is_clip'} && -s "$file.withoutIS" ); my $prOptions = ""; $prOptions .= "-ace " if ( $options{'ace'} ); $prOptions .= "-a " if ( $options{'a'} ); $prOptions .= "-lib " . $options{'lib'} . " " if ( $options{'lib'} && !$options{'species'} ); $prOptions .= "-fixed " if ( $options{'fixed'} ); $prOptions .= "-gff " if ( $options{'gff'} ); $prOptions .= "-no_id " if ( $options{'no_id'} ); $prOptions .= "-poly " if ( $options{'poly'} ); $prOptions .= "-u " if ( $options{'u'} ); $prOptions .= "-lcambig " if ( $options{'lcambig'} ); $prOptions .= "-source " if ( $options{'source'} || $options{'a'} ); $prOptions .= "-html " if ( $options{'html'} ); $prOptions .= "-xm " if ( $options{'xm'} ); $prOptions .= "-excln " if ( $options{'excln'} ); $prOptions .= "-noint " if ( $options{'int'} ); $prOptions .= "-species \"" . $options{'species'} . "\" " if ( defined $options{'species'} && $options{'species'} ne "" ); $prOptions .= "-orifile $fileori "; $prOptions .= "-maskSource $maskSourceFile "; $prOptions .= "-x " if ( $options{'x'} ); $prOptions .= "-xsmall " if ( $options{'xsmall'} ); $prOptions .= "$file.cat"; $prOptions .= " -refcat $file.ref"; # # Run processrepeats # &systemint( "$RepeatMaskerConfig::REPEATMASKER_DIR/ProcessRepeats " . "$prOptions" ) unless $options{'is_only'}; if ( $options{'is_clip'} && -s "$file.withoutIS.masked" ) { rename( "$maskSourceFile.masked", "$file.masked" ); } } &cleanUp( \%options, $runnumber, $tempdir, $fileori, $fileend, $file, $originaldir, $compressed ) unless ( $DEBUG ); } # END FILECYCLE &systemint( "rm -R $tempdir" ) unless ( $DEBUG ); # We are soooo done exit( 0 ); ######################## S U B R O U T I N E S ############################ ##-------------------------------------------------------------------------## ## Use: my &adjustFragmentPositions( $batcher, $catfile ); ## ## Returns ## ## Eample cross_match hit line. All positions are 1-based ## ## Foward Strand: ## ## Column Description Eg ## ============================================================== ## 0 Smith-Waterman Score 2334 ## 1 Percent Divergence 8.44 ## 2 Percent Deletions 0.00 ## 3 Percent Insertions 3.25 ## 4 Query Sequence Human ## 5 Query Begin Pos 127 ## 6 Query End Pos 737 ## 7 Query Left - The bp left after query end (8222) ## 8 Subject Sequence AluSx#SINE/Alu ## 9 Subject Begin 1 ## 10 Subject End 298 ## 11 Subject Left (14) ## 12 ID - A unique id for this run of crossmatch 2 ## ## Reverse Strand: ## ## Column Description Eg ## ============================================================== ## 0 Smith-Waterman Score 2334 ## ... same as above.. ## 8 Strand Designator C ## 9 Subject Sequence AluSx#SINE/Alu ## 10 Subject Left (14) ## 11 Subject End 298 ## 12 Subject Begin 1 ## 13 ID 3 ## ## Globals Used: None ## ##-------------------------------------------------------------------------##� sub adjustFragmentPositions { my $batcher = shift; my $catfile = shift; my @fields = (); my $unfraggedfile = ""; my $querysnext = 0; my $origSeqID = ""; my $nameLengthAdjustment = 0; my $posLengthAdjustment = 0; my $adjustment = 0; my $batchSeqID = undef; my $seqDBObj = $batcher->getSeqDBObj(); open IN1, "<$catfile"; ( $unfraggedfile = $catfile ) =~ s/\.cat$//; $unfraggedfile .= '.tempcat'; open OUT1, ">$unfraggedfile"; my $deleteAlign = 0; while ( ) { # pound thingy is relic if ( /\s$\d+$\s/ && !/^\#/ ) { @fields = split; $batchSeqID = $fields[ 4 ]; $origSeqID = $batcher->getSeqIDFromBatchSeqID( $batchSeqID ); $deleteAlign = 0; $querysnext = 1; $adjustment = $batcher->translateBatchSeqPositionToFastaSeq( $batchSeqID, $fields[ 5 ] ) - $fields[ 5 ]; $fields[ 5 ] += $adjustment; $fields[ 6 ] += $adjustment; $fields[ 7 ] = $seqDBObj->getSeqLength( $origSeqID ) - $fields[ 6 ]; $fields[ 7 ] = "(" . "$fields[7]" . ")"; $fields[ 4 ] = $origSeqID; ## Truncate repeats at the middle of the overlap. my ( $beginValidPos, $endValidPos ) = $batcher->getSeqIDValidRange( $batchSeqID ); ## RULES FOR JUST DELETING ELEMENTS OUTSIDE OUR BOUNDARY if ( $fields[ 5 ] <= $beginValidPos && $fields[ 6 ] <= $beginValidPos ) { if ( $DEBUG ) { print STDERR " DELETING ANNOT!\n"; print STDERR join( ' ', @fields ), "\n"; print STDERR "beginValidPos = $beginValidPos " . "endValidPos = $endValidPos\n"; } $deleteAlign = 1; next; } if ( $endValidPos > 0 && $fields[ 6 ] > $endValidPos && $fields[ 5 ] > $endValidPos ) { if ( $DEBUG ) { print STDERR " DELETING ANNOT2!\n"; print STDERR join( ' ', @fields ), "\n"; print STDERR "beginValidPos = $beginValidPos " . "endValidPos = $endValidPos\n"; } $deleteAlign = 1; next; } print OUT1 join( ' ', @fields ), "\n"; } elsif ( /^C?\s*(\S+)(\s+)(\d+)\s+[A-Z\-]+\s+(\d+)/ ) { if ( $querysnext && !$deleteAlign ) { my $queryname = substr( $origSeqID, 0, 13 ); my $nameLengthAdjustment = length( $queryname ) - length( $1 ); my $originalspace = length $2; my $begin = $3; my $end = $4; my $posLengthAdjustment = -length( $begin ); $begin += $adjustment; $posLengthAdjustment += length( $begin ); $end += $adjustment; #print "before :$_\n"; my $spacing = " " x ( $originalspace - $posLengthAdjustment - $nameLengthAdjustment ); #print "original=$originalspace pos=$posLengthAdjustment " . # "name=$nameLengthAdjustment\n"; s/^(C?\s*)\S+\s+\d+(\s+[A-Z\-]+\s+)\d+/$1$queryname$spacing$begin$2$end/; #print "It's now:$_\n"; print OUT1; $querysnext = ""; } elsif ( !$deleteAlign ) { $querysnext = 1; print OUT1; } } elsif ( !$deleteAlign ) { print OUT1; } } close IN1; close OUT1; rename $unfraggedfile, $catfile; } ##-------------------------------------------------------------------------## ## Use: my &locateISElements( \%options, $batcher, ## $batchNum, $file, $qualFile, ## $RepeatMaskerConfig::REPEATMASKER_DIR, $searchEngine, ## $outFilesPrefix, $generalLibDir ); ## ## ## Returns ## ## Only needs to support two atomic operations. Report and ## clip. ## ## Flow: ## 1. Searches $file for IS elements and saves output ## to $file.iscat (Note...removes .masked from name if ## it exists). ## ## 2. If a quality file is handed to us it reads the *entire* ## quality file into memory ( qualData local datastructure ). ## ## 3. Parse search results and locate IS elements. ## ## 4. Clip IS elements from $file if is_clip or is_only options ## used. ## ## 5. Clip qual values from qualData local datastructure if ## is_clip or is_only options used. ## ## 6. Saves a copy of the sequence/qual at this stage ## if is_clip or is_only options are used. The ## saved copies are named: ## $file.withoutIS, $file.qual.withoutIS. ## ## NOTE: A basic understanding of Dutch and French is necessary ## to decypher this subroutine. ## ## Globals Used: none ##-------------------------------------------------------------------------##� sub locateISElements { my %options = %{ shift() }; my $batcher = shift; my $batchNum = shift; my $file = shift; my $qualFile = shift; my $DIRECTORY = shift; my $searchEngine = shift; my $outFilesPrefix = shift; my $generalLibDir = shift; print "\nChecking for E. coli insertion elements\n"; my @ISlines = (); my @ISFailed = (); my $ISclipornot = 0; my $qualprob = 0; my %clipfailed = (); my $seqDB = FastaDB->new( fileName => $file, openMode => SeqDBI::ReadWrite, maxIDLength => 50 ); my $filePrefix = $file; $filePrefix =~ s/\.masked//; my $minscore = 17; my $minmatch = 15; my $lib = "$generalLibDir/is.lib"; my $matrix = "identity.matrix"; my ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( "", "", "" ); my $bandwidth = ""; my $masklevel = ""; my $maskfile = $file; my $raw = ""; my $wordraw = ""; my $outfile = "$filePrefix" . ".iscat"; my $searchResults; ## Perform the search ( $minmatch, $bandwidth, $searchResults ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); # # Read in quality file data if available # # This should probably be generalized and made # into an object for dealing with quality files. # For now we will leave well enough alone. # my %qualData = (); my %qualHdr = (); my @qualNames = (); my $qualname = ""; if ( -s $qualFile ) { if ( -r $qualFile ) { open( INQUAL, $qualFile ); while ( ) { chomp; if ( /^>/ ) { $qualname = $_; $qualname =~ s/^>(\S*).*/$1/; push @qualNames, $qualname; $qualHdr{$qualname} = $_; } else { $qualData{$qualname} .= $_; } } close INQUAL; } else { $qualprob = 1; } } else { $qualprob = 1; } # # Open the search results file. # my $lastname = ""; my $lastbegin = ""; my $lastend = ""; my $lastscore = 0; my $lastis = ""; my $lastor = ""; my $naam = ""; my $begin = 0; my $end = 0; my $or = ""; my $is = ""; my $beginis = 0; my $leftis = 0; my $deleted = 0; my $remaining = 0; # # Loop through search results # for ( my $i = 0 ; $i < $searchResults->size() ; $i++ ) { my $result = $searchResults->get( $i ); $naam = $result->getQueryName(); $begin = $result->getQueryStart(); $end = $result->getQueryEnd(); $remaining = $result->getQueryRemaining(); if ( $result->getOrientation() ne "C" ) { $or = 'pos'; } else { $or = 'neg'; } $is = $result->getSubjName(); $leftis = $result->getSubjRemaining(); $beginis = $result->getSubjStart(); $deleted = 0 unless $lastname && $naam eq $lastname; ## $deleted keeps track of nr of bp in IS elements already ## deleted from the same query sequence; $leftis =~ tr/()//d; ## remove classification $is =~ s/\#\w+//; ## Allows to clip an IS element with a gap in the middle ## (may be a sequencing gap) ## $lastis is set if last IS was incomplete but did either ## start at pos 1 and had + orientation or end at last bp ## and had - orientation if ( $lastis ) { ## Following allows to clip an IS element with a gap ## in the middle (may be a sequencing gap) if ( $is eq $lastis && $naam eq $lastname && $or eq $lastor && ( $or eq 'pos' && $leftis == 0 || $or eq 'neg' && $beginis == 1 ) && $begin - $lastend <= 2 ) { # rather conservatively $begin = $lastbegin; $beginis = 1 if $or eq 'pos' && $leftis == 0; $leftis = 0 if $or eq 'neg' && $beginis == 1; } else { push @ISFailed, ( $i - 1 ) if $lastscore > 25; $lastis = $lastbegin = $lastend = ""; $lastor = $lastscore = ""; } } ## complete element if ( $beginis == 1 && $leftis == 0 ) { my $length = $end + 1 - $begin; ## This section needs occasional updating (new ## elements, new information on flexibility in ## duplication lengths) my @dupLengths = (); if ( $is eq 'IS1' ) { @dupLengths = ( 9, 8, 10 ); } elsif ( $is eq 'IS2' ) { @dupLengths = ( 5 ); } elsif ( $is eq 'IS3' ) { @dupLengths = ( 3 ); } elsif ( $is eq 'IS5' ) { @dupLengths = ( 4 ); } elsif ( $is eq 'IS10' ) { @dupLengths = ( 9 ); } elsif ( $is eq 'IS30' ) { @dupLengths = ( 2 ); } elsif ( $is eq 'IS150' ) { @dupLengths = ( 3, 4 ); } elsif ( $is eq 'IS186' ) { @dupLengths = ( 10, 11 ); } elsif ( $is eq 'Tn1000' ) { @dupLengths = ( 5 ); } else { @dupLengths = (); } ## Get the left and right flanking site duplications my ( $links, $rechts ) = qw(links rechts); my $dupLength = 0; ## Try all known flanking site lengths while ( @dupLengths && $links ne $rechts ) { $dupLength = shift( @dupLengths ); next if ( $dupLength > ( $begin - 1 ) || $dupLength > $remaining ); $links = $seqDB->getSubstr( $naam, $begin - $dupLength - 1 - $deleted, $dupLength ); $rechts = $seqDB->getSubstr( $naam, $end - $deleted, $dupLength ); } ## If flanking site duplications are identical (these ## are cloning artifacts; there was no time for ## substitutions to take place to make the sites different if ( $links ne "" && $links eq $rechts ) { ## Alters $seqWithName by deleting a segment ## including the IS element and one flanking site, ## thereby reconstructing the original sequence if ( $options{'is_only'} || $options{'is_clip'} ) { $seqDB->setSubstr( $naam, $begin - 1 - $deleted, $length + $dupLength, "" ); ## Similarly removes qual values in $qualData if ( $qualname ) { my @qual = split( / /, $qualData{$naam} ); splice @qual, $begin - 1 - $deleted, $length + $dupLength; $qualData{$naam} = join " ", @qual; } $deleted += $length + $dupLength; } push @ISlines, $i; } else { push @ISFailed, $i; } $lastis = $lastbegin = $lastend = $lastscore = ""; } elsif ( $beginis == 1 && $or eq 'pos' || $leftis == 0 && $or eq 'neg' ) { ## save info to see if rest of the IS element follows $lastis = $is; $lastbegin = $begin; $lastend = $end; $lastor = $or; $lastscore = $result->getScore( $i ); } else { $lastis = $lastbegin = $lastend = ""; # only report if match scores > 25; otherwise may be # freak false positive push @ISFailed, ( $i ) if ( $result->getScore( $i ) > 25 ); } $lastname = $naam; } if ( $lastis ) { #often with IS_only push @ISFailed, ( $searchResults->size() - 1 ) if ( $lastscore > 25 ); } ## adjust position for broken up very large sequences if ( @ISlines && $batcher->isBatchFragmented( $batchNum ) ) { my $seqDBObj = $batcher->getSeqDBObj(); foreach my $resultIndex ( @ISlines ) { my $ISResult = $searchResults->get( $resultIndex ); my $qryName = $ISResult->getQueryName(); my $qryBegin = $ISResult->getQueryStart(); my $qryEnd = $ISResult->getQueryEnd(); my $qryLeft = $ISResult->getQueryRemaining(); my $adjustment = $batcher->translateBatchSeqPositionToFastaSeq( $qryName, $qryBegin ) - $qryBegin; $ISResult->setQueryStart( $qryBegin + $adjustment ); $ISResult->setQueryEnd( $qryEnd + $adjustment ); $ISResult->setQueryRemaining( $seqDBObj->getSeqLength( $batcher->getSeqIDFromBatchSeqID( $qryName ) ) - $qryEnd ); } } if ( @ISlines ) { my $qualthere = ""; if ( $qualprob ) { $qualthere = "\nNo corresponding Phred file ($qualFile) " . "could be read in this directory,\n so no " . "modified quality file has been made\n"; } else { $qualthere = "\nThe quality file $outFilesPrefix.qualwithoutIS " . "corresponds to this clipped fasta sequence file\n"; } if ( $options{'is_only'} ) { $ISclipornot = "These elements and one flanking duplicated " . "site have been clipped out\nof the sequence " . "in the file $outFilesPrefix.withoutIS\n$qualthere\n"; } elsif ( $options{'is_clip'} ) { $ISclipornot = "These elements and one flanking duplicated site " . "have been clipped out\nbefore the repeatmasker " . "run, so that coordinates do not correspond\n" . "everywhere with the original sequence. A clipped " . "version of the\nsequence(s) is in the file " . "$outFilesPrefix.withoutIS\n$qualthere\n"; } else { $ISclipornot = "These elements can be clipped out with the " . "options is_clip or is_only. The latter does " . "not run the 'normal' RepeatMasker routine and " . "positions in the current\n.out file will not " . "correspond with the -is_only reconstructed " . "sequence.\n"; } my $ISreport = "One or more E. coli IS elements were found:\n"; foreach my $resultIndex ( @ISlines ) { my $ISResult = $searchResults->get( $resultIndex ); $ISreport .= " " . $ISResult->getSubjName() . " in " . $ISResult->getQueryName() . ": " . $ISResult->getQueryStart() . " - " . $ISResult->getQueryEnd() . "\n"; } $ISreport .= "\n$ISclipornot\n"; if ( @ISFailed ) { $ISreport .= "The following E coli IS elements could not be " . "confidently clipped out:\n"; foreach my $resultIndex ( @ISFailed ) { my $ISResult = $searchResults->get( $resultIndex ); $ISreport .= " " . $ISResult->getSubjName() . " in " . $ISResult->getQueryName() . ": " . $ISResult->getQueryStart() . " - " . $ISResult->getQueryEnd() . "\n"; } } print "$ISreport\n"; if ( $options{'is_only'} || $options{'is_clip'} ) { system( "cat $file >> $outFilesPrefix.withoutIS" ); if ( $qualname ) { my $qualout = "$outFilesPrefix.qual.withoutIS"; $qualout =~ s/.seq.qual.withoutIS/.qual.withoutIS/; open( QUALOUT, ">$qualout" ); } ## $nom French too! foreach my $nom ( @qualNames ) { # fixed with \n sept 03 print QUALOUT "$qualHdr{$nom}\n$qualData{$nom}\n"; } close QUALOUT if $qualname; } open( ALERTOUT, ">>$outFilesPrefix.alert" ); print ALERTOUT "$ISreport"; close ALERTOUT; } #elsif ( $clipfailed{$file} ) { elsif ( @ISFailed ) { my $ISreport = "The following E coli IS elements could not be " . "confidently clipped out:\n"; foreach my $resultIndex ( @ISFailed ) { my $ISResult = $searchResults->get( $resultIndex ); $ISreport .= " " . $ISResult->getSubjName() . " in " . $ISResult->getQueryName() . ": " . $ISResult->getQueryStart() . " - " . $ISResult->getQueryEnd() . "\n"; } open( ALERTOUT, ">>$outFilesPrefix.alert" ); print ALERTOUT "$ISreport"; close ALERTOUT; print "$ISreport"; } #unlink $iscat; } ##-------------------------------------------------------------------------## ## Use: my ( $minmatch, $bandwidth, $resultsCollection ) = ## &search( \%options, $DIRECTORY, $outfile, $maskfile, ## $lib, $minmatch, $bandwidth, $matrix, ## $gap_initValue, $ins_gap_extValue, ## $del_gap_extValue, $minscore, $masklevel, ## $searchEngine, $wordraw, ## $raw ); ## ## Runs the search engine using the settings specified in the ## subroutine locateRepeats, traps error results and aborts ## on search engine failure. ## ## Input ## ## \%options : The RepeatMasker option hashtable ## ## Returns ## ## $minmatch : The new minmatch ( as it may have changed ) ## $bandwidth : The new bandwidth ( dito ) ## $resultsCollection : ## ## ## Globals Used: None ## Globals Modified: None ## ##-------------------------------------------------------------------------## sub search { my %options = %{ shift() }; # The RepeatMasker option hashtable my $DIRECTORY = shift; my $outfile = shift; my $maskfile = shift; my $lib = shift; my $minmatch = shift; my $bandwidth = shift; my $matrix = shift; my $gap_initValue = shift; my $ins_gap_extValue = shift; my $del_gap_extValue = shift; my $minscore = shift; my $masklevel = shift; my $searchEngine = shift; my $wordraw = shift; my $raw = shift; my $alignments = ""; my $poly = ""; my $quiet = ""; print "RepeatMasker::search( \%options, $DIRECTORY, $outfile, $maskfile,\n" . " $lib, $minmatch, $bandwidth, $matrix\n" . " $gap_initValue, $ins_gap_extValue, $del_gap_extValue\n" . " $minscore, $masklevel, \$searchEngine, $wordraw\n" . " $raw );\n" if ( $DEBUG ); # Set options for crossmatch $alignments = "-alignments" if $options{'a'}; $poly = "-poly" if $options{'poly'}; $quiet = "2>> $outfile.stderr" unless $options{'noisy'}; if ( $options{'div'} && $options{'div'} <= 20 ) { ++$minmatch; ++$minmatch if $options{'div'} <= 10; } my $matrixPrefix = "crossmatch"; if ( $searchEngine->isa( "WUBlastSearchEngine" ) ) { $matrixPrefix = "wublast/aa"; } elsif ( $searchEngine->isa( "NCBIBlastSearchEngine" ) ) { $matrixPrefix = "ncbi/nt"; } elsif ( $searchEngine->isa( "DeCypherSearchEngine" ) ) { $matrixPrefix = "terablast"; } if ( $searchEngine->isa( "WUBlastSearchEngine" ) && $options{'s'} && $lib !~ /simple|at\.lib/ ) { $minmatch -= 1; $bandwidth += 15; } my $cycle = 1; $searchEngine->setQuery( $maskfile ); $searchEngine->setSubject( $lib ); $searchEngine->setMatrix( "$DIRECTORY/Matrices/$matrixPrefix/$matrix" ); if ( $wordraw ne "" ) { $searchEngine->setWordRaw( 1 ); } else { $searchEngine->setWordRaw( 0 ); } if ( $raw ne "" ) { $searchEngine->setScoreMode( SearchEngineI::basicScoreMode ); } else { $searchEngine->setScoreMode( SearchEngineI::complexityAdjustedScoreMode ); } $searchEngine->setGenerateAlignments( 1 ) if ( $alignments ); $searchEngine->setGapInit( $gap_initValue ); $searchEngine->setInsGapExt( $ins_gap_extValue ); $searchEngine->setDelGapExt( $del_gap_extValue ); $searchEngine->setMinScore( $minscore ); my ( $maskLevelNum ) = ( $masklevel =~ /-masklevel\s+(\d+)\s*/ ); $searchEngine->setMaskLevel( $maskLevelNum ); my $cmd = ""; my $status = 0; my $retry = 0; my $resultCollection; while ( $cycle ) { ( $status, $resultCollection ) = $searchEngine->search( minMatch => $minmatch, bandwidth => $bandwidth ); if ( $status ) { if ( $searchEngine->isa( "DeCypherSearchEngine" ) ) { if ( $retry++ < 4 ) { print STDERR "DeCypher communications failure, retrying\n"; } else { print STDERR "DeCypher failure, contact TimeLogic support\n"; exit( -1 ); } } elsif ( $bandwidth > 14 ) { $bandwidth = 14; warn "WARNING: Comparison failed. Retrying with smaller bandwidth\n"; } elsif ( $bandwidth == 4 ) { # second or third simple repeats check # Extreme measures for very long simple satellites $bandwidth = 1; warn "WARNING: Comparison failed. Retrying with smaller bandwidth\n"; } elsif ( $minmatch < 10 ) { $minmatch++; warn "WARNING: Comparison failed. Retrying with larger minmatch " . "($minmatch)\n"; } else { print "WARNING: The search engine returned an error (" . ( $? >> 8 ) . ")\n" . "A search phase could not complete on this batch.\n" . "The batch file will be re-run and if possible the\n" . "program will resume.\n"; exit( -1 ); } } else { $cycle = 0; } } #$resultCollection->write( $outfile ); #undef $searchEngine; return ( $minmatch, $bandwidth, $resultCollection ); } # sub search ##-------------------------------------------------------------------------## ## Use: my ( $cutAlus, $cutSome ) = &locateRepeats ( ## \%options, $RepeatMaskerConfig::REPEATMASKER_DIR, $GC, $GC_frac, ## $file, $maskfile, $sinecutlib, $cutlib, ## $shortcutlib, $shortlib, $longlib, ## $retrolib, $generalLibDir, ## $speciesLibDir, $fragCnt, ## $conspec, $searchEngine, ## $numX, $seqDB, [$batchIdentifierText], ## $tax, $customLibDir, $tmpDir, $batchNum, $refineableHashRef ); ## ## Returns ## ## Phases of a search: ## - Search for contamination species if $conspec defined ## - Search for simple repeats ( simple.lib ) ## ## Species Specific: ## - Search user supplied library ## ## Mammal Specific: ## - Search sinecutlib if it exists ( full length abundant SINEs ) ## - Search shortcutlib if it exists ( full-length IRs ) ## - Search cutlib if it exists ## ## Homo: ## - Search sinecutlib again ## ## - Search shortlib if it exists ## - Search longlib if it exists ## - Search mirs.lib ## - Search mir.lib ## - Search retro.lib ***if it exists*** ## - Search l1.lib ## ## - Search simple.lib (again?) ## - Search at.lib ## ## ## Globals Used: None ## Globals Modified: None ##-------------------------------------------------------------------------##� sub locateRepeats { my %options = %{ shift() }; # The RepeatMasker option hashtable my $DIRECTORY = shift; my $GC = shift; my $GC_frac = shift; my $file = shift; my $maskfile = shift; my $sinecutlib = shift; my $cutlib = shift; my $shortcutlib = shift; my $shortlib = shift; my $longlib = shift; my $retrolib = shift; my $generalLibDir = shift; my $speciesLibDir = shift; my $fragCnt = shift; my $conspec = shift; my $searchEngine = shift; my $numX = shift; my $seqDB = shift; my $batchIdentifierText = shift; my $tax = shift; my $customLibDir = shift; my $tempdir = shift; my $batchNum = shift; my $refineableHashRef = shift; my $repBoundRef; my $cutAlus = 0; my $cutSome = 0; my $stage = 0; my $fullymasked = 0; my $resultsCollection; my %refinementHash = (); # IN DEVELOPMENT my ( $minscore, $minmatch, $lib, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $bandwidth, $masklevel, $raw, $wordraw, $outfile ) = (); MASKREPEATS: { my $step = 0; # # Data structures which help in recording the locations # of various levels of repeat excision. # my @BegRef = (); my @EndRef = (); my %excisedRepRef = (); ## DEBUG #print "Saving a copy of the file /tmp/saveme-$batchNum-orig\n"; #system("cp $maskfile /tmp/saveme-$batchNum-orig"); ## ## Contamination check ## - check for human or rodent specific repeats if ( $conspec ) { ## want to add rat - mouse distinction print "Identifying $conspec specific repeats"; if ( $batchIdentifierText ne "" ) { print " in " . $batchIdentifierText . "\n"; } else { print "\n"; } $minscore = 250; # loses sensitivity, but avoids all trouble $minscore = 300 if $options{'rodspec'}; $minmatch = &setminmatch( \%options, 8, 9, 10, 11 ); $lib = "$generalLibDir/humspec.lib" if ( $options{'primspec'} ); $lib = "$generalLibDir/rodspec.lib" if ( $options{'rodspec'} ); $matrix = "14p" . "$GC" . ".matrix"; ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -30, -6, -6 ); $bandwidth = "10"; $masklevel = "-masklevel 90"; $raw = $wordraw = ""; $outfile = "$file.tmp.concheck"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); &Choose( \%options, "contam", $fragCnt, $lib, $resultsCollection, $tax ); if ( $resultsCollection->size() ) { &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } } # resultsCollection->size() last MASKREPEATS; } # if ( $conspec ) ## ## Simple Repeats ## - delete almost perfect simple repeats unless ( $options{'nocut'} || $options{'low'} || $options{'alu'} ) { print "identifying simple repeats"; if ( $batchIdentifierText ne "" ) { print " in " . $batchIdentifierText . "\n"; } else { print "\n"; } $minscore = 180; # Change threshold for Decypher $minscore = 200 if ( $searchEngine->isa( "DeCypherSearchEngine" ) ); $minmatch = &setminmatch( \%options, 8, 9, 10, 11 ); $lib = "$generalLibDir/simple.lib"; $matrix = "simple1.matrix"; ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -40, -15, -15 ); $bandwidth = 1; $masklevel = "-masklevel 1"; $raw = "-raw"; $wordraw = "-word_raw"; $outfile = "$file.tmp.simple1"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); &Choose( \%options, "simple", $fragCnt, $lib, $resultsCollection, $tax ); if ( $resultsCollection->size() ) { &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); &exciseRepeats( $resultsCollection, $outfile, $step, 0, \@BegRef, \@EndRef, \%excisedRepRef, $numX, $seqDB ); if ( $outfile =~ /alu0$/ ) { $cutAlus = 1; } else { $cutSome = 1; } $repBoundRef = &extractRepeatBoundaries( $resultsCollection ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); &updateExcisedList( $repBoundRef, \%excisedRepRef ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } ++$step; } } # Simple repeats ## ## High complexity repeat searches ## unless ( $options{'int'} ) { #unless only low complexity DNA is to be masked ## ## Single species specific library ## if ( defined $options{'lib'} || -s "$speciesLibDir/specieslib" || -s "$speciesLibDir/specieslib.bsq" || -s "$speciesLibDir/specieslib.xps" ) { if ( defined $options{'lib'} ) { my ( $custLibVol, $custLibDir, $custLibFile ) = File::Spec->splitpath( $options{'lib'} ); print "identifying matches to " . $custLibFile . " sequences"; $lib = "$customLibDir/$custLibFile"; } else { print "identifying matches to " . $options{'species'} . " sequences"; $lib = "$speciesLibDir/specieslib"; } # opt_lib created with all non-mammalian opt_species options # all species but mammals are currently treated in a naive fashion if ( $batchIdentifierText ne "" ) { print " in " . $batchIdentifierText . "\n"; } else { print "\n"; } $minscore = 225; $minscore = $options{'cutoff'} if $options{'cutoff'}; $minmatch = &setminmatch( \%options, 8, 9, 11, 13 ); $matrix = "20p" . "$GC" . ".matrix"; ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -30, -6, -5 ); $bandwidth = "14"; $masklevel = "-masklevel 90"; $raw = $wordraw = ""; $outfile = "$file.tmp.custom"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); &Choose( \%options, "masking", $fragCnt, $lib, $resultsCollection, $tax ); &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); $fullymasked = &maskRepeats( $resultsCollection, \%options, $seqDB ); &adjustAnnotationCoordinates( $resultsCollection, 5, \@BegRef, \@EndRef, $numX ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } last MASKREPEATS if $fullymasked; } # User supplied lib ## ## RepeatMasker specific libraries ## else { # one of the provided mammalian databases is used ## ## Human ## if ( $tax->isA( $options{'species'}, "primates" ) ) { ###### excise young alus ##### print "identifying full-length ALUs"; if ( $batchIdentifierText ne "" ) { print " in " . $batchIdentifierText . "\n"; } else { print "\n"; } if ( $options{'nocut'} ) { $minscore = 1500; } else { $minscore = 1200; } $minmatch = &setminmatch( \%options, 7, 8, 10, 12 ); $lib = "$speciesLibDir/sinecutlib"; $matrix = "14p" . "$GC" . ".matrix"; ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -35, -7, -6 ); $bandwidth = "20"; if ( $options{'nocut'} ) { $masklevel = "-masklevel 80"; } else { $masklevel = "-masklevel 1"; } $raw = $wordraw = ""; $outfile = "$file.tmp.alu0"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); if ( $options{'nocut'} ) { # if no cutting is done (only screened for Alus > 1500) &Choose( \%options, "alumask", $fragCnt, $lib, $resultsCollection, $tax ); if ( $resultsCollection->size() ) { &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); $fullymasked = &maskRepeats( $resultsCollection, \%options, $seqDB ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } } } # $options{'nocut'} else { &Choose( \%options, "alu", $fragCnt, $lib, $resultsCollection, $tax ); if ( $resultsCollection->size() ) { &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); &exciseRepeats( $resultsCollection, $outfile, $step, 1, \@BegRef, \@EndRef, \%excisedRepRef, $numX, $seqDB ); if ( $outfile =~ /alu0$/ ) { $cutAlus = 1; } else { $cutSome = 1; } $repBoundRef = &extractRepeatBoundaries( $resultsCollection ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); &updateExcisedList( $repBoundRef, \%excisedRepRef ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } ++$step; } # if ( $resultsCollection->size()... if ( $cutAlus ) { # Any following full-length Alus only were # exposed after excising previous Alus. $minscore = 1200; $minmatch = &setminmatch( \%options, 7, 8, 10, 12 ); $lib = "$speciesLibDir/sinecutlib"; $matrix = "14p" . "$GC" . ".matrix"; ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -35, -7, -6 ); $bandwidth = "20"; $masklevel = "-masklevel 1"; $raw = $wordraw = ""; $outfile = "$file.tmp.alu1"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); &Choose( \%options, "alu", $fragCnt, $lib, $resultsCollection, $tax ); if ( $resultsCollection->size() ) { &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); &exciseRepeats( $resultsCollection, $outfile, $step, 2, \@BegRef, \@EndRef, \%excisedRepRef, $numX, $seqDB ); if ( $outfile =~ /alu0$/ ) { $cutAlus = 1; } else { $cutSome = 1; } $repBoundRef = &extractRepeatBoundaries( $resultsCollection ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); &updateExcisedList( $repBoundRef, \%excisedRepRef ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } ++$step; } # if ( $resultsCollection->size().. } # if ( $cutAlus ) } # else $options{'nocut'} if ( $options{'alu'} ) { ###### mask remaining Alus ##### # local values $minscore = 225; $minmatch = &setminmatch( \%options, 7, 8, 8, 9 ); $lib = "$speciesLibDir/sinecutlib"; $matrix = "20p" . "$GC" . ".matrix"; ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -30, -6, -5 ); $bandwidth = "14"; $masklevel = "-masklevel 80"; $raw = $wordraw = ""; $outfile = "$file.tmp.alu4"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); &Choose( \%options, "alumask", $fragCnt, $lib, $resultsCollection, $tax ); if ( $resultsCollection->size() ) { &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); $fullymasked = &maskRepeats( $resultsCollection, \%options, $seqDB ); &adjustAnnotationCoordinates( $resultsCollection, 5, \@BegRef, \@EndRef, $numX ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } } last MASKREPEATS; } # if ( $options{'alu'} ) ## ## Rodents ## } elsif ( -s "$speciesLibDir/sinecutlib" || -s "$speciesLibDir/sinecutlib.bsq" || -s "$speciesLibDir/sinecutlib.xps" ) { print "identifying young abundant SINEs"; if ( $batchIdentifierText ne "" ) { print " in " . $batchIdentifierText . "\n"; } else { print "\n"; } $minscore = 225; $minmatch = &setminmatch( \%options, 6, 7, 8, 10 ); $lib = "$speciesLibDir/sinecutlib"; $matrix = "18p" . "$GC" . ".matrix"; ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -30, -6, -5 ); $bandwidth = "14"; $masklevel = "-masklevel 1"; $raw = $wordraw = ""; $outfile = "$file.tmp.alu1"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); &Choose( \%options, "cut1", $fragCnt, $lib, $resultsCollection, $tax ); if ( $resultsCollection->size() ) { &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); &exciseRepeats( $resultsCollection, $outfile, $step, 1, \@BegRef, \@EndRef, \%excisedRepRef, $numX, $seqDB ); if ( $outfile =~ /alu0$/ ) { $cutAlus = 1; } else { $cutSome = 1; } $repBoundRef = &extractRepeatBoundaries( $resultsCollection ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); &updateExcisedList( $repBoundRef, \%excisedRepRef ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } ++$step; } } if ( -s "$speciesLibDir/shortcutlib" || -s "$speciesLibDir/shortcutlib.bsq" || -s "$speciesLibDir/shortcutlib.xps" ) { ###### excise all short full-length elements ###### print "identifying full-length interspersed repeats"; if ( $batchIdentifierText ne "" ) { print " in " . $batchIdentifierText . "\n"; } else { print "\n"; } $minscore = 300; $minmatch = &setminmatch( \%options, 9, 10, 11, 13 ); $lib = "$speciesLibDir/shortcutlib"; $matrix = "18p" . "$GC" . ".matrix"; ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -33, -5, -4 ); # with such big wordlength, try to span larger gaps $bandwidth = "40"; # high masklevel is a problem sometimes: used to be 50 $masklevel = "-masklevel 1"; $raw = $wordraw = ""; $outfile = "$file.tmp.cut1"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); &Choose( \%options, "cut1", $fragCnt, $lib, $resultsCollection, $tax ); if ( $resultsCollection->size() ) { &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); &exciseRepeats( $resultsCollection, $outfile, $step, 3, \@BegRef, \@EndRef, \%excisedRepRef, $numX, $seqDB ); if ( $outfile =~ /alu0$/ ) { $cutAlus = 1; } else { $cutSome = 1; } $repBoundRef = &extractRepeatBoundaries( $resultsCollection ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); &updateExcisedList( $repBoundRef, \%excisedRepRef ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } ++$step; } # if ( $resultsCollection->Size().. if ( -s "$speciesLibDir/cutlib" || -s "$speciesLibDir/cutlib.bsq" || -s "$speciesLibDir/cutlib.xps" ) { # cut out complete 3' ends of young L1 # elements (same parameters) $lib = "$speciesLibDir/cutlib"; $outfile = "$file.tmp.cut2"; $masklevel = "-masklevel 90"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); &Choose( \%options, "cut2", $fragCnt, $lib, $resultsCollection, $tax ); if ( $resultsCollection->size() ) { &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); &exciseRepeats( $resultsCollection, $outfile, $step, 4, \@BegRef, \@EndRef, \%excisedRepRef, $numX, $seqDB ); if ( $outfile =~ /alu0$/ ) { $cutAlus = 1; } else { $cutSome = 1; } $repBoundRef = &extractRepeatBoundaries( $resultsCollection ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); &updateExcisedList( $repBoundRef, \%excisedRepRef ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } ++$step; } # if ( $resultsCollection->size().. } # if ( -s $cutlib ) } # if ( $shortcublib if ( $options{'cut'} && !$fragCnt ) { my $cutfile = $file . ".cut"; copy( $maskfile, $cutfile ); } ## ## primate? ## if ( $tax->isA( $options{'species'}, "primates" ) ) { ###### mask more alus ##### print "identifying remaining ALUs"; if ( $batchIdentifierText ne "" ) { print " in " . $batchIdentifierText . "\n"; } else { print "\n"; } $minscore = 800; $minmatch = &setminmatch( \%options, 7, 8, 10, 12 ); $lib = "$speciesLibDir/sinecutlib"; $matrix = "14p" . "$GC" . ".matrix"; ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -35, -7, -6 ); $bandwidth = "20"; $masklevel = "-masklevel 1"; $raw = $wordraw = ""; $outfile = "$file.tmp.alu2"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); &Choose( \%options, "alumask", $fragCnt, $lib, $resultsCollection, $tax ); if ( $resultsCollection->size() ) { &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); $fullymasked = &maskRepeats( $resultsCollection, \%options, $seqDB ); &adjustAnnotationCoordinates( $resultsCollection, 5, \@BegRef, \@EndRef, $numX ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } } last MASKREPEATS if $fullymasked; ###### mask even more alus ##### # local values $minscore = 400; $minmatch = &setminmatch( \%options, 7, 8, 9, 11 ); $lib = "$speciesLibDir/sinecutlib"; $matrix = "18p" . "$GC" . ".matrix"; ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -30, -6, -5 ); $bandwidth = "14"; $masklevel = "-masklevel 10"; $raw = $wordraw = ""; $outfile = "$file.tmp.alu3"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); &Choose( \%options, "alumask", $fragCnt, $lib, $resultsCollection, $tax ); if ( $resultsCollection->size() ) { &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); $fullymasked = &maskRepeats( $resultsCollection, \%options, $seqDB ); &adjustAnnotationCoordinates( $resultsCollection, 5, \@BegRef, \@EndRef, $numX ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } } last MASKREPEATS if $fullymasked; } ###### mask the remaining other short repeats and satellites##### if ( -s "$speciesLibDir/shortlib" || -s "$speciesLibDir/shortlib.bsq" || -s "$speciesLibDir/shortlib.xps" ) { print "identifying most interspersed repeats"; if ( $batchIdentifierText ne "" ) { print " in " . $batchIdentifierText . "\n"; } else { print "\n"; } ## DEBUG # print "Saving a copy of the file /tmp/saveme-$batchNum\n"; # system("cp $maskfile /tmp/saveme-$batchNum"); $lib = "$speciesLibDir/shortlib"; if ( $tax->isA( $options{'species'}, "rodentia" ) ) { $minscore = 210; $minmatch = &setminmatch( \%options, 7, 8, 9, 10 ); # high divergence level the rule $matrix = "25p" . "$GC" . ".matrix"; ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -27, -5, -5 ); } else { $minscore = 225; $minmatch = &setminmatch( \%options, 7, 8, 10, 12 ); $matrix = "20p" . "$GC" . ".matrix"; ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -30, -6, -5 ); } $bandwidth = "14"; $masklevel = "-masklevel 90"; $raw = $wordraw = ""; $outfile = "$file.tmp.sines"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); &Choose( \%options, "sines", $fragCnt, $lib, $resultsCollection, $tax ); if ( $resultsCollection->size() ) { &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); $fullymasked = &maskRepeats( $resultsCollection, \%options, $seqDB ); &adjustAnnotationCoordinates( $resultsCollection, 5, \@BegRef, \@EndRef, $numX ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } } last MASKREPEATS if $fullymasked; } # if ( -s $shortlib ) if ( -s "$speciesLibDir/longlib" || -s "$speciesLibDir/longlib.bsq" || -s "$speciesLibDir/longlib.xps" ) { # currently long and short together for # non-primate/non-rodent mammals ##### mask longer rep seqs ##### print "identifying long interspersed repeats"; if ( $batchIdentifierText ne "" ) { print " in " . $batchIdentifierText . "\n"; } else { print "\n"; } $minscore = 225; $minmatch = &setminmatch( \%options, 7, 8, 10, 12 ); $lib = "$speciesLibDir/longlib"; $matrix = "20p" . "$GC" . ".matrix"; ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -30, -6, -5 ); $bandwidth = "14"; ## I could run bodies and UTRs separately, ensuring ## better coverage at overlapping regions ## and allowing to set masklevel to 80 or so for UTRs, ## so that oddment extensions are avoided $masklevel = "-masklevel 90"; $raw = $wordraw = ""; $outfile = "$file.tmp.reps"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); &Choose( \%options, "longlib", $fragCnt, $lib, $resultsCollection, $tax ); if ( $resultsCollection->size() ) { &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); $fullymasked = &maskRepeats( $resultsCollection, \%options, $seqDB ); &adjustAnnotationCoordinates( $resultsCollection, 5, \@BegRef, \@EndRef, $numX ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } } last MASKREPEATS if $fullymasked; } if ( -s "$speciesLibDir/mirslib" || -s "$speciesLibDir/mirslib.bsq" || -s "$speciesLibDir/mirslib.xps" ) { ##### mask MIRs ##### print "identifying ancient repeats"; if ( $batchIdentifierText ne "" ) { print " in " . $batchIdentifierText . "\n"; } else { print "\n"; } $lib = "$speciesLibDir/mirslib"; $matrix = "25p" . "$GC" . ".matrix"; $minscore = 180; # Change threshold for DeCypher $minscore = 170 if ( $searchEngine->isa( "DeCypherSearchEngine" ) ); $minmatch = &setminmatch( \%options, 6, 6, 8, 9 ); ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -27, -6, -5 ); if ( $options{'s'} ) { $bandwidth = "25"; } else { $bandwidth = "14"; } $masklevel = "-masklevel 90"; $raw = $wordraw = ""; $outfile = "$file.tmp.mirs"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); &Choose( \%options, "mirs", $fragCnt, $lib, $resultsCollection, $tax ); if ( $resultsCollection->size() ) { &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); $fullymasked = &maskRepeats( $resultsCollection, \%options, $seqDB ); &adjustAnnotationCoordinates( $resultsCollection, 5, \@BegRef, \@EndRef, $numX ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } } last MASKREPEATS if $fullymasked; } # if ( -s "$speciesLibDir/mirslib.... if ( -s "$speciesLibDir/mirlib" || -s "$speciesLibDir/mirlib.bsq" || -s "$speciesLibDir/mirlib.xps" ) { ##### mask MIRs ##### # local values $lib = "$speciesLibDir/mirlib"; $matrix = "25p" . "$GC" . ".matrix"; $minscore = 250; # Change threshold for DeCypher $minscore = 250 if ( $searchEngine->isa( "DeCypherSearchEngine" ) ); $minmatch = &setminmatch( \%options, 6, 6, 8, 9 ); ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -27, -6, -5 ); if ( $options{'s'} ) { $bandwidth = "25"; } else { $bandwidth = "14"; } $masklevel = "-masklevel 90"; $raw = "-raw"; $wordraw = ""; $outfile = "$file.tmp.mir"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); if ( $options{'div'} ) { &Choose( \%options, "masking", $fragCnt, $lib, $resultsCollection, $tax ); } &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); $fullymasked = &maskRepeats( $resultsCollection, \%options, $seqDB ); &adjustAnnotationCoordinates( $resultsCollection, 5, \@BegRef, \@EndRef, $numX ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); ## TODO: Consider adding a routine here to clip element past the overlap boundry if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } last MASKREPEATS if $fullymasked; } # if ( -s "$speciesLibDir/mirlib... if ( -s "$speciesLibDir/retrolib" || -s "$speciesLibDir/retrolib.bsq" || -s "$speciesLibDir/retrolib.xps" ) { ##### mask retroviral internal sequences ##### print "identifying retrovirus-like sequences"; if ( $batchIdentifierText ne "" ) { print " in " . $batchIdentifierText . "\n"; } else { print "\n"; } $lib = "$speciesLibDir/retrolib"; $matrix = "20p" . "$GC" . ".matrix"; $minscore = 250; $minmatch = &setminmatch( \%options, 9, 10, 11, 13 ); ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -30, -6, -5 ); $bandwidth = "14"; $masklevel = "-masklevel 90"; $raw = $wordraw = ""; $outfile = "$file.tmp.retro"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); &Choose( \%options, "masking", $fragCnt, $lib, $resultsCollection, $tax ); &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); $fullymasked = &maskRepeats( $resultsCollection, \%options, $seqDB ); &adjustAnnotationCoordinates( $resultsCollection, 5, \@BegRef, \@EndRef, $numX ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } last MASKREPEATS if $fullymasked; } if ( $tax->isA( $options{'species'}, "eutheria" ) ) { # these LINEs are not scanned in marsupials; perhaps will # find LINEs to put in later ##### mask undetected LINE1 bodies ##### print "identifying tough LINE1s"; if ( $batchIdentifierText ne "" ) { print " in " . $batchIdentifierText . "\n"; } else { print "\n"; } $lib = "$generalLibDir/l1.lib"; $matrix = "25p" . "$GC" . ".matrix"; # changed in 20020713 version as the extreme matrices # gave false positives with -raw $matrix = "25p49g.matrix" if $GC_frac > 49; $minscore = 300; $minmatch = &setminmatch( \%options, 7, 8, 9, 11 ); ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -27, -6, -5 ); $bandwidth = "14"; $masklevel = "-masklevel 90"; $raw = "-raw"; $wordraw = ""; $outfile = "$file.tmp.l1"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); &Choose( \%options, "l1", $fragCnt, $lib, $resultsCollection, $tax ); if ( $resultsCollection->size() ) { &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); $fullymasked = &maskRepeats( $resultsCollection, \%options, $seqDB ); &adjustAnnotationCoordinates( $resultsCollection, 5, \@BegRef, \@EndRef, $numX ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } } # if ( $resultCollections->size().. } # if eutheria } # if ( -s $shortlib last MASKREPEATS if $fullymasked; } # unless only low complexity sequences are masked unless ( $options{'low'} ) { ##### mask the simple repeats ##### print "identifying more simple repeats"; if ( $batchIdentifierText ne "" ) { print " in " . $batchIdentifierText . "\n"; } else { print "\n"; } $lib = "$generalLibDir/simple.lib"; $matrix = "simple1.matrix"; $minscore = 180; # Change threshold for DeCypher $minscore = 200 if ( $searchEngine->isa( "DeCypherSearchEngine" ) ); $minmatch = &setminmatch( \%options, 7, 8, 9, 10 ); ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -40, -15, -15 ); $bandwidth = 4; $masklevel = "-masklevel 25"; $raw = "-raw"; $wordraw = "-word_raw"; $outfile = "$file.tmp.simple2"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); if ( $options{'div'} ) { &Choose( \%options, "masking", $fragCnt, $lib, $resultsCollection, $tax ); } &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); $fullymasked = &maskRepeats( $resultsCollection, \%options, $seqDB ); &adjustAnnotationCoordinates( $resultsCollection, 5, \@BegRef, \@EndRef, $numX ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } ##### mask simple repeats flanking those previously masked ##### # local values $lib = "$generalLibDir/simple.lib"; $matrix = "simple.matrix"; $minscore = 200; $minmatch = &setminmatch( \%options, 6, 7, 8, 9 ); ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -35, -10, -10 ); $bandwidth = 4; $masklevel = "-masklevel 75"; $raw = "-raw"; $wordraw = "-word_raw"; $outfile = "$file.tmp.simple3"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); if ( $options{'div'} ) { &Choose( \%options, "masking", $fragCnt, $lib, $resultsCollection, $tax ); } &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); $fullymasked = &maskRepeats( $resultsCollection, \%options, $seqDB ); &adjustAnnotationCoordinates( $resultsCollection, 5, \@BegRef, \@EndRef, $numX ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } ##### mask polyAT or polyGC regions ##### print "identifying low complexity regions"; if ( $batchIdentifierText ne "" ) { print " in " . $batchIdentifierText . "\n"; } else { print "\n"; } $lib = "$generalLibDir/at.lib"; $matrix = "at.matrix"; $minmatch = 5; $minscore = 21; ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -10, -3, -3 ); $bandwidth = "2"; $masklevel = "-masklevel 95"; $raw = "-raw"; $wordraw = "-word_raw"; $outfile = "$file.tmp.polyNs"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); if ( $options{'div'} ) { &Choose( \%options, "masking", $fragCnt, $lib, $resultsCollection, $tax ); } &addToRefinementCollection( $resultsCollection, \%refinementHash, $seqDB, $batchNum, $stage++, $refineableHashRef ); $fullymasked = &maskRepeats( $resultsCollection, \%options, $seqDB ); &adjustAnnotationCoordinates( $resultsCollection, 5, \@BegRef, \@EndRef, $numX ); &fragmentAnnotations( $resultsCollection, \%excisedRepRef, $numX ); if ( $options{'a'} ) { if ( $options{'inv'} ) { $resultsCollection->write( $outfile, SearchResult::AlignWithSubjSeq ); } else { $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); } } else { $resultsCollection->write( $outfile, SearchResult::NoAlign ); } } # unless low complexity masking is skipped # # Refine elements if necessary # if ( keys( %refinementHash ) && -s "$speciesLibDir/refinelib" ) { # The design goal is to have each repeat class have it's own refinement # library. For now we just search against the same library. foreach my $refinementClass ( keys( %refinementHash ) ) { print "refining $refinementClass elements"; if ( $batchIdentifierText ne "" ) { print " in " . $batchIdentifierText . "\n"; } else { print "\n"; } open OUT, ">$file.unRefinedEles.fa"; my $index = 0; foreach my $ele ( @{ $refinementHash{$refinementClass} } ) { print OUT ">" . $ele->{'seed'}->getQueryName() . ":" . $ele->{'seed'}->getQueryStart() . "-" . $ele->{'seed'}->getQueryEnd() . "-" . $ele->{'seed'}->getOverlap . " " . $ele->{'seed'}->getSubjName() . "\n"; $index++; my $seq = $ele->{'seq'}; if ( $seq eq "" ) { $seq = $ele->{'seed'}->getQueryString(); $seq =~ s/-//g; } $seq =~ s/(\S{50})/$1\n/g; $seq .= "\n" unless ( $seq =~ /.*\n+$/s ); print OUT $seq; } close OUT; # At some point we may make each repeat class have it's own refinment lib $lib = "$speciesLibDir/refinelib"; $minscore = 180; $bandwidth = 40; $masklevel = "-masklevel 101"; $raw = "-raw"; $wordraw = "-word_raw"; $minmatch = &setminmatch( \%options, 7, 8, 9, 11 ); $matrix = "18p" . "$GC" . ".matrix"; ( $gap_initValue, $ins_gap_extValue, $del_gap_extValue ) = ( -30, -6, -5 ); $maskfile = "$file.unRefinedEles.fa"; $outfile = "$file.tmp.ref"; ( $minmatch, $bandwidth, $resultsCollection ) = &search( \%options, $DIRECTORY, $outfile, $maskfile, $lib, $minmatch, $bandwidth, $matrix, $gap_initValue, $ins_gap_extValue, $del_gap_extValue, $minscore, $masklevel, $searchEngine, $wordraw, $raw ); ## Adjust positions for ( my $i = 0 ; $i < $resultsCollection->size() ; $i++ ) { my $current = $resultsCollection->get( $i ); my $seqID = $current->getQueryName(); if ( $seqID =~ /(.*):(\d+)-(\d+)-(b\d+s\d+i\d+)$/ ) { my $origID = $1; my $start = $2; my $end = $3; my $parentID = $4; $current->setQueryName( $origID ); $current->setQueryStart( $current->getQueryStart() + $start - 1 ); # Why was this changed? #$current->setQueryEnd( $end - $current->getQueryRemaining() ); $current->setQueryEnd( $current->getQueryEnd() + $start - 1 ); $current->setQueryRemaining( 0 ); $current->setOverlap( "[$parentID]" ); } } #$resultsCollection->write( $outfile, SearchResult::NoAlign ); $resultsCollection->write( $outfile, SearchResult::AlignWithQuerySeq ); if ( <$file.tmp.ref> ) { systemint( "cat $outfile >> $file.ref" ); } unlink( $outfile ); unlink( $maskfile ); unlink( "$maskfile.log" ); #print "DONE.\n"; } } # End IN DEVELOPMENT } # end of MASKREPEAT block if ( <$file.tmp.*> ) { systemint( "cat $file.tmp.* > $file.cat" ); } return ( $cutAlus, $cutSome ); } ##-------------------------------------------------------------------------## ## Use: my $minMatch = &setminmatch( \%options, $1, $2, $3, $4 ); ## ## ## Returns ## ## Globals Used: None ## Globals Modified: None ## ##-------------------------------------------------------------------------##� sub setminmatch { my %options = %{ shift() }; if ( $options{'qq'} ) { return $_[ 3 ]; } elsif ( $options{'q'} ) { return $_[ 2 ]; } elsif ( $options{'s'} ) { return $_[ 0 ]; } else { return $_[ 1 ]; } } ##-------------------------------------------------------------------------## ## Use: my $matrix = &choosematrices( $GC_frac ); ## ## ## Returns ## ## Globals Used: None ##-------------------------------------------------------------------------##� sub choosematrices { my $GC_frac = shift; my $matrix = ""; if ( $GC_frac < 36 ) { $matrix = "35g"; } elsif ( $GC_frac <= 38 ) { $matrix = "37g"; } elsif ( $GC_frac <= 40 ) { $matrix = "39g"; } elsif ( $GC_frac < 42 ) { $matrix = "41g"; } elsif ( $GC_frac <= 44 ) { $matrix = "43g"; } elsif ( $GC_frac <= 46 ) { $matrix = "45g"; } elsif ( $GC_frac <= 48 ) { $matrix = "47g"; } elsif ( $GC_frac < 50 ) { $matrix = "49g"; } elsif ( $GC_frac <= 52 ) { $matrix = "51g"; } else { $matrix = "53g"; } return ( $matrix ); } ##-------------------------------------------------------------------------## ## Use: my $fullyMasked = &maskRepeats( $searchResults, \%optionsRef, $seqDB ); ## ## Returns ## ## Appears to replace hits in the out file as X's in the ## sequence file. ## ## Returns true (1) if the sequence was fully masked ## false (0) otherwise. ## ## Globals Used: None ## Globals Set: None ## ##-------------------------------------------------------------------------##� sub maskRepeats { my $searchResults = shift; my $optionsRef = shift; my $seqDB = shift; my $totalMasked = 0; my $fullyMasked = 0; my $name = ""; my $seq = ""; my $XedSeq = ""; my %seqwithname = (); my ( $segBeginRef, $segEndRef ) = &Get_segments( $searchResults ); if ( %$segBeginRef ) { my @seqIDs = $seqDB->getIDs(); foreach my $seqID ( @seqIDs ) { my $beginRef = $segBeginRef->{$seqID}; my $endRef = $segEndRef->{$seqID}; if ( $beginRef ) { for ( my $i = $#$beginRef ; $i > -1 ; $i-- ) { my $len = $$endRef[ $i ] - $$beginRef[ $i ] + 1; $seqDB->setSubstr( $seqID, $$beginRef[ $i ] - 1, $len, 'X' x $len ); # Test if sequence is now fully masked $totalMasked++ if ( $seqDB->getSubtLength( $seqID ) == 0 ); } # for ( my $i.. } # if ( $beginRef.. } # foreach my $seqID.. } # if ( defined %$segBegRef.. $fullyMasked = 1 if ( $fullyMasked == $seqDB->getSeqCount() ); return $fullyMasked; } ##-------------------------------------------------------------------------## ## Use: resizeMaskedSeq( \@infoFiles, $numX, $seqDB ); ## ## ## Returns ## ## Reinserts the excised sequences in the masked file (as polyX) ## ## Globals Used: None ##-------------------------------------------------------------------------##� sub resizeMaskedSeq { my $infoFilesRef = shift; my $numX = shift; my $seqDB = shift; my ( $name, $seq ); foreach my $infofile ( @{$infoFilesRef} ) { if ( -e $infofile ) { # TODO....think about this method of parsing. my $resultsCollection = CrossmatchSearchEngine::parseOutput( searchOutput => $infofile ); my ( $segBeginRef, $segEndRef ) = &Get_segments( $resultsCollection ); undef $resultsCollection; if ( %$segBeginRef ) { my @seqIDs = $seqDB->getIDs(); foreach my $seqID ( @seqIDs ) { my $beginRef = $segBeginRef->{$seqID}; my $endRef = $segEndRef->{$seqID}; if ( $beginRef ) { for ( my $i = 0 ; $i <= $#$beginRef ; $i++ ) { my $len = $$endRef[ $i ] - $$beginRef[ $i ] + 1; # TODO: Check $numX...use here...that is how Arian did it! $seqDB->setSubstr( $seqID, $$beginRef[ $i ] - 1, $numX, 'X' x $len ); } # for ( my $i.. } # if ( $beginRef.. } # foreach my $seqID.. } # if ( defined... } } } ##-------------------------------------------------------------------------## ## Use: my addToRefinementCollection( $resultsCollection ); ## ## IN DEVELOPMENT. This routine decides if a result should be ## be refined further ( by another search against a refinement library ). ## ## $refinementHash = { 'SINE/Alu' => [ { 'seed' => SearchResult, ## 'seq' => '' }, { .. } ] } ## ## Returns ## ## Globals Used: None ## ##-------------------------------------------------------------------------## sub addToRefinementCollection { my $resultsCollection = shift; my $refinementHash = shift; my $seqDB = shift; my $batchNum = shift; my $stage = shift; my $refineableHashRef = shift; ## Go through elements and add to refinement hash with the ## class/subtype as the key if the element is enabled for further ## refinement. for ( my $i = 0 ; $i < $resultsCollection->size() ; $i++ ) { my $current = $resultsCollection->get( $i ); if ( &isRefineable( $current, $refineableHashRef ) ) { # Get element fields my $name = $current->getSubjName(); my $seqID = $current->getQueryName(); my $qStart = $current->getQueryStart(); my $qEnd = $current->getQueryEnd(); # Break up the name in to a classification Type/Subtype my ( $classification ) = ( $name =~ /\#(.*)/ ); # Add to class/subclass refinement collection my $refColl = $refinementHash->{$classification}; if ( !defined $refColl ) { $refColl = []; $refinementHash->{$classification} = $refColl; } my $querySeq = ""; if ( $current->getQueryString() eq "" ) { # Search engine did not return the alignment so we must # grab it ourselves from the DB. # TODO: Double check the length of querySeq $querySeq = $seqDB->getSubstr( $seqID, $qStart - 1, $qEnd - $qStart + 1 ); } # Generate a unique id ( to this session ) for the seed $current->setOverlap( "b" . $batchNum . "s" . $stage . "i" . $i ); # If 'seq' is blank the sequence can be obtained from the seed result. push @$refColl, { 'seed' => $current, 'seq' => $querySeq }; } } } ##-------------------------------------------------------------------------## ## Use: my isRefineable( $searchResult ); ## ## IN DEVELOPMENT. This routine returns the refineable status ## given an element. A refineable element can ## be searched against a highly detailed library ## to return a more refined set of alignments. ## ## Returns ## ## 1 = true, 0 = false. ## ## Globals Used: None ## ##-------------------------------------------------------------------------## sub isRefineable { my $result = shift; my $refineableHashRef = shift; my $name = $result->getSubjName(); my ( $id, $classification ) = ( $name =~ /(.*)\#(.*)/ ); if ( defined $refineableHashRef->{$id} ) { return 1; } return 0; } ##-------------------------------------------------------------------------## ## Use: my Choose( \%options, $what, $fragCnt, $lib, ## $resultsCollection, $tax ); ## ## ## Returns ## ## Globals Used: None ## ##-------------------------------------------------------------------------##� sub Choose { my %options = %{ shift() }; my $repeatClass = shift; my $fragCnt = shift; my $lib = shift; my $searchResults = shift; my $tax = shift; $DEBUG = 0; print "Choose( \%options, $repeatClass, $fragCnt," . " $lib, \$searchResults ); Called\n" if ( $DEBUG ); my ( $printed_one, $lastoneisfineorf2, $lastoneisfine3utr, $lastonecut, $delayprint, $lastlineName, $lastlinebegin, $lastlineend, $lastlineleft ); my $undelPrevious = undef; my $previous; my $current; my $next; my @deleteList = (); # DEBUG output if ( $DEBUG ) { print "Original Annotations:\n"; for ( my $i = 0 ; $i < $searchResults->size() ; $i++ ) { print "#$i: " . $searchResults->get( $i ) ->toStringFormatted( SearchResult::NoAlign ); } print "\n"; } # # Modify search results and flag annotations for deletion # for ( my $i = 0 ; $i < $searchResults->size() ; $i++ ) { $current = $searchResults->get( $i ); my $name = $current->getSubjName(); # Look for "#buffer" annotations and remove them. Buffer # sequences are sequence (fragments) thrown into a library # to protect a sequence from being matched. This is often # used when there is some overlap between libraries. Matches # to #buffer sequences are more likely elements which will # be checked at a later stage. if ( $name =~ /\#buffer/ ) { # like MT2B in rodcutsines.lib print "Deleting annotation ($i): It's a #buffer sequence.\n" if ( $DEBUG ); push @deleteList, $i; next; } # Grab the current line of data into variables for easy reference my $score = $current->getScore(); my $diverge = 100 * ( $current->getPctDiverge() / ( 100 - $current->getPctInsert() ) ); my $gaps = $current->getPctDelete() + $current->getPctInsert(); my $queryleftover = $current->getQueryRemaining(); my $reverse = 0; if ( $current->getOrientation() eq "C" ) { $reverse = 1; } my $begin = $current->getSubjStart(); my $end = $current->getSubjEnd(); my $left = $current->getSubjRemaining(); # # Simple filter on divergence level # if ( $options{'div'} && $diverge > $options{'div'} ) { print "Deleting annotation ($i): Divergence too high.\n" if ( $DEBUG ); push @deleteList, $i; next; } # Only consider previous elements if they are # contained in the same sequence. Also do not # consider previous elements if they were marked # for deletion. my $havePreviousElement = 0; if ( $i > 0 ) { $havePreviousElement = 1; $previous = $searchResults->get( $i - 1 ); if ( $current->getQueryName() ne $previous->getQueryName() || ( @deleteList && $deleteList[ $#deleteList ] == $i - 1 ) ) { $havePreviousElement = 0; $previous = undef; } } # Similarly for next elements # except elements can't have been deleted yet my $haveNextElement = 0; if ( $i < $searchResults->size() - 1 ) { $next = $searchResults->get( $i + 1 ); $haveNextElement = 1 if $current->getQueryName() eq $next->getQueryName(); } # # Class contamination # if ( $repeatClass eq 'contam' ) { # The following elements in the contamination library # occasionally match (albeit with a low score) sequences # which have better matches to other repeats ( searched # later on ). Using a score threshold is a crude way # of identifying these incorrect annotations. if ( $options{'primspec'} ) { if ( $name =~ /Alu/ && $score < 1250 || $name =~ /MER30/ && $score < 540 || $name =~ /THE1/ && $score < 400 && $begin > 230 ) { print "Deleting annotation ($i): Filter #1 - Early misidentification.\n" if ( $DEBUG ); push @deleteList, $i; next; } } elsif ( $options{'rodspec'} ) { if ( $name =~ /L1MM_5000/ && ( $score < 800 || $diverge > 17 ) || $name =~ /^Lx[69]/ && $end < 365 && $score < 1000 || $score < 525 && $name eq 'B1_MM#SINE/Alu' || $score < 480 && $name eq 'B4#SINE/B4' ) { print "Deleting annotation ($i): Filter #2 - Early misidentification.\n" if ( $DEBUG ); push @deleteList, $i; next; } } } # class contam # # Class cut2 lib # elsif ( $repeatClass eq 'cut2' ) { # cut2.lib # cutting out youngish LINE1 3' ends; a tricky business # but can have a large (positive) effect only the younger # subset of elements in the repeat library are cut; the # other consensus seqs are only there to correctly classify # matches # this (high) limit prevents cutting out ancient elements # misassigned as young elements if ( $diverge + $gaps < 25 ) { if ( $name =~ /_3end\#/ && $left < 20 ) { # If we overlap with the previous element by more than 20bp if ( $havePreviousElement && $previous->getQueryEnd() > $current->getQueryStart() + 20 ) { # Delete this element if we are contained by the previous element # or if the previous element was cut. # not necessary to look at next element: # longer extending element is always presented first if ( $previous->getQueryEnd() >= $current->getQueryEnd() || $lastonecut ) { $lastoneisfine3utr = ""; $lastonecut = 0; print "Deleting annotation ($i): Filter #3 - Contained by another annot.\n" if ( $DEBUG ); push @deleteList, $i; } ## Overlap of body and 3' end # If the previous element was a good body (orf2) and this is # in the correct orientation (forward). elsif ( $lastoneisfineorf2 && !$reverse ) { print "Modifying annotation ($i): Filter #4 - Extending line 3' end\n" . " to include body.\n" if ( $DEBUG ); if ( $previous->getScore() > $score ) { # body scored better so use it's score and pctSub $current->setScore( $previous->getScore() ); $current->setPctDiverge( $previous->getPctDiverge() ); } # Include previous element boundries $current->setQueryStart( $previous->getQueryStart() ); my $tmpName = $name; $tmpName =~ s/\#/extended\#/; $current->setSubjName( $tmpName ); # used in ProcessRepeats $current->setSubjBegin( $lastlinebegin ); if ( $lastoneisfineorf2 eq 'L1P_orf2' ) { $current->setSubjEnd( $end + 3144 ); } else { # currently only rodent as alternative; need to generalize $current->setSubjEnd( $end + 4384 ); } # Remove the body annotation and keep the 3' extended annotation. $lastonecut = 1; $lastoneisfine3utr = ""; print "Deleting annotation ($i-1): Filter #4 - Body assimilated\n" . " by 3' end annot.\n" if ( $DEBUG ); push @deleteList, $i - 1; } else { # Some freakish combination remove it $lastonecut = 0; $lastlinebegin = ""; $lastoneisfineorf2 = ""; print "Deleting annotation ($i-1): Filter #5 - Unknown 3' " . "overlap combination.\n" if ( $DEBUG ); push @deleteList, $i; } } # If we overlap with the previous element by more than 20bp... # Overlap <= 20 and several other factors. # L1 ORFs and body consensus sequences are built with a 150bp # overlap. If the alignment starts at > 150bp then it truely # matches the 3'end part. elsif ( ( $begin > 150 || !$reverse && $name =~ /L1PA\d\#|L1Hs|L1Rn|L1Md/ ) # L1PA, L1Hs, L1Rn...are young elements. It's safe to assume # that no extensions will be found with a shorter wordlength. # Therefore clip just the 3' consensus portion out as an # independent 3' end. && ( !$fragCnt || $current->getQueryStart() > 50 && !$reverse || $queryleftover > 50 && $reverse ) # Handle the case where this end could be extended but its # to close to a boundry ( of a fragmented sequence ) to tell. ) { # at the edge of query fragments (< 50 bp left); extension may be found # in overlapping fragment if ( $havePreviousElement && $previous->getQueryEnd() >= $current->getQueryEnd() ) { #accidental overlap < 20 bp if ( !$reverse || $lastonecut ) { print "Modifying annotation ($i): Filter #5a - 3' end " . "start adjustment\n" . " (accidental overlap)\n" if ( $DEBUG ); $current->setQueryStart( $previous->getQueryEnd() + 1 ); } } $lastonecut = 1; $lastoneisfine3utr = ""; } else { # Ok....this is strange. If this is set....and for some # reason a body is not quickly located...this annotation # will never be clipped. $lastoneisfine3utr = 1 if $reverse; $lastonecut = 0; # Strange that we don't keep it if its a singleton print "Deleting annotation ($i): Filter #6 - " . "Assimilation or singleton\n" if ( $DEBUG ); push @deleteList, $i; } } # if ($name =~ /_3end\#/... elsif ( $name =~ /_orf2/ && $begin > 100 && $left < 20 # only the younger L1 body consensi are suffixed '_orf2'; # they are all 5' shortened in part because no 5' ends # (which overlap with the full consensi) are included # in this comparison only matches labeled 'extended' # are clipped, wich are treated separately in ProcessRepeats && ( !$fragCnt || $current->getQueryStart() > 50 ) ) { # forward: may extend into previous frag; reverse: lack of # complete 3' UTR guaranteed if ( $havePreviousElement && $lastoneisfine3utr && $previous->getQueryEnd() > $current->getQueryStart() && $reverse && ( !$fragCnt || $queryleftover > 50 ) ) { print "Modifying annotation ($i): Filter #7 - Reverse " . "body can be combined with 3'.\n" if ( $DEBUG ); if ( $previous->getScore() > $score ) { $current->setScore( $previous->getScore() ); $current->setPctDiverge( $previous->getPctDiverge() ); } $current->setQueryStart( $previous->getQueryStart() ); $current->setSubjName( ( $lastlineName =~ s/\#/extended\#/ ) ); $current->setSubjLeft( $lastlineleft ); if ( $tax->isA( $options{'species'}, "primates" ) ) { $current->setSubjEnd( $lastlineend + 3144 ); } else { #right now only alternative is rodents; $current->setSubjEnd( $lastlineend + 4384 ); } $lastonecut = 1; $lastoneisfineorf2 = ""; } else { $lastonecut = 0; $lastoneisfineorf2 = $name if !$reverse; $lastlinebegin = $begin; print "Deleting annotation ($i): Filter #7 - Not extended (yet).\n" if ( $DEBUG ); push @deleteList, $i; } $lastoneisfine3utr = ""; } else { # I'm not cutting out 3' UTRs on opposite strand that begin < 150 # and do not overlap with ORF2 $lastoneisfine3utr = $lastoneisfineorf2 = ""; $lastonecut = 0; print "Deleting annotation ($i): Filter #8 - A bodyless 3' UTR.\n" if ( $DEBUG ); push @deleteList, $i; } } else { # if <75% identical $lastonecut = 0; print "Deleting annotation ($i): Filter #9 - < 75% identical.\n" if ( $DEBUG ); push @deleteList, $i; } } # Class cut2.lib # # Class mirs # elsif ( $repeatClass eq 'mirs' ) { # The MIR and MIR3 SINEs share 51 and 78 bp (inexactly) with the L2 # and L3 LINEs, respectively. Occasionally, a fragment is matched to # both a SINE and a LINE, confusing ProcessRepeats and often extending # the match (and the masking) too far. if ( $name =~ /^L2a|^L2b|^L3|^MIR/i && $havePreviousElement && $previous->getQueryEnd() > $current->getQueryStart() + 33 ) { my $classnow = $name; my $classthen = $lastlineName; $classnow =~ s/.+\#//; $classthen =~ s/.+\#//; if ( $current->getQueryEnd() - $previous->getQueryEnd() <= 30 && ( $previous->getScore() >= 1.5 * $score || $classnow ne $classthen && ( $previous->getScore() >= 1.05 * $score || $current->getQueryEnd() - $previous->getQueryEnd() < $current->getQueryStart() - $previous->getQueryStart() && $previous->getScore() >= 0.83 * $score ) ) ) { print "Deleting annotation ($i): Filter #9a - " . "Preceeding L2/L3/Mir better.\n" if ( $DEBUG ); push @deleteList, $i; } elsif ( $current->getQueryStart() - $previous->getQueryStart() <= 30 && ( $score >= 1.5 * $previous->getScore() || $classnow ne $classthen && ( $score >= 1.05 * $previous->getScore() || $current->getQueryEnd() - $previous->getQueryEnd() > $current->getQueryStart() - $previous->getQueryStart() && $score >= 0.83 * $previous->getScore() ) ) ) { print "Deleting annotation ($i-1): Filter #9a - " . "Following L2/L3/Mir better.\n" if ( $DEBUG ); push @deleteList, $i - 1; } } else { if ( $name =~ /MER91/ && $score < 200 ) { print "Deleting annotation ($i): Filter #9b - A poor MER91.\n" if ( $DEBUG ); push @deleteList, $i; } } } #class mirs # # Class sines # elsif ( $repeatClass eq 'sines' ) { if ( $name =~ /^MLT2B/ ) { # Occasionally the match to Ricksha is broken up by a gap # while spanned in one piece by a (much more diverged) match to MLT2 if ( ( $havePreviousElement && $previous->getSubjName() =~ /^Ricksha/ && $current->getQueryStart() < $previous->getQueryEnd() - 50 && $score < $previous->getScore() ) || ( $haveNextElement && $next->getSubjName() =~ /^Ricksha/ && $current->getQueryEnd() > $next->getQueryStart() + 50 && $score < $next->getScore() ) ) { print "Deleting annotation ($i): Filter \#9c - MLT2 really is Ricksha.\n" if ( $DEBUG ); push @deleteList, $i; } } elsif ( $name eq 'MER3' && $current->getSubjStart() > 30 && $current->getSubjEnd() < 125 ) { # MER3 and MER33 share TIRs, but MER3 has 47 bp extraTIReal seqeunce at the 5' end # this sometimes leads to MER3 and MER33 alignments overlapping because # the alignments can be (falsely) extended a few bases further to the MER3 consensus if ( ( $havePreviousElement && $previous->getSubjName() eq 'MER33' && $current->getOrientation() eq 'C' && $current->getQueryEnd() - $previous->getQueryEnd() < 20 ) || ( $haveNextElement && $next->getSubjName() eq 'MER33' && $current->getOrientation() eq '+' && $next->getQueryStart() - $current->getQueryStart() < 20 ) ) { print "Deleting annotation ($i): Filter \#9d - MER3 really is MER33.\n" if ( $DEBUG ); push @deleteList, $i; } } # This was handled before the divergence filter. if ( $tax->isA( $options{'species'}, "rodentia" ) ) { # Similar to above MER3/MER33 case, B4 contains a B1 and alignments # can continue a few meaningless bases beyond a B1 if ( $havePreviousElement && $name =~ /^B1/ && $previous->getSubjName() =~ /^B4/ && $current->getQueryStart() < $previous->getQueryEnd() - 50 && $score > 1.5 * $previous->getScore() && $previous->getSubjStart() >= 80 ) { # deleted requirement of score > 226; this was in original code # as the B4 element would otherwise not be considered anyway print "Deleting annotation ($i-1): Filter \#10a.\n" if ( $DEBUG ); push @deleteList, $i - 1; } elsif ( $haveNextElement && $name =~ /^B4/ && $next->getSubjName() =~ /^B1/ && $next->getQueryStart() < $current->getQueryEnd() - 50 && $next->getScore() > 1.5 * $score && $begin >= 80 ) { print "Deleting annotation ($i): Filter \#10b.\n" if ( $DEBUG ); push @deleteList, $i; } elsif ( $name =~ /SINE/ && $begin <= 100 ) { # check for false positives ID?? # instead of setting $score < 225" in parameters, here we # set the cutoff for all none-SINEs allows SINEs with score # 200-225 to get masked push( @deleteList, $i ) if $name eq "RSINE1" && $begin > 50 && $score < 225; } elsif ( $name =~ /^ORR1.*-int/ && $current->getOverlap() eq '*' ) { # location of partially masked B1 in consensus; # only skipped if there is a better match overlapping it # this may occasionally leave a genuine small bit of # ORR1 internal unmasked if it overlaps with something # else than a B1 # (MaLR internals are screened in the shortlib session) if ( $name =~ /^ORR1A1-int/ && $begin > 530 && $end < 680 || $name =~ /^ORR1A3-int/ && $begin >= 595 && $end <= 910 || $name =~ /^ORR1B1-int/ && $begin >= 585 && $end <= 860 ) { print "Deleting annotation ($i): Filter #11.\n" if ( $DEBUG ); push @deleteList, $i; } } elsif ( $name eq "RatSatRep1" && $begin > 1150 && $end < 1750 && $diverge > 16 ) { # this fragment matches a retroviral internal sequence (screened only in the next step) print "Deleting annotation ($i): Filter #12.\n" if ( $DEBUG ); push @deleteList, $i; } elsif ( $score < 225 ) { print "Deleting annotation ($i): Filter #13.\n" if ( $DEBUG ); push @deleteList, $i; } elsif ( $score <= 350 ) { # bits of low complexity in consensi that lead to matches in reversed DNA if ( $name =~ /^RLTR21/ && $begin > 1090 && $end < 1365 || $name =~ /^RMER12\#/ && $begin > 750 && $end < 1025 || $score < 310 && ( $name =~ /^RMER17/ && ( $name =~ /^RMER17A\#/ && $begin > 205 && $end < 460 || $name =~ /^RMER17A2\#/ && $begin > 290 && $end < 495 || $name =~ /^RMER17B\#/ && $begin > 840 && $end < 940 || # that's an unexpected one $name =~ /^RMER17B\#/ && $begin > 250 && $end < 445 || $name =~ /^RMER17C\#/ && $begin > 80 && $end < 250 || $name =~ /^RMER17D\#/ && $begin > 245 && $end <= 500 || $name =~ /^RMER17D2\#/ && $begin > 255 && $end <= 525 ) || $name =~ /^RNLTR15A/ && ( $name =~ /^RNLTR15A\#/ && $begin > 1440 && $end < 1550 || $name =~ /^RNLTR15A2\#/ && $begin > 1310 && $end < 1420 ) ) ) { print "Deleting annotation ($i): Filter #14.\n" if ( $DEBUG ); push @deleteList, $i; } } } elsif ( $tax->isA( $options{'species'}, "primates" ) ) { if ( $name =~ /^LTR66/ && $begin > 380 && $end < 470 || $name =~ /^MER45R/ && $score < 275 && $begin > 310 && $end < 400 ) { print "Deleting annotation ($i): Filter #15.\n" if ( $DEBUG ); push @deleteList, $i; } } } # class sines # # Class longlib # elsif ( $repeatClass eq 'longlib' ) { if ( $name =~ /L1MC3_3end/ && $begin > 1310 && $end < 1400 || $name =~ /L1MC4_3end/ && $begin > 1325 && $end < 1475 || $name =~ /^Lx9/ && $begin > 1860 && $end < 1940 || $score <= 300 && ( $name =~ /^L1_Mur2\#/ && $begin > 825 && $end < 932 || $name =~ /L1MCa_5end/ && $begin > 1880 && $end < 1980 || $name =~ /^L1_Mur3\#/ && $begin > 835 && $end < 905 ) ) { print "Deleting annotation ($i): Filter #16.\n" if ( $DEBUG ); push @deleteList, $i; } } # class longlib # # Class l1 # elsif ( $repeatClass eq 'l1' && $score < 425 ) { # run with -raw if ( $name =~ /^HAL1\#/ && $begin > 900 && $end < 1350 && $score < 375 || $name =~ /^HAL1b\#/ && $begin > 695 && $end < 910 && $score < 350 || $name =~ /L1M4_orf2/ && $begin > 700 && $end < 1340 && $score < 375 || $name =~ /L1MC3_3end/ && $begin > 1275 && $end < 1430 && $score < 350 || $name =~ /L1MC4_3end/ && $begin > 1260 && $end < 1490 && $score < 400 || $name =~ /L1MC4_5end/ && $begin > 850 && $end < 1340 && $score < 415 || $name =~ /L1MCa_5end/ && $begin > 1855 && $end < 1995 && $score < 350 || $name =~ /L1M4b_5end/ && ( $begin > 2800 && $end < 3350 || $begin > 2180 && $end < 2300 ) && $score < 400 || $name =~ /L1MDa_5end/ && $begin > 2420 && $end < 2505 && $score < 350 || $name =~ /L1ME4a_3end/ && $begin > 475 && $end < 725 && $score < 350 || $name =~ /L1MEb_5end/ && $begin > 620 && $end < 800 && $score < 350 || $name =~ /L1MEc_5end/ && $begin > 2100 && $end < 2500 && $score < 360 ) { print "Deleting annotation ($i): Filter #17.\n" if ( $DEBUG ); push @deleteList, $i; } } # class l1 # # Class simple # elsif ( $repeatClass eq "simple" ) { if ( $name !~ /AAAAA|$A$n/ && $name !~ /Satellite/ ) { # avoid cutting out polyA tails # decision to cut out simple repeat dependent on the # complexity of the unit my ( $length, $complex ) = &CalcComplexity( $name ); my $cutoff = 16 - 3 * ( $length - $complex ); if ( $havePreviousElement && $previous->getQueryEnd() >= $current->getQueryStart() ) { # cutting out overlapping simple repeats leads to problems # as some nucleotides are cut out twice (some flanking DNA # will be deleted instead) # masklevel is set to 1; masklevel 0 would avoid any # overlaps, but behaves odd my $newQueryStart = $previous->getQueryEnd() + 1; # this apparently breaks SearchResultCollection->write.....somehow # TODO: Why doesn't he just update $field[5]????? # I used to print $_, so that changes to @fields had no impact; # does changing @fields change $i (hey, not a very informative # name there ;-)) $current->setQueryStart( $newQueryStart ); } unless ( $name !~ /$[GA]*$/ && $diverge + $gaps / 1.5 <= $cutoff || $diverge + $gaps < ( $score - 200 ) / 50 || $diverge + $gaps <= 5 ) { print "Deleting annotation ($i): Filter #18.\n" if ( $DEBUG ); push @deleteList, $i; } } else { print "Deleting annotation ($i): Filter #19.\n" if ( $DEBUG ); push @deleteList, $i; } } # class simple # # Class alu # elsif ( $repeatClass eq "alu" ) { if ( $havePreviousElement && $previous->getQueryEnd() >= $current->getQueryStart() ) { #we're stuck with a masklevel 1; 0 behaves odd my $newfield5 = $previous->getQueryEnd() + 1; # Again...see above....but why? #$current->setQueryName( quotemeta $current->getQueryName() ); $current->setQueryStart( $newfield5 ); #s/(.*$fields[4]\s+)$fields[5](\s$fields[6].*)/$1$newfield5$2/; } unless ( $name =~ /^Alu/ && ( $begin < 6 && $end > 280 || $score >= 1450 ) || $begin < 6 && $left < 5 ) { print "Deleting annotation ($i): Filter #20.\n" if ( $DEBUG ); push @deleteList, $i; } } # class alu # # Class cut1 # elsif ( $repeatClass eq "cut1" ) { # for non-LINEs criteria are only that 5 bp or less are missing from ends; # no need to require a maximum divergence, since element is complete (can't get better) if ( $begin < 6 && $left < 5 && $name !~ /LINE/ || $name =~ /SINE/ && $begin < 6 && $left < 20 ) { ## used to be restricted to Alu and SINE/B..; if ( $havePreviousElement && $previous->getQueryEnd() >= $current->getQueryStart() ) { $current->setQueryStart( $previous->getQueryEnd() + 1 ); # since both last and current are full length, no overlaps #of more than a few bases occur } if ( ( $name =~ /RNA$/ || $name =~ /^7SL/ ) && $options{'norna'} || $name =~ /^BC[12]\S+SINE/ ) { #fixed feb 03; was RNA && print "Deleting annotation ($i): Filter #21. Avoid cutting out RNAs" . "when the user has shut this off. Also if this is a BC[12] SINE " . "we don't wish to cut this out in the cut1 phase???\n" if ( $DEBUG ); push @deleteList, $i; } } else { print "Deleting annotation ($i): Filter #22: Non-complete elements " . "could overlap with full length elements. We want to avoid " . "shortening full length elements when the evidence is weak.\n" if ( $DEBUG ); push @deleteList, $i; } } #class cut1 # # Class alumask # elsif ( $repeatClass eq "alumask" ) { # masking but not excising alu.lib if ( $name =~ /^SVA/ && ( $begin >= 855 && $end < 1270 || $begin >= 1263 && $end < 1372 ) ) { print "Deleting annotation ($i): Filter #23.\n" if ( $DEBUG ); push @deleteList, $i; } } # class alumask # # Catchall class # else { # anything that give false positives in the other libraries if ( $score < 300 ) { if ( $name =~ /PRIMA4-int/ && $begin > 475 && $end < 575 || $name eq "LTR39-int" && $begin > 3900 && $end < 3990 || $name eq "LTR38-int" && $begin > 1350 && $end < 1450 ) { print "Deleting annotation ($i): Filter #24.\n" if ( $DEBUG ); push @deleteList, $i; } } } if ( $name =~ /RNA$/ && $options{'norna'} ) { print "Deleting annotation ($i): Filter #25.\n" if ( $DEBUG ); push @deleteList, $i; } # # Contained elements surviving to this point can cause problems in # the excision routine. Ie. # Excising repeat ( step = 4 ): 53276 - 53394 # Excising repeat ( step = 4 ): 53262 - 53394 # FastaDB::substr - Error index out of bounds! # at ./RepeatMasker line 4589 # Delete only if in a cut class, has not already been deleted, # is in the same sequence, and overlaps at all with the previous # undeleted element. # if ( $repeatClass =~ /simple|alu|cut1|cut2/ && ( !@deleteList || $deleteList[ $#deleteList ] != $i ) && $undelPrevious && $current->getQueryName() eq $undelPrevious->getQueryName() && $undelPrevious->getQueryEnd() >= $current->getQueryEnd() ) { print "Deleting annotation ($i): Filter #26 - Contained by filter\n" if ( $DEBUG ); push @deleteList, $i; } $lastlineName = $name; $lastlineleft = $left; $lastlineend = $end; $undelPrevious = $current if ( !@deleteList || $deleteList[ $#deleteList ] != $i ); } # for ( my $i = 0 ; $i < $searchResults->size()... # Return a boolean indicating if there were hits left # after this process completed if ( @deleteList ) { # Remove duplicates - not sure if the above generates duplicates but # to be on the safe side my %seen = (); my @uniqDeleteList = grep { !$seen{$_}++ } @deleteList; print "Doing actual removals:\n" if ( $DEBUG ); foreach my $index ( sort { $b <=> $a } @uniqDeleteList ) { print "Removing element $index\n" if ( $DEBUG ); $searchResults->remove( $index ); } } # DEBUG output if ( $DEBUG ) { print "Final Annotations:\n"; for ( my $i = 0 ; $i < $searchResults->size() ; $i++ ) { print "#$i: " . $searchResults->get( $i ) ->toStringFormatted( SearchResult::NoAlign ); } print "\n"; } $DEBUG = 0; } # sub Choose ##-------------------------------------------------------------------------## ## Use: my ## ## ## Returns ## ## Globals Used: None ##-------------------------------------------------------------------------##� sub CalcComplexity { my %num = ( "A", 0, "C", 0, "G", 0, "T", 0 ); my %log = ( "A", 0, "C", 0, "G", 0, "T", 0 ); my $name = shift; $name =~ s/^\((\w+).*/$1/; my $length = length $name; my @bases = split( //, $name ); foreach my $base ( @bases ) { ++$num{$base}; } $log{A} = $num{A} * log( $num{A} ) if $num{A}; $log{C} = $num{C} * log( $num{C} ) if $num{C}; $log{G} = $num{G} * log( $num{G} ) if $num{G}; $log{T} = $num{T} * log( $num{T} ) if $num{T}; my $complex = ( $log{A} + $log{C} + $log{G} + $log{T} - $length * log( $length ) ) / - log( 4 ); return ( $length, $complex ); } ##-------------------------------------------------------------------------## ## Use: my (\%begin, \%end) = Get_segments( $resultsCollection ); ## ## ## Returns ## ## Looks like this was intended to read the search engine ## output and look for hit lines. ## ## From these lines it creates two data structures. One ## holds a list (in ascending order) of the start positions ## and one holds the end positions of all hits in the output. ## ## %begin = { name1 => [ pos1, pos2, pos3, ... ], ## name2 => [ pos1, pos2, ... ] }; ## %end = { name1 => [ pos1, pos2, pos3, ... ], ## name2 => [ pos1, pos2, ... ] }; ## ## revamped to use a SearchResultCollection. ## ## NO Globals Used ##-------------------------------------------------------------------------##� sub Get_segments { my $resultsCollection = shift; my %begin = (); # associative arrays which assign lists (begin and ends) my %end = (); # to each sequence name: Technically, this assigns a list # reference to each sequence name. my ( $lastend, $lastname ) = (); for ( my $i = 0 ; $i < $resultsCollection->size() ; $i++ ) { my $result = $resultsCollection->get( $i ); my $name = $result->getQueryName(); my $begin = $result->getQueryStart(); my $end = $result->getQueryEnd(); if ( $lastend && $name eq $lastname && $lastend >= $begin ) { # otherwise get negative lengths $begin = $lastend + 1 if $lastend + 1 < $end; } push @{ $begin{$name} }, $begin; push @{ $end{$name} }, $end; $lastname = $name; $lastend = $end; } return ( \%begin, \%end ); } ##-------------------------------------------------------------------------## ## Use: my ## ## ## Returns ## ## Globals Used: None ## Globals Modified: None ## ##-------------------------------------------------------------------------##� sub XoutSeq { my ( $XedSeq, $beginRef, $endRef ) = @_; if ( $beginRef ) { my $i; for ( $i = $#$beginRef ; $i > -1 ; $i-- ) { my $len = $$endRef[ $i ] - $$beginRef[ $i ] + 1; substr( $XedSeq, $$beginRef[ $i ] - 1, $len ) = 'X' x $len; } } return $XedSeq; } ##-------------------------------------------------------------------------## ## Use: my ## ## ## Returns ## ## Globals Used: None ##-------------------------------------------------------------------------##� sub CompressSeq { my ( $cprSeq, $beginRef, $endRef, $numX ) = @_; if ( $beginRef ) { my $i; for ( $i = $#$beginRef ; $i > -1 ; $i-- ) { my $len = $$endRef[ $i ] - $$beginRef[ $i ] + 1; substr( $cprSeq, $$beginRef[ $i ] - 1, $len ) = 'x' x $numX; } } return $cprSeq; } ##-------------------------------------------------------------------------## ## Use: my $fullSeq = DecompressSeq( $fullSeq, $beginRef, $endRef, $numX ); ## ## ## Returns ## Returns $fullSeq with "X"s inserted at positions ## specified by the beginRef and endRef lists. ## ## Globals Used: None ##-------------------------------------------------------------------------##� sub DecompressSeq { my ( $fullSeq, $beginRef, $endRef, $numX ) = @_; if ( $beginRef ) { my $i; for ( $i = 0 ; $i <= $#$beginRef ; $i++ ) { my $len = $$endRef[ $i ] - $$beginRef[ $i ] + 1; substr( $fullSeq, $$beginRef[ $i ] - 1, $numX ) = 'X' x $len; } } return $fullSeq; } ##-------------------------------------------------------------------------## ## Use: my ( $lengthOfMaskedSeq, $maskableLength, $GCAfter ) = &replaceXs ( ## %options, $seqDB, $annotationFile, ## $outputFile ); ## TODO: What a kludge! Think about this routine in a very deep way ## and determine where best to put it! ## In the next release lets move this below the ProcessRepeats ## section of the code. ## ## Returns ## ## In general we would like a way to annotate the original ## fasta file by either changing the case of the repeat ## sequences, masking them with a specific character "x,X,N" ## etc.., or removing them completely. This is a routine ## which only handles the first two cases and in a very dumb ## inefficient way. I would recommend making this a utility ## function (seperate from repeatmasker) which reads an ## annotation database and manipulates the sequence database ## accordingly. ## ## Old documentation: ## ## In the BLAST programs the Xs in the masked files are deleted. ## It is therefore necessary to replace these Xs with Ns ## to retain the correct sequence position in the BLAST output. ## ## accounting (e.g. lengthOfMaskedSeq) is currently incorrect for ## queries with Xs??? ## ## TODO: Expain these better ## The number of bases masked and the maskableLength is also returned. ## ## Globals Used: None ## Globals Modified: None ## ## TODO: Check seqDB interface and make sure that setsubstr method ## updates the GC stats! ## ##-------------------------------------------------------------------------##� sub replaceXs { my %options = %{ shift() }; my $seqDB = shift; my $annotationFile = shift; my $outputFile = shift; print "RepeatMasker::replaceXs\n" if ( $DEBUG ); my %annots = (); # # Open up a search results object # my $searchResults = CrossmatchSearchEngine::parseOutput( searchOutput => $annotationFile ); for ( my $i = 0 ; $i < $searchResults->size() ; $i++ ) { my $result = $searchResults->get( $i ); push @{ $annots{ $result->getQueryName() } }, { 'begin' => $result->getQueryStart(), 'end' => $result->getQueryEnd() }; } undef $searchResults; my @seqIDs = $seqDB->getIDs(); my $seqlen = 0; my $nInSeq = 0; my $GClength = 0; my $maskableLength = 0; my $lengthOfMaskedSeq = 0; my $workseq = ""; open OUTFILE, ">$outputFile"; foreach my $seqID ( @seqIDs ) { my $seq = $seqDB->getSequence( $seqID ); $workseq = $seq; $maskableLength += length $workseq; $maskableLength -= ( $workseq =~ tr/N{20,}// ); foreach my $posRec ( @{ $annots{$seqID} } ) { my $beginPos = $posRec->{'begin'}; my $endPos = $posRec->{'end'}; my $repLen = $endPos - $beginPos + 1; substr( $workseq, $beginPos - 1, $repLen ) = "0" x ( $repLen ); if ( $options{'xsmall'} ) { #print "lowercase from $seqID: $beginPos-$endPos\n"; substr( $seq, $beginPos - 1, $repLen ) = lc( substr( $seq, $beginPos - 1, $repLen ) ); } elsif ( $options{'x'} ) { #print "X'ng from $seqID: $beginPos-$endPos\n"; substr( $seq, $beginPos - 1, $repLen ) = "X" x ( $repLen ); } else { #print "N'ng from $seqID: $beginPos-$endPos\n"; substr( $seq, $beginPos - 1, $repLen ) = "N" x ( $repLen ); } } print OUTFILE ">" . $seqID; my $desc = $seqDB->getDescription( $seqID ); if ( $desc ne "" ) { print OUTFILE " " . $desc; } print OUTFILE "\n"; $seq =~ s/(\S{50})/$1\n/g; $seq .= "\n" unless ( $seq =~ /.*\n+$/s ); print OUTFILE $seq; $lengthOfMaskedSeq += ( $workseq =~ tr/0// ); $seq =~ tr/NRYMK//d; # ignore masked and ambiguous bases $seqlen += length $seq; $GClength += ( $seq =~ tr/GCS// ); # For now it is too slow to modify the file inplace #$seqDB->setSubstr( $seqID, 0, length($seq), $seq ); } close OUTFILE; my $GCafter = 0; $GCafter = 100 * $GClength / $seqlen if $seqlen; $GCafter = sprintf "%4.2f", $GCafter; return ( $lengthOfMaskedSeq, $maskableLength, $GCafter ); } ##-------------------------------------------------------------------------## ## Use: my &exciseRepeats( $resultsCollection, $step, ## $nr, $cutLevelBegRef, ## $cutLevelEndRef, $excisedRepRef, ## $numX, $seqDB ); ## ## ## Returns ## ## Go through a search engine output file and adjust hit positions ## based on what has already been excised. Set the cut_alu and ## cut_some flags. Lastly...actually cut the sequence out of the ## maskfile. ## ## Returns the number of alus actually cut from the maskfile and ## the number of non-alu elements cut from the maskfile. ## ## Globals Used: None ## Globals Set: None ## ##-------------------------------------------------------------------------##� sub exciseRepeats { my $resultsCollection = shift; my $outfile = shift; my $step = shift; my $nr = shift; my $cutLevelBegRef = shift; my $cutLevelEndRef = shift; # TODO: get rid of this my $excisedRepRef = shift; my $numX = shift; my $seqDB = shift; print "RepeatMasker::exciseRepeats( \$resultsCollection, $outfile, " . "$step, $nr, \$cutLevelBegRef, \$cutLevelEndRef, $numX, \$seqDB );\n" if ( $DEBUG ); my ( $name, $seq ); ## $step is to order begin and end references in array; ## $nr is to indicate to processrepeats if a repeat may have been ## excised before another repeat was detected if ( $resultsCollection->size() ) { my ( $beginRef, $endRef ) = &Get_segments( $resultsCollection ); &adjustAnnotationCoordinates( $resultsCollection, $nr, $cutLevelBegRef, $cutLevelEndRef, $numX ); $cutLevelBegRef->[ $step ] = $beginRef; $cutLevelEndRef->[ $step ] = $endRef; my @seqIDs = $seqDB->getIDs(); foreach my $seqID ( @seqIDs ) { my $beginRef = $cutLevelBegRef->[ $step ]->{$seqID}; my $endRef = $cutLevelEndRef->[ $step ]->{$seqID}; if ( $beginRef ) { for ( my $i = $#$beginRef ; $i > -1 ; $i-- ) { print "Excising repeat ( step = $step ): " . ( $$beginRef[ $i ] - 1 ) . " - " . ( $$endRef[ $i ] ) . "\n" if ( $DEBUG ); if ( $i < $#$beginRef && $$endRef[ $i ] > $$beginRef[ $i + 1 ] ) { warn "Contained by relationship in excision!! " . ( $$beginRef[ $i ] - 1 ) . " - " . ( $$endRef[ $i ] ) . " and " . ( $$beginRef[ $i + 1 ] - 1 ) . " - " . ( $$endRef[ $i + 1 ] ) . "\n"; } my $len = $$endRef[ $i ] - $$beginRef[ $i ] + 1; $seqDB->setSubstr( $seqID, $$beginRef[ $i ] - 1, $len, 'x' x $numX ); } # for ( my $i.. } # if ( $beginRef.. } # foreach my $seqID.. } print "RepeatMasker::exciseRepeats - leaving \n" if ( $DEBUG ); } # End sub exciseRepeats() ##-------------------------------------------------------------------------## ## Use: my $bRef = &extractRepeatBoundaries( $resultCollection ); ## ## $resultCollection : A SearchResultCollection. ## ## Returns ## $bRef : A reference to the repeat boundaries ## datastructure. This is simply: ## [ { 'begin'=>$beginPos, ## 'end' => $endPos }, ## 'queryID' => }, ## { 'begin'=>$beginPos, ## ... ## ] ## ## This is used primarily by the updateExcisedList routine as ## a temporary holder for these pre-fragmented ranges. ##-------------------------------------------------------------------------## sub extractRepeatBoundaries { my $resultsCollection = shift; my @repeatBoundaries = (); for ( my $j = 0 ; $j < $resultsCollection->size() ; $j++ ) { my $result = $resultsCollection->get( $j ); push @repeatBoundaries, { 'queryID' => $result->getQueryName(), 'begin' => $result->getQueryStart(), 'end' => $result->getQueryEnd() }; } print "RepeatMasker::extractRepeatBoundaries: Returning..\n" . Dumper( \@repeatBoundaries ) . "\n" if ( $DEBUG ); return ( \@repeatBoundaries ); } # end extractRepeatBoundaries(); ##-------------------------------------------------------------------------## ## Use: &updateExcisedList( $resultCollection, $excisedRepRef ); ## ## $resultCollection : A SearchResultCollection. ## $excisedRepRef : A reference to the excised repeat ## datastructure. This is simply: ## { $queryID => [ { 'begin'=>$beginPos, ## 'end' => $endPos }, ## { 'begin'=>$beginPos, ## 'end' => $endPos }, ## ... ## ] ## ... ## } ## ## Returns ## ## Appends all start/end positions from elements in a collection to a ## sorted array. ## ##-------------------------------------------------------------------------## sub updateExcisedList { my $repeatBoundariesRef = shift; my $excisedRepRef = shift; ## Add new elements to the excised list for ( my $j = 0 ; $j <= $#{$repeatBoundariesRef} ; $j++ ) { my $resultBegin = $repeatBoundariesRef->[ $j ]->{'begin'}; my $resultEnd = $repeatBoundariesRef->[ $j ]->{'end'}; my $queryID = $repeatBoundariesRef->[ $j ]->{'queryID'}; print "RepeatMasker::updateExcisedList: considering result: " . " $queryID: $resultBegin - $resultEnd\n" if ( $DEBUG ); print "RepeatMasker::updateExcisedList: $queryID list before: " . Dumper( $excisedRepRef->{$queryID} ) . "\n" if ( $DEBUG ); my $excisedList = $excisedRepRef->{$queryID}; my $inserted = 0; for ( my $i = $#{$excisedList} ; $i >= 0 ; $i-- ) { next if ( $excisedList->[ $i ]->{'end'} > $resultEnd ); if ( $excisedList->[ $i ]->{'begin'} >= $resultBegin ) { # Contained excision - remove print "RepeatMasker::updateExcisedList: removing internal excision: " . " $excisedList->[$i]->{'begin'} - $excisedList->[$i]->{'end'}\n" if ( $DEBUG ); splice( @{$excisedList}, $i, 1 ); next; } if ( $excisedList->[ $i ]->{'end'} < $resultBegin ) { # Splice in print "RepeatMasker::updateExcisedList: splicing...\n" if ( $DEBUG ); splice( @{$excisedList}, $i + 1, 0, ( { 'begin' => $resultBegin, 'end' => $resultEnd } ) ); $inserted = 1; last; } } if ( !$inserted ) { print "RepeatMasker::updateExcisedList: pushing...\n" if ( $DEBUG ); unshift @{ $excisedRepRef->{$queryID} }, { 'begin' => $resultBegin, 'end' => $resultEnd }; } print "RepeatMasker::updateExcisedList: $queryID list after: " . Dumper( $excisedRepRef->{$queryID} ) . "\n" if ( $DEBUG ); } } # sub updateExcisedList ##-------------------------------------------------------------------------## ## Use: &fragmentAnnotations( $resultsCollection, $excisedRepRef, ## $numX ); ## ## ## Returns ## ## Determine which annotations contain previously excised elements and ## fragment them. ## ##-------------------------------------------------------------------------## sub fragmentAnnotations { my $resultsCollection = shift; my $excisedRepRef = shift; my $numX = shift; my $DEBUG = 0; my $newSearchResults = SearchResultCollection->new(); my %deleteHash = (); for ( my $j = 0 ; $j < $resultsCollection->size() ; $j++ ) { my $result = $resultsCollection->get( $j ); my $resultBegin = $result->getQueryStart(); my $resultEnd = $result->getQueryEnd(); my $queryID = $result->getQueryName(); if ( $DEBUG ) { print "Considering fragmenting:\n"; print "" . $result->toStringFormatted( SearchResult::AlignWithQuerySeq ) . "\n"; print "" . $result->toString . "\n"; } if ( $excisedRepRef->{$queryID} ) { # Loop over previous cut out elements for this $seqID my $internalBegin = -1; my $internalEnd = -1; my @subSegmentList = (); my $newResultBegin = $resultBegin; print "Element boundaries: $resultBegin - $resultEnd\n" if ( $DEBUG ); foreach my $hit ( @{ $excisedRepRef->{$queryID} } ) { last if ( $hit->{'begin'} > $resultEnd ); next if ( $hit->{'begin'} < $resultBegin ); next if ( $hit->{'begin'} == $resultBegin && $hit->{'end'} == $resultEnd ); # Deal with this one $internalBegin = $hit->{'begin'}; $internalEnd = $hit->{'end'}; print "Found internal: $internalBegin - $internalEnd\n" if ( $DEBUG ); # Change: We shouldn't have skipped these zero-length gaps. The # subroutine createSubElements needed this info in order # to skip including the "x" in one of the alignment fragments. #if ( ( $internalBegin - $newResultBegin ) > 0 ) { print "Pushing $newResultBegin - " . ( $internalBegin - 1 ) . "\n" if ( $DEBUG ); push @subSegmentList, { 'begin' => $newResultBegin, 'end' => $internalBegin - 1 }; $newResultBegin = $hit->{'end'} + 1; } if ( $internalBegin > $result->getQueryStart() && $internalEnd < $resultEnd ) { print "last Annotation from " . ( $internalEnd + 1 ) . " to $resultEnd\n" if ( $DEBUG ); push @subSegmentList, { 'begin' => $internalEnd + 1, 'end' => $resultEnd }; } if ( $#subSegmentList >= 0 ) { my $resultSubCollection = &createSubElements( $result, \@subSegmentList, $numX ); $deleteHash{$j} = 1; # Signal that this element has been fragmented $newSearchResults->addAll( $resultSubCollection ); if ( $DEBUG ) { print "Fragmenting Element:\n" . $result->toStringFormatted( SearchResult::AlignWithQuerySeq ) . "\n"; for ( my $i = 0 ; $i < $resultSubCollection->size() ; $i++ ) { print "New Segment:\n"; print "" . $resultSubCollection->get( $i ) ->toStringFormatted( SearchResult::AlignWithQuerySeq ) . "\n"; } } } } } # for ### Here is a good place to debug the fragmentation process ### Adding an "if ( 0 ) {" around the following block of code ### will disable the actual modification of the result collection. # Delete all fragment parents foreach my $index ( sort { $b <=> $a } keys( %deleteHash ) ) { $resultsCollection->remove( $index ); } # Add parents to the results collection $resultsCollection->addAll( $newSearchResults ); #TODO: Check this # Sort the results collection $resultsCollection->sort( sub ($$) { $_[ 0 ]->getQueryStart() <=> $_[ 1 ]->getQueryStart(); } ); ### ### End the debug modification by placing a closing ### bracket here. ### $DEBUG = 0; } ##-------------------------------------------------------------------------## ## Use: my $fragResults = &createSubElements( $parentElement, ## $subSegmentList, ## $numX ); ## ## Results ## ## Given a SearchResult and a list of start/end positions fragment ## the element and return a SearchResultCollection containing the ## new fragments. ## ##-------------------------------------------------------------------------## sub createSubElements { my $parentElement = shift; my $subSegmentList = shift; my $numX = shift; my $DEBUG = 0; my $parentQueryStart = $parentElement->getQueryStart(); my $parentQueryEnd = $parentElement->getQueryEnd(); my $parentQueryLength = $parentQueryEnd - $parentQueryStart + 1; my $parentQuerySeq = $parentElement->getQueryString() || ""; my $parentSubjSeq = $parentElement->getSubjString() || ""; my $parentSubjStart = $parentElement->getSubjStart(); my $parentSubjEnd = $parentElement->getSubjEnd(); my $parentSubjLen = abs( $parentSubjEnd - $parentSubjStart ) + 1; my $newResultColl = SearchResultCollection->new(); my $numSegments = $#{$subSegmentList} + 1; my $segSubjStart = $parentSubjStart; my $segSubjEnd = $parentSubjEnd; my $realQueryLength = 0; for ( my $j = 0 ; $j < $numSegments ; $j++ ) { $realQueryLength += $subSegmentList->[ $j ]->{'end'} - $subSegmentList->[ $j ]->{'begin'} + 1; } for ( my $j = 0 ; $j < $numSegments ; $j++ ) { my $segQueryStart = $subSegmentList->[ $j ]->{'begin'}; my $segQueryEnd = $subSegmentList->[ $j ]->{'end'}; my $segQueryLen = $segQueryEnd - $segQueryStart + 1; print "RepeatMasker::createSubElements: segQueryStart = $segQueryStart," . " segQueryEnd = $segQueryEnd, segQueryLen = $segQueryLen, " . " segSubjStart = $segSubjStart segSubjEnd = $segSubjEnd\n " if ( $DEBUG ); # Do not produce very small alignemnts # Note: This can create small gaps ( usually in low quality # regions ) that must be taken into account in ProcessRepeats. if ( $segQueryLen < 5 ) { print "RepeatMasker::createSubElements: Annotation is to " . "small to report ( len = $segQueryLen, numX = $numX, " . "len(parentQuerySeq) = " . length( $parentQuerySeq ) . " )\n" if ( $DEBUG ); my $adjLen = 0; if ( $segQueryLen < 1 ) { # No bases...just move down by numX $adjLen = $numX; } else { my $bSeen = 0; my $skipTo = 0; while ( $bSeen < $segQueryLen && $skipTo < length( $parentQuerySeq ) ) { $bSeen++ if ( substr( $parentQuerySeq, $skipTo++, 1 ) ne "-" ); } print "RepeatMasker::createSubElements: skipTo=$skipTo, bSeen=$bSeen\n" if ( $DEBUG ); $adjLen = $skipTo + $numX; } if ( $adjLen <= length( $parentQuerySeq ) ) { # Adjust subject start my $sbjBases = substr( $parentSubjSeq, 0, $adjLen ); $sbjBases =~ s/-//g; $segSubjStart += length( $sbjBases ); $parentQuerySeq = substr( $parentQuerySeq, $adjLen ); $parentSubjSeq = substr( $parentSubjSeq, $adjLen ); } next; } my $newSegment = $parentElement->clone(); $newSegment->setQueryStart( $segQueryStart ); $newSegment->setQueryEnd( $segQueryEnd ); $newSegment->setQueryRemaining( $parentQueryEnd - $segQueryEnd + $parentElement->getQueryRemaining() ); # If alignment info if ( $parentQuerySeq ne "" ) { # Count through query seq until we reach the breakpoint ( discount "-" ) my $seqCount = 0; my $i = 0; while ( $seqCount <= $segQueryLen ) { $seqCount++ unless ( substr( $parentQuerySeq, $i++, 1 ) eq '-' ); } # Don't include the X my $newQuerySeq = substr( $parentQuerySeq, 0, $i - 1 ); my $newSubjSeq = substr( $parentSubjSeq, 0, $i - 1 ); $newSegment->setQueryString( $newQuerySeq ); $newSegment->setSubjString( $newSubjSeq ); # Set parent to remaining sequence if any if ( ( $i + $numX ) <= length( $parentQuerySeq ) ) { $parentQuerySeq = substr( $parentQuerySeq, $i - 1 + $numX ); $parentSubjSeq = substr( $parentSubjSeq, $i - 1 + $numX ); } $newSubjSeq =~ s/-//g; #print "segQueryStart = $segQueryStart - $segQueryEnd\n"; #print "newSubjSeq = $newSubjSeq\n"; if ( $parentElement->getOrientation() eq "C" ) { $newSegment->setSubjEnd( $segSubjEnd ); $newSegment->setSubjRemaining( $parentSubjEnd - $segSubjEnd + $parentElement->getSubjRemaining() ); if ( $newSubjSeq eq "" ) { $newSegment->setSubjStart( $segSubjEnd ); } else { $segSubjEnd -= length( $newSubjSeq ); $newSegment->setSubjStart( $segSubjEnd + 1 ); } } else { $newSegment->setSubjStart( $segSubjStart ); if ( $newSubjSeq eq "" ) { $segSubjEnd = $segSubjStart; } else { $segSubjEnd = $segSubjStart + length( $newSubjSeq ) - 1; $segSubjStart += length( $newSubjSeq ); } $newSegment->setSubjEnd( $segSubjEnd ); $newSegment->setSubjRemaining( $parentSubjEnd - $segSubjEnd + $parentElement->getSubjRemaining() ); } } else { # We have no alignment information. We cannot be sure # how many subject characters match this subsegment of # the alignment. Realignment can be too costly so we # will guestimate the number. This is not really proper # and we should make a note of this in the output. Currently # ProcessRepeats is doing this! my $segSubjLength = 0; if ( $j == ( $numSegments - 1 ) ) { if ( $parentElement->getOrientation() eq "C" ) { $segSubjLength = $segSubjEnd - $parentSubjStart; } else { $segSubjLength = $parentSubjEnd - $segSubjStart; } } else { my $percQuerySegLength = ( $segQueryEnd - $segQueryStart + 1 ) / $realQueryLength; $segSubjLength = int( $parentSubjLen * $percQuerySegLength ); } if ( $parentElement->getOrientation() eq "C" ) { $newSegment->setSubjEnd( $segSubjEnd ); $newSegment->setSubjRemaining( $parentSubjEnd - $segSubjEnd + $parentElement->getSubjRemaining() ); $segSubjEnd -= $segSubjLength; $newSegment->setSubjStart( $segSubjEnd ); $segSubjEnd--; } else { $newSegment->setSubjStart( $segSubjStart ); $segSubjEnd = $segSubjStart + $segSubjLength; $newSegment->setSubjEnd( $segSubjEnd ); $segSubjStart = $segSubjEnd + 1; $newSegment->setSubjRemaining( $parentSubjEnd - $segSubjEnd + $parentElement->getSubjRemaining() ); } } $newResultColl->add( $newSegment ); } return $newResultColl; } ##-------------------------------------------------------------------------## ## Use: my adjustAnnotationCoordinates ## ## ## Returns ## ## Adjusts the positions of the matches to sequences with excised ## repeats to those in the original sequences file adjusts the ## divergence level (second column) to: ## mismatches /(matches+mismatches) ## from the odd cross_match definition: ## mismatches /(length match in query). ## The latter definition makes repeats with inserts look less ## diverged than they really are. Also adds information to the ## .cat file about GC richness, fragment size, number of N strings ## in query ## ## Globals Used: None! ##-------------------------------------------------------------------------##� sub adjustAnnotationCoordinates { my $searchResults = shift; my $cutnumber = shift; my $cutLevelBegRef = shift; my $cutLevelEndRef = shift; my $numX = shift; print "RepeatMasker::adjustAnnotationCoordinates( \$searchResults, $cutnumber, " . "\$cutLevelBegRef, \$cutLevelEndRef, $numX );\n" if ( $DEBUG ); my $querysnext = ""; my $fullname; for ( my $j = 0 ; $j < $searchResults->size() ; $j++ ) { my $result = $searchResults->get( $j ); my $percDiv = $result->getPctDiverge(); my $percIns = $result->getPctInsert(); my $adjDiv = 100 * ( $percDiv / ( 100 - $percIns ) ); $result->setPctDiverge( sprintf( "%4.2f", $adjDiv ) ); $fullname = $result->getQueryName(); $result->setQueryStart( &adjustCoordinateForExcisions( $fullname, $result->getQueryStart(), $cutLevelBegRef, $cutLevelEndRef, $numX ) ); $result->setQueryEnd( &adjustCoordinateForExcisions( $fullname, $result->getQueryEnd(), $cutLevelBegRef, $cutLevelEndRef, $numX ) ); # Make sure we start with zero otherwise we would loose # a column. $cutnumber = 0 unless $cutnumber; $result->setId( $cutnumber ); } print "RepeatMasker::adjustAnnotationCoordinates - leaving.\n" if ( $DEBUG ); } ##-------------------------------------------------------------------------## ## Use &adjustCoordinateForExcisions ( $fullname, $position ## \@cutLevelBegRef, \@cutLevelEndRef ## $numX ## ## Globals used: None ##-------------------------------------------------------------------------## sub adjustCoordinateForExcisions { my $fullname = shift; my $position = shift; my @cutLevelBegRef = @{ shift() }; my @cutLevelEndRef = @{ shift() }; my $numX = shift; for ( my $i = $#cutLevelBegRef ; $i >= 0 ; --$i ) { next unless $cutLevelBegRef[ $i ]; $position = &Expand( $position, $cutLevelBegRef[ $i ]->{$fullname}, $cutLevelEndRef[ $i ]->{$fullname}, $numX ); } return $position; } ##-------------------------------------------------------------------------## ## Use: my &Expand ## ## ## Returns ## ## Globals Used: None ##-------------------------------------------------------------------------##� sub Expand { my ( $n, $beginRef, $endRef, $numX ) = @_; my $cum = 0; my $j; if ( $beginRef ) { foreach $j ( 0 .. $#$beginRef ) { return ( $n + $cum ) if ( $n < ( $beginRef->[ $j ] - $cum ) ); # $n maybe == $beginRef->[$j] - $cum when beginalign has been # moved to flank element deleted in same round and in reality # a previously cut element flanks the latter $cum += $endRef->[ $j ] - $beginRef->[ $j ] + 1 - $numX; } } return $n + $cum; } ##-------------------------------------------------------------------------## ## Use: my SaveOldFiles( $fijl, $fileend, $originaldir, $date, \%options ); ## ## ## Returns ## ## Globals Used: None ##-------------------------------------------------------------------------##� sub SaveOldFiles { my $fijl = shift; my $fileend = shift; my $originaldir = shift; my $date = shift; my %options = %{ shift() }; $fijl = $options{'dir'} . "\/$fileend" if $options{'dir'}; if ( $options{'is_only'} ) { rename( "$fijl.alert", "$fijl.alert.pre$date" ) && print "\nOld file $fijl.alert renamed to $fijl.alert.pre$date\n\n" if -s "$fijl.alert"; unlink "$fijl.withoutIS" if -s "$fijl.withoutIS"; } elsif ( $options{'primspec'} ) { rename( "$fijl.primate.spec", "$fijl.primate.spec.pre$date" ) && print "\nOld file $fijl.primate.spec renamed to $fijl.primate.spec.pre$date\n\n" if -s "$fijl.primate.spec"; } elsif ( $options{'rodspec'} ) { rename( "$fijl.rodent.spec", "$fijl.rodent.spec.pre$date" ) && print "\nOld file $fijl.rodent.spec renamed to $fijl.rodent.spec.pre$date\n\n" if -s "$fijl.rodent.spec"; } else { my $savedir = "$originaldir\/$fileend.pre$date.RMoutput"; $savedir = $options{'dir'} . "\/$fileend.pre$date.RMoutput" if $options{'dir'}; mkdir $savedir, 0777; rename( "$fijl.cat", "$savedir\/$fileend.cat" ) if -s "$fijl.cat"; rename( "$fijl.stderr", "$savedir\/$fileend.stderr" ) if -s "$fijl.stderr"; rename( "$fijl.out", "$savedir\/$fileend.out" ) if -s "$fijl.out"; rename( "$fijl.masked", "$savedir\/$fileend.masked" ) if -s "$fijl.masked"; rename( "$fijl.tbl", "$savedir\/$fileend.tbl" ) if -s "$fijl.tbl"; rename( "$fijl.cut", "$savedir\/$fileend.cut" ) if -s "$fijl.cut" && $options{'cut'}; rename( "$fijl.align", "$savedir\/$fileend.align" ) if -s "$fijl.align" && $options{'a'}; rename( "$fijl.alert", "$savedir\/$fileend.alert" ) if -s "$fijl.alert" && !$options{'no_is'}; rename( "$fijl.withoutIS", "$savedir\/$fileend.withoutIS" ) if -s "$fijl.withoutIS" && $options{'is_clip'}; rmdir $savedir || print " Some previous RepeatMasker output files were moved to the directory $savedir in order not to overwrite them.\n\n"; } } ##-------------------------------------------------------------------------## ## Use: my &SkipFile( $file ); ## ## ## Returns ## ## Globals Used: None ##-------------------------------------------------------------------------##� sub SkipFile { my $file = shift; copy( $file, "$file.masked" ); ( my $tempfile = $file ) =~ s/.+\///; print "RepeatMasker quit because the file $tempfile only contains ambiguous bases, if any. To accomodate automated processes the file has been copied to $tempfile.masked and this message has been printed to $tempfile.out\n\n"; open( OUT, ">$file.out" ); print OUT "RepeatMasker quit because the file $tempfile only contains ambiguous bases, if any.\n"; close OUT; } ##-------------------------------------------------------------------------## ## Use: my ( $tempdir, $runnumber ) = &createtempdir( \%options, $date, ## $file ); ## ## ## Returns ## ## TODO: Clean this up!!! It uses globals and makes assumptions ## about how files are passed to repeatmasker etc.. ## ## Globals Used: ARGV[0] ##-------------------------------------------------------------------------##� sub createtempdir { my %options = %{ shift() }; my $date = shift; my $file = shift; my $curdir = cwd(); # To make cygwin happy - Contributed by Mike Seivers of TimeLogic $curdir =~ s/ /\\ /go; my ( $querydir, $fileendname ) = ( File::Spec->splitpath( $ARGV[ 0 ] ) )[ 1, 2 ]; $querydir = "." if ( $querydir eq "" ); # Used to avoid including existing files in output even # if $options{'dir'} chosen, preferred to write the temporary # files to a temporary subdirectory of the home directory, as # $options{'dir'} may be across system boundaries: my $runnumber = "$$" . ".$date"; my $tempdir = "$curdir\/RM_$runnumber"; unless ( -r "$tempdir\/$fileendname" || mkdir $tempdir, 0777 ) { if ( $options{'dir'} ) { $tempdir = $options{'dir'} . "\/RM_$runnumber"; die "Can't write to " . $options{'dir'} . "\n" unless mkdir $tempdir, 0777; } else { # no writing to current directory $tempdir = "$querydir\/RM_$runnumber"; if ( mkdir $tempdir, 0777 ) { my $temptestfile = "$file" . "_$runnumber"; copy( $file, "$temptestfile" ) || die "Can not create a $curdir subdirectory nor write " . "full output to $querydir.\n Change operating " . "directory or use the option " . $options{'dir'} . " to indicate where files should be written.\n"; unlink $temptestfile; } else { die "There is no writing access to the current directory " . "($curdir) nor to the directory containing the query " . "sequence.\nConsider using \"-dir\" or changing " . "current directory."; } } } return ( $tempdir, $runnumber ); } ##-------------------------------------------------------------------------## ## Use: my ## ## ## Returns ## ## Globals Used: None ##-------------------------------------------------------------------------##� ### Interrupt handler used by systemint() ### sub handler { my ( $sig ) = @_; print "\nAborting with a SIG$sig\n"; exit( -1 ); } ##-------------------------------------------------------------------------## ## Use: my ## ## ## Returns ## ## Globals Used: None ##-------------------------------------------------------------------------##� ### systemint -- Interruptible system call routine. ### sub systemint { my ( $cmd ) = @_; my ( $pid ); my ( $flag ) = 0; local $SIG{INT} = sub { &handler( @_ ) if ( $flag ) }; #^C local $SIG{QUIT} = sub { &handler( @_ ) if ( $flag ) }; #^\ local $SIG{TERM} = sub { &handler( @_ ) if ( $flag ) }; #kill command or system crash local $SIG{HUP} = sub { &handler( @_ ) if ( $flag ) }; # local $SIG{CHLD} = 'IGNORE'; FORK: { if ( $pid = fork ) { $flag = 1; waitpid( $pid, 0 ); #Waits for child to finish... my ( $status ) = $?; if ( WIFSTOPPED( $status ) ) { my ( $signal ) = WSTOPSIG( $status ); print "\nforksys: Program terminated by a signal $signal.\n"; print "The executing command was: $cmd\n"; return 1; } if ( WIFEXITED( $status ) ) { my ( $temp ) = WEXITSTATUS( $status ); return $temp; } if ( WIFSIGNALED( $status ) ) { my ( $signal ) = WTERMSIG( $status ); print "\nforksys: Program terminated by a signal $signal.\n"; print "The executing command was: $cmd\n"; return 1; } } elsif ( defined $pid ) { exec( "$cmd" ) or die "Exec $cmd failed\n"; } elsif ( $! =~ /No more process/o ) { print "$!\n"; sleep 5; redo FORK; } else { die "Can't fork...\n"; } } } ##-------------------------------------------------------------------------## ## Use: my &cleanUp( \%options, $runnumber, $tempdir, $fileori, ## $fileend, $file, $originaldir, $compressed ); ## ## Returns ## ## Globals Used: None ##-------------------------------------------------------------------------##� sub cleanUp { my %options = %{ shift() }; my $runnumber = shift; my $tempdir = shift; my $fileori = shift; my $fileend = shift; my $file = shift; my $originaldir = shift; my $compressed = shift; unlink "$tempdir\/$fileend"; # eq $file, but unlinking $file seems scary # copying it to originaldirectory would # change the date, priviliges, etc. unlink "$file.masked.log"; # pretty darn useless little file opendir TEMP, "$tempdir"; my @outputfiles = readdir TEMP; closedir TEMP; # default is writing output to query directory my $targetdir = $originaldir; $targetdir = $options{'dir'} if $options{'dir'}; if ( open TEMP2, ">$targetdir\/temp.$runnumber" ) { unlink "$targetdir\/temp.$runnumber"; close TEMP2; foreach my $outputfile ( @outputfiles ) { next unless $outputfile =~ /^$fileend/; #rename doesn't cross system boundaries copy( "$tempdir\/$outputfile", "$targetdir\/$outputfile" ) || die "Can't write all output files to $targetdir " . "(over quota?)n. Files can be found in $tempdir " . "(and perhaps a few in $targetdir). Run " . "\"ProcessRepeats\" on the .cat file or redo " . "analysis.\n\n"; unlink "$tempdir\/$outputfile"; } } else { print "\nOutput files can not be written to $targetdir. " . "They can be found in the directory $tempdir instead. \n" . "Consider using the -dir option.\n\n"; } if ( $compressed ) { &systemint( "gzip $fileori" ) if $compressed eq 'zipped'; &systemint( "compress $fileori" ) if $compressed eq 'Zed'; } } ##-------------------------------------------------------------------------## ## Use: my $libSize = createLib( \%options, $db, $libName, $specPattern, ## $stageNum, $tax); ## ## \%options : RepeatMasker options hash ## $db : A FastaDB or RepbaseEMBL object open to ## the repeat datbase. ## $libName : The name of the library to create. ## $specPattern : The name of the species to include seqs for. ## $stageNum: The name of the old RM database. Used to ## screen the repeats ( will generalize in the ## future ). ## $tax : The Taxonomy.pm object. ## ## Returns ## ## Creates a library by filtering the RepeatMasker.lib file ## given specific filtering parameters ( specPattern and stageNum). ## If wublast is being used it also creates the binary versions ## of the fasta library. Returns the number of sequences stored ## in the library. Removes the library file if there are no ## matching sequences. ## ## Globals Used: None ##-------------------------------------------------------------------------##� sub createLib { my $options = %{ shift() }; my $db = shift; my $libName = shift; my $specPattern = shift; my $stageNum = shift; my $tax = shift; my $searchEngine = shift; print "RepeatMasker::createLib( \$db, " . "$libName, $specPattern, $stageNum, \$tax );\n" if ( $DEBUG ); my @ids; my @descs; my $seqCount = 0; if ( $db->isa( "FastaDB" ) ) { @ids = $db->getIDs(); $seqCount = $#ids + 1; @descs = $db->getDescriptors(); } elsif ( $db->isa( "RepbaseEMBL" ) ) { $seqCount = $db->getRecordCount(); } else { print "RepeatMasker:createLib: Cannot determine what type " . "of database we were handed!\n"; } my $outFile = $libName; $outFile = "$libName-wublast" if ( $searchEngine->isa( "WUBlastSearchEngine" ) ); open OUT, ">$outFile" or die "RepeatMasker::createLib(): Could " . "not open library file $outFile!\n"; # The number of sequences stored in this library my $librarySize = 0; # For each sequence in the master library for ( my $i = 0 ; $i < $seqCount ; $i++ ) { my $match = 0; my @buffers = (); my $seq = ""; my $id = ""; my $type = ""; my $desc = ""; if ( $db->isa( "FastaDB" ) ) { $seq = $db->getSequence( $ids[ $i ] ); $id = $ids[ $i ]; $desc = $descs[ $i ]; while ( $descs[ $i ] =~ /@([\w\.]+)/ig ) { my $name = $1; $name =~ s/_/ /g; if ( $tax->isA( $name, $specPattern ) > 0 || $tax->isA( $specPattern, $name ) > 0 ) { # Eventually we will place all new species specific repeats # into state 80. This will make this special case not # necessary. if ( ( not defined $stageNum ) || ( $stageNum eq "" ) || ( $stageNum == 80 && $descs[ $i ] =~ /\[S:\]/ ) || $descs[ $i ] =~ /\[S:(\s*\d+,\s*)*$stageNum(\s*,\s*\d+\s*)*\]/ ) { $match = 1; } last; } } } else { my $record = $db->getRecord( $i ); $seq = $record->getSequence(); $id = $record->getId(); $type = "#" . $record->getRMType(); if ( $record->getRMSubType() ne "" ) { $type .= "/" . $record->getRMSubType(); } $desc = $record->getDescription(); foreach my $name ( $record->getRMSpeciesArray() ) { $name =~ s/_/ /g; if ( $tax->isA( $name, $specPattern ) > 0 || $tax->isA( $specPattern, $name ) > 0 ) { if ( $stageNum == 80 ) { # For the specieslib it is sufficient to be in # the clade or in the ancestral species. No # need to breakout into seperate search stages # yet. $match = 1; } else { # Full length sequence non-buffered my @stages = $record->getRMSearchStagesArray(); foreach my $stage ( @stages ) { if ( $stage eq $stageNum ) { $match = 1; } } # Buffered Sequence @stages = $record->getRMBufferStagesArray(); foreach my $stage ( @stages ) { if ( $stage =~ /(\d+)\[(\d+)\-(\d+)\]/ ) { if ( $1 == $stageNum ) { push @buffers, "$2-$3"; } } elsif ( $stage =~ /(\d+)/ ) { if ( $1 == $stageNum ) { push @buffers, "full"; } } else { print "RepeatMasker::createLib: Warning buffer stage $stage " . "understood!\n"; } } } last if ( $match == 1 ); } } } if ( $match > 0 || @buffers ) { $librarySize++; my $origSeq = $seq; if ( $match > 0 ) { if ( $searchEngine->isa( "WUBlastSearchEngine" ) || $searchEngine->isa( "DeCypherSearchEngine" ) ) { my $rseq = uc( $seq ); $rseq =~ tr/ACGTRYWSKMNXBDHV/TGCAYRSWMKNXVHDB/; $rseq = reverse $rseq; die "Repeat consensus ( $id ) contains the " . "word \"anti\" in it's name. This will cause " . "incorrect orientation calls in the output when " . "running with wublast/decypher." if ( $id =~ /anti/ ); print OUT ">$id" . $type; print OUT " (anti)\n"; $rseq =~ s/(\S{50})/$1\n/g; $rseq .= "\n" unless ( $rseq =~ /.*\n+$/s ); print OUT $rseq; } print OUT ">" . $id . "$type\n"; $seq =~ s/(\S{50})/$1\n/g; $seq .= "\n" unless ( $seq =~ /.*\n+$/s ); print OUT $seq; } if ( @buffers ) { foreach my $buffer ( @buffers ) { $seq = $origSeq; if ( $buffer eq "full" ) { $type = "#buffer"; } elsif ( $buffer =~ /(\d+)-(\d+)/ ) { $seq = substr( $seq, $1 - 1, $2 - $1 + 1 ); $type = "_$1" . "_$2#buffer"; } if ( $searchEngine->isa( "WUBlastSearchEngine" ) || $searchEngine->isa( "DeCypherSearchEngine" ) ) { my $rseq = uc( $seq ); $rseq =~ tr/ACGTRYWSKMNXBDHV/TGCAYRSWMKNXVHDB/; $rseq = reverse $rseq; die "Repeat consensus ( $id ) contains the " . "word \"anti\" in it's name. This will cause " . "incorrect orientation calls in the output when " . "running with wublast/decypher." if ( $id =~ /anti/ ); print OUT ">" . $id . "$type"; print OUT " (anti)\n"; $rseq =~ s/(\S{50})/$1\n/g; $rseq .= "\n" unless ( $rseq =~ /.*\n+$/s ); print OUT $rseq; } print OUT ">" . $id . "$type\n"; $seq =~ s/(\S{50})/$1\n/g; $seq .= "\n" unless ( $seq =~ /.*\n+$/s ); print OUT $seq; } } } } close OUT; if ( $librarySize == 0 ) { unlink( $outFile ); } else { if ( $searchEngine->isa( "WUBlastSearchEngine" ) ) { system( "$RepeatMaskerConfig::SETDB_PRGM $outFile > setdb.log 2>&1" ) == 0 or die "RepeatMasker::createLib(): Error invoking setdb on file " . "$outFile. We tried using the setdb program ( " . $RepeatMaskerConfig::SETDB_PRGM . " ).\n"; unlink( $outFile ) unless ( $DEBUG ); move( "$outFile.ahd", "$libName.ahd" ); move( "$outFile.atb", "$libName.atb" ); move( "$outFile.bsq", "$libName.bsq" ); } elsif ( $searchEngine->isa( "DeCypherSearchEngine" ) ) { $outFile =~ s/\/cygdrive\/(\w)/$1:/; # handle cygwin drive $outFile =~ /.*\/(.*)$/; # TODO: NO Globals! This should be a parameter system "$RepeatMaskerConfig::DEMAKE -source \"$outFile\" -targ $1"; } elsif ( $searchEngine->isa( "NCBIBlastSearchEngine" ) ) { system( "$RepeatMaskerConfig::RMBLASTDB_PRGM -dbtype nucl " . "-in $outFile > rmblastdb.log 2>&1" ) == 0 or die "RepeatMasker::createLib(): Error invoking " . $RepeatMaskerConfig::RMBLASTDB_PRGM . " on file $outFile.\n"; } } return ( $librarySize ); } ##-------------------------------------------------------------------------## ## Use: my &processCustomLib( \%options, $wublastSetDB, $tempdir ); ## ## \%options : RepeatMasker options hash ## $wublastSetDB : The full path to the wublast "setdb" program. ## $tempdir : The temporary directory for this run ## $searchEngine : The searchEngine being used ## ## Returns ## ## Processes a custom library ( in FASTA format ) supplied by the user. ## This involves checking that the repeat names supplied by the user's ## library conform to the RepeatMasker nomenclature. Secondarily this ## will create the frozen databases for WUBlast. ## ## Globals Used: None ##-------------------------------------------------------------------------##� sub processCustomLib { my $options = %{ shift() }; my $tempdir = shift; my $searchEngine = shift; print "RepeatMasker::processCustomLib()\n" if ( $DEBUG ); my ( $custLibVol, $custLibDir, $custLibFile ) = File::Spec->splitpath( $options{'lib'} ); $custLibDir = "." if ( $custLibDir eq "" ); # TODO: Check name syntax if ( $searchEngine->isa( "WUBlastSearchEngine" ) || $searchEngine->isa( "DeCypherSearchEngine" ) ) { my $libDB = FastaDB->new( fileName => $options{'lib'}, openMode => SeqDBI::ReadOnly, maxIDLength => 50 ); open OUT, ">$tempdir/$custLibFile.anti" or die "RepeatMasker::processCustomLib(): Could " . "not create wublast compatable library file " . "$custLibDir/$custLibFile.anti!\n"; foreach my $seqID ( $libDB->getIDs() ) { my $seq = $libDB->getSequence( $seqID ); my $rseq = uc( $seq ); my $desc = $libDB->getDescription( $seqID ); die "Repeat consensus ( $seqID ) contains the " . "word \"anti\" in it's name. This will cause " . "incorrect orientation calls in the output when " . "running with wublast." if ( $seqID =~ /anti/ ); print OUT ">$seqID\n"; $seq =~ s/(.{50})/$1\n/g; print OUT "$seq\n"; $rseq =~ tr/ACGTRYWSKMNXBDHV/TGCAYRSWMKNXVHDB/; $rseq = reverse $rseq; print OUT ">$seqID (anti)\n"; $rseq =~ s/(.{50})/$1\n/g; print OUT "$rseq\n"; } close OUT; if ( $searchEngine->isa( "WUBlastSearchEngine" ) ) { my $currdir = cwd(); chdir( $tempdir ) or die "RepeatMasker::processCustomLib(): " . "Cannot change directory to $tempdir"; system( "$RepeatMaskerConfig::SETDB_PRGM -o $custLibFile " . "$tempdir/$custLibFile.anti " . " > setdb.log 2>&1" ) == 0 or die "RepeatMasker::processCustomLib(): Error invoking setdb " . "on file $tempdir/$custLibFile.anti. We tried using " . "the setdb program ( $RepeatMaskerConfig::SETDB_PRGM ).\n"; chdir( $currdir ) or die "RepeatMasker::processCustomLib(): " . "Cannot change directory to $currdir"; } elsif ( $searchEngine->isa( "DeCypherSearchEngine" ) ) { # TODO Have Mike check this my $outFile = "$tempdir/$custLibFile.anti"; $outFile =~ s/\/cygdrive\/(\w)/$1:/; # handle cygwin drive $outFile =~ /.*\/(.*)$/; # TODO: NO Globals! This should be a parameter system "$RepeatMaskerConfig::DEMAKE -source \"$outFile\" -targ $1"; } undef $libDB; } elsif ( $searchEngine->isa( "NCBIBlastSearchEngine" ) ) { system( "$RepeatMaskerConfig::RMBLASTDB_PRGM -out $tempdir/$custLibFile " . "-dbtype nucl -in $options{'lib'} > " . "$tempdir/makeblastdb.log 2>&1" ); } } ##-------------------------------------------------------------------------## ## my ( $refineableHashRef ) = builRefineableHash( $EMBLDBRef ); ## ## Returns ## A hash: $refineableHashRef = { 'AluJo' => 1, ## 'AluSc' => 1, .. } ## which contains all the id's of repeats which can be ## refined. ## ## Globals Used: None ##-------------------------------------------------------------------------##� sub buildRefineableHash { my $db = shift; my %refineableHash = (); if ( $db->isa( "RepbaseEMBL" ) ) { my $seqCount = $db->getRecordCount(); # For each sequence in the master library for ( my $i = 0 ; $i < $seqCount ; $i++ ) { my $record = $db->getRecord( $i ); my $refineable = $record->getRMRefineable(); if ( $refineable ) { $refineableHash{ $record->getId() } = 1; } } } return ( \%refineableHash ); } ##-------------------------------------------------------------------------## ## my ( $species, $generalCacheDir, $speciesCacheDir, $customLibDir ) = ## initLibraries( ## \%options, $rmLibDir, $tax, ## $tempdir, \@libraryPath, ## $searchEngine ); ## Returns ## ## Globals Used: None ##-------------------------------------------------------------------------##� sub initLibraries { my %options = %{ shift() }; my $rmLibDir = shift; my $tax = shift; my $tempdir = shift; my @libraryPath = @{ shift() }; my $searchEngine = shift; my $dbversion = shift; if ( defined $options{'species'} && defined $options{'lib'} ) { die "You can choose only one species option (including -species\n" . "and -lib) at the time.\n"; } my $species; # The "real" species my $speciesCacheDir = ""; # The directory where we will find the # cache files for species spec. libraries my $generalCacheDir = ""; # The directory where we will find the # cache files for general libraries my $customCacheDir = "$tempdir"; # The directory where we will find the # "-lib" custom library. print "RepeatMasker::initLibraries: Using dbversion $dbversion\n" if ( $DEBUG ); # If the user specified a library and it is already frozen we # should check to see if the names are right??? # NOTE: check not needed here for decypher if ( $options{'lib'} && !$searchEngine->isa( "DeCypherSearchEngine" ) ) { if ( -s $options{'lib'} ) { if ( $searchEngine->isa( "WUBlastSearchEngine" ) ) { if ( -s "$options{'lib'}.bsq" || -s "$options{'lib'}.xps" ) { my ( $custLibVol, $custLibDir, $custLibFile ) = File::Spec->splitpath( $options{'lib'} ); $customCacheDir = $custLibDir || "."; warn "NOTE: Compressed versions of your custom library were\n" . "found in the $custLibDir directory. The program will\n" . "use these by default. If these databases do not contain\n" . "reverse complemented copies of your sequences the reverse\n" . "strand hits will not be returned!"; } else { &processCustomLib( \%options, $tempdir, $searchEngine ); } } elsif ( $searchEngine->isa( "NCBIBlastSearchEngine" ) ) { if ( -s "$options{'lib'}.nsq" || -s "$options{'lib'}.nhr" ) { my ( $custLibVol, $custLibDir, $custLibFile ) = File::Spec->splitpath( $options{'lib'} ); $customCacheDir = $custLibDir || "."; } else { &processCustomLib( \%options, $tempdir, $searchEngine ); } } else { my ( $custLibVol, $custLibDir, $custLibFile ) = File::Spec->splitpath( $options{'lib'} ); $customCacheDir = $custLibDir || "."; } } else { die "RepeatMasker::setspecies: Could not find user specified library " . $options{'lib'} . ".\n"; } } else { # Default species if non given. $species = "homo"; $species = $options{'species'} if ( exists $options{'species'} ); # lookup species name in Taxonomy database if ( ( $species = $tax->isSpecies( $species ) ) eq "" ) { my $dieMsg = "RepeatMasker::initLibraries: Species \"" . $options{'species'} . "\" is not known to RepeatMasker.\nHere are some similar " . "sounding (using Soundex) valid species names:\n"; foreach my $alts ( $tax->getSimilarSoundingSpecies( $options{'species'} ) ) { $dieMsg .= " -species \"$alts\"\n"; } die $dieMsg; } } # Check library search path for cached versions # of libraries. NOTE: cached versions are # stored as follows: # # @libraryPath/$dbversion/general/foo.lib # @libraryPath/$dbversion/$species/foolib # # The first directory in the search path containing # the desired library type is used. # my $speciesWord = $species || ""; $speciesWord =~ s/\s+/_/g; foreach my $path ( @libraryPath ) { if ( -d "$path/$dbversion" ) { if ( -d "$path/$dbversion/$speciesWord" ) { if ( ( $searchEngine->isa( "WUBlastSearchEngine" ) && ( my @pathFiles = glob( "$path/$dbversion/$speciesWord/*.ahd" ) ) ) || ( $searchEngine->isa( "NCBIBlastSearchEngine" ) && ( my @pathFiles = glob( "$path/$dbversion/$speciesWord/*.nhr" ) ) ) || ( $searchEngine->isa( "CrossmatchSearchEngine" ) && ( my @pathFiles = glob( "$path/$dbversion/$speciesWord/*lib" ) ) ) ) { $speciesCacheDir = "$path/$dbversion/$speciesWord"; } } if ( -d "$path/$dbversion/general" ) { if ( ( $searchEngine->isa( "WUBlastSearchEngine" ) && ( my @pathFiles = glob( "$path/$dbversion/general/*.ahd" ) ) && ( -s "$path/$dbversion/general/refineableHash.dat" ) ) || ( $searchEngine->isa( "NCBIBlastSearchEngine" ) && ( my @pathFiles = glob( "$path/$dbversion/general/*.nhr" ) ) && ( -s "$path/$dbversion/general/refineableHash.dat" ) ) || ( $searchEngine->isa( "CrossmatchSearchEngine" ) && -s "$path/$dbversion/general/at.lib" && -s "$path/$dbversion/general/refineableHash.dat" ) ) { $generalCacheDir = "$path/$dbversion/general"; } } } } print "Master RepeatMasker Database: $rmLibDir/RepeatMaskerLib.embl "; if ( $dbversion =~ /-min/ ) { print "( Simple/Low_complexity Only: $dbversion )\n"; } else { print "( Complete Database: $dbversion )\n"; } if ( $options{'lib'} ) { print "Custom Repeat Library: $options{'lib'}\n"; } print "\n\n"; # # If we could not find either the general library cache # or the species library cache we need to build them. # if ( $generalCacheDir eq "" || $speciesCacheDir eq "" ) { ## Need to build some library....must open the database ( NOTE This is slow ) my $db; $db = RepbaseEMBL->new( fileName => "$rmLibDir/RepeatMaskerLib.embl" ); # # Determine the highest level writable directory # my $writableCacheDir = ""; foreach my $path ( @libraryPath ) { if ( -d $path ) { if ( open( TEST, ">$path/rmwritetest.deleteme" ) ) { close TEST; unlink "$path/rmwritetest.deleteme"; $writableCacheDir = $path; last; } } elsif ( mkdir "$path", 0777 ) { $writableCacheDir = $path; last; } } if ( $generalCacheDir eq "" ) { # Need to build libraries: at.lib, simple.lib # l1.lib, mirs.lib, mir.lib, is.lib print STDERR "Building general libraries in: " . "$writableCacheDir/$dbversion/general\n"; # Make cache dir if ( !-d "$writableCacheDir/$dbversion/general" ) { eval { mkpath( "$writableCacheDir/$dbversion/general" ) }; if ( $@ ) { die "RepeatMasker::setspecies: Can't creat dir path " . "$writableCacheDir/$dbversion/general! $@\n"; } } # Build the cached refineable elements hash. These are the IDs of sequences which # can be refined by searching against the refinelib. my $refineableHashRef = buildRefineableHash( $db ); nstore $refineableHashRef, "$writableCacheDir/$dbversion/general/refineableHash.dat"; createLib( \%options, $db, "$writableCacheDir/$dbversion/general/at.lib", "root", "30", $tax, $searchEngine ); createLib( \%options, $db, "$writableCacheDir/$dbversion/general/simple.lib", "root", "25", $tax, $searchEngine ); createLib( \%options, $db, "$writableCacheDir/$dbversion/general/l1.lib", "root", "75", $tax, $searchEngine ); createLib( \%options, $db, "$writableCacheDir/$dbversion/general/is.lib", "root", "10", $tax, $searchEngine ); createLib( \%options, $db, "$writableCacheDir/$dbversion/general/rodspec.lib", "root", "15", $tax, $searchEngine ); createLib( \%options, $db, "$writableCacheDir/$dbversion/general/humspec.lib", "root", "20", $tax, $searchEngine ); $generalCacheDir = "$writableCacheDir/$dbversion/general"; } else { print STDERR "Using general libraries in:\n $generalCacheDir\n" if ( $DEBUG ); } if ( !$options{'lib'} ) { if ( $speciesCacheDir eq "" ) { # Need to build species specific libraries print STDERR "Building species libraries in: " . "$writableCacheDir/$dbversion/$speciesWord\n"; # Make cache dir if ( !-d "$writableCacheDir/$dbversion/$speciesWord" ) { eval { mkpath( "$writableCacheDir/$dbversion/$speciesWord" ) }; if ( $@ ) { die "RepeatMasker::setspecies: Can't create dir path " . "$writableCacheDir/$dbversion/$speciesWord! $@\n"; } } if ( $tax->isA( $species, "mammalia" ) ) { # if ( $tax->isA( $species, "primates" ) # || $tax->isA( $species, "rodentia" ) ) # { # Now also used in monotremes and in future probably elsewhere # alu.lib, rodcutsines.lib => sinecutlib createLib( \%options, $db, "$writableCacheDir/$dbversion/$speciesWord/sinecutlib", $species, "35", $tax, $searchEngine ); # } # cut1.lib, rodcut.lib => shortcutlib createLib( \%options, $db, "$writableCacheDir/$dbversion/$speciesWord/shortcutlib", $species, "40", $tax, $searchEngine ); # cut2.lib, rodcut2.lib, cetartiocut.lib => cutlib createLib( \%options, $db, "$writableCacheDir/$dbversion/$speciesWord/cutlib", $species, "45", $tax, $searchEngine ); # humsines.lib, rod1.lib, cetartio1.lib => shortlib createLib( \%options, $db, "$writableCacheDir/$dbversion/$speciesWord/shortlib", $species, "50", $tax, $searchEngine ); # humlines.lib, rod2.lib => longlib createLib( \%options, $db, "$writableCacheDir/$dbversion/$speciesWord/longlib", $species, "55", $tax, $searchEngine ); # mirs.lib => mirslib createLib( \%options, $db, "$writableCacheDir/$dbversion/$speciesWord/mirslib", $species, "60", $tax, $searchEngine ); # mir.lib => mirlib createLib( \%options, $db, "$writableCacheDir/$dbversion/$speciesWord/mirlib", $species, "65", $tax, $searchEngine ); # retrovirus.lib => retrolib createLib( \%options, $db, "$writableCacheDir/$dbversion/$speciesWord/retrolib", $species, "70", $tax, $searchEngine ); # refinelib createLib( \%options, $db, "$writableCacheDir/$dbversion/$speciesWord/refinelib", $species, "85", $tax, $searchEngine ); } else { # Need to separate into a species.lib createLib( \%options, $db, "$writableCacheDir/$dbversion/$speciesWord/specieslib", $species, "80", $tax, $searchEngine ); } $speciesCacheDir = "$writableCacheDir/$dbversion/$speciesWord"; } else { print STDERR "Using species libraries in:\n $speciesCacheDir\n" if ( $DEBUG ); } undef $db; } } return ( $species, $generalCacheDir, $speciesCacheDir, $customCacheDir ); } 1;