#!/usr/bin/perl ##---------------------------------------------------------------------------## ## File: ## @(#) ProcessRepeats ## Authors: ## Arian Smit ## Robert Hubley ## original ExtractObjects module by Chris Abadjian ## and Todd Smith. ## Description: ## Takes RepeatMasker output and produces an annotation table. ## ## #****************************************************************************** #* #* Copyright (C) Institute for Systems Biology 2002-2008 Developed by #* Arian Smit and Robert Hubley. #* #* Copyright (C) Arian Smit 2000-2001 #* #* Copyright (C) University of Washington 1996-1999 Developed by Arian Smit, #* Philip Green and Colin Wilson of the University of Washington Department of #* Genomics. #* #* This work is licensed under the Open Source License v2.1. To view a copy #* of this license, visit http://www.opensource.org/licenses/osl-2.1.php or #* see the license.txt file contained in this distribution. #* ############################################################################### # ChangeLog # # $Log: ProcessRepeats,v $ # Revision 1.211 2011/04/26 22:41:44 rhubley # Cleanup before a distribution # # ############################################################################### # # =head1 NAME ProcessRepeats - Post process results from RepeatMasker and produce an annotation file. =head1 SYNOPSIS ProcessRepeats [-options] =head1 DESCRIPTION The options are: =over 4 =item -h(elp) Detailed help =item -species Post process RepeatMasker results run on sequence from this species. Default is human. =item -lib Skips most processing, does not produce a .tbl file unless the custome library is in the ">name#class" format. =item -nolow Does not display simple repeats or low_complexity DNA in the annotation. =item -noint Skips steps specific to interspersed repeats, saving lots of time. =item -lcambig Outputs ambiguous DNA transposon fragments using a lower case name. All other repeats are listed in upper case. Ambiguous fragments match multiple repeat elements and can only be called based on flanking repeat information. =item -u Creates an untouched annotation file besides the manipulated file. =item -xm Creates an additional output file in cross_match format (for parsing). =item -ace Creates an additional output file in ACeDB format. =item -gff Creates an additional Gene Feature Finding format. =item -poly Creates an output file listing only potentially polymorphic simple repeats. =item -no_id Leaves out final column with unique number for each element (was default). =item -excln Calculates repeat densities excluding long stretches of Ns in the query. =item -orf2 Results in sometimes negative coordinates for L1 elements; all L1 subfamilies are aligned over the ORF2 region, sometimes improving interpretation of data. =item -a Shows the alignments in a .align output file. =item -maskSource Instructs ProcessRepeats to mask the sequence file using the annotation. =item -x Mask repeats with a lower case 'x'. =item -xsmall Mask repeats by making the sequence lowercase. =back =head1 SEE ALSO =over 4 RepeatMasker, Crossmatch, Blast =back =head1 COPYRIGHT Copyright 2008 Arian Smit, Institute for Systems Biology =head1 AUTHORS Arian Smit Robert Hubley =cut # # Module Dependence # use strict; use FindBin; use lib $FindBin::RealBin; use SeqDBI; use RepeatAnnotationData; use FileHandle; use FastaDB; use SearchResult; use SearchEngineI; use SearchResultCollection; use ArrayList; use CrossmatchSearchEngine; use Getopt::Long; use Taxonomy; use Data::Dumper; # Global variables my $DEBUG = 0; my $DIRECTORY = "$FindBin::Bin"; # # Refinement Hash: # Contains re-alignments for *.cat alignments which were refined. # structure is as follows: # $refinementHash{ "b#s#i#" }->{ ConsensusID } = ( annot1ref, annot2ref, ...) # my %refinementHash = (); # Contains refined annotations ordered by catID my %chainSeq1Length; # A bug in 5.8 produces way too many warnings if ( $] && $] >= 5.008003 ) { use warnings; } # # Option processing # e.g. # -t: Single letter binary option # -t=s: String parameters # -t=i: Number paramters # my @opts = qw (a ace debug excln gff lib=s noint nolow orf2 orifile=s no_id poly species=s u xm mammal mus rat rod|rodent cow pig artiodactyl cat dog carnivore chicken fugu danio drosophila elegans arabidopsis rice wheat maize primate maskSource=s xsmall x lcambig source html refcat=s ); # # Get the supplied command line options, and set flags # my %options = (); unless ( &GetOptions( \%options, @opts ) ) { exec "pod2text $0"; exit( 0 ); } ## Print the internal POD documentation if something is missing if ( $#ARGV == -1 && !$options{'help'} ) { print "No cat file indicated\n\n"; # This is a nifty trick so we don't have to have # a duplicate "USAGE()" subroutine. Instead we # just recycle our POD docs. See PERL POD for more # details. exec "pod2text $0"; die; } ## ## Species Options and Taxonomy Processing ## ## NOTE: Tax needs to be global for the moment my $tax; if ( $options{'species'} ) { # Need to set opt_species, opt_mammal, opt_mus $tax = Taxonomy->new( taxonomyDataFile => "$DIRECTORY/taxonomy.dat" ); if ( $tax->isA( $options{'species'}, "primates" ) ) { $options{'primate'} = 1; } elsif ( $tax->isA( $options{'species'}, "rodentia" ) ) { $options{'mus'} = 1; } elsif ( $tax->isA( $options{'species'}, "mammalia" ) ) { $options{'mammal'} = 1; } if ( $tax->isA( $options{'species'}, "metatheria" ) ) { $options{'marsupial'} = 1; } } else { $options{'species'} = "homo"; $options{'primate'} = 1; } # warning for debug mode print "Note that in debug mode the IDs are not adjusted to go up stepwise\n\n" if $options{'debug'}; # # Check library type used # # TODO: We need different levels of meta data compliance. Ie. # Pound formatting isn't enough anymore. Also we can glean # compliance like pound formatting from the cat file. No # need to open up the lib ( yet ). my $poundformat = ""; if ( $options{'lib'} ) { open( IN, $options{'lib'} ); while ( ) { if ( /^>\S+\#\S+/ ) { $poundformat = 1; # Assuming that this represents my classification formatting; no sure # thing could make it more restrictive by also requiring a backslash last; } } } else { $poundformat = 1; } # # Loop over input files # foreach my $file ( @ARGV ) { open( INCAT, $file ) || die "Can\'t open file $file\n"; # INITIALIZE CAT FILE DATA # Read the cat file and calculate: # - The length of the original sequence ( minus 1/2 of the overlap ) # - The number of N bases # - The number of sequences # - The length of each sequence # - The fraction GC the version the masked length and the dbversion. # Also...grab batch overlap boundaries if information is present. my %batchOverlapBoundaries = (); my $numSearchedSeqs = 0; my $lenSearchedSeqs = 0; my $lenSearchedSeqsExcludingXNRuns = 0; my $lenSearchedSeqsExcludingAmbig = 0; my $versionmode = ""; my $engine = ""; my $dbversion = ""; while ( ) { if ( /^##\s*(\S+)\s+([\d\s,]+)/ ) { $batchOverlapBoundaries{$1} = [ sort( split( /,/, $2 ) ) ]; } elsif ( /^##\s*Total\s+Sequences:\s*(\d+)/i ) { $numSearchedSeqs = $1; } elsif ( /^##\s*Total\s+Length:\s*(\d+)/i ) { $lenSearchedSeqs = $1; } elsif ( /^##\s*Total\s+NonMask.*:\s*(\d+)/i ) { $lenSearchedSeqsExcludingXNRuns = $1; } elsif ( /^##\s*Total\s+NonSub.*:\s*(\d+)/i ) { $lenSearchedSeqsExcludingAmbig = $1; } elsif ( /^RepeatMasker|run|RepBase/ ) { my @bit = split; $versionmode = $_ if $bit[ 0 ] eq 'RepeatMasker'; $engine = $1 if /^(run with.*)/; if ( /^(RepBase.*)/ ) { $dbversion = $1; last; } } } close INCAT; # # Parse the cat file into an object # my $sortedAnnotationsList = undef; if ( $options{'a'} ) { $sortedAnnotationsList = &parseCATFile( file => $file ); } else { $sortedAnnotationsList = &parseCATFile( file => $file, noAlignData => 1 ); } # # Read in the refined annotations # my $refinedAnnots = undef; if ( -f $options{'refcat'} ) { $refinedAnnots = &parseCATFile( file => $options{'refcat'} ); my $refinedIter = $refinedAnnots->getIterator(); while ( $refinedIter->hasNext() ) { my $currentAnnot = $refinedIter->next(); # Consider moving this to CATID my $catID = $currentAnnot->getLastField(); $catID =~ s/[\[\]]//g; push @{ $refinementHash{$catID}->{ $currentAnnot->getHitName() } }, ( $currentAnnot ); } } # Create some filename constants my $catfile = $file; $file =~ s/\.(temp)?cat$//; my $filename = $file; $filename =~ s/(.*\/)//; unless ( $sortedAnnotationsList->size() > 0 ) { my $filenaam = $file; $filenaam = $options{'orifile'} if $options{'orifile'}; open( OUT, ">$file.out" ); print STDERR "\n\n\nNo repetitive sequences were detected in $filenaam\n"; print OUT "There were no repetitive sequences detected in $filenaam\n"; close( OUT ); next; } print STDERR "processing output: "; open( OUTRAW, ">$file.ori.out" ) if ( $options{'u'} || !$poundformat ); # # Initialize data structures and ID each annotation # # Makes global %seq1Lengths # NOTE: Only needed because we are not updating LeftOver # when we modify Seq1Beg/End # $sortedAnnotationsList->sort( \&byNameBeginEndrevSWrev ); my $cycleAnnotIter = $sortedAnnotationsList->getIterator(); my $i = -1; my %seq1Lengths = (); while ( $cycleAnnotIter->hasNext() ) { $i++; my $currentAnnot = $cycleAnnotIter->next(); # Do this early so we capture all information before it is changed. # NOTE: I may make derived elements links to the cat file for # memory brevity. if ( $options{'source'} ) { $currentAnnot->addDerivedFromAnnot( $currentAnnot ); $currentAnnot->setAlignData( undef ); } my $HitName = $currentAnnot->getHitName(); $HitName =~ s/_short_$//; $currentAnnot->setHitName( $HitName ); if ( !defined $seq1Lengths{ $currentAnnot->getSeq1Name() } || $seq1Lengths{ $currentAnnot->getSeq1Name() } < ( $currentAnnot->getSeq1End() + $currentAnnot->getLeftOver() ) ) { $seq1Lengths{ $currentAnnot->getSeq1Name() } = $currentAnnot->getSeq1End() + $currentAnnot->getLeftOver(); } } print STDERR "\ncycle 1 "; #printHitArrayList( $sortedAnnotationsList ); my ( $chainBegRef, $chainEndRef ) = &cycleReJoin( $sortedAnnotationsList ); my %chainBeg = %{$chainBegRef}; my %chainEnd = %{$chainEndRef}; ########################## C Y C L E 2 ############################### # Purpose: Remove Edge Effect Annotations # Join Poly-A Tails # Reconstruct Satellite Annotations ########################################################################## print STDERR "\ncycle 2 "; # Sort by name, begin position, and end position descending $sortedAnnotationsList->sort( \&byNameBeginEndrevSWrev ); # Create an ArrayListIterator for this cycle $cycleAnnotIter = $sortedAnnotationsList->getIterator(); my %colWidths = &getMaxColWidths( $sortedAnnotationsList ); my $DEBUG = 0; $i = -1; CYCLE2: while ( $cycleAnnotIter->hasNext() ) { $i++; print STDERR "." if ( $i + 1 ) % 1000 == 0; # NOTE: An iterator's index is considered # to be in between elements of a datastructure. # To obtain the correct index for the # current element in this pass we should # get the index *before* we move the iterator. my $currentIndex = $cycleAnnotIter->getIndex(); my $currentAnnot = $cycleAnnotIter->next(); if ( $DEBUG ) { print STDERR "CYCLE2: Considering\n"; $currentAnnot->print(); } if ( $currentAnnot->getRightLinkedHit() ) { my $proxIter = $cycleAnnotIter->getIterator(); while ( $proxIter->hasNext() ) { my $nextAnnot = $proxIter->next(); # Quit once we reach our partner last if ( $nextAnnot == $currentAnnot->getRightLinkedHit() ); # Break Recursion Violations if ( $currentAnnot->containsElement( $nextAnnot ) && $nextAnnot->containsElement( $currentAnnot->getRightLinkedHit() ) ) { if ( $DEBUG ) { print STDERR "This violates recursion:\nFirst:\n"; $currentAnnot->printLeftRightLinks(); print STDERR "Second:\n"; $nextAnnot->printLeftRightLinks(); } # Break lower scoring link my $oldRight = $nextAnnot->getRightLinkedHit(); $nextAnnot->setRightLinkedHit( undef ); $oldRight->setLeftLinkedHit( undef ); } } } ## ## Edge Effect Removal ## ## The current method of processing overlaps, while better ## than before still produces some edge effects. This ## section attempts to remove these before we start any ## serious analysis. ## ## Overlaps are handled in RepeatMasker thus: ## ## Middle ## <------ | ## -----> ## Batch#1 | ----x----> ## ...-----------------------------| ## | ## | ## |-----------------------------... ## | BATCH#2 ## <-x- | ## ----> ## | ----------------> ## ## ## RepeatMasker deletes all annotations which are ## contained in the region left/right ( closest to ## the edge ) of the overlap midpoint. Shown here ## with an "x" in the diagram annotations. ## If an annotation spans the midpoint it is kept ## in the cat file. This leaves several types of ## edge effects in annoation: ## ## Perfect or Near Perfect duplicates. Perfect if ## the same matrix was used and the annotation is ## completely contained in the overlap. Near perfect ## otherwise. ## ## Middle ## | ## -----> ## | ## -----> ## ## ## Cut Level Ambiguities. A full length young repeat ## partially contained in the overlap will be excised ## at a lower cutlevel in one batch and masked at ## a higher cutlevel in the other batch. See below ## for some examples. ## ## # Remove exact duplicates. Near duplicates get resolved elsewhere # ( FuseOverlappingSeqs etc ). We do this now especially to handle # duplicated poly-a tails that we are about to join. my $prevAnnot = $sortedAnnotationsList->get( $currentIndex - 1 ) if ( $currentIndex > 0 ); if ( $prevAnnot && $currentAnnot->getSW() == $prevAnnot->getSW() && $currentAnnot->getPctSubst() == $prevAnnot->getPctSubst() && $currentAnnot->getSeq1Name() eq $prevAnnot->getSeq1Name() && $currentAnnot->getSeq1Beg() == $prevAnnot->getSeq1Beg() && $currentAnnot->getSeq1End() == $prevAnnot->getSeq1End() ) { # Lower Scoring Overlap > %90 Covered by Higher Scoring # Get rid of lower scoring one # same cutlevel if ( $DEBUG ) { print STDERR "REMOVING EXACT DUPLICATE:\n"; $currentAnnot->print(); print STDERR " because of:\n"; $prevAnnot->print(); } #$prevAnnot->addDerivedFromAnnot( $currentAnnot ) # if ( $options{'source'} ); # Fix any previous joins to this element $currentAnnot->removeFromJoins(); $cycleAnnotIter->remove(); next CYCLE2; } # # If a large >1000bp ( 1/2 current overlap distance ) # young repeat starts outside the overlap and spans # the middle of the overlap you will get edge effect # duplications. I.e. # # Case #1 Middle # batch1 | # ---------------------------| # -----------------> ( excised cutlevels 0-4 ) # ----X----> ( masked cutlevel 5 ) # |------------------------------ # | batch2 # | # or | # batch1 ----X--> ( masked cutlevel 5 ) # ----------------------------| # |------------------------------- batch2 # -----------------> ( excised cutlevel 0-4 ) # | # # Because excision is only performed on full length # elements (lines can be 5' truncated) this can be # detected by checking for elements which contained by # a cutlevel 5 annotation *and* are less than cutlevel 5. # # Case #2 Middle # | # <---X--- ( cut level 5..just outside the other ) # ----------------------------| # |------------------------------- # | <---------- (excised cut level 0-4 ) # | # | # Case #3 | # (c.l. 5) --> | # -------> ( c.l. 3 ) # -----------------------------| # |---------------------------- # X> | # --------> # # In this case you have a cut level 3 spaning a middle # which is one bp longer in one batch. This creates # a 1bp overlap with a something at a higher cut level. # # So our general rule ends up: Remove all elements # in cutlevels 4/5 which overlap ( by > 10bp ) or are # contained by another element at a lessor cut level. # These cases should never occur outside the overlap # region anyway. # if ( $currentAnnot->getLastField >= 4 ) { my $proxIter = $cycleAnnotIter->getIterator(); $proxIter->previous(); while ( $proxIter->hasPrevious() ) { my $prevAnnot = $proxIter->previous(); last unless ( $currentAnnot->getSeq1Name eq $prevAnnot->getSeq1Name() && $prevAnnot->getSeq1End() >= $currentAnnot->getSeq1Beg() ); if ( $prevAnnot->getLastField() < $currentAnnot->getLastField && $prevAnnot->getSeq1End() - $currentAnnot->getSeq1Beg() >= 10 ) { if ( $DEBUG ) { print STDERR "DELETING MASKED INSIDE CUT:\n"; $currentAnnot->print(); print STDERR " because of previous:\n"; $prevAnnot->print(); } #$prevAnnot->addDerivedFromAnnot( $currentAnnot ) # if ( $options{'source'} ); # Fix any previous joins to this element $currentAnnot->removeFromJoins(); $cycleAnnotIter->remove(); next CYCLE2; } } $proxIter = $cycleAnnotIter->getIterator(); while ( $proxIter->hasNext() ) { my $nextAnnot = $proxIter->next(); last unless ( $currentAnnot->getSeq1Name eq $nextAnnot->getSeq1Name() && $nextAnnot->getSeq1Beg() <= $currentAnnot->getSeq1End() ); if ( $nextAnnot->getLastField() < $currentAnnot->getLastField && $currentAnnot->getSeq1End() - $nextAnnot->getSeq1Beg() >= 10 ) { if ( $DEBUG ) { print STDERR "DELETING MASKED INSIDE CUT:\n"; $currentAnnot->print(); print STDERR " because of next:\n"; $nextAnnot->print(); } #$nextAnnot->addDerivedFromAnnot( $currentAnnot ) # if ( $options{'source'} ); # Fix any previous joins to this element $currentAnnot->removeFromJoins(); $cycleAnnotIter->remove(); next CYCLE2; } } } # # Masklevel Violations From Clipping Boundaries # # RepeatMasker fragments alignments which span # cut out elements. This fragmentation process may # convert a pair of alignments like: # # --------------^-------------> SW=1000 # -----------^-----------------> SW=1500 # # ( where "^" marks the site of a clipped out element ) # into something like: # # SW = 1000 SW = 1000 # --------------> -------------> # -----------> -----------------> # SW = 1500 SW = 1500 # # This little block of code resolves this masklevel # rule-breaker ( lower scoring alignment contained by # higher scoring one ) by elminating the lower scoring # subfragments. # # i.e Delete element if flanking elements include # it and has a better or equal score. # # SW = 1000 # --------------> # -----------> -----------------> # SW = 1500 SW = 1500 # # This is still a bit artificial. # $prevAnnot = $sortedAnnotationsList->get( $currentIndex - 1 ) if ( $currentIndex > 0 ); my $proxIter = $cycleAnnotIter->getIterator(); my ( $prevHitName, $prevClassName ) = split( /\#/, $prevAnnot->getSeq2Name() ) if ( $prevAnnot ); # # Delete iff: # ----current-----^ SW <= past # -----past-------^ # -..----past----------^ # # or # ^---current----- SW <= past # ^----past------- # ^------past----------...-- # if ( $prevAnnot && ( ( $currentAnnot->getSeq1End() == $prevAnnot->getSeq1End() && $currentAnnot->getSW() <= $prevAnnot->getSW() && $currentAnnot->getSeq1Name() eq $prevAnnot->getSeq1Name() && $currentAnnot->getClassName() eq $prevClassName ) || ( $currentAnnot->getSeq1Beg() == $prevAnnot->getSeq1Beg() && $currentAnnot->getSeq1End() <= $prevAnnot->getSeq1End() && $currentAnnot->getSW() <= $prevAnnot->getSW() && $currentAnnot->getSeq1Name() eq $prevAnnot->getSeq1Name() && $currentAnnot->getClassName() eq $prevClassName ) ) ) { if ( $DEBUG ) { print STDERR "Deleting clipping boundary fragment " . "( masklevel violation ):\n"; $prevAnnot->print(); $currentAnnot->print(); } #$prevAnnot->addDerivedFromAnnot( $currentAnnot ) # if ( $options{'source'} ); # Fix any previous joins to this element $currentAnnot->removeFromJoins(); $cycleAnnotIter->remove(); next CYCLE2; } my $Seq2BeginPrint = $currentAnnot->getSeq2Beg(); my $LeftUnalignedPrint = "(" . $currentAnnot->get3PrimeUnaligned() . ")"; my $LeftOverPrint = "(" . $currentAnnot->getLeftOver() . ")"; if ( $currentAnnot->getRevComp() eq "C" ) { $Seq2BeginPrint = "(" . $currentAnnot->get3PrimeUnaligned() . ")"; $LeftUnalignedPrint = $currentAnnot->getSeq2Beg(); } # # Supposedly creates an untouched annotation file. # What it really does is create an annotation file # which has been modified to remove exact duplicates # and batch overlap artifacts only. # if ( $options{'u'} || !$poundformat ) { my $prevAnnot = $sortedAnnotationsList->get( $currentIndex - 1 ) if ( $currentIndex > 0 ); my $nextAnnot = $sortedAnnotationsList->get( $currentIndex + 1 ) if ( $currentIndex < $sortedAnnotationsList->size() - 1 ); my $Overlapped = ""; if ( $prevAnnot && $currentAnnot->getSeq1Beg() <= $prevAnnot->getSeq1End() && $currentAnnot->getSW() < $prevAnnot->getSW() || $nextAnnot && $currentAnnot->getSeq1End() >= $nextAnnot->getSeq1Beg() && $currentAnnot->getSW() < $nextAnnot->getSW() ) { $Overlapped = "*"; } $currentAnnot->setClassName( "" ) unless ( $currentAnnot->getClassName() ); # # sequence names get truncated to 20 letters. Too # cumbersome to change. However, names like # /mnt/user/users/FlipvanTiel/mystuff/sequence1 better be # clipped from the end. Thus: $currentAnnot->setSeq1Name( substr( $currentAnnot->getSeq1Name(), -20 ) ) if ( length $currentAnnot->getSeq1Name() > 20 && $currentAnnot->getSeq1Name() =~ /^\// ); printf OUTRAW "%6d %4s %4s %4s %20s %9s %9s %8s %1s " . "%20s %15s %7s %7s %7s %3s\n", $currentAnnot->getSW(), $currentAnnot->getPctSubst, $currentAnnot->getPctDelete, $currentAnnot->getPctInsert, $currentAnnot->getSeq1Name(), $currentAnnot->getSeq1Beg(), $currentAnnot->getSeq1End(), "(" . $currentAnnot->getLeftOver() . ")", $currentAnnot->getRevComp(), $currentAnnot->getHitName(), $currentAnnot->getClassName(), $Seq2BeginPrint, $currentAnnot->getSeq2End(), $LeftUnalignedPrint; $Overlapped; } # if ( $options{'u'} || !$poundformat ) # # If a user supplied non-classified library was used # if ( !$poundformat ) { if ( $options{'ace'} ) { if ( $currentAnnot->getRevComp() eq "C" ) { print OUTACE "Motif_homol \"" . $currentAnnot->getHitName() . "\" \"RepeatMasker\" " . $currentAnnot->getPctSubst() . " " . $currentAnnot->getSeq1Beg() . " " . $currentAnnot->getSeq1End() . " - " . $currentAnnot->getSeq2End() . " " . $currentAnnot->getSeq2Beg() . "\n"; } else { print OUTACE "Motif_homol \"" . $currentAnnot->getHitName() . "\" \"RepeatMasker\" " . $currentAnnot->getPctSubst() . " " . $currentAnnot->getSeq1Beg() . " " . $currentAnnot->getSeq1End() . " + " . $currentAnnot->getSeq2Beg() . " " . $currentAnnot->getSeq2End() . "\n"; } } if ( $options{'xm'} ) { my $tempclassname = ""; $tempclassname = "\#" . $currentAnnot->getClassName() if ( $currentAnnot->getClassName() ); print OUTXM $currentAnnot->getSW() . " " . $currentAnnot->getPctSubst() . " " . $currentAnnot->getPctDelete() . " " . $currentAnnot->getPctInsert() . " " . $currentAnnot->getSeq1Name() . " " . $currentAnnot->getSeq1Beg() . " " . $currentAnnot->getSeq1End() . " " . $LeftOverPrint . " " . $currentAnnot->getRevComp() . " " . $currentAnnot->getHitName() . $tempclassname . " " . $Seq2BeginPrint . " " . $currentAnnot->getSeq2End() . " " . $LeftUnalignedPrint . "\n"; } if ( $options{'gff'} ) { my $source; if ( $currentAnnot->getHitName() =~ /Alu/ ) { $source = 'RepeatMasker_SINE'; } else { # $source = 'RepeatMasker'; } print OUTGFF "" . $currentAnnot->getSeq1Name() . "\t$source\tsimilarity\t" . $currentAnnot->getSeq1Beg() . "\t" . $currentAnnot->getSeq1End() . "\t" . $currentAnnot->getPctSubst() . "\t" . ( $currentAnnot->getRevComp() eq 'C' ? '-' : '+' ) . "\t.\t" . "Target \"Motif:" . $currentAnnot->getHitName() . "\" " . $currentAnnot->getSeq2Beg() . " " . $currentAnnot->getSeq2End() . "\n"; } } # if ( !$poundformat ) else { # # Satellite Consensi # Searching for satellites with consensi is a hack. # Search engines such as crossmatch will often return # hits to a single repeating pattern as: # | | | | | # abcdefghabcdefghabcdefghabcdefghabcdefgh # ---------> # --------> # --------> # By creating a consensus for the repeat pattern and # another for the same pattern shifted by 1/2 of the # cycle. You can nicely overlapping hits. # These shifted consensi are denoted by the use of a # trailing "_" in the name. # # Here is the regular expression for the syntax: # # [A-Z_]+[a-z]?_?#Satellite # # So here are some examples: # # ALR_#Satellite # ALRa#Satellite # ALRa_#Satellite # CENSAT_MC#Satellite # # Since these trailing "_" variants and the "[a-z]" # variants are equivalent we only need to restore # the original name by stripping these characters # HitName. # # NOTE: The lowercase varieties are not in repbase. # if ( $currentAnnot->getClassName =~ /Satellite/ ) { my $HitName = $currentAnnot->getHitName(); $HitName =~ s/_$//; $HitName =~ s/[a-z]$//; $currentAnnot->setHitName( $HitName ); } if ( $currentAnnot->getClassName =~ /DNA/ ) { # Find all ambiguous dna transposon fragments and generate # equivalency lists &preProcessDNATransp( \%chainBeg, \%chainEnd, $currentAnnot, \%RepeatAnnotationData::repeatDB ); } } } # END CYCLE 2 close( OUTRAW ) if ( $options{'u'} || !$poundformat ); # # If we do not have a pound formatted database we are done! # if ( !$poundformat ) { &generateOutput( \%options, \%seq1Lengths, $file, $filename, $dbversion, $numSearchedSeqs, $lenSearchedSeqs, $lenSearchedSeqsExcludingXNRuns, $versionmode, $engine, $sortedAnnotationsList, $poundformat ); } else { # Continue with the rest of the cycles! ########################## C Y C L E 3 ################################ # # This cycle is currently handling the de-fragmentation # of DNA Transposons using a new method. It also joins simple # repeats together. # # Creates global %conPosCorrection ######################################################################## #printHitArrayList( $sortedAnnotationsList ); # Sort by name, begin position, and end position descending print STDERR "\ncycle 3 "; $sortedAnnotationsList->sort( \&byNameBeginEndrevSWrev ); $i = -1; $cycleAnnotIter = $sortedAnnotationsList->getIterator(); $DEBUG = 0; my %conPosCorrection = (); CYCLE3: while ( $cycleAnnotIter->hasNext() ) { $i++; print STDERR "." if ( $i + 1 ) % 1000 == 0; # NOTE: An iterator's index is considered # to be in between elements of a datastructure. # To obtain the correct index for the # current element in this pass we should # get the index *before* we move the iterator. my $currentIndex = $cycleAnnotIter->getIndex(); my $currentAnnot = $cycleAnnotIter->next(); if ( $currentAnnot->getStatus() eq "DELETED" ) { # NOTE: Annotation already added to derived list. # Fix any previous joins to this element $currentAnnot->removeFromJoins(); $cycleAnnotIter->remove(); next CYCLE3; } if ( $DEBUG ) { print STDERR "Considering:\n"; $currentAnnot->print(); } if ( $currentAnnot->getClassName() =~ /DNA/ && $currentAnnot->getStatus() ne "JOINED" ) { #if ( $DEBUG ) { #print STDERR "DNA Transposon Equivalent:\n"; #print STDERR Dumper( $EquivHash ) . "\n"; #} # Look into our future my $proxIter = $cycleAnnotIter->getIterator(); my @dnaTransposonCluster = (); my $elementDistance = 0; my $ignoreUntil = undef; while ( $proxIter->hasNext() ) { my $nextAnnot = $proxIter->next(); my ( $nextHitName, $nextClassName ) = split( /\#/, $nextAnnot->getSeq2Name() ); if ( $DEBUG ) { print STDERR " -vs-: "; $nextAnnot->print(); } $elementDistance++ if ( $nextClassName =~ /DNA/ ); # # Reasons we wouldn't consider this element in our cluster and # trigger the end to the search # # TODO: Do not join fragments outside of a parent fragment. # last if ( $currentAnnot->getSeq1Name() ne $nextAnnot->getSeq1Name() || $elementDistance > 20 || $nextAnnot->getSeq1End() - $currentAnnot->getSeq1End() > 15000 # max retrovirus insert || $nextAnnot->containsElement( $currentAnnot ) ); if ( $ignoreUntil ) { if ( $ignoreUntil == $nextAnnot ) { $ignoreUntil = undef; } next; } if ( $nextClassName =~ /DNA/ && $nextAnnot->getStatus() ne "JOINED" && $nextAnnot->getStatus() ne "DELETED" ) { push @dnaTransposonCluster, $nextAnnot; } if ( $nextAnnot->getRightLinkedHit() ) { $ignoreUntil = $nextAnnot->getRightLinkedHit(); } } if ( @dnaTransposonCluster ) { # Consider recruiting putative related elements to our cause &joinDNATransposonFragments( \%chainBeg, \%chainEnd, \%RepeatAnnotationData::repeatDB, $currentAnnot, \@dnaTransposonCluster ); } # printHitArrayList( $sortedAnnotationsList ); } if ( $currentAnnot->getHitName =~ /^MLT2/ ) { # middle region of MLT2 variable in length in subfamilies # NOTE: Initializes conPosCorrection{ID} for LTRs &preProcessLTR( \%chainBeg, \%chainEnd, \%conPosCorrection, $currentAnnot ); } elsif ( $currentAnnot->getClassName =~ /LINE/ ) { # I placed this here because I could. # adjust start position of LINE termini and # give generic names to too precisely categorized LINEs # NOTE: Initializes conPosCorrection{ID} for LINEs &preProcessLINE( \%chainBeg, \%chainEnd, \%conPosCorrection, $currentAnnot ); } # merge long simple repeats which were initially partly spliced out # Note that not all satellite entries represent (multiple) # units. Some are complex sequences that can contain # minisatellites in it; overlapping matches to such # subsequences cause 'funny' annotation (in particular, # the location of the match in the consensus sequence is off) if ( $currentAnnot->getClassName() =~ /Simple|Satellite/ ) { if ( $cycleAnnotIter->hasNext() ) { # # current ..----------- # next ..----- # my $proxIter = $cycleAnnotIter->getIterator(); my $nextAnnot = $proxIter->next(); if ( $currentAnnot->getSeq1Name() eq $nextAnnot->getSeq1Name() && $currentAnnot->getRevComp() eq $nextAnnot->getRevComp() && $currentAnnot->getSeq1End() > $nextAnnot->getSeq1End() ) { my $tempname = quotemeta $currentAnnot->getHitName(); if ( $nextAnnot->getSeq2Name() =~ /tempname/ ) { my $thislength = $currentAnnot->getSeq2End() - $currentAnnot->getSeq2Beg() + 1; my $nextlength = $nextAnnot->getSeq2End() - $nextAnnot->getSeq2Beg() + 1; my $Seq2Length = $thislength + $nextlength; my $SW = $nextAnnot->getSW() if ( $nextAnnot->getSW() > $currentAnnot->getSW() ); $currentAnnot->setSW( $SW ); $currentAnnot->setPctSubst( ( $currentAnnot->getPctSubst() * $thislength + $nextAnnot->getPctSubst() * $nextlength ) / $Seq2Length ); $currentAnnot->setPctDelete( ( $currentAnnot->getPctDelete() * $thislength + $nextAnnot->getPctDelete() * $nextlength ) / $Seq2Length ); $currentAnnot->setPctInsert( ( $currentAnnot->getPctInsert() * $thislength + $nextAnnot->getPctInsert() * $nextlength ) / $Seq2Length ); $nextAnnot->removeFromJoins(); $currentAnnot->addDerivedFromAnnot( $nextAnnot ) if ( $options{'source'} ); $proxIter->remove(); } } } # if ( $cycleAnnotIter->hasNext()... # if ( $currentAnnot->getClassName() =~ /Simple/ ) { # Requirement added in January 2005; Satellites should not # be considered This loop will convert all to forward # orientation. The annotation shows then the inverse # complement unit of simple repeats, but such is not # available for satellites. if ( ( $currentAnnot->getPctSubst() + $currentAnnot->getPctDelete() + $currentAnnot->getPctInsert() ) > 15 ) { if ( $currentAnnot->getHitName() =~ /AAA|\(A\)/ ) { if ( $currentAnnot->getRevComp() eq '+' ) { $currentAnnot->setHitName( "A-rich" ); } else { $currentAnnot->setHitName( "T-rich" ); } } elsif ( $currentAnnot->getHitName() =~ /GGG|\(G\)/ ) { if ( $currentAnnot->getRevComp() eq '+' ) { $currentAnnot->setHitName( "G-rich" ); } else { $currentAnnot->setHitName( "C-rich" ); } } if ( $currentAnnot->getHitName() =~ /\([GA]+\)/ ) { if ( $currentAnnot->getSW() < ( $currentAnnot->getSeq2End() - $currentAnnot->getSeq2Beg() + 1 ) * ( 9 - $currentAnnot->getPctSubst() * 16 / 100 - ( $currentAnnot->getPctDelete() + $currentAnnot->getPctInsert() ) * 23 / 100 ) ) { if ( $currentAnnot->getRevComp() eq '+' ) { $currentAnnot->setHitName( "GA-rich" ); } else { $currentAnnot->setHitName( "CT-rich" ); } } else { if ( $currentAnnot->getRevComp() eq '+' ) { $currentAnnot->setHitName( "polypurine" ); } else { $currentAnnot->setHitName( "polypyrimidine" ); } } } } if ( $currentAnnot->getHitName() =~ /^\(/ ) { my $unit = $currentAnnot->getHitName(); $unit =~ s/\((\w+)\)n/$1/; my $merness = length $unit; if ( $currentAnnot->getRevComp() eq "C" ) { unless ( $currentAnnot->getHitName() =~ /\(TA\)|\(TTAA\)|\(CG\)|\(CCGG\)/ ) { $unit = reverse $unit; $unit =~ tr/ACGT/TGCA/; $currentAnnot->setHitName( "($unit)n" ); } } while ( $currentAnnot->getSeq2Beg() > $merness ) { $currentAnnot->setSeq2Beg( $currentAnnot->getSeq2Beg() - $merness ); $currentAnnot->setSeq2End( $currentAnnot->getSeq2End() - $merness ); } } $currentAnnot->setRevComp( "+" ); $currentAnnot->set3PrimeUnaligned( 0 ); } } if ( $currentAnnot->getClassName() eq "Low_complexity" ) { # AT-rich and GC rich not strand specific $currentAnnot->setRevComp( '+' ); $currentAnnot->setSeq2End( $currentAnnot->getSeq2End() - $currentAnnot->getSeq2Beg() + 1 ); $currentAnnot->setSeq2Beg( 1 ); $currentAnnot->set3PrimeUnaligned( 0 ); } } ########################## C Y C L E 4 ################################ # # Join Overlapping Fragments # ######################################################################## #printHitArrayList( $sortedAnnotationsList ); # Sort by name, begin position, and end position descending print STDERR "\ncycle 4 "; $sortedAnnotationsList->sort( \&byNameBeginEndrevSWrev ); $i = -1; ## ## This "joins" overlapping fragments for all types of ## annotations. This in effect replaces the FuseOverlappingSeqs sub. ## ## $DEBUG = 0; $cycleAnnotIter = $sortedAnnotationsList->getIterator(); my $prevAnnot; while ( $cycleAnnotIter->hasNext() ) { $i++; print STDERR "." if ( $i + 1 ) % 1000 == 0; my $currentAnnot = $cycleAnnotIter->next(); if ( $DEBUG ) { print STDERR "Overlapping Fragments Considering:\n"; $currentAnnot->printBrief(); } next if ( $currentAnnot->getClassName() =~ /Simple|Low/ ); my $nextAnnot; my $proxIter = $cycleAnnotIter->getIterator(); my @joinList = (); my $currentEnd = $currentAnnot->getSeq1End(); my $skipUntil = undef; while ( $proxIter->hasNext() ) { $nextAnnot = $proxIter->next(); if ( $DEBUG ) { print STDERR " -vs-> (qo = " . $currentAnnot->getQueryOverlap( $nextAnnot ) . "): "; $nextAnnot->printBrief(); } # Quit once we reach our partner last if ( ( $currentAnnot->getRightLinkedHit() && $nextAnnot == $currentAnnot->getRightLinkedHit() ) || ( !$currentAnnot->getRightLinkedHit() && $currentEnd < $nextAnnot->getSeq1Beg() ) || $currentAnnot->getSeq1Name() ne $nextAnnot->getSeq1Name() ); # ASSERT: No more recursion violations if ( $currentAnnot->getRightLinkedHit() && $currentAnnot->containsElement( $nextAnnot ) && $nextAnnot->containsElement( $currentAnnot->getRightLinkedHit() ) ) { ## Currently DNA Transposons can violate recursion rules. if ( $DEBUG ) { print STDERR "\n\n\nThis violates recursion:\nFirst:\n"; $currentAnnot->printLeftRightLinks(); print STDERR "Second:\n"; $nextAnnot->printLeftRightLinks(); print STDERR "\n\n\n"; die; } } if ( $skipUntil ) { if ( $skipUntil == $nextAnnot ) { $skipUntil = undef; } else { print STDERR " -- Can't consider...inside existing join\n" if ( $DEBUG ); next; } } # # Join Overlapping Elements # # TODO: We join things that perhaps we shouldn't. This is a proposed # change to the joining mechanism. Need to consider the change # from 33 to 50 architectually before we continue with this. # # Query Overlap > 50 # ConsensusOverlap - QueryOverlap < 200 # if ( # ( # $currentEnd - $nextAnnot->getSeq1Beg() > 50 # || ( $currentAnnot->getQueryOverlap( $nextAnnot ) == # ( $nextAnnot->getSeq1End() - $nextAnnot->getSeq1Beg() + 1 ) ) # ) # && $currentAnnot->getRightLinkedHit() != $nextAnnot # && $currentAnnot->getRightLinkedHit() != $currentAnnot # && $currentAnnot->getClassName() eq $nextAnnot->getClassName() # && $currentAnnot->getRevComp() eq $nextAnnot->getRevComp() # && ( $currentAnnot->getConsensusOverlap( $nextAnnot ) - # $currentAnnot->getQueryOverlap( $nextAnnot ) ) < 200 # ) if ( ( $currentEnd - $nextAnnot->getSeq1Beg() > 33 || ( $currentAnnot->getQueryOverlap( $nextAnnot ) == ( $nextAnnot->getSeq1End() - $nextAnnot->getSeq1Beg() + 1 ) ) ) && $currentAnnot->getRightLinkedHit() != $nextAnnot && $currentAnnot->getRightLinkedHit() != $currentAnnot && $currentAnnot->getClassName() eq $nextAnnot->getClassName() && $currentAnnot->getRevComp() eq $nextAnnot->getRevComp() ) { if ( $DEBUG ) { print STDERR " Adding to overlap element cluster (co=" . $currentAnnot->getConsensusOverlap( $nextAnnot ) . ")\n"; } push @joinList, $nextAnnot; $currentEnd = $nextAnnot->getSeq1End() if ( $currentEnd < $nextAnnot->getSeq1End() ); } elsif ( $nextAnnot->getRightLinkedHit() ) { $skipUntil = $nextAnnot->getRightLinkedHit(); } } # while has next if ( @joinList ) { my $leftSide = $currentAnnot; my $equivHash = {}; if ( $DEBUG ) { print STDERR " Joining overlap cluster ---- :\n"; print STDERR " "; $leftSide->printBrief(); } foreach my $partner ( @joinList ) { if ( $DEBUG ) { print STDERR " "; $partner->printBrief(); } $leftSide->join( $partner ); # Don't ambiguate the name unless the overlap is excessive # or the element is subsummed completely by the other. if ( $leftSide->getQueryOverlap( $partner ) > 50 || $leftSide->getSeq1End() >= $partner->getSeq1End() ) { print STDERR " ls = " . $leftSide->getSeq2Name() . " par = " . $partner->getSeq2Name() . "\n" if ( $DEBUG ); $equivHash->{ $leftSide->getSeq2Name() } = 1; $equivHash->{ $partner->getSeq2Name() } = 1; } $leftSide = $partner; } if ( keys( %{$equivHash} ) ) { foreach my $element ( @joinList, $currentAnnot ) { my $newEquivHash = { %{$equivHash} }; if ( defined $equivHash->{ $element->getSeq2Name() } ) { delete $newEquivHash->{ $element->getSeq2Name() }; } $element->setEquivHash( $newEquivHash ); } } } } #printHitArrayList( $sortedAnnotationsList ); ########################## C Y C L E 5 ################################ # # This cycle is currently handling the de-fragmentation # of SINES using a new method. # ######################################################################## #printHitArrayList( $sortedAnnotationsList ); # Sort by name, begin position, and end position descending print STDERR "\ncycle 5 "; if ( $options{'refcat'} ) { $sortedAnnotationsList->sort( \&byNameBeginEndrevSWrev ); $i = -1; $cycleAnnotIter = $sortedAnnotationsList->getIterator(); $DEBUG = 0; my %conPosCorrection = (); ## Create a cycle data structure to hold last 21 join scores. ## ie. prevScoreHash{ ID }->{ HitName } = chainScore ( sum of score function ) my %prevScoreHash = (); CYCLE5: while ( $cycleAnnotIter->hasNext() ) { $i++; print STDERR "." if ( $i + 1 ) % 1000 == 0; # NOTE: An iterator's index is considered # to be in between elements of a datastructure. # To obtain the correct index for the # current element in this pass we should # get the index *before* we move the iterator. my $currentIndex = $cycleAnnotIter->getIndex(); my $currentAnnot = $cycleAnnotIter->next(); my ( $currentHitName, $currentClassName ) = split( /\#/, $currentAnnot->getSeq2Name() ); if ( $DEBUG ) { print STDERR "Considering:\n"; $currentAnnot->print(); if ( $currentAnnot->getRightLinkedHit() ) { print STDERR "Bummer this already is linked forward to:\n "; $currentAnnot->getRightLinkedHit()->print(); } } # # Find first fragment which is capable of joining to the right: # # # ----seed----> ------> # or # <----+ # | # ----seed----> ------> # # NOTE: Currently I am using a non-empty $refinementHash as a # proxy for $currentAnnot->isRefineable() # if ( defined $refinementHash{ $currentAnnot->getCATID() } && keys %{ $refinementHash{ $currentAnnot->getCATID() } } && !$currentAnnot->getRightLinkedHit() ) { if ( $DEBUG ) { print STDERR "First Candidate Seed:\n "; $currentAnnot->print(); } # # Gather a collection of candidates # my @candidateJoins = (); my $proxIter = $cycleAnnotIter->getIterator(); my $elementDistance = 0; my $ignoreUntil = undef; my $lastAnnot = $currentAnnot; my $lastDivAnnot = undef; my $classElementDistance = 0; my $totElementDistance = 0; my $highestInterveningDiv = 0; my $unAnnotatedBP = 0; my $unAnnotatedGaps = 0; while ( $proxIter->hasNext() ) { my $nextAnnot = $proxIter->next(); $elementDistance++; my ( $nextHitName, $nextClassName ) = split( /\#/, $nextAnnot->getSeq2Name() ); if ( $DEBUG ) { print STDERR " -vs-: "; $nextAnnot->print(); } my $queryGap = $lastAnnot->getQueryGap( $nextAnnot ); if ( $queryGap >= 100 ) { $unAnnotatedBP += $queryGap; # Only count a series of low/simple repeats as one gap if ( $currentAnnot->getClassName !~ /^simple|low/i ) { $unAnnotatedGaps++; } } $lastAnnot = $nextAnnot; # We don't care to record the divergence of simple/low complexity if ( $lastDivAnnot && $lastDivAnnot->getClassName() !~ /^simple|low/i ) { $highestInterveningDiv = $lastDivAnnot->getPctSubst() if ( $highestInterveningDiv < $lastDivAnnot->getPctSubst() ); } $lastDivAnnot = $nextAnnot; # Number of intervening elements of the same class $classElementDistance++ if ( $nextClassName eq $currentClassName ); # # Reasons we wouldn't consider this element in our cluster and # trigger the end to the search # # A parent join ends at this element # A much older element is reached ( higher cutlevel ) # A much older intervening ( any class ) element. # A set of unannotated gaps exceeds our threshold # When the 21st annotation is reached # if ( $nextAnnot->containsElement( $currentAnnot ) || $currentAnnot->getSeq1Name() ne $nextAnnot->getSeq1Name() || $elementDistance > 21 || ( $unAnnotatedBP * $unAnnotatedGaps ) > 10000 || ( # Think about these impacts a bit: # Don't give up on high intervening # divergence if ( LTR-int or a SINE ) !&isInternal( $nextAnnot ) && $currentClassName !~ /SINE/ && ( ( $currentAnnot->getSeq2End() - $currentAnnot->getSeq2Beg() ) > 50 && &isTooDiverged( $currentAnnot->getPctSubst(), $highestInterveningDiv ) ) ) ) { last; } # if ( $ignoreUntil ) { if ( $ignoreUntil == $nextAnnot ) { $ignoreUntil = undef; } print STDERR "ignoring...\n" if ( $DEBUG ); next; } if ( defined $refinementHash{ $nextAnnot->getCATID() } && keys %{ $refinementHash{ $nextAnnot->getCATID() } } && !$nextAnnot->getLeftLinkedHit() ) { #print STDERR "Pushing...\n"; push @candidateJoins, $nextAnnot; } if ( $nextAnnot->getRightLinkedHit() ) { print STDERR "This has a right linked hit\n" if ( $DEBUG ); $ignoreUntil = $nextAnnot->getRightLinkedHit(); } } # While ( proxIter->hasNext() ) .... searching for candidates # Do we have some candidate joins for this entry? if ( @candidateJoins ) { if ( $DEBUG ) { print STDERR "Candidates found:\n"; foreach my $tmp ( @candidateJoins ) { print STDERR " + "; $tmp->print(); print "\n"; } } # Lookup seed refinement twin # ie. Given RM Hit $currentAnnot: # -----AluJb------> # # And refinement alignments: # -----AluJo---> # -----FRAM_A---> # ----AluJb------> ====> Locate this "twin" alignment and call it $currentMatch # ----7SLRNA-> # # TODO: Only doing this to get currentLenThresh? # my $currentMatch = getMatchingRefinedEntry( $currentAnnot ); # NOTE: This is to get around strange batch overlap behaviour where the twin doesn't exist # in the refinement set. Found in hg18:chr9:131500000-132000000 $currentMatch = $currentAnnot if ( !defined $currentMatch ); if ( $DEBUG ) { print STDERR "Current match = \n "; $currentMatch->print(); } my $currentLenThresh = ( $currentMatch->getSeq1End() - $currentMatch->getSeq1Beg() + 1 ) * .70; # Filter out hits that are less than 80% of the score of # the highest scoring realignment or 200 less than the # the score of the highest scoring realignment whichever # threshold is more permissive. my $chs = getRefinedHighScore( $currentAnnot ); my $currentScoreThresh = $chs * 0.80; $currentScoreThresh = $chs - 100 if ( $chs - $currentScoreThresh < 100 ); my $prevLinkedAnnot = $currentAnnot->getRightLinkedHit(); my $continuationScoreHash = undef; if ( $prevLinkedAnnot && defined $prevScoreHash{ $currentAnnot->getCATID() } ) { $continuationScoreHash = $prevScoreHash{ $currentAnnot->getCATID() }; } # # Iterate through currentAnnot's refinement alignments and score each # one against each candidate's refinement alignments keeping the highest # score as we go. # my $forwardHighScore = 0; my $forwardHighScoringCandidateEquiv = undef; my $forwardHighScoringCurrentEquiv = undef; my $forwardHighScoringCandidate = undef; my %currentScores = (); foreach my $currentEquiv ( getEquivArray( $currentAnnot ) ) { if ( $DEBUG ) { print STDERR " - Current Seed Equiv: "; $currentEquiv->print; } # Don't bother with tiny sub-alignments produced by realignment next if ( $currentEquiv->getSeq1End() - $currentEquiv->getSeq1Beg() + 1 < $currentLenThresh ); print STDERR " - not tiny\n" if ( $DEBUG ); next if ( $currentEquiv->getSW() < $currentScoreThresh ); print STDERR " - high enough score\n" if ( $DEBUG ); # Don't bother if consensus remaining is lower next if ( ( $currentEquiv->getRevComp() ne "C" && $currentEquiv->get3PrimeUnaligned() < 20 ) || ( $currentEquiv->getRevComp() eq "C" && $currentEquiv->getSeq1Beg() < 20 ) ); print STDERR " - enough consensus remaining\n" if ( $DEBUG ); # TODO: If currentAnnot is joined previously then use the list of possible names # to constrain this search. next if ( $continuationScoreHash && ! defined $continuationScoreHash->{ $currentEquiv->getHitName() } ); print STDERR " - previous joins make this name possible\n" if ( $DEBUG ); my $sourceScore = 0; my $score = 0; my @finalJoins = (); my @finalEquivs = (); my $compatibleDistance = 1; my @prevCandidates = (); foreach my $candidate ( @candidateJoins ) { my $candidateMatch = getMatchingRefinedEntry( $candidate ); next if ( !defined $candidateMatch ); my $candidateLenThresh = ( $candidateMatch->getSeq1End() - $candidateMatch->getSeq1Beg() + 1 ) * .70; # Filter out hits that are less than 80% of the score of # the highest scoring realignment or 200 less than the # the score of the highest scoring realignment whichever # threshold is more permissive. my $chs = getRefinedHighScore( $candidate ); #print STDERR "Chs = $chs\n"; my $candidateScoreThresh = $chs * 0.80; $candidateScoreThresh = $chs - 100 if ( $chs - $candidateScoreThresh < 100 ); my $candidateEquivHighScore = 0; my $cEquiv = undef; foreach my $candidateEquiv ( getMatchingRefinedEntries( $candidate, $currentEquiv ) ) { next if ( $candidateEquiv->getSeq1End() - $candidateEquiv->getSeq1Beg() + 1 < $candidateLenThresh ); #print STDERR "Threshold = $candidateScoreThresh\n"; next if ( $candidateEquiv->getSW() < $candidateScoreThresh ); my $tmpScore = scoreRefinedSINEPair( $currentEquiv, $candidateEquiv, $compatibleDistance ); if ( $tmpScore > $candidateEquivHighScore ) { $candidateEquivHighScore = $tmpScore; $cEquiv = $candidateEquiv; } } # foreach candidateEquiv if ( $continuationScoreHash && $candidateEquivHighScore && defined $continuationScoreHash->{ $currentEquiv->getHitName() } ) { print STDERR "Adjusting candidate score due to prevScoreHash entry\n" if ( $DEBUG ); $candidateEquivHighScore += $continuationScoreHash->{ $currentEquiv->getHitName() }; } if ( $candidateEquivHighScore ) { ## Check backwards in ( @candidates ) to make sure that the ## isn't an intervening higher scoring join. my $backwardHighScore = 0; my $cEquivLengthThresh = ( $cEquiv->getSeq1End() - $cEquiv->getSeq1Beg() + 1 ) * .70; my $cDistance = 1; foreach my $highScoreCandidateEquiv ( getEquivArray( $candidate ) ) { # Don't bother with tiny sub-alignments produced by realignment next if ( $highScoreCandidateEquiv->getSeq1End() - $highScoreCandidateEquiv->getSeq1Beg() + 1 < $cEquivLengthThresh ); my $prevCandidateEquivHighScore = 0; foreach my $prevCandidate ( @prevCandidates ) { my $prevCandidateMatch = getMatchingRefinedEntry( $prevCandidate ); next if ( !defined $prevCandidateMatch ); my $prevCandidateLenThresh = ( $prevCandidateMatch->getSeq1End() - $prevCandidateMatch->getSeq1Beg() + 1 ) * .70; foreach my $prevCandidateEquiv ( getMatchingRefinedEntries( $prevCandidate ) ) { next if ( $prevCandidateEquiv->getSeq1End() - $prevCandidateEquiv->getSeq1Beg() + 1 < $prevCandidateLenThresh ); my $tmpScore = scoreRefinedSINEPair( $prevCandidateEquiv, $highScoreCandidateEquiv, $cDistance ); #print STDERR "tmpScore = $tmpScore, "; if ( $tmpScore > $prevCandidateEquivHighScore ) { $prevCandidateEquivHighScore = $tmpScore; } } } if ( $prevCandidateEquivHighScore > $backwardHighScore ) { $cDistance++; $backwardHighScore = $prevCandidateEquivHighScore; } } if ( $backwardHighScore < $candidateEquivHighScore ) { print STDERR "We have a new forward high score of $candidateEquivHighScore\n" if ( $DEBUG ); $compatibleDistance++; $currentScores{ $currentEquiv->getHitName() } = $candidateEquivHighScore; if ( $candidateEquivHighScore > $forwardHighScore ) { $forwardHighScore = $candidateEquivHighScore; $forwardHighScoringCandidateEquiv = $cEquiv; $forwardHighScoringCurrentEquiv = $currentEquiv; $forwardHighScoringCandidate = $candidate; } } else { print STDERR "Backward high score of $backwardHighScore invalidates" . " the current forward high score of $candidateEquivHighScore\n" if ( $DEBUG ); } } push @prevCandidates, $candidate; } # for each candidate } # foreach currentEquiv if ( $DEBUG ) { print STDERR "forwardHighScore = $forwardHighScore\n"; if ( keys( %currentScores ) > 1 ) { print STDERR "Other high scores:\n"; foreach my $key ( keys( %currentScores ) ) { print STDERR " $key = " . $currentScores{$key} . "\n"; } } } # Join code if ( $forwardHighScore ) { if ( $DEBUG ) { print STDERR "Joining: \n "; $forwardHighScoringCurrentEquiv->print(); print STDERR " with \n "; $forwardHighScoringCandidateEquiv->print(); } if ( defined $continuationScoreHash ) { print STDERR "Updating prevScoreHash!\n" if ( $DEBUG ); foreach my $key ( keys( %{$continuationScoreHash} ) ) { if ( defined $currentScores{$key} ) { $prevScoreHash{ $forwardHighScoringCandidate->getCATID() } ->{$key} = $prevScoreHash{ $currentAnnot->getCATID() }->{$key} + $currentScores{$key}; } } } else { print STDERR "Creating new prevScoreHash\n" if ( $DEBUG ); $prevScoreHash{ $forwardHighScoringCandidate->getCATID() } = {%currentScores}; #print "Dumper prevScoreHash: " . Dumper( \%prevScoreHash ) . "\n"; } # TODO: Trim prevScoreHash so it doesn't just keep growing! my $leftAnnot = $currentAnnot; my $leftEquiv = $forwardHighScoringCurrentEquiv; if ( $leftAnnot->getHitName() ne $leftEquiv->getHitName() ) { ## Special case: If leftAnnot was fragmented by RM then we ## need to treat the equivalent differently. ## if ( $leftAnnot->getLeftLinkedHit() && $leftAnnot->getCATID() eq $leftAnnot->getLeftLinkedHit()->getCATID() ) { &replaceRMFragmentChainWithRefinement( $leftAnnot, $leftEquiv ); } else { $leftAnnot->setHitName( $leftEquiv->getHitName() ); $leftAnnot->setSW( $leftEquiv->getSW() ); $leftAnnot->setPctSubst( $leftEquiv->getPctSubst() ); $leftAnnot->setPctDelete( $leftEquiv->getPctDelete() ); $leftAnnot->setPctInsert( $leftEquiv->getPctInsert() ); $leftAnnot->setSeq1Name( $leftEquiv->getSeq1Name() ); $leftAnnot->setSeq1Beg( $leftEquiv->getSeq1Beg() ); $leftAnnot->setSeq1End( $leftEquiv->getSeq1End() ); $leftAnnot->setLeftOver( $leftEquiv->getLeftOver() ); $leftAnnot->setRevComp( $leftEquiv->getRevComp() ); $leftAnnot->setHitName( $leftEquiv->getHitName() ); $leftAnnot->setClassName( $leftEquiv->getClassName() ); $leftAnnot->setSeq2Beg( $leftEquiv->getSeq2Beg() ); $leftAnnot->setSeq2End( $leftEquiv->getSeq2End() ); $leftAnnot->set3PrimeUnaligned( $leftEquiv->get3PrimeUnaligned() ); } $leftAnnot->setDerivedFromAnnot( $leftEquiv ); } my $rightAnnot = $forwardHighScoringCandidate; $leftAnnot->setRightLinkedHit( $rightAnnot ); $rightAnnot->setLeftLinkedHit( $leftAnnot ); my $rightEquiv = $forwardHighScoringCandidateEquiv; if ( $rightAnnot->getHitName() ne $rightEquiv->getHitName() || ( $rightAnnot->getSW() < $rightEquiv->getSW() ) ) { # Special case: Given that b1s4i3 joins to the previously joined # RM fragmented repeat b1s6i8 # # +-----------+ # ---b1s4i3---> | | # ---b1s6i8---> -----b1s6i8---> # # and given the relalignment: # # --------[b1s6i8]----------> # # we must fragment [b1s6i8] into something approximating the # original fragments and then replace the original ones. # if ( $rightAnnot->getRightLinkedHit() && $rightAnnot->getCATID() eq $rightAnnot->getRightLinkedHit()->getCATID() ) { &replaceRMFragmentChainWithRefinement( $rightAnnot, $rightEquiv ); } else { $rightAnnot->setHitName( $rightEquiv->getHitName() ); $rightAnnot->setSW( $rightEquiv->getSW() ); $rightAnnot->setPctSubst( $rightEquiv->getPctSubst() ); $rightAnnot->setPctDelete( $rightEquiv->getPctDelete() ); $rightAnnot->setPctInsert( $rightEquiv->getPctInsert() ); $rightAnnot->setSeq1Name( $rightEquiv->getSeq1Name() ); $rightAnnot->setSeq1Beg( $rightEquiv->getSeq1Beg() ); $rightAnnot->setSeq1End( $rightEquiv->getSeq1End() ); $rightAnnot->setLeftOver( $rightEquiv->getLeftOver() ); $rightAnnot->setRevComp( $rightEquiv->getRevComp() ); $rightAnnot->setHitName( $rightEquiv->getHitName() ); $rightAnnot->setClassName( $rightEquiv->getClassName() ); $rightAnnot->setSeq2Beg( $rightEquiv->getSeq2Beg() ); $rightAnnot->setSeq2End( $rightEquiv->getSeq2End() ); $rightAnnot->set3PrimeUnaligned( $rightEquiv->get3PrimeUnaligned() ); } $rightAnnot->setDerivedFromAnnot( $rightEquiv ); } $leftAnnot = $rightAnnot; } } # if ( @candidateJoins ) } # if seed } # cycle5a } # if refcat..... #printHitArrayList( $sortedAnnotationsList ); ########################## C Y C L E 6 ################################ # # LINE/LTR/SINE Joining Algorithm # ######################################################################## print STDERR "\ncycle 6 "; $cycleAnnotIter = $sortedAnnotationsList->getIterator(); $i = -1; $DEBUG = 0; while ( $cycleAnnotIter->hasNext() ) { $i++; print STDERR "." if ( $i + 1 ) % 1000 == 0; my $currentAnnot = $cycleAnnotIter->next(); my $currentHitName = $currentAnnot->getHitName(); my $currentClassName = $currentAnnot->getClassName(); # # This is not a simple/low complexity joinging loop. # Skip these and do not include them in the intervening # divergence calculations. Also skip DNA Transposons # which were joined in a previous cycle. # next if ( $currentClassName =~ /^simple|low|dna/i ); my $maxScore = 0; my $maxScoringAnnot = undef; # # Do not attempt to link fragments which are already # pre-linked. # if ( !$currentAnnot->getRightLinkedHit() ) { if ( $DEBUG ) { print STDERR "\nCYCLE6: Considering:\n"; $currentAnnot->print(); } # Look into our future my $proxIter = $cycleAnnotIter->getIterator(); my $ignoreUntil; my $classElementDistance = 0; my $totElementDistance = 0; my $highestInterveningDiv = 0; my $unAnnotatedBP = 0; my $unAnnotatedGaps = 0; my $lastDivAnnot = undef; my $lastAnnot = $currentAnnot; my $score = 0; while ( $proxIter->hasNext() ) { my $nextAnnot = $proxIter->next(); my $nextHitName = $nextAnnot->getHitName(); my $nextClassName = $nextAnnot->getClassName(); $totElementDistance++; # # A new statistic for giving up on joins # - Count all un-annoted gaps > threshold ( currently 100 ) # - Sum all bp in the above gaps # my $queryGap = $lastAnnot->getQueryGap( $nextAnnot ); if ( $queryGap >= 100 ) { $unAnnotatedBP += $queryGap; # Only count a series of low/simple repeats as one gap if ( $currentAnnot->getClassName !~ /^simple|low/i ) { $unAnnotatedGaps++; } } $lastAnnot = $nextAnnot; # We don't care to record the divergence of simple/low complexity if ( $lastDivAnnot && $lastDivAnnot->getClassName() !~ /^simple|low/i ) { $highestInterveningDiv = $lastDivAnnot->getPctSubst() if ( $highestInterveningDiv < $lastDivAnnot->getPctSubst() ); } $lastDivAnnot = $nextAnnot; # Number of intervening elements of the same class $classElementDistance++ if ( $nextClassName eq $currentClassName ); if ( $DEBUG ) { print STDERR " $totElementDistance -vs-> "; $nextAnnot->print(); print STDERR " unAnnotatedBP = $unAnnotatedBP * unAnnoatedGaps = $unAnnotatedGaps < 10000\n"; } # # Reasons we wouldn't consider this element in our cluster and # trigger the end to the search # # A parent join ends at this element # A much older element is reached ( higher cutlevel ) # A much older intervening ( any class ) element. # A set of unannotated gaps exceeds our threshold # When the 21st annotation is reached # if ( $nextAnnot->containsElement( $currentAnnot ) || $currentAnnot->getSeq1Name() ne $nextAnnot->getSeq1Name() || $totElementDistance > 21 || ( $unAnnotatedBP * $unAnnotatedGaps ) > 10000 || ( # Think about these impacts a bit: # Don't give up on high intervening # divergence if ( LTR-int or a SINE ) !&isInternal( $nextAnnot ) && $currentClassName !~ /SINE/ && ( ( $currentAnnot->getSeq2End() - $currentAnnot->getSeq2Beg() ) > 50 && &isTooDiverged( $currentAnnot->getPctSubst(), $highestInterveningDiv ) ) ) ) { if ( $DEBUG ) { print STDERR " ---> This element is beyond our " . "consideration boundary:\n"; print STDERR " totElementDistance = " . "$totElementDistance\n"; print STDERR " highestInterveningDiv = " . "$highestInterveningDiv\n"; print STDERR " unAnnotatedBP = $unAnnotatedBP\n"; print STDERR " unAnnotatedGaps = $unAnnotatedGaps\n"; print STDERR " isTooDiverged() = " . &isTooDiverged( $currentAnnot->getPctSubst(), $highestInterveningDiv ) . "\n"; print STDERR " containsElement ==> " . $nextAnnot->containsElement( $currentAnnot ) . "\n"; } last; } # # Move along if we are moving over a joined set of # fragments ( and their children ) which cannot be joined # to us. This flag is set below. # # +--------joined------------+ # | | # --current-- -----+---- --- ------ ---+--- ----- # *skip* *skip* *skip* # if ( $ignoreUntil && $ignoreUntil != $nextAnnot ) { print STDERR " --> Ignoring joined fragments\n" if ( $DEBUG ); next; } else { $ignoreUntil = undef; } # # Do not consider this one if it has a much lower divergence than # something in between. This is the symmetric opposite of the # catch above. # # TODO: Consider doing this for SINES as above # if ( !&isInternal( $nextAnnot ) && $nextAnnot->getSeq2End() - $nextAnnot->getSeq2Beg() > 50 && &isTooDiverged( $nextAnnot->getPctSubst(), $highestInterveningDiv ) ) { if ( $DEBUG ) { print STDERR " --> This has a much lower div than something " . "in between: interveningHitDiv = $highestInterveningDiv\n"; } next; } ##### TODO Uncomment this and remove the duplicate code at bottom # # Need to ignore all element until we reach the right-hand element. # if ( $nextAnnot->getRightLinkedHit() ) { if ( $DEBUG ) { print STDERR " ---> Has right linked hit...ignoring until: "; $nextAnnot->getRightLinkedHit()->print(); } $ignoreUntil = $nextAnnot->getRightLinkedHit(); } # # Don't join fragments of different classes. # if ( $nextClassName ne $currentClassName ) { print STDERR " --> Not same class...moving on\n" if ( $DEBUG ); next; } # # Only elements not already linked to something before the # current element: # # +--------joined------------+ # | | # --current-- -----+---- --- ------ ---+--- ----- # *skip* # if ( !$nextAnnot->getLeftLinkedHit() ) { # LINE Equivalences ( SINE/LTR are not handled this way ) my @currentNames = ( $currentAnnot->getSeq2Name() ); if ( defined $currentAnnot->getEquivHash() ) { push @currentNames, keys( %{ $currentAnnot->getEquivHash() } ); } #print STDERR "Dumper: " . Dumper(\@currentNames) . "\n"; my @nextNames = ( $nextAnnot->getSeq2Name() ); if ( defined $nextAnnot->getEquivHash() ) { push @nextNames, keys( %{ $nextAnnot->getEquivHash() } ); } my $savedCName = $currentAnnot->getSeq2Name(); my $savedNName = $nextAnnot->getSeq2Name(); foreach my $cName ( @currentNames ) { print STDERR " -->trying current name: $cName\n" if ( $DEBUG ); $currentAnnot->setSeq2Name( $cName ); foreach my $nName ( @nextNames ) { print STDERR " -->trying next name: $nName\n" if ( $DEBUG ); $nextAnnot->setSeq2Name( $nName ); if ( $currentClassName =~ /LINE/ ) { $score = &scoreLINEPair( $currentAnnot, $nextAnnot, $classElementDistance, \%options ); } elsif ( $currentClassName =~ /LTR/ ) { $score = &scoreLTRPair( $currentAnnot, $nextAnnot, $totElementDistance, $classElementDistance ); } elsif ( $currentClassName =~ /SINE/ && $currentClassName !~ /Alu/ ) { $score = &scoreSINEPair( $currentAnnot, $nextAnnot, $totElementDistance, $classElementDistance ); } else { $score = &scoreGenericPair( $currentAnnot, $nextAnnot, $totElementDistance, $classElementDistance ); } if ( $score > 0 ) { print STDERR " --> Ambiguous names matched:" . " $cName, $nName\n" if ( $DEBUG ); last; } } last if ( $score > 0 ); } # foreach my $cName... $currentAnnot->setSeq2Name( $savedCName ); $nextAnnot->setSeq2Name( $savedNName ); print STDERR " ---Wow saved one!\n" if ( $score > 0 && $DEBUG ); # # If we found a compatible match we now look backwards # to see if there was a better intervening match to # the candidate. I.e preserve the "best-closest-match" # concept. # # +------------good-----------------+ # | +-better-+ | # | | | | # ----current--- ----- ---+--- ---+----+--- # # Ignore this match if there is a better intervening # match. # if ( $score > 0 ) { print STDERR " \\----> score = $score\n" if ( $DEBUG ); my $revIter = $proxIter->getIterator(); $revIter->previous(); my $inBetweenDistance = 0; my $classBetweenDistance = 0; my $inBetweenScore = 0; my $prevAnnot; print STDERR " Looking backwards to see if there was a " . "better match\n" if ( $DEBUG ); while ( $revIter->hasPrevious() ) { $prevAnnot = $revIter->previous(); if ( $DEBUG ) { print STDERR " -vs-> "; $prevAnnot->printBrief(); } $inBetweenDistance++; $classBetweenDistance++; # Look backwards up to $currentAnnot last if ( $prevAnnot == $currentAnnot ); next if ( $prevAnnot->getClassName() ne $currentAnnot->getClassName() ); if ( $currentClassName =~ /LINE/ ) { $inBetweenScore = &scoreLINEPair( $prevAnnot, $nextAnnot, $inBetweenDistance, \%options ); } elsif ( $currentClassName =~ /LTR/ ) { $inBetweenScore = &scoreLTRPair( $prevAnnot, $nextAnnot, $inBetweenDistance, $classBetweenDistance ); } elsif ( $currentClassName =~ /SINE/ ) { $inBetweenScore = &scoreSINEPair( $prevAnnot, $nextAnnot, $inBetweenDistance ); } else { $inBetweenScore = &scoreGenericPair( $prevAnnot, $nextAnnot, $inBetweenDistance, $classElementDistance ); } last if ( $inBetweenScore >= $score ); } if ( $inBetweenScore >= $score ) { # Abandon match, there is an intervening better match print STDERR " \\--> Better intervening score " . "$inBetweenScore\n" if ( $DEBUG ); } else { print STDERR " ---> Sticking with first match: score=$score\n" if ( $DEBUG ); if ( $maxScore < $score ) { $maxScore = $score; $maxScoringAnnot = $nextAnnot; } } } } else { if ( $DEBUG ) { print STDERR " ---> Already linked to something on left" . "\n "; $nextAnnot->getLeftLinkedHit()->printBrief(); } } #### TODO: REMOVE THIS # # Need to ignore all element until we reach the right-hand element. # #if ( $nextAnnot->getRightLinkedHit() ) { # print STDERR " ---> Has right linked hit...ignoring until\n" # if ( $DEBUG ); # $ignoreUntil = $nextAnnot->getRightLinkedHit(); #} } # while ( $proxIter->hasNext()... if ( $maxScore > 0 ) { if ( $DEBUG ) { print STDERR " *** And the winner is: score = $maxScore\n "; $maxScoringAnnot->print(); } $currentAnnot->setRightLinkedHit( $maxScoringAnnot ); $maxScoringAnnot->setLeftLinkedHit( $currentAnnot ); } } } $DEBUG = 0; #printHitArrayList( $sortedAnnotationsList ); ########################## C Y C L E 7 ################################ # # Name joined LINE/LTR clusters # ######################################################################## print STDERR "\ncycle 7 "; ## ## Name joined fragments & pick consensus adjustment ## ## The name of a chain is determined by the highest scoring ## fragment in the highest priority model group. The model ## groups are ( in order of priority ) 3-Prime End Models, ## 5-Prime End Models, ORF Models, and Undesignated Models. ## ## Also the consensus adjustment is calculated by finding ## the highest consensus adjustment from conPosCorrection ## for the given fragment chain. ## $DEBUG = 0; $cycleAnnotIter = $sortedAnnotationsList->getIterator(); $i = -1; while ( $cycleAnnotIter->hasNext() ) { $i++; print STDERR "." if ( $i + 1 ) % 1000 == 0; my $currentAnnot = $cycleAnnotIter->next(); if ( $DEBUG ) { print STDERR "Considering: "; $currentAnnot->print(); } # RMH: IN DEVELOPMENT # TODO: Handle pre-joined non-chain fragments # TODO: Get rid of refcat an make dependent on refined datastructure if ( $options{'refcat'} && defined $refinementHash{ $currentAnnot->getCATID() } && $currentAnnot->getSeq2Name =~ /Alu/ && !$currentAnnot->getLeftLinkedHit() && !$currentAnnot->getRightLinkedHit() ) { my $currentAnnotRefinedMatch = getMatchingRefinedEntry( $currentAnnot ); # See above problem with batch overlaps $currentAnnotRefinedMatch = $currentAnnot if ( !defined $currentAnnotRefinedMatch ); my $lengthThreshold = ( $currentAnnotRefinedMatch->getSeq1End() - $currentAnnotRefinedMatch->getSeq1Beg() ) + 1 - 10; if ( $DEBUG ) { print STDERR "Match: "; $currentAnnotRefinedMatch->print(); } my $maxScore = 0; my $maxScoringElement = undef; foreach my $ele ( getEquivArray( $currentAnnot ) ) { if ( $ele->getSW() > $maxScore ) { $maxScore = $ele->getSW(); $maxScoringElement = $ele; } } print STDERR "lengthThreshold = $lengthThreshold\n" if ( $DEBUG ); if ( $maxScoringElement && ( $maxScoringElement->getSeq1End() - $maxScoringElement->getSeq1Beg + 1 > $lengthThreshold ) ) { $currentAnnot->setHitName( $maxScoringElement->getHitName() ); $currentAnnot->setSW( $maxScoringElement->getSW() ); $currentAnnot->setPctSubst( $maxScoringElement->getPctSubst() ); $currentAnnot->setPctDelete( $maxScoringElement->getPctDelete() ); $currentAnnot->setPctInsert( $maxScoringElement->getPctInsert() ); $currentAnnot->setSeq1Name( $maxScoringElement->getSeq1Name() ); $currentAnnot->setSeq1Beg( $maxScoringElement->getSeq1Beg() ); $currentAnnot->setSeq1End( $maxScoringElement->getSeq1End() ); $currentAnnot->setLeftOver( $maxScoringElement->getLeftOver() ); $currentAnnot->setRevComp( $maxScoringElement->getRevComp() ); $currentAnnot->setHitName( $maxScoringElement->getHitName() ); $currentAnnot->setClassName( $maxScoringElement->getClassName() ); $currentAnnot->setSeq2Beg( $maxScoringElement->getSeq2Beg() ); $currentAnnot->setSeq2End( $maxScoringElement->getSeq2End() ); $currentAnnot->set3PrimeUnaligned( $maxScoringElement->get3PrimeUnaligned() ); $currentAnnot->setDerivedFromAnnot( $maxScoringElement ); if ( $DEBUG ) { print STDERR "I did this with it: "; $currentAnnot->print(); } } } # We only care about LINES/LTR for the rest next if ( $currentAnnot->getSeq2Name !~ /LINE|LTR/ ); # Consider only left ( from our direction of iteration ) # fragment ends if ( $currentAnnot->getRightLinkedHit() && !$currentAnnot->getLeftLinkedHit() || !$currentAnnot->getLeftLinkedHit() && !$currentAnnot->getRightLinkedHit() ) { my @elements = (); push @elements, $currentAnnot; # Follow links until end my $nextInChain = $currentAnnot; my $highestConCorr = $conPosCorrection{ $nextInChain->getID() }; if ( $DEBUG ) { print STDERR "Naming Current:\n"; $currentAnnot->print(); } while ( $nextInChain ) { if ( $DEBUG ) { print STDERR " -- "; $nextInChain->print(); print STDERR " \\--- ID=" . $nextInChain->getID() . " con{ID} = " . $conPosCorrection{ $nextInChain->getID() } . " highest = " . $highestConCorr . "\n"; } push @elements, $nextInChain; $highestConCorr = $conPosCorrection{ $nextInChain->getID() } if ( $conPosCorrection{ $nextInChain->getID() } > $highestConCorr ); $nextInChain = $nextInChain->getRightLinkedHit(); } $nextInChain = $elements[ $#elements ]; # LTR or LINE my $newLINEName = ""; my $newLTRName = ""; my $newLTRIntName = ""; if ( $currentAnnot->getClassName =~ /LTR/ ) { @elements = sort { isLTR( $b ) <=> isLTR( $a ) || $b->getSW() <=> $a->getSW(); } @elements; my $winner = $elements[ 0 ]; $newLTRName = $winner->getHitName() if ( isLTR( $winner ) ); @elements = sort { isInternal( $b ) <=> isInternal( $a ) || $b->getSW() <=> $a->getSW(); } @elements; $winner = shift @elements; $newLTRIntName = $winner->getHitName() if ( isInternal( $winner ) ); $newLTRIntName =~ s/-int//g; } else { @elements = sort { my $nameA = $a->getSeq2Name(); my $nameB = $b->getSeq2Name(); ( $nameA ) = ( $nameA =~ /(_5end|_3end|_orf2)/ ); ( $nameB ) = ( $nameB =~ /(_5end|_3end|_orf2)/ ); $nameA = "_zzz" if ( $nameA eq "" ); $nameB = "_zzz" if ( $nameB eq "" ); $nameA cmp $nameB || $b->getSW() <=> $a->getSW(); } @elements; my $winner = shift @elements; $newLINEName = $winner->getSeq2Name(); } # Now change the names while ( $nextInChain ) { if ( $currentAnnot->getClassName() =~ /LTR/ ) { if ( isInternal( $nextInChain ) ) { if ( $newLTRName && $nextInChain->getClassName() =~ /ERVL-MaLR/ ) { # Name after LTR $nextInChain->setHitName( $newLTRName . "-int" ); } elsif ( $newLTRIntName ) { # Name after highest scoring internal $nextInChain->setHitName( $newLTRIntName . "-int" ); } } else { $nextInChain->setHitName( $newLTRName ); } } else { $nextInChain->setSeq2Name( $newLINEName ); } if ( $DEBUG ) { print STDERR "Fixing conPosCorrection(*$highestConCorr): "; $nextInChain->print(); } # NOTE: Design improvement potential. LINEs like Hal1B can # be renamed as other LINES ( ie. L1M7 ). When this # happens a different adjustment should be used to # define it's consensus coordinates. # Here is a good example: #287 28.17 8.72 4.70 big 5079 5227 (39675) C HAL1b#LINE/L1 (457) 1552 1398 5 #281 25.51 16.59 4.39 big 5136 5298 (39748) C L1M7_5end#LINE/L1 (417) 1820 1637 5 #705 15.03 0.00 8.28 big 5299 5443 (41394) C FLAM_C_short_#SINE/Alu (0) 133 1 3 #281 25.51 16.59 4.39 big 5444 5484 (39562) C L1M7_5end#LINE/L1 (601) 1636 1591 5 #345 27.33 5.26 1.32 big 5506 5657 (39389) C L1M7_5end#LINE/L1 (873) 1364 1207 5 # Consider a design change to handle this. # $conPosCorrection{ $nextInChain->getID() } = $highestConCorr; # Can be linked to itself. if ( $nextInChain == $nextInChain->getLeftLinkedHit() ) { last; } $nextInChain = $nextInChain->getLeftLinkedHit(); } # while ( $nextInChain... } } $DEBUG = 0; #printHitArrayList( $sortedAnnotationsList ); ########################## C Y C L E 8 ################################ # # Merge overlapping fragments # ######################################################################## print STDERR "\ncycle 8 "; ## ## Merge joined overlapping fragments into one fragment ## ## This is basically the old FuseOverlappingSeqs routine. ## The process is basically a cosmetic one -- simplifying ## the output. ## ## Sort by class? ## $cycleAnnotIter = $sortedAnnotationsList->getIterator(); my $prevAnnot; $DEBUG = 0; $i = -1; while ( $cycleAnnotIter->hasNext() ) { # Pick one element $i++; print STDERR "." if ( $i + 1 ) % 1000 == 0; my $currentAnnot = $cycleAnnotIter->next(); if ( $DEBUG ) { print STDERR "Considering:\n"; $currentAnnot->print(); } my $proxIter = $cycleAnnotIter->getIterator(); my $lookAhead = 0; while ( $proxIter->hasNext() ) { my $next1Annot = $proxIter->next(); if ( $DEBUG ) { print STDERR " --vs-->"; $next1Annot->print(); print STDERR " query Gap = " . $currentAnnot->getQueryGap( $next1Annot ) . "\n"; } # TODO: Now this is specific for SVA...need to generalize this for # anything which contains a tandem repeat unit. last if ( $next1Annot->getSeq1Name() ne $currentAnnot->getSeq1Name() || ( $currentAnnot->getHitName !~ /SVA/ && $currentAnnot->getQueryGap( $next1Annot ) > 10 ) || ( $currentAnnot->getHitName() =~ /SVA/ && $lookAhead++ > 2 ) ); next if ( $next1Annot->getClassName() ne $currentAnnot->getClassName() ); my $next2Annot = undef; if ( $proxIter->hasNext() ) { $next2Annot = $proxIter->next(); $proxIter->previous(); # So as to emit the one we will delete $proxIter->previous(); $proxIter->next(); } my $QO = $currentAnnot->getQueryOverlap( $next1Annot ); # # SVAs # # - Special because they contain a vntr region. This region # behaves in a similar way to satellites or simple repeats. # As a consequence we have to allow a lessor overlap?? # - also can have different cut levels due to it's placement in # the Alu(SINES) search phase.....hmmm. # - Lastly these are not joined yet. # - Query Gap < 10bp # if ( $currentAnnot->getHitName() =~ /SVA/ && $next1Annot->getHitName() =~ /SVA/ ) { if ( $currentAnnot->getRevComp() eq "C" && $currentAnnot->getSeq2End() > 425 && $next1Annot->getSeq2End() < 880 || $currentAnnot->getRevComp() eq "+" && $currentAnnot->getSeq2Beg() < 880 && $next1Annot->getSeq2Beg() > 425 ) { # rest taken care of elsewhere if ( $QO >= -10 ) { if ( $DEBUG ) { print STDERR "SVA merging:\n"; $currentAnnot->print(); $next1Annot->print(); } $currentAnnot->merge( $next1Annot ); $currentAnnot->addDerivedFromAnnot( $next1Annot ) if ( $options{'source'} ); $proxIter->remove(); } else { if ( $DEBUG ) { print STDERR "SVA joining\n"; $currentAnnot->print(); $next1Annot->print(); } $currentAnnot->join( $next1Annot ); last; } } } elsif ( $currentAnnot->getClassName() =~ /Simple|Satellite/ && $QO >= -10 ) { # Simple Repeats and Satellites my $thislength = $currentAnnot->getSeq2End() - $currentAnnot->getSeq2Beg() + 1; my $lastlength = $next1Annot->getSeq2End() - $next1Annot->getSeq2Beg() + 1; if ( $DEBUG ) { print STDERR " Simple/Satellite"; } if ( $next1Annot->getSeq1End() >= $currentAnnot->getSeq1End() ) { if ( $currentAnnot->getHitName() ne $next1Annot->getHitName() ) { my $nextoverlap = 0; if ( $next2Annot ) { if ( $DEBUG ) { print STDERR " --next->:"; $next2Annot->printBrief(); } # don't fuse (CA)n(TG)n(CA)n etc my $tempname = quotemeta $next1Annot->getHitName(); if ( $next2Annot->getSeq2Name() =~ /$tempname/ && $next1Annot->getSeq1Name() eq $next2Annot->getSeq1Name() && $next2Annot->getSeq1Beg() < $next1Annot->getSeq1End() ) { $nextoverlap = $currentAnnot->getQueryOverlap( $next2Annot ); } } #print STDERR " QO=$QO, thislength = $thislength, " . # "nextoverlap = $nextoverlap\n" if ( $DEBUG ); if ( $QO + $nextoverlap > 10 && # just to be sure it's not 0 in the following division $thislength / ( $QO + $nextoverlap ) < 2 ) { print STDERR " --> merging!\n" if ( $DEBUG ); $currentAnnot->mergeSimpleLow( $next1Annot ); $currentAnnot->addDerivedFromAnnot( $next1Annot ) if ( $options{'source'} ); $proxIter->remove(); } } else { print STDERR " --> merging!\n" if ( $DEBUG ); $currentAnnot->mergeSimpleLow( $next1Annot ); $currentAnnot->addDerivedFromAnnot( $next1Annot ) if ( $options{'source'} ); $proxIter->remove(); } } } # See if they overlap by some obvious amount if ( $currentAnnot->getSeq1End() - $next1Annot->getSeq1Beg() + 1 > 33 ) { print STDERR " -- a big overlap...is it already linked?\n" if ( $DEBUG ); if ( $currentAnnot->getRightLinkedHit() == $next1Annot ) { if ( $DEBUG ) { print STDERR "Merging a joined pair:\n"; $currentAnnot->print(); $next1Annot->print(); } $currentAnnot->merge( $next1Annot ); $currentAnnot->addDerivedFromAnnot( $next1Annot ) if ( $options{'source'} ); $proxIter->remove(); if ( $DEBUG ) { print STDERR " Outcome:\n"; $currentAnnot->print(); } if ( $conPosCorrection{ $currentAnnot->getID() } && $currentAnnot->getSeq2Beg() < $next1Annot->getSeq2Beg() ) { $conPosCorrection{ $next1Annot->getID() } = $conPosCorrection{ $currentAnnot->getID() }; } } elsif ( $DEBUG ) { print STDERR " No!\n"; #$currentAnnot->printLinks(); } # If they don't....then try again with a not so obvious # check } elsif ( $currentAnnot->getRightLinkedHit() == $next1Annot && $currentAnnot->getRevComp() eq $next1Annot->getRevComp() && $currentAnnot->getLastField == $next1Annot->getLastField() && $currentAnnot->getQueryGap( $next1Annot ) <= 10 && $currentAnnot->getConsensusGap( $next1Annot ) <= 100 && ( $currentAnnot->getConsensusOverlap( $next1Annot ) <= 20 || $currentAnnot->getQueryGap( $next1Annot ) < 0 && $currentAnnot->getConsensusOverlap( $next1Annot ) - $currentAnnot->getQueryOverlap( $next1Annot ) <= 20 ) ) { # Fuse all remaining neighboring closely related elements that # overlap or nearly join in the query and have small gaps or tiny # overlaps in the consensus. This simlplifies the output, though # there is a further disconnect with the alignments. Since this is # largely a cosmetic action, selectivity trumps sensitivity. # # TODO: Arian suggested that we only join these if the pctDel # does not exceed some threshold. This limit would keep # fragments from merging if they contain more info as # independent annotations. # # my ( $subBases, $subPct, $delBases, $delPct, $insBases, $insPct ) = $currentAnnot->getAdjustedSubstLevel( $next1Annot ); # print STDERR "Merged pair would have: subPct = $subPct, " . "delBases = $delBases, delPct = $delPct, insPct = $insPct\n" if ( $DEBUG ); #if ( $delPct < 30 ) unless ( $delPct > 50 ) { if ( $DEBUG ) { print STDERR "SHOULD MERGE THESE TWO:\n"; $currentAnnot->printBrief(); $next1Annot->printBrief(); } $currentAnnot->merge( $next1Annot ); $currentAnnot->addDerivedFromAnnot( $next1Annot ) if ( $options{'source'} ); $proxIter->remove(); if ( $DEBUG ) { print STDERR " Outcome:\n"; $currentAnnot->print(); } if ( $conPosCorrection{ $currentAnnot->getID() } && $currentAnnot->getSeq2Beg() < $next1Annot->getSeq2Beg() ) { $conPosCorrection{ $next1Annot->getID() } = $conPosCorrection{ $currentAnnot->getID() }; } } # if $delPct < 30... } # Merge close fragments elsif ( $DEBUG ) { print STDERR " Don't merge: qo = " . $currentAnnot->getQueryOverlap( $next1Annot ) . " co = " . $currentAnnot->getConsensusOverlap( $next1Annot ) . " linked = "; if ( $currentAnnot->getRightLinkedHit() == $next1Annot ) { print STDERR "yes"; } else { print STDERR "no"; } print STDERR "\n"; } } # while proxIter->hasNext()... } #printHitArrayList( $sortedAnnotationsList ); ########################## C Y C L E 9 ################################ # # Remove things included in other things??? # LINE Recombinants # Unhide elements inserted inside other hits # ######################################################################## print STDERR "\ncycle 9 "; $sortedAnnotationsList->sort( \&byNameBeginEndrevSWrev ); $i = -1; $cycleAnnotIter = $sortedAnnotationsList->getIterator(); $DEBUG = 0; CYCLE9: while ( $cycleAnnotIter->hasNext() ) { $i++; print STDERR "." if ( $i + 1 ) % 1000 == 0; my $currentAnnot = $cycleAnnotIter->next(); if ( $DEBUG ) { print STDERR "Cycle9: Considering: "; $currentAnnot->print(); } # # Remove fragments contained by a previous annotation # my $pastIter = $cycleAnnotIter->getIterator(); $pastIter->previous(); my $k = 1; while ( $k++ < 10 && $pastIter->hasPrevious() ) { my $pastAnnot = $pastIter->previous(); if ( $currentAnnot->getSeq1Name() eq $pastAnnot->getSeq1Name() ) { my $currentClassName = $currentAnnot->getClassName(); if ( $currentAnnot->getLastField() eq $pastAnnot->getLastField() && ( $currentAnnot->getSeq1End() <= $pastAnnot->getSeq1End() && ( $currentAnnot->getSeq2Name() eq $pastAnnot->getSeq2Name() || $currentAnnot->getSeq2Name() =~ /LTR8$/ && $pastAnnot->getSeq2Name() =~ /Harlequin/ ) || ( ( $currentAnnot->getSeq1End() == $pastAnnot->getSeq1End() || $currentAnnot->getSeq1Beg() == $pastAnnot->getSeq1Beg() ) && $pastAnnot->getClassName() =~ /$currentClassName/ ) ) ) { if ( $DEBUG ) { print STDERR "1Removing because it's included in the previous!\n"; $pastAnnot->print(); $currentAnnot->print(); } if ( $pastAnnot->getRightLinkedHit() == $currentAnnot ) { print STDERR "Relinking the past\n" if ( $DEBUG ); $pastAnnot->setRightLinkedHit( $currentAnnot->getRightLinkedHit() ); if ( $currentAnnot->getRightLinkedHit() ) { $currentAnnot->getRightLinkedHit() ->setLeftLinkedHit( $pastAnnot ); } } else { ### What should be done with these? They may have linked ### partners. } $currentAnnot->removeFromJoins(); $pastAnnot->addDerivedFromAnnot( $currentAnnot ) if ( $options{'source'} ); $cycleAnnotIter->remove(); next CYCLE9; } elsif ( $currentAnnot->getLastField() == 4 && $pastAnnot->getLastField() == 5 && $currentAnnot->getSeq2Name() eq $pastAnnot->getSeq2Name() && ( $currentAnnot->getSeq1End() == $pastAnnot->getSeq1End() || $currentAnnot->getSeq1Beg() == $pastAnnot->getSeq1Beg() ) ) { if ( $DEBUG ) { print STDERR "2Removing because it's included in the previous!\n"; $pastAnnot->print(); $currentAnnot->print(); } if ( $pastAnnot->getRightLinkedHit() == $currentAnnot ) { print STDERR "Relinking the past\n" if ( $DEBUG ); $pastAnnot->setRightLinkedHit( $currentAnnot->getRightLinkedHit() ); if ( $currentAnnot->getRightLinkedHit() ) { $currentAnnot->getRightLinkedHit() ->setLeftLinkedHit( $pastAnnot ); } } # See above $currentAnnot->removeFromJoins(); $pastAnnot->addDerivedFromAnnot( $currentAnnot ) if ( $options{'source'} ); $cycleAnnotIter->remove(); next CYCLE9; } } else { last; } } # March 2004; moved this block to so that recombined elements # with inserts can be joined properly. # I have seen insertions in human vs chimp (and chimp vs human) in # which the gap in the consensus was up to 175 bp # With new ORF2 consensus seqs less chance of false joining and more # consistent divergence of fragments, I've reset the allowed gap in the # consensus from 33 to a calculated distance based on difference in # divergence level, gap between fragments in query sequence. The gap # allowed is doubled when the fragments have the same subfamily # designation. if ( $currentAnnot->getClassName =~ /^LINE\/L1/ ) { # special case of (the quite common case of) LINEs integrated # after recombination this appears as an inversion of part of # the element giving a <- -> structure. matchL1frags() matches # up IDs of fragments and improves subfamily name if possible. # This is not seen in L2,L3 (Perhaps in L4; don't know yet) my $pastIter = $cycleAnnotIter->getIterator(); $pastIter->previous(); my $forwIter = $cycleAnnotIter->getIterator(); my $n = 1; my $matched = 0; while ( $n++ < 6 ) { my $pastAnnot = undef; my $pastAnnot = $pastIter->previous() if ( $pastIter->hasPrevious() ); my $nextAnnot = undef; my $nextAnnot = $forwIter->next() if ( $forwIter->hasNext() ); if ( $pastAnnot && $currentAnnot->getSeq1Name() eq $pastAnnot->getSeq1Name() && $pastAnnot->getClassName() eq "LINE/L1" && $currentAnnot->getSeq1Beg() - $pastAnnot->getSeq1End() <= 20 && $currentAnnot->getRevComp() eq '+' && $pastAnnot->getRevComp() eq "C" ) { my $gap = $currentAnnot->getSeq1Beg() - $pastAnnot->getSeq1End(); # Overlaps always indicate false extension; neither bad nor good $gap = 0 if ( $gap < 0 ); # The more different the divergence and the further apart # the fragments, the stricter the requirements my $gapallowed = 175 - 20 * abs( $currentAnnot->getPctSubst() - $pastAnnot->getPctSubst() ) - 5 * $gap; # Same subfamily name is a big bonus $gapallowed += 50 if ( $currentAnnot->getHitName() eq $pastAnnot->getHitName() ); if ( $currentAnnot->getSeq2End() - $pastAnnot->getSeq2Beg() < $gapallowed / 2 && # overlap should be smaller, but ones seen up to 46 bp $pastAnnot->getSeq2Beg() - $currentAnnot->getSeq2End() < $gapallowed ) { if ( &areLINENamesCompat( $currentAnnot, $pastAnnot ) || $currentAnnot->getHitName() =~ /^L1.*_3end$/ && $currentAnnot->getSeq2End() < 6015 && $pastAnnot->getSeq2End() - $pastAnnot->getSeq2Beg() > 100 ) { $matched = $pastAnnot; last; } } } elsif ( $nextAnnot && $nextAnnot->getSeq2Name() =~ /LINE\/L1/ && $currentAnnot->getSeq1Name() eq $nextAnnot->getSeq1Name() && $nextAnnot->getSeq1Beg() - $currentAnnot->getSeq1End() <= 20 && $currentAnnot->getRevComp() eq "C" && $nextAnnot->getRevComp() eq '+' ) { my $gap = $nextAnnot->getSeq1Beg() - $currentAnnot->getSeq1End(); $gap = 0 if ( $gap < 0 ); my $gapallowed = 175 - 20 * abs( $currentAnnot->getPctSubst() - $nextAnnot->getPctSubst() ) - 5 * $gap; $gapallowed += 50 if ( $nextAnnot->getHitName() eq $currentAnnot->getHitName() ); if ( $currentAnnot->getSeq2End() - $nextAnnot->getSeq2Beg() < $gapallowed / 2 && $nextAnnot->getSeq2Beg() - $currentAnnot->getSeq2End < $gapallowed ) { if ( &areLINENamesCompat( $currentAnnot, $nextAnnot ) || $currentAnnot->getHitName() =~ /^L1.*_3end$/ && $currentAnnot->getSeq2End() < 6015 && $nextAnnot->getSeq2End() - $nextAnnot->getSeq2Beg() > 100 ) { $matched = $nextAnnot; last; } } } } if ( $matched ) { ## TODO: This code should be moved before line clusters ## are named and corrections are determined. This ## would reduce having to do this twice. if ( $DEBUG ) { print STDERR "Joining recombinant L1's:\n"; $matched->print(); $currentAnnot->print(); } my $highestConsCorrection = $conPosCorrection{ $matched->getID() }; $highestConsCorrection = $conPosCorrection{ $currentAnnot->getID() } if ( $conPosCorrection{ $currentAnnot->getID() } > $highestConsCorrection ); $currentAnnot->join( $matched ); # Fix left my $chainNext = $currentAnnot; my $detectLoop = 0; while ( $chainNext->getLeftLinkedHit() != undef && $chainNext->getLeftLinkedHit() != $chainNext && $detectLoop < 50 ) { $chainNext = $chainNext->getLeftLinkedHit(); $conPosCorrection{ $chainNext->getID() } = $highestConsCorrection; $detectLoop++; } # Fix right $chainNext = $currentAnnot; $detectLoop = 0; while ( $chainNext->getRightLinkedHit() != undef && $chainNext->getRightLinkedHit() != $chainNext && $detectLoop < 50 ) { $chainNext = $chainNext->getRightLinkedHit(); $conPosCorrection{ $chainNext->getID() } = $highestConsCorrection; $detectLoop++; } # Fix current $conPosCorrection{ $currentAnnot->getID() } = $highestConsCorrection; $currentAnnot->setHitName( $matched->getSeq2Name() ); if ( $DEBUG ) { print STDERR "Result:"; $currentAnnot->print(); } } } # if ( $ClassName =~ /^LINE\/L1/... unless ( $options{'noint'} ) { my $flankingIter = $cycleAnnotIter->getIterator(); $flankingIter->previous(); my $prevAnnot = undef; if ( $flankingIter->hasPrevious ) { $prevAnnot = $flankingIter->previous(); $flankingIter->next(); } $flankingIter->next(); my $nextAnnot = undef; if ( $flankingIter->hasNext() ) { $nextAnnot = $flankingIter->next(); $flankingIter->previous(); } # # unhide elements inserted in older elements by breaking up the # older elements. # # ie. ---------LINE-------------------- # ------LTR----------- # # These would not get fused in the previous code because # they are of different classes. Now we need to decide # if the LINE above should be broken up? # my $newSeq2Begin = $currentAnnot->getSeq2Beg(); my $newSeq2End = $currentAnnot->getSeq2End(); my @inserts = (); # Look for all elements inserted in this entry # we're estimating the position(s) in Seq2 where the # insertion(s) took place from the position(s) in Seq1 # Look into the future until # - The future element isn't contained inside the current one. # - We have moved into a new query sequence. my $j = 0; my $prevNextAnnot = undef; if ( $DEBUG ) { print STDERR "Unhide Inserts:\n"; } while ( $flankingIter->hasNext() ) { $j++; $nextAnnot = $flankingIter->next(); if ( $DEBUG ) { print STDERR " --vs-->"; $nextAnnot->print(); } last if ( $nextAnnot->getSeq1End() > $currentAnnot->getSeq1End() || $nextAnnot->getSeq1Name() ne $currentAnnot->getSeq1Name() ); # Same class and last field???? Shouldn't these already # have been caught? if ( $nextAnnot->getSeq2Name() =~ $currentAnnot->getClassName() && $currentAnnot->getLastField() == $nextAnnot->getLastField() || # i.e. ignore not yet eliminated overlapping matches to the # same element $nextAnnot->getSeq1Beg() < $currentAnnot->getSeq1Beg() ## TODO ASSERT: IF $sortedHits[ $j ]->getSeq1Beg() ## < $BeginAlign ) { next; } # Too close to the beginning -- just shift it along elsif ( $j == 1 && $nextAnnot->getSeq1Beg() <= $currentAnnot->getSeq1Beg() + 5 ) { # Only 5 bp or less before insertion; move BeginAlign after # this element if ( $DEBUG ) { print STDERR "Only 5bp or less...begin extension...clipping it\n"; } my $diff = $nextAnnot->getSeq1Beg() - $currentAnnot->getSeq1Beg(); if ( $currentAnnot->getRevComp() eq '+' ) { $currentAnnot->setSeq2Beg( $currentAnnot->getSeq2Beg() + $diff ); } else { $currentAnnot->setSeq2End( $currentAnnot->getSeq2End() - $diff ); } $currentAnnot->setSeq1Beg( $nextAnnot->getSeq1End() + 1 ); if ( $DEBUG ) { print STDERR "Now:\n"; $currentAnnot->print(); } # There are cases where exposing an insert by cutting back the # begin position will change the sort order of a previously joined # set of fragments. Obvioulsy you don't want a left join # pointing to something on the right. $currentAnnot->resortJoins(); next; } # Too close to the end -- just shift it along elsif ( $nextAnnot->getSeq1End() >= $currentAnnot->getSeq1End() - 5 ) { # Only 5 bp or less after insertion; change EndAlign to # just before this element # First remove invalidated inserts while ( ( $nextAnnot->getSeq1Beg() - 1 ) < $inserts[ $#inserts ] ) { # Remove insert pop @inserts; pop @inserts; } if ( $DEBUG ) { print STDERR "Only 5bp or less...end extension...clipping it\n"; } my $diff = $currentAnnot->getSeq1End() - $nextAnnot->getSeq1End(); # could do + 1, but penalty for gap == match to X if ( $currentAnnot->getRevComp() eq 'C' ) { $currentAnnot->setSeq2Beg( $currentAnnot->getSeq2Beg() + $diff ); } else { $currentAnnot->setSeq2End( $currentAnnot->getSeq2End() - $diff ); } $currentAnnot->setSeq1End( $nextAnnot->getSeq1Beg() - 1 ); $currentAnnot->resortJoins(); } else { # Do not consider inserts within previous inserts # ie. ------------------------ # ----------------- # ----------- if ( $prevNextAnnot && $nextAnnot->getSeq1End() < $prevNextAnnot->getSeq1End() ) { next; } # Immediate (i.e. within 5 bp) flanking inserts are merged if ( $prevNextAnnot && $nextAnnot->getSeq1Beg() < $prevNextAnnot->getSeq1End() + 5 ) { # replace last insert end with current one; use last insert start pop @inserts; } else { push( @inserts, $nextAnnot->getSeq1Beg() ); } print STDERR "Pushing into inserts\n" if ( $DEBUG ); push( @inserts, $nextAnnot->getSeq1End() ); } $prevNextAnnot = $nextAnnot; } # while loop if ( @inserts ) { my $lengthQmatch = $currentAnnot->getSeq1End() - $currentAnnot->getSeq1Beg() + 1; for ( my $m = 0 ; $m < $#inserts ; $m += 2 ) { #print STDERR " inserts[ m, m+1 ] = " . $inserts[$m] . ", " . # $inserts[ $m + 1 ] . "\n"; my $endInsert = $currentAnnot->getSeq1End(); $endInsert = $inserts[ $m ] if ( $inserts[ $m ] < $endInsert ); $lengthQmatch -= ( $inserts[ $m + 1 ] - $endInsert ); --$inserts[ $m ]; # now last position of ++$inserts[ $m + 1 ]; } my $bpHitperbpQuery = ( $currentAnnot->getSeq2End() - $currentAnnot->getSeq2Beg() + 1 ) / $lengthQmatch; if ( $DEBUG ) { print STDERR "Total inserts = " . scalar( @inserts ) . "\n" if ( $DEBUG ); print STDERR "lengthQMatch = $lengthQmatch bpHitperbpQuery = " . $bpHitperbpQuery . "\n"; } my @edges = @inserts; unshift( @edges, $currentAnnot->getSeq1Beg() ); push( @edges, $currentAnnot->getSeq1End() ); my $outline = ""; if ( $DEBUG ) { print STDERR " Fragmenting:\n"; } $cycleAnnotIter->remove(); my $newHit = undef; my $lastNewHit = $currentAnnot->getLeftLinkedHit(); $lastNewHit = undef if ( $currentAnnot->getLeftLinkedHit() == $currentAnnot ); for ( my $m = 0 ; $m < $#edges ; $m += 2 ) { my $newBeginAlign = $edges[ $m ]; my $newEndAlign = $edges[ $m + 1 ]; my $fragsize = sprintf( "%d", ( $newEndAlign - $newBeginAlign ) * $bpHitperbpQuery ); print STDERR " frag size = $fragsize\n" if ( $DEBUG ); if ( $currentAnnot->getRevComp() eq '+' ) { $newHit = $currentAnnot->clone(); # We don't want to alter the final end position of the # original match if ( $m < $#edges - 2 ) { $newSeq2End = $newSeq2Begin + $fragsize; my $remainder = $currentAnnot->getSeq2Len() - $newSeq2End; $newHit->set3PrimeUnaligned( $remainder ); $newHit->setSeq2End( $newSeq2End ); } $newHit->setSeq1End( $newEndAlign ); $newHit->setSeq1Beg( $newBeginAlign ); $newHit->setSeq2Beg( $newSeq2Begin ); $newSeq2Begin = $newSeq2End + 1; } else { $newHit = $currentAnnot->clone(); if ( $m < $#edges - 2 ) { my $remainder = $currentAnnot->getSeq2Len() - $newSeq2End; $newSeq2Begin = $newSeq2End - $fragsize; # estimate $newHit->setSeq2Beg( $newSeq2Begin ); $newHit->set3PrimeUnaligned( $remainder ); } $newHit->setSeq1Beg( $newBeginAlign ); $newHit->setSeq1End( $newEndAlign ); $newHit->setSeq2End( $newSeq2End ); $newSeq2End = $newSeq2Begin - 1; } if ( $lastNewHit ) { $newHit->setLeftLinkedHit( $lastNewHit ); $lastNewHit->setRightLinkedHit( $newHit ); } if ( $DEBUG ) { print STDERR "Inserting\n"; $newHit->print(); #print STDERR " Linked to left:"; #$newHit->getLeftLinkedHit()->printBrief() # if ( $newHit->getLeftLinkedHit() ); #print STDERR "\n Linked to right:"; #$newHit->getRightLinkedHit()->printBrief() # if ( $newHit->getRightLinkedHit() ); #print STDERR "\n"; } $cycleAnnotIter->insert( $newHit ); $lastNewHit = $newHit; } # for loop # Note since we are adding multiple copies of these source # annotations, we need to make sure this routine makes a deep copy. #$newHit->addDerivedFromAnnot( $currentAnnot ) # if ( $options{'source'} ); if ( $newHit->getRightLinkedHit() ) { $newHit->getRightLinkedHit()->setLeftLinkedHit( $newHit ); } } # if @inserts } } #printHitArrayList( $sortedAnnotationsList ); ########################## C Y C L E 10 ################################ # Poly-A Joining Code now lives here # ######################################################################## print STDERR "\ncycle 10 "; $sortedAnnotationsList->sort( \&byNameBeginEndrevSWrev ); $i = -1; $cycleAnnotIter = $sortedAnnotationsList->getIterator(); $DEBUG = 0; # Local to cycle my $pastAnnot = undef; CYCLE10: while ( $cycleAnnotIter->hasNext() ) { $i++; print STDERR "." if ( $i + 1 ) % 1000 == 0; my $currentAnnot = $cycleAnnotIter->next(); if ( $DEBUG ) { print STDERR "Cycle10: Considering: "; $currentAnnot->print(); } # # Join Poly-A tails to SINE or LINE1 annotations: # # A Poly A tail defined as: # # - A hit to a '(A)n', '(CA)n', '(CAA)n', '(AAA)n' etc # - a previous hit (within 5) to a Alu,Flam, Fr?am, L1.*_3end, # L1_.*extended or SVA. Why would you go five back?? Is it # because you could have overlapping annotations for the previous # segment of DNA? Why not go back to the limit of the gap that you # accept? # - And for each group of element there is a tolerance for # the amount of consensus that remains unaligned at the end. # - The annotations are not > 3 bases apart # - The simple repeat isn't subsumed by more than 30 bases on # the end. I.e the IR doesn't extend past the simple repeat # by more than 30 bases. # - They are in the same orientation, same sequence etc. # - NOTE: This 30 base limit is due to a search related # constant. The maximum match to a simple repeat # is around ~20bp. # # TODO: Consider doing the same for repeat tails of other SINEs # and LINEs. Also could consider limiting this to diverged # simple repeats. Consider what is the best way to deal # with the consensus length/masked when the tail exceeds the # length of the original consensus. # # NOTE: This routine makes the assumption that simple repeats # and low-complexity regions which where fragmented by # repeatmasker are *not* rejoined above. Rejoining # is not always appropriate and would cause the code # below to connect up distant fragments to the ends of # SINEs and LINEs. # my $backIter = $cycleAnnotIter->getIterator(); $backIter->previous(); my $forwIter = $cycleAnnotIter->getIterator(); if ( ( $currentAnnot->getHitName eq '(A)n' || $currentAnnot->getHitName =~ /^\(.A{3,6}\)/ || $currentAnnot->getHitName =~ /^A-rich/ || $currentAnnot->getHitName eq '(T)n' || $currentAnnot->getHitName =~ /^\(T{3,6}.\)/ || $currentAnnot->getHitName =~ /^T-rich/ ) && !$options{'noint'} ) { my $k = 1; my $prevAnnot; my $nextAnnot; while ( $k < 5 ) { my ( $prevHitName, $prevClassName ); my ( $nextHitName, $nextClassName ); if ( $backIter->hasPrevious() ) { $prevAnnot = $backIter->previous(); ( $prevHitName, $prevClassName ) = split( /\#/, $prevAnnot->getSeq2Name() ); } if ( $forwIter->hasNext() ) { $nextAnnot = $forwIter->next(); ( $nextHitName, $nextClassName ) = split( /\#/, $nextAnnot->getSeq2Name() ); } # TODO: Meta data - Should we have a field in the DB called polyAJoiningMinUnaligned? # If set then join Poly A to element but only if unaligned is < polyAJoiningMinUnaligned # Basic scenarios # # previous current # --SINE/LINE----..(max 31 or 25 bp )..> (max gap 3bp ) -------A Rich Repeat---> # # or # # ---SINE/LINE---------.....> # --(max 31)----------A Rich Repeat-----> if ( $DEBUG ) { print STDERR "Poly-A Joining Considering:\n"; if ( $prevAnnot ) { print STDERR " Prev: "; $prevAnnot->print(); } print STDERR " Current: "; $currentAnnot->print(); if ( $nextAnnot ) { print STDERR " Next: "; $nextAnnot->print(); } } if ( $prevAnnot # prevAnnot is an Alu or L1 at it's 3' end && ( $prevHitName =~ /^Alu|^FLAM/ && $prevAnnot->get3PrimeUnaligned() <= 32 || $prevHitName =~ /^FR?AM$|^L1.*|^SVA$/ && $prevAnnot->get3PrimeUnaligned() < 26 ) # The Alu/L1 and simple repeat are no more than 4bp apart && $currentAnnot->getSeq1Beg() - $prevAnnot->getSeq1End() < 4 # Same input sequence && $currentAnnot->getSeq1Name eq $prevAnnot->getSeq1Name() # Alu/L1 is in the forward strand and simple repeat is a Poly-A repeat && ( $prevAnnot->getRevComp() ne "C" && ( $currentAnnot->getHitName eq '(A)n' || $currentAnnot->getHitName =~ /^\(.A{3,6}\)/ || $currentAnnot->getHitName =~ /^.?A-rich/ ) ) # Don't allow the Poly-A tail to extend beyond 30bp past the consensus length && ( ( $currentAnnot->getSeq1End() - $currentAnnot->getSeq1Beg() ) < ( $prevAnnot->get3PrimeUnaligned() + 30 ) ) ) { my $newend = $currentAnnot->getSeq1End(); # should not be counted as insert anymore if ( $DEBUG ) { print STDERR "Fusing ALU and Poly A: Current element:\n"; $currentAnnot->print(); print STDERR "will be renamed for prev element:\n"; $prevAnnot->print(); } # TODO: Remove $k loop #print STDERR "STRANGE>>>>>>>>WHY IS K > 1???\n" # if ( $k > 1 ); # # Special Case: Incompletely excised Poly-A tails # # Typical Case: # ---Foo1------> AAAAAAAA -Foo1-> # # Should be resolved as: # ------------Foo1--------------> # if ( $prevAnnot->getRightLinkedHit() ) { my $nextInChain = $prevAnnot->getRightLinkedHit(); if ( $nextInChain->getSW() eq $prevAnnot->getSW() && $nextInChain->getSeq1Beg() - $currentAnnot->getSeq1End() < 5 && $nextInChain->getSeq2End() - $nextInChain->getSeq2Beg() + 1 <= 40 && $nextInChain == $nextAnnot ) { # To resolve this we will remove the $prevAnnot and $nextAnnot while # converting the $currentAnnot to the final annotation. # Save original 'derived from' annotations if ( $options{'source'} ) { $currentAnnot->addDerivedFromAnnot( $prevAnnot ); $currentAnnot->addDerivedFromAnnot( $nextAnnot ); } # Copy all data fields from $prevAnnot over $currentAnnot->setFrom( $prevAnnot ); # Copy over new Seq1End from $nextAnnot $currentAnnot->setSeq1End( $nextAnnot->getSeq1End() ); $currentAnnot->setLeftOver( $nextAnnot->getLeftOver() ); # Set the new seq2End to max $currentAnnot->setSeq2End( $currentAnnot->getSeq2End() + $currentAnnot->get3PrimeUnaligned() ); # Clear out unaligned field $currentAnnot->set3PrimeUnaligned( 0 ); # Remove $past, and $next my $removedAnnot = $backIter->remove(); $pastAnnot = undef; if ( $DEBUG ) { print STDERR "Removing: "; $removedAnnot->print(); } # Fix joins if ( $removedAnnot->getLeftLinkedHit() ) { my $leftLinkedHit = $removedAnnot->getLeftLinkedHit(); $leftLinkedHit->setRightLinkedHit( $currentAnnot ); $currentAnnot->setLeftLinkedHit( $leftLinkedHit ); } $removedAnnot = $forwIter->remove(); if ( $DEBUG ) { print STDERR "Removing: "; $removedAnnot->print(); } if ( $removedAnnot->getRightLinkedHit() ) { my $rightLinkedHit = $removedAnnot->getRightLinkedHit(); $rightLinkedHit->setLeftLinkedHit( $currentAnnot ); $currentAnnot->setRightLinkedHit( $rightLinkedHit ); } if ( $DEBUG ) { print STDERR "Final call:\n"; $currentAnnot->print(); } } } else { # Save original elements $currentAnnot->addDerivedFromAnnot( $prevAnnot ) if ( $options{'source'} ); # Copy most data fields from $prevAnnot $currentAnnot->setFrom( $prevAnnot ); if ( $prevAnnot->getLeftLinkedHit() ) { my $prevInChain = $prevAnnot->getLeftLinkedHit(); $currentAnnot->setLeftLinkedHit( $prevInChain ); $prevInChain->setRightLinkedHit( $currentAnnot ); } $currentAnnot->setSeq1End( $newend ) if ( $currentAnnot->getSeq1End() < $newend ); # This is now always the case; used to be that # interrupted IRs were "broken up" after this routine $currentAnnot->setSeq2End( $currentAnnot->getSeq2End() + $currentAnnot->get3PrimeUnaligned() ); $currentAnnot->set3PrimeUnaligned( 0 ); #$prevAnnot->removeFromJoins(); if ( $DEBUG ) { print STDERR "Removing: "; $prevAnnot->print(); } $backIter->remove(); $pastAnnot = undef; if ( $DEBUG ) { print STDERR "Final call:\n"; $currentAnnot->print(); print STDERR "and its links:\n"; $currentAnnot->printLinks(); if ( $currentAnnot->getLeftLinkedHit() ) { print STDERR "foo:\n"; $currentAnnot->getLeftLinkedHit()->printLinks(); } } } last; # Reverse strand equivalent } elsif ( $nextAnnot && $currentAnnot->getSeq1Name eq $nextAnnot->getSeq1Name() && $nextAnnot->getRevComp() eq 'C' && $nextAnnot->getSeq1Beg() - $currentAnnot->getSeq1End() < 4 && ( $nextAnnot->getSeq2Name() =~ /^Alu|^FLAM/ && $nextAnnot->get3PrimeUnaligned() <= 32 || $nextAnnot->getSeq2Name() =~ /^FR?AM$|^L1.*/ && $nextAnnot->get3PrimeUnaligned() < 26 ) && ( $currentAnnot->getHitName eq '(T)n' || $currentAnnot->getHitName =~ /^\(T{3,6}.\)/ || $currentAnnot->getHitName =~ /^.?T-rich/ ) # Don't allow the Poly-A tail to extend beyond 30bp past the consensus length && ( ( $currentAnnot->getSeq1End() - $currentAnnot->getSeq1Beg() ) < ( $nextAnnot->get3PrimeUnaligned() + 30 ) ) ) { my $newbegin = $currentAnnot->getSeq1Beg(); if ( $DEBUG ) { print STDERR "Fusing ALU and Poly A: Current element:\n"; $currentAnnot->print(); print STDERR "will be renamed for next element:\n"; $nextAnnot->print(); } # # Special Case: Incompletely excised Poly-A tails # # Typical Case: # <-Foo1- AAAAAAAA <-------Foo1-------- # # Should be resolved as: # <------------Foo1-------------------- # if ( $nextAnnot->getLeftLinkedHit() ) { my $nextInChain = $nextAnnot->getLeftLinkedHit(); if ( $nextInChain->getSW() eq $nextAnnot->getSW() && $nextInChain->getSeq1End() - $currentAnnot->getSeq1Beg() < 5 && $nextInChain->getSeq2End() - $nextInChain->getSeq2Beg() + 1 <= 40 && $nextInChain == $prevAnnot ) { # To resolve this we will remove the $prevAnnot and $nextAnnot while # converting the $currentAnnot to the final annotation. # Save original 'derived from' annotations if ( $options{'source'} ) { $currentAnnot->addDerivedFromAnnot( $prevAnnot ); $currentAnnot->addDerivedFromAnnot( $nextAnnot ); } # Copy all data fields from $nextAnnot over $currentAnnot->setFrom( $nextAnnot ); # Copy over new Seq1Beg from $nextAnnot $currentAnnot->setSeq1Beg( $nextInChain->getSeq1Beg() ); $currentAnnot->setLeftOver( $nextAnnot->getLeftOver() ); # Set the new seq2End to max $currentAnnot->setSeq2End( $currentAnnot->getSeq2End() + $currentAnnot->get3PrimeUnaligned() ); # Clear out unaligned field $currentAnnot->set3PrimeUnaligned( 0 ); # Remove $past, and $next my $removedAnnot = $backIter->remove(); $pastAnnot = undef; if ( $DEBUG ) { print STDERR "Removing: "; $removedAnnot->print(); } $removedAnnot = $forwIter->remove(); if ( $DEBUG ) { print STDERR "Removing: "; $removedAnnot->print(); } $currentAnnot->setRightLinkedHit( $nextInChain ); $nextInChain->setLeftLinkedHit( $currentAnnot ); if ( $DEBUG ) { print STDERR "Final call:\n"; $currentAnnot->print(); } } } else { # Save original elements $currentAnnot->addDerivedFromAnnot( $nextAnnot ) if ( $options{'source'} ); # Copy most data fields from $nextAnnot $currentAnnot->setFrom( $nextAnnot ); if ( $nextAnnot->getRightLinkedHit() ) { my $nextInChain = $nextAnnot->getRightLinkedHit(); $currentAnnot->setRightLinkedHit( $nextInChain ); $nextInChain->setLeftLinkedHit( $currentAnnot ); } if ( $nextAnnot->getLeftLinkedHit() ) { my $prevInChain = $nextAnnot->getLeftLinkedHit(); $currentAnnot->setLeftLinkedHit( $prevInChain ); $prevInChain->setRightLinkedHit( $currentAnnot ); } # Clear original alignment data ( now under derived ) #$currentAnnot->setAlignData( undef ); # It seems better to adjust Seq2End to the end of # the consensus instead. This must (almost) always be # reached (maximally 25 bp unaligned to start; minimum # lenht of deleted simple repeat is 21 bp) and it is # confusing that IR consensus sequences appear to have # different lengths in the output (which you get with # the commented-out line). $currentAnnot->setSeq2End( $currentAnnot->getSeq2End() + $currentAnnot->get3PrimeUnaligned() ); $currentAnnot->setSeq1Beg( $newbegin ); $currentAnnot->set3PrimeUnaligned( 0 ); my $LeftOver = $currentAnnot->getLeftOver(); $LeftOver =~ tr/[\(\)]//d; $currentAnnot->setLeftOver( $LeftOver ); my $removedAnnot = $forwIter->remove(); if ( $DEBUG ) { print STDERR " REMOVED: "; $removedAnnot->print(); } last; } } # What does this do? my $ClassName = $currentAnnot->getClassName(); $ClassName =~ s/_[no][el][dw](\d{1,2})?$//; $currentAnnot->setClassName( $ClassName ); ++$k; } # While ( $k < 5 ) loop } # Join poly-A tails to SINE or LINE1 annotations # # Join Alu and L1 fragments broken by clipped-out A-rich simple # repeats, subsequently (in the previous annotation line) # attached to the poly A tail by above code # elsif ( $backIter->hasPrevious() ) { my $prevAnnot = $backIter->previous(); my ( $prevHitName, $prevClassName ) = split( /\#/, $prevAnnot->getSeq2Name() ); if ( $currentAnnot->getHitName eq $prevHitName && $currentAnnot->getSeq1Beg() - $prevAnnot->getSeq1End() == 1 && $currentAnnot->getRevComp() eq $prevAnnot->getRevComp() ) { if ( $currentAnnot->getRevComp() eq 'C' && $prevAnnot->getSeq2Beg() - $currentAnnot->getSeq2End() == 1 ) { if ( $DEBUG ) { print STDERR "Fusing SINE/LINE and Poly A: Current element:\n"; $currentAnnot->print(); print STDERR "will be renamed for prev element:\n"; $prevAnnot->print(); } $currentAnnot->addDerivedFromAnnot( $prevAnnot ) if ( $options{'source'} ); $currentAnnot->setSeq1Beg( $prevAnnot->getSeq1Beg() ); $currentAnnot->setSeq2End( $prevAnnot->getSeq2End() ); $currentAnnot->set3PrimeUnaligned( $prevAnnot->get3PrimeUnaligned() ); # Fix any previous joins to this element $prevAnnot->removeFromJoins(); $backIter->remove(); } } } # # LINE Renaming # if ( $currentAnnot->getClassName() =~ /^LINE/ ) { my $HitName = $currentAnnot->getHitName(); # Rename unresolved LINEs $HitName =~ s/_orf2$//; # TODO: This is a special case of an element which needs # a portion of it's sequence masked out in a seperate stage. # The sequence is a simple repeats which is scored too high # otherwise. I am refering to the endX designation. $HitName =~ s/(.*)_[35]endX?$/$1/; $HitName =~ s/_strong$//; # could use latter as a general tool to indicate a diagnostic fragment # rename carnivore LINEs who's name have been adjusted # temporarily to allow neighboring fragment recognition if ( $HitName =~ /^L1_Ca/ ) { $HitName =~ s/^L1_Canis0/L1_Cf/; $HitName =~ s/^L1_Canis4/L1_Canid/; $HitName =~ s/^L1_Canis5/L1_Canid2/; } $currentAnnot->setHitName( $HitName ); } # # Count tandemly repeated fragments of a repeat as one insert # only recognizes constant monomers; croaks on diverged dimers # if ( $pastAnnot && $pastAnnot->getClassName() !~ /^SINE/ # SINEs are too common and often miss > 10 bp of tail # probably safe with LINEs because of variable 5' ends && $currentAnnot->getSeq1Beg() - $pastAnnot->getSeq1End() < 30 && $currentAnnot->getHitName() eq $pastAnnot->getHitName() && $currentAnnot->getSeq1Name() eq $pastAnnot->getSeq1Name() && ( $currentAnnot->getSeq2Beg() > 10 || $currentAnnot->get3PrimeUnaligned() > 10 ) && $currentAnnot->getSeq2Beg() - $pastAnnot->getSeq2Beg() < 15 && $pastAnnot->getSeq2Beg() - $currentAnnot->getSeq2Beg() < 15 && $currentAnnot->getSeq2End() - $pastAnnot->getSeq2End() < 15 && $pastAnnot->getSeq2End() - $currentAnnot->getSeq2End() < 15 ) { $currentAnnot->setLeftLinkedHit( $pastAnnot ); } # Get the next annot my $nextAnnot = undef; if ( $cycleAnnotIter->hasNext ) { $nextAnnot = $cycleAnnotIter->next(); $cycleAnnotIter->previous(); # NOTE: If you are going to remove anything in this # cycle you should move back and forward so # the last thing returned is the previous one. $cycleAnnotIter->previous(); $cycleAnnotIter->next(); } # # Skip the following processing blocks if we are simple/low # unless ( $currentAnnot->getClassName() =~ /Low_complexity|Simple_repeat|Satellite/ ) { # # Adjust overlapping sequences when begin or end of one of repeats # is probably known # ----------- # --------- # Does the current fragment have 4-5 bp of unaligned space # at the end or begining? Or...is the current fragment a LINE # a Unknown or a Composite? if ( $currentAnnot->getRevComp eq '+' && ( $currentAnnot->get3PrimeUnaligned >= 4 ) || ( $currentAnnot->getRevComp eq "C" && $currentAnnot->getSeq2Beg >= 5 ) || ( $currentAnnot->getClassName =~ /^L1|^Unknown|Composite/ ) ) { # Does the current fragment and the next fragment: # - Have the same name? # and - Overlap by 1 or more bp in the query? # and - Is the next fragment a LTR or SINE or DNA? if ( $nextAnnot && $currentAnnot->getSeq1Name eq $nextAnnot->getSeq1Name() && $currentAnnot->getSeq1End >= $nextAnnot->getSeq1Beg() && $currentAnnot->getSeq1End < $nextAnnot->getSeq1End() && ( $nextAnnot->getSeq2Name() =~ /LTR|SINE|DNA/ ) ) { # Does the current fragment run to the end of consensus ( or close ) # at the end opposite the overlap? SINES are a special case? if ( ( $nextAnnot->getRevComp() eq '+' && $nextAnnot->getSeq2Beg() < 5 ) || ( $nextAnnot->getRevComp() eq "C" && $nextAnnot->get3PrimeUnaligned() < 4 && $nextAnnot->getSeq2Name() !~ /SINE/ ) ) { my $overlap = $currentAnnot->getSeq1End - $nextAnnot->getSeq1Beg() + 1; # Added the following checks so that we do not end up # creating consensus positions that end before they # begin. - RMH 2005/12/2 if ( $currentAnnot->getRevComp eq '+' && $currentAnnot->getSeq2End - $overlap > $currentAnnot->getSeq2Beg ) { $currentAnnot->setSeq2End( $currentAnnot->getSeq2End() - $overlap ); $currentAnnot->set3PrimeUnaligned( $currentAnnot->get3PrimeUnaligned + $overlap ); $currentAnnot->setSeq1End( $currentAnnot->getSeq1End - $overlap ); } elsif ( $currentAnnot->getSeq2Beg + $overlap < $currentAnnot->getSeq2End ) { $currentAnnot->setSeq2Beg( $currentAnnot->getSeq2Beg() + $overlap ); $currentAnnot->setSeq1End( $currentAnnot->getSeq1End - $overlap ); } } } } # End Adjust overlapping sequences when begin... # Same as above but in reverse???? if ( $currentAnnot->getRevComp eq '+' && ( $currentAnnot->getSeq2Beg >= 5 ) || ( $currentAnnot->getRevComp eq "C" && $currentAnnot->get3PrimeUnaligned >= 4 ) || ( $currentAnnot->getClassName =~ /^L1|^Unknown|Composite/ ) ) { if ( $pastAnnot && $currentAnnot->getSeq1Name eq $pastAnnot->getSeq1Name && $currentAnnot->getSeq1Beg <= $pastAnnot->getSeq1End && $currentAnnot->getSeq1End > $pastAnnot->getSeq1End + 25 ) { if ( $pastAnnot->getClassName() =~ /^LTR|^SINE|^DNA/ ) { if ( ( $pastAnnot->getRevComp eq '+' && $pastAnnot->get3PrimeUnaligned < 4 && $pastAnnot->getClassName =~ /^[^S]/ ) || ( $pastAnnot->getRevComp eq "C" && $pastAnnot->getSeq2Beg < 5 ) ) { my $overlap = $pastAnnot->getSeq1End - $currentAnnot->getSeq1Beg + 1; # Added the following checks so that we do not end up # creating consensus positions that end before they # begin. - RMH 2005/12/2 if ( $currentAnnot->getRevComp eq '+' && $currentAnnot->getSeq2Beg + $overlap < $currentAnnot->getSeq2End ) { $currentAnnot->setSeq2Beg( $currentAnnot->getSeq2Beg() + $overlap ); $currentAnnot->setSeq1Beg( $pastAnnot->getSeq1End + 1 ); } elsif ( $currentAnnot->getSeq2End - $overlap > $currentAnnot->getSeq2Beg ) { $currentAnnot->setSeq2End( $currentAnnot->getSeq2End - $overlap ); $currentAnnot->set3PrimeUnaligned( $currentAnnot->get3PrimeUnaligned() + $overlap ); $currentAnnot->setSeq1Beg( $pastAnnot->getSeq1End + 1 ); } } } } } if ( $currentAnnot->getClassName =~ /^SINE/ ) { $currentAnnot->setClassName( "srpRNA" ) if ( $currentAnnot->getHitName eq "7SLRNA" ); $currentAnnot->setClassName( "scRNA" ) if ( $currentAnnot->getHitName eq "BC200" || $currentAnnot->getHitName =~ /^BC1_/ ); # $ClassName = "scRNA" if $HitName eq "BC200" && $Seq2End > 100; if ( $currentAnnot->getHitName eq "ID_B1" && $currentAnnot->getSeq2End < 90 ) { $currentAnnot->setClassName( "SINE/ID" ); $currentAnnot->setHitName( "ID" ); } } } # Unless ( $ClassName =~ /Low_complexity... # get rid of negative LINE1 positions; also readjust MLT2 fragments if ( $conPosCorrection{ $currentAnnot->getID() } ) { # probably don't need to be so restrictive, as only those # repeats in the following if ever obtain a $conPosCorrection vallue if ( $currentAnnot->getClassName() =~ /^LINE/ && !$options{'orf2'} || $currentAnnot->getHitName() =~ /^MLT2\w\d?/ && $currentAnnot->getHitName() =~ /^MLT2B[34]$/ ) { # the adjusted pos are right for B3 & 4 #$currentAnnot->sanityCheckConsPos( \%conPosCorrection ); $currentAnnot->setSeq2Beg( $currentAnnot->getSeq2Beg() + $conPosCorrection{ $currentAnnot->getID() } ); ## TODO: Consider why this needs to be protected from ## producing negative/0 numbers. Considered: If the ## renaming of a fragment changes the length of the consensus ## ( making it shorter ) the adjustment will be off. ## Now consider how to fix this. $currentAnnot->setSeq2Beg( 1 ) if ( $currentAnnot->getSeq2Beg() < 1 ); $currentAnnot->setSeq2End( $currentAnnot->getSeq2End() + $conPosCorrection{ $currentAnnot->getID() } ); } } elsif ( $currentAnnot->getHitName() =~ /MER33|Charlie5/ && $currentAnnot->getSeq2Beg < 1 ) { $currentAnnot->setSeq2Beg( 1 ); } # TODO Get rid of this once it's in cycle 11 my $length = $currentAnnot->getSeq1End - $currentAnnot->getSeq1Beg + 1; ## following happens sometimes with overlapping simple repeats: if ( $nextAnnot && $currentAnnot->getSeq1Name eq $nextAnnot->getSeq1Name() && $currentAnnot->getSeq1End > $nextAnnot->getSeq1End() ) { $length -= $nextAnnot->getSeq1End() - $nextAnnot->getSeq1Beg() + 1; } if ( $currentAnnot->getClassName =~ /Simple_repeat|Satellite/ ) { $currentAnnot->setClassName( "Low_complexity" ) if ( $currentAnnot->getHitName =~ /rich|purine|pyrimidin/ ); if ( $currentAnnot->getHitName =~ /\(CATTC\)n|\(GAATG\)n/ ) { my $hitname = quotemeta $currentAnnot->getHitName; if ( $currentAnnot->getSeq2End > 200 || ( $pastAnnot && $pastAnnot->getHitName =~ /$hitname|HSATII/ && $currentAnnot->getSeq1Beg - $pastAnnot->getSeq1End < 100 && $currentAnnot->getSeq2End + $pastAnnot->getSeq2End > 200 ) || ( $nextAnnot && $nextAnnot->getSeq2Name() =~ /$hitname|HSATII/ && $nextAnnot->getSeq1Beg() - $currentAnnot->getSeq1End < 100 && ( ( $currentAnnot->getSeq2End + $nextAnnot->getSeq2End() > 200 ) || ( $pastAnnot && $pastAnnot->getHitName eq $currentAnnot->getHitName && $currentAnnot->getSeq2End + $pastAnnot->getSeq2End + $nextAnnot->getSeq2End() > 200 ) ) ) ) { $currentAnnot->setClassName( "Satellite" ); } } $currentAnnot->setHitName( $currentAnnot->getHitName() . '/Alpha' ) if ( $currentAnnot->getHitName eq 'ALR' ); $currentAnnot->setHitName( $currentAnnot->getHitName() . '/Beta' ) if ( $currentAnnot->getHitName eq 'BSR' ); if ( $pastAnnot && $currentAnnot->getSeq1Name eq $pastAnnot->getSeq1Name ) { if ( $pastAnnot->getHitName eq $currentAnnot->getHitName && $pastAnnot->getRevComp eq $currentAnnot->getRevComp && $currentAnnot->getSeq1Beg - $pastAnnot->getSeq1End < 65 ) { $currentAnnot->join( $pastAnnot ); } # TODO: Get rid of this once its in cycle 11 if ( $currentAnnot->getSeq1Beg <= $pastAnnot->getSeq1End && $currentAnnot->getSeq1End > $pastAnnot->getSeq1End ) { $length -= ( $pastAnnot->getSeq1End - $currentAnnot->getSeq1Beg + 1 ); } } if ( $currentAnnot->getClassName =~ /Satellite/ ) { my $tmpClassName = $currentAnnot->getClassName(); $tmpClassName =~ s/[Cc]entromeric$/centr/; $tmpClassName =~ s/telomeric$/telo/; $tmpClassName =~ s/acromeric$/acro/; $currentAnnot->setClassName( $tmpClassName ); } } # if Simple/Sattellite # Do not print (near) duplicates created by fusing overlapping matches # Remove small fragments created by RM fragmentation of alignments. if ( ( $pastAnnot && $currentAnnot->getSeq1Name eq $pastAnnot->getSeq1Name() && $currentAnnot->getSeq1End <= $pastAnnot->getSeq1End() && $currentAnnot->getSW <= $pastAnnot->getSW() && $currentAnnot->getClassName eq $pastAnnot->getClassName() && $currentAnnot->getLastField eq $pastAnnot->getLastField() ) || ( $currentAnnot->getClassName !~ /Simple/ # Removed this constraint per Arian's recommendations #&& ! $currentAnnot->getRightLinkedHit() #&& ! $currentAnnot->getLeftLinkedHit() && $currentAnnot->getSeq1End() - $currentAnnot->getSeq1Beg() < 10 || $currentAnnot->getSeq1End() - $currentAnnot->getSeq1Beg() < 5 ) || # nor low complexity DNA or simple repeats when people don't # want to see it ( $options{'nolow'} && $currentAnnot->getClassName =~ /Low_complexity|Simple_repeat/ ) ) { if ( $DEBUG ) { print STDERR "Near duplicate (created by fusing): Removing Current element:\n"; $currentAnnot->print(); if ( $pastAnnot ) { print STDERR "because of past element:\n"; $pastAnnot->print(); } } $currentAnnot->removeFromJoins(); $cycleAnnotIter->remove(); $pastAnnot->addDerivedFromAnnot( $currentAnnot ) if ( $pastAnnot && $options{'source'} ); next CYCLE10; } $pastAnnot = $currentAnnot; } # CYCLE 10 #printHitArrayList( $sortedAnnotationsList ); &generateOutput( \%options, \%seq1Lengths, $file, $filename, $dbversion, $numSearchedSeqs, $lenSearchedSeqs, $lenSearchedSeqsExcludingXNRuns, $versionmode, $engine, $sortedAnnotationsList, $poundformat ); } # else....if ( ! $poundformat print STDERR "\ndone\n"; } # For each file ## ## ## End of main loops ## ## ######################################################################### ##subroutines ######################################################################### # # replaceRMFragmentChainWithRefinement: # RepeatMasker fragments alignment's that span a ( previously ) # clipped out repeat. The RM refinement process is run prior # to RM fragmentation and therefore creates contiguous re-alignments. # RM does not fragment these re-alignments. If we choose to replace # a RM fragmented alignment with a contiguous re-alignment then # we need to fragment the re-alignment ourselves. This operation # is highly subjective as we do not have the alignment data # accessible to determine the breakpoint location. Here are # a few examples of how we are doing this: # # Given that b1s4i3 joins to the previously joined # RM fragmented repeat b1s6i8: # # +-----------+ # ---b1s4i3---> | | # ---b1s6i8---> -----b1s6i8---> # # and given the relalignment: # # --------[b1s6i8]----------> # # we must fragment [b1s6i8] into something approximating the # original fragments and then replace the original ones. # # # Scenario #1: +-----------+ # | | # RM Frags -----------------------> ----> # Refined ......................../ /....> # # Replace original fragments with new ones: # +-----------+ # | | # .......................> ....> # # # Scenario #2: +-----------+ # | | # -----------------------> ----> # ......................../ /.> # # If fragment coverage is less than 10 than remove fragment: # .......................> # else: # +-----------+ # | | # .......................> ..> # # # # Scenario #2: +-----------+ # | | # -----------------------> ----> # .....................> # # Remove uncovered fragment # .....................> # sub replaceRMFragmentChainWithRefinement { my $rightAnnot = shift; my $rightEquiv = shift; my $DEBUG = 0; my $chainCATID = $rightAnnot->getCATID(); print STDERR "replaceRMFragmentChainWithRefinement: Entered...\n" if ( $DEBUG ); # Make sure we are at the begining of the chain before we attempt to # replace with the refinement annotation. while ( $rightAnnot->getLeftLinkedHit() ) { if ( $rightAnnot->getLeftLinkedHit()->getCATID() eq $chainCATID ) { $rightAnnot = $rightAnnot->getLeftLinkedHit(); } else { last; } } my $newSeq1Width = $rightEquiv->getSeq1End() - $rightEquiv->getSeq1Beg() + 1; my $newConsSize = $rightEquiv->getSeq2End() + $rightEquiv->get3PrimeUnaligned(); my $newConsLeft = $rightEquiv->getSeq2Beg(); my $newConsPerSeq1 = ( $rightEquiv->getSeq2End() - $rightEquiv->getSeq2Beg() + 1 ) / $newSeq1Width; my $currentLinearFragPos = $rightAnnot->getSeq1Beg(); while ( 1 ) { if ( $DEBUG ) { print STDERR " Fix frag starting chain: "; $rightAnnot->print(); print STDERR " With this one: "; $rightEquiv->print(); print STDERR " Current linear frag pos = $currentLinearFragPos\n"; } if ( ( $rightAnnot->getSeq1Beg() <= $rightEquiv->getSeq1Beg() && $rightAnnot->getSeq1End() - $rightEquiv->getSeq1Beg() < 10 ) || $rightEquiv->getSeq1End() - $currentLinearFragPos < 10 ) { print STDERR " Marking as deleted\n" if ( $DEBUG ); # NOTE: This is a bit of a synthetic removal. Since we do not have the # arraylist and it's iterators all we can do is simply alter the # annotation to have a 0 length seq1Length and a silly name like # "DELETE_ME. In subsequent cycles 0 length repeats get removed. $rightAnnot->setSeq1End( $rightAnnot->setSeq1Beg() ); $rightAnnot->setHitName( "DELETE_ME" ); } else { # Fix Seq1End for last fragment if ( $currentLinearFragPos + ( $rightAnnot->getSeq1End() - $rightAnnot->getSeq1Beg() + 1 ) > $rightEquiv->getSeq1End() ) { $rightAnnot->setSeq1End( $rightAnnot->getSeq1Beg() + ( $rightEquiv->getSeq1End() - $currentLinearFragPos ) ); $rightAnnot->setLeftOver( $rightEquiv->getLeftOver() ); } # Fix Seq1Beg for first fragment if ( $rightAnnot->getSeq1Beg() < $rightEquiv->getSeq1Beg() ) { $rightAnnot->setSeq1Beg( $rightEquiv->getSeq1Beg() ); } my $newConsWidth = int( ( $rightAnnot->getSeq1End() - $rightAnnot->getSeq1Beg() + 1 ) * $newConsPerSeq1 ); $rightAnnot->setSeq2Beg( $newConsLeft ); $newConsLeft += $newConsWidth; $newConsLeft = $rightEquiv->getSeq2End() if ( $newConsLeft > $rightEquiv->getSeq2End() ); $rightAnnot->setSeq2End( $newConsLeft ); $rightAnnot->set3PrimeUnaligned( $newConsSize - $newConsLeft ); $newConsLeft++; $rightAnnot->setHitName( $rightEquiv->getHitName() ); $rightAnnot->setSW( $rightEquiv->getSW() ); $rightAnnot->setPctSubst( $rightEquiv->getPctSubst() ); $rightAnnot->setPctDelete( $rightEquiv->getPctDelete() ); $rightAnnot->setPctInsert( $rightEquiv->getPctInsert() ); $rightAnnot->setSeq1Name( $rightEquiv->getSeq1Name() ); $rightAnnot->setRevComp( $rightEquiv->getRevComp() ); $rightAnnot->setClassName( $rightEquiv->getClassName() ); } last if ( !$rightAnnot->getRightLinkedHit() || $rightAnnot->getRightLinkedHit()->getCATID() ne $rightAnnot->getCATID() ); $currentLinearFragPos += ( $rightAnnot->getSeq1End() - $rightAnnot->getSeq1Beg + 1 ); $rightAnnot = $rightAnnot->getRightLinkedHit(); } print STDERR "replaceRMFragmentChainWithRefinement: Leaving...\n" if ( $DEBUG ); } # # getRefinedHighScore: # Given a RM annotation that has associated refinement ( re-alignment ) data, # return the highest re-alignment score in the refinement set. # sub getRefinedHighScore { my $collection = shift; my $highScore = 0; if ( defined( my $refHashEntry = $refinementHash{ $collection->getCATID() } ) ) { foreach my $hitName ( keys %{$refHashEntry} ) { my $refArray = $refHashEntry->{$hitName}; foreach my $hit ( @{$refArray} ) { $highScore = $hit->getSW() if ( $highScore < $hit->getSW() ); } } } return $highScore; } # # getMatchingRefinedEntries: # Given a RM annotation that has associated refinement ( re-alignment ) data, # return the re-alignment(s) for a given consensus ID. NOTE: The consensus # ID is passed in the form of a annotation. Also note that the routine may # not return any match. # sub getMatchingRefinedEntries { my $collection = shift; my $annot = shift; $annot = $collection if ( !defined $annot ); if ( defined( my $refHashEntry = $refinementHash{ $collection->getCATID() } ) ) { if ( defined( my $refArray = $refHashEntry->{ $annot->getHitName() } ) ) { return @{$refArray}; } } return (); } # # getMatchingRefinedEntry: # Given a RM annotation that has associated refinement ( re-alignment ) data, # return the equivalent re-alignment ( same ID as the RM annotation ) annotation # or the highest scoring equivalent if more than one re-alignment annotation is # found. # sub getMatchingRefinedEntry { my $annot = shift; my $match; if ( defined( my $refHashEntry = $refinementHash{ $annot->getCATID() } ) ) { if ( defined( my $refArray = $refHashEntry->{ $annot->getHitName() } ) ) { my $highScore = 0; foreach my $refinedAnnot ( @{$refArray} ) { if ( $refinedAnnot->getSW() > $highScore ) { $highScore = $refinedAnnot->getSW(); $match = $refinedAnnot; } } } } return $match; } # # getEquivArray: # Given a RM annotation that has associated refinement ( re-alignment ) data, # return all re-alignments to all consensi for this annotation. # sub getEquivArray { my $annot = shift; my @annots = (); if ( defined( my $refHashEntry = $refinementHash{ $annot->getCATID() } ) ) { foreach my $key ( keys %{$refHashEntry} ) { push @annots, ( @{ $refHashEntry->{$key} } ); } } return @annots; } # # Final output routine # sub generateOutput { my %options = %{ shift() }; my %seq1Lengths = %{ shift() }; my $file = shift; my $filename = shift; my $dbversion = shift; my $numSearchedSeqs = shift; my $lenSearchedSeqs = shift; my $lenSearchedSeqsExcludingXNRuns = shift; my $versionmode = shift; my $engine = shift; my $sortedAnnotationsList = shift; my $poundformat = shift; print STDERR "\nGenerating output... "; my $i = -1; my $cycleAnnotIter = $sortedAnnotationsList->getIterator(); my $DEBUG = 0; # Local to cycle my $lastid = 0; my $pastAnnot; # Globals created for stats my %totlength = (); my %uniqCount = (); my %checked = (); my $seq_cnt = 0; my $frac_GC = 0; my $totseqlen = 0; my $nonNSeqLen = 0; my $totalSeqLen = 0; my $maskedpercent = 0; my $maskedlength = 0; my $annotatedlength = 0; # Aggregate Stats ( Default match is to "OTHER" ) my %aggregateStats = ( 'SINE' => { 'RE' => 'SINE' }, 'LINE' => { 'RE' => '^LINE' }, 'LINECR1' => { 'RE' => '\/CR1$|\/L2$|\/Rex-Babar$' }, 'LINEI' => { 'RE' => '\/R1$|\/LOA$|\/I$|\/Jockey$' }, 'LINERTE' => { 'RE' => '\/RTE|\/Bov-B' }, 'LINER2' => { 'RE' => '\/NeSL|\/R[24]|\/Dong' }, 'LTR' => { 'RE' => '^LTR' }, 'LTRERV' => { 'RE' => 'ERV|ERVL-MaLR' }, 'LTRBEL' => { 'RE' => 'Pao$' }, 'LTRGYP' => { 'RE' => 'DIRS$|Gypsy$' }, 'DNA' => { 'RE' => '^DNA' }, 'DNATC1' => { 'RE' => '\/TcMar|\/Stowaway' }, 'DNAHAT' => { 'RE' => '\/[hH]AT|Tam3$' }, 'DNAPIG' => { 'RE' => '\/T2$|\/PiggyBac' }, 'DNAH' => { 'RE' => 'Harbing|Tourist' }, 'DNAP' => { 'RE' => '\/P$|\/Mirage|\/Transib' }, 'OTHER' => { 'RE' => '^Unknown|^Composite|^Other' }, 'RNA' => { 'RE' => 'RNA$' }, 'SATEL' => { 'RE' => 'Satel' }, 'SIMPLE' => { 'RE' => 'Simple' }, 'LOWCOMP' => { 'RE' => '^Low' }, ); # calculate stats while ( $cycleAnnotIter->hasNext() ) { $i++; print STDERR "." if ( $i + 1 ) % 1000 == 0; my $currentAnnot = $cycleAnnotIter->next(); # # Stats # my $CN = $currentAnnot->getClassName(); $CN =~ s/\?$//; # I'm counting them with the class, even if I'm not sure if ( !defined $uniqCount{$CN} ) { $uniqCount{$CN} = 0; $totlength{$CN} = 0 if !defined $totlength{$CN}; } # Define length of coverage based on possibly overlapping # fragments. my $length = $currentAnnot->getSeq1End - $currentAnnot->getSeq1Beg + 1; if ( $pastAnnot && $currentAnnot->getSeq1Name eq $pastAnnot->getSeq1Name() && $currentAnnot->getSeq1Beg <= $pastAnnot->getSeq1End && $currentAnnot->getSeq1End > $pastAnnot->getSeq1End ) { $length -= ( $pastAnnot->getSeq1End - $currentAnnot->getSeq1Beg + 1 ); } # Determine if this is fragment of a previously counted element my $checked = 0; if ( $currentAnnot->getLeftLinkedHit() && $currentAnnot->getLeftLinkedHit() != $currentAnnot ) { $checked = 1; } # Class Name Stats $uniqCount{$CN}++ unless $checked; $totlength{$CN} += $length; $annotatedlength += $length; # Aggregate Stats my $cataloged = 0; foreach my $aggKey ( keys( %aggregateStats ) ) { my $regExp = $aggregateStats{$aggKey}->{'RE'}; if ( $CN =~ /$regExp/ ) { $aggregateStats{$aggKey}->{'count'}++ unless $checked; $aggregateStats{$aggKey}->{'length'} += $length; $cataloged = 1; } } if ( !$cataloged ) { $aggregateStats{'OTHER'}->{'count'}++ unless $checked; $aggregateStats{'OTHER'}->{'length'} += $length; } $checked{ $currentAnnot->getID() }++; $pastAnnot = $currentAnnot; } # # Now go through the dataset and renumber the ids # $cycleAnnotIter = $sortedAnnotationsList->getIterator(); my %id = (); while ( $cycleAnnotIter->hasNext() ) { my $currentAnnot = $cycleAnnotIter->next(); # # Build the id{} datastructure for renumbering the annotations # print STDERR "About to renumber\n" if ( $DEBUG ); my $tmpID = $currentAnnot->getID(); $currentAnnot->print() if ( $DEBUG ); if ( $currentAnnot->getLeftLinkedHit() && $currentAnnot->getLeftLinkedHit() != $currentAnnot ) { if ( $DEBUG ) { print STDERR "Left link is:\n"; $currentAnnot->getLeftLinkedHit()->print(); print STDERR "Setting to left's id{ " . $currentAnnot->getLeftLinkedHit()->getID() . " } = " . $id{ $currentAnnot->getLeftLinkedHit()->getID() } . "\n"; } $id{$tmpID} = $id{ $currentAnnot->getLeftLinkedHit()->getID() }; } else { print STDERR "Setting to new number $lastid + 1\n" if ( $DEBUG ); $id{$tmpID} = ++$lastid; } } # End while #printHitArrayList( $sortedAnnotationsList ); ## ## Annotation Output ## # Get column widths my %colWidths = &getMaxColWidths( $sortedAnnotationsList ); # Some option to do this if ( $options{'html'} ) { &printHTMLAnnots( "$file.out.html", \%id, \%seq1Lengths, \%colWidths, $sortedAnnotationsList ); } if ( $options{'a'} ) { &printAlignAnnots( "$file.align", \%id, \%seq1Lengths, \%colWidths, $sortedAnnotationsList ); } # # Print the out, xm, ace, poly and gff files # $cycleAnnotIter = $sortedAnnotationsList->getIterator(); my $headerWritten = 0; my $pastAnnot = undef; open( OUTFULL, ">$file.out" ) || die "can't create $file.out\n"; $options{'xm'} && ( open( OUTXM, ">$file.out.xm" ) || die "can't create $file.out.xm\n" ); $options{'ace'} && ( open( OUTACE, ">$file.out.ace" ) || die "can't create $file.out.ace\n" ); $options{'poly'} && ( open( OUTPOLY, ">$file.polyout" ) || die "can't create $file.out.polyout\n" ); if ( $options{'gff'} ) { open( OUTGFF, ">$file.out.gff" ) || die "can't create $file.out.gff\n"; print OUTGFF "##gff-version 2\n"; printf OUTGFF "##date %4d-%02d-%02d\n", ( localtime )[ 5 ] + 1900, ( localtime )[ 4 ] + 1, ( localtime )[ 3 ]; # date as 1999-09-24... ( my $seqname = $file ) =~ s/^.*\///; print OUTGFF "##sequence-region $seqname\n"; } while ( $cycleAnnotIter->hasNext() ) { my $currentAnnot = $cycleAnnotIter->next(); # Get the next annot my $nextAnnot = undef; if ( $cycleAnnotIter->hasNext ) { $nextAnnot = $cycleAnnotIter->next(); $cycleAnnotIter->previous(); # NOTE: If you are going to remove anything in this # cycle you should move back and forward so # the last thing returned is the previous one. } # # Indicate overlapping sequences in table # my $Overlapped = ""; if ( ( $pastAnnot && $currentAnnot->getSeq1Name eq $pastAnnot->getSeq1Name && $currentAnnot->getSeq1Beg <= $pastAnnot->getSeq1End && $currentAnnot->getSW < $pastAnnot->getSW && $currentAnnot->getLastField eq $pastAnnot->getLastField ) || ( $nextAnnot && $currentAnnot->getSeq1Name eq $nextAnnot->getSeq1Name() && $currentAnnot->getSeq1End >= $nextAnnot->getSeq1Beg() && $currentAnnot->getSW < $nextAnnot->getSW() && $currentAnnot->getLastField eq $nextAnnot->getLastField() ) ) { $Overlapped = "*"; } # format fields my $LeftOver = $seq1Lengths{ $currentAnnot->getSeq1Name } - $currentAnnot->getSeq1End; my $LeftOverPrint = "(" . $LeftOver . ")"; my $Seq2BeginPrint = "(" . $currentAnnot->get3PrimeUnaligned . ")"; my $LeftUnalignedPrint = $currentAnnot->getSeq2Beg(); if ( $currentAnnot->getRevComp eq '+' ) { $Seq2BeginPrint = $currentAnnot->getSeq2Beg(); $LeftUnalignedPrint = "(" . $currentAnnot->get3PrimeUnaligned . ")"; } my $printid = $id{ $currentAnnot->getID() }; if ( $options{'no_id'} ) { $printid = ""; } my $PctSubst = sprintf "%4.1f", $currentAnnot->getPctSubst; my $PctDelete = sprintf "%4.1f", $currentAnnot->getPctDelete; my $PctInsert = sprintf "%4.1f", $currentAnnot->getPctInsert; if ( $options{'xm'} ) { print OUTXM "" . $currentAnnot->getSW() . " " . $PctSubst . " " . $PctDelete . " " . $PctInsert . " " . $currentAnnot->getSeq1Name() . " " . $currentAnnot->getSeq1Beg() . " " . $currentAnnot->getSeq1End() . " " . $LeftOverPrint . " " . $currentAnnot->getRevComp() . " " . $currentAnnot->getHitName() . "\#" . $currentAnnot->getClassName() . " " . $Seq2BeginPrint . " " . $currentAnnot->getSeq2End() . " " . $LeftUnalignedPrint . " " . $Overlapped . "\n"; } if ( $options{'ace'} ) { if ( $currentAnnot->getRevComp eq "C" ) { print OUTACE "Motif_homol \"" . $currentAnnot->getHitName() . "\" \"RepeatMasker\" " . $PctSubst . " " . $currentAnnot->getSeq1Beg() . " " . $currentAnnot->getSeq1End() . " - " . $currentAnnot->getSeq2End() . " " . $currentAnnot->getSeq2Beg() . "\n"; } else { print OUTACE "Motif_homol \"" . $currentAnnot->getHitName() . "\" \"RepeatMasker\" " . $PctSubst . " " . $currentAnnot->getSeq1Beg() . " " . $currentAnnot->getSeq1End() . " + " . $currentAnnot->getSeq2End() . " " . $currentAnnot->getSeq2Beg() . "\n"; } } if ( $options{'poly'} && $currentAnnot->getClassName eq "Simple_repeat" && $PctSubst + $PctDelete + $PctInsert < 10 ) { printf OUTPOLY "%${colWidths{'SW'}}d %${colWidths{'PctSubst'}}s " . "%${colWidths{'PctDelete'}}s " . "%${colWidths{'PctInsert'}}s " . "%-${colWidths{'Seq1Name'}}s " . "%${colWidths{'BeginAlign'}}s " . "%${colWidths{'EndAlign'}}s " . "%${colWidths{'LeftOver'}}s %1s " . "%-${colWidths{'HitName'}}s " . "%-${colWidths{'class'}}s " . "%${colWidths{'Seq2Begin'}}s " . "%${colWidths{'Seq2End'}}s " . "%${colWidths{'LeftUnaligned'}}s %1s\n", $currentAnnot->getSW(), $PctSubst, $PctDelete, $PctInsert, $currentAnnot->getSeq1Name(), $currentAnnot->getSeq1Beg(), $currentAnnot->getSeq1End(), $LeftOverPrint, $currentAnnot->getRevComp(), $currentAnnot->getHitName(), $currentAnnot->getClassName(), $Seq2BeginPrint, $currentAnnot->getSeq2End(), $LeftUnalignedPrint, $Overlapped; } # if $options{'poly'} if ( $options{'gff'} ) { my $source; if ( $currentAnnot->getHitName =~ /Alu/ ) { $source = 'RepeatMasker_SINE'; } else { $source = 'RepeatMasker'; } print OUTGFF "" . $currentAnnot->getSeq1Name() . "\t" . $source . "\tsimilarity\t" . $currentAnnot->getSeq1Beg() . "\t" . $currentAnnot->getSeq1End() . "\t" . $PctSubst . "\t"; if ( $currentAnnot->getRevComp() eq "C" ) { print OUTGFF "-\t.\t"; } else { print OUTGFF "+\t.\t"; } print OUTGFF "Target \"Motif:" . $currentAnnot->getHitName() . "\" " . $currentAnnot->getSeq2Beg() . " " . $currentAnnot->getSeq2End() . "\n"; } if ( !$headerWritten ) { my $widthposquery = $colWidths{'BeginAlign'} + $colWidths{'EndAlign'} + $colWidths{'LeftOver'} + 2; my $widthposrepeat = $colWidths{'Seq2Begin'} + $colWidths{'Seq2End'} + $colWidths{'LeftUnaligned'} + 2; # First line of header printf OUTFULL "%${colWidths{'SW'}}s %${colWidths{'PctSubst'}}s " . "%${colWidths{'PctDelete'}}s " . "%${colWidths{'PctInsert'}}s " . "%-${colWidths{'Seq1Name'}}s " . "%-${widthposquery}s " . "%-${colWidths{'HitName'}}s %-${colWidths{'class'}}s " . "%${widthposrepeat}s\n", 'SW', 'perc', 'perc', 'perc', 'query', 'position in query', 'matching', 'repeat', 'position in repeat'; # Second line of header printf OUTFULL "%${colWidths{'SW'}}s %${colWidths{'PctSubst'}}s " . "%${colWidths{'PctDelete'}}s " . "%${colWidths{'PctInsert'}}s " . "%-${colWidths{'Seq1Name'}}s " . "%-${colWidths{'BeginAlign'}}s " . "%-${colWidths{'EndAlign'}}s " . "%${colWidths{'LeftOver'}}s " . "%-${colWidths{'HitName'}}s " . "%-${colWidths{'class'}}s " . "%-${colWidths{'Seq2Begin'}}s " . "%-${colWidths{'Seq2End'}}s " . "%-${colWidths{'LeftUnaligned'}}s ", 'score', 'div.', 'del.', 'ins.', 'sequence', 'begin', 'end', '(left)', 'repeat', 'class/family', 'begin', 'end', '(left)', 'ID'; unless ( $options{'no_id'} ) { printf OUTFULL "%${colWidths{'ID'}}s", 'ID'; } printf OUTFULL "\n\n"; $headerWritten = 1; } if ( $options{'lcambig'} ) { # Use repeat name case to highlight ambiguous DNA # transposon fragments $currentAnnot->setHitName( uc( $currentAnnot->getHitName() ) ); $currentAnnot->setHitName( lc( $currentAnnot->getHitName() ) ) if ( $currentAnnot->getEquivHash() && $currentAnnot->getClassName() =~ /DNA/ ); } printf OUTFULL "%${colWidths{'SW'}}d %${colWidths{'PctSubst'}}s " . "%${colWidths{'PctDelete'}}s " . "%${colWidths{'PctInsert'}}s " . "%-${colWidths{'Seq1Name'}}s " . "%${colWidths{'BeginAlign'}}s " . "%${colWidths{'EndAlign'}}s " . "%${colWidths{'LeftOver'}}s %1s " . "%-${colWidths{'HitName'}}s " . "%-${colWidths{'class'}}s " . "%${colWidths{'Seq2Begin'}}s " . "%${colWidths{'Seq2End'}}s " . "%${colWidths{'LeftUnaligned'}}s ", $currentAnnot->getSW(), $PctSubst, $PctDelete, $PctInsert, $currentAnnot->getSeq1Name(), $currentAnnot->getSeq1Beg(), $currentAnnot->getSeq1End(), $LeftOverPrint, $currentAnnot->getRevComp(), $currentAnnot->getHitName(), $currentAnnot->getClassName(), $Seq2BeginPrint, $currentAnnot->getSeq2End(), $LeftUnalignedPrint; unless ( $options{'no_id'} ) { printf OUTFULL "%${colWidths{'ID'}}s %1s\n", $printid, $Overlapped; } else { printf OUTFULL "%1s\n", $Overlapped; } ## TESTING ##if ( $alignFileData ) ##{ ##} # Currently only available in html output #if ( $options{'source'} ) { # #print STDERR "Doing source output\n"; # printSourceAnnots( $currentAnnot, \%colWidths, 1 ); #} $pastAnnot = $currentAnnot; } # while loop close OUTFULL; close OUTXM if $options{'xm'}; close OUTACE if $options{'ace'}; close OUTGFF if $options{'gff'}; close OUTPOLY if $options{'poly'}; ## ## Table Output ## # # Now mask so we can get stats on the seq for the table output # # GLOBAL for use in format statements my $usePerc = 1; my $exclnote = ""; if ( -f $options{'maskSource'} ) { print STDERR "\nmasking"; my $db = FastaDB->new( fileName => $options{'maskSource'}, openMode => SeqDBI::ReadOnly, maxIDLength => 50 ); my $maskFormat = ''; $maskFormat = 'x' if ( $options{'x'} ); $maskFormat = 'xsmall' if ( $options{'xsmall'} ); ( $seq_cnt, $totseqlen, $nonNSeqLen, $frac_GC, $maskedlength ) = &maskSequence( $maskFormat, $db, "$file.out", $options{'maskSource'} . ".masked" ); $totalSeqLen = $totseqlen; if ( $options{'excln'} ) { $totseqlen = $nonNSeqLen; $exclnote = "Runs of >=20 X/Ns in query were excluded in % calcs\n"; } $nonNSeqLen = "($nonNSeqLen bp excl N/X-runs)"; $maskedpercent = ( $maskedlength / $totseqlen ) * 100; } elsif ( $numSearchedSeqs ) { $seq_cnt = $numSearchedSeqs; $totseqlen = $lenSearchedSeqs; if ( $options{'excln'} ) { $totseqlen = $lenSearchedSeqsExcludingXNRuns; $exclnote = "Runs of >=20 X/Ns in query were excluded in % calcs\n"; } $totalSeqLen = $totseqlen; $nonNSeqLen = "($lenSearchedSeqsExcludingXNRuns bp excl N/X-runs)"; $maskedlength = $annotatedlength; $maskedpercent = ( $maskedlength / $totseqlen ) * 100; $frac_GC = "Unknown"; } else { warn "\nWarning: Calculation of statistics hampered by missing\n" . "sequence sizes in the input file *and* missing -maskSource\n" . "parameter! Cannot calculate percentage masked values.\n"; $usePerc = 0; $totseqlen = 1; $nonNSeqLen = ""; $maskedpercent = 0; $maskedlength = $annotatedlength; } my $totallength = $aggregateStats{'SINE'}->{'length'} + $aggregateStats{'LINE'}->{'length'} + $aggregateStats{'LTR'}->{'length'} + $aggregateStats{'DNA'}->{'length'} + $aggregateStats{'OTHER'}->{'length'}; my $customLib; if ( $options{'lib'} ) { $options{'lib'} =~ s/.*(\/.*?$)/\.\.\.$1/; # full path too long # obnoxiously, emacs' indentation can't handle that line; # nothing wrong $customLib = "The query was compared to "; if ( $poundformat ) { $customLib .= "classified sequences in "; } else { $customLib .= "unclassified sequences in "; } $customLib = $customLib . "\"" . $options{'lib'} . "\""; } # future aim is to use same format for at least all mammals if ( $options{'primate'} ) { my $OUT; open( $OUT, ">$file.tbl" ) || die "can't create $file.tbl\n"; printf $OUT "==================================================\n"; printf $OUT "file name: %-25s\n", $filename; printf $OUT "sequences: %7d\n", $seq_cnt; printf $OUT "total length: %10d bp %-25s\n", $totalSeqLen, $nonNSeqLen; printf $OUT "GC level: %6s \%\n", $frac_GC; printf $OUT "bases masked: %10d bp ( %4.2f \%)\n", $maskedlength, $maskedpercent; printf $OUT "==================================================\n"; printf $OUT " number of length percentage\n"; printf $OUT " elements* occupied of sequence\n"; printf $OUT "--------------------------------------------------\n"; printf $OUT "SINEs: %6d %10d bp %5.2f \%\n", $aggregateStats{'SINE'}->{'count'}, $aggregateStats{'SINE'}->{'length'}, $aggregateStats{'SINE'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " ALUs %6d %10d bp %5.2f \%\n", $uniqCount{"SINE/Alu"}, $totlength{"SINE/Alu"}, $totlength{"SINE/Alu"} * 100 * $usePerc / $totseqlen; printf $OUT " MIRs %6d %10d bp %5.2f \%\n", $uniqCount{"SINE/MIR"}, $totlength{"SINE/MIR"}, $totlength{"SINE/MIR"} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "LINEs: %6d %10d bp %5.2f \%\n", $aggregateStats{'LINE'}->{'count'}, $aggregateStats{'LINE'}->{'length'}, $aggregateStats{'LINE'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " LINE1 %6d %10d bp %5.2f \%\n", $uniqCount{"LINE/L1"}, $totlength{"LINE/L1"}, $totlength{"LINE/L1"} * 100 * $usePerc / $totseqlen; printf $OUT " LINE2 %6d %10d bp %5.2f \%\n", $uniqCount{"LINE/L2"}, $totlength{"LINE/L2"}, $totlength{"LINE/L2"} * 100 * $usePerc / $totseqlen; printf $OUT " L3/CR1 %6d %10d bp %5.2f \%\n", $uniqCount{"LINE/CR1"}, $totlength{"LINE/CR1"}, $totlength{"LINE/CR1"} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "LTR elements: %6d %10d bp %5.2f \%\n", $aggregateStats{'LTR'}->{'count'}, $aggregateStats{'LTR'}->{'length'}, $aggregateStats{'LTR'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " ERVL %6d %10d bp %5.2f \%\n", $uniqCount{"LTR/ERVL"}, $totlength{"LTR/ERVL"}, $totlength{"LTR/ERVL"} * 100 * $usePerc / $totseqlen; printf $OUT " ERVL-MaLRs %6d %10d bp %5.2f \%\n", $uniqCount{"LTR/ERVL-MaLR"}, $totlength{"LTR/ERVL-MaLR"}, $totlength{"LTR/ERVL-MaLR"} * 100 * $usePerc / $totseqlen; printf $OUT " ERV_classI %6d %10d bp %5.2f \%\n", $uniqCount{"LTR/ERV1"}, $totlength{"LTR/ERV1"}, $totlength{"LTR/ERV1"} * 100 * $usePerc / $totseqlen; printf $OUT " ERV_classII%6d %10d bp %5.2f \%\n", $uniqCount{"LTR/ERVK"}, $totlength{"LTR/ERVK"}, $totlength{"LTR/ERVK"} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "DNA elements: %6d %10d bp %5.2f \%\n", $aggregateStats{'DNA'}->{'count'}, $aggregateStats{'DNA'}->{'length'}, $aggregateStats{'DNA'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " hAT-Charlie %6d %10d bp %5.2f \%\n", $uniqCount{"DNA/hAT-Charlie"}, $totlength{"DNA/hAT-Charlie"}, $totlength{"DNA/hAT-Charlie"} * 100 * $usePerc / $totseqlen; printf $OUT " TcMar-Tigger%6d %10d bp %5.2f \%\n", $uniqCount{"DNA/TcMar-Tigger"}, $totlength{"DNA/TcMar-Tigger"}, $totlength{"DNA/TcMar-Tigger"} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "Unclassified: %6d %10d bp %5.2f \%\n", $aggregateStats{'OTHER'}->{'count'}, $aggregateStats{'OTHER'}->{'length'}, $aggregateStats{'OTHER'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "Total interspersed repeats:%9d bp %5.2f \%\n", $totallength, $totallength * 100 * $usePerc / $totseqlen; printf $OUT "\n\n"; printf $OUT "Small RNA: %6d %10d bp %5.2f \%\n", $aggregateStats{'RNA'}->{'count'}, $aggregateStats{'RNA'}->{'length'}, $aggregateStats{'RNA'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "Satellites: %6d %10d bp %5.2f \%\n", $aggregateStats{'SATEL'}->{'count'}, $aggregateStats{'SATEL'}->{'length'}, $aggregateStats{'SATEL'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "Simple repeats: %6d %10d bp %5.2f \%\n", $aggregateStats{'SIMPLE'}->{'count'}, $aggregateStats{'SIMPLE'}->{'length'}, $aggregateStats{'SIMPLE'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "Low complexity: %6d %10d bp %5.2f \%\n", $aggregateStats{'LOWCOMP'}->{'count'}, $aggregateStats{'LOWCOMP'}->{'length'}, $aggregateStats{'LOWCOMP'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "==================================================\n"; printf $OUT "\n"; printf $OUT "* most repeats fragmented by insertions or deletions\n"; printf $OUT " have been counted as one element\n"; printf $OUT "%54s\n", $exclnote; printf $OUT "\n"; printf $OUT "The query species was assumed to be %-14s\n", $options{'species'}; printf $OUT "%-82s\n", substr( $versionmode, 0, 82 ); printf $OUT "$engine\n"; printf $OUT "$customLib\n" if ( $customLib ne "" ); printf $OUT "$dbversion\n"; close $OUT; } elsif ( $options{'mus'} ) { my $OUT; open( $OUT, ">$file.tbl" ) || die "can't create $file.tbl\n"; printf $OUT "==================================================\n"; printf $OUT "file name: %-25s\n", $filename; printf $OUT "sequences: %7d\n", $seq_cnt; printf $OUT "total length: %10d bp %-25s\n", $totalSeqLen, $nonNSeqLen; printf $OUT "GC level: %6s \%\n", $frac_GC; printf $OUT "bases masked: %10d bp ( %4.2f \%)\n", $maskedlength, $maskedpercent; printf $OUT "==================================================\n"; printf $OUT " number of length percentage\n"; printf $OUT " elements* occupied of sequence\n"; printf $OUT "--------------------------------------------------\n"; printf $OUT "SINEs: %6d %10d bp %5.2f \%\n", $aggregateStats{'SINE'}->{'count'}, $aggregateStats{'SINE'}->{'length'}, $aggregateStats{'SINE'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " Alu/B1 %6d %10d bp %5.2f \%\n", $uniqCount{"SINE/Alu"}, $totlength{"SINE/Alu"}, $totlength{"SINE/Alu"} * 100 * $usePerc / $totseqlen; printf $OUT " B2-B4 %6d %10d bp %5.2f \%\n", $uniqCount{"SINE/B2"} + $uniqCount{"SINE/B4"}, $totlength{"SINE/B2"} + $totlength{"SINE/B4"}, ( $totlength{"SINE/B2"} + $totlength{"SINE/B4"} ) * 100 * $usePerc / $totseqlen; printf $OUT " IDs %6d %10d bp %5.2f \%\n", $uniqCount{"SINE/ID"}, $totlength{"SINE/ID"}, $totlength{"SINE/ID"} * 100 * $usePerc / $totseqlen; printf $OUT " MIRs %6d %10d bp %5.2f \%\n", $uniqCount{"SINE/MIR"}, $totlength{"SINE/MIR"}, $totlength{"SINE/MIR"} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "LINEs: %6d %10d bp %5.2f \%\n", $aggregateStats{'LINE'}->{'count'}, $aggregateStats{'LINE'}->{'length'}, $aggregateStats{'LINE'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " LINE1 %6d %10d bp %5.2f \%\n", $uniqCount{"LINE/L1"}, $totlength{"LINE/L1"}, $totlength{"LINE/L1"} * 100 * $usePerc / $totseqlen; printf $OUT " LINE2 %6d %10d bp %5.2f \%\n", $uniqCount{"LINE/L2"}, $totlength{"LINE/L2"}, $totlength{"LINE/L2"} * 100 * $usePerc / $totseqlen; printf $OUT " L3/CR1 %6d %10d bp %5.2f \%\n", $uniqCount{"LINE/CR1"}, $totlength{"LINE/CR1"}, $totlength{"LINE/CR1"} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "LTR elements: %6d %10d bp %5.2f \%\n", $aggregateStats{'LTR'}->{'count'}, $aggregateStats{'LTR'}->{'length'}, $aggregateStats{'LTR'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " ERVL %6d %10d bp %5.2f \%\n", $uniqCount{"LTR/ERVL"}, $totlength{"LTR/ERVL"}, $totlength{"LTR/ERVL"} * 100 * $usePerc / $totseqlen; printf $OUT " ERVL-MaLRs %6d %10d bp %5.2f \%\n", $uniqCount{"LTR/ERVL-MaLR"}, $totlength{"LTR/ERVL-MaLR"}, $totlength{"LTR/ERVL-MaLR"} * 100 * $usePerc / $totseqlen; printf $OUT " ERV_classI %6d %10d bp %5.2f \%\n", $uniqCount{"LTR/ERV1"}, $totlength{"LTR/ERV1"}, $totlength{"LTR/ERV1"} * 100 * $usePerc / $totseqlen; printf $OUT " ERV_classII %6d %10d bp %5.2f \%\n", $uniqCount{"LTR/ERVK"}, $totlength{"LTR/ERVK"}, $totlength{"LTR/ERVK"} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "DNA elements: %6d %10dbp %5.2f \%\n", $aggregateStats{'DNA'}->{'count'}, $aggregateStats{'DNA'}->{'length'}, $aggregateStats{'DNA'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " hAT-Charlie %6d %10d bp %5.2f \%\n", $uniqCount{"DNA/hAT-Charlie"}, $totlength{"DNA/hAT-Charlie"}, $totlength{"DNA/hAT-Charlie"} * 100 * $usePerc / $totseqlen; printf $OUT " TcMar-Tigger%6d %10d bp %5.2f \%\n", $uniqCount{"DNA/TcMar-Tigger"}, $totlength{"DNA/TcMar-Tigger"}, $totlength{"DNA/TcMar-Tigger"} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "Unclassified: %6d %10d bp %5.2f \%\n", $aggregateStats{'OTHER'}->{'count'}, $aggregateStats{'OTHER'}->{'length'}, $aggregateStats{'OTHER'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "Total interspersed repeats:%10d bp %5.2f \%\n", $totallength, $totallength * 100 * $usePerc / $totseqlen; printf $OUT "\n\n"; printf $OUT "Small RNA: %6d %10d bp %5.2f \%\n", $aggregateStats{'RNA'}->{'count'}, $aggregateStats{'RNA'}->{'length'}, $aggregateStats{'RNA'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "Satellites: %6d %10d bp %5.2f \%\n", $aggregateStats{'SATEL'}->{'count'}, $aggregateStats{'SATEL'}->{'length'}, $aggregateStats{'SATEL'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "Simple repeats: %6d %10d bp %5.2f \%\n", $aggregateStats{'SIMPLE'}->{'count'}, $aggregateStats{'SIMPLE'}->{'length'}, $aggregateStats{'SIMPLE'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "Low complexity: %6d %10d bp %5.2f \%\n", $aggregateStats{'LOWCOMP'}->{'count'}, $aggregateStats{'LOWCOMP'}->{'length'}, $aggregateStats{'LOWCOMP'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "==================================================\n"; printf $OUT "\n"; printf $OUT "* most repeats fragmented by insertions or deletions\n"; printf $OUT " have been counted as one element\n"; printf $OUT "%54s\n", $exclnote; printf $OUT "\n"; printf $OUT "The query species was assumed to be %-14s\n", $options{'species'}; printf $OUT "%-82s\n", substr( $versionmode, 0, 82 ); printf $OUT "$engine\n"; printf $OUT "$customLib\n" if ( $customLib ne "" ); printf $OUT "$dbversion\n"; close $OUT; } elsif ( $options{'mammal'} ) { my $OUT; open( $OUT, ">$file.tbl" ) || die "can't create $file.tbl\n"; printf $OUT "==================================================\n"; printf $OUT "file name: %-25s\n", $filename; printf $OUT "sequences: %7d\n", $seq_cnt; printf $OUT "total length: %10d bp %-25s\n", $totalSeqLen, $nonNSeqLen; printf $OUT "GC level: %6s \%\n", $frac_GC; printf $OUT "bases masked: %10d bp ( %4.2f \%)\n", $maskedlength, $maskedpercent; printf $OUT "==================================================\n"; printf $OUT " number of length percentage\n"; printf $OUT " elements* occupied of sequence\n"; printf $OUT "--------------------------------------------------\n"; printf $OUT "SINEs: %6d %10d bp %5.2f \%\n", $aggregateStats{'SINE'}->{'count'}, $aggregateStats{'SINE'}->{'length'}, $aggregateStats{'SINE'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " Alu/B1 %6d %10d bp %5.2f \%\n", $uniqCount{"SINE/Alu"}, $totlength{"SINE/Alu"}, $totlength{"SINE/Alu"} * 100 * $usePerc / $totseqlen; printf $OUT " MIRs %6d %10d bp %5.2f \%\n", $uniqCount{"SINE/MIR"}, $totlength{"SINE/MIR"}, $totlength{"SINE/MIR"} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "LINEs: %6d %10d bp %5.2f \%\n", $aggregateStats{'LINE'}->{'count'}, $aggregateStats{'LINE'}->{'length'}, $aggregateStats{'LINE'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " LINE1 %6d %10d bp %5.2f \%\n", $uniqCount{"LINE/L1"}, $totlength{"LINE/L1"}, $totlength{"LINE/L1"} * 100 * $usePerc / $totseqlen; printf $OUT " LINE2 %6d %10d bp %5.2f \%\n", $uniqCount{"LINE/L2"}, $totlength{"LINE/L2"}, $totlength{"LINE/L2"} * 100 * $usePerc / $totseqlen; printf $OUT " L3/CR1 %6d %10d bp %5.2f \%\n", $uniqCount{"LINE/CR1"}, $totlength{"LINE/CR1"}, $totlength{"LINE/CR1"} * 100 * $usePerc / $totseqlen; printf $OUT " RTE %6d %10d bp %5.2f \%\n", $aggregateStats{'LINERTE'}->{'count'}, $aggregateStats{'LINERTE'}->{'length'}, $aggregateStats{'LINERTE'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "LTR elements: %6d %10d bp %5.2f \%\n", $aggregateStats{'LTR'}->{'count'}, $aggregateStats{'LTR'}->{'length'}, $aggregateStats{'LTR'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " ERVL %6d %10d bp %5.2f \%\n", $uniqCount{"LTR/ERVL"}, $totlength{"LTR/ERVL"}, $totlength{"LTR/ERVL"} * 100 * $usePerc / $totseqlen; printf $OUT " ERVL-MaLRs %6d %10d bp %5.2f \%\n", $uniqCount{"LTR/ERVL-MaLR"}, $totlength{"LTR/ERVL-MaLR"}, $totlength{"LTR/ERVL-MaLR"} * 100 * $usePerc / $totseqlen; printf $OUT " ERV_classI %6d %10d bp %5.2f \%\n", $uniqCount{"LTR/ERV1"}, $totlength{"LTR/ERV1"}, $totlength{"LTR/ERV1"} * 100 * $usePerc / $totseqlen; printf $OUT " ERV_classII %6d %10d bp %5.2f \%\n", $uniqCount{"LTR/ERVK"}, $totlength{"LTR/ERVK"}, $totlength{"LTR/ERVK"} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "DNA elements: %6d %10d bp %5.2f \%\n", $aggregateStats{'DNA'}->{'count'}, $aggregateStats{'DNA'}->{'length'}, $aggregateStats{'DNA'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " hAT-Charlie %6d %10d bp %5.2f \%\n", $uniqCount{"DNA/hAT-Charlie"}, $totlength{"DNA/hAT-Charlie"}, $totlength{"DNA/hAT-Charlie"} * 100 * $usePerc / $totseqlen; printf $OUT " TcMar-Tigger%6d %10d bp %5.2f \%\n", $uniqCount{"DNA/TcMar-Tigger"}, $totlength{"DNA/TcMar-Tigger"}, $totlength{"DNA/TcMar-Tigger"} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "Unclassified: %6d %10d bp %5.2f \%\n", $aggregateStats{'OTHER'}->{'count'}, $aggregateStats{'OTHER'}->{'length'}, $aggregateStats{'OTHER'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "Total interspersed repeats:%10d bp %5.2f \%\n", $totallength, $totallength * 100 * $usePerc / $totseqlen; printf $OUT "\n\n"; printf $OUT "Small RNA: %6d %10d bp %5.2f \%\n", $aggregateStats{'RNA'}->{'count'}, $aggregateStats{'RNA'}->{'length'}, $aggregateStats{'RNA'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "Satellites: %6d %10d bp %5.2f \%\n", $aggregateStats{'SATEL'}->{'count'}, $aggregateStats{'SATEL'}->{'length'}, $aggregateStats{'SATEL'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "Simple repeats: %6d %10d bp %5.2f \%\n", $aggregateStats{'SIMPLE'}->{'count'}, $aggregateStats{'SIMPLE'}->{'length'}, $aggregateStats{'SIMPLE'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "Low complexity: %6d %10d bp %5.2f \%\n", $aggregateStats{'LOWCOMP'}->{'count'}, $aggregateStats{'LOWCOMP'}->{'length'}, $aggregateStats{'LOWCOMP'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "==================================================\n"; printf $OUT "\n"; printf $OUT "* most repeats fragmented by insertions or deletions\n"; printf $OUT " have been counted as one element\n"; printf $OUT "%54s\n", $exclnote; printf $OUT "\n"; printf $OUT "The query species was assumed to be %-14s\n", $options{'species'}; printf $OUT "%-82s\n", substr( $versionmode, 0, 82 ); printf $OUT "$engine\n"; printf $OUT "$customLib\n" if ( $customLib ne "" ); printf $OUT "$dbversion\n"; close $OUT; } else { my $OUT; open( $OUT, ">$file.tbl" ) || die "can't create $file.tbl\n"; printf $OUT "==================================================\n"; printf $OUT "file name: %-25s\n", $filename; printf $OUT "sequences: %7d\n", $seq_cnt; printf $OUT "total length: %10d bp %-25s\n", $totalSeqLen, $nonNSeqLen; printf $OUT "GC level: %6s \%\n", $frac_GC; printf $OUT "bases masked: %10d bp ( %4.2f \%)\n", $maskedlength, $maskedpercent; printf $OUT "==================================================\n"; printf $OUT " number of length percentage\n"; printf $OUT " elements* occupied of sequence\n"; printf $OUT "--------------------------------------------------\n"; printf $OUT "Retroelements %6d %10d bp %5.2f \%\n", $aggregateStats{'LINE'}->{'count'} + $aggregateStats{'SINE'}->{'count'} + $aggregateStats{'LTR'}->{'count'}, $aggregateStats{'LINE'}->{'length'} + $aggregateStats{'SINE'}->{'length'} + $aggregateStats{'LTR'}->{'length'}, ( $aggregateStats{'LINE'}->{'length'} + $aggregateStats{'SINE'}->{'length'} + $aggregateStats{'LTR'}->{'length'} ) * 100 * $usePerc / $totseqlen; printf $OUT " SINEs: %6d %10d bp %5.2f \%\n", $aggregateStats{'SINE'}->{'count'}, $aggregateStats{'SINE'}->{'length'}, $aggregateStats{'SINE'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " Penelope %6d %10d bp %5.2f \%\n", $uniqCount{"LINE/Penelope"}, $totlength{"LINE/Penelope"}, $totlength{"LINE/Penelope"} * 100 * $usePerc / $totseqlen; printf $OUT " LINEs: %6d %10d bp %5.2f \%\n", $aggregateStats{'LINE'}->{'count'}, $aggregateStats{'LINE'}->{'length'}, $aggregateStats{'LINE'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " CRE/SLACS %6d %10d bp %5.2f \%\n", $uniqCount{"LINE/CRE"}, $totlength{"LINE/CRE"}, $totlength{"LINE/CRE"} * 100 * $usePerc / $totseqlen; printf $OUT " L2/CR1/Rex %6d %10d bp %5.2f \%\n", $aggregateStats{'LINECR1'}->{'count'}, $aggregateStats{'LINECR1'}->{'length'}, $aggregateStats{'LINECR1'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " R1/LOA/Jockey %6d %10d bp %5.2f \%\n", $aggregateStats{'LINEI'}->{'count'}, $aggregateStats{'LINEI'}->{'length'}, $aggregateStats{'LINEI'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " R2/R4/NeSL %6d %10d bp %5.2f \%\n", $aggregateStats{'LINER2'}->{'count'}, $aggregateStats{'LINER2'}->{'length'}, $aggregateStats{'LINER2'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " RTE/Bov-B %6d %10d bp %5.2f \%\n", $aggregateStats{'LINERTE'}->{'count'}, $aggregateStats{'LINERTE'}->{'length'}, $aggregateStats{'LINERTE'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " L1/CIN4 %6d %10d bp %5.2f \%\n", $uniqCount{"LINE/L1"}, $totlength{"LINE/L1"}, $totlength{"LINE/L1"} * 100 * $usePerc / $totseqlen; printf $OUT " LTR elements: %6d %10d bp %5.2f \%\n", $aggregateStats{'LTR'}->{'count'}, $aggregateStats{'LTR'}->{'length'}, $aggregateStats{'LTR'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " BEL/Pao %6d %10d bp %5.2f \%\n", $aggregateStats{'LTRBEL'}->{'count'}, $aggregateStats{'LTRBEL'}->{'length'}, $aggregateStats{'LTRBEL'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " Ty1/Copia %6d %10d bp %5.2f \%\n", $uniqCount{"LTR/Copia"}, $totlength{"LTR/Copia"}, $totlength{"LTR/Copia"} * 100 * $usePerc / $totseqlen; printf $OUT " Gypsy/DIRS1 %6d %10d bp %5.2f \%\n", $aggregateStats{'LTRGYP'}->{'count'}, $aggregateStats{'LTRGYP'}->{'length'}, $aggregateStats{'LTRGYP'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " Retroviral %6d %10d bp %5.2f \%\n", $aggregateStats{'LTRERV'}->{'count'}, $aggregateStats{'LTRERV'}->{'length'}, $aggregateStats{'LTRERV'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "DNA transposons %6d %10d bp %5.2f \%\n", $aggregateStats{'DNA'}->{'count'}, $aggregateStats{'DNA'}->{'length'}, $aggregateStats{'DNA'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " hobo-Activator %6d %10d bp %5.2f \%\n", $aggregateStats{'DNAHAT'}->{'count'}, $aggregateStats{'DNAHAT'}->{'length'}, $aggregateStats{'DNAHAT'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " Tc1-IS630-Pogo %6d %10d bp %5.2f \%\n", $aggregateStats{'DNATC1'}->{'count'}, $aggregateStats{'DNATC1'}->{'length'}, $aggregateStats{'DNATC1'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " En-Spm %6d %10d bp %5.2f \%\n", $uniqCount{"DNA/En-Spm"}, $totlength{"DNA/En-Spm"}, $totlength{"DNA/En-Spm"} * 100 * $usePerc / $totseqlen; printf $OUT " MuDR-IS905 %6d %10d bp %5.2f \%\n", $uniqCount{"DNA/MuDR"}, $totlength{"DNA/MuDR"}, $totlength{"DNA/MuDR"} * 100 * $usePerc / $totseqlen; printf $OUT " PiggyBac %6d %10d bp %5.2f \%\n", $aggregateStats{'DNAPIG'}->{'count'}, $aggregateStats{'DNAPIG'}->{'length'}, $aggregateStats{'DNAPIG'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " Tourist/Harbinger%6d %10d bp %5.2f \%\n", $aggregateStats{'DNAH'}->{'count'}, $aggregateStats{'DNAH'}->{'length'}, $aggregateStats{'DNAH'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " Other (Mirage, %6d %10d bp %5.2f \%\n", $aggregateStats{'DNAP'}->{'count'}, $aggregateStats{'DNAP'}->{'length'}, $aggregateStats{'DNAP'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT " P-element, Transib)\n"; printf $OUT "\n"; printf $OUT "Rolling-circles %6d %10d bp %5.2f \%\n", $uniqCount{"RC/Helicop"}, $totlength{"RC/Helicop"}, $totlength{"RC/Helicop"} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "Unclassified: %6d %10d bp %5.2f \%\n", $aggregateStats{'OTHER'}->{'count'}, $aggregateStats{'OTHER'}->{'length'}, $aggregateStats{'OTHER'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "Total interspersed repeats: %10d bp %5.2f \%\n", $totallength, $totallength * 100 * $usePerc / $totseqlen; printf $OUT "\n\n"; printf $OUT "Small RNA: %6d %10d bp %5.2f \%\n", $aggregateStats{'RNA'}->{'count'}, $aggregateStats{'RNA'}->{'length'}, $aggregateStats{'RNA'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "\n"; printf $OUT "Satellites: %6d %10d bp %5.2f \%\n", $aggregateStats{'SATEL'}->{'count'}, $aggregateStats{'SATEL'}->{'length'}, $aggregateStats{'SATEL'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "Simple repeats: %6d %10d bp %5.2f \%\n", $aggregateStats{'SIMPLE'}->{'count'}, $aggregateStats{'SIMPLE'}->{'length'}, $aggregateStats{'SIMPLE'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "Low complexity: %6d %10d bp %5.2f \%\n", $aggregateStats{'LOWCOMP'}->{'count'}, $aggregateStats{'LOWCOMP'}->{'length'}, $aggregateStats{'LOWCOMP'}->{'length'} * 100 * $usePerc / $totseqlen; printf $OUT "==================================================\n"; printf $OUT "\n"; printf $OUT "* most repeats fragmented by insertions or deletions\n"; printf $OUT " have been counted as one element\n"; printf $OUT "%54s\n", $exclnote; printf $OUT "\n"; printf $OUT "The query species was assumed to be %-14s\n", $options{'species'}; printf $OUT "%-82s\n", substr( $versionmode, 0, 82 ); printf $OUT "$engine\n"; printf $OUT "$customLib\n" if ( $customLib ne "" ); printf $OUT "$dbversion\n"; close $OUT; } } # sub generateOutput(); ##-------------------------------------------------------------------------## ## Use: my ( $chainBegRef, $chainEndRef ) = ## &cycleReJoin( $sortedAnnotationsList ); ## ## $sortedAnnotationsList : Annotations to be processed ## ## Join fragments broken up by RepeatMasker's clipping strategy. ## The signature ( currently ) for these fragments is: ## ## +-----+ +-------+ ## | | | | ## ------ ------- -------- ## F1 F2 F3 ## ## SW PctSub PctDel PctIns ## F1 = 100 10% 3% 2% ## F2 = 100 10% 3% 2% ## F3 = 323 15% 2% 7% ## ## Fragments which come from the same alignment will have ## the same Score, PctSub, PctDel, and PctIns. Other fragments ## may be linked but do not come from the same original alignment. ## ## Returns ## Two references to hashes. The hashes store the seq2beg/end for ## each ID ie. $chainBegRef->{ID} = #. These data structures are ## currently used in cycles 2 & 3. ## ## Also now sets setRMAlignQueryLen() for each hit. The returned ## hashes should be also be made into Hit properties. ## ##-------------------------------------------------------------------------## sub cycleReJoin { my $sortedAnnotationsList = shift; $sortedAnnotationsList->sort( \&bySeqSWConbegin ); my $cycleAnnotIter = $sortedAnnotationsList->getIterator(); my %chainSeq2Beg = (); my %chainSeq2End = (); my $i = -1; while ( $cycleAnnotIter->hasNext() ) { $i++; $DEBUG = 0; print STDERR "." if ( $i + 1 ) % 1000 == 0; my $currentAnnot = $cycleAnnotIter->next(); next if ( ( $currentAnnot->getLeftLinkedHit() && $currentAnnot->getLeftLinkedHit() != $currentAnnot ) || ( $currentAnnot->getRightLinkedHit() && $currentAnnot->getRightLinkedHit() != $currentAnnot ) ); # Simple and Low Complexity repeats should not be rejoined. # There are often large gaps and misalignments which do not # necessary indicate that the fragments should be part of # one alignment. I.e consider two alus side by side with # long poly-A tails. In some cases the poly-A tails will be # joined as one alignment when in fact they are two independent # regions. next if ( $currentAnnot->getClassName() =~ /Simple|Low_/ ); # # Join fragments artificially broken up by repeatmasker # # Rules: Same HitName # Same Sequence # Same Score, divg's # Cons Boundaries match my $proxIter = $cycleAnnotIter->getIterator(); my @compatIDs = ( $currentAnnot ); my $lastSeq2Begin = $currentAnnot->getSeq2Beg(); my $lastSeq2End = $currentAnnot->getSeq2End(); my $combinedSeq1Length = $currentAnnot->getSeq1End() - $currentAnnot->getSeq1Beg() + 1; my $lowestSeq2Pos = $lastSeq2Begin; my $lastAnnot = $currentAnnot; $lowestSeq2Pos = $lastSeq2End if ( $lowestSeq2Pos > $lastSeq2End ); my $highestSeq2Pos = $lastSeq2Begin; $highestSeq2Pos = $lastSeq2End if ( $highestSeq2Pos < $lastSeq2End ); my $contained = 0; while ( $proxIter->hasNext() ) { my $nextAnnot = $proxIter->next(); if ( $nextAnnot->containsElement( $currentAnnot ) ) { $contained = 1; } last unless ( $currentAnnot->getSeq1Name() eq $nextAnnot->getSeq1Name() && $nextAnnot->getSW() == $currentAnnot->getSW() ); my $nextHitName = $nextAnnot->getHitName(); my $nextClassName = $nextAnnot->getClassName(); # Note: Currently annotations in RepeatMasker may skip reporting # segments of alignments between clipped out markers "x"s which are # smaller than 5 bp long. I upped the tolerance here to handle # these separated fragments. if ( $nextHitName eq $currentAnnot->getHitName() && !$nextAnnot->getLeftLinkedHit() && !$nextAnnot->getRightLinkedHit() && $nextAnnot->getLastField() == $currentAnnot->getLastField() && $nextAnnot->getPctSubst() == $currentAnnot->getPctSubst() && $nextAnnot->getPctInsert() == $currentAnnot->getPctInsert() && $nextAnnot->getPctDelete() == $currentAnnot->getPctDelete() && ( abs( $nextAnnot->getSeq2Beg() - $lastSeq2End ) < 5 || # I don't think I need this or abs( $nextAnnot->getSeq2End() - $lastSeq2Begin ) < 5 ) ) { if ( $DEBUG ) { print STDERR "Joining RepeatMasker fragmented alignment:\n"; $currentAnnot->print(); print STDERR " because of next:\n"; $nextAnnot->print(); } if ( $contained && $DEBUG ) { print STDERR "\n\n\n\n\n\nWARNING WARNING -- containment " . "breached!\n\n\n\n\n\n"; } $lastAnnot->join( $nextAnnot ); if ( $DEBUG ) { if ( $currentAnnot->getHitName() eq "AluSx" && $currentAnnot->getRevComp eq "C" ) { print STDERR "Special!\n"; print STDERR "nexts right partner:\n"; $nextAnnot->getRightLinkedHit()->print(); print STDERR "currents left partner:\n"; $currentAnnot->getLeftLinkedHit()->print(); } } push @compatIDs, $nextAnnot; $lastAnnot = $nextAnnot; $combinedSeq1Length += $nextAnnot->getSeq1End() - $nextAnnot->getSeq1Beg() + 1; $lastSeq2Begin = $nextAnnot->getSeq2Beg(); $lastSeq2End = $nextAnnot->getSeq2End(); $lowestSeq2Pos = $nextAnnot->getSeq2Beg() if ( $lowestSeq2Pos > $nextAnnot->getSeq2Beg() ); $lowestSeq2Pos = $nextAnnot->getSeq2End() if ( $lowestSeq2Pos > $nextAnnot->getSeq2End() ); $highestSeq2Pos = $nextAnnot->getSeq2Beg() if ( $highestSeq2Pos < $nextAnnot->getSeq2Beg() ); $highestSeq2Pos = $nextAnnot->getSeq2End() if ( $highestSeq2Pos < $nextAnnot->getSeq2End() ); } } if ( @compatIDs > 1 ) { if ( $DEBUG ) { print STDERR "Chain range = $lowestSeq2Pos - $highestSeq2Pos\n"; } foreach my $annot ( @compatIDs ) { print STDERR " Setting: chainSeq2Beg/End{ " . $annot->getID() . " }\n" if ( $DEBUG ); $chainSeq2Beg{ $annot->getID() } = $lowestSeq2Pos; $chainSeq2End{ $annot->getID() } = $highestSeq2Pos; $annot->setRMAlignQueryLen( $combinedSeq1Length ); } } else { my $singleAnnot = shift @compatIDs; if ( $DEBUG ) { print STDERR "Adding singleton = " . ( $currentAnnot->getSeq2Beg() - $currentAnnot->getSeq2End() ) . "ID=" . $currentAnnot->getID() . "\n"; $currentAnnot->print(); } $chainSeq2Beg{ $singleAnnot->getID() } = $currentAnnot->getSeq2Beg(); $chainSeq2End{ $singleAnnot->getID() } = $currentAnnot->getSeq2End(); $singleAnnot->setRMAlignQueryLen( $currentAnnot->getSeq1End() - $currentAnnot->getSeq1Beg() + 1 ); } } return ( \%chainSeq2Beg, \%chainSeq2End ); } sub printAlignAnnots { my $fileName = shift; my $id = shift; my $seq1lengths = shift; my %colWidths = %{ shift() }; my $annots = shift; my $ALIGNOUT = new FileHandle; open $ALIGNOUT, ">$fileName" || die "Cannot open $fileName: $!\n"; # # Print the align file. # my $cycleAnnotIter = $annots->getIterator(); my $pastAnnot = undef; while ( $cycleAnnotIter->hasNext() ) { my $currentAnnot = $cycleAnnotIter->next(); # Get the next annot my $nextAnnot = undef; if ( $cycleAnnotIter->hasNext ) { $nextAnnot = $cycleAnnotIter->next(); $cycleAnnotIter->previous(); } # # Indicate overlapping sequences in table # my $Overlapped = ""; if ( ( $pastAnnot && $currentAnnot->getSeq1Name eq $pastAnnot->getSeq1Name && $currentAnnot->getSeq1Beg <= $pastAnnot->getSeq1End && $currentAnnot->getSW < $pastAnnot->getSW && $currentAnnot->getLastField eq $pastAnnot->getLastField ) || ( $nextAnnot && $currentAnnot->getSeq1Name eq $nextAnnot->getSeq1Name() && $currentAnnot->getSeq1End >= $nextAnnot->getSeq1Beg() && $currentAnnot->getSW < $nextAnnot->getSW() && $currentAnnot->getLastField eq $nextAnnot->getLastField() ) ) { $Overlapped = "*"; } # format fields my $LeftOver = $seq1lengths->{ $currentAnnot->getSeq1Name } - $currentAnnot->getSeq1End; my $LeftOverPrint = "(" . $LeftOver . ")"; my $Seq2BeginPrint = "(" . $currentAnnot->get3PrimeUnaligned . ")"; my $LeftUnalignedPrint = $currentAnnot->getSeq2Beg(); if ( $currentAnnot->getRevComp eq '+' ) { $Seq2BeginPrint = $currentAnnot->getSeq2Beg(); $LeftUnalignedPrint = "(" . $currentAnnot->get3PrimeUnaligned . ")"; } my $printid = $id->{ $currentAnnot->getID() }; my $PctSubst = sprintf "%4.1f", $currentAnnot->getPctSubst; my $PctDelete = sprintf "%4.1f", $currentAnnot->getPctDelete; my $PctInsert = sprintf "%4.1f", $currentAnnot->getPctInsert; if ( $options{'lcambig'} ) { # Use repeat name case to highlight ambiguous DNA # transposon fragments $currentAnnot->setHitName( uc( $currentAnnot->getHitName() ) ); $currentAnnot->setHitName( lc( $currentAnnot->getHitName() ) ) if ( $currentAnnot->getEquivHash() && $currentAnnot->getClassName() =~ /DNA/ ); } if ( 0 ) { print $ALIGNOUT " "; printf $ALIGNOUT "%${colWidths{'SW'}}d %${colWidths{'PctSubst'}}s " . "%${colWidths{'PctDelete'}}s " . "%${colWidths{'PctInsert'}}s " . "%-${colWidths{'Seq1Name'}}s " . "%${colWidths{'BeginAlign'}}s " . "%${colWidths{'EndAlign'}}s " . "%${colWidths{'LeftOver'}}s %1s " . $currentAnnot->getHitName() . "%-${colWidths{'HitName'}}s " . "%-${colWidths{'class'}}s " . "%${colWidths{'Seq2Begin'}}s " . "%${colWidths{'Seq2End'}}s " . "%${colWidths{'LeftUnaligned'}}s ", $currentAnnot->getSW(), $PctSubst, $PctDelete, $PctInsert, $currentAnnot->getSeq1Name(), $currentAnnot->getSeq1Beg(), $currentAnnot->getSeq1End(), $LeftOverPrint, $currentAnnot->getRevComp(), $currentAnnot->getHitName(), $currentAnnot->getClassName(), $Seq2BeginPrint, $currentAnnot->getSeq2End(), $LeftUnalignedPrint; printf $ALIGNOUT "%${colWidths{'ID'}}s %1s\n", $printid, $Overlapped; } if ( defined $currentAnnot->getDerivedFromAnnot() ) { #print $ALIGNOUT "ANNOTATION EVIDENCE:\n"; #&printSourceAlignments( $ALIGNOUT, $currentAnnot, \%colWidths, 0, " " ); &printSourceAlignments( $ALIGNOUT, $currentAnnot, \%colWidths, 0, $printid ); } print $ALIGNOUT "\n"; $pastAnnot = $currentAnnot; } # while loop close $ALIGNOUT; } # sub printAlignAnnots sub printHTMLAnnots { my $fileName = shift; my $id = shift; my $seq1lengths = shift; my %colWidths = %{ shift() }; my $annots = shift; my $HTMLOUT = new FileHandle; my $hdrTmplFile = "$FindBin::RealBin/HTMLAnnotHeader.html"; open $HTMLOUT, ">$fileName" || die "Cannot open $fileName\n"; open HDR, "<$hdrTmplFile" || die "printHTMLAnnots(): Cannot open header template $hdrTmplFile\n"; while ( ) { print $HTMLOUT $_; } close HDR; my $widthposquery = $colWidths{'BeginAlign'} + $colWidths{'EndAlign'} + $colWidths{'LeftOver'} + 2; my $widthposrepeat = $colWidths{'Seq2Begin'} + $colWidths{'Seq2End'} + $colWidths{'LeftUnaligned'} + 2; print $HTMLOUT "
\n"; print $HTMLOUT "
\n"; print $HTMLOUT "
\n";

  # TODO:
  # Calculate nesting depth and include as parameter to CSS/Javascript

  #
  # Print Header
  #
  #   The header ( and all annotations lines for that matter ) are printed
  #   as preformatted ( 
 ) text in the HTML file.  It is important
  #   that the correct number of spaces are used between column headings
  #   in order to match the data.
  #
  # First line of header
  my $hdrLine =
      " " x (
       9 + $colWidths{'SW'} + $colWidths{'PctSubst'} + $colWidths{'PctDelete'} +
           $colWidths{'PctInsert'} + $colWidths{'Seq1Name'} )
      . &fmtField(
                   2 + $colWidths{'BeginAlign'} + $colWidths{'EndAlign'} +
                       $colWidths{'LeftOver'},
                   'position in query',
                   "C",
                   "-"
      )
      . " " x ( 5 + $colWidths{'HitName'} + $colWidths{'class'} )
      . &fmtField(
                   2 + $colWidths{'Seq2Begin'} + $colWidths{'Seq2End'} +
                       $colWidths{'LeftUnaligned'},
                   'position in repeat', "C", "-"
      )
      . "\n";

  print $HTMLOUT $hdrLine;

  # Second line of header
  $hdrLine =
        " " x ( 3 + $colWidths{'SW'} )
      . ''
      . &fmtField( $colWidths{'PctSubst'}, "%", "C" )
      . "" . " "
      . ''
      . &fmtField( $colWidths{'PctDelete'}, "%", "C" )
      . "" . " "
      . ''
      . &fmtField( $colWidths{'PctInsert'}, "%", "C" )
      . "" . "  "
      . ''
      . &fmtField( $colWidths{'Seq1Name'}, "query", "L" )
      . ""
      . " " x ( 5 + $colWidths{'BeginAlign'} + $colWidths{'EndAlign'} +
                $colWidths{'LeftOver'} )
      . '' . "C"
      . "" . " "
      . ''
      . &fmtField( $colWidths{'HitName'}, "matching", "L" )
      . "" . " "
      . ''
      . &fmtField( $colWidths{'class'}, "repeat", "L" )
      . "" . " "
      . ''
      . &fmtField( $colWidths{'Seq2Begin'}, "(left)", "C" )
      . "" . " "
      . ''
      . &fmtField( $colWidths{'Seq2End'}, "end", "C" )
      . "" . " "
      . ''
      . &fmtField( $colWidths{'LeftUnaligned'}, "begin", "C" )
      . "" . " "
      . ''
      . "linkage"
      . "\n";

  print $HTMLOUT $hdrLine;

  # Third line of header
  $hdrLine =
''
      . "+"
      . "" . " "
      . ''
      . &fmtField( $colWidths{'SW'}, "score", "L" )
      . "" . " "
      . ''
      . &fmtField( $colWidths{'PctSubst'}, "div.", "R" )
      . "" . " "
      . ''
      . &fmtField( $colWidths{'PctDelete'}, "del.", "R" )
      . "" . " "
      . ''
      . &fmtField( $colWidths{'PctInsert'}, "ins.", "R" )
      . "" . "  "
      . ''
      . &fmtField( $colWidths{'Seq1Name'}, "sequence", "L" )
      . "" . "  "
      . ''
      . &fmtField( $colWidths{'BeginAlign'}, "begin", "C" )
      . "" . " "
      . ''
      . &fmtField( $colWidths{'EndAlign'}, "end", "C" )
      . "" . " "
      . ''
      . &fmtField( $colWidths{'LeftOver'}, "(left)", "C" )
      . "" . " "
      . '' . "+"
      . "" . " "
      . ''
      . &fmtField( $colWidths{'HitName'}, "repeat", "L" )
      . "" . " "
      . ''
      . &fmtField( $colWidths{'class'}, "class/family", "L" )
      . "" . " "
      . ''
      . &fmtField( $colWidths{'Seq2Begin'}, "begin", "C" )
      . "" . " "
      . ''
      . &fmtField( $colWidths{'Seq2End'}, "end", "C" )
      . "" . " "
      . ''
      . &fmtField( $colWidths{'LeftUnaligned'}, "(left)", "C" )
      . "" . " "
      . ''
      . "id/graphic"
      . "\n";

  print $HTMLOUT $hdrLine;

  # First line of header
  #printf $HTMLOUT " %${colWidths{'SW'}}s  %${colWidths{'PctSubst'}}s "
  #     . "%${colWidths{'PctDelete'}}s "
  #     . "%${colWidths{'PctInsert'}}s  "
  #     . "%-${colWidths{'Seq1Name'}}s  "
  #     . "%-${widthposquery}s   "
  #     . "%-${colWidths{'HitName'}}s %-${colWidths{'class'}}s "
  #     . "%${widthposrepeat}s %s\n", 'SW', 'perc', 'perc', 'perc', 'query',
  #       position in query', 'matching', 'repeat', 'position in repeat',
  #       ' linkage';

# Second line of header
#printf $HTMLOUT " %${colWidths{'SW'}}s  %${colWidths{'PctSubst'}}s "
#     . "%${colWidths{'PctDelete'}}s "
#     . "%${colWidths{'PctInsert'}}s  "
#     . "%-${colWidths{'Seq1Name'}}s  "
#     . "%-${colWidths{'BeginAlign'}}s "
#     . "%-${colWidths{'EndAlign'}}s "
#     . "%${colWidths{'LeftOver'}}s   "
#     . "%-${colWidths{'HitName'}}s "
#     . "%-${colWidths{'class'}}s "
#     . "%-${colWidths{'Seq2Begin'}}s "
#     . "%-${colWidths{'Seq2End'}}s "
#     . "%-${colWidths{'LeftUnaligned'}}s %${colWidths{'ID'}}s", 'score', 'div.', 'del.',
#     'ins.', 'sequence', 'begin', 'end', '(left)', 'repeat',
#     'class/family', 'begin', 'end', '(left)', 'id / graphic';

  print $HTMLOUT "
\n"; print $HTMLOUT "
\n"; # # Print the out file. # my $cycleAnnotIter = $annots->getIterator(); my $pastAnnot = undef; my $blueDivToggle = 0; my $hspsID = 1; while ( $cycleAnnotIter->hasNext() ) { my $currentAnnot = $cycleAnnotIter->next(); # Get the next annot my $nextAnnot = undef; if ( $cycleAnnotIter->hasNext ) { $nextAnnot = $cycleAnnotIter->next(); $cycleAnnotIter->previous(); # NOTE: If you are going to remove anything in this # cycle you should move back and forward so # the last thing returned is the previous one. } # # Indicate overlapping sequences in table # my $Overlapped = ""; if ( ( $pastAnnot && $currentAnnot->getSeq1Name eq $pastAnnot->getSeq1Name && $currentAnnot->getSeq1Beg <= $pastAnnot->getSeq1End && $currentAnnot->getSW < $pastAnnot->getSW && $currentAnnot->getLastField eq $pastAnnot->getLastField ) || ( $nextAnnot && $currentAnnot->getSeq1Name eq $nextAnnot->getSeq1Name() && $currentAnnot->getSeq1End >= $nextAnnot->getSeq1Beg() && $currentAnnot->getSW < $nextAnnot->getSW() && $currentAnnot->getLastField eq $nextAnnot->getLastField() ) ) { $Overlapped = "*"; } # format fields my $LeftOver = $seq1lengths->{ $currentAnnot->getSeq1Name } - $currentAnnot->getSeq1End; my $LeftOverPrint = "(" . $LeftOver . ")"; my $Seq2BeginPrint = "(" . $currentAnnot->get3PrimeUnaligned . ")"; my $LeftUnalignedPrint = $currentAnnot->getSeq2Beg(); if ( $currentAnnot->getRevComp eq '+' ) { $Seq2BeginPrint = $currentAnnot->getSeq2Beg(); $LeftUnalignedPrint = "(" . $currentAnnot->get3PrimeUnaligned . ")"; } my $printid = $id->{ $currentAnnot->getID() }; my $PctSubst = sprintf "%4.1f", $currentAnnot->getPctSubst; my $PctDelete = sprintf "%4.1f", $currentAnnot->getPctDelete; my $PctInsert = sprintf "%4.1f", $currentAnnot->getPctInsert; if ( $options{'lcambig'} ) { # Use repeat name case to highlight ambiguous DNA # transposon fragments $currentAnnot->setHitName( uc( $currentAnnot->getHitName() ) ); $currentAnnot->setHitName( lc( $currentAnnot->getHitName() ) ) if ( $currentAnnot->getEquivHash() && $currentAnnot->getClassName() =~ /DNA/ ); } if ( !$currentAnnot->getLeftLinkedHit() ) { print $HTMLOUT "
\n"; } if ( $blueDivToggle ) { print $HTMLOUT "
"; } print $HTMLOUT "
";

    if ( defined $currentAnnot->getDerivedFromAnnot() ) {

      # print link
      print $HTMLOUT
"+";
    }
    else {
      print $HTMLOUT " ";
    }

    print $HTMLOUT "";
    printf $HTMLOUT "%${colWidths{'SW'}}d  %${colWidths{'PctSubst'}}s "
        . "%${colWidths{'PctDelete'}}s "
        . "%${colWidths{'PctInsert'}}s  "
        . "%-${colWidths{'Seq1Name'}}s  "
        . "%${colWidths{'BeginAlign'}}s "
        . "%${colWidths{'EndAlign'}}s "
        . "%${colWidths{'LeftOver'}}s %1s "
        . ""
        . "%-${colWidths{'HitName'}}s "
        . "%-${colWidths{'class'}}s "
        . "%${colWidths{'Seq2Begin'}}s "
        . "%${colWidths{'Seq2End'}}s "
        . "%${colWidths{'LeftUnaligned'}}s ", $currentAnnot->getSW(), $PctSubst,
        $PctDelete, $PctInsert, $currentAnnot->getSeq1Name(),
        $currentAnnot->getSeq1Beg(), $currentAnnot->getSeq1End(),
        $LeftOverPrint, $currentAnnot->getRevComp(),
        $currentAnnot->getHitName(), $currentAnnot->getClassName(),
        $Seq2BeginPrint, $currentAnnot->getSeq2End(), $LeftUnalignedPrint;

    printf $HTMLOUT "%${colWidths{'ID'}}s %1s", $printid, $Overlapped;

    print $HTMLOUT "
"; if ( defined $currentAnnot->getDerivedFromAnnot() ) { print $HTMLOUT "
\n"; print $HTMLOUT "
\n";
      print $HTMLOUT "ANNOTATION EVIDENCE:\n";
      &printSourceAlignments( $HTMLOUT, $currentAnnot, \%colWidths, 1, " " );
      print $HTMLOUT "
\n
\n"; $hspsID++; } if ( $blueDivToggle ) { print $HTMLOUT "
"; } $blueDivToggle ^= 1; print $HTMLOUT "\n"; if ( !$currentAnnot->getRightLinkedHit() ) { print $HTMLOUT "
\n"; } $pastAnnot = $currentAnnot; } # while loop close $HTMLOUT; } # sub printHTMLAnnots sub fmtField { my $fldWidth = shift; my $string = shift; my $just = shift; my $pad = shift; my $retStr = ""; if ( length( $string ) > $fldWidth ) { # Must truncate if ( defined $just && $just =~ /C/i ) { $retStr = substr( $string, int( ( length( $string ) - $fldWidth ) / 2 ), $fldWidth ); } elsif ( defined $just && $just =~ /R/i ) { $retStr = substr( $string, length( $string ) - $fldWidth, $fldWidth ); } else { $retStr = substr( $string, 0, $fldWidth ); } return $retStr; } my $padChar = " "; if ( defined $pad ) { $padChar = $pad; } # Must pad if ( defined $just && $just =~ /C/i ) { my $padPerSide = ( $fldWidth - length( $string ) ) / 2; if ( int( $padPerSide ) != $padPerSide ) { $retStr = $padChar x ( int( $padPerSide ) ) . $string . $padChar x ( int( $padPerSide ) + 1 ); } else { $retStr = $padChar x ( int( $padPerSide ) ) . $string . $padChar x ( int( $padPerSide ) ); } } elsif ( defined $just && $just =~ /R/i ) { $retStr = $padChar x ( $fldWidth - length( $string ) ) . $string; } else { $retStr = $string . $padChar x ( $fldWidth - length( $string ) ); } return $retStr; } sub printSourceAlignments { my $FILE = shift; my $annot = shift; my $widthsRef = shift; my $html = shift; my $fixedID = shift; my %colWidths = undef; %colWidths = %{$widthsRef} if ( defined $widthsRef ); if ( !defined $annot->getDerivedFromAnnot() ) { # Allow ID to be overriden. my $displayID = $annot->getID(); $displayID = $fixedID if ( $fixedID ); # print if ( defined $widthsRef ) { if ( $html ) { printf $FILE "%${colWidths{'SW'}}d %${colWidths{'PctSubst'}}s " . "%${colWidths{'PctDelete'}}s " . "%${colWidths{'PctInsert'}}s " . "%-${colWidths{'Seq1Name'}}s " . "%${colWidths{'BeginAlign'}}s " . "%${colWidths{'EndAlign'}}s " . "%${colWidths{'LeftOver'}}s %1s " . "" . "%-${colWidths{'HitName'}}s " . "%-${colWidths{'class'}}s " . "%${colWidths{'Seq2Begin'}}s " . "%${colWidths{'Seq2End'}}s " . "%${colWidths{'LeftUnaligned'}}s %${colWidths{'ID'}}s\n", $annot->getSW(), $annot->getPctSubst(), $annot->getPctDelete(), $annot->getPctInsert(), $annot->getSeq1Name(), $annot->getSeq1Beg(), $annot->getSeq1End(), $annot->getLeftOver(), $annot->getRevComp(), $annot->getHitName(), $annot->getClassName(), $annot->getSeq2Beg(), $annot->getSeq2End(), $annot->get3PrimeUnaligned(), $displayID; } else { if ( $annot->getRevComp() eq "C" ) { printf $FILE "%${colWidths{'SW'}}d %${colWidths{'PctSubst'}}s " . "%${colWidths{'PctDelete'}}s " . "%${colWidths{'PctInsert'}}s " . "%-${colWidths{'Seq1Name'}}s " . "%${colWidths{'BeginAlign'}}s " . "%${colWidths{'EndAlign'}}s " . "%${colWidths{'LeftOver'}}s %1s " . "%-${colWidths{'HitName'}}s " . "%-${colWidths{'class'}}s " . "%${colWidths{'LeftUnaligned'}}s " . "%${colWidths{'Seq2End'}}s " . "%${colWidths{'Seq2Begin'}}s %${colWidths{'ID'}}s\n", $annot->getSW(), $annot->getPctSubst(), $annot->getPctDelete(), $annot->getPctInsert(), $annot->getSeq1Name(), $annot->getSeq1Beg(), $annot->getSeq1End(), "(" . $annot->getLeftOver() . ")", $annot->getRevComp(), $annot->getHitName(), $annot->getClassName(), "(" . $annot->get3PrimeUnaligned() . ")", $annot->getSeq2End(), $annot->getSeq2Beg(), $displayID; } else { printf $FILE "%${colWidths{'SW'}}d %${colWidths{'PctSubst'}}s " . "%${colWidths{'PctDelete'}}s " . "%${colWidths{'PctInsert'}}s " . "%-${colWidths{'Seq1Name'}}s " . "%${colWidths{'BeginAlign'}}s " . "%${colWidths{'EndAlign'}}s " . "%${colWidths{'LeftOver'}}s %1s " . "%-${colWidths{'HitName'}}s " . "%-${colWidths{'class'}}s " . "%${colWidths{'Seq2Begin'}}s " . "%${colWidths{'Seq2End'}}s " . "%${colWidths{'LeftUnaligned'}}s %${colWidths{'ID'}}s\n", $annot->getSW(), $annot->getPctSubst(), $annot->getPctDelete(), $annot->getPctInsert(), $annot->getSeq1Name(), $annot->getSeq1Beg(), $annot->getSeq1End(), "(" . $annot->getLeftOver() . ")", $annot->getRevComp(), $annot->getHitName(), $annot->getClassName(), $annot->getSeq2Beg(), $annot->getSeq2End(), "(" . $annot->get3PrimeUnaligned() . ")", $displayID; } } } print $FILE $annot->getAlignData(); } else { # recurse ## May need to sort by seq1beg. Look for examples my @srcMembers = @{ $annot->getDerivedFromAnnot() }; foreach my $member ( @srcMembers ) { if ( $member == $annot ) { warn "printSourceAlignments(): Warning - a loop was detected!\n" . "Please notify the site administrator of this problem.\n"; print $FILE "WARNING: A loop was detected in the evidence reporting\n"; print $FILE "code. Please notify the site administrator of this\n"; print $FILE "problem.\n"; ## TODO MUST REMOVE BEFORE NEXT RELEASE -- WEBSERVER ONLY!! #my $command = "echo \"loop detected in run containing " # . $annot->getSeq1Name() . "\" >> /tmp/badseeds"; #`$command`; return; } &printSourceAlignments( $FILE, $member, $widthsRef, $html, $fixedID ); } } } sub getMaxColWidths { my $annotationListRef = shift; my %widths = ( 'SW' => 5, 'PctSubst' => 5, 'PctInsert' => 4, 'PctDelete' => 4, 'Seq1Name' => 8, 'BeginAlign' => 5, 'EndAlign' => 5, 'LeftOver' => 6, 'HitName' => 8, 'class' => 10, 'Seq2Begin' => 6, 'Seq2End' => 6, 'LeftUnaligned' => 6, 'ID' => 3 ); if ( !defined $annotationListRef ) { return ( %widths ); } my $annotIter = $annotationListRef->getIterator(); while ( $annotIter->hasNext() ) { my $annot = $annotIter->next(); my $Seq2BeginLen = length "(" . $annot->get3PrimeUnaligned . ")"; my $LeftUnalignedLen = length $annot->getSeq2Beg(); if ( $annot->getRevComp eq '+' ) { $Seq2BeginLen = length $annot->getSeq2Beg(); $LeftUnalignedLen = length "(" . $annot->get3PrimeUnaligned . ")"; } $widths{'SW'} = length $annot->getSW if $widths{'SW'} < length $annot->getSW; $widths{'Seq1Name'} = length $annot->getSeq1Name if $widths{'Seq1Name'} < length $annot->getSeq1Name; $widths{'BeginAlign'} = length $annot->getSeq1Beg if $widths{'BeginAlign'} < length $annot->getSeq1Beg; $widths{'EndAlign'} = length $annot->getSeq1End if $widths{'EndAlign'} < length $annot->getSeq1End; $widths{'LeftOver'} = length( $annot->getLeftOver() ) + 2 if $widths{'LeftOver'} < length( $annot->getLeftOver() ) + 2; $widths{'HitName'} = length $annot->getHitName() if $widths{'HitName'} < length $annot->getHitName(); $widths{'class'} = length $annot->getClassName if $widths{'class'} < length $annot->getClassName; $widths{'Seq2Begin'} = $Seq2BeginLen if $widths{'Seq2Begin'} < $Seq2BeginLen; $widths{'Seq2End'} = length $annot->getSeq2End if $widths{'Seq2End'} < length $annot->getSeq2End; $widths{'LeftUnaligned'} = $LeftUnalignedLen if $widths{'LeftUnaligned'} < $LeftUnalignedLen; $widths{'ID'} = length $annot->getID() if $widths{'ID'} < length $annot->getID(); } return ( %widths ); } sub printSourceAnnots { my $annot = shift; my $widthsRef = shift; my $level = shift; my %colWidths = %{$widthsRef}; return if ( !defined $annot->getDerivedFromAnnot() ); my @srcMembers = @{ $annot->getDerivedFromAnnot() }; foreach my $member ( @srcMembers ) { print OUTFULL " " x ( $level ) . "-> "; printf OUTFULL "%${colWidths{'SW'}}d %${colWidths{'PctSubst'}}s " . "%${colWidths{'PctDelete'}}s " . "%${colWidths{'PctInsert'}}s " . "%-${colWidths{'Seq1Name'}}s " . "%${colWidths{'BeginAlign'}}s " . "%${colWidths{'EndAlign'}}s " . "%${colWidths{'LeftOver'}}s %1s " . "%-${colWidths{'HitName'}}s " . "%-${colWidths{'class'}}s " . "%${colWidths{'Seq2Begin'}}s " . "%${colWidths{'Seq2End'}}s " . "%${colWidths{'LeftUnaligned'}}s %${colWidths{'ID'}}s\n", $member->getSW(), $member->getPctSubst(), $member->getPctDelete(), $member->getPctInsert(), $member->getSeq1Name(), $member->getSeq1Beg(), $member->getSeq1End(), $member->getLeftOver(), $member->getRevComp(), $member->getHitName(), $member->getClassName(), $member->getSeq2Beg(), $member->getSeq2End(), $member->get3PrimeUnaligned(), $member->getID(); &printSourceAnnots( $member, $widthsRef, $level++ ); } } sub scoreLINEPair { my $annot1 = shift; my $annot2 = shift; my $elementDistance = shift; my $optRef = shift; my $score = 0; my $DEBUG = 0; if ( $DEBUG ) { print STDERR "scoreLINEPair(): Scoring these two:\n"; $annot1->print(); $annot2->print(); } # Establish position order my $leftAnnot = $annot1; my $rightAnnot = $annot2; if ( $annot1->comparePositionOrder( $annot2 ) > 0 ) { $leftAnnot = $annot2; $rightAnnot = $annot1; } # Calculate query overlap my $queryOverlap = $rightAnnot->getQueryOverlap( $leftAnnot ); # Calculate consensus overlap & consensus gap my $consensusOverlap = $rightAnnot->getConsensusOverlap( $leftAnnot ); my $consensusGap = -$consensusOverlap; my $adjConOverlap = $consensusOverlap; $adjConOverlap -= $queryOverlap if $queryOverlap > 0; # Determine name compatibility my $namesCompat = 0; if ( $leftAnnot->getSeq2Name() eq $rightAnnot->getSeq2Name() ) { $namesCompat = 1; } else { my $nameHash = undef; if ( defined $RepeatAnnotationData::lineHash{ $leftAnnot->getHitName() } ) { $nameHash = $RepeatAnnotationData::lineHash{ $leftAnnot->getHitName() }; } elsif ( defined $RepeatAnnotationData::lineHash{ $rightAnnot->getHitName() } ) { #warn "RepeatAnnotationData does not contain data on " . # $leftAnnot->getHitName() . "\n"; $nameHash = $RepeatAnnotationData::lineHash{ $rightAnnot->getHitName() }; } else { #warn "RepeatAnnotationData does not contain data on " . # $rightAnnot->getHitName() . "\n"; } if ( $nameHash ) { my $nameEntry = undef; if ( defined $nameHash->{ $rightAnnot->getHitName() } ) { $nameEntry = $nameHash->{ $rightAnnot->getHitName() }; } elsif ( defined $nameHash->{ $leftAnnot->getHitName() } ) { #warn "RepeatAnnotationData contains a non-reciprocal entry " . # $rightAnnot->getHitName() . "\n"; $nameEntry = $nameHash->{ $leftAnnot->getHitName() }; } if ( $nameEntry ) { my $overlapThresh = $nameEntry->{'overThresh'}; print STDERR "Names compatible: " . $rightAnnot->getHitName() . " " . $leftAnnot->getHitName() . " ot=$overlapThresh\n" if ( $DEBUG ); if ( $overlapThresh >= 0 ) { $namesCompat = 1 if ( $overlapThresh == 0 || ( $queryOverlap > $overlapThresh ) ); } } # if ( $nameEntry... } # if ( $nameHash... else { } } #else names are equal # # First use essential qualifiers # - Same query sequence # - Same orientation # if ( $annot1->getSeq1Name() eq $annot2->getSeq1Name() && $annot1->getRevComp() eq $annot2->getRevComp() ) { print STDERR "scoreLINEPair() get3PrimeUnaligned = " . $leftAnnot->get3PrimeUnaligned . "\n" if ( $DEBUG ); # # Consensus Positions Make Sense # # i.e 100-----> 50-----> # or <------200 <------320 # if ( ( $leftAnnot->getRevComp() eq "+" && $rightAnnot->getSeq2Beg() >= $leftAnnot->getSeq2Beg() - 5 || $leftAnnot->getRevComp() eq "C" && $leftAnnot->getSeq2Beg() >= $rightAnnot->getSeq2Beg() - 5 ) ) { # # Is the model order valid and is there room # for model extension? # # i.e. do not allow: # 5 <--- 3 # or 3 ---> 5 # # Also do not allow ( because? ): # # 3 --->| anything # or anything |<---- 5 # my $modelCompat = 1; if ( $leftAnnot->getRevComp() eq "+" && $leftAnnot->getSeq2Name() =~ /_3end/ && ( $rightAnnot->getSeq2Name() =~ /_5end/ || $leftAnnot->get3PrimeUnaligned() < 10 ) || $rightAnnot->getRevComp() eq "C" && $rightAnnot->getSeq2Name() =~ /_3end/ && ( $leftAnnot->getSeq2Name() =~ /_5end/ || $rightAnnot->getLeftOver() < 10 ) ) { $modelCompat = 0; } if ( $modelCompat ) { if ( $namesCompat ) { # # Calculate score: # - divergence difference ( large = bad ) # - query overlap ( overlap = good, gap = worse ) # - consensus overlap ( large overlap = bad, large gap = bad ) # distfine = CO < 33 && CG <=200 || # really complex CO/QO & Div formula # # divDiff = The closer the two divergences are to each # other the closer divDiff gets to its max of # 1.54. NOTE: It's later limited to 1.2 # wha? wha? wha? my $divDiff; if ( $rightAnnot->getPctSubst() > $leftAnnot->getPctSubst() && $rightAnnot->getPctSubst() > 0 ) { $divDiff = ( ( $leftAnnot->getPctSubst() / $rightAnnot->getPctSubst() ) / 0.65 ) - ( ( $rightAnnot->getPctSubst() - $leftAnnot->getPctSubst() ) / 100 ); } elsif ( $leftAnnot->getPctSubst() > $rightAnnot->getPctSubst() && $leftAnnot->getPctSubst() > 0 ) { $divDiff = ( $rightAnnot->getPctSubst() / $leftAnnot->getPctSubst() ) / 0.65 - ( $leftAnnot->getPctSubst() - $rightAnnot->getPctSubst() ) / 100; } else { $divDiff = 1.2; } $divDiff = 1.2 if $divDiff > 1.2; # Factor decreasing with number of intervening # elements ( of same type ); should prevent way distant link-ups my $elementDistanceFactor = 1 - ( $elementDistance - 1 ) / 30; my $avgLeftUnaligned = $leftAnnot->get3PrimeUnaligned(); $avgLeftUnaligned = $rightAnnot->get3PrimeUnaligned() if ( $avgLeftUnaligned < $rightAnnot->get3PrimeUnaligned() ); if ( $DEBUG ) { print STDERR "Stats: queryOverlap = $queryOverlap,\n" . " consensusOverlap = $consensusOverlap,\n" . " divDiff = $divDiff > 0.1,\n" . " elementDistance = $elementDistance,\n" . " elementDistanceFactor = $elementDistanceFactor,\n" . " avgLeftUnaligned = $avgLeftUnaligned,\n"; print STDERR " adjConOverlap = $adjConOverlap <= " . ( ( 3.75 * $avgLeftUnaligned**0.48 - 47 ) * $elementDistanceFactor**2 * $divDiff ) . ",\n"; print STDERR " consGap = $consensusGap <= " . ( $elementDistanceFactor * $divDiff * $avgLeftUnaligned / 1.4 ) . "\n"; } if ( # The allowed conensus overlap is dependent on the position # in the LINE1 consensus. The further it is from the 3' end # ($LeftUnaligned) the more overlap is tolerated, this # somewhat has to do with the relatively poor representation # of 5' region consensus sequences, but largely because of # the presence of tandem promoter units. This should be # solved in a better way, e.g. identifying the tandem # repeats in the consensus sequences and treating those # regions as we treat satellites. # The overlap quadratically relates to the distance in IDs. ( $adjConOverlap == 0 || $adjConOverlap <= ( 3.75 * $avgLeftUnaligned**0.48 - 47 ) * $elementDistanceFactor**2 * $divDiff ) && $divDiff > 0.1 # I'm being generous here && $consensusGap <= $elementDistanceFactor * $divDiff * $avgLeftUnaligned / 1.4 ) { # consensus gap should be allowed to be larger the more 5' # you get, since these regions are getting rarer #print STDERR "****DISTFINE SET*****\n"; print STDERR " -- We call this good!\n" if ( $DEBUG ); $score = 1; } } elsif ( $DEBUG ) { print STDERR "scoreLINEPair(): Names Not Compat Rule\n"; } # ...if ( $namesCompat... } elsif ( $DEBUG ) { print STDERR "scoreLINEPair(): Invalid Model Order or " . "Insufficient Unaligned\n"; } # if ( $modelCompat... } elsif ( $DEBUG ) { print STDERR "scoreLINEPair(): Invalid Consensus Order\n"; } # Do consensus positions make sense? # # Determine ShortHitName Compat # my $shortHitNameCompatible = 0; my $HitName = $rightAnnot->getHitName(); $HitName =~ s/_orf2$//; # TODO: This is a special case of an element which needs # a portion of it's sequence masked out in a separate stage. # The sequence is a simple repeats which is scored too high # otherwise. I am refering to the endX designation. $HitName =~ s/(.*)_[35]endX?$/$1/; $HitName =~ s/_strong$//; # could use latter as a general tool to indicate a diagnostic fragment # rename carnivore LINEs who's name have been adjusted # temporarily to allow neighboring fragment recognition if ( $HitName =~ /^L1_Ca/ ) { $HitName =~ s/^L1_Canis0/L1_Cf/; $HitName =~ s/^L1_Canis4/L1_Canid/; $HitName =~ s/^L1_Canis5/L1_Canid2/; } my $shortHitName = $HitName; if ( $HitName =~ /^L1M/ ) { $shortHitName =~ s/(^\w{5}).*/$1/; } elsif ( $optRef->{'mammal'} && $HitName =~ /^L1_/ ) { $shortHitName = "L1_"; } else { $shortHitName =~ s/(\S+\d)[a-zA-Z]$/$1/; } $shortHitName = quotemeta $shortHitName; my $leftHitName = $leftAnnot->getHitName(); $leftHitName =~ s/_orf2$//; $leftHitName =~ s/(.*)_[35]endX?$/$1/; $leftHitName =~ s/_strong$//; if ( $leftHitName =~ /^L1_Ca/ ) { $leftHitName =~ s/^L1_Canis0/L1_Cf/; $leftHitName =~ s/^L1_Canis4/L1_Canid/; $leftHitName =~ s/^L1_Canis5/L1_Canid2/; } print STDERR "scoreLINEPair(): shortHitName = $shortHitName\n" if ( $DEBUG ); if ( $leftHitName =~ /^$shortHitName/ ) { $shortHitNameCompatible = 1; print STDERR "scoreLINEPair(): shortHitNameCompatible -- yes\n" if ( $DEBUG ); } # # Less stringent compatability test # # shortHitNameCompatible # gapMax = 2500 -> 3750 ( names exact ) # divDiff = abs( div1 - div2 ) # gapMax decreases with greater divDiff # gapMax decreases with greater annotation distance # if ( $score == 0 && $shortHitNameCompatible ) { print STDERR "scoreLINEPair(): last chance test ( names compat )\n" if ( $DEBUG ); my $gapMax = 2500; $gapMax *= 1.5 if ( $leftAnnot->getHitName() eq $rightAnnot->getHitName() ); # Need to revise the way div diff is calc'd my $divDiff = $leftAnnot->getPctSubst() - $rightAnnot->getPctSubst(); $divDiff = -$divDiff if ( $divDiff < 0 ); $gapMax *= ( 10 - $divDiff ) / 10; $gapMax *= ( 10 - $elementDistance ) / 10; if ( $DEBUG ) { print STDERR " divDiff = $divDiff\n"; print STDERR " gapMax = $gapMax > cg=" . $leftAnnot->getConsensusGap( $rightAnnot ) . "\n"; } if ( $leftAnnot->getConsensusOverlap( $rightAnnot ) < 33 && $leftAnnot->getConsensusOverlap( $rightAnnot ) > -$gapMax ) { print STDERR "scoreLINEPair(): Hmmm....questionable...but " . "going to do it\n" if ( $DEBUG ); $score = 0.5; } } # # What is this catching??? # # - The sequences overlap ( but not contain ) # - The sequences are shortHitNameCompatible # - Query Overlap is >= 33 # - Consensus Overlap is < 50 # if ( $score == 0 ) { print STDERR "scoreLINEPair(): lastiness test\n" if ( $DEBUG ); if ( $leftAnnot->getSeq1End() >= $rightAnnot->getSeq1Beg() && $rightAnnot->getSeq1End() > $leftAnnot->getSeq1End() ) { # # If these elements have the same class, orientation, # similar names with small overlap or simply just # a large overlap then join them!!! Arghhh # if ( $shortHitNameCompatible && $rightAnnot->getSeq1Beg() <= $leftAnnot->getSeq1End() - 33 && $leftAnnot->getConsensusOverlap( $rightAnnot ) < 50 ) { if ( !$namesCompat ) { #warn "Must add " # . $leftAnnot->getHitName() . " and " # . $rightAnnot->getHitName() # . " to the compat hash\n"; } print STDERR "scoreLINEPair(): Well if you insist\n" if ( $DEBUG ); $score = 0.25; } } } } elsif ( $DEBUG ) { print STDERR "scoreLINEPair(): Wrong Seq or Orientation Rule\n"; } # Same queryName and Same orientation if ( $score ) { my $conOverlapFactor = -1; $conOverlapFactor = 1 / abs( $adjConOverlap + $queryOverlap ) if ( $adjConOverlap + $queryOverlap != 0 ); if ( $DEBUG ) { print STDERR "adjConOverlap = $adjConOverlap + " . "queryOverlap = $queryOverlap conOverlapFactor " . "= $conOverlapFactor\n"; print STDERR "score = $score\n"; } if ( $conOverlapFactor > 0 ) { $score += $conOverlapFactor; } else { print STDERR "Using elementdistance = $elementDistance\n" if ( $DEBUG ); $score += 1 / $elementDistance; } } print STDERR "scoreLINEPair(): Final Score = $score\n" if ( $DEBUG ); return ( $score ); } sub areLINENamesCompat { my $line1 = shift; my $line2 = shift; # If name incompatible then return low score early my $compat = 0; my $line1Name = $line1->getHitName(); my $line1ClassName = $line1->getClassName(); my $line2Name = $line2->getHitName(); my $line2ClassName = $line2->getClassName(); # Establish position order my $leftAnnot = $line1; my $rightAnnot = $line2; if ( $line1->comparePositionOrder( $line2 ) > 0 ) { $leftAnnot = $line2; $rightAnnot = $line1; } my $queryOverlap = $rightAnnot->getQueryOverlap( $leftAnnot ); if ( $line1->getSeq2Name() eq $line2->getSeq2Name() ) { $compat = 1; } elsif ( defined $RepeatAnnotationData::lineHash{$line1Name}->{$line2Name} ) { my $overlapThresh = $RepeatAnnotationData::lineHash{$line1Name}->{$line2Name} ->{'overThresh'}; if ( $overlapThresh >= 0 ) { $compat = 1 if ( $overlapThresh == 0 || ( $queryOverlap > $overlapThresh ) ); } } elsif ( defined $RepeatAnnotationData::lineHash{$line2Name}->{$line1Name} ) { my $overlapThresh = $RepeatAnnotationData::lineHash{$line2Name}->{$line1Name} ->{'overThresh'}; if ( $overlapThresh >= 0 ) { $compat = 1 if ( $overlapThresh == 0 || ( $queryOverlap > $overlapThresh ) ); } } else { print STDERR "Names not compatible\n" if ( $DEBUG ); } return ( $compat ); } # # # Given a fragment annotation "c" and a set of # putative related fragments M. Also given a # set of transforms "transforms()" for c and # members of M. # # for all transforms of "c" find the maximal # scoring transform where maximal is determined # by the transform which is compatible with the # most elements of M. # # foreach transform "tc" in transforms("c") # foreach element "m" in M # foreach transform "tm" in transforms("m") # is "tc" + "m" + "tm" compatible? # # # Things to consider: # - Perhaps the score should consider # the liklihood that two annotations # would be fused? i.e. minimize the # consensus gap distance?? # # - It would be parsomonious to choose # the set name which covers the most # of the given consensus. # # sub joinDNATransposonFragments { my $chainBegHash = shift; my $chainEndHash = shift; my $repeatDB = shift; my $currentAnnot = shift; my $dnaTransposonCluster = shift; $DEBUG = 0; if ( $DEBUG ) { print STDERR "joinDNATransposonFragments(): Entered...\n"; print STDERR " Considering Element:\n "; $currentAnnot->printBrief(); print STDERR " Neighbor Cluster:\n"; foreach my $potAnnot ( @{$dnaTransposonCluster} ) { print STDERR " "; $potAnnot->printBrief(); } print STDERR "\n"; } ##### TESTING next if ( $currentAnnot->getRightLinkedHit() ); my ( $curHitName, $curClassName ) = split( /\#/, $currentAnnot->getSeq2Name() ); my $curEquivHash = $currentAnnot->getEquivHash(); $curEquivHash = {} if ( !defined $curEquivHash ); my @curNames = ( $curHitName, ( sort ( keys( %{$curEquivHash} ) ) ) ); my %nameScores = (); my %nameAnnots = (); my $proposedName = ""; my $isCompatible = 0; my ( $adjCurBegin, $adjCurEnd ); my ( $adjCandidateBegin, $adjCandidateEnd ); my $highestScore = 0; my $highName = ""; my $highCurRange; my $highMembers; # # Consider all names that the current element may go by # including it's current name and all aliases. # foreach my $curName ( @curNames ) { print STDERR " Current Element Name: $curName\n" if ( $DEBUG ); # Fake a range if hit name my @curRanges = (); if ( $curName eq $curHitName ) { push @curRanges, { 'start' => 1, 'end' => $chainEndHash->{ $currentAnnot->getID() }, 'eqstart' => 1, 'eqend' => $chainEndHash->{ $currentAnnot->getID() } }; } if ( defined $curEquivHash->{$curName} ) { push @curRanges, @{ $curEquivHash->{$curName} }; } my $highestRangeScore = 0; my $highestScoringCurRange = undef; my $highestScoringMembers; foreach my $curNameRange ( @curRanges ) { print STDERR " Current Element Range: [$curNameRange->{'start'}-" . "$curNameRange->{'end'}]-->[$curNameRange->{'eqstart'}-" . "$curNameRange->{'eqend'}]\n" if ( $DEBUG ); my $curNameRangeScore = 0; my @curNameRangeAnnots = (); ( $adjCurBegin, $adjCurEnd ) = &translateCoord( $currentAnnot->getSeq2Beg(), $currentAnnot->getSeq2End(), $curNameRange ); my $curNameRangeLastBegin = $adjCurBegin; my $curNameRangeLastEnd = $adjCurEnd; my $seedSeq1Pos = $currentAnnot->getSeq1Beg(); my $seedSeq1End = $currentAnnot->getSeq1End(); foreach my $potAnnot ( @{$dnaTransposonCluster} ) { ## Don't consider fragments that would be ## inherited from the first one. ## TODO: Doesn't it make more sense to not consider ## currentAnnots with RightLinkedHits? next if ( $potAnnot->getLeftLinkedHit() ); my ( $potHitName, $potClassName ) = split( /\#/, $potAnnot->getSeq2Name() ); my $potEquivHash = $potAnnot->getEquivHash(); $potEquivHash = {} if ( !defined $potEquivHash ); my @potNames = ( $potHitName, ( sort ( keys( %{$potEquivHash} ) ) ) ); my $adjPotStrand = $potAnnot->getRevComp(); if ( $DEBUG ) { print STDERR " Potential Element:\n " if ( $DEBUG ); $potAnnot->printBrief() if ( $DEBUG ); } foreach my $potName ( @potNames ) { # Fake a range if hit name my @potRanges = (); if ( $potName eq $potHitName ) { push @potRanges, { 'start' => 1, 'end' => $chainEndHash->{ $potAnnot->getID() }, 'eqstart' => 1, 'eqend' => $chainEndHash->{ $potAnnot->getID() } }; } if ( defined $potEquivHash->{$potName} ) { push @potRanges, ( @{ $potEquivHash->{$potName} } ); } foreach my $potNameRange ( @potRanges ) { # Test comparison print STDERR " Equiv Names: " . "$potName: [" . $potNameRange->{"start"} . "-" . $potNameRange->{"end"} . "]-->[" . $potNameRange->{"eqstart"} . "-" . $potNameRange->{"eqend"} . "]\n" if ( $DEBUG ); if ( $curName eq $potName ) { # Are the orientations correct? if ( ( $potNameRange->{'compl'} == $curNameRange->{'compl'} && $currentAnnot->getRevComp() eq $potAnnot->getRevComp() ) || ( $potNameRange->{'compl'} != $curNameRange->{'compl'} && $currentAnnot->getRevComp() ne $potAnnot->getRevComp() ) ) { ( $adjCandidateBegin, $adjCandidateEnd ) = &translateCoord( $potAnnot->getSeq2Beg(), $potAnnot->getSeq2End(), $potNameRange ); # Are the coordinates sensible? if ( $potAnnot->getRevComp() eq "C" && defined $potNameRange->{'compl'} ) { $adjPotStrand = "+"; } elsif ( $potAnnot->getRevComp() eq "+" && defined $potNameRange->{'compl'} ) { $adjPotStrand = "C"; } #$adjCurBegin, $adjCurEnd, $isCompatible = &isTransposonPairCompatible( 1, $curNameRangeLastBegin, $curNameRangeLastEnd, $adjCandidateBegin, $adjCandidateEnd, $adjPotStrand, $seedSeq1Pos, $seedSeq1End, $potAnnot->getSeq1Beg(), $potAnnot->getSeq1End() ); if ( $isCompatible ) { print STDERR " ---> isCompatible = $isCompatible\n" if ( $DEBUG ); $curNameRangeLastBegin = $adjCandidateBegin; $curNameRangeLastEnd = $adjCandidateEnd; # Seq1 coordinates move along as we connect fragments. $seedSeq1Pos = $potAnnot->getSeq1Beg(); $seedSeq1End = $potAnnot->getSeq1End(); ## TODO: Look into adding other factors into this score. ## Those would include, all things being equal take ## the higher scoring matches etc. $curNameRangeScore++; push @curNameRangeAnnots, { 'annot' => $potAnnot, 'range' => $potNameRange }; ## TODO: Make this a recursive function or fuse overlapping ## elements first. I prefer the first option. last; } } # if ( ...are orientations correct } # are the names the same } # foreach potNameRange } # foreach potName } # foreach $pot # Update stats: We now know which name produced the highest element # count for the current element, name, and range. #my $highestScoringName = (sort( keys( %curNameRangeScores )))[0]; if ( $curNameRangeScore > $highestRangeScore ) { $highestRangeScore = $curNameRangeScore; $highestScoringCurRange = $curNameRange; $highestScoringMembers = [ @curNameRangeAnnots ]; print STDERR " New high score = $curNameRangeScore\n" if ( $DEBUG ); } } # foreach curNameRange if ( $highestRangeScore > $highestScore ) { print STDERR " New high score = $highestRangeScore\n" if ( $DEBUG ); $highestScore = $highestRangeScore; $highName = $curName; $highCurRange = $highestScoringCurRange; $highMembers = [ @{$highestScoringMembers} ]; #print "Dump: " . Dumper( $highMembers ) . "\n"; } } # foreach curName print STDERR "\n highestScore = $highestScore\n" if ( $DEBUG ); if ( $highestScore > 0 ) { print STDERR " highCurrentName/Range = $highName: [" . $highCurRange->{"start"} . "-" . $highCurRange->{"end"} . "]-->[" . $highCurRange->{"eqstart"} . "-" . $highCurRange->{"eqend"} . "]\n" if ( $DEBUG ); &translateAnnotation( $currentAnnot, $highCurRange, $highName, $repeatDB ); if ( $currentAnnot->getRightLinkedHit() && !$currentAnnot->getLeftLinkedHit() ) { my $tmpAnnot = $currentAnnot; while ( $tmpAnnot->getRightLinkedHit() ) { $tmpAnnot = $tmpAnnot->getRightLinkedHit(); &translateAnnotation( $tmpAnnot, $highCurRange, $highName, $repeatDB ); } } # Correct the coordinates & orientation of the member elements my $lastAnnot = $currentAnnot; my $lastBegin = $adjCurBegin; my $lastEnd = $adjCurEnd; my $lastSeq1Begin = $currentAnnot->getSeq1Beg(); my $lastSeq1End = $currentAnnot->getSeq1End(); my %memberSeen = (); foreach my $member ( @{$highMembers} ) { print "Considering member:\n" if ( $DEBUG ); $member->{'annot'}->print() if ( $DEBUG ); my ( $memberHitName, $memberClassName ) = split( /\#/, $member->{'annot'}->getSeq2Name() ); my ( $adjMemberBegin, $adjMemberEnd ) = &translateCoord( $member->{'annot'}->getSeq2Beg(), $member->{'annot'}->getSeq2End(), $member->{'range'} ); # First check to see if this needs to be fused if ( $adjMemberBegin <= $lastEnd && $adjMemberEnd >= $lastBegin && $member->{'annot'}->getSeq1Beg() <= $lastSeq1Begin && $member->{'annot'}->getSeq1End() >= $lastSeq1End ) { # add fused element to our derived from list if ( $options{'source'} ) { $lastAnnot->addDerivedFromAnnot( $member ); } # Modify the last one # Add bases to either the end or begining of the consensus position #print STDERR "lastRange = $lastBegin - $lastEnd\n"; $lastAnnot->set3PrimeUnaligned( $lastAnnot->get3PrimeUnaligned() - ( $lastEnd - $adjMemberEnd ) ); $lastAnnot->setSeq2End( $adjMemberEnd ); # Remove ourselves $member->{'annot'}->setStatus( "DELETED" ); # Adjust other things like score, div stats etc if ( $member->{'annot'}->getSW() > $lastAnnot->getSW() ) { $lastAnnot->setSW( $member->{'annot'}->getSW() ); } if ( $member->{'annot'}->getPctSubst() < $lastAnnot->getPctSubst() ) { my $totalLastLength = $lastAnnot->getSeq1End() - $lastAnnot->getSeq2Beg(); my $lastRemainder = $totalLastLength - getOverlapSize( $member->{'annot'}->getSeq1Beg(), $member->{'annot'}->getSeq1End(), $lastAnnot->getSeq1Beg(), $lastAnnot->getSeq1End() ); $lastAnnot->setPctSubst( $member->{'annot'}->getPctSubst() + ( $lastAnnot->getPctSubst() * ( $lastRemainder / $totalLastLength ) ) ); $lastAnnot->setPctDelete( $member->{'annot'}->getPctDelete() + ( $lastAnnot->getPctDelete() * ( $lastRemainder / $totalLastLength ) ) ); $lastAnnot->setPctInsert( $member->{'annot'}->getPctInsert() + ( $lastAnnot->getPctInsert() * ( $lastRemainder / $totalLastLength ) ) ); } else { my $totalLastLength = $member->{'annot'}->getSeq1End() - $member->{'annot'}->getSeq2Beg(); my $lastRemainder = $totalLastLength - getOverlapSize( $lastAnnot->getSeq1Beg(), $lastAnnot->getSeq1End(), $member->{'annot'}->getSeq1Beg(), $member->{'annot'}->getSeq1End() ); $member->{'annot'}->setPctSubst( $lastAnnot->getPctSubst() + ( $member->{'annot'}->getPctSubst() * ( $lastRemainder / $totalLastLength ) ) ); $member->{'annot'}->setPctDelete( $lastAnnot->getPctDelete() + ( $member->{'annot'}->getPctDelete() * ( $lastRemainder / $totalLastLength ) ) ); $member->{'annot'}->setPctInsert( $lastAnnot->getPctInsert() + ( $member->{'annot'}->getPctInsert() * ( $lastRemainder / $totalLastLength ) ) ); } # Adjust seq1positions } else { &translateAnnotation( $member->{'annot'}, $member->{'range'}, $highName, $repeatDB ); if ( $member->{'annot'}->getRightLinkedHit() && !$member->{'annot'}->getLeftLinkedHit() ) { my $tmpAnnot = $member->{'annot'}; while ( $tmpAnnot->getRightLinkedHit() ) { $tmpAnnot = $tmpAnnot->getRightLinkedHit(); &translateAnnotation( $tmpAnnot, $member->{'range'}, $highName, $repeatDB ); } } if ( $DEBUG ) { print STDERR " Doing the join of\n" if ( $DEBUG ); $lastAnnot->print(); $member->{'annot'}->print(); } $lastAnnot->join( $member->{'annot'} ); $lastAnnot = $member->{'annot'}; $lastEnd = $adjMemberEnd; $lastBegin = $adjMemberBegin; $lastSeq1Begin = $member->{'annot'}->getSeq1Beg(); $lastSeq1End = $member->{'annot'}->getSeq1End(); } # is fusable else } # end foreach if ( $DEBUG ) { print STDERR " New current element:\n "; $currentAnnot->printBrief(); print STDERR " New member elements:\n"; foreach my $member ( @{$highMembers} ) { print STDERR " " . $member->{'annot'}->getStatus() . ":"; $member->{'annot'}->printBrief(); } } } print STDERR "joinDNATransposonFragments(): Exiting...\n\n" if ( $DEBUG ); } sub translateAnnotation { my $annot = shift; my $transRange = shift; my $transName = shift; my $repeatDB = shift; # Correct the coordinates & orientation of the current element first my ( $curHitName, $curClassName ) = split( /\#/, $annot->getSeq2Name() ); my ( $adjCurBegin, $adjCurEnd ) = &translateCoord( $annot->getSeq2Beg(), $annot->getSeq2End(), $transRange ); # Modify end coordinates and left unaligned if necessary. if ( $adjCurEnd == $annot->getSeq2End() && $curHitName eq $transName ) { # Nothing to do...coordinates didn't change. } elsif ( defined $repeatDB->{ lc( $transName ) }->{'conlength'} ) { $annot->setSeq2End( $adjCurEnd ); $annot->set3PrimeUnaligned( $repeatDB->{ lc( $transName ) }->{'conlength'} - $annot->getSeq2End() ); } else { die "Cannot find the consensus length for element $transName!\n" . "consensus unaligned length will be incorrect in the\n" . "annotation. Annotation is\n" . $annot->print() . "\n"; } # If we are changing the name we might have changed the class as well. if ( defined $repeatDB->{ lc( $transName ) } ) { my $className = $repeatDB->{ lc( $transName ) }->{'type'}; if ( $repeatDB->{ lc( $transName ) }->{'subtype'} ne "" ) { $className .= "/" . $repeatDB->{ lc( $transName ) }->{'subtype'}; } $annot->setClassName( $className ); } $annot->setHitName( $transName ); $annot->setSeq2Beg( $adjCurBegin ); $annot->setStatus( "JOINED" ); if ( $annot->getRevComp() eq "C" && defined $transRange->{'compl'} ) { $annot->setRevComp( "+" ); } elsif ( $annot->getRevComp() eq "+" && defined $transRange->{'compl'} ) { $annot->setRevComp( "C" ); } } # sub translateAnnotation ##sorting subroutines # Cycle 0B sub bySeqSWConbegin ($$) { ( $_[ 0 ]->getSeq1Name() ) cmp( $_[ 1 ]->getSeq1Name() ) || ( $_[ 0 ]->getSW() ) <=> ( $_[ 1 ]->getSW() ) || ( $_[ 0 ]->getSeq2Beg() ) <=> ( $_[ 1 ]->getSeq2Beg() ); } # Lots of places sub byNameBeginEndrevSWrev ($$) { ( $_[ 0 ]->getSeq1Name() ) cmp( $_[ 1 ]->getSeq1Name() ) || ( $_[ 0 ]->getSeq1Beg() ) <=> ( $_[ 1 ]->getSeq1Beg() ) || ( $_[ 1 ]->getSeq1End() ) <=> ( $_[ 0 ]->getSeq1End() ) || ( $_[ 1 ]->getSW() ) <=> ( $_[ 0 ]->getSW() ); ### note: takes the longer sequence starting at the same position first } # Cycle 0Da sub byNameClassBeginEndrevAndSWrev ($$) { my ( $aSeqName, $aClassName ) = $_[ 0 ]->getSeq2Name() =~ /(.*)#(.*)/; my ( $bSeqName, $bClassName ) = $_[ 1 ]->getSeq2Name() =~ /(.*)#(.*)/; ( $_[ 0 ]->getSeq1Name() ) cmp( $_[ 1 ]->getSeq1Name() ) || ( $aClassName ) cmp( $bClassName ) || ( $_[ 0 ]->getSeq1Beg() ) <=> ( $_[ 1 ]->getSeq1Beg() ) || ( $_[ 1 ]->getSeq1End() ) <=> ( $_[ 0 ]->getSeq1End() ) || ( $_[ 1 ]->getSW() ) <=> ( $_[ 0 ]->getSW() ); } # Line Preprocessing notes: # # # TODO: Eventually this routine should keep track of the adjustments # independent of the original annotation. That way we can # track the evidence for each final annotation. # # Marsupials are adjusted to L1_Mdo1 # To be done: HAL1_Opos1_3end is not so close to normal LINE1s, # but probably still warrants to be lined up with L1. # opt_mammals and L1* are adjusted to L1_Canid # in future all mammalian LINEs need to have one startpoint, e.g. L1M2_ORF2 # We change L1_Cf so that it can be recognized in fragment comparisons # with L1_Canis subs. Similarly for L1_Canid. # # LINE subfamilies have widely variable length 5' ends. To be able to # merge closely related LINE subfamilies, the positions need to be # adjusted to a standard. The ORF2 is basically of identical length # between all known subfamilies, so the start of ORF2 is probably the # best place to match the positions of the 5' end consensus # sequences. I've taken L1PA2 as a standard for L1, where ORF2 starts # at 2110. Many 5' end consensus sequences extend to overlap 150 bp # with the ORF2, but many others only describe sequences further 5' in # the element, usually because another subfamily has a very closely # matching sequence over ORF1. # The "left over in the consensus sequence" number (now reflecting # how far the consensus extends 3') is adjusted to reflect what # would be left in a full element (add 3294 for ORF2 + length best # matching 3' end consensus - 2x 150 bp for overlaps) # The first number fed to ChangePos is subtracted from the given # position. Thus, a consensus that is longer than the standard # gets a positive number, one that is shorter a negative number. # When making adjustments dependent on position in consensus with the # subroutine ChangePos begin-limit of doing so should be >= or < # end-limit when the position number is decreased or increased, resp. # L1PA4 ORF2 start 2110; L1 ORF2 length 3294 bp # # # L1P4b-e are adjusted to L1P4a # L1MEf_5end -- 3'end not yet known; used same as L1MDa # L1MEg_5end -- 3' end not yet known; used same as L1MDa # L3b, L3_Mars -- Are adjust to the ancient L3 consensus, even # if this consensus is incomplete. If this consensus is # updated, these adjustments need to be changed sub preProcessLINE { my $chainBegHash = shift; my $chainEndHash = shift; my $conPosCorrHash = shift; my $annot = shift; my $DEBUG = 0; if ( $DEBUG ) { print STDERR "preProcessLINE(): Before preprocessing\n"; $annot->print(); } my $HitName = $annot->getHitName(); my $ID = $annot->getID(); if ( defined $RepeatAnnotationData::preProcData{$HitName} ) { my $adjChainBeg = 0; my $adjChainEnd = 0; # Adjust consensus positions if ( defined $RepeatAnnotationData::preProcData{$HitName}->{'relToReference'} ) { $annot->setSeq2Beg( $annot->getSeq2Beg() + $RepeatAnnotationData::preProcData{$HitName}->{'relToReference'} ); $annot->setSeq2End( $annot->getSeq2End() + $RepeatAnnotationData::preProcData{$HitName}->{'relToReference'} ); $adjChainBeg = $chainBegHash->{$ID} + $RepeatAnnotationData::preProcData{$HitName}->{'relToReference'}; $adjChainEnd = $chainEndHash->{$ID} + $RepeatAnnotationData::preProcData{$HitName}->{'relToReference'}; # I.e _5ends and full length elements? # TODO: Make this a function is3End isOrf etc.. unless ( $HitName =~ /_3end|_orf2/ ) { $conPosCorrHash->{$ID} = -$RepeatAnnotationData::preProcData{$HitName}->{'relToReference'}; } } # Adjust LeftUnaligned if ( defined $RepeatAnnotationData::preProcData{$HitName}->{'3EndLength'} ) { $annot->set3PrimeUnaligned( $annot->get3PrimeUnaligned() + $RepeatAnnotationData::preProcData{$HitName}->{'3EndLength'} ); } # Rename based on adjusted consensus coordinates if ( defined $RepeatAnnotationData::preProcData{$HitName}->{'rangeNames'} ) { foreach my $range ( @{ $RepeatAnnotationData::preProcData{$HitName}->{'rangeNames'} } ) { my $newHitName = $annot->getHitName(); # range has maxEnd if ( defined $range->{'maxEnd'} && $DEBUG ) { print STDERR "Considering maxEnd= " . $range->{'maxEnd'} . " and hitname = " . $range->{'name'} . " and adjchainend=$adjChainEnd\n"; } if ( defined $range->{'maxEnd'} && $adjChainEnd <= $range->{'maxEnd'} ) { $newHitName = $range->{'name'}; } # if ( defined $range->{'maxArianEndScore'} && ( $adjChainEnd + $annot->getSW() ) <= $range->{'maxArianEndScore'} ) { $newHitName = $range->{'name'}; } # if ( defined $range->{'minBeg'} && $adjChainBeg >= $range->{'minBeg'} ) { $newHitName = $range->{'name'}; } # if ( defined $range->{'relLeftUnaligned'} && $HitName ne $newHitName ) { $annot->set3PrimeUnaligned( $annot->get3PrimeUnaligned() + $range->{'relLeftUnaligned'} ); } # $annot->setHitName( $newHitName ); } } } if ( $DEBUG ) { print STDERR "After preprocessing\n"; $annot->print(); print STDERR "cons pos correction = " . $conPosCorrHash->{$ID} . "\n"; } } sub preProcessDNATransp { my $chainBegHash = shift; my $chainEndHash = shift; my $annot = shift; my $repeatDB = shift; if ( defined $repeatDB->{ lc( $annot->getHitName() ) } ) { my $dbRec = $repeatDB->{ lc( $annot->getHitName() ) }; $DEBUG = 0; if ( defined $dbRec->{'equiv'} ) { my $rangeSlack = 6; my %compHash = (); foreach my $equivRec ( @{ $dbRec->{'equiv'} } ) { # Sorted in start order foreach my $rangeRec ( @{ $equivRec->{'ranges'} } ) { # No need to check ranges which start after we end last if ( $chainEndHash->{ $annot->getID() } < $rangeRec->{'start'} ); # Are we contained in this range? Leave some range slack # for search run through. if ( $chainBegHash->{ $annot->getID() } >= $rangeRec->{'start'} - $rangeSlack && $chainEndHash->{ $annot->getID() } <= $rangeRec->{'end'} + $rangeSlack ) { # Contained by a non-unique range # Not uniq to this consensus...push comparable push @{ $compHash{ $equivRec->{'name'} } }, $rangeRec; } } } if ( keys %compHash ) { if ( $DEBUG ) { print STDERR "\n\n***NOT UNIQUE***\n"; $annot->print(); print STDERR "Comps:\n"; foreach my $key ( keys( %compHash ) ) { print STDERR "$key: " . Dumper( $compHash{$key} ) . "\n"; } } $annot->setEquivHash( {%compHash} ); } $DEBUG = 0; } } } # Temporarily adjusts consensus sequence position info to allow # merging of matches to different subfamilies. The position is usually # later readjusted based on the "conPosCorrection" hash. Note that positive # numbers fed to the subroutine are subtracted (longer subfamilies are # adjusted with higher numbers, shorter with negatives) sub ChangePos { my $chainBegHash = shift; my $chainEndHash = shift; my $conPosCorrHash = shift; my $annot = shift; my $ID = $annot->getID(); $conPosCorrHash->{$ID} = shift; $annot->setSeq2Beg( $annot->getSeq2Beg() - $conPosCorrHash->{$ID} ); $chainBegHash->{$ID} -= $conPosCorrHash->{$ID}; $annot->setSeq2End( $annot->getSeq2End() - $conPosCorrHash->{$ID} ); $chainEndHash->{$ID} -= $conPosCorrHash->{$ID}; $annot->set3PrimeUnaligned( $annot->get3PrimeUnaligned() + $_[ 0 ] ) if $_[ 0 ]; } # # Returns overlap size for a range. # # TODO: Replace this with getQueryOverlap in object # sub getOverlapSize { my $range1Begin = shift; my $range1End = shift; my $range2Begin = shift; my $range2End = shift; my $overlap = 0; if ( $range1Begin >= $range2Begin && $range1Begin <= $range2End ) { # ------- # ------ # or # ----- # -------- if ( $range1End <= $range2End ) { # ----- # -------- $overlap = $range1End - $range1Begin + 1; } else { # ------- # ------ $overlap = $range2End - $range1Begin + 1; } } elsif ( $range1End >= $range2Begin && $range1End <= $range2End ) { # ------- # ------ # or # -------- # ----- if ( $range1End >= $range2End ) { # -------- # ----- $overlap = $range2End - $range2Begin + 1; } else { # ------- # ------ $overlap = $range1End - $range2Begin + 1; } } return $overlap; } sub isTransposonPairCompatible { my $cycle = shift; my $adjCurBegin = shift; my $adjCurEnd = shift; my $adjNextBegin = shift; my $adjNextEnd = shift; my $curStrand = shift; my $curBeginAlign = shift; my $curEndAlign = shift; my $nextBeginAlign = shift; my $nextEndAlign = shift; my $DEBUG = 0; if ( $DEBUG ) { print STDERR " adjCurBegEndStrand=$adjCurBegin $adjCurEnd $curStrand\n"; print STDERR " adjNextBegEnd=$adjNextBegin $adjNextEnd\n"; } my $conGap; my $conExt = 0; if ( $curStrand ne "C" ) { # ---current---> (gap) ---next---> $conGap = $adjNextBegin - $adjCurEnd; $conExt = ( $adjNextEnd - $adjNextBegin + 1 ) - $conGap; } else { # <--current--- (gap) <--next--- $conGap = $adjCurBegin - $adjNextEnd; $conExt = ( $adjNextEnd - $adjNextBegin + 1 ) - $conGap; } my $seqGap; if ( $curBeginAlign > $nextEndAlign ) { # ---next--- (gap) ---current--- $seqGap = $curBeginAlign - $nextEndAlign - 1; } else { # ---current--- (gap) ---next--- $seqGap = $nextBeginAlign - $curEndAlign - 1; } ## TODO: Do not attach pieces to complete consensi print STDERR "conGap = $conGap, seqGap = $seqGap: " . "Is conGap > -75 && ( conGap < 500 )\n" if ( $DEBUG ); if ( $conGap > -75 && $conGap < 500 && ( ( $curStrand eq "C" && $adjCurBegin - $conExt > -10 ) ) ## TODO: Must check that we don't go beyond the length ## of the adjusted consensus. ) { return ( 1 ); } return ( 0 ); } ## ## ## sub translateCoord { my $unBegin = shift; my $unEnd = shift; my $transHash = shift; ## Correction for run-through ranges $unBegin = $transHash->{'start'} if ( $unBegin < $transHash->{'start'} ); $unEnd = $transHash->{'end'} if ( $unEnd > $transHash->{'end'} ); my $rangeLen = $transHash->{'end'} - $transHash->{'start'} + 1; my $eqRangeLen = $transHash->{'eqend'} - $transHash->{'eqstart'} + 1; my $conBegin = sprintf( "%0.d", ( ( ( $unBegin - $transHash->{'start'} ) * $eqRangeLen ) / $rangeLen ) + $transHash->{'eqstart'} ); my $conEnd = sprintf( "%0.d", ( ( ( $unEnd - $transHash->{'start'} ) * $eqRangeLen ) / $rangeLen ) + $transHash->{'eqstart'} ); return ( $conBegin, $conEnd ); } # # isTooDiverged(): # # Return true of past element substitution percentage is 2x as # high *and* greater than 10% higher than the current elements # substitution percentage. # # A true value indicates that the past element is neither likely to # be related to the current element nor is it likely that a # fragment of the younger current element lies beyond the past element. # sub isTooDiverged { my $currentElementPctSub = shift; my $pastElementPctSub = shift; $pastElementPctSub > 2 * $currentElementPctSub && $pastElementPctSub > $currentElementPctSub + 10; } sub printHitArrayList { my $sortedAnnotationsList = shift; my $cycleAnnotIter = $sortedAnnotationsList->getIterator(); my %newID = (); my $ind = 1; while ( $cycleAnnotIter->hasNext() ) { my $currentAnnot = $cycleAnnotIter->next(); if ( $currentAnnot->getLeftLinkedHit() && $newID{ $currentAnnot->getLeftLinkedHit()->getUniqID() } > 0 ) { $newID{ $currentAnnot->getUniqID() } = $newID{ $currentAnnot->getLeftLinkedHit()->getUniqID() }; } else { $newID{ $currentAnnot->getUniqID() } = $ind++; } } $cycleAnnotIter = $sortedAnnotationsList->getIterator(); print STDERR "RESULTS OF JOINS\n"; while ( $cycleAnnotIter->hasNext() ) { my $currentAnnot = $cycleAnnotIter->next(); $currentAnnot->print( 1 ); print STDERR "" . $newID{ $currentAnnot->getUniqID() } . " "; print STDERR "<-" if ( $currentAnnot->getLeftLinkedHit() ); print STDERR "->" if ( $currentAnnot->getRightLinkedHit() ); print STDERR "\n"; if ( $currentAnnot->getSW() == 8027 && $currentAnnot->getLeftLinkedHit() ) { print STDERR " <=== "; $currentAnnot->getLeftLinkedHit()->print( 1 ); print STDERR "\n"; } } print STDERR "END RESULTS OF JOINS\n"; } sub scoreRefinedSINEPair { my $annot1 = shift; my $annot2 = shift; my $compatibleDistance = shift; # Establish position order my $DEBUG = 0; my $leftAnnot = $annot1; my $rightAnnot = $annot2; if ( $annot1->comparePositionOrder( $annot2 ) > 0 ) { $leftAnnot = $annot2; $rightAnnot = $annot1; } if ( $DEBUG ) { print STDERR " scoreRefinedSINEPair():\n "; $leftAnnot->print(); print STDERR " vs "; $rightAnnot->print(); } my $score = 0; if ( $leftAnnot->getHitName() eq $rightAnnot->getHitName() && $rightAnnot->getRevComp() eq $leftAnnot->getRevComp() && $rightAnnot->getQueryOverlap( $leftAnnot ) <= 21 && -( $rightAnnot->getConsensusOverlap( $leftAnnot ) ) <= 100 && -( $rightAnnot->getQueryOverlap( $leftAnnot ) ) <= 1000 && ( $rightAnnot->getConsensusOverlap( $leftAnnot ) <= 21 || -( $rightAnnot->getQueryOverlap( $leftAnnot ) ) < 0 && $rightAnnot->getConsensusOverlap( $leftAnnot ) - $rightAnnot->getQueryOverlap( $leftAnnot ) <= 20 ) ) { my $gap = $rightAnnot->getSeq1Beg() - $leftAnnot->getSeq1End(); $gap = 1 if ( $gap == 0 ); $score = ( $rightAnnot->getSW() + $leftAnnot->getSW() ) * ( 1 / ( $compatibleDistance ) ) * ( 1 / ( $gap ) ); if ( $DEBUG ) { print STDERR "scoreRefinedSINEPair(): Scoring ( $score ) [ " . ( $rightAnnot->getSeq1Beg() - $leftAnnot->getSeq1End() ) . " ] to:" . "\n"; print STDERR " -> "; $leftAnnot->print(); print STDERR " -> "; $rightAnnot->print(); } } print STDERR "newScore.... score = $score\n" if ( $DEBUG ); return $score; } sub scoreGenericPair { my $annot1 = shift; my $annot2 = shift; my $elementDistance = shift; my $classElementDistance = shift; # Establish position order my $DEBUG = 0; my $leftAnnot = $annot1; my $rightAnnot = $annot2; if ( $annot1->comparePositionOrder( $annot2 ) > 0 ) { $leftAnnot = $annot2; $rightAnnot = $annot1; } my $score = 0; my $currentHitName = $rightAnnot->getHitName(); my $prevHitName = $leftAnnot->getHitName(); if ( $leftAnnot->getSeq1Name() eq $rightAnnot->getSeq1Name() && $rightAnnot->getRevComp() eq $leftAnnot->getRevComp() ) { print STDERR " --- Same orient/seq\n" if ( $DEBUG ); print STDERR " --- qo = " . $rightAnnot->getQueryOverlap( $leftAnnot ) . "\n co = " . $rightAnnot->getConsensusOverlap( $leftAnnot ) . "\n" if ( $DEBUG ); if ( $rightAnnot->getLastField == $leftAnnot->getLastField() && ( $currentHitName eq $prevHitName || $currentHitName =~ /$prevHitName/ || $prevHitName =~ /$currentHitName/ ) && -( $rightAnnot->getQueryOverlap( $leftAnnot ) ) <= 10 && -( $rightAnnot->getConsensusOverlap( $leftAnnot ) ) <= 100 && ( $rightAnnot->getConsensusOverlap( $leftAnnot ) <= 21 || -( $rightAnnot->getQueryOverlap( $leftAnnot ) ) < 0 && $rightAnnot->getConsensusOverlap( $leftAnnot ) - $rightAnnot->getQueryOverlap( $leftAnnot ) <= 20 ) ) { $score = 1; } my $HitName = $rightAnnot->getHitName(); my $shortHitName = $HitName; $shortHitName =~ s/(\S+\d)[a-zA-Z]$/$1/; $shortHitName = quotemeta $shortHitName; # Less stringent if ( $score == 0 && $leftAnnot->getHitName() =~ /^$shortHitName/ ) { print STDERR "scoreGenericPair(): last chancy test ( names compat )\n" if ( $DEBUG ); my $gapMax = 2500; $gapMax *= 1.5 if ( $leftAnnot->getHitName() eq $rightAnnot->getHitName() ); # Need to revise the way div diff is calc'd my $divDiff = $leftAnnot->getPctSubst() - $rightAnnot->getPctSubst(); print STDERR "scoreGenericPair(): gapMax = $gapMax " . "divDiff = $divDiff elemedist = $elementDistance\n" if ( $DEBUG ); $divDiff = -$divDiff if ( $divDiff < 0 ); $gapMax *= ( 10 - $divDiff ) / 10; $gapMax *= ( 15 - $elementDistance ) / 15; print STDERR "scoreGenericPair(): gapMax = $gapMax " . "divDiff = $divDiff\n" if ( $DEBUG ); if ( $leftAnnot->getConsensusOverlap( $rightAnnot ) < 33 && $leftAnnot->getConsensusOverlap( $rightAnnot ) > -$gapMax ) { print STDERR "scoreGenericPair(): Hmmm....I guess so.. " . "\n" if ( $DEBUG ); $score = 0.5; } } } return ( $score ); } ##################################################################### ################ C O N T A I N S M E T A D A T A ################ ################# metadata ################ ##################################################################### # TODO: This used to be part of the SINE code. We need to include # a refinement MIR eventually. # # if SINE/ALU # # Make MIR's generic if they are not in the # # unique regions. NOTE: Unique regions are # # a good possibility for generalization. # elsif ( $HitName eq 'MIR3' ) { # if ( $chainBegHash->{$ID} >= 16 # && $chainEndHash->{$ID} <= 153 ) # { # $HitName = 'MIR'; # } # } sub preProcessLTR { my $chainBegHash = shift; my $chainEndHash = shift; my $conPosCorrHash = shift; my $annot = shift; my $ID = $annot->getID(); my $HitName = $annot->getHitName(); # all MLT2s adjusted to MLT2B3 or 4 if ( $chainBegHash->{$ID} > 180 ) { if ( $HitName =~ /^MLT2A/ ) { &ChangePos( $chainBegHash, $chainEndHash, $conPosCorrHash, $annot, -108 ) if $chainBegHash->{$ID} > 270; } elsif ( $HitName =~ /^MLT2B[12]$/ ) { &ChangePos( $chainBegHash, $chainEndHash, $conPosCorrHash, $annot, -47 ) if $chainBegHash->{$ID} > 265; } elsif ( $HitName eq "MLT2B5" ) { if ( $chainBegHash->{$ID} > 415 ) { &ChangePos( $chainBegHash, $chainEndHash, $conPosCorrHash, $annot, 162 ); } else { &ChangePos( $chainBegHash, $chainEndHash, $conPosCorrHash, $annot, 116 ); } } elsif ( $HitName eq "MLT2C1" ) { &ChangePos( $chainBegHash, $chainEndHash, $conPosCorrHash, $annot, -166 ) if $chainBegHash->{$ID} > 220; } elsif ( $HitName eq "MLT2C2" ) { &ChangePos( $chainBegHash, $chainEndHash, $conPosCorrHash, $annot, -95 ) if $chainBegHash->{$ID} > 275; } elsif ( $HitName eq "MLT2D" ) { &ChangePos( $chainBegHash, $chainEndHash, $conPosCorrHash, $annot, -146 ) if $chainBegHash->{$ID} > 225; } elsif ( $HitName eq "MLT2E" ) { &ChangePos( $chainBegHash, $chainEndHash, $conPosCorrHash, $annot, 65 ) if $chainBegHash->{$ID} > 520; } elsif ( $HitName eq "MLT2F" ) { &ChangePos( $chainBegHash, $chainEndHash, $conPosCorrHash, $annot, 100 ) if $chainBegHash->{$ID} > 550; } } } sub scoreSINEPair { my $annot1 = shift; my $annot2 = shift; my $elementDistance = shift; # Establish position order my $DEBUG = 0; my $leftAnnot = $annot1; my $rightAnnot = $annot2; if ( $annot1->comparePositionOrder( $annot2 ) > 0 ) { $leftAnnot = $annot2; $rightAnnot = $annot1; } my $score = 0; my $currentHitName = $rightAnnot->getHitName(); my $prevHitName = $leftAnnot->getHitName(); if ( $leftAnnot->getSeq1Name() eq $rightAnnot->getSeq1Name() && $rightAnnot->getRevComp() eq $leftAnnot->getRevComp() ) { print STDERR " --- Same orient/seq\n" if ( $DEBUG ); print STDERR " --- qo = " . $rightAnnot->getQueryOverlap( $leftAnnot ) . "\n co = " . $rightAnnot->getConsensusOverlap( $leftAnnot ) . "\n" if ( $DEBUG ); if ( $rightAnnot->getLastField == $leftAnnot->getLastField() && ( $currentHitName eq $prevHitName || $currentHitName =~ /$prevHitName/ || $prevHitName =~ /$currentHitName/ ) && -( $rightAnnot->getQueryOverlap( $leftAnnot ) ) <= 10 && -( $rightAnnot->getConsensusOverlap( $leftAnnot ) ) <= 100 && ( $rightAnnot->getConsensusOverlap( $leftAnnot ) <= 21 || -( $rightAnnot->getQueryOverlap( $leftAnnot ) ) < 0 && $rightAnnot->getConsensusOverlap( $leftAnnot ) - $rightAnnot->getQueryOverlap( $leftAnnot ) <= 20 ) ) { print STDERR " --- good score ( 1 )\n" if ( $DEBUG ); $score = 1; } my $HitName = $rightAnnot->getHitName(); my $shortHitName = $HitName; if ( $HitName =~ /^Alu/ ) { $shortHitName =~ s/(^\w{5}).*/$1/; } else { $shortHitName =~ s/(\S+\d)[a-zA-Z]$/$1/; } $shortHitName = quotemeta $shortHitName; # Less stringent if ( $score == 0 && $leftAnnot->getHitName() =~ /^$shortHitName/ ) { print STDERR "scoreSINEPair(): last chancy test ( names compat )\n" if ( $DEBUG ); my $gapMax = 2500; $gapMax *= 1.5 if ( $leftAnnot->getHitName() eq $rightAnnot->getHitName() ); # Need to revise the way div diff is calc'd my $divDiff = $leftAnnot->getPctSubst() - $rightAnnot->getPctSubst(); print STDERR "scoreSINEPair(): gapMax = $gapMax " . "divDiff = $divDiff elemedist = $elementDistance\n" if ( $DEBUG ); $divDiff = -$divDiff if ( $divDiff < 0 ); $gapMax *= ( 10 - $divDiff ) / 10; $gapMax *= ( 15 - $elementDistance ) / 15; print STDERR "scoreSINEPair(): gapMax = $gapMax " . "divDiff = $divDiff\n" if ( $DEBUG ); if ( $leftAnnot->getConsensusOverlap( $rightAnnot ) < 33 && $leftAnnot->getConsensusOverlap( $rightAnnot ) > -$gapMax ) { print STDERR "scoreSINEPair(): Hmmm....I guess so.. " . "\n" if ( $DEBUG ); $score = 0.5; } } } return ( $score ); } sub scoreLTRPair { my $annot1 = shift; my $annot2 = shift; my $eleDistance = shift; my $ltrDistance = shift; my $DEBUG = 0; if ( $DEBUG ) { print STDERR "scoreLTRPair(): Considering...\n"; $annot1->print(); $annot2->print(); } # Establish position order my $leftAnnot = $annot1; my $rightAnnot = $annot2; if ( $annot1->comparePositionOrder( $annot2 ) > 0 ) { $leftAnnot = $annot2; $rightAnnot = $annot1; } my $intScore = 0; my $hybrid = 0; # Ensure compatible names and orientation agreement if ( &areLTRNamesCompat( $annot1, $annot2 ) && $annot1->getRevComp() eq $annot2->getRevComp() ) { print STDERR "scoreLTRPair(): Names are compatible\n" if ( $DEBUG ); # # Model component specific checks # if ( &isInternal( $annot1 ) && &isInternal( $annot2 ) ) { print STDERR "scoreLTRPair(): Both internals\n" if ( $DEBUG ); # Both internal # Names are compatible loosely compatible. my $CO = $leftAnnot->getConsensusOverlap( $rightAnnot ); my $CG = -$CO; if ( $CO <= 111 && $CG <= 1234 || $leftAnnot->getHitName() eq $rightAnnot->getHitName() && $CO <= 111 && $CG <= 5555 ) { print STDERR "scoreLTRPair(): INT Good One\n" if ( $DEBUG ); $intScore = 1; } else { print STDERR "scoreLTRPair(): Failed: CO=$CO>111, " . "CG=$CG>1234 etc..\n" if ( $DEBUG ); } } elsif ( &isLTR( $annot1 ) && &isLTR( $annot2 ) ) { print STDERR "scoreLTRPair(): Both ltrs\n" if ( $DEBUG ); # Both LTR # Join if: # Class is MaLR or # Both are MLT2.. variations or # Hitnames are shorthitname alike # And # LastFields = 5 # CG <= 220 - ( number of LTRs distant + 1 ) * 20 # CO <= 50 # Same orientation # And # Unaligned edges are >= 10bp long if ( ( ( $annot1->getClassName() =~ /ERVL-MaLR/ && $annot2->getClassName() =~ /ERVL-MaLR/ ) || ( $annot1->getHitName() =~ /MLT2/ && $annot2->getHitName() =~ /MLT2/ ) || &areLTRNamesCompat( $annot1, $annot2 ) == 1 ) && ( $annot1->getLastField() == 5 && $annot2->getLastField() == 5 ) && ( $leftAnnot->getConsensusOverlap( $rightAnnot ) > -( 220 - $ltrDistance * 20 ) ) && ( $leftAnnot->getConsensusOverlap( $rightAnnot ) <= 50 ) && ( $leftAnnot->getRevComp() eq $rightAnnot->getRevComp() ) ) { print STDERR "scoreLTRPair(): Good distance\n" if ( $DEBUG ); if ( # --ltr--> --ltr--> ( $annot1->getRevComp() eq "+" && $leftAnnot->get3PrimeUnaligned() >= 10 && $rightAnnot->getSeq2Beg() >= 10 ) # <--ltr-- <--ltr--- || ( $annot2->getRevComp() eq "C" && $leftAnnot->getSeq2Beg() >= 10 && $rightAnnot->get3PrimeUnaligned() >= 10 ) ) { $intScore = 1; } } } else { # Mixture of internal and ltr $hybrid = 1; print STDERR "scoreLTRPair(): Mixture of ltr/internal: " . "eleDist = $eleDistance\n" if ( $DEBUG ); # Mixture of internal and ltr fragments # Stringent: # - Names are compatible: loosely compatible. # - <= 33 bp unaligned on each fragment # - Same orientation # - No more than 2 insertions # - No more than 350 bp gap in query # No more than 2 insertions ## TODO: Should lower stringency based on the exactness of the ## flanking name. if ( $eleDistance <= 2 && $annot1->getQueryOverlap( $annot2 ) > -350 ) { print STDERR "scoreLTRPair(): eleDistance OK and queryOverlap OK\n" if ( $DEBUG ); if ( # ----> ---int---> # ---int--> -----> ( $annot1->getRevComp() eq "+" && $leftAnnot->get3PrimeUnaligned() <= 33 && $rightAnnot->getSeq2Beg() <= 33 ) # <---- <--int--- # <---int-- <----- || ( $annot2->getRevComp() eq "C" && $leftAnnot->getSeq2Beg() <= 33 && $rightAnnot->get3PrimeUnaligned() <= 33 ) ) { print STDERR "scoreLTRPair(): Good One\n" if ( $DEBUG ); $intScore = 1; } } } # Less stringent # Loose applies to all combinations of elements: # - No more than 10,000 bp gap in query # - Same orientation # - No intervening element with > 1.33 * PctSubst than # the *higher-query-position* member of the pair. # - Similar names: shorthitname rules # - Gap < 500 # gapallowed = 1.5*2500 # if exact name # diff = difference in substitution ( max 10% ) # gapallowed *= %difference in substition # - gapallowed *= # of intervening elements ( max 10 ) # - Consensus overlap is < 33 and the gap < gapallowed if ( $intScore == 0 && &areLTRNamesCompat( $annot1, $annot2 ) == 1 ) { print STDERR "scoreLTRPair(): last chance test ( names compat )\n" if ( $DEBUG ); my $gapMax = 2500; $gapMax *= 1.5 if ( $annot1->getHitName() eq $annot2->getHitName() ); # Need to revise the way div diff is calc'd my $divDiff = $annot1->getPctSubst() - $annot2->getPctSubst(); $divDiff = -$divDiff if ( $divDiff < 0 ); $gapMax *= ( 10 - $divDiff ) / 10; $gapMax *= ( 10 - $eleDistance ) / 10; if ( $leftAnnot->getConsensusOverlap( $rightAnnot ) > -$gapMax && ( $hybrid || ( $leftAnnot->getConsensusOverlap( $rightAnnot ) < 33 && $annot1->getLastField() == 5 && $annot2->getLastField() == 5 ) ) ) { print STDERR "scoreLTRPair(): Ok\n" if ( $DEBUG ); $intScore = 0.5; } elsif ( $DEBUG ) { print STDERR "scoreLTRPair(): Not Ok: cg = " . $leftAnnot->getConsensusGap( $rightAnnot ) . "\n"; print STDERR "scoreLTRPair(): co = " . $leftAnnot->getConsensusOverlap( $rightAnnot ) . "< gapMax = -$gapMax or \n" . " hybrid = $hybrid = 0 , co > 33\n"; print STDERR "scoreLTRPair(): or " . $leftAnnot->getLastField() . " != " . $rightAnnot->getLastField() . " != 5\n"; } } ## Last last chance from cycle 4 if ( $intScore == 0 ) { print STDERR "scoreLTRPair(): last last chance test\n" if ( $DEBUG ); if ( $leftAnnot->getSeq1End() >= $rightAnnot->getSeq1Beg() && $rightAnnot->getSeq1End() > $leftAnnot->getSeq1End() ) { # # If these element shave the same class, orientation, # similar names with small overlap or simply just # a large overlap then join them!!! Arghhh # if ( &areLTRNamesCompat( $annot1, $annot2 ) == 1 && $rightAnnot->getSeq1Beg() <= $leftAnnot->getSeq1End() - 12 || $rightAnnot->getSeq1Beg() <= $leftAnnot->getSeq1End() - 33 ) { print STDERR "scoreLTRPair(): Well if you insist\n" if ( $DEBUG ); $intScore = 0.25; } } } } elsif ( $DEBUG ) { print STDERR "scoreLTRPair(): Names are not compatible\n"; } if ( $intScore ) { print STDERR "Using elementdistance = $eleDistance\n" if ( $DEBUG ); $intScore += 1 / $eleDistance; } print STDERR "scoreLTRPair(): Returning score = $intScore\n" if ( $DEBUG ); return $intScore; } # scoreLTRPair() sub isInternal { my $annot = shift; my $HitName = $annot->getHitName(); if ( ( $HitName !~ /LTR/ || $HitName =~ /int$/ ) && ( $HitName =~ /int|ERV|PRIMA41|Harlequin|HUERS/i || $HitName =~ /.*[-_]I(_\S+)?$/ || $HitName =~ /MMERGLN|MMURS|MMVL30|MULV|MURVY|ETn|^IAP/ ) ) { return ( 1 ); } return ( 0 ); } sub isLTR { my $annot = shift; return ( !&isInternal( $annot ) ); } sub areLTRNamesCompat { my $annot1 = shift; my $annot2 = shift; my $annot1Name = $annot1->getHitName(); my $annot1Class = $annot1->getClassName(); my $annot2Name = $annot2->getHitName(); my $annot2Class = $annot2->getClassName(); my $DEBUG = 0; if ( $DEBUG ) { print STDERR "areLTRNamesCompat(): $annot1Name vs $annot2Name\n"; } # # Rules when annot1Name is internal # if ( $annot1Class eq $annot2Class ) { my $shortHitName = $annot2Name; if ( $annot2Name =~ /^MER\d{2}|^LTR\d{2}/ ) { $shortHitName =~ s/(^\w{5}).*/$1/; } elsif ( $annot2Name =~ /^MER\d[A-Z_a-z]|^LTR\d[A-Z_a-z]|ORR1/ ) { $shortHitName =~ s/(^\w{4}).*/$1/; } else { $shortHitName =~ s/(\S+\d)[\-a-zA-Z].*/$1/; } $shortHitName = quotemeta $shortHitName; print STDERR "areLTRNamesCompat(): comparing $annot1Name =~ $shortHitName\n" if ( $DEBUG ); if ( $annot1Name =~ /$shortHitName/ ) { print STDERR "areLTRNamesCompat(): returning 1\n" if ( $DEBUG ); return ( 1 ); } if ( $annot1Class !~ /ERVL-MaLR/ && $annot2Name !~ /pTR5/ # pTR5 is not a product of retrovrial transposition but # appears like a "processed pseudogene". Obviously cant # act like an LTR. # The following are all MaLR names || ( $annot1Name =~ /^THE1/ && $annot2Name =~ /THE1|MST[AB-]/ || $annot1Name =~ /^MST/ && $annot2Name =~ /THE1[BC-]|MST|MLT1[AB-]/ || $annot1Name =~ /^MLT-int/ && $annot2Name =~ /THE1C|MST[ABCD]|MLT1[ABC]/ || $annot1Name =~ /^MLT1[A-]/ && $annot2Name =~ /MST[CD-]|MLT1[ABC-]/ || $annot1Name =~ /^MLT1F/ && $annot2Name =~ /MLT1[DEFGHIJKL]/ || $annot1Name =~ /MLT1[B-H]/ && $annot2Name =~ /MLT1/ || $annot1Name =~ /^ORR1/ && $annot2Name =~ /^ORR1|^MT/ || $annot1Name =~ /^MT/ && $annot2Name =~ /^MT|^ORR1/ ) ) { print STDERR "areLTRNamesCompat(): returning 0.5\n" if ( $DEBUG ); return ( 0.5 ); } if ( $annot2Class !~ /ERVL-MaLR/ || ( $annot2Name =~ /^THE1/ && $annot1Name =~ /THE1|MST[AB-]/ || $annot2Name =~ /^MST/ && $annot1Name =~ /THE1[BC-]|MST|MLT1[AB-]/ || $annot2Name =~ /^MLT-int/ && $annot1Name =~ /THE1C|MST[ABCD]|MLT1[ABC]/ || $annot2Name =~ /^MLT1[A-]/ && $annot1Name =~ /MST[CD-]|MLT1[ABC-]/ || $annot2Name =~ /^MLT1F/ && $annot1Name =~ /MLT1[DEFGHIJKL]/ || $annot2Name =~ /MLT1[B-H]/ && $annot1Name =~ /MLT1/ || $annot2Name =~ /^ORR1/ && $annot1Name =~ /^ORR1|^MT/ || $annot2Name =~ /^MT/ && $annot1Name =~ /^MT|^ORR1/ ) ) { print STDERR "areLTRNamesCompat(): returning 0.5\n" if ( $DEBUG ); return ( 0.5 ); } } # Not compat if ( $DEBUG ) { print STDERR "areLTRNamesCompat(): Not compatible!\n"; } return ( 0 ); } ########################################################################## ########################################################################## ########################################################################## ##-------------------------------------------------------------------------## ## Use: my ( $seq_cnt, $totalSeqLen, $nonMaskedSeqLen, $totGCLevel, ## $totBPMasked ) = ## &maskSequence ( $seqDB, $annotationFile, ## $outputFile ); ## Returns ## ## $seq_cnt: The number of sequences in the FASTA file. ## $totalSeqLen: The absoulte length of all sequences combined. ## $nonMaskedSeqLen: Length of sequence (excluding runs of >20 N's ## and X's) of the pre-masked sequence. ## $totGCLevel: The GC content of the original sequence. ## $totBPMasked: The total bp we masked ## ##-------------------------------------------------------------------------## sub maskSequence { my $maskFormat = shift; my $seqDB = shift; my $annotationFile = shift; my $outputFile = shift; print "ProcessRepeats::maskSequence()\n" if ( $DEBUG ); my %annots = (); # # Open up a search results object # my $searchResults = CrossmatchSearchEngine::parseOutput( searchOutput => $annotationFile ); # # Read in annotations and throw away the rest # my $prevResult; for ( my $i = 0 ; $i < $searchResults->size() ; $i++ ) { my $result = $searchResults->get( $i ); my $start = $result->getQueryStart(); my $end = $result->getQueryEnd(); if ( defined $prevResult && $prevResult->getQueryName() eq $result->getQueryName() && $prevResult->getQueryEnd() >= $start ) { next if ( $prevResult->getQueryEnd() >= $end ); $start = $prevResult->getQueryEnd() + 1; } push @{ $annots{ $result->getQueryName() } }, { 'begin' => $start, 'end' => $end }; $prevResult = $result; } undef $searchResults; my @seqIDs = $seqDB->getIDs(); my $seq_cnt = scalar( @seqIDs ); my $sublength = $seqDB->getSubtLength(); my $totGCLevel = 100 * $seqDB->getGCLength() / $sublength; $totGCLevel = sprintf "%4.2f", $totGCLevel; my $totalSeqLen = 0; my $totBPMasked = 0; my $nonMaskedSeqLen = 0; my $workseq = ""; open OUTFILE, ">$outputFile"; foreach my $seqID ( @seqIDs ) { my $seq = $seqDB->getSequence( $seqID ); $totalSeqLen += length $seq; $workseq = $seq; $nonMaskedSeqLen += length $workseq; # This was counting all N' rather than only ones in blocks of 20 or more #$nonMaskedSeqLen -= ( $workseq =~ tr/[N,X]{20,}// ); while ( $workseq =~ /([X,N]{20,})/ig ) { $nonMaskedSeqLen -= length( $1 ); } foreach my $posRec ( @{ $annots{$seqID} } ) { my $beginPos = $posRec->{'begin'}; my $endPos = $posRec->{'end'}; my $repLen = $endPos - $beginPos + 1; substr( $workseq, $beginPos - 1, $repLen ) = "0" x ( $repLen ); if ( $maskFormat eq 'xsmall' ) { substr( $seq, $beginPos - 1, $repLen ) = lc( substr( $seq, $beginPos - 1, $repLen ) ); } elsif ( $maskFormat eq 'x' ) { substr( $seq, $beginPos - 1, $repLen ) = "X" x ( $repLen ); } else { substr( $seq, $beginPos - 1, $repLen ) = "N" x ( $repLen ); } $totBPMasked += $repLen; } print OUTFILE ">" . $seqID; my $desc = $seqDB->getDescription( $seqID ); if ( $desc ne "" ) { print OUTFILE " " . $desc; } print OUTFILE "\n"; $seq =~ s/(\S{50})/$1\n/g; $seq .= "\n" unless ( $seq =~ /.*\n+$/s ); print OUTFILE $seq; } close OUTFILE; return ( $seq_cnt, $totalSeqLen, $nonMaskedSeqLen, $totGCLevel, $totBPMasked ); } sub parseCATFile { my %nameValuePairs = @_; # Create ourselves as a hash my $retObj = new ArrayList; croak( "parseCATFile(): Missing 'file' parameter!\n" ) if ( !defined $nameValuePairs{'file'} ); open( XMATCH, $nameValuePairs{'file'} ) || die "parseCATFile(): Cannot " . "open $nameValuePairs{'file'}\n"; my $id = -1; my $matrixGCLevel = ""; my $matrixSubLevel = ""; my $alignData = ""; my $hit = undef; while ( ) { if ( /^[^\#].+\(\d+\)/ ) { # only occurs in parse lines if ( $hit && $alignData ) { $hit->setAlignData( $alignData ); $alignData = ""; } $hit = new Hit( $_ ); $id += 1; $hit->setID( $id ); $hit->setMatrixSubLevel( $matrixSubLevel ); $hit->setMatrixGCLevel( $matrixGCLevel ); $retObj->add( $hit ); } elsif ( $id >= 0 ) { $alignData .= $_ unless ( defined $nameValuePairs{'noAlignData'} ); } if ( /^Matrix =/ ) { /(\d\d)p(\d\d)g.matrix/; $matrixSubLevel = $1 if ( $1 ); $matrixGCLevel = $2 if ( $2 ); } } if ( $hit && $alignData ) { $hit->setAlignData( $alignData ); $alignData = ""; } close XMATCH; return $retObj; } ###################################################################### package Hit; sub new { my $class = shift; # @_ = (HIT, $parseLine) my $parseLine = shift; my $this = {}; bless $this, $class; # create a reference for each parsed line, the line # is parsed into a list of refs to hashes (LoH) $this->addParsedLine( $parseLine ); return $this; } ##-------------------------------------------------------------------------## ## ##=head2 clone() ## ## Use: my $newObj = $obj->clone(); ## ## Clone a Hit *duplicating* all the values of the old ## object in the new one. ## ##=cut ## ##-------------------------------------------------------------------------## sub clone { my $this = shift; my %newHash = %{$this}; my $newObj = \%newHash; bless $newObj, ref( $this ); return $newObj; } # Just these fields for now sub setFrom { my $this = shift; my $source = shift; $this->setSW( $source->getSW() ); $this->setPctSubst( $source->getPctSubst() ); $this->setPctDelete( $source->getPctDelete() ); $this->setPctInsert( $source->getPctInsert() ); $this->setSeq1Name( $source->getSeq1Name() ); $this->setSeq1Beg( $source->getSeq1Beg() ); $this->setSeq1End( $source->getSeq1End() ); $this->setLeftOver( $source->getLeftOver() ); $this->setRevComp( $source->getRevComp() ); $this->setHitName( $source->getHitName() ); $this->setClassName( $source->getClassName() ); $this->setSeq2Beg( $source->getSeq2Beg() ); $this->setSeq2End( $source->getSeq2End() ); $this->set3PrimeUnaligned( $source->get3PrimeUnaligned() ); $this->setLastField( $source->getLastField() ); $this->setID( $source->getID() ); $this->setEquivHash( $source->getEquivHash() ); $this->setMatrixSubLevel( $source->getMatrixSubLevel() ); $this->setMatrixGCLevel( $source->getMatrixGCLevel() ); $this->setAlignData( $source->getAlignData() ); $this->setCATID( $source->getCATID() ); # WARNING: Do not inherit "derivedFromAnnots". # These are the fields we are ignoring currently # setScore # setPctMatch # setSeq1Len # setSeq2Len # setStatus # setLeftLinkedHit # setRightLinkedHit return $this; } sub addParsedLine { my ( $this, $line ) = @_; # # parse the line passed, the returned refs will correspond to # the start and end of a hit, its name and orientation (rel to # the first sequence in crossmatch. the next elements ([4] - [end]) # will contain the simscore (%match) and start and ending points) # $line =~ s/^\s+//; my @tokens = split( /\s+/, $line ); $this->{'SW'} = $tokens[ 0 ]; # smith-waterman score $this->{'SUBS'} = $tokens[ 1 ]; # % substitution (bidirectional) # Flipped insertions and deletions to reflect nature and that seq2 # is the consensus/ancestor in RepeatMasker. Otherwise tricky to # understand logic elsewhere in script. $this->{'DEL'} = $tokens[ 2 ]; # % deletions seq1 vs seq2 $this->{'INS'} = $tokens[ 3 ]; # % insertions "" $this->{'SEQ1NAME'} = $tokens[ 4 ]; # name of seq1 $this->{'SEQ1BEG'} = $tokens[ 5 ]; # beginning of alignment $this->{'SEQ1END'} = $tokens[ 6 ]; # end of alignment $this->{'SEQ1REMAINING'} = $tokens[ 7 ]; ## ## TODO: Should do some fail early validation of the input...ie. sequence ## positions etc. ## $this->{'MATCH'} = 100 - $this->{'SUBS'}; if ( $tokens[ 8 ] eq "C" ) { # the C for complemented seqs $this->{'SEQ2NAME'} = $tokens[ 9 ]; # name of seq2 $this->{'REVCOMP'} = "C"; # to be added it to the array $this->{'SEQ2REMAINING'} = $tokens[ 10 ]; # 3' unaligned portion $this->{'SEQ2END'} = $tokens[ 11 ]; # end of seq2 alignment $this->{'SEQ2BEG'} = $tokens[ 12 ]; # beginning of seq2 alignment if ( defined $tokens[ 13 ] ) { if ( $tokens[ 13 ] eq '*' ) { if ( defined $tokens[ 14 ] ) { $this->{'LASTFIELD'} = $tokens[ 14 ]; $this->{'CATID'} = $tokens[ 15 ]; } else { $this->{'LASTFIELD'} = 1; # -nocut option } } else { $this->{'LASTFIELD'} = $tokens[ 13 ]; $this->{'CATID'} = $tokens[ 14 ]; } } else { $this->{'LASTFIELD'} = 1; # -nocut option } } elsif ( $tokens[ 8 ] eq '+' ) { # for cycles 1-4 $this->{'SEQ2NAME'} = $tokens[ 9 ]; $this->{'REVCOMP'} = '+'; $this->{'SEQ2BEG'} = $tokens[ 10 ]; $this->{'SEQ2END'} = $tokens[ 11 ]; # $this->{'SEQ2REMAINING'} = $tokens[ 12 ]; if ( $tokens[ 13 ] eq '*' ) { $this->{'LASTFIELD'} = $tokens[ 14 ]; $this->{'CATID'} = $tokens[ 15 ]; } else { $this->{'LASTFIELD'} = $tokens[ 13 ]; $this->{'CATID'} = $tokens[ 14 ]; } } else { $this->{'REVCOMP'} = '+'; $this->{'SEQ2NAME'} = $tokens[ 8 ]; $this->{'SEQ2BEG'} = $tokens[ 9 ]; $this->{'SEQ2END'} = $tokens[ 10 ]; $this->{'SEQ2REMAINING'} = $tokens[ 11 ]; if ( defined $tokens[ 12 ] ) { if ( $tokens[ 12 ] eq '*' ) { if ( defined $tokens[ 13 ] ) { $this->{'LASTFIELD'} = $tokens[ 13 ]; $this->{'CATID'} = $tokens[ 14 ]; } else { $this->{'LASTFIELD'} = 1; } } else { $this->{'LASTFIELD'} = $tokens[ 12 ]; $this->{'CATID'} = $tokens[ 13 ]; } } else { $this->{'LASTFIELD'} = 1; } } # we will return an array with the beginning and end positions # of the hit, its name, and its orientation $this->{'SEQ1REMAINING'} =~ tr/[()]//d; # remove flanking "()" $this->{'SEQ2REMAINING'} =~ tr/[()]//d; # remove flanking "()" $this->setClassName( "Unknown" ) unless ( $this->getClassName() ); } # the following methods are used to get the data sub getMatrixSubLevel { my $this = shift; return $this->{'MATRIXSUB'}; } sub setMatrixSubLevel { my $this = shift; $this->{'MATRIXSUB'} = shift; } sub getMatrixGCLevel { my $this = shift; return $this->{'MATRIXGC'}; } sub setMatrixGCLevel { my $this = shift; $this->{'MATRIXGC'} = shift; } # smith-waterman score sub getSW { my $this = shift; return $this->{'SW'}; } sub setSW { my $this = shift; $this->{'SW'} = shift; } sub getAlignData { my $this = shift; return $this->{'ALIGNDATA'}; } sub setAlignData { my $this = shift; $this->{'ALIGNDATA'} = shift; } sub getScore { my $this = shift; return $this->{'SW'}; } sub setScore { my $this = shift; $this->{'SW'} = shift; } # % substitution (seq1 vs seq2) sub getPctSubst { my $this = shift; return $this->{'SUBS'}; } sub setPctSubst { my $this = shift; $this->{'SUBS'} = shift; } # leftover sequence of query sub getLeftOver { my $this = shift; return $this->{'SEQ1REMAINING'}; } sub setLeftOver { my $this = shift; $this->{'SEQ1REMAINING'} = shift; } # percent match sub getPctMatch { my $this = shift; return $this->{'MATCH'}; } sub setPctMatch { my $this = shift; $this->{'MATCH'} = shift; } ## Flipped insertions and deletions to reflect RepeatMasker standard: ## deletions and insertions should be with respect to the consensus ## (ancestral) sequences (seq2 in RepeatMasker) # % deletions sub getPctDelete { my $this = shift; return $this->{'DEL'}; } sub setPctDelete { my $this = shift; $this->{'DEL'} = shift; } # % insertions sub getPctInsert { my $this = shift; return $this->{'INS'}; } sub setPctInsert { my $this = shift; $this->{'INS'} = shift; } # name of sequence 1 sub getSeq1Name { my $this = shift; return $this->{'SEQ1NAME'}; } sub setSeq1Name { my $this = shift; $this->{'SEQ1NAME'} = shift; } # beginning of alignment sub getSeq1Beg { my $this = shift; return $this->{'SEQ1BEG'}; } sub setSeq1Beg { my $this = shift; $this->{'SEQ1BEG'} = shift; } # end of alignment sub getSeq1End { my $this = shift; return $this->{'SEQ1END'}; } sub setSeq1End { my $this = shift; $this->{'SEQ1END'} = shift; } #len of seq1 sub getSeq1Len { my $this = shift; return $this->{'SEQ1LEN'}; } sub setSeq1Len { my $this = shift; $this->{'SEQ1LEN'} = shift; } # name of hit (tokens[9]) sub getSeq2Name { my $this = shift; return $this->{'SEQ2NAME'}; } sub setSeq2Name { my $this = shift; $this->{'SEQ2NAME'} = shift; } # returns the orientation of the hit sub getRevComp { my $this = shift; return $this->{'REVCOMP'}; } sub setRevComp { my $this = shift; $this->{'REVCOMP'} = shift; } # returns beginning of seq 2 alignment sub getSeq2Beg { my $this = shift; return $this->{'SEQ2BEG'}; } sub setSeq2Beg { my $this = shift; $this->{'SEQ2BEG'} = shift; } # returns end of seq 2 alignment sub getSeq2End { my $this = shift; return $this->{'SEQ2END'}; } sub setSeq2End { my $this = shift; $this->{'SEQ2END'} = shift; } # returns length of 3' unaligned portion sub get3PrimeUnaligned { my $this = shift; return $this->{'SEQ2REMAINING'}; } sub set3PrimeUnaligned { my $this = shift; $this->{'SEQ2REMAINING'} = shift; } # returns the length of Seq2 sub getSeq2Len { my $this = shift; #if ( ! defined $this->{'SEQ2LEN'} || $this->{'SEQ2LEN'} !~ /[\d-\.]+/ ) #{ $this->{'SEQ2LEN'} = $this->{'SEQ2END'} + $this->{'SEQ2REMAINING'}; # for "dotplots" #} return $this->{'SEQ2LEN'}; } # NOTE: THIS ISN'T USED.... sub setSeq2Len { my $this = shift; $this->{'SEQ2LEN'} = shift; } # returns the length of Seq2 sub getEquivHash { my $this = shift; return $this->{'EQUIVHASH'}; } sub setEquivHash { my $this = shift; $this->{'EQUIVHASH'} = shift; } #returns level of condensation of DNA sub getLastField { my $this = shift; return $this->{'LASTFIELD'}; } sub setLastField { my $this = shift; $this->{'LASTFIELD'} = shift; } #returns line ID number sub getID { my $this = shift; return $this->{'LINEID'}; } sub setID { my $this = shift; $this->{'LINEID'} = shift; } # The ID used in the cat file sub setCATID { my $this = shift; $this->{'CATID'} = shift; } sub getCATID { my $this = shift; return $this->{'CATID'}; } #returns status ( currently only used for DNA transposon analysis ) sub getStatus { my $this = shift; return $this->{'STATUS'}; } sub setStatus { my $this = shift; $this->{'STATUS'} = shift; } sub getBlockLeftJoin { my $this = shift; return $this->{'BLOCKLEFTJOIN'}; } sub setBlockLeftJoin { my $this = shift; $this->{'BLOCKLEFTJOIN'} = shift; } sub getBlockRightJoin { my $this = shift; return $this->{'BLOCKRIGHTJOIN'}; } sub setBlockRightJoin { my $this = shift; $this->{'BLOCKRIGHTJOIN'} = shift; } sub getRMAlignQueryLen { my $this = shift; return $this->{'RMALIGNQUERYLEN'}; } sub setRMAlignQueryLen { my $this = shift; $this->{'RMALIGNQUERYLEN'} = shift; } #returns element ( if set ) who is linked to our left edge ( Seq1Begin ) # sub getLeftLinkedHit { my $this = shift; return $this->{'LEFTHIT'}; } sub setLeftLinkedHit { my $this = shift; $this->{'LEFTHIT'} = shift; } #returns element ( if set ) who is linked to our right edge ( Seq1End ) # sub getRightLinkedHit { my $this = shift; return $this->{'RIGHTHIT'}; } sub setRightLinkedHit { my $this = shift; $this->{'RIGHTHIT'} = shift; } sub getClassName { my $this = shift; my ( $hitName, $className ) = split( /\#/, $this->{'SEQ2NAME'} ); return $className; } sub setClassName { my $this = shift; my $newClassName = shift; my ( $hitName, $className ) = split( /\#/, $this->{'SEQ2NAME'} ); $this->{'SEQ2NAME'} = $hitName . "#" . $newClassName; } sub getHitName { my $this = shift; my ( $hitName, $className ) = split( /\#/, $this->{'SEQ2NAME'} ); return $hitName; } sub setHitName { my $this = shift; my $newHitName = shift; my ( $hitName, $className ) = split( /\#/, $this->{'SEQ2NAME'} ); $this->{'SEQ2NAME'} = $newHitName . "#" . $className; } sub getUniqID { my $this = shift; return ( $this->{'SW'} . $this->{'SUBS'} . $this->{'DEL'} . $this->{'INS'} . $this->{'SEQ1NAME'} . $this->{'SEQ1BEG'} . $this->{'SEQ1END'} . $this->{'SEQ1REMAINING'} . $this->{'REVCOMP'} . $this->{'SEQ2NAME'} . $this->{'SEQ2BEG'} . $this->{'SEQ2END'} . $this->{'SEQ2REMAINING'} ); } sub getDerivedFromAnnot { my $this = shift; return ( $this->{'DERIVEDFROM'} ); } sub setDerivedFromAnnot { my $this = shift; my $member = shift; my $annot = new Hit; $annot->setFrom( $member ); $this->{'DERIVEDFROM'} = [ $member ]; } sub addDerivedFromAnnot { my $this = shift; my $member = shift; #print STDERR "Pushing DERIVED\n"; my $annot = new Hit; $annot->setFrom( $member ); ## This needs to be a deep copy if ( $member->getDerivedFromAnnot() ) { foreach my $subAnnot ( @{ $member->getDerivedFromAnnot() } ) { $annot->addDerivedFromAnnot( $subAnnot ); } } push @{ $this->{'DERIVEDFROM'} }, $annot; } sub print { my $this = shift; my $noLineTerm = shift; my $lineTerm = "\n" if ( !defined $noLineTerm ); my $outStr = sprintf( "%4d %3.1f %3.1f %3.1f %-18.18s %8d %8d " . "%8d %1s %-18.18s %8d %8d %8d %3d %3d %6s %6s$lineTerm", $this->{'SW'}, $this->{'SUBS'}, $this->{'DEL'}, $this->{'INS'}, $this->{'SEQ1NAME'}, $this->{'SEQ1BEG'}, $this->{'SEQ1END'}, $this->{'SEQ1REMAINING'}, $this->{'REVCOMP'}, $this->{'SEQ2NAME'}, $this->{'SEQ2BEG'}, $this->{'SEQ2END'}, $this->{'SEQ2REMAINING'}, $this->{'LASTFIELD'}, $this->{'LINEID'}, $this->{'CATID'}, $this->{'STATUS'} ); print STDERR $outStr; } sub printBrief { my $this = shift; my $outStr = sprintf( "%4d %-18.18s %8d %8d " . "%1s %-18.18s %8d %8d\n", $this->{'SW'}, $this->{'SEQ1NAME'}, $this->{'SEQ1BEG'}, $this->{'SEQ1END'}, $this->{'REVCOMP'}, $this->{'SEQ2NAME'}, $this->{'SEQ2BEG'}, $this->{'SEQ2END'} ); print STDERR $outStr; } sub printLeftRightLinks { my $this = shift; if ( $this->getLeftLinkedHit() != undef ) { if ( $this->getLeftLinkedHit() == $this ) { print STDERR " BLOCKED ( links to self )\n"; } else { print STDERR " "; $this->getLeftLinkedHit()->printBrief(); } } else { print STDERR " UNDEF\n"; } print STDERR " -->"; $this->printBrief(); if ( $this->getRightLinkedHit() != undef ) { if ( $this->getRightLinkedHit() == $this ) { print STDERR " BLOCKED ( links to self )\n"; } else { print STDERR " "; $this->getRightLinkedHit()->printBrief(); } } else { print STDERR " UNDEF\n"; } } sub sanityCheckConsPos { my $this = shift; my $hashRef = shift; # Find begining print STDERR "Sanity Checking:\n"; my $firstInChain = $this; my $detectLoop = 0; while ( $firstInChain->getLeftLinkedHit() != undef && $detectLoop < 50 ) { # Ok to link to self. if ( $firstInChain == $firstInChain->getLeftLinkedHit() ) { last; } $firstInChain = $firstInChain->getLeftLinkedHit(); $detectLoop++; } if ( $detectLoop >= 50 ) { print STDERR "WARNING! This chain contains a loop!!!!\n"; } # Now print my $nextInChain = $firstInChain; do { if ( $nextInChain == $this ) { print STDERR "--> ("; } else { print STDERR " ("; } print STDERR "" . $hashRef->{ $nextInChain->getID() } . "): "; $nextInChain->printBrief(); # Link to itself is also allowed if ( $nextInChain == $nextInChain->getRightLinkedHit() ) { last; } $nextInChain = $nextInChain->getRightLinkedHit(); } while ( $nextInChain ) } # # Print an element and ( if exists ) all of it chain members in order # sub printLinks { my $this = shift; # Find begining my $firstInChain = $this; my $detectLoop = 0; while ( $firstInChain->getLeftLinkedHit() != undef && $firstInChain != $firstInChain->getLeftLinkedHit() && $detectLoop < 50 ) { $firstInChain = $firstInChain->getLeftLinkedHit(); $detectLoop++; } if ( $detectLoop >= 50 ) { print STDERR "WARNING! This chain contains a loop!!!!\n"; } # Now print my $nextInChain = $firstInChain; do { if ( $nextInChain == $this ) { print STDERR "--> "; } else { print STDERR " "; } $nextInChain->printBrief(); # Link to itself is also allowed if ( $nextInChain == $nextInChain->getRightLinkedHit() ) { last; } $nextInChain = $nextInChain->getRightLinkedHit(); } while ( $nextInChain ) } # Intelligently relink an element's edges prior to removal sub removeFromJoins { my $this = shift; my $DEBUG = 0; my $left = $this->getLeftLinkedHit(); my $right = $this->getRightLinkedHit(); if ( $DEBUG ) { print STDERR "Remove from joins:\n"; $this->printLinks(); } # Just reset these since we are being removed $this->setLeftLinkedHit( undef ) unless ( $this == $left ); # don't removedlinks to itself. $this->setRightLinkedHit( undef ) unless ( $this == $right ); # don't remove links to itself. $left = undef if ( $this == $left ); $right = undef if ( $this == $right ); if ( defined $left && !defined $right ) { # Remove left's edge reference $left->setRightLinkedHit( undef ); } elsif ( defined $left && defined $right ) { # Join our neighbors together $left->setRightLinkedHit( $right ); $right->setLeftLinkedHit( $left ); } elsif ( !defined $left && defined $right ) { # Remove rights's edge reference $right->setLeftLinkedHit( undef ); } } # Intelligently relink an elements edges for insertion sub join { my $this = shift; my $partner = shift; if ( $DEBUG ) { print STDERR "join this:\n"; $this->printLinks(); print STDERR "to partner\n"; $partner->printLinks(); } die "join(): Invalid join! \$this == \$partner" if ( $this == $partner ); my @cluster = (); foreach my $chain ( $this, $partner ) { push @cluster, $chain; my $nextInChain = $chain; while ( $nextInChain->getLeftLinkedHit() && $nextInChain->getLeftLinkedHit() != $nextInChain ) { $nextInChain = $nextInChain->getLeftLinkedHit(); push @cluster, $nextInChain; } $nextInChain = $chain; while ( $nextInChain->getRightLinkedHit() && $nextInChain->getRightLinkedHit() != $nextInChain ) { $nextInChain = $nextInChain->getRightLinkedHit(); push @cluster, $nextInChain; } } # Sort cluster @cluster = sort { $a->comparePositionOrder( $b ) } ( @cluster ); my $lastAnnot = undef; foreach my $annot ( @cluster ) { next if ( $annot == $lastAnnot ); $annot->setLeftLinkedHit( $lastAnnot ); $annot->setRightLinkedHit( undef ); if ( $lastAnnot ) { $lastAnnot->setRightLinkedHit( $annot ); } $lastAnnot = $annot; } if ( $DEBUG ) { print STDERR "Now look what we did with this thing:\n"; $this->printLinks(); } } # Intelligently relink an elements edges for insertion sub resortJoins { my $this = shift; my @cluster = (); push @cluster, $this; my $nextInChain = $this; while ( $nextInChain->getLeftLinkedHit() && $nextInChain->getLeftLinkedHit() != $nextInChain ) { $nextInChain = $nextInChain->getLeftLinkedHit(); push @cluster, $nextInChain; } $nextInChain = $this; while ( $nextInChain->getRightLinkedHit() && $nextInChain->getRightLinkedHit() != $nextInChain ) { $nextInChain = $nextInChain->getRightLinkedHit(); push @cluster, $nextInChain; } # Sort cluster @cluster = sort { $a->comparePositionOrder( $b ) } ( @cluster ); my $lastAnnot = undef; foreach my $annot ( @cluster ) { next if ( $annot == $lastAnnot ); $annot->setLeftLinkedHit( $lastAnnot ); $annot->setRightLinkedHit( undef ); if ( $lastAnnot ) { $lastAnnot->setRightLinkedHit( $annot ); } $lastAnnot = $annot; } if ( $DEBUG ) { print STDERR "Now look what we did with this thing:\n"; $this->printLinks(); } } # Intelligently merge $partner information with us. Then remove # our partner's links to ready it for removal. sub mergeSimpleLow { my $this = shift; my $partner = shift; my $DEBUG = 0; if ( $DEBUG ) { print STDERR "mergeSimpleLow(): THIS links:\n"; $this->printLinks(); print STDERR "mergeSimpleLow(): PARTNER links\n"; $partner->printLinks(); } # Pick appropriate substitution/insertion/deletion stats for # merged element. if ( $this->getHitName() eq $partner->getHitName() ) { $this->adjustSubstLevel( $partner, "noQueryOverlapAdj" ); $this->setSW( $partner->getSW() ) if ( $this->getSW() < $partner->getSW() ); } else { if ( $this->getSW() < $partner->getSW() ) { $this->setPctSubst( $partner->getPctSubst() ); $this->setPctDelete( $partner->getPctDelete() ); $this->setPctInsert( $partner->getPctInsert() ); $this->setSW( $partner->getSW() ); $this->setSeq2Name( $partner->getSeq2Name() ); } # else....keep ours } # Pick the lower query begin position $this->setSeq1Beg( $partner->getSeq1Beg() ) if ( $this->getSeq1Beg() > $partner->getSeq1Beg() ); # Pick the higher query end position if ( $this->getSeq1End() < $partner->getSeq1End() ) { $this->setSeq1End( $partner->getSeq1End() ); $this->setLeftOver( $partner->getLeftOver() ); } my $newQuerySize = $this->getSeq1End() - $this->getSeq1Beg() + 1; my $newSeq2End = $newQuerySize + ( $this->getPctDelete() - $this->getPctInsert() ) * $newQuerySize / 100; $this->setSeq2Beg( 1 ); $this->setSeq2End( sprintf( "%d", $newSeq2End ) ); ## TODO: What happens when our partner has joins that ## should point to the new merger? $partner->removeFromJoins(); if ( $DEBUG ) { print STDERR "mergeSimpleLow(): Final THIS links:\n"; $this->printLinks(); print STDERR "mergeSimpleLow(): Leaving\n"; } } # Intelligently merge $partner information with us. Then remove # our partner's links to ready it for removal. sub merge { my $this = shift; my $partner = shift; my $DEBUG = 0; if ( $DEBUG ) { print STDERR "merge(): Entered\n"; print STDERR "merge(): THIS links:\n"; $this->printLinks(); print STDERR "merge(): PARTNER links\n"; $partner->printLinks(); } # Pick appropriate substitution/insertion/deletion stats for # merged element. $this->adjustSubstLevel( $partner ); # Pick the lower query begin position $this->setSeq1Beg( $partner->getSeq1Beg() ) if ( $this->getSeq1Beg() > $partner->getSeq1Beg() ); # Pick the higher query end position if ( $this->getSeq1End() < $partner->getSeq1End() ) { $this->setSeq1End( $partner->getSeq1End() ); $this->setLeftOver( $partner->getLeftOver() ); } # Find the lower consensus begin position my $minBeg = $partner->getSeq2Beg(); $minBeg = $this->getSeq2Beg() if ( $this->getSeq2Beg() < $minBeg ); # Find the higher consensus end position my $maxEnd = $partner->getSeq2End(); $maxEnd = $this->getSeq2End() if ( $this->getSeq2End() > $maxEnd ); # Sanity check consensus position choices. If # the hybrid range ( lowest begin & highest end ) of # the two elements is > 2* query length then # use the original range of the higher scoring # element instead of the hybrid. if ( ( $maxEnd - $minBeg + 1 ) > ( 2 * ( $this->getSeq1End() - $this->getSeq1Beg() + 1 ) ) ) { # Use the highest scoring annotation's range if ( $this->getSW() < $partner->getSW() ) { $this->setSeq2Beg( $partner->getSeq2Beg() ); $this->setSeq2End( $partner->getSeq2End() ); $this->set3PrimeUnaligned( $partner->get3PrimeUnaligned() ); } } else { # Use a hybrid range $this->setSeq2Beg( $minBeg ); if ( $this->getSeq2End() < $partner->getSeq2End() ) { $this->setSeq2End( $maxEnd ); $this->set3PrimeUnaligned( $partner->get3PrimeUnaligned() ); } } # Use the higher score and it's associated consensus name if ( $this->getSW() < $partner->getSW() ) { $this->setSW( $partner->getSW() ); $this->setSeq2Name( $partner->getSeq2Name() ); } $partner->removeFromJoins(); if ( $DEBUG ) { print STDERR "merge(): Final THIS links:\n"; $this->printLinks(); print STDERR "merge(): Leaving\n"; } } #### NEW TESTING sub getAdjustedSubstLevel { my $this = shift; my $partner = shift; my $method = shift; my $DEBUG = 0; if ( $DEBUG ) { print STDERR "getAdjustedSubstLevel( this, partner, method = $method ): Entered\n"; print STDERR "this: "; $this->print(); print STDERR "partner: "; $partner->print(); } warn "getAdjustedSubstLevel(): Unknown method $method" if ( defined $method && $method !~ /noQueryOverlapAdj/ ); # For calculation, choose best matching consensus in overlapped region my $subBases; my $delBases; my $insBases; # # This change allows us to calculate these stats based on # the complete alignemnt returned by RM. Previously this would # only account for one fragment. # TODO: Allow this change after a sanity check! # my $thisLength = $this->getSeq1End() - $this->getSeq1Beg() + 1; #my $thisLength = $this->getRMAlignQueryLen(); my $partnerLength = $partner->getSeq1End() - $partner->getSeq1Beg() + 1; #my $partnerLength = $partner->getRMAlignQueryLen(); my $qo = 0; $qo = $this->getQueryOverlap( $partner ) unless ( $method eq "noQueryOverlapAdj" ); my $co = $this->getConsensusOverlap( $partner ); print STDERR "adjustSubstLevel(): qo = $qo, co = $co, " . "thisLength = $thisLength, " . "partnerLength = $partnerLength\n" if ( $DEBUG ); if ( $this->getPctSubst() <= $partner->getPctSubst() ) { $subBases = $thisLength * $this->getPctSubst() + ( $partnerLength - $qo ) * $partner->getPctSubst(); $delBases = $thisLength * $this->getPctDelete() + ( $partnerLength - $qo ) * $partner->getPctDelete(); $insBases = $thisLength * $this->getPctInsert() + ( $partnerLength - $qo ) * $partner->getPctInsert(); } else { $subBases = $partnerLength * $partner->getPctSubst() + ( $thisLength - $qo ) * $this->getPctSubst(); $delBases = $partnerLength * $partner->getPctDelete() + ( $thisLength - $qo ) * $this->getPctDelete(); $insBases = $partnerLength * $partner->getPctInsert() + ( $thisLength - $qo ) * $this->getPctInsert(); } # Include the query or consensus gap in $PctInsert or $PctDelete # Expressed in percent (not bases) if ( $qo <= 10 && $method ne "noQueryOverlapAdj" ) { if ( $co - $qo > 0 ) { $insBases += ( $co - $qo ) * 100; } else { $delBases -= ( $co - $qo ) * 100; } } my $totalLength = $thisLength + $partnerLength - $qo; $totalLength = 1 if $totalLength < 1; print STDERR "getAdjustedSubstLevel(): subBases=" . ( $subBases / 100 ) . ", delBases=" . ( $delBases / 100 ) . ", insBases= " . ( $insBases / 100 ) . ", totalLength = $totalLength\n" if ( $DEBUG ); return ( ( $subBases / 100 ), sprintf( "%0.2f", ( $subBases / $totalLength ) ), ( $delBases / 100 ), sprintf( "%0.2f", ( $delBases / $totalLength ) ), ( $insBases / 100 ), sprintf( "%0.2f", ( $insBases / $totalLength ) ) ); } # Adjust the current element's divergence characteristics based # on the overlap between it and a past element ( j ). If $ignorediv # is set simply set the current to the past element -- note this # is possibly not needed anymore. sub adjustSubstLevel { my $this = shift; my $partner = shift; my $method = shift; my $DEBUG = 0; if ( $DEBUG ) { print STDERR "adjustSubstLevel( $this, $partner, $method ): Entered\n"; $this->print(); $partner->print(); } warn "adjustSubstLevel(): Unknown method $method" if ( defined $method && $method !~ /noQueryOverlapAdj/ ); my ( $subBases, $subPct, $delBases, $delPct, $insBases, $insPct ) = getAdjustedSubstLevel( $this, $partner, $method ); $this->setPctSubst( $subPct ); $this->setPctDelete( $delPct ); $this->setPctInsert( $insPct ); } sub comparePositionOrder { my $this = shift; my $refElement = shift; # Am I on the left or right of the reference? return ( $this->getSeq1Name() ) cmp( $refElement->getSeq1Name() ) || ( $this->getSeq1Beg() ) <=> ( $refElement->getSeq1Beg() ) || ( $refElement->getSeq1End() ) <=> ( $this->getSeq1End() ) || ( $refElement->getSW() ) <=> ( $this->getSW() ); } # # Tests if this current element is joined # so that it "contains" the refElement. # # # +-------------------------------------+ # | | # ------- .. --refElement--- .. ---this---- # # or # # +-------------------------------------+ # | | # --this--- .. --refElement--- .. -------- # sub containsElement { my $this = shift; my $refElement = shift; # Am I on the left or right of the reference? if ( $this->comparePositionOrder( $refElement ) > 0 ) { # I am on the right my $leftLink = $this->getLeftLinkedHit(); return 0 if ( $leftLink == undef || $leftLink == $this ); return 1 if ( $leftLink->comparePositionOrder( $refElement ) < 0 ); } else { # I am on the left my $rightLink = $this->getRightLinkedHit(); return 0 if ( $rightLink == undef || $rightLink == $this ); return 1 if ( $rightLink->comparePositionOrder( $refElement ) > 0 ); } } sub getConsensusGap { my $this = shift; my $partner = shift; return ( -$this->getConsensusOverlap( $partner ) ); } sub getConsensusOverlap { my $this = shift; my $partner = shift; warn "getConsensusOverlap() mismatched orientations!\n" if ( $this->getRevComp() != $partner->getRevComp() ); my $left = $this; my $right = $partner; if ( $left->comparePositionOrder( $right ) > 0 ) { # I am on the right $left = $partner; $right = $this; } if ( $this->getRevComp() eq "C" ) { # 0 # <--left--, <--right--- # return ( $right->getSeq2End() - $left->getSeq2Beg() + 1 ); } else { # 0 # --left-->, --right--> # return ( $left->getSeq2End() - $right->getSeq2Beg() + 1 ); } } sub getQueryGap { my $this = shift; my $partner = shift; return ( -$this->getQueryOverlap( $partner ) ); } sub getQueryOverlap { my $this = shift; my $partner = shift; my $sign = 1; $sign = -1 if ( $this->getSeq1Beg() > $partner->getSeq1End() || $partner->getSeq1Beg > $this->getSeq1End() ); my $left = $this->getSeq1Beg(); $left = $partner->getSeq1Beg() if ( $partner->getSeq1Beg() > $left ); my $right = $this->getSeq1End(); $right = $partner->getSeq1End() if ( $partner->getSeq1End() < $right ); return ( abs( $right - $left + 1 ) * $sign ); } sub getChainConsensusExtents { my $this = shift; my $seq2Beg = $this->getSeq2Beg(); my $seq2End = $this->getSeq2End(); my $nextInChain = $this; while ( $nextInChain = $nextInChain->getRightLinkedHit() ) { $seq2Beg = $nextInChain->getSeq2Beg() if ( $seq2Beg > $nextInChain->getSeq2Beg() ); } my $nextInChain = $this; while ( $nextInChain = $nextInChain->getRightLinkedHit() ) { $seq2End = $nextInChain->getSeq2End() if ( $seq2End < $nextInChain->getSeq2End() ); } return ( $seq2Beg, $seq2End ); } package main;