#!/usr/bin/env perl # # A first-stage fasta filter for RepeatScout libraries: removes anything under 50 # nt. Removes anything that is over 50% low-complexity vis a vis TRF or NSEG. # # Status is printed to STDERR, the resulting "good" repeats to STDOUT. # # # Created: 20050416 # Author: Neil Jones #----------------------------------------------------------------------------------- use File::Temp; if( grep /^--?h$/, @ARGV ) { exec("perldoc $0"); } # # Same parameters as Alkes' earlier version # my $NSEG_THRESHOLD = .5; my $TRF_THRESHOLD = .5; my $MIN_LENGTH = 50; my $TRF_COMMAND = $ENV{'TRF_COMMAND'} || "trf"; my $NSEG_COMMAND = $ENV{'NSEG_COMMAND'} || "nseg"; my ($head, $body); my $repeat_num = 1; my $num_deleted = 0; my $num_skipped = 0; my $line = 0; while(<>) { ++$line; chomp; next if /^#/; # Allow embedded comments. if( /^>/ || eof() ) { $body .= $_ if eof(); my $xx = $_; my $nseg; my $trf; if( length($body) > $MIN_LENGTH) { my $tmp = new File::Temp( DIR => '.'); print $tmp "$head (RR=$repeat_num)\n"; print $tmp "$body\n"; $nseg = nseg($tmp, $body); $trf = trf($tmp, $body); $tmp->close(); $tmp = undef; # Remove any reference counts so it's deleted. if( $nseg <= $NSEG_THRESHOLD && $trf <= $TRF_THRESHOLD ) { printf("$head (RR=$repeat_num. TRF=%.3f NSEG=%.3f)\n", $trf, $nseg); for(my $ii = 0; $ii < length($body); $ii += 80) { print substr($body, $ii, 80), "\n"; } } else { print STDERR "deleting $header: $nseg / $trf\n"; ++$num_deleted; } ++$repeat_num; } else { ++$num_skipped; } $head = $xx; $body = ''; } else { die("Improperly formatted file: offensive line #$line was\n$_\n") if /[^ATGC]/; $body .= $_; } } print STDERR "$num_deleted deleted. $repeat_num saved. $num_skipped skipped for length.\n"; sub nseg { my ($file, $text) = @_; local *NSEG; my $nseg = $NSEG_COMMAND; open(NSEG, "$nseg $file -x -c 100 |"); my $len = 0; while() { chomp; next if /^>/; s/n//g; s/N//g; $len += 1*length($_); } close(NSEG); return (length($text)-$len) / length($text); } sub trf { my ($file, $text) = @_; local *TRF; my $trf = $TRF_COMMAND; system("$trf $file 2 7 7 80 10 50 500 -f -d -m > /dev/null"); open(TRF, "<$file.2.7.7.80.10.50.500.mask") or die($!); my $len = 0; while() { chomp; next if /^>/; s/n//g; s/N//g; $len += 1*length($_); } close(TRF); # TRF produces many output files. unlink "$file.2.7.7.80.10.50.500.mask"; unlink "$file.2.7.7.80.10.50.500.dat"; unlink "$file.2.7.7.80.10.50.500.1.txt.html"; unlink "$file.2.7.7.80.10.50.500.1.html"; return (length($text)-$len) / length($text); } __END__ =head1 NAME filter-stage-1.prl -- a first stage post-processing tool for RepeatScout output. =head1 SYNOPSIS cat repeats.fa | filter-stage-1.prl > repeats-filtered.prl =head1 OPTIONS none other than "-h" (the output of which you're reading), but you will either want trf and nseg in your PATH, or you will want to set the environment variables TRF_COMMAND and NSEG_COMMAND to provide the executable. =head1 DESCRIPTION This tool takes a repeat library, which is a Fasta-formatted sequence file, and filters out any sequence that is deemed to be more than 50% low-complexity by either TRF or NSEG or both. Note that one algorithm needs to make the determination; we don't check the total number of unique bases masked out by TRF and NSEG individually. =head1 ENVIRONMENT VARIABLES In order for this program to find TRF and NSEG, you need to either place said programs in your PATH, or you need to add the environment variables TRF_COMMAND and NSEG_COMMAND. The value of those variables should be the path at which the respective program can be found. =cut