#!/usr/bin/env perl =head1 NAME reads2fasta.pl =head1 SYNOPSIS reads2fasta.pl -pd -of -oq -help readlistfile1 readlistfile2 ... etc Note:all given options are required except help =head1 DESCRIPTION This script takes as input a readlist or readlist to retrieve. The read id to be retrieved must be in the 1st (whitespace separated) column of the readlist(s) supplied on the command line. If the phd file for the read exists in the phd_dir specified by -pd by the command line, the fasta and quals are written. Note: Newbler ace file information (eg. _left,_right,.pr) is stripped from the read id to enable use of readlists generated by NewblerAce2ReadPair.pl Options are as follows: -pd phd_dir where the phd files are kept -of path to output fasta file. This file will contain the fasta sequences. -of path to output qual file. This file will contain the quality scores. space separated command line arguments are the read lists. =head1 VERSION $Revision: 1.3 $ $Date: 2009-08-26 17:18:15 $ =head1 HISTORY =over =item * Brian Foster 2008/12/12 Creation =back =cut # Includes use strict; use Carp; use vars qw( $RealBin $optphddir $optofasta $optoqual $opth ); use FindBin qw($RealBin); use Getopt::Long; use Pod::Usage; use lib "$RealBin/../lib"; use File::Path; use File::Basename; #============================================================================# # INPUT VALIDATION #============================================================================# if( !GetOptions( "pd=s"=>\$optphddir, "of=s"=>\$optofasta, "oq=s"=>\$optoqual, "h"=>\$opth, ) ) { printHelp(); } pod2usage(-verbose=>2) and exit 1 if defined $opth; # Check ARGVs # if (@ARGV < 1){ print STDERR "no input read files given\n"; printHelp(); } else{ foreach my $file (@ARGV){ if (! -e $file){ print STDERR "$file given on command line not accessible\n"; printHelp(); } } } # Validate all options # if (! defined $optphddir || ! defined $optofasta || ! defined $optoqual ){ printHelp(); } if (-e $optofasta || -e $optoqual){ confess "$optofasta or $optoqual already exist\n"; printHelp(); } $optphddir =~ s/\/$//; if (! -d $optphddir){ print STDERR "$optphddir is not found\n"; printHelp(); } if ( ! -d dirname($optofasta)){ mkpath (dirname($optofasta)); } if ( ! -w dirname($optofasta)){ print STDERR dirname($optofasta), "Not writable\n"; printHelp(); } if ( ! -d dirname($optoqual)){ mkpath (dirname($optoqual)); } if ( ! -w dirname($optoqual)){ print STDERR dirname($optoqual), "Not writable\n"; printHelp(); } #============================================================================# # MAIN #============================================================================# # get hash of phd files key = read id , value = full path to phd # opendir(DIR, "$optphddir")or confess "Cannot opendir $optphddir $!"; # note what about phd.2 etc my %phdfiles = map{$_=~/^(.*)\.phd\.\d+$/,"${optphddir}/$_"} grep {$_=~/^.*\.phd\.\d+$/} readdir(DIR); closedir(DIR); # get read/phd info # my %seq; my (@F,$read); foreach my $file (@ARGV){ open (IN , $file) or confess "can't open $file\n"; while(){ chomp; @F = (split/\s+/,$_); ($read = $F[0]) =~ s/\_(?:left|right).*$//; $read =~ s/\.\d+\-\d+.*$//; $read =~ s/\.pr\d+.*$//; if (exists $phdfiles{$read}){ ($seq{$read}{seq},$seq{$read}{qual}) = _parsePhd($phdfiles{$read}); } } close(IN); } # print fasta and quals if found # if (scalar(keys %seq)){ open(FASTA,">$optofasta") or die "Can't open $optofasta\n"; open(QUAL,">$optoqual") or die "Can't open $optoqual\n"; foreach my $read (keys %seq){ print FASTA ">$read\n", seq2fasta($seq{$read}{seq}); print QUAL ">$read\n", qual2fasta($seq{$read}{qual}); } close(FASTA); close(QUAL); } #============================================================================# sub printHelp { my $optVerbose = shift || 1; pod2usage(-verbose=>$optVerbose); exit 1; } #============================================================================# sub _parsePhd{ my ($file) = @_; my $parse = 0; my ($seq,$qual); my ($b,$q,$rest); open (IN_PHD,"$file") or die "Can't open $file\n"; $seq = $qual = ""; while(){ chomp; if ($_ =~ /BEGIN_DNA/){ $parse = 1; next; } next unless ($parse); last if ($_ =~ /END_DNA/); ($b,$q,$rest) = split/\s+/,$_; $seq .= $b; $qual.= " " . $q; } close(IN_PHD); return $seq,$qual; } #============================================================================# sub seq2fasta{ my $seq = $_[0]; $seq =~ s/(.{50})/$1\n/g; ($seq .= "\n") =~ s/\n+$/\n/; return $seq; } #==================================================================================== sub qual2fasta{ my $qual = $_[0]; $qual =~ s/^\s*/ /; $qual =~ s/((?:\s\d+){50})/$1\n/g; ($qual .= "\n") =~ s/(?:\n\s|\n+)/\n/g; $qual =~ s/^\s*//; return $qual; } exit 0;