#!/usr/bin/env perl use strict; use Getopt::Long; use Pod::Usage; sub version { print < \$opt_eid, 'iid|i=s' => \$opt_iid, 'mean|m=i' => \$opt_mean, 'sd|s=i' => \$opt_sd, 'help' => sub { pod2usage(-verbose => 1) }, 'man' => sub { pod2usage(-verbose => 2) }, 'version' => \&version); for (@ARGV) { die "cannot read `$_'" unless $_ eq '-' || -r } # Output the library record (LIB). print "{LIB\neid:$opt_eid\niid:$opt_iid\n"; print "{DST\nmea:$opt_mean\nstd:$opt_sd\n}\n" if defined $opt_mean && defined $opt_sd; print "}\n"; sub getMateID($) { my $id = shift; return $id =~ s%/1$%/2% || $id =~ s%/2$%/1% ? $id : undef; } my ($g_red_iid, $g_frg_iid, @ctg_eids, @ctg_seqs, %reds, %frgs, %tles); # Output a read (RED) and possibly a fragment (FRG). sub createRead($$$) { my ($eid, $seq, $qlt) = @_; die "error: duplicate sequence ID `$eid'" if exists $reds{$eid}; my $red_iid = ++$g_red_iid; (my $frg_eid = $eid) =~ s/\/[12]$//; my ($my_frg_iid, $mate_iid); if (exists $frgs{$frg_eid}) { $my_frg_iid = delete $frgs{$frg_eid}; my $mate_eid = getMateID($eid); die unless defined $mate_eid; $mate_iid = delete $reds{$mate_eid}; die unless defined $mate_iid; } else { $my_frg_iid = $frgs{$frg_eid} = ++$g_frg_iid; $reds{$eid} = $red_iid; } # Output a read (RED) record. my $qlength = length $seq; print "{RED\nclr:0,$qlength\niid:$red_iid\neid:$eid\n", "frg:$my_frg_iid\n", "seq:\n$seq\n.\nqlt:\n$qlt\n.\n}\n"; # Output a fragment (FRG) record. if (defined $mate_iid) { print "{FRG\nrds:$mate_iid,$red_iid\nlib:$opt_iid\n", "eid:$frg_eid\niid:$my_frg_iid\ntyp:I\n}\n"; } return $red_iid; } # Return the left and right soft clipping of this CIGAR string. sub parseCigar($) { my $cigar = shift; my $clipLeft = $cigar =~ /^([0-9]+)S/ ? $1 : 0; my $clipRight = $cigar =~ /([0-9]+)S$/ ? $1 : 0; return ($clipLeft, $clipRight); } # Record the alignment (TLE) records. while (<>) { chomp; next if /^#/ || /^@/; if (/^>([^ ]+)/) { my $eid = $1; chomp (my $seq = <>); push @ctg_eids, $eid; push @ctg_seqs, $seq; next; } my ($qid, $flag, $tid, $tstart, $mapq, $cigar, $rnext, $pnext, $tlen, $qseq, $qqual) = split '\t'; die unless defined $qqual; $tstart--; # convert to zero-based coordinate next if $flag & 0x100; # secondary alignment $qid .= "/1" if $flag & 0x40; #FREAD1 $qid .= "/2" if $flag & 0x80; #FREAD2 my $rc = $flag & 0x10; #FREVERSE if ($rc) { # Reverse and complement the sequence. $qseq =~ tr/ACGTacgt/TGCAtgca/; $qseq = reverse $qseq; $qqual = reverse $qqual; } my $riid = createRead($qid, $qseq, $qqual); next if $flag & 0x4; #FUNMAP my $qlength = length $qseq; die if length $qqual != $qlength; my ($qstart, $clipRight) = parseCigar($cigar); my $qend = $qlength - $clipRight; die unless $qstart < $qend; my $clr = $rc ? "$qend,$qstart" : "$qstart,$qend"; $tles{$tid} .= "{TLE\nclr:$clr\noff:$tstart\nsrc:$riid\n}\n"; } # Output the contig (CTG) and alignment (TLE) records. my $ctg_iid = 0; for my $ctg_eid (@ctg_eids) { my $seq = shift @ctg_seqs; next if length $tles{$ctg_eid} == 0; # Split long lines. my $qlt = 'I' x (length $seq); $seq =~ s/.{60}/$&\n/sg; $qlt =~ s/.{60}/$&\n/sg; # Contig sequence. $ctg_iid++; print "{CTG\niid:$ctg_iid\n", "eid:$ctg_eid\n", "seq:\n", $seq, "\n.\n", "qlt:\n", $qlt, "\n.\n"; print $tles{$ctg_eid}; print "}\n"; } =pod =head1 NAME abyss-samtoafg - create an AMOS AFG file from a SAM file =head1 SYNOPSIS B F F >F B B<-cb> F B<-m> F B F =head1 DESCRIPTION Create an AMOS AFG file from a FASTA file and a SAM file. =head1 OPTIONS =over =item B<-e>,B<--eid> the EID of the library =item B<-i>,B<--iid> the IID of the library =item B<-m>,B<--mean> the mean of the fragment-size =item B<-s>,B<--sd> the standard deviation of the fragment-size =back =head1 AUTHOR Written by Shaun Jackman. =head1 REPORTING BUGS Report bugs to . =head1 COPYRIGHT Copyright 2012 Canada's Michael Smith Genome Science Centre =head1 SEE ALSO http://www.bcgsc.ca/platform/bioinfo/software/abyss http://amos.sourceforge.net/hawkeye