=head1 LICENSE Copyright (c) 2007-2009 Illumina, Inc. This software is covered by the "Illumina Genome Analyzer Software License Agreement" and the "Illumina Source Code License Agreement", and certain third party copyright/licenses, and any user of this source file is bound by the terms therein (see accompanying files Illumina_Genome_Analyzer_Software_License_Agreement.pdf and Illumina_Source_Code_License_Agreement.pdf and third party copyright/license notices). This file is part of the Consensus Assessment of Sequence And VAriation (CASAVA) software package. =head1 NAME Casava::PostAlignment::Sequencing::RmDupAndSplit - =head1 SYNOPSIS duplicate removal =head1 DESCRIPTION duplicate removal =head1 SUBROUTINES rmDupAndSplitPairFiles Process the unsorted exportpair files for the norm, orph and anom cases to: (1) sort by pair start position (2) remove PCR duplicates (3) split the paired read record into individual read records, preserving the sorted state for a subset of the reads where possible. No intermediate files are written out during these steps, except temporaries used by the sort program. =cut # POD for for each subroutines is right before the implementation # More POD after __END__ package Casava::PostAlignment::Sequencing::RmDupAndSplit; use strict; use warnings "all"; use Exporter 'import'; our @EXPORT_OK = qw(&rmDupAndSplitPairFiles); use Data::Dumper; use IO::File; use Casava::Common::Log; use Casava::PostAlignment::Sequencing::Config qw(%CONF_PROJ %CONF_APP %runsConfig); use Casava::PostAlignment::Sequencing::DnaSeqLib qw( &splitSortNormInit &splitSortNormRecord &splitSortNormClose &splitSortOrphInit &splitSortOrphRecord &splitSortOrphClose &splitSortAnomInit &splitSortAnomRecord &splitSortAnomClose); use Casava::PostAlignment::Sequencing::GenomicIOLib qw(%doubleExportFields); use Casava::PostAlignment::Sequencing::SortLib qw(sortFileList); my $fieldsRef = \%doubleExportFields; sub makeDupReport($\%) { my ($label,$dupStatsRef) = @_; return sprintf "After processing %s, out of all %d reads:\n" . "\t%d were not marked as duplicates (%.1f%%)\n" . "\t%d were marked as duplicates (%.1f%%)\n", $label, $dupStatsRef->{total}, $dupStatsRef->{nonDupCount}, ( $dupStatsRef->{nonDupCount} / $dupStatsRef->{total} ) * 100, $dupStatsRef->{dupCount}, ( $dupStatsRef->{dupCount} / $dupStatsRef->{total} ) * 100; } sub getDupDiff(\%) { my ($sref) = @_; return $sref->{total} - ( $sref->{nonDupCount} + $sref->{dupCount} ); } sub checkDupStats($\%$) { my ($label,$dupStatsRef,$isRmDup) = @_; return if ((not $isRmDup) or ($dupStatsRef->{total} <= 0)); my $difference = getDupDiff(%$dupStatsRef); if($difference != 0){ my $dupReport = makeDupReport($label,%$dupStatsRef); errorExit("ERROR: wrong number of duplicates by $difference\n.$dupReport"); } } sub stringToFile($$) { my ($str,$filename) = @_; my $FH = IO::File->new( ">$filename" ) || errorExit "ERROR: Couldn't create/open file handle for $filename $!\n"; print $FH $str; close $FH; } # SUSPENDED: # # # For back-compatibility, we prevent unanchored read-pairs from being # # selected when an anchored pair is available, even if the anchored pair # # has a lower PE score: # (($_[1]->[$fieldsRef->{SingleScore}]+$_[1][$fieldsRef->{read2SingleScore}])==0) <=> # ($_[0]->[$fieldsRef->{SingleScore}]+$_[0][$fieldsRef->{read2SingleScore}]) || # Pick the non-filtered read with the best alignment score. Order # candidates by other fields as well to be consistent across different # CASAVA builds. # sub norm_sorter($$) { $_[1]->[$fieldsRef->{Filter}] cmp $_[0]->[$fieldsRef->{Filter}] || $_[1]->[$fieldsRef->{PairedScore}] <=> $_[0]->[$fieldsRef->{PairedScore}] || $_[0]->[$fieldsRef->{Seq}] cmp $_[1]->[$fieldsRef->{Seq}] || $_[0]->[$fieldsRef->{read2Seq}] cmp $_[1]->[$fieldsRef->{read2Seq}] || $_[0]->[$fieldsRef->{QualityString}] cmp $_[1]->[$fieldsRef->{QualityString}] || $_[0]->[$fieldsRef->{read2QualityString}] cmp $_[1]->[$fieldsRef->{read2QualityString}]; } sub pickBestSrtNorm(\%\%\%$) { my ( $ref2Store, $dupStatsRef, $splitSortNormStateRef, $isKeepDupReads ) = @_; for (sort keys %$ref2Store) { my $reads = $ref2Store->{$_}; my $is_first=1; for my $read (sort norm_sorter @$reads) { if($is_first) { $dupStatsRef->{nonDupCount}++; splitSortNormRecord($read,$splitSortNormStateRef); $is_first=0; } else { $dupStatsRef->{dupCount}++; splitSortNormRecord($read,$splitSortNormStateRef,"DUP") if($isKeepDupReads); } } $dupStatsRef->{dist}{norm} {scalar(@$reads)}++; } %$ref2Store = (); } # Pick the non-filtered read with the best single alignment # score. Order candidates by other fields as well to be consistent # across different CASAVA builds. sub orph_sorter($$) { $_[1]->[$fieldsRef->{Filter}] cmp $_[0]->[$fieldsRef->{Filter}] || $_[1]->[$fieldsRef->{SingleScore}] <=> $_[0]->[$fieldsRef->{SingleScore}] || $_[0]->[$fieldsRef->{Seq}] cmp $_[1]->[$fieldsRef->{Seq}] || $_[0]->[$fieldsRef->{QualityString}] cmp $_[1]->[$fieldsRef->{QualityString}] } sub pickBestSrtOrph(\%\%\%$) { my ( $ref2Store, $dupStatsRef, $splitSortOrphStateRef, $isKeepDupReads ) = @_; for (sort keys %$ref2Store) { my $reads = $ref2Store->{$_}; my $is_first=1; for my $read (sort orph_sorter @$reads) { if($is_first) { $dupStatsRef->{nonDupCount}++; splitSortOrphRecord($read,$splitSortOrphStateRef); $is_first=0; } else { $dupStatsRef->{dupCount}++; splitSortOrphRecord($read,$splitSortOrphStateRef,"DUP") if($isKeepDupReads); } } $dupStatsRef->{dist}{orph} { scalar(@$reads) }++; } %$ref2Store = (); } # Pick the non-filtered read with the best sum single alignment # score. Order candidates by other fields as well to be consistent # across different CASAVA builds. sub anom_sorter($$) { $_[1]->[$fieldsRef->{Filter}] cmp $_[0]->[$fieldsRef->{Filter}] || ($_[1]->[$fieldsRef->{SingleScore}]+$_[1][$fieldsRef->{read2SingleScore}]) <=> ($_[0]->[$fieldsRef->{SingleScore}]+$_[0][$fieldsRef->{read2SingleScore}]) || $_[0]->[$fieldsRef->{Seq}] cmp $_[1]->[$fieldsRef->{Seq}] || $_[0]->[$fieldsRef->{read2Seq}] cmp $_[1]->[$fieldsRef->{read2Seq}] || $_[0]->[$fieldsRef->{QualityString}] cmp $_[1]->[$fieldsRef->{QualityString}] || $_[0]->[$fieldsRef->{read2QualityString}] cmp $_[1]->[$fieldsRef->{read2QualityString}]; } sub pickBestSrtAnom(\%\%\%$) { my ( $ref2Store, $dupStatsRef, $splitSortAnomStateRef, $isKeepDupReads ) = @_; for (sort keys %$ref2Store) { my $reads = $ref2Store->{$_}; my $is_first=1; for my $read (sort anom_sorter @$reads) { if($is_first) { $dupStatsRef->{nonDupCount}++; splitSortAnomRecord($read,$splitSortAnomStateRef); $is_first=0; } else { $dupStatsRef->{dupCount}++; splitSortAnomRecord($read,$splitSortAnomStateRef,"DUP") if($isKeepDupReads); } } $dupStatsRef->{dist}{anom} { scalar(@$reads) }++; } %$ref2Store = (); } { my $sortByField = 1; sub getSortFH($$$$) { my ( $chrom, $binId, $fileType, $isCompressPair ) = @_; my $u_file; if ($fileType eq 'norm'){ $u_file = $CONF_APP{f_unsort}; } elsif($fileType eq 'anom') { $u_file = $CONF_APP{f_unsort_anom}; } elsif($fileType eq 'orph') { $u_file = $CONF_APP{f_unsort_orph}; } else { errorExit("ERROR: Invalid fileType argument to getSortFH '$fileType'\n"); } my $dirBuildExportSets = File::Spec->catdir( $CONF_PROJ{dirBuildExport}, 'sets' ); my @fileList; for my $i (1 .. scalar(@{$runsConfig{exportFiles}})) { my $fileUnsorted = File::Spec->catfile($dirBuildExportSets, $i, $chrom, $binId, $u_file); if ( -e $fileUnsorted ) { push @fileList, $fileUnsorted; } } my $outFilter; if($isCompressPair) { my $libexecDir = '/home/psgendb/local/pkg/CASAVA_v1.8.2-build/libexec/CASAVA-1.8.2'; my $cxp = File::Spec->catfile($libexecDir,'compressXPair'); $outFilter = "$cxp -d"; } return sortFileList(@fileList,'-',$sortByField,%CONF_PROJ,%CONF_APP,$outFilter); } } sub rmDupAndSplitPairFiles($$$$) { my ($sourceDir, $resultsDir, $chrom, $binId) = @_; # conf values # my $binSize = $CONF_PROJ{binSizeBuild}; my $isRmDup = ( defined $CONF_PROJ{rmDup} ) ? ( $CONF_PROJ{rmDup} eq "YES" ) : 1; my $isKeepAllReads = ((defined $CONF_PROJ{sortKeepAllReads}) and $CONF_PROJ{sortKeepAllReads}); my $isCompressPair = (not ((defined $CONF_PROJ{sortNoCompressPair}) and ($CONF_PROJ{sortNoCompressPair}))); my $isKeepDupReads = $isKeepAllReads; # duplicate statistics: # my %dupStats = ( dupCount => 0, total => 0, nonDupCount => 0, dist => {} ); #################################################### Norm Case my $sortInfo = getSortFH($chrom,$binId,'norm',$isCompressPair); if (defined $sortInfo) { my %STORE; my $lastRead; my %splitSortNormState; splitSortNormInit($binSize, $resultsDir, $fieldsRef, %splitSortNormState); my $sortFH = $sortInfo->{sortFH}; while (<$sortFH>) { $dupStats{total}++; chomp; my ($thisRead, $thisEnd, @temp) = split("\t"); ## TODO: Non rmDup case needs to be handled more efficiently if ( not $isRmDup ) { splitSortNormRecord(\@temp,\%splitSortNormState); } else { if ( ( defined $lastRead ) && ( $thisRead != $lastRead ) ) { errorExit("ERROR: unexpected order") if($thisRead < $lastRead); pickBestSrtNorm( %STORE, %dupStats, %splitSortNormState, $isKeepDupReads ); } push @{ $STORE{$thisEnd} }, \@temp; $lastRead = $thisRead; } } close($sortFH) || errorExit("ERROR: sort process failed: $!\n"); $sortInfo = undef; if ( $isRmDup and (defined $lastRead) ) { pickBestSrtNorm( %STORE, %dupStats, %splitSortNormState, $isKeepDupReads ); } splitSortNormClose(%splitSortNormState); } checkDupStats("normals",%dupStats,$isRmDup); #################################################### Orph Case $sortInfo = getSortFH($chrom,$binId,'orph',$isCompressPair); if (defined $sortInfo) { my %STORE; my $lastRead; my %splitSortOrphState; splitSortOrphInit($binSize, $resultsDir, $fieldsRef, %splitSortOrphState); my $sortFH = $sortInfo->{sortFH}; while (<$sortFH>) { $dupStats{total}++; chomp; my ($thisRead, $thisEnd, @temp) = split("\t"); if ( not $isRmDup ) { splitSortOrphRecord(\@temp, \%splitSortOrphState); } else { if ( ( defined $lastRead ) && ( $thisRead != $lastRead ) ) { errorExit("ERROR: unexpected order") if($thisRead < $lastRead); pickBestSrtOrph( %STORE, %dupStats, %splitSortOrphState, $isKeepDupReads ); } my $strand = $temp[ $fieldsRef->{Strand} ]; my $mateSeq = substr( $temp[ $fieldsRef->{read2Seq} ], 0, 8 ); my $key = "$thisEnd|$strand|$mateSeq"; push @{ $STORE{$key} }, \@temp; $lastRead = $thisRead; } } close($sortFH) || errorExit("ERROR: sort process failed: $!\n"); $sortInfo = undef; if ( $isRmDup and (defined $lastRead) ) { pickBestSrtOrph( %STORE, %dupStats, %splitSortOrphState, $isKeepDupReads ); } splitSortOrphClose(%splitSortOrphState); } checkDupStats("orphans",%dupStats,$isRmDup); ################################################# Anom Case $sortInfo = getSortFH($chrom,$binId,'anom',$isCompressPair); if (defined $sortInfo) { my %STORE; my $lastRead; my %splitSortAnomState; splitSortAnomInit($chrom, $binSize, $resultsDir, $fieldsRef, %splitSortAnomState); my $sortFH = $sortInfo->{sortFH}; while (<$sortFH>) { $dupStats{total}++; chomp; my ($thisRead, $thisEnd, @temp) = split("\t"); if ( not $isRmDup ) { splitSortAnomRecord(\@temp, \%splitSortAnomState); } else { if ( ( defined $lastRead ) && ( $thisRead != $lastRead ) ) { errorExit("ERROR: unexpected order") if($thisRead < $lastRead); pickBestSrtAnom( %STORE, %dupStats, %splitSortAnomState, $isKeepDupReads ); } my $key = $thisEnd; if ($key !~ /\:/){ my $strand1 = $temp[ $fieldsRef->{Strand} ]; my $strand2 = $temp[ $fieldsRef->{read2Strand} ]; if ( $strand1 ne $strand2 ) { # if we have an RF or FR treat them as the same thing # they are either an oversized pair or an outie $key .= "|FR"; } elsif (($strand1 eq "F") and ($strand2 eq "F")) { $key .= "|FF"; } elsif (($strand1 eq "R") and ($strand2 eq "R")) { $key .= "|RR"; } else { errorExit "ERROR: Read pair not currently accounted for:\n". join("\t",@temp) . "\n"; } } push @{ $STORE{$key} }, \@temp; $lastRead = $thisRead; } } close($sortFH) || errorExit("ERROR: sort process failed: $!\n"); $sortInfo = undef; if ( $isRmDup and (defined $lastRead) ) { pickBestSrtAnom( %STORE, %dupStats, %splitSortAnomState, $isKeepDupReads ); } splitSortAnomClose(%splitSortAnomState); } if ( $dupStats{total} > 0 ) { my $dupReport = makeDupReport("anoms",%dupStats); my $fileDupCount = File::Spec->catfile($sourceDir, "dupCount.txt"); stringToFile($dupReport,$fileDupCount); my $distReport = Dumper($dupStats{dist}); my $fileDupDist = File::Spec->catfile($sourceDir, "dupDist.txt"); stringToFile($distReport,$fileDupDist); my $difference = getDupDiff(%dupStats); if ( $difference != 0 and $isRmDup ) { errorExit("ERROR: wrong number of duplicates by $difference\n.$dupReport"); } } } 1; __END__ =pod =head1 CONFIGURATION AND ENVIRONMENT Name and location of configuration files, with a complete description of the properties that can be set. Name and description of the relevant environment variables =head1 DEPENDENCIES =over 4 =item Standard perl modules strict, warnings, Exporter =item External perl modules =item Casava perl modules =head1 BUGS AND LIMITATIONS There are no known bugs in this module. All documented features are fully implemented. Please report problems to Illumina Technical Support (support@illumina.com) Patches are welcome. =head1 AUTHOR Richard Carter =cut