// -*- mode: c++; indent-tabs-mode: nil; -*- // // Copyright 2009 Illumina, Inc. // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). // // /// \file /// \author Chris Saunders /// #include "blt_common/position_snp_call_pprob_nploid.hh" #include "blt_util/log.hh" #include #include #include #include #include void nploid_write(const nploid_info& ninfo, const nploid_genotype& ngt, std::ostream& os) { os << std::setprecision(10) << std::fixed; os << "P(snp): " << (1.-ngt.pprob[ngt.ref_gt]) << " max_gtype: " << ninfo.label(ngt.max_gt) << " P(max_gtype): " << ngt.pprob[ngt.max_gt] << " max2_gtype: " << ninfo.label(ngt.max2_gt) << " P(max2_gtype): " << ngt.pprob[ngt.max2_gt]; os.unsetf(std::ios::fixed); } /// /// The likelihood of an observed 'column' of observations for any /// genotype: P(obs=ACT|genotype=AT) is a sum over all possible /// columns: /// /// sum { c in {A,C,G,T}**3 } { P(obs=ACT|col=c) * P(col=c|genotype=AT) } /// /// Enumerating all {A,C,G,T}**col_size possible column values is /// inefficent, so we only consider those columns for which /// P(col|genotype) is nonzero. For homozygous genotypes this always /// leaves a single column; for heterozygous genotypes this is a sum /// over all valid columns: /// /// P(obs=ACT|genotype=AT)= /// sum { c in {A,T}**3 } { P(obs=ACT|col=c) * P(col=c|genotype=AT } /// /// that is... /// /// P(obs|AAA)*P(AAA|genotype)+ /// P(obs|AAT)*P(AAT|genotype)+...etc /// /// the first term is a product based on error probabilities: /// P(obs=ACT|AAA) = (1-P(e1))*(P(e2)/3)*(P(e3)/3) /// /// the second term is a product of genotype base frequencies: /// P(AAA|genotype=AT) = .5**3 /// /// The likelihood calculated in the function below is an equivelent /// factorization of the calculation described above. /// void position_snp_call_pprob_nploid(const double snp_prob, const snp_pos_info& pi, const nploid_info& ninfo, nploid_genotype& ngt){ if(pi.ref_base=='N') return; const unsigned n_calls(pi.calls.size()); const unsigned ref_id(base_to_id(pi.ref_base)); // check that a non-reference call meeting quality criteria even exists: bool is_test(false); for(unsigned i(0);i lhood(n_gt,0.); static const double one_third(1./3.); const double freq_chunk(ninfo.expect_freq_chunk()); const unsigned n_freq(ninfo.expect_freq_level_size()); std::vector ln_obs_prob_cache(n_freq); for(unsigned i(0);i prior(n_gt,0.); const double nonref_prob(snp_prob/static_cast(n_gt-1)); for(unsigned gt(0);gt max){ max2 = max; max = ngt.pprob[gt]; ngt.max2_gt = ngt.max_gt; ngt.max_gt = gt; } else if(ngt.pprob[gt] > max2) { max2 = ngt.pprob[gt]; ngt.max2_gt = gt; } } // \todo debatable call criteria: ngt.is_snp=(ngt.max_gt != ngt.ref_gt); double sum(0.); for(unsigned gt(0);gt