#ifndef FREEBAYES_GENOTYPE_H #define FREEBAYES_GENOTYPE_H #include #include #include // pair #include #include #include #include #include #include #include #include #include "Allele.h" #include "Sample.h" #include "Utility.h" #include "Multinomial.h" #include "CNV.h" #include "Ewens.h" #include "Bias.h" #include "join.h" #include "convert.h" using namespace std; // each genotype is a vetor of GenotypeElements, each is a count of alleles class GenotypeElement { friend ostream& operator<<(ostream& out, GenotypeElement& rhs); public: Allele allele; int count; GenotypeElement(const Allele& a, int c) : allele(a), count(c) { } }; class Genotype : public vector { friend ostream& operator<<(ostream& out, const pair& rhs); friend ostream& operator<<(ostream& out, const Genotype& g); friend bool operator<(Genotype& a, Genotype& b); public: int ploidy; vector alleles; map alleleCounts; bool homozygous; long double permutationsln; // aka, multinomialCoefficientLn(ploidy, counts()) Genotype(vector& ungroupedAlleles) { alleles = ungroupedAlleles; sort(alleles.begin(), alleles.end()); vector > groups = groupAlleles_copy(alleles); for (vector >::const_iterator group = groups.begin(); group != groups.end(); ++group) { this->push_back(GenotypeElement(group->front(), group->size())); alleleCounts[group->front().currentBase] = group->size(); } ploidy = getPloidy(); homozygous = isHomozygous(); permutationsln = 0; if (!homozygous) { permutationsln = multinomialCoefficientLn(ploidy, counts()); } } vector uniqueAlleles(void); int getPloidy(void); int alleleCount(const string& base); int alleleCount(Allele& allele); bool containsAllele(Allele& allele); bool containsAllele(const string& base); // returns true when the genotype is composed of a subset of the alleles bool matchesAlleles(vector& alleles); vector alternateAlleles(string& refbase); vector alternateBases(string& refbase); vector counts(void); // the probability of drawing each allele out of the genotype, ordered by allele vector alleleProbabilities(void); vector alleleProbabilities(Bias& observationBias); double alleleSamplingProb(const string& base); double alleleSamplingProb(Allele& allele); string str(void) const; string relativeGenotype(string& refbase, vector& altbases); void relativeGenotype(vector& spec, string& refbase, vector& altbases); string relativeGenotype(string& refbase, string& altbase); void relativeGenotype(vector& rg, vector& alleles); bool isHeterozygous(void); bool isHomozygous(void); bool isHomozygousAlternate(void); bool isHomozygousReference(void); int containedAlleleTypes(void); vector alleleObservationCounts(Sample& sample); int alleleObservationCount(Sample& sample); bool sampleHasSupportingObservations(Sample& sample); bool sampleHasSupportingObservationsForAllAlleles(Sample& sample); bool hasNullAllele(void); vector nullMatchingGenotypes(vector& gts); }; string IUPAC(Genotype& g); string IUPAC2GenotypeStr(string iupac); vector allPossibleGenotypes(int ploidy, vector& potentialAlleles); class SampleDataLikelihood { public: string name; Genotype* genotype; long double prob; long double marginal; Sample* sample; bool hasObservations; int rank; // the rank of this data likelihood relative to others for the sample, 0 is best SampleDataLikelihood(string n, Sample* s, Genotype* g, long double p, int r) : name(n) , sample(s) , genotype(g) , prob(p) , rank(r) , marginal(0) , hasObservations(true) { } bool hasSupportingObservations(void) const { return genotype->sampleHasSupportingObservations(*sample); } int supportingObservationCount(void) const { return genotype->alleleObservationCount(*sample); } }; class AlleleCounter { public: int frequency; int observations; int forwardStrand; // supporting reads on the forward strand int reverseStrand; // supporting reads on the reverse strand int placedLeft; // supporting reads placed to the left of the allele int placedRight; // supporting reads placed to the right of the allele int placedStart; // supporting reads for which the allele occurs in the first half of the read (5'-3') int placedEnd; // supporting reads for which the allele occurs in the second half of the read (5'-3') AlleleCounter(void) : frequency(0) , observations(0) , forwardStrand(0) , reverseStrand(0) , placedLeft(0) , placedRight(0) , placedStart(0) , placedEnd(0) { } }; // a combination of genotypes for the population of samples in the analysis class GenotypeCombo : public vector { public: // GenotypeCombo::prob is equal to the sum of probs in the combo. We // factor it out so that we can construct the probabilities efficiently as // we generate the genotype combinations long double probObsGivenGenotypes; // aka data likelihood long double permutationsln; // the number of perutations of unphased genotypes in the combo // these *must* be generated at construction time // for efficiency they can be updated as each genotype combo is generated //map alleleCounts; // frequencies of each allele in the combo //map > alleleStrandCounts; // map from allele spec to (forword, reverse) counts //map > alleleReadPlacementCounts; // map from allele spec to (left, right) counts //map > alleleReadPositionCounts; // map from allele spec to (left, right) counts map alleleCounters; map genotypeCounts; GenotypeCombo(void) : probObsGivenGenotypes(0) , posteriorProb(0) , priorProb(0) , priorProbG_Af(0) , priorProbAf(0) , priorProbObservations(0) , permutationsln(0) { } void init(bool useObsExpectations); void addPriorAlleleCounts(map& priorACs); // appends the other combo to this one, // updates the counts, and multiplies the probabilites, // assuming independence between the two combos void appendIndependentCombo(GenotypeCombo& other); int numberOfAlleles(void); vector alleleProbs(void); // scales counts() by the total number of alleles int ploidy(void); // the number of copies of the locus in this combination int alleleCount(Allele& allele); int alleleCount(const string& allele); long double alleleFrequency(Allele& allele); long double alleleFrequency(const string& allele); long double genotypeFrequency(Genotype* genotype); void updateCachedCounts(Sample* sample, Genotype* oldGenotype, Genotype* newGenotype, bool useObsExpectations); map countAlleles(void); map countFrequencies(void); int hetCount(void); vector counts(void); // the counts of frequencies of the alleles in the genotype combo vector observationCounts(void); // the counts of observations of the alleles (in sorted order) int observationTotal(void); vector alleles(void); // the string representations of alleles in the genotype combo bool isHomozygous(void); // returns true if the combination is 100% homozygous across all individuals // e.g. if there is no variation // posterior long double posteriorProb; // p(genotype combo) * p(observations | genotype combo) // priors long double priorProb; // p(genotype combo) = p(genotype combo | allele frequency) * p(allele frequency) * p(observations) long double priorProbG_Af; // p(genotype combo | allele frequency) long double priorProbAf; // p(allele frequency) long double priorProbObservations; // p(observations) long double priorProbGenotypesGivenHWE; //GenotypeCombo* combo, void calculatePosteriorProbability( long double theta, bool pooled, bool ewensPriors, bool permute, bool hwePriors, bool obsBinomialPriors, bool alleleBalancePriors, long double diffusionPriorScalarln); long double probabilityGivenAlleleFrequencyln(bool permute); long double hweExpectedFrequencyln(Genotype* genotype); long double hweProbGenotypeFrequencyln(Genotype* genotype); long double hweComboProb(void); }; struct GenotypeComboResultSorter { bool operator()(const GenotypeCombo& gc1, const GenotypeCombo& gc2) { if (gc1.posteriorProb == gc2.posteriorProb) { return gc1 > gc2; } else { return gc1.posteriorProb > gc2.posteriorProb; } } }; // for comparing GenotypeCombos which are empty struct GenotypeComboResultEqual { bool operator()(const GenotypeCombo& gc1, const GenotypeCombo& gc2) { return gc1.posteriorProb == gc2.posteriorProb; } }; // for sorting data likelihoods struct SampleDataLikelihoodCompare { bool operator()(const SampleDataLikelihood& a, const SampleDataLikelihood& b) { return a.prob > b.prob; } }; struct SampleMarginalCompare { bool operator()(const SampleDataLikelihood& a, const SampleDataLikelihood& b) { return a.marginal > b.marginal; } }; struct SampleLikelihoodCompare { bool operator()(const SampleDataLikelihood& a, const SampleDataLikelihood& b) { return (a.marginal + a.prob) > (b.marginal + b.prob); } }; struct SampleMarginalAndObsCompare { bool operator()(const SampleDataLikelihood& a, const SampleDataLikelihood& b) { int aObsCount = a.supportingObservationCount(); int bObsCount = b.supportingObservationCount(); if (aObsCount != bObsCount) { if (aObsCount == 0) { return false; } else if (bObsCount == 0) { return true; } } return (a.marginal + a.prob) > (b.marginal + b.prob); } }; // a set of probabilities for a set of genotypes for a set of samples typedef vector > SampleDataLikelihoods; void sortSampleDataLikelihoods(vector& likelihoods); bool sortSampleDataLikelihoodsByMarginals(vector& likelihoods); bool sortSampleDataLikelihoodsByMarginals(SampleDataLikelihoods& samplesLikelihoods); bool sortSampleDataLikelihoodsByMarginalsAndObs(SampleDataLikelihoods& samplesLikelihoods); bool sortSampleDataLikelihoodsScaledByMarginals(vector& likelihoods); bool sortSampleDataLikelihoodsScaledByMarginals(SampleDataLikelihoods& samplesLikelihoods); typedef map GenotypeComboMap; void genotypeCombo2Map(GenotypeCombo& gc, GenotypeComboMap& gcm); void orderedGenotypeCombo( GenotypeCombo& combo, GenotypeCombo& orderedCombo, SampleDataLikelihoods& sampleDataLikelihoods, long double theta, bool pooled, bool ewensPriors, bool permute, bool hwePriors, bool binomialObsPriors, bool alleleBalancePriors, long double diffusionPriorScalar); void makeComboByDatalLikelihoodRank( GenotypeCombo& combo, vector& initialPosition, SampleDataLikelihoods& variantSampleDataLikelihoods, SampleDataLikelihoods& invariantSampleDataLikelihoods, map& priorACs, long double theta, bool pooled, bool ewensPriors, bool permute, bool hwePriors, bool binomialObsPriors, bool alleleBalancePriors, long double diffusionPriorScalar); void dataLikelihoodMaxGenotypeCombo( GenotypeCombo& combo, SampleDataLikelihoods& sampleDataLikelihoods, long double theta, bool pooled, bool ewensPriors, bool permute, bool hwePriors, bool binomialObsPriors, bool alleleBalancePriors, long double diffusionPriorScalar); bool bandedGenotypeCombinations( list& combos, GenotypeCombo& comboKing, SampleDataLikelihoods& variantDataLikelihoods, SampleDataLikelihoods& invariantDataLikelihoods, Samples& samples, map& priorACs, int bandwidth, int banddepth, long double theta, bool pooled, bool ewensPriors, bool permute, bool hwePriors, bool binomialObsPriors, bool alleleBalancePriors, long double diffusionPriorScalar); void allLocalGenotypeCombinations( list& combos, GenotypeCombo& comboKing, SampleDataLikelihoods& sampleDataLikelihoods, Samples& samples, map& priorACs, long double theta, bool pooled, bool ewensPriors, bool permute, bool hwePriors, bool binomialObsPriors, bool alleleBalancePriors, long double diffusionPriorScalar, bool keepCombos); void convergentGenotypeComboSearch( list& combos, GenotypeCombo& comboKing, SampleDataLikelihoods& sampleDataLikelihoods, SampleDataLikelihoods& variantDataLikelihoods, SampleDataLikelihoods& invariantDataLikelihoods, Samples& samples, vector& genotypeAlleles, map& priorACs, int bandwidth, int banddepth, long double theta, bool pooled, bool ewensPriors, bool permute, bool hwePriors, bool binomialObsPriors, bool alleleBalancePriors, long double diffusionPriorScalar, int maxiterations, int& totaliterations, bool addHomozygousCombos); void addAllHomozygousCombos( list& combos, SampleDataLikelihoods& sampleDataLikelihoods, SampleDataLikelihoods& variantSampleDataLikelihoods, SampleDataLikelihoods& invariantSampleDataLikelihoods, Samples& samples, vector& genotypeAlleles, long double theta, bool pooled, bool ewensPriors, bool permute, bool hwePriors, bool binomialObsPriors, bool alleleBalancePriors, long double diffusionPriorScalar); vector > alternateAlleles(GenotypeCombo& combo, string referenceBase); pair alternateAndReferenceCount(vector& observations, string& refbase, string altbase); ostream& operator<<(ostream& out, list& combo); ostream& operator<<(ostream& out, GenotypeCombo& g); map > getGenotypesByPloidy(vector& ploidies, vector& genotypeAlleles); void combinePopulationCombos(list& genotypeCombos, map >& genotypeCombosByPopulation); #endif