// // freebayes // // A bayesian genetic variant detector. // // standard includes //#include #include #include #include #include #include #include #include #include #include #include #include // private libraries #ifdef HAVE_BAMTOOLS #include "api/BamReader.h" #endif #include "TryCatch.h" #include "Parameters.h" #include "Allele.h" #include "Sample.h" #include "AlleleParser.h" #include "Utility.h" #include "SegfaultHandler.h" #include "multichoose.h" #include "multipermute.h" #include "Genotype.h" #include "DataLikelihood.h" #include "Marginals.h" #include "ResultData.h" #include "Bias.h" #include "Contamination.h" #include "NonCall.h" #include "Logging.h" using namespace std; // todo // generalize the main function to take the parameters as input // so that we can invoke the entire algorithm on different regions // when requested to run in parallel // take the targets (or whole genome) and make small jobs // run the main function for each region in an omp parallel for loop // only do this if the --parallel flag is set > 1 // freebayes main int main (int argc, char *argv[]) { // install segfault handler signal(SIGSEGV, segfaultHandler); AlleleParser* parser = new AlleleParser(argc, argv); Parameters& parameters = parser->parameters; list alleles; Samples samples; NonCalls nonCalls; ostream& out = *(parser->output); Bias observationBias; if (!parameters.alleleObservationBiasFile.empty()) { observationBias.open(parameters.alleleObservationBiasFile); } Contamination contaminationEstimates(0.5+parameters.probContamination, parameters.probContamination); if (!parameters.contaminationEstimateFile.empty()) { contaminationEstimates.open(parameters.contaminationEstimateFile); } // this can be uncommented to force operation on a specific set of genotypes vector allGenotypeAlleles; allGenotypeAlleles.push_back(genotypeAllele(ALLELE_GENOTYPE, "A", 1)); allGenotypeAlleles.push_back(genotypeAllele(ALLELE_GENOTYPE, "T", 1)); allGenotypeAlleles.push_back(genotypeAllele(ALLELE_GENOTYPE, "G", 1)); allGenotypeAlleles.push_back(genotypeAllele(ALLELE_GENOTYPE, "C", 1)); int allowedAlleleTypes = ALLELE_REFERENCE; if (parameters.allowSNPs) { allowedAlleleTypes |= ALLELE_SNP; } if (parameters.allowIndels) { allowedAlleleTypes |= ALLELE_INSERTION; allowedAlleleTypes |= ALLELE_DELETION; } if (parameters.allowMNPs) { allowedAlleleTypes |= ALLELE_MNP; } if (parameters.allowComplex) { allowedAlleleTypes |= ALLELE_COMPLEX; } // output VCF header if (parameters.output == "vcf") { out << parser->variantCallFile.header << endl; } if (0 < parameters.limitCoverage) { srand(13); } Allele nullAllele = genotypeAllele(ALLELE_NULL, "N", 1, "1N"); unsigned long total_sites = 0; unsigned long processed_sites = 0; while (parser->getNextAlleles(samples, allowedAlleleTypes)) { ++total_sites; DEBUG2("at start of main loop"); // did we switch chromosomes or exceed our gVCF chunk size, or do we not want to use chunks? // if so, we may need to output a gVCF record Results results; if (parameters.gVCFout && !(nonCalls.empty()) && ( (parameters.gVCFNoChunk) || (nonCalls.begin()->first != parser->currentSequenceName) || (parameters.gVCFchunk && nonCalls.lastPos().second - nonCalls.firstPos().second >= parameters.gVCFchunk ) ) ){ vcflib::Variant var(parser->variantCallFile); out << results.gvcf(var, nonCalls, parser) << endl; nonCalls.clear(); } // don't process non-ATGCN's in the reference string cb = parser->currentReferenceBaseString(); if (cb != "A" && cb != "T" && cb != "C" && cb != "G" && cb != "N") { DEBUG2("current reference base is not in { A T G C N }"); continue; } int coverage = countAlleles(samples); DEBUG("position: " << parser->currentSequenceName << ":" << (long unsigned int) parser->currentPosition + 1 << " coverage: " << coverage); bool skip = false; if (!parser->hasInputVariantAllelesAtCurrentPosition()) { // skips 0-coverage regions if (coverage == 0) { DEBUG("no alleles left at this site after filtering"); skip = true; } else if (coverage < parameters.minCoverage) { DEBUG("post-filtering coverage of " << coverage << " is less than --min-coverage of " << parameters.minCoverage); skip = true; } else if (parameters.onlyUseInputAlleles) { DEBUG("no input alleles, but using only input alleles for analysis, skipping position"); skip = true; } else if (0 < parameters.limitCoverage) { // go through each sample for (Samples::iterator s = samples.begin(); s != samples.end(); ++s) { string sampleName = s->first; Sample& sample = s->second; // get the coverage for this sample int sampleCoverage = 0; for (Sample::iterator sg = sample.begin(); sg != sample.end(); ++sg) { sampleCoverage += sg->second.size(); } if (sampleCoverage <= parameters.limitCoverage) { continue; } DEBUG("coverage " << sampleCoverage << " for sample " << sampleName << " was > " << parameters.limitCoverage << ", so we will remove " << (sampleCoverage - parameters.limitCoverage) << " genotypes"); vector genotypesToErase; do { double probRemove = (sampleCoverage - parameters.limitCoverage) / (double)sampleCoverage; vector genotypesToErase; // iterate through the genotypes for (Sample::iterator sg = sample.begin(); sg != sample.end(); ++sg) { vector allelesToKeep; // iterate through each allele for (int alleleIndex = 0; alleleIndex < sg->second.size(); alleleIndex++) { // only if we have more alleles to remove if (parameters.limitCoverage < sampleCoverage) { double r = rand() / (double)RAND_MAX; if (r < probRemove) { // skip over this allele sampleCoverage--; continue; } } // keep it allelesToKeep.push_back(sg->second[alleleIndex]); } // re-assign the alleles to this genotype if (allelesToKeep.size() < sg->second.size()) { sg->second.assign(allelesToKeep.begin(), allelesToKeep.end()); } // if no more alleles for this genotype, remove it later if (sg->second.empty()) { genotypesToErase.push_back(sg->first); } } // remove empty genotypes for (vector::iterator gt = genotypesToErase.begin(); gt != genotypesToErase.end(); ++gt) { sample.erase(*gt); } } while (parameters.limitCoverage < sampleCoverage); sampleCoverage = 0; for (Sample::iterator sg = sample.begin(); sg != sample.end(); ++sg) { sampleCoverage += sg->second.size(); } DEBUG("coverage for sample " << sampleName << " is now " << sampleCoverage); } // update coverage coverage = countAlleles(samples); } DEBUG2("coverage " << parser->currentSequenceName << ":" << parser->currentPosition << " == " << coverage); // establish a set of possible alternate alleles to evaluate at this location if (!parameters.reportMonomorphic && !sufficientAlternateObservations(samples, parameters.minAltCount, parameters.minAltFraction)) { DEBUG("insufficient alternate observations"); skip = true; } if (parameters.reportMonomorphic) { DEBUG("calling at site even though there are no alternate observations"); } } else { /* cerr << "has input variants at " << parser->currentSequenceName << ":" << parser->currentPosition << endl; vector& inputs = parser->inputVariantAlleles[parser->currentSequenceName][parser->currentPosition]; for (vector::iterator a = inputs.begin(); a != inputs.end(); ++a) { cerr << *a << endl; } */ } if (skip) { // record data for gVCF if (parameters.gVCFout) { nonCalls.record(parser->currentSequenceName, parser->currentPosition, samples); } // and step ahead continue; } // to ensure proper ordering of output stream vector sampleListPlusRef; for (vector::iterator s = parser->sampleList.begin(); s != parser->sampleList.end(); ++s) { sampleListPlusRef.push_back(*s); } if (parameters.useRefAllele) { sampleListPlusRef.push_back(parser->currentSequenceName); } // establish genotype alleles using input filters map > alleleGroups; groupAlleles(samples, alleleGroups); DEBUG2("grouped alleles by equivalence"); vector genotypeAlleles = parser->genotypeAlleles(alleleGroups, samples, parameters.onlyUseInputAlleles); // always include the reference allele as a possible genotype, even when we don't include it by default if (!parameters.useRefAllele) { vector refAlleleVector; refAlleleVector.push_back(genotypeAllele(ALLELE_REFERENCE, string(1, parser->currentReferenceBase), 1, "1M")); genotypeAlleles = alleleUnion(genotypeAlleles, refAlleleVector); } map > partialObservationGroups; map > partialObservationSupport; // build haplotype alleles matching the current longest allele (often will do nothing) // this will adjust genotypeAlleles if changes are made DEBUG("building haplotype alleles, currently there are " << genotypeAlleles.size() << " genotype alleles"); DEBUG(genotypeAlleles); parser->buildHaplotypeAlleles(genotypeAlleles, samples, alleleGroups, partialObservationGroups, partialObservationSupport, allowedAlleleTypes); DEBUG("built haplotype alleles, now there are " << genotypeAlleles.size() << " genotype alleles"); DEBUG(genotypeAlleles); string referenceBase = parser->currentReferenceHaplotype(); // for debugging /* for (Samples::iterator s = samples.begin(); s != samples.end(); ++s) { string sampleName = s->first; Sample& sample = s->second; cerr << sampleName << ": " << sample << endl; } */ // re-calculate coverage, as this could change now that we've built haplotype alleles coverage = countAlleles(samples); // estimate theta using the haplotype length long double theta = parameters.TH * parser->lastHaplotypeLength; // if we have only one viable allele, we don't have evidence for variation at this site if (!parser->hasInputVariantAllelesAtCurrentPosition() && !parameters.reportMonomorphic && genotypeAlleles.size() <= 1 && genotypeAlleles.front().isReference()) { DEBUG("no alternate genotype alleles passed filters at " << parser->currentSequenceName << ":" << parser->currentPosition); continue; } DEBUG("genotype alleles: " << genotypeAlleles); // add the null genotype bool usingNull = false; if (parameters.excludeUnobservedGenotypes && genotypeAlleles.size() > 2) { genotypeAlleles.push_back(nullAllele); usingNull = true; } ++processed_sites; // generate possible genotypes // for each possible ploidy in the dataset, generate all possible genotypes vector ploidies = parser->currentPloidies(samples); map > genotypesByPloidy = getGenotypesByPloidy(ploidies, genotypeAlleles); int numCopiesOfLocus = parser->copiesOfLocus(samples); DEBUG2("generated all possible genotypes:"); if (parameters.debug2) { for (map >::iterator s = genotypesByPloidy.begin(); s != genotypesByPloidy.end(); ++s) { vector& genotypes = s->second; for (vector::iterator g = genotypes.begin(); g != genotypes.end(); ++g) { DEBUG2(*g); } } } // get estimated allele frequencies using sum of estimated qualities map estimatedAlleleFrequencies = samples.estimatedAlleleFrequencies(); double estimatedMaxAlleleFrequency = 0; double estimatedMaxAlleleCount = 0; double estimatedMajorFrequency = estimatedAlleleFrequencies[referenceBase]; if (estimatedMajorFrequency < 0.5) estimatedMajorFrequency = 1-estimatedMajorFrequency; double estimatedMinorFrequency = 1-estimatedMajorFrequency; //cerr << "num copies of locus " << numCopiesOfLocus << endl; int estimatedMinorAllelesAtLocus = max(1, (int) ceil((double) numCopiesOfLocus * estimatedMinorFrequency)); //cerr << "estimated minor frequency " << estimatedMinorFrequency << endl; //cerr << "estimated minor count " << estimatedMinorAllelesAtLocus << endl; map > > sampleDataLikelihoodsByPopulation; map > > variantSampleDataLikelihoodsByPopulation; map > > invariantSampleDataLikelihoodsByPopulation; map inputAlleleCounts; int inputLikelihoodCount = 0; DEBUG2("calculating data likelihoods"); calculateSampleDataLikelihoods( samples, results, parser, genotypesByPloidy, parameters, usingNull, observationBias, genotypeAlleles, contaminationEstimates, estimatedAlleleFrequencies, sampleDataLikelihoodsByPopulation, variantSampleDataLikelihoodsByPopulation, invariantSampleDataLikelihoodsByPopulation); DEBUG2("finished calculating data likelihoods"); // if somehow we get here without any possible sample genotype likelihoods, bail out bool hasSampleLikelihoods = false; for (map > >::iterator s = sampleDataLikelihoodsByPopulation.begin(); s != sampleDataLikelihoodsByPopulation.end(); ++s) { if (!s->second.empty()) { hasSampleLikelihoods = true; break; } } if (!hasSampleLikelihoods) { continue; } DEBUG2("calulating combo posteriors over " << parser->populationSamples.size() << " populations"); // XXX // TODO skip these steps in the case that there is only one population? // we provide p(var|data), or the probability that the location has // variation between individuals relative to the probability that it // has no variation // // in other words: // p(var|d) = 1 - p(AA|d) - p(TT|d) - P(GG|d) - P(CC|d) // // the approach is go through all the homozygous combos // and then subtract this from 1... resolving p(var|d) BigFloat pVar = 1.0; BigFloat pHom = 0.0; long double bestComboOddsRatio = 0; bool bestOverallComboIsHet = false; GenotypeCombo bestCombo; // = NULL; GenotypeCombo bestGenotypeComboByMarginals; vector > allSampleDataLikelihoods; DEBUG("searching genotype space"); // resample the posterior, this time without bounds on the // samples we vary, ensuring that we can generate marginals for // all sample/genotype combinations //SampleDataLikelihoods marginalLikelihoods = sampleDataLikelihoods; // heavyweight copy... map > genotypeCombosByPopulation; int genotypingTotalIterations = 0; // tally total iterations required to reach convergence map > glMaxCombos; for (map::iterator p = sampleDataLikelihoodsByPopulation.begin(); p != sampleDataLikelihoodsByPopulation.end(); ++p) { const string& population = p->first; SampleDataLikelihoods& sampleDataLikelihoods = p->second; list& populationGenotypeCombos = genotypeCombosByPopulation[population]; DEBUG2("genqerating banded genotype combinations from " << sampleDataLikelihoods.size() << " sample genotypes in population " << population); // cap the number of iterations at 2 x the number of alternate alleles // max it at parameters.genotypingMaxIterations iterations, min at 10 int itermax = min(max(10, 2 * estimatedMinorAllelesAtLocus), parameters.genotypingMaxIterations); //int itermax = parameters.genotypingMaxIterations; // XXX HACK // passing 0 for bandwidth and banddepth means "exhaustive local search" // this produces properly normalized GQ's at polyallelic sites int adjustedBandwidth = 0; int adjustedBanddepth = 0; // however, this can lead to huge performance problems at complex sites, // so we implement this hack... if (parameters.genotypingMaxBandDepth > 0 && genotypeAlleles.size() > parameters.genotypingMaxBandDepth) { adjustedBandwidth = 1; adjustedBanddepth = parameters.genotypingMaxBandDepth; } GenotypeCombo nullCombo; SampleDataLikelihoods nullSampleDataLikelihoods; // this is the genotype-likelihood maximum if (parameters.reportGenotypeLikelihoodMax) { GenotypeCombo comboKing; vector initialPosition; initialPosition.assign(sampleDataLikelihoods.size(), 0); SampleDataLikelihoods nullDataLikelihoods; // dummy variable makeComboByDatalLikelihoodRank(comboKing, initialPosition, sampleDataLikelihoods, nullDataLikelihoods, inputAlleleCounts, theta, parameters.pooledDiscrete, parameters.ewensPriors, parameters.permute, parameters.hwePriors, parameters.obsBinomialPriors, parameters.alleleBalancePriors, parameters.diffusionPriorScalar); glMaxCombos[population].push_back(comboKing); } // search much longer for convergence convergentGenotypeComboSearch( populationGenotypeCombos, nullCombo, sampleDataLikelihoods, // vary everything sampleDataLikelihoods, nullSampleDataLikelihoods, samples, genotypeAlleles, inputAlleleCounts, adjustedBandwidth, adjustedBanddepth, theta, parameters.pooledDiscrete, parameters.ewensPriors, parameters.permute, parameters.hwePriors, parameters.obsBinomialPriors, parameters.alleleBalancePriors, parameters.diffusionPriorScalar, itermax, genotypingTotalIterations, true); // add homozygous combos // ^^ combo results are sorted by default } // generate the GL max combo GenotypeCombo glMax; if (parameters.reportGenotypeLikelihoodMax) { list glMaxGenotypeCombos; combinePopulationCombos(glMaxGenotypeCombos, glMaxCombos); glMax = glMaxGenotypeCombos.front(); } // accumulate combos from independently-calculated populations into the list of combos list genotypeCombos; // build new combos into this list combinePopulationCombos(genotypeCombos, genotypeCombosByPopulation); // TODO factor out the following blocks as they are repeated from above // re-get posterior normalizer vector comboProbs; for (list::iterator gc = genotypeCombos.begin(); gc != genotypeCombos.end(); ++gc) { comboProbs.push_back(gc->posteriorProb); } long double posteriorNormalizer = logsumexp_probs(comboProbs); // recalculate posterior normalizer pVar = 1.0; pHom = 0.0; // calculates pvar and gets the best het combo list::iterator gc = genotypeCombos.begin(); bestCombo = *gc; for ( ; gc != genotypeCombos.end(); ++gc) { if (gc->isHomozygous() && gc->alleles().front() == referenceBase) { pVar -= big_exp(gc->posteriorProb - posteriorNormalizer); pHom += big_exp(gc->posteriorProb - posteriorNormalizer); } else if (gc == genotypeCombos.begin()) { bestOverallComboIsHet = true; } } // odds ratio between the first and second-best combinations if (genotypeCombos.size() > 1) { bestComboOddsRatio = genotypeCombos.front().posteriorProb - (++genotypeCombos.begin())->posteriorProb; } if (parameters.calculateMarginals) { // make a combined, all-populations sample data likelihoods vector to accumulate marginals SampleDataLikelihoods allSampleDataLikelihoods; for (map::iterator p = sampleDataLikelihoodsByPopulation.begin(); p != sampleDataLikelihoodsByPopulation.end(); ++p) { SampleDataLikelihoods& sdls = p->second; allSampleDataLikelihoods.reserve(allSampleDataLikelihoods.size() + distance(sdls.begin(), sdls.end())); allSampleDataLikelihoods.insert(allSampleDataLikelihoods.end(), sdls.begin(), sdls.end()); } // calculate the marginal likelihoods for this population marginalGenotypeLikelihoods(genotypeCombos, allSampleDataLikelihoods); // store the marginal data likelihoods in the results, for easy parsing // like a vector -> map conversion... results.update(allSampleDataLikelihoods); } map repeats; if (parameters.showReferenceRepeats) { repeats = parser->repeatCounts(parser->currentSequencePosition(), parser->currentSequence, 12); } vector alts; if (parameters.onlyUseInputAlleles || parameters.reportAllHaplotypeAlleles || parameters.pooledContinuous) { //alts = genotypeAlleles; for (vector::iterator a = genotypeAlleles.begin(); a != genotypeAlleles.end(); ++a) { if (!a->isReference()) { alts.push_back(*a); } } } else { // get the unique alternate alleles in this combo, sorted by frequency in the combo vector > alternates = alternateAlleles(bestCombo, referenceBase); for (vector >::iterator a = alternates.begin(); a != alternates.end(); ++a) { Allele& alt = a->first; if (!alt.isNull() && !alt.isReference()) alts.push_back(alt); } // if there are no alternate alleles in the best combo, use the genotype alleles // XXX ... if (alts.empty()) { for (vector::iterator a = genotypeAlleles.begin(); a != genotypeAlleles.end(); ++a) { if (!a->isReference()) { alts.push_back(*a); } } } } // reporting the GL maximum *over all alleles* if (parameters.reportGenotypeLikelihoodMax) { bestCombo = glMax; } else { // the default behavior is to report the GL maximum genotyping over the alleles in the best posterior genotyping // select the maximum-likelihood GL given the alternates we have // this is not the same thing as the GL max over all alleles! // it is the GL max over the selected alleles at this point vector alleles = alts; for (vector::iterator a = genotypeAlleles.begin(); a != genotypeAlleles.end(); ++a) { if (a->isReference()) { alleles.push_back(*a); } } map > glMaxComboBasedOnAltsByPop; for (map::iterator p = sampleDataLikelihoodsByPopulation.begin(); p != sampleDataLikelihoodsByPopulation.end(); ++p) { const string& population = p->first; SampleDataLikelihoods& sampleDataLikelihoods = p->second; GenotypeCombo glMaxBasedOnAlts; for (SampleDataLikelihoods::iterator v = sampleDataLikelihoods.begin(); v != sampleDataLikelihoods.end(); ++v) { SampleDataLikelihood* m = NULL; for (vector::iterator d = v->begin(); d != v->end(); ++d) { if (d->genotype->matchesAlleles(alleles)) { m = &*d; break; } } assert(m != NULL); glMaxBasedOnAlts.push_back(m); } glMaxComboBasedOnAltsByPop[population].push_back(glMaxBasedOnAlts); } list glMaxBasedOnAltsGenotypeCombos; // build new combos into this list combinePopulationCombos(glMaxBasedOnAltsGenotypeCombos, glMaxComboBasedOnAltsByPop); bestCombo = glMaxBasedOnAltsGenotypeCombos.front(); } DEBUG("best combo: " << bestCombo); // output if ((!alts.empty() && (1 - pHom.ToDouble()) >= parameters.PVL) || parameters.PVL == 0){ // write the last gVCF record(s) if (parameters.gVCFout && !nonCalls.empty()) { vcflib::Variant var(parser->variantCallFile); out << results.gvcf(var, nonCalls, parser) << endl; nonCalls.clear(); } vcflib::Variant var(parser->variantCallFile); out << results.vcf( var, pHom, bestComboOddsRatio, samples, referenceBase, alts, repeats, genotypingTotalIterations, parser->sampleList, coverage, bestCombo, alleleGroups, partialObservationGroups, partialObservationSupport, genotypesByPloidy, parser->sequencingTechnologies, parser) << endl; } else if (parameters.gVCFout) { // record statistics for gVCF output nonCalls.record(parser->currentSequenceName, parser->currentPosition, samples); } DEBUG2("finished position"); } // write the last gVCF record // NOTE: for some resion this is only needed if we are using gVCF chunks, if minimal chunking it is not requird, in fact it breaks.... if (parameters.gVCFout && !nonCalls.empty() && !parameters.gVCFNoChunk) { Results results; vcflib::Variant var(parser->variantCallFile); out << results.gvcf(var, nonCalls, parser) << endl; nonCalls.clear(); } DEBUG("total sites: " << total_sites << endl << "processed sites: " << processed_sites << endl << "ratio: " << (float) processed_sites / (float) total_sites); delete parser; return 0; }