// Copyright 2010 Martin C. Frith // These are routines for masking simple regions (low-complexity and // short-period tandem repeats) in biological sequences. To // understand them in detail, see the published article (in // preparation). // Typically, you would just use the maskSequences routine. The other // routines are more specialized. The inputs to maskSequences are as // follows. // seqBeg: pointer to the start of the sequence. // seqEnd: pointer to one-past-the-end of the sequence. // maxRepeatOffset: the maximum tandem-repeat period-size to consider. // likelihoodRatioMatrix: a matrix of the form Qxy / (Px * Py), where // Qxy is the probability of seeing letters x and y in equivalent // positions of a tandem repeat, and Px and Py are the background // probabilities of the letters. This matrix is related to a scoring // matrix (e.g. blosum62) by: // likelihoodRatioMatrix[x][y] = exp(lambda * scoringMatrix[x][y]). // The uchars in the sequence will be used as indexes into the matrix // (e.g. likelihoodRatioMatrix[x][y]). So typically the uchars are // small integers (e.g. 0, 1, 2, 3). The matrix needs to have entries // for any uchars that can occur. // repeatProb: the probability of a repetitive segment starting per position. // repeatEndProb: the probability of a repetitive segment ending per position. // repeatOffsetProbDecay: the probability of a period-(i) repeat // divided by the probability of a period-(i-1) repeat. // firstGapProb: the probability of initiating an insertion or // deletion in a repetitive region. // otherGapProb: the probability of extending an insertion or deletion // by one more letter. // minMaskProb: mask letters whose posterior probability of being // repetitive is >= this. // maskTable: how to do the masking. Letter x will be changed to // maskTable[x]. So maskTable needs to have entries for any uchar // that can occur. // Typical usage: // tantan::maskSequences(seqBeg, seqEnd, 100, likelihoodRatioMatrix, // 0.005, 0.05, 0.9, 0, 0, 0.5, maskTable) #ifndef TANTAN_HH #define TANTAN_HH namespace tantan { typedef unsigned char uchar; typedef const double *const_double_ptr; void maskSequences(uchar *seqBeg, uchar *seqEnd, int maxRepeatOffset, const const_double_ptr *likelihoodRatioMatrix, double repeatProb, double repeatEndProb, double repeatOffsetProbDecay, double firstGapProb, double otherGapProb, double minMaskProb, const uchar *maskTable); // The following routine gets the posterior probability that each // letter is repetitive. It stores the results in "probabilities", // which must point to enough pre-allocated space to fit the results. void getProbabilities(const uchar *seqBeg, const uchar *seqEnd, int maxRepeatOffset, const const_double_ptr *likelihoodRatioMatrix, double repeatProb, double repeatEndProb, double repeatOffsetProbDecay, double firstGapProb, double otherGapProb, float *probabilities); // The following routine masks each letter whose corresponding entry // in "probabilities" is >= minMaskProb. void maskProbableLetters(uchar *seqBeg, uchar *seqEnd, const float *probabilities, double minMaskProb, const uchar *maskTable); // The following routine counts the expected number of transitions // from the background (non-repeat) state to other states. It adds // the results to "transitionCounts", which must point to // pre-initialized space for (maxRepeatOffset+1) items. The // background->background transition count is stored in // transitionCounts[0]. The background->(period-i repeat) transition // count is stored in transitionCounts[i]. // (In this routine, the HMM begin and end states are counted as // background states. Thus, begin->X is added to background->X, and // X->end is added to X->background.) void countTransitions(const uchar *seqBeg, const uchar *seqEnd, int maxRepeatOffset, const const_double_ptr *likelihoodRatioMatrix, double repeatProb, double repeatEndProb, double repeatOffsetProbDecay, double firstGapProb, double otherGapProb, double *transitionCounts); } #endif