// -*- mode: c++; indent-tabs-mode: nil; -*- // // Copyright 2009 Illumina, Inc. // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). // // /// \file /// /// \author Chris Saunders /// #ifndef __INDEL_HH #define __INDEL_HH #include "blt_util/blt_types.hh" #include "blt_util/pos_range.hh" #include "starling_types.hh" #include #include #include #include // // Breakpoints refer to both insertions and deletions which exceed // varling's MAX_INDEL_SIZE. In this case we try to account for the // breakpoint during snp-calling, but do not attempt to call the indel // using the same methods used for small indels. // // Positions: Internally all positions are stored using zero-indexed // position numbers. For small indels and left breakpoints, we store // the position of the first affected position. For right breakpoints, // we store the first positions *after* the last affected // position. Positions are stored in this manner so that the indels // follow the starling range convention // // Large indel breakpoints are expected to be rare and to possibly // have an unknown size (especially for insertions). Thus the indel // length is not used in this case to accomodate multiple different // lengths at a single location -- You may insert an advisory length // but it will not be used as part of any alignment calculation. For a // large deletion, "seq" is expected to represent the sequence on the // other side of the breakpoint, starling will not look it up. In this // way the break-point for *any* type of large-scale event can be // accomidated for snp-calling. The seq convention for a breakpoint is // to set the seq size equal to MAX_INDEL_SIZE // // TODO -- find a way to deal with combination insertion-deletions (sequence swap) // perhaps this is another indel type? ...another solution would be to introduce // a "grouping/same haplotype" concept to indels, so that a swap would not be combinatorically // recombined as per other indels. -- I like solution 2 more... // // Skips (introns) are *not* added here. They are treated as invariant // alignment elements in the original read. // // namespace INDEL { enum index_t { NONE, INSERT, DELETE, SV_BP_LEFT, SV_BP_RIGHT }; inline const char* get_index_label(index_t id) { switch(id) { case INSERT: return "INSERT"; case DELETE: return "DELETE"; case SV_BP_LEFT: return "BP_LEFT"; case SV_BP_RIGHT: return "BP_RIGHT"; default: return "UNKNOWN"; } } } // policy (for now) is that two indels which are the same except for // their sorted sequence are treated as the same, the insert sequence // is the first sequence encountered: // struct indel_key { indel_key(const pos_t p=0, const INDEL::index_t t=INDEL::NONE, const unsigned l=0) : pos(p), type(t), length(l) {} // default sort is based on left-most position of the indel (note // we consider breakpoints to have the same left and right // locations) // bool operator<(const indel_key& rhs) const { if(pos < rhs.pos) return true; if(pos == rhs.pos) { if(type < rhs.type) return true; if(type == rhs.type) { if((type == INDEL::NONE) or (type == INDEL::SV_BP_LEFT) or (type == INDEL::SV_BP_RIGHT)) return false; if(length < rhs.length) return true; } } return false; } bool operator==(const indel_key& rhs) const { return ((pos == rhs.pos) and (type == rhs.type) and (length == rhs.length)); } pos_t right_pos() const { if(type==INDEL::DELETE) return pos+length; return pos; } // correct pos range to use when we view sv's as breakpoints: known_pos_range breakpoint_pos_range() const { return known_pos_range(pos,right_pos()); } // correct pos range to use when we view sv's as ranges // (ie. candidate indel interference within a single read:) pos_range open_pos_range() const { if (type == INDEL::SV_BP_LEFT) { pos_range pr; pr.set_begin_pos(pos); return pr; } else if(type == INDEL::SV_BP_RIGHT) { pos_range pr; pr.set_end_pos(pos); return pr; } return breakpoint_pos_range(); } bool is_breakpoint() const { return ((type == INDEL::SV_BP_LEFT) or (type == INDEL::SV_BP_RIGHT)); } pos_t pos; INDEL::index_t type; unsigned length; }; struct right_pos_indel_key_sorter { bool operator()(const indel_key& i1, const indel_key& i2) const { if(i1.right_pos() < i2.right_pos()) return true; if(i1.right_pos() == i2.right_pos()) { if(i1.type < i2.type) return true; if(i1.type == i2.type) { if((i1.type == INDEL::NONE) or (i1.type == INDEL::SV_BP_LEFT) or (i1.type == INDEL::SV_BP_RIGHT)) return false; if(i1.length < i2.length) return true; } } return false; } }; // Holds the alignment scores created by each read aligning across an // indel which express the relative probability of the read aligning // to the indel or the reference (or elsewhere in the genome). Used // for indel genotyping. // struct read_path_scores { read_path_scores(const double r=0, const double i=0, const double n=0, const uint16_t rlen=0) : ref(r), indel(i), nonsite(n), read_length(rlen) {} double ref; double indel; double nonsite; typedef std::map alt_indel_t; alt_indel_t alt_indel; uint16_t read_length; // read length is used to set the expected het allele ratio }; struct indel_data { indel_data() : //is_conflict(false), is_candidate_indel_cached(false) {} std::string seq; // <- TODO: potentially this becomes a // compilation of all insert sequences and some // type of consensus is created, for now it's // just the first sequence encountered // map_read_ids refers to the read(s) that support the indel // through their genomic alignments, given that those alignments // meet the snp-caller's mapping thresholds. // // submap_read_ids are genomic alignments that fall below the // mapping thresholds. // // all_read_ids contains the list of reads which either have a // genomic alignment passing the mapping criteria or have a contig // alignment. // // contig_ids are the grouper contig(s) which support the indel // typedef std::set evidence_t; evidence_t contig_ids; evidence_t all_read_ids; evidence_t map_read_ids; evidence_t submap_read_ids; // this structure represents support for the indel among all reads // which cross an indel breakpoint by a sufficient margin after // re-alignment: // typedef std::map score_t; score_t read_path_lnp; evidence_t suboverlap_read_ids; // the reads which cross an indel breakpoint, but not by enough // to be entered into the scores list // indel interferes with another candidate indel and has weaker evidence: // bool is_conflict; mutable bool is_candidate_indel_cached; mutable bool is_candidate_indel; }; struct indel { indel_key key; indel_data data; }; // debuging dumps: std::ostream& operator<<(std::ostream& os, const read_path_scores& rps); std::ostream& operator<<(std::ostream& os, const indel_key& ik); std::ostream& operator<<(std::ostream& os, const indel_data& id); std::ostream& operator<<(std::ostream& os, const indel& in); #endif