// -*- mode: c++; indent-tabs-mode: nil; -*- // // Copyright 2009 Illumina, Inc. // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). // // /// \file /// /// \author Chris Saunders /// #include "alignment/GlobalUtilities.hh" #include "blt_util/blt_exception.hh" #include "blt_util/log.hh" #include "blt_util/parse_util.hh" #include "blt_util/seq_util.hh" #include "starling/align_path.hh" #include "boost/lexical_cast.hpp" #include #include #include #include #include enum { INDEL_BEGIN='^', INDEL_END='$' }; static void unknown_md_error(const char* const md, const char* const mdptr){ std::ostringstream oss; oss << "ERROR: can't parse match descriptor string: " << md << "\n" << "\tunexpected character: '" << *mdptr << "' at position: " << (mdptr-md+1) << "\n"; throw blt_exception(oss.str().c_str()); } static void unknown_cigar_error(const char* const cigar, const char* const cptr){ std::ostringstream oss; oss << "ERROR: can't parse cigar string: " << cigar << "\n" << "\tunexpected character: '" << *cptr << "' at position: " << (cptr-cigar+1) << "\n"; throw blt_exception(oss.str().c_str()); } namespace ALIGNPATH { static void apath_push(path_t& apath, path_segment& ps, const align_t t){ if( (0==ps.length) or (ps.type==t) ) return; apath.push_back(ps); ps.clear(); } static void export_md_to_apath_impl(const char* md, path_t& apath) { using casava::blt_util::parse_unsigned; const char* mdptr(md); path_segment ps; while(*mdptr) { if (isdigit(*mdptr)) { apath_push(apath,ps,MATCH); const unsigned mlen(parse_unsigned(mdptr)); ps.length += mlen; ps.type = MATCH; } else if(is_valid_base(*mdptr)){ apath_push(apath,ps,MATCH); mdptr++; ps.length++; ps.type = MATCH; } else if(*mdptr == INDEL_BEGIN){ mdptr++; // eat INDEL_BEGIN while(*mdptr != INDEL_END){ if (isdigit(*mdptr)){ apath_push(apath,ps,INSERT); const unsigned mlen(parse_unsigned(mdptr)); ps.length=mlen; ps.type=INSERT; } else if(is_valid_base(*mdptr)){ apath_push(apath,ps,DELETE); mdptr++; ps.length++; ps.type=DELETE; } else { unknown_md_error(md,mdptr); } } mdptr++; // eat INDEL_END } else { unknown_md_error(md,mdptr); } } apath_push(apath,ps,NONE); } void export_md_to_apath(const char* md, const bool is_fwd_strand, path_t& apath, const bool is_edge_deletion_error){ // to make best use of previous code, we parse the MD in the // alignment direction and then orient apath to the forward strand // as a second step if required // assert(NULL != md); apath.clear(); export_md_to_apath_impl(md,apath); unsigned as(apath.size()); if( ((as>0) and (apath.front().type == DELETE)) or ((as>1) and (apath.back().type == DELETE)) ) { std::ostringstream oss; if(is_edge_deletion_error) { oss << "ERROR: "; } else { oss << "WARNING: "; } std::string cigar; apath_to_cigar(apath,cigar); oss << "alignment path: " << cigar << " contains meaningless edge deletion.\n"; if(is_edge_deletion_error) { throw blt_exception(oss.str().c_str()); } else { log_os << oss.str(); path_t apath2; for(unsigned i(0);i1) ) { std::reverse(apath.begin(),apath.end()); } } void apath_to_cigar(const path_t& apath, std::string& cigar) { cigar.clear(); const unsigned as(apath.size()); for(unsigned i(0);i(ps.length); cigar.push_back(segment_type_to_cigar_code(ps.type)); } } void apath_to_export_md(path_t& apath, const char* ref_seq, const char* ref_end, const int32_t ref_pos, const std::string& read_bases, const bool is_fwd_strand, std::string& md) { md.clear(); if(is_fwd_strand) { const char* pRead = read_bases.c_str(); const char* pReference = ref_seq + ref_pos - 1; fwd_apath_to_export_md(apath, ref_seq, pReference, ref_end, pRead, md); } else { uint32_t numRefBases = 0; path_t::const_iterator pCIter; for(pCIter = apath.begin(); pCIter != apath.end(); ++pCIter) { if((pCIter->type == MATCH) || (pCIter->type == DELETE)) { numRefBases += pCIter->length; } } const char* pRead = read_bases.c_str(); const char* pReference = ref_seq + ref_pos + numRefBases - 2; rev_apath_to_export_md(apath, ref_seq, pReference, ref_end, pRead, md); } } void fwd_apath_to_export_md(path_t& apath, const char* ref_begin, const char* ref_bases, const char* ref_end, const char* read_bases, std::string& md) { // process the align path bool foundUnsupportedCigar = false; path_t::const_iterator pCIter; for(pCIter = apath.begin(); pCIter != apath.end(); ++pCIter) { if(pCIter->type == DELETE) { // handle deletion md.push_back('^'); for(uint32_t i = 0; i < pCIter->length; ++i, ++ref_bases) { md.push_back(*ref_bases); } md.push_back('$'); } else if(pCIter->type == INSERT) { // handle insertion md.push_back('^'); md += boost::lexical_cast(pCIter->length); read_bases += pCIter->length; md.push_back('$'); } else if(pCIter->type == MATCH) { // handle match/mismatch uint32_t numMatchingBases = 0; for(uint32_t i = 0; i < pCIter->length; ++i, ++ref_bases, ++read_bases) { // handle circular genome if((ref_bases < ref_begin) || (ref_bases > ref_end)) { md.push_back('N'); continue; } if(*ref_bases != *read_bases) { // write the number of preceding matching bases if(numMatchingBases != 0) { md += boost::lexical_cast(numMatchingBases); numMatchingBases = 0; } // output the mismatched base md.push_back(*ref_bases); } else ++numMatchingBases; } // write the number of trailing matching bases if(numMatchingBases != 0) { md += boost::lexical_cast(numMatchingBases); } } else { // handle unsupported CIGAR operation foundUnsupportedCigar = true; break; } } if(foundUnsupportedCigar) md = "UNSUPPORTED"; } void rev_apath_to_export_md(path_t& apath, const char* ref_begin, const char* ref_bases, const char* ref_end, const char* read_bases, std::string& md) { // process the align path bool foundUnsupportedCigar = false; path_t::const_reverse_iterator pCRIter; for(pCRIter = apath.rbegin(); pCRIter != apath.rend(); ++pCRIter) { if(pCRIter->type == DELETE) { // handle deletion md.push_back('^'); for(uint32_t i = 0; i < pCRIter->length; ++i, --ref_bases) { md.push_back(reverseCharASCII[(uint)*ref_bases]); } md.push_back('$'); } else if(pCRIter->type == INSERT) { // handle insertion md.push_back('^'); md += boost::lexical_cast(pCRIter->length); read_bases += pCRIter->length; md.push_back('$'); } else if(pCRIter->type == MATCH) { // recreate the the match descriptor for this non-INDEL region uint32_t numMatchingBases = 0; for(uint32_t i = 0; i < pCRIter->length; ++i, --ref_bases, ++read_bases) { // handle circular genome if((ref_bases < ref_begin) || (ref_bases > ref_end)) { md.push_back('N'); continue; } const char rcRefBase = reverseCharASCII[(uint)*ref_bases]; if(rcRefBase != *read_bases) { // write the number of preceding matching bases if(numMatchingBases != 0) { md += boost::lexical_cast(numMatchingBases); numMatchingBases = 0; } // output the mismatched base md.push_back(rcRefBase); } else ++numMatchingBases; } // write the number of trailing matching bases if(numMatchingBases != 0) { md += boost::lexical_cast(numMatchingBases); } } else { // handle unsupported CIGAR operation foundUnsupportedCigar = true; break; } } if(foundUnsupportedCigar) md = "UNSUPPORTED"; } void cigar_to_apath(const char* cigar, path_t& apath){ using casava::blt_util::parse_unsigned; assert(NULL != cigar); apath.clear(); path_segment lps; const char* cptr(cigar); while(*cptr) { path_segment ps; // expect sequences of digits and cigar codes: if((not isdigit(*cptr)) or (*cptr=='0')) unknown_cigar_error(cigar,cptr); ps.length = parse_unsigned(cptr); ps.type = cigar_code_to_segment_type(*cptr); if(ps.type == NONE) unknown_cigar_error(cigar,cptr); cptr++; if((ps.type == PAD) or (ps.length == 0)) continue; if(ps.type != lps.type){ if(lps.type != NONE) apath.push_back(lps); lps = ps; } else { lps.length += ps.length; } } if(lps.type != NONE) apath.push_back(lps); } unsigned apath_read_length(const path_t& apath) { unsigned val(0); const unsigned as(apath.size()); for(unsigned i(0);i0) { ps.length = lead; apath2.push_back(ps); } apath2.insert(apath2.end(),apath.begin(),apath.end()); if(trail>0) { ps.length = trail; apath2.push_back(ps); } apath=apath2; } std::pair get_nonclip_end_segments(const path_t& apath) { const unsigned as(apath.size()); std::pair res(as,as); bool is_nonclip(false); for(unsigned i(0);i0) and ((apath[0].type == SOFT_CLIP) or (apath[as-1].type==SOFT_CLIP))); } bool is_edge_clipped(const path_t& apath){ const unsigned as(apath.size()); return ((as>0) and ((is_segment_type_unaligned_read_edge(apath[0].type)) or (is_segment_type_unaligned_read_edge(apath[as-1].type)))); } bool is_seq_swap(const path_t& apath) { bool is_last_indel(false); const unsigned as(apath.size()); for(unsigned i(0);i