// -*- mode: c++; indent-tabs-mode: nil; -*- // // Copyright 2009 Illumina, Inc. // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). // // /// \file /// /// \author Chris Saunders /// #include "grouper_contig_util.hh" #include "blt_util/blt_exception.hh" #include "blt_util/log.hh" #include "starling/starling_shared.hh" #include #include #include // for EOF... #include #include // reads which extend off the end of the contig on either side are // soft-clipped // bool map_grouper_contig_read_to_genome(const grouper_contig& ctg, alignment& read_al){ using namespace ALIGNPATH; // Read alignment is expected to perfectly align to the contig // (but soft-clip is allowed): // const unsigned rp_size(read_al.path.size()); assert(rp_size>=1); for(unsigned i(0);i=0); // else read doesn't align within the contig bool is_read_path_begin(false); path_t new_path; // add leading soft-clip segment if it exists: if(read_al.path[0].type == SOFT_CLIP) { new_path.push_back(read_al.path[0].type); } // mark any sequence hanging off the end of the contig as soft-clipped: if(read_begin_pos<0){ path_segment seg; seg.type = SOFT_CLIP; seg.length = -read_begin_pos; new_path.push_back(seg); read_begin_pos=0; } pos_t contig_segment_end_pos(0); pos_t ref_segment_end_pos(ctg.pos); const unsigned cpath_size(ctg.path.size()); for(unsigned i(0);i= contig_segment_end_pos) continue; is_read_path_begin=true; const pos_t shift(read_begin_pos-contig_last_segment_end_pos); read_al.pos = std::min(ref_last_segment_end_pos+shift,ref_segment_end_pos); assert(shift(seg.length)); seg.length -= shift; } const bool is_final_seg(read_end_pos <= contig_segment_end_pos); if(is_final_seg){ const pos_t shift(contig_segment_end_pos-read_end_pos); assert(shift(seg.length)); seg.length -= shift; } new_path.push_back(seg); if(is_final_seg) break; } assert(is_read_path_begin); // else read doesn't align within the contig // add trailing soft-clip if it exists: if((rp_size > 1) and (read_al.path[rp_size-1].type == SOFT_CLIP)) { new_path.push_back(read_al.path[rp_size-1]); } // mark any read segment hanging off the end of the contig as soft-clipped: const unsigned al_read_length(apath_read_length(read_al.path)); const unsigned new_read_length(apath_read_length(new_path)); if(new_read_length < al_read_length) { path_segment seg; seg.type = SOFT_CLIP; seg.length = (al_read_length-new_read_length); new_path.push_back(seg); } // copy and condense new path, check that a MATCH segment exists: bool is_genome_align(false); { path_t& rp(read_al.path); rp.clear(); align_t last_type(NONE); const unsigned nps(new_path.size()); for(unsigned i(0);i(header.substr(begin_pos,pos-begin_pos))-1; } else if(i==3){ const std::string cigar(header.substr(begin_pos,pos-begin_pos)); cigar_to_apath(cigar.c_str(),ctg.path); } else if(i==4){ //intentional throwaway: //pos_t indel_begin_pos=boost::lexical_cast(header.substr(begin_pos,pos-begin_pos))-1; } else { bad_header_error(header); } } /// \todo TODO -- handle this case for circular chromosome support: assert(ctg.pos>=0); } // read seq: { if(is.peek() == '>') { throw blt_exception("ERROR: Unexpected format in GROUPER contig file.\n"); } static const char seq_delim[] = " \t\n\r"; std::string line_buff; std::string::size_type begin_pos,pos(1); std::string& seq(ctg.seq); while (std::getline(is,line_buff)){ begin_pos=line_buff.find_first_not_of(seq_delim); pos=line_buff.find_last_not_of(seq_delim)+1; if(ctg.is_usable) { seq += line_buff.substr(begin_pos,pos-begin_pos); if(seq.size() > MAX_CONTIG_SIZE) { log_os << "WARNING: GROUPER contig: '" << ctg.id << "' exceeds maximum contig size. Skipping...\n"; ctg.is_usable=false; } } if(is.peek() == '>') break; } return true; } }