// -*- mode: c++; indent-tabs-mode: nil; -*- // // Copyright 2010 Illumina, Inc. // // This software is covered by the "Illumina Genome Analyzer Software // License Agreement" and the "Illumina Source Code License Agreement", // and certain third party copyright/licenses, and any user of this // source file is bound by the terms therein (see accompanying files // Illumina_Genome_Analyzer_Software_License_Agreement.pdf and // Illumina_Source_Code_License_Agreement.pdf and third party // copyright/license notices). // // /// \file sortedToBam.cpp /// /// \author Chris Saunders /// #include "blt_util/bam_dumper.hh" #include "blt_util/bam_util.hh" #include "blt_util/blt_exception.hh" #include "blt_util/export_stream_reader.hh" #include "blt_util/seq_util.hh" #include "starling/align_path.hh" #include "starling/align_path_bam_util.hh" #include "boost/program_options.hpp" #include "boost/regex.hpp" #include "boost/utility.hpp" #include #include #include #include #include #include #include std::ostream& log_os(std::cerr); struct qconverter { qconverter(const bool is_qphred) { if(is_qphred) { for(unsigned i(0);i<64;++i) { convertq[i] = 0; convertq[i+64] = i; } } else { static const double log10(std::log(10)); for(int i(0);i<128;++i) { convertq[i] = static_cast(10*std::log(1.+std::pow(10.,((i-64)/10.)))/log10+.499); } } } inline uint8_t convert(const uint8_t q) const { return convertq[q]; } private: uint8_t convertq[128]; }; static void get_read_key_from_export_line(const export_line_parser& exl, std::string& key){ // note that this has been changed to the slower numerical version to // guarantee that matching reads will be identified as such even after GROUPER // removes leading zeros // std::ostringstream oss; oss << exl.machine() << '_' << exl.run_number() << ':' << exl.lane() << ':' << exl.tile() << ':' << exl.x_coordinate() << ':' << exl.y_coordinate(); key = oss.str(); } static int32_t get_bam_target_id(const char* seq_name, const bam_header_t* header){ int32_t tid(bam_get_tid(header,seq_name)); if(tid<-1) tid=-1; return tid; } inline char comp_md_base(char a){ switch(a) { case 'A': return 'T'; case 'C': return 'G'; case 'G': return 'C'; case 'T': return 'A'; case '^': return '$'; case '$': return '^'; default: return a; } } template void reverseCompMDChar(Iter b,Iter e){ char t; for(;b!=e;++b){ if((--e)==b) { *b=comp_md_base(*b); break; } t=comp_md_base(*b); *b=comp_md_base(*e); *e=t; } } static void reverse_comp_match_descriptor(char* md) { const unsigned len(strlen(md)); reverseCompMDChar(md,md+len); // Unreverse the digits of numbers. bool is_in_digits(false); unsigned starti(0); for(unsigned i(0);i(md)); } static const char xdtag[] = {'X','D'}; bam_aux_append(&br,xdtag,'Z', (strlen(md)+1),(uint8_t*) (md)); if(NULL==strchr(md,'^')){ // indel case: apath.push_back(ALIGNPATH::path_segment(ALIGNPATH::MATCH,read_size)); } else { // no-indel case: export_md_to_apath(md,true,apath); } } edit_bam_cigar(apath,br); } struct stb_options { stb_options() : is_paired(false) , is_qphred(true) , is_exclude_mapped(false) , is_exclude_unmapped(false) {} bool is_paired; bool is_qphred; bool is_exclude_mapped; bool is_exclude_unmapped; std::string header_filename; std::string sorted_filename; std::string bam_filename; }; // Note this is intentionally written to be an exact copy of // sorted2sam.pl, warts and all, because the rest of the casava // pipeline has been functioning correctly with that logic for some // time. The primary points that need to get cleaned up are circular // genome corner cases, which we'll just fix in ELAND. // // On that subject -- the only explicit difference from the // sorted2sam.pl logic in here is that SAM pos=max(0,pos) [ or // pos=max(-1,pos) in BAM coordinates]. This logic was not implemented // by the sorted2sam.pl script itself, but was done automatically by // samtools view to produce the final bam file. // static void stb_run(const stb_options& opt){ static const char stream_filename[] = "-"; // open sorted file: // std::auto_ptr exr; if(opt.sorted_filename.empty() or (opt.sorted_filename==stream_filename)){ exr.reset(new export_stream_reader(std::cin,"stdin")); } else { exr.reset(new export_file_reader(opt.sorted_filename.c_str())); } // read bam header and open bam file: bam_header_t *header(NULL); { // read the header tamFile fph = sam_open(opt.header_filename.c_str()); if (fph == 0) { log_os << "ERROR: failed to read the header from file: " << opt.header_filename << "\n"; exit(EXIT_FAILURE); } header = sam_header_read(fph); sam_close(fph); } qconverter qc(opt.is_qphred); bam_dumper bamd(opt.bam_filename.c_str(),header); bam1_t* brp(bam_init1()); bam1_t& br(*brp); bam1_core_t& brc(br.core); int32_t cached_target_id(-1); std::string tmp_key; boost::cmatch rex_result; std::string partner_chrom; while(exr->next()) { const export_line_parser& exl(*(exr->exline())); // reset portions of bam record: brc.flag = 0; brc.pos = -1; brc.mpos = -1; brc.tid = -1; brc.mtid = -1; brc.isize = 0; br.data_len -= br.l_aux; br.l_aux = 0; // Note that the only information we have here is whether the // entire post-alignment build is paired-end or single-end, // and we don't actually know the read's // 'paired-in-sequencing' status. // // The resolution to this problem is that we only set // 'paired-in-sequencing' for paired-end builds (even though // this may be wrong for SE builds), and to preserve legal // read numbers in all cases. Also note that the // paired-in-sequencing bit is mis-interpreted by starling // downstream as 'paired-in-alignment', so CASAVA would not // currently work correctly were this flag set according to // its definition. // if(opt.is_paired) brc.flag |= BAM_FLAG::PAIRED; const int readnum(exl.read_number()); if(opt.is_paired) assert((readnum == 1) or (readnum == 2)); if (readnum==1) { brc.flag |= BAM_FLAG::FIRST_READ; } else if(readnum==2) { brc.flag |= BAM_FLAG::SECOND_READ; } if(not exl.is_passed_filter()) { brc.flag |= BAM_FLAG::FILTER; } // seq qname get_read_key_from_export_line(exl,tmp_key); edit_bam_qname(tmp_key.c_str(),br); const char* pre_cigar(NULL); { // search extra export fields: static const char dupstr[] = "DUP"; static const char cigarstr[] = "CIGAR:"; static const unsigned cigarlen(strlen(cigarstr)); static const char amstr[] = "AM:i:0"; bool is_amzero(false); const unsigned xsize(exl.extra_field_count()); for(unsigned i(0);i(const_cast(exl.quality()))); for(unsigned i(0);i 0))) { brc.tid=cached_target_id; brc.pos = std::max(-1,static_cast(pos-1)); if(is_fwd_strand) { // if shadow, set mate reversed flag based on query strand brc.flag |= BAM_FLAG::MATE_STRAND; } brc.mtid=brc.tid; brc.mpos=brc.pos; } else { ret_val=false; } } else { // mapped: const char* read(exl.read()); if(not is_fwd_strand) { brc.flag |= BAM_FLAG::STRAND; char* uread(const_cast(read)); reverseComp(uread,uread+read_size); std::reverse(qual,qual+read_size); } edit_bam_read_and_quality(read,qual,br); brc.tid=get_bam_target_id(chrom,header); // cache the chrom id for the shadow workaround: cached_target_id=brc.tid; brc.pos = std::max(-1,static_cast(pos-1)); process_match_descriptor(exl,is_fwd_strand,read_size,pre_cigar,br); } // at this point ret_val is fixed, so go ahead and check // against exclusions: // if(opt.is_exclude_unmapped and (not ret_val)) { std::ostringstream oss; oss << "ERROR: detected unmapped read"; if(opt.is_paired) { oss << " pair"; } oss << " in no-unmapped mode\n"; throw blt_exception(oss.str().c_str()); } if(opt.is_exclude_mapped and ret_val) { std::ostringstream oss; oss << "ERROR: detected mapped read"; if(opt.is_paired) { oss << " pair"; } oss << " in no-mapped mode\n"; throw blt_exception(oss.str().c_str()); } // set mapq: if(has_coor) { if(is_semap or is_pemap) { brc.qual = std::min(254,std::max(semap,pemap)); } else { brc.qual = 255; } } else { brc.qual = 0; } if(opt.is_paired) { if(has_coor and (is_pemap and pemap != 0)) { brc.flag |= BAM_FLAG::PROPER_PAIR; } const bool is_pstrand(exl.is_partner_strand()); const char pstrand(is_pstrand ? exl.partner_strand() : '\0'); if((not is_pstrand) or (pstrand=='N')){ // The partner info is bogus for shadows (single score // -1). If it is a shadow, we do not set the mate // unmapped flag, since by definition the mate is // mapped. NOTE that all of the other bogus partner // info is copied through into SAM as is - on the // grounds that downstream analysis still requires the // bogus form. if ((not is_semap) or (semap != -1)){ brc.flag |= BAM_FLAG::MATE_UNMAPPED; } } else { const bool is_fwd_pstrand(pstrand=='F'); if(not is_fwd_pstrand) { brc.flag |= BAM_FLAG::MATE_STRAND; } if(partner_chrom.empty()){ brc.mtid=brc.tid; if(has_coor) { brc.mpos = std::max(-1,static_cast(pos-1+partner_offset)); brc.isize = partner_offset; if(is_fwd_strand!=is_fwd_pstrand) { if(is_fwd_strand) { brc.isize += static_cast(read_size); } else { brc.isize -= static_cast(read_size); } } } } else { brc.mtid=get_bam_target_id(partner_chrom.c_str(),header); brc.mpos=std::max(-1,static_cast(partner_offset-1)); } } } // add remaining optional fields: if(has_coor) { // Values as per spec. Include value if non-blank, even if // 0 or negative: // if(is_semap) { static const char smtag[] = {'S','M'}; bam_aux_append(brp,smtag,'i',4,reinterpret_cast(&semap)); } if(opt.is_paired and is_pemap) { static const char astag[] = {'A','S'}; bam_aux_append(brp,astag,'i',4,const_cast(reinterpret_cast(&pemap))); } } bam_update_bin(br); bamd.put_record(brp); } bam_destroy1(brp); } static void try_main(int argc,char* argv[]){ //const time_t start_time(time(0)); //const char* progname(argv[0]); static const char progname[] = "sortedToBam"; std::string cmdline; for(int i(0);i(&(opt.sorted_filename)),"sorted (export) file (default: stdin)") ("header-file", po::value(&(opt.header_filename)),"file containing SAM header with reference sequence sizes") ("bam-file", po::value(&(opt.bam_filename)),"BAM output file") ("paired","treat input as paired-end (default: single-end) Note that read numbers are preserved in either case.") ("no-mapped","treat mapped reads/read-pairs as an error") ("no-unmapped","treat unmapped reads/read-pairs as an error") ("qlogodds","treat quality scores as log-odds scores, aka Solexa scores (default: treat as Phred64)"); po::options_description help("help"); help.add_options() ("help,h","print this message"); po::options_description visible("options"); visible.add(req).add(help); bool po_parse_fail(false); po::variables_map vm; try { po::store(po::parse_command_line(argc, argv, visible), vm); po::notify(vm); } catch(const boost::program_options::error& e) { log_os << "ERROR: Exception thrown by option parser: " << e.what() << "\n"; po_parse_fail=true; } if (vm.count("help") or po_parse_fail or argc==1) { log_os << "\n" << progname << " converts sorted files to BAM format\n\n"; log_os << "usage: " << progname << " [options] \n\n"; log_os << visible << "\n"; exit(EXIT_FAILURE); } opt.is_paired=(vm.count("paired")); opt.is_qphred=(vm.count("qlogodds")==0); opt.is_exclude_mapped=(vm.count("no-mapped")); opt.is_exclude_unmapped=(vm.count("no-unmapped")); if(opt.header_filename.empty()) { log_os << "\nERROR: must specify bam header file\n\n"; exit(EXIT_FAILURE); } if(opt.bam_filename.empty()) { log_os << "ERROR: must specify bam file\n"; exit(EXIT_FAILURE); } stb_run(opt); } static void dump_cl(int argc, char* argv[], std::ostream& os) { os << "cmdline:"; for(int i(0);i