/** ** Copyright (c) 2007-2010 Illumina, Inc. ** ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). ** ** This file is part of the Consensus Assessment of Sequence And VAriation ** (CASAVA) software package. ** ** @file BamAlignmentReader.h ** ** @brief Provides a wrapper for bam_streamer that fills the Alignment data ** structure. ** ** @author Michael Stromberg **/ #include "alignment/GlobalUtilities.hh" #include "blt_util/seq_util.hh" #include "common/StringUtilities.hh" #include "starling/align_path.hh" #include "starling/align_path_bam_util.hh" #include "variance/BamAlignmentReader.hh" #include using namespace std; namespace cc = casava::common; namespace ca { namespace variance_detection { // constructor BamAlignmentReader::BamAlignmentReader(const bool isFilterUnanchored) : mIsOpen(false) , mpInStream(NULL) , mUseCasavaReadNameStyle(true) , mIsFilterUnanchored(isFilterUnanchored) { } // destructor BamAlignmentReader::~BamAlignmentReader(void) { Close(); } // closes the underlying file stream(s) void BamAlignmentReader::Close(void) { if(mIsOpen) { delete mpInStream; mIsOpen = false; } } // converts the bases from a bam_seq to an alignment string void BamAlignmentReader::AddBases(const bam_seq& br, cc::Alignment& al, const bool onForwardStrand) { const unsigned int seqLen = br.size(); string s; s.resize(seqLen); char* pString = (char*)s.data(); for(uint32_t i = 0; i < seqLen; ++i, ++pString) { *pString = (onForwardStrand ? br.get_char(i) : reverseCharASCII[(uint)br.get_char(seqLen - i - 1)]); } al.setData(s); } // converts the base qualities from an array to an alignment string void BamAlignmentReader::AddBaseQualities(const uint8_t* bq, const uint32_t numBases, cc::Alignment& al, const bool onForwardStrand) { string s; s.resize(numBases); char* pString = (char*)s.data(); if(onForwardStrand) { // handle the forward orientation for(uint32_t i = 0; i < numBases; ++i, ++pString, ++bq) { *pString = *bq + 64; } } else { // handle the reverse orientation bq += (numBases - 1); for(uint32_t i = 0; i < numBases; ++i, ++pString, --bq) { *pString = *bq + 64; } } al.setQuality(s); } // converts the index sequence to an alignment string void BamAlignmentReader::AddIndex(const char* indexSeq, casava::common::Alignment& al) { string s = (indexSeq ? indexSeq : "0"); al.setIndex(s); } // extracts the read name into machine name, run id, etc. void BamAlignmentReader::AddReadName(cc::Alignment& al, const char* readName, bool& useCasavaReadNameStyle) { // handle CASAVA read names if(useCasavaReadNameStyle) { const size_t BUFFER_SIZE = strlen(readName); const char* pBuffer = readName; const char* pEnd = pBuffer + BUFFER_SIZE; // initialize pointers char* pUnderline1 = NULL; char* pColon1 = NULL; char* pColon2 = NULL; char* pColon3 = NULL; char* pColon4 = NULL; bool foundError = false; pUnderline1 = (char*)memchr(pBuffer, '_', BUFFER_SIZE); if(!pUnderline1) foundError = true; if(!foundError) pColon1 = (char*)memchr(pUnderline1 + 1, ':', BUFFER_SIZE - (pUnderline1 - pBuffer)); if(!pColon1) foundError = true; if(!foundError) pColon2 = (char*)memchr(pColon1 + 1, ':', BUFFER_SIZE - (pColon1 - pBuffer)); if(!pColon2) foundError = true; if(!foundError) pColon3 = (char*)memchr(pColon2 + 1, ':', BUFFER_SIZE - (pColon2 - pBuffer)); if(!pColon3) foundError = true; if(!foundError) pColon4 = (char*)memchr(pColon3 + 1, ':', BUFFER_SIZE - (pColon3 - pBuffer)); if(!pColon4) foundError = true; if(!foundError) { // EAS15_25:1:59:590:393 string machineName; string runNumber; string laneNumber; string tileNumber; string xCoord; string yCoord; cc::StringUtilities::CopyString(machineName, pBuffer, pUnderline1); cc::StringUtilities::CopyString(runNumber, pUnderline1 + 1, pColon1); cc::StringUtilities::CopyString(laneNumber, pColon1 + 1, pColon2); cc::StringUtilities::CopyString(tileNumber, pColon2 + 1, pColon3); cc::StringUtilities::CopyString(xCoord, pColon3 + 1, pColon4); cc::StringUtilities::CopyString(yCoord, pColon4 + 1, pEnd); al.setMachineName(machineName); al.setRunNumber(atoi(runNumber.c_str())); al.setLaneNumber(atoi(laneNumber.c_str())); al.setTileNumber(atoi(tileNumber.c_str())); al.setX(atoi(xCoord.c_str())); al.setY(atoi(yCoord.c_str())); //cout << "DEBUG: Retrieved read name: [" << laneNumber << ':' << tileNumber << ':' << xCoord << ':' << yCoord << "]" << endl; return; } else useCasavaReadNameStyle = false; } // handle non-CASAVA read names al.setMachineName(readName); al.setRunNumber(0); al.setLaneNumber(0); al.setTileNumber(0); al.setX(0); al.setY(0); } // provides the next alignment and returns false if no more reads are available bool BamAlignmentReader::GetNextRead(cc::Alignment& al) { // skip if the file is not currently open if(!mIsOpen) return false; // grab the next alignment that does not have purity filtering, pcr // duplicates, or unsupported CIGAR operations const bam_record* pBamAlignment = NULL; while(true) { // skip if there are no more alignments available if(!mpInStream->next()) return false; // retrieve the next alignment pBamAlignment = mpInStream->get_record_ptr(); // skip unwanted alignments if(IsGoodAlignment(pBamAlignment, mReferenceLen, mIsFilterUnanchored)) break; } // get the orientations and set the index const bool onForwardStrand = pBamAlignment->is_fwd_strand(); const bool mateOnForwardStrand = pBamAlignment->is_mate_fwd_strand(); cc::Match::Strand referenceStrand = (onForwardStrand ? cc::Match::Forward : cc::Match::Reverse); // get the reference IDs const int referenceNum = pBamAlignment->target_id(); const int mateReferenceNum = pBamAlignment->mate_target_id(); string chromosome = mpInStream->target_id_to_name(referenceNum); // set the read name, barcode sequence, read number, and filter status AddReadName(al, pBamAlignment->qname(), mUseCasavaReadNameStyle); AddIndex(pBamAlignment->get_string_tag("BC"), al); al.setReadNumber((unsigned int)pBamAlignment->read_no()); al.setPassed(!pBamAlignment->is_filter()); if(pBamAlignment->is_unmapped()) { // ====================== // handle unaligned reads // ====================== // unaligned reads are always given in the forward orientation AddBases(pBamAlignment->get_bam_read(), al, true); AddBaseQualities(pBamAlignment->qual(), al.getLength(), al, true); // set the match descriptor and SE AQ al.setMatchDescriptor("-"); al.setSingleReadScore(-1); // if the other mate was aligned, set this mate to the opposite orientation // N.B. This sucks because it assumes a paired-end protocol rather than mate-pair if(pBamAlignment->is_paired() && (!pBamAlignment->is_mate_unmapped())) { referenceStrand = (mateOnForwardStrand ? cc::Match::Reverse : cc::Match::Forward); } // assume that we have an NM read chromosome = "NM"; // check for QC tag const char* qcTag = pBamAlignment->get_string_tag("XC"); if(qcTag) chromosome = "QC"; // check if we have too many matches (seed errors instead of chromosome name) int32_t seedErrors[3]; bool checkSeedErrors[3]; checkSeedErrors[0] = pBamAlignment->get_num_tag("H0", seedErrors[0]); checkSeedErrors[1] = pBamAlignment->get_num_tag("H1", seedErrors[1]); checkSeedErrors[2] = pBamAlignment->get_num_tag("H2", seedErrors[2]); if(checkSeedErrors[0] && checkSeedErrors[1] && checkSeedErrors[2]) { ostringstream sb; sb << seedErrors[0] << ':' << seedErrors[1] << ':' << seedErrors[2]; chromosome = sb.str(); } // set the match data cc::Match match(chromosome, "", pBamAlignment->pos(), referenceStrand, true); al.setMatch(match); } else { // ==================== // handle aligned reads // ==================== AddBases(pBamAlignment->get_bam_read(), al, onForwardStrand); AddBaseQualities(pBamAlignment->qual(), al.getLength(), al, onForwardStrand); // get the reference position const int referencePos = pBamAlignment->pos(); // set the single-end alignment quality al.setSingleReadScore(pBamAlignment->se_map_qual()); // convert the BAM CIGAR to CASAVA match descriptor // // N.B. IsGoodAlignment is used to ensure that only supported CIGAR // strings make it this far. The md is set to "UNSUPPORTED" if an // illegal CIGAR operation is encountered below. ALIGNPATH::path_t apath; string md; bam_cigar_to_apath(pBamAlignment->raw_cigar(), pBamAlignment->n_cigar(), apath); ALIGNPATH::apath_to_export_md(apath, mpReferenceBases, mpReferenceEnd, referencePos, al.getData().c_str(), onForwardStrand, md); al.setMatchDescriptor(md); if(pBamAlignment->is_paired()) { // --------------------------------- // handle paired-end/mate-pair reads // --------------------------------- // set the paired-end alignment quality al.setPairedReadScore(pBamAlignment->pe_map_qual()); // get the aligned mate position and orientation int matePos = pBamAlignment->mate_pos(); cc::Match::Strand mateStrand = (pBamAlignment->is_mate_fwd_strand() ? cc::Match::Forward : cc::Match::Reverse); // set the mate chromosome only if it occurs on a different chromosome string mateChromosome; if(mateReferenceNum == -1) { // NM mateChromosome.clear(); mateStrand = cc::Match::NM; } else if(mateReferenceNum != referenceNum) { // mate occurs on a different chromosome mateChromosome = mpInStream->target_id_to_name(pBamAlignment->mate_target_id()); } else { // mate occurs on the same chromosome matePos -= referencePos; } // set the paired-end match data cc::Match mateMatch(mateChromosome, "", matePos, mateStrand, true); al.setPartnerMatch(mateMatch); } else { // ----------------------- // handle single-end reads // ----------------------- al.setIsPaired(false); } // set the match data cc::Match match(chromosome, "", referencePos, referenceStrand, true); al.setMatch(match); } return true; } // returns true if the alignment contains supported CIGAR operations, does not fail QC, and is not a duplicate bool BamAlignmentReader::IsGoodAlignment(const bam_record* pBamAlignment, const uint32_t refLen, const bool isFilterUnanchored) { // check for PCR duplicates and alignments that fail vendor QC if(pBamAlignment->is_dup() || pBamAlignment->is_filter()) return false; // check the cigar operations const uint32_t* rawCigar = pBamAlignment->raw_cigar(); for(uint32_t i = 0; i < pBamAlignment->n_cigar(); i++) { const int op = rawCigar[i] & BAM_CIGAR_MASK; if((op == BAM_CREF_SKIP) || (op == BAM_CSOFT_CLIP) || (op == BAM_CHARD_CLIP) || (op == BAM_CPAD)) return false; } // ignore reads that straddle the endpoint (circular genome) const uint32_t alignmentBegin = pBamAlignment->pos(); const uint32_t alignmentEnd = alignmentBegin + pBamAlignment->read_size(); if((alignmentBegin < refLen) && (alignmentEnd >= refLen)) return false; // optionally ignore reads from unanchored pairs: if(isFilterUnanchored and pBamAlignment->is_unanchored()) { return false; } return true; } // returns true if the underlying file stream is open bool BamAlignmentReader::IsOpen(void) const { return mIsOpen; } // opens the underlying file stream void BamAlignmentReader::Open(const std::string& bamFilename, const std::string& refFilename, const char* bamRegion) { Close(); // open the BAM file mFilename = bamFilename; mpInStream = new bam_streamer(bamFilename.c_str(), bamRegion); // retrieve the reference get_ref_seq(refFilename.c_str(),mReferenceBases); standardize_ref_seq(refFilename.c_str(),mReferenceBases); mReferenceLen = mReferenceBases.size(); mpReferenceBases = mReferenceBases.c_str(); mpReferenceEnd = mpReferenceBases + mReferenceLen - 1; // set the isOpen flag mIsOpen = true; } } }