#include #include #include #include "alignment/GlobalUtilities.hh" extern "C" { #include void bam_view1(const bam_header_t *header, const bam1_t *b); } #include "applications/Sam2Export.hh" /** * Project : CASAVA * Module : $RCSfile: Sam2Export.cpp,v $ * @author : Roman Petrovski * Copyright : Copyright (c) Illumina 2008, 2009. All rights reserved. * ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). * */ #include #include #include #include "common/Alignment.hh" namespace ca { namespace applications { using namespace std; using namespace casava::common; /* char solexa2Phred[] = { 34, 34, 35, 35, 36, 36, 37, 37, 38, 38, 39, 40, 41, 42, 43, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, }; */ // for(char solexaQval = -5; solexaQval < 60; ++ solexaQval) // { // double samQval = (10.0 * log(1 + pow(10.0, solexaQval / 10.0)) / log(10)); // cout << 33 + (char)(samQval + 0.499) << endl; // double solexaQval1 = (log10( exp((samQval) * log(10) / 10.0) - 1) * 10.0); // // if(solexaQval != (char)(solexaQval1 + (solexaQval1 > 0 ? 0.499 : -0.499))) // { // cout << "Mismatch" << endl; // } // } // for(char phredQval = 0; phredQval < 65; ++ phredQval) // { // double solexaQval = (log10( exp((phredQval) * log(10) / 10.0) - 1) * 10.0); // cout << 64 + (char)(solexaQval + 0.499) << ", "; // double samQval = (10.0 * log(1 + pow(10.0, solexaQval / 10.0)) / log(10)); // // if(phredQval != (char)(samQval + (samQval > 0 ? 0.499 : -0.499))) // { // cout << "Mismatch" << endl; // } // } /** * Computed table. when converting export to sam (using samtools export2sam.pl) and back, * the I returns as J. * TODO: have a note explaining why this is not one to one with solexa */ /* char phred2Solexa[] = { 0, 59, 63, 64, 66, 67, 69, 70, 71, 72, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,101,102,103, 104,105,106,107,108,109,110,111,112,113, 114,115,116,117,118,119,120,121,122,123, 124,125,126,127,128, }; */ template class ValueCast { const uint8_t *mBuffer; public: ValueCast(const uint8_t *s) : mBuffer(s) { } operator T() { return *(T*) mBuffer; } }; template<> class ValueCast { const uint8_t *mBuffer; public: ValueCast(const uint8_t *s) : mBuffer(s) { } operator const char*() { return (const char*) mBuffer; } }; const uint8_t *skipValue(uint8_t type, const uint8_t* s) { if (type == 'A') { ++s; } else if (type == 'C') { ++s; } // printf("i:%u", *s); else if (type == 'c') { ++s; } //printf("i:%d", *s); else if (type == 'S') { s += 2; } //printf("i:%u", *(uint16_t*)s); else if (type == 's') { s += 2; } //printf("i:%d", *(int16_t*)s); else if (type == 'I') { s += 4; } //printf("i:%u", *(uint32_t*)s); else if (type == 'i') { s += 4; } //printf("i:%d", *(int32_t*)s); else if (type == 'f') { s += 4; } //printf("f:%g", *(float*)s); else if (type == 'Z' || type == 'H') //{ printf("%c:", type); while (*s) putchar(*s++); ++s; } { while (*s++) ; } return s; } template T getKeyValue(const bam1_t &b, const char *searchKey, T defval) { const uint8_t *s = bam1_aux(&b); while (s < b.data + b.data_len) { const uint8_t *foundKey = s; s += 2; uint8_t type = *s; ++s; if (*(const uint16_t*) searchKey == *(const uint16_t*) foundKey) { return ValueCast (s); } s = skipValue(type, s); } return defval; } struct SamRecord { std::string sequence; std::string qval; casava::common::Match::Strand strand; casava::common::Match::Strand mstrand; std::string qname; std::string machineName; bool paired; int readNumber; std::string rname; std::string mrnm; int pos; int mpos; uint32_t mapq; unsigned int ilmnLane; unsigned int ilmnTile; unsigned int ilmnRunNumber; std::string ilmnIndex; int ilmnClusterX; int ilmnClusterY; std::string ilmnMatchDescriptor; int ilmnSingleReadAlignScore; }; boost::char_separator colon(":"); typedef boost::tokenizer > tokenizer; void complement(char &c) { c = reverseCharASCII[(int)c];//TODO:check if export files can hav "atcgnN" (small ones and n) } bool readSamRecord(tamFile fp, bam_header_t *header, SamRecord &sr) { bam1_t b = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 0, 0, 0, 0 }; if (sam_read1(fp, header, &b) < 0) { return false; } bam_view1(header, &b); //TODO remove this debug output uint8_t *s = bam1_seq(&b), *t = bam1_qual(&b); sr.sequence.reserve(b.core.l_qseq); sr.sequence.resize(0); sr.qval.reserve(b.core.l_qseq); sr.qval.resize(0); for (int i = 0; i < b.core.l_qseq; ++i) { sr.sequence += bam_nt16_rev_table[bam1_seqi(s, i)]; //sr.qval += phred2Solexa[t[i]]; sr.qval += (t[i] + 31); } if(!!(b.core.flag & 0x0010)) { sr.strand = casava::common::Match::Reverse; std::reverse(sr.sequence.begin(), sr.sequence.end()); std::for_each(sr.sequence.begin(), sr.sequence.end(), complement); std::reverse(sr.qval.begin(), sr.qval.end()); } else { sr.strand = casava::common::Match::Forward; } if(!!(b.core.flag & 0x0020)) { sr.mstrand = casava::common::Match::Reverse; } else { sr.mstrand = casava::common::Match::Forward; } sr.qname = bam1_qname(&b); tokenizer tokens(sr.qname, colon); std::vector tokensArray(tokens.begin(), tokens.end()); sr.machineName = tokensArray[0]; sr.ilmnLane = sr.ilmnTile = sr.ilmnRunNumber = 1; sr.ilmnClusterX = sr.ilmnClusterY = 1; if (5 == tokensArray.size()) { //Looks like QNAME composed of Illumina export file columns sr.ilmnLane = boost::lexical_cast(tokensArray[1]); sr.ilmnTile = boost::lexical_cast(tokensArray[2]); sr.ilmnClusterX = boost::lexical_cast(tokensArray[3]); sr.ilmnClusterY = boost::lexical_cast(tokensArray[4]); std::string::size_type posRunNr = sr.machineName.rfind('_'); if (std::string::npos != posRunNr) { try { sr.ilmnRunNumber = boost::lexical_cast( sr.machineName.substr(posRunNr + 1)); sr.machineName.erase(posRunNr); } catch (boost::bad_lexical_cast &) { } //Couldn't interpret as read number... no big deal } }// otherwise leave dummy values sr.paired = !!(b.core.flag & 0x0001); sr.readNumber = sr.paired ? (b.core.flag >> 6) & 0x0003 : 0; //TODO: resolving rname through header makes no sense for SAM files //TODO: header is built on index. samtools index fasta produces improper name on E_coli.fa sr.rname = (b.core.tid < 0) ? "" : header->target_name[b.core.tid]; sr.mrnm = (b.core.mtid < 0) ? "" : (b.core.mtid == b.core.tid) ? "" : header->target_name[b.core.mtid]; sr.pos = b.core.pos + 1; //though in format it's 1-based in bam1_core_t it's 0-based sr.mpos = b.core.mpos + 1; //though in format it's 1-based in bam1_core_t it's 0-based //TODO can't print blank index. 0 prints. const char * index = getKeyValue(b, "BC", ""); //TODO: check if this works on different endianness sr.ilmnIndex = index[0] ? boost::lexical_cast(index) : 0; sr.ilmnMatchDescriptor = getKeyValue(b, "MD", ""); sr.ilmnSingleReadAlignScore = getKeyValue(b, "SM", 0); sr.mapq = b.core.qual; return true; } int Sam2Export::run() { cout << "[" << "Start" << "]\n"; const casava::common::Tile customTile("a name", 11, 12, 13); const casava::common::Spot customSpot(customTile, 24, 25); std::string samFilePath = _options.inputReadsFilePath(); std::string indexPath = _options.indexPath(); std::string exportFilePath = _options.outputFilePath(); std::string export2FilePath = _options.output2FilePath(); casava::common::AlignmentWriter alignmentWriter(exportFilePath, casava::common::CompressionFactory::none()); if (alignmentWriter.is_open() && alignmentWriter.good()) { cout << "Writing " << exportFilePath << "\n"; } else { cout << "Error: failed to open: " << exportFilePath << "\n"; exit(1); } casava::common::AlignmentWriter alignment2Writer(casava::common::CompressionFactory::none()); if(!export2FilePath.empty()){ alignment2Writer.open(export2FilePath); if (alignment2Writer.is_open() && alignment2Writer.good()) { cout << "Writing " << export2FilePath << "\n"; } else { cout << "Error: failed to open: " << export2FilePath << "\n"; exit(1); } } bam_header_t *header = sam_header_read2(indexPath.c_str()); tamFile fp = sam_open(samFilePath.c_str()); bam1_t b = { { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, 0, 0, 0, 0 }; setvbuf(stdout, NULL, _IONBF, 0); SamRecord sr; while (readSamRecord(fp, header, sr)) { casava::common::Tile customTile(sr.machineName, sr.ilmnRunNumber, sr.ilmnLane, sr.ilmnTile); casava::common::Spot customSpot(customTile, sr.ilmnClusterX, sr.ilmnClusterY); casava::common::Match match(sr.rname, "", //TODO: what are contigs? sr.pos, sr.strand, true); casava::common::Sequence read1Sequence(customSpot, sr.ilmnIndex, sr.readNumber, sr.sequence, sr.qval, true); if (sr.paired) { casava::common::Match matchMate(sr.mrnm, "", sr.mpos - sr.pos, sr.mstrand, true); casava::common::Alignment readAlignment(read1Sequence, match, sr.ilmnMatchDescriptor, sr.ilmnSingleReadAlignScore, matchMate, sr.mapq, sr.paired); if (alignment2Writer.is_open() && 1 == sr.readNumber) { alignment2Writer << readAlignment; } else { alignmentWriter << readAlignment; } } else { casava::common::Alignment readAlignment(read1Sequence, match, sr.ilmnMatchDescriptor, sr.ilmnSingleReadAlignScore); alignmentWriter << readAlignment; } } free(b.data); sam_close(fp); bam_header_destroy(header); alignment2Writer.close(); alignmentWriter.close(); cout << "[" << "End" << "]\n"; return 0; } } } // end namespace casava{ namespace { applications