// -*- mode: c++; indent-tabs-mode: nil; -*- // /// \file /** * Project : CASAVA * Module : $RCSfile: SmallAssembler.cpp,v $ * @author : Ole Schulz-Trieglaff * Copyright : Copyright (c) Illumina 2008, 2009. All rights reserved. * ** This software is covered by the "Illumina Genome Analyzer Software ** License Agreement" and the "Illumina Source Code License Agreement", ** and certain third party copyright/licenses, and any user of this ** source file is bound by the terms therein (see accompanying files ** Illumina_Genome_Analyzer_Software_License_Agreement.pdf and ** Illumina_Source_Code_License_Agreement.pdf and third party ** copyright/license notices). * */ #include "common/Alignment.hh" #include "common/SequenceUtils.hh" #include "applications/SmallAssembler.hh" #include "assembly/SmallAssemblerImpl.hh" // switch on to print all assembled contigs. //#define LIST_CONTIGS 1 using namespace std; using namespace casava::common; namespace ca { namespace applications { void SmallAssembler::updateAlignment(Alignment& al, const int pos, const unsigned numContigs, const std::string& cseq) { const unsigned cseq_size(cseq.size()); Match ma(al.getMatch()); stringstream sst; sst << "IDC" << numContigs; ma.setChromosome(sst.str()); ma.setPosition(pos+1); // 1-indexed al.setMatch(ma); // update match desc const unsigned rl(al.getData().size()); const unsigned read_end(static_cast(rl)+pos); // read must overlap with contig: assert(pos(cseq_size)); assert(read_end>0); unsigned begin_overhang(0); unsigned end_overhang(0); // beginning of read does not overlap with contig: if(pos < 0){ begin_overhang = std::abs(pos); } //end of read does not overlap with contig: if( read_end > cseq_size ) { end_overhang=read_end-cseq_size; } const unsigned overhang(begin_overhang+end_overhang); assert(overhang<=rl); const unsigned matches(rl-overhang); sst.str(""); for(unsigned i(0);i cvec; SmallAssemblerImpl sai; sai.iterateWordLengths(asmOptions,shadows,cvec); #ifdef DEBUG_ASBL dbg_os << "Assembled " << cvec.size() << " contig(s) from " << shadows.size() << " shadow reads.\n" << endl; #endif const unsigned n_contigs(cvec.size()); for (unsigned i(0);i numContigReads) { cerr << "ERROR:: unexpected cluster output : aligned seed : " << numSeedReads << " " << numContigReads << endl; cerr << "contig=" << asmStats.numContigs << " " << cluster_info << endl; exit(EXIT_FAILURE); } /// this functions as an assertion -- should never happen: if (numContigReads > shadow_size) { cerr << "ERROR:: unexpected cluster output : cluster aligned : " << numContigReads << " " << shadow_size << endl; cerr << "contig=" << asmStats.numContigs << " " << cluster_info << endl; exit(EXIT_FAILURE); } #ifdef DEBUG_ASBL dbg_os << "# seeds aligned shadows : " << numSeedReads << " " << numContigReads << " " << shadow_size << endl; #endif // passed all tests! now print out contig reads and contig itself // print out reads contributing to contig, change export line // to include contig name, relative start position (which can // often be negative), and neutral match descriptor: // // also, recalculate min-max range for this contig from the // aligning reads: ++asmStats.numContigs; bool isFirstRead = true; int contigPosMin = 0; int contigPosMax = 0; // index vector of shadow reads participating in assembly vector > sortedContigReads; typedef map::const_iterator muici; const muici cre(ci.contigReads.end()); for(muici crj(ci.contigReads.begin());crj!=cre;++crj){ sortedContigReads.push_back(make_pair(crj->second,crj->first)); } // sort reads contributing to contig according to position sort(sortedContigReads.begin(),sortedContigReads.end()); for(unsigned j(0);j contigPosMax) { contigPosMax=almax; } } } // print out contig itself: // parse group number from shadow output file boost::char_separator sp2("|"); LineTokenizer tok2(cluster_info,sp2); LineTokenizer::iterator tt2(tok2.begin()); #ifdef LIST_CONTIGS cout << "|contig=" << asmStats.numContigs << endl; #endif #ifdef DEBUG_ASBL dbg_os << ">contig=" << asmStats.numContigs << "|" << *tt2 << "|reads=" << numContigReads << "|start=" << contigPosMin << "|end=" << contigPosMax << "\n"; dbg_os << cseq << "\n" << endl; #endif // write header contigs_os << ">contig=" << asmStats.numContigs << "|" << *tt2 << "|reads=" << numContigReads << "|start=" << contigPosMin << "|end=" << contigPosMax << "\n"; // write sequence contigs_os << cseq << endl; } } //TODO: refactor this. int SmallAssembler::run() { cout << "Starting small assembler" << endl; cout << "Input file : "<< _options.inputReadsFilePath() << endl; cout << "Output contig file : " << _options.outputFilePath() << endl; cout << "Output reads file : " << _options.outputReadsFilePath() << endl; //const SmallAssemblerOptions& asmOptions(_options.asmOptions); SmallAssemblerOptions& asmOptions(_options.asmOptions); //#ifdef DEBUG_ASBL //dbg_os << "word length " << asmOptions.wordLength << "\n" cout << "word length " << asmOptions.wordLength << "\n" << "max word length " << asmOptions.maxWordLength << "\n" << "min contig length " << asmOptions.minContigLength << "\n" << "min coverage " << asmOptions.minCoverage << "\n" << "max error " << asmOptions.maxError << "\n" << "min reads " << asmOptions.minSeedReads << endl; //#endif ifstream infile (_options.inputReadsFilePath().c_str()); if (!infile.is_open()) { cerr << "Could not open input read file : " << _options.inputReadsFilePath() << endl; return (1); } ofstream outfile(_options.outputFilePath().c_str()); if (!outfile.is_open()) { cerr << "Could not open output file : " << _options.outputFilePath() << endl; return (1); } ofstream creads(_options.outputReadsFilePath().c_str()); if (!creads.is_open()) { cerr << "Could not open output read file : " << _options.outputReadsFilePath() << endl; return (1); } // determine read length and update word length accordingly unsigned read_length = 0; ifstream infile2 (_options.inputReadsFilePath().c_str()); if (!infile2.is_open()) { cerr << "Could not open input read file to determine read length : " << _options.inputReadsFilePath() << endl; return (1); } while (!infile2.eof()) { string line; getline(infile2,line); // split the input line // TODO: use pipeline tokenizer? typedef boost::tokenizer > tokenizer; boost::char_separator sep(" "); tokenizer tok(line, sep); tokenizer::iterator tt = tok.begin(); // skip comment if (tok.begin() == tok.end() || *tt == "#") continue; if (*tt == "CLUSTER_START" || *tt == "CLUSTER_END") continue; int minOff; int maxOff; Alignment thisShadow; stringstream ss(line); ss >> minOff; ss >> maxOff; ss.ignore(); ss >> thisShadow; read_length = thisShadow.getData().size(); assert(read_length != 0); } cout << "Determined read length of " << read_length << endl; if (read_length < 100) { asmOptions.wordLength = 13; asmOptions.maxWordLength = 19; cout << "Changing default min word length to " << asmOptions.wordLength << endl; cout << "Changing default max word length to " << asmOptions.maxWordLength << endl; } // shadow reads, contains sequence and running number SmallAssemblerImpl::ShadowReadVec shadows; // stores the shadow reads and their alignments ShadowAlignmentVec shadowLines; AssemblerStats asmStats; bool is_in_usable_cluster(false); bool is_in_ignored_cluster(false); while (!infile.eof() ) { string line; getline (infile,line); // split the input line // TODO: use pipeline tokenizer? typedef boost::tokenizer > tokenizer; boost::char_separator sep(" "); tokenizer tok(line, sep); tokenizer::iterator tt = tok.begin(); // skip comments and empty lines if (tok.begin() == tok.end() || *tt == "#") continue; if (*tt == "CLUSTER_END") { if (is_in_ignored_cluster) { is_in_ignored_cluster = false; continue; } ++asmStats.numClusters; assert(is_in_usable_cluster); is_in_usable_cluster=false; ++tt; const std::string cluster_info(*tt); process_cluster(asmOptions,asmStats,shadows,shadowLines,cluster_info,creads,outfile); // write tag //creads << line << endl; // end of shadow read cluster, empty container shadows.clear(); shadowLines.clear(); } else if (*tt == "CLUSTER_START") { #ifdef DEBUG_ASBL dbg_os << "Line : " << line << endl; #endif // Consider any cluster type - whether or not `type' specified. bool type_key_found(false); bool usable_type_val_found(false); for ( ; tt != tok.end(); ++tt) { if (*tt == "type") { type_key_found = true; } else if (type_key_found) { // && (*tt == "Shadow/SemiAligned")) { usable_type_val_found = true; } } if (type_key_found && !usable_type_val_found) { #ifdef DEBUG_ASBL cerr << "Skipping cluster : " << line << endl; #endif is_in_ignored_cluster = true; continue; } #ifdef DEBUG_ASBL cerr << "Using cluster : " << line << endl; #endif assert(not is_in_usable_cluster); is_in_usable_cluster=true; // writeShadows(shadowLines,creads); // write tag // creads << line << endl; // discard all previously collected reads and their alignments //shadows.clear(); //shadowLines.clear(); } else { if (not is_in_usable_cluster) continue; // a shadow read, either in a cluster or outside. // all reads are written to the read file, but only the reads // between CLUSTER_START and CLUSTER_END are assembled. int minOff; int maxOff; Alignment thisShadow; stringstream ss(line); ss >> minOff; ss >> maxOff; ss.ignore(); ss >> thisShadow; //cout << "Read: " << line << endl; //cout << "minOff : " << minOff << " maxOff: " << maxOff << endl; // assert(minOff<=maxOff); // assert((minOff>=0) && (maxOff>=0)); // unlikely to happen as word length is adopted to fit the read length if(thisShadow.getData().size() < asmOptions.maxWordLength) { continue; } if (thisShadow.getMatch().getStrand()==Match::Reverse) { // is reverse match, take reverse complement of sequence string s = thisShadow.getData(); SequenceUtils::reverseComp(s); //shadows.push_back(make_pair(shadows.size(),s)); shadows.push_back(SmallAssemblerImpl::ShadowRead(s,shadows.size(),false)); } else { //shadows.push_back(make_pair(shadows.size(),thisShadow.getData())); shadows.push_back(SmallAssemblerImpl::ShadowRead(thisShadow.getData(),shadows.size(),false)); } // store shadow read lines pair coords(minOff,maxOff); shadowLines.push_back(make_pair(coords,thisShadow)); } } // while infile // all shadows parsed, close output streams. infile.close(); outfile.close(); creads.close(); cout << endl; cout << "----------------------------------- SUMMARY ---------------------------------------------" << endl; cout << "numClusters : " << asmStats.numClusters << " numContigs : " << asmStats.numContigs << endl; cout << "Discarded contigs : " << (asmStats.clustLen+asmStats.numSeeds+asmStats.others) << " (one shadow read cluster can give rise to several contigs)" << endl; cout << "Reasons : length: " << asmStats.clustLen << " numSeeds : " << asmStats.numSeeds << " others : " << asmStats.others << endl; return 0; } } // end of namespace } // end of namespace