#ifndef FORCE_DEBUG #define NDEBUG #endif #include #include "base/CommandLineParser.h" #include "aligns/KmerAlignCore.h" #include "analysis/DNAVector.h" #include #include "base/FileParser.h" #include "util/mutil.h" #include "analysis/KmerTable.h" bool Irregular(char l) { if (l == 'A' || l == 'C' || l == 'G' || l == 'T') return false; //cout << "Irregular char: " << l << endl; return true; } int FindPartner(svec & ids, const string & name) { char tmp[256]; strcpy(tmp, name.c_str()); tmp[strlen(tmp)-1] = '1'; IDS dummy; dummy.SetName(tmp); int index = BinSearch(ids, dummy); return index; } int FindPartnerRead(vecDNAVector & dna, const string & name) { char tmp[256]; strcpy(tmp, name.c_str()); tmp[strlen(tmp)-1] = '1'; int index = dna.NameIndex(tmp); return index; } bool IsFW(const string & name) { if (name[name.size()-1] == '2') { //cout << "Add fw " << name << endl; return true; } return false; } class ReadPlaces { public: ReadPlaces() { } void AddFW(int i, int pos) { m_fw.push_back(i); m_fwPos.push_back(pos); } void AddRC(int i, int pos) { m_rc.push_back(i); m_rcPos.push_back(pos); } int GetFWCount() const {return m_fw.isize();} int GetFW(int i) const {return m_fw[i];} int GetFWPos(int i) const {return m_fwPos[i];} int GetRCCount() const {return m_rc.isize();} int GetRC(int i) const {return m_rc[i];} int GetRCPos(int i) const {return m_rcPos[i];} int GetRCPosByRead(int read) const { for (int i=0; i m_fw; svec m_fwPos; svec m_rc; svec m_rcPos; }; void ReadThePlaces(svec & plac, const string & file) { CMReadFileStream f; f.Open(file.c_str()); int n; f.Read(n); plac.resize(n); for (int i=0; i & plac, const string & file) { CMWriteFileStream f; f.Open(file.c_str()); f.Write(plac.isize()); for (int i=0; i aStringCmmd("-i","read fasta file"); commandArg fStringCmmd("-f","fasta file"); commandArg oStringCmmd("-o","fasta output"); commandArg kCmmd("-k","kmer size", 24); commandArg strandCmmd("-doublestrand","not strand-specific", false); commandArg padCmmd("-pad","pad witn N's if no merge", false); commandArg contCmmd("-cont","continue w/ previous run", false); //commandArg cCmmd("-nc","do not fully connected graph", false); commandLineParser P(argc,argv); P.SetDescription("Assembles k-mer sequences."); P.registerArg(aStringCmmd); P.registerArg(fStringCmmd); P.registerArg(oStringCmmd); P.registerArg(kCmmd); P.registerArg(strandCmmd); P.registerArg(padCmmd); P.registerArg(contCmmd); //P.registerArg(cCmmd); P.parse(); string aString = P.GetStringValueFor(aStringCmmd); string fString = P.GetStringValueFor(fStringCmmd); string oString = P.GetStringValueFor(oStringCmmd); int k = P.GetIntValueFor(kCmmd); bool bStrand = P.GetBoolValueFor(strandCmmd); bool bCont = P.GetBoolValueFor(contCmmd); bool bPad = P.GetBoolValueFor(padCmmd); string placementFile = fString + ".placements"; int i, j; //vecbasevector contigBases; vecDNAVector seq; seq.Read(aString); vecDNAVector dna; dna.Read(fString); int readlen = seq[0].isize(); KmerSequence kmers(k, &seq); if (!bCont) { kmers.AddRestrict(seq, dna); //kmers.Add(seq); } else { cout << "Continuing..." << endl; } long long m = kmers.GetBoundValue(); vecDNAVector out; //dna.Read(fString); //svec used; //used.resize(seq.isize(), 0); int offset = seq[0].isize(); int broken = 0; int extra = 100; svec contigs; contigs.resize(seq.size(), -1); svec places; if (!bCont) { places.resize(dna.size()); } else { cout << "Reading placements..." << endl; ReadThePlaces(places, placementFile); cout << "done." << endl; } int maxDist = 400; cout << "Assigning contigs to reads." << endl; for (i=0; i ids; int edge = 0; kmers.BasesToNumberCountPlus(ids, n1, sub, edge); for (int x=0; x 0) continue; double ff = FracAlign(one, j, seq[ids[x].ID()], ids[x].Start()); if (ff > 0.98) { if (IsFW(seq.Name(ids[x].ID()))) { pl.AddFW(ids[x].ID(), one.isize()-j); } else { pl.AddRC(ids[x].ID(), j+readlen); } contigs[ids[x].ID()] = i; } } } } } //if (!bCont) //WriteThePlaces(places, placementFile); for (i=0; i possible; possible.resize(dna.size(), 0); //cout << "Checking contig " << i << endl; ReadPlaces & pl = places[i]; double currCov = (double)(pl.GetFWCount() + pl.GetRCCount() * readlen)/(double)d.isize(); int best = 0; int max = 0; for (j=0; j maxDist) continue; possible[cont]++; if (possible[cont] > max) { max = possible[cont]; best = cont; } } if (max > 2) { DNAVector & second = dna[best]; double otherCov = (double)(places[best].GetFWCount() + places[best].GetRCCount() * readlen)/(double)second.isize(); if (IsJoinOK(currCov, otherCov, max)) { cout << "Joining " << dna.Name(i) << " to " << dna.Name(best) << " support: " << max << endl; int firstLen = d.isize(); int secondLen = second.isize(); bool bMerged = false; if (d.Append(second, 12, 25, 0.95)) { cout << "Merged sequences w/o N's." << endl; second.resize(0); bMerged = true; } else { if (bPad) { cout << "Padding with N's" << endl; DNAVector nn = d; nn.resize(d.isize() + 8); for (j=d.isize(); j