//#ifndef FORCE_DEBUG //#define NDEBUG //#endif #include #include "base/CommandLineParser.h" #include "aligns/KmerAlignCore.h" #include "analysis/DNAVector.h" #include #include "analysis/TranscriptomeGraph.h" //int K = 12; class SeqChunk { public: SeqChunk() { m_size = 100000; m_counter = 0; // cerr << "Initial resizing m_seq to " << m_size << endl; m_seq.resize(m_size, 0); } long long Offset() const {return m_counter;} long long lsize() const {return m_size;} const char * Seq(long long i) const { if (i >= m_seq.isize()) { cerr << "Error! i=" << i << " size=" << m_seq.isize() << endl; return NULL; } return &m_seq[i]; } bool Add(long long & from, long long & to, const string & s) { //cout << "Adding " << s << endl; //cout << "m_counter=" << m_counter << endl; long long len = strlen(s.c_str()); if (len + m_counter >= m_size) { // cerr << "Overflow: " << len + m_counter << endl; return false; } from = m_counter; if (m_seq.lsize() == 0) { //cerr << "Resizing m_seq to " << m_size << endl; m_seq.resize(m_size, 0); } for (long long i=0; i m_seq; long long m_counter; long long m_size; }; class KmerSequence; KmerSequence * GetGlobalSeq(); class KmerEntry { public: KmerEntry() {m_index = -1;} KmerEntry(long long i) {m_index = i;} void SetIndex(long long index) { m_index = index; } long long Index() const {return m_index;} bool operator < (const KmerEntry & k) const; private: long long m_index; }; class KmerSequence { public: KmerSequence() { m_k = 24; m_seq.resize(256); m_count = 0; m_kmerIndex = 0; } void Clear() { m_count = 0; m_kmerIndex = 0; m_seq.clear(); m_seq.resize(256); m_kmers.clear(); m_current = ""; } void SetK(long long k) {m_k = k;} long long K() const {return m_k;} long long GetBoundValue() const {return m_kmerIndex+2;} const char * Sequence(long long i) const { if (i == -1) { //cout << "Called 'Sequence', returning " << m_current << endl; return m_current.c_str(); } long long index = i / m_seq[0].lsize(); long long i2 = i - index * m_seq[0].lsize(); //cout << "index=" << index << " offset=" << i2 << endl; if (index > m_seq.isize()) { cerr << "ERROR! " << index << " " << m_seq.isize() << endl; } const SeqChunk & c = m_seq[index]; const char * pRet = c.Seq(i2); if (pRet == NULL) { cerr << "Err: index=" << index << " i2=" << i2 << endl; } return pRet; } void Add(const string & line) { long long from = -1; long long to = -1; if (!m_seq[m_count].Add(from, to, line)) { m_count++; // cerr << "Added chunk " << m_count << endl; m_seq[m_count].Add(from, to, line); } long long index = from + m_count * m_seq[m_count].lsize(); long long len = strlen(line.c_str()); for (long long i=0; i<=len-m_k; i++) { if (m_kmerIndex >= m_kmers.lsize()) { //cerr << "Resizing to " << m_kmerIndex + 100000 << endl; m_kmers.resize(m_kmerIndex + 100000); //cerr << "size=" << m_kmers.lsize() << endl; } m_kmers[m_kmerIndex].SetIndex(index + i); m_kmerIndex++; } } void Setup() { //cerr << "Sorting k-mers" << endl; long long i; //for (i=0; i= m_kmers.isize()) //cout << "ERROR: " << ret << endl; //if (ret < 0) //ret = m_kmerIndex+1; return ret; } private: svec m_seq; svec m_kmers; long long m_kmerIndex; long long m_count; long long m_k; string m_current; }; KmerSequence * GetGlobalSeq() { static KmerSequence seq; return &seq; } bool KmerEntry::operator < (const KmerEntry & k) const { long long i; //cout << "my index=" << m_index << " comp=" << k.Index() << endl; const char * pOne = GetGlobalSeq()->Sequence(m_index); const char * pTwo = GetGlobalSeq()->Sequence(k.Index()); int n = GetGlobalSeq()->K(); for (i=0; i pTwo[i]) return false; if (pOne[i] < pTwo[i]) return true; } return false; } //======================================================================== //======================================================================== //======================================================================== class KmerNode { public: KmerNode(long long id, long long prev, long long mult, const DNAVector & d) { m_id = id; m_prev = prev; m_kmer = d; m_mult = mult; } const DNAVector & Kmer() const {return m_kmer;} long long ID() const {return m_id;} long long Prev() const {return m_prev;} long long Mult() const {return m_mult;} private: DNAVector m_kmer; long long m_id; long long m_prev; long long m_mult; }; class KmerGraph { public: KmerGraph() { m_high = 0; } long long Add(long long id, long long last, long long supp, const DNAVector & d) { //cout << "Adding node " << id << " last=" << last << endl; m_nodes.push_back(KmerNode(id, last, supp, d)); return m_high; } long long Add(long long last, long long supp, const DNAVector & d) { //cout << "Adding node last=" << last << endl; m_nodes.push_back(KmerNode(m_high, last, supp, d)); m_high++; return m_high-1; } void Print(FILE * pOut, const svec & ids) const { if (m_nodes.lsize() < 2) return; //cout << "Prlong longing graph (verbose) " << endl; //fprlong longf(pOut, "Partial graph\n"); long long i, j; bool bCont = false; if (m_nodes.lsize() >= 10 || ids.lsize() > 1) { bCont = true; } if (!bCont) return; for (i=0; i= 10); } void Clear() { m_nodes.clear(); //m_high = 0; } private: svec m_nodes; long long m_high; }; class KmerSearch { public: KmerSearch(long long k, bool connect) { m_k = k; //TranslateBasesToNumberExact trans; //trans.SetSize(k); long long m = GetGlobalSeq()->GetBoundValue(); m_used.resize(m, 0); m_usedLocal.resize(m, 0); m_graphID.resize(m, -1); m_connect = connect; if (!connect) m_local.reserve(100000); m_counter = 0; } void Extend(long long last, DNAVector & seq, const svec & count, DNAVector & longest); void Clear(); long long Used(long long i) const {return m_used[i];} void SetUsed(long long i) {m_used[i]++;} void ShiftLeft(vecDNAVector & out, const DNAVector & in); void Print(FILE * p) { m_branch.push_back(m_counter); //cout << "Before: " << m_branch.isize() << endl; UniqueSort(m_branch); //cout << "After: " << m_branch.isize() << endl; m_graph.Print(p, m_branch); } private: void Shift(vecDNAVector & out, const DNAVector & in); void Right(DNAVector & out, const DNAVector & in); long long m_k; svec m_used; svec m_usedLocal; svec m_local; svec m_graphID; svec m_branch; KmerGraph m_graph; bool m_connect; long long m_counter; bool m_bPrinted; }; bool Irregular(char l) { if (l == 'A' || l == 'C' || l == 'G' || l == 'T') return false; //cout << "Irregular char: " << l << endl; return true; } void KmerSearch::Clear() { if (m_graph.Valid()) m_counter++; long long i; if (!m_connect) { for (i=0; i & count, const DNAVector & r, int from) { long long cc = 0; long long num = GetGlobalSeq()->BasesToNumber(r, from); if (num >= 0) cc += count[num]; //DNAVector tmp; //tmp.SetToSubOf(r, from, GetGlobalSeq()->K()); //tmp.ReverseComplement(); //num = GetGlobalSeq()->BasesToNumber(tmp, 0); //if (num >= 0) //cc += count[num]; return cc; } void KmerSearch::Extend(long long last, DNAVector & seq, // kmer const svec & count, // counts of kmers DNAVector & longest) { //TranslateBasesToNumberExact trans; //trans.SetSize(m_k); DNAVector curr; Right(curr, seq); long long currNum = GetGlobalSeq()->BasesToNumber(curr, 0); if (currNum < 0) return; long long support = count[currNum]; //if (support < 3) //return; last = m_graph.Add(last, support, curr); m_used[currNum]++; m_graphID[currNum] = m_counter; m_usedLocal[currNum] = last; if (!m_connect) m_local.push_back(currNum); vecDNAVector right; Shift(right, seq); static int plusminus = 0; //cout << "Entering Extend" << endl; /*cout << "+"; plusminus++; if (plusminus > 80) { cout << endl; plusminus = 0; }*/ long long valid = 0; long long hi = 0; long long hi_index = -1; for (size_t j=0; jBasesToNumber(r, 0); //long long num = GetFullCount(count, r, 0); long long cc = 0; if (num >= 0) cc = count[num]; if (cc > hi) { hi = cc; hi_index = j; } } if (hi_index > 0) { //cout << "!!!!!" << endl; DNAVector tmp = right[0]; right[0] = right[hi_index]; right[hi_index] = tmp; } for (size_t j=0; j longest.lsize()) { //cout << "Got seq, l=" << seq.isize() << endl; longest = seq; } /* cout << "-"; plusminus++; if (plusminus > 80) { cout << endl; plusminus = 0; } */ /* if (valid == 0 && seq.isize() > 100) { cout << ">Hypothesis" << endl; for (j=0; j0 && j % 80 == 0) cout << endl; cout << seq[j]; } cout << endl; }*/ } //==================================================== int TranscriptomeGraph(vecDNAVector & seq, FILE * pOut, int k, bool connect) { bool bAppend = true; //cerr << "Sequences: " << seq.isize() << endl; //vecbasevector contigBases; //vecDNAVector seq; //seq.Read(aString); size_t i; if (seq.size() == 1) { // only one sequence, the graph is simple: linear set of overlapping kmers /*FILE * pOut = NULL; if (bAppend) pOut = fopen(oString.c_str(), "a"); else pOut = fopen(oString.c_str(), "w"); */ DNAVector & d = seq[0]; for (i=0; i<=d.isize()-k; i++) { fprintf(pOut, "%lu\t%lu\t1\t", i, i-1); //cout << i << "\t" << i-1 << "\t1\t"; for (size_t x=i; xAdd(tmp); delete [] tmp; // why is below disabled? Do we need it if not strand-specific data? (or do we accommodate this elsewhere?) /* d.ReverseComplement(); for (j=0; jAdd(tmp); */ } // trans.SetSize(k); GetGlobalSeq()->Setup(); long long m = GetGlobalSeq()->GetBoundValue(); //cout << "Bound value: " << m << endl; svec counts; counts.resize(m, 0); //cout << "WARNING: no check for left seed extensions!!" << endl; for (i=0; iBasesToNumber(d, 0); //cerr << "iworm(" << i << "," << j << ") = " << d.AsString() << " set to num2: " << num2 << endl; if (num2 < 0 || search.Used(num2) > 0) continue; /* vecDNAVector left; search.ShiftLeft(left, d); bool bLeft = false; for (long long y=0; y 0 && search.Used(num2) == 0) bLeft = true; } if (bLeft) continue; */ search.Clear(); //cout << "Trying..." << endl; DNAVector longest; search.Extend(-1, d, counts, longest); /* if (longest.lsize() > 100) { cout << ">Sequence_" << i << endl; for (long long x=0; x0 && x % 80 == 0) cout << endl; cout << longest[x]; } cout << endl; }*/ //cout << "Printing." << endl; search.Print(pOut); search.SetUsed(num2); } } //cout << "All done!" << endl; //fclose(pOut); return 0; }