// Copyright 2009 Toshiyuki Sato // Setup: // First, read a genetic code with either operator>> or fromString. // Then, initialize with either codeTableSet or initCodons. #ifndef GENETICCODE_HH #define GENETICCODE_HH #include #include #include #include #include // size_t namespace cbrc{ typedef unsigned char uchar; class Alphabet; const int maxDnaAlphabetSize = 54; class GeneticCode{ private: std::string AAs; std::string Base[3]; std::vector genome2residue; std::vector genome2residueWithoutLowercaseMasking; uchar codonToAminoAcid[256]; static int codon2number(const uchar *codon) { const int n = maxDnaAlphabetSize; return n * n * codon[0] + n * codon[1] + codon[2]; } static int codon2number2( const uchar* codon, const Alphabet& dnaAlph ); friend std::istream& operator>>( std::istream& stream, GeneticCode& codon ); public: // Converts a name to a text string defining a genetic code. // If the name isn't known, it assumes it's a file and tries to read it. static std::string stringFromName(const std::string &name); void fromString( const std::string& s ); // Setup translation from DNA to amino acids void codeTableSet( const Alphabet& aaAlph, const Alphabet& dnaAlph ); // Setup translation from DNA to codons (0=aaa, 1=aac, ..., 63=ttt, // 64=delimiter, 65=unknown). Also setup codonToAminoAcid. Any DNA // triplet with non-ACGT (or with lowercase if isMaskLowercase is // true) gets translated to 65=unknown. // If isMaskLowercase and isUnmaskLowercase are both true, also // setup translateWithoutMasking. void initCodons( const uchar *ntToNumber, const uchar *aaToNumber, bool isMaskLowercase, bool isUnmaskLowercase ); void translate( const uchar* beg, const uchar* end, uchar* dest ) const; void translateWithoutMasking( const uchar* beg, const uchar* end, uchar* dest ) const; uchar translation( const uchar* codon ) const { return genome2residue[ codon2number( codon ) ]; } const uchar *getCodonToAmino() const { return codonToAminoAcid; } }; // Convert an amino-acid (translated) coordinate to a DNA coordinate inline size_t aaToDna( size_t aaCoordinate, size_t frameSize ){ if( frameSize == 0 ) return aaCoordinate; // for non-translated sequences size_t frame = aaCoordinate / frameSize; size_t offset = aaCoordinate % frameSize; return frame + offset * 3; } // Convert a DNA coordinate to an amino-acid (translated) coordinate inline size_t dnaToAa( size_t dnaCoordinate, size_t frameSize ){ if( frameSize == 0 ) return dnaCoordinate; // for non-translated sequences size_t frame = dnaCoordinate % 3; size_t offset = dnaCoordinate / 3; return frame * frameSize + offset; } // Convert begin and end coordinates to a size and a frameshift inline void sizeAndFrameshift( size_t beg, size_t end, size_t frameSize, // 0 means not translated size_t& size, size_t& frameshift ){ if( frameSize ){ // if it's a translated sequence: size_t dnaBeg = aaToDna( beg, frameSize ); size_t dnaEnd = aaToDna( end, frameSize ); size_t dnaSize = dnaEnd - dnaBeg; assert( dnaBeg <= dnaEnd + 1 ); // allow a -1 frameshift size = ( dnaSize + 1 ) / 3; frameshift = ( dnaSize + 3 ) % 3; } else{ // if it's not a translated sequence: assert( beg <= end ); size = end - beg; frameshift = 0; } } } // end namespace cbrc #endif