/* File: dna.h * Author: Ed Griffiths (edgrif@sanger.ac.uk) * Copyright (C) J Thierry-Mieg and R Durbin, 1998 * ------------------------------------------------------------------- * Acedb is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * or see the on-line version at http://www.gnu.org/copyleft/gpl.txt * ------------------------------------------------------------------- * This file is part of the ACEDB genome database package, written by * Richard Durbin (Sanger Centre, UK) rd@sanger.ac.uk, and * Jean Thierry-Mieg (CRBM du CNRS, France) mieg@kaa.crbm.cnrs-mop.fr * * Description: * Exported functions: * HISTORY: * Last edited: Dec 4 11:58 2003 (rnc) * * May 24 10:26 1999 (edgrif): Add some missing extern func decs. * * Jul 23 14:41 1998 (edgrif): Remove redeclarations of fmap functions. * * Jul 23 09:06 1998 (edgrif): I have added this header, which was * missing (that's why it shows me as author). I have removed the * declaration of fMapFindDNA which is now in fmaps public header. * Created: Thu Jul 23 09:06:15 1998 (edgrif) * CVS info: $Id: dna.h,v 1.36 2004/04/09 22:23:22 mieg Exp $ *------------------------------------------------------------------- */ #ifndef DEF_DNA_H #define DEF_DNA_H #include #include #include /* We use the standard UPAC coding for representing DNA with a single character per base: Exactly known bases A T (or U for RNA) G C Double ambiguity R AG ~Y puRine Y CT ~R pYrimidine M AC ~K aMino K GT ~M Keto S CG ~S Strong W AT ~w Weak Triple ambiguity H AGT ~D not G B CGT ~V not A V ACG ~B not T D AGT ~H not C Total ambiguity N ACGT ~N unkNown Possible existence of a base - 0 padding Note that: We do NOT use U internally, but just use it for display if tag RNA is set NCBI sometimes uses X in place of N, but we only use X for peptides The - padding character is used when no sequencing is available and therefore means that the exact length is unknown, 300 - might not mean exactly 300 unkown bases. In memory, we use one byte per base, the lower 4 bits correspond to the 4 bases A_, T_, G_, C_. More tha one bit is set in the ambiguous cases. The - padding is represented as zero. In some parts of the code, the upper 4 bits are used as flags. On disk, we store the dna as 4 bases per byte, if there is no ambiguity, or 2 bases per byte otherwise. */ #define A_ 1 #define T_ 2 #define U_ 2 #define G_ 4 #define C_ 8 #define R_ (A_ | G_) #define Y_ (T_ | C_) #define M_ (A_ | C_) #define K_ (G_ | T_) #define S_ (C_ | G_) #define W_ (A_ | T_) #define H_ (A_ | C_ | T_) #define B_ (G_ | C_ | T_) #define V_ (G_ | A_ | C_) #define D_ (G_ | A_ | T_) #define N_ (A_ | T_ | G_ | C_) extern char dnaDecodeChar[] ; /* this is the mapping used to decode a single base */ extern char dnaEncodeChar[] ; /* this is the mapping used to encode a single base */ extern char rnaDecodeChar[] ; extern char complementBase[] ; /* complement of single base */ extern char *aaName[] ; /* maps single letter code to full name */ char * dnaDecodeString(char *buf, char *cp, int n) ; /* Decodes n char from cp, result stored in buffer */ void dnaEncodeString(char *cp) ; /* reversed encoding, works en place */ void dnaDecodeArray (Array a) ; /* works in place */ void rnaDecodeArray (Array a) ; /* works in place */ void dnaEncodeArray (Array a) ; /* works in place */ Array dnaGet (KEY key) ; Array dnaHandleGet(KEY key, STORE_HANDLE handle); Array dnaRawGet(KEY key) ; Array dnaFromObject(KEY key, int dna_start, int dna_end) ; #ifndef NEW_SMAP_DNA /* Use the second of these if you want to control whether mismatches are */ /* reported or not. (dnaAdd resolves to a call to dnaDoAdd anyway). */ /* Both return: -1: error, 0: mismatch, 1: absent, 2: success */ int dnaAdd (Array a, KEY seq, int start, int stop, BOOL noMismatch) ; int dnaDoAdd(Array a, KEY seq, int start, int stop, BOOL noMismatch, BOOL reportErrors) ; void dnaStoreDestroy (KEY key, Array dna) ; /* beware - sets length in sequence object */ void dnaExonsSort (Array a) ; /* a is of BSunit from flattening Source_exons */ #endif char codon (char *cp) ; /* next 3 bases -> amino acid */ char reverseCodon (char *cp) ; /* next 3 bases on opp strand -> amino acid */ char antiCodon (char* cp); Array dnaCopy (Array dna) ; /* arrray copy, but adding a terminal zero, avoids subsequnet doubling */ void reverseComplement(Array dna) ; /* acts in place */ /* Handles for dnacpt package */ void dnaAnalyse (void) ; int dnacptPickMatch (char *cp, int n, char *tp, int maxError, int maxN) ; void dnaCptFindRepeats(void) ; /* Dump _just_ the sequence. */ BOOL dnaRawDump(Array dna, int from, int to, ACEOUT dump_out) ; /* FastA dumping routines */ BOOL dnaRawDumpFastA(char *seqname, char *dna, ACEOUT dump_out) ; BOOL dnaDumpFastA (Array a, int from, int to, char *text, ACEOUT dump_out); BOOL dnaDumpFastAKey (ACEOUT dump_out, KEY key); BOOL dnaDumpFastAKeyWithErrors (ACEOUT dump_out, KEY key); BOOL dnaZoneDumpFastAKey(ACEOUT dump_out, KEY key, BOOL allow_dna_errors, BOOL cds_only, BOOL spliced, int x1, int x2, char style) ; int dnaDumpFastAKeySet(ACEOUT dump_out, KEYSET kSet); int dnaDumpCDSFastAKeySet(ACEOUT dump_out, KEYSET kSet) ; void dnacptFastaDump(void); void dnacptDontUseKeySet (void) ; ACEOUT dnaFileOpen (STORE_HANDLE handle); /* ACEOUT allocated upon given handle */ /* ace dump routines */ BOOL dnaDump (ACEOUT dump_out, KEY key); /* DumpFuncType */ ParseFuncReturn dnaParse (ACEIN parse_in, KEY key, char **errtext); /* ParseFuncType */ BOOL baseQualityDump (ACEOUT dump_out, KEY key) ; /* DumpFuncType */ ParseFuncReturn baseQualityParse (ACEIN parse_in, KEY key, char **errtext) ; /* ParseFuncType */ BOOL basePositionDump (ACEOUT dump_out, KEY key); /* DumpFuncType */ ParseFuncReturn basePositionParse (ACEIN parse_in, KEY key, char **errtext) ; /* ParseFuncType */ /* the text values are filled in dnasubs.c G Guanine A Adenine T Thymine C Cytosine R Purine A,G Y Pyramidine T,C M Amino A,C K Ketone G,T S Strong interaction C,G W Weak interaction A,T H not-G A,C,T B not-A G,C,T V not-T A,C,G D not-C A,G,T N any G,A,T,C */ /* NO CODE AFTER THIS */ #endif /* DEF_DNA_H */