/* File: smapconvert.c * Author: Simon Kelley (srk@sanger.ac.uk) & Ed Griffiths (edgrif@sanger.ac.uk) * Copyright (c) J Thierry-Mieg and R Durbin, 2001 *------------------------------------------------------------------- * Acedb is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * or see the on-line version at http://www.gnu.org/copyleft/gpl.txt *------------------------------------------------------------------- * This file is part of the ACEDB genome database package, written by * Richard Durbin (Sanger Centre, UK) rd@sanger.ac.uk, and * Jean Thierry-Mieg (CRBM du CNRS, France) mieg@kaa.crbm.cnrs-mop.fr * * Description: Uses the smap system to produce arrays of features, * these arrays are then used by the GFF dumper or various * display routines. * * Exported functions: See smapconvert.h * HISTORY: * Last edited: Mar 26 14:39 2004 (edgrif) * * Created: Wed Dec 12 10:17:57 2001 (edgrif) * CVS info: $Id: smapconvert.c,v 1.64 2004/03/26 14:41:17 edgrif Exp $ *------------------------------------------------------------------- */ #undef FMAP_DEBUG #include #include #include #include #include #include #include #include #include #include #include #include /* We should be thinking about moving some of the fmap_.h stuff into a new header and * only including that header in this file so that smap and fmap are truly split apart. */ #include typedef struct _SMapFeatureMapRec { STORE_HANDLE handle ; KEY seqOrig ; KEY seqKey ; SMap *smap ; int start ; int stop ; int length ; int fullLength ; /* probably don't need this... */ int origin ; int zoneMin, zoneMax ; Array dna ; KEYSET method_set ; DICT *featDict ; DICT *sources_dict; BOOL include_methods; BOOL include_sources; MethodCache mcache ; Array segs ; Array seqInfo ; Array homolInfo ; Array feature_info ; BitSet homolBury ; /* same index as homolInfo */ BitSet homolFromTable ; /* MatchTable or Tree? (index as homolInfo) */ } SMapFeatureMapRec ; /* Passed to each object conversion routine in turn, keeps all the state necessary for such * conversions. The alternative to this struct would be to have the routines all have a large * number of arguments which seems not very attractive. */ typedef struct { KEY seq_orig ; /* id of original sequence to be mapped. */ SMap *smap ; /* Current smap. */ SMapKeyInfo *info ; SMapStatus status ; int y1, y2 ; /* Position of current feature. */ KEY key ; /* Key of current object. */ BOOL is_Sequence_class ; /* Is current obj of class ?Sequence */ OBJ obj ; /* Current object. */ Array units ; /* Used for bsFlatten of features. */ KEYSET method_set ; /* which methods are to be smap'ed. */ DICT sources_dict; BOOL include_methods; BOOL include_sources; Array segs_array ; /* Segs array to add features to. */ STORE_HANDLE segs_handle ; /* For any segs related allocation. */ Array seqinfo ; /* Separate of info. describing sets of segs. */ Array homol_info ; Array feature_info ; BitSet homolBury ; /* same index as homolInfo */ BitSet homolFromTable ; /* MatchTable or Tree? (index as homolInfo) */ int length ; /* only needed once, can we get rid of */ /* it altogether....*/ /* These are needed for when one object produces a hierachy of segs (e.g. a sequence with exons) * and we need to keep track of the first seg that represents the "parent" of the subsequent * segs. */ int parent_seg_index ; /* Index of sequence class seg from which exons etc. are created. */ int parent_seqinfo_index ; /* Index to current seqinfo. */ } SmapConvInfo ; /* structs for holding read pair information (e.g. ESTs). */ typedef struct { KEY key ; /* homol key. */ int pos ; /* seg coord of homol nearest to its */ /* paired read. */ } ReadPosStruct, *ReadPos ; typedef struct { int homol_index ; /* a paired read homol seg, used for copying parent key etc. */ ReadPosStruct five_prime ; ReadPosStruct three_prime ; } ReadPairStruct, *ReadPair ; static SMapFeatureMap doFeatureMap(KEY seq, int start, int stop, BOOL get_dna, DICT *methods, DICT *sources, DICT *features, BOOL include_methods, BOOL include_sources, BOOL include_features) ; static BOOL dumpGFF(SMapFeatureMap map, int version, KEY *refSeq, int *offPtr, BOOL isList, DICT *sourceSet, DICT *featSet, ACEOUT gff_out) ; /* Conversion routines for objects of different classes. */ static void convertObj(SmapConvInfo *conv_info, DICT *featDict) ; /* Conversion routines for features hanging off an object. */ static void convertSequence(SmapConvInfo *conv_info) ; static void convertFeatureObj(SmapConvInfo *conv_info) ; static void convertHomol(SmapConvInfo *conv_info) ; static void convertExons(SmapConvInfo *conv_inf, BOOL *exons_found) ; static void convertCDS(SmapConvInfo *conv_info, int *cds_min_out, BOOL exons_found) ; static void convertStart(SmapConvInfo *conv_info, int cds_min) ; static void convertFeature(SmapConvInfo *conv_info, DICT *featDict) ; static void convertVisible(SmapConvInfo *conv_info) ; static void convertAssemblyTags(SmapConvInfo *conv_info) ; static void convertAllele(SmapConvInfo *conv_info) ; static void convertCloneEnd(SmapConvInfo *conv_info, KEY clone_tag) ; static void convertOligo(SmapConvInfo *conv_info) ; static void convertConfirmedIntrons(SmapConvInfo *conv_info) ; static void convertEMBLFeature(SmapConvInfo *conv_info) ; static void convertSplices(SmapConvInfo *conv_info) ; static void convertCoding(SmapConvInfo *conv_info) ; static void addOldSegs (BOOL complement, Array segs, Array oldSegs, MethodCache mcache) ; static void removeDuplicateIntrons(SmapConvInfo *conv_info) ; static void removeSelfHomol(Array segs) ; /* Utility functions. */ static void sMap2Seg(SMapKeyInfo *info, SEG *seg, int y1, int y2, SegType seg_type) ; static BOOL isOldAlleleModel(OBJ obj) ; static SEQINFO *makeSeqInfo(OBJ obj, Array seqinfo, int *index_out) ; static void setDefaultMethod(OBJ obj, SEQINFO *sinf) ; static BOOL getCDSPosition(Array segs, Array seqinfo, Array dna_in, KEY parent, Array *cds, Array *index, BOOL cds_only) ; static int homolOrder(void *a, void *b) ; static void homolLines(Array segs_array, int homols_start, int homols_end) ; static void addReadPair(Array all_paired_reads, Associator key2readpair, KEY this_read, KEY paired_read, Array segs, int homols_start, int homols_end) ; static void makePairedReadSegs(Array segs, Array all_paired_reads) ; static int segOrder(void *a, void *b) ; static BOOL segFindBounds(Array segs, SegType type, int *min, int *max) ; static void segSetClip(SEG *seg, SMapStatus status, SegType feature, SegType feature_up) ; static consMapDNAErrorReturn callback(KEY key, int position) ; static KEYSET allMethods(void) ; static void addToSet (ACEIN command_in, DICT *dict) ; static BOOL IsRequired(SmapConvInfo *conv_info); static BOOL FilterOut(SmapConvInfo *conv_info, KEY method); /* For dumping seg information. */ static void segDumpToStdout(Array segs) ; /* Globals */ static BOOL reportErrors_G ; /* */ /* ---------------- External routines --------------------------------- */ /* */ /* The business: this is the routine that will construct the array of * features to be used by fmap, the GFF dumper etc. * * Some notes - * * This routine does _NOT_ know about fmap, if you introduce fmap into this * routine you WILL BE SHOT. * * The arrays passed in are expected to be allocated but empty and will be * filled with seg/seqinfo/homol data. This gives the caller the freedom to * create/recreate/destroy arrays on handles or not as they see fit. * * The args here are crazy and should be reduced/simplfied, but one step at a * time..... */ BOOL sMapFeaturesMake(SMap *smap, KEY seqKey, KEY seqOrig, int start, int stop, int length, Array oldsegs, BOOL complement, MethodCache mcache, Array dna, KEYSET method_set, BOOL include_methods, DICT *featDict, DICT *sources_dict, BOOL include_sources, Array segs_inout, Array seqinfo_inout, Array homolInfo_inout, Array featureinfo_inout, BitSet homolBury_inout, BitSet homolFromTable_inout, STORE_HANDLE segsHandle) { KEYSET allKeys; int i; SEG *seg; Array segs = segs_inout ; Array seqinfo = seqinfo_inout ; Array units = NULL; /* some more thought needed about how this is filled in and persists in */ /* this function, some room for optimising. */ SmapConvInfo conv_info ; int seq_class = pickWord2Class("Sequence") ; allKeys = sMapKeys(smap, NULL); /* The master seg, n.b. has no parent. */ seg = arrayp(segs, arrayMax(segs), SEG); seg->parent = seg->source = 0; seg->key = seqKey; seg->x1 = start; seg->x2 = stop; seg->type = MASTER; seg->flags = SEGFLAG_UNSET ; /* Process all the objects that were smap'd successfully. */ for (i = 0 ; i < keySetMax(allKeys) ; i++) { KEY key ; SMapKeyInfo *info ; SMapStatus status ; int y1, y2 ; key = keySet(allKeys, i) ; info = sMapKeyInfo(smap, key) ; status = sMapMap(info, 1, 0, &y1, &y2, NULL, NULL) ; /* If an object overlaps the smap'd area at all, then process it. */ if (SMAP_STATUS_INAREA(status)) { OBJ obj ; if ((obj = bsCreate(key))) { /* Create segs for any features hanging off this object. */ units = arrayReCreate (units, 256, BSunit); conv_info.smap = smap ; conv_info.info = info ; conv_info.key = key ; if (class(conv_info.key) == seq_class) conv_info.is_Sequence_class = TRUE ; else conv_info.is_Sequence_class = FALSE ; conv_info.obj = obj ; conv_info.status = status ; conv_info.y1 = y1 ; conv_info.y2 = y2 ; conv_info.units = units ; conv_info.method_set = method_set ; conv_info.sources_dict = sources_dict; conv_info.include_methods = include_methods; conv_info.segs_array = segs ; conv_info.segs_handle = segsHandle ; conv_info.seqinfo = seqinfo ; conv_info.homol_info = homolInfo_inout ; conv_info.feature_info = featureinfo_inout ; conv_info.homolBury = homolBury_inout ; conv_info.homolFromTable = homolFromTable_inout ; conv_info.length = length ; /* Make these invalid so we fail if we look at this when we shouldn't */ conv_info.parent_seqinfo_index = -1 ; conv_info.parent_seg_index = -1 ; conv_info.seq_orig = seqOrig ; /* This is where it all happens, convert any features contained within object. */ convertObj(&conv_info, featDict) ; bsDestroy(obj); } } } #ifdef ED_G_NEVER_INCLUDE_THIS_CODE /* uncomment to see dump of all segs.... */ segDumpToStdout(segs) ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ /* The previously displayed segs contain segs that were calculated * (e.g. genefinder), these must be transferred to the new segs. */ if (oldsegs) addOldSegs(complement, segs_inout, oldsegs, mcache) ; arraySort(segs, segOrder) ; /* must sort for FindCoding and */ /* RemoveSelfHomol */ convertCoding(&conv_info) ; /* make CODING segs */ removeDuplicateIntrons(&conv_info) ; /* Unify confirmed introns with introns. */ removeSelfHomol(conv_info.segs_array) ; #ifdef ED_G_NEVER_INCLUDE_THIS_CODE /* ACEMBLY only, add later. */ fMapTraceFindMultiplets (look) ; /* make virtual multiplets */ #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ return TRUE ; } void sMapFeaturesComplement(int length, Array segs, Array seqinfo, Array homolinfo) { int i, j, tmp, top = length - 1 ; SEG *seg ; SEQINFO *sinf ; /* Transform all the seg coordinates. */ seg = arrp(segs, 1, SEG) ; for (i = 1 ; i < arrayMax(segs) ; ++i, ++seg) { /* Do the complement. */ tmp = seg->x1 ; seg->x1 = top - seg->x2 ; seg->x2 = top - tmp ; if (seg->type >= SEQUENCE && seg->type <= ALLELE_UP) { seg->type ^= 1 ; /* There must be a more elegant way to do this but I can't think of it at the moment... */ { BOOL top = FALSE, bot = FALSE ; if (seg->flags & SEGFLAG_CLIPPED_TOP) { seg->flags &= !SEGFLAG_CLIPPED_TOP ; bot = TRUE ; } if (seg->flags & SEGFLAG_CLIPPED_BOTTOM) { seg->flags &= !SEGFLAG_CLIPPED_BOTTOM ; top = TRUE ; } if (bot) seg->flags |= SEGFLAG_CLIPPED_BOTTOM ; if (top) seg->flags |= SEGFLAG_CLIPPED_TOP ; } } /* Now do some seg-type specific stuff. */ if ((seg->type | 0x1) == SPLICED_cDNA_UP) seg->source = top - seg->source ; } /* Gaps arrays attached to (some) homols have coordinates in which must be munged. */ for (i = 0 ; i < arrayMax(homolinfo) ; i++) { HOMOLINFO *hinf = arrp(homolinfo, i, HOMOLINFO) ; if (hinf->gaps) for (j = 0; j < arrayMax(hinf->gaps); j++) { SMapMap *m = arrp(hinf->gaps, j, SMapMap); m->r1 = top - m->r1 + 2; m->r2 = top - m->r2 + 2; } } /* Some exon segs have CDS information attached, this information is in */ /* the seqinfo array and includes coordinates that must be transformed. */ for (i = 0 ; i < arrayMax(seqinfo) ; i++) { sinf = arrp(seqinfo, i, SEQINFO) ; if (sinf->flags & SEQ_CDS) { tmp = sinf->cds_info.cds_seg.x1 ; sinf->cds_info.cds_seg.x1 = top - sinf->cds_info.cds_seg.x2 ; sinf->cds_info.cds_seg.x2 = top - tmp ; sinf->cds_info.cds_seg.type ^= 1 ; /* There must be a more elegant way to do this but I can't think of it at the moment... */ { BOOL top = FALSE, bot = FALSE ; if (sinf->cds_info.cds_seg.flags & SEGFLAG_CLIPPED_TOP) { sinf->cds_info.cds_seg.flags &= !SEGFLAG_CLIPPED_TOP ; bot = TRUE ; } if (sinf->cds_info.cds_seg.flags & SEGFLAG_CLIPPED_BOTTOM) { sinf->cds_info.cds_seg.flags &= !SEGFLAG_CLIPPED_BOTTOM ; top = TRUE ; } if (bot) sinf->cds_info.cds_seg.flags |= SEGFLAG_CLIPPED_BOTTOM ; if (top) sinf->cds_info.cds_seg.flags |= SEGFLAG_CLIPPED_TOP ; } } } return ; } /* smap based gif get code, doesn't use fex but does use smap and NO fmap */ /* look. It also */ /* */ /* this is my version of fMapGifGet() which allows for selection of which */ /* sets of features will be included. */ /* */ /* When we come to fex stuff we can easily add an smapfexget call which will */ /* be the same except we'll produce fex features instead of segs. */ /* */ /* */ SMapFeatureMap sMapFeaturesCreate(ACEIN command_in, ACEOUT result_out, SMapFeatureMap map_in) { SMapFeatureMap map = NULL ; KEY key ; int x1, x2 ; char *word ; STORE_HANDLE handle = NULL ; DICT *features_dict = NULL ; DICT *methods_dict = NULL ; DICT *sources_dict = NULL ; BOOL dna_get = FALSE ; BOOL already_features = FALSE; BOOL already_sources = FALSE; BOOL already_methods = FALSE; BOOL include_methods = FALSE; BOOL include_sources = FALSE; BOOL include_features = FALSE; if (map_in) sMapFeaturesDestroy(map_in) ; handle = handleCreate() ; /* try to get from graph */ if (!(word = aceInWord(command_in))) { aceOutPrint(result_out, "// gif smapget error: no sequence specified.\n") ; goto usage ; } /* this test is probably wrong now, we probably want to do non-sequence */ /* objects...in fact that is sure to be true. */ if (!lexword2key(word, &key, _VSequence)) { aceOutPrint (result_out, "// gif smapget error: Sequence %s not known\n", word) ; goto usage ; } already_features = FALSE ; x1 = 1 ; x2 = 0 ; /* default for whole sequence */ while (aceInCheck (command_in, "w")) { word = aceInWord(command_in) ; if (strcmp (word, "-dna") == 0) { dna_get = TRUE ; } else if (strcmp (word, "-coords") == 0) { if (!aceInInt (command_in, &x1) || !aceInInt (command_in, &x2) || (x2 == x1)) goto usage ; } else if (strcmp (word+1, "feature") == 0) { if (word[0] == '+') include_features = TRUE; else include_features = FALSE; if (aceInCheck (command_in, "w")) { if (already_features) goto usage ; features_dict = dictHandleCreate(16, handle) ; addToSet(command_in, features_dict) ; already_features = TRUE ; } } else if (strcmp (word+1, "method") == 0) { if (word[0] == '+') include_methods = TRUE; else include_methods = FALSE; if (aceInCheck (command_in, "w")) { if (already_methods) goto usage ; methods_dict = dictHandleCreate(16, handle) ; addToSet(command_in, methods_dict) ; already_methods = TRUE ; } } else if (strcmp (word+1, "source") == 0) { if (word[0] == '+') include_sources = TRUE; else include_sources = FALSE; if (aceInCheck (command_in, "w")) { if (already_sources) goto usage ; sources_dict = dictHandleCreate(16, handle) ; addToSet(command_in, sources_dict) ; already_sources = TRUE ; } } else goto usage ; } #ifdef ED_G_NEVER_INCLUDE_THIS_CODE if (x1 < x2 || (x1 == 1 && x2 == 0)) look = fMapCreateLook(key, x1, x2, FALSE, fmap_data) ; else look = fMapCreateLook(key, x2, x1, FALSE, fmap_data) ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ if (x1 < x2 || (x1 == 1 && x2 == 0)) map = doFeatureMap(key, x1, x2, dna_get, methods_dict, sources_dict, features_dict, include_methods, include_sources, include_features) ; else map = doFeatureMap(key, x2, x1, dna_get, methods_dict, sources_dict, features_dict, include_methods, include_sources, include_features) ; if (!map) { aceOutPrint (result_out, "// gif smapget error: could not make map for %s\n", name(key)) ; if (handle) handleDestroy (handle) ; return NULL ; } #ifdef ED_G_NEVER_INCLUDE_THIS_CODE /* Unsure what to do here...fill in later... */ /* Is the zone used in the gff dump ????? */ /* set the zone */ if (x1 == 1 && x2 == 0) { int i ; for (i = 1 ; i < arrayMax(look->segs) ; ++i) { if (arrp(look->segs,i,SEG)->key == key) { setZone(look, arrp(look->segs,i,SEG)->x1, arrp(look->segs,i,SEG)->x2 + 1) ; break ; } } } else if (x1 > x2) fMapRC (look) ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ if (handle) handleDestroy (handle) ; return map ; usage: aceOutPrint (result_out, "// gif smapseqget error, usage:\n" "\tsmapget [sequence [-coords x1 x2][-feature feat1|feat2]]\n") ; if (handle) handleDestroy (handle) ; return NULL ; } void sMapFeaturesDestroy(SMapFeatureMap map) { if (map) { handleDestroy(map->handle) ; messfree (map) ; } return; } /* Dump features in GFF format. */ /* */ /* Returns TRUE if dump successful, FALSE otherwise. */ /* */ BOOL sMapFeaturesGFF(ACEIN command_in, ACEOUT result_out, SMapFeatureMap map) { BOOL result = FALSE ; KEY refseq = 0 ; int x1, x2, offset ; int version = 2 ; char *word ; STORE_HANDLE handle = handleCreate() ; DICT *sourceSet = dictHandleCreate (16, handle) ; DICT *featSet = dictHandleCreate (16, handle) ; BOOL isList = FALSE ; ACEOUT fo = NULL, dump_out = NULL ; if (command_in && result_out && map) { /* by default output goes to same place as normal command results */ dump_out = aceOutCopy(result_out, 0); result = TRUE ; while (result && aceInCheck(command_in, "w")) { word = aceInWord (command_in) ; if (strcmp (word, "-coords") == 0 && aceInInt (command_in, &x1) && aceInInt (command_in, &x2) && (x2 != x1)) { ; #ifdef ED_G_NEVER_INCLUDE_THIS_CODE /* I need to sort out the zone stuff.... */ setZoneUserCoords (look, x1, x2) ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ } else if (strcmp (word, "-file") == 0 && aceInCheck (command_in, "w") && (fo = aceOutCreateToFile (aceInPath (command_in), "w", 0))) { /* replace default output with file output */ aceOutDestroy (dump_out); dump_out = fo; } else if (strcmp (word, "-refseq") == 0 && aceInCheck (command_in, "w") && (lexword2key (aceInWord (command_in), &refseq, _VSequence))) { ; } else if (strcmp (word, "-version") == 0 && aceInInt (command_in, &version)) { ; } else if (strcmp (word, "-list") == 0) { isList = TRUE ; } else if (strcmp (word, "-source") == 0 && aceInCheck (command_in, "w")) { addToSet (command_in, sourceSet) ; } else if (strcmp (word, "-feature") == 0 && aceInCheck (command_in, "w")) { addToSet (command_in, featSet) ; } else { result = FALSE ; aceOutPrint (result_out, "// gif seqfeatures error: usage: SEQFEATURES [-coords x1 x2] ") ; aceOutPrint (result_out, "[-file fname] [-refseq sequence] [-version 1|2] ") ; aceOutPrint (result_out, "[-list] [-source source(s)] [-feature feature(s)]\n") ; } } } if (result) result = dumpGFF(map, version, &refseq, &offset, isList, sourceSet, featSet, dump_out) ; if (result) aceOutPrint(result_out, "// FMAP_FEATURES %s %d %d\n", freeprotect (name(refseq)), (map->zoneMin + 1 + offset), (map->zoneMax+offset)) ; if (dump_out) aceOutDestroy(dump_out); if (handle) handleDestroy(handle) ; return result ; } /* This routine makes a keyset containing keys of method objects in one of */ /* two ways: */ /* If add = TRUE then we start with an empty keyset and add all the */ /* keys of methods found in the database whose names match one of the names */ /* in the "features" dict parameter. */ /* else we subtract from the set of all method keys, the keys of */ /* those methods matching the names in the "features" dict. */ /* */ /* i.e. this routine can be used to include or exclude methods. */ /* */ /* Returns a valid keyset of method objects or NULL if no methods match the */ /* supplied "feature" list. It is the callers responsibility to destroy the */ /* keyset when finished with. */ /* */ KEYSET sMapFeaturesSet(STORE_HANDLE handle, DICT methods, BOOL include_methods) { KEYSET result = NULL; KEYSET wild = NULL; int i,j ; char *KSMethod; /* KeySet method */ char CLMethod[255+1]; /* Command Line method */ BOOL ThisMethodFound = FALSE; BOOL AnyMethodNotFound = FALSE; if (include_methods) result = keySetHandleCreate(handle) ; else result = allMethods() ; wild = allMethods(); if (result && methods) /* only if some methods have been specified */ { for (i = 0 ; i < dictMax(methods) ; i++) { KEY kp = KEY_UNDEFINED ; ThisMethodFound = FALSE; /* wildcard */ /* If the command line method includes a trailing asterisk, ** then strip that off and search the entire keyset for methods ** that start with that pattern. */ strcpy(CLMethod, dictName(methods,i)); if (CLMethod[strlen(CLMethod)-1] == '*') { CLMethod[strlen(CLMethod)-1] = '\0'; for (j = 0; jx1 = (y1 - 1), seg->x2 = (y2 - 1), seg->type = SEG_DOWN(seg_type) ; } else if (y2 < y1) { seg->x1 = (y2 - 1), seg->x2 = (y1 - 1), seg->type = SEG_UP(seg_type) ; } else { seg->x1 = (y1 - 1), seg->x2 = (y2 - 1), seg->type = ((sMapIsReverse(info, y1, y2)) ? SEG_UP(seg_type) : SEG_DOWN(seg_type)) ; } return ; } static consMapDNAErrorReturn callback(KEY key, int position) { BOOL OK; if (!reportErrors_G) return sMapErrorReturnContinue; OK = messQuery("Mismatch error at %d in %s.\n" "Do you wish to see further errors?", position, name(key)); return OK ? sMapErrorReturnContinue : sMapErrorReturnSilent; } /* if (x1 < x2 || (x1 == 1 && x2 == 0)) look = fMapCreateLook(key, x1, x2, FALSE, fmap_data) ; else look = fMapCreateLook(key, x2, x1, FALSE, fmap_data) ; */ static SMapFeatureMap doFeatureMap(KEY seq, int start, int stop, BOOL get_dna, DICT *methods_dict, DICT *sources_dict, DICT *features_dict, BOOL include_methods, BOOL include_sources, BOOL include_features) { SMapFeatureMap map = NULL ; BOOL status = TRUE ; BOOL make_map ; map = (SMapFeatureMap)messalloc(sizeof(SMapFeatureMapRec)) ; map->handle = handleCreate() ; map->seqOrig = seq ; if (sMapTreeRoot(seq, start, stop, &seq, &start, &stop)) map->seqKey = seq ; else status = FALSE ; if (status) { map->smap = sMapCreate(map->handle, map->seqKey, start, stop, NULL); } if (status) { if (get_dna) { #ifdef ED_G_NEVER_INCLUDE_THIS_CODE reportErrors_G = look->reportDNAMismatches; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ if (!(map->dna = sMapDNA(map->smap, map->handle, callback))) status = FALSE ; } } if (status) { if (methods_dict) map->method_set = sMapFeaturesSet(map->handle, methods_dict, include_methods) ; if (map->method_set == NULL) status = FALSE; map->include_methods = include_methods; } if (status) { if (sources_dict) map->sources_dict = sources_dict; map->include_sources = include_sources; map->length = sMapMax(map->smap) ; map->zoneMin = 0 ; map->zoneMax = stop - start + 1 ; map->start = start - 1 ; map->stop = stop - 1 ; /* just for testing, don't think this is needed for gif etc. style stuff. */ map->fullLength = sMapLength (seq) ; map->featDict = dictHandleCreate (1000, map->handle) ; dictAdd (map->featDict, "null", 0) ; /* so real entries are non-zero */ map->mcache = methodCacheCreate(map->handle) ; map->segs = arrayHandleCreate(256, SEG, map->handle) ; map->homolInfo = arrayHandleCreate(256, HOMOLINFO, map->handle) ; map->homolBury = NULL ; map->homolFromTable = NULL ; map->seqInfo = arrayHandleCreate(32, SEQINFO, map->handle) ; map->sources_dict = sources_dict; map->include_sources = include_sources; } /* Produce the segs array. */ if (status) { if (!(make_map = sMapFeaturesMake(map->smap, map->seqKey, map->seqOrig, map->start, map->stop, map->length, NULL, FALSE, map->mcache, map->dna, map->method_set, include_methods, map->featDict, sources_dict, include_sources, map->segs, map->seqInfo, map->homolInfo, map->feature_info, map->homolBury, map->homolFromTable, map->handle))) status = FALSE ; } if (!status) { sMapFeaturesDestroy(map) ; map = NULL ; } return map ; } /* Hope this is correct for the gff dumper...need to check....with simon.... */ /* Allowable results must be class sequence, not have exons and * must be Genomic_canonical or have the Link tag set. */ static BOOL pred(KEY key) { if (class(key) != _VSequence) return FALSE; if (bIndexTag(key, str2tag("Source_exons"))) return FALSE; return TRUE; } /* this routine is the old gff dumper extracted from fmapcontrol.c it needs */ /* considerable tidying up. */ static BOOL dumpGFF(SMapFeatureMap map, int version, KEY *refSeq, int *offPtr, BOOL isList, DICT *sourceSet, DICT *featSet, ACEOUT gff_out) { int i ; SEG *seg ; HOMOLINFO *hinf ; SEQINFO *sinf ; METHOD *meth ; KEY seqKey, sourceKey ; int x, y, type, offset = 0 ; int chopped ; /* only used for version 1 */ int tmp = 0, reversed = 0 ; /* used by AcePerl */ char *featName, *sourceName, *tagText = 0 ; float score ; char strand, frame ; BOOL flipped ; BOOL isScore ; STORE_HANDLE handle = handleCreate() ; Associator key2sinf = assHandleCreate (handle) ; Associator key2source = assHandleCreate (handle) ; Associator key2feat = assHandleCreate (handle) ; Array stats = arrayHandleCreate (64, int, handle) ; DICT *listSet = 0 ; /* first establish seqKey and offset */ seqKey = 0 ; if (refSeq && !*refSeq && version > 1) *refSeq = map->seqOrig ; if (refSeq && *refSeq) { for (i = 1 ; i < arrayMax(map->segs) ; ++i) { seg = arrp(map->segs, i, SEG) ; if ((seg->type == SEQUENCE || seg->type == SEQUENCE_UP) && seg->key == *refSeq) break ; } if (i < arrayMax(map->segs)) { if (seg->type == SEQUENCE || seg->type == SEQUENCE_UP) { seqKey = *refSeq ; offset = - seg->x1 ; reversed = seg->type == SEQUENCE_UP ? 1 : 0; } } else { messout ("Can't find reference sequence %s", name (*refSeq)) ; return FALSE ; } } /* find minimal spanning sequence */ if (!seqKey) { x = map->zoneMin + 1 ; y = map->zoneMax ; seqKey = 0 ; if (!sMapFindSpan(map->smap, &seqKey, &x, &y, pred)) seqKey = map->seqKey ; if (x > y) /* what if reversed? */ { messout ("Can't GFF dump from reversed sequences for now. Sorry.") ; return FALSE ; } offset = x - (map->zoneMin + 1) ; /* x changed in sMapFindSpan */ } if (refSeq) *refSeq = seqKey ; if (offPtr) *offPtr = offset ; if (!isList) { aceOutPrint (gff_out, "##gff-version %d\n", version) ; aceOutPrint (gff_out, "##date %s\n", timeShow (timeParse ("today"))) ; aceOutPrint (gff_out, "##sequence-region %s %d %d %s\n", name(seqKey), (map->zoneMin + 1 + offset), (map->zoneMax + offset), reversed ? "(reversed)" : "") ; } if (isList) listSet = dictHandleCreate (64, handle) ; for (i = 0 ; i < arrayMax(map->segs) ; ++i) { seg = arrp(map->segs, i, SEG) ; if (seg->x1 > map->zoneMax || seg->x2 < map->zoneMin) continue ; if (seg->type == MASTER || seg->type == VISIBLE || seg->type == CDS || seg->type == CDS_UP || seg->type == DNA_SEQ || seg->type == PEP_SEQ || seg->type == ORF || seg->type == TRANS_SEQ || seg->type == TRANS_SEQ_UP) continue ; if (seg->type & 0x01 && (seg->type >= SEQUENCE && seg->type <= ALLELE_UP)) { type = seg->type - 1 ; x = seg->x2+1 ; y = seg->x1+1 ; } else { type = seg->type ; x = seg->x1+1 ; y = seg->x2+1 ; } chopped = 0 ; if (x <= y) { flipped = FALSE ; strand = '+' ; if (version == 1 && x <= map->zoneMin) chopped = map->zoneMin + 1 - x ; } else { int tmp = x ; x = y ; y = tmp ; flipped = TRUE ; strand = '-' ; if (version == 1 && y > map->zoneMax) chopped = y - map->zoneMax ; } if (version == 1) /* clip */ { if (x <= map->zoneMin) x = map->zoneMin + 1 ; if (y > map->zoneMax) y = map->zoneMax ; } x += offset ; y += offset ; frame = '.' ; /* other defaults */ isScore = FALSE ; score = 0.0 ; featName = 0 ; sourceKey = 0 ; sourceName = ""; /* LS otherwise everything ends up being a SEQUENCE */ switch (type) { case SEQUENCE: case EXON: case INTRON: sinf = arrp(map->seqInfo, seg->data.i, SEQINFO) ; if ( seg->key && (version == 2) ) { featName = className(seg->key); } else featName = (type == SEQUENCE) ? "sequence" : (type == EXON) ? "exon" : "intron" ; if (sinf->method) { sourceKey = sinf->method ; } if (type == SEQUENCE) { assInsert (key2sinf, assVoid(seg->key), sinf) ; if (sinf->flags & SEQ_SCORE) { score = sinf->score ; isScore = TRUE ; } } break ; case CODING: if (version == 1) featName = "coding_exon" ; else featName = "CDS" ; assFind (key2sinf, assVoid(seg->parent), &sinf) ; if (sinf->method) sourceKey = sinf->method ; switch (seg->data.i%3) { case 0: frame = '0' ; break ; case 1: frame = '2' ; break ; case 2: frame = '1' ; break ; } break ; case HOMOL: hinf = arrp(map->homolInfo, seg->data.i, HOMOLINFO) ; score = hinf->score ; sourceKey = hinf->method ; featName = "similarity" ; meth = methodCacheGet(map->mcache, hinf->method, "") ; if (!(meth->flags & METHOD_STRAND_SENSITIVE)) strand = '.' ; if (meth->flags & METHOD_SCORE) isScore = TRUE ; if (meth->flags & METHOD_FRAME_SENSITIVE) frame = '0' ; break ; case FEATURE: case ATG: case SPLICE5: case SPLICE3: sourceKey = seg->key ; if (type == SPLICE3) featName = "splice3" ; else if (type == SPLICE5) featName = "splice5" ; else if (type == ATG) featName = "atg" ; else if (version == 1) featName = "feature" ; else featName = "misc_feature" ; meth = methodCacheGet(map->mcache, seg->key, "") ; if (meth->flags & METHOD_SCORE) { score = seg->data.f ; isScore = TRUE ; } if (!(meth->flags & METHOD_STRAND_SENSITIVE)) strand = '.' ; if (meth->flags & METHOD_FRAME_SENSITIVE) frame = '0' ; break ; case ASSEMBLY_TAG: tagText = strnew (seg->data.s, handle) ; sourceName = "assembly_tag" ; featName = tagText ; { char *cp ; for (cp = tagText ; *cp && *cp != ':' && *cp != ' ' && *cp != '\t' && *cp != '\n' ; ++cp) ; if (*cp) { *cp++ = 0 ; while (*cp == ' ' || *cp == '\t' || *cp == '\n') ++cp ; } tagText = cp ; } break ; case ALLELE: if (version == 1) sourceName = "variation" ; if (seg->data.s) { if (*seg->data.s == '-') featName = "deletion" ; else { char *cp = seg->data.s ; featName = "variation" ; while (*cp) if (!strchr ("AGCTagct", *cp++)) featName = (version == 1) ? "insertion_site" : "insertion" ; } } break ; case EMBL_FEATURE: featName = name(seg->key) ; break ; case CLONE_END: featName = name(seg->data.k) ; strand = '.' ; break ; default: ; } /* finalise source and feat fields */ if (sourceKey) { if (assFind (key2source, assVoid(sourceKey), &sourceName)) assFind (key2feat, assVoid(sourceKey), &featName) ; else { OBJ obj = bsCreate (sourceKey) ; sourceName = name(sourceKey) ; if (obj) { bsGetData (obj, str2tag("GFF_source"), _Text, &sourceName) ; if (bsGetData (obj, str2tag("GFF_feature"), _Text, &featName)) { featName = strnew (featName, handle) ; assInsert (key2feat, assVoid(sourceKey), featName) ; } } sourceName = strnew(sourceName, handle); assInsert (key2source, assVoid(sourceKey), sourceName) ; if (obj) /* only after strnew(sourceName) !!! */ bsDestroy(obj); } } if (!sourceName) sourceName = "unknown" ; if (!featName) /* from method or from seg->type above */ featName = fMapSegTypeName[type] ; if (frame >= '0' && frame <= '2' && chopped) { frame = frame + 3 - (chopped % 3) ; if (frame == '3') frame = '0' ; if (frame == '4') frame = '1' ; if (frame == '5') frame = '2' ; } if (sourceSet && featSet && (dictMax (sourceSet) || dictMax (featSet)) && !dictFind (sourceSet, sourceName, 0) && !dictFind (featSet, featName, 0)) continue ; if (isList) /* just accumulate stats */ { int k ; dictAdd (listSet, messprintf("%s\t%s",sourceName,featName), &k) ; ++array(stats, k, int) ; continue ; /* move on to next line */ } /* !isList from here on */ /* write the main part of the line */ /* LS/AcePerl: fixup reversed reference sequence */ if (reversed) { tmp = map->zoneMax + offset + 1 - y; y = map->zoneMax + offset + 1 - x; x = tmp; if ( strand == '+' ) strand = '-'; else if ( strand == '-' ) strand = '+'; } if (isScore) aceOutPrint (gff_out, "%s\t%s\t%s\t%d\t%d\t%g\t%c\t%c", name(seqKey), sourceName, featName, x, y, score, strand, frame) ; else aceOutPrint (gff_out, "%s\t%s\t%s\t%d\t%d\t.\t%c\t%c", name(seqKey), sourceName, featName, x, y, strand, frame) ; /* extras */ switch (type) { case SEQUENCE: case EXON: case CODING: if (seg->parent) { if (version == 1) aceOutPrint (gff_out, "\t%s", name(seg->parent)) ; else aceOutPrint (gff_out, "\t%s \"%s\"", className(seg->parent), name(seg->parent)) ; } break ; case INTRON: sinf = arrp(map->seqInfo, seg->data.i, SEQINFO) ; if (version == 1) { if (seg->parent) { /* RD 010129 added class for version 2 like EXON */ if (version == 1) aceOutPrint (gff_out, "\t%s", name(seg->parent)) ; else aceOutPrint (gff_out, "\t%s \"%s\"", className(seg->parent), name(seg->parent)) ; if (sinf->flags & SEQ_CONFIRMED) aceOutPrint (gff_out, " Confirmed") ; } else if (sinf->flags & SEQ_CONFIRMED) aceOutPrint (gff_out, "\tConfirmed") ; if (sinf->flags & SEQ_CONFIRM_EST) aceOutPrint (gff_out, " by_EST") ; if (sinf->flags & SEQ_CONFIRM_CDNA) aceOutPrint (gff_out, " by_cDNA") ; if (sinf->flags & SEQ_CONFIRM_HOMOL) aceOutPrint (gff_out, " by_homology") ; if (sinf->flags & SEQ_CONFIRM_UTR) aceOutPrint (gff_out, " in_UTR") ; if (sinf->flags & SEQ_CONFIRM_INCONSIST) aceOutPrint (gff_out, " inconsistent") ; if (sinf->flags & SEQ_CONFIRM_FALSE) aceOutPrint (gff_out, " as_false") ; } else { BOOL isData = FALSE ; if (seg->parent) { aceOutPrint (gff_out, "\tSequence \"%s\"", name(seg->parent)) ; isData = TRUE ; } if (sinf->flags & SEQ_CONFIRM_EST) { aceOutPrint (gff_out, "%s", isData ? " ; " : "\t") ; isData = TRUE ; aceOutPrint (gff_out, "Confirmed_by_EST") ; } if (sinf->flags & SEQ_CONFIRM_CDNA) { aceOutPrint (gff_out, "%s", isData ? " ; " : "\t") ; isData = TRUE ; aceOutPrint (gff_out, "Confirmed_by_cDNA") ; } if (sinf->flags & SEQ_CONFIRM_HOMOL) { aceOutPrint (gff_out, "%s", isData ? " ; " : "\t") ; isData = TRUE ; aceOutPrint (gff_out, "Confirmed_by_homology") ; } if (sinf->flags & SEQ_CONFIRM_UTR) { aceOutPrint (gff_out, "%s", isData ? " ; " : "\t") ; isData = TRUE ; aceOutPrint (gff_out, "Confirmed_in_UTR") ; } if (sinf->flags & SEQ_CONFIRM_FALSE) { aceOutPrint (gff_out, "%s", isData ? " ; " : "\t") ; isData = TRUE ; aceOutPrint (gff_out, "Confirmed_as_false") ; } } break ; case HOMOL: if (version == 1) aceOutPrint (gff_out, "\t%s:%s", className(seg->key), name(seg->key)) ; else aceOutPrint (gff_out, "\tTarget \"%s:%s\"", className(seg->key), name(seg->key)) ; hinf = arrp(map->homolInfo, seg->data.i, HOMOLINFO) ; if (flipped) aceOutPrint (gff_out, " %d %d", hinf->x2, hinf->x1) ; else aceOutPrint (gff_out, " %d %d", hinf->x1, hinf->x2) ; break ; case FEATURE: case SPLICE5: case SPLICE3: case ATG: { BOOL isTab = FALSE ; if (seg->parent) { aceOutPrint (gff_out, "\tNote \"%s\"", utCleanNewlines(dictName(map->featDict, -seg->parent), handle)) ; isTab = TRUE ; } #ifdef ED_G_NEVER_INCLUDE_THIS_CODE /* need to find out about this chosen lark.... */ if (assFind (look->chosen, SEG_HASH(seg), 0)) aceOutPrint (gff_out, "%sSelected", isTab ? " ; " : "\t") ; else if (assFind (look->antiChosen, SEG_HASH(seg), 0)) aceOutPrint (gff_out, "%sAntiselected", isTab ? " ; " : "\t") ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ } break ; case ASSEMBLY_TAG: if (tagText && *tagText) { aceOutPrint (gff_out, "\tNote \"%s\"", utCleanNewlines(tagText, handle)) ; } break ; case ALLELE: aceOutPrint (gff_out, "\tAllele \"%s\"", name(seg->key)) ; if (seg->data.s && *seg->data.s != '-') { if (strcmp (featName, "variation") == 0) aceOutPrint (gff_out, " ; Variant \"%s\"", utCleanNewlines(seg->data.s, handle)) ; else aceOutPrint (gff_out, " ; Insert \"%s\"", utCleanNewlines(seg->data.s, handle)) ; } break ; case EMBL_FEATURE: if (seg->data.s) { aceOutPrint (gff_out, "\tNote \"%s\"", utCleanNewlines(seg->data.s, handle)) ; } break ; case CLONE_END: aceOutPrint (gff_out, "\tClone \"%s\"", name(seg->key)) ; break ; default: ; } aceOutPrint (gff_out, "\n") ; } if (isList) for (i = 0 ; i < arrayMax(stats) ; ++i) aceOutPrint (gff_out, "%s\t%d\n", dictName (listSet, i), arr(stats,i,int)) ; handleDestroy (handle) ; return TRUE ; } /* Actually this routine will need to be externally callable as I will want */ /* to add features to an fmap on the fly....youch.... */ /* */ /* You should be aware when altering this code that some of the calls are */ /* _order_ dependant because some routines need information from preceding */ /* calls to decide what to do, e.g. the exon and cds routines. */ /* */ static void convertObj(SmapConvInfo *conv_info, DICT *featDict) { static KEY _Feature = KEY_UNDEFINED, _Transcribed_gene, _Clone_left_end, _Clone_right_end, _Confirmed_intron, _EMBL_feature, _Splices, _Method ; int cds_min = 0 ; BOOL exons_found = FALSE ; BOOL AllRequired = FALSE; OBJ obj ; if (_Feature == KEY_UNDEFINED) { _Feature = str2tag("Feature") ; _Transcribed_gene = str2tag("Transcribed_gene") ; /* Appears to be unused..... */ _Clone_left_end = str2tag("Clone_left_end") ; _Clone_right_end = str2tag("Clone_right_end") ; _Confirmed_intron = str2tag("Confirmed_intron") ; _EMBL_feature = str2tag("EMBL_feature") ; _Splices = str2tag("Splices") ; _Method = str2tag("Method"); } obj = conv_info->obj ; /* If the parent object has a required method, then any child objects which * lack methods are also included. If no methods specified, we want everything. */ if (IsRequired(conv_info) && (conv_info->key != conv_info->seq_orig /* not for master seg */ || !conv_info->method_set || !conv_info->include_methods)) AllRequired = TRUE; /* Currently the sequence class has some "special case" processing that means that we * may need to create a sort of "virtual" sequence seg. NOTE also that this must be * done before the Source_exons and cds and start_not_found. */ if (conv_info->is_Sequence_class) convertSequence(conv_info) ; else convertFeatureObj(conv_info) ; if (bsFindTag(obj, _Homol)) /* homols need to look at methods at a deeper level */ convertHomol(conv_info) ; if (AllRequired) { if (bsFindTag(obj, str2tag("Source_Exons"))) convertExons(conv_info, &exons_found) ; if (bsFindTag(obj, str2tag("CDS"))) convertCDS(conv_info, &cds_min, exons_found); if (bsFindTag(obj, str2tag("Start_not_found"))) convertStart(conv_info, cds_min) ; } if (bsFindTag(obj, _Feature)) convertFeature(conv_info,featDict) ; if (AllRequired) { if (bsFindTag(obj, _Visible)) convertVisible(conv_info) ; if (bsFindTag(obj, _Assembly_tags)) convertAssemblyTags(conv_info) ; if (bsFindTag(obj, _Allele)) convertAllele(conv_info) ; if (bsFindTag(obj, _Clone_left_end)) convertCloneEnd(conv_info, _Clone_left_end) ; if (bsFindTag(obj, _Clone_right_end)) convertCloneEnd(conv_info, _Clone_right_end) ; if (bsFindTag(obj, _Oligo)) convertOligo(conv_info) ; } if (bsFindTag (obj, _Confirmed_intron)) convertConfirmedIntrons(conv_info) ; if (AllRequired) { if (bsFindTag(obj, _EMBL_feature)) convertEMBLFeature(conv_info) ; } if (bsFindTag(obj, _Splices)) convertSplices(conv_info) ; return ; } static void convertSequence(SmapConvInfo *conv_info) { KEY method_key = str2tag("Method"); SEQINFO *sinf ; if (IsRequired(conv_info)) { SEG *seg ; /* Set these up for features that are specified within this sequence object, they will * need these indexes to get information from the sequence segs/seqinfo. */ conv_info->parent_seg_index = arrayMax(conv_info->segs_array); conv_info->parent_seqinfo_index = arrayMax(conv_info->seqinfo) ; /* Set up a seg for this object. */ seg = arrayp(conv_info->segs_array, conv_info->parent_seg_index, SEG); seg->source = sMapParent(conv_info->info); seg->key = seg->parent = conv_info->key; seg->data.i = conv_info->parent_seqinfo_index ; seg->flags = SEGFLAG_UNSET ; sMap2Seg(conv_info->info, seg, conv_info->y1, conv_info->y2, SEQUENCE) ; segSetClip(seg, conv_info->status, SEQUENCE, SEQUENCE_UP) ; /* Set up method, gaps etc. for this feature. * If the feature is gappy, hold a pointer to the Map array in the sMap */ sinf = arrayp(conv_info->seqinfo, conv_info->parent_seqinfo_index, SEQINFO) ; if (bsGetKey (conv_info->obj, method_key, &sinf->method)) { if (bsGetData (conv_info->obj, _bsRight, _Float, &sinf->score)) sinf->flags |= SEQ_SCORE ; } else setDefaultMethod(conv_info->obj, sinf) ; /* MAY NEED TO MOVE THESE INTO MY ROUTINE THAT ALLOCATES SEGS..... */ /* check when this is executed and if its ever used ?.......... */ if (conv_info->status & SMAP_STATUS_INTERNAL_GAPS) sinf->gaps = arrayHandleCopy(sMapMapArray(conv_info->info), conv_info->segs_handle); if (bsFindTag(conv_info->obj, str2tag("Assembled_from"))) /* Not known in databases ? */ sinf->flags |= SEQ_VIRTUAL_ERRORS ; if (bsFindTag(conv_info->obj, str2tag("Genomic_canonical")) || bsFindTag(conv_info->obj, str2tag("Genomic"))) sinf->flags |= SEQ_CANONICAL ; } return ; } /* This routine creates segs for objects that are themselves features as opposed * to the features specified by the magic tag "Feature" within an object. */ static void convertFeatureObj(SmapConvInfo *conv_info) { KEY method_tag = str2tag("Method"); KEY method_key = KEY_UNDEFINED ; /* Note, no method key, means no processing.... */ if (bIndexTag(conv_info->key, method_tag) && bsGetKey(conv_info->obj, method_tag, &method_key) && !FilterOut(conv_info, method_key)) { SEG *seg ; SegType seg_type ; int feature_index ; BOOL exons = FALSE, cds = FALSE ; /* Needed for gene type objects, perhaps not for simple features.... */ conv_info->parent_seg_index = arrayMax(conv_info->segs_array) ; /* This may well cause a storm but we'll see.....the worm people don't want CDS without * Source_exon's */ exons = bsFindTag(conv_info->obj, str2tag("Source_Exons")) ; cds = bsFindTag(conv_info->obj, str2tag("CDS")) ; if (cds && !exons) { messerror("Object %s has the CDS tag set but no Source_Exons defined.", nameWithClass(conv_info->key)) ; } if (exons || cds) { seg_type = SEQUENCE ; /* Set these up for features that are specified within this sequence object, they will * need these indexes to get information from the sequence segs/seqinfo. */ conv_info->parent_seqinfo_index = arrayMax(conv_info->seqinfo) ; } else { feature_index = arrayMax(conv_info->feature_info) ; seg_type = FEATURE_OBJ ; } /* Set up a seg for this object. */ seg = arrayp(conv_info->segs_array, conv_info->parent_seg_index, SEG) ; seg->source = sMapParent(conv_info->info) ; seg->key = seg->parent = conv_info->key ; seg->data_type = SEG_DATA_UNSET ; sMap2Seg(conv_info->info, seg, conv_info->y1, conv_info->y2, seg_type) ; segSetClip(seg, conv_info->status, seg_type, SEG_UP(seg_type)) ; /* For exon/intron like objects, set up method, gaps etc. for this feature. * If the feature is gappy, hold a pointer to the Map array in the sMap */ if (seg_type == SEQUENCE) { SEQINFO *sinf ; seg->data.i = conv_info->parent_seqinfo_index ; sinf = arrayp(conv_info->seqinfo, conv_info->parent_seqinfo_index, SEQINFO) ; sinf->method = method_key ; if (bsGetData (conv_info->obj, _bsRight, _Float, &sinf->score)) sinf->flags |= SEQ_SCORE ; /* MAY NEED TO MOVE THESE INTO MY ROUTINE THAT ALLOCATES SEGS..... */ /* check when this is executed and if its ever used ?.......... */ if (conv_info->status & SMAP_STATUS_INTERNAL_GAPS) sinf->gaps = arrayHandleCopy(sMapMapArray(conv_info->info), conv_info->segs_handle); } else { FeatureInfo *feat ; seg->data.i = feature_index ; feat = arrayp(conv_info->feature_info, feature_index, FeatureInfo) ; feat->method = method_key ; } #ifdef ED_G_NEVER_INCLUDE_THIS_CODE printf("convertFeatureObj(), obj: %s at %d %d, parent: %s\n", nameWithClass(seg->key), conv_info->y1, conv_info->y2, nameWithClass(seg->source)) ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ } return ; } /* There is a big loop in this routine which is because the underlying database * model cannot simply be bsFlatten'd: * * Homol Float Int UNIQUE Int Int UNIQUE Int #Homol_info * * #Homol_info Align Int UNIQUE Int UNIQUE Int * // not needed if full alignment is ungapped * // one row per ungapped alignment section * // first int is position in query, second in target * // third int is length - only necessary when gaps in * // both sequences align. * * mark1 cursors down * mark2 cursors down * mark3 cursors down Float * mark4 cursors down Int * mark5 cursors down Int * * There are extra complications in that we record each set of homols processed * add information to join the homols up with lines and to join EST read pairs. */ static void convertHomol(SmapConvInfo *conv_info) { static KEY Clone = KEY_UNDEFINED, Show_in_reverse_orientation, Join_blocks, Paired_read ; KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; BSMARK mark1 = 0, mark2 = 0, mark3 = 0, mark4 = 0, mark5 = 0, mark6 = 0 ; KEY method, homol, tag2 ; float score; int pos1, pos2, target1, target2; BOOL show_in_reverse ; KEY prev_method = KEY_UNDEFINED ; int homols_start, homols_end ; BOOL join_blocks, multiple_homols ; BOOL paired_read ; KEY paired_key = KEY_UNDEFINED ; Array all_paired_reads ; Associator key2readpair ; if (Clone == KEY_UNDEFINED) { Clone = str2tag("Clone") ; Show_in_reverse_orientation = str2tag("Show_in_reverse_orientation") ; Join_blocks = str2tag("Join_blocks") ; Paired_read = str2tag("Paired_read") ; } all_paired_reads = arrayCreate(30, ReadPairStruct) ; key2readpair = assCreate() ; if (bsGetKeyTags(obj, _Homol, &tag2)) do { mark1 = bsMark(obj, mark1); if (bsGetKey(obj, _bsRight, &homol)) do { mark2 = bsMark(obj, mark2); /* Tags used for EST read pair processing. */ show_in_reverse = bIndexTag(homol, Show_in_reverse_orientation) ; paired_read = bIndexGetKey(homol, Paired_read, &paired_key) ; if (bsGetKey(obj, _bsRight, &method)) do { /* skip this one if filtering based on method ** and the method doesn't match. */ if (FilterOut(conv_info, method)) continue; mark3 = bsMark(obj, mark3); homols_start = 0 ; multiple_homols = FALSE ; join_blocks = bIndexTag(method, Join_blocks) ; if (bsGetData(obj, _bsRight, _Float, &score)) do { mark4 = bsMark(obj, mark4); if (bsGetData(obj, _bsRight, _Int, &pos1)) do { mark5 = bsMark(obj, mark5); if (bsGetData(obj, _bsRight, _Int, &pos2) && bsGetData(obj, _bsRight, _Int, &target1)) do { mark6 = bsMark(obj, mark6); if (bsGetData(obj, _bsRight, _Int, &target2)) { int y1 = 0, y2 = 0, clip_pos1 = 0, clip_pos2 = 0 ; SMapStatus status ; status = sMapMap(info, pos1, pos2, &y1, &y2, &clip_pos1, &clip_pos2) ; /* We should be recording homols which are off end of this * bit of sequence, and we should be doing this for exons * as well...then we could create segs to show this. */ if (SMAP_STATUS_INAREA(status)) { HOMOLINFO *hinf ; SEG *seg ; int i; homols_end = arrayMax(conv_info->segs_array) ; seg = arrayp(conv_info->segs_array, homols_end, SEG); seg->source = key ; seg->key = seg->parent = homol; /* target */ bIndexGetKey(homol, Clone, &(seg->parent)) ; /* Make clone the parent if it exists. */ seg->data.i = arrayMax(conv_info->homol_info) ; hinf = arrayp(conv_info->homol_info, seg->data.i, HOMOLINFO) ; hinf->method = method; hinf->score = score; /* into Homo_info */ { BOOL errors = FALSE ; bsPushObj(obj); /* We now pass in the clipped coords but actually it makes no difference as this function takes no notice, one day I will fix it and then it will.*/ if ((hinf->gaps = sMapLocalMap(obj, clip_pos1, clip_pos2, conv_info->segs_handle))) { /* Because sMapLocalMap() is faulty we must check our mapping and currently we don't map any gaps if we fail on any of them. */ for (i = 0 ; i < arrayMax(hinf->gaps) && !errors ; i++) { SMapMap *m = arrp(hinf->gaps, i, SMapMap) ; int l1 = m->r1 ; int l2 = m->r2 ; status = sMapMap(info, l1, l2, &m->r1, &m->r2, NULL, NULL) ; if (!SMAP_STATUS_INAREA(status)) errors = TRUE ; } if (errors) { arrayDestroy(hinf->gaps) ; hinf->gaps = NULL ; messerror("Homol \"%s\" has a gaps array that does not " " map correctly, see Align data in parent " " \"%s\"", nameWithClass(homol), nameWithClass(conv_info->key)) ; } } } /* Some features (e.g. for EST read pairs) need to be shown on the opposite strand so flip coords and flag it. */ if (show_in_reverse) { int tmp ; tmp = target1 ; target1 = target2 ; target2 = tmp ; seg->flags |= SEGFLAG_FLIPPED ; } if (target1 > target2) { sMap2Seg(conv_info->info, seg, y2, y1, HOMOL); hinf->x1 = target2 ; hinf->x2 = target1 ; } else { sMap2Seg(conv_info->info, seg, y1, y2, HOMOL); hinf->x1 = target1; hinf->x2 = target2; } if (prev_method != method) { prev_method = method ; homols_start = homols_end ; /* reset start. */ } else multiple_homols = TRUE ; } } bsGoto(obj, mark6); } while (bsGetData(obj, _bsDown, _Int, &target1)); bsGoto(obj, mark5); } while (bsGetData(obj, _bsDown, _Int, &pos1)); bsGoto(obj, mark4); } while (bsGetData(obj, _bsDown, _Float, &score)); if (homols_start != 0 && join_blocks) { /* Found some homols and their method says join them with lines */ /* Sorting gives us positional order of homol blocks. */ if (homols_start != homols_end) arraySortPos(conv_info->segs_array, homols_start, homolOrder) ; if (paired_read) addReadPair(all_paired_reads, key2readpair, homol, paired_key, conv_info->segs_array, homols_start, homols_end) ; if (multiple_homols) homolLines(conv_info->segs_array, homols_start, homols_end) ; } prev_method = KEY_UNDEFINED ; /* Reset to detect new method. */ bsGoto(obj, mark3); } while (bsGetKey(obj, _bsDown, &method)); bsGoto(obj, mark2); } while (bsGetKey(obj, _bsDown, &homol)); bsGoto(obj, mark1); } while (bsGetKeyTags(obj, _bsDown, &tag2)); /* Now process any paired reads into line segs. */ if (arrayMax(all_paired_reads) > 0) makePairedReadSegs(conv_info->segs_array, all_paired_reads) ; /* Clear up. */ bsMarkFree(mark1); bsMarkFree(mark2); bsMarkFree(mark3); bsMarkFree(mark4); bsMarkFree(mark5); bsMarkFree(mark6); arrayDestroy(all_paired_reads) ; assDestroy(key2readpair) ; #ifdef ED_G_NEVER_INCLUDE_THIS_CODE /* At the end of this there are two bitExtend calls which were part of the match * table stuff, without these homologies get excluded by the draw code. Simon * says this is all redundant...but need to check. */ /* This was completely missing....aggggghhhhh.......and causes serious problems.... * because homologies come and go like nobodies business.... */ /* THIS COULD BE A SEPARATE ROUTINE AND PERHAPS I'LL SPLIT IT OFF ONCE ITS WORKING... */ /* MatchTable - more homologies */ { Array ma ; MATCH *m ; HOMOLINFO *hinf ; KEY match_key ; if (bsGetKey(obj, str2tag("Match_table"), &match_key) && (ma = arrayGet (match_key, MATCH, matchFormat))) { int i ; FLAGSET fBury = flag ("Match", "Bury") ; for (i = 0 ; i < arrayMax(ma) ; ++i) { m = arrp (ma, i, MATCH) ; if (class(m->meth) != _VMethod || !m->q1 || !m->q2) continue ; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->source = match_key ; /* the match table key */ seg->key = seg->parent = m->key ; seg->data.i = arrayMax(conv_info->homol_info) ; hinf = arrayp(conv_info->homol_info, seg->data.i, HOMOLINFO) ; hinf->method = m->meth ; hinf->score = m->score ; hinf->gaps = 0; bitSet (conv_info->homolFromTable, seg->data.i) ; if (m->t1 > m->t2) { pos1 = m->q2 ; pos2 = m->q1 ; hinf->x1 = m->t2 ; hinf->x2 = m->t1 ; } else { pos1 = m->q1 ; pos2 = m->q2 ; hinf->x1 = m->t1 ; hinf->x2 = m->t2 ; } POS_PROCESS(HOMOL,HOMOL_UP) ; if (m->flags & fBury) bitSet (conv_info->homolBury, seg->data.i) ; } arrayDestroy (ma) ; } } #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ /* necessary to ensure array long enough */ bitExtend (conv_info->homolBury, arrayMax(conv_info->homol_info)) ; bitExtend (conv_info->homolFromTable, arrayMax(conv_info->homol_info)) ; return ; } /* Notes: * - exons can legitmately be 1 base long, in this case SMAP2SEG may give * seg the wrong type so we get type from the sequence which holds the * exon information which will have the correct type. * The same is done for introns although biologically I think this must * be an error. */ static void convertExons(SmapConvInfo *conv_info, BOOL *exons_found) { int pos1, pos2 ; int i; SEG *seg; int y1, y2; BOOL have_exon = FALSE; OBJ obj = conv_info->obj ; Array units = conv_info->units ; if (bsFlatten (obj, 2, units)) { KEY key = conv_info->key ; SEQINFO *sinf ; int seg_index, sinf_index ; BOOL exon_no_overlap = FALSE ; seg_index = conv_info->parent_seg_index ; sinf_index = conv_info->parent_seqinfo_index ; sinf = arrayp(conv_info->seqinfo, sinf_index, SEQINFO) ; dnaExonsSort (units) ; /* correct if first exon doesn't start at posn 1, since sMap will have truncated to the boundaries of the exons. Cannot do anything about premature end of last exon at the moment */ if (arr(units, 0, BSunit).i > 1) { SEG *parent_seg = arrp(conv_info->segs_array, seg_index, SEG) ; if (parent_seg->type == SEQUENCE) parent_seg->x1 -= arr(units, 0, BSunit).i - 1; else if (parent_seg->type == SEQUENCE_UP) parent_seg->x2 += arr(units, 0, BSunit).i - 1; } for (pos1 = 1, i = 0 ; i < arrayMax(units) ; i += 2) { SMapStatus status ; pos2 = arr(units, i, BSunit).i ; if ((have_exon && (pos2 > (pos1+1))) || exon_no_overlap == TRUE) /* Exons may abut */ { status = sMapUnsplicedMap(conv_info->info, pos1, pos2, &y1, &y2, NULL, NULL) ; if (SMAP_STATUS_OVERLAP(status)) { seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->key = 0 ; seg->parent = seg->source = key ; seg->data.i = sinf_index ; sMap2Seg(conv_info->info, seg, y1, y2, INTRON) ; segSetClip(seg, status, INTRON, INTRON_UP) ; /* Adjust intron coords so they lie _between_ exon coords. Note that * coords can be clipped down to just one or two pixels by SMap mapping so * must be careful in adjusting coords. */ if (seg->x2 - seg->x1 < 2) seg->x2 = seg->x1 ; else { seg->x1++ ; seg->x2-- ; } if (fMapDebug) printf("INTRON: %s %d --> %d\n", name(seg->parent), seg->x1, seg->x2) ; } } pos1 = pos2 ; pos2 = arr(units, i+1, BSunit).i ; if (pos2) { status = sMapUnsplicedMap(conv_info->info, pos1, pos2, &y1, &y2, NULL, NULL) ; if (SMAP_STATUS_OVERLAP(status)) { seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->key = 0 ; seg->parent = seg->source = key ; seg->data.i = sinf_index ; seg->flags = SEGFLAG_UNSET ; sMap2Seg(conv_info->info, seg, y1, y2, EXON) ; segSetClip(seg, status, EXON, EXON_UP) ; have_exon = TRUE ; exon_no_overlap = FALSE ; if (fMapDebug) printf("EXON: %s %d --> %d\n", name(seg->parent), seg->x1, seg->x2) ; } else { exon_no_overlap = TRUE ; } } else messerror ("Missing pos2 for exon in %s", nameWithClass(key)) ; pos1 = pos2 ; } sinf->flags |= SEQ_EXONS ; } *exons_found = have_exon; return ; } /* OK, pay attention, when the CDS gets drawn (fMapShowSequence), the * CDS seg must come _before_ the exon segs, this is guaranteed by * the numerical value of its seg->type (fmap_.h) combined with the arraySort at * the end of smapconvert. * * If the CDS start or stop positions are set then we will need to * convert positions for finding the spliced DNA, e.g. for later * protein translation. */ static void convertCDS(SmapConvInfo *conv_info, int *cds_min_out, BOOL exons_found) { KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; int cds_min, cds_max ; int y1, y2 ; SMapStatus status ; BOOL cds_coords = FALSE ; cds_min = 1 ; cds_max = 0 ; if (bsGetData(obj, _bsRight, _Int, &cds_min)) { cds_coords = TRUE ; bsGetData(obj, _bsRight, _Int, &cds_max) ; } /* Try to map the CDS, if we fail because of bad start/end in database */ /* then redo as whole of source exons, otherwise just fail. */ status = sMapMap(info, cds_min, cds_max, &y1, &y2, NULL, NULL) ; if (status & SMAP_STATUS_NO_OVERLAP) { #ifdef ED_G_NEVER_INCLUDE_THIS_CODE if (cds_coords) { int tmp_min = cds_min, tmp_max = cds_max ; cds_min = 1 ; cds_max = 0 ; status = sMapMap(info, cds_min, cds_max, &y1, &y2, NULL, NULL) ; if (status == SMAP_STATUS_PERFECT_MAP || status == SMAP_STATUS_INTERNAL_GAPS) { messerror("The start and/or end of the CDS in object %s" " is outside of the spliced DNA coordinates for that object" " CDS start = %d & end = %d)." " Make sure the CDS positions have been specified" " in spliced DNA, not Source_exon, coordinates." " CDS start/end positions have been reset to start/end of" " spliced DNA.", nameWithClass(key), tmp_min, tmp_max) ; } else { messerror("Cannot smap CDS for object %s, please check your data for this object.", nameWithClass(key)) ; } } #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ } if (SMAP_STATUS_INAREA(status)) { SEQINFO *sinf ; SEG *seg ; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->key = _CDS ; seg->parent = seg->source = key ; seg->flags = SEGFLAG_UNSET ; sMap2Seg(conv_info->info, seg, y1, y2, CDS) ; segSetClip(seg, status, CDS, CDS_UP) ; /* We need to record position and other CDS information for later */ /* drawing of the CDS. */ sinf = arrayp(conv_info->seqinfo, conv_info->parent_seqinfo_index, SEQINFO) ; sinf->flags |= SEQ_CDS ; sinf->cds_info.cds_seg = *seg ; sinf->cds_info.cds_only = TRUE ; /* record cds_min for use in convertStart() */ *cds_min_out = cds_min ; if (fMapDebug) printf("CDS: %d --> %d\n\n", seg->x1, seg->x2) ; /* Possible that database may specify a CDS without any accompanying */ /* source exons so synthesize an exon which covers the whole of the */ /* object. */ if (!exons_found) { SEG *seg ; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; *seg = arr(conv_info->segs_array, conv_info->parent_seg_index, SEG) ; /* n.b. struct copy. */ seg->source = key ; seg->key = 0 ; /* unusual because we are copying from parent seg... */ sMap2Seg(conv_info->info, seg, (seg->x1 + 1), (seg->x2 + 1), EXON) ; sinf->flags |= SEQ_EXONS ; } } return ; } /* N.B. should be called after convertCDS() as it may need cds_min for */ /* checking. */ /* */ static void convertStart(SmapConvInfo *conv_info, int cds_min) { SEQINFO *sinf ; KEY key = conv_info->key ; OBJ obj = conv_info->obj ; sinf = arrayp(conv_info->seqinfo, conv_info->parent_seqinfo_index, SEQINFO) ; /* Tag signals that this object is incomplete, there is more upstream */ /* somewhere. Tag can be followed by a frame shift to correct any */ /* resultant reading frame error (0 means ignore setting, default). */ /* If there is a CDS, then it must start at position 1 for this tag to */ /* be valid. */ sinf->start_not_found = 0 ; if (sinf->cds_info.cds_only && (cds_min != 1)) { messerror("Data inconsistency: the Start_not_found tag is set for object %s," " but the CDS begins at position %d instead of at position 1" " of the first exon, tag will be ignored", name(key), cds_min) ; } else { /* Record any value found in seqinfo for later use. */ if (bsGetData(obj, _bsRight, _Int, &sinf->start_not_found)) { if (sinf->start_not_found < 1 || sinf->start_not_found > 3) { messerror("Data inconsistency: the Start_not_found tag for object %s" " should be given the value 1, 2 or 3 but has been" " set to %d," " tag will be ignored", name(key), sinf->start_not_found) ; sinf->start_not_found = 0 ; } } } return ; } /* Processes the following model fragment which can be embedded in any class: * * Feature ?Method Int Int UNIQUE Float UNIQUE Text #Feature_info * // Float is score, Text is note * // note is shown on select, and same notes are neighbours * // again, each method has a column double-click shows the method. * // Absorb Assembly_tags? * */ static void convertFeature(SmapConvInfo *conv_info, DICT *featDict) { KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; if (bsFlatten (obj, 5, units)) { int j ; for (j = 0 ; j < arrayMax(units) ; j += 5) { BSunit *u = arrayp(units, j, BSunit); SEG *seg; int y1, y2; SMapStatus status ; if (class(u[0].k != _VMethod) || !u[1].i || !u[2].i) continue; /* skip this one if methods specified & it doesn't match * Note that bsFlatten has created a one-dimensional array * in which the 5 elements of the Feature tag are loaded * into consecutive elements of the units array. We don't * use j to index into it as we point u at the set we want. */ if (FilterOut(conv_info, u[0].k)) continue; status = sMapMap(info, u[1].i, u[2].i, &y1, &y2, NULL, NULL) ; if (!SMAP_STATUS_INAREA(status)) continue; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->source = key; seg->key = u[0].k; /* n.b. assigns Method obj to key. */ seg->data.f = u[3].f; sMap2Seg(conv_info->info, seg, y1, y2, FEATURE); if (u[4].s) { int tmp; dictAdd(featDict, u[4].s, &tmp); seg->parent = -tmp; } else seg->parent = 0; } } return ; } static void convertVisible(SmapConvInfo *conv_info) { KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; if (bsFlatten(obj, 2, units)) { int j ; for (j = 0 ; j < arrayMax(units) ; j += 2) { int y1, y2 ; SEG *seg ; SMapStatus status ; status = sMapMap(info, 1, 0, &y1, &y2, NULL, NULL); /* cannot fail */ if (!SMAP_STATUS_INAREA(status)) continue ; if (!iskey (arr(units,j,BSunit).k) || class(arr(units,j,BSunit).k)) continue ; /* should be a tag */ if (!iskey (arr(units,j+1,BSunit).k)) continue ; /* there is a problem if the model says: Visible foo Text a hack because iskey() could be true for a text pointer should store these as different type, e.g. VISIBLE_TEXT */ seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->source = seg->parent = key; seg->key = arr(units, j+1, BSunit).k; seg->data.k = arr(units, j, BSunit).k; sMap2Seg(conv_info->info, seg, y1, y2, VISIBLE) ; } } return ; } /* Conversion of assembly data, the sub-part of the Sequence class is: */ /* */ /* Assembly_tags Text Int Int Text // type, start, stop, comment */ /* */ /* I make the assumption that if the start or stop are not given then we do */ /* _NOT_ map the feature, it _must_ have these at least to be mapped. This */ /* is a change in behaviour from the old code. */ /* */ static void convertAssemblyTags(SmapConvInfo *conv_info) { KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; int pos1, pos2 ; int y1, y2 ; SMapStatus status ; if (bsFlatten (obj, 4, units)) { int j ; for (j = 0 ; j < arrayMax(units)/4 ; ++j) { int length; char *string; char *type_text, *text_text; pos1 = arr(units, 4*j+1, BSunit).i; pos2 = arr(units, 4*j+2, BSunit).i; /* At this point we could report errors if required. */ if (pos1 == 0 || pos2 == 0) continue ; status = sMapMap(info, pos1, pos2, &y1, &y2, NULL, NULL) ; if (SMAP_STATUS_INAREA(status)) { SEG *seg ; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->parent = seg->source = key; seg->key = _Assembly_tags; /* This seems incorrect, some of the assembly_tag information */ /* seems to be strand specific, _BUT_ the code has not in */ /* the past paid any attention to this. Thus we cannot do our */ /* usual sMap2Seg(info, ) conversion but must instead just pass the */ /* coords through. */ #ifdef ED_G_NEVER_INCLUDE_THIS_CODE sMap2Seg(info, seg, y1, y2, ASSEMBLY_TAG) ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ seg->x1 = (y1 - 1) ; seg->x2 = (y2 - 1) ; seg->type = ASSEMBLY_TAG ; /* Get the text */ length = 0; type_text = arr(units, 4*j, BSunit).s; if (type_text) length += strlen(type_text); text_text = arr(units, 4*j+3, BSunit).s; if (text_text) length += strlen(text_text); seg->data.s = string = halloc(length+3, conv_info->segs_handle); string[0] = '\0'; if (type_text) strcpy(string, type_text) ; strcat(string, ": "); if (text_text) strcat(string, text_text); } } } return ; } /* Conversion of allele data. * * Alleles are now represented in two ways which are both processed by this * routine. This routine reads allele data in both forms and converts the data * into ALLELE segs. * * The object coming in will be either a Sequence class object or some other * SMap'd class of object. * * If the class is _not_ a Sequence class object then the position has already * been mapped and we just need to check that the Allele tag format is correct: * * Allele * * and then check that there is a method. In the new world anything without a * method will _NOT_ be mapped. * * If the class is a Sequence class object then we must check that the data * following the Allele tag is in the old format: * * Allele ?Allele XREF Sequence UNIQUE Int UNIQUE Int UNIQUE Text * * Note I make the assumption that if the start or stop are not given then we do * _NOT_ map the allele, it _must_ have these at least to be mapped. As with * old code we allow the Text to be NULL. * * Its important to check the format because some databases have reused the * Allele tag as part of their SMap tags: * * Allele ?Allele XREF Sequence UNIQUE Int UNIQUE Int #SMap_info * */ static void convertAllele(SmapConvInfo *conv_info) { KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; KEY method_key = str2tag("Method"); if (!(conv_info->is_Sequence_class)) { /* non-sequence class objects. */ SEG *seg ; KEY method = KEY_UNDEFINED ; /* Really we should check that the Allele tag is in the correct format here, * there is a big question about how/if we want to go down this road..... * We don't want to do this sort of checking at this level because the * performance implications are BAD....but we do have a problem because with Smap * the same tag may be used in one object as part of the Smap tags and in another * to signify that an object is an allele etc.... */ /* To be shown must have a method + if filtering based on method, method must match. */ if (bsGetKey(conv_info->obj, method_key, &method) && !FilterOut(conv_info, method)) { seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->source = sMapParent(conv_info->info) ; seg->key = conv_info->key ; seg->parent = KEY_UNDEFINED ; seg->data_type = SEG_DATA_UNSET ; seg->data.i = 0 ; if (conv_info->y1 || conv_info->y2) seg->parent = seg->key ; /* flag to draw it */ else seg->parent = seg->source ; /* couple with parent */ sMap2Seg(conv_info->info, seg, conv_info->y1, conv_info->y2, ALLELE) ; } } else { if (isOldAlleleModel(obj) && bsFlatten (obj, 4, units)) { int i ; for (i = 0 ; i < arrayMax (units) ; i += 4) { BSunit *u = arrayp(units, i, BSunit) ; SEG *seg ; char *string ; int y1, y2 ; SMapStatus status ; /* The indices here are hardcoded as we point u to the set of * array elements we're interested in. */ /* Only process Allele data if there is position information and that * position is within smap'd sequence. */ if (u[1].i == 0 || u[2].i == 0) continue ; status = sMapMap(info, u[1].i, u[2].i, &y1, &y2, NULL, NULL) ; if (!SMAP_STATUS_INAREA(status)) continue; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->source = key ; seg->key = u[0].k ; seg->parent = 0 ; seg->data_type = SEG_DATA_UNSET ; seg->data.i = 0 ; if (y1 || y2) seg->parent = seg->key ; /* flag to draw it */ else seg->parent = seg->source ; /* couple with parent */ sMap2Seg(conv_info->info, seg, y1, y2, ALLELE) ; if ((string = u[3].s)) { seg->data.s = strnew(string, conv_info->segs_handle) ; seg->data_type = SEG_DATA_STRING ; } } } } return ; } /* Expects to be located at Allele tag in current object and then checks to * see if model looks like: * * Allele ?Allele XREF Sequence UNIQUE Int UNIQUE Int UNIQUE Text * * .......although only checks the final Text field, returns TRUE if final * field is "Text", FALSE otherwise. */ static BOOL isOldAlleleModel(OBJ obj) { BOOL result = TRUE ; BS bsm = NULL ; KEY node_key = KEY_UNDEFINED ; char *text = NULL ; bsm = bsModelCurrent(obj) ; node_key = bsModelKey(bsm) ; if (strcmp(name(node_key), "Allele") != 0) messcrash("Wrongly located, supposed to be at Allele tag but actually at: %s", name(node_key)) ; while ((bsm = bsModelRight(bsm))) { node_key = bsModelKey(bsm) ; text = name(node_key) ; } if (text && strcmp(text, "Text") == 0) result = TRUE ; else result = FALSE ; return result ; } /* Conversion of Clone End data, the sub-part of the Sequence class is: * * Clone_left_end ?Clone XREF Clone_left_end UNIQUE Int * Clone_right_end ?Clone XREF Clone_right_end UNIQUE Int * */ static void convertCloneEnd(SmapConvInfo *conv_info, KEY clone_tag) { KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; if (bsFlatten(obj, 2, units)) { int i ; int y1, y2 ; for (i = 0 ; i < arrayMax(units) ; i += 2) { BSunit *u = arrayp(units, i, BSunit) ; SMapStatus status ; status = sMapMap(info, u[1].i, u[1].i, &y1, &y2, NULL, NULL) ; if (SMAP_STATUS_INAREA(status)) { SEG *seg ; KEY seqKey ; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->source = key ; seg->key = u[0].k ; /* Clone */ sMap2Seg(conv_info->info, seg, y1, y2, CLONE_END) ; seg->data.k = clone_tag ; lexaddkey (name(seg->key), &seqKey, _VSequence) ; seg->parent = seqKey ; } } } return ; } /* Conversion of oligo data, the sub-part of the Sequence class is: */ /* */ /* Oligo ?Oligo XREF In_sequence Int UNIQUE Int */ /* // for OSP and human mapping mostly */ /* I make the assumption that if the start or stop are not given then we do */ /* _NOT_ map the oligo, it _must_ have these at least to be mapped. */ /* */ static void convertOligo(SmapConvInfo *conv_info) { static KEY _Tm = KEY_UNDEFINED, _Temporary = KEY_UNDEFINED ; KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; if (_Tm == KEY_UNDEFINED) { _Tm = str2tag ("Tm") ; _Temporary = str2tag ("Temporary") ; } if (bsFlatten (obj, 3, units)) { int i ; for (i = 0 ; i < arrayMax (units) ; i += 3) { BSunit *u = arrayp(units, i, BSunit) ; int y1, y2 ; SMapStatus status ; /* The old code used to call fMapOspPositionOligo() */ /* I'm not sure we really want fMapOspPositionOligo(), it actually */ /* alters the database...or tries to, to insert the oligo position.*/ if (u[1].i == 0 || u[2].i == 0) continue ; status = sMapMap(info, u[1].i, u[2].i, &y1, &y2, NULL, NULL) ; if (SMAP_STATUS_INAREA(status)) { SEG *seg ; OBJ obj1 ; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->source = key ; seg->key = u[0].k ; seg->parent = seg->key ; seg->data.i = 0 ; sMap2Seg(conv_info->info, seg, y1, y2, OLIGO) ; if ((obj1 = bsCreate(seg->key))) { float xf ; int xi ; if (bsGetData(obj1, _Tm, _Float, &xf)) { xi = 10 * xf ; seg->data.i = (xi & 0x3FF) << 16 ; } if (bsGetData(obj1, _Score, _Float, &xf)) { xi = xf ; seg->data.i |= (xi & 0xfff) ; } if (!bsFindTag(obj1, _Temporary)) seg->data.i |= 0x4000 ; /* old */ bsDestroy (obj1) ; } } } } return ; } /* Converts this tag set: * * Confirmed_intron Int Int #Splice_confirmation * * where: * * #Splice_confirmation cDNA ?Sequence * EST ?Sequence * Homology * UTR ?Sequence * False ?Sequence * Inconsistent ?Sequence * * BUT note that the Sequence is a new addition and may not be there in many * databases. * * */ static void convertConfirmedIntrons(SmapConvInfo *conv_info) { static KEY _EST = KEY_UNDEFINED, _cDNA, _Homology, _UTR, _False, _Inconsistent ; KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; SEG *seg ; /* if we're including methods, then we don't want anything * that doesn't have a method at all...think this might be redundant... */ if (!conv_info->method_set || conv_info->include_methods == FALSE) { if (_EST == KEY_UNDEFINED) { _EST = str2tag("EST") ; _cDNA = str2tag("cDNA") ; _Homology = str2tag("Homology") ; _UTR = str2tag("UTR") ; _False = str2tag("False") ; _Inconsistent = str2tag("Inconsistent") ; } if (bsFlatten (obj, 4, units)) { int i ; SEQINFO *sinf ; BOOL intron_was_mapped ; for (i = 0, intron_was_mapped = FALSE, sinf = NULL ; i < arrayMax(units) ; i += 4) { BSunit *u ; int y1 = 0, y2 = 0 ; ConfirmedIntronInfo confirm = NULL ; u = arrayp(units, i, BSunit) ; if (!u[0].i || !u[1].i) /* No coords, can't do anything. */ continue ; /* Each intron can be confirmed by multiple sources, we just want one * SEG and seqinfo for each intron but must record the multiple confirmations * for each intron, i.e. intron coords stay same but confirmation data changes. */ if (i == 0 || u[-4].i != u[0].i || u[-3].i != u[1].i) { SMapStatus status ; status = sMapMap(info, u[0].i, u[1].i, &y1, &y2, NULL, NULL) ; if (SMAP_STATUS_INAREA(status)) { intron_was_mapped = TRUE ; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->source = key ; seg->key = 0 ; seg->data.i = 0 ; sMap2Seg(conv_info->info, seg, y1, y2, INTRON) ; segSetClip(seg, conv_info->status, INTRON, INTRON_UP) ; /* If we allocated a new seg then allocate a new seqinfo. */ seg->data.i = arrayMax(conv_info->seqinfo) ; sinf = arrayp(conv_info->seqinfo, seg->data.i, SEQINFO) ; sinf->confirmation = arrayHandleCreate(SEQ_CONFIRM_TYPES, ConfirmedIntronInfoStruct, conv_info->segs_handle) ; } else { intron_was_mapped = FALSE ; continue ; } } /* If previous intron was _not_ mapped and current intron has same coords as * previous intron then we should ignore the current intron as well. */ if (!intron_was_mapped) continue ; /* Record the confirmation information. */ confirm = arrayp(sinf->confirmation, arrayMax(sinf->confirmation), ConfirmedIntronInfoStruct) ; /* I'm making all the strings be just the type otherwise it makes parsing of * confirmed introns in GFF etc. harder. */ if (u[2].k == _EST) { sinf->flags |= SEQ_CONFIRM_EST ; confirm->confirm_str = "EST" ; if (u[3].k) confirm->confirm_sequence = u[3].k ; } else if (u[2].k == _cDNA) { sinf->flags |= SEQ_CONFIRM_CDNA ; confirm->confirm_str = "cDNA" ; if (u[3].k) confirm->confirm_sequence = u[3].k ; } else if (u[2].k == _Homology) { sinf->flags |= SEQ_CONFIRM_HOMOL ; confirm->confirm_str = "homology" ; if (u[3].k) confirm->confirm_sequence = u[3].k ; } else if (u[2].k == _UTR) { sinf->flags |= SEQ_CONFIRM_UTR ; confirm->confirm_str = "UTR" ; if (u[3].k) confirm->confirm_sequence = u[3].k ; } else if (u[2].k == _False) { sinf->flags |= SEQ_CONFIRM_FALSE ; confirm->confirm_str = "false" ; if (u[3].k) confirm->confirm_sequence = u[3].k ; } else if (u[2].k == _Inconsistent) { sinf->flags |= SEQ_CONFIRM_INCONSIST ; confirm->confirm_str = "inconsistent" ; if (u[3].k) confirm->confirm_sequence = u[3].k ; } else { sinf->flags |= SEQ_CONFIRM_UNKNOWN ; confirm->confirm_str = "unknown" ; } } } } return ; } static void convertEMBLFeature(SmapConvInfo *conv_info) { SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; if (bsFlatten(obj, 4, units)) { int i ; for (i = 0 ; i < arrayMax (units) ; i += 4) { BSunit *u = arrayp(units, i, BSunit) ; int y1, y2 ; SMapStatus status ; status = sMapMap(info, u[1].i, u[2].i, &y1, &y2, NULL, NULL) ; if (SMAP_STATUS_INAREA(status)) { char *string ; KEY key ; SEG *seg ; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->source = conv_info->key ; seg->parent = arrayMax(conv_info->segs_array) ; seg->key = u[0].k ; sMap2Seg(conv_info->info, seg, y1, y2, EMBL_FEATURE) ; /* 4'th column: can't distinguish KEY from Text securely assume if iskey() that it is a key, e.g. ?Text */ if (iskey(key = u[3].k)) seg->data.s = strnew (name(key), conv_info->segs_handle) ; else if ((string = u[3].s)) seg->data.s = strnew(string, conv_info->segs_handle) ; else seg->data.s = NULL ; } } } return ; } static void convertSplices(SmapConvInfo *conv_info) { static KEY _Predicted_5 = KEY_UNDEFINED, _Predicted_3 ; KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; if (_Predicted_5 == KEY_UNDEFINED) { _Predicted_5 = str2tag("Predicted_5") ; _Predicted_3 = str2tag("Predicted_3") ; } if (bsFlatten(obj, 5, units)) { int i ; for (i = 0 ; i < arrayMax(units) ; i += 5) { BSunit *u = arrayp(units, i, BSunit) ; int y1, y2, unused ; SMapStatus status ; if (u[0].k != _Predicted_5 && u[0].k != _Predicted_3) continue ; if (class(u[1].k) != _VMethod || !u[2].i || !u[3].i) continue; if (conv_info->method_set && !(keySetFind(conv_info->method_set, u[1].k, &unused))) continue ; status = sMapMap(info, u[2].i, u[3].i, &y1, &y2, NULL, NULL) ; if (SMAP_STATUS_INAREA(status)) { SEG *seg ; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->source = key ; seg->key = u[1].k ; if (u[0].k == _Predicted_5) sMap2Seg(conv_info->info, seg, y1, y2, SPLICE5) ; else sMap2Seg(conv_info->info, seg, y1, y2, SPLICE3) ; seg->data.f = u[4].f ; } } } return ; } /* This encapsulates all the data from Acembly */ /* NOTE that this code is currently untested, and will probably need some ajustment of models. I will liase with Jean */ static void convertMieg(SmapConvInfo *conv_info) { KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; int i; SEG *seg; KEY M_TRANSCRIPT = defaultSubsequenceKey ("TRANSCRIPT", DARKGRAY, 2.1, FALSE) ; static KEY _Transcribed_gene = KEY_UNDEFINED ; if (_Transcribed_gene == KEY_UNDEFINED) { _Transcribed_gene = str2tag("Transcribed_gene") ; } if (bsFindTag (obj, _Primer) && bsFlatten (obj, 1, units)) for (i = 0 ; i < arrayMax(units) ; ++i) { seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->parent = seg->source = key; seg->key = arr(units,i, BSunit).k ; seg->type = PRIMER ; } if (bsFindTag (obj, _SCF_File)) { seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->parent = seg->source = key; seg->type = VIRTUAL_SUB_SEQUENCE_TAG ; /* TODO if (!isDown) seg->type = VIRTUAL_SUB_SEQUENCE_TAG_UP ; */ } if (bsFindTag (obj, _Transcribed_gene) && bsFlatten (obj, 3, units)) { for (i = 0 ; i < arrayMax(units) ; i += 3) { int y1, y2; KEYSET tmp; BSunit *u = arrp(units,i,BSunit) ; SMapStatus status ; if (!u[0].k || !u[1].i || !u[2].i) continue ; status = sMapMap(info, u[1].i, u[2].i, &y1, &y2, NULL, NULL) ; if (!SMAP_STATUS_INAREA(status)) continue; y1--; y2--; if (y2 < 0 || y1 >= conv_info->length) continue; seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->key = seg->parent = seg->source = u[0].k ; seg->data.i = 0 ; seg->type = y1 > y2 ? TRANSCRIPT : TRANSCRIPT_UP; tmp = queryKey (seg->key, ">Annotations Fully_edited") ; if (keySetMax (tmp)) seg->data.i = 0x1 ; keySetDestroy (tmp) ; tmp = queryKey(seg->key, "Transpliced_to = SL1") ; if (keySetMax (tmp)) seg->data.i |= 2 ; keySetDestroy (tmp) ; tmp = queryKey(seg->key, "Transpliced_to = SL2") ; if (keySetMax (tmp)) seg->data.i |= 4 ; keySetDestroy (tmp) ; if (seg->data.i >= 6) seg->data.i ^= 0x0F ; /* leave bit 1, zero bit 2 and 4 , set bit 8 */ } } /* transcript pieces, mieg */ if (bsFindTag (obj, str2tag("Splicing")) && bsFlatten (obj, 4, units)) { seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->key = M_TRANSCRIPT ; /* TODO seg->type = (isDown) ? TRANSCRIPT : TRANSCRIPT_UP ; */ for (i = 0 ; i < arrayMax(units) ; i += 4) { int y1, y2; BSunit *u = arrayp(units, i, BSunit); char *cp ; SMapStatus status ; if (arrayMax(units) < i + 3) continue ; status = sMapMap(info, u[0].i, u[1].i, &y1, &y2, NULL, NULL) ; if (!SMAP_STATUS_INAREA(status)) continue; seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->key = u[2].k ; seg->parent = seg->source = key; /* TODO seg->type = (isDown) ? TRANSCRIPT : TRANSCRIPT_UP ; */ cp = arr(units, i + 3, BSunit).s ; if (cp && *cp && (seg->key == str2tag("Intron") || seg->key == str2tag("Alternative_Intron"))) if (!lexword2key(cp, &(seg->data.k), 0)) lexaddkey ("Other", &seg->data.k,0) ; } } } /* selective for CALCULATED segs */ static void addOldSegs (BOOL complement, Array segs, Array oldSegs, MethodCache mcache) { SEG *seg, *oldMaster, *newMaster ; int i, j, length, offset ; METHOD *meth; oldMaster = arrp(oldSegs,0,SEG) ; newMaster = arrp(segs,0,SEG) ; if (newMaster->key != oldMaster->key) return ; length = newMaster->x2 - newMaster->x1 + 1 ; if (complement) offset = oldMaster->x2 - newMaster->x2 ; else offset = newMaster->x1 - oldMaster->x1 ; #ifdef FMAP_DEBUG printf("In add oldsegs, offset = %d length = %d\n", offset, length); printf("Old: %d,%d New: %d,%d\n", oldMaster->x1, oldMaster->x2, newMaster->x1, newMaster->x2); #endif j = arrayMax(segs) ; for (i = 1 ; i < arrayMax(oldSegs) ; ++i) { seg = arrp(oldSegs, i, SEG) ; switch (seg->type) { case FEATURE: case FEATURE_UP: case ATG: case ATG_UP: case SPLICE5: case SPLICE5_UP: case SPLICE3: case SPLICE3_UP: if (class(seg->key) != _VMethod) messcrash ("Non-method key in addOldSegs") ; meth = methodCacheGet(mcache, seg->key, ""); if (meth && meth->flags & METHOD_CALCULATED) { seg->x1 -= offset ; seg->x2 -= offset ; if (seg->x1 >= 0 && seg->x2 < length) array(segs, j++, SEG) = *seg ; } break ; default: /* needed for picky compiler */ break ; } } return; } /* addOldSegs */ /* Need to look at original fMapFindCoding() to see what the code used to */ /* do..... */ /* This was the fMapFindCoding() routine in the old fmap code. */ /* */ static void convertCoding(SmapConvInfo *conv_info) { int i, j, j1 ; Array index = NULL ; SEG *seg ; KEY parent ; for (i = 1 ; i < arrayMax(conv_info->segs_array) ; ++i) { #ifdef ED_G_NEVER_INCLUDE_THIS_CODE /* In the worm this is a place of considerable delay in rendering large*/ /* contigs.... */ /* Probably all this should go in favour of allowing interrupting at */ /* a higher level. */ /* User can interrupt by pressing F4 */ if (messIsInterruptCalled()) { if (messQuery("Finding coding - do you really want to interrupt fmap ?")) { if (index != NULL) arrayDestroy(index) ; result = FALSE ; break ; } else messResetInterrupt() ; } #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ seg = arrp(conv_info->segs_array, i, SEG) ; if (seg->type == CDS && (parent = seg->parent) && getCDSPosition(conv_info->segs_array, conv_info->seqinfo, NULL, parent, NULL, &index, TRUE)) { j1 = 0 ; for (j = 0 ; j < arrayMax(index) ; ++j) if (j == arrayMax(index) - 1 || arr(index,j+1,int) > arr(index,j,int)+1) /* boundary */ { seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->key = _CDS ; seg->type = CODING ; seg->x1 = arr(index, j1, int) ; seg->x2 = arr(index, j, int) ; seg->parent = parent ; seg->data.i = j1 ; j1 = j+1 ; } arrayDestroy (index) ; index = NULL ; } else if (seg->type == CDS_UP && (parent = seg->parent) && getCDSPosition(conv_info->segs_array, conv_info->seqinfo, NULL, parent, NULL, &index, TRUE)) { j1 = 0 ; for (j = 0 ; j < arrayMax(index) ; ++j) if (j == arrayMax(index) - 1 || arr(index,j+1,int) < arr(index,j,int)-1) /* boundary */ { seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->key = _CDS ; seg->type = CODING_UP ; seg->x1 = arr(index,j,int) ; seg->x2 = arr(index,j1,int) ; seg->parent = parent ; seg->data.i = j1 ; j1 = j+1 ; } arrayDestroy (index) ; index = NULL ; } } return ; } /* We create intron segs when we process the Source_exons tag data but also when we process * the Confirmed_introns tag, these may have identical positions, in which case we unify them * into a single seg. */ static void removeDuplicateIntrons(SmapConvInfo *conv_info) { int i ; SEG *seg ; for (i = 1 ; i < arrayMax(conv_info->segs_array) ; ++i) { seg = arrp(conv_info->segs_array, i, SEG) ; /* remove duplicate INTRON segs */ if (seg->type == INTRON || seg->type == INTRON_UP) { SEG *seg2 = seg ; int flags = 0 ; Array confirmation = NULL ; BOOL existsParent = FALSE ; SEQINFO *sinf ; while (i < arrayMax(conv_info->segs_array) - 1 && seg->type == seg2->type && (seg->x1 == seg2->x1 && seg->x2 == seg2->x2)) { sinf = arrp(conv_info->seqinfo, seg->data.i, SEQINFO) ; flags |= sinf->flags ; /* there is a problem here in that we should really merge in the new confirmation * with the existing one but this is not straight forward if a flag occurs more * than once and the occurences specify _different_ sequences. Which sequence * do we output ? This is a hack for now.... */ if (!confirmation) confirmation = sinf->confirmation ; if (seg->parent) existsParent = TRUE ; i++ ; seg++ ; } for (--i, --seg ; seg2 <= seg ; ++seg2) if (seg2->parent || !existsParent) { sinf = arrp(conv_info->seqinfo, seg2->data.i, SEQINFO) ; if (sinf->flags != flags) { sinf = arrayp(conv_info->seqinfo, arrayMax(conv_info->seqinfo), SEQINFO) ; *sinf = arr(conv_info->seqinfo, seg2->data.i, SEQINFO) ; sinf->flags = flags ; sinf->confirmation = confirmation ; seg2->data.i = arrayMax(conv_info->seqinfo)-1 ; } existsParent = TRUE ; } else { *seg2 = array(conv_info->segs_array,arrayMax(conv_info->segs_array)-1,SEG) ; --arrayMax(conv_info->segs_array) ; } } } return ; } static void removeSelfHomol(Array segs) { int i, max ; SEG *seg, *cds, *cdsMin, *cdsMax ; char *cp ; segFindBounds(segs, CDS, &i, &max) ; cdsMin = arrp(segs, i, SEG) ; cdsMax = arrp(segs, max-1, SEG) + 1 ; /* -1+1 for arrayCheck */ segFindBounds(segs, HOMOL, &i, &max) ; for ( ; i < max && i < arrayMax(segs) ; ++i) /* arrayMax(segs) can change */ { seg = arrp(segs, i, SEG) ; for (cp = name(seg->key) ; *cp && *cp != ':' ; ++cp) ; if (*cp) ++cp ; else cp = name(seg->key) ; for (cds = cdsMin ; cds < cdsMax && cds->x1 < seg->x2 ; ++cds) if (cds->x1 <= seg->x1 && cds->x2 >= seg->x2 && !strcmp (cp, name(cds->parent))) { *seg = array(segs,arrayMax(segs)-1,SEG) ; --arrayMax(segs) ; break ; } } segFindBounds(segs, CDS_UP, &i, &max) ; cdsMin = arrp(segs, i, SEG) ; cdsMax = arrp(segs, max-1, SEG) + 1 ; /* -1+1 for arrayCheck */ segFindBounds(segs, HOMOL_UP, &i, &max) ; for ( ; i < max && i < arrayMax(segs) ; ++i) /* arrayMax(segs) can change */ { seg = arrp(segs, i, SEG) ; for (cp = name(seg->key) ; *cp && *cp != ':' ; ++cp) ; if (*cp) ++cp ; else cp = name(seg->key) ; for (cds = cdsMin ; cds < cdsMax && cds->x1 < seg->x2 ; ++cds) if (cds->x1 <= seg->x1 && cds->x2 >= seg->x2 && !strcmp (cp, name(cds->parent))) { *seg = array(segs,arrayMax(segs)-1,SEG) ; --arrayMax(segs) ; break ; } } return; } /* Returns a new entry in the seq info array, returns the index of the SEQINFO in the array. * if index_out is non-NULL. * Should be considered as never failing as underlying memory allocation routine will exit * the application if it fails. */ static SEQINFO *makeSeqInfo(OBJ obj, Array seqinfo, int *index_out) { SEQINFO *sinf = NULL ; KEY method_key = str2tag("Method") ; if (index_out) *index_out = arrayMax(seqinfo) ; sinf = arrayp(seqinfo, arrayMax(seqinfo), SEQINFO) ; if (bsGetKey (obj, method_key, &sinf->method)) { if (bsGetData (obj, _bsRight, _Float, &sinf->score)) sinf->flags |= SEQ_SCORE ; } else setDefaultMethod(obj, sinf) ; return sinf ; } /* Set up a default method for a feature, this code should gradually * shrink to nothing. */ static void setDefaultMethod(OBJ obj, SEQINFO *sinf) { static KEY M_Transposon = KEY_UNDEFINED, M_SPLICED_cDNA, M_RNA, M_TRANSCRIPT, M_Pseudogene, M_Coding ; if (M_Transposon == KEY_UNDEFINED) { M_Coding = defaultSubsequenceKey ("Coding", BLUE, 2.0, TRUE) ; M_RNA = defaultSubsequenceKey ("RNA", GREEN, 2.0, TRUE) ; M_Pseudogene = defaultSubsequenceKey ("Pseudogene", DARKGRAY, 2.0, TRUE) ; M_Transposon = defaultSubsequenceKey ("Transposon", DARKGRAY, 2.05, FALSE) ; M_TRANSCRIPT = defaultSubsequenceKey ("TRANSCRIPT", DARKGRAY, 2.1, FALSE) ; M_SPLICED_cDNA = defaultSubsequenceKey ("SPLICED_cDNA", DARKGRAY, 2.2, FALSE) ; } if (bsFindTag (obj, _Pseudogene)) sinf->method = M_Pseudogene ; else if (bsFindTag (obj, _Transposon)) sinf->method = M_Transposon ; else if (bsFindTag (obj, _tRNA) || bsFindTag (obj, _rRNA) || bsFindTag (obj, _snRNA)) sinf->method = M_RNA ; else if (bsFindTag (obj, _CDS)) sinf->method = M_Coding ; else sinf->method = KEY_UNDEFINED ; return ; } /* I think all we need here is to use the correct smap calls to get */ /* position information, I will insert them once I am sure I have the */ /* dumper working correctly...for now I've hacked this so it can be called */ /* without dna... */ /* */ /* */ /* This is the old fMapGetDNA() routine which I think is complete overkill */ /* here....for the moment I will leave it as it is... */ /* */ /* Worth noting that this routine does _not_ query the database but instead */ /* uses whatever is stored in the segs, corrolary of this is that user must */ /* do an fmap recalculate to see effects of any DB changes they make. */ static BOOL getCDSPosition(Array segs, Array seqinfo, Array dna_in, KEY parent, Array *cds, Array *index, BOOL cds_only) { int i, j=0, jCode = 0, cds1, cds2, tmp, dnaMax ; SEG *seg = NULL ; BOOL isUp = FALSE ; Array dna = NULL ; BOOL first_exon ; if (!iskey(parent)) return FALSE ; cds1 = cds2 = 0 ; /* used to check if CDS found. */ for (i = 1 ; i < arrayMax(segs) ; ++i) { seg = arrp(segs,i,SEG) ; if (seg->parent == parent) { switch (seg->type) { case CDS: case CDS_UP: if (cds_only) { cds1 = seg->x1 ; /* N.B. these are coords on displayed */ cds2 = seg->x2 ; /* DNA. */ } break ; case SEQUENCE: isUp = FALSE ; break ; case SEQUENCE_UP: isUp = TRUE ; break ; default: break ; } } } /* Couldn't find the required CDS so no translation done. */ if (cds1 == cds2 && cds_only) return FALSE ; if (dna_in) { dna = dna_in ; dnaMax = arrayMax(dna) ; } /* Must have dna to return cds dna... */ if (dna && cds) *cds = arrayCreate (1000, char) ; /* wild guess */ if (index) *index = arrayCreate (1000, int) ; for (i = 1, first_exon = TRUE ; i < arrayMax(segs) ; ++i) { seg = arrp(segs,i,SEG) ; if (seg->parent == parent) switch (seg->type) { /* pick cds */ case EXON: case EXON_UP: { if (first_exon) { /* May have to modify beginning of dna for translation because obj may */ /* have begun in a previous exon so we don't have the beginning, the */ /* Start_not_found allows us to correct the reading frame by setting */ /* a new start position. Default is start of obj (start_not_found = 1).*/ int new_start ; first_exon = FALSE ; new_start = arrp(seqinfo, seg->data.i, SEQINFO)->start_not_found ; if (new_start != 0) new_start-- ; cds1 += new_start ; j = seg->x1 + new_start ; } else j = seg->x1 ; for( ; j <= seg->x2 ; ++j) { if (cds_only && (j < cds1 || j > cds2)) continue ; if (cds) { if (j >= 0 && j < dnaMax) array(*cds,jCode,char) = arr(dna,j,char) ; else array(*cds,jCode,char) = 0 ; } if (index) array(*index,jCode,int) = j ; ++jCode ; } } break ; default: break ; } } if (isUp) { if (cds) reverseComplement(*cds) ; if (index) for (j = 0, i = jCode-1 ; j < i ; ++j, --i) { tmp = arr(*index,j,int) ; arr(*index,j,int) = arr(*index,i,int) ; arr(*index,i,int) = tmp ; } } return TRUE ; } /* Cribbed from fMapOrder, purpose is to obtain a set of sorted segs from which to * draw lines between homology blocks that should be joined up, but can be use * generally to order homols. */ static int homolOrder(void *a, void *b) { SEG *seg1 = (SEG*)a, *seg2 = (SEG*)b ; int diff ; int result = 0 ; diff = seg1->type - seg2->type ; if (diff > 0) return 1 ; else if (diff < 0) return -1 ; diff = seg1->x1 - seg2->x1 ; if (diff > 0) result = 1 ; else if (diff < 0) result = -1 ; else { diff = seg1->x2 - seg2->x2 ; if (diff > 0) result = 1 ; else if (diff < 0) result = -1 ; } return result ; } /* Creates segs of the type HOMOL_GAP/_UP which are lines to be * drawn in between HOMOL features. * N.B. we assume that the homol segs from homols_start to the end of the segs_array * have been sorted by seg coords so we can determine the coords of the lines * to go between the homol blocks directly from the segs_array. */ static void homolLines(Array segs_array, int homols_start, int homols_end) { int i ; for (i = homols_start ; i <= homols_end ; i++) { SEG *homol_line = arrayp(segs_array, arrayMax(segs_array), SEG) ; /* Must be first in case of relocation. */ SEG *curr_homol = arrayp(segs_array, i, SEG) ; SEG *next_homol = arrayp(segs_array, (i + 1), SEG) ; *(homol_line) = *(curr_homol) ; /* struct copy */ if (SEG_IS_DOWN(homol_line)) homol_line->type = HOMOL_GAP ; else homol_line->type = HOMOL_GAP_UP ; homol_line->x1 = curr_homol->x2 + 1 ; homol_line->x2 = next_homol->x1 - 1 ; } return ; } /* Code to add an EST object to the array of read pairs. * In the array of read pairs, each pair holds the keys of the EST objects and * also the seg coordinates of the ends of those objects nearest each other. From this * we can create a HOMOL_GAP seg to span the gap between the EST read pair (the seg. * has a flag set so that a dashed line is used). * * We use an associator to find out if the pair for any one read is already in the array * and return its index in the array if it is. The EST homol segs (homols_start to * homols_end) are position sorted so we can record the relevant top/bottom of the * EST object by whether it is 5' or 3' and record it + what the read pair is. * If the pair of this EST is already in the array, we just add this one to it, * if not we create a new array element with this EST's information. For the latter * we make a copy of the HOMOL seg for the EST object so that we get all the * parent etc. information. */ static void addReadPair(Array all_paired_reads, Associator key2readpair, KEY this_read, KEY paired_read, Array segs, int homols_start, int homols_end) { static KEY EST_5 = KEY_UNDEFINED, EST_3 ; int read_index ; KEY orientation ; ReadPair readpair = NULL ; if (EST_5 == KEY_UNDEFINED) { EST_5 = str2tag("EST_5") ; EST_3 = str2tag("EST_3") ; } if (bIndexFind(this_read, EST_5)) orientation = EST_5 ; else if (bIndexFind(this_read, EST_3)) orientation = EST_3 ; else orientation = KEY_UNDEFINED ; /* Complain ?? */ #ifdef ED_G_NEVER_INCLUDE_THIS_CODE { /* For debug..... */ SEG *seg ; int i ; for (i = homols_start ; i <= homols_end ; i++) { seg = arrayp(segs, i, SEG) ; printf("addReadPair() Homol Seg %d: %s is at %d %d\n", i, fMapSegTypeName[seg->type], seg->x1, seg->x2) ; } } #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ if (orientation) { BOOL found ; /* n.b. has more than one use below. */ /* If we can't find this EST's read pair in the array, add a new array element. */ if (!(found = assFind(key2readpair, assVoid(paired_read), &read_index))) read_index = arrayMax(all_paired_reads) ; readpair = arrayp(all_paired_reads, read_index, ReadPairStruct) ; if (!found) memset(readpair, 0, sizeof(ReadPairStruct)) ; /* Needed for KEY_UNDEFINED tests. */ /* Add EST to read pair array according to orientation and add coord. info. */ if (orientation == EST_5 && readpair->five_prime.key == KEY_UNDEFINED) { found = TRUE ; readpair->five_prime.key = this_read ; if (SEG_IS_DOWN(arrayp(segs, homols_start, SEG))) { readpair->five_prime.pos = arrayp(segs, homols_end, SEG)->x2 ; readpair->homol_index = homols_end ; } else { readpair->five_prime.pos = arrayp(segs, homols_start, SEG)->x1 ; readpair->homol_index = homols_start ; } #ifdef ED_G_NEVER_INCLUDE_THIS_CODE printf("addReadPair() - 5' EST: at index %d, %s at %d\n", read_index, name(readpair->five_prime.key), readpair->five_prime.pos) ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ } else if (orientation == EST_3 && readpair->three_prime.key == KEY_UNDEFINED) { found = TRUE ; readpair->three_prime.key = this_read ; if (SEG_IS_DOWN(arrayp(segs, homols_start, SEG))) { readpair->three_prime.pos = arrayp(segs, homols_start, SEG)->x1 ; readpair->homol_index = homols_start ; } else { readpair->three_prime.pos = arrayp(segs, homols_end, SEG)->x2 ; readpair->homol_index = homols_end ; } #ifdef ED_G_NEVER_INCLUDE_THIS_CODE printf("addReadPair() - 3' EST: at index %d, %s at %d\n", read_index, name(readpair->three_prime.key), readpair->three_prime.pos) ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ } else { found = FALSE ; /* complain ? */ } /* Update the associator with the new ESTs -> array index. */ if (found) assInsert(key2readpair, assVoid(this_read), assVoid(read_index)) ; } return ; } /* Make homol line segs to join up each EST read pair recorded. The segs * have the dashed line flag set. */ static void makePairedReadSegs(Array segs, Array all_paired_reads) { SEG *seg ; ReadPair readpair ; int i ; for (i = 0 ; i < arrayMax(all_paired_reads) ; i++) { readpair = arrayp(all_paired_reads, i, ReadPairStruct) ; if (readpair->five_prime.key && readpair->three_prime.key) { seg = arrayp(segs, arrayMax(segs), SEG) ; *(seg) = *(arrayp(segs, readpair->homol_index, SEG)) ; /* struct copy */ if (SEG_IS_DOWN(seg)) { seg->type = HOMOL_GAP ; seg->x1 = readpair->five_prime.pos ; seg->x2 = readpair->three_prime.pos ; } else { seg->type = HOMOL_GAP_UP ; seg->x1 = readpair->three_prime.pos ; seg->x2 = readpair->five_prime.pos ; } #ifdef ED_G_NEVER_INCLUDE_THIS_CODE printf("makePairedReadSegs() - Seg from %s & %s : %s at %d %d\n", name(readpair->five_prime.key), name(readpair->three_prime.key), fMapSegTypeName[seg->type], seg->x1, seg->x2) ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ seg->flags |= SEGFLAG_LINE_DASHED ; } } return ; } /* Just a copy of fMapOrder, main seg sorting routine which sorts by type * and position */ static int segOrder(void *a, void *b) { SEG *seg1 = (SEG*)a, *seg2 = (SEG*)b ; int diff ; diff = seg1->type - seg2->type ; if (diff > 0) return 1 ; else if (diff < 0) return -1 ; if ((seg1->type | 0x1) == SPLICED_cDNA_UP) { if (!(seg1->source * seg2->source)) return seg1->source ? -1 : 1 ; diff = seg1->source - seg2->source ; if (diff) return diff ; diff = seg1->parent - seg2->parent ; if (diff) return diff ; { int x1, x2, y1, y2 ; x1 = seg1->data.i >> 14 & 0x3FFF ; x2 = seg1->data.i & 0x3FFF ; y1 = seg2->data.i >> 14 & 0x3FFF ; y2 = seg2->data.i & 0x3FFF ; if ((x1 - x2) * (y1 - y2) < 0) return x1 < x2 ? -1 : 1 ; } } diff = seg1->x1 - seg2->x1 ; if (diff > 0) return 1 ; else if (diff < 0) return -1 ; diff = seg1->x2 - seg2->x2 ; if (diff > 0) return 1 ; else if (diff < 0) return -1 ; return 0 ; } static BOOL segFindBounds(Array segs, SegType type, int *min, int *max) { for (*min = 1 ; *min < arrayMax(segs) ; ++*min) { if (arrp(segs, *min, SEG)->type == type) break ; } for (*max = *min ; *max < arrayMax(segs) ; ++*max) { if (arrp(segs, *max, SEG)->type != type) break ; } return (*min < arrayMax(segs)) ; } #ifdef ED_G_NEVER_INCLUDE_THIS_CODE /* THIS IS HOPEFULLY REDUNDANT NOW AS WE HAVE THE sMapIsReverse() function to deal * WITH 1 BASE LONG FEATURES. */ /* Use this routine to set up any segs that represent features that are * 1 base pair long, this is needed because in this case SMAP2SEG may assign * the incorrect SegType so we get the type from the parent which will be correct. * An example would be exons where it is legitimate for them to be 1 base long * where they are incompletely known. */ static void segSetPosType(SEG *parent_seg, SEG *seg, SMapStatus status, int y1, int y2, SegType down) { sMap2Seg(info, seg, y1, y2, down) ; if (y1 == y2) { if (SEG_IS_DOWN(parent_seg)) seg->type = down ; else seg->type = up ; } segSetClip(seg, status, down, up) ; return ; } #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ /* Translate the smap clipping flags into seg clipping for use by drawing */ /* code. */ static void segSetClip(SEG *seg, SMapStatus status, SegType feature, SegType feature_up) { if (SMAP_STATUS_SET(status, SMAP_STATUS_CLIP)) { if (SMAP_STATUS_SET(status, SMAP_STATUS_CLIP)) { if (SMAP_STATUS_SET(status, SMAP_STATUS_X1_CLIP)) { if (seg->type == feature) seg->flags |= SEGFLAG_CLIPPED_TOP ; else seg->flags |= SEGFLAG_CLIPPED_BOTTOM ; } if (SMAP_STATUS_SET(status, SMAP_STATUS_X2_CLIP)) { if (seg->type == feature) seg->flags |= SEGFLAG_CLIPPED_BOTTOM ; else seg->flags |= SEGFLAG_CLIPPED_TOP ; } } } else seg->flags = SEGFLAG_UNSET ; return ; } /* */ /* Internal routines for setting up the keyset of methods, and hence, */ /* features that should be included in the smapconvert() */ /* */ /* This routine finds all the Method objects in the database and returns */ /* them as a keyset. */ /* */ /* The routine returns a keyset if there are any method objects and NULL if */ /* not. It is the callers responsibility to destroy the keyset when */ /* finished with. */ /* */ static KEYSET allMethods(void) { KEYSET result = NULL ; /* Get all the methods. */ result = query(NULL, "FIND method") ; /* A refinement here would be to check all the objects to see if they any */ /* tags defined in them, I assume that if they don't then they are useless */ /* but this might not be so. */ /* A more thorough approach would be to see if a method object was */ /* referenced by any sequence objects, if not then there would no point in */ /* showing it. */ return result ; } /* Adding features to the dictionary. */ static void addToSet (ACEIN command_in, DICT *dict) { char cutter, *cutset, *word ; if (aceInStep (command_in, '"')) cutset = "|\"" ; else cutset = "| \t" ; while ((word = aceInWordCut (command_in, cutset, &cutter))) { dictAdd (dict, word, 0) ; if (cutter != '|') break ; } return; } /* This function determines whether or not to create a seg. * I did design some really snappy code to condense the * decision into a few lines, but it was so opaque * I threw it away, preferring dozens of lines you can * understand easily to a few you can't. */ static BOOL IsRequired(SmapConvInfo *conv_info) { BOOL IsRequired = FALSE; KEY method_key = str2tag("Method"), method ; KEY source_key = str2tag("GFF_source") ; int unused; if (!conv_info->method_set && !conv_info->sources_dict) IsRequired = TRUE; if (conv_info->method_set) /* is there a method set? */ { if (conv_info->key == conv_info->seq_orig) /* is this the parent seg? */ { IsRequired = TRUE; } else { if (bIndexTag(conv_info->key, method_key) /* is there a method tag? */ && bsGetKey(conv_info->obj, method_key, &method)) /* can I get the key? */ { if (keySetFind(conv_info->method_set, method, &unused)) /* is it in the set? */ { IsRequired = TRUE; } else /* ie not in the set */ { IsRequired = FALSE; } } else /* ie no method */ if (conv_info->include_methods) IsRequired = FALSE; else IsRequired = TRUE; } /* if (key == seqKey) */ } /* if (method_set) */ /* it may be I can store some of this stuff so I don't have to create an object */ /* each time, but for now I'll go along with this. */ if (conv_info->sources_dict) /* is there a list of sources? */ { if (bIndexTag(conv_info->key, method_key) /* does obj have a method tag? */ && bsGetKey(conv_info->obj, method_key, &method)) /* can I get the key? */ { OBJ method_obj; char *source; if ((method_obj = bsCreate(method))) { if (bsFindTag(method_obj, source_key) /* is there a GFF_source tag? */ && bsGetData(method_obj, source_key, _Text, &source) /* can I get the data from it? */ && dictFind(conv_info->sources_dict, source, &unused)) /* can I find it in the set? */ { if (conv_info->include_sources) IsRequired = TRUE; else IsRequired = FALSE; } } else { if (conv_info->include_sources) IsRequired = FALSE; else IsRequired = TRUE; } } else { if (conv_info->include_sources) IsRequired = FALSE; else IsRequired = TRUE; } } return IsRequired; } /* This is rather similar to the IsRequired() function and I may want to merge the two. * This is only called for objects that have methods. * Note that filtering on source has not been fully implemented, so don't expect it to be right. * Filtering on method and source are currently mutually exclusive, so one function is enough. */ static BOOL FilterOut(SmapConvInfo *conv_info, KEY method) { BOOL result = FALSE; int unused; /* check method */ if (conv_info->method_set && !(keySetFind(conv_info->method_set, method, &unused))) result = TRUE; /* check source */ if (conv_info->sources_dict) { OBJ method_obj; char *source; KEY source_key = str2tag("GFF_source"); if ((method_obj = bsCreate(method))) { if (bsFindTag(method_obj, source_key) /* is there a GFF_source tag? */ && bsGetData(method_obj, source_key, _Text, &source) /* can I get the data from it? */ && dictFind(conv_info->sources_dict, source, &unused)) /* can I find it in the set? */ { if (conv_info->include_sources == FALSE) result = TRUE; } else if (conv_info->include_sources == TRUE) result = TRUE; } } return result; } /* * Some debug utilities. * * (these should be rationalised with stuff in fmapcontrol.c) */ static void segDumpToStdout(Array segs) { ACEOUT dest = aceOutCreateToStdout(0); fmapDumpSegs(segs, dest); aceOutDestroy(dest); return ; } /**********************************end of file**********************************************/