/* File: smapconvert.c * Author: Simon Kelley (srk@sanger.ac.uk) & Ed Griffiths (edgrif@sanger.ac.uk) * Copyright (c) J Thierry-Mieg and R Durbin, 2001 *------------------------------------------------------------------- * Acedb is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * or see the on-line version at http://www.gnu.org/copyleft/gpl.txt *------------------------------------------------------------------- * This file is part of the ACEDB genome database package, written by * Richard Durbin (Sanger Centre, UK) rd@sanger.ac.uk, and * Jean Thierry-Mieg (CRBM du CNRS, France) mieg@kaa.crbm.cnrs-mop.fr * * Description: Uses the smap system to produce arrays of features, * these arrays are then used by the GFF dumper or various * display routines. * * Exported functions: See smapconvert.h * HISTORY: * Last edited: Nov 12 10:15 2002 (edgrif) * Created: Wed Dec 12 10:17:57 2001 (edgrif) * CVS info: $Id: smapconvert.c,v 1.34 2002/11/12 10:36:39 edgrif Exp $ *------------------------------------------------------------------- */ #include #include #include #include #include #include #include #include #include #include #include #include /* THERE IS A LOOMING PROBLEM HERE, WHAT IF Y1 == Y2 AS IT DOES WHEN STUFF */ /* GETS CLIPPED OR MAY DO FOR SOME ONE PIXEL LONG FEATURES ????????? */ /* */ /* This can happen for exons but that's all at the moment, we handle it for * exons by looking in the exons parent for strandedness. */ /* This macro sets the coords and type in the seg. * Note that co-ords in segs are ZERO-based, with the first base of the * sequence in the MASTER seg being zero. The results of sMapMap are * ONE-based. Whenever we call sMapMap, we need to reduce the results by * one. Hmph. * Also note that the coords that come out of smap will be reversed for * features that are flipped, this means that they must be flipped back to * go into the seg. In segs _all_ coords are held the same way round: * smallest first, it is the "type" field (e.g. SEQUENCE, SEQUENCE_UP) that * determines the orientation. * I'd prefer this to be a subroutine but performance is likely to be an * issue as this must be done for all segs.*/ #define SMAP2SEG(SEG, Y1, Y2, TYPE, TYPE_UP) \ ((Y1) < (Y2) ? ((SEG)->x1 = ((Y1) - 1), (SEG)->x2 = ((Y2) - 1), (SEG)->type = (TYPE)) \ : ((SEG)->x1 = ((Y2) - 1), (SEG)->x2 = ((Y1) - 1), (SEG)->type = (TYPE_UP))) typedef struct _SMapFeatureMapRec { STORE_HANDLE handle ; KEY seqOrig ; KEY seqKey ; SMap *smap ; int start ; int stop ; int length ; int fullLength ; /* probably don't need this... */ int origin ; int zoneMin, zoneMax ; Array dna ; KEYSET feature_set ; DICT *featDict ; MethodCache mcache ; Array segs ; Array seqInfo ; Array homolInfo ; BitSet homolBury ; /* same index as homolInfo */ BitSet homolFromTable ; /* MatchTable or Tree? (index as homolInfo) */ } SMapFeatureMapRec ; typedef struct { SMap *smap ; /* Current smap. */ SMapKeyInfo *info ; KEY key ; /* Key of current object. */ OBJ obj ; /* Current object. */ Array units ; /* Used for bsFlatten of features. */ KEYSET feature_set ; /* which features are to be smap'ed. */ Array segs_array ; /* Segs array to add features to. */ int parent_seg_index ; /* Index of parent seg in segs_array. */ STORE_HANDLE segs_handle ; /* For any segs related allocation. */ Array seqinfo ; /* Separate of info. describing sets of segs. */ int seqinfo_index ; /* Index to current seqinfo. */ Array homol_info ; BitSet homolBury ; /* same index as homolInfo */ BitSet homolFromTable ; /* MatchTable or Tree? (index as homolInfo) */ int length ; /* only needed once, can we get rid of */ /* it altogether....*/ } SmapConvInfo ; /* structs for holding read pair information (e.g. ESTs). */ typedef struct { KEY key ; /* homol key. */ int pos ; /* seg coord of homol nearest to its */ /* paired read. */ } ReadPosStruct, *ReadPos ; typedef struct { int homol_index ; /* a paired read homol seg, used for copying parent key etc. */ ReadPosStruct five_prime ; ReadPosStruct three_prime ; } ReadPairStruct, *ReadPair ; static SMapFeatureMap doFeatureMap(KEY seq, int start, int stop, BOOL get_dna, DICT *features, BOOL include_features) ; static BOOL dumpGFF(SMapFeatureMap map, int version, KEY *refSeq, int *offPtr, BOOL isList, DICT *sourceSet, DICT *featSet, ACEOUT gff_out) ; /* Master routine, calls the individual convertor routines. */ static void convertObj(SmapConvInfo *conv_info, DICT *featDict) ; /* Conversion routines, one per type of feature. */ /* */ static void convertHomol(SmapConvInfo *conv_info) ; static void convertExons(SmapConvInfo *conv_inf, BOOL *exons_found) ; static void convertCDS(SmapConvInfo *conv_info, int *cds_min_out, BOOL exons_found) ; static void convertStart(SmapConvInfo *conv_info, int cds_min) ; static void convertFeature(SmapConvInfo *conv_info, DICT *featDict) ; static void convertVisible(SmapConvInfo *conv_info) ; static void convertAssemblyTags(SmapConvInfo *conv_info) ; static void convertAllele(SmapConvInfo *conv_info) ; static void convertCloneEnd(SmapConvInfo *conv_info, KEY clone_tag) ; static void convertOligo(SmapConvInfo *conv_info) ; static void convertConfirmedIntrons(SmapConvInfo *conv_info) ; static void convertEMBLFeature(SmapConvInfo *conv_info) ; static void convertSplices(SmapConvInfo *conv_info) ; static void convertCoding(SmapConvInfo *conv_info) ; static void addOldSegs (Array segs, Array oldSegs, MethodCache mcache) ; static void removeDuplicateIntrons(SmapConvInfo *conv_info) ; static void removeSelfHomol(Array segs) ; /* Utility functions. */ static void setDefaultMethod(OBJ obj, SEQINFO *sinf) ; static BOOL getCDSPosition(Array segs, Array seqinfo, Array dna_in, KEY parent, Array *cds, Array *index, BOOL cds_only) ; static int homolOrder(void *a, void *b) ; static void homolLines(Array segs_array, int homols_start, int homols_end) ; static void addReadPair(Array all_paired_reads, Associator key2readpair, KEY this_read, KEY paired_read, Array segs, int homols_start, int homols_end) ; static void makePairedReadSegs(Array segs, Array all_paired_reads) ; static int segOrder(void *a, void *b) ; static BOOL segFindBounds(Array segs, SegType type, int *min, int *max) ; static void segSetClip(SEG *seg, SMapStatus status, SegType feature, SegType feature_up) ; static void segSetPosType(SEG *parent_seg, SEG *seg, SMapStatus status, int y1, int y2, SegType down, SegType up) ; static consMapDNAErrorReturn callback(KEY key, int position) ; static KEYSET allMethods(void) ; static void addToSet (ACEIN command_in, DICT *dict) ; /* For dumping seg information. */ static void segDump(Array segs, ACEOUT dest) ; static void fMapLookDataDump(FeatureMap look, ACEOUT dest) ; static BOOL reportErrors_G ; /* */ /* ---------------- External routines --------------------------------- */ /* */ /* The business: this is the routine that will construct the array of */ /* features to be used by fmap, the GFF dumper etc. */ /* */ /* Some notes - */ /* */ /* This routine does _NOT_ know about fmap, if you introduce fmap into this */ /* routine you WILL BE SHOT. */ /* */ /* The arrays passed in are expected to be allocated but empty and will be */ /* filled with seg/seqinfo/homol data. This gives the caller the freedom to */ /* create/recreate/destroy arrays on handles or not as they see fit. */ /* */ BOOL sMapFeaturesMake(SMap *smap, KEY seqKey, int start, int stop, int length, Array oldsegs, BOOL complement, MethodCache mcache, Array dna, KEYSET feature_set, DICT *featDict, Array segs_inout, Array seqinfo_inout, Array homolInfo_inout, BitSet homolBury_inout, BitSet homolFromTable_inout, STORE_HANDLE segsHandle) { BOOL result = FALSE ; KEYSET allKeys; int i; SEG *seg; Array segs = segs_inout ; Array seqinfo = seqinfo_inout ; Array units = NULL; KEY method_key = str2tag("Method") ; SEQINFO *sinf ; /* some more thought needed about how this is filled in and persists in */ /* this function, some room for optimising. */ SmapConvInfo conv_info ; int tmp_segindex = 0, tmp_seqinfoindex = 0 ; /* Sanity check, if caller accidentally excludes original sequence then */ /* nothing will get mapped properly (I think that's right), so warn and */ /* return. */ if (feature_set && bIndexTag(seqKey, method_key)) { OBJ obj = NULL ; KEY method = KEY_UNDEFINED ; int unused = 0 ; if ((obj = bsCreate(seqKey))) { BOOL no_sequence = FALSE ; if (bsGetKey(obj, method_key, &method) && !(keySetFind(feature_set, method, &unused))) no_sequence = TRUE ; bsDestroy(obj) ; if (no_sequence) { messerror("Cannot smapconvert sequence %s because its method (\"%s\") " "was excluded from the convert.", name(seqKey), name(method)) ; return FALSE ; } } } allKeys = sMapKeys(smap, NULL); /* The master seg, n.b. has no parent. */ seg = arrayp(segs, arrayMax(segs), SEG); seg->parent = seg->source = 0; seg->key = seqKey; seg->x1 = start; seg->x2 = stop; seg->type = MASTER; seg->flags = SEGFLAG_UNSET ; for (i = 0 ; i < keySetMax(allKeys) ; i++) { KEY key = keySet(allKeys, i); SMapKeyInfo *info = sMapKeyInfo(smap, key); int y1, y2; SMapStatus status ; status = sMapMap(info, 1, 0, &y1, &y2, NULL, NULL) ; if (SMAP_STATUS_OVERLAP(status)) { int sinf_index = arrayMax(seqinfo) ; OBJ obj = NULL ; if ((obj = bsCreate(key))) { KEY method = KEY_UNDEFINED ; int unused = 0 ; int parent_seg_index; /* note: only exclude if object contains a method and it's not */ /* in the feature set. */ if (feature_set) { if (bIndexTag(key, method_key) && bsGetKey(obj, method_key, &method) && !(keySetFind(feature_set, method, &unused))) { bsDestroy(obj) ; continue ; } } parent_seg_index = arrayMax(segs); seg = arrayp(segs, parent_seg_index, SEG); seg->source = sMapParent(info); seg->key = seg->parent = key; seg->data.i = sinf_index ; seg->flags = SEGFLAG_UNSET ; SMAP2SEG(seg, y1, y2, SEQUENCE, SEQUENCE_UP) ; /* Set up method, gaps etc. for this feature. * If the sequence is gappy, hold a pointer to the Map array in the sMap */ sinf = arrayp(seqinfo, sinf_index, SEQINFO) ; if (bsGetKey (obj, str2tag("Method"), &sinf->method)) { if (bsGetData (obj, _bsRight, _Float, &sinf->score)) sinf->flags |= SEQ_SCORE ; } else setDefaultMethod(obj, sinf) ; if (status & SMAP_STATUS_INTERNAL_GAPS) sinf->gaps = sMapMapArray(info); /* Create segs for any features hanging off this sequence obj. */ units = arrayReCreate (units, 256, BSunit); conv_info.smap = smap ; conv_info.info = info ; conv_info.key = key ; conv_info.obj = obj ; conv_info.units = units ; conv_info.feature_set = feature_set ; conv_info.segs_array = segs ; conv_info.parent_seg_index = parent_seg_index ; conv_info.segs_handle = segsHandle ; conv_info.seqinfo = seqinfo ; conv_info.seqinfo_index = sinf_index ; conv_info.homol_info = homolInfo_inout ; conv_info.homolBury = homolBury_inout ; conv_info.homolFromTable = homolFromTable_inout ; conv_info.length = length ; convertObj(&conv_info, featDict) ; bsDestroy(obj); } } } /* We used to do the complementing here but this seems to come out in the wash * now, do a sanity check with Simon about this. */ /* The previously displayed segs contain segs that were calculated * (e.g. genefinder), these must be transferred to the new segs. */ if (oldsegs) addOldSegs(segs_inout, oldsegs, mcache) ; arraySort(segs, segOrder) ; /* must sort for FindCoding and */ /* RemoveSelfHomol */ convertCoding(&conv_info) ; /* make CODING segs */ /* What causes the duplicates, surely better not to make them in the first */ /* place ?? something to investigate further.... */ removeDuplicateIntrons(&conv_info) ; removeSelfHomol(conv_info.segs_array) ; #ifdef ED_G_NEVER_INCLUDE_THIS_CODE /* ACEMBLY only, add later. */ fMapTraceFindMultiplets (look) ; /* make virtual multiplets */ #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ return TRUE ; } void sMapFeaturesComplement(int length, Array segs, Array seqinfo) { int i, tmp, top = length - 1 ; SEG *seg ; SEQINFO *sinf ; /* Transform all the seg coordinates. */ seg = arrp(segs, 1, SEG) ; for (i = 1 ; i < arrayMax(segs) ; ++i, ++seg) { /* Do the complement. */ tmp = seg->x1 ; seg->x1 = top - seg->x2 ; seg->x2 = top - tmp ; if (seg->type >= SEQUENCE && seg->type <= ALLELE_UP) { seg->type ^= 1 ; /* There must be a more elegant way to do this but I can't think of it at the moment... */ { BOOL top = FALSE, bot = FALSE ; if (seg->flags & SEGFLAG_CLIPPED_TOP) { seg->flags &= !SEGFLAG_CLIPPED_TOP ; bot = TRUE ; } if (seg->flags & SEGFLAG_CLIPPED_BOTTOM) { seg->flags &= !SEGFLAG_CLIPPED_BOTTOM ; top = TRUE ; } if (bot) seg->flags |= SEGFLAG_CLIPPED_BOTTOM ; if (top) seg->flags |= SEGFLAG_CLIPPED_TOP ; } } /* Now do some seg-type specific stuff. */ if ((seg->type | 0x1) == SPLICED_cDNA_UP) seg->source = top - seg->source ; } /* Some exon segs have CDS information attached, this information is in */ /* the seqinfo array and includes coordinates that must be transformed. */ for (i = 0 ; i < arrayMax(seqinfo) ; i++) { sinf = arrp(seqinfo, i, SEQINFO) ; if (sinf->flags & SEQ_CDS) { tmp = sinf->cds_info.cds_seg.x1 ; sinf->cds_info.cds_seg.x1 = top - sinf->cds_info.cds_seg.x2 ; sinf->cds_info.cds_seg.x2 = top - tmp ; sinf->cds_info.cds_seg.type ^= 1 ; /* There must be a more elegant way to do this but I can't think of it at the moment... */ { BOOL top = FALSE, bot = FALSE ; if (sinf->cds_info.cds_seg.flags & SEGFLAG_CLIPPED_TOP) { sinf->cds_info.cds_seg.flags &= !SEGFLAG_CLIPPED_TOP ; bot = TRUE ; } if (sinf->cds_info.cds_seg.flags & SEGFLAG_CLIPPED_BOTTOM) { sinf->cds_info.cds_seg.flags &= !SEGFLAG_CLIPPED_BOTTOM ; top = TRUE ; } if (bot) sinf->cds_info.cds_seg.flags |= SEGFLAG_CLIPPED_BOTTOM ; if (top) sinf->cds_info.cds_seg.flags |= SEGFLAG_CLIPPED_TOP ; } } } return ; } /* smap based gif get code, doesn't use fex but does use smap and NO fmap */ /* look. It also */ /* */ /* this is my version of fMapGifGet() which allows for selection of which */ /* sets of features will be included. */ /* */ /* When we come to fex stuff we can easily add an smapfexget call which will */ /* be the same except we'll produce fex features instead of segs. */ /* */ /* */ SMapFeatureMap sMapFeaturesCreate(ACEIN command_in, ACEOUT result_out, SMapFeatureMap map_in) { SMapFeatureMap map = NULL ; KEY key ; int x1, x2 ; char *word ; STORE_HANDLE handle = NULL ; DICT *features = NULL ; BOOL dna_get = FALSE ; BOOL already_features, inclusive ; if (map_in) sMapFeaturesDestroy(map_in) ; handle = handleCreate() ; /* try to get from graph */ if (!(word = aceInWord(command_in))) { aceOutPrint(result_out, "// gif smapget error: no sequence specified.\n") ; goto usage ; } /* this test is probably wrong now, we probably want to do non-sequence */ /* objects...in fact that is sure to be true. */ if (!lexword2key(word, &key, _VSequence)) { aceOutPrint (result_out, "// gif smapget error: Sequence %s not known\n", word) ; goto usage ; } already_features = FALSE ; x1 = 1 ; x2 = 0 ; /* default for whole sequence */ while (aceInCheck (command_in, "w")) { word = aceInWord(command_in) ; if (strcmp (word, "-dna") == 0) { dna_get = TRUE ; } else if (strcmp (word, "-coords") == 0) { if (!aceInInt (command_in, &x1) || !aceInInt (command_in, &x2) || (x2 == x1)) goto usage ; } else if (strcmp (word, "-feature") == 0 && aceInCheck (command_in, "w")) { if (already_features) goto usage ; features = dictHandleCreate(16, handle) ; addToSet(command_in, features) ; inclusive = TRUE ; already_features = TRUE ; } else if (strcmp (word, "+feature") == 0 && aceInCheck (command_in, "w")) { if (already_features) goto usage ; features = dictHandleCreate(16, handle) ; addToSet(command_in, features) ; inclusive = FALSE ; already_features = TRUE ; } else goto usage ; } #ifdef ED_G_NEVER_INCLUDE_THIS_CODE if (x1 < x2 || (x1 == 1 && x2 == 0)) look = fMapCreateLook(key, x1, x2, FALSE, fmap_data) ; else look = fMapCreateLook(key, x2, x1, FALSE, fmap_data) ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ if (x1 < x2 || (x1 == 1 && x2 == 0)) map = doFeatureMap(key, x1, x2, dna_get, features, inclusive) ; else map = doFeatureMap(key, x2, x1, dna_get, features, inclusive) ; if (!map) { aceOutPrint (result_out, "// gif smapget error: could not make map for %s\n", name(key)) ; if (handle) handleDestroy (handle) ; return NULL ; } #ifdef ED_G_NEVER_INCLUDE_THIS_CODE /* Unsure what to do here...fill in later... */ /* Is the zone used in the gff dump ????? */ /* set the zone */ if (x1 == 1 && x2 == 0) { int i ; for (i = 1 ; i < arrayMax(look->segs) ; ++i) { if (arrp(look->segs,i,SEG)->key == key) { setZone(look, arrp(look->segs,i,SEG)->x1, arrp(look->segs,i,SEG)->x2 + 1) ; break ; } } } else if (x1 > x2) fMapRC (look) ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ if (handle) handleDestroy (handle) ; return map ; usage: aceOutPrint (result_out, "// gif smapseqget error, usage:\n" "\tsmapget [sequence [-coords x1 x2][-feature feat1|feat2]]\n") ; if (handle) handleDestroy (handle) ; return NULL ; } void sMapFeaturesDestroy(SMapFeatureMap map) { if (map) { handleDestroy(map->handle) ; messfree (map) ; } return; } /* Dump features in GFF format. */ /* */ /* Returns TRUE if dump successful, FALSE otherwise. */ /* */ BOOL sMapFeaturesGFF(ACEIN command_in, ACEOUT result_out, SMapFeatureMap map) { BOOL result = FALSE ; KEY refseq = 0 ; int x1, x2, offset ; int version = 2 ; char *word ; STORE_HANDLE handle = handleCreate() ; DICT *sourceSet = dictHandleCreate (16, handle) ; DICT *featSet = dictHandleCreate (16, handle) ; BOOL isList = FALSE ; ACEOUT fo = NULL, dump_out = NULL ; if (command_in && result_out && map) { /* by default output goes to same place as normal command results */ dump_out = aceOutCopy(result_out, 0); result = TRUE ; while (result && aceInCheck(command_in, "w")) { word = aceInWord (command_in) ; if (strcmp (word, "-coords") == 0 && aceInInt (command_in, &x1) && aceInInt (command_in, &x2) && (x2 != x1)) { ; #ifdef ED_G_NEVER_INCLUDE_THIS_CODE /* I need to sort out the zone stuff.... */ setZoneUserCoords (look, x1, x2) ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ } else if (strcmp (word, "-file") == 0 && aceInCheck (command_in, "w") && (fo = aceOutCreateToFile (aceInPath (command_in), "w", 0))) { /* replace default output with file output */ aceOutDestroy (dump_out); dump_out = fo; } else if (strcmp (word, "-refseq") == 0 && aceInCheck (command_in, "w") && (lexword2key (aceInWord (command_in), &refseq, _VSequence))) { ; } else if (strcmp (word, "-version") == 0 && aceInInt (command_in, &version)) { ; } else if (strcmp (word, "-list") == 0) { isList = TRUE ; } else if (strcmp (word, "-source") == 0 && aceInCheck (command_in, "w")) { addToSet (command_in, sourceSet) ; } else if (strcmp (word, "-feature") == 0 && aceInCheck (command_in, "w")) { addToSet (command_in, featSet) ; } else { result = FALSE ; aceOutPrint (result_out, "// gif seqfeatures error: usage: SEQFEATURES [-coords x1 x2] ") ; aceOutPrint (result_out, "[-file fname] [-refseq sequence] [-version 1|2] ") ; aceOutPrint (result_out, "[-list] [-source source(s)] [-feature feature(s)]\n") ; } } } if (result) result = dumpGFF(map, version, &refseq, &offset, isList, sourceSet, featSet, dump_out) ; if (result) aceOutPrint(result_out, "// FMAP_FEATURES %s %d %d\n", freeprotect (name(refseq)), (map->zoneMin + 1 + offset), (map->zoneMax+offset)) ; if (dump_out) aceOutDestroy(dump_out); if (handle) handleDestroy(handle) ; return result ; } /* This routine makes a keyset containing keys of method objects in one of */ /* two ways: */ /* If add = TRUE then we start with an empty keyset and add all the */ /* keys of methods found in the database whose names match one of the names */ /* in the "features" dict parameter. */ /* else we subtract from the set of all method keys, the keys of */ /* those methods matching the names in the "features" dict. */ /* */ /* i.e. this routine can be used to include or exclude methods. */ /* */ /* Returns a valid keyset of method objects or NULL if no methods match the */ /* supplied "feature" list. It is the callers responsibility to destroy the */ /* keyset when finished with. */ /* */ KEYSET sMapFeaturesSet(STORE_HANDLE handle, DICT *features, BOOL inclusive) { KEYSET result = NULL; int i ; if (inclusive) result = keySetHandleCreate(handle) ; else result = allMethods() ; if (result) { for (i = 0 ; i < dictMax(features) ; i++) { KEY kp = KEY_UNDEFINED ; if (lexword2key(dictName(features, i), &kp, _VMethod)) { if (inclusive) keySetInsert(result, kp) ; else keySetRemove(result, kp) ; } } if (keySetMax(result) == 0) { keySetDestroy(result) ; result = NULL ; } } return result ; } /* */ /* ---------------- Internal routines --------------------------------- */ /* */ static consMapDNAErrorReturn callback(KEY key, int position) { BOOL OK; if (!reportErrors_G) return sMapErrorReturnContinue; OK = messQuery("Mismatch error at %d in %s.\n" "Do you wish to see further errors?", position, name(key)); return OK ? sMapErrorReturnContinue : sMapErrorReturnSilent; } /* if (x1 < x2 || (x1 == 1 && x2 == 0)) look = fMapCreateLook(key, x1, x2, FALSE, fmap_data) ; else look = fMapCreateLook(key, x2, x1, FALSE, fmap_data) ; */ static SMapFeatureMap doFeatureMap(KEY seq, int start, int stop, BOOL get_dna, DICT *features, BOOL include_features) { SMapFeatureMap map = NULL ; BOOL status = TRUE ; BOOL make_map ; map = (SMapFeatureMap)messalloc(sizeof(SMapFeatureMapRec)) ; map->handle = handleCreate() ; map->seqOrig = seq ; if (sMapTreeRoot(seq, start, stop, &seq, &start, &stop)) map->seqKey = seq ; else status = FALSE ; if (status) { map->smap = sMapCreate(map->handle, map->seqKey, start, stop, NULL); } if (status) { if (get_dna) { #ifdef ED_G_NEVER_INCLUDE_THIS_CODE reportErrors_G = look->reportDNAMismatches; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ if (!(map->dna = sMapDNA(map->smap, map->handle, callback))) status = FALSE ; } } if (status) { if (features) map->feature_set = sMapFeaturesSet(map->handle, features, include_features) ; map->length = sMapMax(map->smap) ; map->zoneMin = 0 ; map->zoneMax = stop - start + 1 ; map->start = start - 1 ; map->stop = stop - 1 ; /* just for testing, don't think this is needed for gif etc. style stuff. */ map->fullLength = sMapLength (seq) ; map->featDict = dictHandleCreate (1000, map->handle) ; dictAdd (map->featDict, "null", 0) ; /* so real entries are non-zero */ map->mcache = methodCacheCreate(map->handle) ; map->segs = arrayHandleCreate(256, SEG, map->handle) ; map->homolInfo = arrayHandleCreate(256, HOMOLINFO, map->handle) ; map->homolBury = NULL ; map->homolFromTable = NULL ; map->seqInfo = arrayHandleCreate(32, SEQINFO, map->handle) ; } /* Produce the segs array. */ if (status) { if (!(make_map = sMapFeaturesMake(map->smap, map->seqKey, map->start, map->stop, map->length, NULL, FALSE, map->mcache, map->dna, map->feature_set, map->featDict, map->segs, map->seqInfo, map->homolInfo, map->homolBury, map->homolFromTable, map->handle))) status = FALSE ; } if (!status) { sMapFeaturesDestroy(map) ; map = NULL ; } return map ; } /* Hope this is correct for the gff dumper...need to check....with simon.... */ /* Allowable results must be class sequence, not have exons and * must be Genomic_canonical or have the Link tag set. */ static BOOL pred(KEY key) { if (class(key) != _VSequence) return FALSE; if (bIndexTag(key, str2tag("Source_exons"))) return FALSE; return TRUE; } /* this routine is the old gff dumper extracted from fmapcontrol.c it needs */ /* considerable tidying up. */ static BOOL dumpGFF(SMapFeatureMap map, int version, KEY *refSeq, int *offPtr, BOOL isList, DICT *sourceSet, DICT *featSet, ACEOUT gff_out) { int i ; SEG *seg ; HOMOLINFO *hinf ; SEQINFO *sinf ; METHOD *meth ; KEY seqKey, sourceKey ; int x, y, type, offset = 0 ; int chopped ; /* only used for version 1 */ int tmp = 0, reversed = 0 ; /* used by AcePerl */ char *featName, *sourceName, *tagText = 0 ; float score ; char strand, frame ; BOOL flipped ; BOOL isScore ; STORE_HANDLE handle = handleCreate() ; Associator key2sinf = assHandleCreate (handle) ; Associator key2source = assHandleCreate (handle) ; Associator key2feat = assHandleCreate (handle) ; Array stats = arrayHandleCreate (64, int, handle) ; DICT *listSet = 0 ; /* first establish seqKey and offset */ seqKey = 0 ; if (refSeq && !*refSeq && version > 1) *refSeq = map->seqOrig ; if (refSeq && *refSeq) { for (i = 1 ; i < arrayMax(map->segs) ; ++i) { seg = arrp(map->segs, i, SEG) ; if ((seg->type == SEQUENCE || seg->type == SEQUENCE_UP) && seg->key == *refSeq) break ; } if (i < arrayMax(map->segs)) { if (seg->type == SEQUENCE || seg->type == SEQUENCE_UP) { seqKey = *refSeq ; offset = - seg->x1 ; reversed = seg->type == SEQUENCE_UP ? 1 : 0; } } else { messout ("Can't find reference sequence %s", name (*refSeq)) ; return FALSE ; } } /* find minimal spanning sequence */ if (!seqKey) { x = map->zoneMin + 1 ; y = map->zoneMax ; seqKey = 0 ; if (!sMapFindSpan(map->smap, &seqKey, &x, &y, pred)) seqKey = map->seqKey ; if (x > y) /* what if reversed? */ { messout ("Can't GFF dump from reversed sequences for now. Sorry.") ; return FALSE ; } offset = x - (map->zoneMin + 1) ; /* x changed in sMapFindSpan */ } if (refSeq) *refSeq = seqKey ; if (offPtr) *offPtr = offset ; if (!isList) { aceOutPrint (gff_out, "##gff-version %d\n", version) ; aceOutPrint (gff_out, "##date %s\n", timeShow (timeParse ("today"))) ; aceOutPrint (gff_out, "##sequence-region %s %d %d %s\n", name(seqKey), (map->zoneMin + 1 + offset), (map->zoneMax + offset), reversed ? "(reversed)" : "") ; } if (isList) listSet = dictHandleCreate (64, handle) ; for (i = 0 ; i < arrayMax(map->segs) ; ++i) { seg = arrp(map->segs, i, SEG) ; if (seg->x1 > map->zoneMax || seg->x2 < map->zoneMin) continue ; if (seg->type == MASTER || seg->type == VISIBLE || seg->type == CDS || seg->type == CDS_UP || seg->type == DNA_SEQ || seg->type == PEP_SEQ || seg->type == ORF || seg->type == TRANS_SEQ || seg->type == TRANS_SEQ_UP) continue ; if (seg->type & 0x01 && (seg->type >= SEQUENCE && seg->type <= ALLELE_UP)) { type = seg->type - 1 ; x = seg->x2+1 ; y = seg->x1+1 ; } else { type = seg->type ; x = seg->x1+1 ; y = seg->x2+1 ; } chopped = 0 ; if (x <= y) { flipped = FALSE ; strand = '+' ; if (version == 1 && x <= map->zoneMin) chopped = map->zoneMin + 1 - x ; } else { int tmp = x ; x = y ; y = tmp ; flipped = TRUE ; strand = '-' ; if (version == 1 && y > map->zoneMax) chopped = y - map->zoneMax ; } if (version == 1) /* clip */ { if (x <= map->zoneMin) x = map->zoneMin + 1 ; if (y > map->zoneMax) y = map->zoneMax ; } x += offset ; y += offset ; frame = '.' ; /* other defaults */ isScore = (version == 1) ? TRUE : FALSE ; score = 0.0 ; sourceName = (version == 1 || version == 2) /* mieg, why not in version2 ? */ ? fMapSegTypeName[type] : 0 ; featName = 0 ; sourceKey = 0 ; sourceName = ""; /* LS otherwise everything ends up being a SEQUENCE */ switch (type) { case SEQUENCE: case EXON: case INTRON: sinf = arrp(map->seqInfo, seg->data.i, SEQINFO) ; if ( seg->key && (version == 2) ) { featName = className(seg->key); } else featName = (type == SEQUENCE) ? "sequence" : (type == EXON) ? "exon" : "intron" ; if (sinf->method) { sourceKey = sinf->method ; } if (type == SEQUENCE) { assInsert (key2sinf, assVoid(seg->key), sinf) ; if (sinf->flags & SEQ_SCORE) { score = sinf->score ; isScore = TRUE ; } } break ; case CODING: if (version == 1) featName = "coding_exon" ; else featName = "CDS" ; assFind (key2sinf, assVoid(seg->parent), &sinf) ; if (sinf->method) sourceKey = sinf->method ; switch (seg->data.i%3) { case 0: frame = '0' ; break ; case 1: frame = '2' ; break ; case 2: frame = '1' ; break ; } break ; case HOMOL: hinf = arrp(map->homolInfo, seg->data.i, HOMOLINFO) ; score = hinf->score ; sourceKey = hinf->method ; featName = "similarity" ; meth = methodCacheGet(map->mcache, hinf->method, "") ; if (!(meth->flags & METHOD_STRAND_SENSITIVE)) strand = '.' ; if (meth->flags & METHOD_SCORE) isScore = TRUE ; if (meth->flags & METHOD_FRAME_SENSITIVE) frame = '0' ; break ; case FEATURE: case ATG: case SPLICE5: case SPLICE3: sourceKey = seg->key ; if (type == SPLICE3) featName = "splice3" ; else if (type == SPLICE5) featName = "splice5" ; else if (type == ATG) featName = "atg" ; else if (version == 1) featName = "feature" ; else featName = "misc_feature" ; meth = methodCacheGet(map->mcache, seg->key, "") ; if (meth->flags & METHOD_SCORE) { score = seg->data.f ; isScore = TRUE ; } if (!(meth->flags & METHOD_STRAND_SENSITIVE)) strand = '.' ; if (meth->flags & METHOD_FRAME_SENSITIVE) frame = '0' ; break ; case ASSEMBLY_TAG: tagText = strnew (seg->data.s, handle) ; sourceName = "assembly_tag" ; featName = tagText ; { char *cp ; for (cp = tagText ; *cp && *cp != ':' && *cp != ' ' && *cp != '\t' && *cp != '\n' ; ++cp) ; if (*cp) { *cp++ = 0 ; while (*cp == ' ' || *cp == '\t' || *cp == '\n') ++cp ; } tagText = cp ; } break ; case ALLELE: if (version == 1) sourceName = "variation" ; if (seg->data.s) { if (*seg->data.s == '-') featName = "deletion" ; else { char *cp = seg->data.s ; featName = "variation" ; while (*cp) if (!strchr ("AGCTagct", *cp++)) featName = (version == 1) ? "insertion_site" : "insertion" ; } } break ; case EMBL_FEATURE: featName = name(seg->key) ; break ; case CLONE_END: featName = name(seg->data.k) ; strand = '.' ; break ; default: ; } /* finalise source and feat fields */ if (sourceKey) { if (assFind (key2source, assVoid(sourceKey), &sourceName)) assFind (key2feat, assVoid(sourceKey), &featName) ; else { OBJ obj = bsCreate (sourceKey) ; sourceName = name(sourceKey) ; if (obj) { bsGetData (obj, str2tag("GFF_source"), _Text, &sourceName) ; if (bsGetData (obj, str2tag("GFF_feature"), _Text, &featName)) { featName = strnew (featName, handle) ; assInsert (key2feat, assVoid(sourceKey), featName) ; } } sourceName = strnew(sourceName, handle); assInsert (key2source, assVoid(sourceKey), sourceName) ; if (obj) /* only after strnew(sourceName) !!! */ bsDestroy(obj); } } if (!sourceName) /* only possible if version > 1 */ sourceName = "unknown" ; if (!featName) /* from method or from seg->type above */ { if (version == 1) featName = sourceName ; else featName = fMapSegTypeName[type] ; } if (frame >= '0' && frame <= '2' && chopped) { frame = frame + 3 - (chopped % 3) ; if (frame == '3') frame = '0' ; if (frame == '4') frame = '1' ; if (frame == '5') frame = '2' ; } if (sourceSet && featSet && (dictMax (sourceSet) || dictMax (featSet)) && !dictFind (sourceSet, sourceName, 0) && !dictFind (featSet, featName, 0)) continue ; if (isList) /* just accumulate stats */ { int k ; dictAdd (listSet, messprintf("%s\t%s",sourceName,featName), &k) ; ++array(stats, k, int) ; continue ; /* move on to next line */ } /* !isList from here on */ /* write the main part of the line */ /* LS/AcePerl: fixup reversed reference sequence */ if (reversed) { tmp = map->zoneMax + offset + 1 - y; y = map->zoneMax + offset + 1 - x; x = tmp; if ( strand == '+' ) strand = '-'; else if ( strand == '-' ) strand = '+'; } if (isScore) aceOutPrint (gff_out, "%s\t%s\t%s\t%d\t%d\t%g\t%c\t%c", name(seqKey), sourceName, featName, x, y, score, strand, frame) ; else aceOutPrint (gff_out, "%s\t%s\t%s\t%d\t%d\t.\t%c\t%c", name(seqKey), sourceName, featName, x, y, strand, frame) ; /* extras */ switch (type) { case SEQUENCE: case EXON: case CODING: if (seg->parent) { if (version == 1) aceOutPrint (gff_out, "\t%s", name(seg->parent)) ; else aceOutPrint (gff_out, "\t%s \"%s\"", className(seg->parent), name(seg->parent)) ; } break ; case INTRON: sinf = arrp(map->seqInfo, seg->data.i, SEQINFO) ; if (version == 1) { if (seg->parent) { /* RD 010129 added class for version 2 like EXON */ if (version == 1) aceOutPrint (gff_out, "\t%s", name(seg->parent)) ; else aceOutPrint (gff_out, "\t%s \"%s\"", className(seg->parent), name(seg->parent)) ; if (sinf->flags & SEQ_CONFIRMED) aceOutPrint (gff_out, " Confirmed") ; } else if (sinf->flags & SEQ_CONFIRMED) aceOutPrint (gff_out, "\tConfirmed") ; if (sinf->flags & SEQ_CONFIRM_EST) aceOutPrint (gff_out, " by_EST") ; if (sinf->flags & SEQ_CONFIRM_CDNA) aceOutPrint (gff_out, " by_cDNA") ; if (sinf->flags & SEQ_CONFIRM_HOMOL) aceOutPrint (gff_out, " by_homology") ; if (sinf->flags & SEQ_CONFIRM_UTR) aceOutPrint (gff_out, " in_UTR") ; if (sinf->flags & SEQ_CONFIRM_FALSE) aceOutPrint (gff_out, " as_false") ; } else { BOOL isData = FALSE ; if (seg->parent) { aceOutPrint (gff_out, "\tSequence \"%s\"", name(seg->parent)) ; isData = TRUE ; } if (sinf->flags & SEQ_CONFIRM_EST) { aceOutPrint (gff_out, "%s", isData ? " ; " : "\t") ; isData = TRUE ; aceOutPrint (gff_out, "Confirmed_by_EST") ; } if (sinf->flags & SEQ_CONFIRM_CDNA) { aceOutPrint (gff_out, "%s", isData ? " ; " : "\t") ; isData = TRUE ; aceOutPrint (gff_out, "Confirmed_by_cDNA") ; } if (sinf->flags & SEQ_CONFIRM_HOMOL) { aceOutPrint (gff_out, "%s", isData ? " ; " : "\t") ; isData = TRUE ; aceOutPrint (gff_out, "Confirmed_by_homology") ; } if (sinf->flags & SEQ_CONFIRM_UTR) { aceOutPrint (gff_out, "%s", isData ? " ; " : "\t") ; isData = TRUE ; aceOutPrint (gff_out, "Confirmed_in_UTR") ; } if (sinf->flags & SEQ_CONFIRM_FALSE) { aceOutPrint (gff_out, "%s", isData ? " ; " : "\t") ; isData = TRUE ; aceOutPrint (gff_out, "Confirmed_as_false") ; } } break ; case HOMOL: if (version == 1) aceOutPrint (gff_out, "\t%s:%s", className(seg->key), name(seg->key)) ; else aceOutPrint (gff_out, "\tTarget \"%s:%s\"", className(seg->key), name(seg->key)) ; hinf = arrp(map->homolInfo, seg->data.i, HOMOLINFO) ; if (flipped) aceOutPrint (gff_out, " %d %d", hinf->x2, hinf->x1) ; else aceOutPrint (gff_out, " %d %d", hinf->x1, hinf->x2) ; break ; case FEATURE: case SPLICE5: case SPLICE3: case ATG: { BOOL isTab = FALSE ; if (seg->parent) { if (version == 1) aceOutPrint (gff_out, "\t%s", utCleanNewlines(dictName(map->featDict, -seg->parent), handle)) ; else aceOutPrint (gff_out, "\tNote \"%s\"", utCleanNewlines(dictName(map->featDict, -seg->parent), handle)) ; isTab = TRUE ; } #ifdef ED_G_NEVER_INCLUDE_THIS_CODE /* need to find out about this chosen lark.... */ if (assFind (look->chosen, SEG_HASH(seg), 0)) aceOutPrint (gff_out, "%sSelected", isTab ? " ; " : "\t") ; else if (assFind (look->antiChosen, SEG_HASH(seg), 0)) aceOutPrint (gff_out, "%sAntiselected", isTab ? " ; " : "\t") ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ } break ; case ASSEMBLY_TAG: if (tagText && *tagText) { if (version == 1) aceOutPrint (gff_out, "\t%s", utCleanNewlines(tagText, handle)) ; else aceOutPrint (gff_out, "\tNote \"%s\"", utCleanNewlines(tagText, handle)) ; } break ; case ALLELE: if (version == 1) aceOutPrint (gff_out, "\t%s", name(seg->key)) ; else aceOutPrint (gff_out, "\tAllele \"%s\"", name(seg->key)) ; if (seg->data.s && *seg->data.s != '-') { if (version == 1) aceOutPrint (gff_out, "\t%s", utCleanNewlines(seg->data.s, handle)) ; else if (strcmp (featName, "variation") == 0) aceOutPrint (gff_out, " ; Variant \"%s\"", utCleanNewlines(seg->data.s, handle)) ; else aceOutPrint (gff_out, " ; Insert \"%s\"", utCleanNewlines(seg->data.s, handle)) ; } break ; case EMBL_FEATURE: if (seg->data.s) { if (version == 1) aceOutPrint (gff_out, "\t%s", utCleanNewlines(seg->data.s, handle)) ; else aceOutPrint (gff_out, "\tNote \"%s\"", utCleanNewlines(seg->data.s, handle)) ; } break ; case CLONE_END: if (version == 1) aceOutPrint (gff_out, "\t%s", name(seg->key)) ; else aceOutPrint (gff_out, "\tClone \"%s\"", name(seg->key)) ; break ; default: ; } aceOutPrint (gff_out, "\n") ; } if (isList) for (i = 0 ; i < arrayMax(stats) ; ++i) aceOutPrint (gff_out, "%s\t%d\n", dictName (listSet, i), arr(stats,i,int)) ; handleDestroy (handle) ; return TRUE ; } /* Actually this routine will need to be externally callable as I will want */ /* to add features to an fmap on the fly....youch.... */ /* */ /* You should be aware when altering this code that some of the calls are */ /* _order_ dependant because some routines need information from preceding */ /* calls to decide what to do, e.g. the exon and cds routines. */ /* */ static void convertObj(SmapConvInfo *conv_info, DICT *featDict) { static KEY _Feature = KEY_UNDEFINED, _Transcribed_gene, _Clone_left_end, _Clone_right_end, _Confirmed_intron, _EMBL_feature, _Splices ; SEQINFO *sinf = NULL ; int cds_min = 0 ; BOOL exons_found = FALSE ; OBJ obj ; if (_Feature == KEY_UNDEFINED) { _Feature = str2tag("Feature") ; _Transcribed_gene = str2tag("Transcribed_gene") ; _Clone_left_end = str2tag("Clone_left_end") ; _Clone_right_end = str2tag("Clone_right_end") ; _Confirmed_intron = str2tag("Confirmed_intron") ; _EMBL_feature = str2tag("EMBL_feature") ; _Splices = str2tag("Splices") ; } obj = conv_info->obj ; sinf = arrayp(conv_info->seqinfo, conv_info->seqinfo_index, SEQINFO); if (bsFindTag(obj, str2tag("Assembled_from"))) sinf->flags |= SEQ_VIRTUAL_ERRORS ; if (bsFindTag(obj, str2tag("Genomic_canonical")) || bsFindTag(obj, str2tag("Genomic"))) sinf->flags |= SEQ_CANONICAL ; if (bsFindTag(obj, _Homol)) convertHomol(conv_info) ; if (bsFindTag(obj, str2tag("Source_Exons"))) convertExons(conv_info, &exons_found) ; if (bsFindTag(obj, str2tag("CDS"))) convertCDS(conv_info, &cds_min, exons_found); if (bsFindTag(obj, str2tag("Start_not_found"))) convertStart(conv_info, cds_min) ; if (bsFindTag(obj, _Feature)) convertFeature(conv_info,featDict) ; if (bsFindTag(obj, _Visible)) convertVisible(conv_info) ; if (bsFindTag(obj, _Assembly_tags)) convertAssemblyTags(conv_info) ; if (bsFindTag(obj, _Allele)) convertAllele(conv_info) ; if (bsFindTag(obj, _Clone_left_end)) convertCloneEnd(conv_info, _Clone_left_end) ; if (bsFindTag(obj, _Clone_right_end)) convertCloneEnd(conv_info, _Clone_right_end) ; if (bsFindTag(obj, _Oligo)) convertOligo(conv_info) ; if (bsFindTag (obj, _Confirmed_intron)) convertConfirmedIntrons(conv_info) ; if (bsFindTag(obj, _EMBL_feature)) convertEMBLFeature(conv_info) ; if (bsFindTag(obj, _Splices)) convertSplices(conv_info) ; return ; } /* There is a big loop in this routine which is because the underlying database * model cannot simply be bsFlatten'd: * * Homol Float Int UNIQUE Int Int UNIQUE Int #Homol_info * * #Homol_info Align Int UNIQUE Int UNIQUE Int * // not needed if full alignment is ungapped * // one row per ungapped alignment section * // first int is position in query, second in target * // third int is length - only necessary when gaps in * // both sequences align. * * mark1 cursors down * mark2 cursors down * mark3 cursors down Float * mark4 cursors down Int * mark5 cursors down Int * * There are extra complications in that we record each set of homols processed * add information to join the homols up with lines and to join EST read pairs. */ static void convertHomol(SmapConvInfo *conv_info) { static KEY Clone = KEY_UNDEFINED, Show_in_reverse_orientation, Join_blocks, Paired_read ; KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; BSMARK mark1 = 0, mark2 = 0, mark3 = 0, mark4 = 0, mark5 = 0, mark6 = 0 ; KEY method, homol, tag2 ; float score; int pos1, pos2, target1, target2; BOOL show_in_reverse ; KEY prev_method = KEY_UNDEFINED ; int homols_start, homols_end ; BOOL join_blocks, multiple_homols ; BOOL paired_read ; KEY paired_key = KEY_UNDEFINED ; Array all_paired_reads ; Associator key2readpair ; if (Clone == KEY_UNDEFINED) { Clone = str2tag("Clone") ; Show_in_reverse_orientation = str2tag("Show_in_reverse_orientation") ; Join_blocks = str2tag("Join_blocks") ; Paired_read = str2tag("Paired_read") ; } all_paired_reads = arrayCreate(30, ReadPairStruct) ; key2readpair = assCreate() ; if (bsGetKeyTags(obj, _Homol, &tag2)) do { mark1 = bsMark(obj, mark1); if (bsGetKey(obj, _bsRight, &homol)) do { mark2 = bsMark(obj, mark2); /* Tags used for EST read pair processing. */ show_in_reverse = bIndexTag(homol, Show_in_reverse_orientation) ; paired_read = bIndexGetKey(homol, Paired_read, &paired_key) ; if (bsGetKey(obj, _bsRight, &method)) do { int unused ; /* continue based on "method" at this point. */ if (conv_info->feature_set && !(keySetFind(conv_info->feature_set, method, &unused))) continue ; mark3 = bsMark(obj, mark3); homols_start = 0 ; multiple_homols = FALSE ; join_blocks = bIndexTag(method, Join_blocks) ; if (bsGetData(obj, _bsRight, _Float, &score)) do { mark4 = bsMark(obj, mark4); if (bsGetData(obj, _bsRight, _Int, &pos1)) do { mark5 = bsMark(obj, mark5); if (bsGetData(obj, _bsRight, _Int, &pos2) && bsGetData(obj, _bsRight, _Int, &target1)) do { mark6 = bsMark(obj, mark6); if (bsGetData(obj, _bsRight, _Int, &target2)) { int y1, y2; SMapStatus status ; status = sMapMap(info, pos1, pos2, &y1, &y2, NULL, NULL) ; /* We should be recording homols which are off end of this * bit of sequence, and we should be doing this for exons * as well...then we could create segs to show this. */ if (SMAP_STATUS_OVERLAP(status)) { HOMOLINFO *hinf ; SEG *seg ; homols_end = arrayMax(conv_info->segs_array) ; seg = arrayp(conv_info->segs_array, homols_end, SEG); seg->source = key ; seg->key = seg->parent = homol; /* target */ bIndexGetKey(homol, Clone, &(seg->parent)) ; /* Make clone the parent if it exists. */ seg->data.i = arrayMax(conv_info->homol_info) ; hinf = arrayp(conv_info->homol_info, seg->data.i, HOMOLINFO) ; hinf->method = method; hinf->score = score; bsPushObj(obj); /* into Homo_info */ hinf->gaps = sMapLocalMap(obj, pos1, pos2, conv_info->segs_handle); /* Some features (e.g. for EST read pairs) need to be shown on the opposite strand so flip coords and flag it. */ if (show_in_reverse) { int tmp ; tmp = target1 ; target1 = target2 ; target2 = tmp ; seg->flags |= SEGFLAG_FLIPPED ; } if (target1 > target2) { SMAP2SEG(seg, y2, y1, HOMOL, HOMOL_UP); hinf->x1 = target2 ; hinf->x2 = target1 ; } else { SMAP2SEG(seg, y1, y2, HOMOL, HOMOL_UP); hinf->x1 = target1; hinf->x2 = target2; } if (prev_method != method) { prev_method = method ; homols_start = homols_end ; /* reset start. */ } else multiple_homols = TRUE ; } } bsGoto(obj, mark6); } while (bsGetData(obj, _bsDown, _Int, &target1)); bsGoto(obj, mark5); } while (bsGetData(obj, _bsDown, _Int, &pos1)); bsGoto(obj, mark4); } while (bsGetData(obj, _bsDown, _Float, &score)); if (homols_start != 0 && join_blocks) { /* Found some homols and their method says join them with lines */ /* Sorting gives us positional order of homol blocks. */ if (homols_start != homols_end) arraySortPos(conv_info->segs_array, homols_start, homolOrder) ; if (paired_read) addReadPair(all_paired_reads, key2readpair, homol, paired_key, conv_info->segs_array, homols_start, homols_end) ; if (multiple_homols) homolLines(conv_info->segs_array, homols_start, homols_end) ; } prev_method = KEY_UNDEFINED ; /* Reset to detect new method. */ bsGoto(obj, mark3); } while (bsGetKey(obj, _bsDown, &method)); bsGoto(obj, mark2); } while (bsGetKey(obj, _bsDown, &homol)); bsGoto(obj, mark1); } while (bsGetKeyTags(obj, _bsDown, &tag2)); /* Now process any paired reads into line segs. */ if (arrayMax(all_paired_reads) > 0) makePairedReadSegs(conv_info->segs_array, all_paired_reads) ; bsMarkFree(mark1); bsMarkFree(mark2); bsMarkFree(mark3); /* Clear up. */ bsMarkFree(mark4); bsMarkFree(mark5); bsMarkFree(mark6); arrayDestroy(all_paired_reads) ; assDestroy(key2readpair) ; #ifdef ED_G_NEVER_INCLUDE_THIS_CODE /* At the end of this there are two bitExtend calls which were part of the match * table stuff, without these homologies get excluded by the draw code. Simon * says this is all redundant...but need to check. */ /* This was completely missing....aggggghhhhh.......and causes serious problems.... * because homologies come and go like nobodies business.... */ /* THIS COULD BE A SEPARATE ROUTINE AND PERHAPS I'LL SPLIT IT OFF ONCE ITS WORKING... */ /* MatchTable - more homologies */ { Array ma ; MATCH *m ; HOMOLINFO *hinf ; KEY match_key ; if (bsGetKey(obj, str2tag("Match_table"), &match_key) && (ma = arrayGet (match_key, MATCH, matchFormat))) { int i ; FLAGSET fBury = flag ("Match", "Bury") ; for (i = 0 ; i < arrayMax(ma) ; ++i) { m = arrp (ma, i, MATCH) ; if (class(m->meth) != _VMethod || !m->q1 || !m->q2) continue ; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->source = match_key ; /* the match table key */ seg->key = seg->parent = m->key ; seg->data.i = arrayMax(conv_info->homol_info) ; hinf = arrayp(conv_info->homol_info, seg->data.i, HOMOLINFO) ; hinf->method = m->meth ; hinf->score = m->score ; hinf->gaps = 0; bitSet (conv_info->homolFromTable, seg->data.i) ; if (m->t1 > m->t2) { pos1 = m->q2 ; pos2 = m->q1 ; hinf->x1 = m->t2 ; hinf->x2 = m->t1 ; } else { pos1 = m->q1 ; pos2 = m->q2 ; hinf->x1 = m->t1 ; hinf->x2 = m->t2 ; } POS_PROCESS(HOMOL,HOMOL_UP) ; if (m->flags & fBury) bitSet (conv_info->homolBury, seg->data.i) ; } arrayDestroy (ma) ; } } #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ /* necessary to ensure array long enough */ bitExtend (conv_info->homolBury, arrayMax(conv_info->homol_info)) ; bitExtend (conv_info->homolFromTable, arrayMax(conv_info->homol_info)) ; return ; } /* Notes: * - exons can legitmately be 1 base long, in this case SMAP2SEG may give * seg the wrong type so we get type from the sequence which holds the * exon information which will have the correct type. * The same is done for introns although biologically I think this must * be an error. */ static void convertExons(SmapConvInfo *conv_info, BOOL *exons_found) { int pos1, pos2 ; int i; SEG *seg, *parent_seg; int y1, y2; BOOL have_exon = FALSE; OBJ obj = conv_info->obj ; Array units = conv_info->units ; if (bsFlatten (obj, 2, units)) { KEY key = conv_info->key ; SEQINFO *sinf = arrayp(conv_info->seqinfo, conv_info->seqinfo_index, SEQINFO); BOOL exon_no_overlap = FALSE ; dnaExonsSort (units) ; /* correct if first exon doesn't start at posn 1, since sMap will have truncated to the boundaries of the exons. Cannot do anything about premature end of last exon at the moment */ if (arr(units, 0, BSunit).i > 1) { SEG *parent_seg = arrp(conv_info->segs_array, conv_info->parent_seg_index, SEG) ; if (parent_seg->type == SEQUENCE) parent_seg->x1 -= arr(units, 0, BSunit).i - 1; else if (parent_seg->type == SEQUENCE_UP) parent_seg->x2 += arr(units, 0, BSunit).i - 1; } for (pos1 = 1, i = 0 ; i < arrayMax(units) ; i += 2) { SMapStatus status ; pos2 = arr(units, i, BSunit).i ; if ((have_exon && (pos2 > (pos1+1))) || exon_no_overlap == TRUE) /* Exons may abut */ { status = sMapUnsplicedMap(conv_info->info, pos1, pos2, &y1, &y2, NULL, NULL) ; if (SMAP_STATUS_OVERLAP(status)) { seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->key = 0 ; seg->parent = seg->source = key ; seg->data.i = conv_info->seqinfo_index ; segSetPosType(arrp(conv_info->segs_array, conv_info->parent_seg_index, SEG), seg, status, y1, y2, INTRON, INTRON_UP) ; /* Adjust intron coords so they lie _between_ exon coords. Note that * coords can be clipped down to just one or two pixels by SMap mapping so * must be careful in adjusting coords. */ if (seg->x2 - seg->x1 < 2) seg->x2 = seg->x1 ; else { seg->x1++ ; seg->x2-- ; } if (fMapDebug) printf("INTRON: %s %d --> %d\n", name(seg->parent), seg->x1, seg->x2) ; } } pos1 = pos2 ; pos2 = arr(units, i+1, BSunit).i ; if (pos2) { status = sMapUnsplicedMap(conv_info->info, pos1, pos2, &y1, &y2, NULL, NULL) ; if (SMAP_STATUS_OVERLAP(status)) { seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->key = 0 ; seg->parent = seg->source = key ; seg->data.i = conv_info->seqinfo_index ; seg->flags = SEGFLAG_UNSET ; segSetPosType(arrp(conv_info->segs_array, conv_info->parent_seg_index, SEG), seg, status, y1, y2, EXON, EXON_UP) ; have_exon = TRUE ; exon_no_overlap = FALSE ; if (fMapDebug) printf("EXON: %s %d --> %d\n", name(seg->parent), seg->x1, seg->x2) ; } else { exon_no_overlap = TRUE ; } } else messerror ("Missing pos2 for exon in %s", nameWithClass(key)) ; pos1 = pos2 ; } sinf->flags |= SEQ_EXONS ; } *exons_found = have_exon; return ; } /* OK, pay attention, when the CDS gets drawn (fMapShowSequence), the * CDS seg must come _before_ the exon segs, this is guaranteed by * the numerical value of its seg->type (fmap_.h) combined with the arraySort at * the end of smapconvert. * * If the CDS start or stop positions are set then we will need to * convert positions for finding the spliced DNA, e.g. for later * protein translation. */ static void convertCDS(SmapConvInfo *conv_info, int *cds_min_out, BOOL exons_found) { KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; SEQINFO *sinf = arrayp(conv_info->seqinfo, conv_info->seqinfo_index, SEQINFO); SEG *seg ; int cds_min, cds_max ; int y1, y2 ; SMapStatus status ; BOOL cds_coords = FALSE ; /* CDS defaults to the whole of the spliced dna. */ cds_min = 1 ; cds_max = 0 ; if (bsGetData(obj, _bsRight, _Int, &cds_min)) { cds_coords = TRUE ; bsGetData(obj, _bsRight, _Int, &cds_max) ; } /* Try to map the CDS, if we fail because of bad start/end in database */ /* then redo as whole of source exons, otherwise just fail. */ status = sMapMap(info, cds_min, cds_max, &y1, &y2, NULL, NULL) ; if (status & SMAP_STATUS_NO_OVERLAP) { #ifdef ED_G_NEVER_INCLUDE_THIS_CODE if (cds_coords) { int tmp_min = cds_min, tmp_max = cds_max ; cds_min = 1 ; cds_max = 0 ; status = sMapMap(info, cds_min, cds_max, &y1, &y2, NULL, NULL) ; if (status == SMAP_STATUS_PERFECT_MAP || status == SMAP_STATUS_INTERNAL_GAPS) { messerror("The start and/or end of the CDS in object %s" " is outside of the spliced DNA coordinates for that object" " CDS start = %d & end = %d)." " Make sure the CDS positions have been specified" " in spliced DNA, not Source_exon, coordinates." " CDS start/end positions have been reset to start/end of" " spliced DNA.", nameWithClass(key), tmp_min, tmp_max) ; } else { messerror("Cannot smap CDS for object %s, please check your data for this object.", nameWithClass(key)) ; } } #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ } if (SMAP_STATUS_OVERLAP(status)) { seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->key = _CDS ; seg->parent = seg->source = key ; seg->flags = SEGFLAG_UNSET ; segSetPosType(arrp(conv_info->segs_array, conv_info->parent_seg_index, SEG), seg, status, y1, y2, CDS, CDS_UP) ; /* We need to record position and other CDS information for later */ /* drawing of the CDS. */ sinf->flags |= SEQ_CDS ; sinf->cds_info.cds_seg = *seg ; sinf->cds_info.cds_only = TRUE ; /* record cds_min for use in convertStart() */ *cds_min_out = cds_min ; if (fMapDebug) printf("CDS: %d --> %d\n\n", seg->x1, seg->x2) ; /* Possible that database may specify a CDS without any accompanying */ /* source exons so synthesize an exon which covers the whole of the */ /* object. */ if (!exons_found) { SEG *seg ; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; *seg = arr(conv_info->segs_array, conv_info->parent_seg_index, SEG) ; /* n.b. struct copy. */ seg->source = key ; seg->key = 0 ; /* unusual because we are copying from parent seg... */ SMAP2SEG(seg, (seg->x1 + 1), (seg->x2 + 1), EXON, EXON_UP) ; sinf->flags |= SEQ_EXONS ; } } return ; } /* N.B. should be called after convertCDS() as it may need cds_min for */ /* checking. */ /* */ static void convertStart(SmapConvInfo *conv_info, int cds_min) { SEQINFO *sinf = arrayp(conv_info->seqinfo, conv_info->seqinfo_index, SEQINFO); KEY key = conv_info->key ; OBJ obj = conv_info->obj ; /* Tag signals that this object is incomplete, there is more upstream */ /* somewhere. Tag can be followed by a frame shift to correct any */ /* resultant reading frame error (0 means ignore setting, default). */ /* If there is a CDS, then it just start at position 1 for this tag to */ /* be valid. */ sinf->start_not_found = 0 ; if (sinf->cds_info.cds_only && (cds_min != 1)) { messerror("Data inconsistency: the Start_not_found tag is set for object %s," " but the CDS begins at position %d instead of at 1," " tag will be ignored", name(key), cds_min) ; } else { /* Record any value found in seqinfo for later use. */ if (bsGetData(obj, _bsRight, _Int, &sinf->start_not_found)) { if (sinf->start_not_found < 1 || sinf->start_not_found > 3) { messerror("Data inconsistency: the Start_not_found tag for object %s" " should be given the value 1, 2 or 3 but has been" " set to %d," " tag will be ignored", name(key), sinf->start_not_found) ; sinf->start_not_found = 0 ; } } } return ; } static void convertFeature(SmapConvInfo *conv_info, DICT *featDict) { KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; if (bsFlatten (obj, 5, units)) { int j ; for (j = 0 ; j < arrayMax(units) ; j += 5) { BSunit *u = arrayp(units, j, BSunit); SEG *seg; int y1, y2; int unused ; SMapStatus status ; if (class(u[0].k != _VMethod) || !u[1].i || !u[2].i || (conv_info->feature_set && !(keySetFind(conv_info->feature_set, u[0].k, &unused)))) /* Do last for efficiency. */ continue; status = sMapMap(info, u[1].i, u[2].i, &y1, &y2, NULL, NULL) ; if (status & SMAP_STATUS_NO_OVERLAP) continue; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->source = key; seg->key = u[0].k; seg->data.f = u[3].f; SMAP2SEG(seg, y1, y2, FEATURE, FEATURE_UP); if (u[4].s) { int tmp; dictAdd(featDict, u[4].s, &tmp); seg->parent = -tmp; } else seg->parent = 0; } } return ; } static void convertVisible(SmapConvInfo *conv_info) { KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; if (bsFlatten(obj, 2, units)) { int j ; for (j = 0 ; j < arrayMax(units) ; j += 2) { int y1, y2 ; SEG *seg ; SMapStatus status ; status = sMapMap(info, 1, 0, &y1, &y2, NULL, NULL); /* cannot fail */ if (status & SMAP_STATUS_NO_OVERLAP) continue ; if (!iskey (arr(units,j,BSunit).k) || class(arr(units,j,BSunit).k)) continue ; /* should be a tag */ if (!iskey (arr(units,j+1,BSunit).k)) continue ; /* there is a problem if the model says: Visible foo Text a hack because iskey() could be true for a text pointer should store these as different type, e.g. VISIBLE_TEXT */ seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->source = seg->parent = key; seg->key = arr(units, j+1, BSunit).k; seg->data.k = arr(units, j, BSunit).k; SMAP2SEG(seg, y1, y2, VISIBLE, VISIBLE) ; } } return ; } /* Conversion of assembly data, the sub-part of the Sequence class is: */ /* */ /* Assembly_tags Text Int Int Text // type, start, stop, comment */ /* */ /* I make the assumption that if the start or stop are not given then we do */ /* _NOT_ map the feature, it _must_ have these at least to be mapped. This */ /* is a change in behaviour from the old code. */ /* */ static void convertAssemblyTags(SmapConvInfo *conv_info) { KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; int pos1, pos2 ; int y1, y2 ; SMapStatus status ; if (bsFlatten (obj, 4, units)) { int j ; for (j = 0 ; j < arrayMax(units)/4 ; ++j) { int length; char *string; char *type_text, *text_text; pos1 = arr(units, 4*j+1, BSunit).i; pos2 = arr(units, 4*j+2, BSunit).i; /* At this point we could report errors if required. */ if (pos1 == 0 || pos2 == 0) continue ; status = sMapMap(info, pos1, pos2, &y1, &y2, NULL, NULL) ; if (SMAP_STATUS_OVERLAP(status)) { SEG *seg ; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->parent = seg->source = key; seg->key = _Assembly_tags; /* This seems incorrect, some of the assembly_tag information */ /* seems to be strand specific, _BUT_ the code has not in */ /* the past paid any attention to this. Thus we cannot do our */ /* usual SMAP2SEG() conversion but must instead just pass the */ /* coords through. */ #ifdef ED_G_NEVER_INCLUDE_THIS_CODE SMAP2SEG(seg, y1, y2, ASSEMBLY_TAG, ASSEMBLY_TAG) ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ seg->x1 = (y1 - 1) ; seg->x2 = (y2 - 1) ; seg->type = ASSEMBLY_TAG ; /* Get the text */ length = 0; type_text = arr(units, 4*j, BSunit).s; if (type_text) length += strlen(type_text); text_text = arr(units, 4*j+3, BSunit).s; if (text_text) length += strlen(text_text); seg->data.s = string = halloc(length+3, conv_info->segs_handle); string[0] = '\0'; if (type_text) strcpy(string, type_text) ; strcat(string, ": "); if (text_text) strcat(string, text_text); } } } return ; } /* Conversion of allele data, the sub-part of the Sequence class is: */ /* */ /* Allele ?Allele XREF Sequence UNIQUE Int UNIQUE Int UNIQUE Text */ /* // start, stop, replacement sequence */ /* */ /* I make the assumption that if the start or stop are not given then we do */ /* _NOT_ map the allele, it _must_ have these at least to be mapped. As with */ /* old code we allow the Text to be NULL. */ /* */ /* Note that there is no Method for alleles currently so no way to exclude */ /* them, this would require alleles to be S_Child mapped really. */ /* */ static void convertAllele(SmapConvInfo *conv_info) { KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; if (bsFlatten (obj, 4, units)) { int i ; for (i = 0 ; i < arrayMax (units) ; i += 4) { BSunit *u = arrayp(units, i, BSunit) ; SEG *seg ; char *string ; int y1, y2 ; SMapStatus status ; /* If no postion specified or position won't map then continue. */ /* It's at this point we should offer option of reporting the */ /* error.....speak to Simon.... */ if (u[1].i == 0 || u[2].i == 0) continue ; status = sMapMap(info, u[1].i, u[2].i, &y1, &y2, NULL, NULL) ; if (status & SMAP_STATUS_NO_OVERLAP) continue; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->source = key ; seg->key = u[0].k ; seg->parent = 0 ; seg->data.i = 0 ; if (y1 || y2) seg->parent = seg->key ; /* flag to draw it */ else seg->parent = key ; /* couple with parent */ SMAP2SEG(seg, y1, y2, ALLELE, ALLELE_UP) ; if ((string = u[3].s)) seg->data.s = strnew(string, conv_info->segs_handle) ; else seg->data.s = NULL ; } } return ; } static void convertCloneEnd(SmapConvInfo *conv_info, KEY clone_tag) { KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; if (bsFlatten(obj, 2, units)) { int i ; int y1, y2 ; for (i = 0 ; i < arrayMax(units) ; i += 2) { BSunit *u = arrayp(units, i, BSunit) ; SMapStatus status ; status = sMapMap(info, u[1].i, u[1].i, &y1, &y2, NULL, NULL) ; if (SMAP_STATUS_OVERLAP(status)) { SEG *seg ; KEY seqKey ; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->source = key ; seg->key = u[0].k ; /* Clone */ SMAP2SEG(seg, y1, y2, CLONE_END, CLONE_END) ; seg->data.k = clone_tag ; lexaddkey (name(seg->key), &seqKey, _VSequence) ; seg->parent = seqKey ; } } } return ; } /* Conversion of oligo data, the sub-part of the Sequence class is: */ /* */ /* Oligo ?Oligo XREF In_sequence Int UNIQUE Int */ /* // for OSP and human mapping mostly */ /* I make the assumption that if the start or stop are not given then we do */ /* _NOT_ map the oligo, it _must_ have these at least to be mapped. */ /* */ static void convertOligo(SmapConvInfo *conv_info) { static KEY _Tm = KEY_UNDEFINED, _Temporary = KEY_UNDEFINED ; KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; if (_Tm == KEY_UNDEFINED) { _Tm = str2tag ("Tm") ; _Temporary = str2tag ("Temporary") ; } if (bsFlatten (obj, 3, units)) { int i ; for (i = 0 ; i < arrayMax (units) ; i += 3) { BSunit *u = arrayp(units, i, BSunit) ; int y1, y2 ; SMapStatus status ; /* The old code used to call fMapOspPositionOligo() */ /* I'm not sure we really want fMapOspPositionOligo(), it actually */ /* alters the database...or tries to, to insert the oligo position.*/ if (u[1].i == 0 || u[2].i == 0) continue ; status = sMapMap(info, u[1].i, u[2].i, &y1, &y2, NULL, NULL) ; if (SMAP_STATUS_OVERLAP(status)) { SEG *seg ; OBJ obj1 ; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->source = key ; seg->key = u[0].k ; seg->parent = seg->key ; seg->data.i = 0 ; SMAP2SEG(seg, y1, y2, OLIGO, OLIGO_UP) ; if ((obj1 = bsCreate(seg->key))) { float xf ; int xi ; if (bsGetData(obj1, _Tm, _Float, &xf)) { xi = 10 * xf ; seg->data.i = (xi & 0x3FF) << 16 ; } if (bsGetData(obj1, _Score, _Float, &xf)) { xi = xf ; seg->data.i |= (xi & 0xfff) ; } if (!bsFindTag(obj1, _Temporary)) seg->data.i |= 0x4000 ; /* old */ bsDestroy (obj1) ; } } } } return ; } static void convertConfirmedIntrons(SmapConvInfo *conv_info) { static KEY _EST = KEY_UNDEFINED, _cDNA, _Homology, _UTR, _False ; KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; SEG *seg ; if (_EST == KEY_UNDEFINED) { _EST = str2tag("EST") ; _cDNA = str2tag("cDNA") ; _Homology = str2tag("Homology") ; _UTR = str2tag("UTR") ; _False = str2tag("False"); } /* Start with the current sequence seg. */ seg = arrayp(conv_info->segs_array, conv_info->parent_seg_index, SEG) ; if (bsFlatten (obj, 3, units)) { int i ; for (i = 0 ; i < arrayMax(units) ; i += 3) { BSunit *u ; SEQINFO *sinf ; int y1, y2 ; u = arrayp(units, i, BSunit) ; if (!u[0].i || !u[1].i) /* No coords, can't do anything. */ continue ; /* I think this is meant to deal with when a confirmed intron has */ /* been duplicated for some reason such as recording different */ /* methods of confirmation ????? So only create a new seg for new */ /* coords... */ if (!i || u[-3].i != u[0].i || u[-2].i != u[1].i) { SMapStatus status ; status = sMapMap(info, u[0].i, u[1].i, &y1, &y2, NULL, NULL) ; if (SMAP_STATUS_OVERLAP(status)) { seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->source = key ; seg->key = 0 ; seg->data.i = 0 ; SMAP2SEG(seg, y1, y2, INTRON, INTRON_UP) ; } } /* _IF_ we allocated a new seg then allocate a new seqinfo. */ if (seg->data.i == 0) seg->data.i = arrayMax(conv_info->seqinfo) ; sinf = arrayp (conv_info->seqinfo, seg->data.i, SEQINFO) ; if (u[2].k == _EST) sinf->flags |= SEQ_CONFIRM_EST ; else if (u[2].k == _cDNA) sinf->flags |= SEQ_CONFIRM_CDNA ; else if (u[2].k == _Homology) sinf->flags |= SEQ_CONFIRM_HOMOL ; else if (u[2].k == _UTR) sinf->flags |= SEQ_CONFIRM_UTR ; else if (u[2].k == _False) sinf->flags |= SEQ_CONFIRM_FALSE ; else sinf->flags |= SEQ_CONFIRM_UNKNOWN ; } } return ; } static void convertEMBLFeature(SmapConvInfo *conv_info) { SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; if (bsFlatten(obj, 4, units)) { int i ; for (i = 0 ; i < arrayMax (units) ; i += 4) { BSunit *u = arrayp(units, i, BSunit) ; int y1, y2 ; SMapStatus status ; status = sMapMap(info, u[1].i, u[2].i, &y1, &y2, NULL, NULL) ; if (SMAP_STATUS_OVERLAP(status)) { char *string ; KEY key ; SEG *seg ; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->source = conv_info->key ; seg->parent = arrayMax(conv_info->segs_array) ; seg->key = u[0].k ; SMAP2SEG(seg, y1, y2, EMBL_FEATURE, EMBL_FEATURE_UP) ; /* 4'th column: can't distinguish KEY from Text securely assume if iskey() that it is a key, e.g. ?Text */ if (iskey(key = u[3].k)) seg->data.s = strnew (name(key), conv_info->segs_handle) ; else if ((string = u[3].s)) seg->data.s = strnew(string, conv_info->segs_handle) ; else seg->data.s = NULL ; } } } return ; } static void convertSplices(SmapConvInfo *conv_info) { static KEY _Predicted_5 = KEY_UNDEFINED, _Predicted_3 ; KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; if (_Predicted_5 == KEY_UNDEFINED) { _Predicted_5 = str2tag("Predicted_5") ; _Predicted_3 = str2tag("Predicted_3") ; } if (bsFlatten(obj, 5, units)) { int i ; for (i = 0 ; i < arrayMax(units) ; i += 5) { BSunit *u = arrayp(units, i, BSunit) ; int y1, y2 ; SMapStatus status ; if (u[0].k != _Predicted_5 && u[0].k != _Predicted_3) continue ; if (class(u[1].k) != _VMethod || !u[2].i || !u[3].i) continue ; status = sMapMap(info, u[2].i, u[3].i, &y1, &y2, NULL, NULL) ; if (SMAP_STATUS_OVERLAP(status)) { SEG *seg ; seg = arrayp(conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->source = key ; seg->key = u[1].k ; if (u[0].k == _Predicted_5) SMAP2SEG(seg, y1, y2, SPLICE5, SPLICE5_UP) ; else SMAP2SEG(seg, y1, y2, SPLICE3, SPLICE3_UP) ; seg->data.f = u[4].f ; } } } return ; } /* This encapsulates all the data from Acembly */ /* NOTE that this code is currently untested, and will probably need some ajustment of models. I will liase with Jean */ static void convertMieg(SmapConvInfo *conv_info) { KEY key = conv_info->key ; SMapKeyInfo *info = conv_info->info ; OBJ obj = conv_info->obj ; Array units = conv_info->units ; int i; SEG *seg; KEY M_TRANSCRIPT = defaultSubsequenceKey ("TRANSCRIPT", DARKGRAY, 2.1, FALSE) ; static KEY _Transcribed_gene = KEY_UNDEFINED ; if (_Transcribed_gene == KEY_UNDEFINED) { _Transcribed_gene = str2tag("Transcribed_gene") ; } if (bsFindTag (obj, _Primer) && bsFlatten (obj, 1, units)) for (i = 0 ; i < arrayMax(units) ; ++i) { seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->parent = seg->source = key; seg->key = arr(units,i, BSunit).k ; seg->type = PRIMER ; } if (bsFindTag (obj, _SCF_File)) { seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->parent = seg->source = key; seg->type = VIRTUAL_SUB_SEQUENCE_TAG ; /* TODO if (!isDown) seg->type = VIRTUAL_SUB_SEQUENCE_TAG_UP ; */ } if (bsFindTag (obj, _Transcribed_gene) && bsFlatten (obj, 3, units)) { for (i = 0 ; i < arrayMax(units) ; i += 3) { int y1, y2; KEYSET tmp; BSunit *u = arrp(units,i,BSunit) ; SMapStatus status ; if (!u[0].k || !u[1].i || !u[2].i) continue ; status = sMapMap(info, u[1].i, u[2].i, &y1, &y2, NULL, NULL) ; if (status & SMAP_STATUS_NO_OVERLAP) continue; y1--; y2--; if (y2 < 0 || y1 >= conv_info->length) continue; seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->key = seg->parent = seg->source = u[0].k ; seg->data.i = 0 ; seg->type = y1 > y2 ? TRANSCRIPT : TRANSCRIPT_UP; tmp = queryKey (seg->key, ">Annotations Fully_edited") ; if (keySetMax (tmp)) seg->data.i = 0x1 ; keySetDestroy (tmp) ; tmp = queryKey(seg->key, "Transpliced_to = SL1") ; if (keySetMax (tmp)) seg->data.i |= 2 ; keySetDestroy (tmp) ; tmp = queryKey(seg->key, "Transpliced_to = SL2") ; if (keySetMax (tmp)) seg->data.i |= 4 ; keySetDestroy (tmp) ; if (seg->data.i >= 6) seg->data.i ^= 0x0F ; /* leave bit 1, zero bit 2 and 4 , set bit 8 */ } } /* transcript pieces, mieg */ if (bsFindTag (obj, str2tag("Splicing")) && bsFlatten (obj, 4, units)) { seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->key = M_TRANSCRIPT ; /* TODO seg->type = (isDown) ? TRANSCRIPT : TRANSCRIPT_UP ; */ for (i = 0 ; i < arrayMax(units) ; i += 4) { int y1, y2; BSunit *u = arrayp(units, i, BSunit); char *cp ; SMapStatus status ; if (arrayMax(units) < i + 3) continue ; status = sMapMap(info, u[0].i, u[1].i, &y1, &y2, NULL, NULL) ; if (status & SMAP_STATUS_NO_OVERLAP) continue; seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG); seg->key = u[2].k ; seg->parent = seg->source = key; /* TODO seg->type = (isDown) ? TRANSCRIPT : TRANSCRIPT_UP ; */ cp = arr(units, i + 3, BSunit).s ; if (cp && *cp && (seg->key == str2tag("Intron") || seg->key == str2tag("Alternative_Intron"))) if (!lexword2key(cp, &(seg->data.k), 0)) lexaddkey ("Other", &seg->data.k,0) ; } } } /* selective for CALCULATED segs */ static void addOldSegs (Array segs, Array oldSegs, MethodCache mcache) { SEG *seg, *oldMaster, *newMaster ; int i, j, length, offset ; METHOD *meth; oldMaster = arrp(oldSegs,0,SEG) ; newMaster = arrp(segs,0,SEG) ; if (newMaster->key != oldMaster->key) return ; length = newMaster->x2 - newMaster->x1 + 1 ; offset = newMaster->x1 - oldMaster->x1 ; j = arrayMax(segs) ; for (i = 1 ; i < arrayMax(oldSegs) ; ++i) { seg = arrp(oldSegs, i, SEG) ; switch (seg->type) { case FEATURE: case FEATURE_UP: case ATG: case ATG_UP: case SPLICE5: case SPLICE5_UP: case SPLICE3: case SPLICE3_UP: if (class(seg->key) != _VMethod) messcrash ("Non-method key in addOldSegs") ; meth = methodCacheGet(mcache, seg->key, ""); if (meth && meth->flags & METHOD_CALCULATED) { seg->x1 -= offset ; seg->x2 -= offset ; if (seg->x1 >= 0 && seg->x2 < length) array(segs, j++, SEG) = *seg ; } break ; default: /* needed for picky compiler */ break ; } } return; } /* addOldSegs */ /* Need to look at original fMapFindCoding() to see what the code used to */ /* do..... */ /* This was the fMapFindCoding() routine in the old fmap code. */ /* */ static void convertCoding(SmapConvInfo *conv_info) { int i, j, j1 ; Array index = NULL ; SEG *seg ; KEY parent ; for (i = 1 ; i < arrayMax(conv_info->segs_array) ; ++i) { #ifdef ED_G_NEVER_INCLUDE_THIS_CODE /* In the worm this is a place of considerable delay in rendering large*/ /* contigs.... */ /* Probably all this should go in favour of allowing interrupting at */ /* a higher level. */ /* User can interrupt by pressing F4 */ if (messIsInterruptCalled()) { if (messQuery("Finding coding - do you really want to interrupt fmap ?")) { if (index != NULL) arrayDestroy(index) ; result = FALSE ; break ; } else messResetInterrupt() ; } #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ seg = arrp(conv_info->segs_array, i, SEG) ; if (seg->type == CDS && (parent = seg->parent) && getCDSPosition(conv_info->segs_array, conv_info->seqinfo, NULL, parent, NULL, &index, TRUE)) { j1 = 0 ; for (j = 0 ; j < arrayMax(index) ; ++j) if (j == arrayMax(index) - 1 || arr(index,j+1,int) > arr(index,j,int)+1) /* boundary */ { seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->key = _CDS ; seg->type = CODING ; seg->x1 = arr(index, j1, int) ; seg->x2 = arr(index, j, int) ; seg->parent = parent ; seg->data.i = j1 ; j1 = j+1 ; } arrayDestroy (index) ; index = NULL ; } else if (seg->type == CDS_UP && (parent = seg->parent) && getCDSPosition(conv_info->segs_array, conv_info->seqinfo, NULL, parent, NULL, &index, TRUE)) { j1 = 0 ; for (j = 0 ; j < arrayMax(index) ; ++j) if (j == arrayMax(index) - 1 || arr(index,j+1,int) < arr(index,j,int)-1) /* boundary */ { seg = arrayp (conv_info->segs_array, arrayMax(conv_info->segs_array), SEG) ; seg->key = _CDS ; seg->type = CODING_UP ; seg->x1 = arr(index,j,int) ; seg->x2 = arr(index,j1,int) ; seg->parent = parent ; seg->data.i = j1 ; j1 = j+1 ; } arrayDestroy (index) ; index = NULL ; } } return ; } /* I think the calls to find the coding are complete overkill, it seems like */ /* all we want to do is find the position of the cds not build some huge */ /* index of positions by going right through the segs, seems completely */ /* crazy.... */ static void removeDuplicateIntrons(SmapConvInfo *conv_info) #ifdef ED_G_NEVER_INCLUDE_THIS_CODE /* Will be a replacement for fmapfindcoding() */ BOOL fMapFindCoding (FeatureMap look) #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ { int i, j, j1 ; Array index = NULL ; SEG *seg ; for (i = 1 ; i < arrayMax(conv_info->segs_array) ; ++i) { seg = arrp(conv_info->segs_array, i, SEG) ; /* Why do we have duplicates in the first place ???????? */ /* remove duplicate INTRON segs */ if (seg->type == INTRON || seg->type == INTRON_UP) { SEG *seg2 = seg ; int flags = 0 ; BOOL existsParent = FALSE ; SEQINFO *sinf ; while (i < arrayMax(conv_info->segs_array)-1 && seg->type == seg2->type && seg->x1 == seg2->x1 && seg->x2 == seg2->x2) { sinf = arrp(conv_info->seqinfo, seg->data.i, SEQINFO) ; flags |= sinf->flags ; if (seg->parent) existsParent = TRUE ; ++i ; ++seg ; } for (--i, --seg ; seg2 <= seg ; ++seg2) if (seg2->parent || !existsParent) { sinf = arrp(conv_info->seqinfo, seg2->data.i, SEQINFO) ; if (sinf->flags != flags) { sinf = arrayp(conv_info->seqinfo, arrayMax(conv_info->seqinfo), SEQINFO) ; *sinf = arr(conv_info->seqinfo, seg2->data.i, SEQINFO) ; sinf->flags = flags ; seg2->data.i = arrayMax(conv_info->seqinfo)-1 ; } existsParent = TRUE ; } else { *seg2 = array(conv_info->segs_array,arrayMax(conv_info->segs_array)-1,SEG) ; --arrayMax(conv_info->segs_array) ; } } } return ; } static void removeSelfHomol(Array segs) { int i, max ; SEG *seg, *cds, *cdsMin, *cdsMax ; char *cp ; segFindBounds(segs, CDS, &i, &max) ; cdsMin = arrp(segs, i, SEG) ; cdsMax = arrp(segs, max-1, SEG) + 1 ; /* -1+1 for arrayCheck */ segFindBounds(segs, HOMOL, &i, &max) ; for ( ; i < max && i < arrayMax(segs) ; ++i) /* arrayMax(segs) can change */ { seg = arrp(segs, i, SEG) ; for (cp = name(seg->key) ; *cp && *cp != ':' ; ++cp) ; if (*cp) ++cp ; else cp = name(seg->key) ; for (cds = cdsMin ; cds < cdsMax && cds->x1 < seg->x2 ; ++cds) if (cds->x1 <= seg->x1 && cds->x2 >= seg->x2 && !strcmp (cp, name(cds->parent))) { *seg = array(segs,arrayMax(segs)-1,SEG) ; --arrayMax(segs) ; break ; } } segFindBounds(segs, CDS_UP, &i, &max) ; cdsMin = arrp(segs, i, SEG) ; cdsMax = arrp(segs, max-1, SEG) + 1 ; /* -1+1 for arrayCheck */ segFindBounds(segs, HOMOL_UP, &i, &max) ; for ( ; i < max && i < arrayMax(segs) ; ++i) /* arrayMax(segs) can change */ { seg = arrp(segs, i, SEG) ; for (cp = name(seg->key) ; *cp && *cp != ':' ; ++cp) ; if (*cp) ++cp ; else cp = name(seg->key) ; for (cds = cdsMin ; cds < cdsMax && cds->x1 < seg->x2 ; ++cds) if (cds->x1 <= seg->x1 && cds->x2 >= seg->x2 && !strcmp (cp, name(cds->parent))) { *seg = array(segs,arrayMax(segs)-1,SEG) ; --arrayMax(segs) ; break ; } } return; } /* Set up a default method for a feature, this code should gradually * shrink to nothing. */ static void setDefaultMethod(OBJ obj, SEQINFO *sinf) { static KEY M_Transposon = KEY_UNDEFINED, M_SPLICED_cDNA, M_RNA, M_TRANSCRIPT, M_Pseudogene, M_Coding ; if (M_Transposon == KEY_UNDEFINED) { M_Coding = defaultSubsequenceKey ("Coding", BLUE, 2.0, TRUE) ; M_RNA = defaultSubsequenceKey ("RNA", GREEN, 2.0, TRUE) ; M_Pseudogene = defaultSubsequenceKey ("Pseudogene", DARKGRAY, 2.0, TRUE) ; M_Transposon = defaultSubsequenceKey ("Transposon", DARKGRAY, 2.05, FALSE) ; M_TRANSCRIPT = defaultSubsequenceKey ("TRANSCRIPT", DARKGRAY, 2.1, FALSE) ; M_SPLICED_cDNA = defaultSubsequenceKey ("SPLICED_cDNA", DARKGRAY, 2.2, FALSE) ; } if (bsFindTag (obj, _Pseudogene)) sinf->method = M_Pseudogene ; else if (bsFindTag (obj, _Transposon)) sinf->method = M_Transposon ; else if (bsFindTag (obj, _tRNA) || bsFindTag (obj, _rRNA) || bsFindTag (obj, _snRNA)) sinf->method = M_RNA ; else if (bsFindTag (obj, _CDS)) sinf->method = M_Coding ; else sinf->method = KEY_UNDEFINED ; return ; } /* I think all we need here is to use the correct smap calls to get */ /* position information, I will insert them once I am sure I have the */ /* dumper working correctly...for now I've hacked this so it can be called */ /* without dna... */ /* */ /* */ /* This is the old fMapGetDNA() routine which I think is complete overkill */ /* here....for the moment I will leave it as it is... */ /* */ /* Worth noting that this routine does _not_ query the database but instead */ /* uses whatever is stored in the segs, corrolary of this is that user must */ /* do an fmap recalculate to see effects of any DB changes they make. */ static BOOL getCDSPosition(Array segs, Array seqinfo, Array dna_in, KEY parent, Array *cds, Array *index, BOOL cds_only) { int i, j=0, jCode = 0, cds1, cds2, tmp, dnaMax ; SEG *seg = NULL ; BOOL isUp = FALSE ; Array dna = NULL ; BOOL first_exon ; if (!iskey(parent)) return FALSE ; cds1 = cds2 = 0 ; /* used to check if CDS found. */ for (i = 1 ; i < arrayMax(segs) ; ++i) { seg = arrp(segs,i,SEG) ; if (seg->parent == parent) { switch (seg->type) { case CDS: case CDS_UP: if (cds_only) { cds1 = seg->x1 ; /* N.B. these are coords on displayed */ cds2 = seg->x2 ; /* DNA. */ } break ; case SEQUENCE: isUp = FALSE ; break ; case SEQUENCE_UP: isUp = TRUE ; break ; default: break ; } } } /* Couldn't find the required CDS so no translation done. */ if (cds1 == cds2 && cds_only) return FALSE ; if (dna_in) { dna = dna_in ; dnaMax = arrayMax(dna) ; } /* Must have dna to return cds dna... */ if (dna && cds) *cds = arrayCreate (1000, char) ; /* wild guess */ if (index) *index = arrayCreate (1000, int) ; for (i = 1, first_exon = TRUE ; i < arrayMax(segs) ; ++i) { seg = arrp(segs,i,SEG) ; if (seg->parent == parent) switch (seg->type) { /* pick cds */ case EXON: case EXON_UP: { if (first_exon) { /* May have to modify beginning of dna for translation because obj may */ /* have begun in a previous exon so we don't have the beginning, the */ /* Start_not_found allows us to correct the reading frame by setting */ /* a new start position. Default is start of obj (start_not_found = 1).*/ int new_start ; first_exon = FALSE ; new_start = arrp(seqinfo, seg->data.i, SEQINFO)->start_not_found ; if (new_start != 0) new_start-- ; cds1 += new_start ; j = seg->x1 + new_start ; } else j = seg->x1 ; for( ; j <= seg->x2 ; ++j) { if (cds_only && (j < cds1 || j > cds2)) continue ; if (cds) { if (j >= 0 && j < dnaMax) array(*cds,jCode,char) = arr(dna,j,char) ; else array(*cds,jCode,char) = 0 ; } if (index) array(*index,jCode,int) = j ; ++jCode ; } } break ; default: break ; } } if (isUp) { if (cds) reverseComplement(*cds) ; if (index) for (j = 0, i = jCode-1 ; j < i ; ++j, --i) { tmp = arr(*index,j,int) ; arr(*index,j,int) = arr(*index,i,int) ; arr(*index,i,int) = tmp ; } } return TRUE ; } /* Cribbed from fMapOrder, purpose is to obtain a set of sorted segs from which to * draw lines between homology blocks that should be joined up, but can be use * generally to order homols. */ static int homolOrder(void *a, void *b) { SEG *seg1 = (SEG*)a, *seg2 = (SEG*)b ; int diff ; int result = 0 ; diff = seg1->type - seg2->type ; if (diff > 0) return 1 ; else if (diff < 0) return -1 ; diff = seg1->x1 - seg2->x1 ; if (diff > 0) result = 1 ; else if (diff < 0) result = -1 ; else { diff = seg1->x2 - seg2->x2 ; if (diff > 0) result = 1 ; else if (diff < 0) result = -1 ; } return result ; } /* Creates segs of the type HOMOL_GAP/_UP which are lines to be * drawn in between HOMOL features. * N.B. we assume that the homol segs from homols_start to the end of the segs_array * have been sorted by seg coords so we can determine the coords of the lines * to go between the homol blocks directly from the segs_array. */ static void homolLines(Array segs_array, int homols_start, int homols_end) { int i ; for (i = homols_start ; i <= homols_end ; i++) { SEG *homol_line = arrayp(segs_array, arrayMax(segs_array), SEG) ; /* Must be first in case of relocation. */ SEG *curr_homol = arrayp(segs_array, i, SEG) ; SEG *next_homol = arrayp(segs_array, (i + 1), SEG) ; *(homol_line) = *(curr_homol) ; /* struct copy */ if (SEG_IS_DOWN(homol_line)) homol_line->type = HOMOL_GAP ; else homol_line->type = HOMOL_GAP_UP ; homol_line->x1 = curr_homol->x2 + 1 ; homol_line->x2 = next_homol->x1 - 1 ; } return ; } /* Code to add an EST object to the array of read pairs. * In the array of read pairs, each pair holds the keys of the EST objects and * also the seg coordinates of the ends of those objects nearest each other. From this * we can create a HOMOL_GAP seg to span the gap between the EST read pair (the seg. * has a flag set so that a dashed line is used). * * We use an associator to find out if the pair for any one read is already in the array * and return its index in the array if it is. The EST homol segs (homols_start to * homols_end) are position sorted so we can record the relevant top/bottom of the * EST object by whether it is 5' or 3' and record it + what the read pair is. * If the pair of this EST is already in the array, we just add this one to it, * if not we create a new array element with this EST's information. For the latter * we make a copy of the HOMOL seg for the EST object so that we get all the * parent etc. information. */ static void addReadPair(Array all_paired_reads, Associator key2readpair, KEY this_read, KEY paired_read, Array segs, int homols_start, int homols_end) { static KEY EST_5 = KEY_UNDEFINED, EST_3 ; int read_index ; KEY orientation ; ReadPos read = NULL ; ReadPair readpair = NULL ; if (EST_5 == KEY_UNDEFINED) { EST_5 = str2tag("EST_5") ; EST_3 = str2tag("EST_3") ; } if (bIndexFind(this_read, EST_5)) orientation = EST_5 ; else if (bIndexFind(this_read, EST_3)) orientation = EST_3 ; else orientation = KEY_UNDEFINED ; /* Complain ?? */ #ifdef ED_G_NEVER_INCLUDE_THIS_CODE { SEG *seg ; int i ; for (i = homols_start ; i <= homols_end ; i++) { seg = arrayp(segs, i, SEG) ; printf("addReadPair() Homol Seg %d: %s is at %d %d\n", i, fMapSegTypeName[seg->type], seg->x1, seg->x2) ; } } #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ if (orientation) { BOOL found ; /* n.b. has more than one use below. */ /* If we can't find this EST's read pair in the array, add a new array element. */ if (!(found = assFind(key2readpair, assVoid(paired_read), &read_index))) read_index = arrayMax(all_paired_reads) ; readpair = arrayp(all_paired_reads, read_index, ReadPairStruct) ; if (!found) memset(readpair, 0, sizeof(ReadPairStruct)) ; /* Needed for KEY_UNDEFINED tests. */ /* Add EST to read pair array according to orientation and add coord. info. */ if (orientation == EST_5 && readpair->five_prime.key == KEY_UNDEFINED) { found = TRUE ; readpair->five_prime.key = this_read ; if (SEG_IS_DOWN(arrayp(segs, homols_start, SEG))) { readpair->five_prime.pos = arrayp(segs, homols_end, SEG)->x2 ; readpair->homol_index = homols_end ; } else { readpair->five_prime.pos = arrayp(segs, homols_start, SEG)->x1 ; readpair->homol_index = homols_start ; } #ifdef ED_G_NEVER_INCLUDE_THIS_CODE printf("addReadPair() - 5' EST: at index %d, %s at %d\n", read_index, name(readpair->five_prime.key), readpair->five_prime.pos) ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ } else if (orientation == EST_3 && readpair->three_prime.key == KEY_UNDEFINED) { found = TRUE ; readpair->three_prime.key = this_read ; if (SEG_IS_DOWN(arrayp(segs, homols_start, SEG))) { readpair->three_prime.pos = arrayp(segs, homols_start, SEG)->x1 ; readpair->homol_index = homols_start ; } else { readpair->three_prime.pos = arrayp(segs, homols_end, SEG)->x2 ; readpair->homol_index = homols_end ; } #ifdef ED_G_NEVER_INCLUDE_THIS_CODE printf("addReadPair() - 3' EST: at index %d, %s at %d\n", read_index, name(readpair->three_prime.key), readpair->three_prime.pos) ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ } else { found = FALSE ; /* complain ? */ } /* Update the associator with the new ESTs -> array index. */ if (found) assInsert(key2readpair, assVoid(this_read), assVoid(read_index)) ; } return ; } /* Make homol line segs to join up each EST read pair recorded. The segs * have the dashed line flag set. */ static void makePairedReadSegs(Array segs, Array all_paired_reads) { SEG *seg ; ReadPair readpair ; int i ; for (i = 0 ; i < arrayMax(all_paired_reads) ; i++) { readpair = arrayp(all_paired_reads, i, ReadPairStruct) ; if (readpair->five_prime.key && readpair->three_prime.key) { seg = arrayp(segs, arrayMax(segs), SEG) ; *(seg) = *(arrayp(segs, readpair->homol_index, SEG)) ; /* struct copy */ if (SEG_IS_DOWN(seg)) { seg->type = HOMOL_GAP ; seg->x1 = readpair->five_prime.pos ; seg->x2 = readpair->three_prime.pos ; } else { seg->type = HOMOL_GAP_UP ; seg->x1 = readpair->three_prime.pos ; seg->x2 = readpair->five_prime.pos ; } #ifdef ED_G_NEVER_INCLUDE_THIS_CODE printf("makePairedReadSegs() - Seg from %s & %s : %s at %d %d\n", name(readpair->five_prime.key), name(readpair->three_prime.key), fMapSegTypeName[seg->type], seg->x1, seg->x2) ; #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ seg->flags |= SEGFLAG_LINE_DASHED ; } } return ; } /* Just a copy of fMapOrder, main seg sorting routine which sorts by type * and position */ static int segOrder(void *a, void *b) { SEG *seg1 = (SEG*)a, *seg2 = (SEG*)b ; int diff ; diff = seg1->type - seg2->type ; if (diff > 0) return 1 ; else if (diff < 0) return -1 ; if ((seg1->type | 0x1) == SPLICED_cDNA_UP) { if (!(seg1->source * seg2->source)) return seg1->source ? -1 : 1 ; diff = seg1->source - seg2->source ; if (diff) return diff ; diff = seg1->parent - seg2->parent ; if (diff) return diff ; { int x1, x2, y1, y2 ; x1 = seg1->data.i >> 14 & 0x3FFF ; x2 = seg1->data.i & 0x3FFF ; y1 = seg2->data.i >> 14 & 0x3FFF ; y2 = seg2->data.i & 0x3FFF ; if ((x1 - x2) * (y1 - y2) < 0) return x1 < x2 ? -1 : 1 ; } } diff = seg1->x1 - seg2->x1 ; if (diff > 0) return 1 ; else if (diff < 0) return -1 ; diff = seg1->x2 - seg2->x2 ; if (diff > 0) return 1 ; else if (diff < 0) return -1 ; return 0 ; } static BOOL segFindBounds(Array segs, SegType type, int *min, int *max) { for (*min = 1 ; *min < arrayMax(segs) ; ++*min) { if (arrp(segs, *min, SEG)->type == type) break ; } for (*max = *min ; *max < arrayMax(segs) ; ++*max) { if (arrp(segs, *max, SEG)->type != type) break ; } return (*min < arrayMax(segs)) ; } /* Use this routine to set up any segs that represent features that are * 1 base pair long, this is needed because in this case SMAP2SEG may assign * the incorrect SegType so we get the type from the parent which will be correct. * An example would be exons where it is legitimate for them to be 1 base long * where they are incompletely known. */ static void segSetPosType(SEG *parent_seg, SEG *seg, SMapStatus status, int y1, int y2, SegType down, SegType up) { SMAP2SEG(seg, y1, y2, down, up) ; if (y1 == y2) { if (SEG_IS_DOWN(parent_seg)) seg->type = down ; else seg->type = up ; } segSetClip(seg, status, down, up) ; return ; } /* Translate the smap clipping flags into seg clipping for use by drawing */ /* code. */ static void segSetClip(SEG *seg, SMapStatus status, SegType feature, SegType feature_up) { if (SMAP_STATUS_SET(status, SMAP_STATUS_CLIP)) { if (SMAP_STATUS_SET(status, SMAP_STATUS_CLIP)) { if (SMAP_STATUS_SET(status, SMAP_STATUS_X1_CLIP)) { if (seg->type == feature) seg->flags |= SEGFLAG_CLIPPED_TOP ; else seg->flags |= SEGFLAG_CLIPPED_BOTTOM ; } if (SMAP_STATUS_SET(status, SMAP_STATUS_X2_CLIP)) { if (seg->type == feature) seg->flags |= SEGFLAG_CLIPPED_BOTTOM ; else seg->flags |= SEGFLAG_CLIPPED_TOP ; } } } else seg->flags = SEGFLAG_UNSET ; return ; } /* */ /* Internal routines for setting up the keyset of methods, and hence, */ /* features that should be included in the smapconvert() */ /* */ /* This routine finds all the Method objects in the database and returns */ /* them as a keyset. */ /* */ /* The routine returns a keyset if there are any method objects and NULL if */ /* not. It is the callers responsibility to destroy the keyset when */ /* finished with. */ /* */ static KEYSET allMethods(void) { KEYSET result = NULL ; /* Get all the methods. */ result = query(NULL, "FIND method") ; /* A refinement here would be to check all the objects to see if they any */ /* tags defined in them, I assume that if they don't then they are useless */ /* but this might not be so. */ /* A more thorough approach would be to see if a method object was */ /* referenced by any sequence objects, if not then there would no point in */ /* showing it. */ return result ; } /* Adding features to the dictionary. */ static void addToSet (ACEIN command_in, DICT *dict) { char cutter, *cutset, *word ; if (aceInStep (command_in, '"')) cutset = "|\"" ; else cutset = "| \t" ; while ((word = aceInWordCut (command_in, cutset, &cutter))) { dictAdd (dict, word, 0) ; if (cutter != '|') break ; } return; } /* */ /* Get rid of look from these routines.... */ /* */ /* */ /* Some utilities written by Simon to dump out look data, especially the */ /* seg data...very illuminating.... */ /* */ void segDumpToStdout(Array segs) { ACEOUT dest = aceOutCreateToStdout(0); segDump(segs, dest); aceOutDestroy(dest); } static void segDump(Array segs, ACEOUT dest) { int i; for (i=0; i< arrayMax(segs); i++) { SEG * seg = arrp(segs, i, SEG); aceOutPrint(dest, "%s: %d-%d key=%s ", fMapSegTypeName[seg->type], seg->x1, seg->x2, name(seg->key)); if (seg->source) aceOutPrint(dest, "source=%s ", name(seg->source)); if (seg->parent) aceOutPrint(dest, "parent=%s ", name(seg->parent)); aceOutPrint(dest, "\n"); } } static void fMapLookDataDump(FeatureMap look, ACEOUT dest) { aceOutPrint(dest, "seqKey = %s\n", name(look->seqKey)); aceOutPrint(dest, "mapKey = %s\n", name(look->mapKey)); aceOutPrint(dest, "dnaKey = %s\n", name(look->dnaKey)); aceOutPrint(dest, "start=%d stop=%d min=%d max=%d length=%d\n", look->start, look->stop, look->min, look->max, look->length); aceOutPrint(dest, "fullLength=%d origin=%d zoneMin=%d zoneMax=%d\n", look->fullLength, look->origin, look->zoneMin, look->zoneMax); aceOutPrint(dest, "dnaStart=%d dnaWidth=%d dnaSkip=%d\n", look->dnaStart, look->dnaWidth, look->dnaSkip); } void fMapLookDump(FeatureMap look) { ACEOUT dest = aceOutCreateToStdout(0); fMapLookDataDump(look, dest); segDump(look->segs, dest); aceOutDestroy(dest); }