/* file: smap.c * Author: Richard Durbin (rd@sanger.ac.uk) * Copyright (c) J Thierry-Mieg and R Durbin, 1999 * ------------------------------------------------------------------- * Acedb is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * or see the on-line version at http://www.gnu.org/copyleft/gpl.txt * ------------------------------------------------------------------- * This file is part of the ACEDB genome database package, written by * Richard Durbin (Sanger Centre, UK) rd@sanger.ac.uk, and * Jean Thierry-Mieg (CRBM du CNRS, France) mieg@kaa.crbm.cnrs-mop.fr * * Description: * Exported functions: See smap.h * HISTORY: * Last edited: Feb 4 17:03 2004 (edgrif) * * Aug 26 20:20 (rd): added support for Source_exons * * Aug 23 21:15 (rd): added sMapDNA and changed mismatch semantics * * Aug 23 19:50 (rd): added bsIndexFind calls for efficiency * these are VERY IMPORTANT * * Aug 23 19:34 (rd): added sMapKeySet() * Created: Wed Jul 28 22:11:20 1999 (rd) * CVS info: $Id: smap.c,v 1.50 2004/02/05 11:01:28 edgrif Exp $ *------------------------------------------------------------------- */ #include #include #include #include #include #include #include #include #include #include /* The SMap package supports building a sequence coordinate system out of multiple objects according to the following model: in parent object ---------------- S_Child XREF UNIQUE Int UNIQUE Int #SMap_info // Ints are start, end in parent #SMap_info Align Int UNIQUE Int UNIQUE Int // for each ungapped block, parent_start child_start [length] // starts are starts with respect to child (relevant if sequences // are opposite orientations // if no length then until next block (so no double gap) // if no Align assume ungapped starting at 1 in child AlignDNAPep Int UNIQUE Int UNIQUE Int AlignPepDNA Int UNIQUE Int UNIQUE Int // These two tags are analogous to Align, but scale length // for the case of a dna alignment to peptide or vice-versa. Mismatch Int UNIQUE Int // start end of mismatch region in child coords // mismatches from this sequence or children are ignored // if no end then only the specified base // if no Ints then mismatches OK anywhere in this sequence Contents Homols_only Features_only No_DNA in child object --------------- SMap S_Parent UNIQUE UNIQUE // must be just _one_ parent. UNIQUE length ; } /******************************************************************/ SMapKeyInfo* sMapKeyInfo (SMap* smap, KEY key) { SMapKeyInfo *info ; if (smap->lastKeyInfo && smap->lastKeyInfo->key == key) return smap->lastKeyInfo ; if (!assFind (smap->key2index, assVoid(key), &info)) return 0 ; smap->lastKeyInfo = info ; #ifdef ACEDB4 if (smap->lastKeyUnspliced) arrayDestroy(smap->lastKeyUnspliced); #endif return info ; } /******************************************************************/ /* Return parent, or NULL of root of tree. Note that this is parent wrt this smap: it may be a child if the mapping was traversed backwards! */ KEY sMapParent(SMapKeyInfo* info) { return info->parent; } /* Return Map array for this key */ Array sMapMapArray(SMapKeyInfo* info) { return info->map; } /* fundamental coordinate conversion routine * x1, x2, y1, y2 as sMapConvert() * nx1, nx2 are the clipped values that actually map to y1, y2 - for code simplicity * these are always set, even if NO_OVERLAP status is returned (in which case they * are x1, x2) * * return values are all a mess currently, if sMapMapEx() fails then y1 & y2 will be * RUBBISH and yet they are still returned...sigh...what is going on here... * * return value is status as above */ SMapStatus sMapMap(SMapKeyInfo* info, int x1, int x2, int *y1, int *y2, int *nx1, int *nx2) { struct SMapMapReturn ret; SMapStatus status ; status = sMapMapEx(info->smap, info->length, info->map, x1, x2, &ret); if (y1) *y1 = ret.y1; if (y2) *y2 = ret.y2; if (nx1) *nx1 = ret.nx1; if (nx2) *nx2 = ret.nx2; return status ; } /* and its equivalent for mapping the other way */ /* as for sMapMap(), but maps in the reverse direction */ SMapStatus sMapInverseMap (SMapKeyInfo* info, int x1, int x2, int *y1, int *y2, int *nx1, int *nx2) { struct SMapMapReturn ret; SMapStatus status ; if (!info->inverseMap) { info->inverseMap = arrayHandleCopy (info->map, info->smap->handle) ; sMapInvertMap (info->inverseMap) ; } status = sMapMapEx(info->smap, info->length, info->inverseMap, x1, x2, &ret); if (y1) *y1 = ret.y1; if (y2) *y2 = ret.y2; if (nx1) *nx1 = ret.nx1; if (nx2) *nx2 = ret.nx2; /* SMAP_STATUS_OUTSIDE_AREA may get set in sMapMapEx. This is bogus for inverse maps since we are mapping into object coordinates and not Map coordinates. Just reset that bit here. */ return status & ~SMAP_STATUS_OUTSIDE_AREA; } /* Is the mapping from this key to the root on the reverse strand. * True means that for mapping from x1,x2 to y1,y2 gives y1>y2 when x1strand == strand; } /* NOTE that we require the object length to be supplied because objects may be clipped * during mapping, if this happens then we _cannot_ place any child features correctly within * them when those features are _reversed_. * This is because for forward features we know the object starts at 1 and so can always recover * this information, but for reversed features we cannot now reliably recover the length * as it is now clipped, i.e. we must be told it. */ static SMapStatus sMapMapEx(SMap *smap, int obj_length, Array map, int x1, int x2, struct SMapMapReturn *ret) { int z1, z2 ; SMapMap *m1, *m2 ; BOOL isReverse = FALSE ; SMapStatus status = SMAP_STATUS_PERFECT_MAP ; /* default is perfect map without gaps */ /* Passing a coord of zero gives the maximum extent of this object */ if (x1 == 0) x1 = obj_length ; if (x2 == 0) x2 = obj_length ; /* set default nx1, nx2 before possible reverse * this means they are set even if NO_OVERLAP, but that seems OK */ if (ret) { ret->nx1 = x1 ; ret->nx2 = x2 ; } /* Is this really ok ? What about if x1 == x2 ?? */ if (x1 > x2) { int t = x1 ; x1 = x2 ; x2 = t ; isReverse = TRUE ; } /* Are coords completely outside the supplied map ? */ if (x1 > arrp(map, arrayMax(map)-1, SMapMap)->s2) /* off end of last mapped segment */ return SMAP_STATUS_X2_NO_OVERLAP_EXTERNAL ; else if (x2 < arrp(map, 0, SMapMap)->s1) /* off start of first mapped segment */ return SMAP_STATUS_X1_NO_OVERLAP_EXTERNAL ; /* find map segment for x1 by binary search */ /* could cache in KeyInfo for rapid return on dense sorted features? */ m1 = lowestUpperSeg (map, x1) ; if (x1 < m1->s1) /* clip */ { z1 = m1->r1 ; if (m1 == arrp(map, 0, SMapMap)) status |= SMAP_STATUS_X1_EXTERNAL_CLIP ; else status |= SMAP_STATUS_X1_INTERNAL_CLIP ; if (ret) { if (isReverse) ret->nx2 = m1->s1; else ret->nx1 = m1->s1 ; } } else if (m1->r2 > m1->r1) z1 = m1->r1 + (x1 - m1->s1) ; else z1 = m1->r1 - (x1 - m1->s1) ; /* x2 likewise */ m2 = highestLowerSeg (map, x2) ; /* completely contained in internal gap */ if (m2 < m1) return SMAP_STATUS_NO_OVERLAP_INTERNAL ; if (m1 < m2) status |= SMAP_STATUS_INTERNAL_GAPS ; if (x2 > m2->s2) /* clip */ { z2 = m2->r2 ; if (m2 == arrp(map, arrayMax(map)-1, SMapMap)) status |= SMAP_STATUS_X2_EXTERNAL_CLIP ; else status |= SMAP_STATUS_X2_INTERNAL_CLIP ; if (ret) { if (isReverse) ret->nx1 = m2->s2 ; else ret->nx2 = m2->s2 ; } } else if (m2->r2 > m2->r1) z2 = m2->r2 - (m2->s2 - x2) ; else z2 = m2->r2 + (m2->s2 - x2) ; /* If an smap was also supplied then check whether coords are also completely outside * requested area, note that its possible for coords to be successfully mapped but * still be _outside_ the requested area. */ if (smap) { if (((z1 <= z2) && (z2 < smap->area1)) || ((z1 <= z2) && (z1 > smap->area2)) || ((z1 > z2) && (z1 < smap->area1)) || ((z1 > z2) && (z2 > smap->area2))) status |= SMAP_STATUS_OUTSIDE_AREA; } /* done, now fill in results and return */ if (ret) { if (isReverse) { ret->y1 = z2 ; ret->y2 = z1 ; } else { ret->y1 = z1 ; ret->y2 = z2 ; } } return status; } /* Return human readable error as static string. * Note that at most one bit should be set in status, if more * than one is set in your result, call this multiple times and * cat the return values to taste. May return NULL for unused bits. * * Used to provide string versions of return codes for a text based/command line interface to * smap (see "gif smap" command in gifcommand.c). */ char *sMapErrorString(SMapStatus status) { char *result ; switch (status) { case SMAP_STATUS_PERFECT_MAP: result = "PERFECT_MAP"; break ; case SMAP_STATUS_ERROR: result = "ERROR"; break ; case SMAP_STATUS_INTERNAL_GAPS: result = "INTERNAL_GAPS"; break ; case SMAP_STATUS_OUTSIDE_AREA: result = "OUTSIDE_AREA"; break ; case SMAP_STATUS_X1_NO_OVERLAP_EXTERNAL: result = "X1_NO_OVERLAP_EXTERNAL"; break ; case SMAP_STATUS_X2_NO_OVERLAP_EXTERNAL: result = "X2_NO_OVERLAP_EXTERNAL"; break ; case SMAP_STATUS_NO_OVERLAP_INTERNAL: result = "NO_OVERLAP_INTERNAL"; break ; case SMAP_STATUS_X1_EXTERNAL_CLIP: result = "X1_EXTERNAL_CLIP"; break ; case SMAP_STATUS_X1_INTERNAL_CLIP: result = "X1_INTERNAL_CLIP"; break ; case SMAP_STATUS_X2_EXTERNAL_CLIP: result = "X2_EXTERNAL_CLIP"; break ; case SMAP_STATUS_X2_INTERNAL_CLIP: result = "X2_INTERNAL_CLIP"; break ; default: result = NULL; break ; } return result ; } /************************************************************/ /* standard coordinate conversion routine for a full SMap object */ /* Converts interval x1..x2 in key's coordinate system to y1..y2 in smap's. Returns TRUE if succeeds, FALSE if e.g. key not known in smap, or both x1, x2 out of range (on the same side), or both fall in the same gap in the map. If part of the interval is out of range then it clips. */ BOOL sMapConvert (SMap* smap, KEY key, int x1, int x2, int *y1, int *y2) { BOOL result = FALSE ; SMapKeyInfo *info; struct SMapMapReturn ret; if ((info = sMapKeyInfo (smap, key)) && (sMapMapEx(info->smap, info->length, info->map, x1, x2, &ret) & SMAP_STATUS_NO_OVERLAP) == 0) { if (y1) *y1 = ret.y1 ; if (y2) *y2 = ret.y2 ; result = TRUE ; } return result ; } #ifdef ACEDB4 /* As for sMapMap, but for objects with source exons, the conversion is from the unspliced co-ordinate system. Converts interval x1..x2 in key's coordinate system to y1..y2 in smap's. Returns SMapStatus as above, note well that it will clip coords so you may end up with the returned y1,y2 being identical. Beware that this call is _more costly than sMapMap(). */ /* Map into unspliced co-ordinates even if an object has source exons. Currently first call on each object costs a bsCreate/Destroy and two arrayCreate/Destroys. We could make this cheaper at the cost of holding more data in every node. */ SMapStatus sMapUnsplicedMap(SMapKeyInfo* info, int x1, int x2, int *y1, int *y2, int *nx1, int *nx2) { SMap *smap = info->smap; SMapStatus result = SMAP_STATUS_ERROR ; struct SMapMapReturn ret; Array exonMap, unsplicedMap = smap->lastKeyUnspliced ; if (!unsplicedMap) { /* If no source exons we just do ordinary map */ if (!(exonMap = getExonMap(info->key))) unsplicedMap = info->map; else { /* spliced -> unspliced */ sMapInvertMap(exonMap); /* root -> spliced -> unspliced */ unsplicedMap = smap->lastKeyUnspliced = sMapComposeMaps(smap, exonMap, info->length, info->map, smap->handle); arrayDestroy(exonMap); } } result = sMapMapEx(info->smap, info->length, unsplicedMap, x1, x2, &ret) ; if (y1) *y1 = ret.y1; if (y2) *y2 = ret.y2; if (nx1) *nx1 = ret.nx1; if (nx2) *nx2 = ret.nx2; return result ; } #endif /* ACEDB4 */ /*************************************************************/ static void sMapKeyAdd (KEYSET kset, SMapKeyInfo *info) { int i ; keySet(kset, keySetMax(kset)) = info->key ; for (i = 0 ; i < arrayMax (info->children) ; ++i) { sMapKeyAdd (kset, arr(info->children, i, SMapKeyInfo*)) ; } return ; } /* this gives all keys that map onto this smap */ KEYSET sMapKeys (SMap* smap, STORE_HANDLE handle) { KEYSET kset = keySetHandleCreate (handle) ; sMapKeyAdd (kset, smap->root) ; return kset ; } /*************************************************************/ /* Given an smap, will return the dna sequence for that smap. * If the dna can be found it is returned as an array allocated on handle, * otherwise NULL is returned. * NOTE that there is no requirement for the dna to be complete, there may * be holes in the array which will be exported as a series of "-" chars. */ /* sMapDNAErrorCallback may be NULL, which is equivalent to a callBack function which always returns sMapErrorReturnFail. */ Array sMapDNA(SMap* smap, STORE_HANDLE handle, sMapDNAErrorCallback callback) { Array dna ; int dna_length ; consMapDNAErrorReturn errorLocation = sMapErrorReturnContinue ; /* The dna length comes from the requested "area" of the smap, this would * correspond to the calculated area in say the fmap display. */ dna_length = smap->area2 - smap->area1 + 1 ; /* this is where we build the result */ dna = arrayHandleCreate(dna_length, unsigned char, handle) ; /* recursively add DNA */ if (sMapAddDNA(dna, smap->root, callback, &errorLocation) && errorLocation != sMapErrorReturnFail && errorLocation != sMapErrorReturnContinueFail) { /* now fill in low byte from high byte if necessary */ int i ; unsigned char *cp = arrp(dna, 0, unsigned char) ; for (i = 0 ; i < arrayMax(dna) ; i++, cp++) { if (*cp & 0xf) *cp &= 0xf ; else *cp >>= 4 ; } /* We must reset the arrayMax of the dna array to be original array * length because sMapAddDNA() leaves it as the last position that * it found dna for, we need to return the whole array. */ arrayMax(dna) = dna_length ; } else { arrayDestroy(dna) ; dna = NULL ; } return dna ; } /* NB sMapDNA is called by dnaNewGet() if there is no directly attached DNA, and it calls dnaGet to get the dna for keys with directly attached DNA. We have been careful to ensure there is no loop here. */ /* We return TRUE if the object itself contains dna or if any of its */ /* children contain dna, otherwise FALSE. */ /* */ static BOOL sMapAddDNA(Array dna, SMapKeyInfo *info, sMapDNAErrorCallback callback, consMapDNAErrorReturn *errorLocation) { BOOL result = FALSE ; int i ; #ifdef ED_G_NEVER_INCLUDE_THIS_CODE /* This code means that we stop as soon as we find any dna on our way down through * parents to children in an smap, so we never see dna below.... * I have removed this restriction but it needs further testing. */ if (bIndexTag(info->key, str2tag("DNA"))) return sMapAddRawDNA(dna, info, callback, errorLocation); #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ if (bIndexTag(info->key, str2tag("DNA"))) { if (sMapAddRawDNA(dna, info, callback, errorLocation)) result = TRUE ; } /* This sequence has no dna directly attached so look in children. */ for (i = 0 ; i < arrayMax(info->children) ; ++i) { /* N.B. we return TRUE if one or more of the children contain dna. */ if (sMapAddDNA(dna, arr(info->children, i, SMapKeyInfo*), callback, errorLocation)) result = TRUE ; } return result; } static BOOL sMapAddRawDNA(Array dna, SMapKeyInfo *info, sMapDNAErrorCallback callback, consMapDNAErrorReturn *errorLocation) { /* Try and get this objects dna. */ OBJ obj = NULL ; Array a = NULL ; KEY key = KEY_UNDEFINED ; unsigned char *cp, *cq ; int i, j, length ; int area1 = info->smap->area1; int area2 = info->smap->area2; if (!(obj = bsCreate (info->key))) return FALSE; if (!bsGetKey (obj, str2tag("DNA"), &key) || !(a = dnaRawGet(key))) { bsDestroy(obj); return FALSE; } /* take minimum of declared and actual lengths of DNA */ if (!bsGetData (obj, _bsRight, _Int, &length) || (length > arrayMax(a))) length = arrayMax(a); if (length < 1) { arrayDestroy(a); bsDestroy(obj); return FALSE; /* don't have to worry about zero length later */ } /* shift sequence allowed to mismatch into high byte */ if (info->mismatch) for (i = 0 ; i < arrayMax (info->mismatch) ; ++i) { SMapMismatch *mis = arrp(info->mismatch, i, SMapMismatch) ; for (j = mis->s1 - 1 ; (j < mis->s2 - 1) && (j < length) ; ++j) arr(a,j,unsigned char) <<= 4 ; } /* strategy is to complain if low half-bytes don't match then OR with new sequence, allowing high byte ambiguity codes */ for (i = 0 ; i < arrayMax (info->map) ; ++i) { SMapMap *map = arrp(info->map, i, SMapMap); int s1 = map->s1; int s2 = map->s2; int r1 = map->r1; int r2 = map->r2; if (info->strand == strand_down) { /* May not have enough DNA: adjust. */ if (s2 > length) { r2 -= s2 - length; s2 = length; } /* Now truncate based on Area and adjust r-coords to take it into account. area1 <= area2 always */ if (r1 < area1) { s1 += area1 - r1; r1 = area1; } if (r2 > area2) { s2 -= r2 - area2; r2 = area2; } if (s1 > s2) /* no overlap with area at all. */ continue; { int max_length = r2 - area1 + 1 ; if (arrayMax(dna) < max_length) arrayMax(dna) = max_length ; } cp = arrp(dna, (r1 - area1), unsigned char) ; cq = arrp(a, (s1 - 1), unsigned char) ; for (j = 0; j <= s2-s1 ; j++) { if ((*cp & 0xf) && (*cp & 0xf) != (*cq & 0xf) && (*errorLocation == sMapErrorReturnContinue || *errorLocation == sMapErrorReturnContinueFail)) *errorLocation = callback ? (*callback)(key, s1 + j) : sMapErrorReturnFail; *cp++ |= *cq++ ; } } else { /* up-strand version. */ /* May not have enough DNA: adjust. */ if (s2 > length) { r2 += s2 - length; s2 = length; } /* Now truncate based on Area and adjust r-coords to take it into account. area1 <= area2 always */ if (r2 < area1) { s2 -= area1 - r2; r2 = area1; } if (r1 > area2) { s1 += r1 - area2; r1 = area2; } if (s1 > s2) /* no overlap with area at all. */ continue; if (arrayMax(dna) < r1 - area1 + 1) arrayMax(dna) = r1 - area1 + 1; cp = arrp(dna, (r1 - area1), unsigned char) ; cq = arrp(a, (s1 - 1), unsigned char) ; for (j = 0; j <= s2-s1 ; j++) { unsigned char base = complementBase[(int)(*cq & 0xf)] ; if ((*cp & 0xf) && (*cp & 0xf) != base && (*errorLocation == sMapErrorReturnContinue || *errorLocation == sMapErrorReturnContinueFail)) *errorLocation = callback ? (*callback)(key, s1 + j) : sMapErrorReturnFail; *cp-- |= (complementBase[(int)(*cq++ >> 4)] << 4) | base; } } } bsDestroy (obj) ; arrayDestroy (a) ; return TRUE; } /*************************************************************/ /************ now the stuff to build smaps *******************/ static int SMapMapOrder (void *va, void *vb) { SMapMap *a = (SMapMap*)va ; SMapMap *b = (SMapMap*)vb ; return (a->s1 - b->s1) ; } /******************************/ /* For a given span find the sequence object that contains that span * without gaps. If sMapSpanPredicate is non-NULL, it gets used to filter the output. */ /* Finds smallest containing smap'd object that fits the predicate specified * by predicate function, or any smallest containing smap'd object if no * predicate. * * This is richards comment about this routine when it was in fmapcontrol.c * * "Mar 13 22:55 2002 (rd): fixed fMapFindSpan() to use sMapInverseMap() not SEGs * and removed fMapFindZoneFather(), replacing calls with fMapFindSpan()" * * You should NOTE WELL that smap coords run 1 -> length (i.e. do not start at 0) * */ BOOL sMapFindSpan(SMap *smap, KEY *keyp, int *start, int *end, sMapSpanPredicate predicate) { BOOL found ; KEYSET ks ; int i, min = INT_MAX ; int x, y, xResult=0, yResult=0 ; KEY key, result = KEY_UNDEFINED ; SMapStatus status ; SMapKeyInfo *info ; messAssert(smap != NULL && keyp != NULL && start != NULL && end != NULL) ; ks = sMapKeys(smap, 0) ; for (i = 0 ; i < keySetMax(ks) ; ++i) { int key_len ; key = keySet (ks, i) ; if (predicate && !(*predicate)(key)) continue ; info = sMapKeyInfo (smap, key) ; status = sMapInverseMap (info, *start, *end, &x, &y, 0, 0) ; if (status == SMAP_STATUS_PERFECT_MAP && (key_len = sMapLength(key)) < min) { result = key ; xResult = x ; yResult = y ; min = key_len ; } } keySetDestroy(ks) ; if (!result) found = FALSE ; else { *keyp = result ; *start = xResult ; *end = yResult ; found = TRUE ; } return found ; } /******************************/ static Array sMapLocalMapEx (OBJ obj, /* in parent located at #SmapInfo */ int pStart, /* start coord in parent */ int pEnd, /* end coord in parent */ STORE_HANDLE handle, /* return array on this. */ Strand *strand) /* fill in for direction.*/ { SMapMap *m ; Array u = arrayCreate (16, BSunit) ; BSunit *ui ; int i ; Array localMap ; int DNAfac, pepFac; BSMARK mark ; Strand alignStrand = strand_down; /* first get Align info into u */ mark = bsMark(obj, 0) ; if (bsGetArray (obj, str2tag("Align"), u, 3)) { DNAfac = 1 ; pepFac = 1 ; } else if (bsGetArray (obj, str2tag("AlignDNAPep"), u, 3)) { DNAfac = 3 ; pepFac = 1 ; } else if (bsGetArray (obj, str2tag("AlignPepDNA"), u, 3)) { DNAfac = 1 ; pepFac = 3 ; } else { arrayDestroy(u); bsGoto (obj, mark) ; bsMarkFree (mark) ; return NULL ; } bsGoto (obj, mark) ; bsMarkFree (mark) ; /* next convert u into localMap */ localMap = arrayHandleCreate (arrayMax(u)/3, SMapMap, handle) ; for (i = 0 ; i < arrayMax(u)/3 ; ++i) { m = arrayp(localMap, i, SMapMap) ; ui = arrp(u, 3*i, BSunit) ; m->s1 = ui[1].i ; m->r1 = ui[0].i ; if (ui[2].i) { /* negative length means invert, but only when pStart == pEnd, * this in the database: * s_child s_sequence 100 100 Align 100 1 -1 * gives a single base sequence on reverse strand. */ if (ui[2].i < 0) { alignStrand = strand_up; m->s2 = m->s1 - ui[2].i - 1 ; } else m->s2 = m->s1 + ui[2].i - 1 ; } } arraySort(localMap, SMapMapOrder) ; /** could/should check colinearity here? **/ /* fill in blank ->s1 and all ->r2 values; must come after sort */ for (i = 0 ; i < arrayMax(localMap)-1 ; ++i) { m = arrp(localMap, i, SMapMap) ; if (pEnd > pStart) { /* forward alignment */ if (!m->s2) { if (((m+1)->s1 - m->s1) / pepFac < (((m+1)->r1 - m->r1) / DNAfac)) m->s2 = (m+1)->s1 - 1 ; else m->s2 = m->s1 + ((((m+1)->r1 - m->r1) / DNAfac) - 1) * pepFac ; } m->r2 = m->r1 + ((((m->s2 - m->s1 + 1) * DNAfac) / pepFac) - 1) ; } else { /* reversed alignment */ if (!m->s2) { if (((m+1)->s1 - m->s1) / pepFac < ((m->r1 - (m+1)->r1) / DNAfac)) m->s2 = (m+1)->s1 - 1 ; else m->s2 = m->s1 + (((m->r1 - (m+1)->r1) / DNAfac) - 1) * pepFac ; } m->r2 = m->r1 - ((((m->s2 - m->s1 + 1) * DNAfac) / pepFac) - 1) ; } } m = arrp(localMap, arrayMax(localMap)-1, SMapMap) ; m->r2 = pEnd ; if (!m->s2) { if (pEnd > pStart) m->s2 = m->s1 + ((m->r2 - m->r1) / DNAfac) * pepFac ; else m->s2 = m->s1 + ((m->r1 - m->r2) / DNAfac) * pepFac ; } /** could check total lengths match here if given **/ if (strand) { if (pEnd > pStart) /* forward alignment */ *strand = strand_down; else if (pEnd < pStart) *strand = strand_up; else /* pStart == pEnd */ *strand = alignStrand; } arrayDestroy (u) ; return localMap ; } Array sMapLocalMap (OBJ obj, /* in parent located at #SmapInfo */ int pStart, /* start coord in parent */ int pEnd, /* end coord in parent */ STORE_HANDLE handle) /* return array on this. */ { return sMapLocalMapEx(obj, pStart, pEnd, handle, NULL); } /******************************/ static Array sMapComposeMaps(SMap *smap, Array localMap, int parent_length, Array parentMap, STORE_HANDLE handle) { Array productMap ; SMapMap *m1, *m2, *m3 ; SMapStatus status; struct SMapMapReturn mapRet ; int i; /* Warning: a picture helps A LOT if you want to follow this function */ /* Notation: m1 in localMap maps x-space to y-space m2 in parentMap maps y-space to z-space m3 in the productMap maps x-space to z-space */ productMap = arrayHandleCreate (arrayMax(localMap) + arrayMax(parentMap) - 1, SMapMap, handle) ; /* First loop over localMap, creating 0, 1 or more segments in * productMap for each localMap segment. Treat separately the two * cases where localMap is reversed or not reversed. */ if (arrp(localMap,0,SMapMap)->r1 < arrp(localMap,arrayMax(localMap)-1,SMapMap)->r2) /* localMap is not reversed - easier to think about */ for (i = 0 ; i < arrayMax(localMap) ; ++i) { m1 = arrp(localMap, i, SMapMap) ; status = sMapMapEx(smap, parent_length, parentMap, m1->r1, m1->r2, &mapRet) ; if (status & SMAP_STATUS_NO_OVERLAP) continue ; m3 = arrayp(productMap, arrayMax(productMap), SMapMap) ; /* make at least one segment */ if (!(status & SMAP_STATUS_INTERNAL_GAPS)) /* easy case */ { m3->s1 = m1->s1 + (mapRet.nx1 - m1->r1) ; m3->r1 = mapRet.y1 ; m3->s2 = m1->s2 - (m1->r2 - mapRet.nx2) ; m3->r2 = mapRet.y2 ; } else /* complex case when INTERNAL_GAPS */ { /* start first segment as above */ m3->s1 = m1->s1 + (mapRet.nx1 - m1->r1) ; m3->r1 = mapRet.y1 ; m2 = lowestUpperSeg (parentMap, m1->r1) ; while (m2->s2 < mapRet.nx2) /* all the rest of m2 is in the m1 alignment */ { m3->s2 = m1->s1 + (m2->s2 - m1->r1) ; m3->r2 = m2->r2 ; /* finish this segment */ ++m2 ; m3 = arrayp(productMap, arrayMax(productMap), SMapMap) ; /* and start another */ m3->s1 = m1->s1 + (m2->s1 - m1->r1) ; m3->r1 = m2->r1 ; } m3->s2 = m1->s2 - (m1->r2 - mapRet.nx2) ; m3->r2 = mapRet.y2 ; /* finish final segment */ } } else /* localMap is reversed */ for (i = 0 ; i < arrayMax(localMap) ; ++i) { m1 = arrp(localMap, i, SMapMap) ; status = sMapMapEx(smap, parent_length, parentMap, m1->r1, m1->r2, &mapRet) ; if (status & SMAP_STATUS_NO_OVERLAP) continue ; m3 = arrayp(productMap, arrayMax(productMap), SMapMap) ; /* make at least one segment */ if (!(status & SMAP_STATUS_INTERNAL_GAPS)) /* easy case */ { m3->s1 = m1->s1 + (m1->r1 - mapRet.nx1) ; m3->r1 = mapRet.y1 ; m3->s2 = m1->s2 - (mapRet.nx2 - m1->r2) ; m3->r2 = mapRet.y2 ; } else /* complex case when INTERNAL_GAPS */ { /* start first segment as above */ m3->s1 = m1->s1 + (m1->r1 - mapRet.nx1) ; m3->r1 = mapRet.y1 ; m2 = highestLowerSeg (parentMap, m1->r1) ; while (m2->s1 > mapRet.nx2) /* all the rest of m2 is in the m1 alignment */ { /* finish this segment */ m3->s2 = m1->s1 + (m1->r1 - m2->s1) ; m3->r2 = m2->r1 ; --m2 ; /* and start another */ m3 = arrayp(productMap, arrayMax(productMap), SMapMap) ; m3->s1 = m1->s1 + (m1->r1 - m2->s2) ; m3->r1 = m2->r2 ; } m3->s2 = m1->s2 - (mapRet.nx2 - m1->r2) ; m3->r2 = mapRet.y2 ; /* finish final segment */ } } /* finally must go through and check that we don't have abutting segments - if so join them */ /* reuse m1, m2, m3 all in productMap */ m2 = m1 = arrp(productMap, 0, SMapMap) ; for (i = 0 ; i < arrayMax(productMap)-1 ; ++m2, ++i) { m3 = m2 + 1 ; if (m3->s1 > m2->s2 + 1 || m3->r1 > m2->r2 + 1 || m3->r1 < m2->r2 - 1) { if (m1 != m2) { m1->s2 = m2->s2 ; m1->r2 = m2->r2 ; } ++m1 ; if (m1 != m3) { m1->s1 = m3->s1 ; m1->r1 = m3->r1 ; } } } if (m1 != m2) { m1->s2 = m2->s2 ; m1->r2 = m2->r2 ; } arrayMax(productMap) = (m1 - arrp(productMap, 0, SMapMap)) + 1 ; return productMap ; } /******************************/ static void sMapInvertMap (Array map) { int i, t ; SMapMap *m, *n ; /* N.B. we compare the first segment with the _last_ to detect reverse */ /* coords, comparing within a segment does not if r1 == r2 ! */ if (arrp(map, 0, SMapMap)->r1 < arrp(map, (arrayMax(map) - 1), SMapMap)->r2) for (i = 0 ; i < arrayMax(map) ; ++i) { m = arrp(map, i, SMapMap) ; t = m->s1 ; m->s1 = m->r1 ; m->r1 = t ; t = m->s2 ; m->s2 = m->r2 ; m->r2 = t ; } else for (i = 0 ; i < arrayMax(map) ; ++i) { m = arrp(map, i, SMapMap) ; n = arrp(map, arrayMax(map)-1-i, SMapMap) ; t = m->s1 ; m->s1 = n->r2 ; n->r2 = t ; t = m->s2 ; m->s2 = n->r1 ; n->r1 = t ; } } /*********************************************************/ static SMapKeyInfo *sMapNewInfo (SMap* smap, KEY key, KEY parent) { SMapKeyInfo *info = (SMapKeyInfo*) halloc (sizeof(SMapKeyInfo), smap->handle) ; info->smap = smap ; info->key = key ; info->parent = parent; assInsert (smap->key2index, assVoid(key), info) ; info->children = arrayHandleCreate (8, SMapKeyInfo*, smap->handle) ; return info ; } /*****************/ static SMapKeyInfo *sMapAddNode(SMap *smap, SMapKeyInfo *parentInfo, KEY key, /* key to add */ int pStart, /* start coord in parent */ int pEnd, /* end coord in parent */ OBJ parentObj, /* located at #SmapInfo */ BOOL invert) { SMapKeyInfo *info = NULL; OBJ obj = NULL ; Array localMap = NULL, u = NULL ; SMapMismatch *mis = NULL ; SMapMap *m = NULL ; int i ; Strand strand; if (assFind (smap->key2index, assVoid(key), 0)) /* we have it already */ return NULL; info = sMapNewInfo (smap, key, parentInfo->key) ; array(parentInfo->children, arrayMax(parentInfo->children), SMapKeyInfo*) = info ; /* We need to keep a record of the proper length of the child so that we can calculate * the position of reverse strand features properly. pStart/pEnd may be the same as length * but don't have to be (e.g. their may be Align information that * means that pStart and pEnd are inside the childs true length). */ if ((obj = bsCreate (key))) { if (!(info->length = getLengthInternal(obj))) { info->length = getLengthOfChildren(obj) ; /* No need to get parent coords we already have them. */ if (info->length < abs(pEnd - pStart) + 1) info->length = abs(pEnd - pStart) + 1 ; } bsDestroy(obj) ; } /* make local map then compose with parent map to give map */ if (!(localMap = sMapLocalMapEx(parentObj, pStart, pEnd, 0, &strand))) { localMap = arrayCreate (1, SMapMap) ; m = arrayp (localMap, 0, SMapMap) ; m->s1 = 1 ; m->r1 = pStart ; m->r2 = pEnd ; if (pEnd >= pStart) /* default for 1 base block is down strand */ { m->s2 = pEnd - pStart + 1; strand = strand_down; } else { m->s2 = pStart - pEnd + 1; strand = strand_up; } } /* DON'T KNOW WHAT THIS COMMENT IS REFERRING TO.... */ /* Note that ExonMap is always forward so we can ignore that */ if (strand == strand_up) info->strand = invertStrand(parentInfo->strand); else info->strand = parentInfo->strand; #ifdef ACEDB4 if (!invert) { Array exonMap; if ((exonMap = getExonMap(key))) { Array unspliceMap = sMapComposeMaps(smap, localMap, parentInfo->length, parentInfo->map, smap->handle); info->map = sMapComposeMaps(smap, exonMap, info->length, unspliceMap, smap->handle) ; arrayDestroy(exonMap); arrayDestroy(unspliceMap); } else info->map = sMapComposeMaps(smap, localMap, parentInfo->length, parentInfo->map, smap->handle); } else { /* The next bit is REALLY TRICKY. If we are following a mapping backwards and the parent (really child) has source exons, then the mapping we have for the parent will include the mapping from the source exons. But the child (really parent) maps to the unspliced (ie no exons) version. So before doing anything else we have to remove the source exons mapping. Then we apply the reverse SE mapping (if any) for the child (really parent) and finally the smap mapping. You are not expected to understand this. */ Array parentExonMap; sMapInvertMap(localMap); if ((parentExonMap = getExonMap(parentInfo->key))) { Array unspliceMap; sMapInvertMap(parentExonMap); unspliceMap = sMapComposeMaps(smap, parentExonMap, parentInfo->length, parentInfo->map, smap->handle); info->map = sMapComposeMaps(smap, localMap, info->length, unspliceMap, smap->handle); arrayDestroy(parentExonMap); arrayDestroy(unspliceMap); } else info->map = sMapComposeMaps(smap, localMap, parentInfo->length, parentInfo->map, smap->handle); } #else if (invert) sMapInvertMap(localMap); info->map = sMapComposeMaps(smap, localMap, parentInfo->length, parentInfo->map, smap->handle) ; #endif /* !ACEDB4 */ arrayDestroy(localMap); /* get and store info->mismatch */ u = arrayCreate (16, BSunit) ; if (bsGetArray (parentObj, str2tag("Mismatch"), u, 2)) { info->mismatch = arrayHandleCreate (arrayMax(u)/2, SMapMismatch, smap->handle) ; for (i = 0 ; i < arrayMax(u) ; i += 2) { mis = arrayp(info->mismatch, arrayMax(info->mismatch), SMapMismatch) ; mis->s1 = arr(u,i,BSunit).i ; mis->s2 = arr(u,i+1,BSunit).i ? arr(u,i+1,BSunit).i : mis->s1 ; } } else if (bsFindTag (parentObj, str2tag("Mismatch"))) /* whole sequence */ { info->mismatch = arrayHandleCreate (1, SMapMismatch, smap->handle) ; mis = arrayp(info->mismatch, 0, SMapMismatch) ; mis->s1 = arrp(info->map, 0, SMapMap)->s1 ; mis->s2 = arrp(info->map, arrayMax(info->map), SMapMap)->s2 ; } else info->mismatch = NULL; arrayDestroy (u) ; if (info->mismatch && !arrayMax (info->mismatch)) arrayDestroy (info->mismatch) ; return info; } /* ParentDone flag for efficiency, set when going down tree, cleared going up. * aqlCondition does nothing yet */ static void sMapAddNeighbours(SMap *smap, SMapKeyInfo *keyInfo, char *aqlCondition, BOOL parentDone) { int pos1, pos2 ; BSMARK mark1 = 0, mark2 = 0 ; OBJ obj ; KEY child, key = keyInfo->key ; SMapKeyInfo *newInfo; /* check if any children first before opening */ if (bIndexTag (key, str2tag("S_Child")) || (!parentDone && bIndexTag(key, str2tag("S_Parent"))) #ifdef ACEDB4 || bIndexTag (key, str2tag("Subsequence")) || (!parentDone && bIndexTag(key, str2tag("Source"))) #endif ) { if (!(obj = bsCreate (key))) return ; if (bsGetKeyTags (obj, str2tag("S_Child"), 0)) do { mark1 = bsMark (obj, mark1) ; if (bsGetKey (obj, _bsRight, &child)) do { mark2 = bsMark (obj, mark2) ; if (bsGetData (obj, _bsRight, _Int, &pos1) && bsGetData (obj, _bsRight, _Int, &pos2) && !(sMapMapEx(smap, keyInfo->length, keyInfo->map, pos1, pos2, NULL) & (SMAP_STATUS_NO_OVERLAP | SMAP_STATUS_OUTSIDE_AREA))) { bsPushObj (obj) ; if ((newInfo = sMapAddNode(smap, keyInfo, child, pos1, pos2, obj, FALSE))) sMapAddNeighbours (smap, newInfo, aqlCondition, TRUE) ; } bsGoto (obj, mark2) ; } while (bsGetKey (obj, _bsDown, &child)) ; bsGoto (obj, mark1) ; } while (bsGetKeyTags (obj, _bsDown, 0)) ; #ifdef ACEDB4 /* support Subsequence not under S_Child temporarily */ bsGoto (obj, 0) ; if (bsGetKey (obj, str2tag("Subsequence"), &child)) do { mark1 = bsMark (obj, mark1) ; if (bsGetData (obj, _bsRight, _Int, &pos1) && bsGetData (obj, _bsRight, _Int, &pos2) && !(sMapMapEx(smap, keyInfo->length, keyInfo->map, pos1, pos2, NULL) & (SMAP_STATUS_NO_OVERLAP | SMAP_STATUS_OUTSIDE_AREA))) { bsPushObj (obj) ; if ((newInfo = sMapAddNode(smap, keyInfo, child, pos1, pos2, obj, FALSE))) sMapAddNeighbours (smap, newInfo, aqlCondition, TRUE) ; } bsGoto (obj, mark1) ; } while (bsGetKey (obj, _bsDown, &child)) ; #endif /* ACEDB4 */ if (!parentDone) { KEY parent = 0; OBJ parentObj; if (bsGetKeyTags(obj, str2tag("S_Parent"), 0) && bsGetKey(obj, _bsRight, &parent)) { if ((parentObj = bsCreate(parent))) { if (bsFindKey2 (parentObj, str2tag("S_Child"), key, 0) && bsGetData (parentObj, _bsRight, _Int, &pos1) && bsGetData (parentObj, _bsRight, _Int, &pos2) && (newInfo = sMapAddNode(smap, keyInfo, parent, pos1, pos2, parentObj, TRUE))) /* Note child, parent swap here */ sMapAddNeighbours (smap, newInfo, aqlCondition, FALSE) ; bsDestroy(parentObj); } } #ifdef ACEDB4 else if (bsGetKey (obj, str2tag("Source"), &parent)) { if ((parentObj = bsCreate(parent))) { if (bsFindKey (parentObj, str2tag("Subsequence"), key) && bsGetData (parentObj, _bsRight, _Int, &pos1) && bsGetData (parentObj, _bsRight, _Int, &pos2) && (newInfo = sMapAddNode(smap, keyInfo, parent, pos1, pos2, parentObj, TRUE))) /* Note child, parent swap here */ sMapAddNeighbours (smap, newInfo, aqlCondition, FALSE) ; bsDestroy(parentObj); } } #endif } bsMarkFree (mark1) ; bsMarkFree (mark2) ; bsDestroy (obj) ; } return ; } /* Recursively builds map down from key between start and stop inclusive * in key's coordinate system. * The idea is that the aqlCondition is applied to test whether to open * each object, but this is not implemented yet. */ SMap *sMapCreate(STORE_HANDLE handle, KEY key, int x1, int x2, char *aqlCondition) { int area; messAssert(x1 != 0 && x2 != 0) ; if (x1 < x2) area = 1 + x2 - x1; else area = 1 + x1 - x2; return sMapCreateEx(handle, key, x1, x2, 1, area, aqlCondition); } /* area1 and area2 are limits in maps coordinate system. (ie 1 based ) Only objects which overlap area1 to area2 are included, (but they are not truncated to that area if they partialy overlap). DNA is only returned from area1 to area2 The first base if DNA is at coordinate area1 */ SMap* sMapCreateEx(STORE_HANDLE handle, KEY key, int x1, int x2, int area1, int area2, char *aqlCondition) { STORE_HANDLE newHandle = handleHandleCreate(handle); SMap* smap = (SMap*) handleAlloc (0, newHandle, sizeof(SMap)) ; SMapMap *m ; int end ; /* removed handle from messAssert as it's sometimes legitimately null */ messAssert( key != KEY_UNDEFINED && !(x1 == 0 && x2 == 0) && !(area1 == 0 && area2 == 0)) ; smap->handle = newHandle; smap->key2index = assHandleCreate (newHandle) ; /* Can be reversed if caller has got the coords of a reverse strand feature and is setting * the area from those coords. */ if (area1 < area2) { smap->area1 = area1 ; smap->area2 = area2 ; } else { smap->area1 = area2 ; smap->area2 = area1 ; } /* insert self into smap as root */ smap->root = sMapNewInfo (smap, key, 0) ; smap->root->map = arrayHandleCreate (1, SMapMap, smap->handle) ; m = arrayp(smap->root->map, 0, SMapMap) ; smap->root->length = sMapLength(key) ; if (x1 == 0) x1 = smap->root->length ; else if (x2 == 0) x2 = smap->root->length ; if (x2 > x1) { smap->root->strand = strand_down; m->s1 = x1 ; m->s2 = x2 ; m->r1 = 1 ; m->r2 = end = x2 - x1 + 1 ; } else { smap->root->strand = strand_up; m->s1 = x2 ; m->s2 = x1 ; m->r1 = end = x1 - x2 + 1 ; m->r2 = 1 ; } smap->length = end ; /* now recursively add all neighbours */ sMapAddNeighbours (smap, smap->root, aqlCondition, FALSE) ; return smap ; } void sMapDestroy (SMap* smap) { handleDestroy(smap->handle) ; return ; } /******************************************************************/ /* binary searches to find segments used in sMapMap and sMapComposeMaps */ static SMapMap *lowestUpperSeg (Array map, int x) { int i = 0, j = arrayMax(map)-1 ; while (i < j) if (x > arrp(map, (i+j)/2, SMapMap)->s2) i = (i+j)/2 + 1 ; else j = (i+j)/2 ; return arrp(map, i, SMapMap) ; } static SMapMap *highestLowerSeg (Array map, int x) { int i = 0, j = arrayMax(map)-1 ; while (i < j) if (x < arrp(map, (i+j)/2+1, SMapMap)->s1) j = (i+j)/2 ; else i = (i+j)/2 + 1 ; return arrp(map, j, SMapMap) ; } /************************************************************************/ /******** routines to be used independently/prior to SMAP creation ******/ /* * The remaining functions don't need SMap structure - they can be used * independently, e.g. sMapTreeRoot() to find the ultimate ancestor prior to * calling sMapCreate(). * */ /* find parent and return map between key and parent. inobj may be bsCreate(key) or NULL */ static Array mapToParent(KEY key, OBJ keyObj, KEY *kp, int *length_in_parent) { OBJ obj = 0; KEY parent ; int z1, z2; Array map ; int obj_length ; /* find self in parent */ /* slightly convoluted to allow use of Source #ifdef ACEDB4 */ parent = 0; if (bsGetKeyTags (keyObj, str2tag("S_Parent"), 0)) bsGetKey (keyObj, _bsRight, &parent) ; if (!(parent && (obj = bsCreate (parent)) && bsFindKey2 (obj, str2tag("S_Child"), key, 0) && bsGetData (obj, _bsRight, _Int, &z1) && bsGetData (obj, _bsRight, _Int, &z2))) { if (obj) bsDestroy (obj) ; #ifndef ACEDB4 return NULL; #else /* accept Source not under S_Parent temporarily */ obj = bsCreate (key) ; parent = 0; bsGetKey (obj, str2tag("Source"), &parent); bsDestroy (obj) ; if (!(parent && (obj = bsCreate (parent)) && bsFindKey (obj, str2tag("Subsequence"), key) && bsGetData (obj, _bsRight, _Int, &z1) && bsGetData (obj, _bsRight, _Int, &z2))) { if (obj) bsDestroy (obj) ; return NULL; } #endif /* ACEDB4 */ } obj_length = abs(z2 - z1) + 1 ; bsPushObj (obj) ; /* go into #SMap_info */ if (!(map = sMapLocalMap(obj, z1, z2, 0))) { SMapMap *m ; map = arrayCreate (1, SMapMap) ; m = arrayp (map, 0, SMapMap) ; m->r1 = z1 ; m->r2 = z2 ; m->s1 = 1 ; m->s2 = (z2 > z1) ? (z2 - z1 + 1) : (z1 - z2 + 1) ; } bsDestroy (obj) ; #ifdef ACEDB4 /* This routine maps from child coords back into parent coords. If the child has Source_exons tags we need that mapping too. */ { Array temp, exonMap; if ((exonMap = getExonMap(key))) { temp = sMapComposeMaps(NULL, exonMap, obj_length, map, 0); arrayDestroy(map); arrayDestroy(exonMap); map = temp; } } #endif if (kp) *kp = parent; if (length_in_parent) *length_in_parent = obj_length ; return map; } /* finds next enclosing sequence, with coords in that sequence * returns FALSE if no enclosing object, or can't map coords into it * x2 == 0 means end of sequence */ /* finds next enclosing sequence, with coords in that sequence * N.B. does _not_ return SMAP_STATUS_OUTSIDE_AREA because does * not involve an smap. */ BOOL sMapEnclosedBy(KEY key, int x1, int x2, KEY *kp, int *y1, int *y2) { Array map; KEY parent; SMapStatus status; struct SMapMapReturn ret; OBJ obj; int length_in_parent ; /* Is there an enclosing object (i.e. parent) ?? */ if (hasNoParent(key)) return FALSE; if (!(obj = bsCreate(key))) return FALSE; if (!(map = mapToParent(key, obj, &parent, &length_in_parent))) { bsDestroy(obj); return FALSE; } /* We only fail if there is no overlap at all, otherwise we return coords */ /* that may be clipped. */ status = sMapMapEx(NULL, length_in_parent, map, x1, x2, &ret); arrayDestroy (map) ; bsDestroy(obj); if (status & SMAP_STATUS_NO_OVERLAP) return FALSE ; else { if (kp) *kp = parent ; if (y1) *y1 = ret.y1 ; if (y2) *y2 = ret.y2 ; } return TRUE ; } /*****************/ /* Finds x1,x2 coords of key in top parent of keys smap tree by iteration of * sMapEnclosedBy() until FALSE, * returns TRUE if first call to sMapEnclosedBy returns TRUE */ /* Finds x1,x2 coords of key in top parent of smap tree that contains key. * Returns TRUE if key _and_ coords map, FALSE otherwise. * * Note that this is just a cover function for sMapTree(), this function * has been retained because its used in quite a few places in the code. */ BOOL sMapTreeRoot(KEY key, int x1, int x2, KEY *kp, int *y1, int *y2) { BOOL result = FALSE ; KEY target_key = KEY_UNDEFINED ; /* sMapTree() will go to root key if target_key is KEY_UNDEFINED, note * we only change *kp if we succeed. */ if ((result = sMapTreeCoords(key, x1, x2, &target_key, y1, y2))) { if (kp) *kp = target_key ; } return result ; } /* Finds x1,x2 coords of key in target_parent, which must be a parent of * of key in keys smap tree, if target_parent is set to KEY_UNDEFINED it * does the same as sMapTreeRoot(). */ /* Find the x1,x2 coords of "key" in "parent" and return them in y1,y2. * If "parent" has the value KEY_UNDEFINED then returns the coordinates * of "key" in the top or root object of the smap. * Returns FALSE if the coords are completely outside or there is some * other problem like "parent" is not in the smap that contains "key". * * NOTE: sMapLength() must _not_ call this function, otherwise there will be recursion... */ BOOL sMapTreeCoords(KEY key, int x1, int x2, KEY *parent, int *y1, int *y2) { BOOL result = FALSE ; static Associator hashSet = NULL ; KEY next_parent = KEY_UNDEFINED, tmp_parent = KEY_UNDEFINED ; int k1 = 0, k2 = 0 ; int length; if (iskey(key) != LEXKEY_IS_OBJ || x1 < 0 || x2 < 0 || (x1 == 0 && x2 == 0) || (*parent != KEY_UNDEFINED && iskey(*parent) != LEXKEY_IS_OBJ)) return FALSE ; /* simplifies coding to have dummy variables for results if not supplied by caller. */ if (!parent) parent = &tmp_parent ; if (!y1) y1 = &k1 ; if (!y2) y2 = &k2 ; if (sMapEnclosedBy(key, x1, x2, &next_parent, y1, y2)) { /* Enclosing parent so look for coords in that parent. */ /* use hashSet to break loops in object tree. */ hashSet = assReCreate (hashSet) ; assInsert (hashSet, assVoid(key), assVoid(1)) ; while (next_parent != *parent && assInsert(hashSet, assVoid(next_parent), assVoid(1)) && sMapEnclosedBy(next_parent, *y1, *y2, &next_parent, y1, y2)) ; if (next_parent == *parent) result = TRUE ; else if (*parent == KEY_UNDEFINED) { result = TRUE ; *parent = next_parent ; } } else if ((length = sMapLength(key)) && (x1 > 0 || x2 > 0) && (x1 <= length || x2 <= length)) { /* No enclosing object/parent so return key + coords of this object. */ /* In some ways we should really do an sMapMap() here for purity but */ /* perhaps this short cut is acceptable. */ *parent = key ; if (x1 < x2 && x1 == 0) { *y1 = length ; *y2 = x2 ; } else if (x2 < x1 && x2 == 0) { *y1 = x1 ; *y2 = length ; } else if (x1 <= x2) { if (x1 < 0) *y1 = 1 ; else *y1 = x1 ; if (x2 > length) *y2 = length ; else *y2 = x2 ; } else /* x2 <= x1 */ { if (x2 < 0) *y2 = 1 ; else *y2 = x2 ; if (x1 > length) *y1 = length ; else *y1 = x1 ; } result = TRUE ; } return result ; } /* THE CORRECT LOGIC WOULD SEEM TO BE SOMETHING LIKE: * * If there is internal length use that * else if max(s_parent length, s_child length) * * REMEMBER that this is not relative to an smap, its just an attempt to get the length * of an object....... * * Remember also that any internal length (ie DNA tag) overrules anything else I think otherwise * the database is in error....if we don't do this I think everything becomes a bit of a nonsense, * possible to do but a bit of a nonsense.... * */ /* Try various strategies (DNA, S_parent, S_children) to find the sequence length of an object * could do all and check consistency? * N.B. does _not_ return SMAP_STATUS_OUTSIDE_AREA because does * not involve an smap. * * NOTE: You should not call sMapTreeRoot() from here either directly or indirectly * othewise there will be recursion in a run away sense.....*/ int sMapLength (KEY key) { int max = 0 ; OBJ obj = NULL ; if (!bIndexTag (key, str2tag("DNA")) && !bIndexTag (key, str2tag("S_Parent")) && !bIndexTag (key, str2tag("S_Child")) #ifdef ACEDB4 && !bIndexTag (key, str2tag("Source")) && !bIndexTag (key, str2tag("Subsequence")) #endif ) { max = 0 ; } else if (!(obj = bsCreate (key))) { max = 0 ; } else if ((max = getLengthInternal(obj))) { /* check first if object has its own intrinsic length, e.g. has its own DNA */ max = max ; } else { /* Otherwise we record whichever is the greater of the child and parent lengths. */ int child, parent ; child = getLengthOfChildren(obj) ; parent = getLengthInParent(obj) ; max = (child > parent ? child : parent) ; } bsDestroy (obj) ; return max ; } /* We may wish to promote the following set of getLengthXXX() routines to be part * of the external smap interface at some time but for now they will be an * internal routines only. */ /* Looks only in the object itself for tags that might explicitly give the length of * this object, i.e. we do not look at child or parent lengths. */ static int getLengthInternal(OBJ obj) { int result = 0, length = 0 ; /* Should we look for Source_exons as well to get that max length...?????? */ if (bsGetKey(obj, str2tag("DNA"), NULL) && bsGetData (obj, _bsRight, _Int, &length)) result = length ; return result ; } /* Looks in the object for S_Child information that implicitly could give the length of * this object, i.e. we derive the length of this object from the maximum span of the children. */ static int getLengthOfChildren(OBJ obj) { int result = 0, length = 0, i ; static Array units = NULL ; units = arrayReCreate(units, 48, BSunit) ; /* Note, there may be a mixture of S_Child and Subsequence tags, * it is vital to check both, do not ignore the Subsequences if * S_Childs exist. */ if (bsGetArray(obj, str2tag("S_Child"), units, 4)) { /* find highest coord of all the S_Child'ren */ for (i = 2 ; i < arrayMax (units) ; i += 4) { if (arr(units,i,BSunit).i > length) length = arr(units,i,BSunit).i ; if (arr(units,i+1,BSunit).i > length) length = arr(units,i+1,BSunit).i ; } } #ifdef ACEDB4 if (bsGetArray(obj, str2tag("Subsequence"), units, 3)) { /* or Subsequence in ACEDB 4 */ for (i = 1 ; i < arrayMax (units) ; i += 3) { if (arr(units,i,BSunit).i > length) length = arr(units,i,BSunit).i ; if (arr(units,i+1,BSunit).i > length) length = arr(units,i+1,BSunit).i ; } } #endif /* ACEDB4 */ if (length > 0) result = length ; return result ; } /* actually , is this correct even ? Why do we look for clipped coords ???? */ /* Maps object into parent to get length of object as recorded in parent. */ static int getLengthInParent(OBJ obj) { int result = 0, length_in_parent ; Array map ; if ((map = mapToParent(bsKey(obj), obj, NULL, &length_in_parent))) { /* next if S_parent */ struct SMapMapReturn ret ; if (!(sMapMapEx(NULL, length_in_parent, map, 1, 0, &ret) & SMAP_STATUS_NO_OVERLAP)) result = (ret.nx2 > ret.nx1) ? (ret.nx2 - ret.nx1 + 1) : (ret.nx1 - ret.nx2 + 1) ; arrayDestroy(map) ; } return result ; } #ifdef ACEDB4 /***********************************************************/ /******* a utility needed for Source_exons sorting *********/ static int dnaExonsOrder (void *a, void *b) { ExonStruct *ea = (ExonStruct *)a ; ExonStruct *eb = (ExonStruct *)b ; return ea->x - eb->x ; } /* can't just arraySort, because a is in BSunits, not * pairs of BSunits */ static void exonsSort (Array a) { int i, n ; Array b = 0 ; n = arrayMax(a) / 2 ; b = arrayCreate (n, ExonStruct) ; for (i = 0 ; i < n ; ++i) { arrayp(b,i,ExonStruct)->x = arr(a,2*i,BSunit).i ; arrp(b,i,ExonStruct)->y = arr(a,2*i+1,BSunit).i ; } arraySort (b, dnaExonsOrder) ; for (i = 0 ; i < n ; ++i) { arr(a,2*i,BSunit).i = arrp(b,i,ExonStruct)->x ; arr(a,2*i+1,BSunit).i = arrp(b,i,ExonStruct)->y ; } arrayDestroy (b) ; return ; } static Array getExonMap(KEY key) { Array u, exonMap = 0; OBJ obj = 0 ; int x, i ; SMapMap *m; u = arrayCreate (16, BSunit) ; if (bIndexTag(key, str2tag("Source_exons")) && (obj = bsCreate(key)) && bsGetArray (obj, str2tag("Source_exons"), u, 2)) { exonsSort (u) ; exonMap = arrayCreate(arrayMax(u)/2, SMapMap) ; x = 1 ; for (i = 0 ; i < arrayMax(u) ; i += 2) { m = arrayp (exonMap, i/2, SMapMap) ; m->r1 = arr(u,i,BSunit).i ; m->r2 = arr(u,i+1,BSunit).i ; m->s1 = x ; m->s2 = x + (m->r2 - m->r1) ; x += m->r2 - m->r1 + 1 ; } } if (obj) bsDestroy (obj) ; arrayDestroy(u); return exonMap; } #endif /* Is this object at the top of a hierachy (i.e. has no parent tags) ? */ /* */ static BOOL hasNoParent(KEY key) { BOOL result = FALSE ; if (bIndexTag(key, str2tag("S_Parent"))) result = FALSE ; else result = TRUE ; #ifdef ACEDB4 /* Objects either have S_Parent or Source, not both... */ if (result == TRUE) { if (bIndexTag(key, str2tag("Source"))) result = FALSE ; else result = TRUE ; } #endif return result ; } static void sMapDumpInfo(SMapKeyInfo *info, ACEOUT dest, int offset) { int i; aceOutPrint(dest, "\n"); for (i=0; ikey), sMapIsReverse(info, 1, 1) ? "Up" : "Down"); if (info->map) { aceOutPrint(dest, " map:"); for (i=0; imap); i++) { SMapMap *m = arrayp(info->map, i, SMapMap); aceOutPrint(dest, "%d,%d->%d,%d ", m->s1, m->s2, m->r1, m->r2); } } if (info->mismatch) { aceOutPrint(dest," mis:"); for (i=0; imap); i++) { SMapMismatch *m = arrayp(info->mismatch, i, SMapMismatch); aceOutPrint(dest, "%d,%d ", m->s1, m->s2); } } if (info->children) for (i=0; ichildren); i++) { SMapKeyInfo *inf = array(info->children, i, SMapKeyInfo *); sMapDumpInfo(inf, dest, offset+3); } return ; } /* human-readable dump of an smap to dest */ void sMapDump(SMap *smap, ACEOUT dest) { KEY rootkey; int i, k, k1, start, rx1, rx2 ; char *cp, *cq ; char buffer [8000] ; Array dna; aceOutPrint(dest, "\n"); aceOutPrint(dest, "sMap->nnnn:\n") ; aceOutPrint(dest, "\tsMap->length: %d\n", smap->length) ; aceOutPrint(dest, "\tsMap->area1: %d, sMap->area2: %d\n", smap->area1, smap->area2) ; aceOutPrint(dest, "sMap->root->nnnn:\n") ; if (!sMapTreeRoot(smap->root->key, 1, 0, &rootkey, &rx1, &rx2)) aceOutPrint(dest, "\tNO ROOT\n"); else aceOutPrint(dest, "\tRoot: %s [%d,%d]\n", nameWithClass(rootkey), rx1, rx2); aceOutPrint(dest, "\tsMapLength = %d\n", sMapLength(smap->root->key)); sMapDumpInfo(smap->root, dest, 0); aceOutPrint(dest, "\n\n"); aceOutPrint(dest, "sMap dna:\n") ; if (!(dna = sMapDNA(smap, 0, NULL))) aceOutPrint(dest, "NO DNA\n"); else { i = arrayMax(dna) ; aceOutPrint(dest, "Length = %d", i) ; if (i > 1000) { aceOutPrint(dest, " (only first 1,000 bases printed)", i); i = 1000 ; } aceOutPrint(dest, "\n") ; cp = arrp(dna,0,char) ; cq = buffer ; while(i > 0) { cq = buffer ; for (k=0 ; k < 4000/(50 + 3) ; k++) if (i > 0) { start = cp - arrp(dna,0,char) + smap->area1; cq += sprintf(cq, "%5d ", start); k1 = 50 ; while (k1-- && i--) *cq++ = dnaDecodeChar[*cp++ & 0xff] ; *cq++ = '\n' ; *cq = 0 ; } aceOutPrint(dest, "%s", buffer) ; } arrayDestroy(dna); } aceOutPrint(dest, "\n"); return ; } /*********************** end of file ************************/