/* File: fmapgff.c * Author: Ed Griffiths (edgrif@sanger.ac.uk) * Copyright (c) J Thierry-Mieg and R Durbin, 2002 *------------------------------------------------------------------- * Acedb is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * or see the on-line version at http://www.gnu.org/copyleft/gpl.txt *------------------------------------------------------------------- * This file is part of the ACEDB genome database package, written by * Richard Durbin (Sanger Centre, UK) rd@sanger.ac.uk, and * Jean Thierry-Mieg (CRBM du CNRS, France) mieg@kaa.crbm.cnrs-mop.fr * * Description: Contains routines needed to dump GFF features. * Main dumping routine extracts information from segs * but then calls a callback function which does output * in whatever format the caller wants. Currently straight * GFF and DASGFF are output. * * Exported functions: See wh/gff.h * HISTORY: * Last edited: Mar 26 14:31 2004 (edgrif) * Created: Fri Apr 19 10:04:23 2002 (edgrif) * CVS info: $Id: gff.c,v 1.14 2004/03/26 14:32:01 edgrif Exp $ *------------------------------------------------------------------- */ #include #include #include #include #include #include #include #include static char *cleanNewlines (char *s, STORE_HANDLE handle) ; static char strandOfKey(FeatureMap look, KEY key, int rootReversed) ; /* * -------------------- EXTERNAL FUNCTIONS -------------------- */ /* Establishes the reference sequence, offset, start/stop of the target * key, start/stop of features to be output (maybe outside start/stop * of target key. This is a separate routine from the dump routine * because some of this information is needed by the caller before * dumping starts. */ BOOL GFFRefSeqPos(FeatureMap look, int version, KEY *refSeq, KEY *seqKey_out, BOOL *reversed_out, int *offset_out, int *key_start_out, int *key_end_out, int *feature_start_out, int *feature_end_out) { BOOL result = TRUE ; int i ; int key_start, key_end, offset = 0, feature_start, feature_end ; int x, y ; SEG *seg ; KEY seqKey ; messAssert(look && version > 1 && seqKey_out && reversed_out && offset_out && key_start_out && key_end_out && feature_start_out && feature_end_out) ; /* first establish seqKey and offset */ seqKey = 0 ; if (refSeq && !*refSeq) *refSeq = look->seqOrig ; if (refSeq && *refSeq) { for (i = 1 ; i < arrayMax(look->segs) ; ++i) { seg = arrp(look->segs, i, SEG) ; if ((seg->type == SEQUENCE || seg->type == SEQUENCE_UP || seg->type == FEATURE_OBJ || seg->type == FEATURE_OBJ_UP) && seg->key == *refSeq) break ; } if (i < arrayMax(look->segs)) { if (seg->type == SEQUENCE || seg->type == SEQUENCE_UP || seg->type == FEATURE_OBJ || seg->type == FEATURE_OBJ_UP) { seqKey = *refSeq ; if (look->seqOrig == look->seqKey) { if (look->noclip) { /* Note that we do not use zoneMin/Max here, they don't make sense if you * have asked to see everything, why clip it with zoneMin/Max ? */ offset = 0 ; key_start = (look->start + (look->zoneMin + 1)) ; key_end = (look->start + look->zoneMax) ; feature_start = 0 ; feature_end = (look->fullLength - 1) ; } else { offset = look->start ; key_start = (offset + (look->zoneMin + 1)) ; key_end = (offset + look->zoneMax) ; feature_start = look->zoneMin ; feature_end = (look->zoneMax - 1) ; } } else { /* Find position of seqOrig in seqKey and use that and start to calculate * offset. */ int y1 = 0, y2 = 0 ; KEY key = KEY_UNDEFINED ; int begin, end ; if (seg->type == SEQUENCE || seg->type == FEATURE_OBJ) begin = 1, end = 0 ; else begin = 0, end = 1 ; /* not sure if the above screws everything up or not... */ begin = 1, end = 0 ; /* WHAT...WHAT IS THIS.... */ if (!sMapTreeRoot (look->seqOrig, begin, end, &key, &y1, &y2) && key != look->seqKey) messcrash("whoops") ; if (look->noclip) { /* Note that we do not use zoneMin/Max here, they don't make sense if you * have asked to see everything, why clip it with zoneMin/Max ? */ offset = -(look->start) ; key_start = y1 + offset ; key_end = y2 + offset ; feature_start = 0 ; feature_end = (look->fullLength - 1) ; } else { offset = (look->start + 1) - y1 ; key_start = (offset + (look->zoneMin + 1)) ; key_end = (offset + look->zoneMax) ; feature_start = look->zoneMin ; feature_end = (look->zoneMax - 1) ; } } *reversed_out = (seg->type == SEQUENCE_UP ? TRUE : FALSE) ; } } else { messout ("Can't find reference sequence %s", name (*refSeq)) ; return FALSE ; } } if (!seqKey) /* find minimal spanning sequence */ { x = look->zoneMin ; y = look->zoneMax ; seqKey = 0 ; if (!fMapFindSpanSequence(look, &seqKey, &x, &y)) seqKey = look->seqKey ; if (x > y) /* what if reversed? */ { messout("Can't GFF dump from reversed sequences for now. Sorry.") ; return FALSE ; } offset = x - look->zoneMin ; } if (refSeq) *refSeq = seqKey ; *seqKey_out = seqKey ; *key_start_out = key_start ; *key_end_out = key_end ; *offset_out = offset ; *feature_start_out = feature_start ; *feature_end_out = feature_end ; return result ; } /* This routine contains the loop that goes through the segs in an fmap look * and extracts information from them appropriately for gff style dumps, it * then calls the supplied GffCallBack routine which does the actual output. * So far there are callback routines to output in standard GFF format and * also in DASGFF format. * * All output fields apart from score and comment are mandatory, but most * default to GFF_UNKNOWN_FIELD (currently a dot). Those that must have * proper data are feature, start and end. */ BOOL GFFDump(GffCallBack app_cb, void *app_data, FeatureMap look, int version, KEY seqKey, int offset, int feature_start, int feature_end, GffListType list_type, DICT listSet, Array stats, DICT *sourceSet, DICT *featSet, DICT *methodSet, BOOL include_source, BOOL include_feature, BOOL include_method) { BOOL result = TRUE ; int i ; SEG *seg ; HOMOLINFO *hinf ; SEQINFO *sinf ; METHOD *meth ; KEY sourceKey ; char *sourceName, *featName ; char *methodName, *tagText = NULL ; int x, y, type ; int tmp = 0, reversed = 0 ; /* used by AcePerl */ float score ; char strand, frame ; char *comment = NULL ; BOOL flipped ; BOOL isScore ; STORE_HANDLE handle = handleCreate() ; Associator key2sinf = assHandleCreate (handle) ; /* bizarrely unused.... */ Associator key2source = assHandleCreate (handle) ; Associator key2feat = assHandleCreate (handle) ; GString *attributes ; enum {GFF_ATTR_INITLEN = 256} ; /* Vague guess at initial length. */ BOOL is_structural ; messAssert(app_cb != NULL && look != NULL && version > 1 && seqKey) ; /* THIS ROUTINE IS IN THE MIDDLE OF BEING REWRITTEN, WE WANT TO GET TO THE POINT WHERE * ANYTHING THAT DOESN'T HAVE A METHOD AND APPROPRIATE DUMPING TAGS IN THE METHOD * SIMPLY DOESN'T GET DUMPED......WOULD SIMPLIFY MUCH OF THE CODE BELOW..... */ /* I am unsure how to fill this in at the moment....probably it is meant to be * anything that is in the smap hierachy, if so this would require some extra * work to ascertain such as a bIndex search of the object for the smap tags.... */ is_structural = FALSE ; attributes = g_string_sized_new(GFF_ATTR_INITLEN) ; /* Allocate reusable/appendable string for attributes.*/ for (i = 0 ; i < arrayMax(look->segs) ; ++i) { FeatureInfo *feat = NULL ; seg = arrp(look->segs, i, SEG) ; /* Exclude anything outside the range we are interested in. */ if (seg->x1+offset > feature_end || seg->x2+offset < feature_start) continue ; /* Exclude segs that are created to support FMap like stuff, but mean nothing in the * context of a GFF style dump. */ if (seg->type == MASTER || seg->type == VISIBLE || seg->type == CDS || seg->type == CDS_UP || seg->type == HOMOL_GAP || seg->type == HOMOL_GAP_UP || seg->type == DNA_SEQ || seg->type == PEP_SEQ || seg->type == ORF || seg->type == TRANS_SEQ || seg->type == TRANS_SEQ_UP) continue ; /* Normalise the type and set the coords of the feature, note slightly tricky * code here, we only need to change things for features that are strand * specific _and_ are on the reverse strand. _All_ other features are * straight forward. */ if (seg->type & 0x01 && (seg->type >= SEQUENCE && seg->type <= ALLELE_UP)) { type = seg->type - 1 ; x = seg->x2+1 ; y = seg->x1+1 ; } else { type = seg->type ; x = seg->x1+1 ; y = seg->x2+1 ; } /* BUT some segs have been flipped from one strand to the other * for display (usually for EST 3' reads), we flip them back to their original * orientation here. Otherwise people don't know where they are for the GFF. */ if (seg->flags & SEGFLAG_FLIPPED) { int tmp = x ; x = y ; y = tmp ; } if (x < y) { flipped = FALSE ; strand = '+' ; /* Note that this might be overridden. */ } else if (x > y) { int tmp = x ; x = y ; y = tmp ; flipped = TRUE ; strand = '-' ; } else /* x = y */ { strand = strandOfKey(look, seg->source, reversed); if (strand == '.') strand = strandOfKey(look, seg->parent, reversed); } x += offset ; y += offset ; /* Set some defaults. */ frame = '.' ; isScore = FALSE ; score = 0.0 ; featName = NULL ; methodName = NULL; sourceKey = KEY_UNDEFINED ; sourceName = ""; /* LS otherwise everything ends up being a SEQUENCE */ switch (type) { case EXON: case INTRON: case SEQUENCE: sinf = arrp(look->seqInfo, seg->data.i, SEQINFO) ; if (seg->key) { featName = className(seg->key); } else featName = (type == SEQUENCE) ? "sequence" : (type == EXON) ? "exon" : "intron" ; if (sinf->method) { sourceKey = sinf->method ; } if (type == SEQUENCE) { assInsert (key2sinf, assVoid(seg->key), sinf) ; if (sinf->flags & SEQ_SCORE) { score = sinf->score ; isScore = TRUE ; } } break ; case CODING: featName = "coding_exon" ; assFind (key2sinf, assVoid(seg->parent), &sinf) ; if (sinf->method) sourceKey = sinf->method ; switch (seg->data.i%3) { case 0: frame = '0' ; break ; case 1: frame = '2' ; break ; case 2: frame = '1' ; break ; } break ; case HOMOL: hinf = arrp(look->homolInfo, seg->data.i, HOMOLINFO) ; score = hinf->score ; sourceKey = hinf->method ; featName = "similarity" ; meth = methodCacheGet(look->mcache, hinf->method, "") ; if (!(meth->flags & METHOD_STRAND_SENSITIVE)) strand = '.' ; if (meth->flags & METHOD_SCORE) isScore = TRUE ; if (meth->flags & METHOD_FRAME_SENSITIVE) frame = '0' ; break ; case FEATURE_OBJ: /* Some things should always be true, e.g. there should always be a method. */ feat = arrp(look->feature_info, seg->data.i, FeatureInfo) ; featName = "misc_feature" ; /* default to rubbish. */ sourceKey = feat->method ; #ifdef ED_G_NEVER_INCLUDE_THIS_CODE /* IS THIS NEEDED......YES, WHEN WE COMBINE IT WITH MORE GENERAL "FEATURE" stuff. */ if (feat->flags & SEQ_SCORE) { score = sinf->score ; isScore = TRUE ; } #endif /* ED_G_NEVER_INCLUDE_THIS_CODE */ break ; case FEATURE: case ATG: case SPLICE5: case SPLICE3: sourceKey = seg->key ; /* Set to method by convert routine. */ if (type == SPLICE3) featName = "splice3" ; else if (type == SPLICE5) featName = "splice5" ; else if (type == ATG) featName = "atg" ; else featName = "misc_feature" ; meth = methodCacheGet(look->mcache, seg->key, "") ; if (meth->flags & METHOD_SCORE) { score = seg->data.f ; isScore = TRUE ; } if (!(meth->flags & METHOD_STRAND_SENSITIVE)) strand = '.' ; if (meth->flags & METHOD_FRAME_SENSITIVE) frame = '0' ; break ; case ASSEMBLY_TAG: tagText = strnew(seg->data.s, handle) ; sourceName = "assembly_tag" ; featName = tagText ; { char *cp ; for (cp = tagText ; *cp && *cp != ':' && *cp != ' ' && *cp != '\t' && *cp != '\n' ; ++cp) ; if (*cp) { *cp++ = 0 ; while (*cp == ' ' || *cp == '\t' || *cp == '\n') ++cp ; } tagText = cp ; } break ; case ALLELE: if (seg->data.s) { if (*seg->data.s == '-') featName = "deletion" ; else { char *cp = seg->data.s ; featName = "variation" ; while (*cp) { if (!strchr ("AGCTagct", *cp++)) featName = "insertion" ; } } } break ; case EMBL_FEATURE: featName = name(seg->key) ; break ; case CLONE_END: featName = name(seg->data.k) ; strand = '.' ; break ; default: /* stuff like oligos come out here. */ ; } /* Finalise source and feature fields. */ if (sourceKey) { char *tmp_featname = NULL ; if (assFind (key2source, assVoid(sourceKey), &sourceName)) assFind (key2feat, assVoid(sourceKey), &tmp_featname) ; else { OBJ obj = bsCreate (sourceKey) ; sourceName = name(sourceKey) ; methodName = name(sourceKey) ; if (obj) { bsGetData (obj, str2tag("GFF_source"), _Text, &sourceName) ; if (bsGetData (obj, str2tag("GFF_feature"), _Text, &tmp_featname)) { tmp_featname = strnew(tmp_featname, handle) ; assInsert(key2feat, assVoid(sourceKey), tmp_featname) ; } } sourceName = strnew(sourceName, handle); methodName = strnew(methodName, handle); assInsert (key2source, assVoid(sourceKey), sourceName) ; if (obj) /* only after strnew(sourceName) !!! */ bsDestroy(obj); } /* Feature name is more tricky, in cases where a seg is really a child seg * as in exons/introns we want to keep the feature type we've already * set (e.g. "exon"), the GFF_feature string applies only to the parent seg. */ if (type != EXON && type != INTRON && type != CODING) { if (tmp_featname) featName = tmp_featname ; } } /* GFF requires the first 8 fields to be present, where there is no sensible value, * the field is set to "." */ if (!sourceName || !*sourceName) sourceName = GFF_UNKNOWN_FIELD ; if (!methodName || !*methodName) /* if still no methodName, do your best. */ { if (sourceKey) { methodName = name(sourceKey); } else methodName = GFF_UNKNOWN_FIELD ; } /* ROB, I would like to remove this in the end but I think we need to leave it * in place for now..... */ if (!featName) /* from method or from seg->type above */ featName = fMapSegTypeName[type] ; /* * Now throw stuff away - if any restrictions apply, then * if we're excluding things and we find this one in the set * then exclude it. * if we're including and this isn't what we want, exclude it. */ /* * if (sourceSet && featSet && * (dictMax (sourceSet) || dictMax (featSet)) && * !dictFind (sourceSet, sourceName, 0) && * !dictFind (featSet, featName, 0)) * continue ; */ if (featSet && dictMax(featSet)) { if (!include_feature && dictFind(featSet, featName, 0)) continue; if (include_feature && !dictFind(featSet, featName, 0)) continue; } if (sourceSet && dictMax(sourceSet)) { if (!include_source && dictFind(sourceSet, sourceName, 0)) continue; if (include_source && !dictFind(sourceSet, sourceName, 0)) continue; } if (methodSet && dictMax(methodSet)) { if (!include_method) { if (dictFind(methodSet, methodName, 0)) { continue; } else { /* well, maybe its parent object has this method */ OBJ obj = bsCreate(seg->parent); KEY method_key = str2tag("Method"); KEY parent_method; if (obj) { if (bsFindTag(obj, method_key) && bsGetKey(obj, method_key, &parent_method )) { if (dictFind(methodSet, name(parent_method), 0)) { bsDestroy(obj) ; continue; } } } } } else /* ie methods to be included */ { if (!dictFind(methodSet, methodName, 0)) { /* well, maybe its parent object has this method */ OBJ obj = bsCreate(seg->parent); KEY method_key = str2tag("Method"); KEY parent_method; if (obj) { if (bsFindTag(obj, method_key) && bsGetKey(obj, method_key, &parent_method )) { if (!dictFind(methodSet, name(parent_method), 0)) { bsDestroy(obj) ; continue; } } else /* nope, method not in this object or its parent */ { bsDestroy(obj); continue; } } else continue; } } } /* IF.....user just wants list of source/features, just accumulate stats, * don't do ANYTHING ELSE. */ if (list_type != GFF_LIST_NONE) { int k ; if (list_type == GFF_LIST_FEATURES) { dictAdd(listSet, messprintf("%s\t%s",sourceName,featName), &k) ; ++array(stats, k, int) ; } else /* list_type == GFF_LIST_KEYS */ { char *seg_name = NULL ; if ((seg_name = fmapSeg2SourceName(seg))) dictAdd(listSet, seg_name, &k) ; } continue ; } /* ELSE...we go on to write out the GFF line. */ /* LS/AcePerl: fixup reversed reference sequence */ if (reversed) { tmp = look->zoneMax + offset + 1 - y; y = look->zoneMax + offset + 1 - x; x = tmp; if ( strand == '+' ) strand = '-'; else if ( strand == '-' ) strand = '+'; } /* GFF attribute field (i.e. all the extras following the mandatory 7 fields.) */ switch (type) { case SEQUENCE: case EXON: case CODING: if (seg->parent) { g_string_sprintfa(attributes, "\t%s \"%s\"", className(seg->parent), name(seg->parent)) ; } break ; case INTRON: { BOOL isData = FALSE ; /* Introns derived from Source_exons always have a parent. Confirmed introns * only have a parent if they have been combined with existing Introns. */ if (seg->parent) { g_string_sprintfa(attributes, "\t%s \"%s\"", className(seg->parent), name(seg->parent)) ; isData = TRUE ; } sinf = arrp(look->seqInfo, seg->data.i, SEQINFO) ; if (sinf->flags & SEQ_CONFIRMED) { int i ; for (i = 0 ; i < arrayMax(sinf->confirmation) ; i++) { ConfirmedIntronInfo confirm = arrayp(sinf->confirmation, i, ConfirmedIntronInfoStruct) ; g_string_sprintfa(attributes, "%s", isData ? " ; " : "\t") ; isData = TRUE ; g_string_sprintfa(attributes, "Confirmed_%s", confirm->confirm_str) ; if (confirm->confirm_sequence) g_string_sprintfa(attributes, " %s", name(confirm->confirm_sequence)) ; } } break ; } case HOMOL: { g_string_sprintfa(attributes, "\tTarget \"%s:%s\"", className(seg->key), name(seg->key)) ; hinf = arrp(look->homolInfo, seg->data.i, HOMOLINFO) ; if (flipped) g_string_sprintfa(attributes, " %d %d", hinf->x2, hinf->x1) ; else g_string_sprintfa(attributes, " %d %d", hinf->x1, hinf->x2) ; break ; } case FEATURE_OBJ: if (seg->parent) { g_string_sprintfa(attributes, "\t%s \"%s\"", className(seg->parent), name(seg->parent)) ; } break ; case FEATURE: case SPLICE5: case SPLICE3: case ATG: { BOOL isTab = FALSE ; if (seg->parent) { g_string_sprintfa(attributes, "\tNote \"%s\"", cleanNewlines(dictName (look->featDict, -seg->parent), handle)) ; isTab = TRUE ; } if (assFind (look->chosen, SEG_HASH(seg), 0)) g_string_sprintfa(attributes, "%sSelected", isTab ? " ; " : "\t") ; else if (assFind (look->antiChosen, SEG_HASH(seg), 0)) g_string_sprintfa(attributes, "%sAntiselected", isTab ? " ; " : "\t") ; } break ; case ASSEMBLY_TAG: if (tagText && *tagText) { g_string_sprintfa(attributes, "\tNote \"%s\"", cleanNewlines(tagText, handle)) ; } break ; case ALLELE: g_string_sprintfa(attributes, "\tAllele \"%s\"", name(seg->key)) ; if (seg->data.s && *seg->data.s != '-') { if (strcmp (featName, "variation") == 0) g_string_sprintfa(attributes, " ; Variant \"%s\"", cleanNewlines(seg->data.s, handle)) ; else g_string_sprintfa(attributes, " ; Insert \"%s\"", cleanNewlines(seg->data.s, handle)) ; } break ; case EMBL_FEATURE: if (seg->data.s) { g_string_sprintfa(attributes, "\tNote \"%s\"", cleanNewlines(seg->data.s, handle)) ; } break ; case CLONE_END: g_string_sprintfa(attributes, "\tClone \"%s\"", name(seg->key)) ; break ; default: ; } /* Call the applications function to output the GFF record. */ { float *tmp_score = isScore ? &score : NULL ; /* float == NULL => score not relevant. */ (*app_cb)(app_data, name(seqKey), className(seqKey), methodName, is_structural, sourceName, featName, x, y, tmp_score, strand, frame, attributes->str, comment, type) ; /* passing seg type just for debugging */ } attributes = g_string_truncate(attributes, 0) ; /* Reset attribute string to empty. */ } /* Clean up. */ handleDestroy(handle) ; g_string_free(attributes, TRUE) ; return result ; } /* * -------------------- INTERNAL FUNCTIONS -------------------- */ static char *cleanNewlines (char *s, STORE_HANDLE handle) { int n = 0 ; char *cp, *cq, *copy ; for (cp = s ; *cp ; ++cp) if (*cp == '\n') ++n ; if (!n) return s ; copy = halloc (cp-s+n+1, handle) ; for (cp = s, cq = copy ; *cp ; ++cp) if (*cp == '\n') { *cq++ = '\\' ; *cq++ = 'n' ; } else *cq++ = *cp ; *cq = 0 ; return copy ; } /***********************************/ /* Determine direction of key in Dump. If the dump is of a sequence * which is reversed wrt to the root of the fMap, rootReversed == 1) */ static char strandOfKey(FeatureMap look, KEY key, int rootReversed) { SMapKeyInfo *info = sMapKeyInfo(look->smap, key); BOOL isReverse; if (!info) return '.'; isReverse = sMapIsReverse(info, 1, 0); if (isReverse) return rootReversed == 0 ? '-' : '+'; return rootReversed == 1 ? '-' : '+'; }