/* File: asn.c * Author: LaDeana Hillier, WashU, and help from Jim Ostell, NCBI. * Copyright (c) J Thierry-Mieg and R Durbin, 1999 * ------------------------------------------------------------------- * Acedb is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 * of the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * or see the on-line version at http://www.gnu.org/copyleft/gpl.txt * ------------------------------------------------------------------- * This file is part of the ACEDB genome database package, written by * Richard Durbin (Sanger Centre, UK) rd@sanger.ac.uk, and * Jean Thierry-Mieg (CRBM du CNRS, France) mieg@kaa.crbm.cnrs-mop.fr * * Description: * This is a modification of the embl.c file (comment below) to produce * an ASN.1 format direct submission for NCBI. It assumes the NCBI * toolkit is installed and that either the environment variable "NCBI" is * set to a directory in which the file ".ncbirc" exists or that ".ncbirc" * is in the current directory. * * ".ncbirc" contains a path to /ncbi/data which contains the data files * seqcode.val and gc.val, sequence alphabets and genetic codes, respectively * used by the toolkit. If you have Entrez installed, then this is already * properly configured. * * These instructions apply only to UNIX machines. * HISTORY: * original version: 11/15/93 * modified version: 9/95 (update for ACEDB 4) Dave Ficenec, WashU. * modified version: 3/96 (update for human) Dave Ficenec, WashU. * modified version: 10/96 (update for human) Suemee Shin, WashU. * Last edited: Oct 28 12:53 1999 (fw) * * Aug 26 16:52 1999 (fw): added this header * Created: Thu Aug 26 16:51:07 1999 (fw) * CVS info: $Id: asn.c,v 1.5 1999/10/28 11:54:28 fw Exp $ *------------------------------------------------------------------- */ /**************************************************************************** * * Original File: embl.c * Author: Richard Durbin (rd@mrc-lmb.cam.ac.uk) * Copyright (C) J Thierry-Mieg and R Durbin, 1992 * * This file is part of the ACEDB genome database package, written by * Richard Durbin (MRC LMB, UK) rd@mrc-lmb.cam.ac.uk, and * Jean Thierry-Mieg (CRBM du CNRS, France) mieg@kaa.cnrs-mop.fr * ***************************************************************************/ /* WRG 961004 Fixed partial interval problems WRG 960929 Sequence similarity comments will appear at beginning of /note= WRG 960929 Added calls to new NCBI subutil functions for WASHU and htgs_3 tags WRG 960917 Added conditional code related to HUMAN. */ #include "acedb.h" #include "display.h" #include "bs.h" #include "classes.h" #include "tags.h" #include "systags.h" #include "dna.h" #include "lex.h" #include "gmap.h" #include "subutil.h" /* this file includes NCBI stuff */ enum {NOPROJECT, ELEGANS, YEAST, HUMAN, BRIGGSAE, FUGU}; static char Institution[] = "WUGSC"; /* submittor tag */ static int gProject; /* Project id */ static int gAsnDebug; /* Debgging output off/on. */ static int gDay,gMon,gYear; /* Submit date. */ static char gChromosome[10]; /* Chromosome name */ static char gSequence[128]; /* Current sequence name */ static char gRawClone[128]; /* Unadulterated clone name */ static char gClone[128]; /* Current clone name */ static char gClonePat[128]; /* Current clone+gene name pattern */ static char gRawClonePat[128]; /* Current clone+gene name pattern */ static char gCloneType[128]; /* Current clone type */ static char gLibrary[128]; /* Library */ static char keyword[10][256]; /* keywords */ static char gMapPosition[128]; /* Map position, xq28 */ static char gLocus[256]; /* Current locus name */ static int gLength; /* Length of current DNA sequence */ static char gBuf[32000]; /* All-purpose char buffer */ static int gOffset; /* Subseq in superseq coords */ static BOOL gOffForward; /* Subseq codirect. with superseq */ static int gMiss1,gMiss2; /* Amount clipped from subseq */ static NCBISubPtr nsp; /* NCBI submission pointer */ static SeqEntryPtr nucprot_entry;/* NCBI nucleotide & protein entry */ static SeqEntryPtr nuc_entry; /* NCBI nucleotide entry */ static SeqEntryPtr prot_entry; /* NCBI protein entry */ static SeqEntryPtr the_entry; /* NCBI entry (one of above) */ static KEY _Database; static KEY _scRNA; static KEY _misc_RNA; static KEY _pseudogene; static KEY _Feature; static KEY _Clone_Library; static KEY _EMBL_feature; static KEY _EMBL_threshold; static KEY _EMBL_qualifier; static KEY _EMBL_dump_YES; static KEY _EMBL_dump_NO; char *catnames(char *, char *); unsigned long get_cDNA_NID(char *); /********************************************************************* * checkFileExists: * * Check if file in current directory; prompt user to continue * if it does; exit at user request. ********************************************************************/ static int checkFileExists(char *filename) { struct stat sbuf; char a[256]; strcpy(a, ""); if (stat(filename, &sbuf) == 0) { printf(" ASN file %s already exists. Abort this ASN dump? [n] ", filename); fgets(a, sizeof(a), stdin); } return (a[0] == 'Y' || a[0] == 'y'); } /********************************************************************* * sortExons: * * Swap exon limits if start > end; then sort exons by start. ********************************************************************/ static void sortExons(Array a) { int i,j,lo,hi; for (i=0; i arr(a,i+1,BSunit).i ) { j = arr(a,i+1,BSunit).i; arr(a,i+1,BSunit).i = arr(a,i,BSunit).i; arr(a,i,BSunit).i = j; } } for (j=2; j0 && arr(a,i,BSunit).i > lo) { arr(a,i+2,BSunit).i = arr(a,i+0,BSunit).i; arr(a,i+3,BSunit).i = arr(a,i+1,BSunit).i; i-=2; } arr(a,i+2,BSunit).i = lo; arr(a,i+3,BSunit).i = hi; } } /********************************************************************* * offAdjust: * * Converts limits to dump sequence coordinates. ********************************************************************/ static void offAdjust (int *x1, int *x2) { if (gOffForward) { *x1 -= gOffset - 1 ; *x2 -= gOffset - 1 ; } else { *x1 = gOffset + 1 - *x1 ; *x2 = gOffset + 1 - *x2 ; } } /********************************************************************* * clipAdd: * * Accumulate counts of nucleotides outside superseq. * Modified from EMBL version: does not build exon array. ********************************************************************/ static void clipAdd (int x1, int x2) { if (x2 < 1) { gMiss1 += x2 - x1 + 1 ; return ; } if (x1 > gLength) { gMiss2 += x2 - x1 + 1 ; return ; } if (x1 < 1) { gMiss1 += 1 - x1 ; x1 = 1 ; } if (x2 > gLength) { gMiss2 += x2 - gLength ; x2 = gLength ; } } /********************************************************************* * FixCloneName: * * Makes character string uppercase and returns a pointer past "H_". ********************************************************************/ static char * FixCloneName(char *name) { int i; int l = strlen(name); char *newname; for (i=0; i strlen(lastname) ) { return 1; } else { return 0; } } /********************************************************************* * AddFinishers: * * Adds finisher reference from key to NCBI submission. ********************************************************************/ static void AddFinishers(OBJ this, NCBISubPtr sp, SeqEntryPtr ep, char *title) { OBJ obj; PubPtr pub; static Array a; int i; char *finisher; char lastname[80]; char firstini[80]; a = arrayReCreate(a, 32, BSunit); if (bsFindTag(this, _From_Author) && bsFlatten(this,1,a)) { if (gAsnDebug) printf(" Adding finisher reference...\n"); pub = CitArtBuild(sp, title, NULL, NULL, NULL, NULL, gMon, gDay, gYear, PUB_STATUS_UNPUBLISHED); for (i=0; i gLength) || !isPlus && (subBeg < 1 || subEnd > gLength)) { return(1); } if (!(obj = bsCreate(key))) { printf(" Cannot create obj from key %s\n",name(key)); return(1); } if (!bsFindTag(obj, _CDS) && !bsFindTag(obj, _Transposon) && !bsFindTag(obj, _Transcript)) { bsDestroy(obj); return(0); } fflush(stdout); fflush(stderr); if (gAsnDebug) printf(" Entering subsequence %s from %d to %d.\n",name(key),subBeg,subEnd); /* Add transposon feature: assumes text field is transposon type. */ if (bsGetData(obj, _Transposon, _Text, &text)) { if (gAsnDebug) printf(" Is a transposon.\n"); sfp = FeatureBuild(nsp, nuc_entry, FALSE, FALSE, FALSE, NULL); AddIntervalToFeature(nsp,sfp,nuc_entry,NULL,subBeg,subEnd,isPlus,FALSE,FALSE); MakeImpFeature (nsp,sfp,"source"); switch (gProject) { case YEAST: AddQualToImpFeature(nsp,sfp,"organism","Saccharomyes cerevisiae"); break; case ELEGANS: AddQualToImpFeature(nsp,sfp,"organism","Caenorhabditis elegans"); break; case BRIGGSAE: AddQualToImpFeature(nsp,sfp,"organism","Caenorhabditis briggsae"); break; case HUMAN: AddQualToImpFeature(nsp,sfp,"organism","Homo sapiens"); break; default: break; } AddQualToImpFeature(nsp,sfp,"transposon",text); } /* Add RNA feature: assume tRNA text is "codon name letter" */ if (bsFindTag(obj, _Transcript) && bsFlatten(obj, 2, a)) { if (gAsnDebug) printf(" Is a transcript.\n"); gpkey = arr(a,0,BSunit).k; if (gpkey == _mRNA) { if (gAsnDebug) printf(" Is a mRNA.\n"); strcpy(rnaname,name(arr(a,1,BSunit).k)); } else if (gpkey == _tRNA) { if (gAsnDebug) printf(" Is a tRNA.\n"); fflush(stdout); if (arr(a,1,BSunit).s == NULL) { printf(" FATAL: ===================================\n"); printf(" Could not find the text field for %s.\n",name(key)); printf(" Please enter codon, rna name, and rna aa \n"); printf(" in tRNA text field and try again.\n"); printf(" Example text: \"TAC Tyr Y\"\n"); printf(" FATAL: ===================================\n"); return(1); } else { strcpy(gBuf,arr(a,1,BSunit).s); } if (gAsnDebug) printf(" tRNA info: %s\n",gBuf); fflush(stdout); if (sscanf(gBuf,"%s %s %s",trna_codon,rnaname,trna_AA) != 3) { printf(" FATAL: Error in tRNA text \"%s\".\n",gBuf); return(1); } if ( strcmp(rnaname,"Sup") == 0 ) { printf(" Comment: adding tRNA-Sup (putative) entry.\n"); strcpy(rnaname,"tRNA-Sup (putative)"); } } else { if (gAsnDebug) printf(" Is other RNA.\n"); if (arr(a,1,BSunit).s != NULL) { strcpy(rnaname,arr(a,1,BSunit).s); } else if (bsGetKey(obj, _Locus, &gpkey)) { if (gAsnDebug) printf(" Locus found; using %s as name of rna.\n",name(gpkey)); strcpy(rnaname, name(gpkey)); } else { printf(" Error in RNA: no text or locus.\n"); } } sfp = FeatureBuild(nsp, nuc_entry, FALSE, FALSE, FALSE, rnaname); AddIntervalToFeature(nsp, sfp, nuc_entry, NULL, subBeg, subEnd, isPlus, FALSE, FALSE); if (gpkey == _mRNA) { MakeRNAFeature(nsp,sfp,RNA_TYPE_mRNA,FALSE,NULL,NULL,NULL); } else if (gpkey == _tRNA) { MakeRNAFeature(nsp,sfp,RNA_TYPE_tRNA,FALSE,NULL,trna_AA,trna_codon); } else if (gpkey == _snRNA) { MakeRNAFeature(nsp,sfp,RNA_TYPE_snRNA,FALSE,NULL,NULL,NULL); } else if (gpkey == _scRNA) { MakeRNAFeature(nsp,sfp,RNA_TYPE_scRNA,FALSE,NULL,NULL,NULL); } else { MakeRNAFeature(nsp,sfp,RNA_TYPE_other,FALSE,NULL,NULL,NULL); } } /* Return if subsequence is not CDS. */ if (!bsFindTag(obj, _CDS) ) { bsDestroy(obj); if (gAsnDebug) printf(" Exiting subsequence: no CDS found.\n"); fflush(stdout); fflush(stderr); return 0; } /*Add CDS; check for matching cDNA first and add to feature comment (start buf) */ strcpy(gbp = gBuf, ""); has_cDNA = (bsFindTag(obj, _Matching_cDNA) && bsFlatten(obj,1,a)); if (has_cDNA) { char *cp; unsigned long nid; if (gAsnDebug) printf(" Found matching cDNA.\n"); switch (gProject) { case ELEGANS: for (i = 0; i < arrayMax(a); ++i) { if (i > 0) gbp = Nlm_StrMove(gbp, "; "); gbp += sprintf(gbp, "coded for by C. elegans cDNA %s", name(arr(a,i,BSunit).k)); } break; case HUMAN: gbp += sprintf(gbp, "coded for by human cDNA%s %s", (arrayMax(a) > 1 ? "s" : ""), cp = name(arr(a,0,BSunit).k)); if ((nid = get_cDNA_NID(cp)) != 0) { gbp += sprintf(gbp, " (NID:g%lu)", nid); } if (arrayMax(a) > 1) { for (i = 1; i < arrayMax(a) - 1; ++i) { gbp += sprintf(gbp, ", %s", cp = name(arr(a,i,BSunit).k)); if ((nid = get_cDNA_NID(cp)) != 0) { gbp += sprintf(gbp, " (NID:g%lu)", nid); } } gbp += sprintf(gbp, " and %s", cp = name(arr(a,i,BSunit).k)); if ((nid = get_cDNA_NID(cp)) != 0) { gbp += sprintf(gbp, " (NID:g%lu)", nid); } } cDNA_description = StrSave(gBuf); strcpy(gbp = gBuf, ""); break; default: break; } } /* Add TSL_site to feature comment. */ if (bsGetData(obj, _TSL_site, _Int, &i)) { if (gAsnDebug) printf(" Adding TSL site in comment field.\n"); if (gBuf[0] != NULLB) gbp = StrMove(gbp, "; possible"); else gbp = StrMove(gbp, "Possible"); gbp += sprintf(gbp, " TSL site at %d",isPlus?i+subBeg-1:subBeg+1-i); } /* Check for locus and set gene name (use subsequence if no locus). */ has_Locus = bsGetKey(obj, _Locus, &gpkey); lcp = MemNew(strlen(name(gpkey)) + 2); sprintf(lcp, "%s", name(gpkey)); if (has_Locus && (*lcp != '*')) { if (gAsnDebug) printf(" Locus found; using \"%s\" as name of gene.\n", name(gpkey)); strcpy(gene, name(gpkey)); /*ss --where locus is set to gene*/ } else { if (gAsnDebug) printf(" No locus found; using \"%s\" as name of gene.\n", name(key)); strcpy(gene, ""); switch (gProject) { case HUMAN: if (strncasecmp(name(key), gClonePat, strlen(gClonePat)) == 0 || strncasecmp(name(key), gRawClonePat, strlen(gRawClonePat)) == 0) { strcpy(gene, Institution); strcat(gene, ":"); } break; default: break; } strcat(gene, name(key)); } lcp = MemFree(lcp); /* Build CDS feature. */ /* Get source exons and gene limits (cdsBeg,cdsEnd) */ if (gAsnDebug) printf(" Start analysis of source exons...\n"); cdsBeg = subEnd; cdsEnd = subBeg; gMiss1 = gMiss2 = 0; if (bsFindTag(obj, _Source_Exons) && bsFlatten(obj, 2, a)) { sortExons(a); for (i = 0; i < arrayMax(a) ; i += 2) { if (gAsnDebug) printf(" >>> %5d %5d\n", arr(a,i,BSunit).i,arr(a,i+1,BSunit).i); if (isPlus) { arr(a,i ,BSunit).i += subBeg - 1; arr(a,i+1,BSunit).i += subBeg - 1; clipAdd(arr(a,i,BSunit).i, arr(a,i+1,BSunit).i); } else { arr(a,i ,BSunit).i = subBeg + 1 - arr(a,i ,BSunit).i; arr(a,i+1,BSunit).i = subBeg + 1 - arr(a,i+1,BSunit).i; clipAdd(arr(a,i+1,BSunit).i, arr(a,i,BSunit).i); } } cdsBeg = arr(a,0 ,BSunit).i; cdsEnd = arr(a,arrayMax(a)-1,BSunit).i; } else { if (isPlus) clipAdd(subBeg, subEnd); else clipAdd(subEnd, subBeg); } if (isPlus) { if (!gMiss1 && bsFindTag (obj, _Start_not_found)) if (bsGetData (obj, _bsRight, _Int, &i)) gMiss1 = 2 + i; else gMiss1 = 3; /* no frame change */ if (!gMiss2 && bsFindTag (obj, _End_not_found)) gMiss2 = 3; } else { if (!gMiss2 && bsFindTag (obj, _Start_not_found)) if (bsGetData (obj, _bsRight, _Int, &i)) gMiss2 = 2 + i; else gMiss2 = 3; /* no frame change */ if (!gMiss1 && bsFindTag (obj, _End_not_found)) gMiss1 = 3; } start_not_found = FALSE; if (bsFindTag(obj, _Start_not_found)) { start_not_found = TRUE; if (gAsnDebug) printf(" WARNING: Start was NOT found\n"); } end_not_found = FALSE; if (bsFindTag(obj, _End_not_found)) { end_not_found = TRUE; if (gAsnDebug) printf(" WARNING: End was NOT found\n"); } if (has_Locus || has_cDNA) sfp = FeatureBuild(nsp, the_entry, (start_not_found || end_not_found), EVIDENCE_NOT_SET, FALSE, gBuf); else sfp = FeatureBuild(nsp, the_entry, (start_not_found || end_not_found), EVIDENCE_NOT_EXPERIMENTAL, FALSE, gBuf); for (i = 0; i < arrayMax(a) - 1; i += 2) { snf = enf = FALSE; if (i == 0) snf = start_not_found; if (i == arrayMax(a) - 2) enf = end_not_found; if (gAsnDebug && snf) printf(" AddIntervalToFeature, start_not_found.\n"); if (gAsnDebug && enf) printf(" AddIntervalToFeature, end_not_found.\n"); AddIntervalToFeature(nsp, sfp, nuc_entry, NULL, arr(a, i, BSunit).i, arr(a, i+1, BSunit).i, isPlus, snf, enf); } if (gAsnDebug) printf(" Source exons added.\n"); MakeCdRegionFeature(nsp, sfp, 1, 0, NULL, NULL); prot_entry = TranslateCdRegion(nsp, sfp, nucprot_entry, name(key), NULL, NULL, 0); AddCompleteness(nsp, prot_entry, sfp); if (gAsnDebug) printf(" Coding translation successful.\n"); if (prot_entry != NULL) { if (gAsnDebug) printf(" Adding protein entry.\n"); /* Only give protein name1 if we have synonyms for the protein and */ /* and we actually know the product: only know what the protein */ /* encodes if know the locus (at least in elegans). */ /* For HUMAN, if there is no Locus or it starts with a '*', then it is assumed the protein has been positively identified, but we just don't have a gene name. In this case, the '*' is skipped over and not dumped, and what will be seen in the GenBank flat file is a /product="{briefID}" and /gene="{pseudoname}". */ has_DBRemark = bsGetKey(obj, _DB_remark, &gpkey); if (has_DBRemark) { switch (gProject) { case HUMAN: if (bsGetKey(obj, _Brief_identification, &gpkey)) { while (bsGetKey(obj, _bsDown, &gpkey)) /* goto last Brief_identification */ ; name1 = StringSaveNoNull(name(gpkey)); /*ss --such as /product="semaphorin IV" from BI*/ } default: break; } bsGetKey(obj, _DB_remark, &gpkey); while (bsGetKey(obj, _bsDown, &gpkey)) /* goto last db remark */ ; if (has_Locus) { if (gAsnDebug) printf(" Locus taken from DB remark: %s.\n", name(gpkey)); if (name1 == NULL) name1 = StringSaveNoNull(name(gpkey)); switch (gProject) { /*char *cp;*/ case HUMAN: if (strncasecmp(name(key), gRawClonePat, strlen(gRawClonePat)) == 0) { /*cp = MemNew(strlen(name(key)) + strlen(Institution) + 2); sprintf(cp, "%s:%s", Institution, name(key)); descname = catnames(cp, name(gpkey)); cp = MemFree(cp);*/ descname = StringSaveNoNull(name(gpkey)); /*ss --such as /note="" from DB remark*/ } else descname = catnames(name(key), name(gpkey)); break; default: descname = StringSaveNoNull(name(key)); break; } } else { if (gAsnDebug) printf(" No locus found from DB remark.\n"); if (gProject != HUMAN) name1 = MemFree(name1); if (gAsnDebug) printf(" name1 = \"%s\"\n", (name1 != NULL ? name1 : "")); if (name1 == NULL || *name1 != '*') { descname = catnames(name1, name(gpkey)); name1 = MemFree(name1); } else { descname = StringSaveNoNull(name(gpkey)); } } } /* has_DBRemark */ else { if (gAsnDebug) printf(" No DBremark...checking for briefID\n"); if (bsGetKey(obj, _Brief_identification, &gpkey)) { while (bsGetKey(obj, _bsDown, &gpkey)) /* goto last Brief_identification */ ; if (gAsnDebug && name1 != NULL) printf(" Chucking name1=\"%s\"\n", name1); name1 = MemFree(name1); if (has_Locus) { if (gAsnDebug) printf(" Locus found from brief id: %s.\n",name(gpkey)); name1 = StringSaveNoNull(name(gpkey)); descname = StringSaveNoNull(name(key)); } else { char *cp; if (gAsnDebug) printf(" No locus found from brief id.\n"); cp = MemNew(strlen(name(gpkey)) + 100); switch (gProject) { case HUMAN: sprintf(cp, "%s", name(gpkey)); break; default: sprintf(cp, "Similar to %s", name(gpkey)); break; } /* Prepend cp to the growing gBuf */ if (gAsnDebug) printf(" concatenating \"%s\" & \"%s\"\n", cp, gBuf); /* if (*cp == '*') { name1 = cp; descname = gBuf; } else {*/ descname = catnames(cp, gBuf); cp = MemFree(cp); } } } descname = catnames(descname, cDNA_description); /*ss --where cDNA gets added*/ sfp = FeatureBuild(nsp, prot_entry, (start_not_found || end_not_found), FALSE, FALSE, NULL); AddIntervalToFeature(nsp, sfp, prot_entry, NULL, 1, -1, TRUE, start_not_found, end_not_found); /* Argument name1 is the thing that goes in for product and */ /* should only be used when we KNOW what the product is. */ if (gAsnDebug) { if (name1 != NULL) printf(" Protein name1 is %s\n", name1); else printf(" Protein name1 is NULL.\n"); if (descname != NULL) printf(" Protein descriptive name is %s\n", descname); else printf(" Protein descriptive name is NULL.\n"); } MakeProteinFeature(nsp, sfp, (name1 == NULL || *name1 != '*' ? name1 : name1+1), NULL, NULL, descname, NULL, NULL, NULL, NULL, NULL, NULL); name1 = MemFree(name1); descname = MemFree(descname); if (gAsnDebug) printf(" Adding gene %s from %d to %d.\n", gene, cdsBeg, cdsEnd); sfp = FeatureBuild(nsp, nuc_entry, (start_not_found || end_not_found), FALSE, FALSE, NULL); /* MakeGeneFeature args: nsp, sfp, gene_name, allele, */ /* descriptive name, map_location, is_pseudogene, */ /* genetic_database, gene_id, synonym1, syn2, syn3) */ MakeGeneFeature(nsp,sfp,gene,NULL,NULL,NULL,FALSE,NULL,NULL,NULL,NULL,NULL); AddIntervalToFeature(nsp,sfp,nuc_entry,NULL,cdsBeg,cdsEnd,isPlus,start_not_found,end_not_found); } if (gAsnDebug) printf(" All done with subsequence.\n"); bsDestroy(obj); arrayDestroy(a); return 0; } /********************************************************************* * emblFeature: * * Adds EMBL_feature to NCBI submission. ********************************************************************/ static int emblFeature(KEY tag, int beg, int end, BSunit u) { SeqFeatPtr sfp; BOOL isPlus = (beg <= end); if (u.s) strcpy(gBuf,u.s); else strcpy(gBuf," "); if (gAsnDebug) printf(" Adding feature %s from %d to %d.\n",name(tag),beg,end); sfp = FeatureBuild(nsp, nuc_entry, FALSE, FALSE, FALSE, gBuf); AddIntervalToFeature(nsp, sfp, nuc_entry, NULL, beg, end, isPlus, FALSE, FALSE); MakeImpFeature(nsp, sfp, name(tag)); return(0); } /********************************************************************* * parseQualifierString: * * Parses text string from ACeDB Feature type GB_*. Text string * contains multiple GenBank qualifers. See dumpGBFeatures for * format of the qualifier string. * * D. Ficenec WUGSC. * ********************************************************************/ static int parseQualString(char *qstr, char *qbuf, char *vbuf) { static char *qhere = NULL; static char *nextq = NULL; char *currq; int nscan; if (qstr != NULL) { if (strlen(qstr) == 0) { printf(" WARNING: Empty qualifer string.\n"); qhere = NULL; return 0; } if (strncmp(qstr,"/",1) != 0) { printf(" WARNING: Qualifer string must start with /\n"); qhere = NULL; return 0; } qhere = (char *) malloc(strlen(qstr)); strcpy(qhere,qstr); nextq = qhere + 1; } if (nextq == NULL) { qhere = NULL; return 0; } currq = nextq; if ((nextq = (char *) strstr(currq+2," /")) != NULL) { nextq[0] = '\0'; nextq += 2; } nscan = sscanf(currq,"%[^= ]%*[= ]%[^\f]",qbuf,vbuf); if (nscan == 1) { strcpy(vbuf,""); } return 1; } /********************************************************************* * dumpGBFeatures: * * Adds other features (attached to main sequence only) to NCBI * submission. Only features which start with GB_ and have the * EMBL_dump_YES flag set are dumped. The text contains the feature * note and/or qualifier(s); each one starting with a blank followed * by a / (/ in qualifier must be preceded by a non-space character). * If the qualifier has a value, then use "/qualifer=value" with * optional spaces between the qualifier and the the equals sign. * * D. Ficenec WUGSC. * ********************************************************************/ static int dumpGBFeatures(KEY tag) { OBJ mobj,sobj; KEY mkey; BOOL isPlus; BSMARK mmark = NULL; BSMARK smark = NULL; SeqFeatPtr sfp; int dumpYes; int hasEMBLfeature; int mdone,sdone; int mbeg,mend; float msig; /* char qbuf[128]; char vbuf[2048];*/ char ftype[128]; char ntxt[256]; char slashname[256]; char *mtxt, *cp; char *etype; sobj = bsCreate(tag); if (gAsnDebug) printf(" Adding GB_ features...\n"); if (bsFindTag(sobj, _Feature)) { bsGetKeyTags(sobj,_bsRight,&mkey); mdone = 0; while (!mdone) { mobj = bsCreate(mkey); hasEMBLfeature = bsGetData(mobj, _EMBL_feature, _Text, &etype); if (hasEMBLfeature) strcpy(ftype,etype); bsDestroy(mobj); if (strncmp(name(mkey),"GB_",3) == 0 && hasEMBLfeature) { mmark = bsMark(sobj,mmark); bsGetData(sobj,_bsRight,_Int,&mbeg); sdone = 0; while (!sdone) { smark = bsMark(sobj,smark); bsGetData(sobj,_bsRight,_Int,&mend); bsGetData(sobj,_bsRight,_Float,&msig); bsGetData(sobj,_bsRight,_Text,&mtxt); strcpy(ntxt, mtxt); slashname[0] = NULLB; if (mtxt[0] == '/' && isalpha(mtxt[1]) && (cp = strchr(mtxt,'=')) != NULL) { memcpy(slashname, mtxt+1, cp-mtxt-1); slashname[cp-mtxt-1] = NULLB; strcpy(ntxt, cp+1); } dumpYes = (bsPushObj(sobj) && bsFindTag(sobj,_EMBL_dump_YES)) ? 1 : 0; if (dumpYes) { isPlus = (mbeg <= mend); if (strncasecmp(ntxt,"Grail",5) ==0) { sfp = FeatureBuild(nsp, nuc_entry, FALSE, EVIDENCE_NOT_EXPERIMENTAL, FALSE, NULL); } else { sfp = FeatureBuild(nsp, nuc_entry, FALSE, FALSE, FALSE, NULL); } MakeImpFeature(nsp, sfp, ftype); AddIntervalToFeature(nsp, sfp, nuc_entry, NULL, mbeg, mend, isPlus, FALSE, FALSE); /*if (parseQualString(mtxt,qbuf,vbuf)) { AddQualToImpFeature(nsp, sfp, qbuf, vbuf); while (parseQualString(NULL,qbuf,vbuf)) { AddQualToImpFeature(nsp, sfp, qbuf, vbuf); } }*/ if (strcmp(ftype, "exon") == 0) { if (slashname[0] == NULLB) strcpy(slashname, "note"); AddQualToImpFeature(nsp,sfp,slashname,ntxt); if (gAsnDebug) { printf(" >%s as %s [%6d,%6d] /%s=\"%s\"\n",name(mkey),ftype,mbeg,mend,slashname,ntxt); } } else { if (slashname[0] == NULLB) strcpy(slashname, "rpt_family"); AddQualToImpFeature(nsp,sfp,slashname,ntxt); if (gAsnDebug) { printf(" >%s as %s [%6d,%6d] /%s=\"%s\"\n",name(mkey),ftype,mbeg,mend,slashname,ntxt); } } } bsGoto(sobj,smark); sdone = !bsGetData(sobj,_bsDown,_Int,&mbeg); } bsGoto(sobj,mmark); } mdone = !bsGetKeyTags(sobj,_bsDown,&mkey); } } bsDestroy(sobj); bsMarkFree(mmark); bsMarkFree(smark); } /********************************************************************* * dumpMiscFeatures: * * Adds other features (attached to main sequence only) to NCBI * submission. * * S. Shin WUGSC. * ********************************************************************/ static int dumpMiscFeatures(KEY tag) { OBJ mobj,sobj; KEY mkey; BOOL isPlus; BSMARK mmark = NULL; BSMARK smark = NULL; SeqFeatPtr sfp; int dumpYes; int hasEMBLfeature; int mdone,sdone; int mbeg,mend; Boolean cpg_island; float msig; /* char qbuf[128]; char vbuf[2048];*/ char qtxt[2176]; char ftype[128]; char *mtxt; char *etype; sobj = bsCreate(tag); if (gAsnDebug) printf(" Adding Misc. features...\n"); if (bsFindTag(sobj, _Feature)) { bsGetKeyTags(sobj,_bsRight,&mkey); mdone = 0; while (!mdone) { mobj = bsCreate(mkey); hasEMBLfeature = bsGetData(mobj, _EMBL_feature, _Text, &etype); if (hasEMBLfeature) strcpy(ftype,etype); bsDestroy(mobj); if (strncasecmp(name(mkey),"Predicted_CpG_island",20) == 0 && hasEMBLfeature) { mmark = bsMark(sobj,mmark); bsGetData(sobj,_bsRight,_Int,&mbeg); sdone = 0; while (!sdone) { smark = bsMark(sobj,smark); bsGetData(sobj,_bsRight,_Int,&mend); bsGetData(sobj,_bsRight,_Float,&msig); bsGetData(sobj,_bsRight,_Text,&mtxt); strcpy(qtxt, "CpG_island "); /* enclose mtxt between matching parentheses */ if (mtxt[0] != '(') /* open parenthesis (if not already present) */ strcat(qtxt, "("); strcat(qtxt,mtxt); if (mtxt[strlen(mtxt)-1] != ')') /* close parenthesis (if not present) */ strcat(qtxt, ")"); /*dumpYes = (bsPushObj(sobj) && bsFindTag(sobj,_EMBL_dump_YES)) ? 1 : 0;*/ dumpYes = 1; if (dumpYes) { isPlus = (mbeg <= mend); sfp = FeatureBuild(nsp, nuc_entry, FALSE, EVIDENCE_NOT_SET, FALSE, NULL); MakeImpFeature(nsp, sfp, ftype); AddIntervalToFeature(nsp, sfp, nuc_entry, NULL, mbeg, mend, isPlus, FALSE, FALSE); /* if (parseQualString(mtxt,qbuf,vbuf)) { AddQualToImpFeature(nsp, sfp, qbuf, vbuf); while (parseQualString(NULL,qbuf,vbuf)) { AddQualToImpFeature(nsp, sfp, qbuf, vbuf); } }*/ AddQualToImpFeature(nsp,sfp,"note",qtxt); if (gAsnDebug) { printf(" >%s as %s [%6d,%6d] %s\n",name(mkey),ftype,mbeg,mend,mtxt); } } bsGoto(sobj,smark); sdone = !bsGetData(sobj,_bsDown,_Int,&mbeg); } bsGoto(sobj,mmark); } mdone = !bsGetKeyTags(sobj,_bsDown,&mkey); } } bsDestroy(sobj); bsMarkFree(mmark); bsMarkFree(smark); } /********************************************************************* * asnDoDump: * * Called from asnDumpKey. * * Process: * [1] Check sequence (DNA, Genomic_Canonical); * [2] Get sequence info (locus, finisher, etc); * [3] Add sequence info and publications; * [4] Add EMBL features; * [5] Add method features; * [6] Add subsequence features; ********************************************************************/ static int asnDoDump (KEY seq) { KEY key; /* General purpose key */ KEY clone; /* Key for clone */ OBJ Seq; /* Sequence object (key seq) */ OBJ obj; /* General purpose object */ SeqFeatPtr sfp; /* General purpose seq feature ptr */ PubPtr pub; /* General purpose publication ptr */ static Array a; /* Working array for bsFlatten */ Array dna; /* Holds nucleotide sequence */ int i,ii; /* General purpose int */ int isCDS=0; /* Flag for CDS in subsequences */ int status; /* Subroutine return status */ char *message = 0; /* Error message */ char *dnastr; /* DNA string. */ char namebuf1[64]; /* Last name or first initial */ char namebuf2[64]; /* Last name or first initial */ char *asnbr = NULL; /* Pointer to accession number */ char *cp; CharPtr gbp; /* Initialize globals and set working array. */ if (! _scRNA) { lexaddkey("scRNA", &_scRNA, 0); lexaddkey("misc_RNA", &_misc_RNA, 0); lexaddkey("pseudogene", &_pseudogene, 0); lexaddkey("Database", &_Database, 0); lexaddkey("Clone_Library", &_Clone_Library, 0); lexaddkey("Feature", &_Feature, 0); lexaddkey("EMBL_feature", &_EMBL_feature, 0); lexaddkey("EMBL_threshold", &_EMBL_threshold, 0); lexaddkey("EMBL_qualifier", &_EMBL_qualifier, 0); lexaddkey("EMBL_dump_YES", &_EMBL_dump_YES, 0); lexaddkey("EMBL_dump_NO", &_EMBL_dump_NO, 0); } nucprot_entry = NULL; nuc_entry = NULL; a = arrayReCreate(a, 32, BSunit); /* Check sequence. */ if (!(class(seq) == _VSequence) || !(Seq = bsCreate(seq))) { message = " Sequence object missing."; goto end_asnDoDump; } if (!bsFindTag(Seq, _Genomic_Canonical)) { gProject = NOPROJECT; message = " Sequence must be Genomic_Canonical to do ASN.1 dump."; goto end_asnDoDump; } /* Get DNA array. */ if (!bsGetKey(Seq, _DNA, &key) || !(dna = dnaGet(key))) { message = " No DNA attached to Sequence object."; goto end_asnDoDump; } /* Get info: name, clone, length, accession number, locus, chromosome. */ strcpy(gSequence,FixCloneName(name(seq))); bsGetKey(Seq, _Clone, &clone); strcpy(gRawClone, name(clone)); strcpy(gClone, FixCloneName(gRawClone)); strcpy(gClonePat, gClone); strcat(gClonePat, "."); strcpy(gRawClonePat, gRawClone); strcat(gRawClonePat, "."); obj = bsCreate(clone); if (bsFindTag(obj, _Type) && bsFlatten(obj, 1, a)) { strcpy(gCloneType,name(arr(a,0,BSunit).k)); } else { printf(" WARNING: Could not find Clone name.\n"); strcpy(gCloneType,"clone"); } if (bsFindTag(obj, _Clone_Library) && bsFlatten(obj, 1, a)) { strcpy(gLibrary,name(arr(a,0,BSunit).k)); } else { strcpy(gLibrary,"library"); printf(" WARNING: Could not find Clone_Library name.\n"); } if (bsFindTag(obj, _Map) && bsFlatten(obj, 1, a)) { strcpy(gMapPosition,name(arr(a,0,BSunit).k)); } else { printf(" WARNING: Could not find library map position under clone.\n"); strcpy(gMapPosition,"map_position"); } if (gAsnDebug) { printf(" Sequence is %s\n",gSequence); printf(" Clone is %s of type %s\n",gClone,gCloneType); printf(" Library is %s\n",gLibrary); printf(" Map position is %s\n",gMapPosition); } gLength = arrayMax(dna); if (gAsnDebug) printf(" DNA length is %d.\n", gLength); if (bsFindTag(Seq, _Database) && bsFlatten(Seq,3,a) && getenv("ASNACCESSION") == NULL) { asnbr = arr(a,2,BSunit).s; } if (bsFindTag(Seq, _Keyword) && bsFlatten (Seq, 1, a)) { ii=0; for (i = 0; i < arrayMax(a) ; ++i) { sprintf(keyword[ii++],"%s",name(arr(a,i,BSunit).k)); } } if (gAsnDebug) printf("keyword[0] is %s\n",keyword[0]); switch (gProject) { case YEAST: sprintf(gLocus, "YSCL%s", gSequence); strcpy(gChromosome,"XII"); if (gAsnDebug) printf(" Yeast chromosome XII.\n"); break; case BRIGGSAE: sprintf(gLocus, "CBR%s", gSequence); if (bsGetKey(Seq, _Map, &key)) { sscanf(name(key),"Sequence-%s",gChromosome); if (gAsnDebug) printf(" Briggsae chromosome %s.\n",gChromosome); } else { printf(" WARNING: Could not find chromosome for C. briggsae.\n"); strcpy(gChromosome,"unknown"); } break; case ELEGANS: sprintf(gLocus, "CEL%s", gSequence); if (bsGetKey(Seq, _Map, &key)) { sscanf(name(key),"Sequence-%s",gChromosome); if (gAsnDebug) printf(" Elegans chromosome %s.\n",gChromosome); } else { message = " Could not find chromosome (no map)."; goto end_asnDoDump; } break; case HUMAN: if (asnbr != NULL && asnbr[0] != NULLB) sprintf(gLocus, "HS%s", asnbr); /*sprintf(gLocus, "HUM%s", gSequence);*/ if (bsGetKey(Seq, _Map, &key)) { sscanf(name(key),"Sequence-%s",gChromosome); if (gAsnDebug) printf(" Human chromosome %s.\n",gChromosome); } else { message = " Could not find chromosome (no map)."; goto end_asnDoDump; } break; default: message = " Project not implemented yet."; goto end_asnDoDump; } /* * Determine is any CDS subsequences. Create appropriate * sequence entry type. Add DNA. */ if (bsFindTag(Seq, _Subsequence) && bsFlatten(Seq, 3, a)) { for (i=0; i (size_t) 0) { AddCommentToEntry(nsp, nuc_entry, gBuf); } break; case BRIGGSAE: AddOrganismToEntry(nsp, the_entry, "Caenorhabditis briggsae", "C. briggsae hermaphrodite mixed whole animal DNA.", NULL, "GujArat G16", NULL, NULL, NULL ); sfp = FeatureBuild(nsp, nuc_entry, FALSE, FALSE, FALSE, NULL); AddIntervalToFeature(nsp,sfp,nuc_entry,NULL,1,gLength,TRUE,FALSE,FALSE); MakeImpFeature (nsp,sfp,"source"); AddQualToImpFeature(nsp,sfp,"organism","Caenorhabditis briggsae"); if (strcmp(gChromosome,"unknown") != 0) { AddQualToImpFeature(nsp,sfp,"chromosome",gChromosome); } AddQualToImpFeature(nsp,sfp,"clone",gSequence); AddQualToImpFeature(nsp,sfp,"strain","GujArat G16"); strcpy(gBuf,"Eukaryota; Animalia; Eumetazoa; Nematoda; Secernentea; "); strcat(gBuf,"Rhabditida; Rhabditina; Rhabditoidea; Rhabditidae."); AddGenBankBlockToEntry(nsp, the_entry, gBuf, "INV", NULL, NULL, NULL); sprintf(gBuf, "Caenorhabditis briggsae cosmid %s", gSequence) ; AddTitleToEntry(nsp, nuc_entry, gBuf); AddBriggsaeReference(nsp, the_entry); /* Remove when briggsae papers in ACeDB. */ AddReferences(Seq, nsp, the_entry); sprintf(gBuf, "The sequence of C. briggsae cosmid %s", gSequence) ; AddFinishers(Seq, nsp, the_entry, gBuf); strcpy(gBuf,"Submitted by: ~"); strcat(gBuf," Genome Sequencing Center ~"); strcat(gBuf," Department of Genetics, Washington University, ~"); strcat(gBuf," St. Louis, MO 63110, USA ~"); strcat(gBuf," e-mail: mmarra@watson.wustl.edu ~"); if (bsFindTag(Seq, _DB_remark) && bsFlatten (Seq, 1, a)) { strcat(gBuf, "~~ NEIGHBORING COSMID INFORMATION:~"); for (i = 0; i < arrayMax(a) ; ++i) { sprintf(gBuf+strlen(gBuf), "~~%s",name(arr(a,i,BSunit).k)); } } if (isCDS) { if (gAsnDebug) printf(" Adding comment about GeneFinder predictions.\n"); sprintf(gBuf+strlen(gBuf), " ~ ~NOTES:~~Coding sequences below are predicted from"); sprintf(gBuf+strlen(gBuf), " computer analysis, using the program Genefinder"); sprintf(gBuf+strlen(gBuf), "(P. Green and L. Hillier, ms in preparation)."); } if (bsFindTag (Seq, _TSL_site)) { sprintf(gBuf+strlen(gBuf), "~\"TSL\" = trans-spliced leader."); } if (strlen(gBuf) > (size_t) 0) AddCommentToEntry(nsp, nuc_entry, gBuf); break; case ELEGANS: AddOrganismToEntry(nsp, the_entry, "Caenorhabditis elegans", "C. elegans hermaphrodite mixed whole animal DNA.", NULL, "Bristol N2", NULL, NULL, NULL ); sfp = FeatureBuild(nsp, nuc_entry, FALSE, FALSE, FALSE, NULL); AddIntervalToFeature(nsp,sfp,nuc_entry,NULL,1,gLength,TRUE,FALSE,FALSE); MakeImpFeature (nsp,sfp,"source"); AddQualToImpFeature(nsp,sfp,"organism","Caenorhabditis elegans"); AddQualToImpFeature(nsp,sfp,"chromosome",gChromosome); AddQualToImpFeature(nsp,sfp,"clone",gSequence); AddQualToImpFeature(nsp,sfp,"strain","Bristol N2"); strcpy(gBuf,"Eukaryota; Animalia; Eumetazoa; Nematoda; Secernentea; "); strcat(gBuf,"Rhabditida; Rhabditina; Rhabditoidea; Rhabditidae."); AddGenBankBlockToEntry(nsp, the_entry, gBuf, "INV", NULL, NULL, NULL); sprintf(gBuf, "Caenorhabditis elegans cosmid %s", gSequence) ; AddTitleToEntry(nsp, nuc_entry, gBuf); AddElegansReference(nsp, the_entry); /* Remove when elegans papers in ACeDB. */ AddReferences(Seq, nsp, the_entry); sprintf(gBuf, "The sequence of C. elegans cosmid %s", gSequence) ; AddFinishers(Seq, nsp, the_entry, gBuf); strcpy(gBuf,"Submitted by: ~"); strcat(gBuf," Genome Sequencing Center ~"); strcat(gBuf," Department of Genetics, Washington University, ~"); strcat(gBuf," St. Louis, MO 63110, USA, and ~"); strcat(gBuf," Sanger Centre, Hinxton Hall~"); strcat(gBuf," Cambridge CB10 IRQ, England ~"); strcat(gBuf," e-mail: rw@nematode.wustl.edu and jes@sanger.ac.uk~"); strcat(gBuf,"~~NOTICE: This sequence may not be the entire insert of this clone.~"); strcat(gBuf,"It may be shorter because we only sequence overlapping sections~"); strcat(gBuf,"once, or longer because we provide a small overlap between ~"); strcat(gBuf,"neighboring submissions.~"); strcat(gBuf,"~This sequence was finished as follows unless otherwise noted:~"); strcat(gBuf,"all regions were double stranded or sequenced with an alternate ~"); strcat(gBuf,"chemistry; an attempt was made to resolve all sequencing problems,~"); strcat(gBuf,"such as compressions and repeats; all regions were covered by ~"); strcat(gBuf,"sequence from more than one subclone ~"); if (bsFindTag(Seq, _DB_remark) && bsFlatten (Seq, 1, a)) { strcat(gBuf, "~~ NEIGHBORING COSMID INFORMATION:~"); for (i = 0; i < arrayMax(a) ; ++i) { sprintf(gBuf+strlen(gBuf), "~~%s",name(arr(a,i,BSunit).k)); } } if (isCDS) { if (gAsnDebug) printf(" Adding comment about GeneFinder predictions.\n"); sprintf(gBuf+strlen(gBuf), " ~ ~NOTES:~~Coding sequences below are predicted from"); sprintf(gBuf+strlen(gBuf), " computer analysis, using the program Genefinder"); sprintf(gBuf+strlen(gBuf), "(P. Green and L. Hillier, ms in preparation)."); } if (bsFindTag (Seq, _TSL_site)) { sprintf(gBuf+strlen(gBuf), "~\"TSL\" = trans-spliced leader."); } if (strlen(gBuf) > (size_t) 0) AddCommentToEntry(nsp, nuc_entry, gBuf); break; case HUMAN: AddOrganismToEntry(nsp, the_entry, "Homo sapiens", "H. sapiens", NULL, NULL, NULL, NULL, NULL ); sfp = FeatureBuild(nsp, nuc_entry, FALSE, FALSE, FALSE, NULL); AddIntervalToFeature(nsp,sfp,nuc_entry,NULL,1,gLength,TRUE,FALSE,FALSE); MakeImpFeature (nsp,sfp,"source"); AddQualToImpFeature(nsp,sfp,"organism","Homo sapiens"); AddQualToImpFeature(nsp,sfp,"chromosome",gChromosome); AddQualToImpFeature(nsp,sfp,"clone",gRawClone); AddQualToImpFeature(nsp,sfp,"clone_lib",gLibrary); AddQualToImpFeature(nsp,sfp,"map",gMapPosition); gbp = StrMove(gBuf,"Eukaryota; Animalia; Chordata; Vertebrata; Mammalia; Theria; "); gbp = StrMove(gbp,"Eutheria; Primates; Haplorhini; Catarrhini; Hominidae."); /* keywords go in last three fields */ /* AddGenBankBlockToEntry(nsp, the_entry, gBuf, "PRI", NULL, NULL, NULL);*/ if (ii==0) AddGenBankBlockToEntry(nsp, the_entry, gBuf, "PLN",NULL,NULL,NULL); else if (ii==1) AddGenBankBlockToEntry(nsp, the_entry, gBuf, "PLN", keyword[0],NULL,NULL); else if (ii==2) AddGenBankBlockToEntry(nsp, the_entry, gBuf, "PLN", keyword[0], keyword[1], NULL); else if (ii==3) AddGenBankBlockToEntry(nsp, the_entry, gBuf, "PLN", keyword[0], keyword[1], keyword[2]); gbp = gBuf; gbp += sprintf(gBuf, "Human %s clone %s", gCloneType, gClone); if (gMapPosition[0] != '\0') { gbp = StrMove(gbp, " from "); gbp = StrMove(gbp, gMapPosition); } AddTitleToEntry(nsp, nuc_entry, gBuf); AddReferences(Seq, nsp, the_entry); sprintf(gBuf, "The sequence of H. sapiens %s clone %s", gCloneType, gClone); AddFinishers(Seq, nsp, the_entry, gBuf); strcpy(gbp = gBuf,"Submitted by:~"); gbp = StrMove(gbp," Genome Sequencing Center~"); gbp = StrMove(gbp," Department of Genetics, Washington University~"); gbp = StrMove(gbp," St. Louis, MO 63108, USA~"); gbp = StrMove(gbp," e-mail: sapiens@watson.wustl.edu~"); gbp = StrMove(gbp,"~NOTICE: This sequence may not represent the entire insert of this clone. "); gbp = StrMove(gbp,"It may be shorter because we only sequence overlapping sections "); gbp = StrMove(gbp,"once, or longer because we provide a small overlap between "); gbp = StrMove(gbp,"neighboring submissions."); gbp = StrMove(gbp,"~~This sequence was finished as follows unless otherwise noted:~"); gbp = StrMove(gbp,"all regions were double stranded or sequenced with an alternate "); gbp = StrMove(gbp,"chemistry; an attempt was made to resolve all sequencing problems, " ); gbp = StrMove(gbp,"such as compressions and repeats; all regions were covered by "); gbp = StrMove(gbp,"sequence from more than one subclone; and the assembly was "); gbp = StrMove(gbp,"confirmed by restriction digest."); if (strcmp(gLibrary,"LL0XNCC01-U")==0) { gbp = StrMove(gbp,"~~SOURCE INFORMATION: This clone is from a chromosome X specific cosmid library LL0XNCC01 \"U\". "); gbp = StrMove(gbp,"The source of the chromosomes was a human/hamster hybrid, GM07297-F, from "); gbp = StrMove(gbp,"Robert Nussbaum at University of Pennsylvania School of Medicine."); gbp = StrMove(gbp, "~VECTOR: Lawrist16"); gbp = StrMove(gbp,"~~Clone reference:~"); gbp = StrMove(gbp,"M. Grieff, R. Mazzarella, M.P. Whyte, R.V. Thakker, R. Wilson, S. Chissoe, and D. Schlessinger. "); gbp = StrMove(gbp,"\"X-linked hypophosphatemia candidate gene region: sequence data on 80 kb "); gbp = StrMove(gbp,"containing spermine synthase and the 5' region of PEX\", in preparation."); } if (strcmp(gLibrary,"LL22NC03")==0) { gbp = StrMove(gbp,"~~SOURCE INFORMATION:~"); gbp = StrMove(gbp, "This clone is from the human chromosome 22-specific cosmid library "); gbp = StrMove(gbp, "LL22NCO3, constructed at the Biomedical Sciences Division, Lawrence "); gbp = StrMove(gbp, "Livermore National Laboratory, Livermore, CA 94550 under the "); gbp = StrMove(gbp, "auspices of the National Laboratory Gene Library Project sponsored "); gbp = StrMove(gbp, "by the US Department of Energy. The source of the flow sorted "); gbp = StrMove(gbp, "chromosomes was a human/hamster hybrid containing chromosomes Y, 22 and 9. "); gbp = StrMove(gbp, "This clone is part of a cosmid contig isolated using YACs from the "); gbp = StrMove(gbp, "Sanger Centre chromosome 22 YAC contig described by "); gbp = StrMove(gbp, "Collins et al., Nature 377 Suppl., 367-379."); gbp = StrMove(gbp,"~VECTOR: Lawrist16"); } if (strcmp(gLibrary,"LLNL3")==0) { gbp = StrMove(gbp,"~~SOURCE INFORMATION:~"); gbp = StrMove(gbp,"This clone is from a chromosome 3 specific library described by "); gbp = StrMove(gbp,"Wei et al., Cancer Research 56:1487-1492 (1996)."); gbp = StrMove(gbp, "~VECTOR: pWE15"); } if (strcmp(gLibrary,"Stratagene_placenta_951202")==0) { gbp = StrMove(gbp,"~~SOURCE INFORMATION:~"); gbp = StrMove(gbp,"This human genomic clone is from a male, caucasian placental cosmid library purchased from Stratagene."); gbp = StrMove(gbp, "~VECTOR: pWE15"); } if (strcmp(gLibrary,"GSBAC1")==0) { gbp = StrMove(gbp,"~~SOURCE INFORMATION:~"); gbp = StrMove(gbp,"This clone is from Genome Systems first BAC library."); gbp = StrMove(gbp,"~Cell line: lymphoblastoid"); gbp = StrMove(gbp,"~Haplotypes: two"); gbp = StrMove(gbp,"~VECTOR: pBELO"); gbp = StrMove(gbp,"~Selection: chloramphenicol"); } if (strcmp(gLibrary,"RPCI-1")==0 || strcmp(gLibrary,"RPCI-3")==0 || strcmp(gLibrary,"RPCI-4")==0 || strcmp(gLibrary,"RPCI-5")==0 || strcmp(gLibrary,"RPCI-6")==0) { gbp = StrMove(gbp,"~~SOURCE INFORMATION:~"); gbp = StrMove(gbp,"The sequence for this clone was derived from a human PAC library "); gbp = StrMove(gbp,"prepared by Pieter de Jong and coworkers at Roswell Park "); gbp = StrMove(gbp,"Cancer Institute using the method described by Ioannou et al., "); gbp = StrMove(gbp,"Nature Genetics 6:84-89 (1994)."); if (strcmp(gLibrary,"RPCI-6")==0) { gbp = StrMove(gbp," The library is from one female donor."); gbp = StrMove(gbp, "~VECTOR: pPAC4"); } else { gbp = StrMove(gbp," The library is from one male donor."); gbp = StrMove(gbp, "~VECTOR: pCYPAC2"); } } if (strcmp(gLibrary,"CITB-978SK-B")==0) { gbp = StrMove(gbp,"~~SOURCE INFORMATION:~"); gbp = StrMove(gbp,"This clone is from the first release of the human BAC library. The library contains cloned DNA "); gbp = StrMove(gbp,"from a human male fibroblast cell line 978SK. For references see: "); gbp = StrMove(gbp,"Shizuya et al., Proc. Natl. Acad. Sci. 89:8794-8797 (1992); "); gbp = StrMove(gbp,"Kim et al., Genomics 34:213-218 (1996)."); gbp = StrMove(gbp,"~VECTOR: pBELO~Selection: chloramphenicol"); } if (strcmp(gLibrary,"CITB-HS-A")==0) { gbp = StrMove(gbp,"~~SOURCE INFORMATION:~"); gbp = StrMove(gbp,"This clone is from a release of the human BAC library. The library contains cloned DNA "); gbp = StrMove(gbp,"from a human sperm. For references see: "); gbp = StrMove(gbp,"Shizuya et al., Proc. Natl. Acad. Sci. 89:8794-8797 (1992); "); gbp = StrMove(gbp,"Kim et al., Genomics 34:213-218 (1996)."); gbp = StrMove(gbp,"~VECTOR: pBELO~Selection: chloramphenicol"); } if (bsFindTag(Seq, _DB_remark) && bsFlatten (Seq, 1, a)) { gbp = StrMove(gbp, "~~NEIGHBORING SEQUENCE INFORMATION:"); for (i = 0; i < arrayMax(a) ; ++i) { gbp += sprintf(gbp, "~%s", name(arr(a,i,BSunit).k)); if (i < arrayMax(a) - 1) gbp = StrMove(gbp, "~"); } } if (gBuf[0] != NULLB) AddCommentToEntry(nsp, nuc_entry, gBuf); break; default: message = " Project not implemented yet."; goto end_asnDoDump; } /* Now dump EMBL_feature, Subsequence, and Feature (human only). */ /* Recurse through parent. */ key = seq; obj = bsCreate(key); gOffset = 1; gOffForward = TRUE; while (TRUE) { KEY new; int i,x1=0,x2=0; if (gAsnDebug) { printf(" Analyzing subseq/features from %s.\n",name(key)); } if (bsFindTag(obj, _EMBL_feature) && bsFlatten(obj, 4, a)) { for (i = 0; i < arrayMax(a); i += 4) { status = emblFeature(arr(a,i+0,BSunit).k, arr(a,i+1,BSunit).i, arr(a,i+2,BSunit).i, arr(a,i+3,BSunit)); } if (status) { bsDestroy(obj); goto end_asnDoDump; } } if (bsFindTag(obj, _Subsequence) && bsFlatten(obj, 3, a)) { for (i = 0; i < arrayMax(a); i += 3) { status = subseqFeatures(arr(a,i+0,BSunit).k, arr(a,i+1,BSunit).i, arr(a,i+2,BSunit).i); } if (status) { bsDestroy(obj); goto end_asnDoDump; } } if (gProject == HUMAN) { status = dumpGBFeatures(key); status = dumpMiscFeatures(key); if (status) { bsDestroy(obj); goto end_asnDoDump; } } if (gProject == HUMAN) { status = dumpMiscFeatures(key); if (status) { bsDestroy(obj); goto end_asnDoDump; } } if (!bsGetKey(obj, _Source, &new)) { bsDestroy (obj); break; } if (gAsnDebug) { printf(" Using %s found _Source %s.\n",name(key),name(new)); } bsDestroy(obj); if (!(obj = bsCreate (new))) { break; } if (!bsFindKey (obj, _Subsequence, key) || !bsGetData (obj, _bsRight, _Int, &x1) || !bsGetData (obj, _bsRight, _Int, &x2)) { break; } if (gAsnDebug) { printf(" New key %s has subseq with left/right limits.\n",name(new)); } if (x2 > x1) gOffset += x1 - 1; else { gOffForward = !gOffForward; gOffset = x1 + 1 - gOffset; } key = new; } end_asnDoDump: if (message) printf(" Fatal error dumping %s: %s\n", name(seq), message); bsDestroy(Seq); arrayDestroy(dna); return (message) ? 1 : 0; } /********************************************************************* * asnDumpKey: * * Called from displayBlock (pointer to function passed from asnDump). * * Process: * [0] Check for valid ACEDB_PROJECT; * [1] Check for valid ASN output file; * [2] Get submission date and call NCBISubCreate; * [3] Add author for NCBI reference; * [4] Call asnDoDump to do the rest; * [5] Validate NCBI submission and report errors. ********************************************************************/ void asnDumpKey (KEY key) { struct tm *date = NULL; /* MDH 11-14-94 for submit date. */ time_t clock; /* MDH 11-14-94 for submit date. */ BOOL PubHold = FALSE; /* Do not hold submission until publication. */ PubPtr pub; /* Pointer to NCBI publication feature. */ OBJ Seq; /* Object for key seq. */ char asnfile[1024]; /* Absolute full file name for ASN.1 output. */ char dname[DIR_BUFFER_SIZE] = {0}; /* Path name of ASN.1 output.*/ char fname[FIL_BUFFER_SIZE] = {0}; /* ASN.1 output file (no extension). */ char *ptype; /* Pointer to project name (from getenv). */ int i; /* Loop variable. */ int asnauto = 0; /* Is set to 1 if ASNAUTO defined in env. */ int asncase = 0; /* Is set to 1 if ASNCASE defined in env. */ int asnprompt = 0; /* Is set to 1 if ASNPROMPT defined in env. */ Int2 nasnerrors; /* Number of errors in the ASN.1 submission */ FILE *fil = NULL; /* Temp file pointer to test file open. */ char *sub_email; printf("Begin ASN.1 dump for sequence %s.\n",name(key)); /* Determine if debug output on. */ gAsnDebug = (getenv("ASNDEBUG") != NULL); /* Determine if output file determined from sequence name. */ asnauto = (getenv("ASNAUTO") != NULL); /* Determine if output file should be lower case. */ asncase = (asnauto && getenv("ASNCASE") != NULL); /* Determine if prompt before overwriting ASN file. */ asnprompt = (asnauto && getenv("ASNPROMPT") != NULL); /* Determine project type from ACEDB_PROJECT; return if invalid project */ ptype = getenv("ACEDB_PROJECT"); if (ptype == NULL) gProject = NOPROJECT; else if (!strcmp(ptype,"elegans")) gProject = ELEGANS; else if (!strcmp(ptype,"yeast")) gProject = YEAST; else if (!strcmp(ptype,"human")) gProject = HUMAN; else if (!strcmp(ptype,"briggsae")) gProject = BRIGGSAE; else if (!strcmp(ptype,"fugu")) gProject = FUGU; else gProject = NOPROJECT; if (gProject == NOPROJECT) { printf(" Fatal error: Must set ACEDB_PROJECT to valid project name.\n"); printf("Abort ASN.1 dump for sequence %s.\n",name(key)); return; } /* Get dname/fname.asn and create file name string for NCBI toolkit. */ if (asnauto) { sprintf(asnfile, "%s.asn", name(key)); if (asncase > 0) { for (i=0; itm_mday; /* days of month (1-31) */ gMon = 1 + date->tm_mon; /* months since jan (0-11) */ gYear = 1900 + date->tm_year; /* years since 1900 */ /* Create submission. */ switch (gProject) { case HUMAN: sub_email = "humansub@watson.wustl.edu"; break; default: sub_email = "lhillier@watson.wustl.edu"; break; } nsp = NCBISubCreate("Hillier","LaDeana",NULL,"L.",NULL, "Washington University","Department of Genetics", "4444 Forest Park Avenue","St. Louis","Missouri", "USA","63108","314-286-1800","314-286-1810", sub_email,PubHold,gMon,gDay,gYear); if (nsp == NULL) { printf(" Fatal Error: NCBISubCreate returned NULL pointer.\n"); printf("Abort ASN.1 dump for sequence %s.\n",name(key)); return; } switch (gProject) { case HUMAN: DefineSubmittorKey(nsp, Institution); AddTechToEntry(nsp, the_entry, MI_TECH_htgs_3); break; default: break; } pub = CitSubBuild(nsp, gMon, gDay, gYear, MEDIUM_EMAIL); CitSubForSubmission(nsp, pub); AddAuthorToPub(nsp, pub, "Waterston", "Robert", NULL, "R.", NULL); if (asnDoDump(key)) { printf("Abort ASN.1 dump for sequence %s.\n",name(key)); return; } /* Validate submission and report errors; write ASN.1 file. */ if ((nasnerrors = NCBISubValidate(nsp, stderr)) == 0) printf(" NCBI validate: SUCCESS!\n"); else printf(" NCBI validate: %d ERRORS in ASN.1 submission.\n", (int)nasnerrors); NCBISubWrite(nsp, asnfile); NCBISubFree(nsp); printf("End ASN.1 dump for sequence %s.\n",name(key)); } /********************************************************************* * asnDump: * * Entry point for ASN.1 dump; calling routine in w7/fmapcontrol.c ********************************************************************/ void asnDump (void) { displayBlock(asnDumpKey,"I will NCBI dump the corresponding sequence ") ; } /********************************************************************* * catnames: * * Return a malloced string that is the concatenation of name1 & name2 ********************************************************************/ char * catnames (char *name1, char *name2) { char *cp, *cp0; size_t len1, len2; size_t len; name1 = (name1 == NULL ? "" : name1); name2 = (name2 == NULL ? "" : name2); len1 = strlen(name1); len2 = strlen(name2); len = len1 + len2 + 3; /* make room for "; " + NUL byte */ cp0 = cp = Nlm_MemNew(len); cp = Nlm_StrMove(cp, name1); if (*name1 != NULLB && *name2 != NULLB) cp = Nlm_StrMove(cp, "; "); MemCpy(cp, name2, len2 + 1); return cp0; } /**************************************************************************** * get_acc_nid: * * Return the NID associated with the given accession ****************************************************************************/ static unsigned long get_acc_nid(char *fname, char *acc) { FILE *pp; char *cp; char buf[4096]; unsigned long nid = 0; sprintf(buf, "getfa %s %s", fname, acc); if ((pp = popen(buf, "r")) == NULL) return nid; if (fgets(buf, sizeof buf, pp) == NULL || buf[0] != '>') goto Return; if ((cp = StringStr(buf, "gi|")) == NULL) goto Return; sscanf(cp + 3, "%lu", &nid); Return: pclose(pp); return nid; } /**************************************************************************** * get_cDNA_NID: * * Return the NID associated with the given accession * -- looking in PRImate and EST divisions only. ****************************************************************************/ unsigned long get_cDNA_NID(char *acc) { unsigned long nid; if ((nid = get_acc_nid("pri.nt", acc)) != 0) return nid; if ((nid = get_acc_nid("est.nt", acc)) != 0) return nid; return nid; } /**************************************************************************** Commented out for now since info is in DB_remark; at least for C. elegans...This goes in the build of the main comment; do not delete. Save for future reference. Note that this code has never been tested. if (clone && (obj = bsCreate(clone))) { KEY sseq; OBJ Sseq; if (bsFindKey(Seq, _Clone_left_end, clone) && bsGetData(Seq, _bsRight, _Int, &i)) sprintf(gBuf+strlen(gBuf)," The true left end of clone %s is at %d in this sequence.~", name(clone), i); else if (bsGetKey(obj, _Clone_left_end, &sseq) && (Sseq = bsCreate (sseq))) { if (bsFindKey(Sseq, _Clone_left_end, clone) && bsGetData(Sseq, _bsRight, _Int, &i)) sprintf(gBuf+strlen(gBuf)," The true left end of clone %s is at %d in sequence CEL%s~", name(clone), i, name(sseq)); bsDestroy(Sseq); } if (bsFindKey(Seq, _Clone_right_end, clone) && bsGetData(Seq, _bsRight, _Int, &i)) sprintf(gBuf+strlen(gBuf)," The true right end of clone %s is at %d in this sequence.~", name(clone), i); else if (bsGetKey(obj, _Clone_right_end, &sseq) && (Sseq == bsCreate (sseq))) { if (bsFindKey(Sseq, _Clone_right_end, clone) && bsGetData(Sseq, _bsRight, _Int, &i)) sprintf(gBuf+strlen(gBuf)," The true right end of clone %s is at %d in sequence CEL%s~", name(clone), i, name(sseq)); bsDestroy (Sseq); } if (bsFindTag(Seq, _Clone_left_end) && bsFlatten(Seq, 2, a)) for (i = 0 ; i < arrayMax(a) ; i += 2) if (arr(a, i, BSunit).k != clone && arr(a, i+1, BSunit).i) sprintf(gBuf+strlen(gBuf)," The true left end of clone %s is at %d in this sequence.~", name(arr(a, i, BSunit).k), arr(a, i+1, BSunit).i); if (bsFindTag(Seq, _Clone_right_end) && bsFlatten(Seq, 2, a)) for (i = 0 ; i < arrayMax(a) ; i += 2) if (arr(a, i, BSunit).k != clone && arr(a, i+1, BSunit).i) sprintf(gBuf+strlen(gBuf)," The true right end of clone %s is at %d in this sequence.~", name(arr(a, i, BSunit).k), arr(a, i+1, BSunit).i); bsDestroy(obj); } if (bsGetKey(Seq, _Overlap_left, &key)) { strcat(gBuf, "The start of this sequence "); if (obj = bsCreate(key)) { int len, off; KEY dna; if (bsGetKey (obj, _DNA, &dna) && bsGetData (obj, _bsRight, _Int, &len) && bsFindKey (obj, _Overlap_right, seq) && bsGetData (obj, _bsRight, _Int, &off)) sprintf(gBuf+strlen(gBuf), "(1..%d) ", len - off + 1); bsDestroy (obj); } sprintf(gBuf+strlen(gBuf),"overlaps with the end of sequence CEL%s.~",name (key)); } if (bsGetKey(Seq, _Overlap_right, &key)) { strcat (gBuf, "The end of this sequence "); if (bsGetData(Seq, _bsRight, _Int, &i)) sprintf(gBuf+strlen(gBuf), "(%d..%d) ", i, gLength); sprintf(gBuf+strlen(gBuf), "overlaps with the start of sequence CEL%s.~",name (key)); } *****************************************************************************/