#!/bin/csh # GBUPDATE, Version 10/31/2007 #csh script to download GenBank files # 31 Oct 2007 Added RM_CMD and FTPCOMMAND variables, to get # around differences amoung Unix and Linux implementations. # 24 Feb 2007 As of GenBank Release 158.0, the Accession index # has been split among several files, named gbacc1.idx,gbacc2.idx etc. # gbupdate now handles these files in the same way it handles # sequence files. Simply include 'acc' in filelist as if it # was another division. # 29 Apr 2004 GenPept files have a new naming convention. For example, # in GenBank Release 141, rel141.fsa_aa # 21 Aug 2000 Updated to extract gzipped files, rather # than tar.Z files. # Example: {uses 'at' command to run a delayed batch job} # at 1am # at>gbupdate filelist & # at>ctrl-D {ends the command } # This assumes you are in the GenBank directory. The existing files will be # replaced with the new ones, and so must be writeable. # Remember to include the index and documentation files in filelist. # When disk space is tight, edit the 'filelist' file to download files in # order of decreasing size. In general, you need to have at least twice as # much empty space as is necessary to hold the largest file to be downloaded. # This can be circumvented by setting $tmpdir to /tmp, rather than ".". #----------------------- Set environment variables ------------- #Check to see if $MAILID is set if (${?MAILID} == 0) then echo Environment variable MAILID must be set to your full Internet address echo in the form userid@hostname echo This is best done in your .cshrc file. exit endif # RM_CMD - command to be used for removing files and directories if (-e /usr/bin/rm) then set RM_CMD = /usr/bin/rm else if (-e /bin/rm) then set RM_CMD = /bin/rm else set RM_CMD = rm endif endif # generate FTP command # We need to run in the passive mode, which is required by # some firewalls. # There is a lot of inconsistency from system to system as far # as how ftp is run in the passive mode. One or more of these # works on each system, but none works on all systems: # ftp -p # ftp, input 'passive' from ftp.input # pftp # set RESULT = `which pftp | wc -w` if ( "$RESULT" == "1" ) then set FTPCOMMAND = 'pftp' else set FTPCOMMAND = 'ftp -p' endif #- - - - - - - - - directory to store temporary files - - - - - - - - - # Unless you are just scraping the last bytes of the # filesystem in which GenBank is to reside, then the current working # directory should be fine. set tmpdir = . # /tmp and /var/tmp are two alternatives, if you're running out of space # in the target filesystem. Beware! if you max out /tmp, you will be # interfering with the ability of everyone else on the system to # do business as usual. /tmp on some systems can be surprisingly small, # if it is taken from /swap. #set tmpdir = /tmp # On some systems, /var/tmp is an alternative place for temporary files #set tmpdir = /var/tmp #- - - - - - - - - - - - - - - - - - - - - - set GBUSERID = anonymous set GBPASSWD = $MAILID # RLENGTH and LLENGTH tell which field gives the length of a file # printed by 'ls -l' respectively, on remote and local hosts. set RLENGTH = 5 set LLENGTH = 5 cd $GENBANK # GenBank download site, and directory at that site. # Mirror sites and directories are commented out. # - - - - - - NCBI #set GBHOST = ftp.ncbi.nih.gov #set GBDIR = genbank #set RLENGTH = 5 #- - - - - - -JAPAN #set GBHOST = bio-mirror.jp.apan.net #set GBDIR = pub/biomirror/genbank #- - - - - - -AUSTRALIA #set GBHOST = bio-mirror.au.apan.net #set GBDIR = biomirror/genbank #- - - - - - -SINGAPORE #set GBHOST = bio-mirror.sg.apan.net #set GBDIR = biomirrors/genbank #- - - - - - -CHINA #set GBHOST = bio-mirror.im.ac.cn #set GBDIR = genbank #- - - - - - -USA - Indiana University set GBHOST = bio-mirror.net set GBDIR = biomirror/genbank set GBUSERID = anonymous set GBPASSWD = $MAILID #- - - - - - -USA - San Diego Supercomputing Center #set GBHOST = genbank.sdsc.edu #set GBDIR = pub #set GBUSERID = anonymous #set GBPASSWD = $MAILID # - - - - - - - - - - - - - - - # Use 'ls -l' to write a list of GenBank files on remote server, # Send output to ls.out # generate FTP command file echo user $GBUSERID $GBPASSWD > ftp.input echo cd $GBDIR >> ftp.input echo bin >> ftp.input echo ls -l ls.out >> ftp.input echo bye >> ftp.input # run FTP $FTPCOMMAND -i -n $GBHOST < ftp.input #----------------------- MAIN LOOP ------------- foreach file (`cat $1`) if ($file:e == gz | $file:e == Z) then set name = $file:r else set name = $file endif echo $file # Create a temporary list of all files for a # GenBank division. egrep -e gb$file'[0-9]*\.idx\.gz' ls.out > $$.temp egrep -e gb$file'[0-9]*\.seq\.gz' ls.out >> $$.temp cat $$.temp if ( -z $$.temp) then # division is in a single file echo $file > $$.filelist else # division is split among several files tr -s ' ' ' ' < $$.temp | cut -f9 > $$.filelist endif cat $$.filelist $RM_CMD $$.temp foreach file (`cat $$.filelist`) #Nomenclature: # $file - original gzipped file eg. gbest1.seq.gz # $name - $file minus .gz extension eg. gbest1.seq # $base - raw file name eg. gbest1 if ($file:e == gz | $file:e == Z) then set name = $file:r if ($name:e == seq) then set base = $name:r # Make some space by deleting the current GenBank division, if # it exists. echo "Removing file(s) for $base, if they exist" $RM_CMD $name.* endif endif # Create input file for ftp command. Logs in, moves to correct directory, # and downloads the data. Then logs out. echo user $GBUSERID $GBPASSWD > ftp.input echo cd $GBDIR >> ftp.input echo bin >> ftp.input echo get $file $tmpdir/$file >> ftp.input echo bye >> ftp.input # Get the file from GenBank #nice $FTPCOMMAND -i -n $GBHOST < ftp.input nice +2 $FTPCOMMAND -i -n $GBHOST < ftp.input # Make sure that the file received is the same length as the original #file. set ORIGINAL = `grep $file ls.out |tr -s ' ' ' ' |cut -f$RLENGTH` set RECIEVED = `ls -l $tmpdir/$file |tr -s ' ' ' ' |cut -f$LLENGTH` echo 'ORIGINAL= '$ORIGINAL echo 'RECEIVED= '$RECIEVED if ($ORIGINAL == $RECIEVED) then ls -l $tmpdir/$file >> files_received if ($file:e == gz | $file:e == Z) then #Uncompress the file if ($name:e == seq) then nice +10 gunzip -f $tmpdir/$file #Run splitdb to create the database nice +10 splitdb -c $tmpdir/$name $base.ano $base.wrp $base.ind set success = $status chmod a+r $base.* #Remove the .seq file if splitdb exited with a return code of 0. if ($success == 0) then $RM_CMD $tmpdir/$name # Otherwise, remove the partially split files to make room. # These will have to be split manually. else $RM_CMD $base.ano $base.wrp $base.ind endif # success == 0 else # nice +10 zcat -c -f $tmpdir/$file > $name nice +10 gunzip -c -f $tmpdir/$file > $name $RM_CMD $tmpdir/$file chmod a+r $name endif endif # $name:e == seq endif # $file:e == gz if ($name:e == fsa_aa) then $RM_CMD $GP/*.fsa_aa mv $name $GP $RM_CMD $GP/genpept.wrp ln -s $name $GP/genpept.wrp else if ($name == gbrel.txt) then mv gbrel.txt $DOC/GenBank endif else echo $file >> files_missed endif # $ORIGINAL == RECEIVED end #foreach $RM_CMD $$.filelist end #foreach