#! /bin/sh ######################################################################## # SCRIPT: make-pattern.sh # # Simultaneously constructs an amino-acid class covering (AACC) pattern and # a pattern-induced multi-sequence alignment for the sequences in # each cluster in cluster_file. If secondary structure sequences are # provided for one or more of the primary sequences (one of which must be # designated as a "reference sequence") then the sequences are # multiple-aligned using a secondary structure-dependent gap penalty # algorithm. # # cluster-file format: CLUSTER_NAMEcluster-nested-list # (as created by make-cluster.sh) # # If present, the secondary structure sequences in the file # sec_struct_seq_file should have a locus name that matches # the locus name of it's corresponding primary sequence # with the suffix '.ss' (e.g. 2PLV.ss). Each element # of an alpha-helix, 3-1 helix and beta-strand must be designated # 'h', 'g', and 'e', repectively. All other characters in the secondary # strucuture sequences will be ignored with respect to the # the structure-dependent gap penalty. # ss_gap_penalty is the penalty for placing a gap # within a alpha- or 3-10 helix or a beta-strand. # # Output files created: # "CLUSTER_NAME-[ML|SB][.ext].pima" # The pattern-induced multiple-sequence alignment of each clustered # sequence set; includes the "nodal" patterns used to align the # sequences (the nodal patterns have the locus name # cluster_name.ext -- extensions added to the sequence names # match the extension of the nodal-pattern used to align the # corresponding sequence subset, e.g. seq_1.1 and seq_2.1 would be # aligned by nodal-pattern cluster_name.1 . # # References: # Smith, R.F. and Smith, T.F. (1990). Automatic generation of primary # sequence patterns from sets of related protein sequences. # PNAS 87:118-122. # Smith, R.F. and Smith, T.F. (1991). Pattern-Induced Multi-Sequence # Alignment (PIMA) algorithm employing secondary structure-dependent # gap penalties for use in comparitive protein modelling. # Submitted to Protein Engineering. # # Smith, Randall F. and Temple F. Smith (1991) # Molecular Biology Computer Research Resource # Galleria Level 1 # Dana-Farber Cancer Institute and School of Public Health # Harvard University # 44 Binney St., Boston MA 02115 USA (617)732-3746 # Email: rsmith@bcm.tmc.edu # # Copyright (c) 1990,1991 MBCRR, Dana-Farber Cancer Institute # and Harvard University. # # Last modified: 5-16-95 ######################################################################## if test "$1" = -h; then echo "Usage: make-pattern [options] seq_file cluster_file [sec_struct_seq_file]" echo echo " -d PENALTY use length dependent gap penalty of PENALTY (gap extension)" echo " -h print this help" echo " -i PENALTY use length independent gap penalty of PENALTY (gap open)" echo " -l SCORE use minimum local score of SCORE" echo " -m MATRIX_FILE use weight matrix file of MATRIX_FILE instead of patgen.mat" echo " -n do not output node extensions" echo " -t PENALTY use sec. struct. gap penalty of PENALTY" echo " -u CHARS use secondary structure characters CHARS instead of \"hge\"" echo " -w WIDTH use minimum local alignment width of WIDTH instead of 15" echo echo "Default values for the gap penalties and the minimum local score depend on" echo "the weight matrix file used. All options must be specified separately" exit 0 fi while true; do case "$1" in -n) N_OPT=-n ;; -d|-i|-l|-m|-t|-u|-w) OPTIONS="$OPTIONS $1 $2" shift ;; *) break ;; esac shift done if test $# -ne 2 -a $# -ne 3; then echo 'make-pattern: bad arguments, run "make-pattern -h" for help.' >&2 exit 1 fi SEQ_FILE=$1 if test ! -s $SEQ_FILE; then echo "make-pattern: sequence-file \"$SEQ_FILE\" not found." >&2 exit 1 fi CLUSTER_FILE=$2 if test ! -s $CLUSTER_FILE; then echo "make-pattern: sequence-file \"$CLUSTER_FILE\" not found." >&2 exit 1 fi if test $# -gt 2; then SS_SEQ_FILE=$3 if test ! -s $SS_SEQ_FILE; then echo "make-pattern: sec_struct_seq_file \"$SS_SEQ_FILE\" not found." >&2 exit 1 fi fi # # Save the output of extract-records in a file and open it on file descriptor # 3. Changed from a pipe to avoid having the while loop become a subshell. # The while loop will also become a subshell if redirection is used, so avoid # that also. Want to avoid having the while loop be a subshell otherwise # there is no way to catch an error and variables are not preserved. # extract-records -i $CLUSTER_FILE > tmp-extract.$$ exec 3 tmp-cluster.$$ N_WORDS=`wc -w < tmp-cluster.$$` #skip this loop if the original cluster size = 1 entry if test $N_WORDS -gt 2; then while test $N_WORDS -gt 2; do SEQ_NUM=`expr $SEQ_NUM + 1` # Avoid a pipe, see note above while loop. Be sure print-cluster # works, otherwise tmp-cluster.$$ won't change nor will $N_WORDS # and the while loop right above will become infinite. print-cluster -m tmp-cluster.$$ > tmp-print.$$ if test $? -ne 0 -o ! -s tmp-cluster.$$; then echo 'make-pattern: print-cluster error.' >&2 exit 1 fi exec 4 tmp-infile-$J.$$ if test $# -gt 2; then echo "$I".ss | extract-records - $SS_SEQ_FILE \ >> tmp-infile-$J.$$ fi fi if test ! -s tmp-infile-$J.$$; then echo "make-pattern: input file tmp-infile-$J.$$ is empty." >&2 exit 1 fi J=2 # Should use `expr $J + 1`, but only happens once. done EXTRA_OPTIONS= if test $# -gt 2; then EXTRA_OPTIONS=-s fi if test ! -n "$N_OPT"; then EXTRA_OPTIONS="$EXTRA_OPTIONS -e .$SEQ_NUM" fi pima-pm $OPTIONS $EXTRA_OPTIONS \ tmp-infile-1.$$ tmp-infile-2.$$ tmp-pat-out.$$ \ $CLUSTER_NAME > tmp-score.$$ if test $? -ne 0; then echo 'make-pattern pima-pm error.' >&2 exit 1 fi NEW_SCORE=`tail -1 tmp-score.$$` print-cluster -s "$SEQ1" "$SCORE" "$SEQ2" $CLUSTER_NAME.$SEQ_NUM tmp-cluster.$$ > tmp-new-cluster.$$ mv tmp-pat-out.$$ $CLUSTER_NAME.$SEQ_NUM echo "$SEQ_NUM $SEQ1 $SEQ2 $NEW_SCORE" mv tmp-new-cluster.$$ tmp-cluster.$$ rm tmp-infile-1.$$ tmp-infile-2.$$ done N_WORDS=`wc -w < tmp-cluster.$$` done if test -n "$N_OPT"; then mv $CLUSTER_NAME_FILE.$SEQ_NUM $CLUSTER_NAME_FILE.pima else trim-root-num $CLUSTER_NAME_FILE.$SEQ_NUM > $CLUSTER_NAME_FILE.pima rm $CLUSTER_NAME_FILE.$SEQ_NUM fi rm tmp-cluster.$$ tmp-print.$$ tmp-score.$$ # Set a flag that a pattern was generated. Just in case all the # clusters have just a single sequence. MADE_PATTERN=TRUE #skip this whole loop if the original cluster size = 1 entry fi echo done rm tmp-extract.$$ if test ! "$MADE_PATTERN"; then echo 'make-pattern: no patterns generated, score cutoff probably to high.' >&2 rm tmp-cluster.$$ exit 1 fi exit 0