#! /bin/sh ######################################################################## # SCRIPT: pima.sh # # Given a set of (presumably related) primary sequences in sequence_file, # clusters the sequences into one or more families using two different # linkage rules: 1) maximal linkage (Smith and Smith, 1990) and # 2) sequential branching (see Smith and Smith, 1991). For the latter, # all pairwise scores are sorted high-to-low and the first sequence from # the highest scoring pair is chosen as the "reference sequence" # and the sequences clustered based strictly on the order of # similarity to the reference sequence. Maximal linkage clusters are # labelled '-ML' appended to the CLUSTER_NAME; sequential branching # clusters are labeled '-SB'. Each cluster is then multiply-aligned # using a pattern-based alignment algorithm (Smith and Smith, 1991). # # If secondary structure sequences are provided # for one or more of the primary sequences (one of which must be # designated a priori as the reference sequence" then the sequences are # clustered using the sequentially branching rule and the clusters # multiply-aligned using a secondary structure-dependent gap penalty # algorithm (Smith and Smith, 1991). # # Where: # CLUSTER_NAME is a name used to label the cluster # CLUSTER_SCORE_CUTOFF is the lowest match score to be used within a cluster # (recommended value: 25.0); a value of 0 will group all input seqs into # a single cluster, but the final pattern may be completely degenerate. # REF_SEQ_NAME [optional; if specified, then sec_struct_seq_file must also # be specified] is the locus name of one of the primary sequences for # which the secondary structure is in the file seq_struct_seq_file, # sec_struct_seq_file [optional; if specified, then REF_SEQ_NAME must also # be specified] contains secondary structure # sequences for one or more of the primary sequences in the set. # The secondary structure sequences in the file # sec_struct_seq_file should have a locus name that matches # the locus name of it's corresponding # primary sequence with the suffix '.ss' (e.g. 2PLV.ss). Each element # of an alpha-helix, 3-10 helix and beta-strand must be designated # 'h', 'g', and 'e', repectively. All other characters in the secondary # strucuture sequences will be ignored with respect to the # the structure-dependent gap penalty. # ss_gap_penalty [optional; default: 66.7] is the penalty for placing a gap # within a alpha- or 3-10 helix or a beta-strand. # # Output: 2 files for all clusters: # CLUSTER_NAME-[ML|SB].cluster, CLUSTER_NAME-[ML|SB].pattern, and # CLUSTER_NAME-[ML|SB].pima # # References: # Smith, R.F. and Smith, T.F. (1990). Automatic generation of primary # sequence patterns from sets of related protein sequences. # PNAS 87:118-122. # Smith, R.F. and Smith, T.F. (1991). Pattern-Induced Multi-Sequence # Alignment (PIMA) algorithm employing secondary structure-dependent # gap penalties for use in comparitive protein modelling. # Submitted to Protein Engineering. # # # Smith, Randall F. and Temple F. Smith (1991) # Molecular Biology Computer Research Resource # Galleria Level 1 # Dana-Farber Cancer Institute and School of Public Health # Harvard University # 44 Binney St., Boston MA 02115 USA (617)732-3746 # # Internet: rsmith@bcm.tmc.edu # # # Last modified: 5-16-95 ######################################################################## # # Interpret options and arguments. # while true; do case "$1" in # make-cluster options. -c) C_OPT="-c $2" shift ;; -M) M_OPT=-m ;; # make-pattern options. -n) PATTERN_OPTIONS="$PATTERN_OPTIONS $1" ;; -t|-u) PATTERN_OPTIONS="$PATTERN_OPTIONS $1 $2" shift ;; # Common options. -d|-i|-l|-m|-w) OPTIONS="$OPTIONS $1 $2" shift ;; # Print help and exit. -h) echo "Usage: pima [options] pattern_name seq_file [ref_seq_name sec_struct_seq_file]" echo echo " -c CUTOFF use match score cutoff of CUTOFF instead of 0" echo " -d PENALTY use length dependent gap penalty of PENALTY (gap extension)" echo " -h print this help" echo " -i PENALTY use length independent gap penalty of PENALTY (gap open)" echo " -l SCORE use minimum local score of SCORE" echo " -m MATRIX_FILE use weight matrix file of MATRIX_FILE instead of patgen.mat" echo " -n do not output node extensions" echo " -t PENALTY use sec. struct. gap penalty of PENALTY" echo " -u CHARS use secondary structure characters CHARS instead of \"hge\"" echo " -w WIDTH use minimum local alignment width of WIDTH instead of 15" echo " -M only perform maximal (complete) linkage" echo echo "Default values for the gap penalties and the minimum local score depend on" echo "the weight matrix file used. All options must be specified separately" exit 0 ;; *) break ;; esac shift done if test $# -ne 2 -a $# -ne 4; then echo 'pima: bad arguments, run "pima -h" for help.' >&2 exit 1 fi CLUSTER_NAME=$1 SEQ_FILE=$2 if test ! -s $SEQ_FILE; then echo "pima: sequence-file \"$SEQ_FILE\" not found." >&2 exit 1 fi if test $# -gt 2; then if test -n "$M_OPT"; then echo 'pima: -M option not accepted, secondary structure uses sequential branching.' >&2 exit 1 fi REF_SEQ_NAME=$3 SS_SEQ_FILE=$4 if test ! -s $SS_SEQ_FILE; then echo "pima: sec_struct_seq_file \"$SS_SEQ_FILE\" not found." >&2 exit 1 fi fi echo echo ">>Making cluster:" if test $# -eq 2; then make-cluster $OPTIONS $C_OPT $M_OPT $CLUSTER_NAME $SEQ_FILE else make-cluster $OPTIONS $C_OPT $CLUSTER_NAME $SEQ_FILE $REF_SEQ_NAME $SS_SEQ_FILE fi if test $? -ne 0; then echo 'pima: make-cluster error.' >&2 exit 1 fi CLUSTER_FILES=`echo $CLUSTER_NAME*.cluster` if test "$CLUSTER_FILES" = "$CLUSTER_NAME*.cluster"; then echo 'pima: missing cluster files.' >&2 exit 1 fi echo echo "Cluster(s) created:" cat $CLUSTER_FILES echo echo ">>Generating pattern(s) + multi-sequence alignment(s)" for CLUSTER_FILE in $CLUSTER_FILES; do if test $# -eq 2; then make-pattern $OPTIONS $PATTERN_OPTIONS $SEQ_FILE $CLUSTER_FILE else make-pattern $OPTIONS $PATTERN_OPTIONS $SEQ_FILE $CLUSTER_NAME-SB.cluster $SS_SEQ_FILE fi if test $? -ne 0; then echo 'pima: make-pattern error.' >&2 exit 1 fi done echo PATTERN_FILES=`echo $CLUSTER_NAME*.pima` if test "$PATTERN_FILES" = "$CLUSTER_NAME*.pima"; then echo 'pima: missing pattern files.' >&2 exit 1 fi echo "Pattern(s) created:" for FILE in $PATTERN_FILES; do ROOT_NAME=`echo $FILE | sed 's/\.pima//'` extract-root-pat $FILE > $ROOT_NAME.pattern echo cat $ROOT_NAME.pattern echo done echo ">>Progran completed." echo ">>Files created:" ls -sF $CLUSTER_NAME* exit 0