#! /bin/sh
########################################################################
#                    SCRIPT: pima.sh
#
# Given a set of (presumably related) primary sequences in sequence_file,
#   clusters the sequences into one or more families  using two different
#   linkage rules: 1) maximal linkage (Smith and Smith, 1990) and
#   2) sequential branching (see Smith and Smith, 1991).  For the latter,
#   all pairwise scores are sorted high-to-low and the first sequence from
#   the highest scoring pair is chosen as the "reference sequence"
#   and the sequences clustered based strictly on the order of
#   similarity to the reference sequence. Maximal linkage clusters are
#   labelled '-ML' appended to the CLUSTER_NAME; sequential branching
#   clusters are labeled '-SB'.  Each cluster is then multiply-aligned
#   using a pattern-based alignment algorithm (Smith and Smith, 1991).
#
#   If secondary structure sequences are provided
#   for one or more of the primary sequences (one of which must be
#   designated a priori as the reference sequence" then the sequences are
#   clustered using the sequentially branching rule and the clusters
#   multiply-aligned using a secondary structure-dependent gap penalty
#   algorithm (Smith and Smith, 1991).
#   
# Where:
#   CLUSTER_NAME is a name used to label the cluster
#   CLUSTER_SCORE_CUTOFF is the lowest match score to be used within a cluster
#       (recommended value: 25.0); a value of 0 will group all input seqs into
#       a single cluster, but the final pattern may be completely degenerate.
#   REF_SEQ_NAME [optional; if specified, then sec_struct_seq_file must also
#       be specified] is the locus name of one of the primary sequences for 
#       which the secondary structure is in the file seq_struct_seq_file,
#   sec_struct_seq_file [optional; if specified, then REF_SEQ_NAME must also
#       be specified] contains secondary structure
#       sequences for one or more of the primary sequences in the set.
#       The secondary structure sequences in the file
#       sec_struct_seq_file should have a locus name that matches
#       the locus name of it's corresponding
#       primary sequence with the suffix '.ss' (e.g. 2PLV.ss).  Each element
#       of an alpha-helix, 3-10 helix and beta-strand must be designated
#       'h', 'g', and 'e', repectively.  All other characters in the secondary
#       strucuture sequences will be ignored with respect to the
#       the structure-dependent gap penalty.
#   ss_gap_penalty [optional; default: 66.7] is the penalty for placing a gap 
#       within a alpha- or 3-10 helix or a beta-strand.
#   
# Output: 2 files for all clusters:
#   CLUSTER_NAME-[ML|SB].cluster, CLUSTER_NAME-[ML|SB].pattern, and
#   CLUSTER_NAME-[ML|SB].pima
#
# References:
#   Smith, R.F. and Smith, T.F. (1990).  Automatic generation of primary
#     sequence patterns from sets of related protein sequences.
#     PNAS 87:118-122.
#   Smith, R.F. and Smith, T.F. (1991).  Pattern-Induced Multi-Sequence
#     Alignment (PIMA) algorithm employing secondary structure-dependent
#     gap penalties for use in comparitive protein modelling.
#     Submitted to Protein Engineering.
#
#
# Smith, Randall F. and Temple F. Smith (1991)
# Molecular Biology Computer Research Resource
# Galleria Level 1
# Dana-Farber Cancer Institute and School of Public Health
# Harvard University
# 44 Binney St., Boston MA 02115 USA     (617)732-3746
#
# Internet: rsmith@bcm.tmc.edu
#
#
# Last modified: 5-16-95
########################################################################

#
# Interpret options and arguments.
#

while true; do
    case "$1" in
	# make-cluster options.
	-c)
	    C_OPT="-c $2"
	    shift ;;
	-M)
	    M_OPT=-m ;;
	# make-pattern options.
	-n)
	    PATTERN_OPTIONS="$PATTERN_OPTIONS $1" ;;
	-t|-u)
	    PATTERN_OPTIONS="$PATTERN_OPTIONS $1 $2"
	    shift ;;
	# Common options.
	-d|-i|-l|-m|-w)
	    OPTIONS="$OPTIONS $1 $2"
	    shift ;;
	# Print help and exit.
	-h)
	    echo "Usage: pima [options] pattern_name seq_file [ref_seq_name sec_struct_seq_file]"
	    echo
	    echo "  -c CUTOFF       use match score cutoff of CUTOFF instead of 0"
	    echo "  -d PENALTY      use length dependent gap penalty of PENALTY (gap extension)"
	    echo "  -h              print this help"
	    echo "  -i PENALTY      use length independent gap penalty of PENALTY (gap open)"
	    echo "  -l SCORE        use minimum local score of SCORE"
	    echo "  -m MATRIX_FILE  use weight matrix file of MATRIX_FILE instead of patgen.mat"
	    echo "  -n              do not output node extensions"
	    echo "  -t PENALTY      use sec. struct. gap penalty of PENALTY"
	    echo "  -u CHARS        use secondary structure characters CHARS instead of \"hge\""
	    echo "  -w WIDTH        use minimum local alignment width of WIDTH instead of 15"
	    echo "  -M              only perform maximal (complete) linkage"
	    echo
	    echo "Default values for the gap penalties and the minimum local score depend on"
	    echo "the weight matrix file used.  All options must be specified separately"
	    exit 0 ;;
	*)
	   break ;;
    esac
    shift
done

if test $# -ne 2 -a $# -ne 4; then
    echo 'pima: bad arguments, run "pima -h" for help.' >&2
    exit 1
fi

CLUSTER_NAME=$1
SEQ_FILE=$2
if test ! -s $SEQ_FILE; then
    echo "pima: sequence-file \"$SEQ_FILE\" not found." >&2
    exit 1
fi

if test $# -gt 2; then
    if test -n "$M_OPT"; then
	echo 'pima: -M option not accepted, secondary structure uses sequential branching.' >&2
	exit 1
    fi
    REF_SEQ_NAME=$3
    SS_SEQ_FILE=$4
    if test ! -s $SS_SEQ_FILE; then
	echo "pima: sec_struct_seq_file \"$SS_SEQ_FILE\" not found." >&2
	exit 1
    fi
fi

echo
echo ">>Making cluster:"
if test $# -eq 2; then
    make-cluster $OPTIONS $C_OPT $M_OPT $CLUSTER_NAME $SEQ_FILE
else
    make-cluster $OPTIONS $C_OPT $CLUSTER_NAME $SEQ_FILE $REF_SEQ_NAME $SS_SEQ_FILE
fi
if test $? -ne 0; then
    echo 'pima: make-cluster error.' >&2
    exit 1
fi

CLUSTER_FILES=`echo $CLUSTER_NAME*.cluster`
if test "$CLUSTER_FILES" = "$CLUSTER_NAME*.cluster"; then
    echo 'pima: missing cluster files.' >&2
    exit 1
fi

echo
echo "Cluster(s) created:"
cat $CLUSTER_FILES
echo

echo ">>Generating pattern(s) + multi-sequence alignment(s)"
for CLUSTER_FILE in $CLUSTER_FILES; do
    if test $# -eq 2; then
	make-pattern $OPTIONS $PATTERN_OPTIONS $SEQ_FILE $CLUSTER_FILE
    else
	make-pattern $OPTIONS $PATTERN_OPTIONS $SEQ_FILE $CLUSTER_NAME-SB.cluster $SS_SEQ_FILE
    fi
    if test $? -ne 0; then
	echo 'pima: make-pattern error.' >&2
	exit 1
    fi
done

echo

PATTERN_FILES=`echo $CLUSTER_NAME*.pima`
if test "$PATTERN_FILES" = "$CLUSTER_NAME*.pima"; then
    echo 'pima: missing pattern files.' >&2
    exit 1
fi

echo "Pattern(s) created:"
for FILE in $PATTERN_FILES; do
    ROOT_NAME=`echo $FILE | sed 's/\.pima//'`
    extract-root-pat $FILE > $ROOT_NAME.pattern
    echo
    cat $ROOT_NAME.pattern
    echo
done

echo ">>Progran completed."
echo ">>Files created:"
ls -sF $CLUSTER_NAME*

exit 0