#! /bin/sh
########################################################################
#                  SCRIPT: make-cluster.sh
#
# Clusters the sequences in seq_input_file into one or more cluster
#   (i.e. family) trees using two different linkage rules: 1) maximal
#   linkage (Smith and Smith, 1990) and 2) sequential branching
#   (see Smith and Smith, 1991).  All pairwise scores are
#   sorted high-to-low and the first sequence from the highest scoring
#   pair is chosen as the "reference sequence" and the sequences
#   clustered based strictly on the order of similarity to the reference
#   sequence. Maximal linkage clusters are labelled '-ML' appended to
#   the CLUSTER_NAME; sequential branching clusters are labeled '-SB'.
# 
#   If secondary structure sequences are provided for one or more of the
#   primary sequences (one of which must be designated as the reference
#   sequence) then only the sequential branching clustering is performed.
#
# Where:
#	CLUSTER_NAME is a name used to label the cluster
#	SCORE_CUTOFF is the lowest match score to be used within a cluster
#		(recommended value: 25.0); a value of 0 will group all input
#		seqs into a single cluster, but the final pattern may be completely
#       degenerate.
#   REF_SEQ_NAME [optional; if specified, then sec_struct_seq_file must also
#       be specified] is the locus name of one of the primary sequences for 
#       which the secondary structure is in the file seq_struct_seq_file,
#   sec_struct_seq_file [optional; if specified, then REF_SEQ_NAME must also
#       be specified] contains secondary structure
#       sequences for one or more of the primary sequences in the set.
#       The secondary structure sequences in the file
#       sec_struct_seq_file should have a locus name that matches
#       the locus name of it's corresponding
#       primary sequence with the suffix '.ss' (e.g. 2PLV.ss).  Each element
#       of an alpha-helix, 3-10 helix and beta-strand must be designated
#       'h', 'g', and 'e', repectively.  All other characters in the secondary
#       structure sequences will be ignored with respect to the
#       the structure-dependent gap penalty.
#
# Output files: CLUSTER_NAME-[ML|SB].cluster
#
# References:
#   Smith, R.F. and Smith, T.F. (1990).  Automatic generation of primary
#     sequence patterns from sets of related protein sequences.
#     PNAS 87:118-122.
#   Smith, R.F. and Smith, T.F. (1991).  Pattern-Induced Multi-Sequence
#     Alignment (PIMA) algorithm employing secondary structure-dependent
#     gap penalties for use in comparitive protein modelling.
#     Submitted to Protein Engineering.
#
#
# Smith, Randall F. and Temple F. Smith (1991)
# Molecular Biology Computer Research Resource, LG-127
# Dana-Farber Cancer Institute and School of Public Health
# Harvard University
# 44 Binney St., Boston MA 02115 USA     (617)732-3746
#
# Internet: rsmith@mbcrr.harvard.edu
# Bitnet:   rsmith%mbcrr@husc6.bitnet
#
# Copyright (c) 1989,1990,1991 MBCRR, Dana-Farber Cancer Institute 
#    and Harvard University.
#
# Last modified: 5-16-95
########################################################################

DO_SEQ_BRANCH="TRUE"
CUTOFF=0

if test "$1" = -h; then
    echo "Usage: make-cluster [options] pattern_name seq_file [ref_seq_name sec_struct_seq_file]"
    echo
    echo "  -c CUTOFF       use match score cutoff of CUTOFF instead of 0"
    echo "  -d PENALTY      use length dependent gap penalty of PENALTY (gap extension)"
    echo "  -h              print this help"
    echo "  -i PENALTY      use length independent gap penalty of PENALTY (gap open)"
    echo "  -l SCORE        use minimum local score of SCORE"
    echo "  -m MATRIX_FILE  use weight matrix file of MATRIX_FILE instead of patgen.mat"
    echo "  -w WIDTH        use minimum local alignment width of WIDTH instead of 15"
    echo "  -M              only perform maximal (complete) linkage"
    echo
    echo "Default values for the gap penalties and the minimum local score depend on"
    echo "the weight matrix file used.  All options must be specified separately"
    exit 0
fi

while true; do
    case "$1" in
	# make-cluster options.
	-c)
	    CUTOFF="$2"
	    shift ;;
	-M)
	    M_OPT=-m ;;
	# Common options.
	-d|-i|-l|-m|-w)
	    OPTIONS="$OPTIONS $1 $2"
	    shift ;;
	*)
	   break ;;
    esac
    shift
done

if test -n "$M_OPT"; then
    DO_SEQ_BRANCH=FALSE
fi

if test $# -ne 2 -a $# -ne 4; then
    echo 'make-cluster: bad arguments, run "make-cluster -h" for help.' >&2
    exit 1
fi

CLUSTER_NAME=$1
SEQ_FILE=$2

if test ! -s $SEQ_FILE; then
    echo "make-cluster: sequence-file \"$SEQ_FILE\" not found." >&2
    exit 1
fi

if test $# -gt 2; then
    REF_SEQ_NAME=$3
    SS_SEQ_FILE=$4
    if test ! -s $SS_SEQ_FILE; then
	echo "make-cluster: sec_struct_seq_file \"$SS_SEQ_FILE\" not found." >&2
	exit 1
    fi
fi

if test $# -eq 2; then
    MATCH_PROGRAM_NAME="pima-mso -x"
    INPUT_FILE_1=$SEQ_FILE
else
    MATCH_PROGRAM_NAME="pima-mso"
    echo $REF_SEQ_NAME | extract-records - $SEQ_FILE > tmp-ref.$$
    INPUT_FILE_1=tmp-ref.$$
fi

$MATCH_PROGRAM_NAME $OPTIONS $INPUT_FILE_1 $SEQ_FILE tmp-out.$$
if test $? -ne 0; then
    echo 'make-cluster: pima-mso error.' >&2
    exit 1
fi

#
# Check that there are some match scores, otherwise assume there was less than
# two sequences.
#
if test ! -s tmp-out.$$; then
    echo 'make-cluster: two or more sequences with unique names are required.' >&2
    exit 1
fi

# Sort scores 
sort -rn +2 tmp-out.$$ > tmp-sorted.$$

if test "$DO_SEQ_BRANCH" = TRUE; then
    REF_SEQ=`head -1 tmp-sorted.$$ | awk '{print $1}'`
    grep "$REF_SEQ	" tmp-sorted.$$ > tmp-sorted-sb.$$
fi

if test $# -gt 2; then
    rm -f tmp-sorted.$$
fi

for FILE in tmp-sorted*.$$; do
    if test "$FILE" = "tmp-sorted-sb.$$"; then
	# SINGLE linkage
	CLUSTER_METHOD=single
	NEW_CLUSTER_NAME=$CLUSTER_NAME-SB
    else
	# MAXIMAL linkage
	CLUSTER_METHOD=complete
	if test -n "$M_OPT"; then
	    NEW_CLUSTER_NAME=$CLUSTER_NAME
	else
	    NEW_CLUSTER_NAME=$CLUSTER_NAME-ML
	fi
    fi

    rm -f $NEW_CLUSTER_NAME.cluster tmp-cluster.$$

    cluster-pima -c 2 -m $CLUSTER_METHOD -s $CUTOFF < $FILE > tmp-cluster.$$
    if test $? -ne 0; then
	echo 'make-cluster: cluster-pima error.' >&2
	exit 1
    fi

    # Make certain REF_SEQ_NAME is first member of cluster
    if test $# -gt 2; then
	print-cluster -n -r $REF_SEQ_NAME tmp-cluster.$$ > tmp-new-cluster.$$
	mv tmp-new-cluster.$$ tmp-cluster.$$
    fi

    # Add loci not in any family to cluster
    awk '{print NR,$0}' tmp-cluster.$$ | extract-cluster-loci \
    | awk '{print $2}' | sort | uniq > tmp-cluster-loci.$$
    extract-records -i $SEQ_FILE | sort | uniq > tmp-family-loci.$$
    comm -23 tmp-family-loci.$$ tmp-cluster-loci.$$ >> tmp-cluster.$$

    # Add name to cluster(s)
    N_CLUSTERS=`wc -l < tmp-cluster.$$`
    if test $N_CLUSTERS -eq 1; then
	echo $NEW_CLUSTER_NAME"	"`cat tmp-cluster.$$` \
	>> $NEW_CLUSTER_NAME.cluster
    else
	print-cluster -i $NEW_CLUSTER_NAME tmp-cluster.$$ \
	>> $NEW_CLUSTER_NAME.cluster
    fi


    if test ! -s $NEW_CLUSTER_NAME.cluster; then
	exec >&2
	echo
	echo "*********************************************************************"
	echo "> $CLUSTER_NAME.cluster is EMPTY !!"
	echo "> Probable cause: No match scores above the score-cutoff [$CUTOFF]"
	echo "> -- Check the scores in the $CLUSTER_NAME.cluster-scores file"
	echo "> -- (Use a cutoff below that of the highest score generated)"
	echo "> -- Also make sure sequences have unique locus names."
	echo "*********************************************************************"
	echo
	exit 1
    fi
    echo
done

rm tmp-*.$$
exit 0