#! /bin/sh
########################################################################
#                  SCRIPT: make-pattern.sh
#
# Simultaneously constructs an amino-acid class covering (AACC) pattern and
#   a pattern-induced multi-sequence alignment for the sequences in
#   each cluster in cluster_file.  If secondary structure sequences are
#   provided for one or more of the primary sequences (one of which must be
#   designated as a "reference sequence") then the sequences are
#	multiple-aligned using a secondary structure-dependent gap penalty
#   algorithm.
#
#   cluster-file format: CLUSTER_NAME<tab>cluster-nested-list
#        (as created by make-cluster.sh)
#
#   If present, the secondary structure sequences in the file
#        sec_struct_seq_file should have a locus name that matches
#        the locus name of it's corresponding primary sequence
#        with the suffix '.ss' (e.g. 2PLV.ss).  Each element
#        of an alpha-helix, 3-1 helix and beta-strand must be designated
#        'h', 'g', and 'e', repectively.  All other characters in the secondary
#        strucuture sequences will be ignored with respect to the
#        the structure-dependent gap penalty.
#    ss_gap_penalty is the penalty for placing a gap 
#       within a alpha- or 3-10 helix or a beta-strand.
#   
# Output files created:
#   "CLUSTER_NAME-[ML|SB][.ext].pima" 
#        The pattern-induced multiple-sequence alignment of each clustered
#        sequence set; includes the "nodal" patterns used to align the
#        sequences (the nodal patterns have the locus name 
#        cluster_name.ext -- extensions added to the sequence names
#        match the extension of the nodal-pattern used to align the
#        corresponding sequence subset, e.g. seq_1.1 and seq_2.1 would be
#        aligned by nodal-pattern cluster_name.1 .
#
# References:
#   Smith, R.F. and Smith, T.F. (1990).  Automatic generation of primary
#     sequence patterns from sets of related protein sequences.
#     PNAS 87:118-122.
#   Smith, R.F. and Smith, T.F. (1991).  Pattern-Induced Multi-Sequence
#     Alignment (PIMA) algorithm employing secondary structure-dependent
#     gap penalties for use in comparitive protein modelling.
#     Submitted to Protein Engineering.
#
# Smith, Randall F. and Temple F. Smith (1991)
# Molecular Biology Computer Research Resource
# Galleria Level 1
# Dana-Farber Cancer Institute and School of Public Health
# Harvard University
# 44 Binney St., Boston MA 02115 USA     (617)732-3746
# Email: rsmith@bcm.tmc.edu
#
# Copyright (c) 1990,1991 MBCRR, Dana-Farber Cancer Institute 
#  and Harvard University.
#
# Last modified: 5-16-95
########################################################################

if test "$1" = -h; then
    echo "Usage: make-pattern [options] seq_file cluster_file [sec_struct_seq_file]"
    echo
    echo "  -d PENALTY      use length dependent gap penalty of PENALTY (gap extension)"
    echo "  -h              print this help"
    echo "  -i PENALTY      use length independent gap penalty of PENALTY (gap open)"
    echo "  -l SCORE        use minimum local score of SCORE"
    echo "  -m MATRIX_FILE  use weight matrix file of MATRIX_FILE instead of patgen.mat"
    echo "  -n              do not output node extensions"
    echo "  -t PENALTY      use sec. struct. gap penalty of PENALTY"
    echo "  -u CHARS        use secondary structure characters CHARS instead of \"hge\""
    echo "  -w WIDTH        use minimum local alignment width of WIDTH instead of 15"
    echo
    echo "Default values for the gap penalties and the minimum local score depend on"
    echo "the weight matrix file used.  All options must be specified separately"
    exit 0
fi

while true; do
    case "$1" in
	-n)
	    N_OPT=-n ;;
	-d|-i|-l|-m|-t|-u|-w)
	    OPTIONS="$OPTIONS $1 $2"
	    shift ;;
	*)
	   break ;;
    esac
    shift
done

if test $# -ne 2 -a $# -ne 3; then
    echo 'make-pattern: bad arguments, run "make-pattern -h" for help.' >&2
    exit 1
fi

SEQ_FILE=$1
if test ! -s $SEQ_FILE; then
    echo "make-pattern: sequence-file \"$SEQ_FILE\" not found." >&2
    exit 1
fi

CLUSTER_FILE=$2
if test ! -s $CLUSTER_FILE; then
    echo "make-pattern: sequence-file \"$CLUSTER_FILE\" not found." >&2
    exit 1
fi

if test $# -gt 2; then
    SS_SEQ_FILE=$3
    if test ! -s $SS_SEQ_FILE; then
	echo "make-pattern: sec_struct_seq_file \"$SS_SEQ_FILE\" not found." >&2
	exit 1
    fi
fi

#
# Save the output of extract-records in a file and open it on file descriptor
# 3.  Changed from a pipe to avoid having the while loop become a subshell.
# The while loop will also become a subshell if redirection is used, so avoid
# that also.  Want to avoid having the while loop be a subshell otherwise
# there is no way to catch an error and variables are not preserved.
#
extract-records -i $CLUSTER_FILE > tmp-extract.$$
exec 3<tmp-extract.$$
while read CLUSTER_NAME_FILE <&3; do
    SEQ_NUM=0

    echo $CLUSTER_NAME_FILE | extract-records - $CLUSTER_FILE > tmp-cluster.$$
    N_WORDS=`wc -w < tmp-cluster.$$`

    #skip this loop if the original cluster size = 1 entry
    if test $N_WORDS -gt 2; then
	while test $N_WORDS -gt 2; do
	    SEQ_NUM=`expr $SEQ_NUM + 1`

	    # Avoid a pipe, see note above while loop.  Be sure print-cluster
	    # works, otherwise tmp-cluster.$$ won't change nor will $N_WORDS
	    # and the while loop right above will become infinite.
	    print-cluster -m tmp-cluster.$$ > tmp-print.$$
	    if test $? -ne 0 -o ! -s tmp-cluster.$$; then
		echo 'make-pattern: print-cluster error.' >&2
		exit 1
	    fi
	    exec 4<tmp-print.$$

	    while read CLUSTER_NAME SEQ1 SCORE SEQ2 <&4; do
		J=1
		for I in "$SEQ1" "$SEQ2"; do
		    # Use leading "./" just in case locus begins with a "/".
		    if test -s ./"$I"; then
			mv "$I" tmp-infile-$J.$$	
		    else
			echo "$I" | extract-records - $SEQ_FILE \
			> tmp-infile-$J.$$
			if test $# -gt 2; then 
			    echo "$I".ss | extract-records - $SS_SEQ_FILE \
			    >> tmp-infile-$J.$$
			fi
		    fi
		    if test ! -s tmp-infile-$J.$$; then
			echo "make-pattern: input file tmp-infile-$J.$$ is empty." >&2
			exit 1
		    fi
		    J=2 # Should use `expr $J + 1`, but only happens once.
		done

		EXTRA_OPTIONS=
		if test $# -gt 2; then
		    EXTRA_OPTIONS=-s
		fi
		if test ! -n "$N_OPT"; then
		    EXTRA_OPTIONS="$EXTRA_OPTIONS -e .$SEQ_NUM"
		fi

		pima-pm $OPTIONS $EXTRA_OPTIONS \
		tmp-infile-1.$$ tmp-infile-2.$$ tmp-pat-out.$$ \
		$CLUSTER_NAME > tmp-score.$$ 

		if test $? -ne 0; then
		    echo 'make-pattern pima-pm error.' >&2
		    exit 1
		fi
		NEW_SCORE=`tail -1 tmp-score.$$`

		print-cluster -s "$SEQ1" "$SCORE" "$SEQ2" $CLUSTER_NAME.$SEQ_NUM tmp-cluster.$$ > tmp-new-cluster.$$

		mv tmp-pat-out.$$ $CLUSTER_NAME.$SEQ_NUM

		echo "$SEQ_NUM	$SEQ1	$SEQ2	$NEW_SCORE"

		mv tmp-new-cluster.$$ tmp-cluster.$$
		rm tmp-infile-1.$$ tmp-infile-2.$$
	    done
	    N_WORDS=`wc -w < tmp-cluster.$$`
	done

	if test -n "$N_OPT"; then
	    mv $CLUSTER_NAME_FILE.$SEQ_NUM $CLUSTER_NAME_FILE.pima
	else
	    trim-root-num $CLUSTER_NAME_FILE.$SEQ_NUM > $CLUSTER_NAME_FILE.pima
	    rm $CLUSTER_NAME_FILE.$SEQ_NUM
	fi
	rm tmp-cluster.$$ tmp-print.$$ tmp-score.$$ 

	# Set a flag that a pattern was generated.  Just in case all the
	# clusters have just a single sequence.
	MADE_PATTERN=TRUE

	#skip this whole loop if the original cluster size = 1 entry
    fi
    echo
done
rm tmp-extract.$$

if test ! "$MADE_PATTERN"; then
    echo 'make-pattern: no patterns generated, score cutoff probably to high.' >&2
    rm tmp-cluster.$$
    exit 1
fi

exit 0