#! /bin/sh ######################################################################## # SCRIPT: make-cluster.sh # # Clusters the sequences in seq_input_file into one or more cluster # (i.e. family) trees using two different linkage rules: 1) maximal # linkage (Smith and Smith, 1990) and 2) sequential branching # (see Smith and Smith, 1991). All pairwise scores are # sorted high-to-low and the first sequence from the highest scoring # pair is chosen as the "reference sequence" and the sequences # clustered based strictly on the order of similarity to the reference # sequence. Maximal linkage clusters are labelled '-ML' appended to # the CLUSTER_NAME; sequential branching clusters are labeled '-SB'. # # If secondary structure sequences are provided for one or more of the # primary sequences (one of which must be designated as the reference # sequence) then only the sequential branching clustering is performed. # # Where: # CLUSTER_NAME is a name used to label the cluster # SCORE_CUTOFF is the lowest match score to be used within a cluster # (recommended value: 25.0); a value of 0 will group all input # seqs into a single cluster, but the final pattern may be completely # degenerate. # REF_SEQ_NAME [optional; if specified, then sec_struct_seq_file must also # be specified] is the locus name of one of the primary sequences for # which the secondary structure is in the file seq_struct_seq_file, # sec_struct_seq_file [optional; if specified, then REF_SEQ_NAME must also # be specified] contains secondary structure # sequences for one or more of the primary sequences in the set. # The secondary structure sequences in the file # sec_struct_seq_file should have a locus name that matches # the locus name of it's corresponding # primary sequence with the suffix '.ss' (e.g. 2PLV.ss). Each element # of an alpha-helix, 3-10 helix and beta-strand must be designated # 'h', 'g', and 'e', repectively. All other characters in the secondary # structure sequences will be ignored with respect to the # the structure-dependent gap penalty. # # Output files: CLUSTER_NAME-[ML|SB].cluster # # References: # Smith, R.F. and Smith, T.F. (1990). Automatic generation of primary # sequence patterns from sets of related protein sequences. # PNAS 87:118-122. # Smith, R.F. and Smith, T.F. (1991). Pattern-Induced Multi-Sequence # Alignment (PIMA) algorithm employing secondary structure-dependent # gap penalties for use in comparitive protein modelling. # Submitted to Protein Engineering. # # # Smith, Randall F. and Temple F. Smith (1991) # Molecular Biology Computer Research Resource, LG-127 # Dana-Farber Cancer Institute and School of Public Health # Harvard University # 44 Binney St., Boston MA 02115 USA (617)732-3746 # # Internet: rsmith@mbcrr.harvard.edu # Bitnet: rsmith%mbcrr@husc6.bitnet # # Copyright (c) 1989,1990,1991 MBCRR, Dana-Farber Cancer Institute # and Harvard University. # # Last modified: 5-16-95 ######################################################################## DO_SEQ_BRANCH="TRUE" CUTOFF=0 if test "$1" = -h; then echo "Usage: make-cluster [options] pattern_name seq_file [ref_seq_name sec_struct_seq_file]" echo echo " -c CUTOFF use match score cutoff of CUTOFF instead of 0" echo " -d PENALTY use length dependent gap penalty of PENALTY (gap extension)" echo " -h print this help" echo " -i PENALTY use length independent gap penalty of PENALTY (gap open)" echo " -l SCORE use minimum local score of SCORE" echo " -m MATRIX_FILE use weight matrix file of MATRIX_FILE instead of patgen.mat" echo " -w WIDTH use minimum local alignment width of WIDTH instead of 15" echo " -M only perform maximal (complete) linkage" echo echo "Default values for the gap penalties and the minimum local score depend on" echo "the weight matrix file used. All options must be specified separately" exit 0 fi while true; do case "$1" in # make-cluster options. -c) CUTOFF="$2" shift ;; -M) M_OPT=-m ;; # Common options. -d|-i|-l|-m|-w) OPTIONS="$OPTIONS $1 $2" shift ;; *) break ;; esac shift done if test -n "$M_OPT"; then DO_SEQ_BRANCH=FALSE fi if test $# -ne 2 -a $# -ne 4; then echo 'make-cluster: bad arguments, run "make-cluster -h" for help.' >&2 exit 1 fi CLUSTER_NAME=$1 SEQ_FILE=$2 if test ! -s $SEQ_FILE; then echo "make-cluster: sequence-file \"$SEQ_FILE\" not found." >&2 exit 1 fi if test $# -gt 2; then REF_SEQ_NAME=$3 SS_SEQ_FILE=$4 if test ! -s $SS_SEQ_FILE; then echo "make-cluster: sec_struct_seq_file \"$SS_SEQ_FILE\" not found." >&2 exit 1 fi fi if test $# -eq 2; then MATCH_PROGRAM_NAME="pima-mso -x" INPUT_FILE_1=$SEQ_FILE else MATCH_PROGRAM_NAME="pima-mso" echo $REF_SEQ_NAME | extract-records - $SEQ_FILE > tmp-ref.$$ INPUT_FILE_1=tmp-ref.$$ fi $MATCH_PROGRAM_NAME $OPTIONS $INPUT_FILE_1 $SEQ_FILE tmp-out.$$ if test $? -ne 0; then echo 'make-cluster: pima-mso error.' >&2 exit 1 fi # # Check that there are some match scores, otherwise assume there was less than # two sequences. # if test ! -s tmp-out.$$; then echo 'make-cluster: two or more sequences with unique names are required.' >&2 exit 1 fi # Sort scores sort -rn +2 tmp-out.$$ > tmp-sorted.$$ if test "$DO_SEQ_BRANCH" = TRUE; then REF_SEQ=`head -1 tmp-sorted.$$ | awk '{print $1}'` grep "$REF_SEQ " tmp-sorted.$$ > tmp-sorted-sb.$$ fi if test $# -gt 2; then rm -f tmp-sorted.$$ fi for FILE in tmp-sorted*.$$; do if test "$FILE" = "tmp-sorted-sb.$$"; then # SINGLE linkage CLUSTER_METHOD=single NEW_CLUSTER_NAME=$CLUSTER_NAME-SB else # MAXIMAL linkage CLUSTER_METHOD=complete if test -n "$M_OPT"; then NEW_CLUSTER_NAME=$CLUSTER_NAME else NEW_CLUSTER_NAME=$CLUSTER_NAME-ML fi fi rm -f $NEW_CLUSTER_NAME.cluster tmp-cluster.$$ cluster-pima -c 2 -m $CLUSTER_METHOD -s $CUTOFF < $FILE > tmp-cluster.$$ if test $? -ne 0; then echo 'make-cluster: cluster-pima error.' >&2 exit 1 fi # Make certain REF_SEQ_NAME is first member of cluster if test $# -gt 2; then print-cluster -n -r $REF_SEQ_NAME tmp-cluster.$$ > tmp-new-cluster.$$ mv tmp-new-cluster.$$ tmp-cluster.$$ fi # Add loci not in any family to cluster awk '{print NR,$0}' tmp-cluster.$$ | extract-cluster-loci \ | awk '{print $2}' | sort | uniq > tmp-cluster-loci.$$ extract-records -i $SEQ_FILE | sort | uniq > tmp-family-loci.$$ comm -23 tmp-family-loci.$$ tmp-cluster-loci.$$ >> tmp-cluster.$$ # Add name to cluster(s) N_CLUSTERS=`wc -l < tmp-cluster.$$` if test $N_CLUSTERS -eq 1; then echo $NEW_CLUSTER_NAME" "`cat tmp-cluster.$$` \ >> $NEW_CLUSTER_NAME.cluster else print-cluster -i $NEW_CLUSTER_NAME tmp-cluster.$$ \ >> $NEW_CLUSTER_NAME.cluster fi if test ! -s $NEW_CLUSTER_NAME.cluster; then exec >&2 echo echo "*********************************************************************" echo "> $CLUSTER_NAME.cluster is EMPTY !!" echo "> Probable cause: No match scores above the score-cutoff [$CUTOFF]" echo "> -- Check the scores in the $CLUSTER_NAME.cluster-scores file" echo "> -- (Use a cutoff below that of the highest score generated)" echo "> -- Also make sure sequences have unique locus names." echo "*********************************************************************" echo exit 1 fi echo done rm tmp-*.$$ exit 0