#!/usr/bin/env Rscript ## Identifies the approximate number of most relevant transcripts based on finding the point representing the elbow in the curve of expression ~ ordered transcript main = function () { suppressPackageStartupMessages(library("argparse")) suppressPackageStartupMessages(library("tidyverse")) parser = ArgumentParser() parser$add_argument("--E_inputs", help="file.isoform.TMM.EXPR.matrix.E-inputs", required=TRUE, nargs=1) parser$add_argument("--out_pdf", help="name for output pdf filename", required=FALSE, nargs=1, default="estimate_TPM_threshold.pdf") args = parser$parse_args() E_inputs_filename = args$E_inputs output_pdf_filename = args$out_pdf message("-parsing:", E_inputs_filename) if (grepl(".gz$", E_inputs_filename)) { data = read.table(gzfile(E_inputs_filename), header=T, com='', sep="\t", stringsAsFactors = F) } else { data = read.table(E_inputs_filename, header=T, com='', sep="\t", stringsAsFactors = F) } data = data %>% filter(max_expr_over_samples > 0.05 & max_expr_over_samples <= 2^5) # must have some evidence of expression if (nrow(data) < 1000) { message("Too few data points in the low-expression range to perform a meaningful analysis here.") quit(save = "no", status = 0, runLast = FALSE) } data = data %>% arrange(desc(max_expr_over_samples)) %>% mutate(r=row_number()) data = data %>% mutate(logexpr = log2(max_expr_over_samples+1)) normalize = function(x) { x = unlist(x) min_x = min(x) max_x = max(x) vals = (x-min_x)/(max_x-min_x) return(vals) } data$x = normalize(list(data$logexpr)) data$y = normalize(list(data$r)) message("-performing elbow analysis to select threshold.") get_dist_point_line <- function(point, line_coord1, line_coord2) { # derived from https://github.com/ropensci/pathviewr/blob/HEAD/R/analytical_functions.R ## Compute v1 <- line_coord1 - line_coord2 v2 <- point - line_coord1 m <- cbind(v1, v2) dist <- abs(det(m)) / sqrt(sum(v1 * v1)) ## export return(dist) } # get a line from the first and last point of the elbow curve data = data %>% arrange(x) first_pt = data %>% head(n=1) last_pt = data %>% tail(n=1) xs = c(first_pt$x, last_pt$x) ys = c(first_pt$y, last_pt$y) line = lm(ys ~ xs) coeffs = coefficients(line) intercept = coeffs[1] slope = coeffs[2] dist_to_line_df = do.call(rbind, apply(data.frame(x=data$x, y=data$y), 1, function(row) { x = row[1] y = row[2] point_dist = get_dist_point_line(c(x,y), c(first_pt$x, first_pt$y), c(last_pt$x, last_pt$y)) return(data.frame(dist=point_dist)) }) ) data = bind_cols(data, dist_to_line_df) # define threshold as that transcript with greatest distance to the line (elbow transcript) threshold_entry = data %>% arrange(desc(dist)) %>% head(n=1) tpm_x = threshold_entry$max_expr_over_samples # compute some stats based on that threshold filtered_data = data %>% filter(max_expr_over_samples >= tpm_x) %>% arrange(desc(length)) sum_length = sum(filtered_data$length) filtered_data = filtered_data %>% mutate(cumsum_len = cumsum(length)) half_length = sum_length / 2 N50_entry = filtered_data %>% filter(cumsum_len <= half_length) %>% filter(row_number() == n()) Ex = threshold_entry %>% pull(X.Ex) N50 = N50_entry %>% pull(length) logexpr_x = threshold_entry$logexpr # ensure line is above the elbow line_y = threshold_entry$x * slope + intercept pdf(output_pdf_filename) p = data %>% ggplot(aes(x=logexpr, y=r)) + geom_point() if (line_y < threshold_entry$y) { message("Not finding a lower elbow curve, cannot estimate min threshold") } else { num_transcripts = data %>% filter(max_expr_over_samples >= tpm_x) %>% nrow() message("-Number of transcripts >= tpm threshold: ", num_transcripts) message("-Fraction of expression data represented: ", Ex) message("-Contig N50 based on these transcripts: ", N50) message("-selected TPM threshold: ", tpm_x) p = p + geom_vline(xintercept=logexpr_x, color='red') + annotate("text", x=threshold_entry$logexpr, y=num_transcripts, label=paste0(" # transcripts =", num_transcripts, " Ex=", Ex, " N50=", N50, " TPMthresh=", tpm_x), hjust=0) } plot(p) message("-Done. See pdf: estimate_TPM_threshold.pdf") quit(save = "no", status = 0, runLast = FALSE) } if (length(sys.calls())==0) { main() }