#22.12.08 - 22.12.09 R Workshop Days 1 & 2

### dplyr
# Load libraries
library(tidyverse)
library(here)

# load file
Ecoli_citrate2 <- read.csv(here("data_in", "Ecoli_citrate.csv"))
Ecoli_citrate2

# Selecting columns and filtering rows
select(Ecoli_citrate, sample, clade, cit, genome_size)
filter(Ecoli_citrate, cit == "plus")
filter(Ecoli_citrate, cit != "plus")


# Pipes
Ecoli_citrate %>%
  filter(cit == "plus") %>%
  select(sample, generation, clade, cit)


select(filter(Ecoli_citrate, cit == "plus"), sample, generation, clade)

#Create a new object with this smaller version of the data
Ecoli_citplus <- Ecoli_citrate %>%
  filter(cit == "plus") %>%
  select(sample, generation, clade)
Ecoli_citplus

### EXERCISE

#a. Using pipes, subset `Ecoli_citrate` to include rows where the clade is ‘Cit+’ and keep only the columns `sample`, `cit`, and `genome_size`.
#How many rows are in that tibble?
#HINT: You can use the `nrow()` function to find out how many rows are in a tibble.

#b. Using pipes, subset `Ecol_citrate` to include rows where the genome_size is greater than 4.6 and generation is greater than 20 000. Then keep only the columns sample, run, clade


# Mutate
Ecoli_citrate %>%
  mutate(genome_bp = genome_size *1e6)


# remove NAs
Ecoli_citrate %>%
  mutate(genome_bp = genome_size *1e6) %>%
  filter(!is.na(clade))

# Split-apply-combine data analysis and the summarize() function
Ecoli_citrate %>%
  group_by(cit) %>%
  summarize(n())

#Calculate mean `genome_size` by mutant status:
Ecoli_citrate %>%
  group_by(cit) %>%
  summarize(mean_size = mean(genome_size, na.rm = TRUE))

# group by multiple columns too:
Ecoli_citrate %>%
  group_by(cit, clade) %>%
  summarize(mean_size = mean(genome_size, na.rm = TRUE))

#discard rows where clade is missing
Ecoli_citrate %>%
  group_by(cit, clade) %>%
  summarize(mean_size = mean(genome_size, na.rm = TRUE)) %>%
  filter(!is.na(clade))

#summarize multiple variables at the same time:
Ecoli_citrate %>%
  group_by(cit, clade) %>%
  summarize(mean_size = mean(genome_size, na.rm = TRUE),
            min_generation = min(generation))

### EXERCISE
#Create a tibble that has removed all of the rows where the clade is "unknown". Then find the mean genome size of each unique clade and the resultant rank of  mean genome size. (note that ranking genome size will not sort the table; the row order will be unchanged. You can use the `arrange()` function to sort the table).
#There are several functions for ranking observations, which handle tied values differently. For this exercise it doesn’t matter which function you choose. Use the help options to find a ranking function.

#*Reminder that unknown is part of the dataset - R exlusively uses NA to indicate no data

Ecoli_citrate %>%
  group_by(clade) %>%
  filter(clade != "unknown") %>%
  summarise(means = mean(genome_size)) %>%
  mutate(rank = rank(means)) %>%
  arrange(rank)

# Key Points
# Use the dplyr package (within the tidyverse) to manipulate tibbles.
# Use select() to choose variables from a tibbles.
# Use filter() to choose data based on values.
# Use group_by() and summarize() to work with subsets of data.
# Use mutate() to create new variables.

############
# EDA
############

# load the data
Calb_R <- read_csv(here("data_in", "Calb_resistance.csv"))
glimpse(Calb_R)


# rename the two numerical variables
Calb_R <- Calb_R %>%
  rename(MIC = `MIC (ug/mL)`, disk = `disk (mm)`)
glimpse(Calb_R)

# Plotting with `ggplot2`


## Plotting the distribution of one varible
ggplot(data = Calb_R, mapping = aes(disk)) +
  geom_histogram()


# change the the number of
ggplot(data = Calb_R, mapping = aes(disk)) +
  geom_histogram(binwidth = 1)


# get rid of NAs
ggplot(data = Calb_R) +
  geom_histogram(mapping = aes(disk), binwidth =1, na.rm = TRUE)

# specify `binwidth`
ggplot(data = Calb_R) +
  geom_histogram(mapping = aes(disk)) +
  stat_bin(mapping = aes(disk), binwidth = 1)


#The difference between geom and stat layers is somewhat nuanced and I refer you to this stackoverflow post
#https://stackoverflow.com/questions/38775661/what-is-the-difference-between-geoms-and-stats-in-ggplot2.
ggplot(data = Calb_R, mapping = aes(disk)) +
  geom_histogram(binwidth = 1) +    # change the number of bins
  theme_bw() +   # change the theme
  labs(x = "disk diffusion zone of inhibition (mm)" , y = "Number of strains")

## Plotting with multiple types of data

# overlay information from a categorical variable
ggplot(data = Calb_R, mapping = aes(disk, fill = site)) +
  geom_histogram(binwidth = 1, na.rm = TRUE) +    # change the number of bins
  theme_bw() +   # change the theme
  labs(x = "disk diffusion zone of inhibition (mm)" , y = "Number of strains")

# separate into three different panels using the function `facet_wrap`:
ggplot(data = Calb_R, mapping = aes(disk)) +
  geom_histogram(binwidth = 1, na.rm = TRUE) +
  theme_bw() +
  labs(x = "disk diffusion zone of inhibition (mm)" , y = "Number of strains") +
  facet_wrap(~site)

#use a different (but related) geom, `geom_freqpoly`, which also requires us to specify that the stat we want is density.

ggplot(data = Calb_R, mapping = aes(disk)) +
  geom_freqpoly(mapping = aes(disk, stat(density)), binwidth = 1, na.rm = TRUE) +
  labs(x = "disk diffusion zone of inhibition (mm)" , y = "Density of strains") +
  facet_wrap(~site)

### EXERCISE:
#Create a new figure to plot the `disk` variable with two panels, one for each `sex` that colours the different strain `site` differently. Use the help menus or "google fu" to figure out how to add a title to the figure (add whatever you like). Since dividing up the data into six categories makes it a bit sparse, play around with either `bin` or `binwidth` to find something that looks more pleasing.

#Extra things to do:
#If you get that done, can you figure out how to stack the different figures/facets vertically instead of horizonatally? (remember that google is your friend). Can you figure out how to change bar colours?
#Can you change the name of the legend?
#What about the legend labels - can you change it from f & m to female and male? (don't recode the dataset, just change it in the figure)
# Change the panel titles from “f” and “m” to “female” and “male” (don’t recode the actual dataset, just #change the title labels)
#Look  up different themes and find one you like
#Change the default colours that are used

gplot(data = Calb_R, mapping = aes(disk, fill = site)) +
  geom_histogram(binwidth = 2, na.rm = TRUE) +
  theme_bw() +
  labs(x = "disk diffusion zone of inhibition (mm)" , y = "Number of strains") +
  facet_wrap(~gender)

# Let's change to another geom method, `geom_boxplot`.
ggplot(data = Calb_R) +
  geom_boxplot(mapping = aes(x = site, y = disk), na.rm = TRUE) +
  theme_bw()

p <- ggplot(data = Calb_R) +
  geom_boxplot(mapping = aes(x = site, y = disk), na.rm = TRUE) +
  theme_bw()
p

#change the axes labels (this is almost always true) and add the tukey test results using the `annotate` layer.
p +
  scale_y_reverse(limits = c(50, 0)) +
  labs(y = "disk diffusion zone of inhibition (mm)" , y = "site of strain isolation")

#violin plot
p2 <- ggplot(data = Calb_R) +
  geom_violin(mapping = aes(x = site, y = disk), na.rm = TRUE) +
  theme_bw() +
  scale_y_reverse(limits = c(50, 0)) +
  labs(y = "disk diffusion zone of inhibition (mm)" , y = "site of strain isolation")
p2


#add different summary statistics directly on top, like the mean or the median.
p2 +
  stat_summary(fun="mean",
               geom="point",
               shape=23,
               size=2)

p3 <- ggplot(data = Calb_R, mapping = aes(x = site, y = disk)) +
  geom_violin(na.rm = TRUE) +
  theme_bw() +
  scale_y_reverse(limits = c(50, 0)) +
  labs(y = "disk diffusion zone of inhibition (mm)" , y = "site of strain isolation") +  stat_summary(fun="mean", geom="point", shape=23, size=3, na.rm = TRUE, fill = "red")

p3

# sina plot.
install.packages("ggforce")
library(ggforce)
p4 <- ggplot(data = Calb_R, mapping = aes(x = site, y = disk)) +
  geom_sina(na.rm=TRUE) +
  theme_bw() +
  scale_y_reverse(limits = c(50, 0)) +
  labs(y = "disk diffusion zone of inhibition (mm)" , y = "site of strain isolation") +
  stat_summary(fun="mean", geom="point", shape=23, size=3, na.rm = TRUE, fill = "red")
p4

p4 <- ggplot(data = Calb_R, mapping = aes(x = site, y = disk)) +
  geom_sina(na.rm=TRUE) +
  theme_bw() +
  scale_y_reverse(limits = c(50, 0)) +
  labs(y = "disk diffusion zone of inhibition (mm)" , y = "site of strain isolation") +
  stat_summary(fun="mean", geom="crossbar", linewidth=0.5, na.rm = TRUE, colour = "red")
p4


# histogram:
pM <- ggplot(data = Calb_R, mapping = aes(MIC)) +
  geom_histogram(na.rm = TRUE)

pM

pM +
  scale_x_continuous(trans="log2")

# switch to `geom_bar()`:
ggplot(data = Calb_R, mapping = aes(MIC)) +
  geom_bar(na.rm = TRUE) +
  scale_x_continuous(trans="log2", breaks = unique(Calb_R$MIC)) +
  labs(x = expression(MIC[50]), y = "Number of strains")

# add the site information into this plot?

Calb_R_sum <- Calb_R %>%
  group_by(site, MIC) %>%
  summarize(num_strains = n())

Calb_R_sum


pSum <- ggplot(Calb_R_sum, mapping = aes(MIC, num_strains, fill= site)) +
  geom_bar(stat = "identity", position=position_dodge(preserve = "single"), na.rm = TRUE, width=0.75) +
  scale_x_continuous(trans="log2", breaks = unique(Calb_R$MIC)) +
  labs(x = expression(MIC[50]), y = "Number of strains") +
  scale_fill_discrete(name = "Site of strain isolation")
pSum

### EXERCISE
# Using the help menu, explore the options for `geom_bar`. What does `stat = "identity"` do? What about `position_dodge`? How do you change the width of the bar to be equal to the total width of all elements at a position?

## Two numerical variables
gP <- ggplot(Calb_R, aes(MIC, disk)) +
  scale_x_continuous(trans="log2", breaks = unique(Calb_R$MIC)) +
  scale_y_reverse(limits = c(50, 0)) +
  labs(y = "disk diffusion zone of inhibition (mm)" , x = expression(MIC[50])) +
  theme_bw()

gP +
  geom_point(na.rm =TRUE)

# add a line of fit:

gP +
  geom_point(na.rm =TRUE) +
  geom_smooth(method = "lm", na.rm=TRUE)

# change the opacity of the points (which is refereed to as alpha).
gP +
  geom_smooth(method = "lm", na.rm=TRUE) +
  geom_jitter(alpha = 0.5, color = "tomato", width = 0.2)

### EXERCISE:
#The way we plotted the scatterplot ignored the other variables in our dataset that we know are important. How can you improve this figure to add in that additional information? There are many different options, see how many you can think of (and implement).

# Saving graphs

pdf(here("figures_out", "190411Calb_R_MIC_DDA.pdf"), width=4, height=4)
gP +
  geom_smooth(method = "lm", na.rm=TRUE) +
  geom_jitter(alpha = 0.5, color = "tomato", width = 0.2)
dev.off()
system("open figures_out/190411Calb_R_MIC_DDA.pdf")

## Linking statistical analysis to data visualization

# one-dimensional visualizations, i.e., histograms
Calb_R <- read_csv(here("data_in", "Calb_resistance.csv"))

Calb_R <- Calb_R %>%
  rename(MIC = `MIC (ug/mL)`, disk = `disk (mm)`)

Calb_R

ggplot(data = Calb_R, mapping = aes(disk)) +
  geom_histogram()

### EXERCISE
#By eye, does this variable look normally distributed? Why or why not?


# test for normality using the `shapiro.test()
shapiro.test(Calb_R$disk)


# look at multiple variables.
#A reminder of what our dataset it:

glimpse(Calb_R)


#subset the data and test for normality.
Calb_R_f <- filter(Calb_R, sex == "f")
Calb_R_m <- filter(Calb_R, sex == "m")
shapiro.test(Calb_R_f$disk)
shapiro.test(Calb_R_m$disk)

# non-parametric wilcoxon test (or Mann-Whitney U test) that compares data ranks
# specify x and y
wilcox.test(Calb_R_f$disk, Calb_R_m$disk)

# use equation format
wilcox.test(disk ~ sex, data = Calb_R)

ggplot(data = Calb_R, mapping = aes(disk, fill = site)) +
  geom_histogram(binwidth = 2, na.rm = TRUE) +
  theme_bw() +
  labs(x = "disk diffusion zone of inhibition (mm)" , y = "Number of strains")

#ANOVA
aov(disk ~ site, data = Calb_R)
anova_test <- aov(disk ~ site, data = Calb_R)
summary(anova_test)

#install one more package, the broom package, that will clean up this output.
install.packages("broom")
library(broom)
tidy(anova_test)


anova_test_tidy <- tidy(anova_test)
anova_test_tidy

anova_test_tidy$p.value[1]

# tukey test:
TukeyHSD(anova_test)

### Exercise
#Conduct a t-test comparing skin and oral samples. First subset the data frame as needed, then check for normality.
#?t.test might be helpful

# two-way ANOVA.
full_anova_test <- aov(disk ~ site*sex, data = Calb_R)
tidy(full_anova_test)

### Exercise
#Conduct a  statistical test to determine whether site or sex (or their interaction) has a significant effect on MIC.

ggplot(Calb_R, aes(MIC, disk)) +
  scale_x_continuous(trans="log2", breaks = unique(Calb_R$MIC)) +
  scale_y_reverse(limits = c(50, 0)) +
  labs(y = "disk diffusion zone of inhibition (mm)" , x = expression(MIC[50])) +
  theme_bw() +
  geom_point(na.rm =TRUE) +
  geom_jitter(alpha = 0.5, color = "tomato", width = 0.2)

# look for a correlation between these two resistance variables.
cor_test <- cor.test(Calb_R$MIC, Calb_R$disk, method = "spearman")
cor_test
tidy(cor_test)

ggplot(Calb_R, aes(MIC, disk)) +
  scale_x_continuous(trans="log2", breaks = unique(Calb_R$MIC)) +
  scale_y_reverse(limits = c(50, 0)) +
  labs(y = "disk diffusion zone of inhibition (mm)" , x = expression(MIC[50])) +
  theme_bw() +
  geom_point(na.rm =TRUE) +
  geom_jitter(alpha = 0.5, color = "tomato", width = 0.2)+
  geom_smooth(method = "lm", na.rm=TRUE)

# Attributes
#This code was created by Aleeza Gerstein, https://microstatslab.ca, aleeza.gerstein@umanitoba.ca at the University of Manitoba based on material from:
# The Carpentries https://carpentries.org/
# Wikipedia en.wikipedia.org
#Applied Biostats: https://bookdown.org/ybrandvain/Applied-Biostats/
#Modern Dive: https://moderndive.com
#Diva Jain: https://codeburst.io/2-important-statistics-terms-you-need-to-know-in-data-science-skewness-and-kurtosis-388fef94eeaa
#Alan Downey, :Probably Overthinking it, http://allendowney.blogspot.com
#Introduction to Statistical Ideas and Methods online modules, https://stats.onlinelearning.utoronto.ca/
#Statistics Teacher: What is power? http://www.statisticsteacher.org/2017/09/15/what-is-power/ #Concepts and Applications of Inferential Statistics, http://vassarstats.net
#Towards Data Science, blog post: https://towardsdatascience.com/intro-to-descriptive-statistics-252e9c464ac9, https://towardsdatascience.com/data-types-in-statistics-347e152e8bee

#Made available under the [Creative Commons Attribution license](https://creativecommons.org/licenses/by/4.0/). [License](https://datacarpentry.org/R-genomics/LICENSE.html).