#22.12.08 R Workshop Day 2

### ggplot & stats

# download the data from the internet (remember that you only need to do this once)!
download.file("http://home.cc.umanitoba.ca/~gersteia/MBIO7040/Calb_resistance.csv",
              here("data_in", "Calb_resistance.csv"))

# load the data
Calb_R <- read_csv(here("data_in", "Calb_resistance.csv"))
# look at the data

# rename the two numerical variables

# Plotting the distribution of one varible
ggplot(data = Calb_R, mapping = aes(disk)) +
  geom_histogram()

# Plotting with multiple types of data
ggplot(data = Calb_R, mapping = aes(disk, fill = type)) +
  geom_histogram(binwidth = 1, na.rm = TRUE) +    # change the number of bins
  theme_bw() +   # change the theme
  labs(x = "disk diffusion zone of inhibition (mm)" , y = "Number of strains")

############################################
# EXERCISE
#Create a new figure to plot the `disk` variable with two panels, one for each `sex` that colours the different strain `site` differently. Use the help menus or "google fu" to figure out how to add a title to the figure (add whatever you like). Since dividing up the data into six categories makes it a bit sparse, play around with either `bin` or `binwidth` to find something that looks more pleasing.

#Extra things to do:
#If you get that done, can you figure out how to stack the different figures/facets vertically instead of horizonatally? (remember that google is your friend). Can you figure out how to change bar colours?
#Can you change the name of the legend?
#What about the legend labels - can you change it from f & m to female and male? (don't recode the dataset, just change it in the figure)
#Change the panel titles from “f” and “m” to “female” and “male” (don’t recode the actual dataset, just #change the title labels)
#Look  up different themes and find one you like
#Change the default colours that are used
############################################


# base plot
p <- ggplot(data = Calb_R) +
  geom_boxplot(mapping = aes(x = type, y = disk), na.rm = TRUE) +
  theme_bw()
p

# second continuous variable
pM <- ggplot(data = Calb_R, mapping = aes(MIC)) +
  geom_histogram(na.rm = TRUE)

ggplot(data = Calb_R, mapping = aes(MIC)) +
  geom_bar(na.rm = TRUE) +
  scale_x_continuous(trans="log2", breaks = unique(Calb_R$MIC)) +
  labs(x = expression(MIC[50]), y = "Number of strains")

#######################################
## EXERCISE
#Using the help menu, explore the options for geom_bar. What does stat = "identity" do? What about position_dodge? How do you change the width of the bar to be equal to the total width of all elements at a position?

#Conduct a statistical test to determine whether type or gender (or their interaction) has a significant effect on MIC.
#######################################

# Two numerical variables
gP <- ggplot(Calb_R, aes(MIC, disk)) +
  scale_x_continuous(trans="log2", breaks = unique(Calb_R$MIC)) +
  scale_y_reverse(limits = c(50, 0)) +
  labs(y = "disk diffusion zone of inhibition (mm)" , x = expression(MIC[50])) +
  theme_bw()

#######################################
## EXERCISE
#The way we plotted the scatterplot ignored the other variables in our dataset that we know are important. How can you improve this figure to add in that additional information? There are many different options, see how many you can think of (and implement).
#######################################

# Saving graphs
pdf(here("figures_out", "201123Calb_R_MIC_DDA.pdf"), width=4, height=4)

gP +
  geom_smooth(method = "lm", na.rm=TRUE) +
  geom_jitter(alpha = 0.5, color = "tomato", width = 0.2)
dev.off()

ggsave(filename = "201123Calb_R_MIC_DDA.png",
       device = "png",
       path = "./Figures",
       width = 5,
       height = 5,
       units = "in")


## Statistics

# one dimensional data - histograms
ggplot(data = Calb_R, mapping = aes(disk)) +
  geom_histogram()

### EXERCISE
#By eye, does this variable look normally distributed? Why or why not?

#We can statistically test for normality using the `shapiro.test()`:


#First let's ignore the different sites and test whether disk diffusion MIC differed between strains isolated from males and females. To do that we're going to use a two sample t-test. The assumptions of a t-test are normally-distributed data. So first let's subset the data and test for normality.

# do the non-parametric wilcoxon test (or Mann-Whitney U test) that compares data ranks instead.

ggplot(data = Calb_R, mapping = aes(disk, fill = site)) +
  geom_histogram(binwidth = 2, na.rm = TRUE) +
  theme_bw() +
  labs(x = "disk diffusion zone of inhibition (mm)" , y = "Number of strains")

#We can similarly ignore sex and test the effect of site. In this case we have more than two groups, so we're going to use an ANOVA test.

#When you run the ANOVA we don't actually get all the information we need out of just the model `aov` call. We need to wrap that in a second function to pull out additional information:

#We're going to install one more package, the broom package, that will clean up this output.

install.packages("broom")
library(broom)


#If we want to know which groups are different from each other, we can use the post-hoc (or "after the event") tukey test:

TukeyHSD()

### Exercise
#Conduct a t-test comparing skin and oral samples. First subset the data frame as needed, then check for normality.
# ?t.test might be helpful

#there are two different categorical variables here that could influence the disk diffusion resistance (site and sex), and we can include them both in one test, a two-way ANOVA.

### Exercise
#Conduct a  statistical test to determine whether site or sex (or their interaction) has a significant effect on MIC.

ggplot(Calb_R, aes(MIC, disk)) +
  scale_x_continuous(trans="log2", breaks = unique(Calb_R$MIC)) +
  scale_y_reverse(limits = c(50, 0)) +
  labs(y = "disk diffusion zone of inhibition (mm)" , x = expression(MIC[50])) +
  theme_bw() +
  geom_point(na.rm =TRUE) +
  geom_jitter(alpha = 0.5, color = "tomato", width = 0.2)

# look for a correlation between these two resistance variables.