#20.11.23 R Workshop Day 2 ### dplyr # Load libraries library(tidyverse) library(here) # Download and then load a new package, tidylog install.packages("tidylog") library(tidylog) # load file Ecoli_citrate2 <- read.csv(here("data_in", "Ecoli_citrate.csv")) Ecoli_citrate2 # Selecting columns and filtering rows select(Ecoli_citrate, sample, clade, cit, genome_size) filter(Ecoli_citrate, cit == "plus") filter(Ecoli_citrate, cit != "plus") # Pipes Ecoli_citrate %>% filter(cit == "plus") %>% select(sample, generation, clade, cit) select(filter(Ecoli_citrate, cit == "plus"), sample, generation, clade) Ecoli_citplus <- Ecoli_citrate %>% filter(cit == "plus") %>% select(sample, generation, clade) ######################################### ## EXERCISE #Using pipes, subset Ecoli_citrate to include rows where the clade is ‘Cit+’ and keep only the columns "sample", "cit", and "genome_size". #How many rows are in that tibble? #HINT: You can use the nrow() function to find out how many rows are in a tibble. #Using pipes, subset Ecol_citrate to include rows where the genome_size is greater than 4.6 and generation is greater than 20 000. Then keep only the columns "sample", "run", "clade" ######################################### # Mutate Ecoli_citrate %>% mutate(genome_bp = genome_size *1e6) Ecoli_citrate %>% mutate(genome_bp = genome_size *1e6) %>% filter(!is.na(clade)) # Split-apply-combine data analysis and the summarize() function Ecoli_citrate %>% group_by(cit) %>% summarize(n()) Ecoli_citrate %>% group_by(cit) %>% summarize(mean_size = mean(genome_size, na.rm = TRUE)) Ecoli_citrate %>% group_by(cit, clade) %>% summarize(mean_size = mean(genome_size, na.rm = TRUE)) Ecoli_citrate %>% group_by(cit, clade) %>% summarize(mean_size = mean(genome_size, na.rm = TRUE)) %>% filter(!is.na(clade)) Ecoli_citrate %>% group_by(cit, clade) %>% summarize(mean_size = mean(genome_size, na.rm = TRUE), min_generation = min(generation)) ######################################### ##EXERCISE #Create a tibble that has removed all of the rows where the clade is “unknown”. Then find the mean genome size of each unique clade and the resultant rank of mean genome size. (note that ranking genome size will not sort the table; the row order will be unchanged. You can use the arrange() function to sort the table). #There are several functions for ranking observations, which handle tied values differently. For this exercise it doesn’t matter which function you choose. Use the help options to find a ranking function. #*Reminder that unknown is part of the dataset - R exlusively uses NA to indicate no data ######################################### # Long to wide example counts <- tibble(replicate = c(1, 1, 2, 2, 3), num_colonies = c(40, 70, 25, 60, 15), dilution = c("d10", "d5", "d10", "d5", "d5") ) counts counts_wide <- counts %>% pivot_wider(names_from = dilution, values_from = num_colonies) counts_wide <- counts %>% pivot_wider(names_from = dilution, values_from = c(num_colonies), values_fill=list(num_colonies = 0)) counts_long_0 <- counts %>% complete(dilution, replicate, fill=list(num_colonies=0)) count_long <- counts_wide %>% pivot_longer(cols = c(d10, d5), names_to = "dilution", values_to = "count") ### ggplot & stats # download the data from the internet (remember that you only need to do this once)! download.file("http://home.cc.umanitoba.ca/~gersteia/MBIO7040/Calb_resistance.csv", here("data_in", "Calb_resistance.csv")) # load the data Calb_R <- read_csv(here("data_in", "Calb_resistance.csv")) # look at the data glimpse(Calb_R) # rename the two numerical variables Calb_R <- Calb_R %>% rename(MIC = `MIC (ug/mL)`, disk = `disk (mm)`) # download a new package "skimr" install.packages("skimr") library(skimr) Calb_R %>% select(MIC, disk) %>% skim() # Plotting the distribution of one varible ggplot(data = Calb_R, mapping = aes(disk)) + geom_histogram() ggplot(data = Calb_R, mapping = aes(disk)) + geom_histogram(binwidth = 1) Calb_R %>% filter(is.na(disk)) ggplot(data = Calb_R) + geom_histogram(mapping = aes(disk), binwidth =1, na.rm = TRUE) ggplot(data = Calb_R) + # geom_histogram(mapping = aes(disk)) + stat_bin(mapping = aes(disk), binwidth = 1) ggplot(data = Calb_R, mapping = aes(disk)) + geom_histogram(binwidth = 1) + # change the number of bins theme_bw() + # change the theme labs(x = "disk diffusion zone of inhibition (mm)" , y = "Number of strains") shapiro.test(Calb_R$disk) # Plotting with multiple types of data ggplot(data = Calb_R, mapping = aes(disk, fill = type)) + geom_histogram(binwidth = 1, na.rm = TRUE) + # change the number of bins theme_bw() + # change the theme labs(x = "disk diffusion zone of inhibition (mm)" , y = "Number of strains") ggplot(data = Calb_R, mapping = aes(disk)) + geom_histogram(binwidth = 1, na.rm = TRUE) + theme_bw() + labs(x = "disk diffusion zone of inhibition (mm)" , y = "Number of strains") + facet_wrap(~type) ggplot(data = Calb_R, mapping = aes(disk)) + geom_freqpoly(mapping = aes(disk, stat(density)), binwidth = 1, na.rm = TRUE) + theme_bw() + labs(x = "disk diffusion zone of inhibition (mm)" , y = "Density of strains") + facet_wrap(~type) ############################################ # EXERCISE # Create a new figure that plots variable disk using two panels, one panel for each gender that colours the different strain types differently. Use the help menus or “google fu” to figure out how to add a title to the figure (whatever you like). # Since dividing up the data into six categories makes it a bit sparse, play around with either bin or binwidth to find something that looks more pleasing. # Stretch goals: # figure out how to stack the different figures/facets vertically instead of horizonatally? (remember that google is your friend). # Change the bar colours # Change the name of the legend and the legend labels? # Change the panel titles from “f” and “m” to “female” and “male” (don’t recode the actual dataset, just change the title labels) # Look up different themes and find one you like ############################################ # Statistical tests install.packages("broom") library(broom) wilcox.test(disk ~ gender, data = Calb_R) aov(disk ~ type, data = Calb_R) anova_test <- aov(disk ~ type, data = Calb_R) summary(anova_test) tidy(anova_test) # base plot p <- ggplot(data = Calb_R) + geom_boxplot(mapping = aes(x = type, y = disk), na.rm = TRUE) + theme_bw() p p + scale_y_reverse(limits = c(50, 0)) + labs(y = "disk diffusion zone of inhibition (mm)" , y = "site of strain isolation") + annotate("text",x = c(1, 2, 3), y = c(0, 0, 0), label = c("a", "b", "b")) p2 <- ggplot(data = Calb_R) + geom_violin(mapping = aes(x = type, y = disk), na.rm = TRUE) + theme_bw() + scale_y_reverse(limits = c(50, 0)) + labs(y = "disk diffusion zone of inhibition (mm)" , y = "site of strain isolation") + annotate("text",x = c(1, 2, 3), y = c(0, 0, 0), label = c("a", "b", "b")) p2 + stat_summary(fun.y=mean, geom="point", shape=23, size=2) p3 <- ggplot(data = Calb_R, mapping = aes(x = type, y = disk)) + geom_violin(na.rm = TRUE) + theme_bw() + scale_y_reverse(limits = c(50, 0)) + labs(y = "disk diffusion zone of inhibition (mm)" , y = "site of strain isolation") + annotate("text",x = c(1, 2, 3), y = c(0, 0, 0), label = c("a", "b", "b")) + stat_summary(fun.y=mean, geom="point", shape=23, size=3, na.rm = TRUE, fill = "red") library(ggforce) p4 <- ggplot(data = Calb_R, mapping = aes(x = type, y = disk)) + geom_sina() + theme_bw() + scale_y_reverse(limits = c(50, 0)) + labs(y = "disk diffusion zone of inhibition (mm)" , y = "site of strain isolation") + annotate("text",x = c(1, 2, 3), y = c(0, 0, 0), label = c("a", "b", "b")) + stat_summary(fun.y=mean, geom="point", shape=23, size=3, na.rm = TRUE, fill = "red") # second continuous variable pM <- ggplot(data = Calb_R, mapping = aes(MIC)) + geom_histogram(na.rm = TRUE) pM + scale_x_continuous(trans="log2") ggplot(data = Calb_R, mapping = aes(MIC)) + geom_bar(na.rm = TRUE) + scale_x_continuous(trans="log2", breaks = unique(Calb_R$MIC)) + labs(x = expression(MIC[50]), y = "Number of strains") Calb_R_sum <- Calb_R %>% group_by(type, MIC) %>% summarize(num_strains = n()) pSum <- ggplot(Calb_R_sum, mapping = aes(MIC, num_strains, fill= type)) + geom_bar(stat = "identity", position=position_dodge(preserve = "single"), na.rm = TRUE, width=0.75) + scale_x_continuous(trans="log2", breaks = unique(Calb_R$MIC)) + labs(x = expression(MIC[50]), y = "Number of strains") + scale_fill_discrete(name = "Site of strain isolation") ####################################### ## EXERCISE #Using the help menu, explore the options for geom_bar. What does stat = "identity" do? What about position_dodge? How do you change the width of the bar to be equal to the total width of all elements at a position? #Conduct a statistical test to determine whether type or gender (or their interaction) has a significant effect on MIC. ####################################### # Two numerical variables gP <- ggplot(Calb_R, aes(MIC, disk)) + scale_x_continuous(trans="log2", breaks = unique(Calb_R$MIC)) + scale_y_reverse(limits = c(50, 0)) + labs(y = "disk diffusion zone of inhibition (mm)" , x = expression(MIC[50])) + theme_bw() gP + geom_point(na.rm =TRUE) + geom_smooth(method = "lm", na.rm=TRUE) gP + geom_smooth(method = "lm", na.rm=TRUE) + geom_jitter(alpha = 0.5, color = "tomato", width = 0.2) cor_test <- cor.test(Calb_R$MIC, Calb_R$disk, method = "spearman") tidy(cor_test) ####################################### ## EXERCISE #The way we plotted the scatterplot ignored the other variables in our dataset that we know are important. How can you improve this figure to add in that additional information? There are many different options, see how many you can think of (and implement). ####################################### # Saving graphs pdf(here("figures_out", "201123Calb_R_MIC_DDA.pdf"), width=4, height=4) gP + geom_smooth(method = "lm", na.rm=TRUE) + geom_jitter(alpha = 0.5, color = "tomato", width = 0.2) dev.off() ggsave(filename = "201123Calb_R_MIC_DDA.png", device = "png", path = "./Figures", width = 5, height = 5, units = "in")