## ----setup, message=FALSE, warning=FALSE-------------------------------------- library(ggpubr) library(gtools) library(purrr) library(scales) library(pheatmap) library(data.table) library(gridExtra) library(png) library(knitr) library(grid) library(styler) library(FactoMineR) library(factoextra) library(magick) library(rlang) library(GGally) library(ggplotify) library(remotes) library(dplyr) library(tidyr) knitr::opts_chunk$set(echo = TRUE, message=FALSE,warning = FALSE, fig.align = 'center', dev = "png", tidy='styler', tidy.opts=list(strict=TRUE)) ## Color Format colFmt <- function(x,color) { outputFormat <- knitr::opts_knit$get("rmarkdown.pandoc.to") if(outputFormat == 'latex') { ret <- paste("\\textcolor{",color,"}{",x,"}",sep="") } else if(outputFormat == 'html') { ret <- paste("",x,"",sep="") } else { ret <- x } return(ret) } ## ----install_package, eval=FALSE---------------------------------------------- # ## install from BioConductor # if (!require("BiocManager", quietly = TRUE)) # install.packages("BiocManager") # BiocManager::install('protGear') ## ----call_protGear------------------------------------------------------------ ## load the package suppressWarnings(library(protGear)) ## ----array_params------------------------------------------------------------- ## specify the the parameters to process the data genepix_vars <- array_vars(channel="635" , chip_path = system.file("extdata/array_data/", package="protGear"), totsamples = 21, blockspersample = 2, sampleID_path = system.file("extdata/array_sampleID/", package="protGear"), mig_prefix = "_first", machine =1, ## optional date_process = "0520") ## ----gpr_header--------------------------------------------------------------- header_gpr <- readLines(system.file("extdata/array_data/machine1/KK2-06.txt", package="protGear"), n=40) header_gpr <- gsub("\"", "", header_gpr[1:32]) header_gpr[1:32] ## ----slide_struct, fig.align='left'------------------------------------------- visualize_slide(infile=system.file("extdata/array_data/machine1/KK2-06.txt", package="protGear"), MFI_var ='F635 Median' ) ## ----slide_struct_2d, fig.align='left'---------------------------------------- visualize_slide_2d(infile =system.file("extdata/array_data/machine1/KK2-06.txt", package="protGear"), MFI_var ='B635 Median' ) ## ----array_files-------------------------------------------------------------- #### read in all the datasets ### list all the file names under data folder filenames <- list.files(file.path(genepix_vars$paths[[1]]), pattern="*.txt$|*.gpr$", full.names=FALSE) #' @___________________read_in_the_files_with_the_text_data_from_the_chip_____________________________ ### read all the data files and save them in a list data_path <- paste0(genepix_vars$paths[[1]],"/") data_files <- purrr::map(.x = filenames, .f = read_array_files, data_path=data_path , genepix_vars=genepix_vars) data_files <- set_names(data_files, purrr::map(filenames, name_of_files)) ## ----bg_data------------------------------------------------------------------ ## utilising the map package we process a number of files under data_files list dfs <- names(data_files) allData_bg <- purrr::map(.x=dfs, .f=extract_bg,data_files=data_files,genepix_vars) allData_bg <- set_names(allData_bg, purrr::map(filenames, name_of_files)) allData_bg <- plyr::ldply(allData_bg) ## ----bg_vs_fg----------------------------------------------------------------- p1 <- plot_FB(allData_bg,antigen_name="antigen", bg_MFI="BG_Median",FG_MFI="FBG_Median",log=FALSE) p1 ## ----bg_plot------------------------------------------------------------------ p2 <- plot_bg(df=allData_bg, x_axis="Block",bg_MFI="BG_Median", log_mfi=TRUE) p2 ## ----bg_correct--------------------------------------------------------------- sample_ID_merged_dfs <- purrr::map(.x=dfs, .f=merge_sampleID ,data_files=data_files , genepix_vars, method="subtract_local") sample_ID_merged_dfs <- set_names(sample_ID_merged_dfs, purrr::map(filenames, name_of_files)) ## ----buffer_spots_data-------------------------------------------------------- buffer_transp <- purrr::map(.x=sample_ID_merged_dfs, .f=buffer_spots , buffer_spot="buffer") buffer_transp <- set_names(buffer_transp, purrr::map(filenames, name_of_files)) buffers <- plyr::ldply(buffer_transp) plot_buffer(buffers,buffer_names="antigen",buffer_mfi="FMedianBG_correct",slide_id=".id") ## ----replicates_plot---------------------------------------------------------- #' @________________________________calculated_cv_for_each_data_file_______________________________________ #' data without the selected mean for the best 2 CVs dataCV <- purrr::map(.x=sample_ID_merged_dfs, .f=cv_estimation ,lab_replicates=3 , cv_cut_off=20, sampleID_var='sampleID', antigen_var='antigen' ,replicate_var='replicate' , mfi_var='FMedianBG_correct') lab_replicates=3 dataCV <- set_names(dataCV, purrr::map(filenames, name_of_files)) aa <- plyr::ldply(dataCV) GGally::ggpairs(aa,aes(color=cvCat_all) , columns = paste(seq_len(lab_replicates)), title = "", axisLabels = "show") + theme_light() ## ----cv_summary--------------------------------------------------------------- #' @________________________________summary_of_cv_for_each_sample________________________________________ #' creates summaries by cv's greater than 20 and less than 20 cv_cut_off <- 20 dataCV_sample <- purrr::map(.x=dataCV, .f=protGear::cv_by_sample_estimation , cv_variable="cvCat_all", lab_replicates=3) dataCV_sample <- set_names(dataCV_sample, purrr::map(filenames, name_of_files)) all_cv_sample <- plyr::ldply(dataCV_sample) ## ----cv_by_sample------------------------------------------------------------- less_20 <- rlang::sym(paste0("CV <= ",cv_cut_off, "_perc")) gt_20 <- rlang::sym(paste0("CV > ",cv_cut_off, "_perc")) less_20_per <- rlang::sym(paste0("% CV <=",cv_cut_off)) gt_20_per <- rlang::sym(paste0("% CV >",cv_cut_off)) ggplot(all_cv_sample)+ geom_violin(aes(.id,`CV <= 20_perc`, color="% CV =<20")) + geom_violin(aes(.id,`CV > 20_perc`, color="% CV >20")) + geom_violin(aes(.id,Others_perc,color="Others")) + ylab("% of CV") + theme_minimal() + ggtitle("% of CV >20 or <=20 for each slide all repeats considered") ## ----best_reps---------------------------------------------------------------- #' @________________________________data_with_selected_best_2_CV_______________________________________ #' data with the selected mean for the best 2 CVs dataCV_best2 <- purrr::map(.x=dataCV, .f=best_CV_estimation , slide_id="iden",lab_replicates=3, cv_cut_off=20) ## give the names to the returned list dataCV_best2 <- set_names(dataCV_best2, purrr::map(filenames, name_of_files)) dataCV_sample_best2 <- purrr::map(.x=dataCV_best2, .f=cv_by_sample_estimation , cv_variable="best_CV_cat",lab_replicates=3) dataCV_sample_best2 <- set_names(dataCV_sample_best2, purrr::map(filenames, name_of_files)) all_cv_sample_best2 <- plyr::ldply(dataCV_sample_best2) ## ----best_cv_plot------------------------------------------------------------- ## plot only the CV perccentages ggplot(all_cv_sample_best2)+ geom_violin(aes(.id,`CV <= 20_perc`, color="% CV =<20")) + geom_violin(aes(.id,`CV > 20_perc`, color="% CV >20")) + geom_violin(aes(.id,Others_perc,color="Others")) + ylab("% of CV") + theme_minimal() + ggtitle("% of CV >20 or <=20 for each slide") ## ----tag_file----------------------------------------------------------------- tag_file <- read.csv(system.file("extdata/TAG_antigens.csv", package="protGear")) tag_antigens <- c("CD4TAG" , "GST", "MBP") batch_vars <- list(machine="m1", day="0520") ## ----tag_glimpse-------------------------------------------------------------- tb1 <- data.frame(head(tag_file, n=10)) tb1 %>% knitr::kable() ## ----tag_subtract------------------------------------------------------------- #' @________________________________subtract_the_tag_values_______________________________________ #' ## tag subtract ## read in the KILCHip TAG file to substract GST-1, MBP -2 and CD4TAG - 0 file dataCV_tag <- purrr::map(.x=dataCV_best2, .f=tag_subtract , tag_antigens=tag_antigens, mean_best_CV_var="mean_best_CV",tag_file=tag_file, antigen_var='antigen', batch_vars=batch_vars) dataCV_tag <- set_names(dataCV_tag, purrr::map(filenames, name_of_files)) dataCV_tag <- plyr::ldply(dataCV_tag) ## ----before_after_tag--------------------------------------------------------- aaa <- dataCV_tag %>% filter(TAG_name=="GST") aaa <- aaa %>% dplyr::select(.id,sampleID,antigen,mean_best_CV,mean_best_CV_tag) aaa <- aaa %>% gather(measure,mfi,-c(.id:antigen)) ggplot(aaa,aes(as.factor(antigen),mfi,color=measure)) + geom_boxplot(aes(fill=measure),alpha=0.5)+ theme_light() + xlab("antigen name")+ ggtitle("Before and after TAG subtraction") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) ## ----normalisation------------------------------------------------------------ df_to_normalise <- dataCV_tag %>% ungroup() %>% dplyr::select(slide=.id,sampleID,sample_array_ID,antigen,mean_best_CV) %>% group_by(sampleID, slide) df_to_normalise$sample_index <- group_indices(.data =df_to_normalise ) ### to_normalise <- df_to_normalise %>% ungroup() %>% dplyr::select(-slide,-sampleID,-sample_array_ID) %>% dplyr::select(antigen, sample_index, everything()) %>% gather(variable, value, -(antigen:sample_index)) %>% unite(temp, antigen ) %>% dplyr::select(-variable) %>% spread(temp, value) %>% as.data.frame(.) ### get the row names of the machine data row.names(to_normalise) <- to_normalise$sample_index #batch_all <- as.factor(paste0(to_normalise$machine,"/",to_normalise$day)) #machines <- as.factor(to_normalise$machine) #day_batches <- as.factor(to_normalise$day) ## create the matrix to normalise matrix_antigen <- to_normalise %>% dplyr::select(-sample_index) %>% as.matrix(.) ## create the matrix to hold the important parameters ## in place of AMA1 you use one of your features or antigen array_matrix <- df_to_normalise %>% filter(antigen=="AMA1") %>% ungroup() %>% dplyr::select(sample_array_ID,sample_index,slide) control_antigens <- c("CommercialHumanIgG","CD4TAG") ## ----normalised_df------------------------------------------------------------ normlise_df <- matrix_normalise(matrix_antigen, method = "vsn", array_matrix=array_matrix, return_plot = TRUE,control_antigens=control_antigens) normlise_df$plot_normalisation ## ----compare_methods---------------------------------------------------------- control_antigens <- c("CommercialHumanIgG","CD4TAG") ## no normalisation normalise_list_none <- matrix_normalise(matrix_antigen=matrix_antigen, method = "none", array_matrix=array_matrix, return_plot = TRUE, control_antigens=control_antigens) names(normalise_list_none) <- c("matrix_antigen_none" ,"plot_none") ## log2 normalisation normalise_list_log <- matrix_normalise(matrix_antigen=matrix_antigen, method = "log2", array_matrix=array_matrix, return_plot = TRUE, control_antigens=control_antigens) names(normalise_list_log) <- c("matrix_antigen_log" ,"plot_log") ## vsn normalisation normalise_list_vsn <- matrix_normalise(matrix_antigen=matrix_antigen, method = "vsn", array_matrix=array_matrix, return_plot = TRUE, control_antigens=control_antigens) names(normalise_list_vsn) <- c("matrix_antigen_vsn" ,"plot_vsn") ## cyclic loess with log normalise_list_cyclic_loess_log <- matrix_normalise(matrix_antigen=matrix_antigen, method = "cyclic_loess_log", array_matrix=array_matrix, return_plot = TRUE, control_antigens=control_antigens) names(normalise_list_cyclic_loess_log) <- c("matrix_antigen_cyclic_loess_log" , "plot_cyclic_loess_log") normalise_list_rlm <- matrix_normalise(matrix_antigen=matrix_antigen, method = "rlm", array_matrix=array_matrix, return_plot = TRUE, control_antigens=control_antigens) names(normalise_list_rlm) <- c("matrix_antigen_rlm" ,"plot_rlm") ## create a list after normalisation normalised_list <- c(normalise_list_none , normalise_list_log, normalise_list_vsn, normalise_list_cyclic_loess_log, normalise_list_rlm) ## normalised_list_plot <- normalised_list[grepl("plot",names(normalised_list))] ## ----plot_comparison, fig.align='center', fig.width=12, fig.height=15--------- p <- do.call("grid.arrange", c(normalised_list_plot, ncol=2)) ## ----heat_norm---------------------------------------------------------------- norm_df <- normlise_df$matrix_antigen_normalised norm_df <- norm_df %>% dplyr::select(-control_antigens) p3 <- pheatmap::pheatmap(norm_df ,scale = "none", cluster_rows = FALSE, main=paste('VSN',"Normalised Data"), silent = TRUE) #------- ## if you want to save the file # p3 <- ggplotify::as.ggplot(p3) # p <- p3 + theme_void() # ggsave(p , # filename ="heatmap.PNG" , # width = 16 , height = 12 , # limitsize = FALSE, # dpi=200 ) #------- p3 ## ----pca, fig.align='center',fig.width=16,fig.height=12----------------------- norm_df <- normlise_df$matrix_antigen_normalised res_pca <- prcomp( norm_df, scale = TRUE) var <- get_pca_var(res_pca) vars_visualise=20 #Visualize the PCA ## individuals contributing to the PCA p1 <- fviz_pca_ind(res_pca, col.var = "contrib", # Color by contributions to the PC gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), repel = TRUE # Avoid text overlapping )+ theme_minimal() # Select the top vars_visualise contributing variables p2 <-fviz_pca_biplot(res_pca, label="var", select.var = list(contrib = vars_visualise)) + theme_minimal() # Total cos2 of variables on Dim.1 and Dim.2 p3 <- fviz_cos2(res_pca, choice = "var", axes = 1:2 , top = vars_visualise) # Color by cos2 values: quality on the factor map p4 <- fviz_pca_var(res_pca, col.var = "cos2", gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"), select.var = list(contrib = vars_visualise), repel = TRUE # Avoid text overlapping ) ## combine the plots into one grid ## combine the plots into one grid ## combine the plots into one grid p_pca <- gridExtra::grid.arrange(p1,p2,p3,p4, ncol=2 ) ## If you want to save the file # ggsave(p_pca , # filename ="p_pca.PNG" , # width = 16 , height = 12 , # units = "in", # limitsize = FALSE, # dpi=300) p_pca ## ----shiny_launch, eval=FALSE------------------------------------------------- # protGear::launch_protGear_interactive() ## ----------------------------------------------------------------------------- sessionInfo()