## ----setup, include = FALSE--------------------------------------------------- knitr::opts_chunk$set( collapse = TRUE, comment = "#>", crop = NULL ## Related to https://stat.ethz.ch/pipermail/bioc-devel/2020-April/016656.html ) ## ----installation, eval=FALSE------------------------------------------------- # if(!requireNamespace('BiocManager', quietly = TRUE)) # install.packages('BiocManager') # # BiocManager::install("syntenet") ## ----load_package, message=FALSE---------------------------------------------- # Load package after installation library(syntenet) ## ----data--------------------------------------------------------------------- # Protein sequences data(proteomes) head(proteomes) # Annotation (ranges) data(annotation) head(annotation) ## ----fasta2AAStringSetlist---------------------------------------------------- # Path to directory containing FASTA files fasta_dir <- system.file("extdata", "sequences", package = "syntenet") fasta_dir dir(fasta_dir) # see the contents of the directory # Read all FASTA files in `fasta_dir` aastringsetlist <- fasta2AAStringSetlist(fasta_dir) aastringsetlist ## ----gff2GRangesList---------------------------------------------------------- # Path to directory containing FASTA files gff_dir <- system.file("extdata", "annotation", package = "syntenet") gff_dir dir(gff_dir) # see the contents of the directory # Read all FASTA files in `fasta_dir` grangeslist <- gff2GRangesList(gff_dir) grangeslist ## ----check_input-------------------------------------------------------------- check_input(proteomes, annotation) ## ----process_input------------------------------------------------------------ pdata <- process_input(proteomes, annotation) # Looking at the processed data pdata$seq pdata$annotation ## ----run_diamond-------------------------------------------------------------- data(blast_list) if(diamond_is_installed()) { blast_list <- run_diamond(seq = pdata$seq) } ## ----blast_inspect------------------------------------------------------------ # List names names(blast_list) # Inspect first data frame head(blast_list$Olucimarinus_Olucimarinus) ## ----infer_syntenet----------------------------------------------------------- # Infer synteny network net <- infer_syntenet(blast_list, pdata$annotation) # Look at the output head(net) ## ----create_species_id_table-------------------------------------------------- # Get a 2-column data frame of species IDs and names id_table <- create_species_id_table(names(proteomes)) id_table ## ----cluster_network---------------------------------------------------------- # Load example data and take a look at it data(network) head(network) # Cluster network clusters <- cluster_network(network) head(clusters) ## ----phylogenomic_profile----------------------------------------------------- # Phylogenomic profiling profiles <- phylogenomic_profile(clusters) # Exploring the output head(profiles) ## ----plot_profiles------------------------------------------------------------ # 1) Create a named vector of custom species order to plot species_order <- setNames( # vector elements c( "vra", "van", "pvu", "gma", "cca", "tpr", "mtr", "adu", "lja", "Lang", "car", "pmu", "ppe", "pbr", "mdo", "roc", "fve", "Mnot", "Zjuj", "jcu", "mes", "rco", "lus", "ptr" ), # vector names c( "V. radiata", "V. angularis", "P. vulgaris", "G. max", "C. cajan", "T. pratense", "M. truncatula", "A. duranensis", "L. japonicus", "L. angustifolius", "C. arietinum", "P. mume", "P. persica", "P. bretschneideri", "M. domestica", "R. occidentalis", "F. vesca", "M. notabilis", "Z. jujuba", "J. curcas", "M. esculenta", "R. communis", "L. usitatissimum", "P. trichocarpa" ) ) species_order # 2) Create a metadata data frame containing the family of each species species_annotation <- data.frame( Species = species_order, Family = c( rep("Fabaceae", 11), rep("Rosaceae", 6), "Moraceae", "Ramnaceae", rep("Euphorbiaceae", 3), "Linaceae", "Salicaceae" ) ) head(species_annotation) # 3) Plot phylogenomic profiles, but using Ruzicka distances plot_profiles( profiles, species_annotation, cluster_species = species_order, dist_function = labdsv::dsvdis, dist_params = list(index = "ruzicka") ) ## ----find_GS_clusters--------------------------------------------------------- # Find group-specific clusters gs_clusters <- find_GS_clusters(profiles, species_annotation) head(gs_clusters) # How many family-specific clusters are there? nrow(gs_clusters) ## ----heatmap_filtered--------------------------------------------------------- # Filter profiles matrix to only include group-specific clusters idx <- rownames(profiles) %in% gs_clusters$Cluster p_gs <- profiles[idx, ] # Plot heatmap plot_profiles( p_gs, species_annotation, cluster_species = species_order, cluster_columns = TRUE ) ## ----plot_network------------------------------------------------------------- # 1) Visualize a network of first 5 GS-clusters id <- gs_clusters$Cluster[1:5] plot_network(network, clusters, cluster_id = id) # 2) Coloring nodes by family genes <- unique(c(network$node1, network$node2)) gene_df <- data.frame( Gene = genes, Species = unlist(lapply(strsplit(genes, "_"), head, 1)) ) gene_df <- merge(gene_df, species_annotation)[, c("Gene", "Family")] head(gene_df) plot_network(network, clusters, cluster_id = id, color_by = gene_df) # 3) Interactive visualization (zoom out and in to explore it) plot_network( network, clusters, cluster_id = id, interactive = TRUE, dim_interactive = c(500, 300) ) ## ----binarize----------------------------------------------------------------- bt_mat <- binarize_and_transpose(profiles) # Looking at the first 5 rows and 5 columns of the matrix bt_mat[1:5, 1:5] ## ----infer_phylogeny---------------------------------------------------------- # Leave only 6 legume species and P. mume as an outgroup for testing purposes included <- c("gma", "pvu", "vra", "van", "cca", "pmu") bt_mat <- bt_mat[rownames(bt_mat) %in% included, ] # Remove non-variable sites bt_mat <- bt_mat[, colSums(bt_mat) != length(included)] if(iqtree_is_installed()) { phylo <- infer_microsynteny_phylogeny(bt_mat, outgroup = "pmu", threads = 1) } ## ----vis_phylogeny, message = FALSE, warning = FALSE, fig.height = 12, fig.width = 7---- data(angiosperm_phylogeny) # Plotting the tree library(ggtree) ggtree(angiosperm_phylogeny) + geom_tiplab(size = 3) + xlim(0, 0.3) ## ----------------------------------------------------------------------------- # Load data data(scerevisiae_annot) data(scerevisiae_diamond) # Take a look at the data head(scerevisiae_annot) names(scerevisiae_diamond) head(scerevisiae_diamond$Scerevisiae_Scerevisiae) # Detect intragenome synteny intra_syn <- intraspecies_synteny( scerevisiae_diamond, scerevisiae_annot ) intra_syn # see where the .collinearity file is # Read .collinearity file scerevisiae_syn <- parse_collinearity(intra_syn) head(scerevisiae_syn) ## ----------------------------------------------------------------------------- # Keep only interspecies DIAMOND comparisons names(blast_list) diamond_inter <- blast_list[c(2, 3)] # Double-check if we have processed annotation for these 2 species names(pdata$annotation) # Detect interspecies synteny intersyn <- interspecies_synteny(diamond_inter, pdata$annotation) intersyn # see where the .collinearity file is # Read .collinearity file ostreoccocus_syn <- parse_collinearity(intersyn) head(ostreoccocus_syn) ## ----parse_collinearity_examples---------------------------------------------- # 1) Get anchors with `parse_collinearity()` anchors <- parse_collinearity(intra_syn) head(anchors) # 2) Get synteny block with `parse_collinearity()` blocks <- parse_collinearity(intra_syn, as = "blocks") head(blocks) # 3) Get synteny blocks and anchor pairs in a single data frame all <- parse_collinearity(intra_syn, as = "all") head(all) ## ----faq1--------------------------------------------------------------------- # Add example directory /home/username/bioinfo_tools to PATH Sys.setenv( PATH = paste( Sys.getenv("PATH"), "/home/username/bioinfo_tools", sep = ":" ) ) ## ----faq3-p1------------------------------------------------------------------ # Path to directory containing data data_dir <- system.file( "extdata", "RefSeq_parsing_example", package = "syntenet" ) dir(data_dir) # Reading the files to a format that syntenet understands seqs <- fasta2AAStringSetlist(data_dir) annot <- gff2GRangesList(data_dir) # Taking a look at the data seqs head(names(seqs$Aalosa)) annot ## ----faq3-p2------------------------------------------------------------------ # Remove whitespace and everything after it names(seqs$Aalosa) <- gsub(" .*", "", names(seqs$Aalosa)) # Taking a look at the new names head(names(seqs$Aalosa)) ## ----faq3-p3------------------------------------------------------------------ # Show only rows for which `type` is "CDS" head(annot$Aalosa[annot$Aalosa$type == "CDS"]) ## ----faq-p4------------------------------------------------------------------- # Create a list of data frames containing protein-to-gene ID correspondences protein2gene <- lapply(annot, function(x) { # Extract only CDS ranges cds_ranges <- x[x$type == "CDS"] # Create the ID correspondence data frame df <- data.frame( protein_id = cds_ranges$Name, gene_id = cds_ranges$gene ) # Remove duplicate rows df <- df[!duplicated(df$protein_id), ] return(df) }) # Taking a look at the list protein2gene ## ----faq-p5------------------------------------------------------------------- # Collapse protein IDs to gene IDs in list of sequences new_seq <- collapse_protein_ids(seqs, protein2gene) # Looking at the new sequences new_seq ## ----sessionInfo-------------------------------------------------------------- sessionInfo()