## ----setup, echo=FALSE, message=FALSE, warning=FALSE--------------------------
knitr::opts_chunk$set(echo = TRUE)

## ----install_bioc, eval=FALSE-------------------------------------------------
# BiocManager::install("scPCA")

## ----install_github, eval=FALSE-----------------------------------------------
# remotes::install_github("PhilBoileau/scPCA")

## ----load_libs, message=FALSE, warning=FALSE----------------------------------
library(dplyr)
library(ggplot2)
library(ggpubr)
library(elasticnet)
library(scPCA)
library(microbenchmark)

## ----PCA_sim------------------------------------------------------------------
# set seed for reproducibility
set.seed(1742)

# load data
data(toy_df)

# perform PCA
pca_sim <- prcomp(toy_df[, 1:30])

# plot the 2D rep using first 2 components
df <- as_tibble(list("PC1" = pca_sim$x[, 1],
                     "PC2" = pca_sim$x[, 2],
                     "label" = as.character(toy_df[, 31])))
p_pca <- ggplot(df, aes(x = PC1, y = PC2, colour = label)) +
  ggtitle("PCA on Simulated Data") +
  geom_point(alpha = 0.5) +
  theme_minimal()
p_pca

## ----sPCA_sim-----------------------------------------------------------------
# perform sPCA on toy_df for a range of L1 penalty terms
penalties <- exp(seq(log(10), log(1000), length.out = 6))
df_ls <- lapply(penalties, function(penalty) {
  spca_sim_p <- elasticnet::spca(toy_df[, 1:30], K = 2, para = rep(penalty, 2),
                     type = "predictor", sparse = "penalty")$loadings
  spca_sim_p <- as.matrix(toy_df[, 1:30]) %*% spca_sim_p
  spca_out <- list("SPC1" = spca_sim_p[, 1],
                   "SPC2" = spca_sim_p[, 2],
                   "penalty" = round(rep(penalty, nrow(toy_df))),
                   "label"  = as.character(toy_df[, 31])) %>%
    as_tibble()
  return(spca_out)
})
df <- dplyr::bind_rows(df_ls)

# plot the results of sPCA
p_spca <- ggplot(df, aes(x = SPC1, y = SPC2, colour = label)) +
  geom_point(alpha = 0.5) +
  ggtitle("SPCA on Simulated Data for Varying L1 Penalty Terms") +
  facet_wrap(~ penalty, nrow = 2) +
  theme_minimal()
p_spca

## ----cPCA_sim-----------------------------------------------------------------
cpca_sim <- scPCA(target = toy_df[, 1:30],
                  background = background_df,
                  penalties = 0,
                  n_centers = 4)

# create a dataframe to be plotted
cpca_df <- cpca_sim$x %>%
  as_tibble() %>%
  mutate(label = toy_df[, 31] %>% as.character)
colnames(cpca_df) <- c("cPC1", "cPC2", "label")

# plot the results
p_cpca <- ggplot(cpca_df, aes(x = cPC1, y = cPC2, colour = label)) +
  geom_point(alpha = 0.5) +
  ggtitle("cPCA of Simulated Data") +
  theme_minimal()
p_cpca

## ----scPCA_sim, warning=FALSE-------------------------------------------------
# run scPCA for using 40 logarithmically seperated contrastive parameter values
# and possible 20 L1 penalty terms
scpca_sim <- scPCA(target = toy_df[, 1:30],
                   background = background_df,
                   n_centers = 4,
                   penalties = exp(seq(log(0.01), log(0.5), length.out = 10)),
                   alg = "var_proj")

# create a dataframe to be plotted
scpca_df <- scpca_sim$x %>%
  as_tibble() %>%
  mutate(label = toy_df[, 31] %>% as.character)
colnames(scpca_df) <- c("scPC1", "scPC2", "label")

# plot the results
p_scpca <- ggplot(scpca_df, aes(x = scPC1, y = scPC2, colour = label)) +
  geom_point(alpha = 0.5) +
  ggtitle("scPCA of Simulated Data") +
  theme_minimal()
p_scpca


# create the loadings comparison plot
scpca_sim$rotation[, 1] <- -scpca_sim$rotation[, 1]
load_diff_df <- bind_rows(
  cpca_sim$rotation %>% as.data.frame,
  scpca_sim$rotation %>% as.data.frame
  ) %>%
  mutate(
    sparse = c(rep("0", 30), rep("1", 30)),
    coef = rep(1:30, 2)
  )
colnames(load_diff_df) <- c("comp1", "comp2", "sparse", "coef")

p1 <- load_diff_df %>%
  ggplot(aes(y = comp1, x = coef, fill = sparse)) +
  geom_bar(stat = "identity") +
  xlab("") +
  ylab("") +
  ylim(-1.4, 1.4) +
  ggtitle("First Component") +
  scale_fill_discrete(name = "Method", labels = c("cPCA", "scPCA")) +
  theme_minimal()

p2 <- load_diff_df %>%
  ggplot(aes(y = comp2, x = coef, fill = sparse)) +
  geom_bar(stat = "identity") +
  xlab("") +
  ylab("") +
  ylim(-1.4, 1.4) +
  ggtitle("Second Component") +
  scale_fill_discrete(name = "Method", labels = c("cPCA", "scPCA")) +
  theme_minimal()

# build full plot via ggpubr
annotate_figure(
  ggarrange(p1, p2, nrow = 1, ncol = 2,
            common.legend = TRUE, legend = "right"),
  top = "Leading Loadings Vectors Comparison",
  left = "Loading Weights",
  bottom = "Variable Index"
)

## ----cPCA_sim_cv--------------------------------------------------------------
cpca_cv_sim <- scPCA(target = toy_df[, 1:30],
                     background = background_df,
                     penalties = 0,
                     n_centers = 4,
                     cv = 3)

# create a dataframe to be plotted
cpca_cv_df <- cpca_cv_sim$x %>%
  as_tibble() %>%
  dplyr::mutate(label = toy_df[, 31] %>% as.character)
colnames(cpca_cv_df) <- c("cPC1", "cPC2", "label")

# plot the results
p_cpca_cv <- ggplot(cpca_cv_df, aes(x = cPC1, y = cPC2, colour = label)) +
  geom_point(alpha = 0.5) +
  ggtitle("cPCA of Simulated Data") +
  theme_minimal()

## ----scPCA_sim_cv-------------------------------------------------------------
scpca_cv_sim <- scPCA(target = toy_df[, 1:30],
                      background = background_df,
                      n_centers = 4,
                      cv = 3,
                      penalties = exp(seq(log(0.01), log(0.5), length.out = 10)),
                      alg = "var_proj")

# create a dataframe to be plotted
scpca_cv_df <- scpca_cv_sim$x %>%
  as_tibble() %>%
  mutate(label = toy_df[, 31] %>% as.character)
colnames(scpca_cv_df) <- c("scPC1", "scPC2", "label")

# plot the results
p_scpca_cv <- ggplot(scpca_cv_df, aes(x = -scPC1, y = -scPC2, colour = label)) +
  geom_point(alpha = 0.5) +
  ggtitle("scPCA of Simulated Data") +
  theme_minimal()

## ----plot_cv_cPCA_scPCA, echo=FALSE-------------------------------------------
ggarrange(
  p_cpca_cv, p_scpca_cv,
  nrow = 1,
  common.legend = TRUE,
  legend = "right"
)

## ----microbenchmark_comparison, warning=FALSE, message=FALSE------------------
timing_scPCA <- microbenchmark(
  "Zou et al." = scPCA(target = toy_df[, 1:30],
                       background = background_df,
                       n_centers = 4,
                       alg = "iterative"),
  "Erichson et al. SPCA" = scPCA(target = toy_df[, 1:30],
                                 background = background_df,
                                 n_centers = 4,
                                 alg = "var_proj"),
  "Erichson et al. RSPCA" = scPCA(target = toy_df[, 1:30],
                                  background = background_df,
                                  n_centers = 4,
                                  alg = "rand_var_proj"),
  times = 3
)

autoplot(timing_scPCA, log = TRUE) +
  ggtitle("Running Time Comparison") +
  theme_minimal()

## ----sce_setup, message=FALSE-------------------------------------------------
library(splatter)
library(SingleCellExperiment)

# Simulate the three groups of cells. Mask cell heterogeneity with batch effect
params <- newSplatParams(
  seed = 6757293,
  nGenes = 500,
  batchCells = c(150, 150),
  batch.facLoc = c(0.05, 0.05),
  batch.facScale = c(0.05, 0.05),
  group.prob = c(0.333, 0.333, 0.334),
  de.prob = c(0.1, 0.05, 0.1),
  de.downProb = c(0.1, 0.05, 0.1),
  de.facLoc = rep(0.2, 3),
  de.facScale = rep(0.2, 3)
)
sim_sce <- splatSimulate(params, method = "groups")

## ----make_sce_subs, message=FALSE---------------------------------------------
# rank genes by variance
n_genes <- 250
vars <- assay(sim_sce) %>%
  log1p %>%
  rowVars
names(vars) <- rownames(sim_sce)
vars <- sort(vars, decreasing = TRUE)

# subset SCE to n_genes with highest variance
sce_sub <- sim_sce[names(vars[seq_len(n_genes)]),]
sce_sub

# split the subsetted SCE into target and background SCEs
tg_sce <- sce_sub[, sce_sub$Group %in% c("Group1", "Group3")]
bg_sce <- sce_sub[, sce_sub$Group %in% c("Group2")]

## ----perform_dimred, message=FALSE--------------------------------------------
# for both cPCA and scPCA, we'll set the penalties and contrasts arguments
contrasts <- exp(seq(log(0.1), log(100), length.out = 5))
penalties <- exp(seq(log(0.001), log(0.1), length.out = 3))

# first, PCA
pca_out <- prcomp(t(log1p(counts(tg_sce))), center = TRUE, scale. = TRUE)

# next, cPCA
cpca_out <- scPCA(t(log1p(counts(tg_sce))),
                  t(log1p(counts(bg_sce))),
                  n_centers = 2,
                  n_eigen = 2,
                  contrasts = contrasts,
                  penalties = 0,
                  center = TRUE,
                  scale = TRUE)

# finally, scPCA
scpca_out <- scPCA(t(log1p(counts(tg_sce))),
                   t(log1p(counts(bg_sce))),
                   n_centers = 2,
                   n_eigen = 2,
                   contrasts = contrasts,
                   penalties = penalties,
                   center = TRUE,
                   scale = TRUE,
                   alg = "var_proj")

# store new representations in the SingleCellExperiment object
reducedDims(tg_sce) <- SimpleList(PCA = pca_out$x[, 1:2],
                                  cPCA = cpca_out$x,
                                  scPCA = scpca_out$x)
tg_sce

## ----plot_red_dims, echo=FALSE, fig.asp=.25, out.width="1600px"---------------
# plot the 2D representations

# prepare PCA
pca_df <- data.frame(
  PC1 = pca_out$x[, 1],
  PC2 = pca_out$x[, 2],
  group = tg_sce$Group,
  batch = tg_sce$Batch
)
pca_p <- pca_df %>%
  ggplot(aes(x = PC1, y = PC2, colour = group, shape = batch)) +
  geom_point(alpha = 0.75) +
  ggtitle("PCA") +
  scale_colour_viridis_d(name = "Target Group", labels = c("1", "2"),
                         begin = 0.1, end = 0.9) +
  scale_shape_discrete(name = "Batch") +
  theme_minimal()

# prepare cPCA
cpca_df <- data.frame(
  cPC1 = cpca_out$x[, 1],
  cPC2 = cpca_out$x[, 2],
  group = tg_sce$Group,
  batch = tg_sce$Batch
)
cpca_p <- cpca_df %>%
  ggplot(aes(x = cPC1, y = cPC2, colour = group, shape = batch)) +
  geom_point(alpha = 0.75) +
  ggtitle("cPCA") +
  scale_colour_viridis_d(name = "Target Group", labels = c("1", "2"),
                         begin = 0.1, end = 0.9) +
  scale_shape_discrete(name = "Batch") +
  theme_minimal()

# prepare scPCA
scpca_df <- data.frame(
  scPC1 = scpca_out$x[, 1],
  scPC2 = scpca_out$x[, 2],
  group = tg_sce$Group,
  batch = tg_sce$Batch
)
scpca_p <- scpca_df %>%
  ggplot(aes(x = scPC1, y = scPC2, colour = group, shape = batch)) +
  geom_point(alpha = 0.75) +
  ggtitle("scPCA") +
  scale_colour_viridis_d(name = "Target Group", labels = c("1", "2"),
                         begin = 0.1, end = 0.9) +
  scale_shape_discrete(name = "Batch") +
  theme_minimal()

# combine the plots
ggarrange(
  pca_p, cpca_p, scpca_p,
  nrow = 1,
  common.legend = TRUE,
  legend = "right"
)

## ----bpscpca, eval=FALSE------------------------------------------------------
# # perform the same operations in parallel using BiocParallel
# library(BiocParallel)
# param <- MulticoreParam()
# register(param)
# 
# # perfom cPCA
# cpca_bp <- scPCA(
#   target = toy_df[, 1:30],
#   background = background_df,
#   contrasts = exp(seq(log(0.1), log(100), length.out = 10)),
#   penalties = 0,
#   n_centers = 4,
#   parallel = TRUE
# )
# 
# # perform scPCA
# scpca_bp <- scPCA(
#   target = toy_df[, 1:30],
#   background = background_df,
#   contrasts = exp(seq(log(0.1), log(100), length.out = 10)),
#   penalties = seq(0.1, 1, length.out = 9),
#   n_centers = 4,
#   parallel = TRUE
# )

## ----session_info, echo=FALSE-------------------------------------------------
sessionInfo()