## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----echo=FALSE, fig.cap="Fig 1. BioTIP workflow with five key analytical steps. RTF: relative transcript fluctuation; PCC: Pearson correlation coefficient; DNB: Dynamic Network Biomarker, of which the score was called as Module-Criticality Index (MCI) in the R package BioTIP; Ic: index of critical transition.; Ic*: our redefined Ic, which is also referred to as the BioTIP score", fig.align='center', out.width = '60%'----
knitr::include_graphics("FigS1_BioTIP_pipeline_detailed_v7.jpg")

## ----echo=FALSE, fig.cap="Table 1. The key R functions and customaized parameters for the five analytical steps.", fig.align='center', out.width = '60%'----
knitr::include_graphics("Fig_Table1_xy.jpg")

## ----warning=FALSE, message=FALSE---------------------------------------------
# load BioTIP and dependent packages
library(BioTIP)
library(cluster)
library(GenomicRanges)
library(MASS)
require(stringr)
require(psych)
require(igraph)

## -----------------------------------------------------------------------------
data(GSE6136_matrix)
dim(GSE6136_matrix)  
row.names(GSE6136_matrix) = GSE6136_matrix$ID_REF

# remove the downloaded first row after assigning it to be column-name of the final numeric data matrix
GSE6136 = GSE6136_matrix[,-1]
dim(GSE6136) 

## -----------------------------------------------------------------------------
data(GSE6136_cli)
##dim(GSE6136_cli) optional: check dimension

cli = t(GSE6136_cli)

colnames(cli) = str_split_fixed(cli[1,],'_',2)[,2]
cli = cli[-1,]
cli = data.frame(cli)
cli[,"cell-type:ch1"] = str_split_fixed(cli$characteristics_ch1.1,": ",2)[,2]
cli[,"Ig clonality:ch1"] = str_split_fixed(cli$characteristics_ch1.3,": ",2)[,2]

colnames(cli)[colnames(cli) == "cell-type:ch1"] = "group"
cli$Row.names = cli[,1]
head(cli[,1:3])

## -----------------------------------------------------------------------------
dat <- GSE6136
df <- log2(dat+1)
head(df[,1:6])

## ----warning=FALSE------------------------------------------------------------
cli$group = factor(cli$group,
                   levels = c('resting','activated','lymphoma_marginal','lymphoma_transitional','lymphoma_aggressive'))
samplesL <- split(cli[,"geo_accession"],f = cli$group)
lapply(samplesL, length)
test <- sd_selection(df, samplesL,0.01)
head(test[["activated"]])

## ----warning=FALSE------------------------------------------------------------
igraphL <- getNetwork(test, fdr = 1)
cluster <- getCluster_methods(igraphL)

## ----echo=TRUE, warning=FALSE-------------------------------------------------
names(cluster)

## ----echo=TRUE, warning=FALSE-------------------------------------------------
membersL_noweight <- getMCI(cluster,test)
plotBar_MCI(membersL_noweight,ylim = c(0,6))

## ----echo=TRUE, warning=FALSE-------------------------------------------------
maxMCIms <- getMaxMCImember(membersL_noweight[[1]],membersL_noweight[[2]],min =10)
names(maxMCIms)
names(maxMCIms[[1]])
names(maxMCIms[[2]])

## ----echo=TRUE, warning=FALSE-------------------------------------------------
head(maxMCIms[['idx']])
head(maxMCIms[['members']][['lymphoma_aggressive']])

## -----------------------------------------------------------------------------
biomodules = getMaxStats(membersL_noweight[['members']],maxMCIms[[1]])
maxMCI = getMaxStats(membersL_noweight[['MCI']],maxMCIms[[1]])
maxMCI = maxMCI[order(maxMCI,decreasing=TRUE)]
head(maxMCI)
topMCI = getTopMCI(membersL_noweight[[1]],membersL_noweight[[2]],membersL_noweight[['MCI']],min =10)
head(topMCI)

## -----------------------------------------------------------------------------
maxSD = getMaxStats(membersL_noweight[['sd']],maxMCIms[[1]])
head(maxSD)

## -----------------------------------------------------------------------------
CTS = getCTS(topMCI, maxMCIms[[2]])

## ----echo=TRUE, warning=FALSE-------------------------------------------------
par(mar = c(10,5,0,2))
plotMaxMCI(maxMCIms,membersL_noweight[[2]],states = names(samplesL),las = 2)

## ----warning=FALSE, results=FALSE---------------------------------------------
simuMCI <- simulationMCI(3,samplesL,df, B=100)
plot_MCI_Simulation(topMCI[1],simuMCI,las=2)

## -----------------------------------------------------------------------------
IC <- getIc(df,samplesL,CTS[[1]],PCC_sample.target = 'average')
par(mar = c(10,5,0,2))
plotIc(IC,las = 2)

## ----warning=FALSE, results=FALSE---------------------------------------------
simuIC <- simulation_Ic(length(CTS[[1]]),samplesL,df,B=100)
par(mar = c(10,5,0,2))
plot_Ic_Simulation(IC,simuIC,las = 2)

## -----------------------------------------------------------------------------
sample_Ic = simulation_Ic_sample(df, sampleNo=3, genes=CTS[[1]], plot=TRUE)
##simulated_Ic = plot_simulation_sample(df,length(samplesL[['lymphoma_aggressive']]),IC[['lym#phoma_aggressive']],CTS)

## ----echo=FALSE, fig.cap="Fig 2. BioTIP Single-Cell Expression Data-Analytical Pipeline.", fig.align='center', out.width = '60%'----
knitr::include_graphics("Fig1_scRNARNAseq_pipeline_2021_xy.jpg")

## ----echo=FALSE, out.height='60%', out.width='60%'----------------------------
knitr::include_graphics("subway_map.jpg")

## ----warning=FALSE, message=FALSE, results=FALSE------------------------------
library(BioTIP)

## ----warning=FALSE, message=FALSE---------------------------------------------
#Load stringr library 
library(stringr)
#BioTIP dependent libraries 
library(cluster)
library(psych)
library(stringr)
library(GenomicRanges)
# library(Hmisc)
library(MASS)
library(igraph)
# library(RCurl)

## -----------------------------------------------------------------------------
## familiarize yourself with data
cell_info = read.delim("cell_info.tsv", head = T, sep = '\t', row.names=1)
dim(cell_info) 
## in case R automatically reformated characters in cell labels, match cell labels in the data and in the infor table  
rownames(cell_info) = sub('LT-HSC', 'LT.HSC', rownames(cell_info))
## focus on one branch, e.g., S4-S3-S0-S1  
## no filter because it was already 4k genes, 10% of the originally measured genes in the single cell experiment
cell_info <- subset(cell_info, branch_id_alias %in% c("('S4', 'S3')", "('S3', 'S0')", "('S1', 'S0')" ))
## check cell sub-population sizes along the STREAM outputs
(t = table(cell_info$label, cell_info$branch_id_alias))
## focus on sub-populations of cells with more than 10 cells
( idx = apply(t, 2, function(x) names(which(x>10))) )
## generate a list of samples (cells) of interest
 samplesL  = sapply(names(idx), 
                   function(x) sapply(idx[[x]], function(y) 
                   rownames(subset(cell_info, branch_id_alias == x & label == y))))
## check the number of sub-populations along the pseudotime trajectory (generated by STREAM for example)
## recommend to focus on each branch, respectively
names(samplesL[[1]])
## merge sub-populations at each pseudo-time 'state'
samplesL = do.call(c, samplesL)  
lengths(samplesL)

## load normalized gene expression matrix, refer to the example below if not yet normalized
## e.g, here is a STREAM-published data matrix of normalized gene expression matrix

## when analyzing SingleCellExperiment object SCE, translate to matrix using the following code
# counts <- logcounts(SCE)
## data matrix imported from our github repository due to large size
githuburl = "https://github.com/ysun98/BioTIPBigData/blob/master/data_Nestorowa.tsv?raw=true"
counts = read.table(url(githuburl), head=T, sep="\t", row.names=1)
dim(counts) 
if (class(counts)=="data.frame") counts = as.matrix(counts)

all(colnames(counts) %in% as.character(rownames(cell_info)))
## log2 transformation example
if(max(counts)>20) counts = log2(counts)
## Check the overall distribution using histogram
# hist (counts, 100)  

## ----results=FALSE------------------------------------------------------------
## First, estimate the random Ic-scores by permuting the expression values of genes
  RandomIc_g = list()
  set.seed(2020)
  C= 10 # for real data analysis C = at least 500
  i = 1  
  n <- 200 # randomly pick up 200 genes
  RandomIc_g[[i]]  <- simulation_Ic(n, samplesL, counts, 
                                    B=C,
                                    fun="BioTIP")
  names(RandomIc_g)[i] = paste0("Simulation",i,"gene")

  medianIc = apply(RandomIc_g[[i]],1,median)
  par(mfrow = c(1,1))
  plot_Ic_Simulation(medianIc, RandomIc_g[[i]], las = 2, ylab="BioTIP",
                       main= paste("Simulation using",n,"transcripts"),
                       fun="boxplot", ylim=c(0,0.3))

## -----------------------------------------------------------------------------
## setup parameters for gene preselection
B=10 ##for demonstration purposes use 10, when running dataset we recommend at least 100 
##optimize and one nonoptimize for single cell use optimize with B 100 or higher for demo purpose only we run 
opt.cutoff = 0.2
opt.per = 0.8

## Commented out because of calculation expense, recommend to save the calculated data file after one calculation for future use
## set.seed(100)
## subcounts = optimize.sd_selection(counts, samplesL, cutoff = opt.cutoff, percent=opt.per, B)
## save( subcounts, file='BioTIP_Output_xy/subcounts.optimize_80per.rData')
data("subcounts.optimize_80per")

## ----warning=FALSE------------------------------------------------------------
## use 0.01-0.2 to ensure gene-gene co-expression is significant
## increase fdr cutoff to obtain larger gene sets
network.fdr = 0.1
min.size = 50
igraphL = getNetwork(subcounts, fdr=network.fdr)
names(igraphL)

## Network partition using random walk
cluster = getCluster_methods(igraphL)

tmp = igraphL[["('S3', 'S0').CMP"]]
E(tmp)$width <- E(tmp)$weight*3
V(tmp)$community= cluster[["('S3', 'S0').CMP"]]$membership
mark.groups = table(cluster[["('S3', 'S0').CMP"]]$membership)
colrs = rainbow(length(mark.groups), alpha = 0.3)
V(tmp)$label <- NA
plot(tmp, vertex.color=colrs[V(tmp)$community],vertex.size = 5,
     mark.groups=cluster[["('S3', 'S0').CMP"]])

## ----warning=FALSE------------------------------------------------------------
fun = 'BioTIP'

##  Commented out because of calculation expense, recommend to save the calculated data file after one calculation for future use
## membersL = getMCI(cluster, subcounts, adjust.size = F, fun)
## save(membersL, file="membersL.RData", compress=TRUE)
data("membersL")

par(mar=c(1,1,2,1))
plotBar_MCI(membersL, ylim=c(0,30), min=50)

## Decide how many states of interest, here is 4
n.state.candidate <- 4
topMCI = getTopMCI(membersL[["members"]], membersL[["MCI"]], membersL[["MCI"]], 
                   min=min.size, 
                   n=n.state.candidate)
names(topMCI)

## Obtain state ID and MCI statistics for the n=3 leading MCI scores per state
maxMCIms = getMaxMCImember(membersL[["members"]], membersL[["MCI"]], 
                           min =min.size, 
                           n=3)
names(maxMCIms)

## list the maximum MCI score per state, for all states
maxMCI = getMaxStats(membersL[['MCI']], maxMCIms[['idx']])
unlist(maxMCI)

#### extract biomodule candidates in the following steps: ####
## record the gene members per toppest module for each of these states of interest 
CTS = getCTS(maxMCI[names(topMCI)], maxMCIms[["members"]][names(topMCI)])

## tmp calculates the number of bars within each named state
tmp = unlist(lapply(maxMCIms[['idx']][names(topMCI)], length))
## here returns all the groups with exactly 2 bars
(whoistop2nd = names(tmp[tmp==2]))
## here returns all the groups with exactly 3 bars
(whoistop3rd = names(tmp[tmp==3]))

## add the gene members of the 2nd toppest biomodue in the states with exactly 2 bars
if(length(whoistop2nd)>0)  CTS = append(CTS, maxMCIms[["2topest.members"]][whoistop2nd])
names(CTS)
## add the gene members of the 2nd toppest biomodue in the states with exactly 3 bars
if(length(whoistop3rd)>0)  CTS = append(CTS, maxMCIms[["2topest.members"]][whoistop3rd])  
names(CTS)

## add the gene members of the 3rd toppest biomodue in the states with exactly 3 bars
if(length(whoistop3rd)>0)  CTS = append(CTS, maxMCIms[["3topest.members"]][whoistop3rd])  
names(CTS)

## ----results=FALSE------------------------------------------------------------
#### extract CTS scores for each biomodule candidate in the following steps: ####
## first to record the max MCI for the n.state.candidate 
maxMCI <- maxMCI[names(CTS)[1:n.state.candidate]]
maxMCI

## then applendix the 2nd highest MCI score (if existing) for the states with exactly 2 bars
if(length(whoistop2nd)>0) maxMCI <- c(maxMCI, getNextMaxStats(membersL[['MCI']], idL=maxMCIms[['idx']], whoistop2nd))
names(maxMCI)
## applendix the 2nd highest MCI score (if existing) for the states with exactly 3 bars
if(length(whoistop3rd)>0) maxMCI <- c(maxMCI, getNextMaxStats(membersL[['MCI']], idL=maxMCIms[['idx']], whoistop3rd))
names(maxMCI)

## then applendix the 3rd highest MCI score (if existing) for the states with exactly 3 bars
if(length(whoistop3rd)>0) maxMCI <- c(maxMCI, getNextMaxStats(membersL[['MCI']], idL=maxMCIms[['idx']], whoistop3rd, which.next=3))
maxMCI

## to ensure the same order between maxMCI  and CTS
all(names(CTS) == names(maxMCI))

##  estimate empritical significance from the MCI scores

## M is precalculated correlation matrix for large dataset (>2k genes), will be reused in the downstream simulation analysis
#counts = read.table("data_Nestorowa.tsv")
#if (class(counts)=="data.frame") counts = as.matrix(counts)
#M <- cor.shrink(counts, Y = NULL, MARGIN = 1, shrink = TRUE)
#save(M, file="cor.shrink_M.RData", compress=TRUE) 
#dim(M)

## C is the runs of permutations to estimate random scores
#C = 10 # for real data analysis C = at least 200
#RandomMCI = list()
#n <- length(CTS)  # number of CTS candidates
#set.seed(2020)
#for (i in 1:n) #
#  i=1; par(mfrow=c(1,1))
#  x <- length(CTS[[i]])
#  RandomMCI[[i]] <- simulationMCI(x, samplesL, counts,  B=C, fun="BioTIP", M=M)
#  dim(RandomMCI)  

#  plot_MCI_Simulation(maxMCI[i], RandomMCI[[i]], las=2, 
#                      ylim=c(0, max(maxMCI[i], 2*RandomMCI[[i]])),  
#                      main=paste(names(maxMCI)[i], length(CTS[[i]]), "genes",
#                                 "\n","vs. ",C, "times of gene-permutation"),
#                      which2point=names(maxMCI)[i])

  
######## Finding Tipping Point #################
  
  newIc_score = lapply(CTS, function(x) getIc(counts, samplesL, x, fun="BioTIP", PCC_sample.target = 'average'))
  names(newIc_score) <- names(CTS)

## ----echo=FALSE, fig.align='center', out.width = '60%'------------------------
knitr::include_graphics("DNB_Sim_M.jpg")

## ----results=FALSE------------------------------------------------------------
######## verify using IC score #################
## First, estimate the random Ic-scores by permuating the expresion values of genes
  RandomIc_g = list()
  set.seed(2020)
  C= 10 # for real data analysis C = at least 500
#  for(i in 1:length(CTS)){ Not to run the full loop B# } 
  i = 1  
  CTS <- CTS[[i]]
  n <- length(CTS)
  RandomIc_g[[i]]  <- simulation_Ic(n, samplesL, counts, 
                                    B=C,
                                    fun="BioTIP")
  names(RandomIc_g)[i] = names(CTS)[i]

#  par(mfrow=c(1,length(int)))
#  for(i in 1:length(newIc_score)){ Not to run the full loop B plotting
    par(mfrow = c(1,1))
    n = length(CTS[[i]])
    plot_Ic_Simulation(newIc_score[[i]], RandomIc_g[[i]], las = 2, ylab="BioTIP",
                       main= paste(names(newIc_score)[i],"(",n," transcripts)"),
                       #fun="matplot", which2point=names(newIc_score)[i])
                       fun="boxplot", which2point=names(newIc_score)[i], ylim=c(0,0.5))
    interesting = which(names(samplesL) == names(newIc_score[i]))
    p = length(which(RandomIc_g[[i]][interesting,] >= newIc_score[[i]][names(newIc_score)[i]]))
    p = p/ncol(RandomIc_g[[i]])
    # first p value (p1) calculated for exactly at tipping point
    p2 = length(which(RandomIc_g[[i]] >= newIc_score[[i]][names(newIc_score)[i]]))
    p2 = p2/ncol(RandomIc_g[[i]])
    p2 = p2/nrow(RandomIc_g[[i]])
    # second p value (p2) calculated across all statuses
    ## local Kernel Density Plot  
    d <- density(RandomIc_g[[i]]) # returns the density data
    plot(d, xlim=range(c(newIc_score[[i]],RandomIc_g[[i]])),
         main=paste("Random genes: p.Local=",p)) # plots the results
    abline(v=newIc_score[[i]][names(newIc_score)[i]], col="green")
    ## global Kernel Density Plot  
    d <- density(unlist(RandomIc_g)) # returns the density data
    plot(d, xlim=range(c(newIc_score[[i]],unlist(RandomIc_g))),
         main=paste("Random genes: p.Global=",p2)) # plots the results
    abline(v=newIc_score[[i]][names(newIc_score)[i]], col="green")
    
#  }  Not to run the full loop B plotting

    ## Second, estimate the random Ic-scores by randomly shulffing cell labels
    RandomIc_s = list()
    set.seed(2020)
    #  for(i in 1:length(CTS)){ Not to run the full loop C
    i = 1
    RandomIc_s[[i]] <- matrix(nrow=length(samplesL), ncol=C)
    rownames(RandomIc_s[[i]]) = names(samplesL)
    for(j in 1:length(samplesL)) {
      ns <- length(samplesL[[j]])  # of cells at the state of interest
      RandomIc_s[[i]][j,] <- simulation_Ic_sample(counts, ns, 
                                                  Ic=BioTIP_score[x],
                                                  genes=CTS, 
                                                  B=C,
                                                  fun="BioTIP")
    }
    names(RandomIc_s)[i] = names(CTS)[i]
    #  } Not to run the full loop C
    
    #  par(mfrow=c(1,length(int)))
    #  for(i in 1:length(newIc_score)){ Not to run the full loop C plotting
    n = length(CTS[[i]])
    plot_Ic_Simulation(newIc_score[[i]], RandomIc_s[[i]], las = 2, ylab="BioTIP",
                       main= paste(names(newIc_score)[i],"(",n," transcripts)"),
                       fun="boxplot", which2point=names(newIc_score)[i])
    interesting = which(names(samplesL) == names(newIc_score[i]))
    p = length(which(RandomIc_s[[i]][interesting,] >= newIc_score[[i]][names(newIc_score)[i]]))
    p = p/ncol(RandomIc_s[[i]])
    # first p value (p1) calculated for exactly at tipping point
    p2 = length(which(RandomIc_s[[i]] >= newIc_score[[i]][names(newIc_score)[i]]))
    p2 = p2/ncol(RandomIc_s[[i]])
    p2 = p2/nrow(RandomIc_s[[i]])
    p
    p2
    #  }  Not to run the full loop C plotting
    
    
## Alternatively, p values is estimated from delta scores
P3 = plot_SS_Simulation(newIc_score[[i]], RandomIc_s[[i]], 
                       xlim=c(0, max(newIc_score[[i]], RandomIc_s[[i]])),
                       las=2, 
                       main=paste(names(CTS)[i], length(CTS[[i]]), "genes, ", "\n", C, "Shuffling labels"))

## -----------------------------------------------------------------------------
P3

## ----echo=FALSE, fig.align='center', out.width = '60%'------------------------
knitr::include_graphics("CTS_Id.jpg")
knitr::include_graphics("delta.jpg")

## ----echo=FALSE, fig.align='center', out.width = '65%'------------------------
knitr::include_graphics("Fig2.jpg")

## -----------------------------------------------------------------------------
library(BioTIP)
data(gencode)
head(gencode)

## ----"python code", eval = FALSE----------------------------------------------
# gtf = ("Your/PATH/TO/THE/FILE")
# outF = open("gtf_summary_transbiotype.txt","w")
# 
# def getquote(str,f,target):
#     targetLen = len(target)+2
#     strInd = str.find(target)
#     st = strInd + len(target)+2
#     ed = st + str[st:].find("";")
#     #print(st,ed)
#     f.write("\t"+str[st:ed]) if strInd!= -1 else f.write("\t"+"NA.")
# 
# with open(gtf, "r") as f:
#      for line in f:
#         if line[0] != "#":
#             chromosome = line.split("\t")[0]
#             st = line.split("\t")[3]
#             ed = line.split("\t")[4]
#             strand = line.split("\t")[6]
#             type = line.split("\t")[2]
#             outF.write(chromosome+"\t"+st+"\t"+ed+"\t"+strand+"\t"+type)
#             c = "transcript_id"
#             g = "gene_name"
#             t = "transcript_type"
#             getquote(line,outF,c)
#             getquote(line,outF,g)
#             getquote(line,outF,t)
#             outF.write("\n")
# outF.close()

## -----------------------------------------------------------------------------
library(BioTIP)
library(GenomicRanges)
data(gencode)
head(gencode)

## -----------------------------------------------------------------------------
query <- GRanges(c("chr1:2-10:+","chr1:6-10:-"), Row.names = c("trans1","trans2"), score = c(1,2))
head(query)

## -----------------------------------------------------------------------------
library(BioTIP)
gr <- GRanges(c("chr1:1-5:+","chr1:2-3:+"),biotype = c("lincRNA","CPC"))
head(gr)

## -----------------------------------------------------------------------------
intron <- GRanges("chr1:6-8:+")
head(intron)

## -----------------------------------------------------------------------------
intron_trncp <- getBiotypes(query, gr, intron)
intron_trncp

## -----------------------------------------------------------------------------
library(BioTIP)
data("intron")
data("ILEF")
data("gencode")

gencode_gr = GRanges(gencode)
ILEF_gr = GRanges(ILEF)
cod_gr = GRanges(cod)
intron_gr = GRanges(intron)

non_coding <- getBiotypes(ILEF_gr, gencode_gr, intron_gr)
dim(non_coding)
head(non_coding[,1:3])

## -----------------------------------------------------------------------------
coding <- getBiotypes(ILEF_gr, gencode_gr)
dim(coding)
head(coding[,1:3])

## -----------------------------------------------------------------------------
library(BioTIP)

data(ILEF)
data(cod)
ILEF_gr = GRanges(ILEF)
cod_gr = GRanges(cod)

rdthrough <- getReadthrough(ILEF_gr, cod_gr)
head(rdthrough)

## ----SessionInfo--------------------------------------------------------------
sessionInfo()