################################################### ### chunk number 1: startUP ################################################### ## This rather nasty chunk of code is only necessary for reproducing ## the random division into training and validation sets library("RbcBook1") library("kidpack") data(eset) set.seed(32) x <- t(exprs(eset)) y <- pData(phenoData(eset))$type y <- as.factor(y) y.train <- y[-c(1:10) ] x.train <- x[-c(1:10),] t.size <- length(y.train) l.size <- round((2/3)*t.size) v.size <- t.size-l.size l.samp <- NULL K <- nlevels(y.train) props <- round(l.size/t.size * table(y.train)) props[1] <- l.size - sum(props[2:K]) for (k in 1:K) { y.num <- as.numeric(y.train) l.samp <- c(l.samp, sample(which(y.num==k))[1:props[k]]) } v.samp <- (1:t.size)[-l.samp] library("multtest") yl.num <- as.numeric(y.train[l.samp])-1 xl.mtt <- t(x.train[l.samp,]) f.stat <- mt.teststat(xl.mtt, yl.num, test = "f") best.genes <- rev(order(f.stat))[1:200] x.learn <- x.train[l.samp, best.genes] x.valid <- x.train[v.samp, best.genes] y.learn <- y.train[l.samp] y.valid <- y.train[v.samp] library("sma") yl.num <- as.numeric(y.learn) yv.num <- as.numeric(y.valid)-1 dlda.predic <- stat.diag.da(x.learn, yl.num, x.valid) dlda.error <- mean(dlda.predic$pred-1 != yv.num) library("class") knn.error <- numeric(3) for(k in c(1,3,5)) { i <- ((k-1)/2)+1 knn.predic <- knn(x.learn, x.valid, y.learn, k = k) knn.error[i] <- mean(knn.predic!=y.valid) } library("e1071") svm.error <- matrix(0, nrow = 3, ncol = 3) for(cost in 0:2) { for(gamma in (-1):1) { i <- cost+1 j <- gamma+2 svm.fit <- svm(x.learn, y.learn, cost = 2^cost, gamma = 2^gamma/ncol(x.learn)) svm.predic <- predict(svm.fit, newdata = x.valid) svm.error[i,j] <- mean(svm.predic != y.valid) } } randiv.repr <- function(x,y) { t.size <- length(y) l.size <- round((2/3)*t.size) v.size <- t.size-l.size l.samp <- NULL K <- nlevels(y) props <- round(l.size/t.size * table(y)) props[1] <- l.size - sum(props[2:K]) for (k in 1:K) { y.num <- as.numeric(y) l.samp <- c(l.samp, sample(which(y.num==k))[1:props[k]]) } v.samp <- (1:t.size)[-l.samp] yl.num <- as.numeric(y[l.samp])-1 f.stat <- mt.teststat(t(x[l.samp,]), yl.num, test = "f") best.genes <- rev(order(f.stat))[1:200] x.learn <- x[l.samp, best.genes] x.valid <- x[v.samp, best.genes] y.learn <- y[l.samp] y.valid <- y[v.samp] yl.num <- as.numeric(y.learn) yv.num <- as.numeric(y.valid)-1 dlda.predic <- stat.diag.da(x.learn, as.numeric(y.learn), x.valid) dlda.error <- mean(dlda.predic$pred-1 != yv.num) knn.error <- numeric(3) for(k in c(1,3,5)) { i <- ((k-1)/2)+1 knn.predic <- knn(x.learn, x.valid, y.learn, k = k) knn.error[i] <- mean(knn.predic!=y.valid) } svm.error <- matrix(0, nrow = 3, ncol = 3) for(cost in 0:2) { for(gamma in (-1):1) { i <- cost+1 j <- gamma+2 svm.fit <- svm(x.learn, y.learn, cost = 2^cost, gamma = 2^gamma/ncol(x.learn)) svm.predic <- predict(svm.fit, newdata = x.valid) svm.error[i,j] <- mean(svm.predic != y.valid) } } list(dlda = dlda.error, knn = knn.error, svm = svm.error, lsamp = l.samp, vsamp = v.samp) } runs <- 50 dlda.errors <- numeric(runs) knn.errors <- matrix(0, nrow = 3, ncol = runs) svm.errors <- array(0, dim = c(3, 3, runs)) vsamp <- matrix(0, runs, length(v.samp)) lsamp <- matrix(0, runs, length(l.samp)) for(r in 1:runs) { results <- randiv.repr(x.train, y.train) dlda.errors[ r] <- results$dlda knn.errors[, r] <- results$knn svm.errors[,,r] <- results$svm vsamp[r,] <- results$vsamp lsamp[r,] <- results$lsamp cat("This was run", r, "of", runs, "\n") } ################################################### ### chunk number 2: loadkidney ################################################### set.seed(32) library("kidpack") data(eset) ################################################### ### chunk number 3: wegzehn ################################################### test <- (1:10) train <- (1:length(eset$type))[-test] # needed! trEset <- eset[,train] ################################################### ### chunk number 4: ################################################### t.size <- length(trEset$type) l.size <- round((2/3)*t.size) v.size <- t.size-l.size ################################################### ### chunk number 5: ################################################### K <- nlevels(factor(trEset$type)) l.samp <- NULL props <- round(l.size/t.size * table(trEset$type)) props[1] <- l.size - sum(props[2:K]) for (k in 1:K) { y.num <- as.numeric(factor(trEset$type)) l.samp <- c(l.samp, sample(which(y.num==k))[1:props[k]]) } v.samp <- (1:t.size)[-l.samp] ################################################### ### chunk number 6: ################################################### table(trEset$type[l.samp]) table(trEset$type[v.samp]) ################################################### ### chunk number 7: ################################################### library("multtest") yl.num <- as.numeric(factor(trEset$type[l.samp]))-1 xl.mtt <- exprs(trEset[,l.samp]) f.stat <- mt.teststat(xl.mtt, yl.num, test = "f") best.genes <- rev(order(f.stat))[1:200] trselEset <- trEset[best.genes,] ################################################### ### chunk number 8: ################################################### library("sma") library("MLInterfaces") l.samp <- as.integer(l.samp) dlda.predic <- stat.diag.daB(trselEset, "type", l.samp) conf.matrix <- confuMat(dlda.predic) error.rate <- function(cm) 1-sum(diag(cm))/sum(cm) dlda.error <- error.rate(conf.matrix) ################################################### ### chunk number 9: ################################################### library("class") knn.error <- numeric(3) for(k in c(1,3,5)) { i <- ((k-1)/2)+1 knn.predic <- knnB(trselEset, "type", l.samp, k = k, prob = FALSE) knn.error[i] <- error.rate(confuMat(knn.predic)) } ################################################### ### chunk number 10: ################################################### library("e1071") svm.error <- matrix(0, nrow = 3, ncol = 3) for(cost in 0:2) { for(gamma in (-1):1) { i <- cost+1 j <- gamma+2 svm.fit <- svmB(trselEset, "type", l.samp, cost=2^cost, gamma = 2^gamma/nrow(exprs(trselEset)), type="C-classification") svm.error[i,j] <- error.rate(confuMat(svm.fit)) } } ################################################### ### chunk number 11: ################################################### dlda.error knn.error svm.error ################################################### ### chunk number 12: ################################################### comment <- function(x) x ################################################### ### chunk number 13: ################################################### randiv <- function(trEset) { "\#body suppressed; repeat all the code from Section 24.3" list(dlda=dlda.error, knn=knn.error, svm=svm.error) } ################################################### ### chunk number 14: ################################################### randiv <- function(trEset, run) { ## Defining the size of learning and validation sets t.size <- length(trEset$type) l.size <- round((2/3)*t.size) v.size <- t.size-l.size ## Balanced sampling v.samp <- vsamp[run,] l.samp <- lsamp[run,] ## Variable selection yl.num <- as.numeric(factor(trEset$type[l.samp]))-1 xl.mtt <- exprs(trEset[,l.samp]) f.stat <- mt.teststat(xl.mtt, yl.num, test = "f") best.genes <- rev(order(f.stat))[1:200] trselEset <- trEset[best.genes,] ## Classification with DLDA l.samp <- as.integer(l.samp) dlda.predic <- stat.diag.daB(trselEset, "type", l.samp) conf.matrix <- confuMat(dlda.predic) dlda.error <- error.rate(conf.matrix) ## Classification with Nearest Neighbors knn.error <- numeric(3) for(k in c(1,3,5)) { i <- ((k-1)/2)+1 knn.predic <- knnB(trselEset, "type", l.samp, k = k, prob = FALSE) knn.error[i] <- error.rate(confuMat(knn.predic)) } ## Classification with Support Vector Machines svm.error <- matrix(0, nrow = 3, ncol = 3) for(cost in 0:2) { for(gamma in (-1):1) { i <- cost+1 j <- gamma+2 svm.fit <- svmB(trselEset, "type", l.samp, cost=2^cost, gamma = 2^gamma/nrow(exprs(trselEset)), type="C-classification") svm.error[i,j] <- error.rate(confuMat(svm.fit)) } } ## Output list(dlda = dlda.error, knn = knn.error, svm = svm.error) } ################################################### ### chunk number 15: ################################################### runs <- 50 dlda.errors <- numeric(runs) knn.errors <- matrix(0, nrow = 3, ncol = runs) svm.errors <- array(0, dim = c(3, 3, runs)) for(r in 1:runs) { results <- randiv(trEset, r) dlda.errors[ r] <- results$dlda knn.errors[, r] <- results$knn svm.errors[,,r] <- results$svm cat("This was run", r, "of", runs, "\n") } ################################################### ### chunk number 16: ################################################### mean(dlda.errors) apply(knn.errors, 1, mean) apply(svm.errors, c(1,2), mean) ################################################### ### chunk number 17: ################################################### ## These functions are for drawing boxplots and barplots of error rates ## Loading the required package library("grid") ## Check the settings if (!exists("pushViewport")) pushViewport <- push.viewport if (!exists("popViewport")) popViewport <- pop.viewport ## The function for the barplots grid.barplot <- function(x, xlab = FALSE, main = FALSE, xscale = NULL, yticks, ylabels) { ## Customizing the data y <- table(x) y.alt <- table(x) x.alt <- x x <- sort(unique(x)) if(!is.null(xscale)) { index <- which(x > xscale[1] & x < xscale[2]) x <- x[index] y <- y[index] } ## Horizontal bars grid.segments(x0 = unit(x, "native"), y0 = unit(0, "native"), x1 = unit(x, "native"), y1 = unit(y.alt, "native")) ## Drawing the lines grid.lines(unit(x, "native"), unit(y, "native")) ## Plotting the mean brks <- as.numeric(attributes(table(x))$dimnames$x) start <- max(which(brks 0) { if(!is.null(xscale)) index <- which(x$out > xscale[1] & x$out < xscale[2]) else index <- 1:n if(length(index) > 0) { grid.points(unit(x$out[index],"native"),unit(rep(0.5,length(index)), "npc"), size = unit(0.5, "char")) } } ## Annotation if(xlab) grid.xaxis() if(!is.null(ylab)) { grid.text(ylab, x = unit(0, "npc") - unit(0.5, "lines"), gp = gpar(fontface = "bold"), just = c("right", "centre")) } } ## Plotting the results grid.boxNbar <- function(daten, mname) { grid.newpage() pushViewport(plotViewport(c(4, 6, 4, 4))) #pushViewport(viewport(layout=lyt)) n <- ncol(daten) ## Calibration of the x-axis xsc <- range(daten) + c(-1, 1) * max(daten)/15 xsc2 <- xsc ## Calibration of the y-axis ysc <- c(0,max(sapply(apply(daten,2,function(x) table(x)),max))) ysc <- ysc * 1.1 ylabels <- c("0", "10", "20", "30", "40", "50") ynumb <- floor((ysc[2]+3)/10)-1 yticks <- (0:ynumb)*10 ylabels <- ylabels[1:(ynumb+1)] ## The plot contains 3 pieces lyt <- grid.layout(1,3,widths=unit(c(0.5-0.03/2,0.03,0.5-0.03/2),"npc")) pushViewport(viewport(layout=lyt)) ## Left side: boxplots lyt <- grid.layout(n,1,heights=unit(rep(1/n, n), "npc")) pushViewport(viewport(layout.pos.row=1, layout.pos.col=1,xsc=xsc)) pushViewport(viewport(layout=lyt)) grid.rect() for(i in n:1) { pushViewport(viewport(layout.pos.r=i,layout.pos.c=1,xsc=xsc)) main <- (i < 2) grid.boxplot(daten[,i],m=main,xl=(i>(n-1)),yl=mname[i],xsc=xsc) popViewport(1) } popViewport(1) grid.text("Boxplot", y = unit(-2.5, "lines")) ## Middle: an empty frame popViewport(1) #schrift <- gpar(fontface="bold") #grid.text(namen[j], y=unit(1,"npc")+unit(2,"lines"), gp=schrift) ## Right side: density plots lyt <- grid.layout(n,1,hei=unit(rep(1/n,n),"npc")) pushViewport(viewport(layout.pos.row=1,layout.pos.col=3,xscale=xsc)) pushViewport(viewport(layout=lyt)) grid.rect() for(i in n:1) { pushViewport(viewport(layout.pos.r=i,layout.pos.c=1,xs=xsc2,ys=ysc)) main <- (i < 2) grid.barplot(daten[,i],main,xsc2,yticks,ylabels,xlab=(i>(n-1))) popViewport(1) } popViewport(1) grid.text("Density", y=unit(-2.5,"lines")) popViewport(3) } ################################################### ### chunk number 18: ################################################### daten <- cbind(dlda.errors, knn.errors[2,], svm.errors[3,1,], svm.errors[2,1,]) mname <- c("DLDA", "kNN", "SVM1", "SVM2") grid.boxNbar(daten, mname) ################################################### ### chunk number 19: ################################################### yt.num <- as.numeric(factor(trEset$type))-1 xt.mtt <- exprs(trEset) f.stat <- mt.teststat(xt.mtt, yt.num, test = "f") best.genes <- rev(order(f.stat))[1:200] selEset <- eset[best.genes,] dlda.predic <- stat.diag.daB(selEset, "type", train) dlda.predic@RObject$pred-1 ################################################### ### chunk number 20: ################################################### confuMat(dlda.predic)