# # R Code for reproducing results of the article # # The importance of knowing when to stop - # a sequential stopping rule for component-wise gradient boosting # # by A. Mayr, B. Hofner and M. Schmid # # Content # (1) Simulation study # (2) Data analysis # ########################################### # Note: Results were computed with R 2.13.0 library(mboost) # mboost 2.1-0 library(snow) # snow 0.3-3 library(penalized) # penalized 0.9-37 for the lasso # Code was written for the usage on a Cluster with 25 knots, using the snow package # and a pvm but can be easily adopted to run on a single knot ################# (1) Simulation study ####################################### # function AMP_sim carries out one simulation step # id = step number # n = number of observation # p = number of candidate covariates # maxstop = number of maximum boosting iteration for "bootstrap large grid" # approach AMP_sim <- function(id,n,p, maxstop){ p <- p-1 # because of intercept set.seed(id) # explanatory variables - uncorrelated X.train <- matrix(nrow=n, ncol=p, data=rnorm(n*p)) X.test <- matrix(nrow=1000, ncol=p, data=rnorm(1000*p)) # outcome with gaussian error term # setting as in Chang et al. y.train <- 1 + 5*X.train[,1] + 2*X.train[,2] + X.train[,9] + rnorm(n) y.test <- 1 + 5*X.test[,1] + 2*X.test[,2] + X.test[,9] + rnorm(1000) # data sets dat.train <- data.frame(y.train, X.train) dat.test <- data.frame(y.test, X.test) names(dat.train) <- c("y", paste("x", 1:p, sep="")) names(dat.test) <- names(dat.train) # cleanup rm(X.train, X.test, y.train, y.test) # set the band-with equal steplength actstop <- 40 # steplength # initial modelm = 40 gb1 <- glmboost(y~.,control=boost_control(mstop=actstop), data=dat.train) # search for best AIC model: mAIC while( mstop(AIC(gb1)) > 0.9*actstop){ actstop <- actstop + 40 # adding steplength gb1[actstop] } # set to best AIC model mstop.AIC <- mstop(AIC(gb1)) gb1 <- gb1[mstop.AIC] risk.AIC <- mean((dat.test$y - predict(gb1, newdata= dat.test))^2) selected.AIC <- selected(gb1) coefs.AIC <- coef(gb1, which="") # now resampling: subsample after AIC set.seed(id + 804) cvr <- cvrisk(gb1 ,papply =lapply, folds=cv(model.weights(gb1), type="subsampling")) mstop.s <- mstop(cvr) gb1 <- gb1[mstop.s] risk.s <- mean((dat.test$y - predict(gb1, newdata= dat.test))^2) selected.s <- selected(gb1) coefs.s <- coef(gb1, which="") # now subsampling with B=10: 10 fold subsampling after AIC set.seed(id + 804) cvr <- cvrisk(gb1 ,papply =lapply, folds=cv(model.weights(gb1), type="subsampling", B=10)) mstop.sb10 <- mstop(cvr) gb1 <- gb1[mstop.sb10] risk.sb10 <- mean((dat.test$y - predict(gb1, newdata= dat.test))^2) # compare to no early stopping gb1 <- gb1[maxstop] risk.max <- mean((dat.test$y - predict(gb1, newdata= dat.test))^2) # compare to full subsampling set.seed(id + 804) cvr <- cvrisk(gb1, papply =lapply, folds=cv(model.weights(gb1), type="subsampling")) mstop.smax <- mstop(cvr) gb1 <- gb1[mstop.smax] risk.smax <- mean((dat.test$y - predict(gb1, newdata= dat.test))^2) # compare to traditional cv gb1 <- gb1[maxstop] set.seed(id + 804) cvr <- cvrisk(gb1, papply =lapply, folds=cv(model.weights(gb1), type="kfold")) mstop.cv<- mstop(cvr) gb1 <- gb1[mstop.cv] risk.cv <- mean((dat.test$y - predict(gb1, newdata= dat.test))^2) # compare to lasso opt <-optL1(dat.train$y, penalized = dat.train[,-1]) pen <- penalized(dat.train$y, penalized = dat.train[,-1], lambda1=opt$lambda) risk.lasso <- mean((dat.test$y - predict(pen, penalized= dat.test[,-1])[,1])^2) list(risk.AIC = risk.AIC, risk.s = risk.s, risk.max = risk.max, risk.lasso = risk.lasso, mstop.AIC= mstop.AIC, mstop.s = mstop.s, mstop.max = maxstop, mstop.smax=mstop.smax, risk.smax =risk.smax, risk.sb10 =risk.sb10, mstop.sb10=mstop.sb10, mstop.cv = mstop.cv, risk.cv = risk.cv, selected.s = selected.s, coefs.s = coefs.s, selected.AIC = selected.AIC, coefs.AIC = coefs.AIC) } #### Carry out 200 simulation runs id <- 1:200 ### Note: infrastructure has to be set up ### PVM with 25 knots ### alternatively one can use AMP with a simple loop on one knot CL <- makePVMcluster(25) asd<-clusterEvalQ(CL, library(mboost, lib.loc="/net/192.168.245.150/export/cluster-home/amayr/R/i486-pc-linux-gnu-library/2.10")) asd<-clusterEvalQ(CL, library(penalized)) # classic setting ERG_class <- clusterApplyLB(CL,id, fun=AMP_sim, n=100, p=100, maxstop=1500) # save(ERG_class, file = "ERGn100_p100_stop1500_revisionmstop.RData") # high-dimensional setting ERG_high <- clusterApplyLB(CL,id, fun=AMP_sim, n=100, p=500, maxstop=10000) # save(ERG_high, file = "ERGn100_p500_stop10000_revisionmstop.RData") # ## Analyze results # # # ## classic setting # load("ERGn100_p100_stop1500_revisionmstop.RData") mstop.AIC <- sapply(ERG_class, function(x) x$mstop.AIC) mstop.s <- sapply(ERG_class, function(x) x$mstop.s) mstop.max <- sapply(ERG_class, function(x) x$mstop.max) mstop.smax <- sapply(ERG_class, function(x) x$mstop.smax) mstop.sb10 <- sapply(ERG_class, function(x) x$mstop.sb10) mstop.cv <- sapply(ERG_class, function(x) x$mstop.cv) risk.AIC <- sapply(ERG_class, function(x) x$risk.AIC) risk.s <- sapply(ERG_class, function(x) x$risk.s) risk.max <- sapply(ERG_class, function(x) x$risk.max) risk.smax <- sapply(ERG_class, function(x) x$risk.smax) risk.sb10 <- sapply(ERG_class, function(x) x$risk.sb10) risk.lasso <- sapply(ERG_class, function(x) x$risk.lasso) risk.cv <- sapply(ERG_class, function(x) x$risk.cv) ## high dimensional setting # load("ERGn100_p500_stop10000_revisionmstop.RData") mstop.AICh <- sapply(ERG_high, function(x) x$mstop.AIC) mstop.sh <- sapply(ERG_high, function(x) x$mstop.s) mstop.maxh <- sapply(ERG_high, function(x) x$mstop.max) mstop.smaxh <- sapply(ERG_high, function(x) x$mstop.smax) mstop.sb10h <- sapply(ERG_high, function(x) x$mstop.sb10) mstop.cvh <- sapply(ERG_high, function(x) x$mstop.cv) risk.AICh <- sapply(ERG_high, function(x) x$risk.AIC) risk.sh <- sapply(ERG_high, function(x) x$risk.s) risk.maxh <- sapply(ERG_high, function(x) x$risk.max) risk.smaxh <- sapply(ERG_high, function(x) x$risk.smax) risk.lassoh <- sapply(ERG_high, function(x) x$risk.lasso) risk.cvh <- sapply(ERG_high, function(x) x$risk.cv) risk.sb10h <- sapply(ERG_high, function(x) x$risk.sb10) #### Plots # stopping iterations postscript("Figure.eps", paper="a4", horizontal=FALSE) par(mfrow=c(3,2)) par(mar=c(4,4,1,0)+.1) boxplot(cbind(mstop.AIC, mstop.s, mstop.smax)[1:200,], names=rep("",3), ylab="mstop", las=1) mtext(side=1, at=1:3, c("AIC","subsampling", "subsampling"), line=1) mtext(side=1, at=1:3, c("","after AIC", "large grid"), line=2) for(i in 1:200) lines(1:3, c(mstop.AIC[i], mstop.s[i],mstop.smax[i]),col="lightgrey" ) boxplot(cbind(mstop.AIC, mstop.s, mstop.smax)[1:200,], add=T, names=rep("",3), las=1) boxplot(cbind(mstop.AICh, mstop.sh, mstop.smaxh)[1:200,],names=rep("",3), ylab="mstop", las=1) for(i in 1:200) lines(1:3, c(mstop.AICh[i], mstop.sh[i],mstop.smaxh[i]),col="lightgrey" ) mtext(side=1, at=1:3, c("AIC","subsampling", "subsampling"), line=1) mtext(side=1, at=1:3, c("","after AIC", "large grid"), line=2) boxplot(cbind(mstop.AICh, mstop.sh, mstop.smaxh)[1:200,], add=T, names=rep("",3), las=1) # risks boxplot(cbind(risk.AIC, risk.s, risk.smax)[1:200,], names=rep("",3), ylab="predictive risk", las=1) mtext(side=1, at=1:3, c("AIC","subsampling", "subsampling"), line=1) mtext(side=1, at=1:3, c("","after AIC", "large grid"), line=2) for(i in 1:200) lines(1:3, c(risk.AIC[i], risk.s[i],risk.smax[i]),col="lightgrey" ) boxplot(cbind(risk.AIC, risk.s, risk.smax)[1:200,], add=T, names=rep("",3) , las=1) boxplot(cbind(risk.AICh, risk.sh, risk.smaxh)[1:200,],names=rep("",3), ylab="predictive risk", las=1) for(i in 1:200) lines(1:3, c(risk.AICh[i], risk.sh[i],risk.smaxh[i]),col="lightgrey" ) mtext(side=1, at=1:3, c("AIC","subsampling", "subsampling"), line=1) mtext(side=1, at=1:3, c("","after AIC", "large grid"), line=2) boxplot(cbind(risk.AICh, risk.sh, risk.smaxh)[1:200,], add=T, names=rep("",3), las=1) # Read runnning-times from private laptop #load("K:/paper/mstop/R-Code/Revision/run_times_class.RData") #load("K:/paper/mstop/R-Code/Revision/run_times_high.RData") # boxplot(cbind(rt.aic, rt.s + rt.aic, rt.smax)[1:100,], names=rep("",3), ylab="running-time [s]", las=1) # mtext(side=1, at=1:3, c("AIC","subsampling", "subsampling"), line=1) # mtext(side=1, at=1:3, c("","after AIC", "large grid"), line=2) # for(i in 1:200) lines(1:3, c(rt.aic[i], rt.s[i] + rt.aic[i],rt.smax[i]),col="lightgrey" ) # boxplot(cbind(rt.aic, rt.s+rt.aic, rt.smax)[1:100,], add=T, names=rep("",3) , las=1) # boxplot(cbind(rt.aich, rt.sh + rt.aich, rt.smaxh)[1:100,],names=rep("",3), ylab="running time [s]", las=1) # for(i in 1:100) lines(1:3, c(rt.aich[i], rt.sh[i] +rt.aich[i],rt.smaxh[i]),col="lightgrey" ) # mtext(side=1, at=1:3, c("AIC","subsampling", "subsampling"), line=1) # mtext(side=1, at=1:3, c("","after AIC", "large grid"), line=2) # boxplot(cbind(rt.aich, rt.sh + rt.aich, rt.smaxh)[1:100,], add=T, names=rep("",3), las=1) dev.off() # #### variable selection # # ## classic setting selected_s <- matrix(ncol=length(ERG_class[[1]]$coefs.s),nrow=200) selected_AIC <- matrix(ncol=length(ERG_class[[1]]$coefs.AIC),nrow=200) for(i in 1:200){ selected_s[i,] <- as.numeric(1:dim(selected_s)[2] %in% ERG_class[[i]]$selected.s) selected_AIC[i,] <- as.numeric(1:dim(selected_AIC)[2] %in% ERG_class[[i]]$selected.AIC) } # # # selection rate of non-inf: base-learners nr 2,3 and 10 refer to informative covariates # # X1, X2 and X9 Note: X9 was changed in the manuscript to X3 in order to reduce complexity # # of the simulation setting mean(apply(selected_s, mean, MARGIN=2)[-c(2,3,10)]) mean(apply(selected_AIC, mean, MARGIN=2)[-c(2,3,10)]) # # # high-dim setting selected_sh <- matrix(ncol=length(ERG_high[[1]]$coefs.s),nrow=200) selected_AICh <- matrix(ncol=length(ERG_high[[1]]$coefs.AIC),nrow=200) for(i in 1:200){ selected_sh[i,] <- as.numeric(1:dim(selected_sh)[2] %in% ERG_high[[i]]$selected.s) selected_AICh[i,] <- as.numeric(1:dim(selected_AICh)[2] %in% ERG_high[[i]]$selected.AIC) } # # # selection rate of non-inf mean(apply(selected_sh, mean, MARGIN=2)[-c(2,3,10)]) mean(apply(selected_AICh, mean, MARGIN=2)[-c(2,3,10)]) ######################## (2) Data analysis ##################################### # Get the Barrier et al data expr <- read.csv("http://www.stat.berkeley.edu/users/sandrine/Docs/Papers/JCO06/Normalized%20data.txt", header = TRUE, sep = "\t", colClasses = c("character", rep("numeric", 50))) genes <- expr[,1] patnr <- colnames(expr)[-1] expr <- t(as.matrix(expr[,-1])) rownames(expr) <- patnr colnames(expr) <- genes #save(expr, file = "Barrier_et_al_expr.Rda") # function AMP_barrier carries out one LOO step # id = step number # # Again, code was designed to run on a multi-knot cluster but can # be easiy adopted to run in a loop on a single machine # load data load("Barrier_et_al_expr.Rda") AMP_barrier <- function(id, expr){ # Data processing event <- c(rep(FALSE,25), rep(TRUE, 25)) y <- as.factor(event) # Leave-one-out y.train <- y[-id] expr.train <- expr[-id,] y.test <- y[id] expr.test <- expr[id,] expr.test <- as.matrix(t(expr.test)) colnames(expr.test) <- colnames(expr.train) # step width actstop <- 40 # initial model gb1 <- glmboost(y=y.train , x=expr.train , control=boost_control(mstop=actstop), family=Binomial()) # cleanup rm(y, expr) gc(reset=T) # search for best AIC model while( mstop(AIC(gb1, method="classical")) > 0.9*actstop ){ actstop <- actstop + 40 # adding steplength gb1[actstop] } # set to best AIC model mstop.AIC <- mstop(AIC(gb1, method="classical")) gb1 <- gb1[mstop.AIC] coef.AIC <- coef(gb1) selected.AIC <- selected(gb1) # get the predictions on test obs predsAIC <- predict(gb1, newdata=(expr.test)) # subsampling after AIC set.seed(id + 804) cvr <- cvrisk(gb1, papply=lapply, folds=cv(model.weights(gb1), type="subsampling", B=25)) mstop.s <- mstop(cvr) gb1 <- gb1[mstop.s] preds.s <- predict(gb1, newdata=as.matrix(expr.test)) coef.s <- coef(gb1) selected.s <- selected(gb1) # compare to lasso opt <-optL1(y.train, penalized = expr.train) pen <- penalized(y.train, penalized = expr.train, lambda1=opt$lambda) predslasso <- predict(pen, penalized= expr.test) list(mstop.AIC, mstop.s, predsAIC, preds.s, coef.AIC, selected.AIC, coef.s, selected.s, predslasso) } ### ### run function on cluster # ids for leace-one-out id <- 1:50 # cluster with 25 knots, via pvm and snow library(snow) CL <- makePVMcluster(25) asd<-clusterEvalQ(CL, library(mboost, lib.loc="/net/192.168.245.150/export/cluster-home/amayr/R/i486-pc-linux-gnu-library/2.10")) asd<-clusterEvalQ(CL, library(penalized)) ERG_barrier <- clusterApplyLB(CL, id, expr, fun=AMP_barrier) # save(ERG_barrier, file="ERG_barrier_revisionmstop.RData") # load("ERG_barrier_revisionmstop.RData") ## Get results out of the ERG list mstopAIC <- sapply(ERG_barrier, function(x) x[[1]]) mstops <- sapply(ERG_barrier, function(x) x[[2]]) predAIC <- sapply(ERG_barrier, function(x) x[[3]]) preds <- sapply(ERG_barrier, function(x) x[[4]]) coefAIC <- sapply(ERG_barrier, function(x) x[[5]]) selectedAIC <- sapply(ERG_barrier, function(x) x[[6]]) coefs <- sapply(ERG_barrier, function(x) x[[7]]) selecteds <- sapply(ERG_barrier, function(x) x[[8]]) predslasso <- sapply(ERG_barrier, function(x) x[[9]]) ## outcome event <- c(rep(FALSE,25), rep(TRUE, 25)) ## get the performance measures the old-fashioned way sensAIC <- sum(predAIC >0 & event)/sum(event) sensbs <- sum(preds >0 & event)/sum(event) ## spezAIC <- sum(predAIC <0 & !event)/sum(!event) spezbs <- sum(preds <0 & !event)/sum(!event) ## perrAIC <- (sum(predAIC > 0 & !event) + sum(predAIC < 0 & event)) /length(predAIC) perrbs <- (sum(preds > 0 & !event) + sum(preds < 0 & event)) /length(preds) ## overall accuracy: 1 - perrAIC 1 - perrbs ## Get the AUC library(Epi) ROC(test=predAIC, stat=event, plot=F)$AUC ROC(test=preds, stat=event, plot=F)$AUC ROC(test=predslasso, stat=event, plot=F)$AUC ## get number of selected genes median(sapply(selectedAIC, function(x) length(unique(x)))) range(sapply(selectedAIC, function(x) length(unique(x)))) ## median(sapply(selecteds, function(x) length(unique(x)))) range(sapply(selecteds, function(x) length(unique(x)))) # To get various classifiers in one plot: library(Daim) ## With lasso rAIC <-roc(predAIC, labels=event, labpos=TRUE, bestcut=F) rbs <-roc(preds, labels=event, labpos=TRUE) rlasso <-roc(predslasso, labels=event, labpos=TRUE) ## Build the plot par(mar=c(4,4,0,0)+.1) plot(x=rbs$FPR, y=rbs$TPR, xlab="false positive rate", ylab="true positive rate", main="", las=1, type="n") abline(h=seq(0,1, by=0.2), col="lightgrey", lty=3) abline(v=seq(0,1, by=0.2), col="lightgrey", lty=3) lines(x=rbs$FPR, y=rbs$TPR, lwd=2) lines(x=rAIC$FPR, y=rAIC$TPR, lty=2, lwd=2) lines(x=rlasso$FPR, y=rlasso$TPR, lty=3, lwd=2) abline(a=0,b=1, col="grey") legend("bottomright", bty="n", lwd=2, lty=c(1:3), c("sequential stopping", "AIC based stopping", "Lasso")) ################################################################ ## Last mod Feb 12 2012 ## ## For questions with respect to the code or the article please contact ## the corresponding author: andreas.mayr at imbe.med.uni-erlangen.de