# 09/14/2007 Functional Regression Tree #Construction of the theoretical tree # with four terminal nodes. # See # David Nerini and Badih Ghattas, Classifying densities using functional regression trees: # Applications in oceanology # Computational Statistics & Data Analysis, Volume 51, Issue 10, 15 June 2007, Pages 4984-4993 # for more details # # see EOF for mymet object to be compiled before launching the construction # of the tree graphics.off() # for this source, see EOF source("mymet.R") library(rpart) library(flexmix) # Generate n densities following the theoretical decision rules on X # n is the whole number of observations # s is the number of points randomly drawn to estimate the densities into each regions # nobs is the size of the mesh used to sample each kernel estimated density s<-100 nobs<-128 n = 300 X1 = runif(n,-1,1) X2 = runif(n,-1,1) X3 = runif(n,-1,1) X4 = runif(n,-1,1) ind = numeric(n) ind[X3 < 0.5 & X4 < 0.5] = 1 ind[X3 < 0.5 & X4 > 0.5] = 2 ind[X3 > 0.5 & X4 < 0.5] = 3 ind[X3 > 0.5 & X4 > 0.5] = 4 yy = list() for(i in 1:n){ if(X3[i] < 0.5 & X4[i] < 0.5) yy[[i]] = c(rnorm(3*s/5,-1,0.5),rnorm(2*s/5,1,0.5)) if(X3[i] < 0.5 & X4[i] > 0.5) yy[[i]] = rnorm(s,0,1.5) if(X3[i] > 0.5 & X4[i] < 0.5) yy[[i]] = c(rnorm(2*s/5,-1,0.5),rnorm(3*s/5,1,0.5)) if(X3[i] > 0.5 & X4[i] > 0.5) yy[[i]] = rnorm(s,0,1) } dens<-list() xabs<-seq(-7,7,length=nobs) for(i in 1:n){ dens[[i]]<-density(yy[[i]],from=-7,to=7,n=nobs,bw=bw.nrd0(yy[[i]]))$y } xx= cbind(X1,X2,X3,X4) yy<-matrix(abs(unlist(dens)),n,nobs,byrow=T) # plot the random sampled density estimations ranged by colour (the regions) x11() matplot(xabs,t(yy),type="l",col=ind,lty=1,main="sampled densities",xlab="time",ylab="density") # construct a test sample, 1/3 of the initial data # build a tree on a learning sample testsamp<-sample(1:n,100,replace=F) yyc<-yy[-testsamp,] xxc<-xx[-testsamp,] X1<-xxc[,1] X2<-xxc[,2] X3<-xxc[,3] X4<-xxc[,4] form0 = as.formula(paste("yyc~ ","X1+X2+X3+X4")) # construct the tree using L2 distance as splitting criterion # and the RPART package # see mymet (EOF) for more information # mymet can easily be modified to change the metrics # results are strongly sensible to the arguments in rpart.control # xval controls the number of cross-validation steps # play carefully with it restree = rpart(form0,method=mymet,control=rpart.control(cp=.01,xval=0,minbucket=5, maxsurrogate=0,minsplit=1)) # plot of the constructed tree # error Rtot estimated with the test sample # is calculated by Riemman's sum colnames(xx)<-paste("X",1:4,sep="") step<-(max(xabs)-min(xabs))/(nobs-1) yy.pred<-predict(restree,newdata=data.frame(xx[testsamp,]),type="matrix") yy.res<-(yy[testsamp,]-yy.pred)^2 Rtot<-1/100*sum(yy.res*step) x11() plot(restree,main=paste("Rtot=",round(Rtot,6))) text(restree) # predicted densities in each leaves # data are plotted for each terminal nodes # from the left to the right nleaves = sum(restree$frame$var == "") ll = split(names(restree$where),restree$where) ll = lapply(ll,as.numeric) numrow<-which(restree$frame$var == "") x11() par(mfrow=c(2,2)) for(i in 1:nleaves){ matplot(xabs,t(yyc[ll[[i]],]),type="l",col="dark grey",lty=1, xlab="time",ylab="density",las=1,ylim=c(0,0.5),xlim=c(-4.5,4.5)) y.m<-colMeans(yyc[ll[[i]],]) points(xabs,y.m,type="l",col=1,lwd=2) } ######################## mymet file ########################################### mymet = list() #mymet is a list of length 4 and allows to calculate the criterion at each step #of the splitting process using the RPART R package. #See EOF example to call the method with RPART. #Here, Euclidean distances between functional observations are used but the functions #can easily be modified to compute other distances # An error can occur when compiling this function, but it has no matter... # Things must be improved.... mymet$eval = function (y, w, p) { if (is.null(nrow(y))) risk = 0 else { moy = colMeans(y) risk = sweep(y, 2, moy, "-") risk = sum(risk^2) } list(label = moy, deviance = risk) } mymet$split = function (Y, w, X, p, continuous=T) { crit = function(xx) { if (is.null(nrow(xx))) yy = 0 else { moy = colMeans(xx) yy = sweep(xx,2,moy,"-") yy = sum(yy^2) } yy } N = length(X) res = numeric() seuils = numeric() ldev = numeric() val0 = crit(Y) for (i in 1:(N - 1)) { ind = 1:i res[i] = crit(Y[ind, ]) + crit(Y[-ind, ]) } res = val0-res cat("resmin=",min(res),fill=T) dir= rep(1,length(res)) if(continuous) res0=list(goodness = res, direction=dir) else res0=list(goodness =, direction=) res0 } mymet$init = function (y, offset, wt) { resume = function(){"test" } list(y = y, numy = ncol(y), numresp = ncol(y), parms = NULL,summary=resume) } mymet$numy = ncol(y) #Example using a 3 dimensional response variable (datay) and 5 colums of the predictor matrix (datax) #to build a functional regression tree #Argument values in function RPART are available with help files of package RPART # res.tree = rpart(as.formula(paste("as.matrix(datay[,1:3]) ~ # ",paste("datax[,",1:5,"]",collapse="+"))),method=mymet, # control=rpart.control(cp=.0002,xval=0,minbucket=3,maxsurrogate=0,minsplit=2))