# This file contains the commands to obtain the results
# for the paper "Variable selection for Additive Models by Nonnegative Garrote"
# by E. Cantoni, J. Mills Flemming and E. Ronchetti 
#
#
# Warning: factor(PERIOD)+factor(NAREA)
# has always to be the first variable of the model

library(mgcv)
source("NPgarrBlueSharks.R")
BLUE <- read.table("BLUE.txt",header=TRUE)

# Data need to be centered
# BLUE is the initial dataframe

contBLUE <- as.matrix(BLUE[,c("DOFY","NLIGHTST","SOAKTIME","AVGHKDEP","OCEAND1","TEMP1")])
scaledBLUE <- scale(contBLUE,center=TRUE,scale=FALSE)
allBLUE <- data.frame(scaledBLUE,BLUE[,c("BSH","NAREA","PERIOD","TOTHOOKS")],logBSH=log(BLUE[,"BSH"]+1))
# Take out 1 observation, the only one with PERIOD=3,
# because problems with the validation sample, if not in construction sample
allBLUE <- allBLUE[-which(allBLUE$PERIOD==3),]


# Initial fit
initialfit <- gam(logBSH~s(DOFY)+s(NLIGHTST)+s(SOAKTIME)+s(AVGHKDEP)+s(OCEAND1)+s(TEMP1)+offset(log(TOTHOOKS)),data=allBLUE)

Bluespar <- initialfit$sp

# CROSS-VALIDATION choice of s
s.grid <- seq(0.1,10,by=0.2)
folds <- 10
n <- nrow(allBLUE)

# Output variables
Blues.ck <- matrix(0,ncol=6,nrow=length(s.grid))
Blue.PE <- numeric(length(s.grid))
Blue.ME <- matrix(0,ncol=length(s.grid),nrow=folds)

# Splits
set.seed(412)
# Data randomization
randomsplit <- sample(1:n,n)
Blue.CVdata <- allBLUE[randomsplit,]
                                                      
# Data split: if data cannot be split evenly, admit one larger class (the last one)
foldid <- rep(1:folds,each=n%/%folds)
foldid <- c(foldid,rep(folds,times= n - folds * (n%/%folds)))

for(j in 1:length(s.grid))
  {
    print(j)
    for(i in 1:folds)
      {
       # centering of the x's of the construction sample
       thiscsample <- data.frame(scale(Blue.CVdata[foldid!=i,c("DOFY","NLIGHTST","SOAKTIME","AVGHKDEP","OCEAND1","TEMP1")],center=T,scale=F), Blue.CVdata[foldid!=i,c("BSH","NAREA","PERIOD","TOTHOOKS","logBSH")])
       # subtract the mean of the construction sample to the x's of the validation sample
       thisvsample <- data.frame(sweep(Blue.CVdata[foldid==i,c("DOFY","NLIGHTST","SOAKTIME","AVGHKDEP","OCEAND1","TEMP1")],2,apply(thiscsample[,c("DOFY","NLIGHTST","SOAKTIME","AVGHKDEP","OCEAND1","TEMP1")],2,mean)),Blue.CVdata[foldid==i,c("BSH","NAREA","PERIOD","TOTHOOKS","logBSH")])
        NPgarr.cross <- NPgarrBlueSharks(logBSH~s(DOFY)+s(NLIGHTST)+s(SOAKTIME)+s(AVGHKDEP)+s(OCEAND1)+s(TEMP1)+offset(log(TOTHOOKS)),sp=Bluespar,ycolumn=11,data=thiscsample,sumck=s.grid[j])
      thisgam <- gam(logBSH~s(DOFY)+s(NLIGHTST)+s(SOAKTIME)+s(AVGHKDEP)+s(OCEAND1)+s(TEMP1)+offset(log(TOTHOOKS)),sp=Bluespar,data=thiscsample)
       sy <- colSums(t(predict.gam(thisgam,newdata=thisvsample,type="terms"))*NPgarr.cross$cks) + thisgam$coefficients[1] + log(Blue.CVdata[foldid==i,"TOTHOOKS"])
       # Need mean() instead of sum() because of different sizes
       # of the validation samples
       Blue.ME[i,j]<- mean((Blue.CVdata[foldid==i,11]-sy)^2)
       # ck's on the full sample
      Blues.ck[j,] <- NPgarrBlueSharks(logBSH~s(DOFY)+s(NLIGHTST)+s(SOAKTIME)+s(AVGHKDEP)+s(OCEAND1)+s(TEMP1)+offset(log(TOTHOOKS)),sp=Bluespar,ycolumn=11,data=Blue.CVdata,sumck=s.grid[j])$cks
     }
  }

Blue.PE <- colSums(Blue.ME)

s.ind <- which(Blue.PE==min(Blue.PE))
s.min <- s.grid[s.ind]
plot(s.grid,Blue.PE)

# Final fit
Bluefinal.gam <- gam(logBSH~s(DOFY)+s(NLIGHTST)+s(SOAKTIME)+s(AVGHKDEP)+s(OCEAND1)+s(TEMP1)+offset(log(TOTHOOKS)),sp=Bluespar,data=allBLUE)
Bluegam.toplot <- t(predict.gam(Bluefinal.gam,type="terms"))

summary(Bluefinal.gam)
plot(Bluefinal.gam,pages=1)


plot(s.grid,Blues.ck[,1],type="l",ylim=c(0,2.5),xlim=c(0,10),xlab="s",ylab="ck")
text(1.4,0.58,"DOFY",cex=0.8)
lines(s.grid,Blues.ck[,2])
text(4.3,0.35,"NLIGHTST",cex=0.8)
lines(s.grid,Blues.ck[,3])
text(5.9,1.75,"SOAKTIME",cex=0.8)
lines(s.grid,Blues.ck[,4])
text(4,1.35,"AVGHKDEP",cex=0.8)
lines(s.grid,Blues.ck[,5])
text(2.8,0.7,"OCEAND1",cex=0.8)
lines(s.grid,Blues.ck[,6])
text(0.5,0.75,"TEMP1",cex=0.8)
abline(v=s.min,lwd=2)


Bluefinal.toplot <-predict.gam(Bluefinal.gam,type="terms",se.fit=TRUE)
Bluefinal.fitted <- t(Bluefinal.toplot$fit)*(Blues.ck[s.ind,])
Bluefinal.se <- t(Bluefinal.toplot$se.fit)*(Blues.ck[s.ind,]^2)

par(mfrow=c(2,2))
plot(sort(allBLUE$TEMP1),Bluefinal.fitted[6,order(allBLUE$TEMP1)],type="l",xlab="TEMP",ylab="s(TEMP)",ylim=c(-3,2))
lines(sort(allBLUE$TEMP1),Bluefinal.fitted[6,order(allBLUE$TEMP1)]+2*Bluefinal.se[6,order(allBLUE$TEMP1)],lty=2)
lines(sort(allBLUE$TEMP1),Bluefinal.fitted[6,order(allBLUE$TEMP1)]-2*Bluefinal.se[6,order(allBLUE$TEMP1)],lty=2)

plot(sort(allBLUE$DOFY),Bluefinal.fitted[1,order(allBLUE$DOFY)],type="l",xlab="DOFY",ylab="s(DOFY)",ylim=c(-3,2))
lines(sort(allBLUE$DOFY),Bluefinal.fitted[1,order(allBLUE$DOFY)]-2*Bluefinal.se[1,order(allBLUE$DOFY)],lty=2)
lines(sort(allBLUE$DOFY),Bluefinal.fitted[1,order(allBLUE$DOFY)]+2*Bluefinal.se[1,order(allBLUE$DOFY)],lty=2)

plot(sort(allBLUE$OCEAND1),Bluefinal.fitted[5,order(allBLUE$OCEAND1)],type="l",xlab="OCEAND",ylab="s(OCEAND)",ylim=c(-3,2))
lines(sort(allBLUE$OCEAND1),Bluefinal.fitted[5,order(allBLUE$OCEAND1)]-2*Bluefinal.fitted[5,order(allBLUE$OCEAND1)],lty=2)
lines(sort(allBLUE$OCEAND1),Bluefinal.fitted[5,order(allBLUE$OCEAND1)]+2*Bluefinal.fitted[5,order(allBLUE$OCEAND1)],lty=2)

dev.print(file="NNGfinalBlue.eps",horizontal=FALSE)

Blues.ck[s.ind,]
[1] 0.8208313 0.0000000 0.0000000 0.0000000 0.6201715 0.8598416


plot(sort(allBLUE$DOFY),Bluegam.toplot[1,order(allBLUE$DOFY)],type="l",xlab="DOFY",ylab="s(DOFY)")
plot(sort(allBLUE$OCEAND1),Bluegam.toplot[5,order(allBLUE$OCEAND1)],type="l",xlab="OCEAND",ylab="s(OCEAND)")
plot(sort(allBLUE$TEMP1),Bluegam.toplot[6,order(allBLUE$TEMP1)],type="l",xlab="TEMP",ylab="s(TEMP)")
plot(sort(allBLUE$NLIGHTST),Bluegam.toplot[2,order(allBLUE$NLIGHTST)],type="l")
plot(sort(allBLUE$NLIGHTST),Bluefinal.toplot[2,order(allBLUE$NLIGHTST)],type="l")

plot(sort(allBLUE$SOAKTIME),Bluegam.toplot[3,order(allBLUE$SOAKTIME)],type="l")
plot(sort(allBLUE$SOAKTIME),Bluefinal.toplot[3,order(allBLUE$SOAKTIME)],type="l")

plot(sort(allBLUE$AVGHKDEP),Bluegam.toplot[4,order(allBLUE$AVGHKDEP)],type="l")
plot(sort(allBLUE$AVGHKDEP),Bluefinal.toplot[4,order(allBLUE$AVGHKDEP)],type="l")
