#### code for the paper 'A regularised hidden Markov model for analysing the hot shoe in football ####

## There are two files of R-code: 
## This file contains the code for the simulation study (Section 3)
## The file 'hotshoe_application.R'  contains the code for the 'hot shoe' case study presented in the manuscript (Section 4)

## the code was tested in R (version 4.0.2) on Windows 10 with packages...

# ...Rcpp (version 1.0.5)
# ...mvtnorm (version 1.1.1)
# ...ggplot2 (version 3.3.2)
# ...dplyr (version 1.0.2)
# ...gridExtra (version 2.3)

# authors: Marius Oetting and Andreas Groll
# Email for correspondence: marius.oetting@uni-bielefeld.de

# load packages
library(Rcpp)
library(mvtnorm)
library(ggplot2)
library(dplyr)
library(gridExtra)

# load C++ file for faster likelihood computation
sourceCpp("nLogLike.cpp")


# functions ---------------------------------------------------------------

# likelihood
L.sim.pen.elnet <- function(theta.star, x, N, lambda, alpha, nr.covariates){
  Gamma1 <- diag(N)
  Gamma1[!Gamma1] <- exp(theta.star[1:2])
  Gamma1 <- Gamma1/rowSums(Gamma1)
  
  beta.0 <- theta.star[3:4]
  beta.1 <- theta.star[5:(5 + 3 + nr.covariates - 1)]
  delta <- c(0.5, 0.5)
  
  allprobs <- matrix(1, nrow(x), N)
  # covariates
  covariates <- as.matrix(x[, -1])
  # index NAs
  ind <- which(!is.na(x$PenaltyMade))
  
  for(j in 1:N){
    allprobs[ind, j] <- dbinom(x$PenaltyMade[ind], 1, plogis(beta.0[j] + as.vector(covariates[ind,] %*% beta.1)))
  }
  
  foo <- delta %*% diag(allprobs[1,])
  l <- log(sum(foo))
  phi <- foo/sum(foo)
  # rcpp
  l <- nLogLike_Rcpp(allprobs, Gamma1, foo, nrow(x))
  # for(t in 2:nrow(x)){
  #   foo <- phi %*% Gamma1 %*% diag(allprobs[t,])
  #   l <- l + log(sum(foo))
  #   phi <- foo/sum(foo)
  # }
  
  # add penalty
  penalty <- lambda * (alpha * sum(sqrt(beta.1^2 + 1e-5)) + (1 - alpha) * sum(beta.1^2))
  res.likelihood <- -l + penalty
  return(res.likelihood)
}

# function for forecasting
bernoulli.HMM.forecast <- function(x, N = 2, H = 1, vector.estimates, new.data){
  # t.p.m.
  Gamma.temp <- diag(N)
  Gamma.temp[!Gamma.temp] <- exp(vector.estimates[1:2])
  Gamma.temp <- Gamma.temp / rowSums(Gamma.temp)
  
  delta.temp <- solve(t(diag(N) - Gamma.temp + 1), rep(1, N))
  xrange <- 0:1
  
  n <- nrow(x)
  
  beta.0 <- vector.estimates[3:4]
  beta.1 <- vector.estimates[5:length(vector.estimates)]
  
  allprobs <- matrix(1, n, N)
  covariates <- as.matrix(x[, -1])
  
  for(j in 1:N){
    allprobs[, j] <- dbinom(x$PenaltyMade, 1, plogis(beta.0[j] + as.vector(covariates %*% beta.1)))
  }
  
  foo <- delta.temp * allprobs[1,]
  sumfoo <- sum(foo)
  lscale <- log(sumfoo)
  foo <- foo / sumfoo
  for(i in 2:n){
    foo <- foo %*% Gamma.temp * allprobs[i, ]
    sumfoo <- sum(foo)
    lscale <- lscale + log(sumfoo)
    foo <- foo / sumfoo
  }
  xi <- matrix(NA, nrow = N, ncol = H)
  
  allprobs.new <- matrix(0, nrow = H, ncol = length(xrange))
  covariates.new <- as.matrix(new.data[, -1])
  
  for(i in 1:H){
    foo <- foo %*% Gamma.temp
    xi[, i] <- foo
    
    for(j in 1:N){
      allprobs.new[i,] <- allprobs.new[i,] + foo[j] * dbinom(xrange, size = 1, prob = plogis(beta.0[j] + as.vector(covariates.new %*% beta.1)[i]))
    }
  }
  return(allprobs.new)
}


# simulate data -----------------------------------------------------------

### correlated setting

pi.beta <- qlogis(c(0.75, 0.35))
length.ts <- 5000 
forecast.h <- 100
nr.covariates <- 47

beta.1 <- beta.2 <- beta.3 <- beta.4 <- beta.5 <- beta.6 <- 0.3

# covariance matrix
sigma_matrix <- matrix(c(1.5, rep(1.3, 2), 
                         1.3, 1.5, 1.3, 
                         1.3, 1.3, 1.5), ncol = 3)

data_list_cor <- list()
set.seed(2305)

for(a in 1:100){
  #### generate data
  group1 <- rmvnorm(n = length.ts + forecast.h, mean = c(rep(0, 3)), sigma = sigma_matrix)
  group2 <- rmvnorm(n = length.ts + forecast.h, mean = c(rep(0, 3)), sigma = sigma_matrix)
  
  x1.sim <- group1[,1]
  x2.sim <- group1[,2]
  x3.sim <- group1[,3]
  
  x4.sim <- group2[,1]
  x5.sim <- group2[,2]
  x6.sim <- group2[,3]
  
  delta <- c(0.5, 0.5)
  # first state and first observation
  s1 <- rep(NA, length.ts + forecast.h) # states
  penalty.sim <- rep(NA, length.ts + forecast.h) # outcomes
  
  s1[1] <- sample(1:2, size = 1, prob = delta)
  penalty.sim[1] <- rbinom(1, size = 1, prob = plogis(pi.beta[s1[1]] + beta.1 * 
                                                        x1.sim[1] + beta.2 * x2.sim[1] + beta.3 * x3.sim[1] +
                                                        beta.4 * x4.sim[1] + beta.5 * x5.sim[1] + 
                                                        beta.6 * x6.sim[1]))
  
  # gamma matrix
  Gamma1 <- matrix(c(0.9, 0.1, 0.1, 0.9), byrow = TRUE, ncol = 2)
  
  for (t in 2:(length.ts + forecast.h)){
    s1[t] <- sample(1:2, size = 1, prob = Gamma1[s1[t - 1], ])
    
    penalty.sim[t] <- rbinom(1, size = 1, prob = plogis(pi.beta[s1[t]] + beta.1 * 
                                                          x1.sim[t] + beta.2 * x2.sim[t] + 
                                                          beta.3 * x3.sim[t] + beta.4 * x4.sim[t] + 
                                                          beta.5 * x5.sim[t] + beta.6 * x6.sim[t]))
  }
  
  zwerg <- data.frame(PenaltyMade = penalty.sim[1:length.ts], active1 = x1.sim[1:length.ts], 
                      active2 = x2.sim[1:length.ts], active3 = x3.sim[1:length.ts],
                      active4 = x4.sim[1:length.ts], active5 = x5.sim[1:length.ts],
                      active6 = x6.sim[1:length.ts])
  zwerg.forecast <- data.frame(PenaltyMade = penalty.sim[(length.ts + 1):(length.ts + forecast.h)], 
                               active1 = x1.sim[(length.ts + 1):(length.ts + forecast.h)], 
                               active2 = x2.sim[(length.ts + 1):(length.ts + forecast.h)],
                               active3 = x3.sim[(length.ts + 1):(length.ts + forecast.h)],
                               active4 = x4.sim[(length.ts + 1):(length.ts + forecast.h)],
                               active5 = x5.sim[(length.ts + 1):(length.ts + forecast.h)],
                               active6 = x6.sim[(length.ts + 1):(length.ts + forecast.h)])
  
  ### add noise covariates
  nr.covariates <- 44
  covariates.useless <- runif((length.ts) * nr.covariates, -2, 2)
  covariate.matrix <- data.frame(zwerg$PenaltyMade, zwerg$active1, zwerg$active2, zwerg$active3,
                                 zwerg$active4, zwerg$active5, zwerg$active6)
  covariate.matrix <- cbind(covariate.matrix, matrix(covariates.useless, nrow = length.ts, ncol = nr.covariates))
  colnames(covariate.matrix)[1:7] <- c("PenaltyMade", "active1", "active2", "active3", "active4",
                                       "active5", "active6")
  colnames(covariate.matrix)[8:ncol(covariate.matrix)] <- paste0("covariate", 1:nr.covariates)
  
  # forecast covariate values
  covariate.matrix.forecast <- data.frame(zwerg.forecast$PenaltyMade, zwerg.forecast$active1, 
                                          zwerg.forecast$active2, zwerg.forecast$active3,
                                          zwerg.forecast$active4, zwerg.forecast$active5,
                                          zwerg.forecast$active6)
  covariates.useless <- runif(forecast.h * nr.covariates, -2, 2)
  covariate.matrix.forecast <- cbind(covariate.matrix.forecast, matrix(covariates.useless, nrow = forecast.h, ncol = nr.covariates))
  colnames(covariate.matrix.forecast)[1:7] <- c("PenaltyMade", "active1", "active2", "active3", "active4",
                                                "active5", "active6")
  colnames(covariate.matrix.forecast)[8:ncol(covariate.matrix.forecast)] <- paste0("covariate", 1:nr.covariates)  
  covariate_matrix_inlist <- rbind(covariate.matrix, covariate.matrix.forecast)
  
  data_list_cor[[a]] <- covariate_matrix_inlist
}

### uncorrelated setting
pi.beta <- qlogis(c(0.75, 0.35))
length.ts <- 5000 
forecast.h <- 100
nr.covariates <- 47

beta.1 <- beta.2 <- beta.3 <- beta.4 <- beta.5 <- beta.6 <- 0.3

# covariance matrix
sigma_matrix <- matrix(c(1.5, rep(0, 2), 
                         0, 1.5, 0, 
                         0, 0, 1.5), ncol = 3)

data_list_uncor <- list()
set.seed(2305)

for(a in 1:100){
  #### generate data
  group1 <- rmvnorm(n = length.ts + forecast.h, mean = c(rep(0, 3)), sigma = sigma_matrix)
  group2 <- rmvnorm(n = length.ts + forecast.h, mean = c(rep(0, 3)), sigma = sigma_matrix)
  
  x1.sim <- group1[,1]
  x2.sim <- group1[,2]
  x3.sim <- group1[,3]
  
  x4.sim <- group2[,1]
  x5.sim <- group2[,2]
  x6.sim <- group2[,3]
  
  delta <- c(0.5, 0.5)
  # first state and first observation
  s1 <- rep(NA, length.ts + forecast.h) # states
  penalty.sim <- rep(NA, length.ts + forecast.h) # outcomes
  
  s1[1] <- sample(1:2, size = 1, prob = delta)
  penalty.sim[1] <- rbinom(1, size = 1, prob = plogis(pi.beta[s1[1]] + beta.1 * 
                                                        x1.sim[1] + beta.2 * x2.sim[1] + beta.3 * x3.sim[1] +
                                                        beta.4 * x4.sim[1] + beta.5 * x5.sim[1] + 
                                                        beta.6 * x6.sim[1]))
  
  # gamma matrix
  Gamma1 <- matrix(c(0.9, 0.1, 0.1, 0.9), byrow = TRUE, ncol = 2)
  
  for (t in 2:(length.ts + forecast.h)){
    s1[t] <- sample(1:2, size = 1, prob = Gamma1[s1[t - 1], ])
    penalty.sim[t] <- rbinom(1, size = 1, prob = plogis(pi.beta[s1[t]] + beta.1 * 
                                                          x1.sim[t] + beta.2 * x2.sim[t] + 
                                                          beta.3 * x3.sim[t] + beta.4 * x4.sim[t] + 
                                                          beta.5 * x5.sim[t] + beta.6 * x6.sim[t]))
  }
  
  zwerg <- data.frame(PenaltyMade = penalty.sim[1:length.ts], active1 = x1.sim[1:length.ts], 
                      active2 = x2.sim[1:length.ts], active3 = x3.sim[1:length.ts],
                      active4 = x4.sim[1:length.ts], active5 = x5.sim[1:length.ts],
                      active6 = x6.sim[1:length.ts])
  zwerg.forecast <- data.frame(PenaltyMade = penalty.sim[(length.ts + 1):(length.ts + forecast.h)], 
                               active1 = x1.sim[(length.ts + 1):(length.ts + forecast.h)], 
                               active2 = x2.sim[(length.ts + 1):(length.ts + forecast.h)],
                               active3 = x3.sim[(length.ts + 1):(length.ts + forecast.h)],
                               active4 = x4.sim[(length.ts + 1):(length.ts + forecast.h)],
                               active5 = x5.sim[(length.ts + 1):(length.ts + forecast.h)],
                               active6 = x6.sim[(length.ts + 1):(length.ts + forecast.h)])
  
  ### add noise covariates
  nr.covariates <- 44
  covariates.useless <- runif((length.ts) * nr.covariates, -2, 2)
  covariate.matrix <- data.frame(zwerg$PenaltyMade, zwerg$active1, zwerg$active2, zwerg$active3,
                                 zwerg$active4, zwerg$active5, zwerg$active6)
  covariate.matrix <- cbind(covariate.matrix, matrix(covariates.useless, nrow = length.ts, ncol = nr.covariates))
  colnames(covariate.matrix)[1:7] <- c("PenaltyMade", "active1", "active2", "active3", "active4",
                                       "active5", "active6")
  colnames(covariate.matrix)[8:ncol(covariate.matrix)] <- paste0("covariate", 1:nr.covariates)
  
  # forecast covariate values
  covariate.matrix.forecast <- data.frame(zwerg.forecast$PenaltyMade, zwerg.forecast$active1, 
                                          zwerg.forecast$active2, zwerg.forecast$active3,
                                          zwerg.forecast$active4, zwerg.forecast$active5,
                                          zwerg.forecast$active6)
  covariates.useless <- runif(forecast.h * nr.covariates, -2, 2)
  covariate.matrix.forecast <- cbind(covariate.matrix.forecast, matrix(covariates.useless, nrow = forecast.h, ncol = nr.covariates))
  colnames(covariate.matrix.forecast)[1:7] <- c("PenaltyMade", "active1", "active2", "active3", "active4",
                                                "active5", "active6")
  colnames(covariate.matrix.forecast)[8:ncol(covariate.matrix.forecast)] <- paste0("covariate", 1:nr.covariates)  
  covariate_matrix_inlist <- rbind(covariate.matrix, covariate.matrix.forecast)
  
  data_list_uncor[[a]] <- covariate_matrix_inlist
}


# run simulation ----------------------------------------------------------


# run simulation -- uncorrelated setting ----------------------------------

# true values
pi.beta <- qlogis(c(0.75, 0.35))
beta.1 <- beta.2 <- beta.3 <- beta.4 <- beta.5 <- beta.6 <- 0.3

length.ts <- 5000 
forecast.h <- 100
nr.covariates <- 47

# define grid
lambdas <- exp(seq(log(20000), log(1e-4), length.out = 50))
alphas <- c(0, 0.2, 0.4, 0.6, 0.8, 1)
lambdas_alphas_grid <- expand.grid(lambdas, alphas)
colnames(lambdas_alphas_grid) <- c("lambda", "alpha")
# starting values
theta.star.vec <- c(-1.4605264, -1.7160739, -0.3921979,  0.4610380, rep(0.1, nr.covariates + 3))
theta.star.mat <- matrix(0, ncol = length(theta.star.vec), nrow = nrow(lambdas_alphas_grid) + 1)
theta.star.mat[1,] <- theta.star.vec

# define empty vectors for MSEs etc.
bic.vec <- rep(NA, nrow(lambdas_alphas_grid))
aic.vec <- rep(NA, nrow(lambdas_alphas_grid))
cv.vec <- rep(NA, nrow(lambdas_alphas_grid))
intercept1.vec <- rep(NA, nrow(lambdas_alphas_grid))
intercept2.vec <- rep(NA, nrow(lambdas_alphas_grid))
gamma11.vec <- rep(NA, nrow(lambdas_alphas_grid))
gamma22.vec <- rep(NA, nrow(lambdas_alphas_grid))


mse.intercepts.aic <- rep(NA, 100)
mse.intercepts.bic <- rep(NA, 100)
mse.intercepts.mle <- rep(NA, 100)
mse.intercepts.cv <- rep(NA, 100)
mse.intercepts.lasso.aic <- rep(NA, 100)
mse.intercepts.lasso.bic <- rep(NA, 100)
mse.intercepts.lasso.cv <- rep(NA, 100)
mse.intercepts.ridge.aic <- rep(NA, 100)
mse.intercepts.ridge.bic <- rep(NA, 100)
mse.intercepts.ridge.cv <- rep(NA, 100)


mse.betas.aic <- rep(NA, 100)
mse.betas.bic <- rep(NA, 100)
mse.betas.mle <- rep(NA, 100)
mse.betas.cv <- rep(NA, 100)
mse.betas.lasso.aic <- rep(NA, 100)
mse.betas.lasso.bic <- rep(NA, 100)
mse.betas.lasso.cv <- rep(NA, 100)
mse.betas.ridge.aic <- rep(NA, 100)
mse.betas.ridge.bic <- rep(NA, 100)
mse.betas.ridge.cv <- rep(NA, 100)


mse.betas.noise.aic <- rep(NA, 100)
mse.betas.nonnoise.aic <- rep(NA, 100)
mse.betas.noise.bic <- rep(NA, 100)
mse.betas.nonnoise.bic <- rep(NA, 100)
mse.betas.noise.mle <- rep(NA, 100)
mse.betas.nonnoise.mle <- rep(NA, 100)
mse.betas.noise.cv <- rep(NA, 100)
mse.betas.nonnoise.cv <- rep(NA, 100)

mse.betas.noise.lasso.aic <- rep(NA, 100)
mse.betas.nonnoise.lasso.aic <- rep(NA, 100)
mse.betas.noise.lasso.bic <- rep(NA, 100)
mse.betas.nonnoise.lasso.bic <- rep(NA, 100)
mse.betas.noise.lasso.cv <- rep(NA, 100)
mse.betas.nonnoise.lasso.cv <- rep(NA, 100)
mse.betas.noise.ridge.aic <- rep(NA, 100)
mse.betas.nonnoise.ridge.aic <- rep(NA, 100)
mse.betas.noise.ridge.bic <- rep(NA, 100)
mse.betas.nonnoise.ridge.bic <- rep(NA, 100)
mse.betas.noise.ridge.cv <- rep(NA, 100)
mse.betas.nonnoise.ridge.cv <- rep(NA, 100)


mse.gammas.aic <- rep(NA, 100)
mse.gammas.bic <- rep(NA, 100)
mse.gammas.mle <- rep(NA, 100)
mse.gammas.cv <- rep(NA, 100)
mse.gammas.lasso.aic <- rep(NA, 100)
mse.gammas.lasso.bic <- rep(NA, 100)
mse.gammas.lasso.cv <- rep(NA, 100)
mse.gammas.ridge.aic <- rep(NA, 100)
mse.gammas.ridge.bic <- rep(NA, 100)
mse.gammas.ridge.cv <- rep(NA, 100)


tpr.betas.aic <- rep(NA, 100)
tpr.betas.bic <- rep(NA, 100)
tpr.betas.mle <- rep(NA, 100)
tpr.betas.cv <- rep(NA, 100)
tpr.betas.lasso.aic <- rep(NA, 100)
tpr.betas.lasso.bic <- rep(NA, 100)
tpr.betas.lasso.cv <- rep(NA, 100)
tpr.betas.ridge.aic <- rep(NA, 100)
tpr.betas.ridge.bic <- rep(NA, 100)
tpr.betas.ridge.cv <- rep(NA, 100)


fpr.betas.aic <- rep(NA, 100)
fpr.betas.bic <- rep(NA, 100)
fpr.betas.mle <- rep(NA, 100)
fpr.betas.cv <- rep(NA, 100)
fpr.betas.lasso.aic <- rep(NA, 100)
fpr.betas.lasso.bic <- rep(NA, 100)
fpr.betas.lasso.cv <- rep(NA, 100)
fpr.betas.ridge.aic <- rep(NA, 100)
fpr.betas.ridge.bic <- rep(NA, 100)
fpr.betas.ridge.cv <- rep(NA, 100)


brier.aic <- rep(NA, 100)
brier.bic <- rep(NA, 100)
brier.mle <- rep(NA, 100)
brier.cv <- rep(NA, 100)
brier.lasso.aic <- rep(NA, 100)
brier.lasso.bic <- rep(NA, 100)
brier.lasso.cv <- rep(NA, 100)
brier.ridge.aic <- rep(NA, 100)
brier.ridge.bic <- rep(NA, 100)
brier.ridge.cv <- rep(NA, 100)

avg.pred.prob.aic <- rep(NA, 100)
avg.pred.prob.bic <- rep(NA, 100)
avg.pred.prob.mle <- rep(NA, 100)
avg.pred.prob.cv <- rep(NA, 100)
avg.pred.prob.lasso.aic <- rep(NA, 100)
avg.pred.prob.lasso.bic <- rep(NA, 100)
avg.pred.prob.lasso.cv <- rep(NA, 100)
avg.pred.prob.ridge.aic <- rep(NA, 100)
avg.pred.prob.ridge.bic <- rep(NA, 100)
avg.pred.prob.ridge.cv <- rep(NA, 100)

opt.alpha.aic <- rep(NA, 100)
opt.alpha.bic <- rep(NA, 100)
opt.alpha.cv <- rep(NA, 100)

opt.lambda.aic <- rep(NA, 100)
opt.lambda.bic <- rep(NA, 100)
opt.lambda.cv <- rep(NA, 100)
opt.lambda.lasso.aic <- rep(NA, 100)
opt.lambda.lasso.bic <- rep(NA, 100)
opt.lambda.lasso.cv <- rep(NA, 100)
opt.lambda.ridge.cv <- rep(NA, 100)

true.betas <- c(beta.1, beta.2, beta.3, beta.4, beta.5, beta.6, rep(0, 44))
true.gammas <- c(0.9, 0.9)
true.icepts <- pi.beta


## run simulation 100 times 
set.seed(2305)
for(a in 1:100){
  covariate_matrix_complete <- data_list_uncor[[a]]
  covariate.matrix <- covariate_matrix_complete[1:5000,]
  covariate.matrix.forecast <- covariate_matrix_complete[5001:5100,]
  
  nr.covariates <- 44
  
  original.means <- apply(covariate.matrix[2:ncol(covariate.matrix)], 2, mean)
  original.sds <- apply(covariate.matrix[2:ncol(covariate.matrix)], 2, sd)
  
  covariate.matrix[,2:ncol(covariate.matrix)] <- scale(covariate.matrix[,2:ncol(covariate.matrix)])
  
  # forecast covariate values
  covariate.matrix.forecast[,2:ncol(covariate.matrix.forecast)] <- scale(covariate.matrix.forecast[,2:ncol(covariate.matrix.forecast)])
  
  #### optimal lambda
  theta.star.mat <- matrix(0, ncol = length(theta.star.vec), nrow = nrow(lambdas_alphas_grid) + 1)
  theta.star.mat[1,] <- theta.star.vec
  
  bic.vec <- rep(NA, nrow(lambdas_alphas_grid))
  aic.vec <- rep(NA, nrow(lambdas_alphas_grid))
  cv.vec <- rep(NA, nrow(lambdas_alphas_grid))
  llk.vec <- rep(NA, nrow(lambdas_alphas_grid))
  
  intercept1.vec <- rep(NA, nrow(lambdas_alphas_grid))
  intercept2.vec <- rep(NA, nrow(lambdas_alphas_grid))
  
  gamma11.vec <- rep(NA, nrow(lambdas_alphas_grid))
  gamma22.vec <- rep(NA, nrow(lambdas_alphas_grid))

  for(i in 1:nrow(lambdas_alphas_grid)){
    cur.lambda <- lambdas_alphas_grid[i,]$lambda
    cur.alpha <- lambdas_alphas_grid[i,]$alpha
    mod <- nlm(L.sim.pen.elnet, theta.star.vec, x = covariate.matrix, N = 2, lambda = cur.lambda,
               alpha = cur.alpha, nr.covariates = 47, print.level = 1, iterlim = 10000)
    
    loglik.temp <- L.sim.pen.elnet(mod$estimate, covariate.matrix, 2, 0, cur.alpha, 47)
    # if alpha = 0, i.e. in the ridge case, coefficients are not set to zero
    if(cur.alpha == 0) edf <- length(mod$estimate[5:length(mod$estimate)])
    else edf <- sum(abs(mod$estimate[5:length(mod$estimate)]) > 0.001)
    llk.vec[i] <- loglik.temp
    bic.vec[i] <- 2*(loglik.temp) + (edf + 4) * log(length.ts)
    aic.vec[i] <- 2*(loglik.temp) + (edf + 4) * 2
    intercept1.vec[i] <- mod$estimate[3] - sum(mod$estimate[5:length(mod$estimate)] * original.means / original.sds)
    intercept2.vec[i] <- mod$estimate[4] - sum(mod$estimate[5:length(mod$estimate)] * original.means / original.sds)
    
    Gamma.temp <- diag(2)
    Gamma.temp[!Gamma.temp] <- exp(mod$estimate[1:2])
    Gamma.temp <- Gamma.temp / rowSums(Gamma.temp)
    gamma11.vec[i] <- Gamma.temp[1,1]
    gamma22.vec[i] <- Gamma.temp[2,2]
    
    theta.star.mat[i + 1,] <- mod$estimate
    
    ### cross validation
    
    nfolds <- 10
    oos_llk <- rep(NA, nfolds)
    for(f in 1:nfolds){
      # sample rows
      index_rows_to_na <- sample(nrow(covariate.matrix), 5000 * 0.1, replace = FALSE)
      index_rows_to_na <- index_rows_to_na[index_rows_to_na > 1]
      # calibration set
      calibration_set <- covariate.matrix
      calibration_set$PenaltyMade[index_rows_to_na] <- NA
      # test set
      na_for_testset <- setdiff(2:5000, index_rows_to_na)
      test_set <- covariate.matrix
      test_set$PenaltyMade[na_for_testset] <- NA
      
      mod_cv <- tryCatch(nlm(L.sim.pen.elnet, theta.star.vec, x = calibration_set, N = 2, lambda = cur.lambda,
                             alpha = cur.alpha, nr.covariates = 47, print.level = 1, iterlim = 10000),
                         error = function(e) NA)
      # out of sample llk
      if(is.na(mod_cv[1])) oos_llk[f] <- NA
      else oos_llk[f] <- L.sim.pen.elnet(mod_cv$estimate, test_set, 2, 0, cur.alpha, 47)
    }
    cv.vec[i] <- mean(oos_llk, na.rm = TRUE)
  }
  
  opt.aic <- which.min(aic.vec)
  opt.bic <- which.min(bic.vec)
  opt.cv <- which.min(cv.vec)
  
  # rows of the grid corresponding to lasso and ridge
  lasso_rows <- which(lambdas_alphas_grid$alpha == 1)
  ridge_rows <- which(lambdas_alphas_grid$alpha == 0)
  opt.lasso.aic <- which(aic.vec == min(aic.vec[lasso_rows]))
  opt.lasso.bic <- which(bic.vec == min(bic.vec[lasso_rows]))
  opt.lasso.cv <- which(cv.vec == min(cv.vec[lasso_rows]))
  opt.ridge.aic <- which(aic.vec == min(aic.vec[ridge_rows]))
  opt.ridge.bic <- which(bic.vec == min(bic.vec[ridge_rows]))
  opt.ridge.cv <- which(cv.vec == min(cv.vec[ridge_rows]))
  
  
  opt.alpha.aic[a] <- lambdas_alphas_grid[opt.aic,]$alpha
  opt.alpha.bic[a] <- lambdas_alphas_grid[opt.bic,]$alpha
  opt.alpha.cv[a] <- lambdas_alphas_grid[opt.cv,]$alpha
  
  # save optimal lambdas
  opt.lambda.aic[a] <- lambdas_alphas_grid[opt.aic,]$lambda
  opt.lambda.bic[a] <- lambdas_alphas_grid[opt.bic,]$lambda
  opt.lambda.cv[a] <- lambdas_alphas_grid[opt.cv,]$lambda
  opt.lambda.lasso.aic[a] <- lambdas_alphas_grid[opt.lasso.aic,]$lambda
  opt.lambda.lasso.bic[a] <- lambdas_alphas_grid[opt.lasso.bic,]$lambda
  opt.lambda.lasso.cv[a] <- lambdas_alphas_grid[opt.lasso.cv,]$lambda
  opt.lambda.ridge.cv[a] <- lambdas_alphas_grid[opt.ridge.cv,]$lambda
  
  # save estimates from chosen model
  theta.final.bic <- theta.star.mat[opt.bic + 1,]
  theta.final.aic <- theta.star.mat[opt.aic + 1,]
  theta.final.cv <- theta.star.mat[opt.cv + 1,]
  
  theta.final.lasso.aic <- theta.star.mat[opt.lasso.aic + 1,]
  theta.final.lasso.bic <- theta.star.mat[opt.lasso.bic + 1,]
  theta.final.lasso.cv <- theta.star.mat[opt.lasso.cv + 1,]
  theta.final.ridge.aic <- theta.star.mat[opt.ridge.aic + 1,]
  theta.final.ridge.bic <- theta.star.mat[opt.ridge.bic + 1,]
  theta.final.ridge.cv <- theta.star.mat[opt.ridge.cv + 1,]
  
  # estimates with betas < 0.001 set to zero: clean estimates
  theta.final.aic.clean <- theta.final.aic
  theta.final.bic.clean <- theta.final.bic
  theta.final.cv.clean <- theta.final.cv
  
  theta.final.lasso.aic.clean <- theta.final.lasso.aic
  theta.final.lasso.bic.clean <- theta.final.lasso.bic
  theta.final.lasso.cv.clean <- theta.final.lasso.cv
  theta.final.ridge.aic.clean <- theta.final.ridge.aic
  theta.final.ridge.bic.clean <- theta.final.ridge.bic
  theta.final.ridge.cv.clean <- theta.final.ridge.cv
  
  if(opt.alpha.aic[a] > 0) theta.final.aic.clean[abs(theta.final.aic) <= 0.001] <- 0
  if(opt.alpha.bic[a] > 0) theta.final.bic.clean[abs(theta.final.bic) <= 0.001] <- 0
  if(opt.alpha.cv[a] > 0) theta.final.cv.clean[abs(theta.final.cv) <= 0.001] <- 0
  
  theta.final.lasso.aic.clean[abs(theta.final.lasso.aic) <= 0.001] <- 0
  theta.final.lasso.bic.clean[abs(theta.final.lasso.bic) <= 0.001] <- 0
  theta.final.lasso.cv.clean[abs(theta.final.lasso.cv) <= 0.001] <- 0
  
  # re-standardize intercepts
  icept1.aic <- intercept1.vec[opt.aic]
  icept2.aic <- intercept2.vec[opt.aic]
  icepts.aic <- sort(c(icept1.aic, icept2.aic), decreasing = TRUE)
  
  icept1.bic <- intercept1.vec[opt.bic]
  icept2.bic <- intercept2.vec[opt.bic]
  icepts.bic <- sort(c(icept1.bic, icept2.bic), decreasing = TRUE)
  
  icept1.cv <- intercept1.vec[opt.cv]
  icept2.cv <- intercept2.vec[opt.cv]
  icepts.cv <- sort(c(icept1.cv, icept2.cv), decreasing = TRUE)
  
  icept1.lasso.aic <- intercept1.vec[opt.lasso.aic]
  icept2.lasso.aic <- intercept2.vec[opt.lasso.aic]
  icepts.lasso.aic <- sort(c(icept1.lasso.aic, icept2.lasso.aic), decreasing = TRUE)
  
  icept1.lasso.bic <- intercept1.vec[opt.lasso.bic]
  icept2.lasso.bic <- intercept2.vec[opt.lasso.bic]
  icepts.lasso.bic <- sort(c(icept1.lasso.bic, icept2.lasso.bic), decreasing = TRUE)
  
  icept1.lasso.cv <- intercept1.vec[opt.lasso.cv]
  icept2.lasso.cv <- intercept2.vec[opt.lasso.cv]
  icepts.lasso.cv <- sort(c(icept1.lasso.cv, icept2.lasso.cv), decreasing = TRUE)
  
  icept1.ridge.aic <- intercept1.vec[opt.ridge.aic]
  icept2.ridge.aic <- intercept2.vec[opt.ridge.aic]
  icepts.ridge.aic <- sort(c(icept1.ridge.aic, icept2.ridge.aic), decreasing = TRUE)
  
  icept1.ridge.bic <- intercept1.vec[opt.ridge.bic]
  icept2.ridge.bic <- intercept2.vec[opt.ridge.bic]
  icepts.ridge.bic <- sort(c(icept1.ridge.bic, icept2.ridge.bic), decreasing = TRUE)
  
  icept1.ridge.cv <- intercept1.vec[opt.ridge.cv]
  icept2.ridge.cv <- intercept2.vec[opt.ridge.cv]
  icepts.ridge.cv <- sort(c(icept1.ridge.cv, icept2.ridge.cv), decreasing = TRUE)
  
  
  # gammas
  gamma11.optaic <- gamma11.vec[opt.aic]
  gamma11.optbic <- gamma11.vec[opt.bic]
  gamma11.optcv <- gamma11.vec[opt.cv]
  gamma11.lasso.optaic <- gamma11.vec[opt.lasso.aic]
  gamma11.lasso.optbic <- gamma11.vec[opt.lasso.bic]
  gamma11.lasso.optcv <- gamma11.vec[opt.lasso.cv]
  gamma11.ridge.optaic <- gamma11.vec[opt.ridge.aic]
  gamma11.ridge.optbic <- gamma11.vec[opt.ridge.bic]
  gamma11.ridge.optcv <- gamma11.vec[opt.ridge.cv]
  
  gamma22.optaic <- gamma22.vec[opt.aic]
  gamma22.optbic <- gamma22.vec[opt.bic]
  gamma22.optcv <- gamma22.vec[opt.cv]
  gamma22.lasso.optaic <- gamma22.vec[opt.lasso.aic]
  gamma22.lasso.optbic <- gamma22.vec[opt.lasso.bic]
  gamma22.lasso.optcv <- gamma22.vec[opt.lasso.cv]
  gamma22.ridge.optaic <- gamma22.vec[opt.ridge.aic]
  gamma22.ridge.optbic <- gamma22.vec[opt.ridge.bic]
  gamma22.ridge.optcv <- gamma22.vec[opt.ridge.cv]
  
  
  # MSE for gammas
  mse.gammas.aic[a] <- 1 / 2 * sum((c(gamma11.optaic, gamma22.optaic) - true.gammas)^2)
  mse.gammas.bic[a] <- 1 / 2 * sum((c(gamma11.optbic, gamma22.optbic) - true.gammas)^2)
  mse.gammas.cv[a] <- 1 / 2 * sum((c(gamma11.optcv, gamma22.optcv) - true.gammas)^2)
  
  mse.gammas.lasso.aic[a] <- 1 / 2 * sum((c(gamma11.lasso.optaic, gamma22.lasso.optaic) - true.gammas)^2)
  mse.gammas.lasso.bic[a] <- 1 / 2 * sum((c(gamma11.lasso.optbic, gamma22.lasso.optbic) - true.gammas)^2)
  mse.gammas.lasso.cv[a] <- 1 / 2 * sum((c(gamma11.lasso.optcv, gamma22.lasso.optcv) - true.gammas)^2)
  mse.gammas.ridge.aic[a] <- 1 / 2 * sum((c(gamma11.ridge.optaic, gamma22.ridge.optaic) - true.gammas)^2)
  mse.gammas.ridge.bic[a] <- 1 / 2 * sum((c(gamma11.ridge.optbic, gamma22.ridge.optbic) - true.gammas)^2)
  mse.gammas.ridge.cv[a] <- 1 / 2 * sum((c(gamma11.ridge.optcv, gamma22.ridge.optcv) - true.gammas)^2)
  
  # MSE for intercepts
  mse.intercepts.aic[a] <- 1 / 2 * sum((icepts.aic - true.icepts)^2)
  mse.intercepts.bic[a] <- 1 / 2 * sum((icepts.bic - true.icepts)^2)
  mse.intercepts.cv[a] <- 1 / 2 * sum((icepts.cv - true.icepts)^2)
  
  mse.intercepts.lasso.aic[a] <- 1 / 2 * sum((icepts.lasso.aic - true.icepts)^2)
  mse.intercepts.lasso.bic[a] <- 1 / 2 * sum((icepts.lasso.bic - true.icepts)^2)
  mse.intercepts.lasso.cv[a] <- 1 / 2 * sum((icepts.lasso.cv - true.icepts)^2)
  mse.intercepts.ridge.aic[a] <- 1 / 2 * sum((icepts.ridge.aic - true.icepts)^2)
  mse.intercepts.ridge.bic[a] <- 1 / 2 * sum((icepts.ridge.bic - true.icepts)^2)
  mse.intercepts.ridge.cv[a] <- 1 / 2 * sum((icepts.ridge.cv - true.icepts)^2)
  
  # TPR 
  selected.betas.aic <- sum(abs(theta.final.aic.clean[5:10]) > 0.001)
  tpr.betas.aic[a] <- selected.betas.aic / 6
  selected.betas.bic <- sum(abs(theta.final.bic.clean[5:10]) > 0.001)
  tpr.betas.bic[a] <- selected.betas.bic / 6
  selected.betas.cv <- sum(abs(theta.final.cv.clean[5:10]) > 0.001)
  tpr.betas.cv[a] <- selected.betas.cv / 6
  
  selected.betas.lasso.aic <- sum(abs(theta.final.lasso.aic.clean[5:10]) > 0.001)
  tpr.betas.lasso.aic[a] <- selected.betas.lasso.aic / 6
  selected.betas.lasso.bic <- sum(abs(theta.final.lasso.bic.clean[5:10]) > 0.001)
  tpr.betas.lasso.bic[a] <- selected.betas.lasso.bic / 6
  selected.betas.lasso.cv <- sum(abs(theta.final.lasso.cv.clean[5:10]) > 0.001)
  tpr.betas.lasso.cv[a] <- selected.betas.lasso.cv / 6
  selected.betas.ridge.aic <- sum(abs(theta.final.ridge.aic.clean[5:10]) > 0.001)
  tpr.betas.ridge.aic[a] <- selected.betas.ridge.aic / 6
  selected.betas.ridge.bic <- sum(abs(theta.final.ridge.bic.clean[5:10]) > 0.001)
  tpr.betas.ridge.bic[a] <- selected.betas.ridge.bic / 6
  selected.betas.ridge.cv <- sum(abs(theta.final.ridge.cv.clean[5:10]) > 0.001)
  tpr.betas.ridge.cv[a] <- selected.betas.ridge.cv / 6
  
  # FPR
  false.detected.aic <- sum(abs(theta.final.aic.clean[11:length(theta.final.aic.clean)]) > 0.001)
  fpr.betas.aic[a] <- false.detected.aic / nr.covariates
  false.detected.bic <- sum(abs(theta.final.bic.clean[11:length(theta.final.bic.clean)]) > 0.001)
  fpr.betas.bic[a] <- false.detected.bic / nr.covariates
  false.detected.cv <- sum(abs(theta.final.cv.clean[11:length(theta.final.cv.clean)]) > 0.001)
  fpr.betas.cv[a] <- false.detected.cv / nr.covariates
  
  false.detected.lasso.aic <- sum(abs(theta.final.lasso.aic.clean[11:length(theta.final.lasso.aic.clean)]) > 0.001)
  fpr.betas.lasso.aic[a] <- false.detected.lasso.aic / nr.covariates
  false.detected.lasso.bic <- sum(abs(theta.final.lasso.bic.clean[11:length(theta.final.lasso.bic.clean)]) > 0.001)
  fpr.betas.lasso.bic[a] <- false.detected.lasso.bic / nr.covariates
  false.detected.lasso.cv <- sum(abs(theta.final.lasso.cv.clean[11:length(theta.final.lasso.cv.clean)]) > 0.001)
  fpr.betas.lasso.cv[a] <- false.detected.lasso.cv / nr.covariates
  false.detected.ridge.aic <- sum(abs(theta.final.ridge.aic.clean[11:length(theta.final.ridge.aic.clean)]) > 0.001)
  fpr.betas.ridge.aic[a] <- false.detected.ridge.aic / nr.covariates
  false.detected.ridge.bic <- sum(abs(theta.final.ridge.bic.clean[11:length(theta.final.ridge.bic.clean)]) > 0.001)
  fpr.betas.ridge.bic[a] <- false.detected.ridge.bic / nr.covariates
  false.detected.ridge.cv <- sum(abs(theta.final.ridge.cv.clean[11:length(theta.final.ridge.cv.clean)]) > 0.001)
  fpr.betas.ridge.cv[a] <- false.detected.ridge.cv / nr.covariates
  
  # re-standardize betas
  theta.final.bic.clean[5:length(theta.final.bic.clean)] <- theta.final.bic.clean[5:length(theta.final.bic.clean)] / original.sds
  theta.final.aic.clean[5:length(theta.final.aic.clean)] <- theta.final.aic.clean[5:length(theta.final.aic.clean)] / original.sds
  theta.final.cv.clean[5:length(theta.final.cv.clean)] <- theta.final.cv.clean[5:length(theta.final.cv.clean)] / original.sds
  
  theta.final.lasso.aic.clean[5:length(theta.final.lasso.aic.clean)] <- theta.final.lasso.aic.clean[5:length(theta.final.lasso.aic.clean)] / original.sds
  theta.final.lasso.bic.clean[5:length(theta.final.lasso.bic.clean)] <- theta.final.lasso.bic.clean[5:length(theta.final.lasso.bic.clean)] / original.sds
  theta.final.lasso.cv.clean[5:length(theta.final.lasso.cv.clean)] <- theta.final.lasso.cv.clean[5:length(theta.final.lasso.cv.clean)] / original.sds
  
  theta.final.ridge.aic.clean[5:length(theta.final.ridge.aic.clean)] <- theta.final.ridge.aic.clean[5:length(theta.final.ridge.aic.clean)] / original.sds
  theta.final.ridge.bic.clean[5:length(theta.final.ridge.bic.clean)] <- theta.final.ridge.bic.clean[5:length(theta.final.ridge.bic.clean)] / original.sds
  theta.final.ridge.cv.clean[5:length(theta.final.ridge.cv.clean)] <- theta.final.ridge.cv.clean[5:length(theta.final.ridge.cv.clean)] / original.sds
  
  # MSE for betas
  mse.betas.aic[a] <- 1 / (nr.covariates + 6) * sum((theta.final.aic.clean[5:length(theta.final.aic.clean)] - true.betas)^2)
  mse.betas.bic[a] <- 1 / (nr.covariates + 6) * sum((theta.final.bic.clean[5:length(theta.final.bic.clean)] - true.betas)^2)
  mse.betas.cv[a] <- 1 / (nr.covariates + 6) * sum((theta.final.cv.clean[5:length(theta.final.cv.clean)] - true.betas)^2)
  
  mse.betas.lasso.aic[a] <- 1 / (nr.covariates + 6) * sum((theta.final.lasso.aic.clean[5:length(theta.final.lasso.aic.clean)] - true.betas)^2)
  mse.betas.lasso.bic[a] <- 1 / (nr.covariates + 6) * sum((theta.final.lasso.bic.clean[5:length(theta.final.lasso.bic.clean)] - true.betas)^2)
  mse.betas.lasso.cv[a] <- 1 / (nr.covariates + 6) * sum((theta.final.lasso.cv.clean[5:length(theta.final.lasso.cv.clean)] - true.betas)^2)
  mse.betas.ridge.aic[a] <- 1 / (nr.covariates + 6) * sum((theta.final.ridge.aic.clean[5:length(theta.final.ridge.aic.clean)] - true.betas)^2)
  mse.betas.ridge.bic[a] <- 1 / (nr.covariates + 6) * sum((theta.final.ridge.bic.clean[5:length(theta.final.ridge.bic.clean)] - true.betas)^2)
  mse.betas.ridge.cv[a] <- 1 / (nr.covariates + 6) * sum((theta.final.ridge.cv.clean[5:length(theta.final.ridge.cv.clean)] - true.betas)^2)
  
  # separate MSE for noise and non-noise coefficients
  aic.noise <- theta.final.aic.clean[11:length(theta.final.aic.clean)]
  aic.nonnoise <- theta.final.aic.clean[5:10]
  bic.noise <- theta.final.bic.clean[11:length(theta.final.bic.clean)]
  bic.nonnoise <- theta.final.bic.clean[5:10]
  cv.noise <- theta.final.cv.clean[11:length(theta.final.cv.clean)]
  cv.nonnoise <- theta.final.cv.clean[5:10]
  
  lasso.aic.noise <- theta.final.lasso.aic.clean[11:length(theta.final.lasso.aic.clean)]
  lasso.aic.nonnoise <- theta.final.lasso.aic.clean[5:10]
  lasso.bic.noise <- theta.final.lasso.bic.clean[11:length(theta.final.lasso.bic.clean)]
  lasso.bic.nonnoise <- theta.final.lasso.bic.clean[5:10]
  lasso.cv.noise <- theta.final.lasso.cv.clean[11:length(theta.final.lasso.cv.clean)]
  lasso.cv.nonnoise <- theta.final.lasso.cv.clean[5:10]
  ridge.aic.noise <- theta.final.ridge.aic.clean[11:length(theta.final.ridge.aic.clean)]
  ridge.aic.nonnoise <- theta.final.ridge.aic.clean[5:10]
  ridge.bic.noise <- theta.final.ridge.bic.clean[11:length(theta.final.ridge.bic.clean)]
  ridge.bic.nonnoise <- theta.final.ridge.bic.clean[5:10]
  ridge.cv.noise <- theta.final.ridge.cv.clean[11:length(theta.final.ridge.cv.clean)]
  ridge.cv.nonnoise <- theta.final.ridge.cv.clean[5:10]
  
  mse.betas.noise.aic[a] <- 1 / nr.covariates * sum(aic.noise^2)
  mse.betas.nonnoise.aic[a] <- 1 / 6 * sum((aic.nonnoise - true.betas[1:6])^2)
  mse.betas.noise.bic[a] <- 1 / nr.covariates * sum(bic.noise^2)
  mse.betas.nonnoise.bic[a] <- 1 / 6 * sum((bic.nonnoise - true.betas[1:6])^2)
  mse.betas.noise.cv[a] <- 1 / nr.covariates * sum(cv.noise^2)
  mse.betas.nonnoise.cv[a] <- 1 / 6 * sum((cv.nonnoise - true.betas[1:6])^2)
  
  mse.betas.noise.lasso.aic[a] <- 1 / nr.covariates * sum(lasso.aic.noise^2)
  mse.betas.nonnoise.lasso.aic[a] <- 1 / 6 * sum((lasso.aic.nonnoise - true.betas[1:6])^2)
  mse.betas.noise.lasso.bic[a] <- 1 / nr.covariates * sum(lasso.bic.noise^2)
  mse.betas.nonnoise.lasso.bic[a] <- 1 / 6 * sum((lasso.bic.nonnoise - true.betas[1:6])^2)
  mse.betas.noise.lasso.cv[a] <- 1 / nr.covariates * sum(lasso.cv.noise^2)
  mse.betas.nonnoise.lasso.cv[a] <- 1 / 6 * sum((lasso.cv.nonnoise - true.betas[1:6])^2)
  mse.betas.noise.ridge.aic[a] <- 1 / nr.covariates * sum(ridge.aic.noise^2)
  mse.betas.nonnoise.ridge.aic[a] <- 1 / 6 * sum((ridge.aic.nonnoise - true.betas[1:6])^2)
  mse.betas.noise.ridge.bic[a] <- 1 / nr.covariates * sum(ridge.bic.noise^2)
  mse.betas.nonnoise.ridge.bic[a] <- 1 / 6 * sum((ridge.bic.nonnoise - true.betas[1:6])^2)
  mse.betas.noise.ridge.cv[a] <- 1 / nr.covariates * sum(ridge.cv.noise^2)
  mse.betas.nonnoise.ridge.cv[a] <- 1 / 6 * sum((ridge.cv.nonnoise - true.betas[1:6])^2)
  
  
  #### MSEs for unpenalised maximum likelihood
  mod.mle <- nlm(L.sim.pen.elnet, theta.star.vec, x = covariate.matrix, N = 2, lambda = 0, alpha = 0,
                 nr.covariates = 47, print.level = 1, iterlim = 1000)
  theta.final.mle <- mod.mle$estimate
  
  # re-standardize intercepts
  icept1.mle <- theta.final.mle[3] - sum(theta.final.mle[5:length(theta.final.mle)] * original.means / original.sds)
  icept2.mle <- theta.final.mle[4] - sum(theta.final.mle[5:length(theta.final.mle)] * original.means / original.sds)
  icepts.mle <- sort(c(icept1.mle, icept2.mle), decreasing = TRUE)
  
  # gammas
  Gamma.temp.mle <- diag(2)
  Gamma.temp.mle[!Gamma.temp.mle] <- exp(theta.final.mle[1:2])
  Gamma.temp.mle <- Gamma.temp.mle / rowSums(Gamma.temp.mle)
  
  gamma11.mle <- Gamma.temp.mle[1,1]
  gamma22.mle <- Gamma.temp.mle[2,2]
  
  # MSE for gammas
  mse.gammas.mle[a] <- 1 / 2 * sum((c(gamma11.mle, gamma22.mle) - true.gammas)^2)
  
  # MSE for intercepts
  mse.intercepts.mle[a] <- 1 / 2 * sum((icepts.mle - true.icepts)^2)
  
  # TPR 
  selected.betas.mle <- sum(abs(theta.final.mle[5:10]) > 0.001)
  tpr.betas.mle[a] <- selected.betas.mle / 6
  
  # FPR
  false.detected.mle <- sum(abs(theta.final.mle[11:length(theta.final.mle)]) > 0.001)
  fpr.betas.mle[a] <- false.detected.mle / nr.covariates
  
  # re-standardize betas
  theta.final.mle[5:length(theta.final.mle)] <- theta.final.mle[5:length(theta.final.mle)] / original.sds
  
  # MSE for betas
  mse.betas.mle[a] <- 1 / (nr.covariates + 6) * sum((theta.final.mle[5:length(theta.final.mle)] - true.betas)^2)
  
  # separate MSE for noise and non-noise coefficients
  mle.noise <- theta.final.mle[11:length(theta.final.mle)]
  mle.nonnoise <- theta.final.mle[5:10]
  
  mse.betas.noise.mle[a] <- 1 / nr.covariates * sum(mle.noise^2)
  mse.betas.nonnoise.mle[a] <- 1 / 6 * sum((mle.nonnoise - true.betas[1:6])^2)
  
  
  #### forecasts
  which.1 <- which(covariate.matrix.forecast$PenaltyMade == 1)
  which.0 <- which(covariate.matrix.forecast$PenaltyMade == 0)
  
  forecast.vec.aic <- theta.final.aic
  if(opt.alpha.aic[a] > 0) forecast.vec.aic[abs(forecast.vec.aic) <= 0.001] <- 0
  forecast.aic <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.aic, covariate.matrix.forecast)
  
  brier.aic[a] <- 1 / 100 * sum((forecast.aic[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.aic[a] <- mean(c(forecast.aic[which.1, 2] , forecast.aic[which.0, 1]))
  
  forecast.vec.bic <- theta.final.bic
  if(opt.alpha.bic[a] > 0) forecast.vec.bic[abs(forecast.vec.bic) <= 0.001] <- 0
  forecast.bic <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.bic, covariate.matrix.forecast)
  brier.bic[a] <- 1 / 100 * sum((forecast.bic[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.bic[a] <- mean(c(forecast.bic[which.1, 2] , forecast.bic[which.0, 1]))
  
  forecast.vec.cv <- theta.final.cv
  if(opt.alpha.cv[a] > 0) forecast.vec.cv[abs(forecast.vec.cv) <= 0.001] <- 0
  forecast.cv <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.cv, covariate.matrix.forecast)
  brier.cv[a] <- 1 / 100 * sum((forecast.cv[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.cv[a] <- mean(c(forecast.cv[which.1, 2] , forecast.cv[which.0, 1]))
  
  forecast.vec.lasso.aic <- theta.final.lasso.aic
  forecast.vec.lasso.aic[abs(forecast.vec.lasso.aic) <= 0.001] <- 0
  forecast.lasso.aic <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.lasso.aic, covariate.matrix.forecast)
  brier.lasso.aic[a] <- 1 / 100 * sum((forecast.lasso.aic[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.lasso.aic[a] <- mean(c(forecast.lasso.aic[which.1, 2] , forecast.lasso.aic[which.0, 1]))
  
  forecast.vec.lasso.bic <- theta.final.lasso.bic
  forecast.vec.lasso.bic[abs(forecast.vec.lasso.bic) <= 0.001] <- 0
  forecast.lasso.bic <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.lasso.bic, covariate.matrix.forecast)
  brier.lasso.bic[a] <- 1 / 100 * sum((forecast.lasso.bic[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.lasso.bic[a] <- mean(c(forecast.lasso.bic[which.1, 2] , forecast.lasso.bic[which.0, 1]))
  
  forecast.vec.lasso.cv <- theta.final.lasso.cv
  forecast.vec.lasso.cv[abs(forecast.vec.lasso.cv) <= 0.001] <- 0
  forecast.lasso.cv <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.lasso.cv, covariate.matrix.forecast)
  brier.lasso.cv[a] <- 1 / 100 * sum((forecast.lasso.cv[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.lasso.cv[a] <- mean(c(forecast.lasso.cv[which.1, 2] , forecast.lasso.cv[which.0, 1]))
  
  forecast.vec.ridge.aic <- theta.final.ridge.aic
  forecast.ridge.aic <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.ridge.aic, covariate.matrix.forecast)
  brier.ridge.aic[a] <- 1 / 100 * sum((forecast.ridge.aic[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.ridge.aic[a] <- mean(c(forecast.ridge.aic[which.1, 2] , forecast.ridge.aic[which.0, 1]))
  
  forecast.vec.ridge.bic <- theta.final.ridge.bic
  forecast.ridge.bic <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.ridge.bic, covariate.matrix.forecast)
  brier.ridge.bic[a] <- 1 / 100 * sum((forecast.ridge.bic[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.ridge.bic[a] <- mean(c(forecast.ridge.bic[which.1, 2] , forecast.ridge.bic[which.0, 1]))
  
  forecast.vec.ridge.cv <- theta.final.ridge.cv
  forecast.ridge.cv <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.ridge.cv, covariate.matrix.forecast)
  brier.ridge.cv[a] <- 1 / 100 * sum((forecast.ridge.cv[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.ridge.cv[a] <- mean(c(forecast.ridge.cv[which.1, 2] , forecast.ridge.cv[which.0, 1]))
  
  forecast.mle <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, theta.final.mle, covariate.matrix.forecast)
  brier.mle[a] <- 1 / 100 * sum((forecast.mle[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.mle[a] <- mean(c(forecast.mle[which.1, 2] , forecast.mle[which.0, 1]))
  
  erg <- data.frame(opt.alpha.aic, opt.alpha.bic, opt.alpha.cv, opt.lambda.aic, opt.lambda.bic,
                    opt.lambda.cv, opt.lambda.lasso.aic, opt.lambda.lasso.bic, opt.lambda.lasso.cv,
                    opt.lambda.ridge.cv,
                    mse.intercepts.aic, mse.intercepts.bic, mse.intercepts.mle, mse.intercepts.cv,
                    mse.intercepts.lasso.aic, mse.intercepts.lasso.bic, mse.intercepts.lasso.cv,
                    mse.intercepts.ridge.aic, mse.intercepts.ridge.bic, mse.intercepts.ridge.cv,
                    mse.betas.aic, mse.betas.noise.aic, mse.betas.nonnoise.aic,
                    mse.betas.bic, mse.betas.noise.bic, mse.betas.nonnoise.bic,
                    mse.betas.mle, mse.betas.noise.mle, mse.betas.nonnoise.mle,
                    mse.betas.cv, mse.betas.noise.cv, mse.betas.nonnoise.cv,
                    mse.betas.lasso.aic, mse.betas.lasso.bic, mse.betas.lasso.cv,
                    mse.betas.ridge.aic, mse.betas.ridge.bic, mse.betas.ridge.cv,
                    mse.gammas.aic, mse.gammas.bic, mse.gammas.mle, mse.gammas.cv,
                    mse.gammas.lasso.aic, mse.gammas.lasso.bic, mse.gammas.lasso.cv,
                    mse.gammas.ridge.aic, mse.gammas.ridge.bic, mse.gammas.ridge.cv,
                    tpr.betas.aic, tpr.betas.bic, tpr.betas.mle, tpr.betas.cv,
                    tpr.betas.lasso.aic, tpr.betas.lasso.bic, tpr.betas.lasso.cv,
                    tpr.betas.ridge.aic, tpr.betas.ridge.bic, tpr.betas.ridge.cv,
                    fpr.betas.aic, fpr.betas.bic, fpr.betas.mle, fpr.betas.cv,
                    fpr.betas.lasso.aic, fpr.betas.lasso.bic, fpr.betas.lasso.cv,
                    fpr.betas.ridge.aic, fpr.betas.ridge.bic, fpr.betas.ridge.cv,
                    brier.aic, brier.bic, brier.cv, brier.mle,
                    brier.lasso.aic, brier.lasso.bic, brier.lasso.cv,
                    brier.ridge.aic, brier.ridge.bic, brier.ridge.cv,
                    avg.pred.prob.aic, avg.pred.prob.bic,
                    avg.pred.prob.lasso.aic, avg.pred.prob.lasso.bic, avg.pred.prob.lasso.cv,
                    avg.pred.prob.ridge.aic, avg.pred.prob.ridge.bic, avg.pred.prob.ridge.cv,
                    avg.pred.prob.mle)
}
res_uncor <- erg



# run simulation -- correlated setting ------------------------------------
pi.beta <- qlogis(c(0.75, 0.35))
beta.1 <- beta.2 <- beta.3 <- beta.4 <- beta.5 <- beta.6 <- 0.3

length.ts <- 5000 
forecast.h <- 100
nr.covariates <- 47

# define grid
lambdas <- exp(seq(log(20000), log(1e-4), length.out = 50))
alphas <- c(0, 0.2, 0.4, 0.6, 0.8, 1)
lambdas_alphas_grid <- expand.grid(lambdas, alphas)
colnames(lambdas_alphas_grid) <- c("lambda", "alpha")
# starting values
theta.star.vec <- c(-1.4605264, -1.7160739, -0.3921979,  0.4610380, rep(0.1, nr.covariates + 3))
theta.star.mat <- matrix(0, ncol = length(theta.star.vec), nrow = nrow(lambdas_alphas_grid) + 1)
theta.star.mat[1,] <- theta.star.vec

bic.vec <- rep(NA, nrow(lambdas_alphas_grid))
aic.vec <- rep(NA, nrow(lambdas_alphas_grid))
cv.vec <- rep(NA, nrow(lambdas_alphas_grid))
intercept1.vec <- rep(NA, nrow(lambdas_alphas_grid))
intercept2.vec <- rep(NA, nrow(lambdas_alphas_grid))
gamma11.vec <- rep(NA, nrow(lambdas_alphas_grid))
gamma22.vec <- rep(NA, nrow(lambdas_alphas_grid))

# define empty vectors for MSEs etc.
mse.intercepts.aic <- rep(NA, 100)
mse.intercepts.bic <- rep(NA, 100)
mse.intercepts.mle <- rep(NA, 100)
mse.intercepts.cv <- rep(NA, 100)
mse.intercepts.lasso.aic <- rep(NA, 100)
mse.intercepts.lasso.bic <- rep(NA, 100)
mse.intercepts.lasso.cv <- rep(NA, 100)
mse.intercepts.ridge.aic <- rep(NA, 100)
mse.intercepts.ridge.bic <- rep(NA, 100)
mse.intercepts.ridge.cv <- rep(NA, 100)


mse.betas.aic <- rep(NA, 100)
mse.betas.bic <- rep(NA, 100)
mse.betas.mle <- rep(NA, 100)
mse.betas.cv <- rep(NA, 100)
mse.betas.lasso.aic <- rep(NA, 100)
mse.betas.lasso.bic <- rep(NA, 100)
mse.betas.lasso.cv <- rep(NA, 100)
mse.betas.ridge.aic <- rep(NA, 100)
mse.betas.ridge.bic <- rep(NA, 100)
mse.betas.ridge.cv <- rep(NA, 100)


mse.betas.noise.aic <- rep(NA, 100)
mse.betas.nonnoise.aic <- rep(NA, 100)
mse.betas.noise.bic <- rep(NA, 100)
mse.betas.nonnoise.bic <- rep(NA, 100)
mse.betas.noise.mle <- rep(NA, 100)
mse.betas.nonnoise.mle <- rep(NA, 100)
mse.betas.noise.cv <- rep(NA, 100)
mse.betas.nonnoise.cv <- rep(NA, 100)

mse.betas.noise.lasso.aic <- rep(NA, 100)
mse.betas.nonnoise.lasso.aic <- rep(NA, 100)
mse.betas.noise.lasso.bic <- rep(NA, 100)
mse.betas.nonnoise.lasso.bic <- rep(NA, 100)
mse.betas.noise.lasso.cv <- rep(NA, 100)
mse.betas.nonnoise.lasso.cv <- rep(NA, 100)
mse.betas.noise.ridge.aic <- rep(NA, 100)
mse.betas.nonnoise.ridge.aic <- rep(NA, 100)
mse.betas.noise.ridge.bic <- rep(NA, 100)
mse.betas.nonnoise.ridge.bic <- rep(NA, 100)
mse.betas.noise.ridge.cv <- rep(NA, 100)
mse.betas.nonnoise.ridge.cv <- rep(NA, 100)


mse.gammas.aic <- rep(NA, 100)
mse.gammas.bic <- rep(NA, 100)
mse.gammas.mle <- rep(NA, 100)
mse.gammas.cv <- rep(NA, 100)
mse.gammas.lasso.aic <- rep(NA, 100)
mse.gammas.lasso.bic <- rep(NA, 100)
mse.gammas.lasso.cv <- rep(NA, 100)
mse.gammas.ridge.aic <- rep(NA, 100)
mse.gammas.ridge.bic <- rep(NA, 100)
mse.gammas.ridge.cv <- rep(NA, 100)


tpr.betas.aic <- rep(NA, 100)
tpr.betas.bic <- rep(NA, 100)
tpr.betas.mle <- rep(NA, 100)
tpr.betas.cv <- rep(NA, 100)
tpr.betas.lasso.aic <- rep(NA, 100)
tpr.betas.lasso.bic <- rep(NA, 100)
tpr.betas.lasso.cv <- rep(NA, 100)
tpr.betas.ridge.aic <- rep(NA, 100)
tpr.betas.ridge.bic <- rep(NA, 100)
tpr.betas.ridge.cv <- rep(NA, 100)


fpr.betas.aic <- rep(NA, 100)
fpr.betas.bic <- rep(NA, 100)
fpr.betas.mle <- rep(NA, 100)
fpr.betas.cv <- rep(NA, 100)
fpr.betas.lasso.aic <- rep(NA, 100)
fpr.betas.lasso.bic <- rep(NA, 100)
fpr.betas.lasso.cv <- rep(NA, 100)
fpr.betas.ridge.aic <- rep(NA, 100)
fpr.betas.ridge.bic <- rep(NA, 100)
fpr.betas.ridge.cv <- rep(NA, 100)


brier.aic <- rep(NA, 100)
brier.bic <- rep(NA, 100)
brier.mle <- rep(NA, 100)
brier.cv <- rep(NA, 100)
brier.lasso.aic <- rep(NA, 100)
brier.lasso.bic <- rep(NA, 100)
brier.lasso.cv <- rep(NA, 100)
brier.ridge.aic <- rep(NA, 100)
brier.ridge.bic <- rep(NA, 100)
brier.ridge.cv <- rep(NA, 100)

avg.pred.prob.aic <- rep(NA, 100)
avg.pred.prob.bic <- rep(NA, 100)
avg.pred.prob.mle <- rep(NA, 100)
avg.pred.prob.cv <- rep(NA, 100)
avg.pred.prob.lasso.aic <- rep(NA, 100)
avg.pred.prob.lasso.bic <- rep(NA, 100)
avg.pred.prob.lasso.cv <- rep(NA, 100)
avg.pred.prob.ridge.aic <- rep(NA, 100)
avg.pred.prob.ridge.bic <- rep(NA, 100)
avg.pred.prob.ridge.cv <- rep(NA, 100)

opt.alpha.aic <- rep(NA, 100)
opt.alpha.bic <- rep(NA, 100)
opt.alpha.cv <- rep(NA, 100)

opt.lambda.aic <- rep(NA, 100)
opt.lambda.bic <- rep(NA, 100)
opt.lambda.cv <- rep(NA, 100)
opt.lambda.lasso.aic <- rep(NA, 100)
opt.lambda.lasso.bic <- rep(NA, 100)
opt.lambda.lasso.cv <- rep(NA, 100)
opt.lambda.ridge.cv <- rep(NA, 100)

true.betas <- c(beta.1, beta.2, beta.3, beta.4, beta.5, beta.6, rep(0, 44))
true.gammas <- c(0.9, 0.9)
true.icepts <- pi.beta


set.seed(2305)
for(a in 1:100){
  covariate_matrix_complete <- data_list_cor[[a]]
  covariate.matrix <- covariate_matrix_complete[1:5000,]
  covariate.matrix.forecast <- covariate_matrix_complete[5001:5100,]
  
  nr.covariates <- 44
  
  original.means <- apply(covariate.matrix[2:ncol(covariate.matrix)], 2, mean)
  original.sds <- apply(covariate.matrix[2:ncol(covariate.matrix)], 2, sd)
  
  covariate.matrix[,2:ncol(covariate.matrix)] <- scale(covariate.matrix[,2:ncol(covariate.matrix)])
  
  # forecast covariate values
  covariate.matrix.forecast[,2:ncol(covariate.matrix.forecast)] <- scale(covariate.matrix.forecast[,2:ncol(covariate.matrix.forecast)])
  
  #### optimal lambda
  theta.star.mat <- matrix(0, ncol = length(theta.star.vec), nrow = nrow(lambdas_alphas_grid) + 1)
  theta.star.mat[1,] <- theta.star.vec
  
  bic.vec <- rep(NA, nrow(lambdas_alphas_grid))
  aic.vec <- rep(NA, nrow(lambdas_alphas_grid))
  cv.vec <- rep(NA, nrow(lambdas_alphas_grid))
  llk.vec <- rep(NA, nrow(lambdas_alphas_grid))
  
  intercept1.vec <- rep(NA, nrow(lambdas_alphas_grid))
  intercept2.vec <- rep(NA, nrow(lambdas_alphas_grid))
  
  gamma11.vec <- rep(NA, nrow(lambdas_alphas_grid))
  gamma22.vec <- rep(NA, nrow(lambdas_alphas_grid))
  
  for(i in 1:nrow(lambdas_alphas_grid)){
    cur.lambda <- lambdas_alphas_grid[i,]$lambda
    cur.alpha <- lambdas_alphas_grid[i,]$alpha
    mod <- nlm(L.sim.pen.elnet, theta.star.vec, x = covariate.matrix, N = 2, lambda = cur.lambda,
               alpha = cur.alpha, nr.covariates = 47, print.level = 1, iterlim = 10000)
    
    loglik.temp <- L.sim.pen.elnet(mod$estimate, covariate.matrix, 2, 0, cur.alpha, 47)
    # if alpha = 0, i.e. in the ridge case, coefficients are not set to zero
    if(cur.alpha == 0) edf <- length(mod$estimate[5:length(mod$estimate)])
    else edf <- sum(abs(mod$estimate[5:length(mod$estimate)]) > 0.001)
    llk.vec[i] <- loglik.temp
    bic.vec[i] <- 2*(loglik.temp) + (edf + 4) * log(length.ts)
    aic.vec[i] <- 2*(loglik.temp) + (edf + 4) * 2
    intercept1.vec[i] <- mod$estimate[3] - sum(mod$estimate[5:length(mod$estimate)] * original.means / original.sds)
    intercept2.vec[i] <- mod$estimate[4] - sum(mod$estimate[5:length(mod$estimate)] * original.means / original.sds)
    
    Gamma.temp <- diag(2)
    Gamma.temp[!Gamma.temp] <- exp(mod$estimate[1:2])
    Gamma.temp <- Gamma.temp / rowSums(Gamma.temp)
    gamma11.vec[i] <- Gamma.temp[1,1]
    gamma22.vec[i] <- Gamma.temp[2,2]
    
    theta.star.mat[i + 1,] <- mod$estimate
    
    ### cross validation
    
    nfolds <- 10
    oos_llk <- rep(NA, nfolds)
    for(f in 1:nfolds){
      # sample rows
      index_rows_to_na <- sample(nrow(covariate.matrix), 5000 * 0.1, replace = FALSE)
      index_rows_to_na <- index_rows_to_na[index_rows_to_na > 1]
      # calibration set
      calibration_set <- covariate.matrix
      calibration_set$PenaltyMade[index_rows_to_na] <- NA
      # test set
      na_for_testset <- setdiff(2:5000, index_rows_to_na)
      test_set <- covariate.matrix
      test_set$PenaltyMade[na_for_testset] <- NA
      
      mod_cv <- tryCatch(nlm(L.sim.pen.elnet, theta.star.vec, x = calibration_set, N = 2, lambda = cur.lambda,
                             alpha = cur.alpha, nr.covariates = 47, print.level = 1, iterlim = 10000),
                         error = function(e) NA)
      # out of sample llk
      if(is.na(mod_cv[1])) oos_llk[f] <- NA
      else oos_llk[f] <- L.sim.pen.elnet(mod_cv$estimate, test_set, 2, 0, cur.alpha, 47)
    }
    cv.vec[i] <- mean(oos_llk, na.rm = TRUE)
  }
  
  opt.aic <- which.min(aic.vec)
  opt.bic <- which.min(bic.vec)
  opt.cv <- which.min(cv.vec)
  
  # rows of the grid corresponding to lasso and ridge
  lasso_rows <- which(lambdas_alphas_grid$alpha == 1)
  ridge_rows <- which(lambdas_alphas_grid$alpha == 0)
  opt.lasso.aic <- which(aic.vec == min(aic.vec[lasso_rows]))
  opt.lasso.bic <- which(bic.vec == min(bic.vec[lasso_rows]))
  opt.lasso.cv <- which(cv.vec == min(cv.vec[lasso_rows]))
  opt.ridge.aic <- which(aic.vec == min(aic.vec[ridge_rows]))
  opt.ridge.bic <- which(bic.vec == min(bic.vec[ridge_rows]))
  opt.ridge.cv <- which(cv.vec == min(cv.vec[ridge_rows]))
  
  
  opt.alpha.aic[a] <- lambdas_alphas_grid[opt.aic,]$alpha
  opt.alpha.bic[a] <- lambdas_alphas_grid[opt.bic,]$alpha
  opt.alpha.cv[a] <- lambdas_alphas_grid[opt.cv,]$alpha
  
  # save optimal lambdas
  opt.lambda.aic[a] <- lambdas_alphas_grid[opt.aic,]$lambda
  opt.lambda.bic[a] <- lambdas_alphas_grid[opt.bic,]$lambda
  opt.lambda.cv[a] <- lambdas_alphas_grid[opt.cv,]$lambda
  opt.lambda.lasso.aic[a] <- lambdas_alphas_grid[opt.lasso.aic,]$lambda
  opt.lambda.lasso.bic[a] <- lambdas_alphas_grid[opt.lasso.bic,]$lambda
  opt.lambda.lasso.cv[a] <- lambdas_alphas_grid[opt.lasso.cv,]$lambda
  opt.lambda.ridge.cv[a] <- lambdas_alphas_grid[opt.ridge.cv,]$lambda
  
  # save estimates from chosen model
  theta.final.bic <- theta.star.mat[opt.bic + 1,]
  theta.final.aic <- theta.star.mat[opt.aic + 1,]
  theta.final.cv <- theta.star.mat[opt.cv + 1,]
  
  theta.final.lasso.aic <- theta.star.mat[opt.lasso.aic + 1,]
  theta.final.lasso.bic <- theta.star.mat[opt.lasso.bic + 1,]
  theta.final.lasso.cv <- theta.star.mat[opt.lasso.cv + 1,]
  theta.final.ridge.aic <- theta.star.mat[opt.ridge.aic + 1,]
  theta.final.ridge.bic <- theta.star.mat[opt.ridge.bic + 1,]
  theta.final.ridge.cv <- theta.star.mat[opt.ridge.cv + 1,]
  
  # estimates with betas < 0.001 set to zero: clean estimates
  theta.final.aic.clean <- theta.final.aic
  theta.final.bic.clean <- theta.final.bic
  theta.final.cv.clean <- theta.final.cv
  
  theta.final.lasso.aic.clean <- theta.final.lasso.aic
  theta.final.lasso.bic.clean <- theta.final.lasso.bic
  theta.final.lasso.cv.clean <- theta.final.lasso.cv
  theta.final.ridge.aic.clean <- theta.final.ridge.aic
  theta.final.ridge.bic.clean <- theta.final.ridge.bic
  theta.final.ridge.cv.clean <- theta.final.ridge.cv
  
  if(opt.alpha.aic[a] > 0) theta.final.aic.clean[abs(theta.final.aic) <= 0.001] <- 0
  if(opt.alpha.bic[a] > 0) theta.final.bic.clean[abs(theta.final.bic) <= 0.001] <- 0
  if(opt.alpha.cv[a] > 0) theta.final.cv.clean[abs(theta.final.cv) <= 0.001] <- 0
  
  theta.final.lasso.aic.clean[abs(theta.final.lasso.aic) <= 0.001] <- 0
  theta.final.lasso.bic.clean[abs(theta.final.lasso.bic) <= 0.001] <- 0
  theta.final.lasso.cv.clean[abs(theta.final.lasso.cv) <= 0.001] <- 0
  
  # re-standardize intercepts
  icept1.aic <- intercept1.vec[opt.aic]
  icept2.aic <- intercept2.vec[opt.aic]
  icepts.aic <- sort(c(icept1.aic, icept2.aic), decreasing = TRUE)
  
  icept1.bic <- intercept1.vec[opt.bic]
  icept2.bic <- intercept2.vec[opt.bic]
  icepts.bic <- sort(c(icept1.bic, icept2.bic), decreasing = TRUE)
  
  icept1.cv <- intercept1.vec[opt.cv]
  icept2.cv <- intercept2.vec[opt.cv]
  icepts.cv <- sort(c(icept1.cv, icept2.cv), decreasing = TRUE)
  
  icept1.lasso.aic <- intercept1.vec[opt.lasso.aic]
  icept2.lasso.aic <- intercept2.vec[opt.lasso.aic]
  icepts.lasso.aic <- sort(c(icept1.lasso.aic, icept2.lasso.aic), decreasing = TRUE)
  
  icept1.lasso.bic <- intercept1.vec[opt.lasso.bic]
  icept2.lasso.bic <- intercept2.vec[opt.lasso.bic]
  icepts.lasso.bic <- sort(c(icept1.lasso.bic, icept2.lasso.bic), decreasing = TRUE)
  
  icept1.lasso.cv <- intercept1.vec[opt.lasso.cv]
  icept2.lasso.cv <- intercept2.vec[opt.lasso.cv]
  icepts.lasso.cv <- sort(c(icept1.lasso.cv, icept2.lasso.cv), decreasing = TRUE)
  
  icept1.ridge.aic <- intercept1.vec[opt.ridge.aic]
  icept2.ridge.aic <- intercept2.vec[opt.ridge.aic]
  icepts.ridge.aic <- sort(c(icept1.ridge.aic, icept2.ridge.aic), decreasing = TRUE)
  
  icept1.ridge.bic <- intercept1.vec[opt.ridge.bic]
  icept2.ridge.bic <- intercept2.vec[opt.ridge.bic]
  icepts.ridge.bic <- sort(c(icept1.ridge.bic, icept2.ridge.bic), decreasing = TRUE)
  
  icept1.ridge.cv <- intercept1.vec[opt.ridge.cv]
  icept2.ridge.cv <- intercept2.vec[opt.ridge.cv]
  icepts.ridge.cv <- sort(c(icept1.ridge.cv, icept2.ridge.cv), decreasing = TRUE)
  
  
  # gammas
  gamma11.optaic <- gamma11.vec[opt.aic]
  gamma11.optbic <- gamma11.vec[opt.bic]
  gamma11.optcv <- gamma11.vec[opt.cv]
  gamma11.lasso.optaic <- gamma11.vec[opt.lasso.aic]
  gamma11.lasso.optbic <- gamma11.vec[opt.lasso.bic]
  gamma11.lasso.optcv <- gamma11.vec[opt.lasso.cv]
  gamma11.ridge.optaic <- gamma11.vec[opt.ridge.aic]
  gamma11.ridge.optbic <- gamma11.vec[opt.ridge.bic]
  gamma11.ridge.optcv <- gamma11.vec[opt.ridge.cv]
  
  gamma22.optaic <- gamma22.vec[opt.aic]
  gamma22.optbic <- gamma22.vec[opt.bic]
  gamma22.optcv <- gamma22.vec[opt.cv]
  gamma22.lasso.optaic <- gamma22.vec[opt.lasso.aic]
  gamma22.lasso.optbic <- gamma22.vec[opt.lasso.bic]
  gamma22.lasso.optcv <- gamma22.vec[opt.lasso.cv]
  gamma22.ridge.optaic <- gamma22.vec[opt.ridge.aic]
  gamma22.ridge.optbic <- gamma22.vec[opt.ridge.bic]
  gamma22.ridge.optcv <- gamma22.vec[opt.ridge.cv]
  
  
  # MSE for gammas
  mse.gammas.aic[a] <- 1 / 2 * sum((c(gamma11.optaic, gamma22.optaic) - true.gammas)^2)
  mse.gammas.bic[a] <- 1 / 2 * sum((c(gamma11.optbic, gamma22.optbic) - true.gammas)^2)
  mse.gammas.cv[a] <- 1 / 2 * sum((c(gamma11.optcv, gamma22.optcv) - true.gammas)^2)
  
  mse.gammas.lasso.aic[a] <- 1 / 2 * sum((c(gamma11.lasso.optaic, gamma22.lasso.optaic) - true.gammas)^2)
  mse.gammas.lasso.bic[a] <- 1 / 2 * sum((c(gamma11.lasso.optbic, gamma22.lasso.optbic) - true.gammas)^2)
  mse.gammas.lasso.cv[a] <- 1 / 2 * sum((c(gamma11.lasso.optcv, gamma22.lasso.optcv) - true.gammas)^2)
  mse.gammas.ridge.aic[a] <- 1 / 2 * sum((c(gamma11.ridge.optaic, gamma22.ridge.optaic) - true.gammas)^2)
  mse.gammas.ridge.bic[a] <- 1 / 2 * sum((c(gamma11.ridge.optbic, gamma22.ridge.optbic) - true.gammas)^2)
  mse.gammas.ridge.cv[a] <- 1 / 2 * sum((c(gamma11.ridge.optcv, gamma22.ridge.optcv) - true.gammas)^2)
  
  # MSE for intercepts
  mse.intercepts.aic[a] <- 1 / 2 * sum((icepts.aic - true.icepts)^2)
  mse.intercepts.bic[a] <- 1 / 2 * sum((icepts.bic - true.icepts)^2)
  mse.intercepts.cv[a] <- 1 / 2 * sum((icepts.cv - true.icepts)^2)
  
  mse.intercepts.lasso.aic[a] <- 1 / 2 * sum((icepts.lasso.aic - true.icepts)^2)
  mse.intercepts.lasso.bic[a] <- 1 / 2 * sum((icepts.lasso.bic - true.icepts)^2)
  mse.intercepts.lasso.cv[a] <- 1 / 2 * sum((icepts.lasso.cv - true.icepts)^2)
  mse.intercepts.ridge.aic[a] <- 1 / 2 * sum((icepts.ridge.aic - true.icepts)^2)
  mse.intercepts.ridge.bic[a] <- 1 / 2 * sum((icepts.ridge.bic - true.icepts)^2)
  mse.intercepts.ridge.cv[a] <- 1 / 2 * sum((icepts.ridge.cv - true.icepts)^2)
  
  # TPR 
  selected.betas.aic <- sum(abs(theta.final.aic.clean[5:10]) > 0.001)
  tpr.betas.aic[a] <- selected.betas.aic / 6
  selected.betas.bic <- sum(abs(theta.final.bic.clean[5:10]) > 0.001)
  tpr.betas.bic[a] <- selected.betas.bic / 6
  selected.betas.cv <- sum(abs(theta.final.cv.clean[5:10]) > 0.001)
  tpr.betas.cv[a] <- selected.betas.cv / 6
  
  selected.betas.lasso.aic <- sum(abs(theta.final.lasso.aic.clean[5:10]) > 0.001)
  tpr.betas.lasso.aic[a] <- selected.betas.lasso.aic / 6
  selected.betas.lasso.bic <- sum(abs(theta.final.lasso.bic.clean[5:10]) > 0.001)
  tpr.betas.lasso.bic[a] <- selected.betas.lasso.bic / 6
  selected.betas.lasso.cv <- sum(abs(theta.final.lasso.cv.clean[5:10]) > 0.001)
  tpr.betas.lasso.cv[a] <- selected.betas.lasso.cv / 6
  selected.betas.ridge.aic <- sum(abs(theta.final.ridge.aic.clean[5:10]) > 0.001)
  tpr.betas.ridge.aic[a] <- selected.betas.ridge.aic / 6
  selected.betas.ridge.bic <- sum(abs(theta.final.ridge.bic.clean[5:10]) > 0.001)
  tpr.betas.ridge.bic[a] <- selected.betas.ridge.bic / 6
  selected.betas.ridge.cv <- sum(abs(theta.final.ridge.cv.clean[5:10]) > 0.001)
  tpr.betas.ridge.cv[a] <- selected.betas.ridge.cv / 6
  
  # FPR
  false.detected.aic <- sum(abs(theta.final.aic.clean[11:length(theta.final.aic.clean)]) > 0.001)
  fpr.betas.aic[a] <- false.detected.aic / nr.covariates
  false.detected.bic <- sum(abs(theta.final.bic.clean[11:length(theta.final.bic.clean)]) > 0.001)
  fpr.betas.bic[a] <- false.detected.bic / nr.covariates
  false.detected.cv <- sum(abs(theta.final.cv.clean[11:length(theta.final.cv.clean)]) > 0.001)
  fpr.betas.cv[a] <- false.detected.cv / nr.covariates
  
  false.detected.lasso.aic <- sum(abs(theta.final.lasso.aic.clean[11:length(theta.final.lasso.aic.clean)]) > 0.001)
  fpr.betas.lasso.aic[a] <- false.detected.lasso.aic / nr.covariates
  false.detected.lasso.bic <- sum(abs(theta.final.lasso.bic.clean[11:length(theta.final.lasso.bic.clean)]) > 0.001)
  fpr.betas.lasso.bic[a] <- false.detected.lasso.bic / nr.covariates
  false.detected.lasso.cv <- sum(abs(theta.final.lasso.cv.clean[11:length(theta.final.lasso.cv.clean)]) > 0.001)
  fpr.betas.lasso.cv[a] <- false.detected.lasso.cv / nr.covariates
  false.detected.ridge.aic <- sum(abs(theta.final.ridge.aic.clean[11:length(theta.final.ridge.aic.clean)]) > 0.001)
  fpr.betas.ridge.aic[a] <- false.detected.ridge.aic / nr.covariates
  false.detected.ridge.bic <- sum(abs(theta.final.ridge.bic.clean[11:length(theta.final.ridge.bic.clean)]) > 0.001)
  fpr.betas.ridge.bic[a] <- false.detected.ridge.bic / nr.covariates
  false.detected.ridge.cv <- sum(abs(theta.final.ridge.cv.clean[11:length(theta.final.ridge.cv.clean)]) > 0.001)
  fpr.betas.ridge.cv[a] <- false.detected.ridge.cv / nr.covariates
  
  # re-standardize betas
  theta.final.bic.clean[5:length(theta.final.bic.clean)] <- theta.final.bic.clean[5:length(theta.final.bic.clean)] / original.sds
  theta.final.aic.clean[5:length(theta.final.aic.clean)] <- theta.final.aic.clean[5:length(theta.final.aic.clean)] / original.sds
  theta.final.cv.clean[5:length(theta.final.cv.clean)] <- theta.final.cv.clean[5:length(theta.final.cv.clean)] / original.sds
  
  theta.final.lasso.aic.clean[5:length(theta.final.lasso.aic.clean)] <- theta.final.lasso.aic.clean[5:length(theta.final.lasso.aic.clean)] / original.sds
  theta.final.lasso.bic.clean[5:length(theta.final.lasso.bic.clean)] <- theta.final.lasso.bic.clean[5:length(theta.final.lasso.bic.clean)] / original.sds
  theta.final.lasso.cv.clean[5:length(theta.final.lasso.cv.clean)] <- theta.final.lasso.cv.clean[5:length(theta.final.lasso.cv.clean)] / original.sds
  
  theta.final.ridge.aic.clean[5:length(theta.final.ridge.aic.clean)] <- theta.final.ridge.aic.clean[5:length(theta.final.ridge.aic.clean)] / original.sds
  theta.final.ridge.bic.clean[5:length(theta.final.ridge.bic.clean)] <- theta.final.ridge.bic.clean[5:length(theta.final.ridge.bic.clean)] / original.sds
  theta.final.ridge.cv.clean[5:length(theta.final.ridge.cv.clean)] <- theta.final.ridge.cv.clean[5:length(theta.final.ridge.cv.clean)] / original.sds
  
  # MSE for betas
  mse.betas.aic[a] <- 1 / (nr.covariates + 6) * sum((theta.final.aic.clean[5:length(theta.final.aic.clean)] - true.betas)^2)
  mse.betas.bic[a] <- 1 / (nr.covariates + 6) * sum((theta.final.bic.clean[5:length(theta.final.bic.clean)] - true.betas)^2)
  mse.betas.cv[a] <- 1 / (nr.covariates + 6) * sum((theta.final.cv.clean[5:length(theta.final.cv.clean)] - true.betas)^2)
  
  mse.betas.lasso.aic[a] <- 1 / (nr.covariates + 6) * sum((theta.final.lasso.aic.clean[5:length(theta.final.lasso.aic.clean)] - true.betas)^2)
  mse.betas.lasso.bic[a] <- 1 / (nr.covariates + 6) * sum((theta.final.lasso.bic.clean[5:length(theta.final.lasso.bic.clean)] - true.betas)^2)
  mse.betas.lasso.cv[a] <- 1 / (nr.covariates + 6) * sum((theta.final.lasso.cv.clean[5:length(theta.final.lasso.cv.clean)] - true.betas)^2)
  mse.betas.ridge.aic[a] <- 1 / (nr.covariates + 6) * sum((theta.final.ridge.aic.clean[5:length(theta.final.ridge.aic.clean)] - true.betas)^2)
  mse.betas.ridge.bic[a] <- 1 / (nr.covariates + 6) * sum((theta.final.ridge.bic.clean[5:length(theta.final.ridge.bic.clean)] - true.betas)^2)
  mse.betas.ridge.cv[a] <- 1 / (nr.covariates + 6) * sum((theta.final.ridge.cv.clean[5:length(theta.final.ridge.cv.clean)] - true.betas)^2)
  
  # separate MSE for noise and non-noise coefficients
  aic.noise <- theta.final.aic.clean[11:length(theta.final.aic.clean)]
  aic.nonnoise <- theta.final.aic.clean[5:10]
  bic.noise <- theta.final.bic.clean[11:length(theta.final.bic.clean)]
  bic.nonnoise <- theta.final.bic.clean[5:10]
  cv.noise <- theta.final.cv.clean[11:length(theta.final.cv.clean)]
  cv.nonnoise <- theta.final.cv.clean[5:10]
  
  lasso.aic.noise <- theta.final.lasso.aic.clean[11:length(theta.final.lasso.aic.clean)]
  lasso.aic.nonnoise <- theta.final.lasso.aic.clean[5:10]
  lasso.bic.noise <- theta.final.lasso.bic.clean[11:length(theta.final.lasso.bic.clean)]
  lasso.bic.nonnoise <- theta.final.lasso.bic.clean[5:10]
  lasso.cv.noise <- theta.final.lasso.cv.clean[11:length(theta.final.lasso.cv.clean)]
  lasso.cv.nonnoise <- theta.final.lasso.cv.clean[5:10]
  ridge.aic.noise <- theta.final.ridge.aic.clean[11:length(theta.final.ridge.aic.clean)]
  ridge.aic.nonnoise <- theta.final.ridge.aic.clean[5:10]
  ridge.bic.noise <- theta.final.ridge.bic.clean[11:length(theta.final.ridge.bic.clean)]
  ridge.bic.nonnoise <- theta.final.ridge.bic.clean[5:10]
  ridge.cv.noise <- theta.final.ridge.cv.clean[11:length(theta.final.ridge.cv.clean)]
  ridge.cv.nonnoise <- theta.final.ridge.cv.clean[5:10]
  
  mse.betas.noise.aic[a] <- 1 / nr.covariates * sum(aic.noise^2)
  mse.betas.nonnoise.aic[a] <- 1 / 6 * sum((aic.nonnoise - true.betas[1:6])^2)
  mse.betas.noise.bic[a] <- 1 / nr.covariates * sum(bic.noise^2)
  mse.betas.nonnoise.bic[a] <- 1 / 6 * sum((bic.nonnoise - true.betas[1:6])^2)
  mse.betas.noise.cv[a] <- 1 / nr.covariates * sum(cv.noise^2)
  mse.betas.nonnoise.cv[a] <- 1 / 6 * sum((cv.nonnoise - true.betas[1:6])^2)
  
  mse.betas.noise.lasso.aic[a] <- 1 / nr.covariates * sum(lasso.aic.noise^2)
  mse.betas.nonnoise.lasso.aic[a] <- 1 / 6 * sum((lasso.aic.nonnoise - true.betas[1:6])^2)
  mse.betas.noise.lasso.bic[a] <- 1 / nr.covariates * sum(lasso.bic.noise^2)
  mse.betas.nonnoise.lasso.bic[a] <- 1 / 6 * sum((lasso.bic.nonnoise - true.betas[1:6])^2)
  mse.betas.noise.lasso.cv[a] <- 1 / nr.covariates * sum(lasso.cv.noise^2)
  mse.betas.nonnoise.lasso.cv[a] <- 1 / 6 * sum((lasso.cv.nonnoise - true.betas[1:6])^2)
  mse.betas.noise.ridge.aic[a] <- 1 / nr.covariates * sum(ridge.aic.noise^2)
  mse.betas.nonnoise.ridge.aic[a] <- 1 / 6 * sum((ridge.aic.nonnoise - true.betas[1:6])^2)
  mse.betas.noise.ridge.bic[a] <- 1 / nr.covariates * sum(ridge.bic.noise^2)
  mse.betas.nonnoise.ridge.bic[a] <- 1 / 6 * sum((ridge.bic.nonnoise - true.betas[1:6])^2)
  mse.betas.noise.ridge.cv[a] <- 1 / nr.covariates * sum(ridge.cv.noise^2)
  mse.betas.nonnoise.ridge.cv[a] <- 1 / 6 * sum((ridge.cv.nonnoise - true.betas[1:6])^2)
  
  
  #### MSEs for unpenalised maximum likelihood
  mod.mle <- nlm(L.sim.pen.elnet, theta.star.vec, x = covariate.matrix, N = 2, lambda = 0, alpha = 0,
                 nr.covariates = 47, print.level = 1, iterlim = 1000)
  theta.final.mle <- mod.mle$estimate
  
  # re-standardize intercepts
  icept1.mle <- theta.final.mle[3] - sum(theta.final.mle[5:length(theta.final.mle)] * original.means / original.sds)
  icept2.mle <- theta.final.mle[4] - sum(theta.final.mle[5:length(theta.final.mle)] * original.means / original.sds)
  icepts.mle <- sort(c(icept1.mle, icept2.mle), decreasing = TRUE)
  
  # gammas
  Gamma.temp.mle <- diag(2)
  Gamma.temp.mle[!Gamma.temp.mle] <- exp(theta.final.mle[1:2])
  Gamma.temp.mle <- Gamma.temp.mle / rowSums(Gamma.temp.mle)
  
  gamma11.mle <- Gamma.temp.mle[1,1]
  gamma22.mle <- Gamma.temp.mle[2,2]
  
  # MSE for gammas
  mse.gammas.mle[a] <- 1 / 2 * sum((c(gamma11.mle, gamma22.mle) - true.gammas)^2)
  
  # MSE for intercepts
  mse.intercepts.mle[a] <- 1 / 2 * sum((icepts.mle - true.icepts)^2)
  
  # TPR 
  selected.betas.mle <- sum(abs(theta.final.mle[5:10]) > 0.001)
  tpr.betas.mle[a] <- selected.betas.mle / 6
  
  # FPR
  false.detected.mle <- sum(abs(theta.final.mle[11:length(theta.final.mle)]) > 0.001)
  fpr.betas.mle[a] <- false.detected.mle / nr.covariates
  
  # re-standardize betas
  theta.final.mle[5:length(theta.final.mle)] <- theta.final.mle[5:length(theta.final.mle)] / original.sds
  
  # MSE for betas
  mse.betas.mle[a] <- 1 / (nr.covariates + 6) * sum((theta.final.mle[5:length(theta.final.mle)] - true.betas)^2)
  
  # separate MSE for noise and non-noise coefficients
  mle.noise <- theta.final.mle[11:length(theta.final.mle)]
  mle.nonnoise <- theta.final.mle[5:10]
  
  mse.betas.noise.mle[a] <- 1 / nr.covariates * sum(mle.noise^2)
  mse.betas.nonnoise.mle[a] <- 1 / 6 * sum((mle.nonnoise - true.betas[1:6])^2)
  
  
  #### forecasts
  which.1 <- which(covariate.matrix.forecast$PenaltyMade == 1)
  which.0 <- which(covariate.matrix.forecast$PenaltyMade == 0)
  
  forecast.vec.aic <- theta.final.aic
  if(opt.alpha.aic[a] > 0) forecast.vec.aic[abs(forecast.vec.aic) <= 0.001] <- 0
  forecast.aic <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.aic, covariate.matrix.forecast)
  
  brier.aic[a] <- 1 / 100 * sum((forecast.aic[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.aic[a] <- mean(c(forecast.aic[which.1, 2] , forecast.aic[which.0, 1]))
  
  forecast.vec.bic <- theta.final.bic
  if(opt.alpha.bic[a] > 0) forecast.vec.bic[abs(forecast.vec.bic) <= 0.001] <- 0
  forecast.bic <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.bic, covariate.matrix.forecast)
  brier.bic[a] <- 1 / 100 * sum((forecast.bic[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.bic[a] <- mean(c(forecast.bic[which.1, 2] , forecast.bic[which.0, 1]))
  
  forecast.vec.cv <- theta.final.cv
  if(opt.alpha.cv[a] > 0) forecast.vec.cv[abs(forecast.vec.cv) <= 0.001] <- 0
  forecast.cv <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.cv, covariate.matrix.forecast)
  brier.cv[a] <- 1 / 100 * sum((forecast.cv[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.cv[a] <- mean(c(forecast.cv[which.1, 2] , forecast.cv[which.0, 1]))
  
  forecast.vec.lasso.aic <- theta.final.lasso.aic
  forecast.vec.lasso.aic[abs(forecast.vec.lasso.aic) <= 0.001] <- 0
  forecast.lasso.aic <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.lasso.aic, covariate.matrix.forecast)
  brier.lasso.aic[a] <- 1 / 100 * sum((forecast.lasso.aic[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.lasso.aic[a] <- mean(c(forecast.lasso.aic[which.1, 2] , forecast.lasso.aic[which.0, 1]))
  
  forecast.vec.lasso.bic <- theta.final.lasso.bic
  forecast.vec.lasso.bic[abs(forecast.vec.lasso.bic) <= 0.001] <- 0
  forecast.lasso.bic <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.lasso.bic, covariate.matrix.forecast)
  brier.lasso.bic[a] <- 1 / 100 * sum((forecast.lasso.bic[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.lasso.bic[a] <- mean(c(forecast.lasso.bic[which.1, 2] , forecast.lasso.bic[which.0, 1]))
  
  forecast.vec.lasso.cv <- theta.final.lasso.cv
  forecast.vec.lasso.cv[abs(forecast.vec.lasso.cv) <= 0.001] <- 0
  forecast.lasso.cv <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.lasso.cv, covariate.matrix.forecast)
  brier.lasso.cv[a] <- 1 / 100 * sum((forecast.lasso.cv[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.lasso.cv[a] <- mean(c(forecast.lasso.cv[which.1, 2] , forecast.lasso.cv[which.0, 1]))
  
  forecast.vec.ridge.aic <- theta.final.ridge.aic
  forecast.ridge.aic <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.ridge.aic, covariate.matrix.forecast)
  brier.ridge.aic[a] <- 1 / 100 * sum((forecast.ridge.aic[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.ridge.aic[a] <- mean(c(forecast.ridge.aic[which.1, 2] , forecast.ridge.aic[which.0, 1]))
  
  forecast.vec.ridge.bic <- theta.final.ridge.bic
  forecast.ridge.bic <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.ridge.bic, covariate.matrix.forecast)
  brier.ridge.bic[a] <- 1 / 100 * sum((forecast.ridge.bic[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.ridge.bic[a] <- mean(c(forecast.ridge.bic[which.1, 2] , forecast.ridge.bic[which.0, 1]))
  
  forecast.vec.ridge.cv <- theta.final.ridge.cv
  forecast.ridge.cv <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, forecast.vec.ridge.cv, covariate.matrix.forecast)
  brier.ridge.cv[a] <- 1 / 100 * sum((forecast.ridge.cv[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.ridge.cv[a] <- mean(c(forecast.ridge.cv[which.1, 2] , forecast.ridge.cv[which.0, 1]))
  
  forecast.mle <- bernoulli.HMM.forecast(covariate.matrix, N = 2, H = forecast.h, theta.final.mle, covariate.matrix.forecast)
  brier.mle[a] <- 1 / 100 * sum((forecast.mle[,2] - covariate.matrix.forecast$PenaltyMade)^2)
  avg.pred.prob.mle[a] <- mean(c(forecast.mle[which.1, 2] , forecast.mle[which.0, 1]))
  
  erg <- data.frame(opt.alpha.aic, opt.alpha.bic, opt.alpha.cv, opt.lambda.aic, opt.lambda.bic,
                    opt.lambda.cv, opt.lambda.lasso.aic, opt.lambda.lasso.bic, opt.lambda.lasso.cv,
                    opt.lambda.ridge.cv,
                    mse.intercepts.aic, mse.intercepts.bic, mse.intercepts.mle, mse.intercepts.cv,
                    mse.intercepts.lasso.aic, mse.intercepts.lasso.bic, mse.intercepts.lasso.cv,
                    mse.intercepts.ridge.aic, mse.intercepts.ridge.bic, mse.intercepts.ridge.cv,
                    mse.betas.aic, mse.betas.noise.aic, mse.betas.nonnoise.aic,
                    mse.betas.bic, mse.betas.noise.bic, mse.betas.nonnoise.bic,
                    mse.betas.mle, mse.betas.noise.mle, mse.betas.nonnoise.mle,
                    mse.betas.cv, mse.betas.noise.cv, mse.betas.nonnoise.cv,
                    mse.betas.lasso.aic, mse.betas.lasso.bic, mse.betas.lasso.cv,
                    mse.betas.ridge.aic, mse.betas.ridge.bic, mse.betas.ridge.cv,
                    mse.gammas.aic, mse.gammas.bic, mse.gammas.mle, mse.gammas.cv,
                    mse.gammas.lasso.aic, mse.gammas.lasso.bic, mse.gammas.lasso.cv,
                    mse.gammas.ridge.aic, mse.gammas.ridge.bic, mse.gammas.ridge.cv,
                    tpr.betas.aic, tpr.betas.bic, tpr.betas.mle, tpr.betas.cv,
                    tpr.betas.lasso.aic, tpr.betas.lasso.bic, tpr.betas.lasso.cv,
                    tpr.betas.ridge.aic, tpr.betas.ridge.bic, tpr.betas.ridge.cv,
                    fpr.betas.aic, fpr.betas.bic, fpr.betas.mle, fpr.betas.cv,
                    fpr.betas.lasso.aic, fpr.betas.lasso.bic, fpr.betas.lasso.cv,
                    fpr.betas.ridge.aic, fpr.betas.ridge.bic, fpr.betas.ridge.cv,
                    brier.aic, brier.bic, brier.cv, brier.mle,
                    brier.lasso.aic, brier.lasso.bic, brier.lasso.cv,
                    brier.ridge.aic, brier.ridge.bic, brier.ridge.cv,
                    avg.pred.prob.aic, avg.pred.prob.bic,
                    avg.pred.prob.lasso.aic, avg.pred.prob.lasso.bic, avg.pred.prob.lasso.cv,
                    avg.pred.prob.ridge.aic, avg.pred.prob.ridge.bic, avg.pred.prob.ridge.cv,
                    avg.pred.prob.mle)
}
res_cor <- erg


# visualise results -------------------------------------------------------
# the following code generates Figures 2 and 3
# we start with the results obtained in the uncorrelated setting


# visualise results -- uncorrelated setting -------------------------------

erg <- res_uncor
erg <- na.omit(erg)
erg$run <- 1:nrow(erg)

# stack data frame so that ggplot can be used
erg2 <- data.frame(erg[ncol(erg)], stack(erg[1:(ncol(erg) - 1)]))
erg2$log_values <- log(erg2$values)

# reorder levels
erg2$ind <- as.factor(erg2$ind)
erg2$ind <- as.character(erg2$ind)

p.betas <- ggplot(erg2 %>% filter(ind == "mse.betas.aic" | ind == "mse.betas.bic" | 
                                    ind == "mse.betas.cv" | ind == "mse.betas.lasso.aic" |
                                    ind == "mse.betas.lasso.bic" | ind == "mse.betas.lasso.cv" |
                                    ind == "mse.betas.ridge.cv" | ind == "mse.betas.mle"), 
                  aes(x = ind, y = log_values)) + geom_boxplot() + ylab("log(MSE)") + xlab("") +
  scale_x_discrete(labels = c("elastic net AIC", "elastic net BIC", "elastic net CV", 
                              "LASSO AIC", "LASSO BIC", "LASSO CV",
                              "Ridge CV", "MLE"), limits = c("mse.betas.aic", "mse.betas.bic",
                                                             "mse.betas.cv", "mse.betas.lasso.aic",
                                                             "mse.betas.lasso.bic",
                                                             "mse.betas.lasso.cv",
                                                             "mse.betas.ridge.cv", "mse.betas.mle")) + 
  ggtitle(expression(paste(beta[1],",...,",beta[50]))) + 
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

p.betas


# intercepts
p.intercepts <- ggplot(erg2 %>% filter(ind == "mse.intercepts.aic" | ind == "mse.intercepts.bic" | 
                                         ind == "mse.intercepts.cv" | ind == "mse.intercepts.lasso.aic" |
                                         ind == "mse.intercepts.lasso.bic" | ind == "mse.intercepts.lasso.cv" |
                                         ind == "mse.intercepts.ridge.cv" |
                                         ind == "mse.intercepts.mle"), 
                       aes(x = ind, log_values)) + geom_boxplot() + ylab("log(MSE)") + xlab("") +
  scale_x_discrete(labels = c("elastic net AIC", "elastic net BIC", "elastic net CV", 
                              "LASSO AIC", "LASSO BIC", "LASSO CV",
                              "Ridge CV", "MLE"), limits = c("mse.intercepts.aic", "mse.intercepts.bic",
                                                             "mse.intercepts.cv", "mse.intercepts.lasso.aic",
                                                             "mse.intercepts.lasso.bic",
                                                             "mse.intercepts.lasso.cv",
                                                             "mse.intercepts.ridge.cv", "mse.intercepts.mle")) +
  ggtitle(expression(paste(beta[0]^{(1)}, ", ", beta[0]^{(2)}))) +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

# gammas
p.gammas <- ggplot(erg2 %>% filter(ind == "mse.gammas.aic" | ind == "mse.gammas.bic" | 
                                     ind == "mse.gammas.cv" | ind == "mse.gammas.lasso.aic" |
                                     ind == "mse.gammas.lasso.bic" | ind == "mse.gammas.lasso.cv" |
                                     ind == "mse.gammas.ridge.cv" |
                                     ind == "mse.gammas.mle"), 
                   aes(x = ind, log_values)) + geom_boxplot() + ylab("") + xlab("") +
  scale_x_discrete(labels = c("elastic net AIC", "elastic net BIC", "elastic net CV", 
                              "LASSO AIC", "LASSO BIC", "LASSO CV",
                              "Ridge CV", "MLE"), limits = c("mse.gammas.aic", "mse.gammas.bic",
                                                             "mse.gammas.cv", "mse.gammas.lasso.aic",
                                                             "mse.gammas.lasso.bic",
                                                             "mse.gammas.lasso.cv",
                                                             "mse.gammas.ridge.cv", "mse.gammas.mle")) +
  ggtitle(expression(paste(gamma[11], ", ", gamma[22]))) +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))


# brier score (Figure A2 in the appendix, left panel)
p.brier <- ggplot(erg2 %>% filter(ind == "brier.aic" | ind == "brier.bic" |
                                    ind == "brier.cv" | ind == "brier.lasso.aic" |
                                    ind == "brier.lasso.bic" | ind == "brier.lasso.cv" |
                                    ind == "brier.ridge.cv" |
                                    ind == "brier.mle"),
                  aes(x = ind, values)) + geom_boxplot() + ylab("Brier score") + xlab("") +
  scale_x_discrete(labels = c("elastic net AIC", "elastic net BIC", "elastic net CV",
                              "LASSO AIC", "LASSO BIC", "LASSO CV",
                              "Ridge CV", "MLE"), limits = c("brier.aic", "brier.bic",
                                                             "brier.cv", "brier.lasso.aic",
                                                             "brier.lasso.bic",
                                                             "brier.lasso.cv",
                                                             "brier.ridge.cv", "brier.mle")) +
  ggtitle("Brier scores (scenario I)") +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

### Figure 2
plots.fin <- grid.arrange(grobs = list(p.intercepts, p.gammas, p.betas),
                          layout_matrix = matrix(c(1, 3, 2, 3), nrow = 2))



# visualise results -- correlated setting ---------------------------------
erg <- res_cor

erg <- na.omit(erg)
erg$run <- 1:nrow(erg)
# stack data frame so that ggplot can be used
erg2 <- data.frame(erg[ncol(erg)], stack(erg[1:(ncol(erg) - 1)]))
erg2$log_values <- log(erg2$values)

# reorder levels
erg2$ind <- as.factor(erg2$ind)
erg2$ind <- as.character(erg2$ind)

p.betas <- ggplot(erg2 %>% filter(ind == "mse.betas.aic" | ind == "mse.betas.bic" | 
                                    ind == "mse.betas.cv" | ind == "mse.betas.lasso.aic" |
                                    ind == "mse.betas.lasso.bic" | ind == "mse.betas.lasso.cv" |
                                    ind == "mse.betas.ridge.cv" |
                                    ind == "mse.betas.mle"), 
                  aes(x = ind, y = log_values)) + geom_boxplot() + ylab("log(MSE)") + xlab("") +
  scale_x_discrete(labels = c("elastic net AIC", "elastic net BIC", "elastic net CV", 
                              "LASSO AIC", "LASSO BIC", "LASSO CV",
                              "Ridge CV", "MLE"), limits = c("mse.betas.aic", "mse.betas.bic",
                                                             "mse.betas.cv", "mse.betas.lasso.aic",
                                                             "mse.betas.lasso.bic",
                                                             "mse.betas.lasso.cv",
                                                             "mse.betas.ridge.cv", "mse.betas.mle")) + 
  ggtitle(expression(paste(beta[1],",...,",beta[50]))) + 
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

p.betas

# intercepts
p.intercepts <- ggplot(erg2 %>% filter(ind == "mse.intercepts.aic" | ind == "mse.intercepts.bic" | 
                                         ind == "mse.intercepts.cv" | ind == "mse.intercepts.lasso.aic" |
                                         ind == "mse.intercepts.lasso.bic" | ind == "mse.intercepts.lasso.cv" |
                                         ind == "mse.intercepts.ridge.cv" |
                                         ind == "mse.intercepts.mle"), 
                       aes(x = ind, log_values)) + geom_boxplot() + ylab("log(MSE)") + xlab("") +
  scale_x_discrete(labels = c("elastic net AIC", "elastic net BIC", "elastic net CV", 
                              "LASSO AIC", "LASSO BIC", "LASSO CV",
                              "Ridge CV", "MLE"), limits = c("mse.intercepts.aic", "mse.intercepts.bic",
                                                             "mse.intercepts.cv", "mse.intercepts.lasso.aic",
                                                             "mse.intercepts.lasso.bic",
                                                             "mse.intercepts.lasso.cv",
                                                             "mse.intercepts.ridge.cv", "mse.intercepts.mle")) +
  ggtitle(expression(paste(beta[0]^{(1)}, ", ", beta[0]^{(2)}))) +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

# gammas
p.gammas <- ggplot(erg2 %>% filter(ind == "mse.gammas.aic" | ind == "mse.gammas.bic" | 
                                     ind == "mse.gammas.cv" | ind == "mse.gammas.lasso.aic" |
                                     ind == "mse.gammas.lasso.bic" | ind == "mse.gammas.lasso.cv" |
                                     ind == "mse.gammas.ridge.cv" |
                                     ind == "mse.gammas.mle"), 
                   aes(x = ind, log_values)) + geom_boxplot() + ylab("") + xlab("") +
  scale_x_discrete(labels = c("elastic net AIC", "elastic net BIC", "elastic net CV", 
                              "LASSO AIC", "LASSO BIC", "LASSO CV",
                              "Ridge CV", "MLE"), limits = c("mse.gammas.aic", "mse.gammas.bic",
                                                             "mse.gammas.cv", "mse.gammas.lasso.aic",
                                                             "mse.gammas.lasso.bic",
                                                             "mse.gammas.lasso.cv",
                                                             "mse.gammas.ridge.cv", "mse.gammas.mle")) +
  ggtitle(expression(paste(gamma[11], ", ", gamma[22]))) +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))


# brier score (Figure A2 in the appendix, right panel)
p.brier.cor <- ggplot(erg2 %>% filter(ind == "brier.aic" | ind == "brier.bic" |
                                        ind == "brier.cv" | ind == "brier.lasso.aic" |
                                        ind == "brier.lasso.bic" | ind == "brier.lasso.cv" |
                                        ind == "brier.ridge.cv" |
                                        ind == "brier.mle"),
                      aes(x = ind, values)) + geom_boxplot() + ylab("") + xlab("") +
  scale_x_discrete(labels = c("elastic net AIC", "elastic net BIC", "elastic net CV",
                              "LASSO AIC", "LASSO BIC", "LASSO CV",
                              "Ridge CV", "MLE"), limits = c("brier.aic", "brier.bic",
                                                             "brier.cv", "brier.lasso.aic",
                                                             "brier.lasso.bic",
                                                             "brier.lasso.cv",
                                                             "brier.ridge.cv", "brier.mle")) +
  ggtitle("Brier scores (scenario II)") +
  theme(plot.title = element_text(hjust = 0.5),
        axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1))

### Figure 3
plots.fin <- grid.arrange(grobs = list(p.intercepts, p.gammas, p.betas), 
                          layout_matrix = matrix(c(1, 3, 2, 3), nrow = 2))