#### R Code for the paper 'Integrating multiple data sources in match-fixing warning systems' --------
#### This file contains the code for the scoring rates model, i.e. all estimations, Figure 3  and Tables 3 and 4

library(gamlss) # version 5.0-1
library(gamboostLSS) # version 2.0-0
# load the Bivariate Poisson implementation for gamboostLSS
source("0_PoissonMVlog.R")

# read data
data_odds <- read.csv("data_odds.csv")

# add intercept
data_odds$int <- rep(1, nrow(data_odds))
# make dummy variables as factor
data_odds$relhome <- as.factor(data_odds$relhome)
data_odds$promhome <- as.factor(data_odds$promhome)
data_odds$relaway <- as.factor(data_odds$relaway)
data_odds$promaway <- as.factor(data_odds$promaway)


# model based on all seasons prior to season 2009/10 ----------------------
data.until08 <- data_odds[data_odds$season <= 2008, ]

# estimate the model
mod_odds_09 <- gamboostLSS(list(lambda1 = y1 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday), lambda2 = y2 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday), lambda3 = y1 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday)), data = data.until08, control = boost_control(trace = TRUE, 
                                                                                                              mstop = 100, nu = 0.1), families = PoissonMVlog(lambda1 = NULL, lambda2 = NULL, 
                                                                                                                                                              lambda3 = NULL, y1 = data.until08$y1, y2 = data.until08$y2, scale.grad = TRUE))


# early stopping
set.seed(2305)
folds <- cv(model.weights(mod_odds_09), type = "subsampling")
# create grid of possible stopping values
grid <- make.grid(max = c(lambda1 = 150, lambda2 = 250, lambda3 = 250), min = 10, 
                  length.out = 5, dense_mu_grid = FALSE)
# apply early stopping. Please note: Executing the next line may take some hours
cvr <- cvrisk(mod_odds_09, grid = grid, folds = folds)
# mstop(cvr) gives the optimal stopping values, which are also stated in the next line
# optimal values: lambda1 = 20, lambda2 = 112, lambda3 = 112
mstop(mod_odds_09) <- c(20, 112, 112)


# prepare data for season 2009/10
data.09 <- data_odds[data_odds$season == 2009, ]

# predict values for the next season
pred.values <- predict(mod_odds_09, newdata = data.09, type = "response")

# odds estimation by simulation
odds.1 <- odds.x <- odds.2 <- odds.u15 <- odds.o15 <- odds.u25 <- odds.o25 <- odds.u35 <- odds.o35 <- c()
set.seed(2305)

# loop over all matches
for (j in 1:nrow(data.09)) {
  # take predicted values for current match
  lambda.hat <- c(pred.values$lambda1[j], pred.values$lambda2[j], pred.values$lambda3[j])
  # n = number of random numbers generated 
  n <- 1e+05
  # generate n random numbers from the Poisson distribution with the respective lambda parameter
  X_i <- array(dim = c(n, 3))
  for (i in 1:3) {
    X_i[, i] <- rpois(n, lambda.hat[i])
  }
  # create simulated home (Y_1) and away goals (Y_2)
  Y_1 <- X_i[, 1] + X_i[, 3]
  Y_2 <- X_i[, 2] + X_i[, 3]
  
  # take empirical proportions of a home win, draw etc. as estimate for the corresponding probability
  # of this event
  odds.1 <- c(odds.1, 1/(sum(Y_1 > Y_2)/n))
  odds.x <- c(odds.x, 1/(sum(Y_1 == Y_2)/n))
  odds.2 <- c(odds.2, 1/(sum(Y_1 < Y_2)/n))
  odds.u15 <- c(odds.u15, 1/(sum(Y_1 + Y_2 < 2)/n))
  odds.o15 <- c(odds.o15, 1/(sum(Y_1 + Y_2 > 1)/n))
  odds.u25 <- c(odds.u25, 1/(sum(Y_1 + Y_2 < 3)/n))
  odds.o25 <- c(odds.o25, 1/(sum(Y_1 + Y_2 > 2)/n))
  odds.u35 <- c(odds.u35, 1/(sum(Y_1 + Y_2 < 4)/n))
  odds.o35 <- c(odds.o35, 1/(sum(Y_1 + Y_2 > 3)/n))
}


# model based on all seasons prior to season 2010/11 ----------------------
data.until09 <- data_odds[data_odds$season <= 2009, ]

# estimate the model
mod_odds_10 <- gamboostLSS(list(lambda1 = y1 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday), lambda2 = y2 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday), lambda3 = y1 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday)), data = data.until09, control = boost_control(trace = TRUE, 
                                                                                                              mstop = 100, nu = 0.1), families = PoissonMVlog(lambda1 = NULL, lambda2 = NULL, 
                                                                                                                                                              lambda3 = NULL, y1 = data.until09$y1, y2 = data.until09$y2, scale.grad = TRUE))


# early stopping
set.seed(2305)
folds <- cv(model.weights(mod_odds_10), type = "subsampling")
# create grid of possible stopping values
grid <- make.grid(max = c(lambda1 = 200, lambda2 = 200, lambda3 = 300), min = 10, 
                  length.out = 5, dense_mu_grid = FALSE)
# apply early stopping. Please note: Executing the next line may take some hours
cvr <- cvrisk(mod_odds_10, grid = grid, folds = folds)
# mstop(cvr) gives the optimal stopping values, which are also stated in the next line
# optimal values: lambda1 = 21, lambda2 = 95, lambda3 = 128
mstop(mod_odds_10) <- c(21, 95, 128)



# prepare data for season 2010/11
data.10 <- data_odds[data_odds$season == 2010, ]

# predict values for the next season
pred.values <- predict(mod_odds_10, newdata = data.10, type = "response")

# odds estimation by simulation
odds.1 <- odds.x <- odds.2 <- odds.u15 <- odds.o15 <- odds.u25 <- odds.o25 <- odds.u35 <- odds.o35 <- c()
set.seed(2305)

# loop over all matches
for (j in 1:nrow(data.10)) {
  # take predicted values for current match
  lambda.hat <- c(pred.values$lambda1[j], pred.values$lambda2[j], pred.values$lambda3[j])
  # n = number of random numbers generated 
  n <- 1e+05
  # generate n random numbers from the Poisson distribution with the respective lambda parameter
  X_i <- array(dim = c(n, 3))
  for (i in 1:3) {
    X_i[, i] <- rpois(n, lambda.hat[i])
  }
  # create simulated home (Y_1) and away goals (Y_2)
  Y_1 <- X_i[, 1] + X_i[, 3]
  Y_2 <- X_i[, 2] + X_i[, 3]
  
  # take empirical proportions of a home win, draw etc. as estimate for the corresponding probability
  # of this event
  odds.1 <- c(odds.1, 1/(sum(Y_1 > Y_2)/n))
  odds.x <- c(odds.x, 1/(sum(Y_1 == Y_2)/n))
  odds.2 <- c(odds.2, 1/(sum(Y_1 < Y_2)/n))
  odds.u15 <- c(odds.u15, 1/(sum(Y_1 + Y_2 < 2)/n))
  odds.o15 <- c(odds.o15, 1/(sum(Y_1 + Y_2 > 1)/n))
  odds.u25 <- c(odds.u25, 1/(sum(Y_1 + Y_2 < 3)/n))
  odds.o25 <- c(odds.o25, 1/(sum(Y_1 + Y_2 > 2)/n))
  odds.u35 <- c(odds.u35, 1/(sum(Y_1 + Y_2 < 4)/n))
  odds.o35 <- c(odds.o35, 1/(sum(Y_1 + Y_2 > 3)/n))
}


# model based on all seasons prior to season 2011/12 ----------------------
data.until10 <- data_odds[data_odds$season <= 2010, ]

# estimate the model
mod_odds_11 <- gamboostLSS(list(lambda1 = y1 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday), lambda2 = y2 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday), lambda3 = y1 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday)), data = data.until10, control = boost_control(trace = TRUE, 
                                                                                                              mstop = 100, nu = 0.1), families = PoissonMVlog(lambda1 = NULL, lambda2 = NULL, 
                                                                                                                                                              lambda3 = NULL, y1 = data.until10$y1, y2 = data.until10$y2, scale.grad = TRUE))

# early stopping
set.seed(2305)
folds <- cv(model.weights(mod_odds_11), type = "subsampling")
# create grid of possible stopping values
grid <- make.grid(max = c(lambda1 = 200, lambda2 = 200, lambda3 = 200), min = 10, 
                  length.out = 5, dense_mu_grid = FALSE)
# apply early stopping. Please note: Executing the next line may take some hours
cvr <- cvrisk(mod_odds_11, grid = grid, folds = folds)
# mstop(cvr) gives the optimal stopping values, which are also stated in the next line
# optimal values: lambda1 = 95, lambda2 = 21, lambda3 = 95
mstop(mod_odds_11) <- c(95, 21, 95)

# prepare data for season 2011/12
data.11 <- data_odds[data_odds$season == 2011, ]

# predict values for the next season
pred.values <- predict(mod_odds_11, newdata = data.11, type = "response")

# odds estimation by simulation
odds.1 <- odds.x <- odds.2 <- odds.u15 <- odds.o15 <- odds.u25 <- odds.o25 <- odds.u35 <- odds.o35 <- c()
set.seed(2305)

# loop over all matches
for (j in 1:nrow(data.11)) {
  # take predicted values for current match
  lambda.hat <- c(pred.values$lambda1[j], pred.values$lambda2[j], pred.values$lambda3[j])
  # n = number of random numbers generated
  n <- 1e+05
  # generate n random numbers from the Poisson distribution with the respective lambda parameter
  X_i <- array(dim = c(n, 3))
  for (i in 1:3) {
    X_i[, i] <- rpois(n, lambda.hat[i])
  }
  # create simulated home (Y_1) and away goals (Y_2)
  Y_1 <- X_i[, 1] + X_i[, 3]
  Y_2 <- X_i[, 2] + X_i[, 3]
  
  # take empirical proportions of a home win, draw etc. as estimate for the corresponding probability
  # of this event
  odds.1 <- c(odds.1, 1/(sum(Y_1 > Y_2)/n))
  odds.x <- c(odds.x, 1/(sum(Y_1 == Y_2)/n))
  odds.2 <- c(odds.2, 1/(sum(Y_1 < Y_2)/n))
  odds.u15 <- c(odds.u15, 1/(sum(Y_1 + Y_2 < 2)/n))
  odds.o15 <- c(odds.o15, 1/(sum(Y_1 + Y_2 > 1)/n))
  odds.u25 <- c(odds.u25, 1/(sum(Y_1 + Y_2 < 3)/n))
  odds.o25 <- c(odds.o25, 1/(sum(Y_1 + Y_2 > 2)/n))
  odds.u35 <- c(odds.u35, 1/(sum(Y_1 + Y_2 < 4)/n))
  odds.o35 <- c(odds.o35, 1/(sum(Y_1 + Y_2 > 3)/n))
}



# model based on all seasons prior to season 2012/13 ----------------------
data.until11 <- data_odds[data_odds$season <= 2011, ]

# estimate the model
mod_odds_12 <- gamboostLSS(list(lambda1 = y1 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday), lambda2 = y2 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday), lambda3 = y1 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday)), data = data.until11, control = boost_control(trace = TRUE, 
                                                                                                              mstop = 100, nu = 0.1), families = PoissonMVlog(lambda1 = NULL, lambda2 = NULL, 
                                                                                                                                                              lambda3 = NULL, y1 = data.until11$y1, y2 = data.until11$y2, scale.grad = TRUE))

# early stopping
set.seed(2305)
folds <- cv(model.weights(mod_odds_12), type = "subsampling")
# create grid of possible stopping values
grid <- make.grid(max = c(lambda1 = 150, lambda2 = 200, lambda3 = 250), min = 10, 
                  length.out = 5, dense_mu_grid = FALSE)
# apply early stopping. Please note: Executing the next line may take some hours
cvr <- cvrisk(mod_odds_12, grid = grid, folds = folds)
# mstop(cvr) gives the optimal stopping values, which are also stated in the next line
# optimal values: lambda1 = 20, lambda2 = 95, lambda3 = 112
mstop(mod_odds_12) <- c(20, 95, 112)


# prepare data for season 2012/13
data.12 <- data_odds[data_odds$season == 2012, ]

# predict values for the next season
pred.values <- predict(mod_odds_12, newdata = data.12, type = "response")

# odds estimation by simulation
odds.1 <- odds.x <- odds.2 <- odds.u15 <- odds.o15 <- odds.u25 <- odds.o25 <- odds.u35 <- odds.o35 <- c()
set.seed(2305)

# loop over all matches
for (j in 1:nrow(data.12)) {
  # take predicted values for current match
  lambda.hat <- c(pred.values$lambda1[j], pred.values$lambda2[j], pred.values$lambda3[j])
  # n = number of random numbers generated 
  n <- 1e+05
  # generate n random numbers from the Poisson distribution with the respective lambda parameter
  X_i <- array(dim = c(n, 3))
  for (i in 1:3) {
    X_i[, i] <- rpois(n, lambda.hat[i])
  }
  # create simulated home (Y_1) and away goals (Y_2)
  Y_1 <- X_i[, 1] + X_i[, 3]
  Y_2 <- X_i[, 2] + X_i[, 3]
  
  # take empirical proportions of a home win, draw etc. as estimate for the corresponding probability
  # of this event
  odds.1 <- c(odds.1, 1/(sum(Y_1 > Y_2)/n))
  odds.x <- c(odds.x, 1/(sum(Y_1 == Y_2)/n))
  odds.2 <- c(odds.2, 1/(sum(Y_1 < Y_2)/n))
  odds.u15 <- c(odds.u15, 1/(sum(Y_1 + Y_2 < 2)/n))
  odds.o15 <- c(odds.o15, 1/(sum(Y_1 + Y_2 > 1)/n))
  odds.u25 <- c(odds.u25, 1/(sum(Y_1 + Y_2 < 3)/n))
  odds.o25 <- c(odds.o25, 1/(sum(Y_1 + Y_2 > 2)/n))
  odds.u35 <- c(odds.u35, 1/(sum(Y_1 + Y_2 < 4)/n))
  odds.o35 <- c(odds.o35, 1/(sum(Y_1 + Y_2 > 3)/n))
}



# model based on all seasons prior to season 2013/14 ----------------------
data.until12 <- data_odds[data_odds$season <= 2012, ]

# estimate the model
mod_odds_13 <- gamboostLSS(list(lambda1 = y1 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday), lambda2 = y2 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday), lambda3 = y1 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday)), data = data.until12, control = boost_control(trace = TRUE, 
                                                                                                              mstop = 100, nu = 0.1), families = PoissonMVlog(lambda1 = NULL, lambda2 = NULL, 
                                                                                                                                                              lambda3 = NULL, y1 = data.until12$y1, y2 = data.until12$y2, scale.grad = TRUE))

# early stopping
set.seed(2305)
folds <- cv(model.weights(mod_odds_13), type = "subsampling")
# create grid of possible stopping values
grid <- make.grid(max = c(lambda1 = 150, lambda2 = 160, lambda3 = 190), min = 80, 
                  length.out = 5, dense_mu_grid = FALSE)
# apply early stopping. Please note: Executing the next line may take some hours
cvr <- cvrisk(mod_odds_13, grid = grid, folds = folds)
# mstop(cvr) gives the optimal stopping values, which are also stated in the next line
# optimal values: lambda1 = 128, lambda2 = 135, lambda3 = 153
mstop(mod_odds_13) <- c(128, 135, 153)

# prepare data for season 2013/14
data.13 <- data_odds[data_odds$season == 2013, ]

# predict values for the next season
pred.values <- predict(mod_odds_13, newdata = data.13, type = "response")

# odds estimation by simulation
odds.1 <- odds.x <- odds.2 <- odds.u15 <- odds.o15 <- odds.u25 <- odds.o25 <- odds.u35 <- odds.o35 <- c()
set.seed(2305)

# loop over all matches
for (j in 1:nrow(data.13)) {
  # take predicted values for current match
  lambda.hat <- c(pred.values$lambda1[j], pred.values$lambda2[j], pred.values$lambda3[j])
  # n = number of random numbers generated 
  n <- 1e+05
  # generate n random numbers from the Poisson distribution with the respective lambda parameter
  X_i <- array(dim = c(n, 3))
  for (i in 1:3) {
    X_i[, i] <- rpois(n, lambda.hat[i])
  }
  # create simulated home (Y_1) and away goals (Y_2)
  Y_1 <- X_i[, 1] + X_i[, 3]
  Y_2 <- X_i[, 2] + X_i[, 3]
  
  # take empirical proportions of a home win, draw etc. as estimate for the corresponding probability
  # of this event
  odds.1 <- c(odds.1, 1/(sum(Y_1 > Y_2)/n))
  odds.x <- c(odds.x, 1/(sum(Y_1 == Y_2)/n))
  odds.2 <- c(odds.2, 1/(sum(Y_1 < Y_2)/n))
  odds.u15 <- c(odds.u15, 1/(sum(Y_1 + Y_2 < 2)/n))
  odds.o15 <- c(odds.o15, 1/(sum(Y_1 + Y_2 > 1)/n))
  odds.u25 <- c(odds.u25, 1/(sum(Y_1 + Y_2 < 3)/n))
  odds.o25 <- c(odds.o25, 1/(sum(Y_1 + Y_2 > 2)/n))
  odds.u35 <- c(odds.u35, 1/(sum(Y_1 + Y_2 < 4)/n))
  odds.o35 <- c(odds.o35, 1/(sum(Y_1 + Y_2 > 3)/n))
}



# model based on all seasons prior to season 2014/15 ----------------------
data.until13 <- data_odds[data_odds$season <= 2013, ]

# estimate the model
mod_odds_14 <- gamboostLSS(list(lambda1 = y1 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday), lambda2 = y2 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday), lambda3 = y1 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday)), data = data.until13, control = boost_control(trace = TRUE, 
                                                                                                              mstop = 100, nu = 0.1), families = PoissonMVlog(lambda1 = NULL, lambda2 = NULL, 
                                                                                                                                                              lambda3 = NULL, y1 = data.until13$y1, y2 = data.until13$y2, scale.grad = TRUE))


# early stopping
set.seed(2305)
folds <- cv(model.weights(mod_odds_14), type = "subsampling")
# create grid of possible stopping values
grid <- make.grid(max = c(lambda1 = 200, lambda2 = 200, lambda3 = 250), min = 10, 
                  length.out = 5, dense_mu_grid = FALSE)
# apply early stopping. Please note: Executing the next line may take some hours
cvr <- cvrisk(mod_odds_14, grid = grid, folds = folds)
# mstop(cvr) gives the optimal stopping values, which are also stated in the next line
# optimal values: lambda1 = 21, lambda2 = 95, lambda3 = 112
mstop(mod_odds_14) <- c(21, 95, 112)

# prepare data for season 2014/15
data.14 <- data_odds[data_odds$season == 2014, ]

# predict values for the next season
pred.values <- predict(mod_odds_14, newdata = data.14, type = "response")

# odds estimation by simulation
odds.1 <- odds.x <- odds.2 <- odds.u15 <- odds.o15 <- odds.u25 <- odds.o25 <- odds.u35 <- odds.o35 <- c()
set.seed(2305)

# loop over all matches
for (j in 1:nrow(data.14)) {
  # take predicted values for current match
  lambda.hat <- c(pred.values$lambda1[j], pred.values$lambda2[j], pred.values$lambda3[j])
  # n = number of random numbers generated 
  n <- 1e+05
  # generate n random numbers from the Poisson distribution with the respective lambda parameter
  X_i <- array(dim = c(n, 3))
  for (i in 1:3) {
    X_i[, i] <- rpois(n, lambda.hat[i])
  }
  # create simulated home (Y_1) and away goals (Y_2)
  Y_1 <- X_i[, 1] + X_i[, 3]
  Y_2 <- X_i[, 2] + X_i[, 3]
  
  # take empirical proportions of a home win, draw etc. as estimate for the corresponding probability
  # of this event
  odds.1 <- c(odds.1, 1/(sum(Y_1 > Y_2)/n))
  odds.x <- c(odds.x, 1/(sum(Y_1 == Y_2)/n))
  odds.2 <- c(odds.2, 1/(sum(Y_1 < Y_2)/n))
  odds.u15 <- c(odds.u15, 1/(sum(Y_1 + Y_2 < 2)/n))
  odds.o15 <- c(odds.o15, 1/(sum(Y_1 + Y_2 > 1)/n))
  odds.u25 <- c(odds.u25, 1/(sum(Y_1 + Y_2 < 3)/n))
  odds.o25 <- c(odds.o25, 1/(sum(Y_1 + Y_2 > 2)/n))
  odds.u35 <- c(odds.u35, 1/(sum(Y_1 + Y_2 < 4)/n))
  odds.o35 <- c(odds.o35, 1/(sum(Y_1 + Y_2 > 3)/n))
}



# model based on all seasons prior to season 2015/16 ----------------------
data.until14 <- data_odds[data_odds$season <= 2014, ]

# estimate the model
mod_odds_15 <- gamboostLSS(list(lambda1 = y1 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday), lambda2 = y2 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday), lambda3 = y1 ~ bols(int, intercept = FALSE) + bbs(mvhome) + 
                                  bbs(mvaway) + bols(relhome, intercept = FALSE) + bols(promhome, intercept = FALSE) + 
                                  bols(relaway, intercept = FALSE) + bols(promaway, intercept = FALSE) + bbs(PL4Home) + 
                                  bbs(PL4Away) + bbs(matchday)), data = data.until14, control = boost_control(trace = TRUE, 
                                                                                                              mstop = 100, nu = 0.1), families = PoissonMVlog(lambda1 = NULL, lambda2 = NULL, 
                                                                                                                                                              lambda3 = NULL, y1 = data.until14$y1, y2 = data.until14$y2, scale.grad = TRUE))


# early stopping
set.seed(2305)
folds <- cv(model.weights(mod_odds_15), type = "subsampling")
# create grid of possible stopping values
grid <- make.grid(max = c(lambda1 = 200, lambda2 = 200, lambda3 = 250), min = 10, 
                  length.out = 5, dense_mu_grid = FALSE)
# apply early stopping. Please note: Executing the next line may take some hours
cvr <- cvrisk(mod_odds_15, grid = grid, folds = folds)
# mstop(cvr) gives the optimal stopping values, which are also stated in the next line
# optimal values: lambda1 = 21, lambda2 = 95, lambda3 = 112
mstop(mod_odds_15) <- c(21, 95, 112)

# prepare data for season 2015/16
data.15 <- data_odds[data_odds$season == 2015, ]

# predict values for the next season
pred.values <- predict(mod_odds_15, newdata = data.15, type = "response")

# odds estimation by simulation
odds.1 <- odds.x <- odds.2 <- odds.u15 <- odds.o15 <- odds.u25 <- odds.o25 <- odds.u35 <- odds.o35 <- c()
set.seed(2305)

# loop over all matches
for (j in 1:nrow(data.15)) {
  # take predicted values for current match
  lambda.hat <- c(pred.values$lambda1[j], pred.values$lambda2[j], pred.values$lambda3[j])
  # n = number of random numbers generated 
  n <- 1e+05
  # generate n random numbers from the Poisson distribution with the respective lambda parameter
  X_i <- array(dim = c(n, 3))
  for (i in 1:3) {
    X_i[, i] <- rpois(n, lambda.hat[i])
  }
  # create simulated home (Y_1) and away goals (Y_2)
  Y_1 <- X_i[, 1] + X_i[, 3]
  Y_2 <- X_i[, 2] + X_i[, 3]
  
  # take empirical proportions of a home win, draw etc. as estimate for the corresponding probability
  # of this event
  odds.1 <- c(odds.1, 1/(sum(Y_1 > Y_2)/n))
  odds.x <- c(odds.x, 1/(sum(Y_1 == Y_2)/n))
  odds.2 <- c(odds.2, 1/(sum(Y_1 < Y_2)/n))
  odds.u15 <- c(odds.u15, 1/(sum(Y_1 + Y_2 < 2)/n))
  odds.o15 <- c(odds.o15, 1/(sum(Y_1 + Y_2 > 1)/n))
  odds.u25 <- c(odds.u25, 1/(sum(Y_1 + Y_2 < 3)/n))
  odds.o25 <- c(odds.o25, 1/(sum(Y_1 + Y_2 > 2)/n))
  odds.u35 <- c(odds.u35, 1/(sum(Y_1 + Y_2 < 4)/n))
  odds.o35 <- c(odds.o35, 1/(sum(Y_1 + Y_2 > 3)/n))
}


## Analysis of outliers continues in file 'OutlierDetection_RCode.R'



# Plots -------------------------------------------------------------------
# Plots for the model fitted to data from season 2009/10 to 2014/15
# The code below generates Figure 3
# Please note: As we provide artificial data, 
# the following results differ slightly from those presented in the paper

data.plot <- data.until14

# for all selected base learners, create a sequence from min(covariate) to
# max(covariate) to draw the estimated effect on an appropriate grid

data.plot$mvhome <- seq(min(data.until14$mvhome), max(data.until14$mvhome), length = nrow(data.until14))
data.plot$mvaway <- seq(min(data.until14$mvaway), max(data.until14$mvaway), length = nrow(data.until14))
data.plot$matchday <- seq(min(data.until14$matchday), max(data.until14$matchday), 
                          length = nrow(data.until14))
data.plot$PL4Home <- seq(min(data.until14$PL4Home), max(data.until14$PL4Home), length = nrow(data.until14))
data.plot$PL4Away <- seq(min(data.until14$PL4Away), max(data.until14$PL4Away), length = nrow(data.until14))

layout(matrix(c(rep(1, 20), rep(2, 20), rep(3, 20), rep(4, 12), rep(5, 12), rep(6, 12), 
                rep(7, 12), rep(8, 12), rep(9, 15), rep(10, 15), rep(11, 15), rep(12, 15)), 
              3, 60, byrow = TRUE))
par(mai = c(0.45, 0.2, 0.15, 0.1) + 0.1)


### Plots for lambda 1 
## mvhome 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_odds_15, parameter = "lambda1", which = "mvhome", 
                                  newdata = data.plot))

# Plot the effect of mvhome
plot(data.plot$mvhome, pred.values, type = "l", lwd = 2, ylim = c(-1, 0.05), xlab = "mvhome", 
     main = c(bquote(widehat(f)[11])))
# add a short vertical line for each original data point
points(data.until14$mvhome, rep(-1.04, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)

## mvaway 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_odds_15, parameter = "lambda1", which = "mvaway", 
                                  newdata = data.plot))

# Plot the effect of mvaway
plot(data.plot$mvaway, pred.values, type = "l", lwd = 2, ylim = c(-1, 0.05), xlab = "mvaway", 
     main = c(bquote(widehat(f)[21])))
# add a short vertical line for each original data point
points(data.until14$mvaway, rep(-1.04, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)


## p4away 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_odds_15, parameter = "lambda1", which = "PL4Away", 
                                  newdata = data.plot))

# Plot the effect of p4away
plot(data.plot$PL4Away, pred.values, type = "l", lwd = 2, ylim = c(-1, 0.05), xlab = "avgp4ateam", 
     main = c(bquote(widehat(f)[31])))
# add a short vertical line for each original data point
points(data.until14$PL4Away, rep(-1.04, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)


### Plots for lambda 2 
## mvhome 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_odds_15, parameter = "lambda2", which = "mvhome", 
                                  newdata = data.plot))

# Plot the effect of mvhome
plot(data.plot$mvhome, pred.values, type = "l", lwd = 2, ylim = c(-0.87, 0.1), xlab = "mvhome", 
     main = c(bquote(widehat(f)[12])))
# add a short vertical line for each original data point
points(data.until14$mvhome, rep(-0.91, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)


## mvaway 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_odds_15, parameter = "lambda2", which = "mvaway", 
                                  newdata = data.plot))

# Plot the effect of mvaway
plot(data.plot$mvaway, pred.values, type = "l", lwd = 2, ylim = c(-0.87, 0.1), xlab = "mvaway", 
     main = c(bquote(widehat(f)[22])))
# add a short vertical line for each original data point
points(data.until14$mvaway, rep(-0.91, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)

## matchday 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_odds_15, parameter = "lambda2", which = "matchday", 
                                  newdata = data.plot))

# Plot the effect of matchday
plot(data.plot$matchday, pred.values, type = "l", lwd = 2, ylim = c(-0.87, 0.1), 
     xlab = "matchday", main = c(bquote(widehat(f)[32])))
# add a short vertical line for each original data point
points(data.until14$matchday, rep(-0.91, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)


## p4home 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_odds_15, parameter = "lambda2", which = "PL4Home", 
                                  newdata = data.plot))

# Plot the effect of p4home
plot(data.plot$PL4Home, pred.values, type = "l", lwd = 2, ylim = c(-0.87, 0.1), xlab = "avgp4hteam", 
     main = c(bquote(widehat(f)[42])))
# add a short vertical line for each original data point
points(data.until14$PL4Home, rep(-0.91, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)


## p4away 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_odds_15, parameter = "lambda2", which = "PL4Away", 
                                  newdata = data.plot))

# Plot the effect of p4away
plot(data.plot$PL4Away, pred.values, type = "l", lwd = 2, ylim = c(-0.87, 0.1), xlab = "avgp4ateam", 
     main = c(bquote(widehat(f)[52])))
# add a short vertical line for each original data point
points(data.until14$PL4Home, rep(-0.91, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)


### Plots for lambda 3 
## mvhome 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_odds_15, parameter = "lambda3", which = "mvhome", 
                                  newdata = data.plot))

# Plot the effect of mvhome
plot(data.plot$mvhome, pred.values, type = "l", lwd = 2, ylim = c(-2.48, 0.05), xlab = "mvhome", 
     main = c(bquote(widehat(f)[13])))
# add a short vertical line for each original data point
points(data.until14$mvhome, rep(-2.55, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)


## mvaway 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_odds_15, parameter = "lambda3", which = "mvaway", 
                                  newdata = data.plot))

# Plot the effect of mvaway
plot(data.plot$mvaway, pred.values, type = "l", lwd = 2, ylim = c(-2.48, 0.05), xlab = "mvaway", 
     main = c(bquote(widehat(f)[23])))
# add a short vertical line for each original data point
points(data.until14$mvaway, rep(-2.55, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)


## matchday 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_odds_15, parameter = "lambda3", which = "matchday", 
                                  newdata = data.plot))

# Plot the effect of matchday
plot(data.plot$matchday, pred.values, type = "l", lwd = 2, ylim = c(-2.48, 0.05), 
     xlab = "matchday", main = c(bquote(widehat(f)[33])))
# add a short vertical line for each original data point
points(data.until14$matchday, rep(-2.57, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)


## p4away 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_odds_15, parameter = "lambda3", which = "PL4Away", 
                                  newdata = data.plot))

# Plot the effect of p4away
plot(data.plot$PL4Away, pred.values, type = "l", lwd = 2, ylim = c(-2.48, 0.05), 
     xlab = "avgp4ateam", main = c(bquote(widehat(f)[43])))
# add a short vertical line for each original data point
points(data.until14$PL4Home, rep(-2.55, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)


# Descriptive statistics --------------------------------------------------
## Data for Table 3
summary(data_odds)



# Selected base learners for all season -----------------------------------
## Information for Table 4

# selected base learners for season 2009/10
names(coef(mod_odds_09)$lambda1)
names(coef(mod_odds_09)$lambda2)
names(coef(mod_odds_09)$lambda3)

# selected base learners for season 2010/11
names(coef(mod_odds_10)$lambda1)
names(coef(mod_odds_10)$lambda2)
names(coef(mod_odds_10)$lambda3)

# selected base learners for season 2011/12
names(coef(mod_odds_11)$lambda1)
names(coef(mod_odds_11)$lambda2)
names(coef(mod_odds_11)$lambda3)

# selected base learners for season 2012/13
names(coef(mod_odds_12)$lambda1)
names(coef(mod_odds_12)$lambda2)
names(coef(mod_odds_12)$lambda3)

# selected base learners for season 2013/14
names(coef(mod_odds_13)$lambda1)
names(coef(mod_odds_13)$lambda2)
names(coef(mod_odds_13)$lambda3)

# selected base learners for season 2014/15
names(coef(mod_odds_14)$lambda1)
names(coef(mod_odds_14)$lambda2)
names(coef(mod_odds_14)$lambda3)

# selected base learners for season 2015/16
names(coef(mod_odds_15)$lambda1)
names(coef(mod_odds_15)$lambda2)
names(coef(mod_odds_15)$lambda3)