# R Code for the paper 'Integrating multiple data sources in match-fixing warning systems -------- 
# by Marius tting, Roland Langrock and Christian Deutscher

# This file contains the code for the betting volume model, i.e. all estimations, Figures 1 and 2 and Tables 1 and 2

library(gamlss) # version 5.0-1
library(gamboostLSS) # version 2.0-0

# read data
data.bv.boost <- read.csv("data_volumes.csv")

# create a column of ones for the intercept
data.bv.boost$intcpt <- rep(1, nrow(data.bv.boost))
# turn the MatchID into factor
data.bv.boost$MatchID <- as.factor(data.bv.boost$MatchID)

# model based on season 2009/10 -------------------------------------------

# subset the data for the current season(s)
data.bv.boost.2009 <- data.bv.boost[data.bv.boost$season == 2009, ]
data.bv.boost.2009$MatchID <- droplevels(data.bv.boost.2009$MatchID)

# estimate the model
mod_bv_boost_09 <- gamboostLSS(volume_thsd ~ bols(intcpt, intercept = FALSE) + bols(selection, 
                                                                                    intercept = FALSE) + bols(weekday, intercept = FALSE) + bbs(matchday) + bbs(mvhome) + 
                                 bbs(mvaway) + bbs(certainty) + bbs(certainty, by = selection, center = TRUE) + 
                                 brandom(MatchID), families = as.families("LOGNO"), data = data.bv.boost.2009, 
                               control = boost_control(trace = TRUE, mstop = 300, nu = 0.1))


# early stopping
set.seed(2305)
folds <- cv(model.weights(mod_bv_boost_09), type = "subsampling")
# create grid of possible stopping values
grid <- make.grid(max = c(mu = 3000, sigma = 1000), min = c(mu = 1000, sigma = 100), 
                  length.out = 5, dense_mu_grid = TRUE)
# apply early stopping. Please note: Executing the next line may take some hours
cvr <- cvrisk(mod_bv_boost_09, grid = grid, folds = folds)
# mstop(cvr) gives the optimal stopping values, which are also stated in the next line
# optimal values: mu = 2417; sigma = 316
mstop(mod_bv_boost_09) <- c(2417, 316)


# compute fitted values for season 2009/10 for mu and sigma
fitted_mu <- fitted(mod_bv_boost_09, parameter = "mu")
fitted_sigma <- exp(fitted(mod_bv_boost_09, parameter = "sigma"))

# compute quantile residuals for season 2009/10
quant.residuals <- qnorm(pLOGNO(data.bv.boost.2009$volume_thsd, mu = fitted_mu, sigma = fitted_sigma))
data.bv.boost.2009$QuantRes <- quant.residuals


# subset data for season 2010/11
data.bv.boost.10 <- data.bv.boost[data.bv.boost$season == 2010, ]

# since there are more matches for season 2009 + 2010 (resulting in new MatchIDs),
# the MatchIDs from season 2009 are assigned to the first matches of season 2010 and all matches
# after that are assigned a "1" as MatchID. This is only done for technical reasons
# to calculate the predictions; afterwards the random intercept is substracted for 
# each match, i.e. effectively the random intercept is set to zero for each new prediction.
number.ids <- length(levels(data.bv.boost.2009$MatchID))
data.bv.boost.10$MatchID <- c(1:number.ids, rep(1, nrow(data.bv.boost.10) - number.ids))
data.bv.boost.10$MatchID <- as.factor(data.bv.boost.10$MatchID)

# a few matches were played on a day of the week which was not included as a dummy in
# the previous season just because there was no match at these days of the week.
# Hence we set the day of the week of these matches to another day, make predictions
# and exclude the observations afterwards
not.inc.weekdays <- c("Mittwoch", "Donnerstag")
idx.not.inc.weekdays <- which(data.bv.boost.10$weekday == "Mittwoch" | data.bv.boost.10$weekday == 
                                "Donnerstag")
data.bv.boost.10$weekday <- as.character(data.bv.boost.10$weekday)
data.bv.boost.10[idx.not.inc.weekdays, ]$weekday <- "Samstag"
data.bv.boost.10$double <- 0
data.bv.boost.10$double[idx.not.inc.weekdays] <- 1
data.bv.boost.10$weekday <- as.factor(data.bv.boost.10$weekday)

# predict mu and sigma for the next season
fitted <- predict(mod_bv_boost_09, newdata = data.bv.boost.10)
# set the random intercept = 0 for the prediction for mu 
# by subtracting the respective estimate for the random intercept
fitted_mu <- as.numeric(fitted$mu) - c(as.numeric(coef(mod_bv_boost_09)$mu$`brandom(MatchID)`), 
                                       rep(as.numeric(coef(mod_bv_boost_09)$mu$`brandom(MatchID)`)[1], nrow(data.bv.boost.10) - 
                                             number.ids))

# set the random intercept = 0 for the prediction for sigma
# by subtracting the respective estimate for the random intercept
fitted_sigma <- as.numeric(fitted$sigma) - c(as.numeric(coef(mod_bv_boost_09)$sigma$`brandom(MatchID)`), 
                                             rep(as.numeric(coef(mod_bv_boost_09)$sigma$`brandom(MatchID)`)[1], nrow(data.bv.boost.10) - 
                                                   number.ids))
fitted_sigma <- exp(fitted_sigma)

# estimate the quantile residuals
quant.residuals <- qnorm(pLOGNO(data.bv.boost.10$volume_thsd, mu = fitted_mu, sigma = fitted_sigma))

# the observations with "new" weekdays as described above are set to NA 
idx.na <- which(data.bv.boost.10$double == 1)
quant.residuals[idx.na] <- NA




# model based on all seasons prior to season 2011/12 ----------------------

# subset the data for the current season(s)
data.bv.boost.2010 <- data.bv.boost[data.bv.boost$season <= 2010, ]
data.bv.boost.2010$MatchID <- droplevels(data.bv.boost.2010$MatchID)

# estimate the model
mod_bv_boost_10 <- gamboostLSS(volume_thsd ~ bols(intcpt, intercept = FALSE) + bols(selection, 
                                                                                    intercept = FALSE) + bols(weekday, intercept = FALSE) + bbs(matchday) + bbs(mvhome) + 
                                 bbs(mvaway) + bbs(certainty) + bbs(certainty, by = selection, center = TRUE) + 
                                 brandom(MatchID), families = as.families("LOGNO"), data = data.bv.boost.2010, 
                               control = boost_control(trace = TRUE, mstop = 300, nu = 0.1))

# early stopping
set.seed(2305)
folds <- cv(model.weights(mod_bv_boost_10), type = "subsampling")
# create grid of possible stopping values
grid <- make.grid(max = c(mu = 10000, sigma = 600), min = c(mu = 1000, sigma = 100), 
                  length.out = 5, dense_mu_grid = FALSE)
# apply early stopping. Please note: Executing the next line may take some hours
cvr <- cvrisk(mod_bv_boost_10, grid = grid, folds = folds)
# mstop(cvr) gives the optimal stopping values, which are also stated in the next line
# optimal values: mu = 3162; sigma = 383
mstop(mod_bv_boost_10) <- c(3162, 383)


# subset data for season 2011/12
data.bv.boost.11 <- data.bv.boost[data.bv.boost$season == 2011, ]

# reorder the MatchIDs to make predictions - see line 80 for further explanations
number.ids <- length(levels(data.bv.boost.2010$MatchID))
data.bv.boost.11$MatchID <- c(1:number.ids, rep(1, nrow(data.bv.boost.11) - number.ids))
data.bv.boost.11$MatchID <- as.factor(data.bv.boost.11$MatchID)

# predict mu and sigma for the next season
fitted <- predict(mod_bv_boost_10, newdata = data.bv.boost.11)
# set the random intercept = 0 for the prediction for mu 
# by subtracting the respective estimate for the random intercept
fitted_mu <- as.numeric(fitted$mu) - c(as.numeric(coef(mod_bv_boost_10)$mu$`brandom(MatchID)`), 
                                       rep(as.numeric(coef(mod_bv_boost_10)$mu$`brandom(MatchID)`)[1], nrow(data.bv.boost.11) - 
                                             number.ids))

# set the random intercept = 0 for the prediction for sigma
# by subtracting the respective estimate for the random intercept
fitted_sigma <- as.numeric(fitted$sigma) - c(as.numeric(coef(mod_bv_boost_10)$sigma$`brandom(MatchID)`), 
                                             rep(as.numeric(coef(mod_bv_boost_10)$sigma$`brandom(MatchID)`)[1], nrow(data.bv.boost.11) - 
                                                   number.ids))
fitted_sigma <- exp(fitted_sigma)

# estimate the quantile residuals
quant.residuals <- qnorm(pLOGNO(data.bv.boost.11$volume_thsd, mu = fitted_mu, sigma = fitted_sigma))



# model based on all seasons prior to season 2012/13 ----------------------

# subset the data for the current season(s)
data.bv.boost.2011 <- data.bv.boost[data.bv.boost$season <= 2011, ]
data.bv.boost.2011$MatchID <- droplevels(data.bv.boost.2011$MatchID)


# estimate the model
mod_bv_boost_11 <- gamboostLSS(volume_thsd ~ bols(intcpt, intercept = FALSE) + bols(selection, 
                                                                                    intercept = FALSE) + bols(weekday, intercept = FALSE) + bbs(matchday) + bbs(mvhome) + 
                                 bbs(mvaway) + bbs(certainty) + bbs(certainty, by = selection, center = TRUE) + 
                                 brandom(MatchID), families = as.families("LOGNO"), data = data.bv.boost.2011, 
                               control = boost_control(trace = TRUE, mstop = 300, nu = 0.1))

# early stopping
set.seed(2305)
folds <- cv(model.weights(mod_bv_boost_11), type = "subsampling")
# create grid of possible stopping values
grid <- make.grid(max = c(mu = 15000, sigma = 2000), min = c(mu = 9000, sigma = 500), 
                  length.out = 5, dense_mu_grid = TRUE)
# apply early stopping. Please note: Executing the next line may take some hours
cvr <- cvrisk(mod_bv_boost_11, grid = grid, folds = folds)
# mstop(cvr) gives the optimal stopping values, which are also stated in the next line
# optimal values: mu = 5552; sigma = 707
mstop(mod_bv_boost_11) <- c(5552, 707)


# subset data for season 2012/13
data.bv.boost.12 <- data.bv.boost[data.bv.boost$season == 2012, ]

# reorder the MatchIDs to make predictions - see line 80 for further explanations
number.ids <- length(levels(data.bv.boost.2011$MatchID))
data.bv.boost.12$MatchID <- c(1:number.ids, rep(1, nrow(data.bv.boost.12) - number.ids))
data.bv.boost.12$MatchID <- as.factor(data.bv.boost.12$MatchID)

# predict mu and sigma for the next season
fitted <- predict(mod_bv_boost_11, newdata = data.bv.boost.12)
# set the random intercept = 0 for the prediction for mu 
# by subtracting the respective estimate for the random intercept
fitted_mu <- as.numeric(fitted$mu) - c(as.numeric(coef(mod_bv_boost_11)$mu$`brandom(MatchID)`), 
                                       rep(as.numeric(coef(mod_bv_boost_11)$mu$`brandom(MatchID)`)[1], nrow(data.bv.boost.12) - 
                                             number.ids))

# set the random intercept = 0 for the prediction for sigma
# by subtracting the respective estimate for the random intercept
fitted_sigma <- as.numeric(fitted$sigma) - c(as.numeric(coef(mod_bv_boost_11)$sigma$`brandom(MatchID)`), 
                                             rep(as.numeric(coef(mod_bv_boost_11)$sigma$`brandom(MatchID)`)[1], nrow(data.bv.boost.12) - 
                                                   number.ids))
fitted_sigma <- exp(fitted_sigma)

# estimate the quantile residuals
quant.residuals <- qnorm(pLOGNO(data.bv.boost.12$volume_thsd, mu = fitted_mu, sigma = fitted_sigma))



# model based on all seasons prior to season 2013/14 ----------------------

# subset the data for the current season(s)
data.bv.boost.2012 <- data.bv.boost[data.bv.boost$season <= 2012, ]
data.bv.boost.2012$MatchID <- droplevels(data.bv.boost.2012$MatchID)


# estimate the model
mod_bv_boost_12 <- gamboostLSS(volume_thsd ~ bols(intcpt, intercept = FALSE) + bols(selection, 
                                                                                    intercept = FALSE) + bols(weekday, intercept = FALSE) + bbs(matchday) + bbs(mvhome) + 
                                 bbs(mvaway) + bbs(certainty) + bbs(certainty, by = selection, center = TRUE) + 
                                 brandom(MatchID), families = as.families("LOGNO"), data = data.bv.boost.2012, 
                               control = boost_control(trace = TRUE, mstop = 300, nu = 0.1))

# early stopping
set.seed(2305)
folds <- cv(model.weights(mod_bv_boost_12), type = "subsampling")
# create grid of possible stopping values
grid <- make.grid(max = c(mu = 15000, sigma = 2000), min = c(mu = 9000, sigma = 500), 
                  length.out = 5, dense_mu_grid = TRUE)
# apply early stopping. Please note: Executing the next line may take some hours
cvr <- cvrisk(mod_bv_boost_12, grid = grid, folds = folds)
# mstop(cvr) gives the optimal stopping values, which are also stated in the next line
# optimal values: mu = 7482; sigma = 1000
mstop(mod_bv_boost_12) <- c(7482, 1000)


# subset data for season 2013/14
data.bv.boost.13 <- data.bv.boost[data.bv.boost$season == 2013, ]

# reorder the MatchIDs to make predictions - see line 80 for further explanations
number.ids <- length(levels(data.bv.boost.2012$MatchID))
data.bv.boost.13$MatchID <- c(1:number.ids, rep(1, nrow(data.bv.boost.13) - number.ids))
data.bv.boost.13$MatchID <- as.factor(data.bv.boost.13$MatchID)

# predict mu and sigma for the next season
fitted <- predict(mod_bv_boost_12, newdata = data.bv.boost.13)
# set the random intercept = 0 for the prediction for mu 
# by subtracting the respective estimate for the random intercept
fitted_mu <- as.numeric(fitted$mu) - c(as.numeric(coef(mod_bv_boost_12)$mu$`brandom(MatchID)`), 
                                       rep(as.numeric(coef(mod_bv_boost_12)$mu$`brandom(MatchID)`)[1], nrow(data.bv.boost.13) - 
                                             number.ids))

# set the random intercept = 0 for the prediction for sigma
# by subtracting the respective estimate for the random intercept
fitted_sigma <- as.numeric(fitted$sigma) - c(as.numeric(coef(mod_bv_boost_12)$sigma$`brandom(MatchID)`), 
                                             rep(as.numeric(coef(mod_bv_boost_12)$sigma$`brandom(MatchID)`)[1], nrow(data.bv.boost.13) - 
                                                   number.ids))
fitted_sigma <- exp(fitted_sigma)

# estimate the quantile residuals
quant.residuals <- qnorm(pLOGNO(data.bv.boost.13$volume_thsd, mu = fitted_mu, sigma = fitted_sigma))


# model based on all seasons prior to season 2014/15 ----------------------

# subset the data for the current season(s)
data.bv.boost.2013 <- data.bv.boost[data.bv.boost$season <= 2013, ]
data.bv.boost.2013$MatchID <- droplevels(data.bv.boost.2013$MatchID)


# estimate the model
mod_bv_boost_13 <- gamboostLSS(volume_thsd ~ bols(intcpt, intercept = FALSE) + bols(selection, 
                                                                                    intercept = FALSE) + bols(weekday, intercept = FALSE) + bbs(matchday) + bbs(mvhome) + 
                                 bbs(mvaway) + bbs(certainty) + bbs(certainty, by = selection, center = TRUE) + 
                                 brandom(MatchID), families = as.families("LOGNO"), data = data.bv.boost.2013, 
                               control = boost_control(trace = TRUE, mstop = 300, nu = 0.1))

# early stopping
set.seed(2305)
folds <- cv(model.weights(mod_bv_boost_13), type = "subsampling")
# create grid of possible stopping values
grid <- make.grid(max = c(mu = 15000, sigma = 2000), min = c(mu = 9000, sigma = 500), 
                  length.out = 5, dense_mu_grid = TRUE)
# apply early stopping. Please note: Executing the next line may take some hours
cvr <- cvrisk(mod_bv_boost_13, grid = grid, folds = folds)
# mstop(cvr) gives the optimal stopping values, which are also stated in the next line
# optimal values: mu = 11869; sigma = 1000
mstop(mod_bv_boost_13) <- c(11869, 1000)


# subset data for season 2014/15
data.bv.boost.14 <- data.bv.boost[data.bv.boost$season == 2014, ]

# reorder the MatchIDs to make predictions - see line 80 for further explanations
number.ids <- length(levels(data.bv.boost.2013$MatchID))
missing.rows <- number.ids - nrow(data.bv.boost.14)
data.bv.boost.14[(nrow(data.bv.boost.14) + 1):(nrow(data.bv.boost.14) + missing.rows), 
                 ] <- data.bv.boost.14[1, ]
data.bv.boost.14$MatchID <- c(1:number.ids)
data.bv.boost.14$MatchID <- as.factor(data.bv.boost.14$MatchID)

# predict mu and sigma for the next season
fitted <- predict(mod_bv_boost_13, newdata = data.bv.boost.14)
# set the random intercept = 0 for the prediction for mu 
# by subtracting the respective estimate for the random intercept
fitted_mu <- as.numeric(fitted$mu) - as.numeric(coef(mod_bv_boost_13)$mu$`brandom(MatchID)`)
# set the random intercept = 0 for the prediction for sigma
# by subtracting the respective estimate for the random intercept
fitted_sigma <- as.numeric(fitted$sigma) - as.numeric(coef(mod_bv_boost_13)$sigma$`brandom(MatchID)`)
fitted_sigma <- exp(fitted_sigma)

# eliminate duplicated observations which were created to make predictions
fitted_mu <- fitted_mu[1:(nrow(data.bv.boost.14) - missing.rows)]
fitted_sigma <- fitted_sigma[1:(nrow(data.bv.boost.14) - missing.rows)]
data.bv.boost.14 <- data.bv.boost.14[1:(nrow(data.bv.boost.14) - missing.rows), ]

# estimate the quantile residuals
quant.residuals <- qnorm(pLOGNO(data.bv.boost.14$volume_thsd, mu = fitted_mu, sigma = fitted_sigma))




# model based on all seasons prior to season 2015/16 ----------------------

# subset the data for the current season(s)
data.bv.boost.2014 <- data.bv.boost[data.bv.boost$season <= 2014, ]
data.bv.boost.2014$MatchID <- droplevels(data.bv.boost.2014$MatchID)


# estimate the model
mod_bv_boost_14 <- gamboostLSS(volume_thsd ~ bols(intcpt, intercept = FALSE) + bols(selection, 
                                                                                    intercept = FALSE) + bols(weekday, intercept = FALSE) + bbs(matchday) + bbs(mvhome) + 
                                 bbs(mvaway) + bbs(certainty) + bbs(certainty, by = selection, center = TRUE) + 
                                 brandom(MatchID), families = as.families("LOGNO"), data = data.bv.boost.2014, 
                               control = boost_control(trace = TRUE, mstop = 300, nu = 0.1))


# early stopping
set.seed(2305)
folds <- cv(model.weights(mod_bv_boost_14), type = "subsampling")
# create grid of possible stopping values
grid <- make.grid(max = c(mu = 15000, sigma = 2000), min = c(mu = 9000, sigma = 500), 
                  length.out = 5, dense_mu_grid = TRUE)
# apply early stopping. Please note: Executing the next line may take some hours
cvr <- cvrisk(mod_bv_boost_14, grid = grid, folds = folds)
# mstop(cvr) gives the optimal stopping values, which are also stated in the next line
# optimal values: mu = 14821; sigma = 1414
mstop(mod_bv_boost_14) <- c(14821, 1414)


## These are the results presented in Table 2
# Pleas note: As we provide artificial data for the betting volumes etc., 
# the following results differ slightly from those presented in the paper

# estimated effects for the dummy variables on the mean 
coef(mod_bv_boost_14)$mu$`bols(selection, intercept = FALSE)`
coef(mod_bv_boost_14)$mu$`bols(weekday, intercept = FALSE)`
# estimated effects for the dummy variables and the standard deviation
coef(mod_bv_boost_14)$sigma$`bols(selection, intercept = FALSE)`
coef(mod_bv_boost_14)$sigma$`bols(weekday, intercept = FALSE)`


# subset data for season 2015/16
data.bv.boost.15 <- data.bv.boost[data.bv.boost$season == 2015, ]

# reorder the MatchIDs to make predictions - see line 80 for further explanations
number.ids <- length(levels(data.bv.boost.2014$MatchID))
missing.rows <- number.ids - nrow(data.bv.boost.15)
data.bv.boost.15[(nrow(data.bv.boost.15) + 1):(nrow(data.bv.boost.15) + missing.rows), 
                 ] <- data.bv.boost.15[1, ]
data.bv.boost.15$MatchID <- c(1:number.ids)  #, rep(1, nrow(data.bv.boost.15) - number.ids))
data.bv.boost.15$MatchID <- as.factor(data.bv.boost.15$MatchID)

# predict mu and sigma for the next season
fitted <- predict(mod_bv_boost_14, newdata = data.bv.boost.15)

# set the random intercept = 0 for the prediction for mu 
# by subtracting the respective estimate for the random intercept
fitted_mu <- as.numeric(fitted$mu) - as.numeric(coef(mod_bv_boost_14)$mu$`brandom(MatchID)`)

# set the random intercept = 0 for the prediction for sigma
# by subtracting the respective estimate for the random intercept
fitted_sigma <- as.numeric(fitted$sigma) - as.numeric(coef(mod_bv_boost_14)$sigma$`brandom(MatchID)`)
fitted_sigma <- exp(fitted_sigma)

# eliminate duplicated observations which were created to make predictions
fitted_mu <- fitted_mu[1:(nrow(data.bv.boost.15) - missing.rows)]
fitted_sigma <- fitted_sigma[1:(nrow(data.bv.boost.15) - missing.rows)]
data.bv.boost.15 <- data.bv.boost.15[1:(nrow(data.bv.boost.15) - missing.rows), ]


# estimate the quantile residuals
quant.residuals <- qnorm(pLOGNO(data.bv.boost.15$volume_thsd, mu = fitted_mu, sigma = fitted_sigma))
## plot quantile residuals -- Figure 2
qqplot(quant.residuals)


# -> Analysis of the quantile residuals continues in file
# 'OutlierDetection_RCode.R'. The outlier analysis in there can be executed
# without running all models in this file.




# Plots -------------------------------------------------------------------
# Plots for the model fitted to data from season 2009/10 to 2014/15
# The code below generates Figure 1

# selected base learners for mu and sigma, respectively:
names(coef(mod_bv_boost_14)$mu)
names(coef(mod_bv_boost_14)$sigma)

# for all selected base learners, create a sequence from min(covariate) to
# max(covariate) to draw the estimated effect on an appropriate grid
data.plot <- data.bv.boost.2014
data.plot$matchday <- seq(min(data.bv.boost.2014$matchday), max(data.bv.boost.2014$matchday), 
                          length = nrow(data.bv.boost.2014))
data.plot$mvhome <- seq(min(data.bv.boost.2014$mvhome), max(data.bv.boost.2014$mvhome), 
                        length = nrow(data.bv.boost.2014))
data.plot$mvaway <- seq(min(data.bv.boost.2014$mvaway), max(data.bv.boost.2014$mvaway), 
                        length = nrow(data.bv.boost.2014))
data.plot$certainty <- seq(min(data.bv.boost.2014$certainty), max(data.bv.boost.2014$certainty), 
                           length = nrow(data.bv.boost.2014))



# Plots: mu ---------------------------------------------------------------
layout(matrix(c(rep(1, 4), rep(2, 4), rep(3, 4), rep(4, 3), rep(5, 3), rep(6, 3), 
                rep(7, 3)), 2, 12, byrow = TRUE))
par(mai = c(1.05, 0.25, 0.43, 0.1) + 0.1, mar = c(5, 2, 2.7, 1) + 0.1)

## matchday 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_bv_boost_14, parameter = "mu", which = "matchday", 
                                  newdata = data.plot))

# Plot the effect of matchday
plot(data.plot$matchday, pred.values, type = "l", lwd = 2, ylim = c(0, 1.6), xlab = "matchday", 
     main = c(bquote(widehat(f)[11])))
# add a short vertical line for each original data point
points(data.bv.boost.2014$matchday, rep(-0.05, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)


## mvhome 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_bv_boost_14, parameter = "mu", which = "mvhome", 
                                  newdata = data.plot))

# Plot the effect of mvhome
plot(data.plot$mvhome, pred.values, type = "l", lwd = 2, ylim = c(0, 1.6), xlab = "mvhome", 
     main = c(bquote(widehat(f)[21])))
# add a short vertical line for each original data point
points(data.bv.boost.2014$mvhome, rep(-0.05, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)


## mvaway 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_bv_boost_14, parameter = "mu", which = "mvaway", 
                                  newdata = data.plot))

# Plot the effect of mvaway
plot(data.plot$mvaway, pred.values, type = "l", lwd = 2, ylim = c(0, 1.6), xlab = "mvaway", 
     main = c(bquote(widehat(f)[31])))
# add a short vertical line for each original data point
points(data.bv.boost.2014$mvaway, rep(-0.05, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)


## certainty + corresponding interactions
# calculate values for the fitted function
pred.values <- predict(mod_bv_boost_14, parameter = "mu", which = "certainty", newdata = data.plot)
pred.values.1 <- pred.values[, 1] + pred.values[, 2]

# index for the several betting type categories
idx.selectionMO <- which(data.bv.boost.2014$selection == "MatchOdds")
idx.selectionOU15 <- which(data.bv.boost.2014$selection == "OU1.5")
idx.selectionOU25 <- which(data.bv.boost.2014$selection == "OU2.5")
idx.selectionOU35 <- which(data.bv.boost.2014$selection == "OU3.5")

# Plot the effect of uncertainty for the match odds category
plot(data.plot$certainty[idx.selectionMO], pred.values.1[idx.selectionMO], type = "l", 
     lwd = 2, ylim = c(0, 1.7), xlab = "certainty", main = c(bquote(widehat(f)[41])))
# add a short vertical line for each original data point
points(data.bv.boost.2014$certainty[idx.selectionMO], rep(-0.05, length(data.bv.boost.2014$certainty[idx.selectionMO])), 
       pch = "|", col = "black", cex = 0.7)


# Plot the effect of uncertainty for the over/under 1.5 category
plot(data.plot$certainty[idx.selectionOU15], pred.values.1[idx.selectionOU15], type = "l", 
     lwd = 2, ylim = c(0, 1.7), xlab = "certainty", main = c(bquote(widehat(f)[41] + 
                                                                      widehat(f)[51])))
points(data.bv.boost.2014$certainty[idx.selectionOU15], rep(-0.05, length(data.bv.boost.2014$certainty[idx.selectionOU15])), 
       pch = "|", col = "black", cex = 0.7)


# Plot the effect of uncertainty for the over/under 2.5 category
plot(data.plot$certainty[idx.selectionOU25], pred.values.1[idx.selectionOU25], type = "l", 
     lwd = 2, ylim = c(0, 1.7), xlab = "certainty", main = c(bquote(widehat(f)[41] + 
                                                                      widehat(f)[61])))
points(data.bv.boost.2014$certainty[idx.selectionOU25], rep(-0.05, length(data.bv.boost.2014$certainty[idx.selectionOU25])), 
       pch = "|", col = "black", cex = 0.7)


# Plot the effect of uncertainty for the over/under 3.5 category
plot(data.plot$certainty[idx.selectionOU35], pred.values.1[idx.selectionOU35], type = "l", 
     lwd = 2, ylim = c(0, 1.7), xlab = "certainty", main = c(bquote(widehat(f)[41] + 
                                                                      widehat(f)[71])))
points(data.bv.boost.2014$certainty[idx.selectionOU35], rep(-0.05, length(data.bv.boost.2014$certainty[idx.selectionOU35])), 
       pch = "|", col = "black", cex = 0.7)




# Plots: sigma ------------------------------------------------------------
layout(matrix(c(rep(1, 4), rep(2, 4), rep(3, 4), rep(4, 3), rep(5, 3), rep(6, 3), 
                rep(7, 3)), 2, 12, byrow = TRUE))
par(mai = c(1.05, 0.25, 0.43, 0.1) + 0.1, mar = c(5, 2, 2.7, 1) + 0.1)


## matchday 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_bv_boost_14, parameter = "sigma", which = "matchday", 
                                  newdata = data.plot))

# Plot the effect of matchday
plot(data.plot$matchday, pred.values, type = "l", lwd = 2, ylim = c(-0.7, 0.2), xlab = "matchday", 
     main = c(bquote(widehat(f)[12])))
# add a short vertical line for each original data point
points(data.bv.boost.2014$matchday, rep(-0.73, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)


## mvhome 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_bv_boost_14, parameter = "sigma", which = "mvhome", 
                                  newdata = data.plot))

# Plot the effect of mvhome
plot(data.plot$mvhome, pred.values, type = "l", lwd = 2, ylim = c(-0.7, 0.2), xlab = "mvhome", 
     main = c(bquote(widehat(f)[22])))
# add a short vertical line for each original data point
points(data.bv.boost.2014$mvhome, rep(-0.73, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)


## mvaway 
# calculate values for the fitted function
pred.values <- as.numeric(predict(mod_bv_boost_14, parameter = "sigma", which = "mvaway", 
                                  newdata = data.plot))

# Plot the effect of mvaway
plot(data.plot$mvaway, pred.values, type = "l", lwd = 2, ylim = c(-0.7, 0.2), xlab = "mvaway", 
     main = c(bquote(widehat(f)[32])))
# add a short vertical line for each original data point
points(data.bv.boost.2014$mvaway, rep(-0.73, nrow(data.plot)), pch = "|", col = "black", 
       cex = 0.7)



## certainty + corresponding interactions 
# calculate values for the fitted function
pred.values <- predict(mod_bv_boost_14, parameter = "sigma", which = "certainty", 
                       newdata = data.plot)
pred.values.1 <- pred.values[, 1] + pred.values[, 2]


# Plot the effect of uncertainty for the match odds category
plot(data.plot$certainty[idx.selectionMO], pred.values.1[idx.selectionMO], type = "l", 
     lwd = 2, ylim = c(-0.7, 0.2), xlab = "certainty", main = c(bquote(widehat(f)[42])))
# add a short vertical line for each original data point
points(data.bv.boost.2014$certainty[idx.selectionMO], rep(-0.73, length(data.bv.boost.2014$certainty[idx.selectionMO])), 
       pch = "|", col = "black", cex = 0.7)


# Plot the effect of uncertainty for the over/under 1.5 category
plot(data.plot$certainty[idx.selectionOU15], pred.values.1[idx.selectionOU15], type = "l", 
     lwd = 2, ylim = c(-0.7, 0.2), xlab = "certainty", main = c(bquote(widehat(f)[42] + 
                                                                         widehat(f)[52])))
points(data.bv.boost.2014$certainty[idx.selectionOU15], rep(-0.73, length(data.bv.boost.2014$certainty[idx.selectionOU15])), 
       pch = "|", col = "black", cex = 0.7)


# Plot the effect of uncertainty for the over/under 2.5 category
plot(data.plot$certainty[idx.selectionOU25], pred.values.1[idx.selectionOU25], type = "l", 
     lwd = 2, ylim = c(-0.7, 0.2), xlab = "certainty", main = c(bquote(widehat(f)[42] + 
                                                                         widehat(f)[62])))
points(data.bv.boost.2014$certainty[idx.selectionOU25], rep(-0.73, length(data.bv.boost.2014$certainty[idx.selectionOU25])), 
       pch = "|", col = "black", cex = 0.7)


# Plot the effect of uncertainty for the over/under 3.5 category
plot(data.plot$certainty[idx.selectionOU35], pred.values.1[idx.selectionOU35], type = "l", 
     lwd = 2, ylim = c(-0.7, 0.2), xlab = "certainty", main = c(bquote(widehat(f)[42] + 
                                                                         widehat(f)[72])))
points(data.bv.boost.2014$certainty[idx.selectionOU35], rep(-0.73, length(data.bv.boost.2014$certainty[idx.selectionOU35])), 
       pch = "|", col = "black", cex = 0.7)



# Descriptive statistics --------------------------------------------------
## Data for Table 1
# Please note: As we provide artificial data, 
# the following results differ slightly from those presented in the paper
summary(data.bv.boost)

