# R Code for the paper 'Integrating multiple data sources in match-fixing warning systems -------- 
# by Marius tting, Roland Langrock and Christian Deutscher

# This file contains the code for the outlier detection and can be executed
# without running the other files. The information for Tables 5, 6 and 7 and for 
# Figure 4 are also contained here.



# classification of fixed matches by betting volumes ---------------------- 
# read data
data_outlier_volume <- read.csv("data_outlier_volume.csv")

# define range of possible cut-off values; column "QuantRes" incorporates the 
# quantile residuals of the betting volume model estimated using the original
# data
dev <- data_outlier_volume$QuantRes
quantiles <- seq(min(dev, na.rm = TRUE), max(dev, na.rm = TRUE), by = 0.01)

data_outlier_volume$FixedMatch <- as.numeric(as.character(data_outlier_volume$FixedMatch))
# create empty vectors for the results (true positive rate, false positive rate,
# positive predicted values, negative predicted values)
tpr <- fpr <- ppv <- npv <- c()
for (i in 1:length(quantiles)) {
  # flag all matches as suspicious which are greather than the certain quantile
  flag.match.idx <- which(dev > quantiles[i])
  flagged.matches <- data_outlier_volume[flag.match.idx, ]
  # if a match is flagged for multiple betting types, only flag them once as
  # suspicious
  fixed.sum <- aggregate(flagged.matches$FixedMatch, by = list(Category = flagged.matches$MatchID), 
                         FUN = sum)
  fixed.sum$fixed.final <- ifelse(fixed.sum$x >= 1, 1, 0)
  
  # calculate FPR and TPR for the current cut-off value
  found.matches <- sum(fixed.sum$fixed.final)
  pred.notcorrect <- length(unique(fixed.sum$Category)) - found.matches
  tpr <- c(tpr, found.matches/24)
  fpr <- c(fpr, pred.notcorrect/3195)
  
  # PPV and negative PV
  fp <- length(unique((flagged.matches$MatchID))) - found.matches
  ppv <- c(ppv, found.matches/(found.matches + fp))
  fn <- 24 - found.matches
  tn <- 3195 - fp
  npv <- c(npv, tn/(fn + tn))
}


# Classification by betting volumes: choose optimal cutoff via Youden's index
# Please note: since the quantile residuals in this dataset are the ones estimated using the original data, 
# these results are exactly the same as in the paper

# Youden index
max(tpr - fpr)
# corresponding cutoff for which the Youden index reaches the maximum
quantiles[which.max(tpr - fpr)]
# TPR for the optimal cutoff
tpr[which.max(tpr - fpr)]
# FPR for the optimal cutoff
fpr[which.max(tpr - fpr)]

# optimal cutoff via positive predicted value
quantiles[which.max(ppv)]
# corresponding PPV and NPV for the optimal cut off
ppv[which.max(ppv)]
npv[which.max(ppv)]

# For the optimal cut-off via Younden index, check how many matches were
# correctly identified
flag.match.idx <- which(dev > quantiles[which.max(tpr - fpr)])
flagged.matches <- data_outlier_volume[flag.match.idx, ]
# make sure that each match is contained at most once when counting the flagged matches
# (it could be possible that one match if flagged multiple times, i.e. for multiple betting types)
flagged.matches.short <- unique(flagged.matches[, c("season", "MatchID", 
                                                    "FixedMatch")])
# -> Information contained in Table 6
# predicted fixed
(pred.fix <- nrow(flagged.matches.short))
# predicted normal
(pred.normal <- 3219 - nrow(flagged.matches.short))
# take only the ones which are proven as fixed
flagged.matches.short <- flagged.matches.short[flagged.matches.short$FixedMatch == 
                                                 1, ]
# predicted fixed and acutal fixed
nrow(flagged.matches.short)
# predicted normal and actual fixed
24 - nrow(flagged.matches.short)
# predicted fixed and actual normal
pred.fix - nrow(flagged.matches.short)
# predicted normal and actual normal
pred.normal - (24 - nrow(flagged.matches.short))


# Classification by betting volumes: choose optimal cutoff via PPV
# For the optimal cut-off via PPV, check how many matches were correctly
# identified
flag.match.idx <- which(dev > quantiles[which.max(ppv)])
flagged.matches <- data_outlier_volume[flag.match.idx, ]
flagged.matches.short <- unique(flagged.matches[, c("season", "MatchID", 
                                                    "FixedMatch")])
# -> Information contained in Table 7
# predicted fixed
(pred.fix <- nrow(flagged.matches.short))
# predicted normal
(pred.normal <- 3219 - nrow(flagged.matches.short))
# take only the ones which are proven as fixed
flagged.matches.short <- flagged.matches.short[flagged.matches.short$FixedMatch == 
                                                 1, ]
# predicted fixed and acutal fixed
nrow(flagged.matches.short)
# predicted normal and actual fixed
24 - nrow(flagged.matches.short)
# predicted fixed and actual normal
pred.fix - nrow(flagged.matches.short)
# predicted normal and actual normal
pred.normal - (24 - nrow(flagged.matches.short))



# classification of fixed matches by betting odds ------------------------- 
# read data
data_outlier_odds <- read.csv("data_outlier_odds.csv")

# define range of possible cut-off values
cutoffs <- seq(0, 1, by = 0.01)
# create empty vectors for the results (true positive rate, false positive rate,
# positive predicted values, negative predicted values)
tpr.odds <- fpr.odds <- ppv <- npv <- c()
outlier.all <- list()
for (i in 1:length(cutoffs)) {
  akt.quantile <- cutoffs[i]
  # outlier home
  quantile.cutoff <- quantile(data_outlier_odds$odds.1.mod/data_outlier_odds$BetfairH, 
                              probs = akt.quantile, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.1.mod/data_outlier_odds$BetfairH > quantile.cutoff)
  outlier.home <- data_outlier_odds[susp.idx, ]
  
  # outlier draw
  quantile.cutoff <- quantile(data_outlier_odds$odds.x.mod/data_outlier_odds$BetfairD, 
                              probs = akt.quantile, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.x.mod/data_outlier_odds$BetfairD > quantile.cutoff)
  outlier.draw <- data_outlier_odds[susp.idx, ]
  
  # outlier away
  quantile.cutoff <- quantile(data_outlier_odds$odds.2.mod/data_outlier_odds$BetfairA, 
                              probs = akt.quantile, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.2.mod/data_outlier_odds$BetfairA > quantile.cutoff)
  outlier.away <- data_outlier_odds[susp.idx, ]
  
  # outlier U1.5
  quantile.cutoff <- quantile(data_outlier_odds$odds.u15.mod/data_outlier_odds$BetfairU1.5, 
                              probs = akt.quantile, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.u15.mod/data_outlier_odds$BetfairU1.5 > 
                      quantile.cutoff)
  outlier.u15 <- data_outlier_odds[susp.idx, ]
  
  # outlier O1.5
  quantile.cutoff <- quantile(data_outlier_odds$odds.o15.mod/data_outlier_odds$BetfairO1.5, 
                              probs = akt.quantile, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.o15.mod/data_outlier_odds$BetfairO1.5 > 
                      quantile.cutoff)
  outlier.o15 <- data_outlier_odds[susp.idx, ]
  
  # outlier U2.5
  quantile.cutoff <- quantile(data_outlier_odds$odds.u25.mod/data_outlier_odds$BetfairU2.5, 
                              probs = akt.quantile, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.u25.mod/data_outlier_odds$BetfairU2.5 > 
                      quantile.cutoff)
  outlier.u25 <- data_outlier_odds[susp.idx, ]
  
  # outlier O2.5
  quantile.cutoff <- quantile(data_outlier_odds$odds.o25.mod/data_outlier_odds$BetfairO2.5, 
                              probs = akt.quantile, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.o25.mod/data_outlier_odds$BetfairO2.5 > 
                      quantile.cutoff)
  outlier.o25 <- data_outlier_odds[susp.idx, ]
  
  # outlier U3.5
  quantile.cutoff <- quantile(data_outlier_odds$odds.u35.mod/data_outlier_odds$BetfairU3.5, 
                              probs = akt.quantile, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.u35.mod/data_outlier_odds$BetfairU3.5 > 
                      quantile.cutoff)
  outlier.u35 <- data_outlier_odds[susp.idx, ]
  
  # outlier O3.5
  quantile.cutoff <- quantile(data_outlier_odds$odds.o35.mod/data_outlier_odds$BetfairO3.5, 
                              probs = akt.quantile, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.o35.mod/data_outlier_odds$BetfairO3.5 > 
                      quantile.cutoff)
  outlier.o35 <- data_outlier_odds[susp.idx, ]
  
  outlier.all[[i]] <- rbind(outlier.home, outlier.draw, outlier.away, outlier.u15, 
                            outlier.o15, outlier.u25, outlier.o25, outlier.u35, outlier.o35)
  
  # FPR and TPR
  flagged <- nrow(unique(outlier.all[[i]]))
  proven <- sum(as.numeric(as.character(unique(outlier.all[[i]])$FixedMatch)))
  tpr.odds <- c(tpr.odds, proven/24)
  fpr.odds <- c(fpr.odds, flagged/3195)
  
  # PPV and negative PV
  fp <- flagged - proven
  ppv <- c(ppv, proven/(proven + fp))
  fn <- 24 - proven
  tn <- 3195 - fp
  npv <- c(npv, tn/(fn + tn))
}


# Classification by betting odds: choose optimal cutoff via Youden's index
# Please note: As we provide artificial data for the Betfair betting odds, 
# the following results differ slightly from those presented in the paper

# Youden index
max(tpr.odds - fpr.odds)
# corresponding cutoff for which the Youden index reaches the maximum
cutoffs[which.max(tpr.odds - fpr.odds)]
# TPR for the optimal cutoff
tpr.odds[which.max(tpr.odds - fpr.odds)]
# FPR for the optimal cutoff
fpr.odds[which.max(tpr.odds - fpr.odds)]

# optimal cutoff via positive predicted value
cutoffs[which.max(ppv)]
# corresponding PPV and NPV for the optimal cut off
ppv[which.max(ppv)]
npv[which.max(ppv)]



# Please note: As we provide artificial data for the Betfair betting odds, 
# the following results differ slightly from those presented in the paper

# Youden index
idx <- which.max(tpr.odds - fpr.odds)
# For the optimal cut-off via Younden index, check how many matches were
# correctly identified
# -> Information contained in Table 6
# predicted fixed
(pred.fix <- nrow(unique(outlier.all[[idx]])))
# predicted normal
(pred.normal <- 3219 - nrow(unique(outlier.all[[idx]])))
# predicted fixed and acutal fixed
sum(as.numeric(as.character(unique(outlier.all[[idx]])$FixedMatch)))
# predicted normal and actual fixed
24 - sum(as.numeric(as.character(unique(outlier.all[[idx]])$FixedMatch)))
# predicted fixed and actual normal
pred.fix - sum(as.numeric(as.character(unique(outlier.all[[idx]])$FixedMatch)))
# predicted normal and actual normal
pred.normal - (24 - sum(as.numeric(as.character(unique(outlier.all[[idx]])$FixedMatch))))


# Classification by betting odds: choose optimal cutoff via PPV
# Please note: As we provide artificial data for the Betfair betting odds, 
# the following results differ slightly from those presented in the paper
idx <- which(cutoffs == cutoffs[which.max(ppv)])

# For the optimal cut-off via PPV, check how many matches were 
# correctly identified
# -> Information contained in Table 7
# predicted fixed
(pred.fix <- nrow(unique(outlier.all[[idx]])))
# predicted normal
(pred.normal <- 3219 - nrow(unique(outlier.all[[idx]])))
# predicted fixed and acutal fixed
sum(as.numeric(as.character(unique(outlier.all[[idx]])$FixedMatch)))
# predicted normal and actual fixed
24 - sum(as.numeric(as.character(unique(outlier.all[[idx]])$FixedMatch)))
# predicted fixed and actual normal
pred.fix - sum(as.numeric(as.character(unique(outlier.all[[idx]])$FixedMatch)))
# predicted normal and actual normal
pred.normal - (24 - sum(as.numeric(as.character(unique(outlier.all[[idx]])$FixedMatch))))





# ROC curve plot ----------------------------------------------------------
# Figure 4 is generated here
# Please note: since the classification based on betting odds uses artificial
# data for the betting odds, this plot differs slightly from the one presented
# in the paper
plot(fpr, tpr, type = "l", xlab = "FPR", ylab = "TPR")
lines(fpr.odds, tpr.odds, col = "black", lwd = 2, lty = 3)
abline(0, 1, col = "lightgray")

legend(x = -0.03, y = 1.02, c("Volume model", "Odds model"), lty = c(1, 3), cex = 0.75, 
       lwd = 2, bty = "n", y.intersp = 2)


# Combined approach using volumes and odds ----------------------------------------------

# generate sequences for possible cutoffs for the two models
dev <- data_outlier_volume$QuantRes
quantiles <- seq(min(dev, na.rm = TRUE), max(dev, na.rm = TRUE), by = 0.05)
cutoffs <- seq(0, 0.99, by = 0.01)

# create empty vectors for the results
tpr.odds <- fpr.odds <- tpr.one <- fpr.one <- ppv <- npv <- flagged <- 
  found.matches <- proven <- c()

# define grid off all combinations of cutoffs for both models
cutoffs.grid <- expand.grid(quantiles, cutoffs)

data_outlier_volume$FixedMatch <- as.numeric(as.character(data_outlier_volume$FixedMatch))
# Please note: Executing the for-loop may take several hours. The corresponding
# indices for the optimal combination of cut off values are stated in line 439 
# and 471
for (i in 1:nrow(cutoffs.grid)) {
  # set cut-offs
  akt.quantile <- cutoffs.grid[i, 1]
  akt.cutoff <- cutoffs.grid[i, 2]
  ## betting volumes; flag matches according to the current cutoff
  flag.match.idx <- which(dev > akt.quantile)
  flagged.matches.df <- data_outlier_volume
  # create column which indicates flagged matches
  flagged.matches.df$FlaggedBV <- 0
  flagged.matches.df$FlaggedBV[flag.match.idx] <- 1
  
  # save matches flagged by the betting volume model in a data frame
  flagged.matches.short <- unique(flagged.matches.df[flag.match.idx, c("season", "HomeTeam", "AwayTeam",
                                                                       "MatchID", "FlaggedBV", "FixedMatch")])
  
  
  ## betting odds; flag matches according to the current cutoff outlier home
  quantile.cutoff <- quantile(data_outlier_odds$odds.1.mod/data_outlier_odds$BetfairH, 
                              probs = akt.cutoff, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.1.mod/data_outlier_odds$BetfairH > quantile.cutoff)
  outlier.home <- data_outlier_odds[susp.idx, ]
  
  # outlier draw
  quantile.cutoff <- quantile(data_outlier_odds$odds.x.mod/data_outlier_odds$BetfairD, 
                              probs = akt.cutoff, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.x.mod/data_outlier_odds$BetfairD > quantile.cutoff)
  outlier.draw <- data_outlier_odds[susp.idx, ]
  
  # outlier away
  quantile.cutoff <- quantile(data_outlier_odds$odds.2.mod/data_outlier_odds$BetfairA, 
                              probs = akt.cutoff, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.2.mod/data_outlier_odds$BetfairA > quantile.cutoff)
  outlier.away <- data_outlier_odds[susp.idx, ]
  
  # outlier U1.5
  quantile.cutoff <- quantile(data_outlier_odds$odds.u15.mod/data_outlier_odds$BetfairU1.5, 
                              probs = akt.cutoff, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.u15.mod/data_outlier_odds$BetfairU1.5 > 
                      quantile.cutoff)
  outlier.u15 <- data_outlier_odds[susp.idx, ]
  
  # outlier O1.5
  quantile.cutoff <- quantile(data_outlier_odds$odds.o15.mod/data_outlier_odds$BetfairO1.5, 
                              probs = akt.cutoff, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.o15.mod/data_outlier_odds$BetfairO1.5 > 
                      quantile.cutoff)
  outlier.o15 <- data_outlier_odds[susp.idx, ]
  
  # outlier U2.5
  quantile.cutoff <- quantile(data_outlier_odds$odds.u25.mod/data_outlier_odds$BetfairU2.5, 
                              probs = akt.cutoff, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.u25.mod/data_outlier_odds$BetfairU2.5 > 
                      quantile.cutoff)
  outlier.u25 <- data_outlier_odds[susp.idx, ]
  
  # outlier O2.5
  quantile.cutoff <- quantile(data_outlier_odds$odds.o25.mod/data_outlier_odds$BetfairO2.5, 
                              probs = akt.cutoff, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.o25.mod/data_outlier_odds$BetfairO2.5 > 
                      quantile.cutoff)
  outlier.o25 <- data_outlier_odds[susp.idx, ]
  
  # outlier U3.5
  quantile.cutoff <- quantile(data_outlier_odds$odds.u35.mod/data_outlier_odds$BetfairU3.5, 
                              probs = akt.cutoff, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.u35.mod/data_outlier_odds$BetfairU3.5 > 
                      quantile.cutoff)
  outlier.u35 <- data_outlier_odds[susp.idx, ]
  
  # outlier O3.5
  quantile.cutoff <- quantile(data_outlier_odds$odds.o35.mod/data_outlier_odds$BetfairO3.5, 
                              probs = akt.cutoff, na.rm = TRUE)
  susp.idx <- which(data_outlier_odds$odds.o35.mod/data_outlier_odds$BetfairO3.5 > 
                      quantile.cutoff)
  outlier.o35 <- data_outlier_odds[susp.idx, ]
  
  outlier.gesamt <- rbind(outlier.home, outlier.draw, outlier.away, outlier.u15, 
                          outlier.o15, outlier.u25, outlier.o25, outlier.u35, outlier.o35)
  outlier.unique <- unique(outlier.gesamt)
  # save the matches flagged by betting odds in a data frame
  outlier.unique <- outlier.unique[, c("season", "HomeTeam", "AwayTeam", "FixedMatch")]
  outlier.unique$FlaggedOdds <- 1
  
  # merge the two data frames with flagged matches
  intmd.result <- merge(flagged.matches.short, outlier.unique, all = TRUE, by.x = c("season", 
                                                                                    "HomeTeam", "AwayTeam"), by.y = c("season", "HomeTeam", "AwayTeam"))
  intmd.result$FixedMatch <- NA
  # insert in the merged data frame (i.e. in "intmd.result") which matches were
  # flagged by which model
  intmd.result$FixedMatch.x <- as.numeric(as.character(intmd.result$FixedMatch.x))
  intmd.result$FixedMatch.y <- as.numeric(as.character(intmd.result$FixedMatch.y))
  for (r in 1:nrow(intmd.result)) {
    intmd.result$FixedMatch[r] <- ifelse(is.na(intmd.result$FixedMatch.x[r]), 
                                         intmd.result$FixedMatch.y[r], intmd.result$FixedMatch.x[r])
  }
  intmd.result$FixedMatch <- as.numeric(as.character(intmd.result$FixedMatch))
  # The column 'FlaggedFixOneWay' shows whether the match is flagged by at least
  # one model or not
  intmd.result$FlaggedFixOneWay <- intmd.result$FlaggedBV | intmd.result$FlaggedOdds
  
  # take all proven fixed matches of the ones which were at least flagged by 
  # one model
  fixed.sum.oneway <- intmd.result[intmd.result$FixedMatch == 1, ]$FlaggedFixOneWay
  
  # TPR and FPR for the combined approach
  found.matches[i] <- sum(fixed.sum.oneway, na.rm = TRUE)
  tpr.one <- c(tpr.one, found.matches[i]/24)
  
  pred.notcorrect <- sum(intmd.result$FlaggedFixOneWay) - found.matches[i]
  fpr.one <- c(fpr.one, pred.notcorrect/3195)
  
  # PPV and NPV for the combinded approach
  flagged[i] <- sum(intmd.result$FlaggedFixOneWay)
  fp <- flagged[i] - found.matches[i]
  ppv <- c(ppv, found.matches[i]/(found.matches[i] + fp))
  fn <- 24 - found.matches[i]
  tn <- 3195 - fp
  npv <- c(npv, tn/(fn + tn))
  
  if (i%%5000 == 0) print(i)
}

# Classification by betting volumes and betting odds: choose optimal cutoff via Youden index
# Please note: As we provide artificial data for the Betfair betting odds, 
# the following results differ slightly from those presented in the paper

# Youden index
max(tpr.one - fpr.one)
idx.max <- which((tpr.one - fpr.one) == max(tpr.one - fpr.one))  
# resulting index:
# idx.max <- 25704

# optimal cutoffs via Youden index
cutoffs.grid[idx.max, ]
# corresponding TPR and FPR for the optimal cut off
tpr.one[idx.max]
fpr.one[idx.max]

# For the optimal cut-off via Younden index, check how many matches were
# correctly identified
# -> Information contained in Table 6
# predicted fixed
(pred.fix <- flagged[idx.max])
# predicted normal
(pred.normal <- 3219 - flagged[idx.max])
# predicted fixed and acutal fixed
found.matches[idx.max]
# predicted normal and actual fixed
24 - found.matches[idx.max]
# predicted fixed and actual normal
pred.fix - found.matches[idx.max]
# predicted normal and actual normal
pred.normal - ((24 - found.matches[idx.max]))


# Classification by betting volumes and betting odds: choose optimal cutoff via PPV
# Please note: As we provide artificial data for the Betfair betting odds, 
# the following results differ slightly from those presented in the paper

# optimal cutoff for the combined approach via PPV
idx.max.ppv <- which.max(ppv) 
# resulting index:
# idx.max.ppv <- 25726
cutoffs.grid[idx.max.ppv,]
# corresponding PPV and NPV for the optimal cut off
ppv[idx.max.ppv]
npv[idx.max.ppv]

# For the optimal cut-off via PPV, check how many matches were
# correctly identified
# -> Information contained in Table 7
# predicted fixed
(pred.fix <- flagged[idx.max.ppv])
# predicted normal
(pred.normal <- 3219 - flagged[idx.max])
# predicted fixed and acutal fixed
found.matches[idx.max.ppv]
# predicted normal and actual fixed
24 - found.matches[idx.max.ppv]
# predicted fixed and actual normal
pred.fix - found.matches[idx.max.ppv]
# predicted normal and actual normal
pred.normal - ((24 - found.matches[idx.max.ppv]))



# Pseudo-likelihood statistics -- Table 5 --------------------------------------------

# create a dummy indicating whether a certain bet would have been won
data_outlier_odds$U1.5 <- ifelse(data_outlier_odds$y1 + data_outlier_odds$y2 < 2, 1, 0)
data_outlier_odds$O1.5 <- ifelse(data_outlier_odds$y1 + data_outlier_odds$y2 > 1, 1, 0)
data_outlier_odds$U2.5 <- ifelse(data_outlier_odds$y1 + data_outlier_odds$y2 < 3, 1, 0)
data_outlier_odds$O2.5 <- ifelse(data_outlier_odds$y1 + data_outlier_odds$y2 > 2, 1, 0)
data_outlier_odds$U3.5 <- ifelse(data_outlier_odds$y1 + data_outlier_odds$y2 < 3, 1, 0)
data_outlier_odds$O3.5 <- ifelse(data_outlier_odds$y1 + data_outlier_odds$y2 > 2, 1, 0)
data_outlier_odds$Res1 <- ifelse(data_outlier_odds$y1 > data_outlier_odds$y2, 1, 0)
data_outlier_odds$ResX <- ifelse(data_outlier_odds$y1 == data_outlier_odds$y2, 1, 0)
data_outlier_odds$Res2 <- ifelse(data_outlier_odds$y1 < data_outlier_odds$y2, 1, 0)

# check for which betting types there are no betfair odds available
nas <- apply(data_outlier_odds[23:31], 2, function(y) is.na(y))
nas <- as.data.frame(nas)

# "bet.types" includes the names of the columns for the betfair odds
bet.types <- colnames(nas)
# "res.types" includes the names for the columns indicating whether the certain bet was won
res.types <- c("Res1", "ResX", "Res2", "U1.5", "O1.5", "U2.5", "O2.5", "U3.5", "O3.5")
# "model.types" includes the names of the columns for the model-based odds
model.types <- colnames(data_outlier_odds)[14:22]
# empty matrix for the results
res <- as.data.frame(matrix(nrow = 2, ncol = 9))

# loop for each betting type
for (i in 1:length(bet.types)) {
  # check whether (for the current betting type) Betfair odds are offered
  na.1 <- !nas[[bet.types[i]]]
  # index for the bets which were won
  idx.win <- data_outlier_odds[[res.types[i]]] == 1
  # take observations where Betfair odds are offered and where the bet 
  # was won into the calculation
  idx.take <- na.1 == 1 & idx.win == 1 
  win <- data_outlier_odds[idx.take, ]
  # calculate pseudo-likelihood statistics for the Betfair odds and the model-based odds
  pl.betfair <- exp(mean(log(1/win[[bet.types[i]]]), na.rm = TRUE))
  pl.model <- exp(mean(log(1/win[[model.types[i]]]), na.rm = TRUE))
  res[, i] <- c(pl.betfair, pl.model)
}
row.names(res) <- c("Betfair", "Model")
colnames(res) <- c("1", "X", "2", "U1.5", "O1.5", "U2.5", "O2.5", "U3.5", "O3.5")
# Please note: As we provide artificial data for the Betfair betting odds, 
# the following results differ slightly from those presented in the paper
res
