library(glmnet)
library(Matrix)
library(xtable)
source('code/npm.r')

# load data
data = read.csv('data/all2015.csv', header = FALSE)
colnames(data) = read.csv('data/fields.csv')$Header

# remove plate appearances for which pitcher is batter
data = data[data$BAT_FLD_CD != 1, ]

# remove batters and pitchers with sub-minimum playing time
batterID = as.character(data$BAT_ID)
PA = table(data$BAT_ID)
minPA = sort(PA, decreasing = TRUE)[390]
replace = is.element(data$BAT_ID, names(which(PA < minPA)))
batterID[replace] = paste('replace', data$BAT_FLD_CD[replace], sep = '')
pitcherID = as.character(data$PIT_ID)
BF = table(data$PIT_ID)
minBF = sort(BF, decreasing = TRUE)[360]
pitcherID[is.element(data$PIT_ID, names(which(BF < minBF)))] = 'replace1'

# define the response
events = c('F', 'G', 'K', 'BB', 'HBP', '1B', '2B', '3B', 'HR')
y = as.character(data$EVENT_CD)
y[(y == 2 | y == 18 | y == 19) & data$BATTEDBALL_CD != 'G'] = 'F'
y[(y == 2 | y == 18 | y == 19) & data$BATTEDBALL_CD == 'G' & !data$SH_FL] = 'G'
y[y == 3] = 'K'
y[y == 14] = 'BB'
y[y == 16] = 'HBP'
y[y == 20] = '1B'
y[y == 21] = '2B'
y[y == 22] = '3B'
y[y == 23] = 'HR'
subset = is.element(y, events)
y = as.factor(y[subset])
Y = model.matrix(~ y - 1)

hand = (data$RESP_BAT_HAND_CD != data$RESP_PIT_HAND_CD)[subset]
home = data$BAT_HOME_ID[subset] == 1
stadium = as.numeric(data$HOME_TEAM_ID[subset]) + 2
batter = as.numeric(as.factor(batterID[subset])) + max(stadium)
pitcher = as.numeric(as.factor(pitcherID[subset])) + max(batter)

x = sparseMatrix(c(which(hand), which(home), rep(1:sum(subset), 3)),
  c(rep(1, sum(hand)), rep(2, sum(home)), stadium, batter, pitcher))



# get naive rate estimates
counts = rbind(aggregate(y, by = list(hand), table),
  aggregate(y, by = list(home), table),
  aggregate(y, by = list(as.character(data$HOME_TEAM_ID[subset])), table),
  aggregate(y, by = list(batter), table),
  aggregate(y, by = list(pitcher), table))[, 2]
rates = counts/rowSums(counts)
rownames(rates) = c('samehand', 'oppohand', 'away', 'home',
  sort(unique(as.character(data$HOME_TEAM)[subset])),
  sort(unique(batterID[subset])), sort(unique(pitcherID[subset])))

pca.bat = prcomp(rates[2+min(batter):max(batter), ])
pcs.bat = pca.bat$rotation[events, ]
varexp.bat = pca.bat$sdev^2/sum(pca.bat$sdev^2)
pca.bat.table = rbind(pcs.bat, 100*varexp.bat, rep(NA, 9))
rownames(pca.bat.table) = c(events, '% Variance', 'explained')
print(xtable(pca.bat.table, digits = 1), file = 'tabs/pca-bat.tex',
  include.colnames = FALSE, only.contents = TRUE, hline.after = 9)
pdf('figs/pca-bat.pdf', width = 10)
barplot(varexp.bat, axes = FALSE, col = 'lightgreen', border = NA)
dev.off()

pca.pit = prcomp(rates[2+min(pitcher):max(pitcher), ])
pcs.pit = pca.pit$rotation[events, ]
varexp.pit = pca.pit$sdev^2/sum(pca.pit$sdev^2)
pca.pit.table = rbind(pcs.pit, 100*varexp.pit, rep(NA, 9))
rownames(pca.pit.table) = c(events, '% Variance', 'explained')
print(xtable(pca.pit.table, digits = 1), file = 'tabs/pca-pit.tex',
  include.colnames = FALSE, only.contents = TRUE, hline.after = 9)
pdf('figs/pca-pit.pdf', width = 10)
barplot(varexp.pit, axes = FALSE, col = 'lightgreen', border = NA)
dev.off()


pdf('figs/pca-biplot-bat.pdf', height = 5, width = 5)
c = 5
plot(pca.bat$x[, 1], pca.bat$x[, 2], col = 'dodgerblue', axes = FALSE,
  cex = 0.6,
  xlab = 'First principal component', ylab = 'Second principal component',
  xlim = range(c(pca.bat$rotation[, 1]/c*1.1, pca.bat$x[, 1])),
  ylim = range(c(pca.bat$rotation[, 2]/c*1.1, pca.bat$x[, 2])))
axis(1)
axis(2)
axis(3, at = (-2:2)/10, label = c*(-2:2)/10)
axis(4, at = (-3:3)/20, label = c*(-3:3)/20)
for (k in 1:ncol(pcbr)) {
  x = pca.bat$rotation[k, 1]
  y = pca.bat$rotation[k, 2]
  if (x^2 + y^2 > .01) {
    arrows(0, 0, x/c, y/c, lwd = 2, length = 0.1)
    c2 = 40*(abs(x) + abs(y))
    text(x/c + x/c2, y/c + y/c2, rownames(pca.bat$rotation)[k], cex = 1.5)
  }
}
dev.off()


pdf('figs/pca-biplot-pit.pdf', height = 5, width = 5)
c = 5
plot(pca.pit$x[, 1], pca.pit$x[, 2], col = 'dodgerblue', axes = FALSE,
  cex = 0.6,
  xlab = 'First principal component', ylab = 'Second principal component',
  xlim = range(c(pca.pit$rotation[, 1]/c*1.1, pca.pit$x[, 1])),
  ylim = range(c(pca.pit$rotation[, 2]/c*1.1, pca.pit$x[, 2])))
axis(1)
axis(2)
axis(3, at = (-2:2)/10, label = c*(-2:2)/10)
axis(4, at = (-2:1)/10, label = c*(-2:1)/10)
for (k in 1:ncol(pcbr)) {
  x = pca.pit$rotation[k, 1]
  y = pca.pit$rotation[k, 2]
  if (x^2 + y^2 > .01) {
    arrows(0, 0, x/c, y/c, lwd = 2, length = 0.1)
    c2 = 40*(abs(x) + abs(y))
    text(x/c + x/c2, y/c + y/c2, rownames(pca.pit$rotation)[k], cex = 1.5)
  }
}
dev.off()


#training = 1
#
#set.seed(8394)
#permutation = sample(1:nrow(x))
#block = rep(1:20, length = nrow(x))[permutation]
#foldid = sort(rep(1:10, length = nrow(x)))[permutation]
#test = block > training
#
### multinomial model
#middle = -4
#range = 8
#path = exp(seq(middle + range/2, middle - range/2, length = 9))
#
#time = Sys.time()
#fit.glmnet = cv.glmnet(x[!test, ], y[!test], family = 'multinomial', alpha = 0,
#  standardize = FALSE, lambda = path, foldid = foldid[!test])
#Sys.time() - time
#
#pred.glmnet.train = predict(fit.glmnet, as.matrix(x[!test, ]),
#  s = 'lambda.min', type = 'response')[, , 1]
#pred.glmnet.test = predict(fit.glmnet, as.matrix(x[test, ]),
#  s = 'lambda.min', type = 'response')[, , 1]
#
#deviance.glmnet.train = -2 * mean(log(rowSums(Y[!test, ] * pred.glmnet.train)))
#deviance.glmnet.test = -2 * mean(log(rowSums(Y[test, ] * pred.glmnet.test)))
#
### npmr
#middle = 3.75
#range = 2^(-1)
#path = exp(seq(middle + range/2, middle - range/2, length = 9))
#
#group = rep(0:3, c(2, length(unique(stadium)),
#    length(unique(batter)), length(unique(pitcher))))
#
#Sys.time = Sys.time()
#fit.npmr = cv.npmr(x[!test, ], model.matrix(~ y - 1)[!test, ], lambda = path,
#  s = .01, eps = 1e-8, foldid = foldid[!test],
#  group = group)
#Sys.time() - Sys.time
#
#pred.npmr.train = predict(fit.npmr, as.matrix(x[!test, ]))
#pred.npmr.test = predict(fit.npmr, as.matrix(x[test, ]))
#
#deviance.npmr.train = -2 * mean(log(rowSums(Y[!test, ] * pred.npmr.train)))
#deviance.npmr.test = -2 * mean(log(rowSums(Y[test, ] * pred.npmr.test)))
#
#save.image(paste('results/baseball', training, '.rda', sep = ''))
#
#P = predict(fit.npmr, diag(ncol(x)))
#rownames(P) = c('oppohand', 'home',
#  sort(unique(as.character(data$HOME_TEAM)[subset])),
#  sort(unique(batterID[subset])), sort(unique(pitcherID[subset])))
#colnames(P) = sort(events)
#
#B.bat = fit.npmr$fit$B[min(batter):max(batter), , 1]
#SVD.bat = svd(B.bat)
#V.bat = SVD.bat$v
#UD.bat = SVD.bat$u %*% diag(SVD.bat$d)
#rownames(V.bat) = sort(events)
#rownames(UD.bat) = sort(unique(batterID[subset]))
#
#B.pit = fit.npmr$fit$B[min(pitcher):max(pitcher), , 1]
#SVD.pit = svd(B.pit)
#V.pit = SVD.pit$v
#UD.pit = SVD.pit$u %*% diag(SVD.pit$d)
#rownames(V.pit) = sort(events)
#rownames(UD.pit) = sort(unique(pitcherID[subset]))
#
##pdf('figs/baseball.pdf', height = 5)
##plot(-UD[33:796, 1], -UD[33:796, 2], pch = c(rep(2, 403), rep(1, 361)),
##  col = c(rep('darkorange', 403), rep('dodgerblue', 361)),
##  xlab = 'Latent variable 1', ylab = 'Latent variable 2')
##dev.off()
##
##latent.var.table = rbind(V, SVD$d, rep(NA, 9))
##rownames(latent.var.table) = c(events, 'Singular', 'value')
##print(xtable(latent.var.table, digits = 1), file = 'tabs/latent-var.tex',
##  include.colnames = FALSE, only.contents = TRUE, hline.after = 9)
##pdf('figs/latent-var.pdf', width = 10)
##barplot(SVD$d, axes = FALSE, col = 'lightgreen', border = NA)
##dev.off()
##
##
