#### code for the paper 'A regularised hidden Markov model for analysing the hot shoe in football' ####

## There are two files of R-code: 
## This file contains the code for the 'hot shoe' case study presented in the manuscript (Section 4)
## The file 'hotshoe_simulation.R' contains the code for the simulation study (Section 3)

## the code was tested in R (version 4.0.2) on Windows 10 with packages...

# ...Rcpp (version 1.0.5)
# ...data.table (version 1.13.0)
# ...ggplot2 (version 3.3.2)
# ...gridExtra (version 2.3)
# ...dplyr (version 1.0.2)
# ...lubridate (version 1.7.9)
# ...footballpenaltiesBL (version 1.0.0)

## authors: Marius Oetting and Andreas Groll
## Email for correspondence: marius.oetting@uni-bielefeld.de

## load packages
library(Rcpp)
library(data.table)
library(ggplot2)
library(gridExtra)
library(dplyr)
library(lubridate)
library(footballpenaltiesBL)

## import C++ function for faster likelihood evaluation
sourceCpp("nLogLike.cpp")

# application -------------------------------------------------------------

# import data
data(penalties)
# filter player with at least 5 attempts
penaltytaker_table <- as.data.frame(table(penalties$penaltytaker))
players <- as.character(unique(penaltytaker_table[penaltytaker_table$Freq >= 5,]$Var1))
hotshoe_data <- penalties[penalties$penaltytaker %in% players, ]

# recode home dummy
hotshoe_data$homegame[hotshoe_data$homegame == 2] <- 0
# generate goal column
hotshoe_data$goal <- ifelse(hotshoe_data$result == "Tor", 1, 0)
## generate further covariates
hotshoe_data$year <- gsub("-.+$", "", hotshoe_data$date)
hotshoe_data$year <- as.numeric(as.character(hotshoe_data$year))
hotshoe_data$yearbefore1985 <- ifelse(hotshoe_data$year <= 1985, 1, 0)
hotshoe_data$year8695 <- ifelse(hotshoe_data$year > 1985 & hotshoe_data$year <= 1995, 1, 0)
hotshoe_data$year1996 <- ifelse(hotshoe_data$year == 1996, 1, 0)
hotshoe_data$year9717 <- ifelse(hotshoe_data$year >= 1997, 1, 0)
# categorised score difference
hotshoe_data$spielstandmore2 <- ifelse(hotshoe_data$goaldiff > 2, 1, 0)
hotshoe_data$spielstand2 <- ifelse(hotshoe_data$goaldiff == 2, 1, 0)
hotshoe_data$spielstand1 <- ifelse(hotshoe_data$goaldiff == 1, 1, 0)
hotshoe_data$spielstand0 <- ifelse(hotshoe_data$goaldiff == 0, 1, 0)
hotshoe_data$spielstandmin1 <- ifelse(hotshoe_data$goaldiff == -1, 1, 0)
hotshoe_data$spielstandmin2 <- ifelse(hotshoe_data$goaldiff == -2, 1, 0)
hotshoe_data$spielstandminmore2 <- ifelse(hotshoe_data$goaldiff < -2, 1, 0)
# interactions
hotshoe_data$spielstandmore2minute <- hotshoe_data$spielstandmore2 * hotshoe_data$minute
hotshoe_data$spielstand2minute <- hotshoe_data$spielstand2 * hotshoe_data$minute
hotshoe_data$spielstand1minute <- hotshoe_data$spielstand1 * hotshoe_data$minute
hotshoe_data$spielstandmin1minute <- hotshoe_data$spielstandmin1 * hotshoe_data$minute
hotshoe_data$spielstandmin2minute <- hotshoe_data$spielstandmin2 * hotshoe_data$minute
hotshoe_data$spielstandminmore2minute <- hotshoe_data$spielstandminmore2 * hotshoe_data$minute
hotshoe_data$erfahrunghome <- hotshoe_data$homegame * hotshoe_data$ptexp

# design matrix
hotshoe_data$matchdaysq <- hotshoe_data$matchday^2

form.obj <- formula(goal ~ homegame + ptexp + gkexp + minute + matchday +
                      matchdaysq +
                      erfahrunghome + yearbefore1985 + year8695 + year1996 + 
                      spielstandmore2 + spielstand2 + spielstand1 + spielstandmin1 + 
                      spielstandmin2 + spielstandminmore2 + 
                      spielstandmore2minute + spielstand2minute + spielstand1minute + spielstandmin1minute + 
                      spielstandmin2minute + spielstandminmore2minute + 
                      factor(penaltytaker) + factor(goalkeeper)) 

X <- model.matrix(form.obj, data = hotshoe_data)[,-1]

Design <- scale(X)
Design <- as.data.frame(Design)
Design <- cbind(Design , hotshoe_data$penaltytaker)
colnames(Design)[ncol(Design)] <- "penaltytaker"
Design <- cbind(Design, hotshoe_data$goal)
colnames(Design)[ncol(Design)] <- "goal"

# store design matrices in list
Design$penaltytaker <- as.character(Design$penaltytaker)
data.list <- split(Design, f = Design$penaltytaker)

original.means <- apply(X, 2, mean)
original.sds <- apply(X, 2, sd)


# motivating example: time series of Gerd Mueller -------------------------

## the following code generates Figure 4
tab.figure2 <- model.matrix(form.obj, data = hotshoe_data)[,-1] %>% as.data.frame
tab.figure2$goal <- hotshoe_data$goal
tab.figure2$date <- hotshoe_data$date

idx <- which(tab.figure2$`factor(penaltytaker)Mller4, Gerd` == 1)

tab.gerd <- tab.figure2[idx,]

tab.gerd$date <- ymd(tab.gerd$date)

cbbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#D55E00", "#CC79A7")
timeline_plot <- ggplot(tab.gerd, aes(x = date, y = 0, col = factor(goal))) + 
  scale_color_manual(name = "", values = cbbPalette, labels = c("missed", "scored")) + geom_hline(yintercept=0, 
                                                                                                  color = "black", size = 0.3) +
  geom_point(aes(y = 0), size = 4) + theme_classic() + ylim(-0.1, 0.1) +
  theme(axis.line.y=element_blank(),
        axis.text.y=element_blank(),
        axis.title.x=element_blank(),
        axis.title.y=element_blank(),
        axis.ticks.y=element_blank(),
        axis.text.x =element_blank(),
        axis.ticks.x =element_blank(),
        axis.line.x =element_blank(),
        legend.box = "horizontal",
        legend.direction = "horizontal", 
        #legend.position = "bottom"
        legend.position = c(0.5, 0.15)
  ) +
  geom_text(data = tab.gerd[seq(1, nrow(tab.gerd), by = 3) ,], 
            aes(x = date, y = -0.035, label = date),
            size = 2.5, vjust = 0.5, color = 'black', angle = 90) 


timeline_plot


# grid search for alpha and lambda ----------------------------------------

# likelihood function
L.sim.pen.elnet.appl <- function(theta.star, x, N, lambda, alpha, nr.covariates){
  Gamma1 <- diag(N)
  Gamma1[!Gamma1] <- exp(theta.star[1:(N * (N - 1))])
  Gamma1 <- Gamma1/rowSums(Gamma1)
  
  beta.0 <- theta.star[(N * (N - 1) + 1):(N * (N - 1) + N)]
  beta.1 <- theta.star[(N * (N - 1) + N + 1):((N * (N - 1) + N + 1) + nr.covariates - 1)]
  delta <- solve(t(diag(N) - Gamma1 + 1), rep(1, N))
  
  l.all <- 0
  # loop over all players
  for(p in 1:length(x)){
    
    covariates <- as.matrix(x[[p]][, c(-ncol(x[[p]]), -(ncol(x[[p]])) + 1)])
    allprobs <- matrix(1, nrow(x[[p]]), N)
    for(j in 1:N){
      allprobs[, j] <- dbinom(x[[p]]$goal, 1, plogis(beta.0[j] + as.vector(covariates %*% beta.1)))
    }
    foo <- delta %*% diag(allprobs[1,])
    l <- log(sum(foo))
    phi <- foo/sum(foo)
    
    # rcpp
    l <- nLogLike_Rcpp(allprobs, Gamma1, foo, nrow(x[[p]]))
    l.all <- l.all + l
  }
  
  # add penalty
  penalty <- lambda * (alpha * sum(sqrt(beta.1^2 + 1e-5)) + (1 - alpha) * sum(beta.1^2))
  res.likelihood <- -l.all + penalty
  return(res.likelihood)
}



# number of states
N <- 2
# numer of covariates
nr.covariates <- ncol(X)
# sample soze
nr.obs <- nrow(rbindlist(data.list))

# grid for alpha and lambda
lambdas <- seq(5000, 0.5, length.out = 50)
alphas <- c(0, 0.2, 0.4, 0.6, 0.8, 1)
lambdas_alphas_grid <- expand.grid(lambdas, alphas)
colnames(lambdas_alphas_grid) <- c("lambda", "alpha")

# starting values for numerical optimisation
theta.star.vec <- c(-1.5, -1.5, -0.2, 0.2, rep(0, nr.covariates))
theta.star.mat <- matrix(0, ncol = length(theta.star.vec), nrow = nrow(lambdas_alphas_grid) + 1)
theta.star.mat[1,] <- theta.star.vec

bic.vec <- rep(NA, nrow(lambdas_alphas_grid))
aic.vec <- rep(NA, nrow(lambdas_alphas_grid))

# Note that running the grid search may take several days on a usual desktop computer.
# The results of the grid search are stored in an .RData file which is loaded in 
# line 215

for(i in 1:nrow(lambdas_alphas_grid)){
  # select current alpha and lambda
  cur.lambda <- lambdas_alphas_grid[i,]$lambda
  cur.alpha <- lambdas_alphas_grid[i,]$alpha
  
  theta.star <- c(-1.5, -1.5, -0.2, 0.2, rep(0, nr.covariates))
  # fit model
  mod <- nlm(L.sim.pen.elnet.appl, theta.star, x = data.list, N = 2, lambda = cur.lambda,
             alpha = cur.alpha, nr.covariates = ncol(X), print.level = 2, iterlim = 10000)
  
  loglik.temp <- L.sim.pen.elnet.appl(mod$estimate, data.list, 2, 0, cur.alpha, ncol(X))
  # calculate edf
  # if alpha = 0, i.e. in the ridge case, coefficients are not set to zero
  if(cur.alpha == 0) edf <- length(mod$estimate[5:length(mod$estimate)])
  else edf <- sum(abs(mod$estimate[5:length(mod$estimate)]) > 0.001)
  
  # compute AIC and BIC
  aic.vec[i] <- 2 * (loglik.temp) + (edf + 4) * 2
  bic.vec[i] <- 2 * (loglik.temp) + (edf + 4) * log(nr.obs)
  
  theta.star.mat[i + 1,] <- mod$estimate
}

# load results: one theta.star matrix for each alpha
load("hotshoe_results.RData")


### results of the application
cbPalette <- c("#999999", "#D55E00", "#E69F00", "#56B4E9", "#009E73", "#F0E442", "#0072B2", "#CC79A7")

# overview of AIC and BIC values for each alpha and lambda
df_aicbicplot <- data.frame(aic = aic.vec, bic = bic.vec, 
                            alpha = rep(alphas, each = 50),
                            lambda = rep(lambdas, 6))

# check the minimum AIC for each alpha
idx_minima <- df_aicbicplot %>% group_by(alpha) %>% summarise(which.min(aic)) %>% pull(2)

min_lambdas <- lambdas[idx_minima]
min_log_lambdas <- log(min_lambdas)


### the following code generates Figure 5 (top panel)
aicbic_plot <- ggplot(df_aicbicplot[df_aicbicplot$alpha > 0,], 
                      aes(x = log(lambda), y = aic, color = factor(alpha))) + geom_line(size = 1.1) +
  scale_colour_manual(name = "alpha", values = cbPalette) + geom_vline(xintercept = min_log_lambdas[-1],
                                                                       linetype = "dashed") +
  coord_cartesian(xlim = c(-0.5, 9), ylim = c(3665, 4000)) + xlab(expression(log(lambda))) + 
  ylab("AIC") + theme_minimal()

# select theta star mat for alpha = 0.2
theta.star.clean <- theta_star_mat_alpha02
# set coefficients <= 0.001 to zero
theta.star.clean[abs(theta.star.clean) <= 0.001] <- 0

# data frame for coefficient paths
df.plot.paths <- data.frame(rep(lambdas, 657) , c(theta.star.clean[1:(length(lambdas)), 5:ncol(theta.star.clean)]))
df.plot.paths$beta <- sort(rep(1:657, 50))
colnames(df.plot.paths) <- c("lambda", "estimate", "beta")

# vector with estimates at the opimum
theta.final.aic <- theta.star.clean[40,] 
names(theta.final.aic) <- c("gamma.1","gamma.2", "Icept.1", "Icept.2", colnames(data.list[[1]])[c(-length(colnames(data.list[[1]])), 
                                                                                                  -length(colnames(data.list[[1]])) + 1)])

### the following code generates Figure 5 (bottom panel)
plot.paths <- ggplot(df.plot.paths, aes(x = log(lambda), y = estimate, group = beta)) + 
  geom_line(alpha = 0.05) + xlab(expression(log(lambda))) + 
  ylab(expression(hat(beta[j]))) +
  coord_cartesian(ylim = c(-0.17, 0.17), xlim = c(log(0.49), 8)) +
  geom_abline(intercept = 0, slope = 0, linetype = "dashed") + 
  geom_vline(xintercept = log(lambdas[40]), linetype = "dashed") +
  # highlight effects
  annotate("text", x = 5.5, y = -0.1, label = "Jean-Marie\n Pfaff", color = "#F0E442", size = 3) +
  annotate("text", x = 1.5, y = -0.09, label = "Gnter Herrmann", color = "#0072B2", size = 3) +
  annotate("text", x = 1.6, y = 0.13, label = "home effect\n (not selected)", color = "darkred", size = 3) +
  geom_line(aes(x = log(lambdas), y = estimate, group = beta), df.plot.paths %>% filter(beta == 1),
            color = "darkred") +
  geom_line(aes(x = log(lambdas), y = estimate, group = beta), df.plot.paths %>% filter(beta == 545),
            color = "#F0E442") +
  geom_line(aes(x = log(lambdas), y = estimate, group = beta), df.plot.paths %>% filter(beta == 123),
            color = "#0072B2") + theme_minimal()

### Figure 5
plots_fin <- grid.arrange(grobs = list(aicbic_plot, plot.paths), ncol = 1)


