library(tidyverse)

# Set working directory.
#setwd(".../Code_and_data_v1/")

# Ropes functions:
source("Algorithm/BAwR_internalROPES.R")
source("Algorithm/ropes.R")

# Data:
df_bpm <- read.csv("BPM/df_bpm.csv")
length(unique(df_bpm$player_link))
#[1] 3075

# Our ultimate goal will be to predict the BPM in the season 2017-2018, for those players who played 
# at least in one season before the season 2017-2018 and also played in the season 2017-2018 itself.

# So, first filter the players who belong to this test set:
df_bpm1 <- df_bpm %>%
  filter(season != "2018")
df_bpm2 <- df_bpm %>%
  filter(season == "2018")
players_test <- intersect(df_bpm1$player_link, df_bpm2$player_link)
length(players_test)
#[1] 385

# Training and validation data frame:
df_tr_va <- df_bpm %>%
  filter(!player_link %in% players_test)
length(unique(df_tr_va$player_link))
#[1] 2690 # 3075 - 385 = 2690

# Test data frame:
df_test <- df_bpm2 %>%
  filter(player_link %in% players_test) 

intersect(df_tr_va$player_link, df_test$player_link)
#character(0)

# Create folds for cross-validation:
n_folds <- 10 # 5
df_tr_va$fold = cut(1:nrow(df_tr_va), breaks = n_folds, labels = FALSE)
#table(df_tr_va$fold)

# Load b and w matrices:
load("BPM/b_bpm.RData")
load("BPM/w_bpm.RData")

# Filter in b and w the training and validation players:
b_bpm_tr_va <- b_bpm[rownames(b_bpm) %in% unique(df_tr_va$player_link),]
w_bpm_tr_va <- w_bpm[rownames(w_bpm) %in% unique(df_tr_va$player_link),]

# Optimize only one parameter, setting the other two to some rather small values:
lambda2Ages <- seq(0,1000,100) 
lambda1Ages <- 0
lambda0Ages <- 10
combs <- expand.grid(lambda2Ages = lambda2Ages, lambda1Ages = lambda1Ages, lambda0Ages = lambda0Ages)
name_cols <- sapply(1:nrow(combs), function(i, x) paste(x[i,1], x[i,2], x[i,3], sep = "_"), combs)

# MSE matrix:
mse_mat <- matrix(NA, nrow = n_folds, ncol = nrow(combs))
colnames(mse_mat) <- name_cols
# MAE matrix:
mae_mat <- matrix(NA, nrow = n_folds, ncol = nrow(combs))
colnames(mae_mat) <- name_cols

# Run ropes for cross-validation:
s0 <- Sys.time()
for (i in 1:n_folds) {
  cat(paste("Starting fold:", i, "\n"))
  for (j in 1:ncol(mse_mat)) {
    cat(paste("Starting lambda combination:", j, "\n"))
    
    df_tr_va_fold <- df_tr_va %>%
      filter(fold == i)
  
    # Filter in b and w the training and validation players:
    b_bpm_tr_va_fold <- b_bpm_tr_va[rownames(b_bpm_tr_va) %in% unique(df_tr_va_fold$player_link),]
    w_bpm_tr_va_fold <- w_bpm_tr_va[rownames(w_bpm_tr_va) %in% unique(df_tr_va_fold$player_link),]
  
    # Divide the training and validation set with 60% and 40% rows:
    set.seed(2018)
    sample_train <- sample(1:nrow(df_tr_va_fold), size = floor(0.6 * nrow(df_tr_va_fold)))
    df_train <- df_tr_va_fold[sample_train,]
    df_valid <- df_tr_va_fold[-sample_train,]
    
    # For the validation players, replace in 'b' the BPM value by NA, and in 'w' the 1 value by 0:
    for (k in 1:nrow(df_valid)) {
      b_bpm_tr_va_fold[rownames(b_bpm_tr_va_fold) %in% df_valid$player_link[k], 
                       colnames(b_bpm_tr_va_fold) %in% df_valid$age[k]] <- NA
      
      w_bpm_tr_va_fold[rownames(w_bpm_tr_va_fold) %in% df_valid$player_link[k], 
                       colnames(w_bpm_tr_va_fold) %in% df_valid$age[k]] <- 0
    }  
    
    # To count the number of non-NA-values in each row of b:
    nonNA_rows <- apply(b_bpm_tr_va_fold, 1, FUN = function(x) length(x[!is.na(x)]) )
    pl_na <- c(which(nonNA_rows == 0))#, which(nonNA_rows == 1))
    
    # Remove the rows with all NAs (otherwise ropes won't work): 
    b_bpm_tr_va_fold1 <- b_bpm_tr_va_fold[!rownames(b_bpm_tr_va_fold) %in% names(pl_na),]
    b_bpm_tr_va_fold2 <- as.matrix(b_bpm_tr_va_fold1)
    
    w_bpm_tr_va_fold1 <- w_bpm_tr_va_fold[!rownames(w_bpm_tr_va_fold) %in% names(pl_na),]
    w_bpm_tr_va_fold2 <- as.matrix(w_bpm_tr_va_fold1)
    
    df_valid1 <- df_valid %>%
      filter(!player_link %in% names(pl_na))
    
    result <- ropes(b_bpm_tr_va_fold2, w_bpm_tr_va_fold2, 
                    lambda2Ages = combs[j,1], lambda1Ages = combs[j,2], lambda0Ages = combs[j,3], 
                    lambda0Inds = 0.1, trace = 0, method = "cg")
    
    preds <- result$Z
    dimnames(preds) <- dimnames(b_bpm_tr_va_fold2)
    
    for (l in 1:nrow(df_valid1)) {
      df_valid1$bpm_pred[l] <- preds[rownames(preds) %in% df_valid1$player_link[l], 
                                     colnames(preds) %in% df_valid1$age[l]]
    }
    
    mse_mat[i,j] <- round(mean( abs(df_valid1$bpm_pred - df_valid1$bpm) ), 2)
    cat("\n")
    cat("MSE:")
    print(mse_mat[i,j])
    
    mae_mat[i,j] <- round(mean( (df_valid1$bpm_pred - df_valid1$bpm)^2 ), 2)
    cat("\n")
    cat("MAE:")
    print(mae_mat[i,j])
  }
}  
s1 <- Sys.time() - s0
s1
#Time difference of 5.51849 hours

mse_mat
mae_mat

# Calculate average across folds:
mse_ave <- apply(mse_mat, 2, "mean")
mse_ave
names(which.min(mse_ave))
#[1] "900_0_10"
mae_ave <- apply(mae_mat, 2, "mean")
mae_ave
names(which.min(mae_ave))
#[1] "1000_0_10"

save(mse_mat, file = "Validation/mse_mat_lambda2.RData")
save(mae_mat, file = "Validation/mae_mat_lambda2.RData")

# Plot:
df <- data.frame(combs = factor(names(mse_ave), levels = names(mse_ave)), 
                 mse_ave = as.vector(mse_ave),
                 minimum_mse = c(rep("No", 9), "Yes", "No"))
ggplot(data = df, aes(x = combs, y = mse_ave, fill = minimum_mse)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = mse_ave), vjust = -0.3, size = 2) +
  labs(x = "", y = "MSE") +
  theme(axis.text.x = element_text(angle = 45))
ggsave("Validation/mse_lambda2.eps", width = 6, height = 4)
