library(tidyverse)

# Set working directory:
#setwd("~/.../Code_and_Data_v1")

# Data:
df_bpm <- read.csv("BPM/df_bpm.csv")
length(unique(df_bpm$player_link))
#[1] 3075

# Our ultimate goal will be to predict the BPM in the season 2017-2018, for those players who played 
# at least in one season before the season 2017-2018 and also played in the season 2017-2018 itself.

# So, first filter the players who belong to this test set:
df_bpm1 <- df_bpm %>%
  filter(season != "2018")
df_bpm2 <- df_bpm %>%
  filter(season == "2018")
players_test <- intersect(df_bpm1$player_link, df_bpm2$player_link)
length(players_test)
#[1] 385

# Test data frame:
df_test <- df_bpm2 %>%
  filter(player_link %in% players_test) 

# Load b and w matrices:
load("BPM/b_bpm.RData")
load("BPM/w_bpm.RData")

# Average method:
df_test$bpm_pred <- NA
for (i in 1:nrow(df_test)) {
  age_pred <- which(colnames(b_bpm) %in% df_test$age[i])
  bpm_values <- b_bpm[rownames(b_bpm) %in% df_test$player_link[i], colnames(b_bpm)[1:(age_pred - 1)]] 
  bpm_values1 <- as.vector(as.matrix(bpm_values))
  
  # There might be cases where there are only NAs before the age to predict.
  # For example, when the age to predict is the first non-NA of the time series.
  if (all(is.na(bpm_values1))) {
    df_test$bpm_pred[i] <- NA
  }else{
    df_test$bpm_pred[i] <- round(mean(bpm_values1, na.rm = TRUE), 2)
  }
}

# Results:
mean(df_test$bpm)
#[1] -0.9106494
sd(df_test$bpm)
#[1] 3.313776

mean(df_test$bpm_pred, na.rm = TRUE)
#[1] -1.153844
sd(df_test$bpm_pred, na.rm = TRUE)
#[1] 2.717292

df_test$Dif <- round(abs(df_test$bpm_pred - df_test$bpm), 2)
df_test$Dif2 <- round((df_test$bpm_pred - df_test$bpm)^2, 2)

mean(df_test$Dif2, na.rm = TRUE)
#[1] 7.587662
sd(df_test$Dif2, na.rm = TRUE)
#[1] 15.41463

# Save the vector of predictions:
pred_average <- df_test$bpm_pred
save(pred_average, file = "Validation/pred_average.RData")
