library(tidyverse)

# Set working directory:
#setwd(".../Code_and_data_v1/")

# Read data:
df0 <- read_csv("df_adv_hist.csv")
df1 <- df0 %>%
  # Filter by games played. Otherwise there are several outliers, such as 
  # Kuzminskas -49.3 or Muresan -86.7
  # https://www.basketball-reference.com/players/k/kuzmimi01.html
  # https://www.basketball-reference.com/players/m/muresgh01.html
  # In addition, the greatest BPM is Ben Davis 1999, which is not in line with what is said in
  # https://www.basketball-reference.com/about/bpm.html, where LeBron and Jordan are the best.
  # Note that this bpm website is not fully updated and Westbrook got the best BPM in 2017.
  filter(g > 5) %>% # > 10 to filter Ben Davis.
  # Select link with player, because there are some repeated names for different players, so spread
  # won't work otherwise. However, there are some players with NA in their links, so I have to add
  # them manually.
  select(player, link, bpm, age, g, season) 

# Correct missing links:
link_miss_players <- unique(df1$player[which(is.na(df1$link))])
link_miss_players
#[1] "Ray Allen*"            "Steve Nash*"           "Grant Hill*"           "Jason Kidd*"          
#[5] "Tracy McGrady*"        "Shaquille O'Neal*"     "Allen Iverson*"        "Yao Ming*"            
#[9] "Alonzo Mourning*"      "Dikembe Mutombo*"      "Gary Payton*"          "Reggie Miller*"       
#[13] "Karl Malone*"          "Scottie Pippen*"       "Michael Jordan*"       "David Robinson*"      
#[17] "Arvydas Sabonis*"      "John Stockton*"        "Patrick Ewing*"        "Hakeem Olajuwon*"     
#[21] "Mitch Richmond*"       "Chris Mullin*"         "Charles Barkley*"      "Dennis Rodman*"       
#[25] "Joe Dumars*"           "Dominique Wilkins*"    "Clyde Drexler*"        "Sarunas Marciulionis*"
#[29] "Robert Parish*"        "Dino Radja*"           "Magic Johnson*"        "Moses Malone*"        
#[33] "Isiah Thomas*"         "James Worthy*"         "Maurice Cheeks*"       "Bernard King*"        
#[37] "Kevin McHale*"         "Drazen Petrovic*"      "Larry Bird*"           "Alex English*"        
#[41] "Ralph Sampson*"        "Adrian Dantley*"       "Dennis Johnson*"       "Kareem Abdul-Jabbar*" 
#[45] "Artis Gilmore*"        "Julius Erving*"        "George Gervin*"        "Bob McAdoo*"          
#[49] "Bill Walton*"          "Jamaal Wilkes*"        "Dan Issel*"            "Tiny Archibald*"      
#[53] "Elvin Hayes*"          "Bob Lanier*"           "David Thompson*"       "Dave Cowens*"         
#[57] "Spencer Haywood*"      "Calvin Murphy*"        "George McGinnis*"      "Wes Unseld*"          
#[61] "Jo Jo White*"          "Rick Barry*"           "Phil Jackson*"         "Pete Maravich*"       
#[65] "Earl Monroe*"          "Charlie Scott*"        "Louie Dampier*"        "Walt Frazier*"        
#[69] "Gail Goodrich*"        "Dave Bing*"            "John Havlicek*"        "Bill Bradley*"        
#[73] "Mel Daniels*"          "Nate Thurmond*"        "Billy Cunningham*"     "Connie Hawkins*"      
#[77] "Don Nelson*"           "Pat Riley*"            "Jerry Sloan*"          "Zelmo Beaty*"         
#[81] "Chet Walker*"          "Lenny Wilkens*"        "Walt Bellamy*"         "Dave DeBusschere*"    
#[85] "Jerry Lucas*"          "Willis Reed*"          "Oscar Robertson*"      "Jerry West*" 

link_miss <- c("/players/a/allenra02.html", "/players/n/nashst01.html", "/players/h/hillgr01.html", "/players/k/kiddja01.html", 
               "/players/m/mcgratr01.html", "players/o/onealsh01.html", "/players/i/iversal01.html", "players/m/mingya01.html", 
               "/players/m/mournal01.html", "/pls/m/mutayeromdi01.html", "/players/p/paytoga01.html", "/players/m/millere01.html", 
               "/players/m/malonka01.html", "/players/p/pippesc01.html", "/players/j/jordami01.html", "/players/r/robinda01.html", 
               "/players/s/sabonar01.html", "/players/s/stockjo01.html", "/players/e/ewingpa01.html", "/players/o/olajuha01.html",
               "/players/r/richmmi01.html", "/players/m/mullich01.html", "/players/b/barklch01.html", "/players/r/rodmade01.html", 
               "/players/d/dumarjo01.html", "/players/w/wilkido01.html", "/players/d/drexlcl01.html", "/players/m/marcisa01.html", 
               "/players/p/parisro01.html", "/players/r/radjadi01.html", "/players/j/johnsma02.html", "/players/m/malonmo01.html", 
               "/players/t/thomais01.html", "/players/w/worthja01.html", "/players/c/cheekma01.html", "/players/k/kingbe01.html", 
               "/players/m/mchalke01.html", "/players/p/petrodr01.html", "/players/b/birdla01.html", "/players/e/englial01.html",
               "/players/s/sampsra01.html", "/players/d/dantlad01.html", "/players/j/johnsde01.html", "/players/a/abdulka01.html", 
               "/players/g/gilmoar01.html", "/players/e/ervinju01.html", "/players/g/gervige01.html", "/players/m/mcadobo01.html", 
               "/players/w/waltobi01.html", "/players/w/wilkeja01.html", "/players/i/isselda01.html", "/players/a/architi01.html", 
               "/players/h/hayesel01.html", "/players/l/laniebo01.html", "/players/t/thompda01.html", "/players/c/cowenda01.html", 
               "/players/h/haywosp01.html", "/players/m/murphca01.html", "/players/m/mcginge01.html", "/players/u/unselwe01.html",
               "/players/w/whitejo01.html", "/players/b/barryri01.html", "/players/j/jacksph01.html", "/players/m/maravpe01.html", 
               "/players/m/monroea01.html", "/players/s/scottch01.html", "/players/d/dampilo01.html", "players/f/fraziwa01.html", 
               "/players/g/goodrga01.html", "/players/b/bingda01.html",  "/players/h/havlijo01.html", "/players/b/bradlbi01.html", 
               "/players/d/danieme01.html", "/players/t/thurmna01.html", "/players/c/cunnibi01.html", "/players/h/hawkico01.html", 
               "/players/n/nelsodo01.html", "/players/r/rileypa01.html", "/players/s/sloanje01.html", "/players/b/beatyze01.html",
               "/players/w/walkech01.html", "/players/w/wilkele01.html", "/players/b/bellawa01.html", "/players/d/debusda01.html", 
               "/players/l/lucasje01.html", "/players/r/reedwi01.html", "/players/r/roberos01.html", "/players/w/westje01.html")

#df1_copy <- df1
for (i in 1:length(link_miss_players)) {
  df1[df1$player == link_miss_players[i], "link"] <- link_miss[i]
}


# Create b and w matrices for the ropes algorithm:
# https://stackoverflow.com/questions/23518325/how-to-extract-substring-between-symbol-and
#gsub(".*[//]([^.]+)[.].*", "\\1", df1$link[1])
#[1] "abrinal01"
df2 <- df1 %>%
  mutate(link = gsub(".*[//]([^.]+)[.].*", "\\1", link)) %>%
  unite(player_link, player, link)
range(df2$age)
#[1] 18 43

# Copy for later use:
df2_copy <- df2

df2 <- df2 %>%
  select(-g, -season)
b_aux <- spread(df2, key = age, value = bpm)
b_aux1 <- as.data.frame(b_aux)
rownames(b_aux1) <- b_aux1$player_link
b <- b_aux1[, -1]
w <- b
w[!is.na(w)] <- 1
w[is.na(w)] <- 0
dim(b) ; dim(w)
#[1] 3075   26
#[1] 3075   26

old_ages <- b[, colnames(b) %in% 40:43]
old_ages[rowSums(is.na(old_ages)) != 4,]
#                                 40  41  42  43
#Charles Jones_jonesch01         0.2  NA  NA  NA
#Charles Oakley_oaklech01        0.1  NA  NA  NA
#Clifford Robinson_robincl02     0.3  NA  NA  NA
#Dikembe Mutombo*_mournal01      5.2 2.3 0.2  NA
#Grant Hill*_hillgr01           -0.1  NA  NA  NA
#Herb Williams_willihe01         0.1  NA  NA  NA
#James Edwards_edwarja01        -0.2  NA  NA  NA
#Jason Terry_terryja01           0.7  NA  NA  NA
#John Long_longjo01              0.1  NA  NA  NA
#John Stockton*_stockjo01        9.0  NA  NA  NA
#Kareem Abdul-Jabbar*_abdulka01  5.3 2.9  NA  NA
#Karl Malone*_malonka01          4.3  NA  NA  NA
#Kevin Willis_willike02          1.6 0.9 0.0  NA
#Kurt Thomas_thomaku01           1.1  NA  NA  NA
#Manu Ginobili_ginobma01         2.2  NA  NA  NA
#Rick Mahorn_mahorri01           0.0  NA  NA  NA
#Robert Parish*_parisro01        4.0 1.4 1.7 0.9
#Vince Carter_cartevi01          4.0 1.2  NA  NA

b_bpm <- b[, colnames(b) %in% 18:40]
w_bpm <- w[, colnames(b) %in% 18:40]
save(b_bpm, file = "BPM/b_bpm.RData")
save(w_bpm, file = "BPM/w_bpm.RData")

#df_bpm <- df2 %>%
df_bpm <- df2_copy %>%
  filter(age < 41) %>%
  select(-g) %>%
  arrange(player_link, desc(age))
write_csv(df_bpm, path = "BPM/df_bpm.csv")
# No Nas in data:
table(is.na(df_bpm$bpm))
table(is.na(df_bpm$age))
table(is.na(df_bpm$player_link))

# https://www.basketball-reference.com/about/bpm.html
# https://www.sports-reference.com/blog/2014/10/introducing-box-plusminus-bpm-2/
range(df_bpm$bpm)
#[1] -31.5  21.0
df_bpm %>%
  filter(bpm > 12) %>%
  select(player_link, bpm, season) %>%
  arrange(desc(bpm))
