knitr::opts_chunk$set( eval=FALSE)
library(finetune)
#install.packages("finetune") #in case you are new to all this stuffs
library(tidyverse)
#install.packages("tidyverse")
library(tidymodels)
#install.packages("tidymodels")
library(doParallel)
#install.packages("doParallel") 
library(vip)
#install.packages("vip")
library(embed)

# Line 1 for parallelization
cluster <- makeCluster(detectCores() - 1)
# Line 2 for parallelization
registerDoParallel(cluster)

1 Case 6 - Hackathon

1.1 Load and Split da data

load("Case6-Spring23-Hackathon.RData")
# investigate how x actually relate

ALL <- rbind(CARS_TRAIN, CARS_HOLDOUT)

ALL$luxury <- ifelse(ALL$company %in%
  c(
    "alfaromeo", "astonmartin", "bentley",
    "ferrari", "jaguar", "lamborghini", "maserati",
    "mercedesbenz", "maybach", "mclaren", "porsche", 
    "rollsroyce", "renault", "landrover", "lotus", 
    "peugeot", "tesla")
  , 1, 0)

ALL$new <- ifelse(ALL$kilometers <= 20921, 1, 0)
ALL$old <- ifelse(ALL$kilometers >= 210000, 1, 0)

# table(ALL$old)
# table(ALL$new)
# table(ALL$luxury)
TRAIN <- ALL[1:5000, ]
HOLDOUT <- ALL[5001:7853, ]

rec <- recipe(price_in_aed ~ ., data = TRAIN) %>%
  step_lencode_glm(all_nominal_predictors(), outcome = vars(price_in_aed)) #%>%
  #step_normalize(all_numeric_predictors())#can change this to see if it makes diff 
  

  #step_normalize(all_predictors(), - kilometers) 
#GGally::ggpairs(TRAIN[,c(1,2,6,7,18,19,20)])

#train <- juice(prep(rec)) # make sure that rec is good to go
#hold <- bake(prep(rec), HOLDOUT)
folds <- vfold_cv(TRAIN, v = 5) # create resampling folds in training data
# racing may need more folds. More folds means less test in each fold

1.2 Off to the races! šŸ‡

cluster <- makeCluster(detectCores() - 1)
# Line 2 for parallelization
registerDoParallel(cluster)


xgb_spec <- boost_tree( #model spec basically showing what we wanna do    
    trees = tune(),
    min_n = tune(),
    mtry = tune(),
    learn_rate = tune(),
    sample_size = tune(),
    tree_depth = tune(),
    loss_reduction = tune()) %>% 
  set_engine("xgboost") %>% #see ?set_engine for a full list of possibilites
  set_mode("regression") #"classification


# Workflow
xgb_wf <- workflow() %>%  #add the preproc with the model spec
  add_recipe(rec) %>% 
  add_model(xgb_spec)


#this will take awhile dependent on processor speed and # of records
#With shop_hq data it took upwards of 10 minutes to race the models!


xgb_grid <- grid_latin_hypercube( 
  #cover all bases in the ~7 dimensional space of possible hyper params
  trees(range = c(1700,2400)),
  tree_depth(range = c(4,20)),
  min_n(range = c(1,10)),
  loss_reduction(),
  sample_size = sample_prop(range = c(.4,.9)),
  mtry(range = c(4,12)),
  learn_rate(range = c(-4,-1)),
  size = 10
  )

xgb_params <- parameters( #for annealing
  trees(range = c(1700,2400)),
  tree_depth(range = c(4,20)),
  min_n(range = c(1,10)),
  loss_reduction(),
  sample_size = sample_prop(range = c(.2,.9)),
  mtry(range = c(4,12)),
  learn_rate(range = c(-4,-1))
  )

reg_xgb <- expand.grid( #lowest just to save it 
  trees = c(2047),
  tree_depth = c(16),
  min_n=c(3),
  loss_reduction= 0.000000226 ,
  sample_size = c(0.792),
  mtry= c(5),
  learn_rate = c(.00916)
  )

xgb_best <- tune_grid(
  object = xgb_wf,
  resamples = folds,
  metrics = metric_set(rmse),
  grid = reg_xgb, #number of each different hyperparams to test out
)

xgb_rs <- tune_race_anova(
  object = xgb_wf,
  resamples = folds,
  metrics = metric_set(rmse),
  grid = xgb_grid, #number of each different hyperparams to test out
  control = control_race(verbose_elim = TRUE)
)


xgb_anneal <- tune_sim_anneal(
  object = xgb_wf,
  resamples = folds,
  initial = xgb_rs, #switch this to xgb_anneal to continue the search
  param_info = xgb_params,
  iter = 50, #number of each different hyperparams to test out
  control = control_sim_anneal(verbose_iter = TRUE)
)


show_best(xgb_rs) #see the best model
show_best(xgb_anneal)


autoplot(xgb_rs) #see the best model
autoplot(xgb_anneal)


plot_race(xgb_rs)+ #see the race in action lol 
  theme_bw()

xgb_last <- xgb_wf %>% #see how the best model did in the race
  finalize_workflow(select_by_one_std_err(xgb_rs, "rmse")) %>% #change this from rs if you `annealed`
  fit(TRAIN)

xgb_last %>%
  extract_fit_parsnip() %>%
  vip(geom = "point", num_features = 20)

predictions <- predict(
  xgb_last,
  new_data = HOLDOUT
)

SS <- data.frame(ID = 5001:7853, price_in_aed = predictions$.pred)
write.csv(SS, file = "XGBoostV2.csv", row.names = FALSE)
y.preds <- predict(xgb_last,new_data = TRAIN)

diagnostic <- TRAIN
diagnostic$y_preds <- y.preds
diagnostic$y_true <- TRAIN$price_in_aed
diagnostic$error <- y.preds - TRAIN$price_in_aed
diagnostic$ABSerror <- abs(y.preds - TRAIN$price_in_aed)
ggplot(diagnostic, aes(y_preds, price_in_aed, color = kilometers))+
  geom_point()+
  geom_abline(slope = 1,intercept = 0, col = 'red')

IDK how I got this but it was randomly found


1.2.0.0.1 *Disclaimer*

This takes a long a$$ time to do so you might wanna go on a fun walk or go shopping while your model is at the gym training away. NoĀ PunĀ Intended

Julia Silge provided most of the framework needed to get this .rmd off the ground.

Original source found here.