
############# Beginning of File #################

# Project code written by Stefan Skinner @ stefansdatascience@gmail.com

##### establishing constants #####

the_variables_we_are_using <- as.data.frame(cbind(coffee_full_scaled, crude_full_scaled, silver_full_scaled, sugar_full_scaled, wheat_full_scaled))

features <- ncol(the_variables_we_are_using)

# gold_open_full <- Gold_dt$Gold_Open 

prediction <- 90

delay <- prediction

##### data frame setup #####

num_train_samples  <- floor(nrow(the_variables_we_are_using) * .3)
num_val_samples    <- floor((nrow(the_variables_we_are_using) * .5))
num_test_samples_1 <- floor((nrow(the_variables_we_are_using) - (num_train_samples + num_val_samples))/2)
num_test_samples_2 <- num_test_samples_1



train_df  <- as.data.frame(the_variables_we_are_using[seq(from = 1, length.out = num_train_samples), 1:features])
val_df    <- as.data.frame(the_variables_we_are_using[seq(from = num_train_samples + 1, length.out = num_val_samples), 1:features])
test_1_df <- as.data.frame(the_variables_we_are_using[seq(from = (num_train_samples + num_val_samples + 1), length.out = num_test_samples_1), 1:features])
# test_2_df <- as.data.frame(the_variables_we_are_using[seq(from = (num_train_samples + num_val_samples + num_test_samples_1 + 1), length.out = num_test_samples_2), 1:features])


train_df_dates <- as.data.frame(Gold_dt$Gold_Date[seq(1:num_train_samples)])
val_df_dates   <- as.data.frame(Gold_dt$Gold_Date[seq(from = num_train_samples + 1, length.out = num_val_samples)])
test_1_df_dates  <- as.data.frame(Gold_dt$Gold_Date[seq(from = num_train_samples + num_val_samples + 1, length.out = num_test_samples_1)])
test_2_df_dates  <- as.data.frame(Gold_dt$Gold_Date[seq(from = num_train_samples + num_val_samples + num_test_samples_1 + 1, length.out = num_test_samples_2)])


##### Lagging and array creation #####

x_train_data <- list()

for(i in 1:features)
{
  x_train_data[[i]] <- t(sapply(1:(nrow(train_df) - delay - prediction + 1),
                            function(x) train_df[x:(x + delay - 1), i]
                  ))

} # end for

x_train_arr <- array(
  data = as.numeric(unlist(x_train_data)),
  dim = c(
    nrow(x_train_data[[1]]),
    prediction,
    features
  )
)


gold_open_scaled_train <- the_variables_we_are_using[seq(from = 1, length.out = num_train_samples), 1]

y_train_data <- t(sapply(
  (1 + delay):(length(gold_open_scaled_train) - prediction + 1),
  function(x) gold_open_scaled_train[x:(x + prediction - 1)]
))

y_train_arr <- array(
  data = as.numeric(unlist(y_train_data)),
  dim = c(
    nrow(y_train_data),
    prediction,
    1
  )
)


##### val setup

x_val_data <- list()

for(i in 1:features)
{
  x_val_data[[i]]  <- t(sapply(1:(nrow(val_df) - delay - prediction + 1),
                           function(x) val_df[x:(x + delay - 1), i]
  ))

} # end for

x_val_arr <- array(
  data = as.numeric(unlist(x_val_data)),
  dim = c(
    nrow(x_val_data[[1]]),
    prediction,
    features
  )
)

gold_open_scaled_val <- the_variables_we_are_using[seq(from = (num_train_samples + 1), length.out = num_val_samples), 1]

y_val_data <- t(sapply((1 + delay):(length(gold_open_scaled_val) - prediction + 1),
                  function(x) gold_open_scaled_val[x:(x + prediction - 1)]
  ))


y_val_arr <- array(
  data = as.numeric(unlist(y_val_data)),
  dim = c(
    nrow(y_val_data),
    prediction,
    1
  )
)


##### test_1 setup

x_test_1_data <- list() 

for(i in 1:features)
{  
     x_test_1_data[[i]] <-  t(sapply(1:(nrow(test_1_df) - delay - prediction + 1),
                                  function(x) test_1_df[x:(x + delay + 1),i]
  ))

} # end for

x_test_1_arr <- array(
  data = as.numeric(unlist(x_test_1_data)),
  dim = c(
    nrow(x_test_1_data[[1]]),
    prediction,
    features
  )
) # array

gold_open_scaled_test_1 <- the_variables_we_are_using[seq(from = (num_train_samples + num_val_samples + 1), length.out = num_test_samples_1), 1]

y_test_1_data <- t(sapply((1 + delay):(length(gold_open_scaled_test_1) - prediction + 1),
                            function(x) gold_open_scaled_test_1[x:(x + prediction - 1)]
  ))


y_test_1_arr <- array(
  data = as.numeric(unlist(y_test_1_data)),
  dim = c(
    nrow(y_test_1_data),
    prediction,
    1
  )
) # end array


##### test_2 setup
# 
# x_test_2_data <- list()
# 
# for(i in 1:features)
# {
#   x_test_2_data[[i]] <- t(sapply(1:(nrow(test_2_df) - delay - prediction + 1),
#                             function(x) test_2_df[x:(x + delay + 1), i]
#   ))
# } # end for
# 
# x_test_2_arr <- array(
#   data = as.numeric(unlist(x_test_2_data)),
#   dim = c(
#     nrow(x_test_2_data[[1]]),
#     prediction,
#     features
#   )
# )
# 
# gold_open_scaled_test_2 <- the_variables_we_are_using[seq(from = (num_train_samples + num_val_samples + num_test_samples_1 + 1), length.out = num_test_samples_2), 1] 
# 
# y_test_2_data <- t(sapply(
#   (1 + delay):(length(gold_open_scaled_test_2) - prediction + 1),
#   function(x) gold_open_scaled_test_2[x:(x + prediction - 1)]
# ))
# 
# y_test_2_arr <- array(
#   data = as.numeric(unlist(y_test_2_data)),
#   dim = c(
#     nrow(y_test_2_data),
#     prediction,
#     1
#   )
# )


##### Model Section #####

# keras::reset_states(lstm_model)
# rm(lstm_model)

lstm_model <- keras_model_sequential()

# , return_sequences = TRUE

lstm_model %>%
  layer_lstm(units = features, batch_input_shape = c(delay, prediction, features), return_sequences = TRUE) %>%
  layer_dropout(rate = 0.3) %>%  
  # layer_lstm(units = features, return_sequences = TRUE) %>%
  # layer_dropout(rate = 0.1) %>%
  # layer_lstm(units = features, return_sequences = TRUE) %>%
  # layer_dropout(rate = 0.1) %>%
  layer_lstm(units = features) %>%
  layer_dropout(rate = 0.3) %>%
  layer_dense(1) # end lstm_model


callbacks <- list(
  callback_early_stopping(monitor = "val_loss", patience = 15),

  callback_model_checkpoint(filepath = "myModel_{epoch}", monitor = "val_loss", save_best_only = TRUE)

) # end callback list


lstm_model %>%
  compile(loss = 'mse', optimizer = 'rmsprop', metrics = 'mae') 

lstm_model_object <- lstm_model %>% fit(
  x = x_train_arr,
  y = y_train_arr,
  epochs = 30,
  shuffle = FALSE,
  validation_data = list(x_val_arr, y_val_arr),
  callbacks = callbacks
)

# lstm_model <- load_model_tf("/hdd_storage/cuda/LSTM projects/multivariate/Gold_Open_LSTM_multivariate/Gold_Open_LSTM_multivariate/myModel_26")

# plot(lstm_model_object)

# model_summary <-  as.data.table(summary(lstm_model))

# write_csv2(model_summary, "lstm_model_summary.csv")

# save_model_tf(lstm_model, "")

##### Prediction Setup and predictions #####

x_pred_arr <- array(
  data = as.numeric(unlist(test_1_df[(nrow(test_1_df)-delay):nrow(test_1_df),])),
  dim = c(
    delay,  
    prediction,
    features
  )
)

predictions <- lstm_model %>% predict(x_pred_arr)

gold_test_1 <- Gold_dt$Gold_Open[seq(from = (num_train_samples + num_val_samples + 1), length.out = num_test_samples_1)]

predictions <- (predictions * sd(gold_test_1)) + mean(gold_test_1)

prediction_comparision <- as.data.frame(Gold_dt$Gold_Open[seq(from = (num_train_samples + num_val_samples + num_test_samples_1 + 1), length.out = prediction)])

paste("The average difference between LSTM predictions and what actually occured is: ", round(sum(abs(predictions - prediction_comparision))/prediction,2))

# comparison <- cbind(round(predictions,1), prediction_comparision)
# 
# colnames(comparison) <- c("Predictions", "Actual")
# 
write_csv2(comparison, "comparison.csv")


##### Visualization #####

predictions_w_trading_days <- as.data.frame(matrix(ncol = 2, nrow = (prediction)))

predictions_w_trading_days[,1] <- test_2_df_dates[1:(prediction),]

predictions_w_trading_days[,2] <- round(predictions[1:(prediction)],2)

colnames(predictions_w_trading_days) <- c("Dates", "Predictions")


pred_ggplot <- ggplot(predictions_w_trading_days, aes(x = Dates, y = Predictions), --embed-resources, --standalone) +
        geom_point(aes(Dates, Predictions), color = "#18DE83") +
        geom_line(color = "#18DE83") +
        #geom_smooth(color = "cornflowerblue", se = FALSE) +
        scale_x_date(date_labels = "%Y-%m-%d") +
        theme(axis.text.x = element_text(angle = 30, hjust = 1)) +
        ggtitle("LSTM Gold Open Price Prediction, 0.* dropout * * w/ adam optimizer, 30 epochs, and 0.*-0.* data split & an average difference of $**.**") +
        ylab("Predicted Prices $") 

ggplotly(pred_ggplot)


# ----- # 

prediction_comparison_ggplot_df <- as.data.frame(matrix(ncol = 2, nrow = prediction))

start_test_2 <- (num_train_samples + num_val_samples + num_test_samples_1 + 1)

prediction_comparison_ggplot_df[,1] <- test_2_df_dates[1:prediction,]  
prediction_comparison_ggplot_df[,2] <- round(Gold_dt$Gold_Open[start_test_2:((start_test_2-1)+prediction)],2)



colnames(prediction_comparison_ggplot_df) <- c("Dates", "Actual") 

prediction_comparison_ggplot <- ggplot(prediction_comparison_ggplot_df, aes(Dates, Actual), --embed-resources, --standalone) +
                    geom_point(color = "royalblue") +
                    geom_line(color = "forestgreen") +
                    geom_smooth(color = "cornflowerblue", se = FALSE) +
                    scale_x_date(date_labels = "%Y-%m-%d") +
                    theme(axis.text.x = element_text(angle = 30, hjust = 1))  +
                    ggtitle("LSTM Gold Open Price (Historical, not prediction)") +
                    ylab("Actual Historical Prices $") 

ggplotly(prediction_comparison_ggplot)



#-----

data_used_for_prediction_df <- as.data.frame(matrix(ncol = 2, nrow = prediction))

data_used_for_prediction_df[,1] <- test_1_df_dates[(nrow(test_1_df_dates)-delay+1):nrow(test_1_df_dates)]

data_used_for_prediction_df[,2] <- round(test_1_df[(nrow(test_1_df)-delay+1):nrow(test_1_df)],2)

colnames(data_used_for_prediction_df) <- c("Dates", "Actual") 

data_used_for_prediction <- ggplot(data_used_for_prediction_df, aes(Dates, Actual), --embed-resources, --standalone) +
  geom_point(color = "forestgreen") +
  geom_line(color = "royalblue") +
  geom_smooth(color = "cornflowerblue", se = FALSE) +
  scale_x_date(date_labels = "%Y-%m-%d") +
  theme(axis.text.x = element_text(angle = 30, hjust = 1))  +
  ggtitle("Data used for Prediction, LSTM Gold Open Price (Historical, not prediction)") +
  ylab("Actual Historical Prices $") 


ggplotly(data_used_for_prediction)


#-----

##### Forecast #####


# df1 <- data.frame(time = test_1_df_dates, data = test_1_df$V1, metadata = "historical data")
# 
# colnames(df1) <- c("time", "data", "metadata")
# 
# df2 <- data.frame(time = test_2_df_dates[1:prediction], data = predictions, metadata = "predictions")
# 
# colnames(df2) <- c("time", "data", "metadata")
# 
# df = rbind(df1, df2)
# 
# pred_plot_w_hist <- ggplot(df, aes(x = time, y = data, color = metadata)) + geom_line() + geom_smooth() + scale_x_date(date_labels = "%Y-%m-%d")
# 
# ggplotly(pred_plot_w_hist)
# 
# 
# historical_gold_full_ggplot <- ggplot(Gold_dt, aes(Gold_Date, Gold_Open), --embed-resources, --standalone) +
#   geom_point(color = "green", size = 0.25) +
#   geom_line(color = "purple") +
#   ylab("Prices $") +
#   xlab("Dates") +
#   scale_x_date(date_labels = "%Y-%m-%d") +
#   theme(axis.text.x = element_text(angle = 30, hjust = 1))  +
#   ggtitle("Gold Open Prices (Historical, not prediction)")
# 
# ggplotly(historical_gold_full_ggplot)



############# End of File #################

print(paste("End of File"))

############# EOF #################

