#' =================================================================================== #' Author: Tom Herman, Research and Modeling Team at National City Bank #' Date: October 16, 2024 #' Analysis: Propensity Model for a New Product: Line of Credit Against a Household's USED Car #' Primary Objective: Identify the Next 100 Customers from a Prospective Customer List to Contact (HHs with Greatest #' Probability of Accepting the Offer) #' Secondary Objective: Provide Customer Profiles for Both Current and Prospective Customers # Set working directory setwd("~/Desktop/Data Mining Class/DM_Class_Git/personalFiles/Cases/Case 2 Banking Propensity") # Bring in libraries and set options library(ggplot2) # Visualizations library(tidyr) library(ggthemes) # Visual themes library(dplyr) # Data manipulation library(vtreat) # Variable treatment before modeling library(MLmetrics) # Evaluation metrics library(pROC) # Receiver Operating Characteristic (ROC) curves library(caret) # For confusion matrix and accuracy library(rpart.plot) # For better Decision Tree diagram library(randomForest) options(scipen = 999) ################################## SAMPLE ################################## # Import the data sets CurrentCustomers <- read.csv('CurrentCustomerMktgResults.csv') ProspectCustomers <- read.csv('ProspectiveCustomers.csv') HHAxiom <- read.csv('householdAxiomData.csv') HHCredit <- read.csv('householdCreditData.csv') HHVehicle <- read.csv('householdVehicleData.csv') ### DATA PRE-PROCESSING AND ENRICHMENT ### # Join my Current Customers data with HH Axiom, Credit and Vehicle CurrentCustomersJoined <- left_join(CurrentCustomers, HHAxiom, by = "HHuniqueID") CurrentCustomersJoined <- left_join(CurrentCustomersJoined, HHCredit, by = "HHuniqueID") CurrentCustomersJoined <- left_join(CurrentCustomersJoined, HHVehicle, by = "HHuniqueID") # Removed the "CallStart" and "CallEnd" columns since they are not present in my ProspectCustomers data # NOTE: I did still perform some EDA on these variables... See the "EXPLORE" section below CurrentCustomersJoined <- select(CurrentCustomersJoined, -c("CallStart", "CallEnd")) # Consider doing some EDA here? Does call length have an effect? # Move the outcome variable "Y_AcceptedOffer" to the far right CurrentCustomersJoined <- CurrentCustomersJoined %>% relocate("Y_AcceptedOffer", .after = last_col()) # Drop Race column; do not want to base offer acceptance predictions on race CurrentCustomersJoined$EstRace <- NULL # REPLACE NA's in "Communication," "Job," "past_Outcome," "Education" and "carYr" rows CurrentCustomersJoined$Communication[is.na(CurrentCustomersJoined$Communication)] <- "other" CurrentCustomersJoined$Job[is.na(CurrentCustomersJoined$Job)] <- "unknown" CurrentCustomersJoined$past_Outcome[is.na(CurrentCustomersJoined$past_Outcome)] <- "" CurrentCustomersJoined$Education[is.na(CurrentCustomersJoined$Education)] <- "other" CurrentCustomersJoined$carYr[is.na(CurrentCustomersJoined$carYr)] <- "unknown" # REMOVE OUTLIERS # Remove outliers (with visualizations commented out) #boxplot(CurrentCustomersJoined$NoOfContacts) CurrentCustomersJoined <- CurrentCustomersJoined[CurrentCustomersJoined$NoOfContacts < 25,] # Max 25 contacts #boxplot(CurrentCustomersJoined$DaysPassed) CurrentCustomersJoined <- CurrentCustomersJoined[CurrentCustomersJoined$DaysPassed < 730,] # Cap at 2 years (730 days) #boxplot(CurrentCustomersJoined$PrevAttempts) CurrentCustomersJoined <- CurrentCustomersJoined[CurrentCustomersJoined$PrevAttempts < 15,] # Cap at 15 previous attempts #boxplot(CurrentCustomersJoined$RecentBalance) CurrentCustomersJoined <- CurrentCustomersJoined[CurrentCustomersJoined$RecentBalance < 40000,] # Cap at 40,000 # This caused a loss of 32 records, or about 0.8% of the data # ALIGN PROSPECT DATA WITH CURRENT CUSTOMERS DATA # Join my Prospect Customers data with HH Axiom, Credit and Vehicle ProspectCustomersJoined <- left_join(ProspectCustomers, HHAxiom, by = "HHuniqueID") ProspectCustomersJoined <- left_join(ProspectCustomersJoined, HHCredit, by = "HHuniqueID") ProspectCustomersJoined <- left_join(ProspectCustomersJoined, HHVehicle, by = "HHuniqueID") # Move the outcome variable "Y_AcceptedOffer" to the far right ProspectCustomersJoined <- ProspectCustomersJoined %>% relocate("Y_AcceptedOffer", .after = last_col()) # Drop Race column; do not want to base offer acceptance predictions on race ProspectCustomersJoined$EstRace <- NULL # REPLACE NA's in "Communication," "Job," "past_Outcome," "Education," "carYr" and "Y_AcceptedOffer" rows ProspectCustomersJoined$Communication[is.na(ProspectCustomersJoined$Communication)] <- "other" ProspectCustomersJoined$Job[is.na(ProspectCustomersJoined$Job)] <- "unknown" ProspectCustomersJoined$past_Outcome[is.na(ProspectCustomersJoined$past_Outcome)] <- "" ProspectCustomersJoined$Education[is.na(ProspectCustomersJoined$Education)] <- "other" ProspectCustomersJoined$carYr[is.na(ProspectCustomersJoined$carYr)] <- "unknown" ProspectCustomersJoined$Y_AcceptedOffer[is.na(ProspectCustomersJoined$Y_AcceptedOffer)] <- "TBD" ### CREATE VTREAT, TRAINING AND VALIDATION PARTITIONS (10%, then 80/20 on the remainder) ### vtreat_partition <- sample(1:nrow(CurrentCustomersJoined), nrow(CurrentCustomersJoined)*.1) vtreat_rows <- CurrentCustomersJoined[vtreat_partition,] #10% remainder1 <- CurrentCustomersJoined[-vtreat_partition,] #90% training_index <- sample(1:nrow(remainder1), nrow(remainder1)*.8) #80%/20% split trainingSet <- remainder1[training_index,] #80% of the 90% remainder2 <- remainder1[-training_index,] #the leftovers from the 80%, should be 20% validation_index <- sample(1:nrow(remainder2), nrow(remainder2)) validationSet <- remainder2[validation_index,] #half of the 20% = 10% # Remove unnecessary variables remove(remainder1) remove(remainder2) ################################## EXPLORE ################################# # While I did previously remove the call times from my training data, I still performed some EDA here to (con't) # understand the impact of call duration on offer acceptance: # Convert "CallStart" and "CallEnd" time strings into date-time object and calculate call duration in seconds: CurrentCustomers <- CurrentCustomers %>% mutate( CallStart = as.POSIXct(CallStart, format = "%H:%M:%S"), CallEnd = as.POSIXct(CallEnd, format = "%H:%M:%S"), CallDuration = as.numeric(difftime(CallEnd, CallStart, units = "secs")) ) # Display a box plot of accepted/not accepted by call duration: ggplot(CurrentCustomers, aes(x = Y_AcceptedOffer, y = CallDuration, fill = Y_AcceptedOffer)) + geom_boxplot() + labs(title = "Call Duration by Offer Acceptance", x = "Offer Status", y = "Call Duration (seconds)") + theme_minimal(base_size = 18) + # Set base text size to 18 scale_fill_manual(values = c("Accepted" = "mediumseagreen", "DidNotAccept" = "salmon")) + theme( plot.title = element_text(size = 18), # Title size axis.title.x = element_text(size = 18), # X-axis title size axis.title.y = element_text(size = 18), # Y-axis title size axis.text.x = element_text(size = 16), # X-axis text size axis.text.y = element_text(size = 16) # Y-axis text size ) # Determine summary statistics (mean and median call times, etc.) CallTimeStats <- CurrentCustomers %>% group_by(Y_AcceptedOffer) %>% summarize( Min = min(CallDuration), Q1 = quantile(CallDuration, 0.25), Median = median(CallDuration), Mean = mean(CallDuration), Q3 = quantile(CallDuration, 0.75), Max = max(CallDuration), SD = sd(CallDuration), # Standard deviation Count = n() # Number of observations ) # Display summary statistics CallTimeStats #Minimum call time for an accepted offer was 25 seconds # PERFORM MORE EDA HERE ON JUST THE TRAINING DATA SET summary(trainingSet) names(trainingSet) table(trainingSet$Y_AcceptedOffer) # My outcomes are somewhat balanced, with 1,148 having accepted the offer, and 1,709 declining the offer # Histogram of Age ggplot(trainingSet, aes(x = Age)) + geom_histogram(binwidth = 1, fill = 'mediumseagreen', color = 'black') + geom_vline(xintercept = 39, color = "salmon", linetype = "dashed", size = 1) + theme_minimal(base_size = 18) + # Set base size for all text elements labs(title = "Histogram of Customer Ages", x = "Age", y = "Count") # Bar plot of Gender ggplot(trainingSet, aes(x = headOfhouseholdGender, fill = headOfhouseholdGender)) + geom_bar(color = 'black', width = 0.5) + # Set bar width to make bars skinnier scale_fill_manual(values = c("F" = "salmon", "M" = "mediumseagreen")) + # Assign specific colors theme_minimal(base_size = 18) + # Set base text size for all plot text labs(title = "Count of Customer Gender", x = "Gender", y = "Count") + theme( legend.text = element_text(size = 14), # Customize legend text size if needed legend.title = element_text(size = 16) # Customize legend title size ) # Genders are relatively evenly represented in the data set # Look for a trend of certain age groups having a higher acceptance rate acceptance_rate <- trainingSet %>% group_by(Age) %>% summarize( Total = n(), Accepted = sum(Y_AcceptedOffer == "Accepted"), AcceptanceRate = Accepted / Total ) ggplot(acceptance_rate, aes(x = Age, y = AcceptanceRate)) + geom_point(color = "mediumseagreen", size = 3, alpha = 0.7) + geom_smooth(method = "loess", color = "salmon", se = FALSE) + # Optional trend line geom_hline(yintercept = 0.5, linetype = "dashed", color = "grey") + # Horizontal line at 0.5 labs(title = "Scatter Plot of Acceptance Rate by Age Group", x = "Age", y = "Acceptance Rate") + theme_minimal(base_size = 18) # Set base text size for all labels ################################## MODIFY ################################## # PREPARE DATA FOR MODELING # Used varImp() and step() functions to determine top 8 most important and circled back here to input them directly: informativeFeatureNames <- c('Age', 'Communication', 'RecentBalance', 'Job', 'Education', 'Marital', 'LastContactMonth', 'past_Outcome') outcomeVariableName <- names(trainingSet)[26] # "Y_AcceptedOffer" and success will be "Accepted" plan <- designTreatmentsC(vtreat_rows, informativeFeatureNames, outcomeVariableName, 'Accepted') treatedTraining <- prepare(plan, trainingSet) treatedValidation <- prepare(plan, validationSet) treatedProspect <- prepare(plan, ProspectCustomersJoined) # Use these as a foundation for all three models ################################## MODEL ################################### # ******************** MODEL 1: GLM / LOGISTIC REGRESSION ******************** GLMfit <- glm(as.factor(Y_AcceptedOffer) ~ ., data = treatedTraining, family='binomial') summary(GLMfit) # Make a parsimonious model by using the step() function and removing noise variables GLMbetterFit <- step(GLMfit, direction='backward') summary(GLMbetterFit) # AIC is down from 3303 to 3288 # Step says communication, past outcome, job, marital status, car model, and last contact month are the most influential # ...And my model is down from 48 variables to the 22 most influential! length(coefficients(GLMfit)) length(coefficients(GLMbetterFit)) # Get predictions using parsimonious model (GLMbetterFit) GLMacceptedProbability <- predict(GLMbetterFit, treatedTraining, type='response') head(GLMacceptedProbability) # Classify cutoff <- 0.50 GLMtrain_classes <- ifelse(GLMacceptedProbability >= cutoff, 'DidNotAccept', 'Accepted') # Organize with actual GLMtrain_results <- data.frame(actual = treatedTraining$Y_AcceptedOffer, probability = GLMacceptedProbability, classes = GLMtrain_classes) head(GLMtrain_results) tail(GLMtrain_results) # Create a confusion matrix using the Caret library GLMconf_mat <- ConfusionMatrix(GLMtrain_results$classes, GLMtrain_results$actual) GLMconf_mat Accuracy(GLMtrain_results$classes, GLMtrain_results$actual) # Parsimonious GLM accuracy is 71.7% # ************************* MODEL 2: DECISION TREE *************************** # Create a custom control object for tuning DTtuningControl <- trainControl( method = "cv", # Cross-validation number = 10, # Number of folds search = "grid", # Grid search - finding the best combination of hyperparameters--it exhaustively evaluates various hyperparameter settings to fine tune verboseIter = TRUE, # Display progress returnData = FALSE, # Don't return data for each resampling iteration returnResamp = "final" # Return results for the final model ) # Create the decision tree model DTfit <- train( Y_AcceptedOffer ~ ., data = treatedTraining, method = "rpart", trControl = DTtuningControl, tuneGrid = data.frame(cp = c(0.01, 0.05, 0.1)) ) # Print the best hyperparameters print(DTfit) # The final value used for the model was cp = 0.01 (aggressive pruning) # Make predictions on the training set DTpred_classes <- predict(DTfit, treatedTraining) # Get the confusion matrix using the Caret library confusionMatrix(DTpred_classes, as.factor(treatedTraining$Y_AcceptedOffer)) ###### Using rpart.plot to create a decision tree diagram DTplot <- rpart(Y_AcceptedOffer ~., data = treatedTraining, cp = .01) rpart.plot(DTplot, extra=106) # ************************* MODEL 3: RANDOM FOREST *************************** # Using a cross validation training scheme with 10 folds RFcontrol <- trainControl(method="cv", number=10, verboseIter = TRUE) #train model RFfit <- train(Y_AcceptedOffer ~., data = treatedTraining, method ="rf", ntree = 500, # 500 trees trControl = RFcontrol, tuneGrid = data.frame(mtry = 3)) # 3 variables randomly sampled at each split # Variable Importance varImp(RFfit) plot(varImp(RFfit), top = 10) # Age is the most important variable, followed by recent balance, last contact month and past outcome # Estimated probabilities for each class RFpred_probs <- predict(RFfit, treatedTraining, type = "prob") head(RFpred_probs) # Actual predicted class based on the highest probability RFpred_classes <- predict(RFfit, treatedTraining) head(RFpred_classes) # Create a confusion matrix RFconf_matrix <- confusionMatrix(RFpred_classes, as.factor(treatedTraining$Y_AcceptedOffer)) RFconf_matrix # Kappa of .55 suggests the model is better than just random guessing, but not perfect # Find F1 score f1_score <- RFconf_matrix$byClass["F1"] f1_score # An F1 score of .69 suggests that the model achieves both high precision and recall ################################## ASSESS ################################## # **************** ASSESS MODEL 1: GLM / LOGISTIC REGRESSION ***************** # Predict on validation data GLMvalidation_preds <- predict(GLMbetterFit, treatedValidation, type='response') head(GLMvalidation_preds) # Create class predictions based on the cutoff GLMvalidation_classes <- ifelse(GLMvalidation_preds >= cutoff, 'DidNotAccept', 'Accepted') head(GLMvalidation_classes) # Organize with actual GLMvalidation_results <- data.frame(actual = treatedValidation$Y_AcceptedOffer, probability = GLMvalidation_preds, classes = GLMvalidation_classes) # Create same class and levels class(treatedValidation$Y_AcceptedOffer) class(GLMvalidation_results$classes) GLMvalidation_results$classes <- factor(GLMvalidation_results$classes, levels = c("DidNotAccept", "Accepted")) # Get a confusion matrix GLMconf_mat_validation <- ConfusionMatrix(GLMvalidation_results$classes, GLMvalidation_results$actual) GLMconf_mat_validation Accuracy(GLMvalidation_results$classes, GLMvalidation_results$actual) # Accuracy went from 71.7% on training data to 70.3% on validation data # GLM precision, recall, and F1 calculations GLMprecision <- Precision(y_pred = GLMvalidation_classes, y_true = treatedValidation$Y_AcceptedOffer, positive = "Accepted") GLMrecall <- Recall(y_pred = GLMvalidation_classes, y_true = treatedValidation$Y_AcceptedOffer, positive = "Accepted") GLMf1_score <- F1_Score(y_pred = GLMvalidation_classes, y_true = treatedValidation$Y_AcceptedOffer, positive = "Accepted") # Print the metrics print(paste("Logistic Regression Precision:", GLMprecision)) print(paste("Logistic Regression Recall:", GLMrecall)) print(paste("Logistic Regression F1 Score:", GLMf1_score)) # ********************** ASSESS MODEL 2: DECISION TREE *********************** DTpred_classes_validation <- predict(DTfit, newdata = treatedValidation) # Get the confusion matrix for the validation set confusionMatrix(DTpred_classes_validation, as.factor(treatedValidation$Y_AcceptedOffer)) # Calculate precision, recall, and F1 for the Decision Tree DTprecision <- Precision(y_pred = DTpred_classes_validation, y_true = treatedValidation$Y_AcceptedOffer, positive = "Accepted") # Want to maximize precision (when model predicts positive class, it's mostly correct. More discerning to ID the top 100 "sure bets") DTrecall <- Recall(y_pred = DTpred_classes_validation, y_true = treatedValidation$Y_AcceptedOffer, positive = "Accepted") # Want to minimize recall DTf1_score <- F1_Score(y_pred = DTpred_classes_validation, y_true = treatedValidation$Y_AcceptedOffer, positive = "Accepted") # F1 score closer to 1 means precision and recall are more balanced; the model is doing a good job in both minimizing false positives and capturing most actual positive cases # Print out the metrics print(paste("Decision Tree Precision:", DTprecision)) print(paste("Decision Tree Recall:", DTrecall)) print(paste("Decision Tree F1 Score:", DTf1_score)) # Accuracy went from 73.0% on training data to 72.6% on validation data # ********************** ASSESS MODEL 3: RANDOM FOREST *********************** ##### APPLY RANDOM FOREST TO VALIDATION SET ####### RFpred_probs_validation <- predict(RFfit, treatedValidation, type = "prob") head(RFpred_probs_validation) #gives you the actual predicted class based on the highest probability RFpred_classes_validation <- predict(RFfit, treatedValidation) head(RFpred_classes_validation) #confusion matrix RFconf_matrix <- confusionMatrix(RFpred_classes_validation, as.factor(treatedValidation$Y_AcceptedOffer)) RFconf_matrix # Calculate precision, recall, and F1 using MLmetrics RFprecision <- Precision(y_pred = RFpred_classes_validation, y_true = treatedValidation$Y_AcceptedOffer, positive = "Accepted") RFrecall <- Recall(y_pred = RFpred_classes_validation, y_true = treatedValidation$Y_AcceptedOffer, positive = "Accepted") RFf1_score <- F1_Score(y_pred = RFpred_classes_validation, y_true = treatedValidation$Y_AcceptedOffer, positive = "Accepted") # Print out the metrics print(paste("RF Precision:", RFprecision)) #Out of all the instances predicted as positive (defaulted), how many were truly positive? print(paste("RF Recall:", RFrecall)) #Out of all the actual positive instances in the dataset (defaulted), how many did the model correctly identify as positive? print(paste("RF F1 Score:", RFf1_score)) # Accuracy went from 79.5% on training data to 71.6% on validation data # Select the most parsimonious model with the most consistent accuracy and the highest accuracy # Maximize precision and lower recall (when model predicts positive, it's often correct). Want top 100 to be sure bets, not waste money sending mail to unlikely prospects # ********************* COMPARISION OF THE THREE MODELS ********************** # Look for wider accuracy for DT and RF (generalizes better to unseen data) ModelKPI_Comparison <- data.frame(Algorithm = c("GLM", "Decision Tree", "Random Forest"), Precision = c(GLMprecision, DTprecision, RFprecision), Recall = c(GLMrecall, DTrecall, RFrecall), F1_Score = c(GLMf1_score, DTf1_score, RFf1_score)) ModelKPI_Comparison # Create a bar plot of the three metrics across the three models: # Reshape data to long format ModelKPI_Long <- ModelKPI_Comparison %>% pivot_longer(cols = Precision:F1_Score, names_to = "Metric", values_to = "Value") # Create the bar plot with custom colors ggplot(ModelKPI_Long, aes(x = Algorithm, y = Value, fill = Metric)) + geom_bar(stat = "identity", position = position_dodge(width = 0.7), width = 0.5) + # Adjust bar width geom_text(aes(label = round(Value, 2)), # Add values to bars, rounding to 2 decimal places position = position_dodge(width = 0.7), vjust = -0.3, size = 5) + # Adjust position and text size labs(title = "Model KPI Comparison", x = "Algorithm", y = "Value") + theme_minimal(base_size = 18) + # Set base text size to 18 scale_fill_manual(values = c("mediumseagreen", "salmon", "skyblue")) # Compare the randomn forest with decision tree DT_vs_RF <- resamples(list(DT = DTfit, RF = RFfit)) summary(DT_vs_RF) # Visualize model comparisons scales <- list(x = list(relation = "free"), y = list(relation = "free")) bwplot(DT_vs_RF, scales = scales) # Visualize model comparisons with a legend densityplot(DT_vs_RF, scales = scales, pch = "|", auto.key = list(title = "Model", columns = 1)) ########## BEST MODEL APPLICATION AND TOP 100 PROSPECT GENERATION ########## # APPLY DECISION TREE TO PROSPECT SET DTprospect_pred_probs <- predict(DTfit, treatedProspect, type = "prob") head(DTprospect_pred_probs) # Add an ID column to the probabilities so I can join them back up with the prospect data DTprospect_pred_probs <- cbind(ID = 1:nrow(DTprospect_pred_probs), DTprospect_pred_probs) # Add an ID column to the Prospect Customers list so I can join it with the probabilities data ProspectCustomersJoined <- cbind(ID = 1:nrow(ProspectCustomersJoined), ProspectCustomersJoined) # Create new prospect list containing the predicted accept/decline probabilities Top_100_Prospect_List <- left_join(ProspectCustomersJoined, DTprospect_pred_probs, by = "ID") # Sort the Prospect List by the "Accepted" column in descending order to get the top prospects Top_100_Prospect_List <- Top_100_Prospect_List %>% arrange(desc(Accepted)) # Keep only the top 100 rows Top_100_Prospect_List <- Top_100_Prospect_List %>% slice_head(n = 100) # Remove columns no longer needed Top_100_Prospect_List <- Top_100_Prospect_List %>% select(-ID, -Y_AcceptedOffer) # Export to a CSV file write.csv(Top_100_Prospect_List, "Top_100_Prospect_List.csv", row.names = FALSE) ##################### END. Thanks for following along! ######################