=================================================================================== #' Author: Tom Herman, Marketing Data Scientist at Bedding Bathing & Yonder #' Date: December 10, 2024 #' Analysis: Household revenue model and spending predictions for prospective BBY customers #' Primary Objective: Create a model to predict HH revenue from exiting loyalty customers and predict #' the amount of household spending for prospective customers at BBY #' Secondary Objective: Provide insights/personas around our best (loyalty) customers # Set working directory setwd("~/Desktop/Data Mining Class/DM_Class_Git/personalFiles/Cases/Case 3 HH Revenue") # Bring in libraries and set options library(ggplot2) # Visualizations library(tidyr) # Reshape, transform, and organize data library(ggthemes) # Visual themes library(dplyr) # Data manipulation library(vtreat) # Variable treatment before modeling library(caret) # ML models library(caretEnsemble) # Ensemble package library(MLmetrics) # Evaluation metrics library(ModelMetrics) # Evaluation metrics library(leaflet) # For interactive map library(leaflet.extras) # For cluster map options(scipen = 999) ################################## SAMPLE ################################## # Import the data sets # Consumer Data ConsumerDataProspects <- read.csv('consumerData_prospects6K_studentVersion.csv') ConsumerDataTesting <- read.csv('consumerData_testing5K_studentVersion.csv') ConsumerDataTraining <- read.csv('consumerData_training15K_studentVersion.csv') # Donation Data DonationsDataProspects <- read.csv('DonationsData_prospects6K_studentVersion.csv') DonationsDataTesting <- read.csv('DonationsData_testing5K_studentVersion.csv') DonationsDataTraining <- read.csv('DonationsData_training15K_studentVersion.csv') # In House Data InHouseDataProspects <- read.csv('inHouseData_prospects6K_studentVersion.csv') InHouseDataTesting <- read.csv('inHouseData_testing5K_studentVersion.csv') InHouseDataTraining <- read.csv('inHouseData_training15K_studentVersion.csv') # Magazine Data MagazineDataProspects <- read.csv('magazineData_prospects6K_studentVersion.csv') MagazineDataTesting <- read.csv('magazineData_testing5K_studentVersion.csv') MagazineDataTraining <- read.csv('magazineData_training15K_studentVersion.csv') # Political Data PoliticalDataProspects <- read.csv('politicalData_prospects6K_studentVersion.csv') PoliticalDataTesting <- read.csv('politicalData_testing5K_studentVersion.csv') PoliticalDataTraining <- read.csv('politicalData_training15K_studentVersion.csv') ### DATA PRE-PROCESSING AND ENRICHMENT ### # Join TRAINING Data TrainingData <- InHouseDataTraining %>% left_join(ConsumerDataTraining, by = "tmpID") %>% left_join(DonationsDataTraining, by = "tmpID") %>% left_join(MagazineDataTraining, by = "tmpID") %>% left_join(PoliticalDataTraining, by = "tmpID") # Rename yHat to HHSpend TrainingData <- TrainingData %>% rename(HHSpend = yHat) # Join TESTING Data TestingData <- InHouseDataTesting %>% left_join(ConsumerDataTesting, by = "tmpID") %>% left_join(DonationsDataTesting, by = "tmpID") %>% left_join(MagazineDataTesting, by = "tmpID") %>% left_join(PoliticalDataTesting, by = "tmpID") TestingData <- TestingData %>% rename(HHSpend = yHat) # Join PROSPECTS Data ProspectsData <- InHouseDataProspects %>% left_join(ConsumerDataProspects, by = "tmpID") %>% left_join(DonationsDataProspects, by = "tmpID") %>% left_join(MagazineDataProspects, by = "tmpID") %>% left_join(PoliticalDataProspects, by = "tmpID") # Keep a side version of prospects data with all fields still included for final step # Want data science peers and marketing to have all the info for their campaign if needed ProspectsDataOriginal <- ProspectsData # Add HHSpend column with blank values ProspectsData <- ProspectsData %>% mutate(HHSpend = "") # Remove potentially problematic or unethical variables to avoid using sensitive attributes: TrainingData <- TrainingData %>% select(-c( EthnicDescription, BroadEthnicGroupings, ReligiousContributorInHome, PoliticalContributerInHome, ReligiousMagazineInHome, InterestinCurrentAffairsPoliticsInHousehold, PartiesDescription, ReligionsDescription, LikelyUnionMember, GunOwner, supportsAffordableCareAct, supportsGayMarriage, supportsGunControl, supportsTaxesRaise, overallsocialviews, DonatestoConservativeCauses, DonatestoLiberalCauses )) TestingData <- TestingData %>% select(-c( EthnicDescription, BroadEthnicGroupings, ReligiousContributorInHome, PoliticalContributerInHome, ReligiousMagazineInHome, InterestinCurrentAffairsPoliticsInHousehold, PartiesDescription, ReligionsDescription, LikelyUnionMember, GunOwner, supportsAffordableCareAct, supportsGayMarriage, supportsGunControl, supportsTaxesRaise, overallsocialviews, DonatestoConservativeCauses, DonatestoLiberalCauses )) ProspectsData <- ProspectsData %>% select(-c( EthnicDescription, BroadEthnicGroupings, ReligiousContributorInHome, PoliticalContributerInHome, ReligiousMagazineInHome, InterestinCurrentAffairsPoliticsInHousehold, PartiesDescription, ReligionsDescription, LikelyUnionMember, GunOwner, supportsAffordableCareAct, supportsGayMarriage, supportsGunControl, supportsTaxesRaise, overallsocialviews, DonatestoConservativeCauses, DonatestoLiberalCauses )) # Remove columns that are mostly empty (>90%) and/or not useful # Replace all instances of "Unknown" with an empty string in the entire dataset TrainingData[] <- lapply(TrainingData, function(x) gsub("Unknown", "", x)) TestingData[] <- lapply(TestingData, function(x) gsub("Unknown", "", x)) ProspectsData[] <- lapply(ProspectsData, function(x) gsub("Unknown", "", x)) # Calculate the proportion of NA or blank values for each column missing_info <- colSums(is.na(TrainingData) | TrainingData == "") / nrow(TrainingData) # Display columns with the least amount of information (most missing or blank values) # missing_info[order(missing_info, decreasing = TRUE)] TrainingData <- TrainingData %>% select(-c( BuyerofAntiquesinHousehold, DonatestoInternationalAidCauses, DonatestoInternationalAidCauses1, BooksAudioReadinginHousehold, DonatestoArtsandCulture, HorseOwner, BusinessOwner, UpscaleBuyerInHome, DonatestoVeteransCauses, CulinaryInterestMagazineInHome, Veteran, FemaleOrientedMagazineInHome, DonatesEnvironmentCauseInHome, DonatestoAnimalWelfare, GardeningMagazineInHome, DonatestoChildrensCauses, CatOwner, FirstName, LastName, TelephonesFullPhone )) TestingData <- TestingData %>% select(-c( BuyerofAntiquesinHousehold, DonatestoInternationalAidCauses, DonatestoInternationalAidCauses1, BooksAudioReadinginHousehold, DonatestoArtsandCulture, HorseOwner, BusinessOwner, UpscaleBuyerInHome, DonatestoVeteransCauses, CulinaryInterestMagazineInHome, Veteran, FemaleOrientedMagazineInHome, DonatesEnvironmentCauseInHome, DonatestoAnimalWelfare, GardeningMagazineInHome, DonatestoChildrensCauses, CatOwner, FirstName, LastName, TelephonesFullPhone )) ProspectsData <- ProspectsData %>% select(-c( BuyerofAntiquesinHousehold, DonatestoInternationalAidCauses, DonatestoInternationalAidCauses1, BooksAudioReadinginHousehold, DonatestoArtsandCulture, HorseOwner, BusinessOwner, UpscaleBuyerInHome, DonatestoVeteransCauses, CulinaryInterestMagazineInHome, Veteran, FemaleOrientedMagazineInHome, DonatesEnvironmentCauseInHome, DonatestoAnimalWelfare, GardeningMagazineInHome, DonatestoChildrensCauses, CatOwner, FirstName, LastName, TelephonesFullPhone )) # Remove noise text in some of the columns TrainingData$BookBuyerInHome <- as.numeric(sub("^(\\d+).*", "\\1", TrainingData$BookBuyerInHome)) TestingData$BookBuyerInHome <- as.numeric(sub("^(\\d+).*", "\\1", TestingData$BookBuyerInHome)) ProspectsData$BookBuyerInHome <- as.numeric(sub("^(\\d+).*", "\\1", ProspectsData$BookBuyerInHome)) TrainingData$FamilyMagazineInHome <- as.numeric(sub("^(\\d+).*", "\\1", TrainingData$FamilyMagazineInHome)) TestingData$FamilyMagazineInHome <- as.numeric(sub("^(\\d+).*", "\\1", TestingData$FamilyMagazineInHome)) ProspectsData$FamilyMagazineInHome <- as.numeric(sub("^(\\d+).*", "\\1", ProspectsData$FamilyMagazineInHome)) TrainingData$HealthFitnessMagazineInHome <- as.numeric(sub("^(\\d+).*", "\\1", TrainingData$HealthFitnessMagazineInHome)) TestingData$HealthFitnessMagazineInHome <- as.numeric(sub("^(\\d+).*", "\\1", TestingData$HealthFitnessMagazineInHome)) ProspectsData$HealthFitnessMagazineInHome <- as.numeric(sub("^(\\d+).*", "\\1", ProspectsData$HealthFitnessMagazineInHome)) TrainingData$DoItYourselfMagazineInHome <- as.numeric(sub("^(\\d+).*", "\\1", TrainingData$DoItYourselfMagazineInHome)) TestingData$DoItYourselfMagazineInHome <- as.numeric(sub("^(\\d+).*", "\\1", TestingData$DoItYourselfMagazineInHome)) ProspectsData$DoItYourselfMagazineInHome <- as.numeric(sub("^(\\d+).*", "\\1", ProspectsData$DoItYourselfMagazineInHome)) TrainingData$FinancialMagazineInHome <- as.numeric(sub("^(\\d+).*", "\\1", TrainingData$FinancialMagazineInHome)) TestingData$FinancialMagazineInHome <- as.numeric(sub("^(\\d+).*", "\\1", TestingData$FinancialMagazineInHome)) ProspectsData$FinancialMagazineInHome <- as.numeric(sub("^(\\d+).*", "\\1", ProspectsData$FinancialMagazineInHome)) # Round Ages down to nearest whole number TrainingData$Age <- as.numeric(TrainingData$Age) TrainingData$Age <- floor(TrainingData$Age) TestingData$Age <- as.numeric(TestingData$Age) TestingData$Age <- floor(TestingData$Age) ProspectsData$Age <- as.numeric(ProspectsData$Age) ProspectsData$Age <- floor(ProspectsData$Age) # Round HHSpend to the nearest cent TrainingData$HHSpend <- as.numeric(TrainingData$HHSpend) TrainingData$HHSpend <- round(TrainingData$HHSpend, 2) TestingData$HHSpend <- as.numeric(TestingData$HHSpend) TestingData$HHSpend <- round(TestingData$HHSpend, 2) # Convert lat and lon to numeric for maps exploration TrainingData$lat <- as.numeric(as.character(TrainingData$lat)) TrainingData$lon <- as.numeric(as.character(TrainingData$lon)) TestingData$lat <- as.numeric(as.character(TestingData$lat)) TestingData$lon <- as.numeric(as.character(TestingData$lon)) ProspectsData$lat <- as.numeric(as.character(ProspectsData$lat)) ProspectsData$lon <- as.numeric(as.character(ProspectsData$lon)) # Clean up global environment by removing data sets no longer needed rm(ConsumerDataProspects, ConsumerDataTesting, ConsumerDataTraining) rm(DonationsDataProspects, DonationsDataTesting, DonationsDataTraining) rm(InHouseDataProspects, InHouseDataTesting, InHouseDataTraining) rm(MagazineDataProspects, MagazineDataTesting, MagazineDataTraining) rm(PoliticalDataTesting, PoliticalDataTraining, PoliticalDataProspects, missing_info) # FEATURE ENGINEERING (doing this here instead of "MODIFY" section so it's ready for VTreat partition) # Age and Education Interaction TrainingData$MedianEducationYears <- as.numeric(TrainingData$MedianEducationYears) TrainingData$AgeEducationInteraction <- TrainingData$Age * TrainingData$MedianEducationYears TestingData$MedianEducationYears <- as.numeric(TestingData$MedianEducationYears) TestingData$AgeEducationInteraction <- TestingData$Age * TestingData$MedianEducationYears ProspectsData$MedianEducationYears <- as.numeric(ProspectsData$MedianEducationYears) ProspectsData$AgeEducationInteraction <- ProspectsData$Age * ProspectsData$MedianEducationYears # Magazine enthusiast (those with at least one magazine in all categories) TrainingData$MagEnthusiast <- rowSums(TrainingData[, c("FamilyMagazineInHome", "HealthFitnessMagazineInHome", "DoItYourselfMagazineInHome", "FinancialMagazineInHome")], na.rm = TRUE) TestingData$MagEnthusiast <- rowSums(TestingData[, c("FamilyMagazineInHome", "HealthFitnessMagazineInHome", "DoItYourselfMagazineInHome", "FinancialMagazineInHome")], na.rm = TRUE) ProspectsData$MagEnthusiast <- rowSums(ProspectsData[, c("FamilyMagazineInHome", "HealthFitnessMagazineInHome", "DoItYourselfMagazineInHome", "FinancialMagazineInHome")], na.rm = TRUE) # Move HHSpend to the far right for simplicity TrainingData <- TrainingData %>% relocate(HHSpend, .after = last_col()) TestingData <- TestingData %>% relocate(HHSpend, .after = last_col()) ProspectsData <- ProspectsData %>% relocate(HHSpend, .after = last_col()) ### CREATE VTREAT PARTITIONS (10% for VTreat, then maintain the rest in TrainingData) ### set.seed(2024) VtreatPartition <- sample(1:nrow(TrainingData), nrow(TrainingData)*.1) VtreatRows <- TrainingData[VtreatPartition,] #10% TrainingData <- TrainingData[-VtreatPartition,] #90% ################################## EXPLORE ################################# # Map out and review the actual physical locations of our customers for any trends: leaflet(data = TrainingData) %>% addTiles() %>% addHeatmap( lng = ~lon, lat = ~lat, intensity = 1, # Uniform intensity blur = 10, # Sharper transitions radius = 40, # Larger influence radius max = 0.8 # Emphasizes high-density regions ) # We have customers in all 50 states, with the largest concentration around the Tri-State area and eastern seaboard leaflet(data = TrainingData) %>% addTiles() %>% addMarkers(~lon, ~lat, popup = ~paste("Customer ID:", tmpID), clusterOptions = markerClusterOptions()) # Enable clustering # Cluster map shows clustered numbers of our customers # Now let's see if we can identify our biggest spenders and any themes summary(TrainingData$HHSpend) # Our lowest spender is $18.66, median is $314.65 and highest is $591.99 # Histogram of HH spending by bin # Automatically bin HHSpend and create histogram ggplot(TrainingData, aes(x = cut(HHSpend, breaks = 6, # R decides bin intervals with 6 bins include.lowest = TRUE))) + geom_bar(fill = rgb(47, 163, 238, maxColorValue = 255), color = NA) + # Custom RGB fill and no outline labs(title = "Histogram of Loyalty Customer Spending", x = "Spending Range", y = "Number of Customers") + theme_minimal() + theme( plot.title = element_text(size = 18, face = "bold"), # Title size axis.title.x = element_text(size = 16, face = "bold"), # X-axis title size axis.title.y = element_text(size = 16, face = "bold"), # Y-axis title size axis.text.x = element_text(size = 14), # X-axis labels axis.text.y = element_text(size = 14) # Y-axis labels ) # Scatterplot with trend line, y-axis cut off at zero, and y-axis in dollar format ggplot(TrainingData, aes(x = as.numeric(MagEnthusiast), y = HHSpend)) + geom_jitter(width = 0.2, alpha = 0.7, color = rgb(47, 163, 238, maxColorValue = 255)) + # Blue points with RGB color geom_smooth(method = "lm", se = FALSE, color = "red") + # Add linear trend line (no confidence interval shaded) theme_minimal() + # Use minimal theme theme( plot.title = element_text(size = 18, face = "bold"), # Title size axis.title.x = element_text(size = 16, face = "bold"), # X-axis title size axis.title.y = element_text(size = 16, face = "bold"), # Y-axis title size axis.text.x = element_text(size = 14), # X-axis labels axis.text.y = element_text(size = 14) # Y-axis labels ) + coord_cartesian(ylim = c(0, NA)) + # Limit the y-axis from 0 upwards labs( x = "Number of Magazines in Home", # X-axis title y = "Household Spending" # Y-axis title ) + scale_y_continuous(labels = scales::dollar_format()) # Format y-axis as dollar amounts #table(TrainingData$MagEnthusiast) # HH spend by net worth # Create a box plot of HHSpend by NetWorth ggplot(TrainingData, aes(x = NetWorth, y = HHSpend)) + geom_boxplot(fill = rgb(47, 163, 238, maxColorValue = 255), color = "black") + labs( title = "Household Spending by Net Worth", x = "Net Worth Range", y = "Household Spending ($)" ) + theme_minimal() + theme( plot.title = element_text(size = 18, face = "bold"), axis.title.x = element_text(size = 16, face = "bold"), axis.title.y = element_text(size = 16, face = "bold"), axis.text.x = element_text(size = 14, angle = 45, hjust = 1), # Rotate x-axis labels axis.text.y = element_text(size = 14) ) # HH spend by Education ggplot(TrainingData, aes(x = Education, y = HHSpend)) + geom_boxplot(fill = rgb(47, 163, 238, maxColorValue = 255), color = "black") + labs( title = "Household Spending by Education Level", x = "Education Level", y = "Household Spending ($)" ) + theme_minimal() + theme( plot.title = element_text(size = 18, face = "bold"), axis.title.x = element_text(size = 16, face = "bold"), axis.title.y = element_text(size = 16, face = "bold"), axis.text.x = element_text(size = 14, angle = 45, hjust = 1), # Rotate x-axis labels axis.text.y = element_text(size = 14) ) table(TrainingData$Education) # HH Spend by Computer Owner ggplot(TrainingData, aes(x = HHSpend, fill = ComputerOwnerInHome)) + geom_density(alpha = 0.6) + labs( title = "Household Spending Distribution by Computer Ownership", x = "Household Spending ($)", y = "Density" ) + theme_minimal() + theme( plot.title = element_text(size = 18, face = "bold"), axis.title.x = element_text(size = 16, face = "bold"), axis.title.y = element_text(size = 16, face = "bold"), axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14) ) table(TrainingData$ComputerOwnerInHome) # Box Plot of computer owner spending ggplot(TrainingData, aes(x = ComputerOwnerInHome, y = HHSpend)) + geom_boxplot(fill = rgb(47, 163, 238, maxColorValue = 255), color = "black") + stat_summary( fun = median, geom = "text", aes(label = sprintf("$%.2f", ..y..)), color = "red", vjust = -0.5, size = 5 ) + labs( title = "Household Spending by Computer Ownership", x = "Computer Ownership in Home", y = "Household Spending ($)" ) + theme_minimal() + theme( plot.title = element_text(size = 18, face = "bold"), axis.title.x = element_text(size = 16, face = "bold"), axis.title.y = element_text(size = 16, face = "bold"), axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14) ) # Aggregate mean spending by MosaicZ4 MosaicSummary <- TrainingData %>% group_by(MosaicZ4) %>% summarize(MedianSpend = median(HHSpend, na.rm = TRUE)) %>% mutate(Top5 = ifelse(MedianSpend %in% sort(MedianSpend, decreasing = TRUE)[1:5], "Top 5", "Other")) # Horizontal bar plot with different colors for top 5, without a legend ggplot(MosaicSummary, aes(x = MedianSpend, y = reorder(MosaicZ4, MedianSpend), fill = Top5)) + geom_bar(stat = "identity") + scale_fill_manual(values = c("Top 5" = rgb(47, 163, 238, maxColorValue = 255), "Other" = "lightgray")) + labs( title = "Median Household Spending by MosaicZ4 Categories", x = "Median Household Spending ($)", y = "MosaicZ4 Categories" ) + theme_minimal() + theme( plot.title = element_text(size = 18, face = "bold"), axis.title.x = element_text(size = 16, face = "bold"), axis.title.y = element_text(size = 16, face = "bold"), axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 12), legend.position = "none" # Remove the legend ) ################################## MODIFY ################################## # PREPARE DATA FOR MODELING # FIRST I RAN A BASELINE RANDOM FOREST MODEL AND USED varImp() TO DETERMINE LEAST IMPORTANT VARIABLES FOR REMOVAL # I DID THIS A COUPLE TIMES IN CIRCULAR FASHION TO WHITTLE DOWN THE NUMBER OF VARIABLES (TO ACHIEVE PARSIMONIOUS MODEL) # LEAST IMPORTANT VARIABLES THAT WERE REMOVED: state, LandValue, fips, stateFips, ISPSA, HomeOwnerRenter, PresenceOfChildrenCode # This dropped approximately 113 variables from my original Vtreat set (~240). See "MODEL" section. TrainingData <- TrainingData %>% select(-c( state, LandValue, fips, stateFips, ISPSA, HomeOwnerRenter, PresenceOfChildrenCode )) TestingData <- TestingData %>% select(-c( state, LandValue, fips, stateFips, ISPSA, HomeOwnerRenter, PresenceOfChildrenCode )) ProspectsData <- ProspectsData %>% select(-c( state, LandValue, fips, stateFips, ISPSA, HomeOwnerRenter, PresenceOfChildrenCode )) VtreatRows <- VtreatRows %>% select(-c( state, LandValue, fips, stateFips, ISPSA, HomeOwnerRenter, PresenceOfChildrenCode )) # TREAT DATA WITH VTREAT OutcomeVariableName <- names(VtreatRows)[38] # "HHSpend" Plan <- designTreatmentsN(VtreatRows, names(VtreatRows)[2:37], # Omit "tmpID" and include everything up to "HHSpend" OutcomeVariableName) TreatedTraining <- prepare(Plan, TrainingData) TreatedTesting <- prepare(Plan, TestingData) TreatedProspects <- prepare(Plan, ProspectsData) # Use these as a foundation for all four models ################################## MODEL ################################### # ************************* MODEL 1: RANDOM FOREST *************************** # Using a cross validation training scheme with 10 folds. ***Used for all subsequent models*** TrainControl <- trainControl(method="cv", number=10, savePredictions = 'all', verboseIter = TRUE) # Train model RFfit <- train(HHSpend ~., data = TreatedTraining, method ="rf", ntree = 200, # 200 trees trControl = TrainControl, tuneGrid = data.frame(mtry = 5)) # 5 variables randomly sampled at each split # Variable Importance #VarImportance <- varImp(RFfit) #plot(VarImportance, top = 10) # Sort by the Overall importance score in ascending order (least important first) #LeastImportant <- VarImportance$importance[order(VarImportance$importance$Overall), , drop = FALSE] # View the least important variables along with their scores #print(LeastImportant) # Circle back to modify section and remove low influence variables RFfit$results # ****************************** MODEL 2: GLM ******************************** GLMfit <- train(HHSpend ~ ., data = TreatedTraining, method = "glm", trControl = TrainControl) summary(GLMfit) GLMfit$results # ****************************** MODEL 3: GBM ******************************** # Gradient Boosting Machine # builds an ensemble of decision trees in a sequential manner, where each new tree corrects the errors made by the previous ones # Train the model using GBM GBMfit <- train(HHSpend ~ ., data = TreatedTraining, method = "gbm", trControl = TrainControl, # Training control verbose = FALSE) # Optional: Suppress training output GBMfit$results # **************************** MODEL 4: XGBoost ****************************** # Optimized version of GBM # Train the XGBoost model TuneGrid <- expand.grid( nrounds = c(50, 100), # Number of boosting iterations max_depth = c(3, 6, 9), # Maximum depth of the tree eta = c(0.1, 0.3), # Learning rate gamma = c(0, 1), # Minimum loss reduction colsample_bytree = c(0.8), # Subsample ratio of columns min_child_weight = c(1), # Minimum sum of instance weights subsample = c(0.8) # Subsample ratio of the training set ) XGBfit <- train(HHSpend ~ ., data = TreatedTraining, method = "xgbTree", trControl = TrainControl, tuneGrid = TuneGrid) XGBfit$results ################################## ASSESS ################################## # Compare models using caret function resamples() TrainResults <- resamples(list(RF = RFfit, GLM = GLMfit, GBM = GBMfit, XGB = XGBfit)) summary(TrainResults) # Visualize trained models (Separated Rsquared from RMSE and MAE since they have different scales) bwplot(TrainResults, metric = "Rsquared", main = "Comparison of Models (R-squared)") bwplot(TrainResults, metric = c("RMSE", "MAE"), main = "Comparison of Models (RMSE, MAE)") ##### Apply trained models to TreatedTesting data set # Predict using Random Forest RFPredictions <- predict(RFfit, newdata = TreatedTesting) # Predict using GLM GLMPredictions <- predict(GLMfit, newdata = TreatedTesting) # Predict using GBM GBMPredictions <- predict(GBMfit, newdata = TreatedTesting) # Predict using XGB XGBPredictions <- predict(XGBfit, newdata = TreatedTesting) # combine predictions into one data frame CombinedPredictions <- data.frame( Actual = TreatedTesting$HHSpend, RF = RFPredictions, GLM = GLMPredictions, GBM = GBMPredictions, XGB = XGBPredictions ) #CombinedPredictions # Melt the data frame to long format for faceting LongPredictions <- reshape2::melt(CombinedPredictions, id.vars = "Actual") # Plot predictions vs actuals using ggplot2 with faceting ggplot(LongPredictions, aes(x = Actual, y = value)) + geom_point() + facet_wrap(~ variable, scales = "free") + xlab("Actual") + ylab("Predicted") + ggtitle("Model Predictions Comparison") # Actual target variable values actual <- TreatedTesting$HHSpend # Calculate metrics for each model # RMSE RFRMSE <- ModelMetrics::rmse(actual, RFPredictions) GLMRMSE <- ModelMetrics::rmse(actual, GLMPredictions) GBMRMSE <- ModelMetrics::rmse(actual, GBMPredictions) XGBRMSE <- ModelMetrics::rmse(actual, XGBPredictions) # Combine metrics into a data frame for comparison ValidationRMSEResults <- data.frame( Model = c("RF", "GLM", "GBM", "XGB"), RMSE = c(RFRMSE, GLMRMSE, GBMRMSE, XGBRMSE) ) # get model RMSEs ValidationRMSEResults # MAPE RFMAPE <- MLmetrics::MAPE(actual, RFPredictions) GLMMAPE <- MLmetrics::MAPE(actual, GLMPredictions) GBMMAPE <- MLmetrics::MAPE(actual, GBMPredictions) XGBMAPE <- MLmetrics::MAPE(actual, XGBPredictions) # Combine metrics into a data frame for comparison ValidationMAPEResults <- data.frame( Model = c("RF", "GLM", "GBM", "XGB"), MAPE = c(RFMAPE, GLMMAPE, GBMMAPE, XGBMAPE) ) # get model MAPEs - average difference between actuals and predicted ValidationMAPEResults # Get model R squared results calculate_r_squared <- function(actual, predicted) { SS_res <- sum((actual - predicted)^2) SS_tot <- sum((actual - mean(actual))^2) return(1 - (SS_res / SS_tot)) } RFr2 <- calculate_r_squared(actual, RFPredictions) GLMr2 <- calculate_r_squared(actual, GLMPredictions) GBMr2 <- calculate_r_squared(actual, GBMPredictions) XGBr2 <- calculate_r_squared(actual, XGBPredictions) # Combine metrics into a data frame for comparison ValidationR2Results <- data.frame( Model = c("RF", "GLM", "GBM", "XGB"), R2 = c(RFr2, GLMr2, GBMr2, XGBr2) ) ValidationR2Results # ***************************** ENSEMBLE MODEL ******************************* # CREATED AN ENSEMBLE MODEL FROM TOP 3 MODELS (RF, GBM, XGB). HOWEVER... # DID NOT END UP USING THIS BECAUSE IT COMPARED SIMILARLY TO GBM, BUT R-SQUARE NUMBER TANKED # Create a list of top 3 models #TopModels <- list(RF = RFfit, GBM = GBMfit, XGB = XGBfit) # Stack the models using caretStack #EnsembleFit <- caretStack(TopModels, method = "glm", trControl = TrainControl) # Make predictions #EnsemblePredictions <- predict(EnsembleFit, newdata = TreatedTesting) #EnsemblePredictions <- unlist(EnsemblePredictions) # Assess #EnsembleRMSE <- ModelMetrics::rmse(actual, EnsemblePredictions) #EnsembleRMSE # 64.85 - Somewhere between GBM and RF #EnsembleMAPE <- MLmetrics::MAPE(actual, EnsemblePredictions) #EnsembleMAPE # .1890 #EnsembleRsquared <- R2_Score(actual, EnsemblePredictions) #EnsembleRsquared # .23, well below underlying models #EnsembleMAE <- mae(actual = actual, predicted = EnsemblePredictions) #EnsembleMAE # 50.88 - similar to XGB ########## BEST MODEL APPLICATION AND TOP 100 PROSPECT GENERATION ########## # APPLY XGB TO PROSPECTS DATA SET HHSpendPrediction <- predict(XGBfit, newdata = TreatedProspects) HHSpendPrediction <- as.data.frame(HHSpendPrediction) # Add an ID column to the spend data so I can join it back up with the prospect data HHSpendPrediction <- cbind(ID = 1:nrow(HHSpendPrediction), HHSpendPrediction) # Add an ID column to the Prospect Customers list so I can join it with the predicted spending data ProspectsDataOriginal <- cbind(ID = 1:nrow(ProspectsDataOriginal), ProspectsDataOriginal) # Create new prospect list containing the predicted household spend amounts ProspectsPredictedSpending <- left_join(HHSpendPrediction, ProspectsDataOriginal, by = "ID") # Remove temporary "ID" column ProspectsPredictedSpending <- ProspectsPredictedSpending %>% select(-c( ID )) # Sort the Prospect List by the "HHSpendPrediction" column in descending order to get the highest spenders first ProspectsPredictedSpending <- ProspectsPredictedSpending %>% arrange(desc(HHSpendPrediction)) # Round HHSpendPrediction to the nearest cent ProspectsPredictedSpending$HHSpendPrediction <- round(ProspectsPredictedSpending$HHSpendPrediction, 2) # Export to a CSV file write.csv(ProspectsPredictedSpending, "ProspectsPredictedSpending.csv", row.names = FALSE) # PROSPECTS DATA EDA leaflet(data = ProspectsPredictedSpending) %>% addTiles() %>% addMarkers(~lon, ~lat, popup = ~paste("Customer ID:", tmpID), clusterOptions = markerClusterOptions()) # Enable clustering # Aggregate mean spending by MosaicZ4 MosaicSummary <- ProspectsPredictedSpending %>% group_by(MosaicZ4) %>% summarize(MedianSpend = median(HHSpendPrediction, na.rm = TRUE)) %>% mutate(Top5 = ifelse(MedianSpend %in% sort(MedianSpend, decreasing = TRUE)[1:5], "Top 5", "Other")) # Horizontal bar plot with different colors for top 5, without a legend ggplot(MosaicSummary, aes(x = MedianSpend, y = reorder(MosaicZ4, MedianSpend), fill = Top5)) + geom_bar(stat = "identity") + scale_fill_manual(values = c("Top 5" = rgb(47, 163, 238, maxColorValue = 255), "Other" = "lightgray")) + labs( title = "Median Predicted Household Spending by MosaicZ4 Categories", x = "Median Predicted Household Spending ($)", y = "MosaicZ4 Categories" ) + theme_minimal() + theme( plot.title = element_text(size = 18, face = "bold"), axis.title.x = element_text(size = 16, face = "bold"), axis.title.y = element_text(size = 16, face = "bold"), axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 12), legend.position = "none" # Remove the legend ) table(ProspectsPredictedSpending$ComputerOwnerInHome) ##################### END. Thanks for following along! ######################