### Tom Herman Project ### library(ggplot2) library(dplyr) library(car) library(randomForest) library(caret) library(pROC) library(MLmetrics) library(gridExtra) ### SAMPLE ### setwd("~/Desktop") data <- read.csv("Healthcare-Diabetes.csv") ### MODIFY ### # Set zero values (which are implausible) to NA Fix_Zero_Values <- c("Glucose", "BloodPressure", "SkinThickness", "Insulin", "BMI") for (col in Fix_Zero_Values) { data[[col]][data[[col]] == 0] <- NA } # Check for missing values colSums(is.na(data)) # Drop rows with missing values data_clean <- na.omit(data) table(data$Outcome) # before NA removal table(data_clean$Outcome) # after NA removal ### EXPLORE ### # Ensure Outcome is a factor with readable labels #data_clean$Outcome <- factor(data_clean$Outcome, levels = c(0, 1), labels = c("No Diabetes", "Diabetes")) # Create individual box plots #p1 <- ggplot(data_clean, aes(x = Outcome, y = Pregnancies, fill = Outcome)) + # geom_boxplot() + ggtitle("Pregnancies") #p2 <- ggplot(data_clean, aes(x = Outcome, y = Glucose, fill = Outcome)) + # geom_boxplot() + ggtitle("Glucose") #p3 <- ggplot(data_clean, aes(x = Outcome, y = SkinThickness, fill = Outcome)) + # geom_boxplot() + ggtitle("Skin Thickness") #p4 <- ggplot(data_clean, aes(x = Outcome, y = BMI, fill = Outcome)) + # geom_boxplot() + ggtitle("BMI") #p5 <- ggplot(data_clean, aes(x = Outcome, y = DiabetesPedigreeFunction, fill = Outcome)) + # geom_boxplot() + ggtitle("Diabetes Pedigree Function") #p6 <- ggplot(data_clean, aes(x = Outcome, y = Age, fill = Outcome)) + # geom_boxplot() + ggtitle("Age") # Arrange plots in a 2-row grid #grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 3) ### MODEL ### set.seed(123) sample_index <- sample(1:nrow(data_clean), size = 0.7 * nrow(data_clean)) train_data <- data_clean[sample_index, ] test_data <- data_clean[-sample_index, ] # GLM Model log_model <- glm(Outcome ~ Pregnancies + Glucose + SkinThickness + BMI + DiabetesPedigreeFunction + Age, data = train_data, family = binomial) summary(log_model) vif(log_model) # RF Model # Convert Outcome to factor train_data$Outcome <- as.factor(train_data$Outcome) test_data$Outcome <- as.factor(test_data$Outcome) rf_model <- randomForest(Outcome ~ ., data = train_data %>% select(-Id), importance = TRUE, ntree = 500) print(rf_model) ### ASSESS ### # GLM # Predict probabilities log_probs_train <- predict(log_model, train_data, type = "response") log_probs_test <- predict(log_model, test_data, type = "response") # Classify with 0.5 threshold log_pred_train <- ifelse(log_probs_train > 0.5, 1, 0) log_pred_test <- ifelse(log_probs_test > 0.5, 1, 0) #plot(fitted(log_model), residuals(log_model, type = "deviance"), # xlab = "Fitted Values", ylab = "Deviance Residuals", # main = "Residuals vs Fitted Values") #abline(h = 0, col = "red") # RANDOM FOREST # Predict classes rf_pred_train <- predict(rf_model, train_data) rf_pred_test <- predict(rf_model, test_data) rf_probs_test <- predict(rf_model, test_data, type = "prob")[,2] # Logistic regression metrics cat("Logistic Regression - Training:\n") confusionMatrix(as.factor(log_pred_train), as.factor(train_data$Outcome), positive = "1") cat("Logistic Regression - Test:\n") confusionMatrix(as.factor(log_pred_test), as.factor(test_data$Outcome), positive = "1") # Random forest metrics cat("Random Forest - Training:\n") confusionMatrix(rf_pred_train, train_data$Outcome, positive = "1") cat("Random Forest - Test:\n") confusionMatrix(rf_pred_test, test_data$Outcome, positive = "1") varImpPlot(rf_model, main = "Variable Importance in Random Forest") # Logistic Regression Metrics cat("Logistic Regression Metrics (Test Data):\n") log_roc <- roc(test_data$Outcome, log_probs_test) cat("AUC:", auc(log_roc), "\n") cat("F1 Score:", F1_Score(y_pred = as.factor(log_pred_test), y_true = as.factor(test_data$Outcome), positive = "1"), "\n") # Random Forest Metrics cat("\nRandom Forest Metrics (Test Data):\n") rf_roc <- roc(test_data$Outcome, rf_probs_test) cat("AUC:", auc(rf_roc), "\n") cat("F1 Score:", F1_Score(y_pred = as.factor(rf_pred_test), y_true = as.factor(test_data$Outcome), positive = "1"), "\n")