# library(mlbench) library(ggplot2) library(gridExtra) library(dplyr) library(MASS) library(tidyr) library(patchwork) # for combining plots if desired # data(PimaIndiansDiabetes) ?PimaIndiansDiabetes pid <- PimaIndiansDiabetes # head(pid) # # # Context # This dataset is originally from the National Institute of Diabetes and # Digestive and Kidney Diseases. The objective of the dataset is to # diagnostically predict whether or not a patient has diabetes, based on certain # diagnostic measurements included in the dataset. # Several constraints were placed on the selection of these instances from a # larger database. In particular, all patients here are females at least 21 years # old of Pima Indian heritage. # # Content # The datasets consists of several medical predictor variables and one target # variable, Outcome. Predictor variables includes the number of pregnancies # the patient has had, their BMI, insulin level, age, and so on. # # Acknowledgements # Smith, J.W., Everhart, J.E., Dickson, W.C., Knowler, W.C., & Johannes, R.S. (1988). Using the ADAP learning algorithm to forecast the onset of diabetes mellitus. In Proceedings of the Symposium on Computer Applications and Medical Care (pp. 261--265). IEEE Computer Society Press. # # Έλεγχος της ανισορροπίας (unbalanced classes) table(pid$diabetes) # # Ποσοστιαία κατανομή prop.table(table(pid$diabetes)) # # Graph: GLUCOSE vs DIABETES (Boxplot) p1 = ggplot(pid, aes(x = diabetes, y = glucose, fill = diabetes)) + geom_boxplot(alpha = 0.7) + labs(title = "Glucose Levels", x = "Diabetes", y = "Glucose") + theme_minimal() p1 # # Graph: DIABETES vs Number of Pregnancies (Boxplot) p2 = ggplot(pid, aes(x = diabetes, y = pregnant, fill = diabetes)) + geom_boxplot(alpha = 0.7) + labs(title = "Number of Pregnancies", x = "Diabetes", y = "Number of Pregnancies") + theme_minimal() p2 # # Graph: DIABETES vs Blood Pressure (Boxplot) p3 = ggplot(pid, aes(x = diabetes, y = pressure, fill = diabetes)) + geom_boxplot(alpha = 0.7) + labs(title = "Blood Pressure", x = "Diabetes", y = "Blood Pressure") + theme_minimal() p3 # # Graph: DIABETES vs Skin Thickness (Boxplot) p4 = ggplot(pid, aes(x = diabetes, y = triceps, fill = diabetes)) + geom_boxplot(alpha = 0.7) + labs(title = "Skin Thickness", x = "Diabetes", y = "Skin Thickness (triceps)") + theme_minimal() p4 # # Graph: DIABETES vs Insulin (Boxplot) p5 = ggplot(pid, aes(x = diabetes, y = insulin, fill = diabetes)) + geom_boxplot(alpha = 0.7) + labs(title = "Insulin", x = "Diabetes", y = "Insulin") + theme_minimal() p5 # # Graph: DIABETES vs BMI (Boxplot) p6 = ggplot(pid, aes(x = diabetes, y = mass, fill = diabetes)) + geom_boxplot(alpha = 0.7) + labs(title = "BMI", x = "Diabetes", y = "BMI (mass)") + theme_minimal() p6 # # Graph: DIABETES vs Diabetes Pedigree (Boxplot) p7 = ggplot(pid, aes(x = diabetes, y = pedigree, fill = diabetes)) + geom_boxplot(alpha = 0.7) + labs(title = "Diabetes Pedigree", x = "Diabetes", y = "Diabetes Pedigree") + theme_minimal() p7 # # Graph: DIABETES vs Age (Boxplot) p8 = ggplot(pid, aes(x = diabetes, y = age, fill = diabetes)) + geom_boxplot(alpha = 0.7) + labs(title = "Age", x = "Diabetes", y = "Age") + theme_minimal() p8 # grid.arrange(p1, p2,p3,p4,nrow = 2) grid.arrange(p5, p6,p7,p8,nrow = 2) # pid_long <- pid %>% pivot_longer(cols = -diabetes, names_to = "variable", values_to = "value") # Create all boxplots using facets ggplot(pid_long, aes(x = diabetes, y = value, fill = type)) + geom_boxplot(alpha = 0.7) + facet_wrap(~ variable, scales = "free_y") + labs( title = "Boxplots of Predictors by Diabetes Diagnosis (Pima Data)", x = "Diabetes (type)", y = "Value" ) + scale_fill_brewer(palette = "Set2") + theme_minimal(base_size = 14) # # LOGISTIC MODEL model <- glm(diabetes ~ ., data = pid, family = binomial) summary(model) # model1 <- glm(diabetes ~ . -triceps, data = pid, family = binomial) summary(model1) # model2 <- glm(diabetes ~ .-triceps-insulin, data = pid, family = binomial) summary(model2) # # Calculate Odds Ratios exp(coef(model2)) # Example: BMI (variable MASS) increases diabetes probability by ~9% # # ROC if(!require(pROC)) install.packages("pROC") library(pROC) # # Predict Probs probs <- predict(model, type = "response") roc_obj <- roc(pid$diabetes, probs) roc_obj$auc # Find optimal threshold (Youden’s J statistic) coords_best <- coords(roc_obj, "best", ret = c("threshold", "sensitivity", "specificity")) coords_best # # CONFUSION MATRIX # Predicted classes (threshold = 0.5) pid$predicted <- predict(model, type = "response") pid$pred_class <- ifelse(pid$predicted >= 0.5, "pos", "neg") # Confusion matrix table(Observed = pid$diabetes,Predicted = pid$pred_class) # # CONFUSION MATRIX - Threshold=0.5 conf_mat <- table(Observed = pid$diabetes, Predicted = pid$pred_class) TP <- conf_mat["pos", "pos"] TN <- conf_mat["neg", "neg"] FP <- conf_mat["neg", "pos"] FN <- conf_mat["pos", "neg"] accuracy <- (TP + TN) / sum(conf_mat) sensitivity <- TP / (TP + FN) specificity <- TN / (TN + FP) data.frame(Accuracy = accuracy, Sensitivity = sensitivity, Specificity = specificity) # # # Extract best threshold (Youden’s J statistic) # best_t <- coords_best["threshold"] # # Predicted class based on that threshold pid$pred_class_best <- ifelse(pid$predicted >= best_t[[1]], "pos", "neg") # # Actual class should match names used above table(Observed = pid$diabetes, Predicted = pid$pred_class_best) # # CONFUSION MATRIX - Threshold=0.3536825 (Youden’s J statistic) conf_mat <- table(Observed = pid$diabetes, Predicted = pid$pred_class_best) TP <- conf_mat["pos", "pos"] TN <- conf_mat["neg", "neg"] FP <- conf_mat["neg", "pos"] FN <- conf_mat["pos", "neg"] accuracy <- (TP + TN) / sum(conf_mat) sensitivity <- TP / (TP + FN) specificity <- TN / (TN + FP) data.frame(Accuracy = accuracy, Sensitivity = sensitivity, Specificity = specificity) # plot(roc_obj, col = "blue", lwd = 2, main = "ROC Curve for Pima Indians Data") abline(a = 0, b = 1, lty = 2, col = "gray") # # ROC # plot(roc_obj, main = "ROC Curve (Model Accuracy): 0.84") # # The Predicted Probability Curve (S-Curve) # This is the most intuitive "logistic" visual. # It shows how the probability of having diabetes (0% to 100%) # climbs as a specific factor (like Glucose) increases. # predicted_pid <- data.frame(glucose = seq(min(pid$glucose), max(pid$glucose), len=100)) # Keep other variables at their average/median predicted_pid$pregnant <- median(pid$pregnant) predicted_pid$pressure <- median(pid$pressure) predicted_pid$triceps <- median(pid$triceps) predicted_pid$insulin <- median(pid$insulin) predicted_pid$mass <- median(pid$mass) predicted_pid$pedigree <- median(pid$pedigree) predicted_pid$age <- median(pid$age) # predicted_pid$prob <- predict(model, newdata = predicted_pid, type = "response") # ggplot(predicted_pid, aes(x = glucose, y = prob)) + geom_line(color = "blue", size = 1) + labs(title = "The 'Risk' Curve", subtitle = "How Glucose levels impact the probability of Diabetes", x = "Glucose Level", y = "Probability of Diabetes (0 to 1)") + scale_y_continuous(labels = scales::percent) + theme_minimal() # # Density Plot # Instead of a Boxplot, a Density Plot shows where the "bulk" of each group lies. # If the "Diabetes" cloud is shifted far to the right of the "Healthy" cloud, # the audience immediately sees that Glucose is a strong predictor. # ggplot(df, aes(x = glucose, fill = diabetes)) + geom_density(alpha = 0.5) + labs(title = "Where do patients 'cluster'?", subtitle = "Healthy vs. Diabetic distribution based on Glucose", x = "Glucose", y = "Density") + theme_minimal() # # # The Precision-Recall Curve (Best for Unbalanced Data) # Since you mentioned unbalanced data, the ROC curve can sometimes # be "too optimistic." A Precision-Recall Curve is more honest. # It shows: "If we predict someone has diabetes, how often are we # right (Precision) and how many did we catch (Recall)?" # if(!require(precrec)) install.packages("precrec") library(precrec) # precrec_obj <- evalmod(scores = probs, labels = df$diabetes) autoplot(precrec_obj, curvetype = "PRC") + labs(title = "Precision-Recall Curve", subtitle = "Better for Unbalanced Data analysis") #