#
library(mlbench)
library(ggplot2)
library(gridExtra)
library(dplyr)
library(MASS)
library(tidyr)
library(patchwork)   # for combining plots if desired
#
data(PimaIndiansDiabetes)
?PimaIndiansDiabetes
pid <- PimaIndiansDiabetes
#
head(pid)
#
# 
# Context
# This dataset is originally from the National Institute of Diabetes and 
# Digestive and Kidney Diseases. The objective of the dataset is to 
# diagnostically predict whether or not a patient has diabetes, based on certain 
# diagnostic measurements included in the dataset. 
# Several constraints were placed on the selection of these instances from a 
# larger database. In particular, all patients here are females at least 21 years 
# old of Pima Indian heritage.
#
# Content
# The datasets consists of several medical predictor variables and one target 
# variable, Outcome. Predictor variables includes the number of pregnancies 
# the patient has had, their BMI, insulin level, age, and so on.
#
# Acknowledgements
# Smith, J.W., Everhart, J.E., Dickson, W.C., Knowler, W.C., & Johannes, R.S. (1988). Using the ADAP learning algorithm to forecast the onset of diabetes mellitus. In Proceedings of the Symposium on Computer Applications and Medical Care (pp. 261--265). IEEE Computer Society Press.
#
# Έλεγχος της ανισορροπίας (unbalanced classes)
table(pid$diabetes)
#
# Ποσοστιαία κατανομή
prop.table(table(pid$diabetes))
#
# Graph: GLUCOSE vs DIABETES (Boxplot)
p1 = ggplot(pid, aes(x = diabetes, y = glucose, fill = diabetes)) +
  geom_boxplot(alpha = 0.7) +
  labs(title = "Glucose Levels",
       x = "Diabetes", y = "Glucose") +
  theme_minimal()
p1
#
# Graph: DIABETES vs Number of Pregnancies (Boxplot)
p2 = ggplot(pid, aes(x = diabetes, y = pregnant, fill = diabetes)) +
  geom_boxplot(alpha = 0.7) +
  labs(title = "Number of Pregnancies",
       x = "Diabetes", y = "Number of Pregnancies") +
  theme_minimal()
p2
#
# Graph: DIABETES vs Blood Pressure (Boxplot)
p3 = ggplot(pid, aes(x = diabetes, y = pressure, fill = diabetes)) +
  geom_boxplot(alpha = 0.7) +
  labs(title = "Blood Pressure",
       x = "Diabetes", y = "Blood Pressure") +
  theme_minimal()
p3
#
# Graph: DIABETES vs Skin Thickness (Boxplot)
p4 = ggplot(pid, aes(x = diabetes, y = triceps, fill = diabetes)) +
  geom_boxplot(alpha = 0.7) +
  labs(title = "Skin Thickness",
       x = "Diabetes", y = "Skin Thickness (triceps)") +
  theme_minimal()
p4
#
# Graph: DIABETES vs Insulin (Boxplot)
p5 = ggplot(pid, aes(x = diabetes, y = insulin, fill = diabetes)) +
  geom_boxplot(alpha = 0.7) +
  labs(title = "Insulin",
       x = "Diabetes", y = "Insulin") +
  theme_minimal()
p5
#
# Graph: DIABETES vs BMI (Boxplot)
p6 = ggplot(pid, aes(x = diabetes, y = mass, fill = diabetes)) +
  geom_boxplot(alpha = 0.7) +
  labs(title = "BMI",
       x = "Diabetes", y = "BMI (mass)") +
  theme_minimal()
p6
#
# Graph: DIABETES vs Diabetes Pedigree (Boxplot)
p7 = ggplot(pid, aes(x = diabetes, y = pedigree, fill = diabetes)) +
  geom_boxplot(alpha = 0.7) +
  labs(title = "Diabetes Pedigree",
       x = "Diabetes", y = "Diabetes Pedigree") +
  theme_minimal()
p7
#
# Graph: DIABETES vs Age (Boxplot)
p8 = ggplot(pid, aes(x = diabetes, y = age, fill = diabetes)) +
  geom_boxplot(alpha = 0.7) +
  labs(title = "Age",
       x = "Diabetes", y = "Age") +
  theme_minimal()
p8
#
grid.arrange(p1, p2,p3,p4,nrow = 2)
grid.arrange(p5, p6,p7,p8,nrow = 2)
#






pid_long <- pid %>%
  pivot_longer(cols = -diabetes, names_to = "variable", values_to = "value")
# Create all boxplots using facets
ggplot(pid_long, aes(x = diabetes, y = value, fill = type)) +
  geom_boxplot(alpha = 0.7) +
  facet_wrap(~ variable, scales = "free_y") +
  labs(
    title = "Boxplots of Predictors by Diabetes Diagnosis (Pima Data)",
    x = "Diabetes (type)",
    y = "Value"
  ) +
  scale_fill_brewer(palette = "Set2") +
  theme_minimal(base_size = 14)
#
# LOGISTIC MODEL
model <- glm(diabetes ~ ., data = pid, family = binomial)
summary(model)
#
model1 <- glm(diabetes ~ . -triceps, data = pid, family = binomial)
summary(model1)
#
model2 <- glm(diabetes ~ .-triceps-insulin, data = pid, family = binomial)
summary(model2)
#
# Calculate Odds Ratios
exp(coef(model2))
# Example: BMI (variable MASS) increases diabetes probability by ~9%
#
# ROC
if(!require(pROC)) install.packages("pROC")
library(pROC)
#
# Predict Probs
probs <- predict(model, type = "response")
roc_obj <- roc(pid$diabetes, probs)
roc_obj$auc
# Find optimal threshold (Youden’s J statistic)
coords_best <- coords(roc_obj, "best", ret = c("threshold", "sensitivity", "specificity"))
coords_best
#
# CONFUSION MATRIX
# Predicted classes (threshold = 0.5)
pid$predicted <- predict(model, type = "response")
pid$pred_class <- ifelse(pid$predicted >= 0.5, "pos", "neg")
# Confusion matrix
table(Observed = pid$diabetes,Predicted = pid$pred_class)
#
# CONFUSION MATRIX - Threshold=0.5
conf_mat <- table(Observed = pid$diabetes, Predicted = pid$pred_class)
TP <- conf_mat["pos", "pos"]
TN <- conf_mat["neg", "neg"]
FP <- conf_mat["neg", "pos"]
FN <- conf_mat["pos", "neg"]
accuracy    <- (TP + TN) / sum(conf_mat)
sensitivity <- TP / (TP + FN)
specificity <- TN / (TN + FP)
data.frame(Accuracy = accuracy,
           Sensitivity = sensitivity,
           Specificity = specificity)
#
#
# Extract best threshold (Youden’s J statistic)
#
best_t <- coords_best["threshold"]
#
# Predicted class based on that threshold
pid$pred_class_best <- ifelse(pid$predicted >= best_t[[1]], "pos", "neg")
#
# Actual class should match names used above
table(Observed = pid$diabetes, Predicted = pid$pred_class_best)
#
# CONFUSION MATRIX - Threshold=0.3536825 (Youden’s J statistic)
conf_mat <- table(Observed = pid$diabetes, Predicted = pid$pred_class_best)
TP <- conf_mat["pos", "pos"]
TN <- conf_mat["neg", "neg"]
FP <- conf_mat["neg", "pos"]
FN <- conf_mat["pos", "neg"]

accuracy    <- (TP + TN) / sum(conf_mat)
sensitivity <- TP / (TP + FN)
specificity <- TN / (TN + FP)

data.frame(Accuracy = accuracy,
           Sensitivity = sensitivity,
           Specificity = specificity)
#

plot(roc_obj, col = "blue", lwd = 2, main = "ROC Curve for Pima Indians Data")
abline(a = 0, b = 1, lty = 2, col = "gray")
#
# ROC
#
plot(roc_obj, main = "ROC Curve (Model Accuracy): 0.84")
#
# The Predicted Probability Curve (S-Curve)
# This is the most intuitive "logistic" visual. 
# It shows how the probability of having diabetes (0% to 100%) 
# climbs as a specific factor (like Glucose) increases.
#
predicted_pid <- data.frame(glucose = seq(min(pid$glucose), max(pid$glucose), len=100))
# Keep other variables at their average/median
predicted_pid$pregnant <- median(pid$pregnant)
predicted_pid$pressure <- median(pid$pressure)
predicted_pid$triceps  <- median(pid$triceps)
predicted_pid$insulin  <- median(pid$insulin)
predicted_pid$mass     <- median(pid$mass)
predicted_pid$pedigree <- median(pid$pedigree)
predicted_pid$age      <- median(pid$age)
#
predicted_pid$prob <- predict(model, newdata = predicted_pid, type = "response")
#
ggplot(predicted_pid, aes(x = glucose, y = prob)) +
  geom_line(color = "blue", size = 1) +
  labs(title = "The 'Risk' Curve",
       subtitle = "How Glucose levels impact the probability of Diabetes",
       x = "Glucose Level", y = "Probability of Diabetes (0 to 1)") +
  scale_y_continuous(labels = scales::percent) +
  theme_minimal()
#
# Density Plot
# Instead of a Boxplot, a Density Plot shows where the "bulk" of each group lies. 
# If the "Diabetes" cloud is shifted far to the right of the "Healthy" cloud, 
# the audience immediately sees that Glucose is a strong predictor.
#
ggplot(df, aes(x = glucose, fill = diabetes)) +
  geom_density(alpha = 0.5) +
  labs(title = "Where do patients 'cluster'?",
       subtitle = "Healthy vs. Diabetic distribution based on Glucose",
       x = "Glucose", y = "Density") +
  theme_minimal()
#
#
# The Precision-Recall Curve (Best for Unbalanced Data)
# Since you mentioned unbalanced data, the ROC curve can sometimes 
# be "too optimistic." A Precision-Recall Curve is more honest. 
# It shows: "If we predict someone has diabetes, how often are we 
# right (Precision) and how many did we catch (Recall)?"
#
if(!require(precrec)) install.packages("precrec")
library(precrec)
#
precrec_obj <- evalmod(scores = probs, labels = df$diabetes)
autoplot(precrec_obj, curvetype = "PRC") +
  labs(title = "Precision-Recall Curve",
       subtitle = "Better for Unbalanced Data analysis")
#