# For Red Wine
red_wine <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep = ";")
# For White Wine
white_wine <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ";")
#
#
#
# Load necessary libraries
if(!require(rattle)) install.packages("rattle")
if(!require(corrplot)) install.packages("corrplot")
library(rattle)
library(corrplot)
library(ggplot2)
#
data(wine) # Note: This version is often the "Wine" dataset (origin), not "Wine Quality" (physicochemical).
?wine
#
# Load data and remove the categorical 'Type' for a pure regression exercise
data(wine)
wine_reg <- wine[, -1] # Removing 'Type' column
head(wine_reg)
#
# 1. Get the predicted values from your optimal model
wine_reg$Predicted <- predict(optimal_model)
#
# 1. Full Model (The "Kitchen Sink" approach)
p_model <- lm(Alcohol ~ Malic + Ash + Alcalinity + Magnesium + Phenols + Flavanoids + Nonflavanoids + Proanthocyanins + Color + Hue + Dilution + Proline, data = wine_reg);   summary(p_model)
p_model <- lm(Alcohol ~ Malic + Ash + Alcalinity +             Phenols + Flavanoids + Nonflavanoids + Proanthocyanins + Color + Hue + Dilution + Proline, data = wine_reg);   summary(p_model)
p_model <- lm(Alcohol ~ Malic + Ash + Alcalinity +             Phenols +              Nonflavanoids + Proanthocyanins + Color + Hue + Dilution + Proline, data = wine_reg);   summary(p_model)
p_model <- lm(Alcohol ~ Malic + Ash + Alcalinity +                                    Nonflavanoids + Proanthocyanins + Color + Hue + Dilution + Proline, data = wine_reg);   summary(p_model)
p_model <- lm(Alcohol ~ Malic +       Alcalinity +                                                    Proanthocyanins + Color + Hue + Dilution + Proline, data = wine_reg);   summary(p_model)
p_model <- lm(Alcohol ~ Malic +       Alcalinity +                                                    Proanthocyanins + Color +       Dilution + Proline, data = wine_reg);   summary(p_model)
p_final_model <- lm(Alcohol ~ Malic +       Alcalinity +                                                                      Color +       Dilution + Proline, data = wine_reg);   
summary(p_final_model)
summary(p_final_model)$fstatistic
confint(p_final_model)
extractAIC(p_final_model)
#
#
##sigma()
# Model Selection (Stepwise Backward Elimination) - p-value
full_model <- lm(Alcohol ~ ., data = wine_reg)
summary(full_model)
#
#
# Model Selection (Stepwise Backward Elimination) - STEP Function
# This automatically removes variables that do not contribute to the model's quality
optimal_model <- step(full_model, direction = "backward", trace = 1)
# Stepwise Regression with full visibility
# direction = "both" checks for adding AND removing variables at each step
stepwise_process <- step(full_model, direction = "both", trace = 1)
#
# To see a summary table of the steps taken:
stepwise_process$anova
optimal_model$anova
#
# Final Model Summary
summary(optimal_model)
confint(optimal_model)
extractAIC(optimal_model)
#
# Visualization 1: Correlation Matrix (To check for multicollinearity)
cor_matrix <- cor(wine_reg)
corrplot(cor_matrix, method = "color", type = "upper", tl.col = "black", tl.srt = 45)
#
# Visual Correlation Heatmap with Numbers
if(!require(ggcorrplot)) install.packages("ggcorrplot")
library(ggcorrplot)
corr_matrix <- cor(wine_reg)
ggcorrplot(corr_matrix, 
           hc.order = TRUE, 
           type = "lower",
           lab = TRUE, 
           lab_size = 3, 
           method="circle", 
           colors = c("tomato2", "white", "springgreen3"), 
           title="Correlogram of Wine Constituents")

# Regression Diagnostics (The 4-panel plot)
# Visualization 2: Regression Diagnostics
# This generates the 4 essential plots: 
# (1) Linearity, (2) Normality of Residuals, (3) Homoscedasticity, (4) Outliers
par(mfrow = c(2, 2))
plot(optimal_model, col = "steelblue", pch = 16)
par(mfrow=c(1,1))
#
# 1. Partial Residual Plots
# Requires the 'car' package
install.packages("car")
library(car)
crPlots(optimal_model)

# Variable Importance Plot
# Using the 'caret' package for easy scaling

install.packages("caret")
library(caret)

importance <- varImp(optimal_model, scale = FALSE)
plot(importance, main="Variable Importance in Wine Alcohol Prediction")

# Get the predicted values from your optimal model
wine_reg$Predicted <- predict(optimal_model)

# Create the plot
ggplot(wine_reg, aes(x = Predicted, y = Alcohol)) +
  geom_point(color = "darkred", alpha = 0.6, size = 2) +  # The actual data points
  geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "blue", size = 1) + # The 1:1 line
  geom_smooth(method = "lm", color = "black", se = TRUE) + # The fitted regression line
  labs(title = "Optimal Model Performance: Alcohol Content",
       subtitle = "Observed Alcohol vs. Model Predictions",
       x = "Predicted Alcohol (%)",
       y = "Actual Alcohol (%)") +
  theme_minimal()
#
#
#
# EXAMPLE
#
# Create the hypothetical base data (17 points following a trend)
set.seed(42) # For reproducibility
x_base <- runif(17, 2, 8)
y_base <- 10 + 2 * x_base + rnorm(17, 0, 1)

# Inject the three strategic "modified" points
x_all <- c(x_base, 
           5.0,  # X location for the outlier (near mean)
           14.0, # X location for high leverage (extreme right)
           14.0) # X location for influence (extreme right)

y_all <- c(y_base, 
           10.0, # Y location for outlier (far from trend line)
           38.0, # Y location for high leverage (on trend line)
           15.0) # Y location for influence (far from trend line)
#
# Label the points for identification
point_type <- factor(c(rep("Standard Data", 17), 
                       "Outlier (Large Residual)", 
                       "High Leverage (Unusual X)", 
                       "Influential (Leverage + Residual)"))
#
# Compile into a data frame
df_hyp <- data.frame(X = x_all, Y = y_all, Type = point_type)
#
# Perform basic regression on the full set to show the pulled trend line
full_lm <- lm(Y ~ X, data = df_hyp)
intercept <- coef(full_lm)[1]
slope <- coef(full_lm)[2]
#
# Create the visualization using ggplot2
ggplot(df_hyp, aes(x = X, y = Y)) +
  geom_point(aes(color = Type), size = 4) +
  geom_abline(intercept = intercept, slope = slope, linetype = "solid", color = "black", size = 1) + # The regression line
  scale_color_manual(values = c("Standard Data" = "steelblue", 
                                "Outlier (Large Residual)" = "#FF4136", # Red
                                "High Leverage (Unusual X)" = "#2ECC40", # Green
                                "Influential (Leverage + Residual)" = "#FF851B")) + # Orange
  labs(title = "Isolating the Taxonomy of Unusual Observations",
       subtitle = "Hypothetical Biological Variable",
       x = "Biological Predictor (e.g., pH or Color)",
       y = "Biological Outcome (e.g., Alcohol %)") +
  theme_minimal() +
  theme(legend.position = "bottom")
#
# Optional: Print Cook's Distance to confirm mathematical influence
# points 18, 19, 20 correspond to our injected points
print(cooks.distance(full_lm))
#