# For Red Wine red_wine <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", sep = ";") # For White Wine white_wine <- read.csv("https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-white.csv", sep = ";") # # # # Load necessary libraries if(!require(rattle)) install.packages("rattle") if(!require(corrplot)) install.packages("corrplot") library(rattle) library(corrplot) library(ggplot2) # data(wine) # Note: This version is often the "Wine" dataset (origin), not "Wine Quality" (physicochemical). ?wine # # Load data and remove the categorical 'Type' for a pure regression exercise data(wine) wine_reg <- wine[, -1] # Removing 'Type' column head(wine_reg) # # 1. Get the predicted values from your optimal model wine_reg$Predicted <- predict(optimal_model) # # 1. Full Model (The "Kitchen Sink" approach) p_model <- lm(Alcohol ~ Malic + Ash + Alcalinity + Magnesium + Phenols + Flavanoids + Nonflavanoids + Proanthocyanins + Color + Hue + Dilution + Proline, data = wine_reg); summary(p_model) p_model <- lm(Alcohol ~ Malic + Ash + Alcalinity + Phenols + Flavanoids + Nonflavanoids + Proanthocyanins + Color + Hue + Dilution + Proline, data = wine_reg); summary(p_model) p_model <- lm(Alcohol ~ Malic + Ash + Alcalinity + Phenols + Nonflavanoids + Proanthocyanins + Color + Hue + Dilution + Proline, data = wine_reg); summary(p_model) p_model <- lm(Alcohol ~ Malic + Ash + Alcalinity + Nonflavanoids + Proanthocyanins + Color + Hue + Dilution + Proline, data = wine_reg); summary(p_model) p_model <- lm(Alcohol ~ Malic + Alcalinity + Proanthocyanins + Color + Hue + Dilution + Proline, data = wine_reg); summary(p_model) p_model <- lm(Alcohol ~ Malic + Alcalinity + Proanthocyanins + Color + Dilution + Proline, data = wine_reg); summary(p_model) p_final_model <- lm(Alcohol ~ Malic + Alcalinity + Color + Dilution + Proline, data = wine_reg); summary(p_final_model) summary(p_final_model)$fstatistic confint(p_final_model) extractAIC(p_final_model) # # ##sigma() # Model Selection (Stepwise Backward Elimination) - p-value full_model <- lm(Alcohol ~ ., data = wine_reg) summary(full_model) # # # Model Selection (Stepwise Backward Elimination) - STEP Function # This automatically removes variables that do not contribute to the model's quality optimal_model <- step(full_model, direction = "backward", trace = 1) # Stepwise Regression with full visibility # direction = "both" checks for adding AND removing variables at each step stepwise_process <- step(full_model, direction = "both", trace = 1) # # To see a summary table of the steps taken: stepwise_process$anova optimal_model$anova # # Final Model Summary summary(optimal_model) confint(optimal_model) extractAIC(optimal_model) # # Visualization 1: Correlation Matrix (To check for multicollinearity) cor_matrix <- cor(wine_reg) corrplot(cor_matrix, method = "color", type = "upper", tl.col = "black", tl.srt = 45) # # Visual Correlation Heatmap with Numbers if(!require(ggcorrplot)) install.packages("ggcorrplot") library(ggcorrplot) corr_matrix <- cor(wine_reg) ggcorrplot(corr_matrix, hc.order = TRUE, type = "lower", lab = TRUE, lab_size = 3, method="circle", colors = c("tomato2", "white", "springgreen3"), title="Correlogram of Wine Constituents") # Regression Diagnostics (The 4-panel plot) # Visualization 2: Regression Diagnostics # This generates the 4 essential plots: # (1) Linearity, (2) Normality of Residuals, (3) Homoscedasticity, (4) Outliers par(mfrow = c(2, 2)) plot(optimal_model, col = "steelblue", pch = 16) par(mfrow=c(1,1)) # # 1. Partial Residual Plots # Requires the 'car' package install.packages("car") library(car) crPlots(optimal_model) # Variable Importance Plot # Using the 'caret' package for easy scaling install.packages("caret") library(caret) importance <- varImp(optimal_model, scale = FALSE) plot(importance, main="Variable Importance in Wine Alcohol Prediction") # Get the predicted values from your optimal model wine_reg$Predicted <- predict(optimal_model) # Create the plot ggplot(wine_reg, aes(x = Predicted, y = Alcohol)) + geom_point(color = "darkred", alpha = 0.6, size = 2) + # The actual data points geom_abline(intercept = 0, slope = 1, linetype = "dashed", color = "blue", size = 1) + # The 1:1 line geom_smooth(method = "lm", color = "black", se = TRUE) + # The fitted regression line labs(title = "Optimal Model Performance: Alcohol Content", subtitle = "Observed Alcohol vs. Model Predictions", x = "Predicted Alcohol (%)", y = "Actual Alcohol (%)") + theme_minimal() # # # # EXAMPLE # # Create the hypothetical base data (17 points following a trend) set.seed(42) # For reproducibility x_base <- runif(17, 2, 8) y_base <- 10 + 2 * x_base + rnorm(17, 0, 1) # Inject the three strategic "modified" points x_all <- c(x_base, 5.0, # X location for the outlier (near mean) 14.0, # X location for high leverage (extreme right) 14.0) # X location for influence (extreme right) y_all <- c(y_base, 10.0, # Y location for outlier (far from trend line) 38.0, # Y location for high leverage (on trend line) 15.0) # Y location for influence (far from trend line) # # Label the points for identification point_type <- factor(c(rep("Standard Data", 17), "Outlier (Large Residual)", "High Leverage (Unusual X)", "Influential (Leverage + Residual)")) # # Compile into a data frame df_hyp <- data.frame(X = x_all, Y = y_all, Type = point_type) # # Perform basic regression on the full set to show the pulled trend line full_lm <- lm(Y ~ X, data = df_hyp) intercept <- coef(full_lm)[1] slope <- coef(full_lm)[2] # # Create the visualization using ggplot2 ggplot(df_hyp, aes(x = X, y = Y)) + geom_point(aes(color = Type), size = 4) + geom_abline(intercept = intercept, slope = slope, linetype = "solid", color = "black", size = 1) + # The regression line scale_color_manual(values = c("Standard Data" = "steelblue", "Outlier (Large Residual)" = "#FF4136", # Red "High Leverage (Unusual X)" = "#2ECC40", # Green "Influential (Leverage + Residual)" = "#FF851B")) + # Orange labs(title = "Isolating the Taxonomy of Unusual Observations", subtitle = "Hypothetical Biological Variable", x = "Biological Predictor (e.g., pH or Color)", y = "Biological Outcome (e.g., Alcohol %)") + theme_minimal() + theme(legend.position = "bottom") # # Optional: Print Cook's Distance to confirm mathematical influence # points 18, 19, 20 correspond to our injected points print(cooks.distance(full_lm)) #