library(ggplot2) library(dplyr) library(lme4) library(tidyr) library(reshape2) library(GGally) library(tidyverse) # ###################### # SIMULATE plants data ###################### # set.seed(2026) # Setting the seed for 2026! # # 1. Setup n_plants <- 10 n_weeks <- 6 weeks <- 1:6 # # 2. Define Parameters # Baseline height: 10cm # Control growth: 1.0 cm/week # Fertilizer boost: +2.5 cm/week (Significant Effect) # Random Intercept SD: 2.0 (Plants start at very different heights) # Random Slope SD: 0.6 (Plants have different "genetic" growth potential) # plant_metadata <- data.frame( plant_id = factor(1:n_plants), treatment = rep(c("Control", "Fertilizer"), each = n_plants/2), u0i = rnorm(n_plants, 0, 2.0), # Random Intercept deviation u1i = rnorm(n_plants, 0, 0.6) # Random Slope deviation ) # # 3. Create the Long-Form Dataset plant_growth <- plant_metadata %>% crossing(week = weeks) %>% mutate( # The "Fixed" part of the biology base_height = 10, group_slope = ifelse(treatment == "Fertilizer", 3.5, 1.0), # 1.0 vs 3.5 is a huge gap # The "Random" part + Error height = base_height + u0i + (group_slope + u1i) * week + rnorm(n(), 0, 0.5), # Clean up for the students height = round(height, 2) ) %>% select(plant_id, treatment, week, height) # Quick look at the "Spread" summary(plant_growth$height) plants = plant_growth head(plants,10) # # This dataset represents a typical biological experiment involving repeated measurements on # individual organisms. Each plant is measured weekly, leading to correlated observations # within each plant. Linear mixed‑effects models allow us to model both the fixed effects # (treatment, time) and the random biological variability among individual plants. # # # INDIVIDUAL PLANT GROWTH CURVES (raw data) # This produces: # • one subplot per plant # • same axes across panels # • clear visualization of variability in growth trajectories ggplot(plants, aes(x = week, y = height)) + geom_line(color = "steelblue", size = 1) + geom_point(color = "steelblue", size = 2) + facet_wrap(~ plant_id, ncol = 5) + labs( title = "Individual Plant Growth Curves (Raw Data)", x = "Week", y = "Height (cm)" ) + theme_minimal(base_size = 14) # # # RAW DATA - All together # ggplot(plants, aes(x = week, y = height)) + geom_point(alpha = 0.6) + labs(title = "Raw Data", x = "Week", y = "Height (cm)") + theme_minimal(base_size = 14) # # # RAW DATA - Points by Group # ggplot(plants, aes(x = week, y = height, color = treatment)) + geom_point(alpha = 0.6) + # geom_smooth(method = "lm", se = FALSE, size = 0.8) + labs(title = "Raw Data (by Treatment)", # subtitle = "Assumes independence; forces a single trend per treatment", x = "Week", y = "Height (cm)") + theme_minimal(base_size = 14) # # RAW DATA - Profiles # ggplot(plants_lm, aes(x = week, y = height, group = plant_id, color = treatment)) + # Spaghetti lines (individual plants) geom_line(alpha = 0.4) + geom_point(alpha = 0.4) + # # Linear model fitted trend per treatment # geom_line(aes(y = fit_lm), size = 1, alpha = 0.9) + labs( title = "Raw Data - Profiles", # subtitle = "LM forces a single trend per treatment and ignores plant-level variability", x = "Week", y = "Height (cm)" ) + theme_minimal(base_size = 14) # # # LINEAR MODEL lm(height ~ treatment + week, data = plants) # # This shows why the linear model is inappropriate: it assumes all observations are # independent and fits a single trend per treatment. # Fit the WRONG model lm_mod <- lm(height ~ treatment * week, data = plants) # Plot: LM fitted lines ggplot(plants, aes(x = week, y = height, color = treatment)) + geom_point(alpha = 0.6) + geom_smooth(method = "lm", se = FALSE, size = 0.8) + labs(title = "Linear Model (No Random Effects)", subtitle = "Assumes independence; forces a single trend per treatment", x = "Week", y = "Height (cm)") + theme_minimal(base_size = 14) # # LM LIST # Add LM predictions - lmList # plants_lm <- plants %>% mutate(fit_lm = predict(lm_mod)) # ggplot(plants_lm, aes(x = week, y = height, group = plant_id, color = treatment)) + # Spaghetti lines (individual plants) geom_line(alpha = 0.4) + geom_point(alpha = 0.4) + # # Linear model fitted trend per treatment geom_line(aes(y = fit_lm), size = 1, alpha = 0.9) + labs( title = "LM List Plot: Wrong Model for Repeated Measurements", subtitle = "LM forces a single trend per treatment and ignores plant-level variability", x = "Week", y = "Height (cm)" ) + theme_minimal(base_size = 14) # # # Add LM fitted values plants_lm <- plants %>% mutate(fit_lm = predict(lm_mod)) ggplot(plants_lm, aes(x = week, y = height)) + geom_line(color = "steelblue", size = 0.6) + geom_point(color = "steelblue", size = 2) + # WRONG model fitted line (same in every panel) geom_line(aes(y = fit_lm), color = "red", size = 0.6) + facet_wrap(~ plant_id, ncol = 5) + labs( title = "Per_Plant Growth Curves with Individual LM Fitted Line", subtitle = "LM fitted using data per plant", x = "Week", y = "Height (cm)" ) + theme_minimal(base_size = 14) # # # # SINGLE LINE DOESN'T GIVE JUSTICE # # Linear model fitted trend per treatment geom_line(aes(y = fit_lm), size = 1.5, alpha = 0.9) + labs( title = "LM List Plot: Wrong Model for Repeated Measurements", subtitle = "LM forces a single trend per treatment and ignores plant-level variability", x = "Week", y = "Height (cm)" ) + theme_minimal(base_size = 14) # # What this plot illustrates # The LM draws one line per treatment without considering that plants differ. # It treats the 120 measurements as independent → violates independence assumption. # Real plants grow differently → LM underestimates uncertainty and overstates significance. # # BOX PLOT ggplot(plants, aes(x = factor(week), y = height, fill = treatment)) + geom_boxplot(alpha = 0.7) + labs( title = "Plant Height by Week and Treatment", x = "Week", y = "Height (cm)" ) + scale_fill_brewer(palette = "Set2") + theme_minimal(base_size = 14) # # Variability per week per treatment # week_sd_treat <- plants %>% group_by(week, treatment) %>% summarise(sd_height = sd(height), .groups = "drop") # ggplot(week_sd_treat, aes(x = week, y = sd_height, color = treatment)) + geom_line(size = 1.2) + geom_point(size = 3) + labs( title = "Variability Changes Over Time", x = "Week", y = "Standard deviation (cm)" ) + theme_minimal(base_size = 14) # # # CORRELATION PLOT # Reshape Data plants_wide <- plants %>% select(plant_id, week, height) %>% pivot_wider(names_from = week, values_from = height, names_prefix = "week_") head(plants_wide) # Compute Correlation Matrix cor_mat <- cor(plants_wide %>% select(starts_with("week_"))) cor_mat # Plot (heatmap style) cor_long <- melt(cor_mat) ggplot(cor_long, aes(Var1, Var2, fill = value)) + geom_tile(color = "white") + scale_fill_gradient2( low = "steelblue", mid = "white", high = "firebrick", midpoint = 0.8, limits = c(0, 1) ) + labs( title = "Pairwise Correlation Between Weeks", subtitle = "Strong correlations illustrate repeated-measures dependence", x = "Week", y = "Week", fill = "Correlation" ) + theme_minimal(base_size = 14) + theme(axis.text.x = element_text(angle = 45, hjust = 1)) # # Pairwise Scatterplots ggpairs( plants_wide %>% select(starts_with("week_")), title = "Pairwise Scatterplots of Plant Height Across Weeks" ) # # # Random intercept model lmer_int <- lmer(height ~ treatment + week + (1 | plant_id), data = plants) summary(lmer_int) # # Get fitted values plants$fit_int <- fitted(lmer_int) ggplot(plants, aes(x = week, y = height, group = plant_id, color = treatment)) + geom_point(alpha = 0.5) + geom_line(aes(y = fit_int), size = 0.6) + labs(title = "Mixed-Effects Model (Random Intercept)", subtitle = "Each plant gets its own baseline height", x = "Week", y = "Height (cm)") + theme_minimal(base_size = 14) # # What this plot illustrates # Plants start at different heights → the model captures individual baselines # Growth trend still shared within treatment # Much more realistic than LM # # # Random intercept + random slope model lmer_slope <- lmer(height ~ treatment + week + (week | plant_id), data = plants) summary(lmer_slope) # # Fitted values plants$fit_slope <- fitted(lmer_slope) # ggplot(plants, aes(x = week, y = height, group = plant_id, color = treatment)) + geom_point(alpha = 0.5) + geom_line(aes(y = fit_slope), size = 0.6) + labs(title = "Mixed-Effects Model (Random Intercept + Slope)", subtitle = "Each plant has its own baseline height & growth rate", x = "Week", y = "Height (cm)") + theme_minimal(base_size = 14) # # What this plot illustrates - Each plant has: # its own intercept # its own growth rate # Lines fan out naturally — capturing biological variability. # This is the statistically and biologically correct model for these data. # # # Combined Comparison Plot (LM vs LME vs LME‑Slope) # library(dplyr) # # Fit models lm_mod <- lm(height ~ treatment * week, data = plants) lmer_int <- lmer(height ~ treatment * week + (1 | plant_id), data = plants) lmer_slope <- lmer(height ~ treatment * week + (week | plant_id), data = plants) # # Add fits plants_plot <- plants %>% mutate( fit_lm = predict(lm_mod), fit_int = fitted(lmer_int), fit_slope = fitted(lmer_slope) ) # Long format for plotting library(tidyr) # long <- plants_plot %>% select(plant_id, treatment, week, height, fit_lm, fit_int, fit_slope) %>% pivot_longer( cols = c(fit_lm, fit_int, fit_slope), names_to = "model", values_to = "fit" ) # # Rename models for plot readability long$model <- factor(long$model, levels = c("fit_lm", "fit_int", "fit_slope"), labels = c("LM", "LME_random intercept", "LME: random intercept + slope") ) # # Plot ggplot(long, aes(x = week, y = height, color = treatment)) + geom_point(alpha = 0.4) + geom_line(aes(y = fit, group = plant_id), size = 0.4) + facet_wrap(~ model, ncol = 1) + labs( title = "Comparison of Models for Repeated Biological Measurements", x = "Week", y = "Height (cm)" ) + theme_minimal(base_size = 14) # # # Optional: Side‑by‑side comparison in one figure ggplot(long, aes(x = week, y = fit, color = treatment, group = plant_id)) + geom_line(size = 0.4) + facet_wrap(~ model) + labs( title = "Fitted Growth Curves Across Models", subtitle = "LM vs LME (RI) vs LME (RI + RS)", x = "Week", y = "Predicted Height (cm)" ) + theme_minimal(base_size = 14 ) # # anova(lmer_int,lmer_slope) #