library(ggplot2)
library(dplyr)
library(lme4)
library(tidyr)
library(reshape2)
library(GGally)
library(tidyverse)
#
######################
# SIMULATE plants data
######################
#
set.seed(2026) # Setting the seed for 2026!
#
# 1. Setup
n_plants <- 10
n_weeks <- 6
weeks <- 1:6
#
# 2. Define Parameters
# Baseline height: 10cm
# Control growth: 1.0 cm/week
# Fertilizer boost: +2.5 cm/week (Significant Effect)
# Random Intercept SD: 2.0 (Plants start at very different heights)
# Random Slope SD: 0.6 (Plants have different "genetic" growth potential)
#
plant_metadata <- data.frame(
  plant_id = factor(1:n_plants),
  treatment = rep(c("Control", "Fertilizer"), each = n_plants/2),
  u0i = rnorm(n_plants, 0, 2.0), # Random Intercept deviation
  u1i = rnorm(n_plants, 0, 0.6)  # Random Slope deviation
)
#
# 3. Create the Long-Form Dataset
plant_growth <- plant_metadata %>%
  crossing(week = weeks) %>%
  mutate(
    # The "Fixed" part of the biology
    base_height = 10,
    group_slope = ifelse(treatment == "Fertilizer", 3.5, 1.0), # 1.0 vs 3.5 is a huge gap
    
    # The "Random" part + Error
    height = base_height + u0i + (group_slope + u1i) * week + rnorm(n(), 0, 0.5),
    
    # Clean up for the students
    height = round(height, 2)
  ) %>%
  select(plant_id, treatment, week, height)

# Quick look at the "Spread"
summary(plant_growth$height)
plants = plant_growth
head(plants,10)
#
# This dataset represents a typical biological experiment involving repeated measurements on 
# individual organisms. Each plant is measured weekly, leading to correlated observations 
# within each plant. Linear mixed‑effects models allow us to model both the fixed effects 
# (treatment, time) and the random biological variability among individual plants.
#
#
# INDIVIDUAL PLANT GROWTH CURVES (raw data)
# This produces:
# • one subplot per plant
# • same axes across panels
# • clear visualization of variability in growth trajectories
ggplot(plants, aes(x = week, y = height)) +
  geom_line(color = "steelblue", size = 1) +
  geom_point(color = "steelblue", size = 2) +
  facet_wrap(~ plant_id, ncol = 5) +
  labs(
    title = "Individual Plant Growth Curves (Raw Data)",
    x = "Week",
    y = "Height (cm)"
  ) +
  theme_minimal(base_size = 14)
#
#
# RAW DATA - All together
#
ggplot(plants, aes(x = week, y = height)) +
  geom_point(alpha = 0.6) +
  labs(title = "Raw Data",
       x = "Week",
       y = "Height (cm)") +
  theme_minimal(base_size = 14)
#
#
# RAW DATA - Points by Group
#
ggplot(plants, aes(x = week, y = height, color = treatment)) +
  geom_point(alpha = 0.6) +
#  geom_smooth(method = "lm", se = FALSE, size = 0.8) +
  labs(title = "Raw Data (by Treatment)",
#       subtitle = "Assumes independence; forces a single trend per treatment",
       x = "Week",
       y = "Height (cm)") +
  theme_minimal(base_size = 14)
#
# RAW DATA - Profiles
#
ggplot(plants_lm, aes(x = week, y = height, group = plant_id, color = treatment)) +
  # Spaghetti lines (individual plants)
  geom_line(alpha = 0.4) +
  geom_point(alpha = 0.4) +
  #
  # Linear model fitted trend per treatment
#  geom_line(aes(y = fit_lm), size = 1, alpha = 0.9) +
  labs(
    title = "Raw Data - Profiles",
#    subtitle = "LM forces a single trend per treatment and ignores plant-level variability",
    x = "Week",
    y = "Height (cm)"
  ) +
  theme_minimal(base_size = 14)
#
#
# LINEAR MODEL
lm(height ~ treatment + week, data = plants)
#
# This shows why the linear model is inappropriate: it assumes all observations are 
# independent and fits a single trend per treatment.
# Fit the WRONG model
lm_mod <- lm(height ~ treatment * week, data = plants)
# Plot: LM fitted lines
ggplot(plants, aes(x = week, y = height, color = treatment)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = FALSE, size = 0.8) +
  labs(title = "Linear Model (No Random Effects)",
       subtitle = "Assumes independence; forces a single trend per treatment",
       x = "Week",
       y = "Height (cm)") +
  theme_minimal(base_size = 14)
#
# LM LIST
# Add LM predictions - lmList
#
plants_lm <- plants %>%
  mutate(fit_lm = predict(lm_mod))
#
ggplot(plants_lm, aes(x = week, y = height, group = plant_id, color = treatment)) +
  # Spaghetti lines (individual plants)
  geom_line(alpha = 0.4) +
  geom_point(alpha = 0.4) +
#
# Linear model fitted trend per treatment
  geom_line(aes(y = fit_lm), size = 1, alpha = 0.9) +
  labs(
    title = "LM List Plot: Wrong Model for Repeated Measurements",
    subtitle = "LM forces a single trend per treatment and ignores plant-level variability",
    x = "Week",
    y = "Height (cm)"
  ) +
  theme_minimal(base_size = 14)
#
#
# Add LM fitted values
plants_lm <- plants %>%
  mutate(fit_lm = predict(lm_mod))

ggplot(plants_lm, aes(x = week, y = height)) +
  geom_line(color = "steelblue", size = 0.6) +
  geom_point(color = "steelblue", size = 2) +
  
  # WRONG model fitted line (same in every panel)
  geom_line(aes(y = fit_lm), color = "red", size = 0.6) +
  
  facet_wrap(~ plant_id, ncol = 5) +
  labs(
    title = "Per_Plant Growth Curves with Individual LM Fitted Line",
    subtitle = "LM fitted using data per plant",
    x = "Week",
    y = "Height (cm)"
  ) +
  theme_minimal(base_size = 14)
#
#
#  
# SINGLE LINE DOESN'T GIVE JUSTICE  
#
# Linear model fitted trend per treatment
  geom_line(aes(y = fit_lm), size = 1.5, alpha = 0.9) +
  
  labs(
    title = "LM List Plot: Wrong Model for Repeated Measurements",
    subtitle = "LM forces a single trend per treatment and ignores plant-level variability",
    x = "Week",
    y = "Height (cm)"
  ) +
  
  theme_minimal(base_size = 14)
#
# What this plot illustrates
# The LM draws one line per treatment without considering that plants differ.
# It treats the 120 measurements as independent → violates independence assumption.
# Real plants grow differently → LM underestimates uncertainty and overstates significance.
# 
# BOX PLOT
ggplot(plants, aes(x = factor(week), y = height, fill = treatment)) +
    geom_boxplot(alpha = 0.7) +
    labs(
      title = "Plant Height by Week and Treatment",
      x = "Week",
      y = "Height (cm)"
    ) +
    scale_fill_brewer(palette = "Set2") +
    theme_minimal(base_size = 14)
#
# Variability per week per treatment
#
week_sd_treat <- plants %>%
  group_by(week, treatment) %>%
  summarise(sd_height = sd(height), .groups = "drop")
#
ggplot(week_sd_treat, aes(x = week, y = sd_height, color = treatment)) +
  geom_line(size = 1.2) +
  geom_point(size = 3) +
  labs(
    title = "Variability Changes Over Time",
    x = "Week",
    y = "Standard deviation (cm)"
  ) +
  theme_minimal(base_size = 14)
#
#
# CORRELATION PLOT
# Reshape Data
plants_wide <- plants %>%
  select(plant_id, week, height) %>%
  pivot_wider(names_from = week, values_from = height, names_prefix = "week_")
head(plants_wide)
# Compute Correlation Matrix
cor_mat <- cor(plants_wide %>% select(starts_with("week_")))
cor_mat
# Plot (heatmap style)
cor_long <- melt(cor_mat)
ggplot(cor_long, aes(Var1, Var2, fill = value)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(
    low = "steelblue",
    mid = "white",
    high = "firebrick",
    midpoint = 0.8,
    limits = c(0, 1)
  ) +
  labs(
    title = "Pairwise Correlation Between Weeks",
    subtitle = "Strong correlations illustrate repeated-measures dependence",
    x = "Week",
    y = "Week",
    fill = "Correlation"
  ) +
  theme_minimal(base_size = 14) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))
#
# Pairwise Scatterplots
ggpairs(
  plants_wide %>% select(starts_with("week_")),
  title = "Pairwise Scatterplots of Plant Height Across Weeks"
)
#
#
# Random intercept model
lmer_int <- lmer(height ~ treatment + week + (1 | plant_id), data = plants)
summary(lmer_int)
#
# Get fitted values
plants$fit_int <- fitted(lmer_int)
ggplot(plants, aes(x = week, y = height, group = plant_id, color = treatment)) +
  geom_point(alpha = 0.5) +
  geom_line(aes(y = fit_int), size = 0.6) +
  labs(title = "Mixed-Effects Model (Random Intercept)",
       subtitle = "Each plant gets its own baseline height",
       x = "Week",
       y = "Height (cm)") +
  theme_minimal(base_size = 14)
#
# What this plot illustrates
# Plants start at different heights → the model captures individual baselines
# Growth trend still shared within treatment
# Much more realistic than LM
#
#
# Random intercept + random slope model
lmer_slope <- lmer(height ~ treatment + week + (week | plant_id), data = plants)
summary(lmer_slope)
#
# Fitted values
plants$fit_slope <- fitted(lmer_slope)
#
ggplot(plants, aes(x = week, y = height, group = plant_id, color = treatment)) +
  geom_point(alpha = 0.5) +
  geom_line(aes(y = fit_slope), size = 0.6) +
  labs(title = "Mixed-Effects Model (Random Intercept + Slope)",
       subtitle = "Each plant has its own baseline height & growth rate",
       x = "Week",
       y = "Height (cm)") +
  theme_minimal(base_size = 14)
#
# What this plot illustrates - Each plant has: 
# its own intercept
# its own growth rate
# Lines fan out naturally — capturing biological variability.
# This is the statistically and biologically correct model for these data.
#
#
# Combined Comparison Plot (LM vs LME vs LME‑Slope)
#
library(dplyr)
#
# Fit models
lm_mod     <- lm(height ~ treatment * week, data = plants)
lmer_int   <- lmer(height ~ treatment * week + (1 | plant_id), data = plants)
lmer_slope <- lmer(height ~ treatment * week + (week | plant_id), data = plants)
#
# Add fits
plants_plot <- plants %>%
  mutate(
    fit_lm     = predict(lm_mod),
    fit_int    = fitted(lmer_int),
    fit_slope  = fitted(lmer_slope)
  )

# Long format for plotting
library(tidyr)
#
long <- plants_plot %>%
  select(plant_id, treatment, week, height, fit_lm, fit_int, fit_slope) %>%
  pivot_longer(
    cols = c(fit_lm, fit_int, fit_slope),
    names_to = "model",
    values_to = "fit"
  )
#
# Rename models for plot readability
long$model <- factor(long$model,
                     levels = c("fit_lm", "fit_int", "fit_slope"),
                     labels = c("LM",
                                "LME_random intercept",
                                "LME: random intercept + slope")
)
#
# Plot
ggplot(long, aes(x = week, y = height, color = treatment)) +
  geom_point(alpha = 0.4) +
  geom_line(aes(y = fit, group = plant_id), size = 0.4) +
  facet_wrap(~ model, ncol = 1) +
  labs(
    title = "Comparison of Models for Repeated Biological Measurements",
    x = "Week",
    y = "Height (cm)"
  ) +
  theme_minimal(base_size = 14)
#
#
# Optional: Side‑by‑side comparison in one figure
ggplot(long, aes(x = week, y = fit, color = treatment, group = plant_id)) +
  geom_line(size = 0.4) +
  facet_wrap(~ model) +
  labs(
    title = "Fitted Growth Curves Across Models",
    subtitle = "LM vs LME (RI) vs LME (RI + RS)",
    x = "Week",
    y = "Predicted Height (cm)"
  ) +
  theme_minimal(base_size = 14 )
#
#
anova(lmer_int,lmer_slope)
#