############ linear models
# Kaggle
smoke   = read.csv("C:/Users/steli/Desktop/Σορβόνη/smoking.csv")
# data manipulation
# 55695 observations, 27 variables

dim(smoke) 
# names for each variable
names(smoke)
# check which variables are quantitative
sapply(smoke,is.numeric)
# check which variables are characters
char_columns=sapply(smoke, is.character)

# transform  all characters to factors
smoke[char_columns]=sapply(smoke[char_columns],as.factor)
# no missing data
colSums(is.na(smoke))
# we will use the first 1000 observations
smoke_1000=subset(smoke,ID<1000)
# see if some variable has only one observation
bad_cols=sapply(smoke_1000, function(x) length(unique(na.omit(x))) < 2)
# we must drop the variable oral
which(bad_cols==TRUE)
smoke_1000=smoke_1000[,-24]
dim(smoke_1000)
# Visualize the data scatter plots for  continuous variables
# We can observe the correlation 
pairs(smoke_1000[,c(5,14)])
cor(smoke_1000$weight.kg.,smoke_1000$Cholesterol)
#
# Fit the model,considering all explanatory variables
systolic_lm = lm(systolic ~ .-ID,data=smoke_1000)
# output
summary(systolic_lm)

# comparing 2 models with the partial F test
systoliclm2 = lm(systolic ~ gender + age + height.cm.,data=smoke_1000)
summary(systoliclm2)
systoliclm3 = lm(systolic ~ gender + age + height.cm.+tartar,data=smoke_1000)
summary(systoliclm3)
# checking if tartar is important
anova(systoliclm2,systoliclm3)
# With alpha=0.05 we do not reject the null hypothesis, 
#so tartar is not important
# Compare with a likelihood ratio test (equivalent)
library(lmtest)
lrtest(systoliclm2,systoliclm3) 
# same result as expected 
# slightly smaller log-likelihood in the model with
# 5 variables but not important after all

# Residuals vs Fitted plot
# test homogeneity and independence of the errors
# QQplot
# very good picture for the normality assumption to
# hold
plot(systoliclm2,which=2)
# fairly flat plot for the equal variance assumption to hold
plot(systoliclm2,which=1)

# Confidence intervals for the 3 explanatory variables
confint(systoliclm2)

# Prediction bands for the mean systolic for all patients
predict(systoliclm2,smoke_1000[,2:4],interval = "confidence")
# Prediction bands for the response systolic for 4 patients
predict(systoliclm2,smoke_1000[5:8,2:4],interval = "prediction")
# Wider intervals(this band adds 1 under the square root in formula)

#methods for comparison
#install.packages("leaps")
library(leaps)
# For pedagogical purposes we will keep the variables that
# appear important in summary(systolic_lm), pvalue<0.05
# that is gender,age,height,weight,relaxation,tartar,smoking
#2,3,4,5,12,25,26
names(smoke_1000[,c(2,3,4,5,12,25,26)])
#best subset selection,2^7=128 possible  models
best1=regsubsets(systolic~gender+age+height.cm.+weight.kg.+relaxation+tartar+smoking,,data=smoke_1000)#(RSS )
# in the following output we get that the best model
# with one variable is the relaxation
# with 2 the age and relaxation,# with 3 the age,weight and relaxation
# with 4 the age height,weight and relaxation
# with 5 the age,heigth,weiht,relaxation,tartar
# with 6 we add the previous except tartar and add smoke and gender

best_sum=summary(best1)#(default up to 8)

best_sum$adjr2 #adjusted R square for best models

plot(best_sum$adjr2,xlab = "number of variables",ylab="Radjusted",type="l")
#the 7 model has the maximum but the difference is very small
which.max(best_sum$adjr2)
# we choose the most parsimonious one the 3 variable model
# according to adjusted r square
points(3,best_sum$adjr2[3],col="green",pch=20,cex=2)
# According to BIC (Bayes information criterion)
# we choose the 4 variable model
best_sum$bic 
which.min(best_sum$bic)
plot(best_sum$bic,xlab = "number of variables",ylab="BIC",type="l")
points(4,best_sum$bic[4],col="green",pch=20,cex=2)



#forward Total only 1+p(p+1)/2=11 comparisons!
bestforward=regsubsets(systolic~gender+age+height.cm.+weight.kg.+relaxation+tartar+smoking,method="forward",data=smoke_1000)
bestforward_sum=summary(bestforward)
bestforward_sum$adjr2 #adjusted R square for best models
plot(bestforward_sum$adjr2,xlab = "number of variables",ylab="Radjusted",type="l")
which.max(bestforward_sum$adjr2)
# again we choose the 3 variable model
points(3,bestforward_sum$adjr2[3],col="green",pch=20,cex=2)


#In general the 2 methods give similar results  most of the time,but the second
#one can be used for a large number of independent  variables. 
# 
#### AIC forward
best_forward_model=step(systolic_lm,method="forward") # 7 variables
final_variablesf=attr(terms(best_forward_model), "term.labels")
final_variablesf
#### AIC backward
best_backward_model=step(systolic_lm,method="backward") # 7 variables
final_variables=attr(terms(best_backward_model), "term.labels")
final_variables

#We get exactly the same variables
# In reality this is the model that we would choose

############ 
# generalized linear models
# The logistic model
# predictions and comparisons
fit_1=glm(smoking ~ age+gender+waist.cm.+systolic+triglyceride, data = smoke, family = binomial)
fit_1
fit_2=glm(smoking ~ age+gender+waist.cm.+systolic+triglyceride+tartar, data = smoke, family = binomial)
fit_2
# predictions
# probabilities
prob_1=predict(fit_1, type = "response")

# comparison
lrtest(fit_1,fit_2)
# so tartar is important


# what if the models are not nested

library(pROC)
# 2.  ROC
# 
#age,gender,waist.cm.,systolic,triglyceride
roc_1=roc(smoke$smoking, prob_1)
# second model
# age, systolic,tartar,Urine.protein,relaxation
fit_2=glm(smoking~age+systolic+tartar+Urine.protein+relaxation,data=smoke,family="binomial")
prob_2=predict(fit_2, type = "response")
roc_2=roc(smoke$smoking, prob_2)

# 3.  AUC
print(auc(roc_1))
print(auc(roc_2))

# 4. Plot
plot(roc_1, col = "blue", main = "ROC Comparison: Model 1 vs Model 2", lwd = 2)
lines(roc_2, col = "red", lwd = 2)


################### mixed effect models

library("joineR")
# first 6 patients
head(heart.valve)
# 988 patients, 25 variables
dim(heart.valve)
#Aortic valve replacement surgery data
#This is longitudinal data on an observational study on detecting effects of different heart valves, 
#differing on type of tissue, implanted in the aortic position. 
#The data consists of longitudinal measurements on three different heart function outcomes, 
#after surgery occurred. There are several baseline covariates available, and 
#also survival data.
?heart.valve
#two different packages for mixed effect models
library(nlme)
library(lme4)
valve=na.omit(heart.valve)
# we can run this by using nlme
# This is the generalized least square method 
# with the compound symmetry matrix
gls_cs=gls(lvmi ~ sex + dm + time, 
              correlation = corCompSymm(form = ~ 1 | num), 
              data = valve,
              na.action = na.omit)

kk=summary(gls_cs)


valve=na.omit(heart.valve)
# 629 patients left 
# This is the casual method for handling missing data
# This assumes the MCAR(missing completely at random) assumption which is not valid
# in many cases nonetheless is used in many cases 
dim(valve)
# casual linear model
lmefit0=lm(lvmi~sex+dm+time,data=valve)
summary(lmefit0)
# Population average
valve$fitted_pop0=predict(lmefit0)   
# nlme package
# each patient has his/her own behavior
#mixed effect model with a random intercept
lmefit2=lme(lvmi~sex+dm+time,random=~1|num,data=valve)
a=summary(lmefit2)
lmefit2$method # REML method (Restricted maximum likelihood) 
# fixed coefficients
fixef(lmefit2)
# the general table for the fixed terms
a$tTable
# variance-covariance matrix for the inputs
vcov(lmefit2)
# confidence intervals for the inputs
intervals(lmefit2, which="fixed")
# this is the estimate of the s.d for the random intercept
a$sigma
# random effect part
ranef(lmefit2)
# a table for each variable
# notice that the intercept is derived from the sum
# of fixed and random intercept
coef(lmefit2)
# the variance covariance matrix
# for the random term(scalar in this case)
getVarCov(lmefit2)
# variance and s.d for the random term and residual
# notice that the intra-class correlation 
#3253.527/(3253.527+1560.692)=the gls correlation parameter
# because this two models are equivalent
VarCorr(lmefit2)
# variance covariance matrices for the inputs
# there is no correlation in the "random" matrix
# because only the intercept is random
getVarCov(lmefit2, type="conditional")
getVarCov(lmefit2, type="marginal")
# log-likelihood estimates without and with REML
# we observe a slightly  lower value without REML 
logLik(lmefit2, REML = FALSE)
# this one ignores the fixed effects
logLik(lmefit2, REML = TRUE)
#Akaike information criterion
AIC(lmefit2)
# Bayesian Information Criterion
# Larger than AIC for sample size >8
BIC(lmefit2)
# When comparing models we choose the model with the smaller
# AIC or BIC

# To make predictions for a patient
fitted(lmefit2)
# same as the fitted()
predict(lmefit2,level=1)
# total residuals
resid(lmefit2, type="response")
# fixed residuals
resid(lmefit2, type="response",level=0)

# random intercept and slope on time with 
# including correlation in the random intercept and slope
lmefit=lme(lvmi~sex+dm+time,random=~time|num,data=valve)
# no correlation in the random intercept and slope
lmefit_2 = lme(lvmi ~ sex + dm + time, random = list(num = pdDiag(~time)), data = valve)
k=summary(lmefit)
k2=summary(lmefit2)
# residual vs fitted values
plot(lmefit)
# more valid plot to validate constant variance(for the error term)
# by considering the time in the x-axis
# the dot stands for the current model
plot(lmefit, 
     + resid(., type = "pearson") ~ time )
library(lattice)
# use your own cut
valve$time_bins <- cut(valve$time, breaks = 5)
library("psych")
valve$std_resid <- resid(lmefit, type = "p")
# descriptive measurements in each bin 
describe.by(valve$std_resid, group = valve$time_bins)
# box plot for the bins 
bwplot(resid(lmefit, type = "p") ~ time_bins, 
       data = valve,
       ylab = "Pearson Residuals",
       xlab = "Time Point",
       panel = panel.bwplot, # Standard lattice panel function
       pch = "|")          # line for the median
#Normal Q-Q plots of Pearson residuals and predicted random effects
plot_df=data.frame(
  res = resid(lmefit, type = "p"),  # Standardized residuals
  bins = valve$time_bins
)
plot_df
# Use qqmath
# this QQ plot tests the normal assumption in bins
qqmath(~ res | bins, 
       data = plot_df,
       layout = c(5, 1),
       prepanel = prepanel.qqmathline,
       panel = function(x, ...) {
         panel.qqmathline(x, ...)   # This is the straight line"
         panel.qqmath(x, ...)       #  data points
       },
       xlab = "Theoretical Quantiles",
       ylab = "Standardized Residuals")
## this "tests" the normal assumption on random effects
qqnorm(lmefit, ~ranef(.))
# predictions for the model with random intercept and slope
k$fitted
# this gives the coefficients for the fixed part
# and the total(random and fixed) intercepts and slopes for each individual
k$coefficients
# same model with lmefit but with lme4 syntax
lmefit_1=lmer(lvmi~sex+dm+time+(time|num),data=valve)
summary(lmefit_1)


library(ggplot2)
### Create a spaghetti plot
# Add fitted values  
valve$fitted_indiv=predict(lmefit, level = 1) # Subject-specific
valve$fitted_pop=predict(lmefit, level = 0)   # Population average

# Plotting the response
ggplot(valve, aes(x = time, y = lvmi, group = num)) +
  geom_line(alpha = 0.4, color = "gray50") +  # Raw individual lines
  geom_point(alpha = 0.4) +                   # Raw data points
  theme_minimal() +
  labs(title = "Raw Response Profiles (Spaghetti Plot)",
       subtitle = "Observed LVMI values for each individual over time",
       x = "Time", y = "LVMI")

# Create a response plot
ggplot(valve, aes(x = time, y = lvmi, group = num)) +
  # Raw data points
  geom_point(alpha = 0.3) + 
  # Individual fitted lines (Subject-specific)
  geom_line(aes(y = fitted_indiv), color = "blue", alpha = 0.3) +
  # Population trend line (Fixed effects)
  # Note: We use a separate data layer for the population line to avoid 'group' conflicts
  geom_line(aes(x = time, y = fitted_pop), 
            color = "red", 
            linewidth = 1.2, 
            data = valve, 
            inherit.aes = FALSE) + 
  theme_minimal() +
  labs(title = " Response Plot",
       subtitle = "Red = Fixed Effect (Population), Blue = Random Effects (Individuals)")



#comparisons
# null model
m00= gls(formula(lvmi ~ sex + dm + time),data=valve)

# random intercept
m0= lme(lvmi ~ sex + dm + time, random = ~ 1 | num, data = valve, method = "REML")
# no random intercept but slope
m01=update(m0,random = ~0 + time|num,data=valve, method = "REML")
# random intercept and slope without correlation
m02= lme(lvmi ~ sex + dm + time, random = list(num = pdDiag(~time)), data = valve)
# random intercept and slope with correlation
m03=lme(lvmi ~ sex + dm + time, random = ~ 1+time|num, data = valve, method = "REML")

# test for the random intercept

###
# A likelihood ratio test
out_1=anova(m00,m0)
(out_1[["p-value"]][2])/2 # x0^2+x1^2

## A simulation approach

library(RLRsim)
exactRLRT(m0)

# random slope
# A likelihood ratio test
out_2=anova(m0,m02)
RLRT2=out_2[["L.Ratio"]][2]
0.5 * pchisq(RLRT2, 1, lower.tail = FALSE) +0.5 * pchisq(RLRT2, 2, lower.tail = FALSE)
## x1^2+x2^2

##### simulation approach
#m01 auxilary model
exactRLRT(m=m01,m0=m0,mA=m02)

# A simulation approach to see whether the test
#is conservative
sim_result=simulate( m0,m2=m02,nsim=1000)
#  View the empirical p-value and distribution
# all tests are conservative either with
#chi square with one degree, two degrees or the mixture
# choice
plot(sim_result,df=c(1,2),abline = c(0,1, lty=2))


###### reaction time data
?sleepstudy

model_sleep
# Fit the model
model_sleep=lmer(Reaction ~ Days + (Days | Subject), data = sleepstudy)

# View the results
summary(model_sleep)

####  
#We will compare this model with the common linear model
# to see that the usage of random effect gives us 
# a better model

#  create  data folds
library(caret)

# Setup the 5-fold cross-validation
set.seed(13) # For reproducibility
k=5
# randomly split the data into 5 folds
folds=createFolds(sleepstudy$Reaction, k = k)

# Create empty vectors to store the errors  (RMSE)
rmse_lm=numeric(k)
rmse_lmm=numeric(k)

#  Cross-Validation Loop
for(i in 1:k) {
  
  # Split into training (80%) and testing (20%) sets
  test_idx=folds[[i]]

  train_data=sleepstudy[-test_idx, ]
  test_data=sleepstudy[test_idx, ]
  
  #MODEL 1: The usual Linear Model 
  # Fits one single line for everyone
  # training
  mod_lm=lm(Reaction ~ Days, data = train_data)
  # the predictions on the test data
  pred_lm=predict(mod_lm, newdata = test_data)
  
  # Calculate the test error for LM
  rmse_lm[i]=sqrt(mean((test_data$Reaction - pred_lm)^2))
  
  #MODEL 2: The Linear Mixed Model 
  # Fits individual intercepts and slopes for each subject
  mod_lmm=lmer(Reaction ~ Days + (Days | Subject), data = train_data)
  pred_lmm=predict(mod_lmm, newdata = test_data)
  
  # Calculate Error for LMM
  rmse_lmm[i]=sqrt(mean((test_data$Reaction - pred_lmm)^2))
}

# Compare the Final Results
cat("Average Error (RMSE) for Standard Linear Model: ", mean(rmse_lm), "\n")
cat("Average Error (RMSE) for Linear Mixed Model:    ", mean(rmse_lmm), "\n")

# Visualization
#  LM
mod_lm1=lm(Reaction ~ Days, data = sleepstudy)
# LMM
mod_lmm1=lmer(Reaction ~ Days + (Days | Subject), data = sleepstudy)

# Add the predictions as new columns in the data set
sleepstudy$Pred_LM=predict(mod_lm1)
sleepstudy$Pred_LMM=predict(mod_lmm1)

# Build the plot
ggplot(sleepstudy, aes(x = Days, y = Reaction)) +
  # Plot the actual raw data points
  geom_point(alpha = 0.6) +
  
  # Add the Standard Linear Model line (Red, Dashed)
  geom_line(aes(y = Pred_LM), color = "red", linetype = "dashed", linewidth = 1) +
  
  # Add the Linear Mixed Model line (Blue, Solid)
  geom_line(aes(y = Pred_LMM), color = "blue", linewidth = 1) +
  
  # Create a separate panel for each Subject
  facet_wrap(~ Subject) +
  
  # Clean up the aesthetics
  theme_minimal() +
  labs(title = "Standard Linear Model vs. Linear Mixed Model",
       subtitle = "Red Dashed =  Linear Model | Solid Blue = Mixed Model",
       x = "Days of Sleep Deprivation",
       y = "Reaction Time (ms)")