
#
########### Linear models (LM) ################
#
# Import DATA
#
smoke   = read.csv('http://users.uoa.gr/~fsiannis/data/smoking.csv')    # Kaggle
Newborn = read.csv('http://users.uoa.gr/~fsiannis/data/newborn.csv')
attica  = read.csv('http://users.uoa.gr/~fsiannis/data/ATTICA.csv')     
#
head(Newborn)
# NEWBORN data: Predict the weight of a newborn for reproduction
#
# weight:   newborn weight
# age:      mother's age
# weightM:  mother's weight at last menstrual period
# height:   fathers height in cm
# hyper:    mother's hypertension history
# smoke:    mothers smoking habit
#
head(smoke)
head(attica)
#
# Visualize the data scatter plots for each combination of continuous variables
# We can observe the correlation for each combination
pairs(Newborn[,2:5])
#
# Fit the model,considering all explanatory variables
weightlm = lm(weight ~ .-X,data=Newborn)
summary(weightlm)
anova(weightlm)
#
# With confidence level alpha=0.05 only smoke and hyper seems not to be significant
# The F test gives a significant result
# comparing 2 models with the partial F test
weightlm2 = lm(weight ~ age + weightM + height + smoke,data=Newborn)
summary(weightlm2)
weightlm3 = lm(weight ~ age + weightM + height,        data=Newborn)
summary(weightlm3)
#
anova(weightlm2,weightlm3)
# With alpha=0.05 we reject the null hypothesis, so hyper is not important
# Compare with a likelihood ratio test (equivalent)
anova(weightlm2,weightlm3,test="Chisq")
#
# Interpretation of coefficients for the full model
# continuous variable
# e.g For an one unit increase in weightM the weight is increased by
# 0.11(approx),keeping all
# the other variables constant
# categorical variable
# e.g Mothers who smoke tend to give birth to children with less weight by
# 0.01(approx) with relation
# to the mothers who do not smoke,keeping all the other variables constant
#
# Checking the assumptions(weightlm3)
# Residuals vs Fitted plot
# Clearly the homogeneity and independence error  assumption  holds
#
# QQplot
plot(weightlm3,which=2)
#
# Almost perfect fit
# Confidence intervals for the 3 explanatory variables
confint(weightlm3)
#
# Prediction bands for the mean of weight
predict(weightlm3,Newborn[5:8,2:4],interval = "confidence")
# Prediction bands for the response weight
predict(weightlm3,Newborn[5:8,2:4],interval = "prediction")
# Wider intervals(this band adds 1 under the square root in formula)
#
#methods for comparison
#install.packages("leaps")
library(leaps)
#best subset selection,2^4=16 possible  models
best1=regsubsets(weight~.-X,data=Newborn)#(RSS )
best_sum=summary(best1)#(default up to 8)
#here for the model with 2 variables Hits and CRBI were selected(asterisks)
best_sum$adjr2 #adjusted R square for best models
plot(best_sum$adjr2,xlab = "number of variables",ylab="Radjusted",type="l")
which.max(best_sum$adjr2)#the 5 has the maximum but the difference is very small
#with the 3 and 4,so we choose the most parsimonious one
points(3,best_sum$adjr2[3],col="green",pch=20,cex=2)

#forward Total 1+p(p+1)/2=11 comparisons
bestforward=regsubsets(weight~.-X,data=Newborn,method = "forward")
bestforward_sum=summary(bestforward)
bestforward_sum$adjr2 #adjusted R square for best models
plot(bestforward_sum$adjr2,xlab = "number of variables",ylab="Radjusted",type="l")
which.max(bestforward_sum$adjr2)#the 5 has the maximum but the difference is very small
#with the 3 and 4,so we choose the most parsimonious one
points(3,bestforward_sum$adjr2[3],col="green",pch=20,cex=2)

#In general the 2 methods give the same results most of the time,but the second
#one can be used for a large number of independent  variables. 

################################
##### Mixed effect models ######
################################
install.packages("lme4")
install.packages("nlme")
install.packages("ggplot2")
library(lme4);  library(nlme);  library(ggplot2)
#
# Growth of chickens over time
head(ChickWeight)
?ChickWeight
#
# Orthodont data
head(Orthodont)
?Orthodont
#The pituitary gland is a small organ located deep in the center of your head, right behind your eyes.
#The pterygomaxillary fissure is a small space near the back of your upper jaw, close to where your jaw and skull meet.
attributes(Orthodont)
formula(Orthodont)
#
# Visualization Orthodont data
plot(Orthodont)
ggplot(Orthodont, aes(x = age, y = distance, group = Subject, color = Subject)) +
  geom_line() +  # Connect points for each chick
  geom_point(size = 1) +  # Show actual data points
  labs(title = "Response Profile Over Time for Orthodont Data",
       x = "Time (Years)",
       y = "Distance (mm)") +
  theme_minimal() 
#
# One random effect per chicken - in this model only the intercept is random
# Visualization
ggplot(ChickWeight, aes(x = Time, y = weight, group = Chick, color = Chick)) +
  geom_line() +  # Connect points for each chick
  geom_point(size = 1) +  # Show actual data points
  labs(title = "Response Profile Over Time for Each Chicken",
       x = "Time (Days)",
       y = "Weight (grams)") +
  theme_minimal() 
#
### ANALYSIS ###
#
#############
# Orthodont #
#############
#
# Choose only Females
levels(Orthodont$Sex)
OrthoFem=Orthodont[Orthodont$Sex=="Female",]
#
# Fit linear model only on Females (nothing random yet)
lmF=lmList(distance ~ age, data=OrthoFem)
coef(lmF)
intervals(lmF)
plot(intervals(lmF))
#
# Centre Age at I(age-11) [meaningful intercept + remove collinearity]
lmF2=update(lmF,distance ~ I(age-11))
plot(intervals(lmF2))
#
# Random Effects - Random Intercept Only
lmeF = lme(distance ~ age,data=OrthoFem,random=~1)          # Using REML
summary(lmeF)
lmeF0=lme(distance ~ I(age-11),data=OrthoFem,random=~1)        # Makes the correlation
#between Intercept and I(age-11) zero
summary(lmeF0)
#
lmeF1=update(lmeF,method='ML')                    # Using ML
summary(lmeF1)
#
# Random Effects - Random Intercept & Slope
lmeF2=lme(distance~age,data=OrthoFem)              # Using REML
summary(lmeF2)
#
anova(lmeF,lmeF2)                           # Slope not significant (test in 2 DFs)
#
###############
# ChickWeight #
###############
#
# The REML estimates in general give less bias than the casual estimates 
fit_chicken = lmer(weight ~ (1 | Chick) + Time + Diet, data = ChickWeight) 
summary(fit_chicken)
# Intraclass correlation
# Correlation between times for a  chicken(min zero, max 1)
525.4/(525.4+799.4)
#
#check if the random intercept is important 
library(lmerTest)
fit_chicken <- lmer(weight ~ Time + Diet + (1 | Chick), data = ChickWeight)
ranova(fit_chicken) # the random intercept seems significant,but the starting weights 
#are the same so there is no need of using a random intercept

# The fixed effects interpretations are the same as in linear models
# Confidence intervals 
confint(fit_chicken,oldNames=F) 
#
# Estimates (more accurate predictions, since we have random variables) of
# conditional means 
# Best Linear Unbiased Prediction (BLUP)
ranef(fit_chicken) 
#
# Make predictions(first chicken,including the prediction for that chicken)
# This is the predicted response profile for the first chicken
predict(fit_chicken,newdata=ChickWeight[1:12,2:4])
#
# Check if diet for each chicken is important (with a likelihood ratio test)
fit_chicken0= lmer(weight ~ (1 | Chick)+Time, data = ChickWeight) 
anova(fit_chicken0,fit_chicken) 
# As we can see the diet is an important variable for the development of the chicken
# Plot
library(ggeffects)
library(dplyr)
# Marginal response (using fixed effects only)
marginal_preds <- ggpredict(fit_chicken, terms = "Time")
#%>% pipe operator
#The output of each function is passed as the input to the next function.
# Select two chickens (Chick 2 and Chick 4)
selected_chickens <- ChickWeight %>% filter(Chick %in% c("2", "4"))
#
# Conditional response ( including random effects)
selected_chickens$predicted <- predict(fit_chicken, newdata = selected_chickens)
#
# Plotting: Marginal vs. Conditional responses
ggplot() +
  # Plot marginal response (population-level)
  geom_line(data = marginal_preds, aes(x = x, y = predicted), color = "red",
            linetype = "dashed", size = 1) +
  #
  # Plot conditional responses (individual chickens)
  geom_line(data = selected_chickens, aes(x = Time, y = predicted, group = Chick,
                                          color = Chick), size = 1) +
  #
  # Add observed data points for the selected chickens
  geom_point(data = selected_chickens, aes(x = Time, y = weight, color = Chick),
             size = 1.2) +
  #
  # Labels and titles
  labs(title = "Responses for Two Chickens around population response",
       x = "Time (Days)",
       y = "Weight (grams)",
       color = "Chick") +  # Legend for chickens
  theme_minimal() # minimalist theme 
#

# Random Intercept and Slope model
fit_chicken_new0 = lmer(weight ~ (Time-1 | Chick) + Diet + Time, data = ChickWeight) 
summary(fit_chicken_new0)
fit_chicken_new = lmer(weight ~ (1+Time | Chick) + Diet + Time, data = ChickWeight) 
summary(fit_chicken_new)
#
# Chickens with higher initial weights tend to have smaller slope (increase) over time
# Due to  the negative correlation between chicken and time
# Confidence intervals 
confint(fit_chicken_new,oldNames=F) 
#
# Estimates (more accurate predictions, since we have random variables) of
#conditional means 
# Best Linear Unbiased Prediction (BLUP)
ranef(fit_chicken_new) 
#
# Make predictions(first chicken,including the prediction for that chicken)
# This is the predicted response profile for the first chicken
predict(fit_chicken_new,newdata=ChickWeight[1:12,2:4])
# Check if diet for each chicken is important (with a likelihood ratio test)
fit_chicken_new0= lmer(weight ~ (1+Time | Chick)+Time, data = ChickWeight) 
anova(fit_chicken_new0,fit_chicken_new) 
# As we can see again the diet is an important variable for the development of the
#chicken
# Plot
# Marginal response (using fixed effects only)
marginal_preds_new <- ggpredict(fit_chicken_new, terms = "Time")
# Select two chickens (Chick 2 and Chick 4)
#
# Conditional response ( including random effects)
selected_chickens$predicted <- predict(fit_chicken_new, newdata = selected_chickens)
#
# Plotting: Marginal vs. Conditional responses
ggplot() +
  # Plot marginal response (population-level)
  geom_line(data = marginal_preds_new, aes(x = x, y = predicted), color = "red",
            linetype = "dashed", size = 1) +
  #
  # Plot conditional responses (individual chickens)
  geom_line(data = selected_chickens, aes(x = Time, y = predicted, group = Chick,
                                          color = Chick), size = 1) +
  #
  # Add observed data points for the selected chickens
  geom_point(data = selected_chickens, aes(x = Time, y = weight, color = Chick),
             size = 1.2) +
  #
  # Labels and titles
  labs(title = "Responses for Two Chickens around population response",
       x = "Time (Days)",
       y = "Weight (grams)",
       color = "Chick") +  # Legend for chickens
  theme_minimal() # minimalist theme 
#

# Likelihood ratio test to check if random slope is important
# The null distribution is a mixture of chi-squared distributions
# Testing the variance of slope (null hypothesis,the variance is zero)
anova(fit_chicken,fit_chicken_new)
# More appropriate p-value
1/2*pchisq(789.12,1,lower.tail = FALSE)+1/2*pchisq(789.12,2,lower.tail = FALSE)
# or set a=0.1
#
########################################################
#################### MANOVA ############################
########################################################
#
# Independent variable Smoke
# 2 dependent variables Cholesterol,triglyceride
# This can be done with  Hotteling's T^2 but we will use MANOVA instead 
library(tidyr)
#convert wide to long format
Smoke_new <- pivot_longer(Smoke, cols = c( Cholesterol, triglyceride),
                          names_to = "variable", values_to = "value")
Smoke_new$smoking <- factor(Smoke_new$smoking)

mean_values <- Smoke_new %>%
  group_by(variable, smoking) %>%
  summarise(mean_value = mean(value, na.rm = TRUE))
mean_values$smoking <- factor(mean_values$smoking)
#plot 
ggplot(Smoke_new, aes(x = variable, y = value, color = smoking)) +
  geom_jitter(width = 0.2, size = 2, alpha = 0.7) +  
  # Horizontal mean lines per group
  theme_minimal(base_size = 14) +
  labs(
    title = "Cholesterol-triglyceride by Smoking",
    x = "Variable",
    y = "Value",
    color = "Smoking"
  )+facet_wrap(~variable) +  # Creates separate plots for each variable
  # Add the horizontal mean  lines 
  geom_hline(data = mean_values, aes(yintercept = mean_value, color = smoking), 
             linewidth = 0.5, linetype = "solid")


# Summary of the MANOVA test
manova_result <- manova(cbind(Cholesterol,triglyceride) ~ smoking, data =Smoke )
summary(manova_result)
#the null is rejected so there is an indication of a difference between smoking and
#not smoking
#in  at least one variable 
#one way anova ensures that all the dependet variables are important are important 
#one way anova is discussed below
ANOVA=aov(cbind(Cholesterol,triglyceride) ~ smoking, data =Smoke)
summary(ANOVA)
#all three variables are important




library(haven)
ATTICA <- read_sav("C:/Users/steli/Desktop/work/uni/anoixto panepisthmio/ATTICA
Study 10 yr FU  NEO !.sav")
View(ATTICA)
#Data collected from ATTICA hospital to observe patients with CVD(cardiovascular
#disease) in 10 period gap
ATTICA_NEW=as.data.frame(ATTICA[,c(5,14,15,19)])
head(ATTICA_NEW)
ATTICA_CLEAR=na.omit(ATTICA_NEW)
View(ATTICA_CLEAR)
head(ATTICA_CLEAR)
#bmi,bmi group systolic and diastolic blood pressure will be used
#after the data is cleared from the missing values  
#bmi group has 4 levels and is the independent variable
#bmi,sbp and dbp are the dependent variables
ATTICA_CLEAR$bmi_grou <- factor(ATTICA_CLEAR$bmi_grou)
# Also good to check and convert any numeric columns if needed
ATTICA_CLEAR$sbp <- as.numeric(ATTICA_CLEAR$sbp)
ATTICA_CLEAR$dbp <- as.numeric(ATTICA_CLEAR$dbp)
ATTICA_CLEAR$tgl <- as.numeric(ATTICA_CLEAR$tgl)
# Create the plot
#convert wide to long format
ATTICA_long <- pivot_longer(ATTICA_CLEAR, cols = c( sbp, dbp,tgl),
                            names_to = "variable", values_to = "value")
mean_values <- ATTICA_long %>%
  group_by(variable, bmi_grou) %>%
  summarise(mean_value = mean(value, na.rm = TRUE))
ggplot(ATTICA_long, aes(x = variable, y = value, color = bmi_grou)) +
  geom_jitter(width = 0.2, size = 2, alpha = 0.7) +  # Jittered dots
  # Horizontal mean lines per group
  theme_minimal(base_size = 14) +
  labs(
    title = "SBP, DBP, TGL by BMI Group",
    x = "Variable",
    y = "Value",
    color = "BMI Group"
  )+facet_wrap(~variable) +  # Creates separate plots for each variable
  # Add the horizontal mean  lines 
  geom_hline(data = mean_values, aes(yintercept = mean_value, color = bmi_grou), 
             size = 0.5, linetype = "solid")

# Summary of the MANOVA test
manova_result <- manova(cbind(sbp, dbp, tgl) ~ bmi_grou, data = ATTICA_CLEAR)
summary(manova_result)
#the null is rejected so there is an indication of a difference between bmi levels
#in  at least one variable 
#one way anova ensures that all three are important 
#one way anova is discussed below
ANOVA=aov(cbind(sbp, dbp, tgl) ~ bmi_grou, data = ATTICA_CLEAR)
summary(ANOVA)
#all three variables are important






View(PlantGrowth)
?PlantGrowth
###################### ONE WAY ANOVA ########################
# Get data
data("PlantGrowth")
str(PlantGrowth)                      #check the data
levels(PlantGrowth[,"group"])
#
#visualize the data
stripchart(weight~group,vertical=T,pch=1,data=PlantGrowth)
boxplot(weight~group,data=PlantGrowth)
#
# mean plot
# mean  plot for income across categories
#
library(dplyr)
meanweight = PlantGrowth %>%
  group_by(group) %>%
  summarise(mean=mean(weight))
#
aggregate(PlantGrowth[,1],list(PlantGrowth$group), mean)
#
library(ggplot2)
ggplot(meanweight,aes(x=group,y=mean))+
  geom_point(size = 1)+
  geom_line(group=1)+
  labs(title="Mean plot of average weight across groups",x="Group",y="Weight")+
  theme_minimal()
#
# fit ANOVA
fit_plant = aov(weight~group,data=PlantGrowth)
summary(fit_plant)
#
# coefficients
coef(fit_plant)
#
# check the reference group
dummy.coef(fit_plant)
#
# second way (the contrast is the sum to zero (a's))
fit_plant$contrasts
#
# Here the control group is the reference group
#
# manual calculations of the above results
# estimated coefficients
#
a1 = mean(PlantGrowth$weight[1:10])             #reference 
a2 = mean(PlantGrowth$weight[11:20]) - a1
a3 = mean(PlantGrowth$weight[21:30]) - a1
#
# print the coefficients at once
c(a1,a2,a3)
# between SS (SSB)
SSB=10*(mean(PlantGrowth$weight[1:10])-mean(PlantGrowth$weight))^2+10*(mean(PlantGrowth$weight[11:20])-mean(PlantGrowth$weight))^2+10*(mean(PlantGrowth$weight[21:30])-mean(PlantGrowth$weight))^2
#
# within sum variation(SSE)
# means for each group
MEAN1 = rep(mean(PlantGrowth$weight[1:10]),10)
MEAN2 = rep(mean(PlantGrowth$weight[11:20]),10)
MEAN3 = rep(mean(PlantGrowth$weight[21:30]),10)
MEAN = c(MEAN1,MEAN2,MEAN3)
#
# create a new column in PlantGrowth data frame
# Within SS (SSW)
PlantGrowth$MEAN=MEAN
SSW = sum( (PlantGrowth[,1] - PlantGrowth[,3])^2)
#
#F value
F = (SSB/2)/(SSW/27)
#calculate p-value
#pf is the F cumulative distribution function
p_value = 1 - pf(F,2,27)
p_value
#
# predictions in R
# the predicted means for each group
predict(fit_plant,new=data.frame(group=c("ctrl","trt1","trt2")))
#
# check the assumption of normality
# QQ plot, we want the data as close as possible to the straight line
plot(fit_plant,which=2)
# 
# or 
library(car)
qqPlot(fit_plant,distribution = "norm")
#
# Residual vs fitted values plot
# residuals are the original y values minus the predicted values(here the means)
# this plot can be used to check the homoscedastic assumption
# also checks trends in the data, outliers and the independence assumption.
#
## Anova is a special case of linear models
# Equivalent way of doing one way anova for PlantGrowth data using 
# a multivariate linear model with 2 dummy variables
lin_plant=lm(weight~group,data=PlantGrowth)
summary(lin_plant)
#
# compare with coef(fit_plant) and observe that they are exactly the same
summary(fit_plant); coef(fit_plant)
#
plot(fit_plant,which=1,add.smooth = "false")
#
# non parametric approach
# non normality assumption
# Kruskal-Wallis test
k_fit_plant=kruskal.test(weight~group,data=PlantGrowth)
k_fit_plant
#
#contrasts
#install.packages("multcomp")
#
library(multcomp)
# the package is useful because it allows to perform any test 
# testing other hypothesis like m1=mean(m2+m3) for 3 groups
# the coefficients of contrast must add up to zero
#
plant_test=glht(fit_plant,linfct = mcp(group=c(1,-1/2,-1/2)))
# the test result
summary(plant_test)
# confidence intervals for contrasts
confint(plant_test)
# the estimate 5.032-mean(c(4.661,5.526))
# also observe that the interval includes zero,this a sign against H1
#
# Bonferroni correction
# Lets say  that we have 2 contrasts c(1,-1/2,-1/2) and c(1,-1,0)
# multiple comparisons require a correction(the probability of at least one type I
# n independent multiple comparisons is 1-(1-a)^n)
# create a matrix with  contrasts
K=rbind(c(1,-1/2,-1/2),c(1,-1,0))
plant_K=glht(fit_plant,linfct=mcp(group=K))
# non-corrected
summary(plant_K,test=adjusted("none"))
# Bonferroni corrected
# The alpha is divided by n(the correction)
summary(plant_K,test=adjusted("bonferroni"))
# confidence intervals
confint(plant_K,calpha =univariate_calpha()) 
# other corrections
# Bonferroni-Holm
# the smallest p value is adjusted first by a/n,then the second by a/(n-1)(here the
#same as for none )
summary(plant_K,test=adjusted("holm"))
#
#
# THS (pairwise comparisons)
TukeyHSD(fit_plant)#comparison for each combination 
# visualization
plot(TukeyHSD(fit_plant))
# this method can be done also with multcomp
plant_T=glht(fit_plant,linfct=mcp(group="Tukey"))
summary(plant_T)
# usage of t-tests for multiple pairwise comparisons
# no corrections
pairwise.t.test(PlantGrowth$weight, PlantGrowth$group,
                p.adjust.method = "none")
#with corrections
pairwise.t.test(PlantGrowth$weight, PlantGrowth$group,
                p.adjust.method = "bonferroni")
#
########################################################
########## Two way anova with interactions #############
########################################################
#
View(ToothGrowth)
?ToothGrowth
# count the observations in each group
# here we have a balanced design
#
xtabs(~supp+dose,data=ToothGrowth)
table(ToothGrowth$supp,ToothGrowth$dose)
#
# interaction plot
# if the lines of Vitamin and Dose were parallel the interaction effect would not be
#significant
with(ToothGrowth,interaction.plot(x.factor =dose ,trace.factor = supp,response=len))
#
# create elegant plots
library(ggplot2)
ggplot(ToothGrowth,aes(x=dose,y=len,lty=supp,shape=supp))+
  geom_point(size=1)+
  stat_summary(fun=mean,geom="line",aes(group=supp),colour="red")+
  theme_bw(base_size = 10) +  
  theme(
    legend.title = element_blank(),
    legend.position = "top",
    panel.grid.major = element_line(color = "gray85"),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold"))+
  labs(
    x = "Dose of Supplement",
    y = "Tooth Length",
    title = "Interaction Plot"
  )
#
# fit the model - F-test
#
fit_tooth = aov(len ~ supp*as.factor(dose),data=ToothGrowth)
summary(fit_tooth)
#
# take the coefficients of the model
dummy.coef(fit_tooth)
fit_tooth$contrasts
# First we check the interaction!
# If interaction is significant, main effects remain in model even if they are not
#significant (not in this case)
# If interaction is not significant, main effects are assessed independently whether
#to remain in model or not
#
ToothGrowth[,"trt.comb"] <- interaction(ToothGrowth[, "supp"],ToothGrowth[, "dose"])
levels(ToothGrowth[,"trt.comb"])
fit_tooth_new=aov(len~trt.comb,data=ToothGrowth)
summary(fit_tooth_new)
#
# (group mean - population mean)
dummy.coef(fit_tooth_new)
fit_tooth_new$contrasts
#
# significant
fit_half=glht(fit_tooth_new,linfct = mcp(trt.comb=c(-1,1,0,0,0,0)))
summary(fit_half)
#
# significant
fit_one=glht(fit_tooth_new,linfct = mcp(trt.comb=c(0,0,-1,1,0,0)))
summary(fit_one)
#not significant
fit_two=glht(fit_tooth_new,linfct = mcp(trt.comb=c(0,0,0,0,-1,1)))
summary(fit_two)
#
#diagnostic plots
#
#QQ plot
plot(fit_tooth, which = 2)
# manually
qqnorm(rnorm(nrow(ToothGrowth)))
#check the equal variance assumption(funnel like shape or not)
plot(fit_tooth, which = 1, add.smooth = FALSE)
#

#Smoke data
Smoke=read.csv("C:/Users/steli/Desktop/DIDAKTORIKO/????? R/smoking.csv")
#independent variable Smoke
#2 dependent variables Cholesterol,triglyceride
#This can be done with  Hotteling's T^2 but we will use MANOVA instead 
library(tidyr)
#convert wide to long format
Smoke_new <- pivot_longer(Smoke, cols = c( Cholesterol, triglyceride),
                            names_to = "variable", values_to = "value")
Smoke_new$smoking <- factor(Smoke_new$smoking)

mean_values <- Smoke_new %>%
  group_by(variable, smoking) %>%
  summarise(mean_value = mean(value, na.rm = TRUE))
mean_values$smoking <- factor(mean_values$smoking)
#plot 
ggplot(Smoke_new, aes(x = variable, y = value, color = smoking)) +
  geom_jitter(width = 0.2, size = 2, alpha = 0.7) +  
  # Horizontal mean lines per group
  theme_minimal(base_size = 14) +
  labs(
    title = "Cholesterol-triglyceride by Smoking",
    x = "Variable",
    y = "Value",
    color = "Smoking"
  )+facet_wrap(~variable) +  # Creates separate plots for each variable
  # Add the horizontal mean  lines 
  geom_hline(data = mean_values, aes(yintercept = mean_value, color = smoking), 
             linewidth = 0.5, linetype = "solid")


# Summary of the MANOVA test
manova_result <- manova(cbind(Cholesterol,triglyceride) ~ smoking, data =Smoke )
summary(manova_result)
#the null is rejected so there is an indication of a difference between smoking and not smoking
#in  at least one variable 
#one way anova ensures that all the dependet variables are important are important 
#one way anova is discussed below
ANOVA=aov(cbind(Cholesterol,triglyceride) ~ smoking, data =Smoke)
summary(ANOVA)
#all three variables are important




library(haven)
ATTICA <- read_sav("C:/Users/steli/Desktop/work/uni/anoixto panepisthmio/ATTICA Study 10 yr FU  NEO !.sav")
View(ATTICA)
#Data collected from ATTICA hospital to observe patients with CVD(cardiovascular disease) in 10 period gap
ATTICA_NEW=as.data.frame(ATTICA[,c(5,14,15,19)])
head(ATTICA_NEW)
ATTICA_CLEAR=na.omit(ATTICA_NEW)
View(ATTICA_CLEAR)
head(ATTICA_CLEAR)
#bmi,bmi group systolic and diastolic blood pressure will be used
#after the data is cleared from the missing values  
#bmi group has 4 levels and is the independent variable
#bmi,sbp and dbp are the dependent variables
ATTICA_CLEAR$bmi_grou <- factor(ATTICA_CLEAR$bmi_grou)
# Also good to check and convert any numeric columns if needed
ATTICA_CLEAR$sbp <- as.numeric(ATTICA_CLEAR$sbp)
ATTICA_CLEAR$dbp <- as.numeric(ATTICA_CLEAR$dbp)
ATTICA_CLEAR$tgl <- as.numeric(ATTICA_CLEAR$tgl)
# Create the plot
#convert wide to long format
ATTICA_long <- pivot_longer(ATTICA_CLEAR, cols = c( sbp, dbp,tgl),
                        names_to = "variable", values_to = "value")
mean_values <- ATTICA_long %>%
  group_by(variable, bmi_grou) %>%
  summarise(mean_value = mean(value, na.rm = TRUE))
ggplot(ATTICA_long, aes(x = variable, y = value, color = bmi_grou)) +
  geom_jitter(width = 0.2, size = 2, alpha = 0.7) +  # Jittered dots
     # Horizontal mean lines per group
  theme_minimal(base_size = 14) +
  labs(
    title = "SBP, DBP, TGL by BMI Group",
    x = "Variable",
    y = "Value",
    color = "BMI Group"
  )+facet_wrap(~variable) +  # Creates separate plots for each variable
  # Add the horizontal mean  lines 
  geom_hline(data = mean_values, aes(yintercept = mean_value, color = bmi_grou), 
             size = 0.5, linetype = "solid")

# Summary of the MANOVA test
manova_result <- manova(cbind(sbp, dbp, tgl) ~ bmi_grou, data = ATTICA_CLEAR)
summary(manova_result)
#the null is rejected so there is an indication of a difference between bmi levels
#in  at least one variable 
#one way anova ensures that all three are important 
#one way anova is discussed below
ANOVA=aov(cbind(sbp, dbp, tgl) ~ bmi_grou, data = ATTICA_CLEAR)
summary(ANOVA)
#all three variables are important






View(PlantGrowth)
?PlantGrowth
###################### ONE WAY ANOVA ########################
# Get data
data("PlantGrowth")
str(PlantGrowth)                      #check the data
levels(PlantGrowth[,"group"])
#
#visualize the data
stripchart(weight~group,vertical=T,pch=1,data=PlantGrowth)
boxplot(weight~group,data=PlantGrowth)
#
# mean plot
# mean  plot for income across categories
#
library(dplyr)
meanweight = PlantGrowth %>%
  group_by(group) %>%
  summarise(mean=mean(weight))
#
aggregate(PlantGrowth[,1],list(PlantGrowth$group), mean)
#
library(ggplot2)
ggplot(meanweight,aes(x=group,y=mean))+
  geom_point(size = 1)+
  geom_line(group=1)+
  labs(title="Mean plot of average weight across groups",x="Group",y="Weight")+
  theme_minimal()
#
# fit ANOVA
fit_plant = aov(weight~group,data=PlantGrowth)
summary(fit_plant)
#
# coefficients
coef(fit_plant)
#
# check the reference group
dummy.coef(fit_plant)
#
# second way (the contrast is the sum to zero (a's))
fit_plant$contrasts
#
# Here the control group is the reference group
#
# manual calculations of the above results
# estimated coefficients
#
a1 = mean(PlantGrowth$weight[1:10])             #reference 
a2 = mean(PlantGrowth$weight[11:20]) - a1
a3 = mean(PlantGrowth$weight[21:30]) - a1
#
# print the coefficients at once
c(a1,a2,a3)
# between SS (SSB)
SSB =10*(mean(PlantGrowth$weight[1:10])-mean(PlantGrowth$weight))^2+10*(mean(PlantGrowth$weight[11:20])-mean(PlantGrowth$weight))^2+10*(mean(PlantGrowth$weight[21:30])-mean(PlantGrowth$weight))^2
#
# within sum variation(SSE)
# means for each group
MEAN1 = rep(mean(PlantGrowth$weight[1:10]),10)
MEAN2 = rep(mean(PlantGrowth$weight[11:20]),10)
MEAN3 = rep(mean(PlantGrowth$weight[21:30]),10)
MEAN = c(MEAN1,MEAN2,MEAN3)
#
# create a new column in PlantGrowth data frame
# Within SS (SSW)
PlantGrowth$MEAN=MEAN
SSW = sum( (PlantGrowth[,1] - PlantGrowth[,3])^2)
#
#F value
F = (SSB/2)/(SSW/27)
#calculate p-value
#pf is the F cumulative distribution function
p_value = 1 - pf(F,2,27)
p_value
#
# predictions in R
# the predicted means for each group
predict(fit_plant,new=data.frame(group=c("ctrl","trt1","trt2")))
#
# check the assumption of normality
# QQ plot, we want the data as close as possible to the straight line
plot(fit_plant,which=2)
# 
# or 
library(car)
qqPlot(fit_plant,distribution = "norm")
#
# Residual vs fitted values plot
# residuals are the original y values minus the predicted values(here the means)
# this plot can be used to check the homoscedastic assumption
# also checks trends in the data, outliers and the independence assumption.


## Anova is a special case of linear models
# Equivalent way of doing one way anova for PlantGrowth data using 
# a multivariate linear model with 2 dummy variables
lin_plant=lm(weight~group,data=PlantGrowth)
summary(lin_plant)
#
# compare with coef(fit_plant) and observe that they are exactly the same
summary(fit_plant); coef(fit_plant)
#
plot(fit_plant,which=1,add.smooth = "false")


#
# non parametric approach
# non normality assumption
# Kruskal-Wallis test
k_fit_plant=kruskal.test(weight~group,data=PlantGrowth)
k_fit_plant
#
#contrasts
#install.packages("multcomp")
#
library(multcomp)
# the package is useful because it allows to perform any test 
# testing other hypothesis like m1=mean(m2+m3) for 3 groups
# the coefficients of contrast must add up to zero
#
plant_test=glht(fit_plant,linfct = mcp(group=c(1,-1/2,-1/2)))
# the test result
summary(plant_test)
# confidence intervals for contrasts
confint(plant_test)
# the estimate 5.032-mean(c(4.661,5.526))
# also observe that the interval includes zero,this a sign against H1
#
# Bonferroni correction
# Lets say  that we have 2 contrasts c(1,-1/2,-1/2) and c(1,-1,0)
# multiple comparisons require a correction(the probability of at least one type I
# n independent multiple comparisons is 1-(1-a)^n)
# create a matrix with  contrasts
K=rbind(c(1,-1/2,-1/2),c(1,-1,0))
plant_K=glht(fit_plant,linfct=mcp(group=K))
# non-corrected
summary(plant_K,test=adjusted("none"))
# Bonferroni corrected
# The alpha is divided by n(the correction)
summary(plant_K,test=adjusted("bonferroni"))
# confidence intervals
confint(plant_K,calpha =univariate_calpha()) 
# other corrections
# Bonferroni-Holm
# the smallest p value is adjusted first by a/n,then the second by a/(n-1)(here the
#same as for none )
summary(plant_K,test=adjusted("holm"))
#
#
# THS (pairwise comparisons)
TukeyHSD(fit_plant)#comparison for each combination 
# visualization
plot(TukeyHSD(fit_plant))
# this method can be done also with multcomp
plant_T=glht(fit_plant,linfct=mcp(group="Tukey"))
summary(plant_T)
# usage of t-tests for multiple pairwise comparisons
# no corrections
pairwise.t.test(PlantGrowth$weight, PlantGrowth$group,
                p.adjust.method = "none")
#with corrections
pairwise.t.test(PlantGrowth$weight, PlantGrowth$group,
                p.adjust.method = "bonferroni")
#
########################################################
########## Two way anova with interactions #############
########################################################
#
View(ToothGrowth)
?ToothGrowth
# count the observations in each group
# here we have a balanced design
#
xtabs(~supp+dose,data=ToothGrowth)
table(ToothGrowth$supp,ToothGrowth$dose)
#
# interaction plot
# if the lines of Vitamin and Dose were parallel the interaction effect would not be
#significant
with(ToothGrowth,interaction.plot(x.factor =dose ,trace.factor = supp,response=len))
#
# create elegant plots
library(ggplot2)
ggplot(ToothGrowth,aes(x=dose,y=len,lty=supp,shape=supp))+
  geom_point(size=1)+
  stat_summary(fun=mean,geom="line",aes(group=supp),colour="red")+
  theme_bw(base_size = 10) +  
  theme(
    legend.title = element_blank(),
    legend.position = "top",
    panel.grid.major = element_line(color = "gray85"),
    panel.grid.minor = element_blank(),
    axis.title = element_text(face = "bold"))+
  labs(
    x = "Dose of Supplement",
    y = "Tooth Length",
    title = "Interaction Plot"
  )
#
# fit the model - F-test
#
fit_tooth = aov(len ~ supp*as.factor(dose),data=ToothGrowth)
summary(fit_tooth)
#
# take the coefficients of the model
dummy.coef(fit_tooth)
fit_tooth$contrasts
# First we check the interaction!
# If interaction is significant, main effects remain in model even if they are not
#significant (not in this case)
# If interaction is not significant, main effects are assessed independently whether
#to remain in model or not
#
ToothGrowth[,"trt.comb"] <- interaction(ToothGrowth[, "supp"],ToothGrowth[, "dose"])
levels(ToothGrowth[,"trt.comb"])
fit_tooth_new=aov(len~trt.comb,data=ToothGrowth)
summary(fit_tooth_new)
#
# (group mean - population mean)
dummy.coef(fit_tooth_new)
fit_tooth_new$contrasts
#
# significant
fit_half=glht(fit_tooth_new,linfct = mcp(trt.comb=c(-1,1,0,0,0,0)))
summary(fit_half)
#
# significant
fit_one=glht(fit_tooth_new,linfct = mcp(trt.comb=c(0,0,-1,1,0,0)))
summary(fit_one)
#not significant
fit_two=glht(fit_tooth_new,linfct = mcp(trt.comb=c(0,0,0,0,-1,1)))
summary(fit_two)
#
#diagnostic plots
#
#QQ plot
plot(fit_tooth, which = 2)
# manually
qqnorm(rnorm(nrow(ToothGrowth)))
#check the equal variance assumption(funnel like shape or not)
plot(fit_tooth, which = 1, add.smooth = FALSE)
#

