#######################
### Heptathlon data ###
#######################

# Load data in R
heptathlon <- read.csv("C:/Users/loukia/Desktop/multivariate/Labs/Lab2_R_PCA/heptathlon.csv")
head(heptathlon)

# Examine correlations graphically
pairs(heptathlon[,-c(1,ncol(heptathlon))])

# Covariance matrix of the data
round(cov(heptathlon[,-c(1,ncol(heptathlon))]),4) # Much larger variances for Javelin and 800m

# Correlation matrix of the data
round(cor(heptathlon[,-c(1,ncol(heptathlon))]),3)

# Spectral decompositions of the covariance/correlation matrix
S <- cov(heptathlon[,-c(1,ncol(heptathlon))])
R <- cor(heptathlon[,-c(1,ncol(heptathlon))])         
eigS <- eigen(S)
eigR <- eigen(R)

# phi
p <- ncol(R)
sqrt((sum(R^2) - p)/(p*(p-1)))

# In a table
eigTable <- data.frame(it = 1:(ncol(heptathlon)-2),EigValueS = eigS$values,
                       PercS = eigS$values/sum(eigS$values))
eigTable$ceigS <- cumsum(eigTable$PercS)
eigTable$EigValueR <- eigR$values
eigTable$PercR <- eigR$values/sum(eigR$values)
eigTable$ceigR <- cumsum(eigTable$PercR)
round(eigTable,3)

# Eigenvectors based on the covariance matrix
round(eigS$vectors,6)
round(t(eigS$vectors),6)

# Eigenvectors based on the correlation matrix
round(eigR$vectors,6)
round(t(eigR$vectors),6)

# Eigenvectors are normalized
colSums(eigS$vectors^2)
colSums(eigR$vectors^2)

# Change signs so that that 'large' values are 'good'
#heptathlon[,c(2,5,9)] <- -heptathlon[,c(9,5,9)]
heptathlon[,c("X100m","X200m","X800m")] <- -heptathlon[,c("X100m","X200m","X800m")]

S <- cov(heptathlon[,-c(1,ncol(heptathlon))])
S
R <- cor(heptathlon[,-c(1,ncol(heptathlon))])
R         
eigS <- eigen(S)
eigR <- eigen(R)

# All variables are positively correlated now
pairs(heptathlon[,-c(1,ncol(heptathlon))])

round(eigR$vectors,6)
round(t(eigR$vectors),6)

-round(eigR$vectors,6)

# Plots with the coefficient of the first two principal components
plot(-eigR$vectors[,1],-eigR$vectors[,2],xlim = c(-1,1),ylim = c(-1,1),
     xlab = "1st component",ylab = "2nd component")
abline(h = 0 ,v = 0,lty = 2)
text(x = -eigR$vectors[,1],y = -eigR$vectors[,2],
     labels = c("X100m","Hjump","Shot","X200m","Ljump","Javelin","X800m"),
     pos = 1)

# 1st and 2nd component scores
heptathlon$Comp1 <- NULL
heptathlon$Comp2 <- NULL
heptathlon[,c("Comp1","Comp2")] <- scale(heptathlon[,-c(1,ncol(heptathlon))]) %*% (-eigR$vectors[,1:2])

# Ranks according to the total score
dd <- heptathlon[order(-heptathlon$score),]
dd$rankScore <- 1:nrow(dd)
dd <- dd[order(dd$id),]
head(dd)
heptathlon$rankScore <- dd$rankScore

# Ranks according to the 1st principal component
dd <- heptathlon[order(-heptathlon$Comp1),]
dd$rankComp1 <- 1:nrow(dd)
dd <- dd[order(dd$id),]
head(dd)
heptathlon$rankComp1 <- dd$rankComp1

# Plot of component scores, label by rank according to first PC
plot(heptathlon$Comp1,heptathlon$Comp2,
     xlab = "1st component",ylab = "2nd component")
abline(h = 0 ,v = 0,lty = 2)
text(heptathlon$Comp1,heptathlon$Comp2,labels = heptathlon$rankComp1,pos = 3,cex = 0.7)

# Plot of component scores, label by actual rank 
plot(heptathlon$Comp1,heptathlon$Comp2,
     xlab = "1st component",ylab = "2nd component")
abline(h = 0 ,v = 0,lty = 2)
text(heptathlon$Comp1,heptathlon$Comp2,labels = heptathlon$rankScore,pos = 3,cex = 0.7)

# plot rank according to first PC vs actual rank
plot(heptathlon$Comp1,heptathlon$score,
     xlab = "1st component",ylab = "Score")
text(heptathlon$Comp1,heptathlon$score,labels = heptathlon$rankScore,pos = 3,cex = 0.7)


# Scree plot
plot(eigTable$it,eigTable$EigValueR,type = "b",col = "lightblue",lwd = 2,
     xlab = "Component number",ylab = "Eigenvalue",pch = 19)
