#############################
### Discriminant analysis ###
#############################

# Load libraries
library(MASS)
library(psych)
library(Morpho)
library(mvtnorm)
library(klaR)

##############################
###     Simulated  data    ###
##############################

# Two-group example
n = 200
dataTrain <- data.frame(group = rep(1:2,each = n/2))
dataTrain$x1 <- rep(NA,n)
dataTrain$x2 <- rep(NA,n)
dataTrain$x1[dataTrain$group==1] <- rnorm(sum(dataTrain$group==1),mean = - 2,sd = 1.2)
dataTrain$x1[dataTrain$group==2] <- rnorm(sum(dataTrain$group==2),mean = + 2,sd = 1.2)
dataTrain$x2[dataTrain$group%in%1:2] = dataTrain$x1[dataTrain$group%in%1:2] + rnorm(2*n/3,sd = 0.8)

hist(dataTrain$x1)
hist(dataTrain$x2)
plot(dataTrain$x1,dataTrain$x2)

# Fit an lda model
fit <- lda(group ~ x1 + x2, data = dataTrain,prior = c(1/2,1/2))
fit

# Means by group
m1 <- colMeans(dataTrain[dataTrain$group==1,c("x1","x2")])
m2 <- colMeans(dataTrain[dataTrain$group==2,c("x1","x2")])
m1
m2

# Covariance matrix by group
s1 <- cov(dataTrain[dataTrain$group==1,c("x1","x2")])
s2 <- cov(dataTrain[dataTrain$group==2,c("x1","x2")]) 
s1
s2

# Number of items by group
n1 = sum(dataTrain$group==1)
n2 = sum(dataTrain$group==2)

# Pooled covariance matrix
Sp <- ((n1-1)*s1 + (n2-1)*s2)/(n1+n2-2)
Sp

# Predict class membership
dataTrain$predClass <- predict(fit,dataTrain)$class
dataTrain$prob1 <- NA
dataTrain$prob2 <- NA
dataTrain[,c("prob1","prob2")] <- predict(fit,dataTrain)$posterior

xx1 <- dmvnorm(dataTrain[,c("x1","x2")],mean = m1,sigma = Sp)
xx2 <- dmvnorm(dataTrain[,c("x1","x2")],mean = m2,sigma = Sp)
dataTrain$predClass2 <- 1*(xx1>xx2) + 2*(xx2>xx1)

# Check
table(xx1/(xx1+xx2) - dataTrain$prob1 + 1)
table(xx2/(xx1+xx2) - dataTrain$prob2 + 1)
table(dataTrain$predClass,dataTrain$predClass2)

# Three-group example
n = 3000
dataTrain <- data.frame(group = rep(1:3,each = n/3))
dataTrain$x1 <- rep(NA,n)
dataTrain$x2 <- rep(NA,n)
dataTrain$x1[dataTrain$group==1] <- rnorm(sum(dataTrain$group==1),mean = - 2,sd = 1.2)
dataTrain$x1[dataTrain$group==2] <- rnorm(sum(dataTrain$group==2),mean = + 2,sd = 1.2)
dataTrain$x1[dataTrain$group==3] <- rnorm(sum(dataTrain$group==3),mean = + 3,sd = 1.2)
dataTrain$x2[dataTrain$group%in%1:2] = dataTrain$x1[dataTrain$group%in%1:2] + rnorm(2*n/3,sd = 0.8)
dataTrain$x2[dataTrain$group==3] = -dataTrain$x1[dataTrain$group==3] + rnorm(n/3,sd = 0.8)

# Fit an lda model
fit <- lda(group ~ x1 + x2, data = dataTrain,prior = c(1/3,1/3,1/3))
fit

# Means by group
m1 <- colMeans(dataTrain[dataTrain$group==1,c("x1","x2")])
m2 <- colMeans(dataTrain[dataTrain$group==2,c("x1","x2")])
m3 <- colMeans(dataTrain[dataTrain$group==3,c("x1","x2")])
m1
m2
m3

# Covariance matrix by group
s1 <- cov(dataTrain[dataTrain$group==1,c("x1","x2")])
s2 <- cov(dataTrain[dataTrain$group==2,c("x1","x2")]) 
s3 <- cov(dataTrain[dataTrain$group==3,c("x1","x2")])
s1
s2
s3

# Number of items by group
n1 = sum(dataTrain$group==1)
n2 = sum(dataTrain$group==2)
n3 = sum(dataTrain$group==3)

# Pooled covariance matrix
Sp <- ((n1-1)*s1 + (n2-1)*s2 + (n3-1)*s3 )/(n1+n2+n3-3)
Sp

# Predict class membership
dataTrain$predClass <- predict(fit,dataTrain)$class
dataTrain$prob1 <- NA
dataTrain$prob2 <- NA
dataTrain$prob3 <- NA
dataTrain[,c("prob1","prob2","prob3")] <- predict(fit,dataTrain)$posterior

xx1 <- dmvnorm(dataTrain[,c("x1","x2")],mean = m1,sigma = Sp)
xx2 <- dmvnorm(dataTrain[,c("x1","x2")],mean = m2,sigma = Sp)
xx3 <- dmvnorm(dataTrain[,c("x1","x2")],mean = m3,sigma = Sp)
dataTrain$predClass2 <- 1*(xx1>xx2 & xx1>xx3) + 2*(xx2>xx1 & xx2>xx3) + 3*(xx3>xx1 & xx3>xx2)

# Check
table(xx1/(xx1+xx2+xx3) - dataTrain$prob1 + 1)
table(xx2/(xx1+xx2+xx3) - dataTrain$prob2 + 1)
table(xx3/(xx1+xx2+xx3) - dataTrain$prob3 + 1)
table(dataTrain$predClass,dataTrain$predClass2)

############################################################################

##############################
###       Iris  data       ###
##############################


# Load data
data(iris)

# Matrix plot
pairs.panels(iris[1:4],
             gap = 0,smooth = F,ellipses = F,
             bg = c("red", "green", "blue")[iris$Species],
             pch = 21)

# Training and testing datasets
set.seed(123)
ind <- sample(2, nrow(iris),
              replace = TRUE,
              prob = c(0.6, 0.4))
training <- iris[ind==1,]
testing <- iris[ind==2,]

# Linear LDA
linear <- lda(Species ~ Sepal.Length + Sepal.Width + Petal.Length + Petal.Width, data = training,
              prior = c(1/3,1/3,1/3))
linear
linear$svd^2/sum(linear$svd^2)

p <- predict(linear, data = training)
ldahist(data = p$x[,1], g = training$Species)
ldahist(data = p$x[,2], g = training$Species)

# Confusion matrix and accuracy - training data
p1 <- predict(linear, training)$class
tab <- table(Predicted = p1, Actual = training$Species)
tab
sum(diag(tab))/sum(tab)

# Confusion matrix and accuracy - testing data
p2 <- predict(linear, testing)$class
tab1 <- table(Predicted = p2, Actual = testing$Species)
tab1
sum(diag(tab1))/sum(tab1)