                                  ### INTRODUCTION ###

getwd() # find current working directory 
setwd("C:/Users/30697/Desktop/") # change working directory
getwd()

library(survival) # load an already installed library
install.packages("name_of_library") # install a new library
library(name_of_library) # and load it

?seq # details about the function on the right of ? will appear on help window

                          ### OPERATIONS & BASIC MATHEMATIC FUNCTIONS ###

2+2
2-4
2*4
2/4

2^4
2**4 # double * (**) is equivalent to ^

sqrt(16) # square root
log(1) # natural logarithm
log10(10) # base 10
exp(1) # exponential function
sin(pi/2) 
factorial(3)

2<10
10<2
10 = 2 # wrong way to test if 10 is equal to 2
10 == 2 # the correct way, double =

x <- 2
y <- 2
x == y

!(x == y) # ! defines negation of a logical object (TRUE becomes FALSE and vice versa)

(2<4) & (3<4) # equals TRUE only if both expressions are TRUE
(2<4) & (3<2)
(2<1) & (3<2)

(2<4) | (3<4) # equals TRUE only if at least on out of the two expressions is TRUE
(2<4) | (3<2)
(2<1) | (3<2)

rm(x, y) # rm() removes selected objects from global environment

                                ### Assign values - R OBJECTS ###

x <- 3 # assign the value of 3 in a variable called x, Alt + - is the shortcut for <- 
x
x <- 2+3 # old value of 3 for x does not exist anymore
X <- 3 # R distinguishes uppercase with lowercase
rm(x, X) 

# Numerics (doubles, integers)                  
x <- 4
class(x) # returns the class of x
is.integer(x) # ask if x is integer

x <- 4L # use L to explicitly denote to R that it's an integer
class(x) 
is.integer(x)

# logicals
x <- 2>3 # remember what the result of 2>3 is 
x
class(x)
is.logical(x)
as.logical(1) # as.`class` functions convert appropriately the input values to the desired class
as.logical(100) # as.logical() will return TRUE for all numeric values except 0.
as.logical(0)

# characters
x <- "abc"
class(x)
is.character(x)
as.character(5)

rm(x,y)


                          ### VECTORS ###

x <- c(2,6,9)
x
x <- c(2, 6, "a") # objects must be of the same class, so 2, 6 are converted to characters
x

x <- c(1, 2, 3, 4, 5)
x
y <- 1:5# equivalent way
y
?seq
z <- seq(from = 1, to = 5, by = 1) # also equivalent
z
seq(from = 10, to = 5, by = -1) # if you want a decreasing sequence you have to use a negative step

?rep
rep(3, times = 7) 
rep(c(1, 2, 3), times = 7) # returns c(1, 2, 3) 7 times
rep(c(1, 2, 3), each = 7) # returns 1 7 times, then 2 7 times, then 3 7 times
rep(c(1,2, 3), times = c(1, 2, 3)) # if input & times argument are vectors of equal length, then each component of input is returned as many times as the respective element of time says

x <- 1:100
length(x) # length() returns the length (i.e. the number of elements) of the vector

x <- 11:20
x
c(x, 30) # adding the value of 30 on x

x[6] # using [] you can find which element of x lies in a specified position
x[c(6, 8)] # x[6,8] is wrong
x[c(8, 6)] # you can extract elements from vectors no matter their order 
x[6:8]
x[-1]
x[-6]
x[-(1:3)]


x[x > 15] # choosing elements of x according to a logical expression
x>15
# check the behavior of the commands below
x> c(15,16,17)
x> c(15,16)


x <- 1:5
y <- 6:10
x+y
x-y
x*y # just multiplies the respective elements


sum(x) # sum of components of a vector

exp(x) # applying a function to a vector creates a new vector each element of which is the result of the function being applied to the respective element of the old vector

rm(x,y,z)
                   ### LISTS ###

L <- list(1,2,"hello",FALSE)
L

L[1]
is.numeric(L[1])
L[c(1, 2)] # in lists single [] is useful when somebody wants to extract more than one element of the list

L[[1]]
is.numeric(L[[1]]) # if we want to extract a single object of a list, the most appropriate way is using [[]]
is.character(L[[3]])

rm(L)

                 ### MATRICES ###

x <- c(1, 2, 3)
y <- c(4, 5, 6)  

mat1 <- rbind(x, y) # x, y are considered rows in this case
mat1
dim(mat1) # dimension of matrix, #rows x #columns
nrow(mat1) # number of rows of matrix
ncol(mat1) # number of columns of matrix

mat2 <- cbind(x, y) # x, y are considered columns in this case
mat2
dim(mat2)

mat3 <- matrix(1:12, nrow = 3, ncol = 4, byrow = TRUE) # using byrow you control whether elements of the vector fill the matrix by row or by columns
mat4 <- matrix(1:12, nrow = 3, ncol = 4, byrow = FALSE)
mat3
mat4

mat3[1, ] # mat[i, ] returns the i-th row of matrix
mat3[, 2] # mat[, j] returns the j-th column of matrix
mat3[1, 2] # mat[i, j] returns the ij element of matrix

mat3
mat3[1:2, 3:4]

mat1 <- rbind(c(1, 2), c(3, 4))
mat2 <- rbind(c(1, 2), c(3, 4))
mat1

mat1+mat2
mat1*mat2 # * with matrices returns a matrix where each element is the result of multiplication of the corresponding elements of the initial matrices
mat1 %*% mat2 # %*% is the way to multiply two matrices

rm(x, y, mat1, mat2, mat3, mat4)

                    ### DATAFRAMES ###
a <- c(1:5)

b <- c('Mon','Tue','Mon','Wed','Wed')

c <- c(T,T,F,T,F)

df <- data.frame(a,b,c) 
df

names(df) # names of the columns, they come from the names of the respective vectors
names(df) <- c("id", "Var1", "Var2") # we can change them
df

df[1, ] # as in matrices
df[, 2]
df$id # in dataframes you can extract columns (i.e. variables) using $ and the name of column
df$Var1

df[df$id != 1,] # selecting subset of dataframe based on expression defined by some of its variables
df[df$id != 1,c("Var1", "Var2")] # selecting only particular variables from it


              ### FACTORS ### (probably the most appropriate class for a categorical variable)

x <- rep(c(0, 1), times = 5)
x # consider x to be a binary variable (eg gender); 0 for the first category, 1 for the other

x <- factor(x, levels = c(0, 1, 2), labels = c("male", "female")) # convert x to factor
x
class(x)
levels(x) # get levels of a factor

x=="male"
x==0 # zero no longer exists
as.numeric(x) # reference category will take the value of 1, next the value of 2 and so on

y <- relevel(x, ref = "female")
y
levels(y) # only order of levels has changed, this is useful for example when you want to change the reference level of a variable in a model


      ### READ DATASET IN DATAFRAME - Useful functions - missing values ###

#setwd("...") # set appropriate wd
data <- read.table("heart.csv", header = TRUE, sep = ",") # sep = ",", because it's a csv, header = TRUE, to read first line as the names of variables

str(data) # quick description of a dataframe, getting names of variables and classes

data <- data[,-1] # removing first column

summary(data) # quick summary statistics

unique(data$AHD) # unique() extracts the unique values of a variable

data$AHD <- factor(data$AHD, levels = c("No", "Yes"), labels = c("No", "Yes")) # convert to factor
levels(data$AHD)

unique(data$ChestPain)

data$ChestPain <- factor(data$ChestPain) # if we do not set levels, it will take the unique values of the variable in alphabetical order

unique(data$Sex)
data$Sex <- factor(data$Sex, levels = c(0, 1), labels = c("female", "male"))

str(data)
summary(data)

table(data$Sex) # tabulation
prop.table(table(data$Sex)) # get percentages instead of counts

table(data$Sex, data$AHD) # 2x2 table

table(data$Sex, data$AHD, data$ChestPain) # returns as many 2x2 tables for SEX and AHD as the number of levels of chestpain


mean(data$Age) # mean of a variable
sum(data$Age) # summing the components of a vector
sum(data$Age)/nrow(data) # mean age

var(data$Chol) # variance
sd(data$Chol) #standard deviation

median(data$RestBP)

quantile(data$Chol) # by default returns min, 1st, 2nd, 3rd quartiles, max
IQR(data$Chol)

quantile(data$Chol, probs = c(0.5, 0.7)) # using probs you can calculate whichever quantile

mean(data$Chol[data$Sex == "male"]) # mean for a variable for particular values of another

sort(data$Age) # sort values
sort(data$Age, decreasing = TRUE) 

# suppose we want to sort the dataframe by Age, if you do data$Age <- sort(data$Age), it's a mistake. You have just chnged the Age variable, all the other variables remain the same
# we have to use order
order(data$Age)
data$Age[133]
min(data$Age)

# so order sorts (in decreasing or increasing order) and returns the position that each value of the sorted variable possesses in its initial position

data <- data[order(data$Age),] # the correct way to sort by Age

# create categorical version of variable
# Initialize a variable and use [] to change its values
data$age_categ1 <- 1 # Initialize a variable
data$age_categ1[data$Age > 45] <- 2 # change values according to condition
data$age_categ1 <- factor(data$age_categ1, levels = c(1, 2), labels = c("<= 45", "> 45"))
table(data$age_categ1)

# same, but using instead ifelse() function
data$age_categ2 <- ifelse(data$Age <= 45, yes = 1, no = 2)
data$age_categ2 <- factor(data$age_categ2, levels = c(1, 2), labels = c("<= 45", "> 45"))
table(data$age_categ2)

# try creating a new categ. version of age, with categories (<=45, (45-55], >55))
# check also cut() function

# missing values
# assign missing values for demonstration
data$Age[c(2, 4, 6)] <- NA 
data$Sex[c(3, 8, 9)] <- NA
head(is.na(data$Age))
sum(is.na(data$Age)) # count missing values of Age
summary(data)

mean(data$Age)
mean(data$Age, na.rm = TRUE) # calculate the mean removing the mising values
table(data$Sex)

table(data$Sex, useNA = "ifany") # display missing values




      ### SAVE & LOAD ###

getwd()
save.image("env.RData") # saves the current workspace in an object ending with .RData, be careful of the working directory
load("env.RData") # loads the workspace which exists in a Rdata object
source("script.R") # calls a script and executes all the commands within it



                                   ### BONE DATASET, PLOTS###

#setwd("....")
load("bone.RData")
class(bone)
str(bone)
summary(bone) # summary of all variables of the dataframe

hist(bone$age) # histogram of Age variable
hist(bone$age, xlab = "Age", main = "Histogram of Age", col = "steelblue")
hist(bone$age, xlab = "Age", main = "Histogram of Age", col = "steelblue", breaks = "FD") # different way to define breaks

# get summary statistics of Age by gender

Age_males <- bone$age[bone$gender == "male"] # vector of Age of males of our sample
Age_females <- bone$age[bone$gender == "female"] # vector of Age of females of our sample


par(mfrow = c(1, 2)) # preparing the plot environment to display two plots side by side


hist(Age_males, xlab = "Age", main = "Males", col = "red")

hist(Age_females, xlab = "Age", main = "Females", col = "green")
dev.off()

boxplot(bone$age) # boxplot of age
boxplot(bone$age, col = "orange", ylab = "Age")
boxplot(bone$age, col = "orange", xlab = "Age", horizontal = TRUE)

levels(bone$gender) # check the order of levels in order to pass them respectively to names argument
boxplot(bone$age ~ bone$gender, col = c("gold", "blue"), ylab = "Age", xlab = "Gender", names = c("Females", "Males")) # boxplot of Age by gender
boxplot(age ~gender, data = bone,col = c("gold", "blue"), ylab = "Age", xlab = "Gender", names = c("Females", "Males")) # another way without using $
boxplot(age ~gender, data = bone,col = c("gold", "blue"), xlab = "Age", ylab = "Gender", names = c("Females", "Males"), horizontal = TRUE) # same but horizontal



plot(x = bone$spnbmd, y = bone$age) # scatterplot
plot(x = bone$spnbmd, y = bone$age, xlab = "SPNBMD", ylab = "AGE", col = "red")
plot(bone$age ~ bone$spnbmd, xlab = "SPNBMD", ylab = "AGE", col = "red") # same result,using formula (y~x)


plot(x = bone$spnbmd, y = bone$age, xlab = "SPNBMD", ylab = "AGE", col = "red", pch = 19) # pch controls the type of points (type ?points to see all the options)
plot(x = bone$spnbmd, y = bone$age, xlab = "SPNBMD", ylab = "AGE", col = "red", pch = 5) # different type of points
plot(x = bone$spnbmd, y = bone$age, xlab = "SPNBMD", ylab = "AGE", col = "red", pch = 3)

plot(x = bone$spnbmd, y = bone$age, xlab = "SPNBMD", ylab = "AGE", col = "blue4", pch = 19, cex = 0.6) # cex controls the size of points
plot(x = bone$spnbmd, y = bone$age, xlab = "SPNBMD", ylab = "AGE", col = "blue4", pch = 19, cex = 1.5)
plot(x = bone$spnbmd, y = bone$age, xlab = "SPNBMD", ylab = "AGE", col = "blue4", pch = 19, cex = 0.6, main = "A plot") # with title
plot(x = bone$spnbmd, y = bone$age, xlab = "SPNBMD", ylab = "AGE", col = "blue4", pch = 19, cex = 0.6, main = "A plot", cex.main = 1.3) # adjusting size of title
plot(x = bone$spnbmd, y = bone$age, xlab = "SPNBMD", ylab = "AGE", col = "blue4", pch = 19, cex = 0.6, main = "A plot", cex.main = 1.3, cex.lab = 0.8) # adjusting size of axes text
plot(x = bone$spnbmd, y = bone$age, xlab = "SPNBMD", ylab = "AGE", col = "blue4", pch = 19, cex = 0.6, main = "A plot", cex.main = 1.3, cex.axis = 0.8) # adjusting size of axes ticks


plot(bone$age ~ bone$spnbmd, xlab = "SPNBMD", ylab = "AGE", col = "blue", pch = 21) # pch = 21 type of points can have colour on the contour and on the interior 
plot(bone$age ~ bone$spnbmd, xlab = "SPNBMD", ylab = "AGE", col = "blue", bg = "red3",pch = 21, cex = 1.2) # bg (background color) can be used when type of points allows to use color to fill the shape



# same plot using separate color by gender (here we can also see how we can add a plot to an already created plot)

bonefemale <- bone[bone$gender == "female",] # dataframe only for females
bonemale <- bone[bone$gender == "male",] # dataframe only for males

plot(spnbmd~age, data=bone, type="n") # empty plot (type="n")
points(spnbmd~age, data=bonefemale, pch = 19, col="red", cex = 0.7) # adding points for females
points(spnbmd~age, data=bonemale, pch = 19, col="blue", cex = 0.7)# adding points for males
legend(x = "topright", c("females", "males"), col = c("red", "blue"), pch = 19) # adding legend

plot(spnbmd~age, data=bone, type="n") 
points(spnbmd~age, data=bonefemale, pch = 19, col="red", cex = 0.7) 
points(spnbmd~age, data=bonemale, pch = 19, col="blue", cex = 0.7)
legend(x = 23, y = 0.2, c("females", "males"), col = c("red", "blue"), pch = 19) # placing legends based on coordinates



# another way to provide different colours according to third variable (without having to create separate datasets)
levels(bone$gender)
palette(c("blue","green")) # customizing your own colors to fill
plot(bone$age, bone$spnbmd,  pch = 19,  col = bone$gender, cex = 0.8) # here we specify col being equal to the name of the variable that defines the colors
legend("topright", levels(bone$gender), pch=19, col = 1:nlevels(bone$gender))


# same idea as above but now place the plots side by side
par(mfrow = c(1, 2))
plot(spnbmd~age, data=bonefemale, pch = 19, col="red", cex = 0.7) # first plot
plot(spnbmd~age, data=bonemale, pch = 19, col="blue", cex = 0.7) # second plot



### FUNCTIONS ###

my_func1 <- function(x) { 
  
  y <- 3*x + 2 # main body of the function
  
  return(y) # what is retured from the function
}

my_func1(2) # calling the function with particular values of x
my_func1(1:10)

my_func2 <- function(x, y) { # function with two arguments
  
  z <- x^2 + y^2
  return(sqrt(z))
  
}

my_func2(3, 4)

my_func3 <- function(x, y){ 
  
  z <- x^2 + y^2
  
  ret <- list((sqrt(z) %% 1) == 0, sqrt(z)) # %% returns the remainder of the division between two numbers
  
  return(ret) # returns two values, they have to be a list
  
}

my_func3(3, 4)

my_func3(3, 9)

# create a function that calculates the coefficient of variation(s/m) of a vector of data_points


#### FOR, IF, IF-ELSE, WHILE ###

y <- rep(0,10) # initializing a vector 

for (i in 1:10) { # i goes from 1 to 10
  
  y[i] <- 3*i^2 - 1 
}

y

3*(1:10)^2-1 # equivalent and faster

# using a for loop create a 3x3 matrix, with 1s in the first row, 2s in the second row,...

mat <- matrix(0, nrow = 3, ncol = 3)
mat                                  

for (i in 1:3){
  
  mat[i,] <- rep(i, times = ncol(mat))
}
for (i in 1:3){
  
  mat[i,] <- i 
}

mat

# using a for loop create a vector of length 10 whose i-th component equals to the previous component^2 (first component is 2)

vec <- rep(2, 10)

for (i in 2:length(vec)){
  
  vec[i] <- vec[i-1]^2
}

vec
 # R's built-in function for that purpose

rm(mat, i, vec, y)

# while

# use while to create a vector with random numbers from N(0, 1) till the sum of its absolute values exceeds a particular threshold

vec <- vector("numeric") # vector of zero length
sum(vec)

while(sum(abs(vec)) < 10){ # when this condition is not true, loop stops
  
  vec <- c(vec, rnorm(1, 0, 1))
}
vec


# find how many consecutive natural numbers are needed in order for their sum to exceed 100

SUM <- 0
i <- 0

while (SUM <= 100){ # the process stops when the condition is FALSE, but be careful, if condition never becomes FALSE, the process never stops
  
  i <- i + 1
  SUM <- SUM + i
}
i
SUM  

# indeed
sum(1:13)
sum(1:14)

rm(i, SUM, x, y)



# if
x <- 2
y <- 5


if (x > 1){
  y <- 7 # executed if and only if x > 1
}
y


y <- 1:20 # replace each even component of the vector with 0 (use of if)

for (i in 1:length(y)){
  
  if (y[i] %% 2 == 0){ 
    
    y[i] <- 0 # this action is executed only when expression within if is TRUE
  }
}
y

y <- 1:20 # replace each even component of the vector with 0 and each odd component with 1 (use of if and else)

for (i in 1:length(y)){
  
  if (y[i] %% 2 == 0){
    
    y[i] <- 0
  } else { # else is evaluated when expression of if is FALSE
    
    y[i] <- 1
  }
}
y

y <- 1:20 # replace components <= 10 with 1, >10 & <= 15 with 2 and otherwise with 3 (use of if , else if and else)

for (i in 1:length(y)){
  
  if (y[i] <= 10){
    
    y[i] <- 1
    
  } else if (y[i] > 10 & y[i] <= 15) {
    
    y[i] <- 2
    
  } else {
    
    y[i] <- 3
  }
  
}

# alternative way without if & for
y <- 1:20
y[y<=10] <- 1

y[y > 10 & y <= 15] <- 2

y[y>15] <- 3
y


#### Practice ###
# a) Check built-in cumsum() function. Then write your own function that replicates its behavior.
# b) Write a function that accepts a numeric vector and returns another containing the successive differences of the elements of the input
# c) Write a function that takes a vector and returns it in reverse order.
# d) Write a function that accepts a numeric vector and a number (threshold), multiplies the elements of the vector consecutively and stops once the cumulative product exceeds the threshold
# It has to return the product and the position of the vector at the moment the multiplication stopped, or, in case that the threshold is never exceeded, an appropriate message
# e) Write a function that returns the first element in a vector that appears twice consecutively.
#################

#Random numbers, Probability distributions

# Sample from vector without replacement (each component has the same probability)

x <- 1:100

sample(x, size = 10, replace = FALSE) 

#Sample from vector without replacement (each component has the same probability)

sample(1:5, size = 10, replace = TRUE)

#Sample from vector with each component having pre-specified probability to be picked


#| collapse: true
sample(c("female", "male"), size = 20, prob = c(0.7, 0.3), replace = TRUE)



# pick a random sample of your dataframe
data <- data.frame(id = 1:20, age = round(rnorm(n = 20, 40, 3)))
data

data_sample <- data[sample(1:nrow(data), size = 10, replace = FALSE),] 
data_sample

#Generate random sample from a probability distribution


rnorm(n = 10, mean = 0, sd = 2) # random sample of 10 obs. from N(0, 4)
hist(rnorm(n = 500, mean = 0, sd = 2))
rexp(n = 10, rate = 2) # random sample of 10 obs. from exp(2)

# Reproducibility (using set.seed())


# they are not expected to produce identical samples
rnorm(n = 10, mean = 0, sd = 2) 
rnorm(n = 10, mean = 0, sd = 2) 


set.seed(10)
rnorm(n = 10, mean = 0, sd = 2) 
set.seed(10) # use same seed to get identical results
rnorm(n = 10, mean = 0, sd = 2)

#Probabilities

pnorm(q = 1.96, 0, 1, lower.tail = TRUE) # P(X <= 1.96), X~N(0, 1)
pnorm(q = 1.96, 0, 1, lower.tail = FALSE) # P(X > 1.96), X~N(0, 1)

pbinom(q = 2, size = 5, prob = 0.6, lower.tail = TRUE) # P(X <= 2), X ~ Bin(5, 0.6)

#Densities

dnorm(2, mean = 0, sd = 3) # pdf of N(0, 9) evaluated at x = 2 (Not P(X = 2)!!!)
dpois(2, lambda = 3) # P(X = 2), X ~ Poisson(3) (Note that Poisson is discrete)

#Quantiles

qnorm(0.025, mean = 0, sd = 1, lower.tail = FALSE) # 0.025-upper quantile of a N(0, 1)
qnorm(0.975, mean = 0, sd = 1, lower.tail = TRUE) # 0.975-lower quantile of a N(0, 1)


