Code
getwd()Εισαγωγή στην R
Find current working directory:
getwd()Change current working directory (using /):
setwd("C:/Users/MyDirectory/")or (using \\):
setwd("C:\\Users\\MyDirectory\\")Load an already installed library:
library(survival)Install a new library and then load it:
install.packages("library_of_interest")
library(library_of_interest)Get details about a function of interest (e.g. about the function seq()):
?seq | seq | R Documentation |
Generate regular sequences. seq is a standard generic with a
default method. seq.int is a primitive which can be
much faster but has a few restrictions. seq_along and
seq_len are very fast primitives for two common cases.
seq(...)
## Default S3 method:
seq(from = 1, to = 1, by = ((to - from)/(length.out - 1)),
length.out = NULL, along.with = NULL, ...)
seq.int(from, to, by, length.out, along.with, ...)
seq_along(along.with)
seq_len(length.out)
seq(0, 1, length.out = 11)
seq(stats::rnorm(20)) # effectively 'along'
seq(1, 9, by = 2) # matches 'end'
seq(1, 9, by = pi) # stays below 'end'
seq(1, 6, by = 3)
seq(1.575, 5.125, by = 0.05)
seq(17) # same as 1:17, or even better seq_len(17)
If you encounter issues with setting the working directory, ensure that the path is correct for your operating system.
Always load a library after installing it in R.
2+2
## [1] 4
2-4
## [1] -2
2*4
## [1] 8
2/4
## [1] 0.5
2^4
## [1] 16
2**4 # double * (**) is equivalent to ^
## [1] 16sqrt(16) # square root
## [1] 4
log(1) # natural logarithm
## [1] 0
log10(10) # base 10 logarithm
## [1] 1
exp(1) # exponential function
## [1] 2.718282
sin(pi/2)
## [1] 1
factorial(3)
## [1] 62<3
## [1] TRUE
2>3
## [1] FALSE
2 == 2 # use == to test if two values are equal
## [1] TRUE
2 != 2
## [1] FALSE(2<4) & (3<4) # equals TRUE only if both expressions are TRUE
## [1] TRUE
(2<4) & (3<2)
## [1] FALSE
(2<1) & (3<2)
## [1] FALSE
(2<4) | (3<4) # equals TRUE only if at least on out of the two expressions is TRUE
## [1] TRUE
(2<4) | (3<2)
## [1] TRUE
(2<1) | (3<2)
## [1] FALSEx <- 3 # assign the value of 3 in a variable called x, Alt + - is the shortcut for <-
x
## [1] 3
x = 3 # = equivalent with <-
x <- 2+3 # old value of 3 for x does not exist anymore
x
## [1] 5
X <- 3 # R distinguishes uppercase with lowercase
x
## [1] 5
X
## [1] 3
rm(x, X) # rm() removes selected objects from global environmentx <- 4
class(x) # returns the class of x
## [1] "numeric"
is.double(x) # ask if x is double (by default all numerics will be doubles even if they do not contain decimals)
## [1] TRUE
is.integer(x) # ask if x is integer
## [1] FALSE
x <- 4L # use L to explicitly denote to R that it's an integer
class(x)
## [1] "integer"
is.double(x)
## [1] FALSE
is.integer(x)
## [1] TRUEx <- 2>3 # remember what the result of 2>3 is
x
## [1] FALSE
class(x)
## [1] "logical"
is.logical(x)
## [1] TRUE
as.logical(1) # as.`class` functions convert appropriately the input values to the desired class
## [1] TRUE
as.logical(100) # as.logical() will return TRUE for all numeric values except 0.
## [1] TRUE
as.logical(0)
## [1] FALSEx <- "abc"
class(x)
## [1] "character"
is.character(x)
## [1] TRUE
as.character(5)
## [1] "5"Collections of values of the same type.
c() function:x <- c(1, 5, 10)
x
## [1] 1 5 10
is.vector(x)
## [1] TRUE
x <- c(1, 5, "a")
x
## [1] "1" "5" "a"seq(), rep() functions for vectors of specific pattern:x <- seq(from = 1, to = 5, by = 1)
x
## [1] 1 2 3 4 5
x <- 1:5 # equivalent to seq(from = 1, to = 5, by = 1)
x
## [1] 1 2 3 4 5
x <- seq(from = 0, to = 10, by = 2)
x
## [1] 0 2 4 6 8 10x <- rep(2, times = 5)
x
## [1] 2 2 2 2 2
x <- rep(c(2, 3), times = 2)
x
## [1] 2 3 2 3
x <- rep(c(2, 3), each = 2)
x
## [1] 2 2 3 3
x <- rep(c(2,3), times = c(4, 5))
x
## [1] 2 2 2 2 3 3 3 3 3length(x)
## [1] 9x <- 1:10
x[5] # return the 5th element
## [1] 5
x[c(2, 6)] # return the 2nd and 6th elements (x[2,6] is wrong)
## [1] 2 6
x[x>3] # return elements of x that are greater than 3
## [1] 4 5 6 7 8 9 10x <- 1:5
y <- 6:10
x+y
## [1] 7 9 11 13 15
x-y
## [1] -5 -5 -5 -5 -5
x*y # just multiplies the respective elements
## [1] 6 14 24 36 50x^2
## [1] 1 4 9 16 25
exp(x) # apply exp() function at each element of x, returns a vector
## [1] 2.718282 7.389056 20.085537 54.598150 148.413159
sum(x) # sum all elements of x, returns scalar
## [1] 15
cumsum(x) # returns a vector where each element is the cumulative sum of the elements of x up to that position
## [1] 1 3 6 10 15When functions that typically operate on a single scalar (such as log()) are applied to a vector, the result will be a vector where the function has been applied to each individual element.
Two-dimensional arrays with elements of the same data type (mainly numeric).
rbind(), cbind() functions:x <- c(1, 2, 3)
y <- c(4, 5, 6)
mat1 <- rbind(x, y)
mat1| x | 1 | 2 | 3 |
| y | 4 | 5 | 6 |
mat2 <- cbind(x, y)
mat2| x | y |
|---|---|
| 1 | 4 |
| 2 | 5 |
| 3 | 6 |
matrix() function:mat3 <- matrix(1:12, nrow = 3, ncol = 4, byrow = TRUE) # using byrow you control whether elements of the vector fill the matrix by row or by columns
mat4 <- matrix(1:12, nrow = 3, ncol = 4, byrow = FALSE)
mat3| 1 | 2 | 3 | 4 |
| 5 | 6 | 7 | 8 |
| 9 | 10 | 11 | 12 |
mat4| 1 | 4 | 7 | 10 |
| 2 | 5 | 8 | 11 |
| 3 | 6 | 9 | 12 |
dim(), nrow(), ncol() functions:dim(mat1)
## [1] 2 3
nrow(mat1)
## [1] 2
ncol(mat1)
## [1] 3mat1[1,]
## [1] 1 2 3
mat1[, 2]
## x y
## 2 5
mat1[2, 3]
## y
## 6mat1 <- rbind(c(1, 2), c(3, 4))
mat2 <- rbind(c(1, 2), c(3, 4))
mat1+mat2| 2 | 4 |
| 6 | 8 |
mat1*mat2 # * with matrices returns a matrix where each element is the result of multiplication of the corresponding elements of the initial matrices (element-wise multiplication/Hadamard product)| 1 | 4 |
| 9 | 16 |
mat1 %*% mat2 # matrix multiplication| 7 | 10 |
| 15 | 22 |
Two-dimensional table-like structures where each column contains values of a single variable, and each row corresponds to a specific observation.
You can think of them as matrices, but with the flexibility to store different data types in different columns.
a <- c(1:5)
b <- c('Mon','Tue','Mon','Wed','Wed')
c <- c(T,T,F,T,F)
df <- data.frame(a,b,c)
df| a | b | c |
|---|---|---|
| 1 | Mon | TRUE |
| 2 | Tue | TRUE |
| 3 | Mon | FALSE |
| 4 | Wed | TRUE |
| 5 | Wed | FALSE |
names(df) # names of the columns, they come from the names of the respective vectors
## [1] "a" "b" "c"
names(df) <- c("id", "Var1", "Var2") # we can change them
df| id | Var1 | Var2 |
|---|---|---|
| 1 | Mon | TRUE |
| 2 | Tue | TRUE |
| 3 | Mon | FALSE |
| 4 | Wed | TRUE |
| 5 | Wed | FALSE |
df[1, ]| id | Var1 | Var2 |
|---|---|---|
| 1 | Mon | TRUE |
df[, 2]
## [1] "Mon" "Tue" "Mon" "Wed" "Wed"df$id
## [1] 1 2 3 4 5df[df$id != 1,] # selecting subset of dataframe based on expression defined by some of its variables| id | Var1 | Var2 | |
|---|---|---|---|
| 2 | 2 | Tue | TRUE |
| 3 | 3 | Mon | FALSE |
| 4 | 4 | Wed | TRUE |
| 5 | 5 | Wed | FALSE |
df[df$id != 1,c("Var1", "Var2")] # selecting only particular variables from it| Var1 | Var2 | |
|---|---|---|
| 2 | Tue | TRUE |
| 3 | Mon | FALSE |
| 4 | Wed | TRUE |
| 5 | Wed | FALSE |
Usually, it’s the most appropriate class to represent a categorical variable.
Example: Consider x to be a binary variable (e.g. gender); with 0 representing the first category, 1 the other.
x <- rep(c(0, 1), times = 5)
x
## [1] 0 1 0 1 0 1 0 1 0 1x <- factor(x, levels = c(0, 1), labels = c("male", "female")) # convert x to factor
x
## [1] male female male female male female male female male female
## Levels: male female
class(x)
## [1] "factor"
levels(x)
## [1] "male" "female"x=="male"
## [1] TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE FALSEx==0 # zero no longer exists
## [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSEy <- relevel(x, ref = "female")
y
## [1] male female male female male female male female male female
## Levels: female male
levels(y)
## [1] "female" "male"Always take note of the order in which the levels of a factor are stored.
getwd(), setwd() functions.data <- read.table("R\\heart.csv", header = TRUE, sep = ",") # sep = ",", because it's a csv, header = TRUE, to read first line as the names of variablesstr(data) # quick description of a dataframe, gettings names of variables and classes
## 'data.frame': 303 obs. of 7 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 63 67 67 37 41 56 62 57 63 53 ...
## $ Sex : int 1 1 1 1 0 1 0 0 1 1 ...
## $ ChestPain: chr "typical" "asymptomatic" "asymptomatic" "nonanginal" ...
## $ RestBP : int 145 160 120 130 130 120 140 120 130 140 ...
## $ Chol : int 233 286 229 250 204 236 268 354 254 203 ...
## $ AHD : chr "No" "Yes" "Yes" "No" ...summary(data) # quick summary statistics| X | Age | Sex | ChestPain | RestBP | Chol | AHD | |
|---|---|---|---|---|---|---|---|
| Min. : 1.0 | Min. :29.00 | Min. :0.0000 | Length:303 | Min. : 94.0 | Min. :126.0 | Length:303 | |
| 1st Qu.: 76.5 | 1st Qu.:48.00 | 1st Qu.:0.0000 | Class :character | 1st Qu.:120.0 | 1st Qu.:211.0 | Class :character | |
| Median :152.0 | Median :56.00 | Median :1.0000 | Mode :character | Median :130.0 | Median :241.0 | Mode :character | |
| Mean :152.0 | Mean :54.44 | Mean :0.6799 | NA | Mean :131.7 | Mean :246.7 | NA | |
| 3rd Qu.:227.5 | 3rd Qu.:61.00 | 3rd Qu.:1.0000 | NA | 3rd Qu.:140.0 | 3rd Qu.:275.0 | NA | |
| Max. :303.0 | Max. :77.00 | Max. :1.0000 | NA | Max. :200.0 | Max. :564.0 | NA |
data <- data[,-1] # removing first columnunique(data$AHD)
## [1] "No" "Yes"data$AHD <- factor(data$AHD, levels = c("No", "Yes"))
levels(data$AHD)
## [1] "No" "Yes"unique(data$ChestPain)
## [1] "typical" "asymptomatic" "nonanginal" "nontypical"
data$ChestPain <- factor(data$ChestPain)unique(data$Sex)
## [1] 1 0
data$Sex <- factor(data$Sex, levels = c(0, 1), labels = c("female", "male"))summary(data) # meaningful summary statistics for the categorical ones after the conversion to factor| Age | Sex | ChestPain | RestBP | Chol | AHD | |
|---|---|---|---|---|---|---|
| Min. :29.00 | female: 97 | asymptomatic:144 | Min. : 94.0 | Min. :126.0 | No :164 | |
| 1st Qu.:48.00 | male :206 | nonanginal : 86 | 1st Qu.:120.0 | 1st Qu.:211.0 | Yes:139 | |
| Median :56.00 | NA | nontypical : 50 | Median :130.0 | Median :241.0 | NA | |
| Mean :54.44 | NA | typical : 23 | Mean :131.7 | Mean :246.7 | NA | |
| 3rd Qu.:61.00 | NA | NA | 3rd Qu.:140.0 | 3rd Qu.:275.0 | NA | |
| Max. :77.00 | NA | NA | Max. :200.0 | Max. :564.0 | NA |
mean(), median(), table(), etc.)mean(data$Age)
## [1] 54.43894
median(data$Age)
## [1] 56
sd(data$Age)
## [1] 9.038662
min(data$Age)
## [1] 29
max(data$Age)
## [1] 77
table(data$ChestPain)| asymptomatic | nonanginal | nontypical | typical |
|---|---|---|---|
| 144 | 86 | 50 | 23 |
prop.table(table(data$ChestPain)) # get percentages instead of counts| asymptomatic | nonanginal | nontypical | typical |
|---|---|---|---|
| 0.4752475 | 0.2838284 | 0.1650165 | 0.0759076 |
mean(data$Age[data$Sex == "female"])
## [1] 55.72165
mean(data$Age[data$Sex == "female" & data$AHD == "No"])
## [1] 54.55556sort(data$Age, decreasing = FALSE)
## [1] 29 34 34 35 35 35 35 37 37 38 38 39 39 39 39 40 40 40 41 41 41 41 41 41 41
## [26] 41 41 41 42 42 42 42 42 42 42 42 43 43 43 43 43 43 43 43 44 44 44 44 44 44
## [51] 44 44 44 44 44 45 45 45 45 45 45 45 45 46 46 46 46 46 46 46 47 47 47 47 47
## [76] 48 48 48 48 48 48 48 49 49 49 49 49 50 50 50 50 50 50 50 51 51 51 51 51 51
## [101] 51 51 51 51 51 51 52 52 52 52 52 52 52 52 52 52 52 52 52 53 53 53 53 53 53
## [126] 53 53 54 54 54 54 54 54 54 54 54 54 54 54 54 54 54 54 55 55 55 55 55 55 55
## [151] 55 56 56 56 56 56 56 56 56 56 56 56 57 57 57 57 57 57 57 57 57 57 57 57 57
## [176] 57 57 57 57 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 59 59
## [201] 59 59 59 59 59 59 59 59 59 59 59 59 60 60 60 60 60 60 60 60 60 60 60 60 61
## [226] 61 61 61 61 61 61 61 62 62 62 62 62 62 62 62 62 62 62 63 63 63 63 63 63 63
## [251] 63 63 64 64 64 64 64 64 64 64 64 64 65 65 65 65 65 65 65 65 66 66 66 66 66
## [276] 66 66 67 67 67 67 67 67 67 67 67 68 68 68 68 69 69 69 70 70 70 70 71 71 71
## [301] 74 76 77
sort(data$Age, decreasing = TRUE)
## [1] 77 76 74 71 71 71 70 70 70 70 69 69 69 68 68 68 68 67 67 67 67 67 67 67 67
## [26] 67 66 66 66 66 66 66 66 65 65 65 65 65 65 65 65 64 64 64 64 64 64 64 64 64
## [51] 64 63 63 63 63 63 63 63 63 63 62 62 62 62 62 62 62 62 62 62 62 61 61 61 61
## [76] 61 61 61 61 60 60 60 60 60 60 60 60 60 60 60 60 59 59 59 59 59 59 59 59 59
## [101] 59 59 59 59 59 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 58 57
## [126] 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 57 56 56 56 56 56 56 56 56 56
## [151] 56 56 55 55 55 55 55 55 55 55 54 54 54 54 54 54 54 54 54 54 54 54 54 54 54
## [176] 54 53 53 53 53 53 53 53 53 52 52 52 52 52 52 52 52 52 52 52 52 52 51 51 51
## [201] 51 51 51 51 51 51 51 51 51 50 50 50 50 50 50 50 49 49 49 49 49 48 48 48 48
## [226] 48 48 48 47 47 47 47 47 46 46 46 46 46 46 46 45 45 45 45 45 45 45 45 44 44
## [251] 44 44 44 44 44 44 44 44 44 43 43 43 43 43 43 43 43 42 42 42 42 42 42 42 42
## [276] 41 41 41 41 41 41 41 41 41 41 40 40 40 39 39 39 39 38 38 37 37 35 35 35 35
## [301] 34 34 29The following command will not do the work. It will just sort the Age variable but the rest will be left unchanged.
data$Age <- sort(data$Age) # not the way to sort the dataframe by ageWe want the rows of the dataset to be organized according to the ordering (increasing/decreasing) of the Age. order() function returns the indices of the elements of the original vector that represent the positions of the elements in a sorted (increasing/decreasing) version of the vector.
#e.g.
order(c(2, 7, 6, 1))
## [1] 4 1 3 2data <- data[order(data$Age),]
head(data)| Age | Sex | ChestPain | RestBP | Chol | AHD | |
|---|---|---|---|---|---|---|
| 133 | 29 | male | nontypical | 130 | 204 | No |
| 102 | 34 | male | typical | 118 | 182 | No |
| 226 | 34 | female | nontypical | 118 | 210 | No |
| 118 | 35 | female | asymptomatic | 138 | 183 | No |
| 139 | 35 | male | asymptomatic | 120 | 198 | Yes |
| 169 | 35 | male | asymptomatic | 126 | 282 | Yes |
data$age_categ1 <- 1 # Initialize a variable
data$age_categ1[data$Age > 45] <- 2 # change values according to condition
data$age_categ1 <- factor(data$age_categ1, levels = c(1, 2), labels = c("<= 45", "> 45"))
table(data$age_categ1)| <= 45 | > 45 |
|---|---|
| 63 | 240 |
ifelse() functiondata$age_categ2 <- ifelse(data$Age <= 45, yes = 1, no = 2)
data$age_categ2 <- factor(data$age_categ2, levels = c(1, 2), labels = c("<= 45", "> 45"))
table(data$age_categ2)| <= 45 | > 45 |
|---|---|
| 63 | 240 |
# assign missing values for demonstration
data$Age[c(2, 4, 6)] <- NA
data$Sex[c(3, 8, 9)] <- NA
head(is.na(data$Age))
## [1] FALSE TRUE FALSE TRUE FALSE TRUE
sum(is.na(data$Age)) # count missing values of Age
## [1] 3
summary(data)| Age | Sex | ChestPain | RestBP | Chol | AHD | age_categ1 | age_categ2 | |
|---|---|---|---|---|---|---|---|---|
| Min. :29.00 | female: 95 | asymptomatic:144 | Min. : 94.0 | Min. :126.0 | No :164 | <= 45: 63 | <= 45: 63 | |
| 1st Qu.:48.00 | male :205 | nonanginal : 86 | 1st Qu.:120.0 | 1st Qu.:211.0 | Yes:139 | > 45 :240 | > 45 :240 | |
| Median :56.00 | NA’s : 3 | nontypical : 50 | Median :130.0 | Median :241.0 | NA | NA | NA | |
| Mean :54.64 | NA | typical : 23 | Mean :131.7 | Mean :246.7 | NA | NA | NA | |
| 3rd Qu.:61.00 | NA | NA | 3rd Qu.:140.0 | 3rd Qu.:275.0 | NA | NA | NA | |
| Max. :77.00 | NA | NA | Max. :200.0 | Max. :564.0 | NA | NA | NA | |
| NA’s :3 | NA | NA | NA | NA | NA | NA | NA |
mean(data$Age)
## [1] NA
mean(data$Age, na.rm = TRUE) # calculate the mean removing the mising values
## [1] 54.63667
table(data$Sex)| female | male |
|---|---|
| 95 | 205 |
table(data$Sex, useNA = "ifany") # display missing values| female | male | NA |
|---|---|---|
| 95 | 205 | 3 |
When inspecting data, always check for the existence of missing values.
load("R\\bone.Rdata")
summary(bone)| idnum | age | gender | spnbmd | |
|---|---|---|---|---|
| Min. : 1.0 | Min. : 9.40 | female:259 | Min. :-0.064103 | |
| 1st Qu.: 60.0 | 1st Qu.:12.70 | male :226 | 1st Qu.: 0.005858 | |
| Median :124.0 | Median :15.40 | NA | Median : 0.026591 | |
| Mean :151.5 | Mean :16.10 | NA | Mean : 0.039252 | |
| 3rd Qu.:240.0 | 3rd Qu.:19.15 | NA | 3rd Qu.: 0.064127 | |
| Max. :384.0 | Max. :25.55 | NA | Max. : 0.219913 |
hist(bone$age) # histogram of Age variable
hist(bone$age, xlab = "Age", main = "Histogram of Age", col = "steelblue")
Age_males <- bone$age[bone$gender == "male"] # vector of Age of males of our sample
Age_females <- bone$age[bone$gender == "female"] # vector of Age of females of our sample
par(mfrow = c(1, 2)) # preparing the plot environment to display two plots side by side
hist(Age_males, xlab = "Age", main = "Males", col = "red")
hist(Age_females, xlab = "Age", main = "Females", col = "green")
boxplot(bone$age)
boxplot(bone$age, col = "orange", ylab = "Age")
boxplot(bone$age, col = "orange", xlab = "Age", horizontal = TRUE)
levels(bone$gender) # check the order of levels in order to pass them respectively to names argument
## [1] "female" "male"
boxplot(bone$age ~ bone$gender, col = c("gold", "blue"), ylab = "Age", xlab = "Gender", names = c("Females", "Males")) # boxplot of Age by gender
boxplot(bone$age ~ bone$gender, col = c("gold", "blue"), xlab = "Age", ylab = "Gender", names = c("Females", "Males"), horizontal = TRUE) # horizontal
plot(x = bone$spnbmd, y = bone$age)
plot(x = bone$spnbmd, y = bone$age, xlab = "SPNBMD", ylab = "AGE", col = "red")
plot(x = bone$spnbmd, y = bone$age, xlab = "SPNBMD", ylab = "AGE", col = "red", pch = 19) # pch controls the type of points (type ?points to see all the options)
plot(x = bone$spnbmd, y = bone$age, xlab = "SPNBMD", ylab = "AGE", col = "blue4", pch = 19, cex = 0.7) # cex controls the size of points
plot(x = bone$spnbmd, y = bone$age, xlab = "SPNBMD", ylab = "AGE", col = "blue4", pch = 19, cex = 0.6, main = "A plot", cex.main = 1.3) # with title (of adjusted size)
plot(x = bone$spnbmd, y = bone$age, xlab = "SPNBMD", ylab = "AGE", col = "blue4", pch = 19, cex = 0.6, main = "A plot", cex.main = 1.3, cex.lab = 0.8) # adjusting size of axes text
plot(x = bone$spnbmd, y = bone$age, xlab = "SPNBMD", ylab = "AGE", col = "blue4", pch = 19, cex = 0.6, main = "A plot", cex.main = 1.3, cex.axis = 0.8) # adjusting size of axes ticks
plot(bone$age ~ bone$spnbmd, xlab = "SPNBMD", ylab = "AGE", col = "blue", bg = "red3",pch = 21, cex = 1.2) # pch = 21 type of points can have colour on the contour and on the interior, bg (background color) can be used when type of points allows to use color to fill the shape
bonefemale <- bone[bone$gender == "female",] # dataframe only for females
bonemale <- bone[bone$gender == "male",] # dataframe only for males
plot(spnbmd~age, data=bone, type="n") # empty plot (type="n")
points(spnbmd~age, data=bonefemale, pch = 19, col="red", cex = 0.7) # adding points for females
points(spnbmd~age, data=bonemale, pch = 19, col="blue", cex = 0.7)# adding points for males
legend(x = "topright", c("females", "males"), col = c("red", "blue"), pch = 19) # adding legend
plot(spnbmd~age, data=bone, type="n")
points(spnbmd~age, data=bonefemale, pch = 19, col="red", cex = 0.7)
points(spnbmd~age, data=bonemale, pch = 19, col="blue", cex = 0.7)
legend(x = 23, y = 0.2, c("females", "males"), col = c("red", "blue"), pch = 19) # placing legends based on coordinates
# different color and type of point for females and males
plot(spnbmd~age, data=bone, type="n")
points(spnbmd~age, data=bonefemale, pch = 15, col="red", cex = 0.7)
points(spnbmd~age, data=bonemale, pch = 19, col="blue", cex = 0.7)
legend(x = 23, y = 0.2, c("females", "males"), col = c("red", "blue"), pch = c(15, 19)) 
# another way to provide different colours according to third variable (without having to create separate datasets)
palette(c("blue","green")) # customizing your own colors to fill
plot(bone$age, bone$spnbmd, pch = 19, col = bone$gender, cex = 0.8) # here we specify col being equal to the name of the variable that defines the colors
legend("topright", levels(bone$gender), pch=19, col = 1:nlevels(bone$gender))
# same idea as above but now place the plots side by side
par(mfrow = c(1, 2))
plot(spnbmd~age, data=bonefemale, pch = 19, col="red", cex = 0.7, main = "Females") # first plot
plot(spnbmd~age, data=bonemale, pch = 19, col="blue", cex = 0.7, main = "Males") # second plot
my_func1 <- function(x) { # x is the argument of the function
y <- 3*x + 2 # main body of the function
return(y) # what is retured from the function
}
my_func1(2)
## [1] 8my_func2 <- function(x, y) { # you can have multiple arguments
z <- x^2 + y^2
return(sqrt(z))
}
my_func2(4, 5)
## [1] 6.403124my_func3 <- function(x, y){
z <- x^2 + y^2
ret <- list((sqrt(z) %% 1) == 0, sqrt(z)) # %% returns the remainder of the division between two numbers
return(ret) # returns two values, they have to be a list
}
my_func3(3, 4)
## [[1]]
## [1] TRUE
##
## [[2]]
## [1] 5
my_func3(3, 9)
## [[1]]
## [1] FALSE
##
## [[2]]
## [1] 9.486833y <- rep(0,10) # initializing a vector
for (i in 1:10) { # i goes from 1 to 10
y[i] <- 3*i^2 - 1
}
y
## [1] 2 11 26 47 74 107 146 191 242 299
x <- 1:10
3*x^2-1 # equivalent and faster
## [1] 2 11 26 47 74 107 146 191 242 299# using a for loop create a 3x3 matrix, with 1s in the first row, 2s in the second row,...
mat <- matrix(0, nrow = 3, ncol = 3)
mat | 0 | 0 | 0 |
| 0 | 0 | 0 |
| 0 | 0 | 0 |
for (i in 1:3){
mat[i,] <- rep(i, times = ncol(mat))
}
mat| 1 | 1 | 1 |
| 2 | 2 | 2 |
| 3 | 3 | 3 |
# using a for loop create a vector of length 5 whose i-th component equals to the previous component^2 (first component is 2)
vec <- rep(2, 5)
for (i in 2:length(vec)){
vec[i] <- vec[i-1]^2
}
vec
## [1] 2 4 16 256 65536# use while to create a vector with random numbers from N(0, 1) till the sum of its absolute values exceeds a particular threshold
vec <- vector("numeric") # vector of zero length
sum(vec)
## [1] 0
while(sum(abs(vec)) < 10){ # when this condition is not true, loop stops
vec <- c(vec, rnorm(1, 0, 1))
}
vec
## [1] 0.38238480 -0.72387134 -1.30788257 0.03003481 -0.13337262 0.66771329
## [7] -0.95807299 -0.84425747 -0.45168428 -1.51895371 0.69116709 1.39426467
## [13] -0.98404690# find how many consecutive natural numbers are needed in order for their sum to exceed 100
SUM <- 0
i <- 0
while (SUM <= 100){
i <- i + 1
SUM <- SUM + i
}
i
## [1] 14
SUM
## [1] 105
# indeed
sum(1:13)
## [1] 91
sum(1:14)
## [1] 105When using while loops, always ensure that the loop will terminate after a finite number of iterations.
# if
x <- 2
y <- 5
if (x > 1){
y <- 7 # executed if and only if x > 1
}
y
## [1] 7# replace each even component of the vector with 0 (use of if)
y <- 1:20
for (i in 1:length(y)){
if (y[i] %% 2 == 0){
y[i] <- 0
}
}
y
## [1] 1 0 3 0 5 0 7 0 9 0 11 0 13 0 15 0 17 0 19 0y <- 1:20 # replace each even component of the vector with 0 and each odd component with 1 (use of if and else)
for (i in 1:length(y)){
if (y[i] %% 2 == 0){
y[i] <- 0
} else { # else is evaluated when expression of if is FALSE
y[i] <- 1
}
}
y
## [1] 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0 1 0y <- 1:20 # replace components <= 10 with 1, >10 & <= 15 with 2 and otherwise with 3 (use of if , else if and else)
for (i in 1:length(y)){
if (y[i] <= 10){
y[i] <- 1
} else if (y[i] > 10 & y[i] <= 15) {
y[i] <- 2
} else {
y[i] <- 3
}
}
y
## [1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3
# of course, you do not need for, if, etc. to do it!
y <- 1:20
y[y<=10] <- 1
y[y > 10 & y <= 15] <- 2
y[y>15] <- 3
y
## [1] 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 3 3 3 3 3nlog <- function(x){
if (x > 0){
res <- log(x)
} else {
stop("Input is not positive")
}
return(res)
}
nlog(-1)
## Error in nlog(-1): Input is not positive
nlog(1)
## [1] 0sum_even <- function(x){
if (x <= 0){
stop("Input is not positive")
}
x <- floor(x)
x <- 1:x
S <- 0
for (j in 1:length(x)){
if (x[j] %% 2 == 0){
S <- S + x[j]
}
}
return(S)
}
sum_even(10)
## [1] 30
sum_even(10.3)
## [1] 30
sum_even(-2)
## Error in sum_even(-2): Input is not positivecollatz <- function(n){
f <- function(n){
if (n%%2 == 0){
res <- n/2
} else {
res <- 3*n + 1
}
return(res)
}
seqn <- n
while (seqn[length(seqn)] != 1){
seqn <- c(seqn, f(seqn[length(seqn)]))
}
return(seqn)
}
collatz(12)
## [1] 12 6 3 10 5 16 8 4 2 1findInterval() function can be helpful.mixt_Normals <- function(p, n){
if (sum(p) != 1){
stop("Weights do not sum up to 1")
}
sample_to_return <- rep(0, times = n)
P <- c(0, cumsum(p))
for (i in 1:n) {
u <- runif(1)
j <- findInterval(u, P)
sample_to_return[i] <- rnorm(1, 8*j, 2)
}
return(sample_to_return)
}
mixt_Normals(c(0.2, 0.5, 0.2), 1000)
## Error in mixt_Normals(c(0.2, 0.5, 0.2), 1000): Weights do not sum up to 1
SAMPLE <- mixt_Normals(c(0.2, 0.5, 0.3), 1000)
hist(SAMPLE)
x <- 1:100
sample(x, size = 10, replace = FALSE)
## [1] 14 67 5 32 1 54 53 28 57 59sample(1:5, size = 10, replace = TRUE)
## [1] 3 3 1 2 2 3 2 3 5 2sample(c("female", "male"), size = 20, prob = c(0.7, 0.3), replace = TRUE)
## [1] "female" "female" "male" "male" "female" "female" "female" "male"
## [9] "female" "female" "male" "female" "male" "female" "female" "female"
## [17] "female" "female" "female" "female"# pick a random sample of your dataframe
data <- data.frame(id = 1:20, age = round(rnorm(n = 20, 40, 3)))
data| id | age |
|---|---|
| 1 | 44 |
| 2 | 40 |
| 3 | 39 |
| 4 | 37 |
| 5 | 40 |
| 6 | 41 |
| 7 | 40 |
| 8 | 42 |
| 9 | 43 |
| 10 | 36 |
| 11 | 40 |
| 12 | 39 |
| 13 | 38 |
| 14 | 36 |
| 15 | 40 |
| 16 | 36 |
| 17 | 45 |
| 18 | 37 |
| 19 | 42 |
| 20 | 43 |
data_sample <- data[sample(1:nrow(data), size = 10, replace = FALSE),]
data_sample| id | age | |
|---|---|---|
| 14 | 14 | 36 |
| 2 | 2 | 40 |
| 5 | 5 | 40 |
| 6 | 6 | 41 |
| 3 | 3 | 39 |
| 4 | 4 | 37 |
| 11 | 11 | 40 |
| 10 | 10 | 36 |
| 18 | 18 | 37 |
| 1 | 1 | 44 |
rnorm(n = 10, mean = 0, sd = 2) # random sample of 10 obs. from N(0, 4)
## [1] 0.5516024 -0.1228111 1.7801494 -4.0597208 1.4345765 3.1640528
## [7] 1.8173622 -4.2852295 0.6825142 -0.3183423
hist(rnorm(n = 500, mean = 0, sd = 2))
rexp(n = 10, rate = 2) # random sample of 10 obs. from exp(2)
## [1] 0.4368936 1.8802163 0.3642066 0.7559150 0.1848147 0.4300541 0.2026118
## [8] 0.1585136 0.4865363 0.2780168set.seed())# they are not expected to produce identical samples
rnorm(n = 10, mean = 0, sd = 2)
## [1] -1.0200077 -0.4378028 -0.1407837 -0.3930933 -3.0438833 -0.5243348
## [7] -2.8118697 0.8141556 2.6820943 -2.5982150
rnorm(n = 10, mean = 0, sd = 2)
## [1] 1.3256739 -0.8990232 0.8228794 -0.9710408 -2.7398450 3.7158174
## [7] -2.6080774 -1.3429773 -0.2303607 -2.5862817set.seed(10)
rnorm(n = 10, mean = 0, sd = 2)
## [1] 0.03749234 -0.36850508 -2.74266110 -1.19833543 0.58909025 0.77958860
## [7] -2.41615235 -0.72735203 -3.25334536 -0.51295679
set.seed(10) # use same seed to get identical results
rnorm(n = 10, mean = 0, sd = 2)
## [1] 0.03749234 -0.36850508 -2.74266110 -1.19833543 0.58909025 0.77958860
## [7] -2.41615235 -0.72735203 -3.25334536 -0.51295679pnorm(q = 1.96, 0, 1, lower.tail = TRUE) # P(X <= 1.96), X~N(0, 1)
## [1] 0.9750021
pnorm(q = 1.96, 0, 1, lower.tail = FALSE) # P(X > 1.96), X~N(0, 1)
## [1] 0.0249979
pbinom(q = 2, size = 5, prob = 0.6, lower.tail = TRUE) # P(X <= 2), X ~ Bin(5, 0.6)
## [1] 0.31744dnorm(2, mean = 0, sd = 3) # pdf of N(0, 9) evaluated at x = 2 (Not P(X = 2)!!!)
## [1] 0.1064827
dpois(2, lambda = 3) # P(X = 2), X ~ Poisson(3) (Note that Poisson is discrete)
## [1] 0.2240418qnorm(0.025, mean = 0, sd = 1, lower.tail = FALSE) # 0.025-upper quantile of a N(0, 1)
## [1] 1.959964
qnorm(0.975, mean = 0, sd = 1, lower.tail = TRUE) # 0.975-lower quantile of a N(0, 1)
## [1] 1.959964