numeric(5)#vector with 5 zeros logical(7) x<-c(1.2,1.3,1.5) x is.numeric(x) #gives a true or false value as.numeric(x) as.character(x) #converts x to a character vector y=1:10 #vector for 1 to 10 with step=1 as.logical(-1:1)#zero is false and the other numbers true mode(x)#shows the type of the object x1=3 x2=x1<3 x2 #this gives false value x1="stelios" #character #define a complex number x=complex(real=5,imaginary=6) x typeof(x) #similar with mode but gives more explanations specifically on storage issues y=1.5:4 #increases by 1 at each step y #NA value means missing data and NaN means not a number usually rises in arithmetic calculations #NaN implies NA but not the inverse # calculations x=3*sqrt(5)+5*cos(1) x #calculations x=3 y=5 x*y x/y y%%x #this gives the residual of the division x%/%y #this gives the floor value #comparison operators x=3 y=4 x==y x(4==5) #checks that 3<5 then gives false to 4=5 but overall true>false(1>0) #we can define step functions x=3 xnew=(x<1)*10+(x>=1)*5 #more general x=-5:10 y=(x<1) y2=1-y xnew=y*10+(1-y)*5 #we omit the comparisons by store them x=1:5 y=-1:3 x5) (x<4) | (x>5) #mathematical functions #binomial coefficient choose(5,2) lchoose(5,2) #log of choose hepl(lchoose) factorial(30) pi #the rm() delete something that is previously defined #to avoid overflow fist we log and then we take the e #vectors where stands for combine x=c(1,2) y=c(1,x) length(x) names(x)=c("banana","apple") #give names on each value #abbreviation of seq 1:10 10:1 -3:-5 -1.3:-5.3 1.1:-2.5 seq(from=1,to=10,by=2) seq(from=1,by=10,length=15) x=1:5 seq(from=1,by=10,along=x) #this is used instead of length(length of x) rep(x,times=2) #repeat the vector one more time rep(x,times=2,each=2) #at first it duplicates each element and then duplicates the whole vector rep(x,times=1:5) #each element is repeated as many times as the corresponding vector in times #operations with vectors #recycling x=1:5 y=rep(1,3) x+y #the length of this vector is the maximum length of x and y so inevitably the y is recycled(it begins again) x+2 x/2 x*2 y1=2:6 x*y1 #same length here #logical calculations height=c(1.47,1.50,1.65,1.80) height<1.5 height<1.5|height>1.7 logical(3) x=c(T,F) is.na(x) #cheks for missing values #subsets of vectors height[1]#the interior defines the position height[c(1,2)] height[height<1.5] height[-c(1,2)] #skips the first 2 elements #common arithmetic functions mean(height) var(height) prod(height) #classification of vectors sort(height) sort(height,decreasing = T) rev(sort(height))#it means reverse the same thing as above exactly x=c(1,3,2,5,8) rank(x) #the number of observations that is smaller or equal with each element in increasing order y=c(1,2,2,3,4,5,6) #ties the default option takes the mean of the ranks of the 2 ties in icreasing order rank(y) y1=c(9,2,4,3) order(y1) #this indicates the potition of each element in ascending order any(y1>8) all(y1<5) #examples #gcd a=10 b=12 c=max(a,b) test=1:c y1=a%% test==0 #find the divisors of a with test y2=b%% test==0 #find the divisors of b with test y3=y1&y2 #common divisors of a and b max(test[y3]) #the gcd #in function form gcd=function(a,b){ c=min(a,b) test=1:c y1=a%% test==0 #find the divisors of a with test y2=b%% test==0 #find the divisors of b with test y3=y1&y2 #common divisors of a and b d=max(test[y3]) #the gcd,the test[y3] returns thete values of y3 which have true positions return(d)} #this is also contained in a package #trimmed mean x=c(51,46,58,30,41,43,28,29,28,50,61,39,48,32,45,69,34,48,49,44,59,68) #let's say that the trim=0.05(5 percent) #first method sort(x) y=length(x) y_new=trunc(y*0.05) trim=mean(x[(y_new+1):(y-y_new)]) #with the Rfunction mean(x,trim = 0.05) #same result #another way with rank r=rank(x) y=length(x) y_new=trunc(y*0.05) trim=mean(x[(r>y_new)&(r<=(y-y_new))]) #median #with Rfunction median(x) #another method,just for practice med=NULL y=sort(x) n=length(x) artios=(n%% 2==0) if(n==artios){med=(y[n/2]+y[n/2+1])/2 #if the division gives positive resial then R takes the intger part }else{ med=y[(n+1)/2]} #character vectors #combine 2 words x=c("MITSOS") y=c("Giannakos") paste(x,y) #seperate characters x=c("Mpamphs") y=c("Giota") z=c("love forever") paste(paste(x,y,sep="+"),z,sep="=") #combine the characters of vectors x=c("apple,bananas") paste(x,collapse=",") date() paste("TODAY IS",date()) paste("A",1:3,sep="") #matrices x=c(1:3,11:13,21:23,31:33) y=matrix(x,ncol=3)#the row index moves faster y=matrix(x,nrow=3)#fills the columns by default from left to right y=matrix(x,nrow=3,byrow=T) #fills the rows from left to right z=matrix(c(1,2,3,4),ncol=2) dimnames(z)=list(c("row1","row2"),c("col1","col2")) #we can name them also inside the matrix(list=) #dimension dim(z) #make a vector matrix x=1:10 dim(x)=c(2,5) x #make 2 vectors matrix x=1:5 y=rep(1,5) z=cbind(x,y) #column rbind(x,y) #row #diagonal matrix diag(x) diag(2)#identity matrix with dimesion 2 #calculations z^2 sqrt(x) z==5 #logical matrix #submatrices x=matrix(1:12,ncol=3,byrow=T) x[1,3] #(1,3) element x[1,] #first row x[,3]#third column x[1,c(1,3)] #vector x[c(1,2),c(2,3)] #matrix x[x<5] #matrix operations x=matrix(1:4,ncol=2,byrow=T) y=matrix(1:4,ncol=2) x%*%y solve(x) #inverse t(x)#transpose z=c(1,2) z%*%x #z as row matrix #ARRAYS #set of matrices(twodimensional) (multidiamensional ) x=1:24 y=array(x,dim=c(3,4,2)) #2 3x4 matrices #subarrays dim(x)=c(3,4,2) x x[1,2,2] #from the second matrix the (1,2) element x[,2,2] #all row elements from the second column of the second matrix # categorical variables #factors for nominal data x=c(0,1,2) y=as.factor(x) factor(c("0","1","2"),levels=c("1","0","2")) #specify exactly the levels ordered(x)#ordinal data #split the data x=rnorm(1000) #1000 random observations from normal distribution #we can also set the set.seed of something to take the same values each time x_cut=cut(x,c(-Inf,-2,-1,0,1,2,Inf))#split the continuous population table(x_cut) #contingency table #the split function #basically it allow us to spit the data into sub populations x=1:10 y=c(1,1,1,1,2,2,2,2,3,3) z=split(x,y) mean(z$"1") ## data frames #2 dimensional matrix but it allows for different types of elements in different columns #so you can formulate a data matrix height=seq(from=1.45,to=1.80,by=0.05) id=1:length(height) sex=c(rep("M",3),rep("F",5)) data.frame(id,height,sex)#the id is not necessary here data=data.frame(ypsos=height,fylo=sex) #give your own names data.frame(height,sex,row.names=c("MPAMPHS","KOSTAS","NIKOS","ALEX","GEORGE","STEFANOS","STELIOS","NIKH")) #GIVE NAMES IN ROWS #subelements data[1,2] data[1,] data[,"fylo"] #the columns of sex data[-1,]#delete the first row data$ypsos #lists #is like an array but allows different structures x=1:5 y=list(data=data,x=x) #data frame and a vector y$x y[[1]] #the frist "matrix" of the list y[[2]] #the second "matrix of the list y[[1]][,2] #the second column of the first "matrix" z=list(ames=data,vec=x) #give different names #IMPORT DATA #R's data editor x=data.frame() #NULL x edit(x) # OPENS THE EDITOR(pop up window) #AT THIS STEP IF WE WRITE SOME VALUES THEIR NOT SAVED x=edit(x) #we have to name the editor first x #now the data are saved #for a list x2=list(x1=1:10,x2=matrix(1:4,2,2)) x2=edit(x2) #we can edit our current list #another method that imports the data directly x=scan(n=10)#we can put the values in a vector by using the keyboard x3=scan(n=3,what=character())#we specify the type we want to use #similar function readline #comment, you can use brackets in R to write a set of commands as a single argument #this function is used to create questionnaires options=menu(c("YES","NO"),title="Do you like basketball?") #THIS CREATES A POP UP MENU options=menu(c("YES","NO"),title="Do you like basketball?",graphics = T) #FOR MORE OPTIONS select.list(c("car","motorcycle","bisycle","TV"),title="Which of the following products do you prefer to own ? ",multiple = T,graphics = T) ##files and directories #This command shows the place of the working directory getwd() #change the working place(now everything is saved here) setwd("C:/Users/steli/Desktop/R") getwd() #we can use paste to create paths drive="C:" path1="Users" path2="steli" path3="Desktop" path4="R" paste(drive,path1,path2,path3,path4,sep="/") #command that allow us to search the file file.choose() #we can save the file on x x=file.choose() #import data from txt,excel,etc x=scan("scan_example.txt" ) #this command returns a vector #if the file was not in the working directory #then we had to determine the full path #combine the scan with the file.choose y=file.choose() matrix(scan(y),length(y)) #this command returns a data.frame instead of a vector read.table("C:/Users/steli/Desktop/R/A numerical illustration.dat",header = T) #we also use the header to recognize the first row as names of the columns #we can skip lines with skip= #stringsAsFactors transform the characters to factors ###we can write code in text and call it with source() #we can save workplace variables in a text fie x=1:4 y=2:5 z=cbind(x,y) save(x,z,file="scan_example.txt") rm(z)#delete load(file="scan_example.txt")#then we bring it back #save.image() saves the whole workspace #other ways of saving things #this command creates a txt in working directory that saves the results of sumand length sink("output_example.txt") x=rnorm(100) length(x) sum(x) sink() #packages for better results(including images) #R2wd for word,Sweave for exporting in latex etc #import files from excel install.packages("gdata") library(gdata) readxl::read_xlsx("C:/Users/steli/Desktop/developer/пяоцяаллатислос лахглатым.xlsx",sheet=1,header=T) #CSV TYPE (MSDOS SAVE IN EXCEL) read.csv2("C:/Users/steli/Desktop/developer/EXAMPLE.csv",header = T) #from spss library(foreign) read.spss("C:/Users/steli/Downloads/ATTICA Study 10 yr FU NEO !.sav",to.data.frame = T) #similar from sas with the read.xport("path") #histogram #random sample with replacement from 0 to 100 with sample size 200 x=sample(0:100,size=200,replace=T) #discrete uniform hist(x) #the number of intervals is [logn]+1 hist(x,nclass = 20) #we define the number of classes #we say to R which are the breaks hist(x,breaks=c(0,20,50,100)) #if we set prob=T the range of each class is multiplied with the height such as the sum of all bars will be 1 #so this gives an estimation of the probability distribution of our data hist(x,probability = T) #we set colours for the bars with col=number #if plot=F the output gives the breaks and the heights(counts) of the hist,but not the graphical representation #the main= gives our own title to the plot #also xlab and ylab set our own names for the axis hist(x,main="Plot of discrete dist" ) #xlim and ylim gives the desire range for x's and y's #e.g ylim=c(0,20) ###boxplots boxplot(x,col=5)#2 ways for using colours boxplot(x,col="cyan") #outlier x[1]=180 y=x boxplot(y) #whiskies q3+1.5*IQR,q1-1.5*IQR Y=sample(50:200,size=200,replace=T) #2 boxplots in the same picture boxplot(x,Y,col=c("red","green"),names=c("y","Y")) #scatter plot x=sample(0:100,size=200,replace=T) #200 random normal dists with mean=0 and sd=10 e=rnorm(200,0,10) plot(x,e) #independency #now we set a linear relationship #the e can be considered the noise-error y=2*x+e plot(x,y)#positive correlation #insert a line #verical line,lwy,lwd also possible options abline(v=100,col="blue") #horizontial line abline(h=200,col="red") #line with intercept=0 and slope=2 #the width of the line with lwd abline(0,2,col="magenta",lwd=4) #type= argument #p for dots,l for lines,s for step functions,n doesn't show anything #make a graph with lines or plot #normal dist x1=seq(-4,4,length=1000) plot(x1,dnorm(x1,0,0.5),type="l") #to use lines,you have to sketch one plot first lines(x1,dnorm(x1,0,1),col="red") lines(x1,dnorm(x1,0,2),col="green") lines(x1,dnorm(x1,0,3),col="blue") #plot lnx x2=seq(0.01,1000,length=10000) y=log(x2) plot(x2,y,type="l") #plot x^4-x^3+x^2 x4=seq(-10000,10000,length=100000) y1=x4^4-x4^3+x^2 plot(x4,y1,type="l") #add more information to scatter plots weight=c(rnorm(100,70,12),rnorm(100,55,7)) height=c(rnorm(100,175,14),rnorm(100,158,10)) sex=rep(c("M","F"),each=100)#seperate the weight and height in M and F plot(height,weight,type="p") #colour the males,pch represents the dots in different shapes points(height[sex=="M"],weight[sex=="M"],col="red") points(height[sex=="F"],weight[sex=="F"],col="green") #instead of colours you can use characters with labels= #multiple scatter plots e=rnorm(200,0,15) z=30-3*x1+e matrix1=cbind(x1,y,z) dimnames(matrix1)=list(NULL,c("X","Y","Z")) pairs(matrix1)#all different scatter plots #legend() adds more information to the diagram legend("topright",pch=c(1,3),legend=c("Males","Females"),col=c("red","green")) legend(locator((1)),pch=c(1,3),legend=c("Males","Females"),col=c("red","green")) #change the size of the box windows(height=7,width=3.5) #multiple diagrams in one output(same box) plot(height[sex=="M"],weight[sex=="M"],xlim = c(120,230),ylim = c(20,130),col="red") par(new=T) #this adds the second data for females plot(height[sex=="F"],weight[sex=="F"],xlim = c(120,230),ylim = c(20,130),col="green") #multiple plots in one output (different boxes) #this command is giving us 2*2=4 plots par(mfrow=c(2,2)) x=rnorm(100,0,1) hist(x) y=rnorm(100,0,2) hist(y) boxplot(x) boxplot(y) #similar comman split.screen hist(x) lines(x) dev.off() #to shut down the process graphics.off()#similar command #set a line in histogram x=rnorm(100,0,1) hist(x,probability=T) lines(density(x=x,col="red")) #ggplot package library("tidyverse") #this works with data.frame x=rnorm(100,0,1) y=rnorm(100,0,2) z=rep(c("F","M"),each=50) data=data.frame(x=x,y=y,z=z) ggplot(data,mapping=aes(x,y))+geom_point(alpha=0.5,col="red") #geom_point(scatter),geom_boxplot,geom_line ##if statement x=5 y=4 if(x>=5) y=0 #the value of y gets zero y c(x,y) if(x<5)y=10 y #y stays the same if(x<=5){ y=10 x=x^2 } #or if(x<=5){y=10 ;x=x^2} #x and y change if(x>0)print("positive number") #this procedure prints a phrase #in general we try to avoid comparisons #2 conditions or more x=4 y=5 if(x<5){ y=0 x=x^2 }else{ x=0 y=10 } c(x,y) #the first condition is true #this argument can be made also in one line #the if statement doesn't use vectors #if the test uses a vector then only the first element is cheked #first solution if we wand to something that involves the chek of a vector #boolean expressions u=1:10 t=(u<5)*(-1)+(u>=5)*1 t #faster than the if's so we use them when we can #multiple if's x=5 if(x==10){print("same") }else if (x>1){ print("bigger") }else {print("smaller")} #second choice for comparisons with vectors,but also for matices and arrays x=1:10 ifelse(x<5,1,2)#second element yes ,third no ifelse(x<5,1:5,1:3) #first it takes the first element of yes etc ##loops #for x=1:5 y=seq(10,18,by=2) for(i in x)y[i]=-y[i] y#the opposites of y z=c(1,2,3) for(i in 1:2)z[i]=2*i z for(i in 1:5){ x[i]=x[i]^2 y[i]=y[i]*2+1} #we don't have to use the indices for(i in 1:5){x=x^2} #at first step it squares the x and so on #for's are slow and we have to use another method if we can #for example x=rnorm(10000,0,1) x=x+1 for(i in 1:length(x))x[i]=x[i]+1 #loops of for #this algorithm produces the opposite matrix of x x=matrix(1:16,4,4) for(i in 1:4){ for(j in 1:4){ x[i,j]<- -x[i,j] } } x #calculate sums with loops #6.1 example sum x=c(3,4,6,7,8,1,7,4,9,0,4,5,6,12,17,19,35,45,20,18) ex_sum=function(n){ sum1=0 for(i in 1:n){ for(j in 1:n){ sum1=sum1+(x[i]-x[j])^2 } print(sum1) #print each result } return(sum1/(2*length(x)^2))} #the first output is in the previous loops sum_first=NULL for(i in 1:length(x))sum_first[i]=(x[i]-x[1])^2 sum(sum_first) #the first result #we can calculate the previous sum without loops ((length(x)-1)/length(x))*var(x) #the control variable e.g i,is preferable to stay constant #for example for(i in 1:5){ i=i+10 print(i)} #as we the commands are repeated 5 times and it doesn't matter that we change the i #other languages have problem with that change # the while again creates loops but according to some logical condition #as long as the condition is true the command-commands will be repeated i=1 while(i<100){ print(i) i=i+1} #we have to be careful in order to not create endless loops #another example,we want to solve the equation pnorm(x)=0.7 #show we want to find the quantile (lower) of the normal dist #non-linear non-linear equation t=-1 #initial value while(pnorm(t,0,1)<=0.7)t=t+0.001 qnorm(0.7)#same result while(c(1,2)<2) print("kalhmera") #endless loop it cheks only the first element #in order to avoid loops we use apply #faster method #it uses arrays or matrices x=matrix(1:16,4,4) apply(x,2,mean) #the means of columns apply(x,1,mean)#the mean of rows #we can set also our own function y=array(1:16,dim = c(2,2,4)) apply(y,1,sum) #sums of the rows of all matrices apply(y,c(1,2),sum) #sum of the rows and columns of all matrices ##functions #multiple benefits such as better management of the memory,because each variable inside is local #we don't have to run the same commands again and again #easy to fix problems inside the function #helps others to understand the program better #the argument here is a vector #make the range range_ours=function(x){ range=max(x)-min(x) return(range)} y=rnorm(100) range_ours(y) #or z=range(y) z[2]-z[1]#range E=matrix(1:4,2,2) #the argument here is a matrix square_matrix=function(A){ B=A^2 return(B)} #we don't have to use the return necessarily #In contrast with other programs we don't have to define the arguments # the input variable each time we call the function must be of the same type with the argument mus #it is a good practice to set initial values inside the function in order to avoid confusion with other variables that are used elsewhere(non-local) #and in general it is better to not use global variables inside of functions #if we have more outputs we use the list() examp=function(x,y){x=x^2;y=y^3;list(x=x,y=y)} examp(y=1,x=2)#this shows that we don't have to use a specific order if we use the names of the arguments #the output can be presented also in a vector form with return(c(x,y)) or print #as an example we write a function of a sort of like trimmed mean trimmedmean=function(x){ meanvalue=mean(x) n=length(x) std=sd(x) sum1=0 #sum of the values id=0# #how many values were included for(i in 1:n){ if(abs((x[i]-meanvalue)/std)<=2){#we include those values of at most 2 standar deviations sum1=sum1+x[i] id=id+1 } } sum1/id#trimmed mean } x=rnorm(1000,1,2) trimmedmean(x) y=rgamma(150,shape=1,rate=1) #gamma dist trimmedmean(y) #multiple outcomes #in former code we ask for the id also trimmedmean1=function(x){ meanvalue=mean(x) n=length(x) std=sd(x) sum1=0 #sum of the values id=0# #how many values wher included for(i in 1:n){ if(abs((x[i]-meanvalue)/std)<=2){#we include those values of at most 2 standar deviations sum1=sum1+x[i] id=id+1 } } list(trimmed=sum1/id,number_obs=id)#trimmed mean } trimmedmean1(x) trimmedmean1(x)$trimmed #call a specific output with $ #of course we can set default values in functions #e.g eg=function(x,l=2){k=x^2;s=l*2 list(k=k,s=s)} eg(5)#we didn't use the l at all #we can define functions inside of other functions #but this is debatable,the only use is when we don't want to bind more memory #inside of functions it is not necessary to take any specific result,e.g the function can include only a plot plotnormal=function(m=0,s=1){ lowerbound=qnorm(0.001) upperbound=qnorm(0.999) x=seq(lowerbound,upperbound,length=1000) plot(x,dnorm(x,m,s),ylab="density",type="l") } plotnormal() #books grolemund