# Lines that begin with the same character as this one are comment lines.
# They will be ignored by R and just serve as notes for the user.
# Execute this R script line by line.
# Most students and many scientists prefer the program RStudio for this.
# (Free version avaiable from https://www.rstudio.com/)
# If you only have the basic R version on your computer, the command to execute
# a line depends on your operating system:
# On Windows: Start R. Oben the script from the menu (File -> Open script)
# and execute each line with the key combination Ctrl-R in the script window.
# On the Mac: Start R, open the script via the menu and execute each
# line with the key combination CMD-Return in the script window.
# Any computer on which the Emacs editor and the ESS ("Emacs Speaks Statistics")
# package are installed (cf. http://ess.r-project.org/):
# Open the script with emacs. When the window with the script has the focus,
# the will be an icon (blue arrow over a black line) which will execute one line.
# Any other system: Open R in a command line console. Open the script and
# copy-and-paste each line to the line with the ">" in the console window and
# use the return key. You can use the console interactively
# You won't learn anything if you just click through the script.
# Use your brain and the online help system to understand the commands given
# below. Try out what you get if you cange the syntax or options in the commands.
# This will take some time and maybe also some cups of tea or coffee.
###############################################################
###############################################################
##
## Section 1:
## First steps: R replaces your pocket calculator
3+4
3+2*2 # Of course, R knows that this is 3+(2*2)
(3+2)*2
exp(1) # 'exp()' is the exponential function
exp(log(5)) # 'log()' is the natural log with base e
log(8,base=2) # log with base 2
sqrt(9) # SQuare RooT
sqrt(8)
3^3
## Exercise 1: Use R to compute the product of the sum of 3 and 5 and the
## difference of the square root of 121 and the 3rd power of 1.5.
#########################
## Section 2:
## Getting Help:
help("log") # Shows help pages for command "log()"
?sin # Same as help("sin")
help.start() # Starts a browser with starting page for R online helpS
help.search("t-test") # Searches for commands related to t-test
## Exercise 2a: Use R to determine an angle with cosine 0.7. Use the R online
## help system to find out how this works.
## Exercise 2b: Compute the third root of 125.
#########################
## Section 3:
## Variables
a <- 3 # The variable a now takes the value 3
a
b <- 4
b
a+b
b = 5 # same as b <- 5 but may be confused with b==5
b == 5 # Is b equal to 5?
b == 6 # Is b equal to 6?
b
4 -> b # allowed but not recommended
b
## Exercise 3: add 10 to 3 and store the result in a variable. Add 2 to this
## variable. Check wether the result of this variable is then larger than 14.
#########################
## Section 4:
## Vectors
## Objects of the same type can be be coerced into a vector with c()
c(1,2,4) # Vector of integers
x <- c(1,2.3,pi) # Vector of real numbers
class(x) # What's the type of x?
c( c(1,2), c(3,pi,4) ) # c() coerces two or more vectors into one.
z <- c( "a", "character string", "This sentence no verb.")
z # Vector of character strings
class(z) # The type of z is 'character', which means that it contains character strings.
class(x)
# Elementary numerical operations and many functions are applied element-wise
4*c(1,2,4) # gives the vector (4*1,4*2,4*4)
c(1,2,4)+c(2,3,5) # gives the vector (1+2,2+3,4+5)
c(1,2)+c(2,3,1,5) # same as c(1,2,1,2)+c(2,3,1,5)
# If the vector lengths are different, the shorter is repeated until it fits
c(100,200,300,400)+c(1,2)
c(1,2,4)*c(2,3,1) # gives (1*2,2*3,4*1)
c(1,2,3)^2 # gives (1^2,2^2,3^2)
c(1,2,3,4,5)^c(2,3) # gives (1^2,2^3,3^2,4^3,5^2) and a warning.
exp(c(1,4,log(2))) # gives (exp(1), exp(4), 2)
## generating useful sequences of numbers
1:5
(1:5)*2
rep(3,5)
rep( c(2,7) ,3)
seq(from=0,to=1,by=0.1) # Sequence from 0 to 1 in steps of 0.1
## Exercise 4a: create a vector that contains all negative integers larger
## than -100 and all positive even numbers that are smaller than 100 and all
## multiples of 0.3 between 30 and 40. Store the result in a vector v.
## Exercise 4b: compute the ten smallest square numbers.
## component-wise comparisons
x <- c(1,3,5,3)
x == 3
x > 3
x != 5
## Exercise 4c: Check with R whether there are integers n between 1 and 100
## for which n^2 is by more than 500 larger than the 4th power of n-50
## accessing certain elements of a vector
x <- 3:6
x
x[4] # 4th element of x
x[c(2,4)] # 2nd and 4th element; (Try to find out why x[2,4] does not work.)
x[c(FALSE,FALSE,TRUE,TRUE)] # same as x[c(3,4)]
x[x>4] # note that x>4 is same as (FALSE,FALSE,TRUE,TRUE)
# Indexing with TRUE/FALSE-Vectors (also called boolean vectors) can be extremly useful:
size <- 1.70 + seq(from=0.01,to=0.1,by=0.01)
sampling.site <- rep(c("Kathmandu","Bangkok"),5)
size[sampling.site=="Bangkok"]
## Exercise 4d: For WHICH integers n between 1 and 100 is n^2 is by more than
## 500 larger than the 4th power of n-50?
#########################
## Section 5:
## Mean, variance and standard deviation of a sample
x <- c(4:6,0,-5)
x
length(x)
sum(x)
mean(x)
var(x)
sd(x)
median(x)
## Exercise 5a: Compute the mean and the standard deviation of all multiples of 0.1 between 0 and 100.
## Exercise 5b: Compute median, mean and variance of the vector v that you computed in exercise 4a.
#########################
## Section 6:
## Plotting
x <- seq(from=0,to=1,by=0.1)
plot(x,col="red")
plot(x,x^2,col="red")
plot(x,x^2,col="red",pch=16) # point character number 16 is solid bullets
plot(1:20,rep(1,20),pch=1:20) # available point characters 1 to 20
plot(x,x^2,col="red",type="l") # plot line instead of single points
plot(sin,from=-3,to=3) # plot sine function from -3 to 3
abline(v=2) # Add a vertical line through (2,0)
abline(h=1) # Add horizontal line through (0,1)
## Exercise 6a: Plot the graphs of f(x)=2*x^2-30*x+5 and g(x)=(x-10)^3 with two
## different colors into the same coordinate system. For this, use the online
## help to learn about the commands "points" and "lines". Choose an
## interesting range for x.
## Exercise 6b: Create a plot that can be considered as a graphical solution of exercise 4d.
#########################
## Section 7:
## Generating random numbers
plot(dnorm,from=-3,to=3) # dnorm is the density function of the standard normal distribution
x <- rnorm(10) # generates 10 independent values from a standard normal distribution
x
y <- rnorm(1000)
hist(y) # Histogram of y
hist(y, breaks=seq(from=-4,to=4,by=0.2), col="orange") # histogram of higher resolution
z <- 1:6
sample(z, size=6) # draw 6 values from z without replacement. This is just a random permutation
s <- sample(z, size=10, replace=TRUE)
s # draw 10 values from z WITH replacement. This simulates rolling a dice 10 times.
mean(s)
sd(s)
sample(z, size=10, replace=TRUE)
## Exercise 7: draw a sample from a standard normal distribution of size n=10
## and compute the mean and the standard deviation of this sample. Repeat this
## several times for n=10, and also for n=100 and n=10000.
#########################
## Section 8:
## A for loop can repeat commands for different values
s <- 0
for( a in c(1,4,6,8)) {
s <- s + a
cat("a now has the value ",a,"\n")
## '\n' means "end of line"
}
s
sum(c(1,4,6,8))
## Exercise 8a: create a vector w of length 100. Then, simulate for each n from
## 1 to 100 a standard normal distribution of size n and store its mean in
## w[n]. After that, visualize how w[n] depends on n.
## Exercise 8b: create a vector w of length 100. Then, simulate for each n from
## 1 to 100 a standard normal distribution of size n^2 and store its mean in
## w[n]. After that, visualize how w[n] depends on n^2.
# nested loops:
x <- numeric()
for ( i in 1:3) {
x[i] <- 0
for (j in (1:3)) {
cat(i," plus ",j," is ",i+j,"\n")
x[i] <- x[i]+i*j
}
}
x
## Exercise 8c (ADVANCED): To find all prime numbers smaller than 10000,
## define the vector u=c(FALSE,rep(TRUE,9999)) and sort out all numbers n that
## are multiples of numbers between 2 and 100 by setting u[n]=FALSE (Sieve of
## Eratosthenes). Understand why you can use (1:10000)[u] to print the result.
#########################
## Section 9:
## A data.frame can store a data table
bavaria <- data.frame(city=c("Munich","Nuremberg","Augsburg"),
area=c(310.43,186.38,146.93),
popul=c(1356594,500132,263477))
bavaria
str(bavaria) # Get an overview of the data frame (str also works with
# other types of data structures)
bavaria$city # Use '$'-operator to access column of table
bavaria$area
class(bavaria$city) # Note that the class is "factor", not "character". More about this later.
class(bavaria$area)
bavaria$area[1] # bavaria$area is a vector
bavaria$city[2]
bavaria[1]
class(bavaria[1])
length(bavaria[1])
bavaria[[1]]
bavaria[1,2] # first line, second column
bavaria[3,1] # third line, first column
bavaria[2:3,1:2] # lines 2 and 3, columns 1 and 2
bavaria[2:3,] # lines 2 and 3
bavaria[,2:3] # columns 2 and 3
bavaria[ bavaria$area<200 , ] # all lines with bavaria$area < 200
bavaria[ bavaria$area<200, c("city","area") ]
# If you are too lazy to always type "bavaria$"
city # does not exist
attach(bavaria) # Tell R to use this table per default
city # 'city', 'area' and 'popul' are known now
bavaria[ area<200, c("city","area") ]
mean(area)
var(area)
detach(bavaria)
city # not known any more
## Exercise 9: Compute the population densities of the three cities and assign
## the results to a new column in the data frame.
#########################
## Section 10:
## reading data from files
# R can read data from (ascii) text files.
# If data are separated by blanks or tabs us the command read.table()
# For comma-seperated values (csv) use read.csv()
# If the first line contains the names of the columns, use the option
# header=TRUE
# We assume that the data file 'swarth1.txt' is stored in the current
# directory (or folder). You can check it with getwd() and set it
# to "xyz" with setwd("xyz").
finches <- read.table("swarth1.txt",header=TRUE)
finches
head(finches) # Show first lines
str(finches) # Other summary of data frame
attach(finches)
mean(beak)
sd(beak)
mean(beak[island=="Floreana"])
detach(finches)
## Exercise 10: Create a dataset with a spreadsheet program like Excel or
## LibreOffice Calc, store it, and load it into R as a data frame. Use the
## command "str" to inspect the data frame.
#########################
## Section 11:
## Plotting data with boxplot(), stripchart() and hist()
attach(finches)
boxplot(beak~island) # boxplots of beak lengths, separated by islands
plot(beak~island) # same plot as default since island is a factor variable
plot(island,beak) # same plot as default since island is a factor variable
stripchart(beak~island)
stripchart(beak~island,pch=16,col="steelblue",ylim=c(0.5,3.5),method="stack")
stripchart(beak~island,pch=16,col="tomato2",ylim=c(0.5,3.5),method="jitter")
colors() # all names of pre-defined colors
hist(beak)
abline(v=mean(beak))
abline(v=mean(beak),lwd=3) # lwd="line width"
abline(v=mean(beak)+sd(beak),lwd=3,lty="dashed")
abline(v=mean(beak)-sd(beak),lwd=3,lty="dashed")
detach(finches)
#########################
## Section 12:
## density polygons
attach(finches)
hist(beak[island=="Santa_Cruz"], main="Beobachtete beaklĂ¤ngen auf Santa Cruz")
hist(beak[island=="Floreana"], main="Beobachtete beaklĂ¤ngen auf Floreana")
# Compute histogram data but do not plot
h.SCru <- hist(beak[island=="Santa_Cruz"], plot=FALSE)
h.Fl <- hist(beak[island=="Floreana"], plot=FALSE)
# h.Fl$mids are the centers of the histogram bars, h.Fl$dens are the heights
# we use these values for the density polygons.
# First generate an empty plot
plot(-1,-1, main="density polygons of beak length distributions",
sub="on Floreana and Santa Cruz",
xlim=c(min(h.SCru$mids, h.Fl$mids),
max(h.SCru$mids, h.Fl$mids)),
ylim=c(0,max(h.SCru$dens, h.Fl$dens)),
xlab="beak lengths",ylab="density")
# add density polygons
points(h.SCru$mids, h.SCru$dens, lty=1, lwd=2, type="l")
points(h.Fl$mids, h.Fl$dens, lty=3, lwd=2, type="l")
detach(finches)
#########################
## Section 13:
## factors
attach(finches)
# Factor variables are used to group data into classes
class(island)
island
levels(island)
# Sometimes a numeric variable is interpreted a factor by mistake.
# Correcting this needs some care:
f <- factor(c(3,5,4,10))
f
as.numeric(f) # This is the internal representation of f and probably not what you want
as.numeric(as.vector(f)) # This is what we want
as.vector(f) # converts factor into vector of character strings
#########################
## Section 14:
## finish session
q()
##################################################
##################################################
## Section 15:
## Example with Darwin's finches
## Use setwd("C:where/your/data/are") or the corresponding menu item to specify where to
## find the data
finches <- read.table("FinchesSulloway.txt",header=TRUE,sep="\t") # read data from file
attach(finches)
boxplot(BeakH~SpeciesID)
boxplot(WingL~SpeciesID)
stripchart(WingL~SpeciesID,method="stack",vertical=TRUE)
#pdf(file="wingbox.pdf",width=10,height=7)
boxplot(WingL~IslandID,col="yellow",horizontal=TRUE,main="Wing Lengths by Island")
#dev.off()
#pdf(file="wingstrip.pdf",width=10,height=7)
stripchart(WingL~IslandID,method="stack",col="red",cex=3,lwd=2)
#dev.off()
#pdf(file="wingboxstrip.pdf",width=10,height=7)
boxplot(WingL~IslandID,col="yellow",horizontal=TRUE,main="Wing Lengths by Island")
stripchart(WingL~IslandID,method="jitter",col="red",cex=3,pch=1,lwd=2,add=TRUE)
#dev.off()
#pdf(file="winghist.pdf",width=10,height=7)
h1 <- hist(WingL[IslandID==""],freq=FALSE,col="#FF990044",xlim=c(50,95),breaks=-1:10*5+50,
main="Histogramm (Densities!) with transparen colors",xlab="Wing Lengths")
h2 <- hist(WingL[IslandID=="SCris_Chat"],freq=FALSE,col="#FF009944",breaks=-1:10*5+50,add=TRUE)
h3 <- hist(WingL[IslandID=="Flor_Chrl"],freq=FALSE,col="#00FF0044",breaks=-1:10*5+50,add=TRUE)
h4 <- hist(WingL[IslandID=="Snti_Jams"],freq=FALSE,col="#0000FF44",breaks=-1:10*5+50,add=TRUE)
legend(80,0.18,legend=c("","SCris_Chat","Flor_Chrl","Snti_Jams"),
col=c("#FF990044","#FF009944","#00FF0044","#0000FF44"),pch=16)
#dev.off()
#pdf(file="wingbar.pdf",width=10,height=7)
M <- rbind(h1$counts,h2$counts,h3$counts,h4$counts)
colnames(M) <- h1$mids
barplot(M,beside=TRUE,col=c("#FF9900","#FF0099","#00FF00","#0000FF"),main="Barplot of Wing Lengths (Numbers)")
legend(40,6,legend=c("","SCris_Chat","Flor_Chrl","Snti_Jams"),col=c("#FF9900","#FF0099","#00FF00","#0000FF"),pch=16)
#dev.off()
#pdf(file="wingdens.pdf",width=10,height=7)
plot(h1$mids,h1$density,col="#FF9900",lwd=3,t="l",xlim=c(50,100),main="Density Polygons",
xlab="Wing Lengths",ylab="Density")
lines(h2$mids,h2$density,col="#FF0099",lwd=3)
lines(h3$mids,h3$density,col="#00FF00",lwd=3)
lines(h4$mids,h4$density,col="#0000FF",lwd=3)
legend(80,0.18,legend=c("","SCris_Chat","Flor_Chrl","Snti_Jams"),col=c("#FF9900","#FF0099","#00FF00","#0000FF"),pch=16)
#dev.off()
#pdf(file="beakbox.pdf",width=10,height=7)
boxplot(BeakH~SpeciesID,col="yellow",main="Beak Sizes by Species")
#dev.off()
#pdf(file="beakboxstrip.pdf",width=10,height=7)
boxplot(BeakH~SpeciesID,col="yellow",main="Beak Sizes by Species")
stripchart(BeakH~SpeciesID,method="jitter",vertical=TRUE,add=TRUE,pch=1,col="red")
#dev.off()
detach(finches)
#########################
## Section 16:
## Some advanced aspects
c(1,2,4) %*% c(2,3,1) # scalar product
## mixing your own colors
y <- rnorm(100,mean=50,sd=10)
hist(y,breaks=0:20*5,col="tomato")
colors()
mycolor <- rgb(red=0.0,green=0.3,blue=0.8,alpha=0.5)
mycolor
hist(x,breaks=0:20*5,col=mycolor,add=TRUE)
x <- rep(0:100,101)/100
y <- rep(0:100,rep(101,101))/100
plot(x,y,col=rgb(red=x,green=y,blue=0),pch=15)
plot(x,y,col=rgb(red=x,green=y,blue=1.0),pch=15)
# Indexing vectors with names
x <- 1:5
names(x) <- c("Charles","Robert","Darwin","Albert","Einstein")
x
x[c("Robert","Charles")]
x[["Robert"]]
x[[c("Robert","Charles")]] # Does not work
x<0
x[x<0]
## Lists
## data.frames are a special type of lists.
## Try to understand the differnce between L[..] and L[[..]]
V <- c(2,3,4)
V[2]+V[3]
L <- list(2,3,4)
L
L2 <- L[ c(1,3) ] # L2 is also a list. It contains the first and the third value of L
L2
class(L2)
L[2] # again a list. it contains only the second value
class(L[ 2 ] )
L[[2]] # the second value
class(L[[ 2 ]] )
L[2]+L[3] ## does not work because you cannot add lists
L[[2]] + L[[3]] ## that's how it works, because you can add the values
L[2:3] + L[3] ## does not work
L[[2:3]] + L[[3]] ## does not work
unlist(L[2:3])+ L[[3]] ## works (hopefully)
## see also http://cran.r-project.org/doc/manuals/R-intro.html#Lists
## An important difference between vectors and lists is that lists can be nested.
L <- list(c("hello",31,7) , c(4,5))
L
V <- c(c("hello",31,7) , c(4,5))
V
L[1]
V[1]
L[2]
L[2][1]
L[[2]][1]
class(L)
class(L[1])
class(L[[1]])
L <- list( Joe=c(1,"hello",3) , Jack=c(4,5))
L[1]
L[[1]]
L["Joe"]
L[["Joe"]]
L$Joe
### SOLUTIONS OF EXERCISES
## Exercise 1: Use R to compute the product of the sum of 3 and 5 and the
## difference of the square root of 121 and the 3rd power of 1.5.
(3+5)*(sqrt(121)-1.5^3)
## Exercise 2a: Use R to determine an angle with cosine 0.7. Use the R online
## help system to find out how this works.
?cos
acos(0.7)
acos(0.7)/(pi*2)*360
## Exercise 2b: Compute the third root of 125.
125^(1/3)
## Exercise 3: add 10 to 3 and store the result in a variable. Add 2 to this
## variable. Check wether the result of this variable is then larger than 14.
z <- 10+3
z <- z+2
z > 14
## Exercise 4a: create a vector that contains all negative integers larger
## than -100 and all positive even numbers that are smaller than 100 and all
## multiples of 0.3 between 30 and 40. Store the result in a vector v.
v <- c(-99:-1,(1:50)*2,seq(from=30.3,to=40,by=0.3))
v
## Exercise 4b: compute the ten smallest square numbers.
(1:10)^2 ## or (0:9)^2 if 0 counts as square number
## Exercise 4c: Check with R whether there are integers n between 1 and 100
## for which n^2 is by more than 500 larger than the 4th power of n-50
n <- 1:100
n^2 > (n-50)^4+500
## Exercise 4d: For WHICH integers n between 1 and 100 is n^2 by more than
## 500 larger than the 4th power of n-50?
n[n^2 > (n-50)^4+500]
## Exercise 5a: Compute the mean and the standard deviation of all multiples of 0.1 between 0 and 100.
a <- seq(from=0.1,t=99.9,by=0.1)
mean(a)
sd(a)
## Exercise 5b: Compute median, mean and variance of the vector v that you computed in exercise 4a.
median(v)
mean(v)
var(v)
## Exercise 6a: Plot the graphs of f(x)=2*x^2-30*x+5 and g(x)=(x-10)^3 with two
## different colors into the same coordinate system. For this, use the online
## help to learn about the commands "points" and "lines". Choose an
## interesting range for x.
?points
?lines
x <- seq(0,30,by=0.1)
plot(x,2*x^2-30*x+5,col="red",ylim=c(-200,1000),t="l")
lines(x,(x-10)^3,col="blue")
## Exercise 6b: Create a plot that can be considered as a graphical solution of exercise 4d.
plot(n,n^2 - (n-50)^4,col="red")
abline(h=500)
plot(n,n^2 - (n-50)^4,col="red",ylim=c(-3000,3000))
abline(h=500)
plot(n,n^2)
points(n,(n-50)^4+500,col="red")
## Exercise 7: draw a sample from a standard normal distribution of size n=10
## and compute the mean and the standard deviation of this sample. Repeat this
## several times for n=10, and also for n=100 and n=10000.
y <- rnorm(10)
mean(y)
sd(y)
y <- rnorm(10)
mean(y)
sd(y)
y <- rnorm(10)
mean(y)
sd(y)
y <- rnorm(10)
mean(y)
sd(y)
y <- rnorm(100)
mean(y)
sd(y)
y <- rnorm(100)
mean(y)
sd(y)
y <- rnorm(100)
mean(y)
sd(y)
y <- rnorm(100)
mean(y)
sd(y)
y <- rnorm(10000)
mean(y)
sd(y)
y <- rnorm(10000)
mean(y)
sd(y)
y <- rnorm(10000)
mean(y)
sd(y)
y <- rnorm(10000)
mean(y)
sd(y)
## Exercise 8a: create a vector w of length 100. Then, simulate for each n from
## 1 to 100 a standard normal distribution of size n and store its mean in
## w[n]. After that, visualize how w[n] depends on n.
w <- rep(0,100)
for(n in 1:100) {
w[n] <- mean(rnorm(n))
}
plot(1:100,w)
## Exercise 8b: create a vector w of length 100. Then, simulate for each n from
## 1 to 100 a standard normal distribution of size n^2 and store its mean in
## w[n]. After that, visualize how w[n] depends on n^2.
w <- rep(0,100)
for(n in 1:100) {
w[n] <- mean(rnorm(n^2))
}
plot(1:100,w)
## Exercise 8c (ADVANCED): To find all prime numbers smaller than 10000,
## define the vector u=c(FALSE,rep(TRUE,9999)) and sort out all numbers n that
## are multiples of numbers between 2 and 100 by setting u[n]=FALSE (Sieve of
## Eratosthenes). Understand why you can use (1:10000)[u] to print the result.
u <- c(FALSE,rep(TRUE,9999))
for(i in 2:100) {
if(u[i]) {
j <- 2*i
while(j<=10000) {
u[j] <- FALSE
j <- j+i
}
}
}
(1:10000)[u]
## Exercise 9: Compute the population densities of the three cities and assign
## the results to a new column in the data frame.
bavaria$dens <- bavaria$popul / bavaria$area
bavaria