# Simulate values for a population of size 1000000
# The distribution is a gamma distribution but this is not important here.
x <- rgamma(1000000,2,1)
# Define breaks for histogram and density plots
# (sequence from min(x) to max(x) in steps of size 0.01
br <- seq(from=min(x),to=max(x)+0.01,by=0.01)
# Store the histogram of population in a variable
originaldensity <- hist(x,breaks=br)
# Plot the density polygon
plot(originaldensity$mids,originaldensity$density,col="blue",t='l')
n <- 10
# take a sample of size n from the population
sample(x,n)
# Here are the mean of the population distribution and of a sample of size n
mean(x)
mean(sample(x,n))
population <- x
################################################################
# Now we repeat the above steps and compare
# the distribution of the sample mean with a normal distribution
truemean <- mean(population)
# Simulate 1000 samples of size n from the population
n <- 10
samplemean <- numeric(1000) # will be the vector of sample means
isin <- logical(1000) # will store whether or not the true value is in [m-se,m+se]
for (m in 1:1000) {
mysample <- sample(population,n) # draw sample
samplemean[m] <- mean(mysample) # compute sample meam
sem <- sd(mysample)/sqrt(n) # standard error of the mean
if(truemean < samplemean[m]-sem | truemean > samplemean[m]+sem)
isin[m] <- FALSE # in this case the true value is not in [m-sem,m+sem]
else
isin[m] <- TRUE # in this case the true value is in [m-sem,m+sem]
}
# Note that sum(c(TRUE,TRUE,FALSE))=2 as TRUE is represented by 1 and FALSE is represented
# by 0
cat("the true mean is in the interval [m-se,m+se] for ",
sum(isin)/length(isin)*100, " % of the samples\n")
# Define breaks for histogram and density plots
br <- seq(min(population),max(population)+0.2,by=0.2)
# Store the histogram of population in a variable
originaldensity <- hist(population,breaks=br,probability=TRUE)
# Store the histogram of the vector of sample means in a variable
samplemeandens <- hist(samplemean,breaks=br,probability=TRUE)
# The following variables help to plot everything into the same window
meanmax <- max(samplemean)
xAxis <- seq(from=min(samplemean),to=max(samplemean)+0.01,by=0.01)
ymax <- max(samplemeandens$density,dnorm(xAxis,mean(population),sd(population)/sqrt(n)))
# Plot the density polygon of the distribution of the population.
# The option ylim=c(a,b) determines the interval [a,b] of the y-axis.
plot(originaldensity$mids,originaldensity$density,col="blue",t='l',ylim=c(0,ymax))
# The following commands add vertical lines through the mean and through mean+-se
abline(v=mean(population),col="blue",lty=2)
abline(v=mean(population)-sd(population)/sqrt(n),col="blue",lty=4)
abline(v=mean(population)+sd(population)/sqrt(n),col="blue",lty=4)
# plot the density polygon of the distribution of the population
lines(samplemeandens$mids,samplemeandens$density,col="red")
# add the density of the appropriate normal distribution
lines(xAxis,dnorm(xAxis,mean(population),sd(population)/sqrt(n)),lwd=2)
# add a legend
legend("topright",legend=c("population","sample means", "normal distribution"),
col=c("blue","red","black"),lty=1)
######################################################
# The following function generates a sample of size m
# of some population (whose distribution is not important
# except that it is not unimodal (eingipfelig)
populationsimulator <- function(m) {
s <- numeric(m)
for (i in 1:m) {
r <- runif(1,min=0,max=1)
if (r < 0.4) s[i] <- abs(rnorm(1,2,0.2))+1
else if (r > 0.8) s[i] <- abs(rnorm(1,8,0.5))+1
else s[i] <- abs(rnorm(1,12,4))+0.3
}
return(s)
}
y <- populationsimulator(1000000)
hist(y,breaks=seq(from=0,to=max(y)+0.5,by=0.5))