# Simulate values for a population of size 1000000 # The distribution is a gamma distribution but this is not important here. x <- rgamma(1000000,2,1) # Define breaks for histogram and density plots # (sequence from min(x) to max(x) in steps of size 0.01 br <- seq(from=min(x),to=max(x)+0.01,by=0.01) # Store the histogram of population in a variable originaldensity <- hist(x,breaks=br) # Plot the density polygon plot(originaldensity$mids,originaldensity$density,col="blue",t='l') n <- 10 # take a sample of size n from the population sample(x,n) # Here are the mean of the population distribution and of a sample of size n mean(x) mean(sample(x,n)) population <- x ################################################################ # Now we repeat the above steps and compare # the distribution of the sample mean with a normal distribution truemean <- mean(population) # Simulate 1000 samples of size n from the population n <- 10 samplemean <- numeric(1000) # will be the vector of sample means isin <- logical(1000) # will store whether or not the true value is in [m-se,m+se] for (m in 1:1000) { mysample <- sample(population,n) # draw sample samplemean[m] <- mean(mysample) # compute sample meam sem <- sd(mysample)/sqrt(n) # standard error of the mean if(truemean < samplemean[m]-sem | truemean > samplemean[m]+sem) isin[m] <- FALSE # in this case the true value is not in [m-sem,m+sem] else isin[m] <- TRUE # in this case the true value is in [m-sem,m+sem] } # Note that sum(c(TRUE,TRUE,FALSE))=2 as TRUE is represented by 1 and FALSE is represented # by 0 cat("the true mean is in the interval [m-se,m+se] for ", sum(isin)/length(isin)*100, " % of the samples\n") # Define breaks for histogram and density plots br <- seq(min(population),max(population)+0.2,by=0.2) # Store the histogram of population in a variable originaldensity <- hist(population,breaks=br,probability=TRUE) # Store the histogram of the vector of sample means in a variable samplemeandens <- hist(samplemean,breaks=br,probability=TRUE) # The following variables help to plot everything into the same window meanmax <- max(samplemean) xAxis <- seq(from=min(samplemean),to=max(samplemean)+0.01,by=0.01) ymax <- max(samplemeandens$density,dnorm(xAxis,mean(population),sd(population)/sqrt(n))) # Plot the density polygon of the distribution of the population. # The option ylim=c(a,b) determines the interval [a,b] of the y-axis. plot(originaldensity$mids,originaldensity$density,col="blue",t='l',ylim=c(0,ymax)) # The following commands add vertical lines through the mean and through mean+-se abline(v=mean(population),col="blue",lty=2) abline(v=mean(population)-sd(population)/sqrt(n),col="blue",lty=4) abline(v=mean(population)+sd(population)/sqrt(n),col="blue",lty=4) # plot the density polygon of the distribution of the population lines(samplemeandens$mids,samplemeandens$density,col="red") # add the density of the appropriate normal distribution lines(xAxis,dnorm(xAxis,mean(population),sd(population)/sqrt(n)),lwd=2) # add a legend legend("topright",legend=c("population","sample means", "normal distribution"), col=c("blue","red","black"),lty=1) ###################################################### # The following function generates a sample of size m # of some population (whose distribution is not important # except that it is not unimodal (eingipfelig) populationsimulator <- function(m) { s <- numeric(m) for (i in 1:m) { r <- runif(1,min=0,max=1) if (r < 0.4) s[i] <- abs(rnorm(1,2,0.2))+1 else if (r > 0.8) s[i] <- abs(rnorm(1,8,0.5))+1 else s[i] <- abs(rnorm(1,12,4))+0.3 } return(s) } y <- populationsimulator(1000000) hist(y,breaks=seq(from=0,to=max(y)+0.5,by=0.5))