#################### # INTRODUCTION TO R #################### ### Lecture 5 - Rearranging and manipulating data ########################################################################### # Data set 1: Data on fish abundance # Files: # Fish_survey.csv # Water_data.csv # GPS_data.csv # Steps of analysis: # Part 1: Reshape the data # Part 2: Combine all three files into one file ########################################################################### ########################## # PART 1: Reshape the data ########################## # Load necessary libraries library(dplyr) library(tidyr) # Import the data Fish_survey <- read.csv("data/Fish_survey.csv", header = TRUE) head(Fish_survey) # You can combine the three columns into one column that contains all species # you can use the function gather() from the tidyr package: Fish_survey_long <- gather(Fish_survey, Species, Abundance, 4:6) # Check the data head(Fish_survey_long) tail(Fish_survey_long) # To convert the data back into a format with separate columns for each species, # you can use the function spread() from the tidyr package: Fish_survey_wide <- spread(Fish_survey_long, Species, Abundance) head(Fish_survey_wide) ################################################# # PART 2: Combine all three files into one file ################################################# # Import Water_data.csv and GPS_data.csv Water_data <- read.csv("data/Water_data.csv", header = TRUE) GPS_location <- read.csv("data/GPS_data.csv", header = TRUE) # To combine the data sets we will use the package dplyr: # Join water characteristics to fish abundance data using inner_join() Fish_and_Water <- inner_join(Fish_survey_long, Water_data, by = c("Site", "Month") ) # Add GPS locations to new Fish_and_Water data set using inner_join() Fish_survey_combined <- inner_join(Fish_and_Water, GPS_location, by = c("Site", "Transect") ) # Get an overview of your data str(Fish_survey_combined) summary(Fish_survey_combined) # Clean R's environment rm(list = ls()) ########################################################################### # Data set 2: Bird behaviour # File: # Bird_Behaviour.csv # Steps of analysis: # Part 1: Adding new variables # Part 2: Combining/separating variables # Part 3: Subsetting data # Part 4: Summarizing data ########################################################################### # Load data Bird_Behaviour <- read.csv("data/Bird_Behaviour.csv", header = TRUE, stringsAsFactors = FALSE ) # Get an overview on the data str(Bird_Behaviour) ################################################# # PART 1: Adding new variables ################################################# # We want to add the new variable (column) log_FID # Possibility 1: Using $ Bird_Behaviour$log_FID <- log(Bird_Behaviour$FID) # Possibility 2: Using the []-operator Bird_Behaviour[ , "log_FID"] <- log(Bird_Behaviour$FID) # Possibility 3: Using mutate() from the dplyr package Bird_Behaviour <- mutate(Bird_Behaviour, log_FID = log(FID)) head(Bird_Behaviour) ################################################# # PART 2: Combining/separating variables ################################################# # We now want to split genus from species. # We can split one column into two using the function separate() from the # dplyr package: Bird_Behaviour <- separate(Bird_Behaviour, Species, c("Genus", "Species"), sep = "_", remove = TRUE) head(Bird_Behaviour) # We can combine two columns into one using the function unite() from the # tidyr package: Bird_Behaviour <- unite(Bird_Behaviour, "Species", c(Genus, Species), sep = "_", remove = TRUE ) head(Bird_Behaviour) ################################################# # PART 3: Subsetting data ################################################# # Subsetting data with the []-operator # selects the first 4 columns Bird_Behaviour[ , 1:4] # selects rows 2 and 3 Bird_Behaviour[c(2, 3), ] # selects the rows 1 to 3 and columns 1 to 4 Bird_Behaviour[1:3, 1:4] # selects the rows 1 to 3 and 6, and the columns 1 to 4 and 8 Bird_Behaviour[c(1:3, 6), c(1:4, 8)] # Subsetting data with the []- and $-operators # Example: select all rows with males Bird_Behaviour[Bird_Behaviour$Sex == "male", ] # Subsetting rows with dplyr # selects rows 3 to 5 Bird_Behaviour.slice <- slice(Bird_Behaviour, 3:5) # selects rows that meet certain criteria Bird_Behaviour.filter <- filter(Bird_Behaviour, FID < 5) # takes randomly 50% of the rows Bird_Behaviour.50 <- sample_frac(Bird_Behaviour, size = 0.5, replace = FALSE ) # takes randomly 50 rows Bird_Behaviour_50Rows <- sample_n(Bird_Behaviour, 50, replace = FALSE ) # Subsetting columns with select() from dplyr # selects the columns Ind, Sex, Fledglings Bird_Behaviour_col <- select(Bird_Behaviour, Ind, Sex, Fledglings ) # excludes the variable disturbance Bird_Behaviour_reduced <- select(Bird_Behaviour, -Disturbance ) ################################################# # PART 4: Summarizing data ################################################# # We can use summarize() from dplyr to get statistics of our data summarize(Bird_Behaviour, mean.FID = mean(FID)) # We can add more variables to our summary summarize(Bird_Behaviour, mean.FID = mean(FID), # mean min.FID = min(FID), # minimum max.FID = max(FID), # maximum med.FID = median(FID), # median sd.FID = sd(FID), # standard deviation var.FID = var(FID), # variance n.FID = n() # sample size ) # To summarize the data for each species we can use group_by() from the dplyr # package Bird_Behaviour_by_Species <- group_by(Bird_Behaviour, Species) summary_species <- summarize(Bird_Behaviour_by_Species, mean.FID = mean(FID), # mean min.FID = min(FID), # minimum max.FID = max(FID), # maximum med.FID = median(FID), # median sd.FID = sd(FID), # standard deviation var.FID = var(FID), # variance n.FID = n() # sample size ) summary_species # summary_species is not a data frame, but a tibble - we can save # the data in a data frame with as.data.frame() summary_species_df <- as.data.frame(summary_species) ######################### # SESSION INFORMATION ######################### sessionInfo() # R version 3.3.3 RC (2017-02-27 r72279) # Platform: x86_64-pc-linux-gnu (64-bit) # Running under: Ubuntu 16.04.3 LTS # # locale: # [1] LC_CTYPE=en_GB.UTF-8 LC_NUMERIC=C # [3] LC_TIME=de_DE.UTF-8 LC_COLLATE=en_GB.UTF-8 # [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=en_GB.UTF-8 # [7] LC_PAPER=de_DE.UTF-8 LC_NAME=C # [9] LC_ADDRESS=C LC_TELEPHONE=C # [11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C # # attached base packages: # [1] stats graphics grDevices utils datasets methods base # # other attached packages: # [1] tidyr_0.8.0 dplyr_0.8.0.1 # # loaded via a namespace (and not attached): # [1] tidyselect_0.2.5 magrittr_1.5 assertthat_0.2.0 R6_2.2.2 # [5] tools_3.3.3 pillar_1.3.1 glue_1.3.0 tibble_2.0.1 # [9] crayon_1.3.4 Rcpp_1.0.0 stringi_1.1.6 pkgconfig_2.0.2 # [13] rlang_0.3.1 purrr_0.2.4 Sys.time() # [1] "2019-03-07 20:49:24 CET"