#################### # INTRODUCTION TO R #################### ### Lecture 4 - Reading and writing data ########################################################################### # Our data: # An example on marine snails provided by www.environmentalcomputing.net # There are some problems with the data we have to take care of # File: Snail_feeding.csv # Steps of analysis: # Part 1: Import the data # Part 2: Check and clean the data # Part 3: Export the data # Part 4: Clean R ########################################################################### ########################## # PART 1: Import the data ########################## Snail_data <- read.csv(file = "Snail_feeding.csv", header = TRUE, strip.white = TRUE, na.strings = " " ) ################################### # PART 2: Check and clean the data ################################### # Get an overview of your data str(Snail_data) summary(Snail_data) # Specific problems: # Problem 1: What are the last 3 columns? -> We remove them # Problem 2: Why does the variable 'Sex' has 4 levels? (male, female - ?) # Problem 3: Possible problem with variable 'Depth' - Max: 162, Mean: 1.7?) # General: # Problem 4: Do we have any duplicated rows? # We now take care of all these problems one-by-one ## Problem 1: Remove the last 3 columns, i.e. only take the first 7 columns Snail_data <- Snail_data[ , 1:7] # Check data again str(Snail_data) ## Problem 2: Why does the variable 'Sex' has 4 levels? unique(Snail_data$Sex) # or levels(Snail_data$Sex) # Ah, possibly typos - we can assign 'males' and 'Male' the value 'male' Snail_data$Sex[which(Snail_data$Sex == "males")] <- "male" Snail_data$Sex[which(Snail_data$Sex == "Male")] <- "male" # Or - both assignments in once Snail_data$Sex[which(Snail_data$Sex == "males") | Snail_data$Sex == "Male"] <- "male" # Check if it worked levels(Snail_data$Sex) summary(Snail_data$Sex) # We still have the extra levels, but the summary shows that we now only have # 'male' and 'female' as expected -> we have to remove the extra levels Snail_data$Sex <- factor(Snail_data$Sex) # Test again levels(Snail_data$Sex) # Okay - done. ## Problem 3: Possible problem with variable 'Depth' - Max: 162, Mean: 1.7? summary(Snail_data$Depth) # The lecturer tells us she had introduced a typo - it should be 1.62 # Ah, makes much more sense -> we have to reassign the value # Get the row with the maximum depth which.max(Snail_data$Depth) # Row 8 -> change the value in column 6 ('Depth') to 1.62 Snail_data[8, 6] <- 1.62 summary(Snail_data$Depth) # Any more problems? # The lecturer tells us, the depth should be below 2m - the snails do not live # in deeper water # Test for that Snail_data[which(Snail_data$Depth > 2), ] # You do not get more rows, okay - done. ## Problem 4: Do I have any duplicated rows? # The lecturer tells us, that in this case it should not happen to have # duplicated rows -> if we find any, remove them # Identify duplicated rows duplicated(Snail_data) # How many? sum(duplicated(Snail_data)) # Which row? Snail_data[which(duplicated(Snail_data)), ] # Snail.ID Sex Size Feeding Distance Depth Temp # 17 1 male small FALSE 0.87 1.95 18 # CAREFUL: It is ROW number 17 - not Snail.ID! # Have a look at first 20 rows of the data head(Snail_data, n = 20) # Get rid of the duplicates Snail_data <- unique(Snail_data) # Check data sum(duplicated(Snail_data)) # Okay head(Snail_data, n = 20) # Wait - row 17 is now missing # We have to renumber the rows row.names(Snail_data) <- 1:nrow(Snail_data) # Check again... head(Snail_data, n = 20) # Okay - done. ########################## # PART 3: Export the data ########################## write.csv(Snail_data, file = "Snail_data_checked.csv", row.names = FALSE) # If you are really paranoid now (understandably and totally fine) - read in the # new data and check it again Snail_data_checked <- read.csv(file = "Snail_data_checked.csv", header = TRUE, strip.white = TRUE, na.strings = " " ) summary(Snail_data_checked) str(Snail_data_checked) sum(duplicated(Snail_data_checked)) # OKAY - REALLY DONE! ########################## # PART 4: Cleaning R ########################## # List of objects ls() # [1] "Snail_data" "Snail_data_checked" # Clean R's environment rm(list = ls()) ######################### # SESSION INFORMATION ######################### sessionInfo() # R version 3.2.0 (2015-04-16) # Platform: x86_64-pc-linux-gnu (64-bit) # Running under: Ubuntu 14.04.2 LTS # # locale: # [1] LC_CTYPE=de_DE.UTF-8 LC_NUMERIC=C LC_TIME=de_DE.UTF-8 LC_COLLATE=de_DE.UTF-8 # [5] LC_MONETARY=de_DE.UTF-8 LC_MESSAGES=de_DE.UTF-8 LC_PAPER=de_DE.UTF-8 LC_NAME=C # [9] LC_ADDRESS=C LC_TELEPHONE=C LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C # # attached base packages: # [1] stats graphics grDevices utils datasets methods base # # loaded via a namespace (and not attached): # [1] tools_3.2.0 Sys.time() # [1] "2019-03-07 11:42:33 CET"