###WQP Insect Occurrence Records Cleaning File ##Author: Laura Twardochleb ##This script combines and explores all "Benthic Macroinvertebrate" and "Invertebrate" data from WQP for the continental US ##This script contains code for initial data cleaning up to removing records with georeferencing errors ##This script takes cleaned WQP data and assigns taxonomic names for each taxonomic level- from species to order to WQP data by creating a separate column for each taxonomic level ##A dataset that has removed records with georeferencing errors or missing datum type "WQP_invertebrate_1979_2016_2.csv" #Clear all existing data rm(list=ls()) #Close graphics devices graphics.off() setwd("~/Documents/WaterCube/Aquaxterra_code_data/aquatic_insects-master/code/Data_cleaning_WQP_traits") load(file='WQP_cleaning.RData') #####Read in all data############################################ #this is the data from the "benthic macroinvertebrate" assemblage search biodata<-read.csv("macroinvertebrate_biological.csv", header=TRUE) #read in biological count data sitedata<-read.csv("macroinvertebrate_site_data.csv", header=TRUE) #read in site data that includes lat and lon for each site #this is the data from the "invertebrate" assemblage search invert.biodata<-read.csv("invertebrate_biological.csv", header=TRUE) invert.sitedata<-read.csv("invertebrate_site.csv", header=TRUE) #before merging the datasets, need to remove tissue sample records (samples for chemical pollutants) from invertebrate assemblage dataset unique(invert.biodata$BiologicalIntentName) invert.biodata1<-subset(invert.biodata, (BiologicalIntentName == "Population Census") | (BiologicalIntentName == "Species Density")) #####Merge WQP databases######################################### #merge the site-level and biological invertebrate sample data invertdata<-merge(invert.biodata1, invert.sitedata, by="MonitoringLocationIdentifier", all=TRUE) macroinvertdata<-merge(biodata, sitedata, by="MonitoringLocationIdentifier", all=TRUE) #merge the two datasets wqp<-rbind(invertdata, macroinvertdata) #how many initial records? ######Remove marine samples############################################################################## #next remove marine samples- estuaries, oceans, superfund sites, other types can be removed later, e.g., springs, land; also remove samples with NAs in the location type names(wqp) unique(wqp$MonitoringLocationTypeName) wqp1<-subset(wqp, (MonitoringLocationTypeName != "") & (MonitoringLocationTypeName !="Facility Municipal Sewage (POTW)")&(MonitoringLocationTypeName !="Storm Sewer")&(MonitoringLocationTypeName !="Ocean")&(MonitoringLocationTypeName !="Estuary")&(MonitoringLocationTypeName !="Facility Industrial")&(MonitoringLocationTypeName !="Facility Public Water Supply (PWS)")&(MonitoringLocationTypeName !="CERCLA Superfund Site")&(MonitoringLocationTypeName !="Facility Other")&(MonitoringLocationTypeName !="Canal Irrigation")&(MonitoringLocationTypeName !="Waste Sewer")&(MonitoringLocationTypeName !="Land Runoff")&(MonitoringLocationTypeName !="Canal Drainage")&(MonitoringLocationTypeName !="Canal Transport")&(MonitoringLocationTypeName !="Other-Surface Water")&(MonitoringLocationTypeName !="Land")) unique(wqp1$MonitoringLocationTypeName) #######Remove non-count or presence/absence records###################################################### #next remove all tissue samples- or any samples that are not population census, population density unique(wqp1$BiologicalIntentName) #examine what is in each of these categories wqptoxicity<-subset(wqp1, BiologicalIntentName == "Toxicity") #this is count data on aquatic inverts- keep in database wqpfreq<-subset(wqp1, BiologicalIntentName == "Frequency Class") #this is invertebrate count data- keep in database wqptissue<-subset(wqp1, BiologicalIntentName == "Targeted Sampling") #this is presence/absence data- need to remove records that have 'N' in ResultMeasureValue- subset by this next wqpIndividual<-subset(wqp1, BiologicalIntentName == "Individual") #this is invertebrate count data- keep in database wqpsummary<-subset(wqp1, BiologicalIntentName == "Group Summary")#some are counts and some are lengths and weights- keep in database, after this, will need to check CharacteristicName and ResultMeasure.MeasureUnitCode wqptiss<-subset(wqp1, BiologicalIntentName == "Tissue") #this is tissue samples from unidentified invertebrates- remove from database #now remove BiologicalIntentName="Tissue" wqp2<-subset(wqp1, (BiologicalIntentName != "Tissue")) #now remove records that have 'N' in ResultMeasureValue or NA wqp3<-subset(wqp2, (ResultMeasureValue != "N")) wqp4<-subset(wqp3, (ResultMeasureValue != "NA")) ####################Remove columns that only contain NAs################################################# wqp5<-wqp4[, colSums(is.na(wqp4)) != nrow(wqp4)] #how many records in WQP5? ######Remove records with NA in lat or long############################################################## wqp6<-subset(wqp5, wqp5$LatitudeMeasure != "NA") wqp7<-subset(wqp6, wqp6$LongitudeMeasure != "NA") dim(wqp7) #2,492,283 occurence records names(wqp7) unique(wqp7$MonitoringLocationIdentifier) # there are 35,428 sampling locations #Still need to create columns for each type of taxonomic group and remove non-target insect orders. Need to subset by streams/rivers for subsequent analyses #Possibly need to remove records that have errors in the geodatum- some are coded as 'UNKWN' for the Vertical and Horizontal coordinate reference system datum name #Exploring data some more unique(wqp7$MonitoringLocationTypeName) unique(wqp7$HorizontalCoordinateReferenceSystemDatumName) unique(wqp7$VerticalCoordinateReferenceSystemDatumName) #One type of vertical reference is given as SEALV- trying to determine what this datum is sealv<-subset(wqp7, VerticalCoordinateReferenceSystemDatumName=="SEALV") names(sealv) unique(sealv$OrganizationFormalName.x) #SEALV is probably mean sea level, likely equivalent to NGVD29 #One type of horizontal reference is given as OTHER- trying to determine what this datum is other<-subset(wqp7, HorizontalCoordinateReferenceSystemDatumName=="OTHER") unknown<-subset(wqp7, HorizontalCoordinateReferenceSystemDatumName=="UNKWN") ###############Dropping all records that do not have horizontal datum defined######################## wqp8<-subset(wqp7, (wqp7$HorizontalCoordinateReferenceSystemDatumName != "OTHER") & (wqp7$HorizontalCoordinateReferenceSystemDatumName !="UNKWN")) unique(wqp8$HorizontalCoordinateReferenceSystemDatumName) #how many records remain? write.csv(wqp8, "WQP_invertebrate_1979_2016_2.csv") #comprehensive WQP insect occurrence dataset without geo-referencing errors ###########Assign taxonomic names for each taxonomic level- from species to order to WQP data by creating a separate column for each taxonomic level##### #####read in the data#### wqp_insect<-read.csv("WQP_invertebrate_1979_2016_2.csv") #####create a list of unique taxa and get records using 'taxize' library###### utaxon<-unique(wqp_insect$SubjectTaxonomicName) #install 'taxize' library to extract taxonomic names install.packages('taxize') install.packages('jsonlite') install.packages('data.table') library(taxize) #can use this package to query phylogenetic databases #now practice extracting higher taxonomic names for a single species using the ITIS database classification("Enallagma boreale", db='itis') #that works, now need to extract it for all in the list of WQP taxa ###############extract higher taxonomic names for all taxa from WQP############################################# specieslist<-classification(utaxon, db='itis') class(specieslist) class(specieslist$`Rhyacophila pellisa`) specieslist$`Rhyacophila pellisa`$rank specieslist$`Rhyacophila pellisa`$name specieslist$`Cricotopus (Cricotopus)` test<-rbind(specieslist$`Rhyacophila pellisa`$name, specieslist$`Cricotopus (Cricotopus)`$name) #need to rbind all of these rankings and then rename column headers with the rank. Some taxa go to a lower taxonomic rank- e.g., some go to genus and some to species. #will need to replace species rankings with 'NA' for taxa that are not ID'd to species in WQP. After we have these, we add these columns for each taxa in the 'utaxon' list in WQP. #after assigning rankings, we can extract a list of all taxa with species or genus level records in WQP, and download the GBIF data for all of these taxa. ranks <- do.call("rbind", lapply(specieslist, "[[", 1)) #this extracts all of the "name" elements from each list in specieslist and rbinds them to result in a matrix with one column for #taxonomic name in WQP, and then a column for each higher rank from ITIS. The resultant matrix can be combined with WQP data ##do.call 'rbinds' or applies any other 'function' separately over each element within the list ##########Create a 'lookup' table that has the species name from WQP and the taxonomic name for each rank############################# ##########this extracts the names of lists, the first element- 'names', and the second element- 'ranks' from the list of lists and binds them into a dataframe id<-names(specieslist) names<-lapply(specieslist, '[[', 1) ranksnew1<-lapply(specieslist, '[', 2) ranksnew1 names(ranksnew1) <- NULL rank.test<-do.call(rbind, Map(data.frame, rank=ranksnew1, name=names, id=id)) #this is sort of what I want, except reshaped so taht each column is a separate rank #rename id to SubjectTaxonomicName library(plyr) rank.test3<-rename(rank.test, c("id"="SubjectTaxonomicName")) #writing this rank table to a csv file write.csv(rank.test3, file="taxonomic_ranks.csv") ###########Now, I need to reshape the rank.test3 database from long to wide format, creating a separate column for each taxonomic rank, before I #can combine this database with the WQP portal- use 'dcast' function for casting data frames- 'melt' function transforms data frame from wide to long format library(reshape2) ?dcast #rank is variable to swing into column names and name is the value, SubjectTaxonomicName is what we want to have its own row rank.test4<-dcast(rank.test3, SubjectTaxonomicName ~ rank, value.var="name") #this worked! Now can merge this table with the WQP databse #################Merge taxonomic rank lookup table with WQP database############################################################# wqp_insect2<-merge(wqp_insect, rank.test4, by="SubjectTaxonomicName", all=TRUE) #need to merge by more than one criterion? names(wqp_insect2) wqp_insect2$class #write resultant database to csv file write.csv(wqp_insect2, file="WQP_insect_w_ranks.csv") ################Subset records to insects only##################### insects<-read.csv("WQP_insect_w_ranks.csv", stringsAsFactors = FALSE) insects2<-subset(insects, class=="Insecta") #remove unneeded variables insects3<-insects2[,c(3:131)] #there are 1.9 million insect records names(insects3) write.csv(insects3, "insects2001v3.csv") #this is the database subset without extra columns with taxonomic information ###############Subset records to 2001 to present##################### insects4<-subset(insects3, ActivityStartDate>="2001-01-01") #1.2 million insect records from 2001 to present save.image(file="WQP_cleaning.RData")