--- title: "QUBES Lesson: Oysters and Water Quality in Virginia" author: "Julia Josephs" output: html_notebook --- This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. ```{r} #load your libraries - if you don't have them use the install.packages("") function to install them. knitr::opts_chunk$set(echo = TRUE) library(readr) library(tidyverse) library(lubridate) library(ggplot2) library(knitr) #bring in the James River oyster data - all located in the lower James oysterjames <- read.csv("LowerJamesOysters.csv") #look at the data oysterjames #note that the data is in character form summary(oysterjames) ``` ```{r} #manipulate the data using tidyverse- I will do it step by step and then all together #separate out the columns you need - Year, Total oysterjames%>% select(Year, Total) -> oysterjames1 oysterjames1 #convert year and total to numeric from character oysterjames1 %>% mutate_if(is.character,as.numeric)-> oysterjames2 oysterjames2 #arrange by year oysterjames2%>% arrange(Year) -> oysterjames3 oysterjames3 #get average Total for each year oysterjames3%>% group_by(Year)%>% mutate(mean(Total))%>% rename("average" = "mean(Total)")%>% select(Year, average)-> oysterjames4 oysterjames4 ``` ```{r} #all of the steps above but all together oysterjames%>% select(Year, Total)%>% mutate_if(is.character,as.numeric)%>% arrange(Year)%>% group_by(Year)%>% mutate(mean(Total))%>% rename("average" = "mean(Total)")%>% select(Year, average)-> oystersjamesave oystersjamesave #get rid of duplicate rows oystersjamesave%>% distinct(Year, average, .keep_all = TRUE)->oystersjamesave2 oystersjamesave2 ``` ```{r} #look at a histogram of the data to see if it looks normal hist(oystersjamesave2$Year) hist(oystersjamesave2$average) #the year data looks normally distributed but the average data does not so we can log it to transform it and make it more normal hist(log(oystersjamesave2$average)) ``` ```{r} #plot data to visualize how the number of oysters changes through time #We are going to create two scatterplots in order to visually see the relationship between the two variables, total oysters and year. We must do this before running the other statistical tests because if the plot does not show any increasing or decreasing trends, a linear regression model will not be useful. oystersjamesave%>% ggplot(aes(x=Year, y=average))+ geom_point()+ expand_limits(x = 2020)+ ggtitle("Average Total Oysters in the James River by Year")+ ylab("Average Total Oysters")+ geom_line() #plot the data to visualize the relationship between oysters and year ggplot(data=oystersjamesave2, aes(Year, average)) + geom_point()+ labs(x="Year", y = "Average Total Oysters")+ geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+ expand_limits(x=2020) ``` ```{r} #run a pearson's correlation test to measure how strong a relationship is between two variables by looking at the correlation coefficient: 1 indicates a strong positive relationship, -1 indicates a strong negative relationship, and a result of zero indicates no relationship at all. cor.test(oystersjamesave2$Year, oystersjamesave2$average) #the correlation coefficient is 0.8215589, which is very close to 1 meaning there is a strong positive relationship between year and oysters #make a linear regression model to find a relationship between year and oysters, Linear regression attempts to model the relationship between two variables by fitting a linear equation to observed data. fit.0 <- lm(Year ~ average, data=oystersjamesave2) fit.0 summary(fit.0) #because the p-value is low and the multiple R-squared is high, there is a good fit and a relationship does exist ``` ```{r} #bring in the lower James river water quality data waterqualityjames <- read.csv("LowerJamesWaterQuality.csv") waterqualityjames ``` ```{r} #we are going to start by looking at total nitrogen #manipulate the data- select only the columns we need, filter out the parameter we need, total nitrogen, separate the date, make the data numeric, make the years go in chronological order, get the average total nitrogen per year waterqualityjames%>% select(Parameter, MeasureValue, SampleDate)%>% filter(Parameter=="TN")%>% separate(SampleDate, c("month", "day", "year"))%>% mutate_if(is.character,as.numeric)%>% arrange(year)%>% group_by(year)%>% mutate(mean(MeasureValue))%>% rename("average" = "mean(MeasureValue)")%>% select(year, average)%>% rename(Year=year)-> waterjamesTN waterjamesTN #remove duplicate rows waterjamesTN%>% distinct(Year, average, .keep_all = TRUE)->waterjamesTN2 waterjamesTN2 ``` ```{r} #plot total nitrogen data through time with labels and a title waterjamesTN2%>% ggplot(aes(x=Year, y= average)) + geom_point()+ expand_limits(x=2020)+ ggtitle("Total Nitrogen in the James River by Year")+ ylab("Average Total Nitrogen") #plot the data to visualize the relationship between total nitrogen and year ggplot(data=waterjamesTN2, aes(Year, average)) + geom_point()+ labs(x="Year", y = "Total Nitrogen")+ geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+ expand_limits(x=2020) ``` ```{r} #run a pearson's correlation test to see if the variables year and total nitrogen are related cor.test(waterjamesTN2$Year, waterjamesTN2$average) #make a linear regression model to find a relationship between year and total nitrogen fit.1.0 <- lm(Year ~ average, data=waterjamesTN2) fit.1.0 summary(fit.1.0) ``` ```{r} #next we are going to look at chlorophyll-A in the James #manipulate the data as before but with the parameter CHLA waterqualityjames%>% select(Parameter, MeasureValue, SampleDate)%>% filter(Parameter=="CHLA")%>% separate(SampleDate, c("month", "day", "year"))%>% mutate_if(is.character,as.numeric)%>% filter(year>=2008)%>% arrange(year)%>% group_by(year)%>% mutate(mean(MeasureValue))%>% rename("average" = "mean(MeasureValue)")%>% select(year, average)%>% rename(Year=year)-> waterjamesCHL waterjamesCHL #remove duplicate rows waterjamesCHL%>% distinct(Year, average, .keep_all = TRUE)->waterjamesCHL2 waterjamesCHL2 ``` ```{r} #plot the data to visualize how it changes through time waterjamesCHL2%>% ggplot(aes(x=Year, y= average)) + geom_point()+ ggtitle("Chlorophyll-A in the James River by Year")+ ylab("Chlorophyll-A")+ expand_limits(x=2020) #plot the data to visualize the relationship between chlorophyll-A and year ggplot(data=waterjamesCHL2, aes(Year, average)) + geom_point()+ labs(x="Year", y = "Total Chlorophyll-A")+ geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+ expand_limits(x=2020) ``` ```{r} #run a pearson's correlation test to see if the variables year and chlorophyll-A are related cor.test(waterjamesCHL2$Year, waterjamesCHL2$average) #make a linear regression model to find a relationship between year and chlorophyll-A fit.2 <- lm(Year ~ average, data=waterjamesCHL2) fit.2 summary(fit.2) ``` ```{r} #next is total phosphorus in the James #manipulate the data as before but with TP waterqualityjames%>% select(Parameter, MeasureValue, SampleDate)%>% filter(Parameter=="TP")%>% separate(SampleDate, c("month", "day", "year"))%>% mutate_if(is.character,as.numeric)%>% arrange(year)%>% group_by(year)%>% mutate(mean(MeasureValue))%>% rename("average" = "mean(MeasureValue)")%>% select(year, average)%>% rename(Year=year)-> waterjamesTP waterjamesTP #remove duplicate rows waterjamesTP%>% distinct(Year, average, .keep_all = TRUE)->waterjamesTP2 waterjamesTP2 ``` ```{r} #plot data waterjamesTP2%>% ggplot(aes(x=Year, y= average)) + geom_point()+ expand_limits(x=2020)+ ggtitle("Total Phosphorus in the James River by Year")+ ylab("Average Total Phosphorus") #plot the data to visualize the relationship between TP and year ggplot(data=waterjamesTP2, aes(Year, average)) + geom_point()+ labs(x="Year", y = "Total Phosphorus")+ geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+ expand_limits(x=2020) ``` ```{r} #run a pearson's correlation test to see if the variables year and total phosphorus are related cor.test(waterjamesTP2$Year, waterjamesTP2$average) #make a linear regression model to find a relationship between year and total phosphorus fit.3 <- lm(Year ~ average, data=waterjamesTP2) fit.3 summary(fit.3) ``` ```{r} #last is turbidity in the james #manipulate the data as before but with TURB_NTU waterqualityjames waterqualityjames%>% select(Parameter, MeasureValue, SampleDate)%>% filter(Parameter=="TURB_NTU")%>% separate(SampleDate, c("month", "day", "year"))%>% mutate_if(is.character,as.numeric)%>% arrange(year)%>% group_by(year)%>% mutate(mean(MeasureValue))%>% rename("average" = "mean(MeasureValue)")%>% select(year, average)%>% rename(Year=year)-> waterjamesTURB waterjamesTURB #remove duplicate rows waterjamesTURB%>% distinct(Year, average, .keep_all = TRUE)->waterjamesTURB2 waterjamesTURB2 ``` ```{r} #lets look at suspended solids instead since turbidity is missing a lot of data TSS <- read.csv("TSSlowerjames.csv") TSS%>% select(Parameter, MeasureValue, SampleDate)%>% filter(Parameter=="TSS")%>% separate(SampleDate, c("month", "day", "year"))%>% mutate_if(is.character,as.numeric)%>% arrange(year)%>% group_by(year)%>% mutate(mean(MeasureValue))%>% rename("average" = "mean(MeasureValue)")%>% select(year, average)%>% rename(Year=year)-> waterjamesTSS waterjamesTSS waterjamesTSS%>% distinct(Year, average, .keep_all = TRUE)->waterjamesTSS2 waterjamesTSS2 ``` ```{r} #plot data waterjamesTSS2%>% ggplot(aes(x=Year, y= average)) + geom_point()+ expand_limits(x=2020)+ ggtitle("Total Suspended Solids in the James River by Year")+ ylab("Total Suspended Solids") #plot the data to visualize the relationship between TTS and year ggplot(data=waterjamesTSS2, aes(Year, average)) + geom_point()+ labs(x="Year", y = "Total Suspended Solids")+ geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+ expand_limits(x=2020) ``` ```{r} #run a pearson's correlation test to see if the variables year and total suspended solids are related cor.test(waterjamesTSS2$Year, waterjamesTSS2$average) #make a linear regression model to find a relationship between year and total suspended solids fit.4 <- lm(Year ~ average, data=waterjamesTSS2) fit.4 summary(fit.4) ``` ```{r} #this is where the students'.RMD file becomes different - they will not have this code, they will have to do it on their own for homework #load in the Rappahannock oyster data oysterrapp1 <- read_csv("LowerRappOysters.csv") #look at the data oysterrapp1 #note that the data is in character form summary(oysterrapp1) ``` ```{r} #separate out the columns you need - Year, Total #convert year and total to numeric from character #arrange by year #get average Total for each year oysterrapp1%>% select(Year, Total)%>% mutate_if(is.character,as.numeric)%>% arrange(Year)%>% group_by(Year)%>% mutate(mean(Total))%>% rename("average" = "mean(Total)")%>% select(Year, average)-> oystersaverage2 oystersaverage2 #remove duplicate rows oystersaverage2%>% distinct(Year, average, .keep_all = TRUE)->oystersrappave3 oystersrappave3 ``` ```{r} #plot total oysters through time oystersrappave3%>% ggplot(aes(x=Year, y=average))+ geom_point()+ expand_limits(x = 2020)+ ggtitle("Average Total Oysters in the Rappahannock by Year")+ ylab("Average Total Oysters")+ geom_line() #plot total oysters in relation to year ggplot(data=oystersrappave3, aes(Year, average)) + geom_point()+ labs(x="Year", y = "Total Average Oysters")+ geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) ``` ```{r} #correlation test cor.test(oystersrappave3$Year, oystersrappave3$average) #linear regression fit.5 <- lm(Year ~ average, data=oystersrappave3) fit.5 summary(fit.5) ``` ```{r} #get water quality data for RAPP waterqualityrapp2 <- read_csv("LowerRappahannockWaterQuality.csv") waterqualityrapp2 ``` ```{r} #first we are going to look at total nitrogen #the Rappahannock oyster data starts in 2000, so we have to manipulate the water quality data to start in 2000 as well waterqualityrapp2%>% select(Parameter, MeasureValue, SampleDate)%>% filter(Parameter=="TN")%>% separate(SampleDate, c("month", "day", "year"))%>% filter(year>=2000)%>% mutate_if(is.character,as.numeric)%>% arrange(year)%>% group_by(year)%>% mutate(mean(MeasureValue))%>% rename("average" = "mean(MeasureValue)")%>% select(year, average)%>% rename(Year=year)-> waterrappTN waterrappTN #remove duplicate rows waterrappTN%>% distinct(Year, average, .keep_all = TRUE)->waterrappTN2 waterrappTN2 ``` ```{r} #let's take a look at the data hist(oystersaverage$average) hist(waterrappTN$average) #data does not look normally distributed, so we are going to log it to transform it hist(log(oystersaverage$average, 10), main = NULL, xlab="Logged Average Total Oysters") hist(log(waterrappTN$average, 10), main = NULL, xlab="Logged Average Total Nitrogen") ``` ```{r} #plot total nitrogen data through time with labels and a title waterrappTN2%>% ggplot(aes(x=Year, y= average)) + geom_point()+ expand_limits(x=2020)+ ggtitle("Average Total Nitrogen in the Rappahannock by Year")+ ylab("Average Total Nitrogen") #plot the relationship between TN and year ggplot(data=waterrappTN2, aes(Year, average)) + geom_point()+ labs(x="Year", y = "Total Nitrogen")+ geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) #since there is no visible trend, the line is not increasing or decreasing, we will not proceed with the correlation test or the linear regression model because we cannot visually establish a relationship between the variables. ``` ```{r} #chlorophyll-A RAPP #manipulate data waterqualityrapp2%>% select(Parameter, MeasureValue, SampleDate)%>% filter(Parameter=="CHLA")%>% separate(SampleDate, c("month", "day", "year"))%>% filter(year>=2000)%>% mutate_if(is.character,as.numeric)%>% arrange(year)%>% group_by(year)%>% mutate(mean(MeasureValue))%>% rename("average" = "mean(MeasureValue)")%>% select(year, average)%>% rename(Year=year)-> waterrappCHL waterrappCHL #remove duplicate rows waterrappCHL%>% distinct(Year, average, .keep_all = TRUE)->waterrappCHL2 waterrappCHL2 ``` ```{r} #plot data through time waterrappCHL2%>% ggplot(aes(x=Year, y= average)) + geom_point()+ expand_limits(x=2020)+ ggtitle("Total Chlorophyll-A in the Rappahannock by Year")+ ylab("Total Chlorophyll-A") #plot the relationship between CHL and year ggplot(data=waterrappCHL2, aes(Year, average)) + geom_point()+ labs(x="Year", y = "Total Chlorophyll-A")+ geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) ``` ```{r} #correlation test cor.test(waterrappCHL2$Year, waterrappCHL2$average) #linear regression fit.7 <- lm(Year ~ average, data=waterrappCHL2) fit.7 summary(fit.7) ``` ```{r} #total phosphorus RAPP waterqualityrapp2%>% select(Parameter, MeasureValue, SampleDate)%>% filter(Parameter=="TP")%>% separate(SampleDate, c("month", "day", "year"))%>% filter(year>=2000)%>% mutate_if(is.character,as.numeric)%>% arrange(year)%>% group_by(year)%>% mutate(mean(MeasureValue))%>% rename("average" = "mean(MeasureValue)")%>% select(year, average)%>% rename(Year=year)-> waterrappTP waterrappTP #remove duplicate rows waterrappTP%>% distinct(Year, average, .keep_all = TRUE)->waterrappTP2 waterrappTP2 #look at the data distribution hist(waterrappTP$average) hist(log(waterrappTP$average, 10), main = NULL, xlab="Logged Average Total Phosphorus") ``` ```{r} #plot the data through time waterrappTP2%>% ggplot(aes(x=Year, y= average)) + geom_point()+ expand_limits(x=2020)+ ggtitle("Average Total Phosphorus in the Rappahannock by Year")+ ylab("Average Total Phosphorus") #plot the relationship between TP and year ggplot(data=waterrappTP2, aes(Year, average)) + geom_point()+ labs(x="Year", y = "Total Phosphorus")+ geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) ``` ```{r} #correlation test cor.test(waterrappTP2$Year, waterrappTP2$average) #linear regression fit.8 <- lm(Year ~ average, data=waterrappTP2) fit.8 summary(fit.8) ``` ```{r} #turbitidy RAPP #TURB_NTU waterqualityrapp2%>% select(Parameter, MeasureValue, SampleDate)%>% filter(Parameter=="TURB_NTU")%>% separate(SampleDate, c("month", "day", "year"))%>% filter(year>=2000)%>% mutate_if(is.character,as.numeric)%>% arrange(year)%>% group_by(year)%>% mutate(mean(MeasureValue))%>% rename("average" = "mean(MeasureValue)")%>% select(year, average)%>% rename(Year=year)-> waterrappTURB waterrappTURB #remove duplicate rows waterrappTURB%>% distinct(Year, average, .keep_all = TRUE)->waterrappTURB2 waterrappTURB2 ``` ```{r} #plot turbidity through time to visualize with title and labels waterrappTURB2%>% ggplot(aes(x=Year, y= average)) + geom_point()+ ggtitle("Average Turbidity in the Rappahannock by Year")+ ylab("Average Turbidity") #plot the relationship between Turbididy and year ggplot(data=waterrappTURB2, aes(Year, average)) + geom_point()+ labs(x="Year", y = "Total Turbidity")+ geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x) #since there is no visible trend, the line is not increasing or decreasing, we will not proceed with the correlation test or the linear regression model because we cannot visually establish a relationship between the variables. ```