--- title: "QUBES Lesson: Oysters and Water Quality in Virginia" author: "Julia Josephs" output: html_notebook --- This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. ```{r} #load your libraries - if you don't have them use the install.packages("") function to install them. knitr::opts_chunk$set(echo = TRUE) library(readr) library(tidyverse) library(lubridate) library(ggplot2) library(knitr) #bring in the James River oyster data - all located in the lower James oysterjames <- read.csv("LowerJamesOysters.csv") oysterjames #look at the data oysterjames #note that the data is in character form summary(oysterjames) ``` ```{r} #manipulate the data- I will do it step by step and then all together #separate out the columns you need - Year, Total oysterjames%>% select(Year, Total) -> oysterjames1 oysterjames1 #convert year and total to numeric from character oysterjames1 %>% mutate_if(is.character,as.numeric)-> oysterjames2 oysterjames2 #arrange by year oysterjames2%>% arrange(Year) -> oysterjames3 oysterjames3 #get average Total for each year oysterjames3%>% group_by(Year)%>% mutate(mean(Total))%>% rename("average" = "mean(Total)")%>% select(Year, average)-> oysterjames4 oysterjames4 ``` ```{r} #all of the steps above but all together oysterjames%>% select(Year, Total)%>% mutate_if(is.character,as.numeric)%>% arrange(Year)%>% group_by(Year)%>% mutate(mean(Total))%>% rename("average" = "mean(Total)")%>% select(Year, average)-> oystersjamesave oystersjamesave #get rid of duplicate rows oystersjamesave%>% distinct(Year, average, .keep_all = TRUE)->oystersjamesave2 oystersjamesave2 ``` ```{r} #look at a histogram of the data to see if it looks normal hist(oystersjamesave2$Year) hist(oystersjamesave2$average) #the year data looks normally distributed but the average data does not so we can log it to transform it and make it more normal hist(log(oystersjamesave2$average)) ``` ```{r} #plot data to visualize how the number of oysters changes through time #We are going to create two scatterplots in order to visually see the relationship between the two variables, total oysters and year. We must do this before running the other statistical tests because if the plot does not show any increasing or decreasing trends, a linear regression model will not be useful. oystersjamesave%>% ggplot(aes(x=Year, y=average))+ geom_point()+ expand_limits(x = 2020)+ ggtitle("Average Total Oysters in the James River by Year")+ ylab("Average Total Oysters")+ geom_line() #plot the data to visualize the relationship between oysters and year ggplot(data=oystersjamesave2, aes(Year, average)) + geom_point()+ labs(x="Year", y = "Average Total Oysters")+ geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+ expand_limits(x=2020) ``` ```{r} #run a pearson's correlation test to measure how strong a relationship is between two variables by looking at the correlation coefficient: 1 indicates a strong positive relationship, -1 indicates a strong negative relationship, and a result of zero indicates no relationship at all. cor.test(oystersjamesave2$Year, oystersjamesave2$average) #the correlation coefficient is 0.8215589, which is very close to 1 meaning there is a strong positive relationship between year and oysters #make a linear regression model between year and oysters to find a relationship, Linear regression attempts to model the relationship between two variables by fitting a linear equation to observed data. fit.0 <- lm(Year ~ average, data=oystersjamesave2) fit.0 summary(fit.0) #because the p-value is low and the multiple R-squared is high, there is a good fit and a relationship does exist ``` ```{r} #bring in the lower James river water quality data waterqualityjames <- read.csv("LowerJamesWaterQuality.csv") waterqualityjames ``` ```{r} #we are going to start by looking at total nitrogen #manipulate the data- select only the columns we need, filter out the parameter we need, total nitrogen, separate the date, make the data numeric, make the years go in chronological order, get the average total nitrogen per year waterqualityjames%>% select(Parameter, MeasureValue, SampleDate)%>% filter(Parameter=="TN")%>% separate(SampleDate, c("month", "day", "year"))%>% mutate_if(is.character,as.numeric)%>% arrange(year)%>% group_by(year)%>% mutate(mean(MeasureValue))%>% rename("average" = "mean(MeasureValue)")%>% select(year, average)%>% rename(Year=year)-> waterjamesTN waterjamesTN #remove duplicate rows waterjamesTN%>% distinct(Year, average, .keep_all = TRUE)->waterjamesTN2 waterjamesTN2 ``` ```{r} #plot total nitrogen data through time with labels and a title waterjamesTN2%>% ggplot(aes(x=Year, y= average)) + geom_point()+ expand_limits(x=2020)+ ggtitle("Total Nitrogen in the James River by Year")+ ylab("Average Total Nitrogen") #plot the data to visualize the relationship between total nitrogen and year ggplot(data=waterjamesTN2, aes(Year, average)) + geom_point()+ labs(x="Year", y = "Total Nitrogen")+ geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+ expand_limits(x=2020) ``` ```{r} #run a pearson's correlation test to see if the variables year and total nitrogen are related cor.test(waterjamesTN2$Year, waterjamesTN2$average) #make a linear regression model to find a relationship between year and total nitrogen fit.1.0 <- lm(Year ~ average, data=waterjamesTN2) fit.1.0 summary(fit.1.0) ``` ```{r} #next we are going to look at chlorophyll-A (CHLA) in the James #manipulate the data as before but with the parameter CHLA waterqualityjames%>% select(Parameter, MeasureValue, SampleDate)%>% filter(Parameter=="CHLA")%>% separate(SampleDate, c("month", "day", "year"))%>% mutate_if(is.character,as.numeric)%>% filter(year>=2008)%>% arrange(year)%>% group_by(year)%>% mutate(mean(MeasureValue))%>% rename("average" = "mean(MeasureValue)")%>% select(year, average)%>% rename(Year=year)-> waterjamesCHL waterjamesCHL #remove duplicate rows waterjamesCHL%>% distinct(Year, average, .keep_all = TRUE)->waterjamesCHL2 waterjamesCHL2 ``` ```{r} #plot the data to visualize how it changes through time waterjamesCHL2%>% ggplot(aes(x=Year, y= average)) + geom_point()+ ggtitle("Chlorophyll-A in the James River by Year")+ ylab("Chlorophyll-A")+ expand_limits(x=2020) #plot the data to visualize the relationship between chlorophyll-A and year ggplot(data=waterjamesCHL2, aes(Year, average)) + geom_point()+ labs(x="Year", y = "Total Chlorophyll-A")+ geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+ expand_limits(x=2020) ``` ```{r} #run a pearson's correlation test to see if the variables year and chlorophyll-A are related cor.test(waterjamesCHL2$Year, waterjamesCHL2$average) #make a linear regression model to find a relationship between year and chlorophyll-A fit.2 <- lm(Year ~ average, data=waterjamesCHL2) fit.2 summary(fit.2) ``` ```{r} #next we are going to look at the total phosphorus (TP) in the James #manipulate the data as before but with the parameter TP waterqualityjames%>% select(Parameter, MeasureValue, SampleDate)%>% filter(Parameter=="TP")%>% separate(SampleDate, c("month", "day", "year"))%>% mutate_if(is.character,as.numeric)%>% arrange(year)%>% group_by(year)%>% mutate(mean(MeasureValue))%>% rename("average" = "mean(MeasureValue)")%>% select(year, average)%>% rename(Year=year)-> waterjamesTP waterjamesTP #remove duplicate rows waterjamesTP%>% distinct(Year, average, .keep_all = TRUE)->waterjamesTP2 waterjamesTP2 ``` ```{r} #plot data through time waterjamesTP2%>% ggplot(aes(x=Year, y= average)) + geom_point()+ expand_limits(x=2020)+ ggtitle("Total Phosphorus in the James River by Year")+ ylab("Average Total Phosphorus") #plot the data to visualize the relationship between TP and year ggplot(data=waterjamesTP2, aes(Year, average)) + geom_point()+ labs(x="Year", y = "Total Phosphorus")+ geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+ expand_limits(x=2020) ``` ```{r} #run a pearson's correlation test to see if the variables year and total phosphorus are related cor.test(waterjamesTP2$Year, waterjamesTP2$average) #make a linear regression model to find a relationship between year and total phosphorus fit.3 <- lm(Year ~ average, data=waterjamesTP2) fit.3 summary(fit.3) ``` ```{r} #the last parameter is turbidity in the James #manipulate the data as before but with TURB_NTU waterqualityjames waterqualityjames%>% select(Parameter, MeasureValue, SampleDate)%>% filter(Parameter=="TURB_NTU")%>% separate(SampleDate, c("month", "day", "year"))%>% mutate_if(is.character,as.numeric)%>% arrange(year)%>% group_by(year)%>% mutate(mean(MeasureValue))%>% rename("average" = "mean(MeasureValue)")%>% select(year, average)%>% rename(Year=year)-> waterjamesTURB waterjamesTURB #remove duplicate rows waterjamesTURB%>% distinct(Year, average, .keep_all = TRUE)->waterjamesTURB2 waterjamesTURB2 ``` ```{r} #lets look at suspended solids (TSS) instead since turbidity is missing a lot of data #bring in data TSS <- read.csv("TSSlowerjames.csv") TSS%>% select(Parameter, MeasureValue, SampleDate)%>% filter(Parameter=="TSS")%>% separate(SampleDate, c("month", "day", "year"))%>% mutate_if(is.character,as.numeric)%>% arrange(year)%>% group_by(year)%>% mutate(mean(MeasureValue))%>% rename("average" = "mean(MeasureValue)")%>% select(year, average)%>% rename(Year=year)-> waterjamesTSS waterjamesTSS #remove duplicate rows waterjamesTSS%>% distinct(Year, average, .keep_all = TRUE)->waterjamesTSS2 waterjamesTSS2 ``` ```{r} #plot data through time waterjamesTSS2%>% ggplot(aes(x=Year, y= average)) + geom_point()+ expand_limits(x=2020)+ ggtitle("Total Suspended Solids in the James River by Year")+ ylab("Total Suspended Solids") #plot the data to visualize the relationship between TTS and year ggplot(data=waterjamesTSS2, aes(Year, average)) + geom_point()+ labs(x="Year", y = "Total Suspended Solids")+ geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+ expand_limits(x=2020) ``` ```{r} #run a pearson's correlation test to see if the variables year and total suspended solids are related cor.test(waterjamesTSS2$Year, waterjamesTSS2$average) #make a linear regression model to find a relationship between year and total suspended solids fit.4 <- lm(Year ~ average, data=waterjamesTSS2) fit.4 summary(fit.4) ``` ```{r} #load in the Rappahannock oyster data oysterrapp1 <- read_csv("LowerRappOysters.csv") #look at the data oysterrapp1 #note that the data is in character form summary(oysterrapp1) ``` ```{r} #separate out the columns you need - Year, Total #convert year and total to numeric from character #arrange by year #get average Total for each year #remove duplicate rows ``` ```{r} #plot total oysters through time in the Rappahannock with labels and a title using GGPLOT #plot the relationship between total average oysters and year ``` ```{r} #run a Pearson's correlation test between total average oysters and year #make a linear regression model between total average oysters and year ``` ```{r} #get water quality data for RAPP waterqualityrapp2 <- read_csv("LowerRappahannockWaterQuality.csv") waterqualityrapp2 ``` ```{r} #start with total nitrogen (TN) #manipulate data - select only the columns you need, isolate TN, separate date, isolate years 2000 or greater, group by the year, get the average per year #the Rappahannock oyster data starts in 2000, so we have to manipulate the water quality data to start in 2000 as well #remove duplicate rows ``` ```{r} #plot total nitrogen data through time with labels and a title using GGPLOT #plot the relationship between total nitrogen and year #we will not proceed with the correlation test or the linear regression model ``` ```{r} #chlorophyll-A RAPP - CHLA #manipulate data - select only the columns you need, isolate CHLA, separate date, isolate years 2000 or greater, group by the year, get the average per year #remove duplicate rows ``` ```{r} #plot the total CHLA through time using GGPLOT #plot the relationship between CHLA and year ``` ```{r} #run a correlation test between chlorophyll-A and year #run a linear regression model between chlorophyll-A and year ``` ```{r} #total phosphorous RAPP (TP) #manipulate data like before to get the year and average total per year #remove duplicate rows ``` ```{r} #plot the total phosphorus through time using GGPLOT #plot the relationship between total phosphorus and year ``` ```{r} #run a correlation test between total phosphorus and year #run a linear regression model between total phosphorus and year ``` ```{r} #turbididy RAPP (TURB_NTU) #manipulate data as before #remove duplicate rows ``` ```{r} #plot turbidity through time using GGPLOT #plot the relationship between turbidity and year #we will not proceed with the correlation test or the linear regression model ```