---
title: "QUBES Lesson: Oysters and Water Quality in Virginia"
author: "Julia Josephs" 
output: html_notebook
---

This is an [R Markdown](http://rmarkdown.rstudio.com) Notebook. When you execute code within the notebook, the results appear beneath the code. 

Try executing this chunk by clicking the *Run* button within the chunk or by placing your cursor inside it and pressing *Cmd+Shift+Enter*. 

```{r}
#load your libraries - if you don't have them use the install.packages("") function to install them.
knitr::opts_chunk$set(echo = TRUE)
library(readr)
library(tidyverse)
library(lubridate)
library(ggplot2)
library(knitr)

#bring in the James River oyster data - all located in the lower James 
oysterjames <- read.csv("LowerJamesOysters.csv")
oysterjames

#look at the data 
oysterjames

#note that the data is in character form
summary(oysterjames)
```

```{r}
#manipulate the data- I will do it step by step and then all together 
#separate out the columns you need - Year, Total
oysterjames%>%
  select(Year, Total) -> oysterjames1
oysterjames1

#convert year and total to numeric from character
oysterjames1 %>%
  mutate_if(is.character,as.numeric)-> oysterjames2
oysterjames2

#arrange by year 
oysterjames2%>%
  arrange(Year) -> oysterjames3
oysterjames3

#get average Total for each year
oysterjames3%>%
  group_by(Year)%>%
  mutate(mean(Total))%>%
  rename("average" = "mean(Total)")%>%
  select(Year, average)-> oysterjames4
oysterjames4
```

```{r}
#all of the steps above but all together 
oysterjames%>%
  select(Year, Total)%>%
  mutate_if(is.character,as.numeric)%>%
  arrange(Year)%>%
  group_by(Year)%>%
  mutate(mean(Total))%>%
  rename("average" = "mean(Total)")%>%
  select(Year, average)-> oystersjamesave
  oystersjamesave
  
#get rid of duplicate rows 
oystersjamesave%>%
  distinct(Year, average, .keep_all = TRUE)->oystersjamesave2
oystersjamesave2
```

```{r}
#look at a histogram of the data to see if it looks normal
hist(oystersjamesave2$Year)
hist(oystersjamesave2$average)

#the year data looks normally distributed but the average data does not so we can log it to transform it and make it more normal
hist(log(oystersjamesave2$average))
```

```{r}
#plot data to visualize how the number of oysters changes through time 
#We are going to create two scatterplots in order to visually see the relationship between the two variables, total oysters and year. We must do this before running the other statistical tests because if the plot does not show any increasing or decreasing trends, a linear regression model will not be useful.   
oystersjamesave%>%
  ggplot(aes(x=Year, y=average))+ 
  geom_point()+
  expand_limits(x = 2020)+
  ggtitle("Average Total Oysters in the James River by Year")+
  ylab("Average Total Oysters")+
  geom_line()

#plot the data to visualize the relationship between oysters and year 
ggplot(data=oystersjamesave2, aes(Year, average)) +
  geom_point()+
  labs(x="Year", y = "Average Total Oysters")+
  geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+
  expand_limits(x=2020)
```


```{r}
#run a pearson's correlation test to measure how strong a relationship is between two variables by looking at the correlation coefficient: 1 indicates a strong positive relationship, -1 indicates a strong negative relationship, and a result of zero indicates no relationship at all.

cor.test(oystersjamesave2$Year, oystersjamesave2$average)
#the correlation coefficient is 0.8215589, which is very close to 1 meaning there is a strong positive relationship between year and oysters

#make a linear regression model between year and oysters to find a relationship, Linear regression attempts to model the relationship between two variables by fitting a linear equation to observed data. 
fit.0 <- lm(Year ~ average, data=oystersjamesave2)
fit.0
summary(fit.0)
#because the p-value is low and the multiple R-squared is high, there is a good fit and a relationship does exist 
```


```{r}
#bring in the lower James river water quality data 
waterqualityjames <- read.csv("LowerJamesWaterQuality.csv")
waterqualityjames
```

```{r}
#we are going to start by looking at total nitrogen 
#manipulate the data- select only the columns we need, filter out the parameter we need, total nitrogen, separate the date, make the data numeric, make the years go in chronological order, get the average total nitrogen per year 
waterqualityjames%>%
  select(Parameter, MeasureValue, SampleDate)%>%
  filter(Parameter=="TN")%>%
  separate(SampleDate, c("month", "day", "year"))%>%
  mutate_if(is.character,as.numeric)%>%
  arrange(year)%>%
  group_by(year)%>%
  mutate(mean(MeasureValue))%>%
  rename("average" = "mean(MeasureValue)")%>%
  select(year, average)%>%
  rename(Year=year)-> waterjamesTN
waterjamesTN

#remove duplicate rows 
waterjamesTN%>%
  distinct(Year, average, .keep_all = TRUE)->waterjamesTN2
waterjamesTN2
```

```{r}
#plot total nitrogen data through time with labels and a title 
waterjamesTN2%>%
  ggplot(aes(x=Year, y= average)) + 
  geom_point()+
  expand_limits(x=2020)+
  ggtitle("Total Nitrogen in the James River by Year")+
  ylab("Average Total Nitrogen")

#plot the data to visualize the relationship between total nitrogen and year 
ggplot(data=waterjamesTN2, aes(Year, average)) +
  geom_point()+
  labs(x="Year", y = "Total Nitrogen")+
  geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+
  expand_limits(x=2020)
```

```{r}
#run a pearson's correlation test to see if the variables year and total nitrogen are related
cor.test(waterjamesTN2$Year, waterjamesTN2$average)

#make a linear regression model to find a relationship between year and total nitrogen
fit.1.0 <- lm(Year ~ average, data=waterjamesTN2)
fit.1.0
summary(fit.1.0)
```


```{r}
#next we are going to look at chlorophyll-A (CHLA) in the James 
#manipulate the data as before but with the parameter CHLA
waterqualityjames%>%
  select(Parameter, MeasureValue, SampleDate)%>%
  filter(Parameter=="CHLA")%>%
  separate(SampleDate, c("month", "day", "year"))%>%
  mutate_if(is.character,as.numeric)%>%
  filter(year>=2008)%>%
  arrange(year)%>%
  group_by(year)%>%
  mutate(mean(MeasureValue))%>%
  rename("average" = "mean(MeasureValue)")%>%
  select(year, average)%>%
  rename(Year=year)-> waterjamesCHL
waterjamesCHL

#remove duplicate rows 
waterjamesCHL%>%
  distinct(Year, average, .keep_all = TRUE)->waterjamesCHL2
waterjamesCHL2
```

```{r}
#plot the data to visualize how it changes through time
waterjamesCHL2%>%
  ggplot(aes(x=Year, y= average)) + 
  geom_point()+
  ggtitle("Chlorophyll-A in the James River by Year")+
  ylab("Chlorophyll-A")+
  expand_limits(x=2020)

#plot the data to visualize the relationship between chlorophyll-A and year 
ggplot(data=waterjamesCHL2, aes(Year, average)) +
  geom_point()+
  labs(x="Year", y = "Total Chlorophyll-A")+
  geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+
  expand_limits(x=2020)
```
```{r}
#run a pearson's correlation test to see if the variables year and chlorophyll-A are related
cor.test(waterjamesCHL2$Year, waterjamesCHL2$average)

#make a linear regression model to find a relationship between year and chlorophyll-A
fit.2 <- lm(Year ~ average, data=waterjamesCHL2)
fit.2
summary(fit.2)
```


```{r}
#next we are going to look at the total phosphorus (TP) in the James 
#manipulate the data as before but with the parameter TP 
waterqualityjames%>%
  select(Parameter, MeasureValue, SampleDate)%>%
  filter(Parameter=="TP")%>%
  separate(SampleDate, c("month", "day", "year"))%>%
  mutate_if(is.character,as.numeric)%>%
  arrange(year)%>%
  group_by(year)%>%
  mutate(mean(MeasureValue))%>%
  rename("average" = "mean(MeasureValue)")%>%
  select(year, average)%>%
  rename(Year=year)-> waterjamesTP
waterjamesTP

#remove duplicate rows
waterjamesTP%>%
  distinct(Year, average, .keep_all = TRUE)->waterjamesTP2
waterjamesTP2
```


```{r}
#plot data through time 
waterjamesTP2%>%
  ggplot(aes(x=Year, y= average)) + 
  geom_point()+
  expand_limits(x=2020)+
  ggtitle("Total Phosphorus in the James River by Year")+
  ylab("Average Total Phosphorus")

#plot the data to visualize the relationship between TP and year 
ggplot(data=waterjamesTP2, aes(Year, average)) +
  geom_point()+
  labs(x="Year", y = "Total Phosphorus")+
  geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+
  expand_limits(x=2020)

```
```{r}
#run a pearson's correlation test to see if the variables year and total phosphorus are related
cor.test(waterjamesTP2$Year, waterjamesTP2$average)

#make a linear regression model to find a relationship between year and total phosphorus
fit.3 <- lm(Year ~ average, data=waterjamesTP2)
fit.3
summary(fit.3)
```


```{r}
#the last parameter is turbidity in the James 
#manipulate the data as before but with TURB_NTU
waterqualityjames
waterqualityjames%>%
  select(Parameter, MeasureValue, SampleDate)%>%
  filter(Parameter=="TURB_NTU")%>%
  separate(SampleDate, c("month", "day", "year"))%>%
  mutate_if(is.character,as.numeric)%>%
  arrange(year)%>%
  group_by(year)%>%
  mutate(mean(MeasureValue))%>%
  rename("average" = "mean(MeasureValue)")%>%
  select(year, average)%>%
  rename(Year=year)-> waterjamesTURB
waterjamesTURB

#remove duplicate rows 
waterjamesTURB%>%
  distinct(Year, average, .keep_all = TRUE)->waterjamesTURB2
waterjamesTURB2
```


```{r}
#lets look at suspended solids (TSS) instead since turbidity is missing a lot of data
#bring in data
TSS <- read.csv("TSSlowerjames.csv")
TSS%>%
  select(Parameter, MeasureValue, SampleDate)%>%
  filter(Parameter=="TSS")%>%
  separate(SampleDate, c("month", "day", "year"))%>%
  mutate_if(is.character,as.numeric)%>%
  arrange(year)%>%
  group_by(year)%>%
  mutate(mean(MeasureValue))%>%
  rename("average" = "mean(MeasureValue)")%>%
  select(year, average)%>%
  rename(Year=year)-> waterjamesTSS
waterjamesTSS

#remove duplicate rows 
waterjamesTSS%>%
  distinct(Year, average, .keep_all = TRUE)->waterjamesTSS2
waterjamesTSS2
```

```{r}
#plot data through time
waterjamesTSS2%>%
  ggplot(aes(x=Year, y= average)) + 
  geom_point()+
  expand_limits(x=2020)+
  ggtitle("Total Suspended Solids in the James River by Year")+
  ylab("Total Suspended Solids")

#plot the data to visualize the relationship between TTS and year 
ggplot(data=waterjamesTSS2, aes(Year, average)) +
  geom_point()+
  labs(x="Year", y = "Total Suspended Solids")+
  geom_smooth(method = "lm", se=FALSE, color="black", formula = y ~ x)+
  expand_limits(x=2020)
```

```{r}
#run a pearson's correlation test to see if the variables year and total suspended solids are related
cor.test(waterjamesTSS2$Year, waterjamesTSS2$average)

#make a linear regression model to find a relationship between year and total suspended solids
fit.4 <- lm(Year ~ average, data=waterjamesTSS2)
fit.4
summary(fit.4)
```

```{r}
#load in the Rappahannock oyster data
oysterrapp1 <- read_csv("LowerRappOysters.csv")

#look at the data 
oysterrapp1

#note that the data is in character form
summary(oysterrapp1)
```

```{r}
#separate out the columns you need - Year, Total


#convert year and total to numeric from character


#arrange by year 


#get average Total for each year


#remove duplicate rows 

```

```{r}
 #plot total oysters through time in the Rappahannock with labels and a title using GGPLOT


#plot the relationship between total average oysters and year 

```

```{r}
#run a Pearson's correlation test between total average oysters and year 


#make a linear regression model between total average oysters and year 

```


```{r}
#get water quality data for RAPP
waterqualityrapp2 <- read_csv("LowerRappahannockWaterQuality.csv")
waterqualityrapp2
```

```{r}
#start with total nitrogen (TN)
#manipulate data -  select only the columns you need, isolate TN, separate date, isolate years 2000 or greater, group by the year, get the average per year
#the Rappahannock oyster data starts in 2000, so we have to manipulate the water quality data to start in 2000 as well 


#remove duplicate rows 

```

```{r}
#plot total nitrogen data through time with labels and a title using GGPLOT 


#plot the relationship between total nitrogen and year


#we will not proceed with the correlation test or the linear regression model 
```


```{r}
#chlorophyll-A RAPP - CHLA
#manipulate data -  select only the columns you need, isolate CHLA, separate date, isolate years 2000 or greater, group by the year, get the average per year


#remove duplicate rows 

```

```{r}
#plot the total CHLA through time using GGPLOT


#plot the relationship between CHLA and year 

```

```{r}
#run a correlation test between chlorophyll-A and year 


#run a linear regression model between chlorophyll-A and year
```

```{r}
#total phosphorous RAPP (TP)
#manipulate data like before to get the year and average total per year 


#remove duplicate rows 
```

```{r}
#plot the total phosphorus through time using GGPLOT


#plot the relationship between total phosphorus and year 

```

```{r}
#run a correlation test between total phosphorus and year 


#run a linear regression model between total phosphorus and year
```

```{r}
#turbididy RAPP (TURB_NTU)
#manipulate data as before 


#remove duplicate rows 
```

```{r}
#plot turbidity through time using GGPLOT


#plot the relationship between turbidity and year 

#we will not proceed with the correlation test or the linear regression model 
```