#Statistics with epidemiology of COVID-19
#Maria Shumskaya, Shakira Benjamin, Matthew G. Niepielko, Nicholas Lorusso. Kean University, NJ, USA.
#2021


rm(list= ls())
################################# REQUIREMENTS ######################################################
# Programs: RStudio 
# R packages: ggplot2, dplyr, stringr, maps, mapdata
# Files: covid19Data.csv file. 
###############################   DATA SOURCE   #####################################################
#The dataset in covid19Data.csv is cleaned and extensively modified for teaching and is based on the original data collected 
#by Genesis Laboratory Management, NJ, USA. Modifications included deleting of patients ID and 
# some other personal information, removing inconclusive data and other information unrelated to this module.
#For other than teaching purposes, the original data must be requested from Genesis Laboratory Management.
#####################################################################################################     


#INSTRUCTIONS:
#   	
# Download the covid19Data.csv file and store it in the directory you can find later.
#
# Use the bottom right frame of R Studio, find the Packages tab, and search for and install packages 
#ggplot2, dplyr, stringr, maps, mapdata if they are not already installed. 
#Alternatively, to install the packages you can run: 


install.packages(c("ggplot2", "devtools", "dplyr", "stringr"))

install.packages(c("maps", "mapdata"))

# Check the packages in the bottom right frame: make sure the the boxes next to these packages have check marks in them.
#Alternatively, you can load up the libraries using the R commands:
library(ggplot2)
library(ggmap)
library(maps)
library(mapdata)
library(stringr)
library(dplyr)
library(devtools)

# In the bottom right frame of R Studio, go to the Files tab and navigate to your recently
# downloaded file covid19Data.csv.  Left click on the file name with the mouse, then choose "Import Dataset". 


# After importing the file, the "View()" command will be automatically executed, causing the 
# dataset to appear in a tab in the top left quarter of R studio (right above this text). 
# Click on the tab and note the spreadsheet format of the data. If the deadwood dataset tab is
# closed (go ahead and close it) you can access it again by typing: 

View(covid19Data)
#ANSWER QUESTION 1 in your Word file. 

#PART 1. HEAT MAPS TO SHOW COVID19 SPREAD ACROSS NJ

#STEP 1.1. Test your maps package
#Let's plot the US map outline just to make sure your maps and ggplot2 packages are installed and everything is working OK.
#R can convert the data on outlines for continents, countries, states and counties from maps package into data frames ggplot2 can deal with:
usa = map_data("usa")
USA_base = ggplot(data = usa, mapping = aes(x = long, y = lat, group = group)) + 
  coord_fixed(1.3) + 
  geom_polygon(color = "black", fill = "white")
USA_base

#ANSWER QUESTION 2.

#Now let's add outlines for each state:
states = map_data("state")
USA_base + theme_nothing() + 
  geom_polygon(data = states, fill = NA, color = "black") +
  geom_polygon(color = "black", fill = NA)  

#STEP 1.2. Plot a map of New Jersey and its counties. 

nj_df = subset(states, region == "new jersey")
nj = ggplot(data = nj_df, mapping = aes(x = long, y = lat, group = group)) + 
  coord_fixed(1.3) + 
  geom_polygon(color = "white", fill = "gray")
nj + theme_nothing()
counties = map_data("county")
nj_county = subset(counties, region == "new jersey")
nj + theme_nothing() + 
  geom_polygon(data = nj_county, fill = "gray", color = "white") +
  geom_polygon(color = "white", fill = NA)

#STEP 1.3. Building a heat map for NJ to illustrate the number of positive cases per county, normalized to the county population
#First we need to make a new data frame county_data with the number of positive cases by county, and county population
county_data = aggregate(TEST_NUMERICAL ~ subregion + CountyPop, FUN=sum, data=covid19Data)
#Let's check if we made the table we want (counts of positive tests by county), and view the top of the data frame:
head(county_data)
#Now let's normalize the number of tests to each county population:
county_data$normalized = county_data$TEST_NUMERICAL/county_data$CountyPop
head(county_data)

#Now let's merge these numbers to the county outlines in the map. Subregion column in the covid19Data file 
#contains names of the counties that R can use to merge with our data frame county_data and create a new data frame CovbyCounty:
CovbyCounty <- inner_join(nj_county, county_data, by = "subregion")

#Now let's make a blank map:

BWMAP <- theme(
  axis.text = element_blank(),
  axis.line = element_blank(),
  axis.ticks = element_blank(),
  panel.border = element_blank(),
  panel.grid = element_blank(),
  axis.title = element_blank()
)

#FIll the blank map with values from CovbyCounty to produce heat map

CovidCountyMap <- nj + 
  geom_polygon(data = CovbyCounty, aes(fill = normalized), color = "white") +
  geom_polygon(color = "gray", fill = "NA") +
  theme_bw() +
  ggtitle("Summary of positive cases per county normalized for population 
          between March and December 2020")+
  BWMAP + 
  scale_fill_gradient(trans = "log10")
CovidCountyMap

#Now let's add some county labels
county_names <- covid19Data[!duplicated(covid19Data$County),] #creates a new data frame with no duplicated county names
#county_names contains the county names, location, and population density ranks (1-21)
#type View(county_names) in the console and look at the new data frame, notice that it only has 1 sample from each county

#Add each label to the map
CovidCountyMap <- CovidCountyMap + geom_text(data = county_names, aes(x = long, y = lat, label= County), size = 2.5, color = "firebrick2") #adds the county names
CovidCountyMap

#Lets save a high resolution image of the state map
#The saved image will be in your working directory
#To determine your working directory, type getwd() in your Console
ggsave(plot = CovidCountyMap, "CovidCountyMap.png", width = 17, height = 20, units = "cm", dpi = 300) #saves the map as a high resolution image


#STEP 1.4. Building a heat map for only one month at a time.
#Now, let's see what things look like if we look only at the data from one month. Lets make another data frame, county_data2,
#and add the information about the month there:

county_data2 = aggregate(TEST_NUMERICAL ~ subregion + CountyPop + MONTH, FUN=sum, data=covid19Data)
#Let's check if we made the table we want 
head(county_data2)
#Now let's normalize the number of tests to each county population
county_data2$normalized = county_data2$TEST_NUMERICAL/county_data2$CountyPop
head(county_data2)
#ANSWER QUESTION 3.

#STEP 1.4A. Now, let's pick March (month number 3) from this data frame and make a map for it:
march_data = subset(county_data2, MONTH==3)
MarchCounty <- inner_join(nj_county, march_data, by = "subregion")

#Fill the NJ county map with the information 
MarchMap <- nj + 
  geom_polygon(data = MarchCounty, aes(fill = normalized), color = "white") +
  geom_polygon(color = "gray", fill = "NA") +
  theme_bw() +
  ggtitle("March") +
  BWMAP +
  scale_fill_gradient(trans = "log10") +
  geom_text(data = county_names, aes(x = long, y = lat, label= County), size = 2.5, color = "firebrick2")+ #adds county names
  geom_text(data = county_names, aes(x = long, y = lat-.05, label= rank), size = 2.5, color = "firebrick2") #adds county population density ranks
MarchMap #adds the county names
#notice that grey areas are counties that do not contain any data (no positive tests were collected) - don't worry if you get a warning message because of this when plotting the maps

#ANSWER QUESTION 4.

#STEP 1.4B. Now, build a heat map of COVID19 positive test results for June. Use the script in STEP 4A as an example.
#You will have to make necessary substitutions.
#Create a data frame for June as a subset of county_data2, then plot the data on the county map. Write your script below.

#ANSWER QUESTION 5. 

#STEP 1.4c. Now, repeat the same for December.

#ANSWER QUESTION 6.

#ANSWER QUESTION 7. 

#PART 2. ANALYSIS THE INFECTION RATES IN DIFFERENT GENDERS
#Now, we are going to test the hypothesis that the rate of infection correlates with the gender.

#STEP 2.1. Compare the number of postitive results for males in females detected in NJ in 2020
#First, let's create two vectors out of our covid19Data dataset. Each one will contain either only male or female data.
maletest =  subset(covid19Data, PATIENT_SEX == 'M', select = c("TEST_NUMERICAL","PATIENT_SEX"))
femaletest = subset(covid19Data, PATIENT_SEX == 'F', select = c("TEST_NUMERICAL","PATIENT_SEX"))

#Let's count the number of tests and positive tests by sex
lenm = length(maletest$TEST_NUMERICAL)
lenm
lenf = length(femaletest$TEST_NUMERICAL)
lenf
#ANSWER QUESTION 8. 


summ = sum(maletest$TEST_NUMERICAL)
summ
sumf = sum(femaletest$TEST_NUMERICAL)
sumf
#ANSWER QUESTION 9. 

# Make a bar plot to compare the number of positive tests between the genders:
barplot(c(summ,sumf), main = "Number of positive tests by gender", names.arg = c("Male","Female"), ylab = "Raw numbers of positive tests", col = c("blue", "red"), ylim = c(0,14000))
#ANSWER QUESTION 10. 

#copy the line that plots the data, but change the colors for males to green, and for females to purple.

#ANSWER QUESTION 11.

#Now let's calculate % of the male or female populations with a positive test:
MalePosRate = (summ/lenm)*100
MalePosRate
FemalePosRate = (sumf/lenf)*100
FemalePosRate
# Make a bar plot of the % data. For the names.arg, type the labels for the bars, and for the ylab type the lable for y axis: 
barplot(c(MalePosRate,FemalePosRate), main = "Percentage of positive tests by gender", names.arg = c("INSERT_LABEL","INSERT_LABEL"), ylab = "INSERT_LABEL", col = c("blue", "red"), ylim = c(0,11))


#ANSWER QUESTION 12.

#STEP 2.2.
#Now let's see if the number of positive tests detected in males and females is significantly different
# We will use a non-parametric equivalent of the t-test: the Mann-Whitney-Wilcoxon test

#First, combine two vectors together 
SexData = rbind(maletest,femaletest)
#perform the test
SexTest = wilcox.test(TEST_NUMERICAL ~ PATIENT_SEX, data=SexData)
SexTest
#ANSWER QUESTION 13.


#Save your workspace by clicking on the environment tab on the right side and clicking the blue disc icon
#Save the workspace environment as NJ_Covid.RData 
############################  BREAK  ########################################################

#If you have time, please continue to 
#PART 3. GIF of COVID-19 infection numbers change over time

#Now we can create a GIF to visualize how COVID-19 infections change over time
#For this, we will need the data generated in STEPS 1 and 2 and the gifski package

#STEP 3.1, Download, install, and import the gifski package
#To install gifski: On the RStudio toolbar, Select Tools, Install packages...
                    #Type gifski, dependencies should be checked
                    #Click Install


#Use the library function to import the necessary packages to create the gif.Complete the line below with the name of the package
library() #import gifski

#STEP 3.2. create unique heat map text naming labels

#Our data is limited to March-December 2020
#Let's create a data frame that contains all of the months in our data as represented by two digit text labels
#Example: "03" is March
#Identifying Months using two digits as text labels is needed to maintain the proper order of the data and unique names

#To create the naming text labels, we will use the formatC function
#type help("formatC") in the Console to learn more about this function

#Task 3.2, change formatC inputs to work with your data and store it as a data frame in the following way:
#use 3:12 to obtain all of the month numbers (March to December) found in our data
#set width = 2 so that each number will be 2 digits
#set flag = 0 to place a 0 in front of single digit numbers
month_num = formatC(10:100, width = 25, flag = 000)


#Let's store our month text labels found in month_num in a data frame called month_frame. Complete the line below:
month_frame = data.frame() #creates a data frame with month text labels
#Note, the numbers we create are considered "text" and not numeric values

#STEP 3.3, set working directory to save images
#Before continuing, let's tell RStudio where to save an image for each heat map that we create

#Task 3.3, create an empty folder to store images
#On the Rstudio toolbar:
                      #Go to Session, Set Working Directory, Choose Directory, choose a location and create and name an empty folder 
#An empty folder is needed to store all of the saved heat maps so that unwanted images are not added to the gif

#STEP 3.4, use a for loop to automatically create and save each map from the data
#Type help("for") in the Console to learn more about the "for loop"

#Our "for loop" will iterate through each month stored in month_frame, one at a time
#For each month, it will create the NJ heat map of the Covid-19 data and save it to the new folder that you created in STEP 3.3

for (i in month_frame$month_num){
  #In the for loop, "i" will equal the month text label, one at a time
  month_number = as.numeric(i) #Convert the month text label into a numeric value since the numeric value in the Covid data
  month_data = subset(county_data2, MONTH==month_number)
  monthCounty <- inner_join(nj_county, month_data, by = "subregion")
  month_name = month.name[month_number] #Converts the month number into a month name 
  #month_name will be used in the map title
  #type help(month.name) in the Console for more information
  
  #Fill the NJ county map with the information 
  month_map <- nj + 
    geom_polygon(data = monthCounty, aes(fill = normalized), color = "white") +
    geom_polygon(color = "gray", fill = "NA") +
    theme_bw() +
    ggtitle(month_name) + #adds the month name to the title
    BWMAP +
    #the labs function allows for you to customize the legend title using fill = "New Title"
    #the labs function allows to add a caption area to be added on the map for additional info   
    labs(fill = "Cases per person", caption = "2020 Data: Genesis Laboratory Management")+
    
  #Task 3.4, keep the color values equal for every image using limits = c(low_value, high_value) within the scale_fill_gradient function
  #for this data, the low_value should be 8.011344e-06 and the high_value should be 0.003304063
    scale_fill_gradient(limits = c(,)) + #Use limits = to keep the same sale for everyone image
    #Keeping a set scale is needed to compare maps
    geom_text(data = county_names, aes(x = long, y = lat, label= County), size = 2.5, color = "firebrick2") #adds county names

  
  #Let's create a unique name for our map, including the image file type
  #We can combine the month text label with the .png file type using the paste0 function
  #Type help("paste0") in the Console for more information about this function
  image_name = paste0(i,".png")
  
  #Now we are all set to save each of the heat maps as an individual .png image file
  #Let's use the ggsave function to save a high resolution image for each heat map created
  #Images will be automatically saved to our working directory
  #type help("ggsave") in console to learn more about the function
  #plot = is used to designate what ggplot is being saved, it should be month_map
  #the next item in ggsave is the name of the unique image, which is image_name
  #width, hieght, and unites designates how large you want the image to be
  #dpi = allows you to adjust the resolution, for publications and posters 300dpi is required
  
  ggsave(plot = month_map,  image_name , width = 17, height = 20, units = "cm", dpi = 300) #saves the map as a high resolution image
}

#All of the images should now be saved to our working directory 
#Before continuing, please check to see if the images are there

#If you are missing images in your folder, please troubleshoot with your instructor
#If all images are there, let's move on and create a GIF from all of the saved heat maps

#STEP 3.5, organize and identify the saved heat map images

#Let's replace month labels with their full unique image names, including the .png
month_frame$month_num = paste0(month_frame$month_num,".png") #create a data frame with unique full image names


#Task 3.5, Create a variable called locations that represents the file location where the images are saved
#This should also be your working directory
#Example: locations = "C:/Users/User_name/Desktop/Test" 
locations = 

#Let's get all the .png files that are located in the working directly using the list.files function
#Type help("list.files") in the Console for more information

png_files = list.files(locations, pattern = ".png", full.names = TRUE)
#type png_files in the Console to see the list of heat maps created by the for loop

#STEP 3.6, Create the GIF
#use the gifski function to generate a GIF of all the heat maps
#the gifski function take the list of png_files

#Task 3.6, complete the gifski function
#To set the name of the file, use gif_file = "NJ_covid.gif"
#Set the width = 2007, height = 2362 so that the images are not squished or stretched
#These are the pixel dimension for 17x20cm images that were saved in the for loop
#Delay = 1 means that there is a 1 sec delay between images. 

#Type help(gifski) in the Console for more information
gifski(png_files, gif_file = "", width = , height = , delay = )

#STEP 3.7, open the GIF
#Task 3.7, go to your computer file location to view the heat map gif

#Answer Questions 14-15

############################  THE END  ########################################################