#Statistics with epidemiology of COVID-19 #Maria Shumskaya, Shakira Benjamin, Matthew G. Niepielko, Nicholas Lorusso. Kean University, NJ, USA. #2021 rm(list= ls()) ################################# REQUIREMENTS ###################################################### # Programs: RStudio # R packages: ggplot2, dplyr, stringr, maps, mapdata # Files: covid19Data.csv file. ############################### DATA SOURCE ##################################################### #The dataset in covid19Data.csv is cleaned and extensively modified for teaching and is based on the original data collected #by Genesis Laboratory Management, NJ, USA. Modifications included deleting of patients ID and # some other personal information, removing inconclusive data and other information unrelated to this module. #For other than teaching purposes, the original data must be requested from Genesis Laboratory Management. ##################################################################################################### #INSTRUCTIONS: # # Download the covid19Data.csv file and store it in the directory you can find later. # # Use the bottom right frame of R Studio, find the Packages tab, and search for and install packages #ggplot2, dplyr, stringr, maps, mapdata if they are not already installed. #Alternatively, to install the packages you can run: install.packages(c("ggplot2", "devtools", "dplyr", "stringr")) install.packages(c("maps", "mapdata")) # Check the packages in the bottom right frame: make sure the the boxes next to these packages have check marks in them. #Alternatively, you can load up the libraries using the R commands: library(ggplot2) library(ggmap) library(maps) library(mapdata) library(stringr) library(dplyr) library(devtools) # In the bottom right frame of R Studio, go to the Files tab and navigate to your recently # downloaded file covid19Data.csv. Left click on the file name with the mouse, then choose "Import Dataset". # After importing the file, the "View()" command will be automatically executed, causing the # dataset to appear in a tab in the top left quarter of R studio (right above this text). # Click on the tab and note the spreadsheet format of the data. If the deadwood dataset tab is # closed (go ahead and close it) you can access it again by typing: View(covid19Data) #ANSWER QUESTION 1 in your Word file. #PART 1. HEAT MAPS TO SHOW COVID19 SPREAD ACROSS NJ #STEP 1.1. Test your maps package #Let's plot the US map outline just to make sure your maps and ggplot2 packages are installed and everything is working OK. #R can convert the data on outlines for continents, countries, states and counties from maps package into data frames ggplot2 can deal with: usa = map_data("usa") USA_base = ggplot(data = usa, mapping = aes(x = long, y = lat, group = group)) + coord_fixed(1.3) + geom_polygon(color = "black", fill = "white") USA_base #ANSWER QUESTION 2. #Now let's add outlines for each state: states = map_data("state") USA_base + theme_nothing() + geom_polygon(data = states, fill = NA, color = "black") + geom_polygon(color = "black", fill = NA) #STEP 1.2. Plot a map of New Jersey and its counties. nj_df = subset(states, region == "new jersey") nj = ggplot(data = nj_df, mapping = aes(x = long, y = lat, group = group)) + coord_fixed(1.3) + geom_polygon(color = "white", fill = "gray") nj + theme_nothing() counties = map_data("county") nj_county = subset(counties, region == "new jersey") nj + theme_nothing() + geom_polygon(data = nj_county, fill = "gray", color = "white") + geom_polygon(color = "white", fill = NA) #STEP 1.3. Building a heat map for NJ to illustrate the number of positive cases per county, normalized to the county population #First we need to make a new data frame county_data with the number of positive cases by county, and county population county_data = aggregate(TEST_NUMERICAL ~ subregion + CountyPop, FUN=sum, data=covid19Data) #Let's check if we made the table we want (counts of positive tests by county), and view the top of the data frame: head(county_data) #Now let's normalize the number of tests to each county population: county_data$normalized = county_data$TEST_NUMERICAL/county_data$CountyPop head(county_data) #Now let's merge these numbers to the county outlines in the map. Subregion column in the covid19Data file #contains names of the counties that R can use to merge with our data frame county_data and create a new data frame CovbyCounty: CovbyCounty <- inner_join(nj_county, county_data, by = "subregion") #Now let's make a blank map: BWMAP <- theme( axis.text = element_blank(), axis.line = element_blank(), axis.ticks = element_blank(), panel.border = element_blank(), panel.grid = element_blank(), axis.title = element_blank() ) #FIll the blank map with values from CovbyCounty to produce heat map CovidCountyMap <- nj + geom_polygon(data = CovbyCounty, aes(fill = normalized), color = "white") + geom_polygon(color = "gray", fill = "NA") + theme_bw() + ggtitle("Summary of positive cases per county normalized for population between March and December 2020")+ BWMAP + scale_fill_gradient(trans = "log10") CovidCountyMap #Now let's add some county labels county_names <- covid19Data[!duplicated(covid19Data$County),] #creates a new data frame with no duplicated county names #county_names contains the county names, location, and population density ranks (1-21) #type View(county_names) in the console and look at the new data frame, notice that it only has 1 sample from each county #Add each label to the map CovidCountyMap <- CovidCountyMap + geom_text(data = county_names, aes(x = long, y = lat, label= County), size = 2.5, color = "firebrick2") #adds the county names CovidCountyMap #Lets save a high resolution image of the state map #The saved image will be in your working directory #To determine your working directory, type getwd() in your Console ggsave(plot = CovidCountyMap, "CovidCountyMap.png", width = 17, height = 20, units = "cm", dpi = 300) #saves the map as a high resolution image #STEP 1.4. Building a heat map for only one month at a time. #Now, let's see what things look like if we look only at the data from one month. Lets make another data frame, county_data2, #and add the information about the month there: county_data2 = aggregate(TEST_NUMERICAL ~ subregion + CountyPop + MONTH, FUN=sum, data=covid19Data) #Let's check if we made the table we want head(county_data2) #Now let's normalize the number of tests to each county population county_data2$normalized = county_data2$TEST_NUMERICAL/county_data2$CountyPop head(county_data2) #ANSWER QUESTION 3. #STEP 1.4A. Now, let's pick March (month number 3) from this data frame and make a map for it: march_data = subset(county_data2, MONTH==3) MarchCounty <- inner_join(nj_county, march_data, by = "subregion") #Fill the NJ county map with the information MarchMap <- nj + geom_polygon(data = MarchCounty, aes(fill = normalized), color = "white") + geom_polygon(color = "gray", fill = "NA") + theme_bw() + ggtitle("March") + BWMAP + scale_fill_gradient(trans = "log10") + geom_text(data = county_names, aes(x = long, y = lat, label= County), size = 2.5, color = "firebrick2")+ #adds county names geom_text(data = county_names, aes(x = long, y = lat-.05, label= rank), size = 2.5, color = "firebrick2") #adds county population density ranks MarchMap #adds the county names #notice that grey areas are counties that do not contain any data (no positive tests were collected) - don't worry if you get a warning message because of this when plotting the maps #ANSWER QUESTION 4. #STEP 1.4B. Now, build a heat map of COVID19 positive test results for June. Use the script in STEP 4A as an example. #You will have to make necessary substitutions. #Create a data frame for June as a subset of county_data2, then plot the data on the county map. Write your script below. #ANSWER QUESTION 5. #STEP 1.4c. Now, repeat the same for December. #ANSWER QUESTION 6. #ANSWER QUESTION 7. #PART 2. ANALYSIS THE INFECTION RATES IN DIFFERENT GENDERS #Now, we are going to test the hypothesis that the rate of infection correlates with the gender. #STEP 2.1. Compare the number of postitive results for males in females detected in NJ in 2020 #First, let's create two vectors out of our covid19Data dataset. Each one will contain either only male or female data. maletest = subset(covid19Data, PATIENT_SEX == 'M', select = c("TEST_NUMERICAL","PATIENT_SEX")) femaletest = subset(covid19Data, PATIENT_SEX == 'F', select = c("TEST_NUMERICAL","PATIENT_SEX")) #Let's count the number of tests and positive tests by sex lenm = length(maletest$TEST_NUMERICAL) lenm lenf = length(femaletest$TEST_NUMERICAL) lenf #ANSWER QUESTION 8. summ = sum(maletest$TEST_NUMERICAL) summ sumf = sum(femaletest$TEST_NUMERICAL) sumf #ANSWER QUESTION 9. # Make a bar plot to compare the number of positive tests between the genders: barplot(c(summ,sumf), main = "Number of positive tests by gender", names.arg = c("Male","Female"), ylab = "Raw numbers of positive tests", col = c("blue", "red"), ylim = c(0,14000)) #ANSWER QUESTION 10. #copy the line that plots the data, but change the colors for males to green, and for females to purple. #ANSWER QUESTION 11. #Now let's calculate % of the male or female populations with a positive test: MalePosRate = (summ/lenm)*100 MalePosRate FemalePosRate = (sumf/lenf)*100 FemalePosRate # Make a bar plot of the % data. For the names.arg, type the labels for the bars, and for the ylab type the lable for y axis: barplot(c(MalePosRate,FemalePosRate), main = "Percentage of positive tests by gender", names.arg = c("INSERT_LABEL","INSERT_LABEL"), ylab = "INSERT_LABEL", col = c("blue", "red"), ylim = c(0,11)) #ANSWER QUESTION 12. #STEP 2.2. #Now let's see if the number of positive tests detected in males and females is significantly different # We will use a non-parametric equivalent of the t-test: the Mann-Whitney-Wilcoxon test #First, combine two vectors together SexData = rbind(maletest,femaletest) #perform the test SexTest = wilcox.test(TEST_NUMERICAL ~ PATIENT_SEX, data=SexData) SexTest #ANSWER QUESTION 13. #Save your workspace by clicking on the environment tab on the right side and clicking the blue disc icon #Save the workspace environment as NJ_Covid.RData ############################ BREAK ######################################################## #If you have time, please continue to #PART 3. GIF of COVID-19 infection numbers change over time #Now we can create a GIF to visualize how COVID-19 infections change over time #For this, we will need the data generated in STEPS 1 and 2 and the gifski package #STEP 3.1, Download, install, and import the gifski package #To install gifski: On the RStudio toolbar, Select Tools, Install packages... #Type gifski, dependencies should be checked #Click Install #Use the library function to import the necessary packages to create the gif.Complete the line below with the name of the package library() #import gifski #STEP 3.2. create unique heat map text naming labels #Our data is limited to March-December 2020 #Let's create a data frame that contains all of the months in our data as represented by two digit text labels #Example: "03" is March #Identifying Months using two digits as text labels is needed to maintain the proper order of the data and unique names #To create the naming text labels, we will use the formatC function #type help("formatC") in the Console to learn more about this function #Task 3.2, change formatC inputs to work with your data and store it as a data frame in the following way: #use 3:12 to obtain all of the month numbers (March to December) found in our data #set width = 2 so that each number will be 2 digits #set flag = 0 to place a 0 in front of single digit numbers month_num = formatC(10:100, width = 25, flag = 000) #Let's store our month text labels found in month_num in a data frame called month_frame. Complete the line below: month_frame = data.frame() #creates a data frame with month text labels #Note, the numbers we create are considered "text" and not numeric values #STEP 3.3, set working directory to save images #Before continuing, let's tell RStudio where to save an image for each heat map that we create #Task 3.3, create an empty folder to store images #On the Rstudio toolbar: #Go to Session, Set Working Directory, Choose Directory, choose a location and create and name an empty folder #An empty folder is needed to store all of the saved heat maps so that unwanted images are not added to the gif #STEP 3.4, use a for loop to automatically create and save each map from the data #Type help("for") in the Console to learn more about the "for loop" #Our "for loop" will iterate through each month stored in month_frame, one at a time #For each month, it will create the NJ heat map of the Covid-19 data and save it to the new folder that you created in STEP 3.3 for (i in month_frame$month_num){ #In the for loop, "i" will equal the month text label, one at a time month_number = as.numeric(i) #Convert the month text label into a numeric value since the numeric value in the Covid data month_data = subset(county_data2, MONTH==month_number) monthCounty <- inner_join(nj_county, month_data, by = "subregion") month_name = month.name[month_number] #Converts the month number into a month name #month_name will be used in the map title #type help(month.name) in the Console for more information #Fill the NJ county map with the information month_map <- nj + geom_polygon(data = monthCounty, aes(fill = normalized), color = "white") + geom_polygon(color = "gray", fill = "NA") + theme_bw() + ggtitle(month_name) + #adds the month name to the title BWMAP + #the labs function allows for you to customize the legend title using fill = "New Title" #the labs function allows to add a caption area to be added on the map for additional info labs(fill = "Cases per person", caption = "2020 Data: Genesis Laboratory Management")+ #Task 3.4, keep the color values equal for every image using limits = c(low_value, high_value) within the scale_fill_gradient function #for this data, the low_value should be 8.011344e-06 and the high_value should be 0.003304063 scale_fill_gradient(limits = c(,)) + #Use limits = to keep the same sale for everyone image #Keeping a set scale is needed to compare maps geom_text(data = county_names, aes(x = long, y = lat, label= County), size = 2.5, color = "firebrick2") #adds county names #Let's create a unique name for our map, including the image file type #We can combine the month text label with the .png file type using the paste0 function #Type help("paste0") in the Console for more information about this function image_name = paste0(i,".png") #Now we are all set to save each of the heat maps as an individual .png image file #Let's use the ggsave function to save a high resolution image for each heat map created #Images will be automatically saved to our working directory #type help("ggsave") in console to learn more about the function #plot = is used to designate what ggplot is being saved, it should be month_map #the next item in ggsave is the name of the unique image, which is image_name #width, hieght, and unites designates how large you want the image to be #dpi = allows you to adjust the resolution, for publications and posters 300dpi is required ggsave(plot = month_map, image_name , width = 17, height = 20, units = "cm", dpi = 300) #saves the map as a high resolution image } #All of the images should now be saved to our working directory #Before continuing, please check to see if the images are there #If you are missing images in your folder, please troubleshoot with your instructor #If all images are there, let's move on and create a GIF from all of the saved heat maps #STEP 3.5, organize and identify the saved heat map images #Let's replace month labels with their full unique image names, including the .png month_frame$month_num = paste0(month_frame$month_num,".png") #create a data frame with unique full image names #Task 3.5, Create a variable called locations that represents the file location where the images are saved #This should also be your working directory #Example: locations = "C:/Users/User_name/Desktop/Test" locations = #Let's get all the .png files that are located in the working directly using the list.files function #Type help("list.files") in the Console for more information png_files = list.files(locations, pattern = ".png", full.names = TRUE) #type png_files in the Console to see the list of heat maps created by the for loop #STEP 3.6, Create the GIF #use the gifski function to generate a GIF of all the heat maps #the gifski function take the list of png_files #Task 3.6, complete the gifski function #To set the name of the file, use gif_file = "NJ_covid.gif" #Set the width = 2007, height = 2362 so that the images are not squished or stretched #These are the pixel dimension for 17x20cm images that were saved in the for loop #Delay = 1 means that there is a 1 sec delay between images. #Type help(gifski) in the Console for more information gifski(png_files, gif_file = "", width = , height = , delay = ) #STEP 3.7, open the GIF #Task 3.7, go to your computer file location to view the heat map gif #Answer Questions 14-15 ############################ THE END ########################################################