# HeightAnalysis # # This example is modified from the text # Mathematics for the Life Sciences # by Erin N. Bodine, Suzanne Lenhart, and Louis J. Gross # © 2014 by Princeton University Press # # Objective is analyze the data collected on heights # before and after sleeping for participants in a class project. # # First step is to have pair and share groups in class consider # what factors might affect the amount (or whether it occurs) of # height change overnight. # # Second step is to have pair and share groups make one or more # hypotheses about height change overnight based upon the factors # they considered might affect it. # # Third step is to collect data from participants. For this example # data set the participants collected data over 4 nights each and # provided the raw data in files that were then combined # across all participants # # Data provided were placed into a file fulldat.txt with # spaces as column delimiters. # # Metadata are for the columns in fulldat.txt: # Gender (1=female,2=male), age in years, Height at night # in mm before sleep, Hours of sleep, Height in morning in mm # upon awakening, Height in morning minus Height at night in mm # Each row in the data contains an observation for a different # night with put 4 rows for each participant # Participants are not identified (e.g. the data are combined # with no way to determine which participant provided which data). # # We next carry out basic data analysis to investigate the data # using descriptive statistics, histograms and linear regressions # to analyze the data # # Step 1: Read in the data # In RStudio use the Import Dataset link on the upper left window # to read in the data in fulldat.txt # In doing this, it is best to maintain the column headings # # Step 2: Rename the dataset # To more easily access the data, rename the data set by assigning # it the name F # In the Console (lower left window) type F<-fulldat # # Step 3: Means of the dataset # To find the mean (arithmetic average) of height type in the Console mean(F[,2]) # which takes the average of the second column (Age) # Similarly the mean height before sleep is obtained from mean(F[,3]) # and the mean height in the morning is mean(F[,5]) # and the mean height difference is mean(F[,6]) # and the mean hours of sleep is mean(F[,4]) # # Step 4: Separating Males and Females # To look at potential differences between genders, we need to split # the dataset by gender - we create a new dataset FM that just has the # male data and a new dataset FF that just has the female data in the # Console window type FF<-F[which(F$Gender == 1),] FM<-F[which(F$Gender == 2),] # # Step 5: Histograms of the full dataset # To look at a histogram of age we type hist(F[,2]) # and note that this is equivalent to the command hist(F$Age) # and to change the number of classes we type hist(F[,2],nclass=30) # The histogram for hours of sleep is hist(F$HrsSleep) # Rather than using the standard labels for the hist() command # we can modify these in several ways for example # by finding the range of the values AgeRange = max(F$Age)-min(F$Age) # Calculate class width if we want 15 classes using the ' # ceiling command which rounds up to the nearest integer cw = ceiling(AgeRange/15) # Determine values where histogram bars should start using # the seq command startvals = seq(min(F$Age)-0.5,min(F$Age)-0.5+15*cw,by=cw) # and adding labels in the hist() function and axis function hist(F$Age, main = "Age Histogram of Participants", breaks = startvals, xlab = "Age (yrs)", ylab = "Number", xlim = c(min(F$Age)-0.5,min(F$Age)-0.5+15*cw), xaxt = "n") axis(1,at = startvals) # # Similar to the above, to see the histogram of heights before # sleep, in the morning and the differences hist(F$HtNight,nclass=30) hist(F$HtMorn,nclass=30) hist(F$HtDiff,nclass30) # You may notices some problems with these graphs - we'll get to # these after we look at some other graphs # # Step 6: Histograms by Gender # Similar to the above, using the datasets that separate out # heights by gender, the histograms are for heights at night hist(FM$HtNight,nclass=30) hist(FF$HtNight,nclass=30) # and for height differences hist(FM$HtDiff,nclass30) hist(FF$HtDiff,nclass30) # # Step 7: Cleaning the Data # There are clearly some problems with the data and one way to # deal with it is to remove all observations with clearly incorrect # heights. We can do this by removing data in F to create a new # dataset FC that removes those with heights above 2200 mm and # below 900 mm (e.g 3 ft to 7 ft) FC<-F[which(F$HtNight < 2200 & F$HtNight > 900 & F$HtMorn < 2200 & F$HtMorn > 900),] # Looking at the histograms for the Heights at night and # morning for the FC dataset hist(FC$HtNight) hist(FC$HtMorn) # shows that these now have reasonable heights for the age range # of the class participants. # To check, look at hist(FC$HtDiff) # and note that there is still a major problem. So the data still # require further cleaning, in which we take out observations # with HtDiff less than -200mm FCC<-F[which(F$HtNight < 2200 & F$HtNight > 900 & F$HtMorn < 2200 & F$HtMorn > 900 & F$HtDiff > -200),] # and the mean height change overnight for this cleaned dataset is mean(FCC$HtDiff) # # Step 8: Investigating Hypotheses # The mean height change and histogram of the differences from the cleaned # data indicate a positive change in height overnight in the dataset. # A simple t-test can be used and provides a confidence interval # for the true mean at the 99% level t.test(FCC$HtDiff,mu=0.0,conf.level=.99) # # Other hypotheses might include whether the magnitude of the overnight # height change is related to height or to hours of sleep or to gender # For the full dataset, a scatter plot of height change vs height plot(FCC$HtNight,FCC$HtDiff) # or to separate out by gender first make a fully-cleaned dataset for # each gender FCCF<-FCC[which(FCC$Gender == 1),] FCCM<-FCC[which(FCC$Gender == 2),] # then plot with one color (red) for females and blue for males xmin=min(FCC$HtNight) xmax=max(FCC$HtNight) ymin=min(FCC$HtDiff) ymax=max(FCC$HtDiff) plot(FCCF$HtNight,FCCF$HtDiff,xlim=c(xmin,xmax), ylim=c(ymin,ymax),col="red",xlab = "Height(mm)", ylab = "Height change (mm)", main="Red are Females, Blue are Males") par(new="True") plot(FCCM$HtNight,FCCM$HtDiff,xlim=c(xmin,xmax), ylim=c(ymin,ymax),col="blue",xlab = "Height(mm)", ylab = "Height change (mm)", main="Red are Females, Blue are Males") # Similar plots allow investigation of whether there is a # relationship of height change to amount of sleep overall plot(FCC$HrsSleep,FCC$HtDiff) # and as above we can separate these out by gender xmin=min(FCC$HrsSleep) xmax=max(FCC$HrsSleep) ymin=min(FCC$HtDiff) ymax=max(FCC$HtDiff) plot(FCCF$HrsSleep,FCCF$HtDiff,xlim=c(xmin,xmax), ylim=c(ymin,ymax),col="red",xlab = "Hours of Sleep", ylab = "Height change (mm)", main="Red are Females, Blue are Males") par(new="True") plot(FCCM$HrsSleep,FCCM$HtDiff,xlim=c(xmin,xmax), ylim=c(ymin,ymax),col="blue",xlab = "Hours of Sleep", ylab = "Height change (mm)", main="Red are Females, Blue are Males") # # Step 9: Other Analyses # To assess whether there are gender differences in height change # overnight, a two-sample t-test can be done using t.test(FCCF$HtDiff,FCCM$HtDiff,mu=0.0,conf.level=.99) # # To assess whether there are any correlations in the data, # the correlation coefficient is obtained from cor(FCC$HtNight,FCC$HtDiff) # and this is also calculated when a Pearson correlation test is # done for example on the full cleaned dataset of height at night # and height change cor.test(FCC$HtNight,FCC$HtDiff,conf.level=.99) # or on hours of sleep and height change cor.test(FCC$HrsSleep,FCC$HtDiff,conf.level=.99) # # Although there is no indication that any linear dependence in # the observed variables exist, for completeness note that a # linear regression could be carried out. As an example # consider the possibility that height change overnight is a # linear function of height before sleep. The below produces # a vector C with first component the intercept and second # component the slope of the best-fit line C=lm(FCC$HtDiff~FCC$HtNight) # and to display the best fit line cat(sprintf("Equation for regression: HtDiff = %f HtNight + %f", coef(C)[2],coef(C)[1]), "\n") # and to plot this line on the graph of the data xmin=min(FCC$HtNight) xmax=max(FCC$HtNight) ymin=min(FCC$HtDiff) ymax=max(FCC$HtDiff) HtPredict = predict(C, data.frame(FCC$HtNight)) plot(FCC$HtNight,FCC$HtDiff,xlim=c(xmin,xmax), ylim=c(ymin,ymax),xlab = "Height(mm)", ylab = "Height change (mm)", main="Height change vs Height") par(new="True") plot(FCC$HtNight,HtPredict,xlim=c(xmin,xmax), ylim=c(ymin,ymax),xlab = "Height(mm)", type = "l",ylab = "Height change (mm)", col="red",main="Height change vs Height", xaxt = "n",yaxt = "n")