library(ggplot2) # for graphics
## Warning: package 'ggplot2' was built under R version 4.1.2
library(MASS) # for maximum likelihood estimation
z <- read.table("Cleaned2021VegetationData.csv",header=TRUE,sep=",")
str(z)
## 'data.frame': 288 obs. of 6 variables:
## $ Date : chr "10/4/2021" "10/4/2021" "10/4/2021" "10/4/2021" ...
## $ PlotID : chr "1A" "1A" "1A" "1A" ...
## $ Plant : chr "Milkweed" "Milkweed" "Coneflower" "Coneflower" ...
## $ VegID : chr "M1" "M2" "C1" "C2" ...
## $ Height_cm : num 16.5 24 12.5 19 64 36.5 21.5 23 21 17 ...
## $ PercentGreenCover: num 5.91 NA NA NA NA NA 4.49 NA NA NA ...
summary(z)
## Date PlotID Plant VegID
## Length:288 Length:288 Length:288 Length:288
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
##
## Height_cm PercentGreenCover
## Min. : 8.0 Min. : 4.49
## 1st Qu.:20.5 1st Qu.:10.82
## Median :27.0 Median :20.59
## Mean :35.0 Mean :23.35
## 3rd Qu.:44.0 3rd Qu.:34.40
## Max. :85.0 Max. :52.13
## NA's :241
# quick and dirty, a truncated normal distribution to work on the solution set
#z <- rnorm(n=3000,mean=0.2)
#z <- data.frame(1:3000,z)
#names(z) <- list("ID","myVar")
#z <- z[z$myVar>0,]
#str(z)
#summary(z$myVar)
p1 <- ggplot(data=z, aes(x=Height_cm, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
normPars <- fitdistr(z$Height_cm,"normal")
print(normPars)
## mean sd
## 34.9982639 19.3204653
## ( 1.1384693) ( 0.8050194)
str(normPars)
## List of 5
## $ estimate: Named num [1:2] 35 19.3
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ sd : Named num [1:2] 1.138 0.805
## ..- attr(*, "names")= chr [1:2] "mean" "sd"
## $ vcov : num [1:2, 1:2] 1.296 0 0 0.648
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:2] "mean" "sd"
## .. ..$ : chr [1:2] "mean" "sd"
## $ n : int 288
## $ loglik : num -1261
## - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
## mean
## 34.99826
Normal
Probability DensitymeanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$Height_cm),len=length(z$Height_cm))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$Height_cm), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Exponential
Probability DensityexpoPars <- fitdistr(z$Height_cm,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$Height_cm), args = list(rate=rateML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Uniform
Probability Densitystat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$Height_cm), args = list(min=min(z$myVar), max=max(z$myVar)))
## Warning in min(z$myVar): no non-missing arguments to min; returning Inf
## Warning in max(z$myVar): no non-missing arguments to max; returning -Inf
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in (function (x, min = 0, max = 1, log = FALSE) : NaNs produced
## Warning: Removed 288 row(s) containing missing values (geom_path).
Gamma
Probability DensitygammaPars <- fitdistr(z$Height_cm,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$Height_cm), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in (function (x, min = 0, max = 1, log = FALSE) : NaNs produced
## Warning: Removed 288 row(s) containing missing values (geom_path).
Beta
Probability DensitypSpecial <- ggplot(data=z, aes(x=Height_cm/(max(Height_cm + 0.1)), y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2) +
xlim(c(0,1)) +
geom_density(size=0.75,linetype="dotted")
betaPars <- fitdistr(x=z$Height_cm/max(z$Height_cm + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$Height_cm), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).
Gamma is the best fitting distribution.
#generate data using gamma
z <- rgamma(n=288,
shape=3.568224649,
rate=0.101954357)
z <- data.frame(1:288,z)
names(z) <- list("ID","Height_cm")
z <- z[z$Height_cm>0,]
str(z)
## 'data.frame': 288 obs. of 2 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Height_cm: num 16.1 35.1 30.1 23.3 74 ...
summary(z$Height_cm)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 7.245 22.132 33.393 36.396 45.732 115.216
#plot histogram
p1 <- ggplot(data=z, aes(x=Height_cm, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#add probability density curve
p1 <- p1 + geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Histogram of Original Data
z <- read.table("Cleaned2021VegetationData.csv",header=TRUE,sep=",")
p1 <- ggplot(data=z, aes(x=Height_cm, y=..density..)) +
geom_histogram(color="grey60",fill="cornsilk",size=0.2)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
How do the two histogram profiles compare? Do you think the model is doing a good job of simulating realistic data that match your original measurements? Why or why not?
The actual data has more of a bimodal distribution so it deviates from the simulated one which is unimodal skewed right, but for the left hand side of the plot it is similar.