Open Libraries

library(ggplot2) # for graphics
## Warning: package 'ggplot2' was built under R version 4.1.2
library(MASS) # for maximum likelihood estimation

My Own Data

z <- read.table("Cleaned2021VegetationData.csv",header=TRUE,sep=",")
str(z)
## 'data.frame':    288 obs. of  6 variables:
##  $ Date             : chr  "10/4/2021" "10/4/2021" "10/4/2021" "10/4/2021" ...
##  $ PlotID           : chr  "1A" "1A" "1A" "1A" ...
##  $ Plant            : chr  "Milkweed" "Milkweed" "Coneflower" "Coneflower" ...
##  $ VegID            : chr  "M1" "M2" "C1" "C2" ...
##  $ Height_cm        : num  16.5 24 12.5 19 64 36.5 21.5 23 21 17 ...
##  $ PercentGreenCover: num  5.91 NA NA NA NA NA 4.49 NA NA NA ...
summary(z)
##      Date              PlotID             Plant              VegID          
##  Length:288         Length:288         Length:288         Length:288        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##                                                                             
##    Height_cm    PercentGreenCover
##  Min.   : 8.0   Min.   : 4.49    
##  1st Qu.:20.5   1st Qu.:10.82    
##  Median :27.0   Median :20.59    
##  Mean   :35.0   Mean   :23.35    
##  3rd Qu.:44.0   3rd Qu.:34.40    
##  Max.   :85.0   Max.   :52.13    
##                 NA's   :241

Read in Data Vector

# quick and dirty, a truncated normal distribution to work on the solution set

#z <- rnorm(n=3000,mean=0.2)
#z <- data.frame(1:3000,z)
#names(z) <- list("ID","myVar")
#z <- z[z$myVar>0,]
#str(z)
#summary(z$myVar)

Plot Histogram of Data

p1 <- ggplot(data=z, aes(x=Height_cm, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Add Emperical Density Curve

p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Get Maximum Likelihood Parameters for Normal

normPars <- fitdistr(z$Height_cm,"normal")
print(normPars)
##       mean          sd    
##   34.9982639   19.3204653 
##  ( 1.1384693) ( 0.8050194)
str(normPars)
## List of 5
##  $ estimate: Named num [1:2] 35 19.3
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ sd      : Named num [1:2] 1.138 0.805
##   ..- attr(*, "names")= chr [1:2] "mean" "sd"
##  $ vcov    : num [1:2, 1:2] 1.296 0 0 0.648
##   ..- attr(*, "dimnames")=List of 2
##   .. ..$ : chr [1:2] "mean" "sd"
##   .. ..$ : chr [1:2] "mean" "sd"
##  $ n       : int 288
##  $ loglik  : num -1261
##  - attr(*, "class")= chr "fitdistr"
normPars$estimate["mean"] # note structure of getting a named attribute
##     mean 
## 34.99826

Plot Normal Probability Density

meanML <- normPars$estimate["mean"]
sdML <- normPars$estimate["sd"]
xval <- seq(0,max(z$Height_cm),len=length(z$Height_cm))
stat <- stat_function(aes(x = xval, y = ..y..), fun = dnorm, colour="red", n = length(z$Height_cm), args = list(mean = meanML, sd = sdML))
p1 + stat
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot Exponential Probability Density

expoPars <- fitdistr(z$Height_cm,"exponential")
rateML <- expoPars$estimate["rate"]
stat2 <- stat_function(aes(x = xval, y = ..y..), fun = dexp, colour="blue", n = length(z$Height_cm), args = list(rate=rateML))
p1 + stat + stat2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Plot Uniform Probability Density

stat3 <- stat_function(aes(x = xval, y = ..y..), fun = dunif, colour="darkgreen", n = length(z$Height_cm), args = list(min=min(z$myVar), max=max(z$myVar)))
## Warning in min(z$myVar): no non-missing arguments to min; returning Inf
## Warning in max(z$myVar): no non-missing arguments to max; returning -Inf
p1 + stat + stat2 + stat3
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in (function (x, min = 0, max = 1, log = FALSE) : NaNs produced
## Warning: Removed 288 row(s) containing missing values (geom_path).

Plot Gamma Probability Density

gammaPars <- fitdistr(z$Height_cm,"gamma")
shapeML <- gammaPars$estimate["shape"]
rateML <- gammaPars$estimate["rate"]
stat4 <- stat_function(aes(x = xval, y = ..y..), fun = dgamma, colour="brown", n = length(z$Height_cm), args = list(shape=shapeML, rate=rateML))
p1 + stat + stat2 + stat3 + stat4
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning in (function (x, min = 0, max = 1, log = FALSE) : NaNs produced
## Warning: Removed 288 row(s) containing missing values (geom_path).

Plot Beta Probability Density

pSpecial <- ggplot(data=z, aes(x=Height_cm/(max(Height_cm + 0.1)), y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) + 
  xlim(c(0,1)) +
  geom_density(size=0.75,linetype="dotted")

betaPars <- fitdistr(x=z$Height_cm/max(z$Height_cm + 0.1),start=list(shape1=1,shape2=2),"beta")
## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced

## Warning in densfun(x, parm[1], parm[2], ...): NaNs produced
shape1ML <- betaPars$estimate["shape1"]
shape2ML <- betaPars$estimate["shape2"]
statSpecial <- stat_function(aes(x = xval, y = ..y..), fun = dbeta, colour="orchid", n = length(z$Height_cm), args = list(shape1=shape1ML,shape2=shape2ML))
pSpecial + statSpecial
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 2 rows containing missing values (geom_bar).

Gamma is the best fitting distribution.

Simulate New Data Set

#generate data using gamma
z <- rgamma(n=288,
            shape=3.568224649,
            rate=0.101954357)
z <- data.frame(1:288,z)
names(z) <- list("ID","Height_cm")
z <- z[z$Height_cm>0,]
str(z)
## 'data.frame':    288 obs. of  2 variables:
##  $ ID       : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Height_cm: num  16.1 35.1 30.1 23.3 74 ...
summary(z$Height_cm)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   7.245  22.132  33.393  36.396  45.732 115.216
#plot histogram
p1 <- ggplot(data=z, aes(x=Height_cm, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#add probability density curve
p1 <-  p1 +  geom_density(linetype="dotted",size=0.75)
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Histogram of Original Data

z <- read.table("Cleaned2021VegetationData.csv",header=TRUE,sep=",")
p1 <- ggplot(data=z, aes(x=Height_cm, y=..density..)) +
  geom_histogram(color="grey60",fill="cornsilk",size=0.2) 
print(p1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Questions

How do the two histogram profiles compare? Do you think the model is doing a good job of simulating realistic data that match your original measurements? Why or why not?

The actual data has more of a bimodal distribution so it deviates from the simulated one which is unimodal skewed right, but for the left hand side of the plot it is similar.