Homework 8

Study used

The long-term response of phosphorus concentrations is modeled by the equation Y = 53.74 – 47.04 X for the Missisquoi Bay where Y = predicted bay annual average total phosphorus concentration (µg/L) 70 years after watershed load reduction is achieved X = percent (as decimal fraction) watershed phosphorus load reduction from base period rates

Creating the simulated data for 50% phosphorus loading capacity

beta1 <- 53.74 # slope
beta0 <- 47.04 # intercept
noise <- rnorm(n=100, mean=0, sd=0.05) # adds variation to the data set
x_normalP <- rnorm(n=100, mean=0.50, sd=0.050)
y_normalP <- beta0 + beta1*x_normalP +noise
head(x_normalP)
## [1] 0.4952682 0.5528420 0.5701698 0.4606714 0.4267123 0.5407199
head(y_normalP)
## [1] 73.60135 76.73353 77.65101 71.80613 70.02512 76.08037

Other data sets for 75%, 25%, and 50% with low SD, respectively

x_highP <- rnorm(n=100, mean=0.75, sd=0.050)
y_highP <- beta0 + beta1*x_highP + noise
head(x_highP)
## [1] 0.8176820 0.7113466 0.7933653 0.7578386 0.7546398 0.8473084
head(y_highP)
## [1] 90.92787 85.25157 89.64553 87.77589 87.64795 92.55644
x_lowP <- rnorm(n=100, mean=0.25, sd=0.050)
y_lowP <- beta0 + beta1*x_lowP + noise
head(x_lowP)
## [1] 0.2134774 0.2844823 0.3035201 0.1663823 0.2924710 0.2693256
head(y_lowP)
## [1] 58.45792 62.31188 63.32126 55.99103 62.81099 61.49564
x_lowsdP <- rnorm(n=100, mean=0.50, sd=0.005)
y_lowsdP <- beta0 + beta1*x_lowsdP + noise
head(x_lowsdP)
## [1] 0.4957265 0.4930910 0.5009467 0.4932078 0.4922352 0.5050748
head(y_lowsdP)
## [1] 73.62598 73.52251 73.93096 73.55463 73.54633 74.16480

Creating a data frame

d_frame <- data.frame(x_normalP,y_normalP,x_highP,y_highP,x_lowP,y_lowP,x_lowsdP,y_lowsdP)

Data Analysis with Linear Regression of 50% P

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.1.2
reg_model <- lm(y_normalP~x_normalP,data=d_frame)

slope <- summary(reg_model)$coefficients[1,1]
intercept <- summary(reg_model)$coefficients[1,2]
slope == beta0
## [1] FALSE
intercept == beta1
## [1] FALSE
reg_plot <- ggplot(data=d_frame, aes(x=x_normalP, y=y_normalP)) +
            geom_point() +
            stat_smooth(method=lm,se=0.95)
print(reg_plot)
## `geom_smooth()` using formula 'y ~ x'

Different Means

75% P

reg_model <- lm(y_highP~x_highP,data=d_frame)

slope <- summary(reg_model)$coefficients[1,1]
intercept <- summary(reg_model)$coefficients[1,2]
slope == beta0
## [1] FALSE
intercept == beta1
## [1] FALSE
reg_plot <- ggplot(data=d_frame, aes(x=x_highP, y=y_highP)) +
            geom_point() +
            stat_smooth(method=lm,se=0.95)
print(reg_plot)
## `geom_smooth()` using formula 'y ~ x'

25% P

reg_model <- lm(y_lowP~x_lowP,data=d_frame)

slope <- summary(reg_model)$coefficients[1,1]
intercept <- summary(reg_model)$coefficients[1,2]
slope == beta0
## [1] FALSE
intercept == beta1
## [1] FALSE
reg_plot <- ggplot(data=d_frame, aes(x=x_lowP, y=y_lowP)) +
            geom_point() +
            stat_smooth(method=lm,se=0.95)
print(reg_plot)
## `geom_smooth()` using formula 'y ~ x'

Different Standard Deviation

low SD, 50% P

reg_model <- lm(y_lowsdP~x_lowsdP,data=d_frame)

slope <- summary(reg_model)$coefficients[1,1]
intercept <- summary(reg_model)$coefficients[1,2]
slope == beta0
## [1] FALSE
intercept == beta1
## [1] FALSE
reg_plot <- ggplot(data=d_frame, aes(x=x_lowsdP, y=y_lowsdP)) +
            geom_point() +
            stat_smooth(method=lm,se=0.95)
print(reg_plot)
## `geom_smooth()` using formula 'y ~ x'

Different Sample Size

n=10

x_small <- rnorm(n=10, mean=0.50, sd=0.050)
y_small <- beta0 + beta1*x_small + noise
head(x_small)
## [1] 0.4576254 0.5632679 0.5789608 0.5067136 0.5003470 0.5116168
head(y_small)
## [1] 71.57843 77.29382 78.12344 74.28044 73.98225 74.51637
d_frame <- data.frame(x_small,y_small)

reg_model <- lm(y_small~x_small,data=d_frame)

slope <- summary(reg_model)$coefficients[1,1]
intercept <- summary(reg_model)$coefficients[1,2]
slope == beta0
## [1] FALSE
slope
## [1] 46.94899
intercept == beta1
## [1] FALSE
reg_plot <- ggplot(data=d_frame, aes(x=x_small, y=y_small)) +
            geom_point() +
            stat_smooth(method=lm,se=0.95)
print(reg_plot)
## `geom_smooth()` using formula 'y ~ x'

The slope is still very close to the original at this small sample size. I added noise for all simulations above to try and make the regression less close to the original equation.