# Q1, part (c) -------------

set.seed(1)

# Set the parameters of the experiment:
nrep <- 10000
n <- 100
beta0 <- 2
beta1 <- -4

# Create the vectors to store the OLS estimates:
b0 <- b1 <- numeric(nrep)

# Generate the x data:
x <- runif(n)

# Start the simulation loop:
for (i in 1:nrep) {
  # Generate the y data. Note that epsilon will have mean = 2.
  y <- beta0 + beta1 * x + rnorm(n, mean=2)
  
  # Estimate OLS:
  ols <- lm(y ~ x)
  
  # Store the OLS estimates:
  b0[i] <- ols$coefficient[1]
  b1[i] <- ols$coefficient[2]
}

# Examine the simulated bias of the intercept and slope estimators:
mean(b0) - beta0
mean(b1) - beta1

# We notice that the bias of b0 is close to the mean of epsilon, while
# the bias of b1 is close to zero.

# Q2, part (b) -------------

# We will use a similar data generating process as above, except we
# will generate a new 'x2' variable 10000 times, add it to the regression,
# and record the value of R-square before and after the addition of the
# new variable.

# An empty vector to store R-square:
R2before <- R2after <- numeric(nrep)

for (i in 1:nrep) {
  # Generate x2. It will not be used to generate y.
  x2 <- rnorm(n)
  
  # Generate the y data. Note that epsilon has mean = 0 as usual.
  y <- beta0 + beta1 * x + rnorm(n)
  
  # Estimate OLS before and after the addition of x2:
  olsbefore <- lm(y ~ x)
  olsafter <- lm(y ~ x + x2)
  
  # Store the R-squares:
  R2before[i] <- summary(olsbefore)$r.squared
  R2after[i] <- summary(olsafter)$r.squared
}

# Count how many times R-square is higher with the addition of the
# (irrelevant) variable (we will see that R2after is always higher):

sum(R2after > R2before)

# Q3, part (c) -------------

# Empty vectors to store the slope estimators:
b1.tilde <- b1.ols <- numeric(nrep)

for (i in 1:nrep) {
  # Generate the y data.
  y <- beta0 + beta1 * x + rnorm(n)
  
  # Calculate and store the new estimator:
  b1.tilde[i] <- (mean(y[((n / 2) + 1):n]) - mean(y[1:(n / 2)])) / 
    (mean(x[((n / 2) + 1):n]) - mean(x[1:(n / 2)]))
  
  # Estimate and store the OLS slope estimator:
  b1.ols[i] <- lm(y ~ x)$coefficient[2]
}

# See that the simulated variance of the new estimator is indeed
# higher than the variance of OLS:

var(b1.tilde) > var(b1.ols)

# Q4, part (b) -------------

# In order for an estimator to be consistent, its bias and variance
# should approach zero as the sample size grows. Hence, we will
# simulate these two estimators using a very large sample size.

n <- 5000
x <- runif(n)

# Empty vectors to store the estimators for sigma-square:
s2 <- sig.tilde2 <- numeric(nrep)

for (i in 1:nrep) {
  # Generate the y data. Note that sigma-square = 4.
  y <- beta0 + beta1 * x + rnorm(n, sd = 2)
  
  # Estimate OLS:
  olsQ4 <- lm(y ~ x)
  
  # Extract the sum of squared residuals:
  SSE <- sum(olsQ4$residual ^ 2)
  
  # Calculate and store the two estimators for sigma-square:
  s2[i] <- SSE / (n - 2)
  sig.tilde2[i] <- SSE / n
}

# Now, we examine the simulated bias and variance of these two 
# estimators, and see that both are close to zero for large n:

mean(s2) - 4
var(s2)

mean(sig.tilde2) - 4
var(sig.tilde2)

# Q5, part (b) -------------

n <- 100

beta0 <- 2
beta1 <- -2

# It will turn out that the higher the absolute value of beta2, the more bias in b1.
beta2 <- 3

# The "p" parameter will control the strength of the correlation between x1 and x2.
# It will turn out that the higher the "p", the higher the bias of b1.
p <- 0.5

# Generate x1 and x2. Note x1 depends on x2.
x2 <- runif(n)
x1 <- p * x2 + runif(n)

# Create empty vectors to store the OLS estimators:
b0 <- b1 <- numeric(nrep)

for(i in 1:nrep) {
  
  # Generate the y variable. Note that x2 determines y.
  y <- beta0 + beta1 * x1 + beta2 * x2 + rnorm(n)

  # Estimate OLS. Note that the estimated model does not include x2.
  olsQ5 <- lm(y ~ x1)

  # Store the OLS estimates:
  b0[i] <- olsQ5$coefficient[1]
  b1[i] <- olsQ5$coefficient[2]
}

# Check the bias of the estimators (it will depend on "p" and "beta2"):
mean(b0) - beta0
mean(b1) - beta1