# Q1, part (c) ------------- set.seed(1) # Set the parameters of the experiment: nrep <- 10000 n <- 100 beta0 <- 2 beta1 <- -4 # Create the vectors to store the OLS estimates: b0 <- b1 <- numeric(nrep) # Generate the x data: x <- runif(n) # Start the simulation loop: for (i in 1:nrep) { # Generate the y data. Note that epsilon will have mean = 2. y <- beta0 + beta1 * x + rnorm(n, mean=2) # Estimate OLS: ols <- lm(y ~ x) # Store the OLS estimates: b0[i] <- ols$coefficient[1] b1[i] <- ols$coefficient[2] } # Examine the simulated bias of the intercept and slope estimators: mean(b0) - beta0 mean(b1) - beta1 # We notice that the bias of b0 is close to the mean of epsilon, while # the bias of b1 is close to zero. # Q2, part (b) ------------- # We will use a similar data generating process as above, except we # will generate a new 'x2' variable 10000 times, add it to the regression, # and record the value of R-square before and after the addition of the # new variable. # An empty vector to store R-square: R2before <- R2after <- numeric(nrep) for (i in 1:nrep) { # Generate x2. It will not be used to generate y. x2 <- rnorm(n) # Generate the y data. Note that epsilon has mean = 0 as usual. y <- beta0 + beta1 * x + rnorm(n) # Estimate OLS before and after the addition of x2: olsbefore <- lm(y ~ x) olsafter <- lm(y ~ x + x2) # Store the R-squares: R2before[i] <- summary(olsbefore)$r.squared R2after[i] <- summary(olsafter)$r.squared } # Count how many times R-square is higher with the addition of the # (irrelevant) variable (we will see that R2after is always higher): sum(R2after > R2before) # Q3, part (c) ------------- # Empty vectors to store the slope estimators: b1.tilde <- b1.ols <- numeric(nrep) for (i in 1:nrep) { # Generate the y data. y <- beta0 + beta1 * x + rnorm(n) # Calculate and store the new estimator: b1.tilde[i] <- (mean(y[((n / 2) + 1):n]) - mean(y[1:(n / 2)])) / (mean(x[((n / 2) + 1):n]) - mean(x[1:(n / 2)])) # Estimate and store the OLS slope estimator: b1.ols[i] <- lm(y ~ x)$coefficient[2] } # See that the simulated variance of the new estimator is indeed # higher than the variance of OLS: var(b1.tilde) > var(b1.ols) # Q4, part (b) ------------- # In order for an estimator to be consistent, its bias and variance # should approach zero as the sample size grows. Hence, we will # simulate these two estimators using a very large sample size. n <- 5000 x <- runif(n) # Empty vectors to store the estimators for sigma-square: s2 <- sig.tilde2 <- numeric(nrep) for (i in 1:nrep) { # Generate the y data. Note that sigma-square = 4. y <- beta0 + beta1 * x + rnorm(n, sd = 2) # Estimate OLS: olsQ4 <- lm(y ~ x) # Extract the sum of squared residuals: SSE <- sum(olsQ4$residual ^ 2) # Calculate and store the two estimators for sigma-square: s2[i] <- SSE / (n - 2) sig.tilde2[i] <- SSE / n } # Now, we examine the simulated bias and variance of these two # estimators, and see that both are close to zero for large n: mean(s2) - 4 var(s2) mean(sig.tilde2) - 4 var(sig.tilde2) # Q5, part (b) ------------- n <- 100 beta0 <- 2 beta1 <- -2 # It will turn out that the higher the absolute value of beta2, the more bias in b1. beta2 <- 3 # The "p" parameter will control the strength of the correlation between x1 and x2. # It will turn out that the higher the "p", the higher the bias of b1. p <- 0.5 # Generate x1 and x2. Note x1 depends on x2. x2 <- runif(n) x1 <- p * x2 + runif(n) # Create empty vectors to store the OLS estimators: b0 <- b1 <- numeric(nrep) for(i in 1:nrep) { # Generate the y variable. Note that x2 determines y. y <- beta0 + beta1 * x1 + beta2 * x2 + rnorm(n) # Estimate OLS. Note that the estimated model does not include x2. olsQ5 <- lm(y ~ x1) # Store the OLS estimates: b0[i] <- olsQ5$coefficient[1] b1[i] <- olsQ5$coefficient[2] } # Check the bias of the estimators (it will depend on "p" and "beta2"): mean(b0) - beta0 mean(b1) - beta1