Class Size Regression Analysis

Load Packages

# note I have already installed all of these here!

library(tidyverse) # always! 
── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
✔ ggplot2 3.3.6      ✔ purrr   0.3.4 
✔ tibble  3.1.8      ✔ dplyr   1.0.10
✔ tidyr   1.2.1      ✔ stringr 1.4.1 
✔ readr   2.1.2      ✔ forcats 0.5.2 
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
library(haven) # for importing .dta files
library(broom) # for tidy regression analysis

Import the Data

# need haven loaded (done in first chunk)
ca_school <- read_dta("https://metricsF22.classes.ryansafner.com/files/data/CASchool.dta")

Explore the Data

ca_school
ca_school %>%
  glimpse()
Rows: 420
Columns: 21
$ observat <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
$ dist_cod <dbl> 75119, 61499, 61549, 61457, 61523, 62042, 68536, 63834, 62331…
$ county   <chr> "Alameda", "Butte", "Butte", "Butte", "Butte", "Fresno", "San…
$ district <chr> "Sunol Glen Unified", "Manzanita Elementary", "Thermalito Uni…
$ gr_span  <chr> "KK-08", "KK-08", "KK-08", "KK-08", "KK-08", "KK-08", "KK-08"…
$ enrl_tot <dbl> 195, 240, 1550, 243, 1335, 137, 195, 888, 379, 2247, 446, 987…
$ teachers <dbl> 10.90, 11.15, 82.90, 14.00, 71.50, 6.40, 10.00, 42.50, 19.00,…
$ calw_pct <dbl> 0.5102, 15.4167, 55.0323, 36.4754, 33.1086, 12.3188, 12.9032,…
$ meal_pct <dbl> 2.0408, 47.9167, 76.3226, 77.0492, 78.4270, 86.9565, 94.6237,…
$ computer <dbl> 67, 101, 169, 85, 171, 25, 28, 66, 35, 0, 86, 56, 25, 0, 31, …
$ testscr  <dbl> 690.80, 661.20, 643.60, 647.70, 640.85, 605.55, 606.75, 609.0…
$ comp_stu <dbl> 0.34358975, 0.42083332, 0.10903226, 0.34979424, 0.12808989, 0…
$ expn_stu <dbl> 6384.911, 5099.381, 5501.955, 7101.831, 5235.988, 5580.147, 5…
$ str      <dbl> 17.88991, 21.52466, 18.69723, 17.35714, 18.67133, 21.40625, 1…
$ avginc   <dbl> 22.690001, 9.824000, 8.978000, 8.978000, 9.080333, 10.415000,…
$ el_pct   <dbl> 0.000000, 4.583333, 30.000002, 0.000000, 13.857677, 12.408759…
$ read_scr <dbl> 691.6, 660.5, 636.3, 651.9, 641.8, 605.7, 604.5, 605.5, 608.9…
$ math_scr <dbl> 690.0, 661.9, 650.9, 643.5, 639.9, 605.4, 609.0, 612.5, 616.1…
$ aowijef  <dbl> 35.77982, 43.04933, 37.39445, 34.71429, 37.34266, 42.81250, 3…
$ es_pct   <dbl> 1.000000, 3.583333, 29.000002, 1.000000, 12.857677, 11.408759…
$ es_frac  <dbl> 0.01000000, 0.03583334, 0.29000002, 0.01000000, 0.12857677, 0…
# scatterplot
scatter <- ggplot(data = ca_school)+
  aes(x = str,
      y = testscr)+
  geom_point(color = "blue")+
  labs(x = "Student to Teacher Ratio",
       y = "Test Score")+
  theme_bw(base_family = "Fira Sans Condensed",
           base_size = 20)

scatter

Regression

# run regression of testscr on str
school_reg <- lm(testscr ~ str, 
                 data = ca_school)

# look at reg object
school_reg 

Call:
lm(formula = testscr ~ str, data = ca_school)

Coefficients:
(Intercept)          str  
     698.93        -2.28  
# get full summary
school_reg %>% summary()

Call:
lm(formula = testscr ~ str, data = ca_school)

Residuals:
    Min      1Q  Median      3Q     Max 
-47.727 -14.251   0.483  12.822  48.540 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept) 698.9330     9.4675  73.825  < 2e-16 ***
str          -2.2798     0.4798  -4.751 2.78e-06 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 18.58 on 418 degrees of freedom
Multiple R-squared:  0.05124,   Adjusted R-squared:  0.04897 
F-statistic: 22.58 on 1 and 418 DF,  p-value: 2.783e-06
# scatterplot with regression line
scatter + geom_smooth(method = "lm", color = "red")
`geom_smooth()` using formula 'y ~ x'

Tidy Regression

# need broom loaded (done in first chunk)

# tidy regression output
school_reg %>% 
  tidy()
# now with confidence intervals
school_reg_tidy <- school_reg %>%
  tidy(conf.int = TRUE)

school_reg_tidy
# look at regression statistics and diagnostics
school_reg %>% 
  glance()
# add regression-based values to data
aug_reg <- school_reg %>% 
  augment()

aug_reg

Predictions

some_district <- tibble(str = 18) # make a dataframe of "new data"'

some_district # look at it just to see
predict(school_reg, # regression lm object
        newdata = some_district) # a dataframe of new data)
       1 
657.8964 

Presenting Results

library(modelsummary)
modelsummary(models = list("Test Score" = school_reg),
             fmt = 2, # round to 2 decimals
             output = "html",
             coef_rename = c("(Intercept)" = "Constant",
                             "str" = "STR"),
             gof_map = list(
               list("raw" = "nobs", "clean" = "n", "fmt" = 0),
               list("raw" = "r.squared", "clean" = "R<sup>2</sup>", "fmt" = 2),
               #list("raw" = "adj.r.squared", "clean" = "Adj. R<sup>2</sup>", "fmt" = 2),
               list("raw" = "rmse", "clean" = "SER", "fmt" = 2)
             ),
             escape = FALSE,
             stars = c('*' = .1, '**' = .05, '***' = 0.01)
)
Test Score
Constant 698.93***
(9.47)
STR −2.28***
(0.48)
n 420
R2 0.05
SER 18.54
* p < 0.1, ** p < 0.05, *** p < 0.01
modelplot(school_reg,
          coef_omit = 'Intercept') # don't show intercept

school_reg %>%
  tidy(conf.int = TRUE) %>%
  filter(term == "str") %>%
  ggplot()+
  aes(x = estimate,
      y = term)+
  geom_point(size = 3)+
  geom_errorbarh(aes(xmin = conf.low, xmax = conf.high),
                 height = 0.1)+ # height of whiskers
  theme_light()

Diagnostics

ggplot(data = aug_reg)+
  aes(x = .resid)+
  geom_histogram(color="white", fill = "pink")+
  labs(x = expression(paste("Residual, ", hat(u))))+
  scale_y_continuous(limits = c(20, 40),
                     expand = c(0,0))+
  theme_light(base_family = "Fira Sans Condensed",
           base_size=20)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 30 rows containing missing values (geom_bar).

aug_reg %>%
  summarize(E_u = mean(.resid),
            sd_u = sd(.resid))
ggplot(data = aug_reg)+
  aes(x = .fitted,
      y = .resid)+
  geom_point(color = "blue")+
  geom_hline(aes(yintercept = 0), color = "red")+
  labs(x = expression(paste("Predicted Test Score,", hat(y)[i])),
       y = expression(paste("Residual, ", hat(u)[i])))+
  theme_light(base_family = "Fira Sans Condensed",
           base_size = 20)

Heteroskedasticity

library(lmtest)
Loading required package: zoo

Attaching package: 'zoo'
The following objects are masked from 'package:base':

    as.Date, as.Date.numeric
school_reg %>% bptest()

    studentized Breusch-Pagan test

data:  .
BP = 5.7936, df = 1, p-value = 0.01608
library(estimatr)
school_reg_robust <- lm_robust(testscr ~ str, data = ca_school,
                              se_type = "stata")

school_reg_robust
              Estimate Std. Error   t value      Pr(>|t|)   CI Lower   CI Upper
(Intercept) 698.932952 10.3643599 67.436191 9.486678e-227 678.560192 719.305713
str          -2.279808  0.5194892 -4.388557  1.446737e-05  -3.300945  -1.258671
             DF
(Intercept) 418
str         418
school_reg_robust %>% summary()

Call:
lm_robust(formula = testscr ~ str, data = ca_school, se_type = "stata")

Standard error type:  HC1 

Coefficients:
            Estimate Std. Error t value   Pr(>|t|) CI Lower CI Upper  DF
(Intercept)   698.93    10.3644  67.436 9.487e-227  678.560  719.306 418
str            -2.28     0.5195  -4.389  1.447e-05   -3.301   -1.259 418

Multiple R-squared:  0.05124 ,  Adjusted R-squared:  0.04897 
F-statistic: 19.26 on 1 and 418 DF,  p-value: 1.447e-05
school_reg_robust %>% tidy()
school_reg_robust %>% glance()
modelsummary(models = list("Normal SE" = school_reg,
                           "Robust SE" = school_reg_robust),
             fmt = 2, # round to 2 decimals
             output = "html",
             coef_rename = c("(Intercept)" = "Constant",
                             "str" = "STR"),
             gof_map = list(
               list("raw" = "nobs", "clean" = "n", "fmt" = 0),
               list("raw" = "r.squared", "clean" = "R<sup>2</sup>", "fmt" = 2),
               #list("raw" = "adj.r.squared", "clean" = "Adj. R<sup>2</sup>", "fmt" = 2),
               list("raw" = "rmse", "clean" = "SER", "fmt" = 2)
             ),
             escape = FALSE,
             stars = c('*' = .1, '**' = .05, '***' = 0.01)
)
Normal SE Robust SE
Constant 698.93*** 698.93***
(9.47) (10.36)
STR −2.28*** −2.28***
(0.48) (0.52)
n 420 420
R2 0.05 0.05
SER 18.54 18.54
* p < 0.1, ** p < 0.05, *** p < 0.01

Multivariate Regression (4.1)

Correlation Table

ca_school %>%
  select("str","testscr","el_pct", "expn_stu") %>%
  cor()
                str    testscr      el_pct    expn_stu
str       1.0000000 -0.2263628  0.18764237 -0.61998215
testscr  -0.2263628  1.0000000 -0.64412374  0.19127277
el_pct    0.1876424 -0.6441237  1.00000000 -0.07139604
expn_stu -0.6199821  0.1912728 -0.07139604  1.00000000

Conditional Distribution (of Test Score) by Low/High ESL

# make a new variable called EL
# = high (if el_pct is above median) or = low (if below median)
ca_school<-ca_school %>% # next we create a new dummy variable called ESL
  mutate(ESL = ifelse(el_pct > median(el_pct), # test if ESL is above median
                     yes = "High ESL", # if yes, call this variable "High ESL"
                     no = "Low ESL")) # if no, call this variable "Low ESL"

# get average test score by high/low EL
ca_school %>%
  group_by(ESL) %>%
  summarize(Average_test_score=mean(testscr))
# make distributions
ggplot(data = ca_school)+
  aes(x = testscr,
      fill = ESL)+
  geom_density(alpha=0.5)+
  labs(x = "Test Score",
       y = "Density")+
  theme_bw(
    base_family = "Fira Sans Condensed",
    base_size=20
    )+
  theme(legend.position = "bottom")

# make scatterplot

esl_scatter<-ggplot(data = ca_school)+
  aes(x = str,
      y = testscr,
      color = ESL)+
  geom_point()+
  geom_smooth(method="lm")+
  labs(x = "STR",
       y = "Test Score")+
  theme_bw(
    base_family = "Fira Sans Condensed",
    base_size=20
    )+
  theme(legend.position = "bottom")
esl_scatter
`geom_smooth()` using formula 'y ~ x'

# facet instead

esl_scatter+
  facet_grid(~ESL)+ #<<
  guides(color = F) #<<
Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
"none")` instead.
`geom_smooth()` using formula 'y ~ x'

Multivariate Regression (with %EL)

# run regression of testscr on str and el_pct
elreg <- lm(testscr ~ str + el_pct, 
                 data = ca_school)

# look at it (just gives coefficients)
elreg

Call:
lm(formula = testscr ~ str + el_pct, data = ca_school)

Coefficients:
(Intercept)          str       el_pct  
   686.0322      -1.1013      -0.6498  
# get summary
elreg %>% summary()

Call:
lm(formula = testscr ~ str + el_pct, data = ca_school)

Residuals:
    Min      1Q  Median      3Q     Max 
-48.845 -10.240  -0.308   9.815  43.461 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 686.03225    7.41131  92.566  < 2e-16 ***
str          -1.10130    0.38028  -2.896  0.00398 ** 
el_pct       -0.64978    0.03934 -16.516  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 14.46 on 417 degrees of freedom
Multiple R-squared:  0.4264,    Adjusted R-squared:  0.4237 
F-statistic:   155 on 2 and 417 DF,  p-value: < 2.2e-16
# do it tider with broom
elreg %>% tidy()
# get regression fit statistics
elreg %>% glance()
# make a regression output table
modelsummary(models = list("Simple Model" = school_reg,
                           "Multivariate Model" = elreg),
             fmt = 2, # round to 2 decimals
             output = "html",
             coef_rename = c("(Intercept)" = "Constant",
                             "str" = "STR",
                             "el_pct" = "% ESL Students"),
             gof_map = list(
               list("raw" = "nobs", "clean" = "N", "fmt" = 0),
               #list("raw" = "r.squared", "clean" = "R<sup>2</sup>", "fmt" = 2),
               list("raw" = "adj.r.squared", "clean" = "Adj. R<sup>2</sup>", "fmt" = 2),
               list("raw" = "rmse", "clean" = "SER", "fmt" = 2)
             ),
             escape = FALSE,
             stars = c('*' = .1, '**' = .05, '***' = 0.01)
)
Simple Model Multivariate Model
Constant 698.93*** 686.03***
(9.47) (7.41)
STR −2.28*** −1.10***
(0.48) (0.38)
% ESL Students −0.65***
(0.04)
N 420 420
Adj. R2 0.05 0.42
SER 18.54 14.41
* p < 0.1, ** p < 0.05, *** p < 0.01
library(modelsummary)
modelplot(elreg,  # our regression object
          coef_omit = 'Intercept') # don't show intercept

Measuring OVB

# "true" regression (str and %EL)
true<-lm(testscr~str+el_pct, data=ca_school)
true %>% tidy()
# "omitted" regression (omitting %EL)
omitted<-lm(testscr~str, data=ca_school)
omitted %>% tidy()
# "auxiliary" regression (%EL on str)
aux<-lm(el_pct~str, data=ca_school)
aux %>% tidy()

Variance and VIF

library(car) # needs car package
Loading required package: carData

Attaching package: 'car'
The following object is masked from 'package:dplyr':

    recode
The following object is masked from 'package:purrr':

    some
true %>% vif()
     str   el_pct 
1.036495 1.036495 
# calculate VIF manually

# look at aux reg stats for R^2
# extract our R-squared from aux regression (R_j^2)
aux_r_sq<-glance(aux) %>%
  select(r.squared)

aux_r_sq # look at it
# calculate VIF manually

our_vif<-1/(1-aux_r_sq) # VIF formula 

our_vif