Data Sources

For this analysis, I combine multiple sources produced by the DC government. The DC FY19 Economic Development Return on Investment Data spreadsheet data set pulls together economic development investments from across a variety of agencies and programs in the DC government for FY15-19. The data set was created by the DC Deputy Mayor for Planning and Economic Development (DMPED) as required by the Economic Development Return on Investment Accountability Amendment Act of 2018. Fore more information see DMPED’s FY19 Economic Development Return on Investment Accountability Report.

The second source is a database of affordable housing projects projects maintained by the DC Department of Housing and Community Development. These projects span a number of years, but most are after 2010.

The final source is a data set of affordable housing production and preservation projects that encompasses comprehensively covers affordable housing projects which started or completed since January of 2015. The data includes affordable housing projects (production and preservation, rental and for-sale) which were subsidized by DMPED, DHCD, DCHFA, or DCHA, and those which were produced as a result of Planned Unit Development (PUD) proffers or Inclusionary Zoning (IZ) requirements. This data was used only for mapping affordable housing projects.

Github Repo: DSPP Final Project

Data Import and Cleaning

# Removing unneeded variables and making the variable names easier to work with 
dc_housing_investment <- read_csv("data/dc-housing-investment.csv") %>% 
  select(-`Is recipient in compliance with District resident employment requirements?`, 
         -`Amenity Produced or Enhanced`,
         -`Affordable Housing Compliance?  (Covenanted)`,
         -`CBE Compliant?`,
         -`Additional Incentive Benefits`,
         -`Notes`,
         -`Incentive Type`,
         -`Agency`) %>% 
  rename(
    dmped_year = `FY`,
    incentive_name = `Incentive Name`,
    recipient_name = `Recipient Name`,
    incentive_amount = `Incentive Amount`, 
    investment_address = `MAR Address of Investment`,
    ward = `MAR_Ward`,
    ami_30 = `30% AMI`,
    ami_50 = `50% AMI`,
    ami_60 = `60% AMI`,
    ami_80 = `80% AMI`,
    one_br = `1 BRs`,
    two_br = `2 BRs`,
    three_br = `3 BRs`,
    four_plus_br = `4+ BRs`,
    total_dmped_affordable = `Total Affordable Units Produced or Preserved`,
    dc_residents_employed = `Number of District Residents Employed`,
    num_cbes = `Number of CBEs`
    ) %>% 
  # Ensuring variables are consistent and converting to numeric
  # Note that the "HFA Revenue Bond Issuance" was renamed to "HFA Revenue Bond" in 2019 so this code makes all years consistent
  mutate(
    investment_address = str_to_upper(investment_address),
    ami_30 = as.numeric(ami_30),
    ami_50 = as.numeric(ami_50),
    ami_60 = as.numeric(ami_60),
    ami_80 = as.numeric(ami_80),
    one_br = as.numeric(one_br),
    two_br = as.numeric(two_br),
    three_br = as.numeric(three_br),
    four_plus_br = as.numeric(four_plus_br),
    total_dmped_affordable = as.numeric(total_dmped_affordable),
    total_bedrooms = one_br + 2*two_br + 3*three_br + 4*four_plus_br,
    # For both dc_residents_employed and num_cbes some observations have N/A values, while others have Not Available
    # I infer that N/A means "Not Applicable" which would reasonably translate to 0 for my purposes, whereas I replace Not Available values with NA
    num_cbes = as.numeric(str_replace_all(num_cbes, c("Not Available" = "NA", "N/A" = 0))),
    dc_residents_employed = as.numeric(str_replace_all(dc_residents_employed, c("Not Available" = "NA", "N/A" = 0))),
    incentive_name = str_replace_all(incentive_name, c("HFA Revenue Bond Issuance" = "HFARB", "HFA Revenue Bond" = "HFARB", "4% Low Income Housing Tax Credit" = "LIHTC", "9% Low Income Housing Tax Credit" = "LIHTC", "Community Development Block Grant \\(CDBG\\)" = "CDBG", "Creative and Open Space Modernization Grant" = "COSMG", "District of Columbia Business Capital Program" = "CBCP", "DMPED Grant" = "DMPED", "Economic Development Special Account" = "EDSA", "HOME Investment Partnerships Program" = "HOME", "Housing Preservation Fund" = "HPF", "Housing Production Trust Fund" = "HPTF", "Industrial Revenue Bond" = "IRB", "Land Disposition Agreement" = "LDA", "Legacy Business" = "LBZ", "Makerspace Marketplace Grant" = "MMG", "Neighborhood Prosperity Fund" = "NPF", "New Communities Initiative" = "NCI", "Qualified Supermarket Tax Incentive" = "TAX", "Real Property Tax Abatement" = "TAX", "Tax Exemption" = "TAX", "Tax Increment Financing" = "TAX")),
    dmped_lihtc_amount = if_else(incentive_name == "LIHTC", incentive_amount, 0),
    ward = na_if(ward, "N/A")
  ) %>% 
  # Standardizing quadrant parts of address 
  mutate_if(
    is.character, 
    str_replace_all, 
    pattern = "NORTHEAST", 
    replacement = "NE"
    ) %>% 
  mutate_if(
    is.character, 
    str_replace_all, 
    pattern = "SOUTHEAST", 
    replacement = "SE"
    ) %>% 
  mutate_if(
    is.character, 
    str_replace_all, 
    pattern = "NORTHWEST", 
    replacement = "NW"
    ) %>% 
  mutate_if(
    is.character, 
    str_replace_all, 
    pattern = "SOUTHWEST", 
    replacement = "SW"
    ) %>% 
  mutate_if(
    is.character, 
    str_replace_all, 
    pattern = "STREET", 
    replacement = "ST"
    ) %>% 
  mutate_if(
    is.character, 
    str_replace_all, 
    pattern = "ROAD", 
    replacement = "RD"
    ) %>% 
  mutate_if(
    is.character, 
    str_replace_all, 
    pattern = "PLACE", 
    replacement = "PL"
    ) %>% 
  mutate_if(
    is.character, 
    str_replace_all, 
    pattern = "AVENUE", 
    replacement = "AVE"
    )

# Removing commas and periods from addresses for consistency
dc_housing_investment$investment_address <- str_remove_all(dc_housing_investment$investment_address, "[,.]")
# Doing similar importing and cleaning for DHCD data set
affordable_proj_market_rate <- read_csv("data/affordable-projects-market-rate.csv") %>% 
  select(-`Projected or Actual Construction Completion Date`,
         -`External Project Website`,
         -`Project Type/Scope`) %>% 
  rename(
    investment_address = `Address`,
    total_units = `Total Units`,
    loan_grant_amount = `Loan/Grant Amount`,
    dhcd_lihtc_amount = `LIHTC Allocation`,
    ami_0_30 = `0-30% AMI units`,
    ami_31_50 = `31-50% AMI Units`,
    ami_51_80 = `51-80% AMI Units`,
    ami_81 = `81%+ AMI / Market Rate`,
    dhcd_year = `Projected or Actual Closing Date`,
    total_dhcd_affordable = `Affordable Units`,
    psh_units = `PSH Units`,
    funding_sources = `Funding Sources`
  ) %>%
  mutate(
    investment_address = str_to_upper(investment_address),
    market_rate_units = total_units - total_dhcd_affordable,
    total_units = as.numeric(total_units),
    loan_grant_amount = replace_na(as.numeric(gsub('[\\$,]', '', loan_grant_amount)), 0),
    dhcd_lihtc_amount = replace_na(as.numeric(gsub('[\\$,]', '', dhcd_lihtc_amount)), 0),
    dhcd_year = as.Date(dhcd_year, "%m/%d/%y"),
    total_investment = dhcd_lihtc_amount + loan_grant_amount
  ) %>% 
  separate(funding_sources, c("funding_source1", "funding_source2")) %>% 
  filter(!is.na(total_units)) %>% 
  mutate_if(
    is.character, 
    str_replace_all, 
    pattern = "NORTHEAST", 
    replacement = "NE"
    ) %>% 
  mutate_if(
    is.character, 
    str_replace_all, 
    pattern = "SOUTHEAST", 
    replacement = "SE"
    ) %>% 
  mutate_if(
    is.character, 
    str_replace_all, 
    pattern = "NORTHWEST", 
    replacement = "NW"
    ) %>% 
  mutate_if(
    is.character, 
    str_replace_all, 
    pattern = "SOUTHWEST", 
    replacement = "SW"
    ) %>% 
  mutate_if(
    is.character, 
    str_replace_all, 
    pattern = "STREET", 
    replacement = "ST"
    ) %>% 
  mutate_if(
    is.character, 
    str_replace_all, 
    pattern = "ROAD", 
    replacement = "RD"
    ) %>% 
  mutate_if(
    is.character, 
    str_replace_all, 
    pattern = "PLACE", 
    replacement = "PL"
    ) %>% 
  mutate_if(
    is.character, 
    str_replace_all, 
    pattern = "AVENUE", 
    replacement = "AVE"
    )

affordable_proj_market_rate$investment_address <- str_remove_all(affordable_proj_market_rate$investment_address, "[,.]")
# Getting total investment data by project for both data sets

investment_by_project <- affordable_proj_market_rate %>% 
  group_by(investment_address) %>% 
  summarize(total_dhcd_investment = sum(total_investment))

affordable_proj_market_rate <- 
  left_join(affordable_proj_market_rate, investment_by_project, by = "investment_address")

total_proj_data <- dc_housing_investment %>% 
  group_by(investment_address) %>% 
  summarize(total_dmped_investment = sum(incentive_amount))

dc_housing_investment <- 
  left_join(dc_housing_investment, total_proj_data, by = "investment_address")
# Merging the existing data set with market rate data set and recoding some variables
# Since I will be doing project-level analysis rather than investment-level, I remove duplicate entries by address

# The investment and unit data from the two data sets do not always line up
# I compare the investment amount and take the larger of the values assuming that the data set with a larger investment value is more up to date with recent funding allocation
# I do the same with the affordable unit data 
# Before comparing, I replace missing values with 0 for the corresponding variables from the other data set that are created when joining
# This process also affects projects in the DMPED data set that had NA values for total affordable housing; this makes sense because these NA values existed for investments that would not have been expected to produce any affordable housing (i.e. a business grant or tax exemption for a museum)
# I also align the AMI variables so all projects with data for these variables are using the same variable, with the DHCD data set as the default
# I then use the address variable to create a separate quadrant variable

combined_data <- 
  full_join(dc_housing_investment, affordable_proj_market_rate, by = "investment_address") %>% 
  distinct(investment_address, .keep_all= TRUE) %>% 
  mutate(total_dmped_investment = replace_na(total_dmped_investment, 0),
         total_dhcd_investment = replace_na(total_dhcd_investment, 0),
         total_dmped_affordable = replace_na(total_dmped_affordable, 0),
         total_dhcd_affordable = replace_na(total_dhcd_affordable, 0),
         dhcd_lihtc_amount = replace_na(dhcd_lihtc_amount, 0),
         dmped_lihtc_amount = replace_na(dmped_lihtc_amount, 0),
         total_proj_investment = if_else(total_dmped_investment > total_dhcd_investment, total_dmped_investment, total_dhcd_investment),
         total_proj_affordable = if_else(total_dmped_affordable > total_dhcd_affordable, total_dmped_affordable, total_dhcd_affordable),
         ami_0_30 = if_else((is.na(ami_0_30) & !is.na(ami_30)), ami_30, ami_0_30),
         ami_31_50 = if_else((is.na(ami_31_50) & !is.na(ami_50)), ami_50, ami_31_50),
         ami_51_80 = if_else((is.na(ami_51_80) & !is.na(ami_80) & !is.na(ami_60)), ami_60 + ami_80, ami_51_80),
         proj_funding_source = as.factor(if_else(is.na(incentive_name), funding_source1, incentive_name)),
         proj_lihtc_amount = if_else(dhcd_lihtc_amount > dmped_lihtc_amount, dhcd_lihtc_amount, dmped_lihtc_amount),
         investment_year = as.factor(if_else(is.na(dhcd_year) & !is.na(dmped_year), as.character(str_sub(dmped_year, -2, -1)), as.character(format(dhcd_year, '%y')))),
         quadrant = str_sub(investment_address, -2, -1),
         quadrant = if_else(quadrant %in% c("NE", "NW", "SW", "SE"), quadrant, "NA")
         )
# Plotting project investment, total unit production, and investment efficiency
# For this, I exclude projects that did not produce any housing (N = 147), projects without investment data (N = 85), and those that produced affordable units, but did not have clear data on total units for the project (N = 30)

total_unit_investment_data <- combined_data %>% 
  filter(!is.na(total_units) & total_proj_investment > 0) %>% 
  mutate(investment_per_unit = total_proj_investment/total_units)

Visualizing Project Investment and Affordable Unit Production

Given the limited local government funds available and the sizable need for housing in general, and affordable housing in particular, it is imperative that investments are made in a way that maximizes public benefit. Below I plot the relationship between government investment in a project and the total number of housing units produced. There is a weak positive relationship with significant variance between projects. These differences between projects deserves further research to determine the different factors that lead to a greater return for government investment.

total_unit_investment_data %>% 
  filter(total_units > 0) %>% 
  ggplot(aes(x = total_proj_investment, 
        y = total_units)) +
  geom_point(
    aes(color = if_else(market_rate_units > 0, "Yes", "No"))) +
  geom_smooth(method = lm, se = FALSE) +
  stat_cor(aes(label = ..rr.label..), geom = "label") +
  labs(title = "Weak positive relationship between project investment and affordable units",
       color = "Project with Market Rate Units") +
  scale_x_continuous(labels = dollar_format()) +
  scale_color_manual(values = c("black", "red")) +
  xlab('Total Government Investment in Project') +
  ylab('Total Housing Units') +
  theme_minimal()
## `geom_smooth()` using formula 'y ~ x'

One possible variable that could contribute to a greater return for government investment is the presence of market rate housing in a project. These units could hypothetically offset some of the subsidy required by developers and thereby may necessitate less funding from the government. However, there were relatively few projects in the data sets examined that contained any market rate housing. This could be partly due to incomplete data as only one of the data sets used contained market rate unit data. Of the projects that did contain market rate units, they did not appear to differ significantly in the investment efficiency, calculated as the dollars of government investment per unit of affordable housing produced.

total_unit_investment_data %>% 
  mutate(has_market_rate = if_else(market_rate_units > 0, TRUE, FALSE)) %>% 
  ggplot() +
  geom_histogram(
    aes(x = investment_per_unit),
    fill="blue",
    binwidth = 10000) +
  scale_x_continuous(labels = dollar_format()) +
  xlab("Investment Amount Per Unit Produced") +
  ggtitle("No difference in investment efficiency for projects with market rate units") +
  facet_wrap(~ has_market_rate,
             labeller = labeller(has_market_rate = c("FALSE" = "No Market Rate Units", "TRUE" = "Contains Market Rate Units"))
             ) +
  theme_light() + 
  theme(panel.spacing.x = unit(1, "cm"))

Similarly, we might expect that those projects with higher number of affordable housing units could require more investment per unit since these affordable units would require subsidy to offset the lower rents paid for them. Plotting the total affordable units in a project and the government investment per unit does not appear to show a relationship.

We also might expect that there would be relationships between different levels of affordability and the investment per unit produced. Affordable housing typically has income restrictions that are calculated based on the Area Median Income (AMI). For example some units that are considered “deeply” affordable housing are restricted to renters making below 30% AMI, while others are reserved for more moderate income renters making around 80% AMI. There does not appear to be a relationship between investment efficiency and either the number of deeply affordable housing units or moderately affordable housing units.

# Want to look at the investment per unit and relationship to number of affordable units produced
total_unit_investment_data %>% 
  filter(total_proj_affordable > 0) %>% 
  ggplot(aes(x = investment_per_unit, 
        y = total_proj_affordable)) +
  geom_point() +
  geom_smooth(method = lm, se = FALSE) +
  stat_cor(aes(label = ..rr.label..), geom = "label") +
  labs(title = "No relationship between investment efficiency and affordable units in project",
       color = "Project with Market Rate Units") +
  scale_x_continuous(labels = dollar_format()) +
  xlab('Government Investment Per Unit Produced') +
  ylab('Total Affordable Units') +
  theme_minimal()
## `geom_smooth()` using formula 'y ~ x'

total_unit_investment_data %>% 
  filter(ami_0_30 > 0) %>% 
  ggplot(aes(x = investment_per_unit, 
        y = ami_0_30)) +
  geom_point() +
  geom_smooth(method = lm, se = FALSE) +
  stat_cor(aes(label = ..rr.label..), geom = "label") +
  labs(title = "No relationship between investment efficiency and deeply affordable units",
       color = "Project with Market Rate Units") +
  scale_x_continuous(labels = dollar_format()) +
  xlab('Government Investment Per Unit Produced') +
  ylab('Total 30% AMI Affordable Units') +
  theme_minimal()
## `geom_smooth()` using formula 'y ~ x'

total_unit_investment_data %>% 
  mutate(moderate_market_units = (ami_81 + market_rate_units)) %>% 
  filter(moderate_market_units > 0) %>% 
  ggplot(aes(x = investment_per_unit, 
        y = moderate_market_units)) +
  geom_point() +
  geom_smooth(method = lm, se = FALSE) +
  stat_cor(aes(label = ..rr.label..), geom = "label") +
  labs(title = "No relationship between investment efficiency and moderate income units",
       color = "Project with Market Rate Units") +
  scale_x_continuous(labels = dollar_format()) +
  xlab('Government Investment Per Unit Produced') +
  ylab('Total 81%+ AMI Units') +
  theme_minimal()
## `geom_smooth()` using formula 'y ~ x'

Mapping Affordable Housing Production

I use a third data set to map affordable housing production since 2015. As has been emphasized in previous research on housing in DC, new affordable housing construction has been concentrated in areas east of the Anacostia River and in the central part of DC. While many Census tracts have received some affordable housing production, there has been a notable lack of affordable housing production in areas west of Rock Creek Park.

The distribution of deeply affordable housing (available to those making less than 30% AMI) is even more concentrated in particular Census tracts and less evenly spread throughout the city. This is likely due in part to the fact that many of these units are replacement units for public housing redevelopment projects, which are typically very large developments.

# I use a separate data set for mapping as this other data set has latitude and longitude, but unfortunately does not have investment data and does not align well with the other two data sets. 

dc_aff_housing_open <- 
  read_csv("data/dc-aff-housing-open.csv", 
                            col_type = cols(
                              LATITUDE = col_character(),
                              LONGITUDE = col_character()
                            )) %>% 
  rename_with(tolower) %>% 
  filter(!is.na(address_id) & !is.na(longitude) & !is.na(latitude))

dc_aff_housing_open <- st_as_sf(
  dc_aff_housing_open,
  coords = c("longitude", "latitude"),
  crs = 4326,
  remove = FALSE)

dc_tracts <- st_read("data/Census_Tracts_in_2010.shp") %>% 
  select(TRACT, GEOID, geometry)
## Reading layer `Census_Tracts_in_2010' from data source `/Users/Christina/Documents/mccourt/intro-data-science/final-project/data/Census_Tracts_in_2010.shp' using driver `ESRI Shapefile'
## Simple feature collection with 179 features and 67 fields
## geometry type:  POLYGON
## dimension:      XY
## bbox:           xmin: -77.11976 ymin: 38.79165 xmax: -76.9094 ymax: 38.99581
## geographic CRS: WGS 84
# Affordable housing data doesn't have tract or geoID information, adding additional data source
dc_addresses <- st_read("data/Address_Points.csv") %>% 
  select(address_id = ADDRESS_ID, TRACT = CENSUS_TRACT) %>% 
  mutate(address_id = as.numeric(address_id))
## Reading layer `Address_Points' from data source `/Users/Christina/Documents/mccourt/intro-data-science/final-project/data/Address_Points.csv' using driver `CSV'
dc_aff_housing_open <- 
  left_join(dc_aff_housing_open, dc_addresses, by = "address_id")

dc_aff_housing_merged <- st_join(
  dc_aff_housing_open, # points
  dc_tracts, # polygons
  join = st_within
)

dc_aff_housing_merged <- st_set_geometry(dc_aff_housing_merged, NULL)

dc_aff_housing_merged_agg <- dc_aff_housing_merged %>%
  group_by(GEOID) %>%
  summarize(
    total_affordable_units = sum(total_affordable_units),
    total_0_30_ami = sum(affordable_units_at_0_30_ami)
  )

dc_aff_housing_merged_agg <- dc_tracts %>%
  left_join(dc_aff_housing_merged_agg, by = "GEOID")

dc_aff_housing_merged_agg %>% 
  mutate(total_affordable_units = replace_na(total_affordable_units, 0)) %>% 
  ggplot() +
  geom_sf(aes(fill = total_affordable_units)) +
  scale_fill_distiller(direction = 1, name="Affordable Housing Units Produced", palette = "YlGnBu") +
  ggtitle("Affordable housing concentrated east of Anacostia River and center of city") +
  theme_void()

dc_aff_housing_merged_agg %>% 
  mutate(total_0_30_ami = replace_na(total_0_30_ami, 0)) %>% 
  ggplot() +
  geom_sf(aes(fill = total_0_30_ami)) + 
  scale_fill_distiller(direction = 1, name="Deeply Affordable Housing Units Produced", palette = "YlGnBu") +
  ggtitle("Deeply affordable housing is even more concentrated in particular Census tracts") +
  theme_void()

Predicting DC Government Investment by Project

I construct two machine learning models to predict the amount of DC government investment in a particular project. These include not just affordable housing projects, but other economic development investments including some related to small businesses, nonprofits, and arts and culture.

Notably, these investments are heavily right-skewed, with most investments under $1.5M and a long tail reaching a maximum of over $215M for one project. Due to this skew, I log-transform the project investment variable before building my predictive models.

# Predicting amount of investment in a project
# I previously imputed 0 for projects that had NA values for total affordable housing produced as these only existed in the DMPED data set and are assumed to have not produced any housing (at any affordability levels) based on the description of the investments; I therefore repeat this for the other housing unit affordability level variables and the total_bedrooms variable

# Outcome variable is right-skewed so will normalize
combined_data %>% 
  filter(total_proj_investment > 0) %>% 
  ggplot() +
  geom_histogram(
    aes(x = total_proj_investment),
    bins = 200,
    fill = "blue") +
  scale_x_continuous(labels = dollar_format()) + 
  xlab('Total Government Investment in Project') +
  ylab('Project Count') +
  ggtitle(label = 'Project investment concentrated at low amounts', subtitle = 'The majority of DC Government investments are for relatively small amounts') +
  theme_minimal()

combined_data %>% 
  select(total_proj_investment) %>% 
  filter(total_proj_investment > 0) %>% 
  tbl_summary(
    label = list(total_proj_investment ~ "Project Investment"),
    type = all_continuous() ~ "continuous2",
    statistic = all_continuous() ~ c("{N_nonmiss}",
                                     "{median}",
                                     "{p25}", 
                                     "{p75}", 
                                     "{min}", "{max}")
  ) %>% 
  modify_header(
    update = list(label ~ "**Variable**",
                  stat_0 ~ "")
  )
Variable
Project Investment
N 440
Median 1,324,822
25% 487,406
75% 6,333,148
Minimum 30,000
Maximum 216,925,241
predict_investment_data <- combined_data %>% 
  select(total_proj_investment, proj_lihtc_amount, total_proj_affordable, quadrant, ami_0_30, ami_31_50, ami_51_80, ami_81, ward, dc_residents_employed, num_cbes, total_bedrooms, investment_year, proj_funding_source) %>% 
  mutate(ami_0_30 = if_else(total_proj_affordable == 0, 0, ami_0_30),
         ami_31_50 = if_else(total_proj_affordable == 0, 0, ami_31_50),
         ami_51_80 = if_else(total_proj_affordable == 0, 0, ami_51_80),
         ami_81 = if_else(total_proj_affordable == 0, 0, ami_81),
         # As recommended in this thread (https://github.com/tidymodels/workflows/issues/63), I'm log transforming the outcome variable before the initial split
         log_total_proj_investment = log(total_proj_investment)
         ) %>% 
  # remove any projects that didn't receive any investment
  filter(total_proj_investment > 0) %>% 
  select(-total_proj_investment)



set.seed(20201212)
# create a split object
predict_investment_split <- initial_split(data = predict_investment_data, prop = 0.75)
# create the training and testing data
predict_investment_train <- training(x = predict_investment_split)
predict_investment_test <- testing(x = predict_investment_split)

# create resamples
predict_investment_folds <- vfold_cv(data = predict_investment_train, v = 10)

As can be seen in the below visualization of missing values across the data set, there were a number of variables that were contained in one data set but not the other, leading to projects that had missing values for these variables. These were imputed as part of the preprocessing steps.

# Predicting investment in a project

# Significant number of missing values across particular variables occur because I combine two data sets, and some projects were only present in one of the data sets which did not have data for particular variables including dc_residents_employed, num_cbes, total_bedrooms, total_funding_sources, and investment_year
vis_miss(predict_investment_train, cluster = TRUE)

# I have no zero-variance predictors, but a few near-zero predictors so these will be filtered out as part of the recipe
nearZeroVar(predict_investment_train, saveMetrics = TRUE) %>% 
  rownames_to_column() %>% 
  filter(nzv)
##                 rowname freqRatio percentUnique zeroVar  nzv
## 1                ami_81  55.00000      9.393939   FALSE TRUE
## 2 dc_residents_employed  25.83333      4.545455   FALSE TRUE
## 3              num_cbes  39.33333      5.757576   FALSE TRUE
predict_investment_recipe <- 
  recipe(log_total_proj_investment ~ ., data = predict_investment_train) %>%
  step_nzv(all_predictors()) %>% 
  step_bagimpute(all_predictors()) %>% 
  step_other(investment_year, threshold = 0.10, 
             other = "other") %>% 
  step_other(proj_funding_source, threshold = 0.05, 
             other = "OTHER")

I first train a K-Nearest Neighbor model, which outperforms a previous KNN model I conducted on just one of the data sets. These improvements are likely due to the additional data as well as additional preprocessing steps conducted. The root mean squared error term is not easily interpretable because it is in log-units, which will be reversed in a later step for further model evaluation.

# KNN Model to predict investment

# create model object
predict_investment_knn_mod <-
  nearest_neighbor(neighbors = tune()) %>%
  set_engine(engine = "kknn") %>%
  set_mode(mode = "regression")

# create workflow
predict_investment_knn_workflow <-
  workflow() %>%
  add_model(spec = predict_investment_knn_mod) %>%
  add_recipe(recipe = predict_investment_recipe)

# create tuning grid for knn hyperparameters
predict_investment_knn_grid <- tibble(neighbors = c(7, 9, 11, 13))

# estimate models
predict_investment_knn_res <-
  predict_investment_knn_workflow %>%
  tune_grid(resamples = predict_investment_folds,
            grid = predict_investment_knn_grid,
            control = control_grid(save_pred = TRUE),
            metrics = metric_set(rmse))

show_best(predict_investment_knn_res, "rmse", n = 10)
## # A tibble: 4 x 7
##   neighbors .metric .estimator  mean     n std_err .config
##       <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>  
## 1        13 rmse    standard    1.47    10  0.0690 Model4 
## 2        11 rmse    standard    1.48    10  0.0682 Model3 
## 3         9 rmse    standard    1.50    10  0.0702 Model2 
## 4         7 rmse    standard    1.53    10  0.0681 Model1

I then train a Random Forest model to predict project investment. This model performs better than the KNN model based on a lower RMSE value. I also visualize the variable importance for the RF model, which shows that total affordable housing units, amount of Low Income Housing Tax Credit investment, units at 30% AMI, and the project funding source play the most important roles in the model. While the initial visualizations appeared to show that there was not much of a relationship between investment and affordable units or deeply affordable units, this predictive model relied heavily on these two variables suggesting the possibility of some relationship.

# Random Forest model to predict investment

# create model object
predict_investment_rf_mod <- 
    rand_forest(mtry = tune(), trees = tune(), min_n = tune()) %>%
    set_mode("regression") %>%
    set_engine("ranger", importance = "impurity")

# create workflow
predict_investment_rf_workflow <-
  workflow() %>%
  add_model(spec = predict_investment_rf_mod) %>%
  add_recipe(recipe = predict_investment_recipe)

n_features <- length(setdiff(names(predict_investment_train), "total_proj_investment"))

# create tuning grid for rf hyperparamters
predict_investment_rf_grid <- expand.grid(
  mtry = floor(n_features * c(.25, .333, .4)),
  min_n = c(1, 5, 10), 
  trees = c(n_features*10)
)

# estimate models
predict_investment_rf_res <- 
  predict_investment_rf_workflow %>%
  tune_grid(resamples = predict_investment_folds,
            grid = predict_investment_rf_grid,
            control = control_grid(save_pred = TRUE),
            metrics = metric_set(rmse))

show_best(predict_investment_rf_res, "rmse", n = 10)
## # A tibble: 9 x 9
##    mtry trees min_n .metric .estimator  mean     n std_err .config
##   <dbl> <dbl> <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>  
## 1     4   140     5 rmse    standard    1.26    10  0.0568 Model5 
## 2     3   140     1 rmse    standard    1.26    10  0.0564 Model1 
## 3     3   140     5 rmse    standard    1.27    10  0.0560 Model4 
## 4     4   140     1 rmse    standard    1.27    10  0.0541 Model2 
## 5     5   140     1 rmse    standard    1.27    10  0.0511 Model3 
## 6     5   140    10 rmse    standard    1.28    10  0.0602 Model9 
## 7     4   140    10 rmse    standard    1.28    10  0.0531 Model8 
## 8     3   140    10 rmse    standard    1.28    10  0.0543 Model7 
## 9     5   140     5 rmse    standard    1.28    10  0.0552 Model6
# Re-running model with best specification to get variable importance
predict_investment_rf_mod_vip <- 
    rand_forest(mtry = 4, trees = 140, min_n = 5) %>%
    set_mode("regression") %>%
    set_engine("ranger", importance = "impurity")

# Fitting model to training data
predict_investment_rf_fit <-
  workflow() %>%
  add_model(spec = predict_investment_rf_mod_vip) %>%
  add_recipe(recipe = predict_investment_recipe) %>% 
  fit(predict_investment_train)

# Plotting variable importance
predict_investment_rf_fit %>%
  pull_workflow_fit() %>%
  vip(num_features = 10)

Finally, I use the identified best model to make predictions on the testing data set and then undo the log transformation to allow for further analysis. With a root mean squared error of $24,544,726 this model does not initially appear to perform very well considering that the vast majority of investments were well below this amount. Upon closer analysis, however, we see that the mean absolute error was much lower at $9,182,117. While still relatively high compared to most investments, this demonstrates the impact of outlier projects that were significantly larger than most other projects. This can be seen further by plotting the absolute error for individual projects against the actual investment and by considering the distribution of absolute error. We can see that for many project investment predictions there was relatively low error. Based on the composition of the data set, it makes sense that the model would perform better with the lower investment amounts.

# Fitting the best model and making predictions with test data

# create model object
predict_investment_rf_best_mod <-
  rand_forest(mtry = 4, trees = 140, min_n = 5) %>%
  set_mode("regression") %>%
  set_engine("ranger")

predict_investment_rf_best_fit <-
  workflow() %>%
  add_model(spec = predict_investment_rf_best_mod) %>%
  add_recipe(recipe = predict_investment_recipe) %>% 
  fit(predict_investment_train)

# make predictions with testing data
predict_investment_rf_best_predictions <-
  bind_cols(
    predict_investment_test,
    predict(object = predict_investment_rf_best_fit, new_data = predict_investment_test)
  )

# calculate the rmse for the testing data
rmse(data = predict_investment_rf_best_predictions, 
     truth = log_total_proj_investment, 
     estimate = .pred)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        1.47
# undo log transform
predict_investment_rf_best_predictions <- 
  predict_investment_rf_best_predictions %>% 
  mutate(total_proj_investment = exp(log_total_proj_investment),
         predicted_proj_investment = exp(.pred),
         abs_error = abs(total_proj_investment - predicted_proj_investment))

# calculate RMSE after undoing log transformation of outcome variable
rmse(data = predict_investment_rf_best_predictions, 
     truth = total_proj_investment, 
     estimate = predicted_proj_investment)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard   24544726.
mae(data = predict_investment_rf_best_predictions, 
     truth = total_proj_investment, 
     estimate = predicted_proj_investment)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 mae     standard    9182117.
summary(predict_investment_rf_best_predictions$total_proj_investment)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     39473    463510   1516268  11485842  12853145 216925241
predict_investment_rf_best_predictions %>% 
  ggplot() +
  geom_point(
    aes(x = total_proj_investment, y = abs_error),
    alpha = .5) +
  scale_x_continuous(labels = dollar_format()) + 
  scale_y_continuous(labels = dollar_format()) +
  xlab("Government Investment in Project") +
  ylab("Absolute Error of Prediction") + 
  ggtitle("Model accuracy decreases with increasing investment") +
  theme_minimal()

mean_line <- data.frame(vlines = c(25376579), labels = c("Mean: 9,182,117"))

predict_investment_rf_best_predictions %>% 
  ggplot() + 
  geom_histogram (
    aes(x = abs_error),
    bins = 75
  ) +
  # Add mean line
  geom_vline(
    aes(xintercept = mean(abs_error)),
    color = "blue",
    linetype = "dashed",
    size = 1, 
    show.legend = TRUE) +
  geom_text(data = mean_line, aes(x = vlines, y = 10, label = labels)) +
  scale_x_continuous(labels = dollar_format()) + 
  xlab("Absolute Error") + 
  ggtitle("Distribution of absolute error is right skewed") +
  theme_minimal()

# INCOMPLETE: Boosted Tree model to predict investment in a project
# Couldn't get this to work in time

# create model object
predict_investment_xgboost_mod <- 
  boost_tree(
    trees = tune(),
    min_n = tune(),
    tree_depth = tune(),
    learn_rate = tune(),
    loss_reduction = tune()) %>%
  set_engine("xgboost") %>% 
  set_mode("regression")

predict_investment_xgboost_recipe <- 
  recipe(total_proj_investment ~ ., data = predict_investment_train) %>%
  step_integer(all_nominal()) %>%
  step_nzv(all_predictors()) %>% 
  step_bagimpute(all_predictors()) %>% 
  step_BoxCox(all_numeric()) %>% 
  step_other(investment_year, threshold = 0.10, 
             other = "other") %>% 
  step_other(proj_funding_source, threshold = 0.05, 
             other = "OTHER")

# create workflow
predict_investment_xgboost_workflow <-
  workflow() %>%
  add_model(spec = predict_investment_xgboost_mod) %>%
  add_recipe(recipe = predict_investment_xgboost_recipe)

predict_investment_xgboost_grid <- expand.grid(
  trees = c(100, 500, 1000),
  min_n = 1,
  tree_depth = 6,
  learn_rate = .3,
  loss_reduction = 0
)

# estimate models
predict_investment_xgboost_res <- 
  predict_investment_xgboost_workflow %>%
  tune_grid(resamples = predict_investment_folds,
            grid = predict_investment_xgboost_grid,
            control = control_grid(save_pred = TRUE),
            metrics = metric_set(rmse))

show_best(predict_investment_rf_res, "rmse", n = 10)

# grid specification
xgboost_params <- 
  dials::parameters(
    min_n(),
    tree_depth(),
    learn_rate(),
    loss_reduction()
  )

xgboost_grid <- 
  dials::grid_max_entropy(
    xgboost_params, 
    size = 60
  )

Predicting Affordable Unit Production by Project

Similarly, I next produce a model to predict the number of affordable housing units produced by project. The number of affordable units per project is also right-skewed in the data with the majority of projects producing fewer than 60 units of affordable housing, but with a few producing over 500.

# Predicting affordable units in a project

# Outcome variable is right skewed
combined_data %>% 
  filter(total_proj_affordable > 0) %>% 
  ggplot() +
  geom_histogram(
    aes(x = total_proj_affordable),
    bins = 100,
    fill = "blue") +
  scale_x_continuous(labels = comma) + 
  xlab('Affordable Units Produced in Project') +
  ylab('Project Count') +
  ggtitle(label = 'Most affordable housing projects are relatively small') +
  theme_minimal()

combined_data %>% 
  select(total_proj_affordable) %>% 
  filter(total_proj_affordable > 0) %>% 
  tbl_summary(
    label = list(total_proj_affordable ~ "Project Affordable Units"),
    type = all_continuous() ~ "continuous2",
    statistic = all_continuous() ~ c("{N_nonmiss}",
                                     "{median}",
                                     "{p25}", 
                                     "{p75}", 
                                     "{min}", "{max}")
  ) %>% 
  modify_header(
    update = list(label ~ "**Variable**",
                  stat_0 ~ "")
  )
Variable
Project Affordable Units
N 378
Median 60
25% 27
75% 124
Minimum 2
Maximum 653
predict_aff_units_data <- combined_data %>% 
  select(total_proj_investment, proj_lihtc_amount, total_proj_affordable, quadrant, ami_0_30, ami_31_50, ami_51_80, ami_81, ward, dc_residents_employed, num_cbes, total_bedrooms, investment_year, proj_funding_source) %>% 
  mutate(ami_0_30 = if_else(total_proj_affordable == 0, 0, ami_0_30),
         ami_31_50 = if_else(total_proj_affordable == 0, 0, ami_31_50),
         ami_51_80 = if_else(total_proj_affordable == 0, 0, ami_51_80),
         ami_81 = if_else(total_proj_affordable == 0, 0, ami_81),
         log_total_proj_affordable = log(total_proj_affordable)
         ) %>% 
  # remove any projects that didn't have any affordable units
  filter(total_proj_affordable > 0) %>% 
  select(-total_proj_affordable)


set.seed(20201212)
# create a split object
predict_aff_units_split <- initial_split(data = predict_aff_units_data, prop = 0.75)
# create the training and testing data
predict_aff_units_train <- training(x = predict_aff_units_split)
predict_aff_units_test <- testing(x = predict_aff_units_split)

# create resamples
predict_aff_units_folds <- vfold_cv(data = predict_aff_units_train, v = 10)
# Predicting the affordable units in a project

predict_aff_units_recipe <- 
  recipe(log_total_proj_affordable ~ ., data = predict_aff_units_train) %>%
  step_nzv(all_predictors()) %>% 
  step_bagimpute(all_predictors()) %>% 
  step_other(investment_year, threshold = 0.10, 
             other = "other") %>% 
  step_other(proj_funding_source, threshold = 0.05, 
             other = "other") %>% 
  step_other(ward, threshold = 0.10, 
             other = "other")

I first train multiple specifications of a K-Nearest Neighbor model.

# KNN Model to predict affordable units in project

# create model object
predict_aff_knn_mod <-
  nearest_neighbor(neighbors = tune()) %>%
  set_engine(engine = "kknn") %>%
  set_mode(mode = "regression")

# create workflow
predict_aff_knn_workflow <-
  workflow() %>%
  add_model(spec = predict_aff_knn_mod) %>%
  add_recipe(recipe = predict_aff_units_recipe)

# create tuning grid for knn hyperparameters
predict_aff_knn_grid <- tibble(neighbors = c(7, 9, 11, 13))

# estimate models
predict_aff_knn_res <-
  predict_aff_knn_workflow %>%
  tune_grid(resamples = predict_aff_units_folds,
            grid = predict_aff_knn_grid,
            control = control_grid(save_pred = TRUE),
            metrics = metric_set(rmse))
## x Fold09: model 1/1 (predictions): Error in model.frame.default(Terms, newdata, na...
show_best(predict_aff_knn_res, "rmse", n = 10)
## # A tibble: 4 x 7
##   neighbors .metric .estimator  mean     n std_err .config
##       <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>  
## 1         7 rmse    standard   0.600     9  0.0269 Model1 
## 2         9 rmse    standard   0.601     9  0.0265 Model2 
## 3        11 rmse    standard   0.604     9  0.0263 Model3 
## 4        13 rmse    standard   0.607     9  0.0267 Model4

I then train a number of specifications of a Random Forest model, which signficantly outperforms the KNN model. Again, the root mean squared error term is not initially meaningfully interpretable because of the log transformation of the outcome variable.

# Random forest model to predict affordable housing units

# create model object
predict_aff_rf_mod <- 
    rand_forest(mtry = tune(), trees = tune(), min_n = tune()) %>%
    set_mode("regression") %>%
    set_engine("ranger")

# create workflow
predict_aff_rf_workflow <-
  workflow() %>%
  add_model(spec = predict_aff_rf_mod) %>%
  add_recipe(recipe = predict_aff_units_recipe)

n_features <- length(setdiff(names(predict_aff_units_train), "log_total_proj_affordable"))

# create tuning grid for rf hyperparamters
predict_aff_rf_grid <- expand.grid(
  mtry = floor(n_features * c(.25, .333, .4)),
  min_n = c(1, 5, 10), 
  trees = c(n_features*10)
)

# estimate models
predict_aff_rf_res <- 
  predict_aff_rf_workflow %>%
  tune_grid(resamples = predict_aff_units_folds,
            grid = predict_aff_rf_grid,
            control = control_grid(save_pred = TRUE),
            metrics = metric_set(rmse))

show_best(predict_aff_rf_res, "rmse", n = 10)
## # A tibble: 9 x 9
##    mtry trees min_n .metric .estimator  mean     n std_err .config
##   <dbl> <dbl> <dbl> <chr>   <chr>      <dbl> <int>   <dbl> <chr>  
## 1     5   130     1 rmse    standard   0.388     9  0.0395 Model3 
## 2     5   130     5 rmse    standard   0.401     9  0.0390 Model6 
## 3     5   130    10 rmse    standard   0.409     9  0.0409 Model9 
## 4     4   130     1 rmse    standard   0.413     9  0.0409 Model2 
## 5     4   130     5 rmse    standard   0.413     9  0.0394 Model5 
## 6     4   130    10 rmse    standard   0.422     9  0.0369 Model8 
## 7     3   130     1 rmse    standard   0.433     9  0.0390 Model1 
## 8     3   130     5 rmse    standard   0.436     9  0.0401 Model4 
## 9     3   130    10 rmse    standard   0.453     9  0.0435 Model7
# Re-running model with best specification to get variable importance
predict_aff_rf_mod_vip <- 
    rand_forest(mtry = 5, trees = 130, min_n = 1) %>%
    set_mode("regression") %>%
    set_engine("ranger", importance = "impurity")

# Fitting model to training data
predict_aff_rf_fit_vip <-
  workflow() %>%
  add_model(spec = predict_aff_rf_mod_vip) %>%
  add_recipe(recipe = predict_aff_units_recipe) %>% 
  fit(predict_aff_units_train)

# Plotting variable importance
predict_aff_rf_fit_vip %>%
  pull_workflow_fit() %>%
  vip(num_features = 10)

After selecting the optimal Random Forest model specification, I use this model to make predictions for the test data. After undoing the log transformation, the root mean squared error is approximately 51 units. This model performs better in comparison to the median than the model to predict investment in projects. Similarly, though, the mean absolute error is significantly lower with a value of approximately 19 units. This is a fairly good margin of error except for the smallest of projects.

The model does somewhat better at predicting projects in the middle of the distribution than the model for predicting investment. This is likely due to the fact that the affordable units per project were not as severely skewed as the investment per project. Overall, this model performs somewhat well and could be used as a way to flag particular projects that are proposed to produce a number of affordable units that are far outside the predictions of the model.

# Fitting the best model and making predictions with test data

# create model object
predict_aff_rf_best_mod <-
  rand_forest(mtry = 5, trees = 130, min_n = 1) %>%
  set_mode("regression") %>%
  set_engine("ranger")

predict_aff_rf_best_fit <-
  workflow() %>%
  add_model(spec = predict_aff_rf_best_mod) %>%
  add_recipe(recipe = predict_aff_units_recipe) %>% 
  fit(predict_aff_units_train)

# make predictions with testing data
predict_aff_rf_best_predictions <-
  bind_cols(
    predict_aff_units_test,
    predict(object = predict_aff_rf_best_fit, new_data = predict_aff_units_test)
  )

# calculate the rmse for the testing data
rmse(data = predict_aff_rf_best_predictions, 
     truth = log_total_proj_affordable, 
     estimate = .pred)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard       0.335
# undo log transform
predict_aff_rf_best_predictions <- 
  predict_aff_rf_best_predictions %>% 
  mutate(total_proj_affordable = exp(log_total_proj_affordable),
         predicted_proj_affordable = exp(.pred),
         abs_error = abs(total_proj_affordable - predicted_proj_affordable)) 

# calculate RMSE after undoing log transformation of outcome variable
rmse(data = predict_aff_rf_best_predictions, 
     truth = total_proj_affordable, 
     estimate = predicted_proj_affordable)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 rmse    standard        51.6
mae(data = predict_aff_rf_best_predictions, 
     truth = total_proj_affordable, 
     estimate = predicted_proj_affordable)
## # A tibble: 1 x 3
##   .metric .estimator .estimate
##   <chr>   <chr>          <dbl>
## 1 mae     standard        18.8
summary(predict_aff_rf_best_predictions$total_proj_affordable)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    4.00   23.25   60.00   88.93  118.25  653.00
predict_aff_rf_best_predictions %>% 
  ggplot() +
  geom_point(
    aes(x = total_proj_affordable, y = abs_error),
    alpha = .5) +
  scale_x_continuous(labels = comma) + 
  scale_y_continuous(labels = comma) +
  xlab("Total Affordable Units in Project") +
  ylab("Absolute Error") +
  ggtitle("Model more accurately predicts affordable units for smaller projects") +
  theme_minimal()

mean_line <- data.frame(vlines = c(44), labels = c("Mean: 18.8"))

predict_aff_rf_best_predictions %>% 
  ggplot() + 
  geom_histogram (
    aes(x = abs_error),
    bins = 75
  ) +
  # Add mean line
  geom_vline(
    aes(xintercept = mean(abs_error)),
    color = "blue",
    linetype = "dashed",
    size = 1) +
  geom_text(data = mean_line, aes(x = vlines, y = 12, label = labels)) +
  scale_x_continuous(labels = comma) + 
  scale_y_continuous(labels = comma) +
  ggtitle("Distribution of absolute error is right skewed") +
  xlab("Absolute Error") +
  theme_minimal()