%>%
train_metrics select(model_name, id, .metric, .estimate) %>%
pivot_wider(names_from = .metric, values_from = .estimate) %>%
ggplot(aes(rmse, rsq, color = model_name)) +
geom_point() +
scale_x_continuous(label = dollar) +
labs(x = "Root Mean Squared Error",
y = "R^2")
Click here to view the full dashboard
Model discussion
I trained 3 models against the assessment data:
- Linear model
- Random Forest
- Bagged Tree
I chose the Bagged Tree model because it performed about as well as the Random Forest model, but it predicts against new data much faster. Prediction speed is important because the UI for the dashboard has to update very quickly.
Total observations: 120,872
Training set metrics (75% of total observations)
I used 10-fold cross-validation to assess model performance against the training set:
Test set metrics (25% of total observations)
%>%
test_metrics select(.metric, .estimate)
# A tibble: 3 × 2
.metric .estimate
<chr> <dbl>
1 rmse 251406.
2 rsq 0.607
3 mape 3724488.
%>%
model_results ggplot(aes(.resid)) +
geom_density() +
geom_vline(xintercept = 0, lty = 2) +
scale_x_continuous(label = label_dollar())
%>%
model_results ggplot(aes(sale_price_adj, .pred_dollar)) +
geom_density_2d_filled(contour_var = "count") +
scale_x_log10(label = label_dollar()) +
scale_y_log10(label = label_dollar()) +
guides(fill = guide_coloursteps()) +
labs(x = "Inflation-adjusted sale price log10 scale",
y = "Prediction",
fill = "Sales")
The model becomes less effective as the actual sale price increases.
%>%
model_results ggplot(aes(sale_price_adj, .resid)) +
geom_point(alpha = .01) +
scale_x_log10(label = dollar) +
scale_y_continuous(label = dollar) +
labs(x = "Inflation-adjusted sale price log10 scale",
y = "Residual")
<- st_read("post_data/unified_geo_ids/unified_geo_ids.shp",
geo_ids quiet = T)
<- model_results %>%
geo_id_median_resid group_by(geo_id) %>%
summarize(median_resid = median(.resid))
<- colorNumeric(
pal palette = "viridis",
domain = geo_id_median_resid$median_resid)
%>%
geo_ids left_join(geo_id_median_resid) %>%
leaflet() %>%
addProviderTiles(providers$OpenStreetMap.Mapnik,
options = providerTileOptions(noWrap = TRUE,
minZoom = 9
#maxZoom = 8
%>%
)) addPolygons(popup = ~ str_c(geo_id, " ", "median residual: ", round(median_resid, 2), sep = ""),
fillColor = ~pal(median_resid),
fillOpacity = .7,
color = "black",
weight = 3) %>%
addLegend("bottomright", pal = pal, values = ~median_resid,
title = "Median of residual",
opacity = 1)
Joining with `by = join_by(geo_id)`