library(tidyverse)
library(tidymodels)
library(mapboxapi)
library(tidycensus)
library(janitor)
library(lehdr)
library(tigris)
library(sf)
library(hrbrthemes)
options(tigris_use_cache = TRUE,
scipen = 999,
digits = 4)
theme_set(theme_ipsum())
sf_use_s2(FALSE)
Intro
In this post I will use the Mapbox API to calculate metrics for major commuter routes in Allegheny County. The API will provide the distance and duration of the trip, as well as turn-by-turn directions. The route duration should be considered a “minimum duration” because it does not consider traffic. Then I will estimate the duration of the trips with a linear model and compare that to the actual duration from the Mapbox API. I will use the difference between the actual and estimated duration to identify neighborhoods that experience longer or shorter commutes than expected.
Gather data
The first step is to download the census tract shapefiles for the county:
#get tracts
<- tracts(state = "PA", county = "Allegheny", cb = TRUE) %>%
allegheny_county_tracts select(GEOID)
<- function(x, y) {
st_erase st_difference(x, st_union(y))
}
<- area_water("PA", "Allegheny", class = "sf")
ac_water
<- st_erase(allegheny_county_tracts, ac_water) allegheny_county_tracts
Then I download the “Origin-Destination” LODES file from the Census for Pennsylvania in 2017 and subset it to commuters within Allegheny County.
#load od tract-level data
<- grab_lodes(state = "pa", year = 2017,
lodes_od_ac_main lodes_type = "od", job_type = "JT00",
segment = "S000", state_part = "main",
agg_geo = "tract", use_cache = TRUE) %>%
select(state, w_tract, h_tract, S000, year) %>%
rename(commuters = S000) %>%
mutate(intra_tract_commute = h_tract == w_tract) %>%
semi_join(allegheny_county_tracts, by = c("w_tract" = "GEOID")) %>%
semi_join(allegheny_county_tracts, by = c("h_tract" = "GEOID"))
This analysis only considers routes where the commuter changed census tracts. 96% of commuters in Allegheny County change census tracts.
%>%
lodes_od_ac_main group_by(intra_tract_commute) %>%
summarize(commuters = sum(commuters)) %>%
ungroup() %>%
mutate(pct = commuters / sum(commuters))
# A tibble: 2 × 3
intra_tract_commute commuters pct
<lgl> <dbl> <dbl>
1 FALSE 465637 0.963
2 TRUE 18127 0.0375
Get directions
This is the code that identifies the center of each tract, geocodes those centroids to get an address, and gets the turn-by-turn directions and route data for each pair of home and work addresses. I will focus on the top 20% of these routes (in terms of cumulative percent of commuters) because the Mapbox API is not designed for the size of query I would need to get directions for all combinations of census tracts.
Note that I manually replaced the geocoded address for the Wexford and Swissvale areas because the geocoder returned results outside of the county, probably because the center of those tracts intersect with highways.
#filter out rows where commuter doesn't change tracts
<- lodes_od_ac_main %>%
combined_tract_sf arrange(desc(commuters)) %>%
filter(w_tract != h_tract)
#calculate cumulative pct of commuters, keep only top 20%
<- combined_tract_sf %>%
combined_tract_sf_small select(h_tract, w_tract, commuters) %>%
arrange(desc(commuters)) %>%
mutate(id = row_number(),
pct_commuters = commuters / sum(commuters),
cumulative_pct_commuters = cumsum(pct_commuters)) %>%
filter(cumulative_pct_commuters < .2) %>%
select(h_tract, w_tract, commuters)
#add census centroid geometry
<- combined_tract_sf_small %>%
combined_tract_sf_small left_join(st_centroid(allegheny_county_tracts), by = c("h_tract" = "GEOID")) %>%
rename(h_tract_geo = geometry) %>%
left_join(st_centroid(allegheny_county_tracts), by = c("w_tract" = "GEOID")) %>%
rename(w_tract_geo = geometry) %>%
select(h_tract, h_tract_geo, w_tract, w_tract_geo, commuters)
|>
combined_tract_sf_small st_sf() |>
ggplot() +
geom_sf(data = allegheny_county_tracts) +
geom_sf()
#get addresses for tract centroids
<- combined_tract_sf_small %>%
tract_od_directions mutate(home_address = map_chr(h_tract_geo, mb_reverse_geocode),
work_address = map_chr(w_tract_geo, mb_reverse_geocode))
#replace bad address with good address
<- "3321 Wexford Rd, Gibsonia, PA 15044"
wexford_good_address <- "1118 S Braddock Ave, Swissvale, PA 15218"
swissvale_good_address
<- tract_od_directions %>%
tract_od_directions #fix wexford address
mutate(home_address = case_when(h_tract == "42003409000" ~ wexford_good_address,
!= "42003409000" ~ home_address),
h_tract work_address = case_when(w_tract == "42003409000" ~ wexford_good_address,
!= "42003409000" ~ work_address)) |>
w_tract #fix swissvale address
mutate(home_address = case_when(h_tract == "42003515401" ~ swissvale_good_address,
TRUE ~ home_address))
#define error-safe mb_directions function
<- possibly(mb_directions, otherwise = NA)
mb_directions_possibly
#geocode addresses, get directions
<- tract_od_directions %>%
tract_od_directions mutate(home_address_location_geocoded = map(home_address, mb_geocode),
work_address_location_geocoded = map(work_address, mb_geocode)) %>%
mutate(directions = map2(home_address, work_address, ~ mb_directions_possibly(origin = .x,
destination = .y,
steps = TRUE,
profile = "driving"))) %>%
select(h_tract, h_tract_geo, home_address, home_address_location_geocoded,
w_tract, w_tract_geo, work_address, work_address_location_geocoded, directions, commuters)
The core of the above code is combining map2
and mb_directions_possibly
. This maps the mb_directions_possibly
function against two inputs (the home address and work address).
The result is a dataframe with a row per turn-by-turn direction for each commuter route.
This summarizes the data so there is one row per commuter route and creates summarized route data.
#summarize direction data
<- tract_od_directions %>%
tract_od_stats group_by(h_tract, home_address, w_tract, work_address) %>%
summarize(duration = sum(duration),
distance = sum(distance),
steps = n(),
commuters = unique(commuters)) %>%
ungroup()
As expected, route duration and distance are highly correlated. The median duration of a trip is 16.7 minutes.
#graph od stats
%>%
tract_od_stats ggplot(aes(distance, duration, size = commuters)) +
geom_point(alpha = .3) +
geom_abline(linetype = 2, color = "red") +
coord_equal() +
theme_ipsum() +
labs(title = "Commutes between census tracts",
subtitle = "Allegheny County, PA",
x = "Distance in KM",
y = "Duration in minutes",
size = "Commuters")
<- tract_od_stats %>%
median_duration uncount(weights = commuters) %>%
summarize(median_duration = median(duration)) %>%
pull(median_duration)
%>%
tract_od_stats uncount(weights = commuters) %>%
ggplot(aes(duration)) +
geom_density(fill = "grey") +
geom_vline(xintercept = median_duration, lty = 2, color = "red") +
annotate("text", x = 21, y = .05, label = "median", color = "red") +
theme_ipsum() +
labs(title = "Trip duration",
x = "Duration in minutes",
y = "Density of observations")
This map shows the main roads that commuter routes use I-376, I-279, and Route 28 are major arteries, as expected.
#map routes
%>%
tract_od_stats ggplot() +
geom_sf(data = allegheny_county_tracts, linewidth = .1, fill = "black") +
geom_sf(aes(alpha = commuters, linewidth = commuters), color = "#ffcc01", alpha = .1) +
guides(linewidth = guide_legend(override.aes= list(alpha = 1))) +
scale_linewidth_continuous(range = c(.1, 5)) +
theme_void() +
labs(title = "Commuter routes between Allegheny County census tracts",
subtitle = "Driving routes",
linewidth = "Commuters")
A high-resolution image of this map is available here. An animation of the routes is here.
People that live closer to downtown Pittsburgh have shorter commutes, on average.
%>%
allegheny_county_tracts st_drop_geometry() %>%
left_join(tract_od_stats %>%
st_drop_geometry() |>
select(h_tract, w_tract, duration) %>%
pivot_longer(contains("tract")) %>%
group_by(name, value) %>%
summarize(avg_duration = mean(duration)) %>%
ungroup(),
by = c("GEOID" = "value")) %>%
complete(GEOID, name) %>%
filter(!is.na(name)) %>%
left_join(allegheny_county_tracts, by = "GEOID") %>%
mutate(name = case_when(name == "h_tract" ~ "Origin tract",
== "w_tract" ~ "Destination tract"),
name name = as.factor(name) %>% fct_rev()) %>%
st_sf() %>%
ggplot() +
geom_sf(aes(fill = avg_duration), size = .1) +
facet_wrap(~name, ncol = 1) +
scale_fill_viridis_c(na.value = "grey90") +
labs(title = "Average commute duration",
fill = "Minutes") +
theme_void()
Model
The next step is to create a model that estimates the duration of a given commute. I will use the number of steps in the turn-by-turn directions and the distance as predictors. Additionally, I will calculate which rivers a commute route crosses and use those as logical variables in the model.
This collects the geometry for the main rivers in the county.
<- ac_water %>%
main_rivers group_by(FULLNAME) %>%
summarize(AWATER = sum(AWATER)) %>%
arrange(desc(AWATER)) %>%
slice(1:4)
This code calculates whether a given commuter route crosses a river.
<- tract_od_stats %>%
tract_od_stats_rivers mutate(intersects_ohio = st_intersects(., main_rivers %>%
filter(FULLNAME == "Ohio Riv")) %>% as.logical(),
intersects_allegheny = st_intersects(., main_rivers %>%
filter(FULLNAME == "Allegheny Riv")) %>% as.logical(),
intersects_monongahela = st_intersects(., main_rivers %>%
filter(FULLNAME == "Monongahela Riv")) %>% as.logical(),
intersects_youghiogheny = st_intersects(., main_rivers %>%
filter(FULLNAME == "Youghiogheny Riv")) %>% as.logical()) %>%
replace_na(list(intersects_ohio = FALSE,
intersects_allegheny = FALSE,
intersects_monongahela = FALSE,
intersects_youghiogheny = FALSE)) %>%
st_drop_geometry()
glimpse(tract_od_stats_rivers)
Rows: 780
Columns: 12
$ h_tract <chr> "42003020100", "42003020300", "42003030500", "…
$ home_address <chr> "445 Wood Street, Pittsburgh, Pennsylvania 152…
$ w_tract <chr> "42003982200", "42003020100", "42003020100", "…
$ work_address <chr> "4215 Fifth Avenue, Pittsburgh, Pennsylvania 1…
$ duration <dbl> 14.087, 9.225, 7.460, 10.951, 2.510, 10.517, 1…
$ distance <dbl> 5.6640, 2.6518, 1.5922, 3.4968, 0.7169, 4.2762…
$ steps <int> 8, 7, 7, 5, 6, 6, 9, 7, 10, 11, 14, 10, 10, 8,…
$ commuters <dbl> 58, 106, 287, 68, 81, 129, 87, 121, 116, 59, 1…
$ intersects_ohio <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
$ intersects_allegheny <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
$ intersects_monongahela <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
$ intersects_youghiogheny <lgl> FALSE, FALSE, FALSE, FALSE, FALSE, FALSE, FALS…
<- tract_od_stats_rivers %>%
tract_od_stats_rivers mutate(od_id = str_c("h_tract: ", h_tract, ", ", "w_tract: ", w_tract, sep = ""))
First I set the seed and split the data into training and testing sets.
set.seed(1234)
#split data
<- initial_split(tract_od_stats_rivers, prop = .75)
splits
<- training(splits)
training_data <- testing(splits) testing_data
Then I use {tidymodels}
to define a linear model, cross-validate it, and extract the coefficients.
#recipe
<- recipe(duration ~ .,
model_recipe data = training_data) %>%
update_role(od_id, new_role = "id") %>%
step_rm(h_tract, home_address, w_tract, work_address, commuters) %>%
step_normalize(distance, steps) %>%
step_zv(all_predictors())
%>%
model_recipe prep() %>%
summary()
# A tibble: 8 × 4
variable type role source
<chr> <list> <chr> <chr>
1 distance <chr [2]> predictor original
2 steps <chr [2]> predictor original
3 intersects_ohio <chr [1]> predictor original
4 intersects_allegheny <chr [1]> predictor original
5 intersects_monongahela <chr [1]> predictor original
6 intersects_youghiogheny <chr [1]> predictor original
7 od_id <chr [3]> id original
8 duration <chr [2]> outcome original
<- model_recipe %>%
model_recipe_prep prep()
#apply cv to training data
<- vfold_cv(training_data, v = 10, repeats = 2) training_vfold
#model specification
<- linear_reg(mode = "regression") %>%
lm_model set_engine("lm")
#linear regression workflow
<- workflow() %>%
lm_workflow add_recipe(model_recipe) %>%
add_model(lm_model)
#fit against training resamples
<- control_resamples(save_pred = TRUE)
keep_pred
<- lm_workflow %>%
lm_training_fit fit_resamples(training_vfold, control = keep_pred) %>%
mutate(model = "lm")
#get results from training cv
%>%
lm_training_fit collect_metrics()
# A tibble: 2 × 6
.metric .estimator mean n std_err .config
<chr> <chr> <dbl> <int> <dbl> <chr>
1 rmse standard 2.88 20 0.0590 Preprocessor1_Model1
2 rsq standard 0.870 20 0.00571 Preprocessor1_Model1
The model averaged an R-squared of .82 on the training data, which is pretty good.
The predictions from the training set fit the actual duration pretty well.
#graph predictions from assessment sets
%>%
lm_training_fit collect_predictions() %>%
ggplot(aes(duration, .pred)) +
geom_point(alpha = .3) +
geom_abline(linetype = 2, color = "red") +
coord_equal() +
labs(x = "Actual duration",
y = "Predicted duration")
Next I fit the model against the test data to extract the coefficients. Holding the other variables constant, distance is by far the most influential variable in the model. For every kilometer increase in distance, the duration of the commute can be expected to increase by around 5 minutes. Crossing the Monongahela will add around 2 minutes to a commute, while crossing the Allegheny and Ohio actually decrease commute times. This is probably related to the bridge that the commuter uses.
#variable importance
%>%
lm_workflow fit(testing_data) %>%
pull_workflow_fit() %>%
tidy() %>%
filter(term != "(Intercept)") %>%
mutate(term = fct_reorder(term, estimate)) %>%
ggplot(aes(estimate, term)) +
geom_col(fill = "grey", color = "black")
This fits the model to the full dataset and plots the predicted duration against the actual duration. The fit is tighter than just plotting distance vs. duration.
#final model
<- lm_workflow %>%
tract_od_pred fit(testing_data) %>%
predict(tract_od_stats_rivers) %>%
bind_cols(tract_od_stats_rivers) %>%
select(h_tract, w_tract, distance, steps, duration, .pred, commuters)
%>%
tract_od_pred ggplot(aes(duration, .pred, size = commuters)) +
geom_point(alpha = .3) +
geom_abline(lty = 2, color = "red") +
coord_equal() +
labs(x = "Duration in minutes",
y = "Predicted duration",
size = "Number of commuters")
This calculates how far off the model’s estimation of duration was for each census tract in the dataset (origin and destination). Commuters originating from neighborhoods between State Route 51 and the Monongahela River experience longer than expected commutes.
%>%
allegheny_county_tracts st_drop_geometry() %>%
left_join(tract_od_pred %>%
mutate(.resid = duration - .pred) %>%
select(h_tract, w_tract, .resid) %>%
pivot_longer(contains("tract")) %>%
group_by(name, value) %>%
summarize(avg_resid = mean(.resid)) %>%
ungroup(),
by = c("GEOID" = "value")) %>%
complete(GEOID, name) %>%
filter(!is.na(name)) %>%
left_join(allegheny_county_tracts) %>%
mutate(name = case_when(name == "h_tract" ~ "Origin tract",
== "w_tract" ~ "Destination tract"),
name name = as.factor(name) %>% fct_rev()) %>%
st_sf() %>%
ggplot() +
geom_sf(aes(fill = avg_resid), size = .1) +
facet_wrap(~name, ncol = 1) +
scale_fill_viridis_c(na.value = "grey90") +
labs(title = "Commute duration above/below expected",
fill = "Minutes") +
theme_void()