library(tidyverse)
library(ggmap)
library(lubridate)
library(viridis)
library(stringr)
library(gghighlight)
library(knitr)
library(kableExtra)
This post is about mapping the Healthy Ride dataset in R.
This is my third post about the Healthy Ride bike service in Pittsburgh. You can find the first post and second post on my blog.
First, load the R packages we will be using:
Then load the data from the WPRDC (hosted on my GitHub page):
<- read_csv("https://raw.githubusercontent.com/conorotompkins/healthy_ride/master/data/data.csv") data
Again, we need to format the data and the column names to make them more useful for analysis. Since this is a repeat of the script from the last post, I will just do it all in one go:
colnames(data) <- tolower(colnames(data))
colnames(data) <- gsub(" ", "_", colnames(data))
<- data %>%
data_long rename(start_date_time = starttime,
stop_date_time = stoptime) %>%
gather(date_time_type, date_time, c(start_date_time, stop_date_time)) %>%
select(date_time_type, date_time, everything()) %>%
mutate(date_time_2 = date_time) %>%
separate(date_time, " ", into = c("date", "time")) %>%
mutate(id = row_number(),
date = mdy(date),
year = year(date),
month = month(date, label = TRUE),
week = week(date),
time = hm(time),
hour = hour(time),
wday = wday(date, label = TRUE),
is_weekday = ifelse(wday %in% c("Mon", "Tues", "Wed", "Thurs", "Fri"), "weekday", "weekend"),
yday = yday(date),
mday = mday(date)) %>%
mutate(trip_duration = (tripduration / 60) / 60) %>%
gather(station_id_type, station_id, c(from_station_id, to_station_id)) %>%
gather(station_name_type, station_name, c(from_station_name, to_station_name)) %>%
select(date_time_type,
is_weekday,
date,
year,
month,
hour,
wday,
yday,
mday,
date_time_2,
station_id_type,
station_id,
station_name_type,
station_name,everything(),
-time)
1:10, 1:5] %>%
data_long[kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
date_time_type | is_weekday | date | year | month |
---|---|---|---|---|
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
Importantly, we will be excluding trips where the rider started and ended their trip at the same station. The data lacks the granularity to analyze rider location beyond the points where they began and ended their trip.
<- data_long %>%
data_long spread(station_name_type, station_name) %>%
filter(from_station_name != to_station_name) %>%
gather(station_name_type, station_name, c(from_station_name, to_station_name))
1:10, 1:5] %>%
data_long[kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
date_time_type | is_weekday | date | year | month |
---|---|---|---|---|
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
start_date_time | weekend | 2015-05-31 | 2015 | May |
We also need to load the CSV with the longitude and latitude for the Healthy Ride stations
<- read_csv("https://raw.githubusercontent.com/conorotompkins/healthy_ride/master/data/stations/station_locations.csv") data_station_locations
<- data_long %>%
df_station_totals group_by(station_name) %>%
summarize(number_of_trips = n()) %>%
arrange(desc(number_of_trips), station_name) %>%
left_join(data_station_locations) %>%
select(station_name, number_of_trips, longitude, latitude)
%>%
df_station_totals head() %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
station_name | number_of_trips | longitude | latitude |
---|---|---|---|
Forbes Ave & Market Square | 51924 | -80.00308 | 40.44088 |
21st St & Penn Ave | 47700 | -79.98354 | 40.45212 |
21st St & Penn Ave | 47700 | -79.98322 | 40.45174 |
Liberty Ave & Stanwix St | 45692 | -80.00468 | 40.44133 |
10th St & Penn Ave (David L. Lawrence Convention Center) | 42868 | -79.99580 | 40.44467 |
S 27th St & Sidney St. (Southside Works) | 37356 | -79.96611 | 40.42790 |
Where are the Healthy Ride Stations?
<- get_map(c(lon = -79.973859, lat = 40.447095), zoom = 13)
pgh_map <- ggmap(pgh_map)
pgh_map
+
pgh_map geom_point(data = df_station_totals, aes(longitude, latitude, size = number_of_trips),
alpha = .75) +
scale_size_continuous("Number of trips", range = c(.1, 5)) +
theme_minimal() +
theme(axis.text = element_blank(),
axis.title = element_blank())
Next, join the two dataframes:
<- data_long %>%
df_long select(station_name, station_name_type) %>%
group_by(station_name, station_name_type) %>%
summarize(number_of_trips = n()) %>%
arrange(desc(number_of_trips)) %>%
left_join(data_station_locations) %>%
ungroup()
%>%
df_long head() %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
station_name | station_name_type | number_of_trips | station_number | number_of_racks | latitude | longitude |
---|---|---|---|---|---|---|
Forbes Ave & Market Square | to_station_name | 29040 | 1001 | 19 | 40.44088 | -80.00308 |
21st St & Penn Ave | to_station_name | 25748 | 1017 | 19 | 40.45212 | -79.98354 |
21st St & Penn Ave | to_station_name | 25748 | 1017 | 18 | 40.45174 | -79.98322 |
Liberty Ave & Stanwix St | to_station_name | 25380 | 1000 | 16 | 40.44133 | -80.00468 |
Forbes Ave & Market Square | from_station_name | 22884 | 1001 | 19 | 40.44088 | -80.00308 |
10th St & Penn Ave (David L. Lawrence Convention Center) | to_station_name | 22040 | 1010 | 15 | 40.44467 | -79.99580 |
Do some stations function more as starting points or ending points for trips?
+
pgh_map geom_point(data = df_long, aes(longitude, latitude, size = number_of_trips, color = station_name_type),
alpha = .75) +
scale_size_continuous("Number of trips",range = c(.1, 5)) +
scale_color_discrete("Station type") +
facet_wrap(~station_name_type) +
theme_minimal() +
theme(axis.text = element_blank(),
axis.title = element_blank())
No differences are discernible in this view.
A scatter plot shows the differences more effectively:
<- df_long %>%
df_from_to spread(station_name_type, number_of_trips) %>%
rename(from_trips = from_station_name,
to_trips = to_station_name) %>%
select(station_name, from_trips, to_trips) %>%
mutate(differential = abs(from_trips - to_trips))
%>%
df_from_to ggplot(aes(from_trips, to_trips)) +
geom_point(size = 1) +
gghighlight(label_key = station_name,
> 4000) +
differential scale_x_continuous(limits = c(0, 30000)) +
scale_y_continuous(limits = c(0, 30000)) +
coord_equal() +
geom_abline() +
labs(x = "From trips",
y = "To trips") +
theme_bw()
What are the top 10 stations in terms of absolute difference between departures and arrivals?
%>%
df_from_to ungroup() %>%
arrange(desc(differential)) %>%
top_n(10, differential) %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
station_name | from_trips | to_trips | differential |
---|---|---|---|
Forbes Ave & Market Square | 22884 | 29040 | 6156 |
S 27th St & Sidney St. (Southside Works) | 15804 | 21552 | 5748 |
Liberty Ave & Stanwix St | 20312 | 25380 | 5068 |
42nd St & Butler St | 11152 | 15580 | 4428 |
42nd St & Butler St | 11152 | 15580 | 4428 |
42nd & Penn Ave. | 9184 | 4760 | 4424 |
Taylor St & Liberty Ave | 8948 | 5072 | 3876 |
21st St & Penn Ave | 21952 | 25748 | 3796 |
21st St & Penn Ave | 21952 | 25748 | 3796 |
Penn Ave & N Fairmount St | 6604 | 2816 | 3788 |
Let’s map the connections between stations by drawing lines between the stations.
First, widen the data:
<- data_long %>%
df_wide spread(station_name_type, station_name) %>%
select(from_station_name, to_station_name) %>%
left_join(data_station_locations, by = c("from_station_name" = "station_name")) %>%
rename(from_latitude = latitude,
from_longitude = longitude) %>%
left_join(data_station_locations, by = c("to_station_name" = "station_name")) %>%
rename(to_latitude = latitude,
to_longitude = longitude) %>%
group_by(from_station_name, to_station_name, from_longitude, from_latitude, to_longitude, to_latitude) %>%
summarise(number_of_trips = n()) %>%
arrange(desc(number_of_trips)) %>%
mutate(from_station_type = ifelse(from_station_name == to_station_name,
"Same station", "Different station"))
%>%
df_wide head() %>%
kable("html") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
from_station_name | to_station_name | from_longitude | from_latitude | to_longitude | to_latitude | number_of_trips | from_station_type |
---|---|---|---|---|---|---|---|
10th St & Penn Ave (David L. Lawrence Convention Center) | 21st St & Penn Ave | -79.99580 | 40.44467 | -79.98354 | 40.45212 | 4432 | Different station |
10th St & Penn Ave (David L. Lawrence Convention Center) | 21st St & Penn Ave | -79.99580 | 40.44467 | -79.98322 | 40.45174 | 4432 | Different station |
Boulevard of the Allies & Parkview Ave | Fifth Ave & S Bouquet St | -79.95188 | 40.43434 | -79.95760 | 40.44232 | 3888 | Different station |
21st St & Penn Ave | 10th St & Penn Ave (David L. Lawrence Convention Center) | -79.98354 | 40.45212 | -79.99580 | 40.44467 | 3640 | Different station |
21st St & Penn Ave | 10th St & Penn Ave (David L. Lawrence Convention Center) | -79.98322 | 40.45174 | -79.99580 | 40.44467 | 3640 | Different station |
Fifth Ave & S Bouquet St | Boulevard of the Allies & Parkview Ave | -79.95760 | 40.44232 | -79.95188 | 40.43434 | 3560 | Different station |
Then, layer the data over the map:
+
pgh_map geom_segment(data = df_wide, aes(x = from_longitude, xend = to_longitude,
y = from_latitude, yend = to_latitude,
alpha = number_of_trips)) +
geom_point(data = df_wide, aes(from_longitude, from_latitude), shape = 21, size = 3, fill = "white") +
geom_point(data = df_wide, aes(to_longitude, to_latitude), shape = 21, size = 3, fill = "white") +
scale_alpha_continuous("Number of trips", range = c(.0001, 1)) +
theme_minimal() +
theme(axis.text = element_blank(),
axis.title = element_blank())
We can also facet by the from_station_name variable to see where trips originating from certain stations end at. This plot shows the top 6 stations in terms of trips that began from that station:
<- df_wide %>%
top_from_stations group_by(from_station_name) %>%
summarize(number_of_trips = sum(number_of_trips)) %>%
arrange(desc(number_of_trips)) %>%
top_n(6) %>%
select(from_station_name) %>%
unlist()
<- df_wide %>%
df_wide_specific_station filter(from_station_name %in% top_from_stations)
+
pgh_map geom_segment(data = df_wide_specific_station, aes(x = from_longitude, xend = to_longitude,
y = from_latitude, yend = to_latitude,
alpha = number_of_trips), arrow = arrow(length = unit(0.03, "npc"))) +
geom_point(data = df_wide_specific_station, aes(from_longitude, from_latitude),
shape = 1, size = 2) +
geom_point(data = df_wide_specific_station, aes(to_longitude, to_latitude),
shape = 1, size = 2) +
scale_alpha_continuous("Number of trips", range = c(.1, 1)) +
facet_wrap(~from_station_name,
nrow = 2) +
theme_minimal() +
theme(axis.text = element_blank(),
axis.title = element_blank())
We can use the same method to examine the top stations in terms of absolute difference between departing and arriving rides:
<- df_from_to %>%
top_diff_stations arrange(desc(differential)) %>%
distinct() %>%
top_n(10) %>%
select(station_name) %>%
unlist()
<- df_wide %>%
df_wide_diff_station filter(from_station_name %in% top_diff_stations)
+
pgh_map geom_point(data = df_wide_diff_station, aes(from_longitude, from_latitude),
shape = 1, size = 2) +
geom_point(data = df_wide_diff_station, aes(to_longitude, to_latitude),
shape = 1, size = 2) +
geom_segment(data = df_wide_diff_station, aes(x = from_longitude, xend = to_longitude,
y = from_latitude, yend = to_latitude,
alpha = number_of_trips),
arrow = arrow(length = unit(0.03, "npc"))) +
scale_alpha_continuous("Number of trips", range = c(.1, 1)) +
facet_wrap(~from_station_name,
nrow = 2) +
theme_minimal() +
theme(axis.text = element_blank(),
axis.title = element_blank())
There does not appear to be a stark difference in the way the network behaves on weekdays vs. weekends:
<- data_long %>%
df_wide_day spread(station_name_type, station_name) %>%
select(from_station_name, to_station_name, is_weekday) %>%
left_join(data_station_locations, by = c("from_station_name" = "station_name")) %>%
rename(from_latitude = latitude,
from_longitude = longitude) %>%
left_join(data_station_locations, by = c("to_station_name" = "station_name")) %>%
rename(to_latitude = latitude,
to_longitude = longitude) %>%
group_by(is_weekday, from_station_name, to_station_name, from_longitude, from_latitude, to_longitude, to_latitude) %>%
summarise(number_of_trips = n()) %>%
arrange(desc(number_of_trips))
+
pgh_map geom_segment(data = df_wide_day, aes(x = from_longitude, xend = to_longitude,
y = from_latitude, yend = to_latitude,
alpha = number_of_trips)) +
geom_point(data = df_wide_day, aes(from_longitude, from_latitude), shape = 21, size = 3, fill = "white") +
geom_point(data = df_wide_day, aes(to_longitude, to_latitude), shape = 21, size = 3, fill = "white") +
scale_alpha_continuous("Number of trips", range = c(.05, .3)) +
facet_wrap(~is_weekday) +
theme_minimal() +
theme(axis.text = element_blank(),
axis.title = element_blank())
There are clear differences in the number of rides across different times of day, but the geographic pattern of departures and arrivals does not appear to change:
<- data_long %>%
df_wide_tod spread(station_name_type, station_name) %>%
select(from_station_name, to_station_name, hour) %>%
mutate(time_of_day = cut(hour, breaks = c(-Inf, 3, 6, 9, 12, 15, 18, 21, Inf),
labels = c("0-3", "3-6", "6-9", "9-12", "12-15", "15-18", "18-21", "21-24"),
ordered_result = TRUE)) %>%
left_join(data_station_locations, by = c("from_station_name" = "station_name")) %>%
rename(from_latitude = latitude,
from_longitude = longitude) %>%
left_join(data_station_locations, by = c("to_station_name" = "station_name")) %>%
rename(to_latitude = latitude,
to_longitude = longitude) %>%
group_by(time_of_day, from_station_name, to_station_name,
from_longitude, from_latitude, %>%
to_longitude, to_latitude) summarise(number_of_trips = n()) %>%
arrange(desc(number_of_trips))
+
pgh_map geom_segment(data = df_wide_tod, aes(x = from_longitude, xend = to_longitude,
y = from_latitude, yend = to_latitude,
alpha = number_of_trips)) +
geom_point(data = df_wide_tod, aes(from_longitude, from_latitude), shape = 21, size = 2, fill = "white") +
geom_point(data = df_wide_tod, aes(to_longitude, to_latitude), shape = 21, size = 2, fill = "white") +
scale_alpha_continuous("Number of trips", range = c(.05, .3)) +
facet_wrap(~time_of_day) +
labs(title = "Time of day") +
theme_minimal() +
theme(axis.text = element_blank(),
axis.title = element_blank())