Mapping Healthy Ride Data

This post is about mapping the Healthy Ride dataset in R.

This is my third post about the Healthy Ride bike service in Pittsburgh. You can find the first post and second post on my blog.

First, load the R packages we will be using:

library(tidyverse)
library(ggmap)
library(lubridate)
library(viridis)
library(stringr)
library(gghighlight)
library(knitr)
library(kableExtra)

Then load the data from the WPRDC (hosted on my GitHub page):

data <- read_csv("https://raw.githubusercontent.com/conorotompkins/healthy_ride/master/data/data.csv")

Again, we need to format the data and the column names to make them more useful for analysis. Since this is a repeat of the script from the last post, I will just do it all in one go:

colnames(data) <- tolower(colnames(data))
colnames(data) <- gsub(" ", "_", colnames(data))

data_long <- data %>% 
  rename(start_date_time = starttime,
         stop_date_time = stoptime) %>% 
  gather(date_time_type, date_time, c(start_date_time, stop_date_time)) %>% 
  select(date_time_type, date_time, everything()) %>% 
  mutate(date_time_2 = date_time) %>% 
  separate(date_time, " ", into = c("date", "time")) %>% 
  mutate(id = row_number(),
         date = mdy(date),
         year = year(date),
         month = month(date, label = TRUE),
         week = week(date),
         time = hm(time),
         hour = hour(time),
         wday = wday(date, label = TRUE),
         is_weekday = ifelse(wday %in% c("Mon", "Tues", "Wed", "Thurs", "Fri"), "weekday", "weekend"),
         yday = yday(date),
         mday = mday(date)) %>% 
  mutate(trip_duration = (tripduration / 60) / 60) %>% 
  gather(station_id_type, station_id, c(from_station_id, to_station_id)) %>% 
  gather(station_name_type, station_name, c(from_station_name, to_station_name)) %>% 
  select(date_time_type, 
         is_weekday, 
         date, 
         year,
         month,
         hour,
         wday,
         yday,
         mday,
         date_time_2, 
         station_id_type, 
         station_id, 
         station_name_type,
         station_name,
         everything(),
         -time)

data_long[1:10, 1:5] %>% 
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
date_time_type is_weekday date year month
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May

Importantly, we will be excluding trips where the rider started and ended their trip at the same station. The data lacks the granularity to analyze rider location beyond the points where they began and ended their trip.

data_long <- data_long %>% 
  spread(station_name_type, station_name) %>% 
  filter(from_station_name != to_station_name) %>% 
  gather(station_name_type, station_name, c(from_station_name, to_station_name))

data_long[1:10, 1:5] %>% 
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
date_time_type is_weekday date year month
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May
start_date_time weekend 2015-05-31 2015 May

We also need to load the CSV with the longitude and latitude for the Healthy Ride stations

data_station_locations <- read_csv("https://raw.githubusercontent.com/conorotompkins/healthy_ride/master/data/stations/station_locations.csv")
df_station_totals <- data_long %>% 
  group_by(station_name) %>% 
  summarize(number_of_trips = n()) %>% 
  arrange(desc(number_of_trips), station_name) %>% 
  left_join(data_station_locations) %>% 
  select(station_name, number_of_trips, longitude, latitude)

df_station_totals %>% 
  head() %>% 
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
station_name number_of_trips longitude latitude
Forbes Ave & Market Square 51924 -80.00308 40.44088
21st St & Penn Ave 47700 -79.98354 40.45212
21st St & Penn Ave 47700 -79.98322 40.45174
Liberty Ave & Stanwix St 45692 -80.00468 40.44133
10th St & Penn Ave (David L. Lawrence Convention Center) 42868 -79.99580 40.44467
S 27th St & Sidney St. (Southside Works) 37356 -79.96611 40.42790

Where are the Healthy Ride Stations?

pgh_map <- get_map(c(lon = -79.973859, lat = 40.447095), zoom = 13)
pgh_map <- ggmap(pgh_map)

pgh_map +
  geom_point(data = df_station_totals, aes(longitude, latitude, size = number_of_trips),
             alpha = .75) +
  scale_size_continuous("Number of trips", range = c(.1, 5)) +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank())

Next, join the two dataframes:

df_long <- data_long %>% 
  select(station_name, station_name_type) %>% 
  group_by(station_name, station_name_type) %>% 
  summarize(number_of_trips = n()) %>% 
  arrange(desc(number_of_trips)) %>% 
  left_join(data_station_locations) %>% 
  ungroup()

df_long %>% 
  head() %>% 
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
station_name station_name_type number_of_trips station_number number_of_racks latitude longitude
Forbes Ave & Market Square to_station_name 29040 1001 19 40.44088 -80.00308
21st St & Penn Ave to_station_name 25748 1017 19 40.45212 -79.98354
21st St & Penn Ave to_station_name 25748 1017 18 40.45174 -79.98322
Liberty Ave & Stanwix St to_station_name 25380 1000 16 40.44133 -80.00468
Forbes Ave & Market Square from_station_name 22884 1001 19 40.44088 -80.00308
10th St & Penn Ave (David L. Lawrence Convention Center) to_station_name 22040 1010 15 40.44467 -79.99580

Do some stations function more as starting points or ending points for trips?

pgh_map +
  geom_point(data = df_long, aes(longitude, latitude, size = number_of_trips, color = station_name_type),
             alpha = .75) +
  scale_size_continuous("Number of trips",range = c(.1, 5)) +
  scale_color_discrete("Station type") +
  facet_wrap(~station_name_type) +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank())

No differences are discernible in this view.

A scatter plot shows the differences more effectively:

df_from_to <- df_long %>%
  spread(station_name_type, number_of_trips) %>% 
  rename(from_trips = from_station_name,
         to_trips = to_station_name) %>% 
  select(station_name, from_trips, to_trips) %>% 
  mutate(differential = abs(from_trips - to_trips))

df_from_to %>% 
  ggplot(aes(from_trips, to_trips)) +
  geom_point(size = 1) +
  gghighlight(label_key = station_name, 
              differential > 4000) +
  scale_x_continuous(limits = c(0, 30000)) +
  scale_y_continuous(limits = c(0, 30000)) +
  coord_equal() +
  geom_abline() +
  labs(x = "From trips",
       y = "To trips") +
  theme_bw()

What are the top 10 stations in terms of absolute difference between departures and arrivals?

df_from_to %>% 
  ungroup() %>% 
  arrange(desc(differential)) %>% 
  top_n(10, differential) %>% 
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
station_name from_trips to_trips differential
Forbes Ave & Market Square 22884 29040 6156
S 27th St & Sidney St. (Southside Works) 15804 21552 5748
Liberty Ave & Stanwix St 20312 25380 5068
42nd St & Butler St 11152 15580 4428
42nd St & Butler St 11152 15580 4428
42nd & Penn Ave. 9184 4760 4424
Taylor St & Liberty Ave 8948 5072 3876
21st St & Penn Ave 21952 25748 3796
21st St & Penn Ave 21952 25748 3796
Penn Ave & N Fairmount St 6604 2816 3788

Let’s map the connections between stations by drawing lines between the stations.

First, widen the data:

df_wide <- data_long %>%
  spread(station_name_type, station_name) %>% 
  select(from_station_name, to_station_name) %>% 
  left_join(data_station_locations, by = c("from_station_name" = "station_name")) %>%
  rename(from_latitude = latitude,
         from_longitude = longitude) %>% 
  left_join(data_station_locations, by = c("to_station_name" = "station_name")) %>% 
  rename(to_latitude = latitude,
         to_longitude = longitude) %>% 
  group_by(from_station_name, to_station_name, from_longitude, from_latitude, to_longitude, to_latitude) %>% 
  summarise(number_of_trips = n()) %>% 
  arrange(desc(number_of_trips)) %>% 
  mutate(from_station_type = ifelse(from_station_name == to_station_name,
                               "Same station", "Different station"))
## `summarise()` regrouping output by 'from_station_name', 'to_station_name', 'from_longitude', 'from_latitude', 'to_longitude' (override with `.groups` argument)
df_wide %>% 
  head() %>% 
  kable("html") %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"))
from_station_name to_station_name from_longitude from_latitude to_longitude to_latitude number_of_trips from_station_type
10th St & Penn Ave (David L. Lawrence Convention Center) 21st St & Penn Ave -79.99580 40.44467 -79.98354 40.45212 4432 Different station
10th St & Penn Ave (David L. Lawrence Convention Center) 21st St & Penn Ave -79.99580 40.44467 -79.98322 40.45174 4432 Different station
Boulevard of the Allies & Parkview Ave Fifth Ave & S Bouquet St -79.95188 40.43434 -79.95760 40.44232 3888 Different station
21st St & Penn Ave 10th St & Penn Ave (David L. Lawrence Convention Center) -79.98354 40.45212 -79.99580 40.44467 3640 Different station
21st St & Penn Ave 10th St & Penn Ave (David L. Lawrence Convention Center) -79.98322 40.45174 -79.99580 40.44467 3640 Different station
Fifth Ave & S Bouquet St Boulevard of the Allies & Parkview Ave -79.95760 40.44232 -79.95188 40.43434 3560 Different station

Then, layer the data over the map:

pgh_map +
  geom_segment(data = df_wide, aes(x = from_longitude, xend = to_longitude, 
                                   y = from_latitude, yend = to_latitude, 
                                   alpha = number_of_trips)) +
  geom_point(data = df_wide, aes(from_longitude, from_latitude), shape = 21, size = 3, fill = "white") +
  geom_point(data = df_wide, aes(to_longitude, to_latitude), shape = 21, size = 3, fill = "white") +
  scale_alpha_continuous("Number of trips", range = c(.0001, 1)) +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank())

We can also facet by the from_station_name variable to see where trips originating from certain stations end at. This plot shows the top 6 stations in terms of trips that began from that station:

top_from_stations <- df_wide %>% 
  group_by(from_station_name) %>% 
  summarize(number_of_trips = sum(number_of_trips)) %>% 
  arrange(desc(number_of_trips)) %>% 
  top_n(6) %>% 
  select(from_station_name) %>% 
  unlist()

df_wide_specific_station <- df_wide %>% 
  filter(from_station_name %in% top_from_stations)

pgh_map +
  geom_segment(data = df_wide_specific_station, aes(x = from_longitude, xend = to_longitude, 
                                   y = from_latitude, yend = to_latitude, 
                                   alpha = number_of_trips), arrow = arrow(length = unit(0.03, "npc"))) +
  geom_point(data = df_wide_specific_station, aes(from_longitude, from_latitude), 
             shape = 1, size = 2) +
  geom_point(data = df_wide_specific_station, aes(to_longitude, to_latitude), 
             shape = 1, size = 2) +
  scale_alpha_continuous("Number of trips", range = c(.1, 1)) +
  facet_wrap(~from_station_name,
             nrow = 2) +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank())

We can use the same method to examine the top stations in terms of absolute difference between departing and arriving rides:

top_diff_stations <- df_from_to %>% 
  arrange(desc(differential)) %>% 
  distinct() %>% 
  top_n(10) %>% 
  select(station_name) %>% 
  unlist()
  
df_wide_diff_station <- df_wide %>% 
  filter(from_station_name %in% top_diff_stations)

pgh_map +
  geom_point(data = df_wide_diff_station, aes(from_longitude, from_latitude), 
             shape = 1, size = 2) +
  geom_point(data = df_wide_diff_station, aes(to_longitude, to_latitude), 
             shape = 1, size = 2) +
  geom_segment(data = df_wide_diff_station, aes(x = from_longitude, xend = to_longitude, 
                                   y = from_latitude, yend = to_latitude, 
                                   alpha = number_of_trips),
               arrow = arrow(length = unit(0.03, "npc"))) +
  scale_alpha_continuous("Number of trips", range = c(.1, 1)) +
  facet_wrap(~from_station_name,
             nrow = 2) +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank())

There does not appear to be a stark difference in the way the network behaves on weekdays vs. weekends:

df_wide_day <- data_long %>% 
  spread(station_name_type, station_name) %>% 
  select(from_station_name, to_station_name, is_weekday) %>% 
  left_join(data_station_locations, by = c("from_station_name" = "station_name")) %>%
  rename(from_latitude = latitude,
         from_longitude = longitude) %>% 
  left_join(data_station_locations, by = c("to_station_name" = "station_name")) %>% 
  rename(to_latitude = latitude,
         to_longitude = longitude) %>% 
  group_by(is_weekday, from_station_name, to_station_name, from_longitude, from_latitude, to_longitude, to_latitude) %>% 
  summarise(number_of_trips = n()) %>% 
  arrange(desc(number_of_trips))

pgh_map +
  geom_segment(data = df_wide_day, aes(x = from_longitude, xend = to_longitude, 
                                   y = from_latitude, yend = to_latitude, 
                                   alpha = number_of_trips)) +
  geom_point(data = df_wide_day, aes(from_longitude, from_latitude), shape = 21, size = 3, fill = "white") +
  geom_point(data = df_wide_day, aes(to_longitude, to_latitude), shape = 21, size = 3, fill = "white") +
  scale_alpha_continuous("Number of trips", range = c(.05, .3)) +
  facet_wrap(~is_weekday) +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank())

There are clear differences in the number of rides across different times of day, but the geographic pattern of departures and arrivals does not appear to change:

df_wide_tod <- data_long %>% 
  spread(station_name_type, station_name) %>% 
  select(from_station_name, to_station_name, hour) %>% 
  mutate(time_of_day = cut(hour, breaks = c(-Inf, 3, 6, 9, 12, 15, 18, 21, Inf), 
                           labels = c("0-3", "3-6", "6-9", "9-12", "12-15", "15-18", "18-21", "21-24"), 
                           ordered_result = TRUE)) %>% 
  left_join(data_station_locations, by = c("from_station_name" = "station_name")) %>%
  rename(from_latitude = latitude,
         from_longitude = longitude) %>% 
  left_join(data_station_locations, by = c("to_station_name" = "station_name")) %>% 
  rename(to_latitude = latitude,
         to_longitude = longitude) %>% 
  group_by(time_of_day, from_station_name, to_station_name, 
           from_longitude, from_latitude, 
           to_longitude, to_latitude) %>% 
  summarise(number_of_trips = n()) %>% 
  arrange(desc(number_of_trips))

pgh_map +
  geom_segment(data = df_wide_tod, aes(x = from_longitude, xend = to_longitude, 
                                   y = from_latitude, yend = to_latitude, 
                                   alpha = number_of_trips)) +
  geom_point(data = df_wide_tod, aes(from_longitude, from_latitude), shape = 21, size = 2, fill = "white") +
  geom_point(data = df_wide_tod, aes(to_longitude, to_latitude), shape = 21, size = 2, fill = "white") +
  scale_alpha_continuous("Number of trips", range = c(.05, .3)) +
  facet_wrap(~time_of_day) +
  labs(title = "Time of day") +
  theme_minimal() +
  theme(axis.text = element_blank(),
        axis.title = element_blank())

Related