library(tidyverse)
library(lubridate)
library(tidyquant)
library(janitor)
library(hrbrthemes)
library(vroom)
library(ggrepel)
library(gganimate)
set.seed(1234)
theme_set(theme_bw(base_size = 16))
This will be a quick post on cumulative bird observations in Allegheny County. Cumulative graphs show overall trends, seasonality, and quirks in how the data was recorded. They are also fun to turn into animated gifs with gganimate
.
Load the relevant libraries:
This reads in data from the eBird data portal:
<- vroom("post_data/ebd_US-PA-003_201001_202003_relFeb-2020.zip", delim = "\t") %>%
df clean_names() %>%
mutate_at(vars(observer_id, locality, observation_date, time_observations_started, protocol_type), str_replace_na, "NA") %>%
mutate(observation_count = as.numeric(str_replace(observation_count, "X", as.character(NA))),
observation_event_id = str_c(observer_id, locality, observation_date, time_observations_started, sep = "-"),
observation_date = ymd(observation_date)) %>%
filter(all_species_reported == 1)
glimpse(df)
Rows: 908,622
Columns: 48
$ global_unique_identifier <chr> "URN:CornellLabOfOrnithology:EBIRD:OBS815…
$ last_edited_date <dttm> 2018-08-03 11:44:16, 2018-08-03 11:44:34…
$ taxonomic_order <dbl> 493, 20638, 20638, 20638, 20638, 20638, 2…
$ category <chr> "species", "species", "species", "species…
$ common_name <chr> "American Black Duck", "American Crow", "…
$ scientific_name <chr> "Anas rubripes", "Corvus brachyrhynchos",…
$ subspecies_common_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ subspecies_scientific_name <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ observation_count <dbl> 1, 7, 3, 2, 4, 3, NA, NA, 25, 2, 7, 2, 3,…
$ breeding_bird_atlas_code <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ breeding_bird_atlas_category <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ age_sex <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ country <chr> "United States", "United States", "United…
$ country_code <chr> "US", "US", "US", "US", "US", "US", "US",…
$ state <chr> "Pennsylvania", "Pennsylvania", "Pennsylv…
$ state_code <chr> "US-PA", "US-PA", "US-PA", "US-PA", "US-P…
$ county <chr> "Allegheny", "Allegheny", "Allegheny", "A…
$ county_code <chr> "US-PA-003", "US-PA-003", "US-PA-003", "U…
$ iba_code <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ bcr_code <dbl> 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 2…
$ usfws_code <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ atlas_block <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ locality <chr> "Rachel Carson Riverfront Park", "Southsi…
$ locality_id <chr> "L2640334", "L841018", "L329082", "L69207…
$ locality_type <chr> "H", "H", "H", "P", "P", "H", "H", "H", "…
$ latitude <dbl> 40.53731, 40.43124, 40.54348, 40.65688, 4…
$ longitude <dbl> -79.79531, -79.97032, -79.90623, -80.1138…
$ observation_date <date> 2010-01-14, 2010-01-31, 2010-01-23, 2010…
$ time_observations_started <chr> "11:05:00", "16:45:00", "13:15:00", "14:4…
$ observer_id <chr> "obsr39944", "obsr197993", "obsr197993", …
$ sampling_event_identifier <chr> "S5760087", "S5839167", "S5798726", "S580…
$ protocol_type <chr> "Traveling", "Traveling", "Area", "Travel…
$ protocol_code <chr> "P22", "P22", "P23", "P22", "P21", "P23",…
$ project_code <chr> "EBIRD", "EBIRD", "EBIRD", "EBIRD", "EBIR…
$ duration_minutes <dbl> 25, 30, 90, 90, 120, 30, 35, 30, 70, 60, …
$ effort_distance_km <dbl> 0.483, 0.483, NA, 8.047, NA, NA, NA, NA, …
$ effort_area_ha <dbl> NA, NA, 24.2811, NA, NA, 4.0469, 4.0469, …
$ number_observers <dbl> 1, 2, 2, 1, NA, 2, 2, 2, 4, 2, 1, 1, 1, 4…
$ all_species_reported <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ group_identifier <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ has_media <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ approved <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ reviewed <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ reason <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ trip_comments <chr> NA, NA, NA, NA, NA, "Temperature 8F Winds…
$ species_comments <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ x47 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ observation_event_id <chr> "obsr39944-Rachel Carson Riverfront Park-…
I focus on the two main ways people use the eBird app: traveling and stationary. I also filter to only observations from 2016 onwards, since that is when eBird usage became stable in the county.
<- df %>%
df_top_protocols count(protocol_type, sort = TRUE) %>%
slice(1:2)
<- df %>%
df semi_join(df_top_protocols) %>%
filter(year(observation_date) >= 2016)
This identifies the top 10 birds in terms of total observations:
<- df %>%
df_species_count group_by(common_name) %>%
summarize(observation_count = sum(observation_count, na.rm = TRUE)) %>%
arrange(desc(observation_count)) %>%
slice(1:10)
This code filters on the top 10 birds and caculates the cumulative number of sightings and the rolling 21 day average of sightings.
<- df %>%
df_cumulative semi_join(df_species_count, by = c("common_name")) %>%
group_by(common_name, observation_date) %>%
summarize(observation_count = sum(observation_count, na.rm = TRUE)) %>%
ungroup() %>%
arrange(common_name, observation_date) %>%
group_by(common_name) %>%
mutate(observation_count_cumulative = cumsum(observation_count)) %>%
tq_mutate(
# tq_mutate args
select = observation_count,
mutate_fun = rollapply,
# rollapply args
width = 21,
align = "right",
FUN = mean,
# mean args
na.rm = TRUE,
# tq_mutate args
col_rename = "mean_21"
)
This plots the cumulative observations by bird and creates an animation with gganimate
:
<- df_cumulative %>%
plot ggplot(aes(observation_date, observation_count_cumulative, group = common_name)) +
geom_line(alpha = .5) +
geom_segment(aes(xend = last(df_cumulative$observation_date) + 240, yend = observation_count_cumulative), linetype = 2, colour = 'grey') +
geom_point(aes(size = mean_21)) +
geom_label(aes(x = last(df_cumulative$observation_date) + 210, label = common_name), size = 6) +
scale_y_comma() +
scale_size_continuous("21 day rolling average of observation count", range = c(2, 10), labels = scales::comma) +
scale_x_date(limits = c(first(df_cumulative$observation_date), last(df_cumulative$observation_date) + 250)) +
labs(x = NULL,
y = "Cumulative observations",
title = "eBird observations in Allegheny County",
subtitle = "Top 10 birds 2016 through January 2020",
caption = "@conor_tompkins") +
coord_cartesian(clip = 'off') +
transition_reveal(observation_date)
plot