Cumulative eBird Sightings in Allegheny County

R
Author

Conor Tompkins

Published

April 29, 2020

This will be a quick post on cumulative bird observations in Allegheny County. Cumulative graphs show overall trends, seasonality, and quirks in how the data was recorded. They are also fun to turn into animated gifs with gganimate.

Load the relevant libraries:

library(tidyverse)
library(lubridate)
library(tidyquant)
library(janitor)
library(hrbrthemes)
library(vroom)
library(ggrepel)
library(gganimate)

set.seed(1234)

theme_set(theme_bw(base_size = 16))

This reads in data from the eBird data portal:

df <- vroom("post_data/ebd_US-PA-003_201001_202003_relFeb-2020.zip", delim = "\t") %>% 
  clean_names() %>% 
  mutate_at(vars(observer_id, locality, observation_date, time_observations_started, protocol_type), str_replace_na, "NA") %>% 
  mutate(observation_count = as.numeric(str_replace(observation_count, "X", as.character(NA))),
         observation_event_id = str_c(observer_id, locality, observation_date, time_observations_started, sep = "-"),
         observation_date = ymd(observation_date)) %>%
  filter(all_species_reported == 1)

glimpse(df)
Rows: 908,622
Columns: 48
$ global_unique_identifier     <chr> "URN:CornellLabOfOrnithology:EBIRD:OBS815…
$ last_edited_date             <dttm> 2018-08-03 11:44:16, 2018-08-03 11:44:34…
$ taxonomic_order              <dbl> 493, 20638, 20638, 20638, 20638, 20638, 2…
$ category                     <chr> "species", "species", "species", "species…
$ common_name                  <chr> "American Black Duck", "American Crow", "…
$ scientific_name              <chr> "Anas rubripes", "Corvus brachyrhynchos",…
$ subspecies_common_name       <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ subspecies_scientific_name   <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ observation_count            <dbl> 1, 7, 3, 2, 4, 3, NA, NA, 25, 2, 7, 2, 3,…
$ breeding_bird_atlas_code     <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ breeding_bird_atlas_category <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ age_sex                      <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ country                      <chr> "United States", "United States", "United…
$ country_code                 <chr> "US", "US", "US", "US", "US", "US", "US",…
$ state                        <chr> "Pennsylvania", "Pennsylvania", "Pennsylv…
$ state_code                   <chr> "US-PA", "US-PA", "US-PA", "US-PA", "US-P…
$ county                       <chr> "Allegheny", "Allegheny", "Allegheny", "A…
$ county_code                  <chr> "US-PA-003", "US-PA-003", "US-PA-003", "U…
$ iba_code                     <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ bcr_code                     <dbl> 28, 28, 28, 28, 28, 28, 28, 28, 28, 28, 2…
$ usfws_code                   <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ atlas_block                  <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ locality                     <chr> "Rachel Carson Riverfront Park", "Southsi…
$ locality_id                  <chr> "L2640334", "L841018", "L329082", "L69207…
$ locality_type                <chr> "H", "H", "H", "P", "P", "H", "H", "H", "…
$ latitude                     <dbl> 40.53731, 40.43124, 40.54348, 40.65688, 4…
$ longitude                    <dbl> -79.79531, -79.97032, -79.90623, -80.1138…
$ observation_date             <date> 2010-01-14, 2010-01-31, 2010-01-23, 2010…
$ time_observations_started    <chr> "11:05:00", "16:45:00", "13:15:00", "14:4…
$ observer_id                  <chr> "obsr39944", "obsr197993", "obsr197993", …
$ sampling_event_identifier    <chr> "S5760087", "S5839167", "S5798726", "S580…
$ protocol_type                <chr> "Traveling", "Traveling", "Area", "Travel…
$ protocol_code                <chr> "P22", "P22", "P23", "P22", "P21", "P23",…
$ project_code                 <chr> "EBIRD", "EBIRD", "EBIRD", "EBIRD", "EBIR…
$ duration_minutes             <dbl> 25, 30, 90, 90, 120, 30, 35, 30, 70, 60, …
$ effort_distance_km           <dbl> 0.483, 0.483, NA, 8.047, NA, NA, NA, NA, …
$ effort_area_ha               <dbl> NA, NA, 24.2811, NA, NA, 4.0469, 4.0469, …
$ number_observers             <dbl> 1, 2, 2, 1, NA, 2, 2, 2, 4, 2, 1, 1, 1, 4…
$ all_species_reported         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ group_identifier             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ has_media                    <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ approved                     <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
$ reviewed                     <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,…
$ reason                       <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ trip_comments                <chr> NA, NA, NA, NA, NA, "Temperature 8F Winds…
$ species_comments             <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ x47                          <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N…
$ observation_event_id         <chr> "obsr39944-Rachel Carson Riverfront Park-…

I focus on the two main ways people use the eBird app: traveling and stationary. I also filter to only observations from 2016 onwards, since that is when eBird usage became stable in the county.

df_top_protocols <- df %>% 
  count(protocol_type, sort = TRUE) %>% 
  slice(1:2)

df <- df %>% 
  semi_join(df_top_protocols) %>% 
  filter(year(observation_date) >= 2016)

This identifies the top 10 birds in terms of total observations:

df_species_count <- df %>% 
  group_by(common_name) %>% 
  summarize(observation_count = sum(observation_count, na.rm = TRUE)) %>% 
  arrange(desc(observation_count)) %>% 
  slice(1:10)

This code filters on the top 10 birds and caculates the cumulative number of sightings and the rolling 21 day average of sightings.

df_cumulative <- df %>% 
  semi_join(df_species_count, by = c("common_name")) %>% 
  group_by(common_name, observation_date) %>% 
  summarize(observation_count = sum(observation_count, na.rm = TRUE)) %>% 
  ungroup() %>% 
  arrange(common_name, observation_date) %>% 
  group_by(common_name) %>% 
  mutate(observation_count_cumulative = cumsum(observation_count)) %>% 
  tq_mutate(
    # tq_mutate args
    select     = observation_count,
    mutate_fun = rollapply, 
    # rollapply args
    width      = 21,
    align      = "right",
    FUN        = mean,
    # mean args
    na.rm      = TRUE,
    # tq_mutate args
    col_rename = "mean_21"
  )

This plots the cumulative observations by bird and creates an animation with gganimate:

plot <- df_cumulative %>% 
  ggplot(aes(observation_date, observation_count_cumulative, group = common_name)) +
  geom_line(alpha = .5) +
  geom_segment(aes(xend = last(df_cumulative$observation_date) + 240, yend = observation_count_cumulative), linetype = 2, colour = 'grey') +
  geom_point(aes(size = mean_21)) +
  geom_label(aes(x = last(df_cumulative$observation_date) + 210, label = common_name), size = 6) +
  scale_y_comma() +
  scale_size_continuous("21 day rolling average of observation count", range = c(2, 10), labels = scales::comma) +
  scale_x_date(limits = c(first(df_cumulative$observation_date), last(df_cumulative$observation_date) + 250)) +
  labs(x = NULL,
       y = "Cumulative observations",
       title = "eBird observations in Allegheny County",
       subtitle = "Top 10 birds 2016 through January 2020",
       caption = "@conor_tompkins") +
  coord_cartesian(clip = 'off') +
  transition_reveal(observation_date)

plot