library(tidyverse)
library(lubridate)
library(vroom)
library(janitor)
library(rebird)
library(hrbrthemes)
library(ggrepel)
library(gganimate)
library(widyr)
library(tidygraph)
library(ggraph)
library(tidytext)
options(scipen = 999, digits = 2)
theme_set(theme_ipsum())
set.seed(1234)
In this post I will do some exploratory analysis on eBird data. I’ve picked up birdwatching as a hobby during quarantine, and eBird has a ton of cool data on bird sightings.
Setup
Load and filter data
I downloaded data for bird sightings in Allegheny County from the eBird data portal. This code loads the data in R and prepares it for analysis.
<- vroom("post_data/ebd_US-PA-003_201001_202003_relFeb-2020.zip", delim = "\t") %>%
df clean_names() %>%
mutate_at(vars(observer_id, locality, observation_date, time_observations_started, protocol_type), str_replace_na, "NA") %>%
mutate(observation_date = ymd(observation_date),
observation_count_old = observation_count,
observation_count = as.numeric(str_replace(observation_count_old, "X", as.character(NA))),
observation_event_id = str_c(observer_id, locality, observation_date, time_observations_started)) %>%
filter(all_species_reported == 1)
I will focus on the two major “types” of observation protocols. The others are related to specific birding events, and might not be representative of the overall data.
<- df %>%
df_top_protocols count(protocol_type, sort = TRUE) %>%
slice(1:2)
df_top_protocols
# A tibble: 2 × 2
protocol_type n
<chr> <int>
1 Traveling 545872
2 Stationary 293991
<- df %>%
df semi_join(df_top_protocols)
This shows that Allegheny County birders began submitting data regularly in 2016. I will focus my analysis on recent observations.
%>%
df count(observation_date) %>%
mutate(recent_observation = year(observation_date) >= 2016,
observation_date = ymd(observation_date)) %>%
ggplot(aes(observation_date, n, color = recent_observation)) +
geom_line() +
scale_y_comma() +
scale_color_discrete("Recent Observation") +
labs(y = "Number of observations",
x = "Observation date")
<- df %>%
df filter(year(observation_date) >= 2016)
There is wide variation in the number of times a species was sighted in a given observation.
%>%
df ggplot(aes(observation_count, group = common_name)) +
geom_density() +
scale_x_log10() +
labs(x = "Species count (log10)")
This calculates the 99th percentile of bird count per observation, and highlights the birds that are seen in large flocks.
%>%
df group_by(common_name) %>%
summarize(observation_count_99 = quantile(observation_count, probs = c(.99), na.rm = TRUE)) %>%
arrange(desc(observation_count_99))
# A tibble: 306 × 2
common_name observation_count_99
<chr> <dbl>
1 Ring-billed Gull 3400
2 crow sp. 2000
3 gull sp. 500
4 Herring Gull 248.
5 Tundra Swan 230.
6 European Starling 176.
7 Horned Lark 175
8 Bufflehead 170.
9 Canada Goose 148
10 Larus sp. 143.
# ℹ 296 more rows
This shows that the high count of Ring-billed Gulls is explained by groups of birders seeing flocks of thousands of the species. This also highlights that the same bird sighting can be counted twice because of simultaneous observation.
%>%
df filter(common_name == "Ring-billed Gull") %>%
arrange(desc(observation_count)) %>%
slice(1:10) %>%
select(observer_id, group_identifier, common_name, observation_date, duration_minutes, observation_count, locality)
# A tibble: 10 × 7
observer_id group_identifier common_name observation_date duration_minutes
<chr> <chr> <chr> <date> <dbl>
1 obsr160352 G2943178 Ring-billed G… 2018-02-13 38
2 obsr40545 G2943178 Ring-billed G… 2018-02-13 38
3 obsr40545 G2930894 Ring-billed G… 2018-02-09 50
4 obsr160352 G2930894 Ring-billed G… 2018-02-09 75
5 obsr160352 <NA> Ring-billed G… 2016-01-29 25
6 obsr160352 G1578214 Ring-billed G… 2016-01-30 71
7 obsr101818 G1578214 Ring-billed G… 2016-01-30 71
8 obsr40545 G2940041 Ring-billed G… 2018-02-12 55
9 obsr160352 <NA> Ring-billed G… 2018-02-10 55
10 obsr620338 G2940041 Ring-billed G… 2018-02-12 55
# ℹ 2 more variables: observation_count <dbl>, locality <chr>
Species counts
This does a basic count of the major species, not accounting for simultaneous observation:
<- df %>%
df_counts group_by(common_name) %>%
summarize(species_count = sum(observation_count, na.rm = TRUE)) %>%
arrange(desc(species_count))
df_counts
# A tibble: 306 × 2
common_name species_count
<chr> <dbl>
1 Ring-billed Gull 253373
2 American Crow 189804
3 European Starling 180456
4 American Robin 180148
5 Canada Goose 159040
6 Mallard 103066
7 Northern Cardinal 84604
8 Mourning Dove 70975
9 Blue Jay 64894
10 Song Sparrow 63571
# ℹ 296 more rows
%>%
df_counts mutate(common_name = fct_reorder(common_name, species_count)) %>%
slice(1:20) %>%
ggplot(aes(species_count, common_name)) +
geom_col() +
scale_x_comma() +
labs(x = "Observations",
y = NULL)
Species correlations
I was interested in which birds are correlated most with two common and popular birds, the Cardinal and the Blue Jay. This calculates the pairwise correlation and plots the top 10 birds correlated with the two target species:
<- c("Northern Cardinal", "Blue Jay") species_list
<- df %>%
df_pair_corr pairwise_cor(common_name, observation_event_id, diag = FALSE, upper = FALSE)
%>%
df_pair_corr filter(item1 %in% species_list) %>%
drop_na(correlation) %>%
arrange(item1, desc(correlation)) %>%
group_by(item1) %>%
slice(1:10) %>%
ungroup() %>%
mutate(item2 = reorder_within(x = item2, by = correlation, within = item1)) %>%
ggplot(aes(correlation, item2, fill = item1)) +
geom_col(alpha = .9) +
facet_wrap(~item1, scales = "free_y") +
scale_y_reordered() +
scale_fill_manual(values = c("blue", "red")) +
guides(fill = FALSE) +
labs(x = "Correlation",
y = NULL)
Cardinals and Blue Jays share many birds in common, but there are some distinctions, and the level of correlation can vary. I use a slopegraph to show the difference in how much a bird is correlated with Blue Jays vs. Cardinals.
This grabs the data for the two target species and gets the top 20 correlated birds for each species:
<- df_pair_corr %>%
df_slopegraph filter(item1 %in% species_list) %>%
drop_na(correlation) %>%
arrange(item1, desc(correlation)) %>%
group_by(item1) %>%
slice(1:20) %>%
ungroup()
This calculates the difference in correlation for each bird:
<- df_slopegraph %>%
df_corr_diff pivot_wider(names_from = item1, values_from = correlation, names_prefix = "corr_") %>%
clean_names() %>%
mutate(corr_diff = abs(corr_blue_jay - corr_northern_cardinal)) %>%
select(item2, corr_diff)
df_corr_diff
# A tibble: 31 × 2
item2 corr_diff
<chr> <dbl>
1 Northern Cardinal NA
2 Red-bellied Woodpecker 0.0347
3 Tufted Titmouse 0.0142
4 Carolina Wren NA
5 Downy Woodpecker NA
6 White-breasted Nuthatch 0.00495
7 Song Sparrow 0.118
8 Northern Flicker 0.0199
9 Mourning Dove NA
10 Eastern Towhee NA
# ℹ 21 more rows
%>%
df_slopegraph left_join(df_corr_diff) %>%
ggplot(aes(item1, correlation)) +
geom_line(aes(group = item2, color = corr_diff), size = 2) +
geom_point(size = 2) +
geom_text_repel(data = filter(df_slopegraph, item1 == species_list[2]),
aes(y = correlation, label = item2), direction = "both", nudge_x = -.3, segment.alpha = .2) +
geom_text_repel(data = filter(df_slopegraph, item1 == species_list[1]),
aes(y = correlation, label = item2), direction = "both", nudge_x = .3, segment.alpha = .2) +
scale_color_viridis_c("Absolute difference in correlation") +
labs(x = NULL,
y = "Correlation") +
theme(panel.grid.minor.y = element_blank(),
panel.grid.major.y = element_blank(),
axis.title.x = element_blank())
Network Graph
Network graphs are fun ways to show counts or correlations between groups. Since birds often exist in distinct environments and vary by season, graph analysis should be able to visualize connections.
This takes the top 100 birds in terms of total count and makes a network graph object consisting of nodes and edges.
<- df_pair_corr %>%
graph_object_corr semi_join(df_counts %>% slice(1:75), by = c("item1" = "common_name")) %>%
semi_join(df_counts %>% slice(1:75), by = c("item2" = "common_name")) %>%
as_tbl_graph(directed = FALSE) %>%
activate(nodes) %>%
filter(!node_is_isolated()) %>%
activate(edges) %>%
filter(abs(correlation) >= .05) %>%
activate(nodes) %>%
filter(!node_is_isolated()) %>%
left_join(df_counts, by = c("name" = "common_name"))
graph_object_corr
# A tbl_graph: 74 nodes and 1925 edges
#
# An undirected simple graph with 1 component
#
# Node Data: 74 × 2 (active)
name species_count
<chr> <dbl>
1 American Crow 189804
2 American Goldfinch 60297
3 American Robin 180148
4 Belted Kingfisher 2754
5 Black-capped Chickadee 12379
6 Blue Jay 64894
7 Brown-headed Cowbird 8877
8 Bufflehead 5758
9 Canada Goose 159040
10 Carolina Chickadee 21862
# ℹ 64 more rows
#
# Edge Data: 1,925 × 3
from to correlation
<int> <int> <dbl>
1 1 2 0.262
2 1 3 0.259
3 2 3 0.371
# ℹ 1,922 more rows
This plots the network graph to show all the connections that fit the criteria in the code above:
<- graph_object_corr %>%
plot ggraph() +
geom_edge_link(aes(width = correlation, alpha = correlation)) +
geom_node_point(aes(size = species_count, alpha = species_count)) +
scale_size_continuous("Total observations", labels = scales::comma) +
scale_alpha_continuous("Total observations", labels = scales::comma) +
scale_edge_alpha("Observations together", range = c(.01, .3)) +
scale_edge_width("Observations together", range = c(.1, 2)) +
theme_void()
plot
There is a dense group of birds that are highly correlated with each other. This high correlation could be caused by a shared environment or seasonal migration patterns.
Highlight species
I also wanted to be able to see where a given species fits in the network graph. This code prepares a network graph object and filters out species that are not highly connected to the main group.
<- df_pair_corr %>%
graph_object_corr as_tbl_graph(directed = FALSE) %>%
activate(edges) %>%
filter(abs(correlation) > .2) %>%
activate(nodes) %>%
mutate(centrality = centrality_authority()) %>%
filter(centrality > .01) %>%
filter(!node_is_isolated())
This code identifies the node ID for the Northern Cardinal and makes a dataframe I will use to filter with in the next code chunk.
<- c("Northern Cardinal")
species_list
<- graph_object_corr %>%
df_species_corr_nodes activate(nodes) %>%
as_tibble() %>%
mutate(node_id = row_number()) %>%
filter(name %in% species_list)
This code identifes which node is the Northern Cardinal, and which edges connect to the Northern Cardinal. This identifies all the species that are connected to the bird, given the criteria I set above.
<- graph_object_corr %>%
plot_corr activate(nodes) %>%
mutate(name_label = case_when(name %in% species_list ~ name,
TRUE ~ as.character(NA))) %>%
activate(edges) %>%
left_join(df_species_corr_nodes, by = c("from" = "node_id")) %>%
left_join(df_species_corr_nodes, by = c("to" = "node_id")) %>%
mutate(name = coalesce(name.x, name.y),
centrality = coalesce(name.x, name.y)) %>%
select(-matches(".x|.y")) %>%
mutate(species_flag = case_when(from %in% df_species_corr_nodes$node_id | to %in% df_species_corr_nodes$node_id ~ TRUE,
TRUE ~ FALSE),
name = case_when(is.na(name) ~ "Other species",
TRUE ~ name)) %>%
ggraph() +
geom_edge_link(aes(alpha = species_flag, width = species_flag)) +
geom_node_point(aes(shape = !is.na(name_label), size = !is.na(name_label), color = name_label)) +
geom_node_label(aes(label = name_label, color = name_label), repel = TRUE) +
scale_edge_width_manual(values = c(.3, 1)) +
scale_edge_alpha_discrete(range = c(0.1, .5)) +
scale_shape_manual(values = c(1, 19)) +
scale_size_manual(values = c(2, 3)) +
guides(edge_alpha = FALSE,
edge_width = FALSE,
size = FALSE,
shape = FALSE,
color = FALSE) +
theme_void()
plot_corr