library(tidyverse)
library(lubridate)
library(janitor)
library(vroom)
library(broom)
library(hrbrthemes)
theme_set(theme_bw())
set.seed(1234)
In this post, I use k-means clustering to identify clusters of bird species based on frequency of observations per month. I use bird sightings in Allegheny County from eBird.
Load the relevant libraries:
Load and filter the data:
<- vroom("post_data/ebd_US-PA-003_201001_202003_relFeb-2020.zip", delim = "\t") %>%
df clean_names() %>%
mutate_at(vars(observer_id, locality, observation_date, time_observations_started, protocol_type), str_replace_na, "NA") %>%
mutate(observation_count = as.numeric(str_replace(observation_count, "X", as.character(NA))),
observation_event_id = str_c(observer_id, locality, observation_date, time_observations_started, sep = "-"),
observation_date = ymd(observation_date)) %>%
filter(all_species_reported == 1)
<- df %>%
df_top_protocols count(protocol_type, sort = TRUE) %>%
slice(1:2)
<- df %>%
df semi_join(df_top_protocols) %>%
filter(year(observation_date) >= 2016)
%>%
df select(common_name, observation_date, observation_count) %>%
glimpse()
Rows: 533,493
Columns: 3
$ common_name <chr> "American Black Duck", "American Black Duck", "Ameri…
$ observation_date <date> 2016-01-31, 2016-01-24, 2016-01-30, 2016-01-31, 201…
$ observation_count <dbl> 2, 2, 2, 3, 2, 3, 57, 7, 2, 4, 1, 5, 8, 1, 1, 4, 1, …
This graph shows general seasonality in bird observations:
%>%
df count(observation_date) %>%
ggplot(aes(observation_date, n)) +
geom_line() +
labs(x = "Observation date",
y = "Observation events") +
scale_y_comma()
This code chunk calculates the average number of observations by species and month. Then, it interpolates a value of 0
for birds where there were no sightings in a given month:
<- df %>%
months mutate(observation_month = month(observation_date, label = TRUE)) %>%
distinct(observation_month) %>%
pull(observation_month)
<- df %>%
df_seasonality mutate(observation_month = month(observation_date, label = TRUE),
observation_year = year(observation_date)) %>%
group_by(common_name, observation_year, observation_month) %>%
summarize(observation_count = sum(observation_count, na.rm = TRUE)) %>%
group_by(common_name, observation_month) %>%
summarize(observation_count_mean = mean(observation_count) %>% round(1)) %>%
ungroup() %>%
complete(common_name, observation_month = months) %>%
replace_na(list(observation_count_mean = 0)) %>%
arrange(common_name, observation_month)
glimpse(df_seasonality)
Rows: 3,672
Columns: 3
$ common_name <chr> "Acadian Flycatcher", "Acadian Flycatcher", "Ac…
$ observation_month <ord> Jan, Feb, Mar, Apr, May, Jun, Jul, Aug, Sep, Oc…
$ observation_count_mean <dbl> 0.0, 0.0, 0.0, 0.0, 184.8, 98.5, 67.0, 19.0, 22…
This transforms the mean monthly observation into log10:
<- df_seasonality %>%
df_seasonality mutate(observation_count_mean_log10 = log10(observation_count_mean),
observation_count_mean_log10 = case_when(is.infinite(observation_count_mean_log10) ~ 0,
TRUE ~ observation_count_mean_log10)) %>%
select(-observation_count_mean)
These graphs show that observations generally increase in the spring and fall, but there is wide variation:
%>%
df_seasonality ggplot(aes(observation_month, observation_count_mean_log10)) +
geom_boxplot() +
labs(x = "Observation month",
y = "Mean observation count (log10)")
This tile graph shows the seasonality trends per species. I sort the birds by ascending mean observation count by month. It shows there are birds that appear year-round, some that appear seasonally, and some that only appear sporadically:
<- df_seasonality %>%
vec_common_name pivot_wider(names_from = observation_month, values_from = observation_count_mean_log10, names_prefix = "month_") %>%
clean_names() %>%
arrange(month_jan, month_feb, month_mar, month_apr, month_may, month_jun, month_jul, month_aug, month_sep, month_oct, month_nov, month_dec) %>%
pull(common_name)
%>%
df_seasonality mutate(common_name = factor(common_name, levels = vec_common_name)) %>%
ggplot(aes(observation_month, common_name, fill = observation_count_mean_log10)) +
geom_tile() +
scale_fill_viridis_c("Mean observation count (log10)") +
scale_x_discrete(expand = c(0,0)) +
scale_y_discrete(expand = c(0,0)) +
labs(x = "Observation month",
y = "Species") +
theme(panel.grid = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank())
You can subjectively see clusters of bird types in the above graph. I will use k-means to attempt to find those clusters.
This code chunk pivots the data wide to prepare it for clustering:
<- df_seasonality %>%
df_seasonality_wide select(common_name, observation_month, observation_count_mean_log10) %>%
pivot_wider(names_from = observation_month, values_from = observation_count_mean_log10, names_prefix = "month_") %>%
clean_names()
glimpse(df_seasonality_wide)
Rows: 306
Columns: 13
$ common_name <chr> "Acadian Flycatcher", "Accipiter sp.", "Alder Flycatcher",…
$ month_jan <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0…
$ month_feb <dbl> 0.000000, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000…
$ month_mar <dbl> 0.0000000, 0.1760913, 0.0000000, 0.0000000, 0.0000000, 0.0…
$ month_apr <dbl> 0.0000000, 0.3010300, 0.0000000, 0.0000000, 0.0000000, 0.6…
$ month_may <dbl> 2.2667020, 0.3010300, 0.1760913, 0.4771213, 0.0000000, 0.0…
$ month_jun <dbl> 1.993436, 0.000000, 0.000000, 0.000000, 0.000000, 0.000000…
$ month_jul <dbl> 1.8260748, 0.0000000, 0.0000000, 0.4771213, 0.0000000, 0.0…
$ month_aug <dbl> 1.2787536, 0.0000000, 0.0000000, 0.3979400, 0.9030900, 0.0…
$ month_sep <dbl> 1.3424227, 0.0000000, 0.0000000, 1.0086002, 1.8450980, 0.0…
$ month_oct <dbl> 0.0000000, 0.3010300, 0.0000000, 0.0000000, 0.0000000, 0.0…
$ month_nov <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0…
$ month_dec <dbl> 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0000000, 0.0…
This uses purrr
to cluster the data with varying numbers of clusters (1 to 9):
<- tibble(k = 1:9) %>%
kclusts mutate(
kclust = map(k, ~kmeans(df_seasonality_wide %>% select(-common_name), .x)),
tidied = map(kclust, tidy),
glanced = map(kclust, glance),
augmented = map(kclust, augment, df_seasonality_wide %>% select(-common_name))
)
kclusts
# A tibble: 9 × 5
k kclust tidied glanced augmented
<int> <list> <list> <list> <list>
1 1 <kmeans> <tibble [1 × 15]> <tibble [1 × 4]> <tibble [306 × 13]>
2 2 <kmeans> <tibble [2 × 15]> <tibble [1 × 4]> <tibble [306 × 13]>
3 3 <kmeans> <tibble [3 × 15]> <tibble [1 × 4]> <tibble [306 × 13]>
4 4 <kmeans> <tibble [4 × 15]> <tibble [1 × 4]> <tibble [306 × 13]>
5 5 <kmeans> <tibble [5 × 15]> <tibble [1 × 4]> <tibble [306 × 13]>
6 6 <kmeans> <tibble [6 × 15]> <tibble [1 × 4]> <tibble [306 × 13]>
7 7 <kmeans> <tibble [7 × 15]> <tibble [1 × 4]> <tibble [306 × 13]>
8 8 <kmeans> <tibble [8 × 15]> <tibble [1 × 4]> <tibble [306 × 13]>
9 9 <kmeans> <tibble [9 × 15]> <tibble [1 × 4]> <tibble [306 × 13]>
<- kclusts %>%
clusters unnest(tidied)
<- kclusts %>%
assignments unnest(augmented)
<- kclusts %>%
clusterings unnest(glanced)
This scree plot shows that 2 clusters is probably optimal, but 4 could also be useful:
ggplot(clusterings, aes(k, tot.withinss)) +
geom_line() +
geom_vline(xintercept = 2, linetype = 2) +
geom_vline(xintercept = 4, linetype = 2) +
scale_x_continuous(breaks = seq(1:9)) +
labs(x = "Number of clusters")
This graph shows how the clustering performed by comparing the observation value in January to the value in the other months, for each k
cluster value 1 through 4:
%>%
assignments select(k, .cluster, contains("month_")) %>%
mutate(id = row_number()) %>%
pivot_longer(cols = contains("month_"), names_to = "observation_month", values_to = "observation_count_mean_log10") %>%
mutate(month_jan = case_when(observation_month == "month_jan" ~ observation_count_mean_log10,
TRUE ~ as.numeric(NA))) %>%
group_by(k, .cluster, id) %>%
fill(month_jan, .direction = c("down")) %>%
ungroup() %>%
filter(observation_month != "month_jan",
<= 4) %>%
k mutate(k = str_c(k, "cluster(s)", sep = " ")) %>%
ggplot(aes(observation_count_mean_log10, month_jan, color = .cluster)) +
geom_point() +
facet_grid(k ~ observation_month) +
labs(x = "Observation month",
y = "January")
Subjectively, I think the optimal number of clusters is 4. It is noiser, but could show more interesting granularity in seasonality.
This clusters the data using 4 clusters:
<- df_seasonality_wide %>%
df_kmeans select(-common_name) %>%
kmeans(centers = 4)
<- augment(df_kmeans, df_seasonality_wide) %>%
df_clustered select(common_name, .cluster)
df_clustered
# A tibble: 306 × 2
common_name .cluster
<chr> <fct>
1 Acadian Flycatcher 4
2 Accipiter sp. 3
3 Alder Flycatcher 3
4 Alder/Willow Flycatcher (Traill's Flycatcher) 3
5 American Avocet 3
6 American Bittern 3
7 American Black Duck 2
8 American Coot 2
9 American Crow 1
10 American Golden-Plover 3
# ℹ 296 more rows
This shows the same style of tile graph as shown previously, but facets it by cluster.
<- df_seasonality %>%
vec_common_name_cluster left_join(df_clustered) %>%
pivot_wider(names_from = observation_month, values_from = observation_count_mean_log10, names_prefix = "month_") %>%
clean_names() %>%
arrange(cluster, month_jan, month_feb, month_mar, month_apr, month_may, month_jun, month_jul, month_aug, month_sep, month_oct, month_nov, month_dec) %>%
pull(common_name)
<- df_seasonality %>%
df_seasonality_clustered left_join(df_clustered) %>%
mutate(common_name = factor(common_name, levels = vec_common_name_cluster))
%>%
df_seasonality_clustered mutate(.cluster = str_c("Cluster", .cluster, sep = " ")) %>%
ggplot(aes(observation_month, common_name, fill = observation_count_mean_log10)) +
geom_tile() +
facet_wrap(~.cluster, scales = "free_y") +
scale_fill_viridis_c("Mean observation count (log10)") +
scale_x_discrete(expand = c(0,0)) +
scale_y_discrete(expand = c(0,0)) +
labs(x = "Observation month",
y = "Species") +
theme(panel.grid = element_blank(),
axis.text.y = element_blank(),
axis.ticks.y = element_blank())
Cluster 1 shows birds that only appear sporadically. I think these are birds that migrate through Allegheny County, but do not stick around. Cluster 2 shows birds that are generally around all year. Cluster 3 shows birds that are seen mostly during the summer, and cluster 4 contains birds that appear in the winter.
This shows a sample of each cluster:
<- df_clustered %>%
df_cluster_sample group_by(.cluster) %>%
sample_n(10, replace = FALSE) %>%
ungroup()
%>%
df_seasonality_clustered semi_join(df_cluster_sample) %>%
mutate(.cluster = str_c("Cluster", .cluster, sep = " ")) %>%
ggplot(aes(observation_month, common_name, fill = observation_count_mean_log10)) +
geom_tile() +
facet_wrap(~.cluster, scales = "free_y") +
scale_fill_viridis_c("Mean observation count (log10)") +
scale_x_discrete(expand = c(0,0)) +
scale_y_discrete(expand = c(0,0)) +
labs(x = "Observation month",
y = NULL) +
theme(panel.grid = element_blank())