library(tidyverse)
library(tidytext)
library(ggraph)
library(igraph)
library(widyr)
library(SnowballC)
library(lubridate)
library(viridis)
library(scales)
library(knitr)
library(kableExtra)
Pittsburgh Mayor Bill Peduto uses Twitter to communicate with his constituents, express his political opinions, and comment about Pittsburgh sports. I will use various R packages (mainly the Tidyverse, Tidytext, and rtweet) to analyze how the Mayor uses Twitter.
Before getting started, we need to load the packages required for this analysis:
First, we will download @billpeduto’s recent tweets using rtweet
::tweets_bill <- get_timelines("BillPeduto", n = 3200) rtweet
I have already downloaded the data, so I will load it from my GitHub repo. I will also do some data munging to make the data easier to work with.
<- "https://raw.githubusercontent.com/conorotompkins/pittsburgh_twitter/master/data/df_billpeduto.csv"
url
read_csv(url) %>%
mutate(created_at = with_tz(created_at, "US/Eastern"),
date = ymd(str_sub(created_at, 1, 10)),
year = year(date),
month = month(date, label = TRUE),
week = week(date),
wday = wday(date, label = TRUE),
hour = hour(created_at),
month_year = str_c(month, year, sep = "-"),
month_year = factor(month_year, levels = unique(month_year)),
wday = factor(wday, levels = c("Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"))) -> df_bill
1:5, 1:5] %>%
df_bill[kable("html") %>%
kable_styling()
status_id | created_at | source | is_quote | is_retweet |
---|---|---|---|---|
x892785891055468548 | 2017-08-02 12:35:18 | Twitter for iPhone | FALSE | TRUE |
x892804872390795264 | 2017-08-02 13:50:43 | Twitter for iPhone | FALSE | TRUE |
x892820785823404032 | 2017-08-02 14:53:57 | Twitter for iPhone | FALSE | TRUE |
x892926762933506048 | 2017-08-02 21:55:04 | Twitter for iPhone | FALSE | TRUE |
x892953788679692288 | 2017-08-02 23:42:28 | Twitter for iPhone | FALSE | FALSE |
Exploratory Analysis
This plot shows that the Mayor tweets about 15 times a day, though there is considerable daily variation.
%>%
df_bill count(date) %>%
ggplot(aes(date, n)) +
geom_jitter(alpha = .5) +
geom_smooth(size = 2) +
scale_x_date(date_breaks = "month",
date_labels = "%b-%Y") +
labs(title = title,
x = "",
y = "Number of tweets",
caption = caption)
The Mayor tweets the most on weekdays; Tuesday has the highest median number of tweets. The weekends see fewer tweets, and Saturday and Sunday are very similar.
%>%
df_bill count(wday, date) %>%
group_by(wday) %>%
mutate(median = median(n)) %>%
ggplot(aes(wday, n, color = wday)) +
geom_jitter(alpha = .3, width = .2) +
geom_point(aes(wday, median), size = 5) +
scale_color_viridis(name = "Day of the week", discrete = TRUE) +
labs(title = title,
subtitle = "One small dot = one day, large dot = median for weekday",
x = "",
y = "Number of tweets in a given day",
caption = caption)
The Mayor does most of his tweeting around lunchtime and in the late evening.
%>%
df_bill count(hour) %>%
ggplot(aes(hour, y = 1, fill = n)) +
geom_tile() +
coord_equal() +
scale_y_continuous(expand = c(0,0)) +
scale_x_continuous(expand = c(0,0),
breaks = seq(0, 23, by = 3)) +
scale_fill_viridis(name = "Number of tweets",
option = 3) +
guides(fill = guide_colorbar(title.position = "top")) +
labs(title = title,
x = "Hour",
y = "",
caption = caption) +
theme(axis.text.y = element_blank(),
axis.ticks.y = element_blank(),
legend.position = "bottom",
legend.text = element_text(size = 7))
This view combines the day of the week and hour variables, and confirms what we observed when looking at the variables independently.
%>%
df_bill mutate(wday = factor(wday, levels = rev(levels(df_bill$wday)))) %>%
count(wday, hour) %>%
complete(wday, hour = 0:23) %>%
replace_na(list(n = 0)) %>%
ggplot(aes(hour, wday, fill = n)) +
geom_tile() +
coord_equal() +
scale_x_continuous(expand = c(0,0),
breaks = seq(0, 24, by = 3)) +
scale_y_discrete(expand = c(0,0)) +
scale_fill_viridis(option = 3) +
labs(title = title,
x = "Hour",
y = "",
caption = caption) +
guides(fill = guide_colorbar("Number of tweets")) +
theme(legend.position = "bottom")
This plot shows the percentage of the Mayor’s tweets that are regular tweets, retweets, quotes, and replies.
%>%
df_bill select(date, month_year, month, week, is_retweet, is_quote, reply_to_screen_name) %>%
mutate(tweet_type = case_when(is.na(reply_to_screen_name) == FALSE ~ "Reply",
== FALSE & is_quote == FALSE ~ "Regular tweet",
is_retweet == TRUE ~ "Retweet",
is_retweet == TRUE ~ "Quote")) -> df_bill_tweet_types
is_quote
%>%
df_bill_tweet_types count(month_year, tweet_type) %>%
mutate(tweet_type = factor(tweet_type, levels = rev(c("Retweet", "Regular tweet", "Quote", "Reply")))) -> df_bill_tweet_types_month_year
%>%
df_bill_tweet_types_month_year ggplot(aes(month_year, n, fill = tweet_type, group = tweet_type)) +
geom_area(position = "fill") +
scale_fill_viridis(name = "Tweet type", discrete = TRUE) +
scale_x_discrete(expand = c(0,0)) +
scale_y_continuous(expand = c(0,0), labels = scales::percent) +
labs(title = "Types of @billpeduto tweets",
x = "",
y = "Percentage of tweets",
caption = caption) +
theme(axis.text.x = element_text(size = 10))
This heatmap shows that the Mayor replies to tweets most often in the late evening.
%>%
df_bill filter(!is.na(reply_to_screen_name), is_quote == FALSE, is_retweet == FALSE) %>%
count(wday, hour) %>%
complete(wday, hour = 0:23) %>%
replace_na(list(n = 0)) %>%
ggplot(aes(hour, wday, fill = n)) +
geom_tile() +
coord_equal() +
scale_x_continuous(expand = c(0,0),
breaks = seq(0, 24, by = 3)) +
scale_y_discrete(expand = c(0,0)) +
scale_fill_viridis(option = 3) +
labs(x = "Hour",
y = "Day of the week") +
guides(fill = guide_colorbar("Number of tweets")) +
theme(legend.position = "bottom")
Text Analysis
The R package Tidytext makes it very easy to parse text data and pull insights from it. It interfaces very easily with ggraph, which creates network graphs.
set.seed(1234)
<- read_csv("https://raw.githubusercontent.com/conorotompkins/pittsburgh_twitter/master/data/bill_peduto_tweets.tweets.csv") df_bill
By separating the Mayor’s tweets into bigrams (two-word chunks), we can identify which words are used together. This code cuts the tweets into bigrams and counts their occurences. I exclude retweets and quotes from this analysis.
<- function(dataset, custom_stopwords) {
count_twitter_bigrams <- "https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|&|<|>|RT|https|'s"
replace_reg
%>%
dataset filter(is_quote == FALSE, is_retweet == FALSE) %>%
mutate(text = str_replace_all(text, replace_reg, "")) %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word1 %in% custom_stopwords,
!word2 %in% custom_stopwords) %>%
count(word1, word2, sort = TRUE)
}
<- c("t.co", "https", "amp")
bill_stop_words
<- count_twitter_bigrams(df_bill, bill_stop_words)
tweets_bill
%>%
tweets_bill rename(count = n) %>%
head() %>%
kable("html") %>%
kable_styling()
word1 | word2 | count |
---|---|---|
public | safety | 12 |
affordable | housing | 10 |
amazon | hq2 | 8 |
climate | change | 6 |
common | sense | 6 |
u.s | cities | 6 |
This network graph shows which words the Mayor uses together frequently. “public” + “safety”, and “affordable” + “housing” are the main standouts. I think this plot pretty accurately captures the topics the Mayor tweets about.
<- function(bigrams, minimum, text_size = 3, title = NULL, subtitle = NULL, caption = NULL) {
visualize_bigrams set.seed(2016)
<- grid::arrow(type = "closed",
a length = unit(.1, "inches"))
%>%
bigrams filter(n >= minimum) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), size = text_size, vjust = 1, hjust = 1) +
geom_edge_link(aes(edge_alpha = n, edge_width = n), show.legend = TRUE, arrow = a, end_cap = circle(.25, 'inches')) +
scale_edge_width_continuous("Count", range = c(.5, 1.5)) +
scale_edge_alpha_continuous("Count", range = c(.3, .7)) +
labs(title = title,
subtitle = subtitle,
caption = caption) +
theme_void(base_size = 18)
}
visualize_bigrams(tweets_bill,
minimum = 4,
text_size = 5,
title = "@billpeduto tweets",
subtitle = "Bigram network",
caption = caption)
This code determines which words are correlated with each other:
<- function(dataframe, minimum, custom_stopwords){
word_correlations <- "https://t.co/[A-Za-z\\d]+|http://[A-Za-z\\d]+|&|<|>|RT|https"
replace_reg <- "([^A-Za-z_\\d#@']|'(?![A-Za-z_\\d#@]))"
unnest_reg %>%
dataframe filter(is_quote == FALSE, is_retweet == FALSE) %>%
select(status_id, text) %>%
mutate(section = row_number() %/% 10) %>%
filter(section > 0) %>%
mutate(text = str_replace_all(text, replace_reg, "")) %>%
unnest_tokens(word, text, token = "regex", pattern = unnest_reg) %>%
filter(!word %in% stop_words$word,
!word %in% custom_stopwords,
str_detect(word, "[a-z]")) %>%
mutate(word = str_replace(word, "'", ""),
word = str_replace(word, "'", ""),
word = SnowballC::wordStem(word)) %>%
group_by(word) %>%
filter(n() >= minimum) %>%
pairwise_cor(word, section, sort = TRUE)
}
<- c("t.co", "https", "amp")
bill_stopwords
<- word_correlations(df_bill, minimum = 10, custom_stopwords = bill_stopwords) bill_words
This plot shows which words are correlated highly with each other. Some words are “stemmed” (shortened and/or modified), to capture very similar words. For example, “business” and “businesses” could be stemmed to “busine”.
“amazon” and “hq2” are used together overwhelmingly. The “pwsa” + “water” + “decad” combination indicate the ongoing water quality problems, and the plans to reinvest in the infrastructure. This plot shows how some of the policy issues the Mayor focuses on are connected.
<- function(dataframe, title, subtitle, caption){
visualize_word_correlations %>%
dataframe filter(correlation > .3) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation, edge_width = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_label(aes(label = name), size = 5, repel = TRUE) +
scale_edge_alpha_continuous(range = c(.1, .5)) +
theme_void(base_size = 18) +
labs(title = title,
subtitle = subtitle,
caption = caption)
}
visualize_word_correlations(bill_words,
title = "@billpeduto tweets",
subtitle = "Word correlation network",
caption = "@conor_tompkins")
References
- https://github.com/mkearney/rtweet
- http://rtweet.info/
- http://rtweet.info/articles/intro.html
- https://stackoverflow.com/questions/47681690/no-twitter-authorization-prompt-when-using-rtweet-package/48275078#48275078
- https://stackoverflow.com/questions/47910979/setting-up-rtweet-for-r-in-aws-ubuntu-server
- https://github.com/r-lib/httr/issues/156
- https://github.com/mkearney/rtweet/issues/75
- https://github.com/geoffjentry/twitteR/issues/65
- https://github.com/r-lib/httr/blob/master/demo/oauth1-twitter.r