https://github.com/favstats/ica22_conf
https://github.com/favstats/ica22_conf
Last synced: 3 months ago
JSON representation
- Host: GitHub
- URL: https://github.com/favstats/ica22_conf
- Owner: favstats
- Created: 2022-05-24T12:46:57.000Z (about 4 years ago)
- Default Branch: master
- Last Pushed: 2022-05-24T12:54:11.000Z (about 4 years ago)
- Last Synced: 2025-07-17T01:44:07.064Z (11 months ago)
- Size: 1.75 MB
- Stars: 1
- Watchers: 1
- Forks: 0
- Open Issues: 0
-
Metadata Files:
- Readme: README.Rmd
Awesome Lists containing this project
README
---
title: "ICA22 Twitter Analysis"
author: "Fabio"
date: "2022-05-24"
output: github_document
---
This is a short notebook outlining the code used to scrape tweets related to the ICA22 conference in Paris.
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, message = F, error = F, warning = F)
```
## Packages
Load the necessary packages
```{r}
# install pacman once if not avaible on your machine
# install.packages("pacman")
pacman::p_load(tidyverse, rtweet, ggraph, igraph, tidygraph)
```
## Get Data
Call Twitter API. If you want to get data yourself you have to register with a free account where you get your personal access point to Twitter. Check out [`rtweet`](https://github.com/mkearney/rtweet/) and follow the instructions.
```{r, eval = F}
# twitter_token <- readRDS("twitter_token.rds")
rt <- search_tweets(
"#ICA22 OR #ica22", n = 100000, include_rts = T, retryonratelimit = T, since='2022-05-01', until='2022-05-31'
)
save(rt, file = "data/rt.Rdata")
```
Lets first look at the data structure and column names. Twitter returns over 1,200 unique tweets.
```{r}
load("data/rt.Rdata")
rt %>% glimpse # the same as str, returns a df overview
```
## The top ten retweeted tweets.
```{r, results="asis"}
# load("rt.Rdata")
rt %>%
filter(!is_retweet) %>%
select(screen_name, text, retweet_count) %>%
filter(!str_detect(text, "^RT")) %>%
mutate(text = str_replace_all(text, "\\\n", " ")) %>%
arrange(desc(retweet_count)) %>%
top_n(n = 10) %>%
knitr::kable(., format = "markdown")
```
## Timeline
```{r, fig.height = 6}
rt %>%
## parse date format
mutate(created_at = lubridate::as_datetime(created_at, "Europe/Germany")) %>%
mutate(
cdate = created_at %>%
str_extract("\\d{4}-\\d{2}-\\d{2}") %>%
lubridate::ymd(),
hour = lubridate::hour(created_at)
) %>% #select(created_at)
## select relevant time period
filter(cdate >= as.Date("2022-05-24") & cdate <= as.Date("2022-05-31")) %>%
## count tweet per and and hour
group_by(cdate, hour) %>%
tally %>%
ungroup %>%
ggplot(aes(hour, n)) +
geom_line() +
## split the visualization
facet_wrap(~cdate, ncol = 1) +
ggthemes::theme_hc() +
scale_x_continuous(labels = seq(5, 24, 3), breaks = seq(5, 24, 3)) +
# scale_y_continuous(labels = seq(0, 60, 20),
# breaks = seq(0, 60, 20),
# minor_breaks = seq(0, 60, 20)) +
ggtitle("Number of Tweets by Hour of the Day mentioning #ICA22") +
xlab("Hour of the Day") +
ylab("Number of Tweets")
```
## Retweet Network
```{r, fig.width = 15, fig.height=15}
rt_graph <- rt %>%
## select relevant variables
dplyr::select(screen_name, retweet_screen_name) %>%
## unnest list of mentions_screen_name
unnest %>%
## count the number of coocurences
group_by(screen_name, retweet_screen_name) %>%
tally(sort = T) %>%
ungroup %>%
## drop missing values
drop_na %>%
## filter those coocurences that appear at least 2 times
filter(n > 1) %>%
## transforming the dataframe to a graph object
as_tbl_graph() %>%
## calculating node centrality
mutate(centrality = centrality_degree(mode = 'in'))
rt_graph %>%
## create graph layout
ggraph(layout = "kk") +
## define edge aestetics
geom_edge_fan(aes(alpha = n, edge_width = n, color = n)) +
## scale down link saturation
scale_edge_alpha(range = c(.5, .9)) +
## define note size param
scale_edge_color_gradient(low = "gray50", high = "#1874CD") +
geom_node_point(aes(size = centrality), color = "gray30") +
## equal width and height
coord_fixed() +
## plain theme
theme_void() +
## title
ggtitle("#ICA22 Retweet Network")
rt_graph %>%
## create graph layout
ggraph(layout = "kk") +
## define edge aestetics
geom_edge_fan(aes(alpha = n, edge_width = n, color = n)) +
## scale down link saturation
scale_edge_alpha(range = c(.5, .9)) +
## define note size param
scale_edge_color_gradient(low = "gray50", high = "#1874CD") +
geom_node_point(aes(size = centrality), color = "gray30") +
## define node labels
geom_node_text(aes(label = name), repel = T, fontface = "bold") +
## equal width and height
coord_fixed() +
## plain theme
theme_void() +
## title
ggtitle("#ICA22 Retweet Network") +
theme(plot.title = element_text(size = 20, hjust = 0.5))
rt_graph %>%
## create graph layout
ggraph(layout = "circle") +
## define edge aestetics
geom_edge_fan(aes(alpha = n, edge_width = n, color = n)) +
## scale down link saturation
scale_edge_alpha(range = c(.5, .9)) +
## define note size param
scale_edge_color_gradient(low = "gray50", high = "#1874CD") +
geom_node_point(aes(size = centrality), color = "gray30") +
## define node labels
geom_node_text(aes(label = name), repel = F, fontface = "bold") +
## equal width and height
coord_fixed() +
## plain theme
theme_void() +
## title
ggtitle("#ICA22 Retweet Network")
```
## Mentions Network
```{r, fig.width = 15, fig.height=15}
rt_graph <- rt %>%
## remove retweets
filter(!is_retweet) %>%
## select relevant variables
dplyr::select(screen_name, mentions_screen_name) %>%
## unnest list of mentions_screen_name
unnest %>%
## count the number of coocurences
group_by(screen_name, mentions_screen_name) %>%
tally(sort = T) %>%
ungroup %>%
## drop missing values
drop_na %>%
## filter those coocurences that appear at least 2 times
filter(n > 1) %>%
## transforming the dataframe to a graph object
as_tbl_graph() %>%
## calculating node centrality
mutate(centrality = centrality_degree(mode = 'in'))
rt_graph %>%
## create graph layout
ggraph(layout = "kk") +
## define edge aestetics
geom_edge_fan(aes(alpha = n, edge_width = n, color = n)) +
## scale down link saturation
scale_edge_alpha(range = c(.5, .9)) +
## define note size param
scale_edge_color_gradient(low = "gray50", high = "#1874CD") +
geom_node_point(aes(size = centrality), color = "gray30") +
## equal width and height
coord_fixed() +
## plain theme
theme_void() +
## title
ggtitle("#ICA22 Twitter Mentions Network")
rt_graph %>%
## create graph layout
ggraph(layout = "kk") +
## define edge aestetics
geom_edge_fan(aes(alpha = n, edge_width = n, color = n)) +
## scale down link saturation
scale_edge_alpha(range = c(.5, .9)) +
## define note size param
scale_edge_color_gradient(low = "gray50", high = "#1874CD") +
geom_node_point(aes(size = centrality), color = "gray30") +
## define node labels
geom_node_text(aes(label = name), repel = T, fontface = "bold") +
## equal width and height
coord_fixed() +
## plain theme
theme_void() +
## title
ggtitle("#ICA22 Twitter Mentions Network")
rt_graph %>%
## create graph layout
ggraph(layout = "circle") +
## define edge aestetics
geom_edge_fan(aes(alpha = n, edge_width = n, color = n)) +
## scale down link saturation
scale_edge_alpha(range = c(.5, .9)) +
## define note size param
scale_edge_color_gradient(low = "gray50", high = "#1874CD") +
geom_node_point(aes(size = centrality), color = "gray30") +
## define node labels
geom_node_text(aes(label = name), repel = F, fontface = "bold") +
## equal width and height
coord_fixed() +
## plain theme
theme_void() +
## title
ggtitle("#ICA22 Twitter Mentions Network")
```
### Smaller Mentions Network (n > 2)
```{r, fig.width = 15, fig.height=15}
rt_graph2 <- rt %>%
## select relevant variables
dplyr::select(screen_name, mentions_screen_name) %>%
## unnest list of mentions_screen_name
unnest %>%
## count the number of coocurences
group_by(screen_name, mentions_screen_name) %>%
tally(sort = T) %>%
ungroup %>%
## drop missing values
drop_na %>%
## filter those coocurences that appear more than 2 times
filter(n > 2) %>%
## transforming the dataframe to a graph object
as_tbl_graph() %>%
## calculating node centrality
mutate(centrality = centrality_degree(mode = 'in'))
rt_graph2 %>%
## create graph layout
ggraph(layout = "kk") +
## define edge aestetics
geom_edge_fan(aes(alpha = n, edge_width = n, color = n)) +
## scale down link saturation
scale_edge_alpha(range = c(.5, .9)) +
## define note size param
scale_edge_color_gradient(low = "gray50", high = "#1874CD") +
geom_node_point(aes(size = centrality), color = "gray30") +
## equal width and height
coord_fixed() +
geom_node_text(aes(label = name), repel = T, fontface = "bold") +
## plain theme
theme_void() +
## title
ggtitle("#ICA22 Twitter Mentions Network")
```
## Most Frequent Hashtags
```{r}
rt_hashtags <- rt %>%
filter(!is_retweet) %>%
select(hashtags) %>%
## unnest list of hastags
unnest %>%
na.omit %>%
## clean hashtags
mutate(hashtags = stringr::str_to_lower(hashtags) %>%
str_replace_all("2018", "18") %>%
## add #symbol to vector
paste0("#", .)) %>%
## count each hashtag and sort
count(hashtags, sort = T) %>%
filter(n > 5)
rt_hashtags %>%
filter(hashtags != "#ica22") %>%
mutate(hashtags = forcats::fct_reorder(hashtags, n)) %>%
ggplot(aes(hashtags, n)) +
geom_bar(stat = "identity", alpha = .7) +
coord_flip() +
theme_minimal() +
ggtitle("Most Frequent Hastags related to #ICA22")
```
## Most Frequent Bigram Network
```{r}
gg_bigram <- rt %>%
## remove retweets
filter(!is_retweet) %>%
select(text) %>%
## remove text noise
mutate(text = stringr::str_remove_all(text, "w |amp ")) %>%
## remove retweets
filter(!stringr::str_detect(text, "^RT")) %>%
## remove urls
mutate(text = stringr::str_remove_all(text, "https?[:]//[[:graph:]]+")) %>%
mutate(id = 1:n()) %>%
## split text into words
tidytext::unnest_tokens(word, text, token = "words") %>%
## remove stop words
anti_join(tidytext::stop_words) %>%
## paste words to text by id
group_by(id) %>%
summarise(text = paste(word, collapse = " ")) %>%
ungroup %>%
## again split text into bigrams (word occurences or collocations)
tidytext::unnest_tokens(bigram, text, token = "ngrams", n = 2) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
## remove the hashtag and count bigrams
filter(word1 != "ica22", word2 != "ica22") %>%
count(word1, word2, sort = T) %>%
## select first 50
slice(1:50) %>%
drop_na() %>%
## create tidy graph object
as_tbl_graph() %>%
## calculate node centrality
mutate(centrality = centrality_degree(mode = 'in'))
```
```{r}
gg_bigram %>%
ggraph() +
geom_edge_link(aes(edge_alpha = n, edge_width = n)) +
geom_node_point(aes(size = centrality)) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void() +
scale_edge_alpha("", range = c(0.3, .6)) +
ggtitle("Top Bigram Network from Tweets using hashtag #ICA22")
```
```{r}
sessionInfo()
```