Ecosyste.ms: Awesome
An open API service indexing awesome lists of open source software.
https://github.com/tonofshell/bankers-empire-text-analysis
A text analysis of Bankers and Empire by Peter James Hudson
https://github.com/tonofshell/bankers-empire-text-analysis
Last synced: 3 months ago
JSON representation
A text analysis of Bankers and Empire by Peter James Hudson
- Host: GitHub
- URL: https://github.com/tonofshell/bankers-empire-text-analysis
- Owner: tonofshell
- Created: 2019-01-29T08:03:30.000Z (almost 6 years ago)
- Default Branch: master
- Last Pushed: 2019-04-11T13:23:42.000Z (over 5 years ago)
- Last Synced: 2024-05-14T15:36:10.587Z (6 months ago)
- Size: 3.12 MB
- Stars: 1
- Watchers: 1
- Forks: 0
- Open Issues: 0
-
Metadata Files:
- Readme: README.Rmd
Awesome Lists containing this project
README
---
title: "*Bankers and Empire* Text Analysis"
output: github_document
---```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(tidyr)
library(dplyr)
library(ggplot2)
library(tidytext)
library(stringr)
library(here)
library(topicmodels)
library(igraph)
library(ggraph)
library(parallel)
library(wordcloud)source("http://lingtools.uoregon.edu/scripts/english_syllable_counter-102.R")
set.seed(60615)
windows.options(antialias = "cleartype")
options(device = Cairo::CairoWin)#setup workers for parallelization
setup_cl = function(seed = round(Sys.time())) {
require(parallel)
if (exists("cl")) {
print("Stopping existing cluster")
try(parallel::stopCluster(cl))
}
assign("cl", parallel::makeCluster(parallel::detectCores() - 1, outfile = "out.txt"), envir = globalenv())
RNGkind("L'Ecuyer-CMRG")
print(paste("Using", as.numeric(seed), "as parallel RNG seed"))
clusterSetRNGStream(cl, seed)
}
setup_cl(60615)
```## Loading and Cleaning
```{r load-text, cache=TRUE,echo=TRUE}
#load lines of book from text file
book_text = here("bankers_and_empire.txt") %>%
read_file() %>%
tibble(txt = .) %>%
unnest_tokens(line, txt, token = "lines")# load sentences of book from text file
book_sentences = here("bankers_and_empire.txt") %>%
read_file() %>%
tibble(txt = .) %>%
unnest_tokens(sentence, txt, token = "sentences")#remove captions
temp = tibble(sentence = character())
for (line_index in 1:length(book_sentences[[1]])) {
if (!(str_length(book_sentences[line_index,]) < 50)) {
temp = bind_rows(temp, book_sentences[line_index,])
}
}book_sentences = temp
#remove page numbers
temp = tibble(line = character())
for (line_index in 1:length(book_text[[1]])) {
if (!str_detect(book_text[line_index,], "/") && !(str_length(book_text[line_index,]) < 50)) {
temp = bind_rows(temp, book_text[line_index,])
}
}book_words = temp %>% unnest_tokens(word, line, token = "words")
rm(temp, book_text)#remove possessive "'s" from words
for (line_index in 1:length(book_words[[1]])) {
if (str_detect(book_words[line_index,], "’s") || str_detect(book_words[line_index,], "'s")) {
book_words[line_index,] = str_sub(book_words[line_index,],
start = 1, end = (str_length(book_words[line_index,]) - 2))
}
}
```## Wordcloud
```{r wordcloud, message=FALSE, warning=FALSE, cache=TRUE}
book_words %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 150, colors = brewer.pal(8, "Set2")))
```## Topic Modeling
```{r topic-modeling, message=FALSE, warning=FALSE}
book_words_cleaned = book_words %>% anti_join(stop_words)
model = tibble(document = 1, term = count(book_words_cleaned, word)[[1]], count = count(book_words_cleaned, word)[[2]]) %>%
cast_dtm(document, term, count) %>%
LDA(2, control = list(seed = 60615))topics <- tidy(model, matrix = "beta")
top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()ggsave(here("Charts", "topic_model.pdf"), device = "pdf")
```There doesn't seem to be enough heterogenity in the text for an LDA model to pick out different topics.
## Flesch-Kincaid Grade Level
The Flesch-Kincaid grade level is determined by the function ![Flesch-Kincaid grade level formula](https://wikimedia.org/api/rest_v1/media/math/render/svg/8e68f5fc959d052d1123b85758065afecc4150c3)
```{r flesch-kincaid, echo=TRUE, message=FALSE, warning=FALSE, cache=TRUE}
n_words = length(book_words[[1]])
n_sent = length(book_sentences[[1]])
#warning: this will take a surprisingly long amount of time, even parallelized
n_syll = parSapply(cl, book_words$word, english_syllable_count) %>% sum()
f_k_score = 0.39 * (n_words / n_sent) + 11.8 * (n_syll / n_words) - 15.59
```
*Bankers and Empire* has a Flesch-Kincaid score of a `r round(f_k_score, digits = 0)`^th^ grade reading level.## N-Gram
```{r n-gram, message=FALSE, warning=FALSE, cache=TRUE}
book_n_grams = nest(book_words) %>%
mutate(text = map(data, unlist),
text = map_chr(text, paste, collapse = " ")) %>%
select(text) %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)# #filter out stop (uninteresting) words
# cont_stop = function(x) {
# return(x %in% tidytext::stop_words)
# }book_n_grams = separate(book_n_grams, bigram, c("word1", "word2"), sep = " ")
# # Calculate the number of cores
# no_cores <- detectCores() - 1
#
# # Initiate cluster
# cl <- makeCluster(no_cores)book_n_grams$has_stop_one = book_n_grams$word1 %in% stop_words$word
book_n_grams$has_stop_two = book_n_grams$word2 %in% stop_words$word
bigrams = book_n_grams %>%
filter(has_stop_one == FALSE) %>%
filter(has_stop_two == FALSE) %>%
unite(bigram, "word1", "word2", sep = " ")bigram_count = count(bigrams, bigram, sort = TRUE)
bigram_count$n = as.numeric(bigram_count$n)ggplot(data = filter(bigram_count, n > 48), aes(y = n, x = reorder(bigram, n))) +
geom_col() +
coord_flip() +
labs(title = expression(paste("Top 15 Word Pairings in ", italic("Bankers and Empire"))), x = NULL, y = "Number of Observations")ggsave(here("Charts", "word_pairings.pdf"), device = "pdf")
bigram_count = separate(bigram_count, bigram, c("word1", "word2"), sep = " ")
bigram_graph <- bigram_count %>%
filter(n > 20) %>%
graph_from_data_frame()remove_axes <- theme(
axis.text = element_blank(),
axis.line = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
panel.grid = element_blank(),
axis.title = element_blank()
)ggraph(bigram_graph, layout = "fr") +
geom_edge_link(color = "grey") +
geom_node_point(color = "grey") +
geom_node_text(aes(label = name), nudge_x = 0, nudge_y = 0, repel = TRUE) +
labs(title = expression(paste("Common Word Pairings in ", italic("Bankers and Empire")))) +
theme_light() +
remove_axesggsave(here("Charts", "word_pair_graph.pdf"), device = "pdf")
```