Ecosyste.ms: Awesome

An open API service indexing awesome lists of open source software.

Awesome Lists | Featured Topics | Projects

https://github.com/tonofshell/bankers-empire-text-analysis

A text analysis of Bankers and Empire by Peter James Hudson
https://github.com/tonofshell/bankers-empire-text-analysis

Last synced: 3 months ago
JSON representation

A text analysis of Bankers and Empire by Peter James Hudson

Awesome Lists containing this project

README

        

---
title: "*Bankers and Empire* Text Analysis"
output: github_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
library(tidyverse)
library(tidyr)
library(dplyr)
library(ggplot2)
library(tidytext)
library(stringr)
library(here)
library(topicmodels)
library(igraph)
library(ggraph)
library(parallel)
library(wordcloud)

source("http://lingtools.uoregon.edu/scripts/english_syllable_counter-102.R")

set.seed(60615)
windows.options(antialias = "cleartype")
options(device = Cairo::CairoWin)

#setup workers for parallelization
setup_cl = function(seed = round(Sys.time())) {
require(parallel)
if (exists("cl")) {
print("Stopping existing cluster")
try(parallel::stopCluster(cl))
}
assign("cl", parallel::makeCluster(parallel::detectCores() - 1, outfile = "out.txt"), envir = globalenv())
RNGkind("L'Ecuyer-CMRG")
print(paste("Using", as.numeric(seed), "as parallel RNG seed"))
clusterSetRNGStream(cl, seed)
}
setup_cl(60615)
```

## Loading and Cleaning

```{r load-text, cache=TRUE,echo=TRUE}
#load lines of book from text file
book_text = here("bankers_and_empire.txt") %>%
read_file() %>%
tibble(txt = .) %>%
unnest_tokens(line, txt, token = "lines")

# load sentences of book from text file
book_sentences = here("bankers_and_empire.txt") %>%
read_file() %>%
tibble(txt = .) %>%
unnest_tokens(sentence, txt, token = "sentences")

#remove captions
temp = tibble(sentence = character())
for (line_index in 1:length(book_sentences[[1]])) {
if (!(str_length(book_sentences[line_index,]) < 50)) {
temp = bind_rows(temp, book_sentences[line_index,])
}
}

book_sentences = temp

#remove page numbers
temp = tibble(line = character())
for (line_index in 1:length(book_text[[1]])) {
if (!str_detect(book_text[line_index,], "/") && !(str_length(book_text[line_index,]) < 50)) {
temp = bind_rows(temp, book_text[line_index,])
}
}

book_words = temp %>% unnest_tokens(word, line, token = "words")
rm(temp, book_text)

#remove possessive "'s" from words
for (line_index in 1:length(book_words[[1]])) {
if (str_detect(book_words[line_index,], "’s") || str_detect(book_words[line_index,], "'s")) {
book_words[line_index,] = str_sub(book_words[line_index,],
start = 1, end = (str_length(book_words[line_index,]) - 2))
}
}
```

## Wordcloud
```{r wordcloud, message=FALSE, warning=FALSE, cache=TRUE}
book_words %>%
anti_join(stop_words) %>%
count(word) %>%
with(wordcloud(word, n, max.words = 150, colors = brewer.pal(8, "Set2")))
```

## Topic Modeling
```{r topic-modeling, message=FALSE, warning=FALSE}
book_words_cleaned = book_words %>% anti_join(stop_words)
model = tibble(document = 1, term = count(book_words_cleaned, word)[[1]], count = count(book_words_cleaned, word)[[2]]) %>%
cast_dtm(document, term, count) %>%
LDA(2, control = list(seed = 60615))

topics <- tidy(model, matrix = "beta")

top_terms <- topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)

top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()

ggsave(here("Charts", "topic_model.pdf"), device = "pdf")
```

There doesn't seem to be enough heterogenity in the text for an LDA model to pick out different topics.

## Flesch-Kincaid Grade Level

The Flesch-Kincaid grade level is determined by the function ![Flesch-Kincaid grade level formula](https://wikimedia.org/api/rest_v1/media/math/render/svg/8e68f5fc959d052d1123b85758065afecc4150c3)
```{r flesch-kincaid, echo=TRUE, message=FALSE, warning=FALSE, cache=TRUE}
n_words = length(book_words[[1]])
n_sent = length(book_sentences[[1]])
#warning: this will take a surprisingly long amount of time, even parallelized
n_syll = parSapply(cl, book_words$word, english_syllable_count) %>% sum()
f_k_score = 0.39 * (n_words / n_sent) + 11.8 * (n_syll / n_words) - 15.59
```
*Bankers and Empire* has a Flesch-Kincaid score of a `r round(f_k_score, digits = 0)`^th^ grade reading level.

## N-Gram
```{r n-gram, message=FALSE, warning=FALSE, cache=TRUE}
book_n_grams = nest(book_words) %>%
mutate(text = map(data, unlist),
text = map_chr(text, paste, collapse = " ")) %>%
select(text) %>% unnest_tokens(bigram, text, token = "ngrams", n = 2)

# #filter out stop (uninteresting) words
# cont_stop = function(x) {
# return(x %in% tidytext::stop_words)
# }

book_n_grams = separate(book_n_grams, bigram, c("word1", "word2"), sep = " ")

# # Calculate the number of cores
# no_cores <- detectCores() - 1
#
# # Initiate cluster
# cl <- makeCluster(no_cores)

book_n_grams$has_stop_one = book_n_grams$word1 %in% stop_words$word
book_n_grams$has_stop_two = book_n_grams$word2 %in% stop_words$word

bigrams = book_n_grams %>%
filter(has_stop_one == FALSE) %>%
filter(has_stop_two == FALSE) %>%
unite(bigram, "word1", "word2", sep = " ")

bigram_count = count(bigrams, bigram, sort = TRUE)
bigram_count$n = as.numeric(bigram_count$n)

ggplot(data = filter(bigram_count, n > 48), aes(y = n, x = reorder(bigram, n))) +
geom_col() +
coord_flip() +
labs(title = expression(paste("Top 15 Word Pairings in ", italic("Bankers and Empire"))), x = NULL, y = "Number of Observations")

ggsave(here("Charts", "word_pairings.pdf"), device = "pdf")

bigram_count = separate(bigram_count, bigram, c("word1", "word2"), sep = " ")

bigram_graph <- bigram_count %>%
filter(n > 20) %>%
graph_from_data_frame()

remove_axes <- theme(
axis.text = element_blank(),
axis.line = element_blank(),
axis.ticks = element_blank(),
panel.border = element_blank(),
panel.grid = element_blank(),
axis.title = element_blank()
)

ggraph(bigram_graph, layout = "fr") +
geom_edge_link(color = "grey") +
geom_node_point(color = "grey") +
geom_node_text(aes(label = name), nudge_x = 0, nudge_y = 0, repel = TRUE) +
labs(title = expression(paste("Common Word Pairings in ", italic("Bankers and Empire")))) +
theme_light() +
remove_axes

ggsave(here("Charts", "word_pair_graph.pdf"), device = "pdf")
```