https://github.com/mkearney/trumptweets

Download data on all of Donald Trump's (@realDonaldTrump) tweets
https://github.com/mkearney/trumptweets

dataset donald-trump mkearney-dataset r r-rtweet tweets twitter twitter-api

Last synced: about 1 month ago
JSON representation

Download data on all of Donald Trump's (@realDonaldTrump) tweets

Host: GitHub
URL: https://github.com/mkearney/trumptweets
Owner: mkearney
Created: 2017-01-26T02:10:26.000Z (over 8 years ago)
Default Branch: master
Last Pushed: 2018-10-02T04:25:01.000Z (about 7 years ago)
Last Synced: 2025-04-12T10:55:42.005Z (7 months ago)
Topics: dataset, donald-trump, mkearney-dataset, r, r-rtweet, tweets, twitter, twitter-api
Language: R
Size: 37.8 MB
Stars: 42
Watchers: 3
Forks: 5
Open Issues: 2
Metadata Files:
- Readme: README.Rmd

Awesome Lists containing this project

README

          ---

output: github_document

---

## Read data

```{r}

## read csv file

#rdt <- read.csv("data/trumptweets-1515775693.tweets.csv")

rdt <- readRDS("data/trumptweets-1515775693.rds")

## preview data

rdt

```

## Download all of Donald Trump's tweets using R

1. Install and load rtweet. `

```{r, eval=FALSE}

## install rtweet package

install.packages("rtweet")

## alternatively, install dev version

if (!"devtools" %in% installed.packages()) {

  install.packages("devtools")

}

devtools::install_github("mkearney/rtweet")

## load rtweet

library(rtweet)

```

2. Read in the following 3 functions. You'll use the last

function, `trumptweets()`, to download the data.

```{r, eval=FALSE}

#' get_trumptwitterarchive

#'

#' Returns data from trumptwitterarchive.com.

#'

#' @param years Years from which to collect data. Defaults (NULL, TRUE, or "all")

#'   to 2008-current year.

#' @return Returns data frame (tbl) of status IDs with "data" attribute consisting of

#'   list of data by year.

#' @importFrom jsonlite fromJSON

#' @importFrom tibble as_tibble

#' @examples

#' \dontrun{

#' ## get data

#' tta <- get_trumptwitterarchive()

#'

#' @export

get_trumptwitterarchive <- function(years = NULL) {

  ## default to years 2008 through current

  if (is.null(years) || isTRUE(years) || identical(tolower(years), "all")) {

    years <- seq(2009, as.integer(format(Sys.Date(), "%Y")))

  }

  stopifnot(is.numeric(years))

  ## get data for each year

  tta <- lapply(years, trumptwitterarchive_)

  ## make status IDs data frame with tta as attribute

  ids <- lapply(tta, "[[", "id_str")

  ids <- tibble::as_tibble(

    list(status_id = unlist(ids)),

    validate = FALSE

  )

  ## list of data (element = year)

  attr(ids, "data") <- tta

  attr(ids, "years") <- years

  ids

}

#' trumptwitterarchive_data

#'

#' Extracts full trumptwitterarchive data

#'

#' @param data Data frame returned by \code{\link{get_trumptwitterarchive}} with

#'   "data" attribute.

#' @param years Optional integer used to subset data to return only certain years.

#'   Defaults to NULL, which means all data is returned.

#' @return List of full data from trumptwitterarchive.com.

#' @export

trumptwitterarchive_data <- function(data, years = NULL) {

  if (!"data" %in% names(attributes(data))) {

    stop("Archive data not found", call. = FALSE)

  }

  tta <- attr(data, "data")

  ## by default, return tta data

  if (is.null(years)) {

    return(tta)

  }

  ## if years to subset are provided

  data_years <- attr(data, "years")

  ## if no years attr or if years length differs return w/ warning

  if (is.null(data_years) || length(data_years) != length(years)) {

    warning(

      "Length of years attribute differs from length of data. Returning all extracted data",

      call. = FALSE

    )

  } else {

    tta <- tta[data_years %in% years]

  }

  tta

}

#' trumptwitterarchive_

#'

#' Internal function used to retrieve trumptwitterarchive data

#'

#' @param year Integer, specifying year of data to return.

#' @param fromJSON Logical, indicating whether to convert repsonse object to

#'   nested list object.

#' @return Response object from trumptwitterarchive request converted (by default)

#'   to R-friendly list object.

#' @importFrom httr content GET

#' @importFrom jsonlite fromJSON

#' @noRd

#' @keywords internal

trumptwitterarchive_ <- function(year, fromJSON = TRUE) {

  ## build and send request

  url <- paste0(

    "http://trumptwitterarchive.com/",

    "data/realdonaldtrump/",

    year,

    ".json"

  )

  ## response object

  r <- httr::GET(url)

  ## check html status

  httr::warn_for_status(r)

  ## if fromJSON then convert to list otherwise return response object

  if (fromJSON) {

    r <- httr::content(r, "text")

    ## if html return empty data frame

    if (grepl("^\\<\\!DOCTYPE", r)) {

      r <- data.frame()

    } else {

      r <- jsonlite::fromJSON(r)

    }

  }

  r

}

## function to download status ids

trumpids <- function(trumptwitterarchive = TRUE) {

    ## scrape from trumptwitterarchive.com

    if (trumptwitterarchive) {

        ids <- c(2009:2017) %>%

            lapply(.trumpids) %>%

            unlist(use.names = FALSE)

    } else {

        ## or from my github page (note: this one is unlikely to

        ## be updated as frequently as trumptwitterarchive)

        ids <- paste0(

            "https://github.com/mkearney/trumptweets/blob/",

            "master/data/realdonaldtrump-ids-2009-2017.csv") %>%

            read.csv(stringsAsFactors = FALSE) %>%

            unlist(use.names = FALSE)

    }

    ## return ids

    ids

}

## function to download twitter data

trumptweets <- function() {

    ## get archive of status ids

    ids <- trumpids()

    ## get newest trump tweets (set to 1000 to be safe)

    rt1 <- get_timeline(

        "realdonaldtrump", n = 1000,

        since_id = ids[length(ids)])

    ## download archive

    message("    Downloading ", length(ids), " tweets...")

    rt2 <- lookup_statuses(ids[1:16000])

    message("    You're halfway there...")

    rt3 <- lookup_statuses(ids[16001:(length(ids))])

    message("    Huzzah!!!")

    ## combine data into list

    rt <- list(rt1, rt2, rt3)

    ## collapse into data frame (or salvage list if error)

    tryCatch(do.call("rbind", rt),

             error = function(e) return(rt))

}

```

3. Download all of Trump's tweets.

```{r, eval=FALSE}

## run function to download Trump's twitter archive

djt <- trumptweets()

```

4. Save the data file.

```{r, eval=FALSE}

## To save as an excel file:

install.packages("openxlsx")

openxlsx::write.xlsx(djt, "realdonaltrump-fullarchive.xlsx")

## To save as csv file

write.csv(djt, "realdonaltrump-fullarchive.csv",

          row.names = FALSE)

## To preserve meta information and save as csv file

install.packages("readr")

readr::write_csv(djt, "realdonaltrump-fullarchive.csv")

```

## Inspecting the data

```{r, eval=FALSE}

## preview data

head(djt)

## check 100 most popular hashtags

djt$hashtags %>%

    strsplit(" ") %>%

    unlist(use.names = FALSE) %>%

    tolower %>%

    table() %>%

    sort(decreasing = TRUE) %>%

    head(100)

## check 100 most popular mentions

djt$mentions_screen_name %>%

    strsplit(" ") %>%

    unlist(use.names = FALSE) %>%

    tolower %>%

    table() %>%

    sort(decreasing = TRUE) %>%

    head(100)

## check text of 50 most recent tweets

djt$text[1:50]

```

## Plotting the data

```{r, eval = FALSE}

## use the built in rtweet function

ts_plot(p, theme = "nerdy")

## plot four groups of hashtags

p <- ts_filter(djt, "2 days", txt = "hashtags",

               filter = c("makeamericagreatagain|maga",

                          "trump",

                          "debate",

                          "draintheswamp|americafirst"),

               key = c("MakeAmericaGreatAgain",

                       "Trump",

                       "Debates",

                       "DrainTheSwamp/AmericaFirst"))

## you can continue plotting with rtweet functions but

## the current version (0.4.0) prints incorrect labels for

## the x-axis for multi-year plots.

ts_plot(p, theme = "spacegray")

## ggplot2 doesn't have that problem and is more robust and

## flexible anyway

## install and load ggplot2

install.packages("ggplot2")

library(ggplot2)

## uncomment following line and final line to save image

## png("trumptweets.png", 7, 5, "in", res = 127.5)

p %>%

    ggplot(aes(x = time, y = freq, color = filter)) +

    theme_bw() +

    geom_line() +

    facet_wrap( ~ filter, ncol = 2) +

    labs(x = "", y = "",

         title = "Hashtags used by Donald Trump",

         subtitle = "Used entire archive of @realDonaldTrumpTweets") +

    theme(legend.position = "none",

          text = element_text(size = 12,

                              family = "Avenir Next Condensed"),

          plot.title = element_text(

              family = "Avenir Next Condensed Medium", size = 20))

## dev.off()

## image I created using this code displayed below

## note: if Avenir Next Condensed will only work if currently

## installed on your machine. If that's the case, then either

## delete the family arguments or replace Avenir with the font

## of your choosing.

```

ecosyste.ms

Data

Tools

Indexes

Applications

Experiments

Awesome

https://github.com/mkearney/trumptweets

Awesome Lists containing this project

README