Skip to content

Instantly share code, notes, and snippets.

@andrewbtran
Last active December 16, 2019 22:31
Show Gist options
  • Save andrewbtran/7f7fc61066af56fc7b498853447d71e0 to your computer and use it in GitHub Desktop.
Save andrewbtran/7f7fc61066af56fc7b498853447d71e0 to your computer and use it in GitHub Desktop.
NiemanLab Predictions Predictor
# Load libraries
library(dplyr)
library(rvest)
devtools::install_github("abresler/markovifyR")
library(markovifyR)
# The url for the 2019 predictions
year2019 <- "https://www.niemanlab.org/collection/predictions-2019"
# Scraping the headlines of 2019 predictions
headlines <- read_html(year2019) %>%
html_nodes(".rejigger") %>%
html_text()
# Cleaning up by splitting after the first comma
headlines <- sub(".*?, ","", headlines)
# Scraping the urls to specific predictions
urls <- read_html(year2019) %>%
html_nodes("div.predix2019-alllist-itemtext a") %>%
html_attr("href")
# setting up a blank text array
bulk_text <- ""
# Loop to scrape text from each prediction
for (i in 1:length(urls)) {
small_url <- urls[i]
# pulling the text from the specific link
text <- read_html(small_url) %>%
html_nodes(".predix-storybody > p") %>%
html_text()
# appending the text to the bulk_text array
bulk_text <- c(bulk_text, text)
# printing the status
print(paste0(i, " of ", length(urls)))
}
# creating a model of the headlines
markov_model_headline <-
generate_markovify_model(
input_text = headlines,
markov_state_size = 2L,
max_overlap_total = 25,
max_overlap_ratio = .85
)
# creating a model of the body text
markov_model_body <-
generate_markovify_model(
input_text = bulk_text,
markov_state_size = 2L,
max_overlap_total = 25,
max_overlap_ratio = .85
)
##################################
## RUN CODE BELOW TO REGENERATE ##
## NEW TEXT WITHOUT SCRAPING ##
##################################
# generating headline text
headline_text <- markovify_text(
markov_model = markov_model_headline,
maximum_sentence_length = NULL,
output_column_name = 'textLinnemanBot',
count = 1,
tries = 100,
only_distinct = TRUE,
return_message = TRUE
)
# generating body text
body_text <- markovify_text(
markov_model = markov_model_body,
maximum_sentence_length = NULL,
output_column_name = 'textLinnemanBot',
count = 15,
tries = 100,
only_distinct = TRUE,
return_message = TRUE
)
# generating a text file
generated_text <- c("############", headline_text$textLinnemanBot, "##########\n", body_text$textLinnemanBot)
write_lines(generated_text, "generated_prediction.txt")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment