Last active
December 16, 2019 22:31
-
-
Save andrewbtran/7f7fc61066af56fc7b498853447d71e0 to your computer and use it in GitHub Desktop.
NiemanLab Predictions Predictor
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Load libraries | |
library(dplyr) | |
library(rvest) | |
devtools::install_github("abresler/markovifyR") | |
library(markovifyR) | |
# The url for the 2019 predictions | |
year2019 <- "https://www.niemanlab.org/collection/predictions-2019" | |
# Scraping the headlines of 2019 predictions | |
headlines <- read_html(year2019) %>% | |
html_nodes(".rejigger") %>% | |
html_text() | |
# Cleaning up by splitting after the first comma | |
headlines <- sub(".*?, ","", headlines) | |
# Scraping the urls to specific predictions | |
urls <- read_html(year2019) %>% | |
html_nodes("div.predix2019-alllist-itemtext a") %>% | |
html_attr("href") | |
# setting up a blank text array | |
bulk_text <- "" | |
# Loop to scrape text from each prediction | |
for (i in 1:length(urls)) { | |
small_url <- urls[i] | |
# pulling the text from the specific link | |
text <- read_html(small_url) %>% | |
html_nodes(".predix-storybody > p") %>% | |
html_text() | |
# appending the text to the bulk_text array | |
bulk_text <- c(bulk_text, text) | |
# printing the status | |
print(paste0(i, " of ", length(urls))) | |
} | |
# creating a model of the headlines | |
markov_model_headline <- | |
generate_markovify_model( | |
input_text = headlines, | |
markov_state_size = 2L, | |
max_overlap_total = 25, | |
max_overlap_ratio = .85 | |
) | |
# creating a model of the body text | |
markov_model_body <- | |
generate_markovify_model( | |
input_text = bulk_text, | |
markov_state_size = 2L, | |
max_overlap_total = 25, | |
max_overlap_ratio = .85 | |
) | |
################################## | |
## RUN CODE BELOW TO REGENERATE ## | |
## NEW TEXT WITHOUT SCRAPING ## | |
################################## | |
# generating headline text | |
headline_text <- markovify_text( | |
markov_model = markov_model_headline, | |
maximum_sentence_length = NULL, | |
output_column_name = 'textLinnemanBot', | |
count = 1, | |
tries = 100, | |
only_distinct = TRUE, | |
return_message = TRUE | |
) | |
# generating body text | |
body_text <- markovify_text( | |
markov_model = markov_model_body, | |
maximum_sentence_length = NULL, | |
output_column_name = 'textLinnemanBot', | |
count = 15, | |
tries = 100, | |
only_distinct = TRUE, | |
return_message = TRUE | |
) | |
# generating a text file | |
generated_text <- c("############", headline_text$textLinnemanBot, "##########\n", body_text$textLinnemanBot) | |
write_lines(generated_text, "generated_prediction.txt") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment