andrewbtran/nlpp.r

## nlpp.r
# Load libraries
library(dplyr)
library(rvest)

devtools::install_github("abresler/markovifyR")
library(markovifyR)

# The url for the 2019 predictions
year2019 <- "https://www.niemanlab.org/collection/predictions-2019"

# Scraping the headlines of 2019 predictions
headlines <- read_html(year2019) %>%
  html_nodes(".rejigger") %>%
  html_text()

# Cleaning up by splitting after the first comma
headlines <- sub(".*?, ","", headlines)

# Scraping the urls to specific predictions
urls <- read_html(year2019) %>%
  html_nodes("div.predix2019-alllist-itemtext a") %>%
  html_attr("href")

# setting up a blank text array
bulk_text <- ""

# Loop to scrape text from each prediction
for (i in 1:length(urls)) {

  small_url <- urls[i]

  # pulling the text from the specific link
  text <- read_html(small_url) %>%
    html_nodes(".predix-storybody > p") %>%
    html_text()

  # appending the text to the bulk_text array

  bulk_text <- c(bulk_text, text)

  # printing the status
  print(paste0(i, " of ", length(urls)))
}

# creating a model of the headlines
markov_model_headline <-
  generate_markovify_model(
    input_text = headlines,
    markov_state_size = 2L,
    max_overlap_total = 25,
    max_overlap_ratio = .85
  )


# creating a model of the body text
markov_model_body <-
  generate_markovify_model(
    input_text = bulk_text,
    markov_state_size = 2L,
    max_overlap_total = 25,
    max_overlap_ratio = .85
  )


##################################
## RUN CODE BELOW TO REGENERATE ##
## NEW TEXT WITHOUT SCRAPING    ##
##################################

# generating headline text
headline_text <- markovify_text(
  markov_model = markov_model_headline,
  maximum_sentence_length = NULL,
  output_column_name = 'textLinnemanBot',
  count = 1,
  tries = 100,
  only_distinct = TRUE,
  return_message = TRUE
)

# generating body text
body_text <- markovify_text(
  markov_model = markov_model_body,
  maximum_sentence_length = NULL,
  output_column_name = 'textLinnemanBot',
  count = 15,
  tries = 100,
  only_distinct = TRUE,
  return_message = TRUE
)

# generating a text file

generated_text <- c("############", headline_text$textLinnemanBot, "##########\n", body_text$textLinnemanBot)

write_lines(generated_text, "generated_prediction.txt")
	# Load libraries
	library(dplyr)
	library(rvest)

	devtools::install_github("abresler/markovifyR")
	library(markovifyR)

	# The url for the 2019 predictions
	year2019 <- "https://www.niemanlab.org/collection/predictions-2019"

	# Scraping the headlines of 2019 predictions
	headlines <- read_html(year2019) %>%
	html_nodes(".rejigger") %>%
	html_text()

	# Cleaning up by splitting after the first comma
	headlines <- sub(".*?, ","", headlines)

	# Scraping the urls to specific predictions
	urls <- read_html(year2019) %>%
	html_nodes("div.predix2019-alllist-itemtext a") %>%
	html_attr("href")

	# setting up a blank text array
	bulk_text <- ""

	# Loop to scrape text from each prediction
	for (i in 1:length(urls)) {

	small_url <- urls[i]

	# pulling the text from the specific link
	text <- read_html(small_url) %>%
	html_nodes(".predix-storybody > p") %>%
	html_text()

	# appending the text to the bulk_text array

	bulk_text <- c(bulk_text, text)

	# printing the status
	print(paste0(i, " of ", length(urls)))
	}

	# creating a model of the headlines
	markov_model_headline <-
	generate_markovify_model(
	input_text = headlines,
	markov_state_size = 2L,
	max_overlap_total = 25,
	max_overlap_ratio = .85
	)


	# creating a model of the body text
	markov_model_body <-
	generate_markovify_model(
	input_text = bulk_text,
	markov_state_size = 2L,
	max_overlap_total = 25,
	max_overlap_ratio = .85
	)


	##################################
	## RUN CODE BELOW TO REGENERATE ##
	## NEW TEXT WITHOUT SCRAPING ##
	##################################

	# generating headline text
	headline_text <- markovify_text(
	markov_model = markov_model_headline,
	maximum_sentence_length = NULL,
	output_column_name = 'textLinnemanBot',
	count = 1,
	tries = 100,
	only_distinct = TRUE,
	return_message = TRUE
	)

	# generating body text
	body_text <- markovify_text(
	markov_model = markov_model_body,
	maximum_sentence_length = NULL,
	output_column_name = 'textLinnemanBot',
	count = 15,
	tries = 100,
	only_distinct = TRUE,
	return_message = TRUE
	)

	# generating a text file

	generated_text <- c("############", headline_text$textLinnemanBot, "##########\n", body_text$textLinnemanBot)

	write_lines(generated_text, "generated_prediction.txt")