AdamSpannbauer/mod_Monduiz_code.R

## mod_Monduiz_code.R
##################################################################
# ADDRESSING https://github.com/AdamSpannbauer/lexRankr/issues/8
##################################################################

# Monduiz'S EXAMPLE CODE
##################################################################
library(rvest)
library(tidyverse)
library(stringr)
library(purrr)
library(lexRankr)

gm_headlines <- read_html("https://beta.theglobeandmail.com/politics/")

gm_links <- gm_headlines %>%
  html_nodes(".o-card__link") %>%
  html_attr("href") %>%
  xml2::url_absolute("https://beta.theglobeandmail.com")

pages <- gm_links %>% map(read_html)

gm_articles <- pages %>%
  map(. %>%
        html_nodes(".c-article-body__text") %>%
        html_text()
  )

gm_titles <- gm_headlines %>%
  html_nodes('.o-card__content-text') %>%
  html_text

gm <- data_frame(gm_titles, gm_links, gm_articles)

# Remove duplicates and video links
gm <- gm %>%
  distinct(gm_titles, .keep_all = TRUE) %>%
  filter(!str_detect(gm_links, 'video')) %>%
  mutate(doc_id = 1:length(gm_articles))


### summarization
gm_unnest <- gm %>%
  select(doc_id, gm_articles) %>%
  unnest(gm_articles)
#------------------------------------------------------------


# MODIFICATION TO GET TOP LEXRANK PER DOC
##################################################################

#function to get top lexranked sentence in a df
get_top_sentences = function(df_in, text_col = "text", n=1) {
  #perform piped lexrank process and extract top ranked sentence
  lex_df = lexRankr::unnest_sentences_(df_in, "sentences", text_col) %>% #parse sentences
    lexRankr::bind_lexrank(sentences, sent_id, level = "sentences") %>% #perform lexrank
    arrange(desc(lexrank)) %>% #get top ranked sentence(s)
    slice(1:n)
  return(lex_df)
}

#add some try catch logic around the lexrank custom function
safe_top_sent = purrr::possibly(get_top_sentences, otherwise = NULL, quiet = FALSE)
#get top sentence(s) per document
#split into a list with document dfs as elements
gm_rank_doc_level = split(gm_unnest, gm_unnest$doc_id) %>%
  #apply lexrank function to extract top n ranked sentences
  map(safe_top_sent, text_col="gm_articles", n=2) %>%
  #recombine into single df
  bind_rows()
	##################################################################
	# ADDRESSING https://github.com/AdamSpannbauer/lexRankr/issues/8
	##################################################################

	# Monduiz'S EXAMPLE CODE
	##################################################################
	library(rvest)
	library(tidyverse)
	library(stringr)
	library(purrr)
	library(lexRankr)

	gm_headlines <- read_html("https://beta.theglobeandmail.com/politics/")

	gm_links <- gm_headlines %>%
	html_nodes(".o-card__link") %>%
	html_attr("href") %>%
	xml2::url_absolute("https://beta.theglobeandmail.com")

	pages <- gm_links %>% map(read_html)

	gm_articles <- pages %>%
	map(. %>%
	html_nodes(".c-article-body__text") %>%
	html_text()
	)

	gm_titles <- gm_headlines %>%
	html_nodes('.o-card__content-text') %>%
	html_text

	gm <- data_frame(gm_titles, gm_links, gm_articles)

	# Remove duplicates and video links
	gm <- gm %>%
	distinct(gm_titles, .keep_all = TRUE) %>%
	filter(!str_detect(gm_links, 'video')) %>%
	mutate(doc_id = 1:length(gm_articles))


	### summarization
	gm_unnest <- gm %>%
	select(doc_id, gm_articles) %>%
	unnest(gm_articles)
	#------------------------------------------------------------


	# MODIFICATION TO GET TOP LEXRANK PER DOC
	##################################################################

	#function to get top lexranked sentence in a df
	get_top_sentences = function(df_in, text_col = "text", n=1) {
	#perform piped lexrank process and extract top ranked sentence
	lex_df = lexRankr::unnest_sentences_(df_in, "sentences", text_col) %>% #parse sentences
	lexRankr::bind_lexrank(sentences, sent_id, level = "sentences") %>% #perform lexrank
	arrange(desc(lexrank)) %>% #get top ranked sentence(s)
	slice(1:n)
	return(lex_df)
	}

	#add some try catch logic around the lexrank custom function
	safe_top_sent = purrr::possibly(get_top_sentences, otherwise = NULL, quiet = FALSE)
	#get top sentence(s) per document
	#split into a list with document dfs as elements
	gm_rank_doc_level = split(gm_unnest, gm_unnest$doc_id) %>%
	#apply lexrank function to extract top n ranked sentences
	map(safe_top_sent, text_col="gm_articles", n=2) %>%
	#recombine into single df
	bind_rows()