AdamSpannbauer/lexrank_doc_map.R

## lexrank_doc_map.R
##################################################################
# ADDRESSING https://github.com/AdamSpannbauer/lexRankr/issues/8
##################################################################

# GET EXAMPLE DATA
#----------------------------------------------------------
library(xml2)
library(rvest)
options(stringsAsFactors = FALSE)

#two urls with stories from cnn.com
urls = c("http://money.cnn.com/2017/11/20/technology/google-pixel-buds-review/index.html",
         "http://money.cnn.com/2017/11/23/technology/battlesgrounds-game-tencent-china/index.html")

#css selector to get story text
selector = c("#storytext p , .speakable")

#iterate over url list indices
my_df_list = lapply(seq_along(urls), function(i) {
  #get url i
  u = urls[i]
  #read page
  raw_html   = xml2::read_html(u)
  #extract text with selector
  story_text = rvest::html_nodes(raw_html, selector)
  #drop html tags
  text_lines = rvest::html_text(story_text)

  #put in df with id info
  df_out = data.frame(doc_id = i, url = u, text = text_lines)
  return(df_out)
})

#combine into single df
my_df = do.call('rbind', my_df_list)
#----------------------------------------------------------

# POSSIBLE TIDYVERSE SOLUTION TO ISSUE USING `purrr::map()`
#----------------------------------------------------------
library(dplyr)
library(purrr)
#convet to tibble
my_tbl = as_data_frame(my_df)

#function to get top lexranked sentence in a df
get_top_sentences = function(df_in, text_col = "text", n=1) {
  #perform piped lexrank process and extract top ranked sentence
  lex_df = lexRankr::unnest_sentences_(df_in, "sentences", text_col) %>% #parse sentences
    lexRankr::bind_lexrank(sentences, sent_id, level = "sentences") %>% #perform lexrank
    arrange(desc(lexrank)) %>% #get top ranked sentence(s)
    slice(1:n)
  return(lex_df)
}

#get top sentence(s) per document
#split into a list with document dfs as elements
top_sent_df = split(my_tbl, my_tbl$doc_id) %>%
  #apply lexrank function to extract top n ranked sentences
  map(get_top_sentences, n=1) %>%
  #recombine into single df
  bind_rows()
#----------------------------------------------------------

# OUTPUT
#----------------------------------------------------------
top_sent_df$sentences
# [1] " But when Google (GOOG) announced its new Pixel Buds in October,
#       touting the ability to translate a conversation between different
#       languages in near real time, it promised something unique."
# [2] " Chinese tech giant Tencent (TCEHY) has announced plans to
#       distribute PlayerUnknown's \"Battlegrounds\" in its home market
#       after modifying the violent game to comply with \"socialist core
#       values.\" "
top_sent_df$lexrank
# [1] 0.06505914 0.07053404
#----------------------------------------------------------
	##################################################################
	# ADDRESSING https://github.com/AdamSpannbauer/lexRankr/issues/8
	##################################################################

	# GET EXAMPLE DATA
	#----------------------------------------------------------
	library(xml2)
	library(rvest)
	options(stringsAsFactors = FALSE)

	#two urls with stories from cnn.com
	urls = c("http://money.cnn.com/2017/11/20/technology/google-pixel-buds-review/index.html",
	"http://money.cnn.com/2017/11/23/technology/battlesgrounds-game-tencent-china/index.html")

	#css selector to get story text
	selector = c("#storytext p , .speakable")

	#iterate over url list indices
	my_df_list = lapply(seq_along(urls), function(i) {
	#get url i
	u = urls[i]
	#read page
	raw_html = xml2::read_html(u)
	#extract text with selector
	story_text = rvest::html_nodes(raw_html, selector)
	#drop html tags
	text_lines = rvest::html_text(story_text)

	#put in df with id info
	df_out = data.frame(doc_id = i, url = u, text = text_lines)
	return(df_out)
	})

	#combine into single df
	my_df = do.call('rbind', my_df_list)
	#----------------------------------------------------------

	# POSSIBLE TIDYVERSE SOLUTION TO ISSUE USING `purrr::map()`
	#----------------------------------------------------------
	library(dplyr)
	library(purrr)
	#convet to tibble
	my_tbl = as_data_frame(my_df)

	#function to get top lexranked sentence in a df
	get_top_sentences = function(df_in, text_col = "text", n=1) {
	#perform piped lexrank process and extract top ranked sentence
	lex_df = lexRankr::unnest_sentences_(df_in, "sentences", text_col) %>% #parse sentences
	lexRankr::bind_lexrank(sentences, sent_id, level = "sentences") %>% #perform lexrank
	arrange(desc(lexrank)) %>% #get top ranked sentence(s)
	slice(1:n)
	return(lex_df)
	}

	#get top sentence(s) per document
	#split into a list with document dfs as elements
	top_sent_df = split(my_tbl, my_tbl$doc_id) %>%
	#apply lexrank function to extract top n ranked sentences
	map(get_top_sentences, n=1) %>%
	#recombine into single df
	bind_rows()
	#----------------------------------------------------------

	# OUTPUT
	#----------------------------------------------------------
	top_sent_df$sentences
	# [1] " But when Google (GOOG) announced its new Pixel Buds in October,
	# touting the ability to translate a conversation between different
	# languages in near real time, it promised something unique."
	# [2] " Chinese tech giant Tencent (TCEHY) has announced plans to
	# distribute PlayerUnknown's \"Battlegrounds\" in its home market
	# after modifying the violent game to comply with \"socialist core
	# values.\" "
	top_sent_df$lexrank
	# [1] 0.06505914 0.07053404
	#----------------------------------------------------------