allatambov/get-news-r-2

## get-news-r-2
### installing and loading packages ###

install.packages("rvest")
library(rvest)
library(tidyverse)

### parsing HTML ###

# read HTML by link

main <- "https://nplus1.ru"
page <- read_html(main)

# find parts with tag <a> using html_nodes()
# get links stored in attributes href using html_attr()
# to compare: html_nodes() returns all objects while
# html_node() returns only the first occasion

hrefs <- page %>% html_nodes("a") %>% html_attr("href")

# keep only those links that contain /news/
# since we want to work with news only (no blogs/rubrics)
# than get absolute links by merging with the beginning
# https://nplus1.ru

links <- hrefs[str_detect(hrefs, "/news/")]
full_links <- paste(main, links, sep = "")
link1 <- full_links[1]

### MAIN FUNCTION ###

# some comments to the function below
# it takes a link to the news and returns a list of info

# tables: we choose parts with tags <p> and class table (<p class="table"></p>)
# rubrics: we take the 1st table, find links to rubrics and extract texts inside links
# rubrics_un: we merge all rubrics inti one string using a comma
# ntime: we take the 2nd table, find <span> and take text from inside
# ndate: the same, but date is inside attribute "content" in <time>
# ndate_unix: the same, but date in UNIX (POSIX) format
# diffc: take 3rd table, find class "difficult-value" (. stands for class here), get text
# and transform to the number
# author: find nodes with tag <meta> and name "mediator_author", extract value from attribute "content"
# parts: paragraph of texts are stored in <p> with no attributes
# so, we negate any attributes (@*) in xpath (* – any value)
# then we merge paragraphs into one string and crop extra text

get_info <- function(link1){
  page1 <- read_html(link1)
  tables <- page1 %>% html_nodes(xpath = "//p[@class='table']")
  rubrics <- tables[1] %>% html_nodes("a") %>% html_text()
  rubrics_un <- paste(rubrics, collapse = " , ")
  ntime <- tables[2] %>% html_node("span") %>% html_text()
  ndate <- tables[2] %>% html_node("time") %>% html_attr("content")
  ndate_unix <- tables[2] %>% html_node("time") %>% html_attr("data-unix")
  diffc <- tables[3] %>% html_node(".difficult-value") %>% html_text() %>% as.numeric()
  author <- page1 %>% html_nodes(xpath = "//meta[@name='mediator_author']") %>%
    html_attr("content")
  parts <- page1 %>% html_nodes(xpath = "//p[not(@*)]") %>% html_text()
  text <- paste(parts, collapse = " ")
  text_final <- str_split(text, "Нашли опечатку?")[[1]][1]
  text_final <- gsub("\\\n", " ", text_final)
  L <- list(rubrics = rubrics_un,
            ntime = ntime,
            ndate = ndate,
            ndate_unix = ndate_unix,
            difficulty = diffc,
            author = author,
            text = text_final)
  return(L)
}

get_info(full_links[5])

### LAPPLY() and SAPPLY() ###

# functions to avoid for-loops
# apply a function to any element of a list and get new list/vector
# lapply() - list
# sapply() - vector

# apply function
# add value to an empty element
# transform lists to data frames
# Ddat is a list of data frames (one data frame for each piece of news)

Ldat <- lapply(full_links, get_info)
Ldat[[9]]$author <- ""
Ddat <- lapply(Ldat, as.data.frame)

# create an empty data frame for all news

dat <- data.frame(rubrics = as.character(),
                  ntime = as.character(),
                  ndate = as.character(),
                  ndate_unix = as.character(),
                  difficulty = as.character(),
                  author = as.character(),
                  text = as.character())

# and fill it

for (i in Ddat){
  dat <- rbind(dat, i)
}
	### installing and loading packages ###

	install.packages("rvest")
	library(rvest)
	library(tidyverse)

	### parsing HTML ###

	# read HTML by link

	main <- "https://nplus1.ru"
	page <- read_html(main)

	# find parts with tag <a> using html_nodes()
	# get links stored in attributes href using html_attr()
	# to compare: html_nodes() returns all objects while
	# html_node() returns only the first occasion

	hrefs <- page %>% html_nodes("a") %>% html_attr("href")

	# keep only those links that contain /news/
	# since we want to work with news only (no blogs/rubrics)
	# than get absolute links by merging with the beginning
	# https://nplus1.ru

	links <- hrefs[str_detect(hrefs, "/news/")]
	full_links <- paste(main, links, sep = "")
	link1 <- full_links[1]

	### MAIN FUNCTION ###

	# some comments to the function below
	# it takes a link to the news and returns a list of info

	# tables: we choose parts with tags <p> and class table (<p class="table"></p>)
	# rubrics: we take the 1st table, find links to rubrics and extract texts inside links
	# rubrics_un: we merge all rubrics inti one string using a comma
	# ntime: we take the 2nd table, find <span> and take text from inside
	# ndate: the same, but date is inside attribute "content" in <time>
	# ndate_unix: the same, but date in UNIX (POSIX) format
	# diffc: take 3rd table, find class "difficult-value" (. stands for class here), get text
	# and transform to the number
	# author: find nodes with tag <meta> and name "mediator_author", extract value from attribute "content"
	# parts: paragraph of texts are stored in <p> with no attributes
	# so, we negate any attributes (@) in xpath ( – any value)
	# then we merge paragraphs into one string and crop extra text

	get_info <- function(link1){
	page1 <- read_html(link1)
	tables <- page1 %>% html_nodes(xpath = "//p[@class='table']")
	rubrics <- tables[1] %>% html_nodes("a") %>% html_text()
	rubrics_un <- paste(rubrics, collapse = " , ")
	ntime <- tables[2] %>% html_node("span") %>% html_text()
	ndate <- tables[2] %>% html_node("time") %>% html_attr("content")
	ndate_unix <- tables[2] %>% html_node("time") %>% html_attr("data-unix")
	diffc <- tables[3] %>% html_node(".difficult-value") %>% html_text() %>% as.numeric()
	author <- page1 %>% html_nodes(xpath = "//meta[@name='mediator_author']") %>%
	html_attr("content")
	parts <- page1 %>% html_nodes(xpath = "//p[not(@*)]") %>% html_text()
	text <- paste(parts, collapse = " ")
	text_final <- str_split(text, "Нашли опечатку?")[[1]][1]
	text_final <- gsub("\\\n", " ", text_final)
	L <- list(rubrics = rubrics_un,
	ntime = ntime,
	ndate = ndate,
	ndate_unix = ndate_unix,
	difficulty = diffc,
	author = author,
	text = text_final)
	return(L)
	}

	get_info(full_links[5])

	### LAPPLY() and SAPPLY() ###

	# functions to avoid for-loops
	# apply a function to any element of a list and get new list/vector
	# lapply() - list
	# sapply() - vector

	# apply function
	# add value to an empty element
	# transform lists to data frames
	# Ddat is a list of data frames (one data frame for each piece of news)

	Ldat <- lapply(full_links, get_info)
	Ldat[[9]]$author <- ""
	Ddat <- lapply(Ldat, as.data.frame)

	# create an empty data frame for all news

	dat <- data.frame(rubrics = as.character(),
	ntime = as.character(),
	ndate = as.character(),
	ndate_unix = as.character(),
	difficulty = as.character(),
	author = as.character(),
	text = as.character())

	# and fill it

	for (i in Ddat){
	dat <- rbind(dat, i)
	}