Skip to content

Instantly share code, notes, and snippets.

@allatambov
Last active June 11, 2021 20:39
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allatambov/267fb7a50b039656ba0ab5af7c329318 to your computer and use it in GitHub Desktop.
Save allatambov/267fb7a50b039656ba0ab5af7c329318 to your computer and use it in GitHub Desktop.
### installing and loading packages ###
install.packages("rvest")
library(rvest)
library(tidyverse)
### parsing HTML ###
# read HTML by link
main <- "https://nplus1.ru"
page <- read_html(main)
# find parts with tag <a> using html_nodes()
# get links stored in attributes href using html_attr()
# to compare: html_nodes() returns all objects while
# html_node() returns only the first occasion
hrefs <- page %>% html_nodes("a") %>% html_attr("href")
# keep only those links that contain /news/
# since we want to work with news only (no blogs/rubrics)
# than get absolute links by merging with the beginning
# https://nplus1.ru
links <- hrefs[str_detect(hrefs, "/news/")]
full_links <- paste(main, links, sep = "")
link1 <- full_links[1]
### MAIN FUNCTION ###
# some comments to the function below
# it takes a link to the news and returns a list of info
# tables: we choose parts with tags <p> and class table (<p class="table"></p>)
# rubrics: we take the 1st table, find links to rubrics and extract texts inside links
# rubrics_un: we merge all rubrics inti one string using a comma
# ntime: we take the 2nd table, find <span> and take text from inside
# ndate: the same, but date is inside attribute "content" in <time>
# ndate_unix: the same, but date in UNIX (POSIX) format
# diffc: take 3rd table, find class "difficult-value" (. stands for class here), get text
# and transform to the number
# author: find nodes with tag <meta> and name "mediator_author", extract value from attribute "content"
# parts: paragraph of texts are stored in <p> with no attributes
# so, we negate any attributes (@*) in xpath (* – any value)
# then we merge paragraphs into one string and crop extra text
get_info <- function(link1){
page1 <- read_html(link1)
tables <- page1 %>% html_nodes(xpath = "//p[@class='table']")
rubrics <- tables[1] %>% html_nodes("a") %>% html_text()
rubrics_un <- paste(rubrics, collapse = " , ")
ntime <- tables[2] %>% html_node("span") %>% html_text()
ndate <- tables[2] %>% html_node("time") %>% html_attr("content")
ndate_unix <- tables[2] %>% html_node("time") %>% html_attr("data-unix")
diffc <- tables[3] %>% html_node(".difficult-value") %>% html_text() %>% as.numeric()
author <- page1 %>% html_nodes(xpath = "//meta[@name='mediator_author']") %>%
html_attr("content")
parts <- page1 %>% html_nodes(xpath = "//p[not(@*)]") %>% html_text()
text <- paste(parts, collapse = " ")
text_final <- str_split(text, "Нашли опечатку?")[[1]][1]
text_final <- gsub("\\\n", " ", text_final)
L <- list(rubrics = rubrics_un,
ntime = ntime,
ndate = ndate,
ndate_unix = ndate_unix,
difficulty = diffc,
author = author,
text = text_final)
return(L)
}
get_info(full_links[5])
### LAPPLY() and SAPPLY() ###
# functions to avoid for-loops
# apply a function to any element of a list and get new list/vector
# lapply() - list
# sapply() - vector
# apply function
# add value to an empty element
# transform lists to data frames
# Ddat is a list of data frames (one data frame for each piece of news)
Ldat <- lapply(full_links, get_info)
Ldat[[9]]$author <- ""
Ddat <- lapply(Ldat, as.data.frame)
# create an empty data frame for all news
dat <- data.frame(rubrics = as.character(),
ntime = as.character(),
ndate = as.character(),
ndate_unix = as.character(),
difficulty = as.character(),
author = as.character(),
text = as.character())
# and fill it
for (i in Ddat){
dat <- rbind(dat, i)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment