Last active
June 11, 2021 20:39
-
-
Save allatambov/267fb7a50b039656ba0ab5af7c329318 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
### installing and loading packages ### | |
install.packages("rvest") | |
library(rvest) | |
library(tidyverse) | |
### parsing HTML ### | |
# read HTML by link | |
main <- "https://nplus1.ru" | |
page <- read_html(main) | |
# find parts with tag <a> using html_nodes() | |
# get links stored in attributes href using html_attr() | |
# to compare: html_nodes() returns all objects while | |
# html_node() returns only the first occasion | |
hrefs <- page %>% html_nodes("a") %>% html_attr("href") | |
# keep only those links that contain /news/ | |
# since we want to work with news only (no blogs/rubrics) | |
# than get absolute links by merging with the beginning | |
# https://nplus1.ru | |
links <- hrefs[str_detect(hrefs, "/news/")] | |
full_links <- paste(main, links, sep = "") | |
link1 <- full_links[1] | |
### MAIN FUNCTION ### | |
# some comments to the function below | |
# it takes a link to the news and returns a list of info | |
# tables: we choose parts with tags <p> and class table (<p class="table"></p>) | |
# rubrics: we take the 1st table, find links to rubrics and extract texts inside links | |
# rubrics_un: we merge all rubrics inti one string using a comma | |
# ntime: we take the 2nd table, find <span> and take text from inside | |
# ndate: the same, but date is inside attribute "content" in <time> | |
# ndate_unix: the same, but date in UNIX (POSIX) format | |
# diffc: take 3rd table, find class "difficult-value" (. stands for class here), get text | |
# and transform to the number | |
# author: find nodes with tag <meta> and name "mediator_author", extract value from attribute "content" | |
# parts: paragraph of texts are stored in <p> with no attributes | |
# so, we negate any attributes (@*) in xpath (* – any value) | |
# then we merge paragraphs into one string and crop extra text | |
get_info <- function(link1){ | |
page1 <- read_html(link1) | |
tables <- page1 %>% html_nodes(xpath = "//p[@class='table']") | |
rubrics <- tables[1] %>% html_nodes("a") %>% html_text() | |
rubrics_un <- paste(rubrics, collapse = " , ") | |
ntime <- tables[2] %>% html_node("span") %>% html_text() | |
ndate <- tables[2] %>% html_node("time") %>% html_attr("content") | |
ndate_unix <- tables[2] %>% html_node("time") %>% html_attr("data-unix") | |
diffc <- tables[3] %>% html_node(".difficult-value") %>% html_text() %>% as.numeric() | |
author <- page1 %>% html_nodes(xpath = "//meta[@name='mediator_author']") %>% | |
html_attr("content") | |
parts <- page1 %>% html_nodes(xpath = "//p[not(@*)]") %>% html_text() | |
text <- paste(parts, collapse = " ") | |
text_final <- str_split(text, "Нашли опечатку?")[[1]][1] | |
text_final <- gsub("\\\n", " ", text_final) | |
L <- list(rubrics = rubrics_un, | |
ntime = ntime, | |
ndate = ndate, | |
ndate_unix = ndate_unix, | |
difficulty = diffc, | |
author = author, | |
text = text_final) | |
return(L) | |
} | |
get_info(full_links[5]) | |
### LAPPLY() and SAPPLY() ### | |
# functions to avoid for-loops | |
# apply a function to any element of a list and get new list/vector | |
# lapply() - list | |
# sapply() - vector | |
# apply function | |
# add value to an empty element | |
# transform lists to data frames | |
# Ddat is a list of data frames (one data frame for each piece of news) | |
Ldat <- lapply(full_links, get_info) | |
Ldat[[9]]$author <- "" | |
Ddat <- lapply(Ldat, as.data.frame) | |
# create an empty data frame for all news | |
dat <- data.frame(rubrics = as.character(), | |
ntime = as.character(), | |
ndate = as.character(), | |
ndate_unix = as.character(), | |
difficulty = as.character(), | |
author = as.character(), | |
text = as.character()) | |
# and fill it | |
for (i in Ddat){ | |
dat <- rbind(dat, i) | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment