Skip to content

Instantly share code, notes, and snippets.

@allatambov
Created May 25, 2021 14:31
Show Gist options
  • Save allatambov/9eb41ec03dcfd895438352bc34187e36 to your computer and use it in GitHub Desktop.
Save allatambov/9eb41ec03dcfd895438352bc34187e36 to your computer and use it in GitHub Desktop.
get_news <- function(link1){
page1 <- read_html(link1)
page1 %>% html_nodes(".table")
tables <- page1 %>% html_nodes(xpath = "//p[@class='table']")
rubs <- tables[1] %>% html_nodes("a") %>% html_text()
time <- tables[2] %>% html_nodes("time")
date <- time %>% html_attr("content")
date_unix <- time %>% html_attr("data-unix")
time_human <- time %>% html_node("span") %>% html_text()
diffc <- tables[3] %>% html_node(".difficult-value") %>%
html_text() %>% as.numeric()
parts <- page1 %>% html_nodes(xpath = "//p[not(@*)]") %>% html_text()
text <- paste(parts, collapse = " ")
text2 <- str_split(text, "Нашли опечатку?")[[1]][1]
text_final <- gsub("\\n", " ", text2)
author <- page1 %>% html_nodes(xpath = "//meta[@name='mediator_author']") %>%
html_attr("content")
atitle <- page1 %>% html_nodes(xpath = "//meta[@name='twitter:title']") %>%
html_attr("content")
L <- list(title = atitle,
text = text_final,
author = author,
date = date,
date_unix = date_unix,
time = time_human,
dicciculty = diffc)
return(L)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment