Skip to content

Instantly share code, notes, and snippets.

@briatte
Created August 15, 2017 13:18
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save briatte/dc1ee2b18ea34ec0c1cd9c3dd3691025 to your computer and use it in GitHub Desktop.
Save briatte/dc1ee2b18ea34ec0c1cd9c3dd3691025 to your computer and use it in GitHub Desktop.
find dead feeds in OPML export (supports Atom and RSS2, will fail to parse on occasions)
library(dplyr)
library(httr)
library(readr)
library(rvest)
d <- read_xml("~/Downloads/digg_reader_subscriptions.xml") %>%
xml_nodes("outline")
d <- data_frame(
title = xml_attr(d, "text"),
url = xml_attr(d, "xmlUrl")
)
d <- filter(d, !is.na(url)) %>%
mutate(last = NA_character_)
for (i in rev(d$url)) {
cat(which(d$url == i))
f <- try(read_html(i), silent = TRUE)
if ("try-error" %in% class(f)) {
cat(": error\n")
} else {
# Atom
j <- html_node(f, "entry published") %>%
html_text()
if (is.na(j)) {
# RSS
j <- html_node(f, "item pubdate") %>%
html_text()
}
d$last[ d$url == i ] <- j
cat(":", parse_url(i)$hostname, "last updated", j, "\n")
}
}
table(is.na(d$last)) %>%
print
filter(d, !is.na(last), !grepl("201[67]", last)) %>%
View
write_csv(d, "opml_dates.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment