Skip to content

Instantly share code, notes, and snippets.

@riceissa
Created February 26, 2020 02:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save riceissa/80b62818d910e71e177b8d2c84a648de to your computer and use it in GitHub Desktop.
Save riceissa/80b62818d910e71e177b8d2c84a648de to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import re
import requests
import datetime
import dateutil.parser
from bs4 import BeautifulSoup
# url = "http://reflectivedisequilibrium.blogspot.com/sitemap.xml"
# url = "https://terrytao.wordpress.com/sitemap.xml"
# url = "https://deluks917.wordpress.com/sitemap.xml"
# url = "https://nostalgebraist.tumblr.com/sitemap.xml"
# url = "https://grognor.tumblr.com/sitemap.xml"
# url = "https://www.gwern.net/sitemap.xml"
url = "http://www.overcomingbias.com/sitemap.xml"
# url = "https://blog.jessriedel.com/sitemap.xml"
def find_all_urls(feed_url):
headers = {'User-Agent': 'Mozilla/5.0 (Ubuntu; Linux x86_64; rv:62.0) Gecko/20100101 Firefox/62.0'}
r = requests.get(feed_url, headers=headers)
soup = BeautifulSoup(r.content, "lxml-xml")
result = []
for url in soup.find_all('url'):
link = url.loc.text
if parse_url_date(link):
result.append((link, parse_url_date(link)))
elif url.lastmod:
result.append((link, dateutil.parser.parse(url.lastmod.text)))
else:
result.append((link, None))
for sitemap in soup.find_all('sitemap'):
link = sitemap.loc.text
result.extend(find_all_urls(link))
return result
def parse_url_date(url):
m1 = re.match(r'.*/(\d\d\d\d/\d\d/\d\d)/', url)
m2 = re.match(r'.*/(\d\d\d\d/\d\d)/', url)
if m1:
return dateutil.parser.parse(m1.group(1))
if m2:
return dateutil.parser.parse(m2.group(1))
return None
print("""<?xml version="1.0" encoding="UTF-8" ?>
<rss version="2.0">
<channel>""")
for u, d in find_all_urls(url):
print(" <item>")
url_parts = u.split('/')
if url_parts[-1]:
title = url_parts[-1].replace("-", " ").replace("_", " ")
else:
title = url_parts[-2].replace("-", " ").replace("_", " ")
print(" <title>{}</title>".format(title))
print(" <link>{}</link>".format(u))
if d:
print(" <pubDate>{}</pubDate>".format(d.strftime("%a, %d %b %Y %H:%M:%S %z")))
print(" </item>")
print("""</channel>
</rss>""")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment