Skip to content

Instantly share code, notes, and snippets.

@cato-
Last active December 15, 2015 04:39
Show Gist options
  • Save cato-/5203636 to your computer and use it in GitHub Desktop.
Save cato-/5203636 to your computer and use it in GitHub Desktop.
Fetches the feed from heise.de and modifies each link to the shorturl
import requests
import re
from lxml import etree
from StringIO import StringIO
r=requests.get("http://www.heise.de/newsticker/heise.rdf")
parser=etree.XMLParser(encoding="utf8")
tree = etree.parse(StringIO(r.text.encode("utf8")), parser)
def get_plain_direct(link):
d = {'A': '0', 'B': '.', 'C': '/', 'E': '-', 'L': 'http://', 'S': 'www.'}
return re.sub('0([A-Z])', lambda match: d[match.group(1)], link)
def get_plain_head(link):
r = requests.head(link)
return r.url
def convertlink(link):
plink = get_plain_direct(link)
m = re.search("-([0-9]+).html", plink)
if m:
return "http://heise.de/-%s" % m.group(1)
else:
return link
for e in tree.xpath('//link'):
e.text = convertlink(e.text)
print etree.tostring(tree)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment