Skip to content

Instantly share code, notes, and snippets.

@ntnn
Created May 3, 2015 14:17
Show Gist options
  • Save ntnn/10fdddda724c1691d591 to your computer and use it in GitHub Desktop.
Save ntnn/10fdddda724c1691d591 to your computer and use it in GitHub Desktop.
from lxml import html
import requests
from difflib import unified_diff
import pynma
P = pynma.PyNMA('')
def parse(name, url, xpath, url_base=''):
tree = html.fromstring(requests.get(url).content)
elems = tree.xpath(xpath)
output = ""
for elem in elems:
output += printElems(elem, url_base)
with open('{}.txt'.format(name), 'r') as f:
old_output = f.readlines()
diff = unified_diff(output.splitlines(keepends=True), old_output)
if (''.join(diff) != ''):
P.push("KIT Scraper", "New: {}".format(name), ''.join(diff))
print('Pling!')
with open('{}.txt'.format(name), 'w') as f:
f.write(output)
def printElems(elem, url_base=''):
output = ""
if elem.text is not None:
output += elem.text + '\n'
if len(elem.values()) > 0 and 'pdf' in elem.values()[0]:
output += url_base + elem.values()[0] + '\n'
for child in elem.getchildren():
output += printElems(child, url_base)
return output
parse('numerik', 'http://www.math.kit.edu/ianm3/lehre/numinfing2015s/', '//a[contains(text(), "Übungsblatt")]')
parse('lina', 'http://www.math.kit.edu/iag3/lehre/la2info2015s/de', '//a[contains(text(), "Übungsblatt")]')
parse('ro', 'http://ti.itec.uka.de/TI-2/Uebungen/', '//tr[@bgcolor="#fff5a9"]/td/p', url_base='http://ti.itec.uka.de/TI-2/Uebungen/')
ITI_HOST = 'https://crypto.iti.kit.edu/'
ITI_XPATH = '//h2[text() = "Ablauf" or text() = "Übungsblätter"]/following-sibling::ul[position()=1]/li'
ITI_URLS = ['algo-sose15', 'sic-sose15']
for url in ITI_URLS:
parse(url, '{}/index.php?id={}'.format(ITI_HOST, url), ITI_XPATH, ITI_HOST)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment