Skip to content

Instantly share code, notes, and snippets.

@jokey2k
Created April 18, 2015 14:59
Show Gist options
  • Save jokey2k/c5082d53325d5e69f97b to your computer and use it in GitHub Desktop.
Save jokey2k/c5082d53325d5e69f97b to your computer and use it in GitHub Desktop.
import requests
import html5lib
import unicodecsv
import tqdm
# Fetched from http://www.linux-user.de/LUG on 18.04.2015
LUG_URLS = ["http://area51.fh-swf.de", "http://berklix.org/bim/", "http://bse.42.org", "http://ccw.iscool.net", "http://cologne.linuxaudio.org", "http://fibalug.de", "http://friedrichshain.homelinux.org", "http://gulag.de", "http://linux-werkstatt.huettenbusch.de/", "http://linux.baltic.net", "http://linux.eichsfeld.net", "http://linux.griebel-web.eu", "http://linux.php4u.org", "http://linux.studentensiedlung.de", "http://linuxahlen.li.funpic.de", "http://linuxheaven.cjb.net", "http://linuxnode.eichsfeld.net", "http://linuxusergrouplenningen.de.vu", "http://linuxwiki.de/LugMosbach", "http://linuxwiki.de/LugRoemerbergSpeyer", "http://liolug.liebigschule-giessen.de", "http://llugb.amsee.de", "http://luene-lug.org", "http://lug-bremen.info", "http://lug-dd.schlittermann.de", "http://lug-eisenach.de", "http://lug-hannover.de", "http://lug-owl.de/Lokales/Bielefeld/", "http://lug-owl.de/Lokales/Detmold/", "http://lug-owl.de/Lokales/Guetersloh/", "http://lug-owl.de/Lokales/Paderborn/", "http://lug-owl.de/LugWiki/GLUGHF", "http://lug-slf.de", "http://lug.fto.de", "http://lug.lohr-am-main.de", "http://lug.rhoen.de", "http://lugbuchholz.cwsurf.de", "http://lugdui.ihg.uni-duisburg.de", "http://lugnmb.dyndns.org", "http://lugo.signum-media.de", "http://lugrot.de", "http://lugulm.de", "http://ni-linux.de", "http://oldenburg.linux.de", "http://preetzlug.de", "http://rhlx01.rz.fht-esslingen.de/lug/", "http://ruhr.pm.org", "http://tuebingen.linux.de", "http://tux.hm", "http://tuxe.renchtal.com", "http://w3-net.ri-web.de/cont/lugnbg/index.php", "http://wlug.acos.net", "http://www.ahrlug.de", "http://www.align.de", "http://www.allgaeu.org/lugli", "http://www.alug.de", "http://www.amtuxtisch.de", "http://www.az-muelheim.de/penhour/", "http://www.bdpeng.de.vu", "http://www.belug.de", "http://www.bglug.de", "http://www.bluefrogs.de", "http://www.blug.de", "http://www.bonn.linux.de", "http://www.bralug.de", "http://www.brelug.de", "http://www.cc-itzehoe.de", "http://www.ccac.rwth-aachen.de", "http://www.cceev.de", "http://www.cch-holzminden.de", "http://www.cco-online.de/linux", "http://www.ccwn.org", "http://www.cham.baynet.de/lugo/", "http://www.clug.de", "http://www.colug.de", "http://www.cvr.de/linux", "http://www.dalug.org", "http://www.damme.de", "http://www.dlug.de", "http://www.dolug.de", "http://www.dorlug.de", "http://www.dulug.de", "http://www.elug.de", "http://www.erlug.de", "http://www.fen-net.de/flug", "http://www.fmi.uni-passau.de/~lug/", "http://www.freiburg.linux.de", "http://www.frilug.de", "http://www.gaos.org/lug-l/", "http://www.ghks.de/glug/", "http://www.gluga.de", "http://www.goelug.de", "http://www.grozilug.de", "http://www.gulug.info", "http://www.gunnet.de/linux/", "http://www.halix.info", "http://www.hallertux.de", "http://www.hatlug.de", "http://www.hilug-ng.de", "http://www.hmh-ev.de", "http://www.hulug.de", "http://www.ic.pirmasens.de", "http://www.infnet.verein.de/linux/", "http://www.init4.de", "http://www.kaalug.de", "http://www.karlsruhe.linux.de", "http://www.kglug.de", "http://www.koeln-lug.de", "http://www.kronachonline.de", "http://www.lalug.de", "http://www.lalug.net", "http://www.lanlug.org", "http://www.ldknet.org/lug/", "http://www.linux-bayreuth.de", "http://www.linux-mitterteich.de", "http://www.linuxag.hegau.org", "http://www.linuxdu.de", "http://www.linuxob.de", "http://www.linuxstammtisch.de", "http://www.linuxuser-luebeck.de", "http://www.linuxwiki.de/LugSinsheim", "http://www.listig.org", "http://www.loelug.de", "http://www.lug-aichach.de", "http://www.lug-albtal.de", "http://www.lug-an.de", "http://www.lug-balista.de", "http://www.lug-bamberg.de", "http://www.lug-bhv.de", "http://www.lug-bk.de", "http://www.lug-bruchsal.de", "http://www.lug-burghausen.org", "http://www.lug-bz.de", "http://www.lug-celle.de", "http://www.lug-coesfeld.de", "http://www.lug-datteln.de", "http://www.lug-delitzsch.de", "http://www.lug-eggenfelden.org", "http://www.lug-erding.de", "http://www.lug-erkelenz.de", "http://www.lug-erwitte.de", "http://www.lug-fs.de", "http://www.lug-grafing.org", "http://www.lug-hbs.de", "http://www.lug-hdh.de", "http://www.lug-hgw.de", "http://www.lug-in.de", "http://www.lug-kassel.de", "http://www.lug-ketsch.de", "http://www.lug-kiel.de", "http://www.lug-kl.de", "http://www.lug-kr.de", "http://www.lug-kronach.de", "http://www.lug-lauf.de", "http://www.lug-ld.de", "http://www.lug-loerrach.de", "http://www.lug-luenen.de", "http://www.lug-marl.de", "http://www.lug-meppen.de", "http://www.lug-myk.de", "http://www.lug-nb.de", "http://www.lug-nd.de", "http://www.lug-norderstedt.de", "http://www.lug-ottobrunn.de", "http://www.lug-owl.de", "http://www.lug-peine.org", "http://www.lug-qlb.de", "http://www.lug-raum-olpe.de.vu", "http://www.lug-reutlingen.de", "http://www.lug-rhwd.de", "http://www.lug-ro.org", "http://www.lug-s.org", "http://www.lug-saar.de", "http://www.lug-salem.de", "http://www.lug-sauerland.de", "http://www.lug-schaumburg.de", "http://www.lug-stormarn.de", "http://www.lug-sw.de", "http://www.lug-taunus.org", "http://www.lug-trier.de", "http://www.lug-ts.de", "http://www.lug-ts.de", "http://www.lug-untermain.de", "http://www.lug-viersen.de", "http://www.lug-vs.org", "http://www.lug-waldkraiburg.org", "http://www.lug-walsrode.de", "http://www.lug-westerwald.de", "http://www.lug-whv.de", "http://www.lug-wr.de", "http://www.lug-zw.de", "http://www.lug.wolfsburg.de", "http://www.luga.de", "http://www.luga.net", "http://www.lugab.de", "http://www.lugah.de", "http://www.lugal.org", "http://www.lugbb.org", "http://www.lugfl.de", "http://www.lugfrankfurt.de", "http://www.lugg.de", "http://www.luggg.de", "http://www.lugh.de", "http://www.lugl.net", "http://www.lugmoe.de", "http://www.lugmoers.de", "http://www.lugo.de", "http://www.lugog.de", "http://www.lugoland.de", "http://www.lugr.de", "http://www.lugrav.de", "http://www.lugse.de", "http://www.lugwue.de", "http://www.lulug.de", "http://www.lusc.de", "http://www.luug-hn.org", "http://www.luug-nordheide.de", "http://www.luusa.org", "http://www.mathematik.tu-darmstadt.de/dalug/", "http://www.mdlug.de", "http://www.mefia.org", "http://www.mglug.de", "http://www.moewa-lug.de", "http://www.mr-lug.de", "http://www.muc-lug.de", "http://www.mueslihq.de", "http://www.nluug.de", "http://www.osfang.de", "http://www.outerspace.de/lugrudo/", "http://www.pf-lug.de", "http://www.pug.org", "http://www.qlug.net", "http://www.schneifeltux.de", "http://www.si.unix-ag.org", "http://www.spelle.net/lugs", "http://www.t-online.de/home/mumumu/", "http://www.talug.de", "http://www.tlug.de", "http://www.trilug.fh-trier.de", "http://www.trolug.de", "http://www.ubuntu-berlin.de", "http://www.ufo.uni-mainz.de", "http://www.uliweb.de/lssg", "http://www.uni-koeln.de/themen/linux/", "http://www.unix-ag.uni-kl.de/~linux/", "http://www.unix.necoac.de", "http://www.uplug.de", "http://www.uugrn.org", "http://www.vlug.de", "http://www.wemelug.de", "http://www.witlug.de", "http://www.wlug.de", "http://www.wolug.de", "http://www.wuelug.de", "http://www.wuplug.org", "http://www.yalug.de", "http://www.zlug.org", "http://yalug.de", ]
state = {}
for lug_url in tqdm.tqdm(LUG_URLS):
try:
req = requests.get(lug_url, timeout=5)
except requests.adapters.ConnectionError:
state[lug_url] = "No such domain"
continue
except requests.exceptions.TooManyRedirects:
state[lug_url] = "Too many redirects"
continue
except requests.exceptions.ReadTimeout:
state[lug_url] = "Timeout"
continue
except requests.exceptions.RequestException as e:
state[lug_url] = "Unknown error: %s" % str(e)
continue
if req.status_code != 200:
state[lug_url] = "HTTP Status: %s" % req.status_code
continue
content = html5lib.parse(req.text)
head_element = content.find("{http://www.w3.org/1999/xhtml}head")
if head_element is not None:
title_element = head_element.find("{http://www.w3.org/1999/xhtml}title")
else:
title_element = content.find("{http://www.w3.org/1999/xhtml}title")
if title_element is None:
state[lug_url] = "No header in page"
continue
state[lug_url] = title_element.text
with open("lugs.csv", "w") as outfile:
writer = unicodecsv.writer(outfile, delimiter=";", quotechar='"', encoding='utf-8')
writer.writerow(["lugsite", "page header"])
for lug_url in LUG_URLS:
writer.writerow([lug_url, state[lug_url]])
@jokey2k
Copy link
Author

jokey2k commented Apr 18, 2015

virtualenv my-test-env
source my-test-env/bin/activate
pip install unicodecsv tqdm requests html5lib

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment