Skip to content

Instantly share code, notes, and snippets.

@mammuth
Last active August 29, 2015 14:11
Show Gist options
  • Save mammuth/221d97cfe4c60a3644c0 to your computer and use it in GitHub Desktop.
Save mammuth/221d97cfe4c60a3644c0 to your computer and use it in GitHub Desktop.
Crawls the Wikipedia "Liste geflügelter Wörter" and saves the word itself and it absolute link in a dictionary
"""
This Python3 Script extracts all the quotes (and their links) from the Wikipedia 'Liste geflügelter Worte', which is a list of familiar quotations.
Furthermore it stores them into a database.
FYI: Currently it's 2125 quotations long.
"""
from urllib import request
import re
import string
import sqlite3 as lite
import sys
wikiUrl = "https://de.wikipedia.org/wiki/Liste_gefl%C3%BCgelter_Worte/"
# Fetch the HTML-Code of the Wiki-page
def fetchRawHtml(url):
r = request.urlopen(url)
bytecode = r.read()
return bytecode.decode()
def genNameLinkTuples(rawHtml, listLetter):
# makes
# <a href="#Quellennachweise"><span class="tocnumber">161</span> <span class="toctext">Quellennachweise</span></a>
# to Quellennachweise
def extractWord(s):
p = re.compile("toctext\">.*<\/span>")
for m in re.findall(p, s):
return m[9:-7:]
def extractLink(s, listLetter):
p = re.compile("href=\"#.*\"><")
for m in re.findall(p, s):
return wikiUrl+listLetter + m[6:-3:]
p = re.compile("<a href=\"#.*><span .*\">.*<\/span><\/a>")
l = list()
for match in re.findall(p, rawHtml):
w = extractWord(match)
if w != "Quellennachweise": # kick out Quellennachweis-links
l.append( (w, extractLink(match, listLetter)) )
return l
# Save list of (Wort,Link) tuples into database
def dicToDb(tupleList):
con = lite.connect('Worte.db')
with con:
cur = con.cursor()
cur.execute("DROP TABLE IF EXISTS Worte")
cur.execute("CREATE TABLE Worte(Id INT, Wort TEXT, Link TEXT)")
id = 0
for t in tupleList:
cur.execute("INSERT INTO Worte VALUES (?, ?, ?)", (id, t[0], t[1]))
id+=1
def main():
l = list()
# Run fetching method for every letter in the list
for c in string.ascii_uppercase:
if c == 'X':
print("Skipping " + c)
else:
print("Fetching List " + c)
# fetch Html-Page and extract (Quote,Link) tuples
nextLetter = genNameLinkTuples(fetchRawHtml(wikiUrl+c),c)
l = l + nextLetter # merge the next letter into the current list
# Save to Database
dicToDb(l)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment