Last active
August 29, 2015 14:11
-
-
Save mammuth/221d97cfe4c60a3644c0 to your computer and use it in GitHub Desktop.
Crawls the Wikipedia "Liste geflügelter Wörter" and saves the word itself and it absolute link in a dictionary
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This Python3 Script extracts all the quotes (and their links) from the Wikipedia 'Liste geflügelter Worte', which is a list of familiar quotations. | |
Furthermore it stores them into a database. | |
FYI: Currently it's 2125 quotations long. | |
""" | |
from urllib import request | |
import re | |
import string | |
import sqlite3 as lite | |
import sys | |
wikiUrl = "https://de.wikipedia.org/wiki/Liste_gefl%C3%BCgelter_Worte/" | |
# Fetch the HTML-Code of the Wiki-page | |
def fetchRawHtml(url): | |
r = request.urlopen(url) | |
bytecode = r.read() | |
return bytecode.decode() | |
def genNameLinkTuples(rawHtml, listLetter): | |
# makes | |
# <a href="#Quellennachweise"><span class="tocnumber">161</span> <span class="toctext">Quellennachweise</span></a> | |
# to Quellennachweise | |
def extractWord(s): | |
p = re.compile("toctext\">.*<\/span>") | |
for m in re.findall(p, s): | |
return m[9:-7:] | |
def extractLink(s, listLetter): | |
p = re.compile("href=\"#.*\"><") | |
for m in re.findall(p, s): | |
return wikiUrl+listLetter + m[6:-3:] | |
p = re.compile("<a href=\"#.*><span .*\">.*<\/span><\/a>") | |
l = list() | |
for match in re.findall(p, rawHtml): | |
w = extractWord(match) | |
if w != "Quellennachweise": # kick out Quellennachweis-links | |
l.append( (w, extractLink(match, listLetter)) ) | |
return l | |
# Save list of (Wort,Link) tuples into database | |
def dicToDb(tupleList): | |
con = lite.connect('Worte.db') | |
with con: | |
cur = con.cursor() | |
cur.execute("DROP TABLE IF EXISTS Worte") | |
cur.execute("CREATE TABLE Worte(Id INT, Wort TEXT, Link TEXT)") | |
id = 0 | |
for t in tupleList: | |
cur.execute("INSERT INTO Worte VALUES (?, ?, ?)", (id, t[0], t[1])) | |
id+=1 | |
def main(): | |
l = list() | |
# Run fetching method for every letter in the list | |
for c in string.ascii_uppercase: | |
if c == 'X': | |
print("Skipping " + c) | |
else: | |
print("Fetching List " + c) | |
# fetch Html-Page and extract (Quote,Link) tuples | |
nextLetter = genNameLinkTuples(fetchRawHtml(wikiUrl+c),c) | |
l = l + nextLetter # merge the next letter into the current list | |
# Save to Database | |
dicToDb(l) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment