mammuth/GefluegelteWorteFetcher.py

## GefluegelteWorteFetcher.py
"""
This Python3 Script extracts all the quotes (and their links) from the Wikipedia 'Liste geflügelter Worte', which is a list of familiar quotations.
Furthermore it stores them into a database.
FYI: Currently it's 2125 quotations long.
"""

from urllib import request
import re
import string
import sqlite3 as lite
import sys

wikiUrl = "https://de.wikipedia.org/wiki/Liste_gefl%C3%BCgelter_Worte/"


# Fetch the HTML-Code of the Wiki-page
def fetchRawHtml(url):
	r = request.urlopen(url)
	bytecode = r.read()
	return bytecode.decode()


def genNameLinkTuples(rawHtml, listLetter):

	# makes
	# <a href="#Quellennachweise"><span class="tocnumber">161</span> <span class="toctext">Quellennachweise</span></a>
	# to Quellennachweise
	def extractWord(s):
		p = re.compile("toctext\">.*<\/span>")
		for m in re.findall(p, s):
			return m[9:-7:]

	def extractLink(s, listLetter):
		p = re.compile("href=\"#.*\"><")
		for m in re.findall(p, s):
			return wikiUrl+listLetter + m[6:-3:]


	p = re.compile("<a href=\"#.*><span .*\">.*<\/span><\/a>")

	l = list()
	for match in re.findall(p, rawHtml):
		w = extractWord(match)
		if w != "Quellennachweise": # kick out Quellennachweis-links
			l.append( (w, extractLink(match, listLetter)) )

	return l


# Save list of (Wort,Link) tuples into database
def dicToDb(tupleList):
	con = lite.connect('Worte.db')
	with con:
		cur = con.cursor()

		cur.execute("DROP TABLE IF EXISTS Worte")
		cur.execute("CREATE TABLE Worte(Id INT, Wort TEXT, Link TEXT)")

		id = 0
		for t in tupleList:
			cur.execute("INSERT INTO Worte VALUES (?, ?, ?)", (id, t[0], t[1]))
			id+=1


def main():
	l = list()
	# Run fetching method for every letter in the list
	for c in string.ascii_uppercase:
		if c == 'X':
			print("Skipping " + c)
		else:
			print("Fetching List " + c)
			# fetch Html-Page and extract (Quote,Link) tuples
			nextLetter = genNameLinkTuples(fetchRawHtml(wikiUrl+c),c)
			l = l + nextLetter # merge the next letter into the current list

	# Save to Database
	dicToDb(l)


main()
	"""
	This Python3 Script extracts all the quotes (and their links) from the Wikipedia 'Liste geflügelter Worte', which is a list of familiar quotations.
	Furthermore it stores them into a database.
	FYI: Currently it's 2125 quotations long.
	"""

	from urllib import request
	import re
	import string
	import sqlite3 as lite
	import sys

	wikiUrl = "https://de.wikipedia.org/wiki/Liste_gefl%C3%BCgelter_Worte/"



	# Fetch the HTML-Code of the Wiki-page
	def fetchRawHtml(url):
	r = request.urlopen(url)
	bytecode = r.read()
	return bytecode.decode()



	def genNameLinkTuples(rawHtml, listLetter):

	# makes
	# <a href="#Quellennachweise"><span class="tocnumber">161</span> <span class="toctext">Quellennachweise</span></a>
	# to Quellennachweise
	def extractWord(s):
	p = re.compile("toctext\">.*<\/span>")
	for m in re.findall(p, s):
	return m[9:-7:]

	def extractLink(s, listLetter):
	p = re.compile("href=\"#.*\"><")
	for m in re.findall(p, s):
	return wikiUrl+listLetter + m[6:-3:]


	p = re.compile("<a href=\"#.><span .\">.*<\/span><\/a>")

	l = list()
	for match in re.findall(p, rawHtml):
	w = extractWord(match)
	if w != "Quellennachweise": # kick out Quellennachweis-links
	l.append( (w, extractLink(match, listLetter)) )

	return l


	# Save list of (Wort,Link) tuples into database
	def dicToDb(tupleList):
	con = lite.connect('Worte.db')
	with con:
	cur = con.cursor()

	cur.execute("DROP TABLE IF EXISTS Worte")
	cur.execute("CREATE TABLE Worte(Id INT, Wort TEXT, Link TEXT)")

	id = 0
	for t in tupleList:
	cur.execute("INSERT INTO Worte VALUES (?, ?, ?)", (id, t[0], t[1]))
	id+=1



	def main():
	l = list()
	# Run fetching method for every letter in the list
	for c in string.ascii_uppercase:
	if c == 'X':
	print("Skipping " + c)
	else:
	print("Fetching List " + c)
	# fetch Html-Page and extract (Quote,Link) tuples
	nextLetter = genNameLinkTuples(fetchRawHtml(wikiUrl+c),c)
	l = l + nextLetter # merge the next letter into the current list

	# Save to Database
	dicToDb(l)



	main()