Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save SimonMayerhofer/7c95d377b2f45c5ec0d4ff945289f663 to your computer and use it in GitHub Desktop.
Save SimonMayerhofer/7c95d377b2f45c5ec0d4ff945289f663 to your computer and use it in GitHub Desktop.
Script to parse the Wikipedia pages for common german typos and convert them to Alfred Snippets. See: https://de.wikipedia.org/wiki/Wikipedia:Liste_von_Tippfehlern
# !/usr/bin/env python
# -*- coding: utf-8 -*-
'''
Script to parse the Wikipedia pages for common german typos and convert them
to Alfred Snippets.
See: https://de.wikipedia.org/wiki/Wikipedia:Liste_von_Tippfehlern
'''
import re
import os
import urllib.request
import uuid
entries = 0
def get_correction_list(page_source):
regex = "<li>\s*([\w+]+)\*?\s?\(([\w\s]+\+?)\*?\)"
matches = re.findall(regex, page_source, re.UNICODE)
correction_list = list()
for match in matches:
correction_list.append({"alfredsnippet": {
"keyword": match[0].replace("+", " ") + " ",
"name": match[0].replace("+", " "),
"snippet": match[1] + " ",
"uid": str.upper(str(uuid.uuid4()))
}})
return correction_list
def parse_sub_page(sub_page):
response = urllib.request.urlopen(
"https://de.wikipedia.org/wiki/Wikipedia:Liste_von_Tippfehlern/" +
sub_page)
page_source = str(
response.read().decode(response.headers.get_content_charset()))
return get_correction_list(page_source)
def get_all_typos():
pages = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M',
'N', 'O', 'PQ', 'R', 'S', 'T', 'U', 'V', 'W', 'XYZ']
typo_list = list()
print("Start fetching data...")
for page in pages:
print(page + "...")
correction_list = parse_sub_page(page)
for correction in correction_list:
typo_list.append(correction)
global entries
entries += 1
return typo_list
def write_to_files(typo_list):
for line in typo_list:
item = line["alfredsnippet"]
filename = item["name"] + " [" + item["uid"] + "].json"
directory = "german_typos_wikipedia/"
if not os.path.exists(directory):
os.makedirs(directory)
with open(directory + filename, encoding='utf-8',
mode='w+') as f:
f.write(str(line).replace("'", "\"") + "\n")
print("finished " + u'\u2713')
global entries
print("Entries: " + str(entries))
if __name__ == "__main__":
write_to_files(get_all_typos())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment