Last active
July 13, 2021 20:35
-
-
Save wincentbalin/7c3cda669094ef830c5f93decb145dd5 to your computer and use it in GitHub Desktop.
Generate German laws
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from html.parser import HTMLParser | |
RE_TEILLISTE = re.compile(r'/Teilliste_\w\.html$', re.IGNORECASE) | |
def get_url(attrs): | |
"""Find href attribute and join it with base URL""" | |
for key, value in attrs: | |
if key == 'href': | |
return urljoin(START_URL, value) | |
else: | |
return None | |
class AktuellParser(HTMLParser): | |
def __init__(self): | |
super(AktuellParser, self).__init__() | |
self.urls = [] | |
def error(self, message): | |
logging.error('HTML parse error: {}'.format(message)) | |
def handle_starttag(self, tag, attrs): | |
if tag == 'a': | |
url = get_url(attrs) | |
if url is None: | |
return | |
if RE_TEILLISTE.search(url): | |
self.urls.append(url) | |
def get_urls(self): | |
return self.urls | |
parser = AktuellParser() | |
with urlopen(START_URL) as response: | |
parser.feed(response.read().decode('iso-8859-1')) | |
partial_list_urls = parser.get_urls() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/bin/sh | |
# | |
# Create a textual dataset using MSXSL | |
# | |
# Usage: create_dataset.sh directory_with_gii_xml_files dataset_directory | |
if [ $# -ne 2 ] | |
then | |
echo Usage: create_dataset.sh directory_with_gii_xml_files dataset_directory | |
exit 1 | |
fi | |
for xmlfile in "$1"/*.xml | |
do | |
echo Processing $xmlfile | |
txtfile=`basename $xmlfile .xml`.txt | |
msxsl $xmlfile giitotext.xsl > "$2"/$txtfile | |
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
"""This is German laws corpus builder.""" | |
import re | |
import sys | |
import logging | |
import argparse | |
from pathlib import Path | |
from html.parser import HTMLParser | |
from urllib.request import urlopen, urlretrieve | |
from urllib.error import HTTPError, URLError | |
from urllib.parse import urljoin | |
START_URL = 'http://www.gesetze-im-internet.de/aktuell.html' | |
RE_TEILLISTE = re.compile(r'/Teilliste_\w\.html$', re.IGNORECASE) | |
RE_GESETZ = re.compile(r'\.de/(.+)/index\.html$', re.IGNORECASE) | |
def get_url(attrs): | |
"""Find href attribute and join it with base URL""" | |
for key, value in attrs: | |
if key == 'href': | |
return urljoin(START_URL, value) | |
else: | |
return None | |
def get_law_title(attrs): | |
"""Find law title""" | |
for key, value in attrs: | |
if key == 'title': | |
return value | |
else: | |
return None | |
class AktuellParser(HTMLParser): | |
def __init__(self): | |
super(AktuellParser, self).__init__() | |
self.urls = [] | |
def error(self, message): | |
logging.error('HTML parse error: {}'.format(message)) | |
def handle_starttag(self, tag, attrs): | |
if tag == 'a': | |
url = get_url(attrs) | |
if url is None: | |
return | |
if RE_TEILLISTE.search(url): | |
self.urls.append(url) | |
def get_urls(self): | |
return self.urls | |
class TeillisteParser(HTMLParser): | |
def __init__(self): | |
super(TeillisteParser, self).__init__() | |
self.law_props = {} | |
self.laws = [] | |
def error(self, message): | |
logging.error('HTML parse error: {}'.format(message)) | |
def handle_starttag(self, tag, attrs): | |
if tag == 'a': | |
url = get_url(attrs) | |
if url is None: | |
return | |
match = RE_GESETZ.search(url) | |
if match: | |
self.law_props = {'name': match.group(1), 'url': url} | |
elif tag == 'abbr' and self.law_props: | |
title = get_law_title(attrs) | |
if title is None: | |
raise ValueError('Unknown law with abbreviation {}'.format(self.law_props['name'])) | |
self.law_props['title'] = title | |
self.laws.append(self.law_props) | |
self.law_props = {} | |
def get_laws(self): | |
return self.laws | |
def fetch(args: argparse.Namespace): | |
"""fetch command""" | |
logging.info('Downloading law metadata...') | |
parser = AktuellParser() | |
try: | |
logging.debug('Fetching {}'.format(START_URL)) | |
with urlopen(START_URL) as response: | |
parser.feed(response.read().decode('iso-8859-1')) | |
except (HTTPError, URLError): | |
logging.error('Error fetching {}'.format(START_URL)) | |
return | |
partial_list_urls = parser.get_urls() | |
parser = TeillisteParser() | |
for url in partial_list_urls: | |
try: | |
logging.debug('Fetching {}'.format(url)) | |
with urlopen(url) as response: | |
parser.feed(response.read().decode('iso-8859-1')) | |
except (HTTPError, URLError): | |
logging.error('Error fetching {}'.format(url)) | |
return | |
laws = parser.get_laws() | |
if args.list: | |
for law in laws: | |
print('{}\t{}'.format(law['name'], law['title'])) | |
else: | |
for index, law in enumerate(laws, 1): | |
if args.only and law['name'] not in args.only: | |
continue | |
logging.info('Downloading "{}" ({}) [{}/{}]...'.format(law['title'], law['name'], index, len(laws))) | |
try: | |
url = urljoin(law['url'], 'xml.zip') | |
logging.debug('Fetching {}'.format(url)) | |
local_filename, _ = urlretrieve(url) | |
except (HTTPError, URLError) as error: | |
logging.warning('Error fetching {}: {}'.format(url, error)) | |
continue | |
target_filename = args.cache / '{}.xml.zip'.format(law['name']) | |
logging.debug('Moving downloaded file to target filename {}'.format(target_filename)) | |
if not args.cache.exists(): | |
args.cache.mkdir() | |
Path(local_filename).replace(target_filename) | |
def main(): | |
class SplitArgs(argparse.Action): | |
"""Command line argument as comma separated list""" | |
def __call__(self, parser, namespace, values, option_string=None): | |
setattr(namespace, self.dest, [v.lower() for v in values.split(',') if v]) | |
parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__) | |
parser.set_defaults(func=lambda args: parser.print_usage()) | |
parser.add_argument('-c', '--cache', help='Cache directory for fetched files', type=Path, default=Path.cwd() / 'cache') | |
parser.add_argument('--debug', action='store_true', help='Print debug messages') | |
parser.add_argument('--quiet', action='store_true', help='Print errors only') | |
subparsers = parser.add_subparsers(title='Commands') | |
fetch_parser = subparsers.add_parser('fetch', help='Fetch laws in xml.zip format into the cache directory') | |
fetch_parser.add_argument('-o', '--only', action=SplitArgs, help='List of comma separated law abbreviations (example: BGB,hGb)') | |
fetch_parser.add_argument('-l', '--list', action='store_true', help='List laws and exit') | |
fetch_parser.set_defaults(func=fetch) | |
args = parser.parse_args() | |
logging_level = logging.DEBUG if args.debug else logging.WARNING if args.quiet else logging.INFO | |
logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging_level) | |
args.func(args) | |
if __name__ == '__main__': | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from html.parser import HTMLParser | |
from urllib.request import urlopen | |
class Linkparser(HTMLParser): | |
def __init__(self): | |
super(Linkparser, self).__init__() | |
self.links = [] | |
def handle_starttag(self, tag, attrs): | |
if tag == 'a': | |
href = self.get_href(attrs) | |
self.links.append(href) | |
def get_href(self, attrs): | |
for key, value in attrs: | |
if key == 'href': | |
return value | |
else: | |
return None | |
def get_links(self): | |
return self.links | |
parser = Linkparser() | |
with urlopen('https://www.gesetze-im-internet.de/') as response: | |
parser.feed(response.read().decode('iso-8859-1')) | |
for link in parser.get_links(): | |
print(link) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from html.parser import HTMLParser | |
RE_GESETZ = re.compile(r'\.de/(.+)/index\.html$', re.IGNORECASE) | |
def get_law_title(attrs): | |
"""Find law title""" | |
for key, value in attrs: | |
if key == 'title': | |
return value | |
else: | |
return None | |
class TeillisteParser(HTMLParser): | |
def __init__(self): | |
super(TeillisteParser, self).__init__() | |
self.law_props = {} | |
self.laws = [] | |
def error(self, message): | |
logging.error('HTML parse error: {}'.format(message)) | |
def handle_starttag(self, tag, attrs): | |
if tag == 'a': | |
url = get_url(attrs) | |
if url is None: | |
return | |
match = RE_GESETZ.search(url) | |
if match: | |
self.law_props = {'name': match.group(1), 'url': url} | |
elif tag == 'abbr' and self.law_props: | |
title = get_law_title(attrs) | |
if title is None: | |
raise ValueError('Unknown law with abbreviation {}'.format(self.law_props['name'])) | |
self.law_props['title'] = title | |
self.laws.append(self.law_props) | |
self.law_props = {} | |
def get_laws(self): | |
return self.laws | |
parser = TeillisteParser() | |
for url in partial_list_urls: | |
with urlopen(url) as response: | |
parser.feed(response.read().decode('iso-8859-1')) | |
laws = parser.get_laws() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment