Skip to content

Instantly share code, notes, and snippets.

@wincentbalin
Last active July 13, 2021 20:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save wincentbalin/7c3cda669094ef830c5f93decb145dd5 to your computer and use it in GitHub Desktop.
Save wincentbalin/7c3cda669094ef830c5f93decb145dd5 to your computer and use it in GitHub Desktop.
Generate German laws
import re
from html.parser import HTMLParser
RE_TEILLISTE = re.compile(r'/Teilliste_\w\.html$', re.IGNORECASE)
def get_url(attrs):
"""Find href attribute and join it with base URL"""
for key, value in attrs:
if key == 'href':
return urljoin(START_URL, value)
else:
return None
class AktuellParser(HTMLParser):
def __init__(self):
super(AktuellParser, self).__init__()
self.urls = []
def error(self, message):
logging.error('HTML parse error: {}'.format(message))
def handle_starttag(self, tag, attrs):
if tag == 'a':
url = get_url(attrs)
if url is None:
return
if RE_TEILLISTE.search(url):
self.urls.append(url)
def get_urls(self):
return self.urls
parser = AktuellParser()
with urlopen(START_URL) as response:
parser.feed(response.read().decode('iso-8859-1'))
partial_list_urls = parser.get_urls()
#!/bin/sh
#
# Create a textual dataset using MSXSL
#
# Usage: create_dataset.sh directory_with_gii_xml_files dataset_directory
if [ $# -ne 2 ]
then
echo Usage: create_dataset.sh directory_with_gii_xml_files dataset_directory
exit 1
fi
for xmlfile in "$1"/*.xml
do
echo Processing $xmlfile
txtfile=`basename $xmlfile .xml`.txt
msxsl $xmlfile giitotext.xsl > "$2"/$txtfile
done
#!/usr/bin/env python3
"""This is German laws corpus builder."""
import re
import sys
import logging
import argparse
from pathlib import Path
from html.parser import HTMLParser
from urllib.request import urlopen, urlretrieve
from urllib.error import HTTPError, URLError
from urllib.parse import urljoin
START_URL = 'http://www.gesetze-im-internet.de/aktuell.html'
RE_TEILLISTE = re.compile(r'/Teilliste_\w\.html$', re.IGNORECASE)
RE_GESETZ = re.compile(r'\.de/(.+)/index\.html$', re.IGNORECASE)
def get_url(attrs):
"""Find href attribute and join it with base URL"""
for key, value in attrs:
if key == 'href':
return urljoin(START_URL, value)
else:
return None
def get_law_title(attrs):
"""Find law title"""
for key, value in attrs:
if key == 'title':
return value
else:
return None
class AktuellParser(HTMLParser):
def __init__(self):
super(AktuellParser, self).__init__()
self.urls = []
def error(self, message):
logging.error('HTML parse error: {}'.format(message))
def handle_starttag(self, tag, attrs):
if tag == 'a':
url = get_url(attrs)
if url is None:
return
if RE_TEILLISTE.search(url):
self.urls.append(url)
def get_urls(self):
return self.urls
class TeillisteParser(HTMLParser):
def __init__(self):
super(TeillisteParser, self).__init__()
self.law_props = {}
self.laws = []
def error(self, message):
logging.error('HTML parse error: {}'.format(message))
def handle_starttag(self, tag, attrs):
if tag == 'a':
url = get_url(attrs)
if url is None:
return
match = RE_GESETZ.search(url)
if match:
self.law_props = {'name': match.group(1), 'url': url}
elif tag == 'abbr' and self.law_props:
title = get_law_title(attrs)
if title is None:
raise ValueError('Unknown law with abbreviation {}'.format(self.law_props['name']))
self.law_props['title'] = title
self.laws.append(self.law_props)
self.law_props = {}
def get_laws(self):
return self.laws
def fetch(args: argparse.Namespace):
"""fetch command"""
logging.info('Downloading law metadata...')
parser = AktuellParser()
try:
logging.debug('Fetching {}'.format(START_URL))
with urlopen(START_URL) as response:
parser.feed(response.read().decode('iso-8859-1'))
except (HTTPError, URLError):
logging.error('Error fetching {}'.format(START_URL))
return
partial_list_urls = parser.get_urls()
parser = TeillisteParser()
for url in partial_list_urls:
try:
logging.debug('Fetching {}'.format(url))
with urlopen(url) as response:
parser.feed(response.read().decode('iso-8859-1'))
except (HTTPError, URLError):
logging.error('Error fetching {}'.format(url))
return
laws = parser.get_laws()
if args.list:
for law in laws:
print('{}\t{}'.format(law['name'], law['title']))
else:
for index, law in enumerate(laws, 1):
if args.only and law['name'] not in args.only:
continue
logging.info('Downloading "{}" ({}) [{}/{}]...'.format(law['title'], law['name'], index, len(laws)))
try:
url = urljoin(law['url'], 'xml.zip')
logging.debug('Fetching {}'.format(url))
local_filename, _ = urlretrieve(url)
except (HTTPError, URLError) as error:
logging.warning('Error fetching {}: {}'.format(url, error))
continue
target_filename = args.cache / '{}.xml.zip'.format(law['name'])
logging.debug('Moving downloaded file to target filename {}'.format(target_filename))
if not args.cache.exists():
args.cache.mkdir()
Path(local_filename).replace(target_filename)
def main():
class SplitArgs(argparse.Action):
"""Command line argument as comma separated list"""
def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, [v.lower() for v in values.split(',') if v])
parser = argparse.ArgumentParser(description=sys.modules[__name__].__doc__)
parser.set_defaults(func=lambda args: parser.print_usage())
parser.add_argument('-c', '--cache', help='Cache directory for fetched files', type=Path, default=Path.cwd() / 'cache')
parser.add_argument('--debug', action='store_true', help='Print debug messages')
parser.add_argument('--quiet', action='store_true', help='Print errors only')
subparsers = parser.add_subparsers(title='Commands')
fetch_parser = subparsers.add_parser('fetch', help='Fetch laws in xml.zip format into the cache directory')
fetch_parser.add_argument('-o', '--only', action=SplitArgs, help='List of comma separated law abbreviations (example: BGB,hGb)')
fetch_parser.add_argument('-l', '--list', action='store_true', help='List laws and exit')
fetch_parser.set_defaults(func=fetch)
args = parser.parse_args()
logging_level = logging.DEBUG if args.debug else logging.WARNING if args.quiet else logging.INFO
logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s', level=logging_level)
args.func(args)
if __name__ == '__main__':
main()
from html.parser import HTMLParser
from urllib.request import urlopen
class Linkparser(HTMLParser):
def __init__(self):
super(Linkparser, self).__init__()
self.links = []
def handle_starttag(self, tag, attrs):
if tag == 'a':
href = self.get_href(attrs)
self.links.append(href)
def get_href(self, attrs):
for key, value in attrs:
if key == 'href':
return value
else:
return None
def get_links(self):
return self.links
parser = Linkparser()
with urlopen('https://www.gesetze-im-internet.de/') as response:
parser.feed(response.read().decode('iso-8859-1'))
for link in parser.get_links():
print(link)
import re
from html.parser import HTMLParser
RE_GESETZ = re.compile(r'\.de/(.+)/index\.html$', re.IGNORECASE)
def get_law_title(attrs):
"""Find law title"""
for key, value in attrs:
if key == 'title':
return value
else:
return None
class TeillisteParser(HTMLParser):
def __init__(self):
super(TeillisteParser, self).__init__()
self.law_props = {}
self.laws = []
def error(self, message):
logging.error('HTML parse error: {}'.format(message))
def handle_starttag(self, tag, attrs):
if tag == 'a':
url = get_url(attrs)
if url is None:
return
match = RE_GESETZ.search(url)
if match:
self.law_props = {'name': match.group(1), 'url': url}
elif tag == 'abbr' and self.law_props:
title = get_law_title(attrs)
if title is None:
raise ValueError('Unknown law with abbreviation {}'.format(self.law_props['name']))
self.law_props['title'] = title
self.laws.append(self.law_props)
self.law_props = {}
def get_laws(self):
return self.laws
parser = TeillisteParser()
for url in partial_list_urls:
with urlopen(url) as response:
parser.feed(response.read().decode('iso-8859-1'))
laws = parser.get_laws()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment