Skip to content

Instantly share code, notes, and snippets.

@jabney
Created March 9, 2017 19:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jabney/86f28629a0e00924a9aed6a8d1961c65 to your computer and use it in GitHub Desktop.
Save jabney/86f28629a0e00924a9aed6a8d1961c65 to your computer and use it in GitHub Desktop.
import os
import re
import gzip
import base64
import json
from glob import glob
from argparse import ArgumentParser
from KnoDB.crawler.scc import case_parser
from KnoDB.crawler import bcca
from KnoDB.crawler.case_denormalizer import flatten, MissingCitationError
from pydocumentdb.document_client import DocumentClient
import logging
logging.basicConfig(level=logging.INFO)
parse_logger = logging.getLogger('crawler.generate')
parse_logger.setLevel(logging.INFO)
load_logger = logging.getLogger('caseloader')
load_logger.setLevel(logging.INFO)
DOCDB_HOST = os.environ.get('DOCDB_HOST')
DOCDB_KEY = os.environ.get('DOCDB_KEY')
DOCDB_LAWS_COLLECTION = os.environ.get('DOCDB_LAWS_COLLECTION')
# if not (DOCDB_HOST and DOCDB_KEY and DOCDB_LAWS_COLLECTION):
# raise OSError(
# 'One or more DocumentDB environment variables not set: '
# 'DOCDB_HOST, DOCDB_KEY, DOCDB_LAWS_COLLECTION')
def _get_ncit_from_key(unique_key):
"""Extract a neutral citation out of the unique_key from
the resource info cache file.
Args:
unique_key: the unique_key value from the ri cache file.
Returns:
the extracted citation or None.
"""
re_ncit = re.compile(r'(?P<year>\d{4}) (?P<code>[a-z]+) (?P<index>\d+)', re.I)
match = re_ncit.match(unique_key)
return match and match.group(0) or None
def _get_cache_file_list(cache_dir):
"""Get all resource info files from the cache dir.
Args:
cache_dir: the relative path to the cache directory.
Returns:
a list of file names.
"""
return glob('%s/ri_cache/*.ri' % cache_dir)
def _ri_to_dict(ri_str):
"""Create a dictionary from an ri string.
Args:
ri_str: the resource info file contents as a string.
Returns:
a resource info dictionary.
"""
# Separate data from the other keys/values.
# Find where the 'data' key starts. Split the string there.
keyvals_and_data = re.split(r'\ndata :\s+', ri_str)
key_vals = keyvals_and_data[0]
data = keyvals_and_data[1]
# Create a list [key, value, key, value, ...]
key_vals_list = re.split(r'\s:\s+|\n', key_vals)
# Zip the list into a dictionary. Add data to the dict.
ri_dict = dict(zip(key_vals_list[::2], key_vals_list[1::2]))
ri_dict['data'] = data
return ri_dict
def _parse_case(parser, html, url, ncit, soc):
"""Call the given parser and parse the case.
Compress the html using gzip, then make it json-compatible
by converting to base64.
Assign the fallback ncit and soc from the resource info
if it's not in the case after parsing.
Assign name and title to be the same as citation and soc.
Rename 'paragraphs' to 'children' (maybe this should be
done at the API layer instead of here).
Remove the errors key generated during parsing.
Args:
parser: the parser to use, e.g., the scc or bcca parser.
html: the html to parse the case from.
url: the url obtained from the ri cache file.
ncit: a fallback neutral citation obtained from ri file.
soc: a fallback style of cause obtained from ri file.
Returns:
the parsed case.
"""
# Run the parser to generate the case dict.
case = parser.parse(html, url)
# Compress case html using gzip.
compressed_html = gzip.compress(case['html'].encode('utf-8'))
case['html'] = str(base64.b64encode(compressed_html))
# Set citation and soc if they're missing from the case dict.
case['citation'] = case.get('citation') or ncit
case['soc'] = case.get('soc') or soc
# Make sure name and title have valid values.
case['name'] = case['citation']
case['title'] = case['soc']
# Rename 'paragraphs' key to 'children'.
paragraphs = case.get('paragraphs')
case['children'] = paragraphs
case.pop('paragraphs', None)
# Remove 'errors' key.
case.pop('errors')
return case
def _flatten_and_validate(case):
"""
Flatten case and perform basic validation of data.
Args:
case: the case dict obtained from the parser.
Returns:
A list of the case's nodes, or none if validation fails.
"""
try:
case_nodes = flatten(case)
except MissingCitationError as e:
load_logger.error(
'WILL NOT LOAD: '
'case is missing a properly formatted "citation" key')
return None
citation = case.get('citation')
if citation:
load_logger.info('case citation present: %s', citation)
else:
load_logger.error(
'WILL NOT LOAD: '
'case has no "citation" key')
return None
soc = case.get('soc')
if soc:
load_logger.info('case soc present: %s', soc)
else:
load_logger.error(
'WILL NOT LOAD: '
'case has no "soc" key')
return None
if len(case_nodes) > 1:
load_logger.info('case has paragraphs: %d', len(case_nodes)-1)
else:
load_logger.warning('case has no paragraphs')
return case_nodes
def _load_case(case):
"""Load a case into DocDB.
Args:
case: the case dict obtained from the parser.
Returns:
None
"""
client = DocumentClient(DOCDB_HOST, {'masterKey': DOCDB_KEY})
filename = case['name'].replace(' ', '_').replace('CanLII', 'SCC') + '.json'
# Write the case to file
try:
with open(os.path.join('/Projects/knomos/scc_cache/generated_docs/'), filename, 'w') as output_file:
json.dump(case, output_file, indent=4)
except TypeError:
parse_logger.error('Failed to write citation ref to file')
case_nodes = _flatten_and_validate(case) or []
for node in case_nodes:
# TODO: For now filter out any links that aren't strings.
links = node.get('links') or []
links = [link for link in links if isinstance(link, str)]
# Create or upsert the node.
# client.UpsertDocument(DOCDB_LAWS_COLLECTION, node)
# client.CreateDocument(DOCDB_LAWS_COLLECTION, node)
def load(source, cache_dir, test_load=False):
"""Parse and load case files from a cache to DocumentDB.
Args:
source: the source name, e.g., 'scc', which is the
name of the parser module directory to use.
Returns:
None
"""
# Alias the parser based on source.
parser = ({
'scc': case_parser
# 'bcca': bcca.case_parser
}).get(source)
# Raise a ValueError if the source is invalid.
if not parser:
raise ValueError(
'source "%s" does not produce a valid parser.' % source)
# Create a list of files in the cache.
file_list = _get_cache_file_list(cache_dir)
# For each file name, load the cache file and parse the html.
for name in file_list:
with open(name) as infile:
# Load dict from ri file.
ri_dict = _ri_to_dict(infile.read())
# Get metadata from ri dict.
html = ri_dict['data']
url = ri_dict.get('url', 'Error parsing url')
soc = ri_dict['title']
ncit = _get_ncit_from_key(ri_dict['unique_key'])
case = _parse_case(parser, html, url, ncit, soc)
if test_load:
print('')
load_logger.info('Test loading case %s, %s',
case.get('citation'), case.get('soc'))
# Perform a test load (print stats).
_flatten_and_validate(case)
else:
print('')
load_logger.info('Loading case %s, %s',
case.get('citation'), case.get('soc'))
# Load the case in docdb.
_load_case(case)
def get_cmd_args():
"""Create, configure, and return the argument parser's arguments.
"""
parser = ArgumentParser()
parser.add_argument(
"source",
help="The source type (parser module directory name)"
"Valid options are 'bcca' or 'scc'.")
parser.add_argument(
"-c", "--cache-dir",
required=True,
help="The relative cache directory to load data from")
parser.add_argument(
"-t", "--test-load",
action='store_true',
default=False,
help="Print case stats and don't load the case.")
return parser.parse_args()
def _main(args):
"""Run from the command line.
Args:
args: an argument parser object.
Returns:
None
"""
load(args.source, args.cache_dir, args.test_load)
if __name__ == '__main__':
_main(get_cmd_args())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment