Created
March 9, 2017 19:48
-
-
Save jabney/86f28629a0e00924a9aed6a8d1961c65 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import re | |
import gzip | |
import base64 | |
import json | |
from glob import glob | |
from argparse import ArgumentParser | |
from KnoDB.crawler.scc import case_parser | |
from KnoDB.crawler import bcca | |
from KnoDB.crawler.case_denormalizer import flatten, MissingCitationError | |
from pydocumentdb.document_client import DocumentClient | |
import logging | |
logging.basicConfig(level=logging.INFO) | |
parse_logger = logging.getLogger('crawler.generate') | |
parse_logger.setLevel(logging.INFO) | |
load_logger = logging.getLogger('caseloader') | |
load_logger.setLevel(logging.INFO) | |
DOCDB_HOST = os.environ.get('DOCDB_HOST') | |
DOCDB_KEY = os.environ.get('DOCDB_KEY') | |
DOCDB_LAWS_COLLECTION = os.environ.get('DOCDB_LAWS_COLLECTION') | |
# if not (DOCDB_HOST and DOCDB_KEY and DOCDB_LAWS_COLLECTION): | |
# raise OSError( | |
# 'One or more DocumentDB environment variables not set: ' | |
# 'DOCDB_HOST, DOCDB_KEY, DOCDB_LAWS_COLLECTION') | |
def _get_ncit_from_key(unique_key): | |
"""Extract a neutral citation out of the unique_key from | |
the resource info cache file. | |
Args: | |
unique_key: the unique_key value from the ri cache file. | |
Returns: | |
the extracted citation or None. | |
""" | |
re_ncit = re.compile(r'(?P<year>\d{4}) (?P<code>[a-z]+) (?P<index>\d+)', re.I) | |
match = re_ncit.match(unique_key) | |
return match and match.group(0) or None | |
def _get_cache_file_list(cache_dir): | |
"""Get all resource info files from the cache dir. | |
Args: | |
cache_dir: the relative path to the cache directory. | |
Returns: | |
a list of file names. | |
""" | |
return glob('%s/ri_cache/*.ri' % cache_dir) | |
def _ri_to_dict(ri_str): | |
"""Create a dictionary from an ri string. | |
Args: | |
ri_str: the resource info file contents as a string. | |
Returns: | |
a resource info dictionary. | |
""" | |
# Separate data from the other keys/values. | |
# Find where the 'data' key starts. Split the string there. | |
keyvals_and_data = re.split(r'\ndata :\s+', ri_str) | |
key_vals = keyvals_and_data[0] | |
data = keyvals_and_data[1] | |
# Create a list [key, value, key, value, ...] | |
key_vals_list = re.split(r'\s:\s+|\n', key_vals) | |
# Zip the list into a dictionary. Add data to the dict. | |
ri_dict = dict(zip(key_vals_list[::2], key_vals_list[1::2])) | |
ri_dict['data'] = data | |
return ri_dict | |
def _parse_case(parser, html, url, ncit, soc): | |
"""Call the given parser and parse the case. | |
Compress the html using gzip, then make it json-compatible | |
by converting to base64. | |
Assign the fallback ncit and soc from the resource info | |
if it's not in the case after parsing. | |
Assign name and title to be the same as citation and soc. | |
Rename 'paragraphs' to 'children' (maybe this should be | |
done at the API layer instead of here). | |
Remove the errors key generated during parsing. | |
Args: | |
parser: the parser to use, e.g., the scc or bcca parser. | |
html: the html to parse the case from. | |
url: the url obtained from the ri cache file. | |
ncit: a fallback neutral citation obtained from ri file. | |
soc: a fallback style of cause obtained from ri file. | |
Returns: | |
the parsed case. | |
""" | |
# Run the parser to generate the case dict. | |
case = parser.parse(html, url) | |
# Compress case html using gzip. | |
compressed_html = gzip.compress(case['html'].encode('utf-8')) | |
case['html'] = str(base64.b64encode(compressed_html)) | |
# Set citation and soc if they're missing from the case dict. | |
case['citation'] = case.get('citation') or ncit | |
case['soc'] = case.get('soc') or soc | |
# Make sure name and title have valid values. | |
case['name'] = case['citation'] | |
case['title'] = case['soc'] | |
# Rename 'paragraphs' key to 'children'. | |
paragraphs = case.get('paragraphs') | |
case['children'] = paragraphs | |
case.pop('paragraphs', None) | |
# Remove 'errors' key. | |
case.pop('errors') | |
return case | |
def _flatten_and_validate(case): | |
""" | |
Flatten case and perform basic validation of data. | |
Args: | |
case: the case dict obtained from the parser. | |
Returns: | |
A list of the case's nodes, or none if validation fails. | |
""" | |
try: | |
case_nodes = flatten(case) | |
except MissingCitationError as e: | |
load_logger.error( | |
'WILL NOT LOAD: ' | |
'case is missing a properly formatted "citation" key') | |
return None | |
citation = case.get('citation') | |
if citation: | |
load_logger.info('case citation present: %s', citation) | |
else: | |
load_logger.error( | |
'WILL NOT LOAD: ' | |
'case has no "citation" key') | |
return None | |
soc = case.get('soc') | |
if soc: | |
load_logger.info('case soc present: %s', soc) | |
else: | |
load_logger.error( | |
'WILL NOT LOAD: ' | |
'case has no "soc" key') | |
return None | |
if len(case_nodes) > 1: | |
load_logger.info('case has paragraphs: %d', len(case_nodes)-1) | |
else: | |
load_logger.warning('case has no paragraphs') | |
return case_nodes | |
def _load_case(case): | |
"""Load a case into DocDB. | |
Args: | |
case: the case dict obtained from the parser. | |
Returns: | |
None | |
""" | |
client = DocumentClient(DOCDB_HOST, {'masterKey': DOCDB_KEY}) | |
filename = case['name'].replace(' ', '_').replace('CanLII', 'SCC') + '.json' | |
# Write the case to file | |
try: | |
with open(os.path.join('/Projects/knomos/scc_cache/generated_docs/'), filename, 'w') as output_file: | |
json.dump(case, output_file, indent=4) | |
except TypeError: | |
parse_logger.error('Failed to write citation ref to file') | |
case_nodes = _flatten_and_validate(case) or [] | |
for node in case_nodes: | |
# TODO: For now filter out any links that aren't strings. | |
links = node.get('links') or [] | |
links = [link for link in links if isinstance(link, str)] | |
# Create or upsert the node. | |
# client.UpsertDocument(DOCDB_LAWS_COLLECTION, node) | |
# client.CreateDocument(DOCDB_LAWS_COLLECTION, node) | |
def load(source, cache_dir, test_load=False): | |
"""Parse and load case files from a cache to DocumentDB. | |
Args: | |
source: the source name, e.g., 'scc', which is the | |
name of the parser module directory to use. | |
Returns: | |
None | |
""" | |
# Alias the parser based on source. | |
parser = ({ | |
'scc': case_parser | |
# 'bcca': bcca.case_parser | |
}).get(source) | |
# Raise a ValueError if the source is invalid. | |
if not parser: | |
raise ValueError( | |
'source "%s" does not produce a valid parser.' % source) | |
# Create a list of files in the cache. | |
file_list = _get_cache_file_list(cache_dir) | |
# For each file name, load the cache file and parse the html. | |
for name in file_list: | |
with open(name) as infile: | |
# Load dict from ri file. | |
ri_dict = _ri_to_dict(infile.read()) | |
# Get metadata from ri dict. | |
html = ri_dict['data'] | |
url = ri_dict.get('url', 'Error parsing url') | |
soc = ri_dict['title'] | |
ncit = _get_ncit_from_key(ri_dict['unique_key']) | |
case = _parse_case(parser, html, url, ncit, soc) | |
if test_load: | |
print('') | |
load_logger.info('Test loading case %s, %s', | |
case.get('citation'), case.get('soc')) | |
# Perform a test load (print stats). | |
_flatten_and_validate(case) | |
else: | |
print('') | |
load_logger.info('Loading case %s, %s', | |
case.get('citation'), case.get('soc')) | |
# Load the case in docdb. | |
_load_case(case) | |
def get_cmd_args(): | |
"""Create, configure, and return the argument parser's arguments. | |
""" | |
parser = ArgumentParser() | |
parser.add_argument( | |
"source", | |
help="The source type (parser module directory name)" | |
"Valid options are 'bcca' or 'scc'.") | |
parser.add_argument( | |
"-c", "--cache-dir", | |
required=True, | |
help="The relative cache directory to load data from") | |
parser.add_argument( | |
"-t", "--test-load", | |
action='store_true', | |
default=False, | |
help="Print case stats and don't load the case.") | |
return parser.parse_args() | |
def _main(args): | |
"""Run from the command line. | |
Args: | |
args: an argument parser object. | |
Returns: | |
None | |
""" | |
load(args.source, args.cache_dir, args.test_load) | |
if __name__ == '__main__': | |
_main(get_cmd_args()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment