jabney/case_loader.py

## case_loader.py
import os
import re
import gzip
import base64
import json
from glob import glob
from argparse import ArgumentParser

from KnoDB.crawler.scc import case_parser
from KnoDB.crawler import bcca
from KnoDB.crawler.case_denormalizer import flatten, MissingCitationError

from pydocumentdb.document_client import DocumentClient

import logging
logging.basicConfig(level=logging.INFO)

parse_logger = logging.getLogger('crawler.generate')
parse_logger.setLevel(logging.INFO)

load_logger = logging.getLogger('caseloader')
load_logger.setLevel(logging.INFO)


DOCDB_HOST = os.environ.get('DOCDB_HOST')
DOCDB_KEY = os.environ.get('DOCDB_KEY')
DOCDB_LAWS_COLLECTION = os.environ.get('DOCDB_LAWS_COLLECTION')

# if not (DOCDB_HOST and DOCDB_KEY and DOCDB_LAWS_COLLECTION):
#     raise OSError(
#         'One or more DocumentDB environment variables not set: '
#         'DOCDB_HOST, DOCDB_KEY, DOCDB_LAWS_COLLECTION')


def _get_ncit_from_key(unique_key):
    """Extract a neutral citation out of the unique_key from
    the resource info cache file.
    Args:
        unique_key: the unique_key value from the ri cache file.
    Returns:
        the extracted citation or None.
    """
    re_ncit = re.compile(r'(?P<year>\d{4}) (?P<code>[a-z]+) (?P<index>\d+)', re.I)
    match = re_ncit.match(unique_key)
    return match and match.group(0) or None


def _get_cache_file_list(cache_dir):
    """Get all resource info files from the cache dir.
    Args:
        cache_dir: the relative path to the cache directory.
    Returns:
        a list of file names.
    """
    return glob('%s/ri_cache/*.ri' % cache_dir)


def _ri_to_dict(ri_str):
    """Create a dictionary from an ri string.
    Args:
        ri_str: the resource info file contents as a string.
    Returns:
        a resource info dictionary.
    """
    # Separate data from the other keys/values.
    # Find where the 'data' key starts. Split the string there.
    keyvals_and_data = re.split(r'\ndata :\s+', ri_str)
    key_vals = keyvals_and_data[0]
    data = keyvals_and_data[1]

    # Create a list [key, value, key, value, ...]
    key_vals_list = re.split(r'\s:\s+|\n', key_vals)

    # Zip the list into a dictionary. Add data to the dict.
    ri_dict = dict(zip(key_vals_list[::2], key_vals_list[1::2]))
    ri_dict['data'] = data

    return ri_dict


def _parse_case(parser, html, url, ncit, soc):
    """Call the given parser and parse the case.
        Compress the html using gzip, then make it json-compatible
            by converting to base64.
        Assign the fallback ncit and soc from the resource info
            if it's not in the case after parsing.
        Assign name and title to be the same as citation and soc.
        Rename 'paragraphs' to 'children' (maybe this should be
            done at the API layer instead of here).
        Remove the errors key generated during parsing.
    Args:
        parser: the parser to use, e.g., the scc or bcca parser.
        html: the html to parse the case from.
        url: the url obtained from the ri cache file.
        ncit: a fallback neutral citation obtained from ri file.
        soc: a fallback style of cause obtained from ri file.
    Returns:
        the parsed case.
    """
    # Run the parser to generate the case dict.
    case = parser.parse(html, url)

    # Compress case html using gzip.
    compressed_html = gzip.compress(case['html'].encode('utf-8'))
    case['html'] = str(base64.b64encode(compressed_html))

    # Set citation and soc if they're missing from the case dict.
    case['citation'] = case.get('citation') or ncit
    case['soc'] = case.get('soc') or soc

    # Make sure name and title have valid values.
    case['name'] = case['citation']
    case['title'] = case['soc']

    # Rename 'paragraphs' key to 'children'.
    paragraphs = case.get('paragraphs')
    case['children'] = paragraphs
    case.pop('paragraphs', None)

    # Remove 'errors' key.
    case.pop('errors')

    return case


def _flatten_and_validate(case):
    """
    Flatten case and perform basic validation of data.
    Args:
        case: the case dict obtained from the parser.
    Returns:
        A list of the case's nodes, or none if validation fails.
    """
    try:
        case_nodes = flatten(case)
    except MissingCitationError as e:
        load_logger.error(
            'WILL NOT LOAD: '
            'case is missing a properly formatted "citation" key')
        return None

    citation = case.get('citation')
    if citation:
        load_logger.info('case citation present: %s', citation)
    else:
        load_logger.error(
            'WILL NOT LOAD: '
            'case has no "citation" key')
        return None

    soc = case.get('soc')
    if soc:
        load_logger.info('case soc present: %s', soc)
    else:
        load_logger.error(
            'WILL NOT LOAD: '
            'case has no "soc" key')
        return None

    if len(case_nodes) > 1:
        load_logger.info('case has paragraphs: %d', len(case_nodes)-1)
    else:
        load_logger.warning('case has no paragraphs')

    return case_nodes


def _load_case(case):
    """Load a case into DocDB.
    Args:
        case: the case dict obtained from the parser.
    Returns:
        None
    """
    client = DocumentClient(DOCDB_HOST, {'masterKey': DOCDB_KEY})
    filename = case['name'].replace(' ', '_').replace('CanLII', 'SCC') + '.json'
    # Write the case to file
    try:
        with open(os.path.join('/Projects/knomos/scc_cache/generated_docs/'), filename, 'w') as output_file:
            json.dump(case, output_file, indent=4)
    except TypeError:
        parse_logger.error('Failed to write citation ref to file')

    case_nodes = _flatten_and_validate(case) or []

    for node in case_nodes:
        # TODO: For now filter out any links that aren't strings.
        links = node.get('links') or []
        links = [link for link in links if isinstance(link, str)]
        # Create or upsert the node.
        # client.UpsertDocument(DOCDB_LAWS_COLLECTION, node)
        # client.CreateDocument(DOCDB_LAWS_COLLECTION, node)


def load(source, cache_dir, test_load=False):
    """Parse and load case files from a cache to DocumentDB.
    Args:
        source: the source name, e.g., 'scc', which is the
            name of the parser module directory to use.
    Returns:
        None
    """
    # Alias the parser based on source.
    parser = ({
        'scc': case_parser
        # 'bcca': bcca.case_parser
        }).get(source)

    # Raise a ValueError if the source is invalid.
    if not parser:
        raise ValueError(
            'source "%s" does not produce a valid parser.' % source)

    # Create a list of files in the cache.
    file_list = _get_cache_file_list(cache_dir)

    # For each file name, load the cache file and parse the html.
    for name in file_list:
        with open(name) as infile:
            # Load dict from ri file.
            ri_dict = _ri_to_dict(infile.read())

            # Get metadata from ri dict.
            html = ri_dict['data']
            url = ri_dict.get('url', 'Error parsing url')
            soc = ri_dict['title']
            ncit = _get_ncit_from_key(ri_dict['unique_key'])

            case = _parse_case(parser, html, url, ncit, soc)

            if test_load:
                print('')
                load_logger.info('Test loading case %s, %s',
                    case.get('citation'), case.get('soc'))
                # Perform a test load (print stats).
                _flatten_and_validate(case)
            else:
                print('')
                load_logger.info('Loading case %s, %s',
                    case.get('citation'), case.get('soc'))
                # Load the case in docdb.
                _load_case(case)


def get_cmd_args():
    """Create, configure, and return the argument parser's arguments.
    """
    parser = ArgumentParser()

    parser.add_argument(
        "source",
        help="The source type (parser module directory name)"
             "Valid options are 'bcca' or 'scc'.")

    parser.add_argument(
        "-c", "--cache-dir",
        required=True,
        help="The relative cache directory to load data from")

    parser.add_argument(
        "-t", "--test-load",
        action='store_true',
        default=False,
        help="Print case stats and don't load the case.")

    return parser.parse_args()


def _main(args):
    """Run from the command line.
    Args:
        args: an argument parser object.
    Returns:
        None
    """
    load(args.source, args.cache_dir, args.test_load)


if __name__ == '__main__':
    _main(get_cmd_args())
	import os
	import re
	import gzip
	import base64
	import json
	from glob import glob
	from argparse import ArgumentParser

	from KnoDB.crawler.scc import case_parser
	from KnoDB.crawler import bcca
	from KnoDB.crawler.case_denormalizer import flatten, MissingCitationError

	from pydocumentdb.document_client import DocumentClient

	import logging
	logging.basicConfig(level=logging.INFO)

	parse_logger = logging.getLogger('crawler.generate')
	parse_logger.setLevel(logging.INFO)

	load_logger = logging.getLogger('caseloader')
	load_logger.setLevel(logging.INFO)


	DOCDB_HOST = os.environ.get('DOCDB_HOST')
	DOCDB_KEY = os.environ.get('DOCDB_KEY')
	DOCDB_LAWS_COLLECTION = os.environ.get('DOCDB_LAWS_COLLECTION')

	# if not (DOCDB_HOST and DOCDB_KEY and DOCDB_LAWS_COLLECTION):
	# raise OSError(
	# 'One or more DocumentDB environment variables not set: '
	# 'DOCDB_HOST, DOCDB_KEY, DOCDB_LAWS_COLLECTION')


	def _get_ncit_from_key(unique_key):
	"""Extract a neutral citation out of the unique_key from
	the resource info cache file.
	Args:
	unique_key: the unique_key value from the ri cache file.
	Returns:
	the extracted citation or None.
	"""
	re_ncit = re.compile(r'(?P<year>\d{4}) (?P<code>[a-z]+) (?P<index>\d+)', re.I)
	match = re_ncit.match(unique_key)
	return match and match.group(0) or None


	def _get_cache_file_list(cache_dir):
	"""Get all resource info files from the cache dir.
	Args:
	cache_dir: the relative path to the cache directory.
	Returns:
	a list of file names.
	"""
	return glob('%s/ri_cache/*.ri' % cache_dir)


	def _ri_to_dict(ri_str):
	"""Create a dictionary from an ri string.
	Args:
	ri_str: the resource info file contents as a string.
	Returns:
	a resource info dictionary.
	"""
	# Separate data from the other keys/values.
	# Find where the 'data' key starts. Split the string there.
	keyvals_and_data = re.split(r'\ndata :\s+', ri_str)
	key_vals = keyvals_and_data[0]
	data = keyvals_and_data[1]

	# Create a list [key, value, key, value, ...]
	key_vals_list = re.split(r'\s:\s+\|\n', key_vals)

	# Zip the list into a dictionary. Add data to the dict.
	ri_dict = dict(zip(key_vals_list[::2], key_vals_list[1::2]))
	ri_dict['data'] = data

	return ri_dict


	def _parse_case(parser, html, url, ncit, soc):
	"""Call the given parser and parse the case.
	Compress the html using gzip, then make it json-compatible
	by converting to base64.
	Assign the fallback ncit and soc from the resource info
	if it's not in the case after parsing.
	Assign name and title to be the same as citation and soc.
	Rename 'paragraphs' to 'children' (maybe this should be
	done at the API layer instead of here).
	Remove the errors key generated during parsing.
	Args:
	parser: the parser to use, e.g., the scc or bcca parser.
	html: the html to parse the case from.
	url: the url obtained from the ri cache file.
	ncit: a fallback neutral citation obtained from ri file.
	soc: a fallback style of cause obtained from ri file.
	Returns:
	the parsed case.
	"""
	# Run the parser to generate the case dict.
	case = parser.parse(html, url)

	# Compress case html using gzip.
	compressed_html = gzip.compress(case['html'].encode('utf-8'))
	case['html'] = str(base64.b64encode(compressed_html))

	# Set citation and soc if they're missing from the case dict.
	case['citation'] = case.get('citation') or ncit
	case['soc'] = case.get('soc') or soc

	# Make sure name and title have valid values.
	case['name'] = case['citation']
	case['title'] = case['soc']

	# Rename 'paragraphs' key to 'children'.
	paragraphs = case.get('paragraphs')
	case['children'] = paragraphs
	case.pop('paragraphs', None)

	# Remove 'errors' key.
	case.pop('errors')

	return case


	def _flatten_and_validate(case):
	"""
	Flatten case and perform basic validation of data.
	Args:
	case: the case dict obtained from the parser.
	Returns:
	A list of the case's nodes, or none if validation fails.
	"""
	try:
	case_nodes = flatten(case)
	except MissingCitationError as e:
	load_logger.error(
	'WILL NOT LOAD: '
	'case is missing a properly formatted "citation" key')
	return None

	citation = case.get('citation')
	if citation:
	load_logger.info('case citation present: %s', citation)
	else:
	load_logger.error(
	'WILL NOT LOAD: '
	'case has no "citation" key')
	return None

	soc = case.get('soc')
	if soc:
	load_logger.info('case soc present: %s', soc)
	else:
	load_logger.error(
	'WILL NOT LOAD: '
	'case has no "soc" key')
	return None

	if len(case_nodes) > 1:
	load_logger.info('case has paragraphs: %d', len(case_nodes)-1)
	else:
	load_logger.warning('case has no paragraphs')

	return case_nodes


	def _load_case(case):
	"""Load a case into DocDB.
	Args:
	case: the case dict obtained from the parser.
	Returns:
	None
	"""
	client = DocumentClient(DOCDB_HOST, {'masterKey': DOCDB_KEY})
	filename = case['name'].replace(' ', '_').replace('CanLII', 'SCC') + '.json'
	# Write the case to file
	try:
	with open(os.path.join('/Projects/knomos/scc_cache/generated_docs/'), filename, 'w') as output_file:
	json.dump(case, output_file, indent=4)
	except TypeError:
	parse_logger.error('Failed to write citation ref to file')

	case_nodes = _flatten_and_validate(case) or []

	for node in case_nodes:
	# TODO: For now filter out any links that aren't strings.
	links = node.get('links') or []
	links = [link for link in links if isinstance(link, str)]
	# Create or upsert the node.
	# client.UpsertDocument(DOCDB_LAWS_COLLECTION, node)
	# client.CreateDocument(DOCDB_LAWS_COLLECTION, node)


	def load(source, cache_dir, test_load=False):
	"""Parse and load case files from a cache to DocumentDB.
	Args:
	source: the source name, e.g., 'scc', which is the
	name of the parser module directory to use.
	Returns:
	None
	"""
	# Alias the parser based on source.
	parser = ({
	'scc': case_parser
	# 'bcca': bcca.case_parser
	}).get(source)

	# Raise a ValueError if the source is invalid.
	if not parser:
	raise ValueError(
	'source "%s" does not produce a valid parser.' % source)

	# Create a list of files in the cache.
	file_list = _get_cache_file_list(cache_dir)

	# For each file name, load the cache file and parse the html.
	for name in file_list:
	with open(name) as infile:
	# Load dict from ri file.
	ri_dict = _ri_to_dict(infile.read())

	# Get metadata from ri dict.
	html = ri_dict['data']
	url = ri_dict.get('url', 'Error parsing url')
	soc = ri_dict['title']
	ncit = _get_ncit_from_key(ri_dict['unique_key'])

	case = _parse_case(parser, html, url, ncit, soc)

	if test_load:
	print('')
	load_logger.info('Test loading case %s, %s',
	case.get('citation'), case.get('soc'))
	# Perform a test load (print stats).
	_flatten_and_validate(case)
	else:
	print('')
	load_logger.info('Loading case %s, %s',
	case.get('citation'), case.get('soc'))
	# Load the case in docdb.
	_load_case(case)


	def get_cmd_args():
	"""Create, configure, and return the argument parser's arguments.
	"""
	parser = ArgumentParser()

	parser.add_argument(
	"source",
	help="The source type (parser module directory name)"
	"Valid options are 'bcca' or 'scc'.")

	parser.add_argument(
	"-c", "--cache-dir",
	required=True,
	help="The relative cache directory to load data from")

	parser.add_argument(
	"-t", "--test-load",
	action='store_true',
	default=False,
	help="Print case stats and don't load the case.")

	return parser.parse_args()


	def _main(args):
	"""Run from the command line.
	Args:
	args: an argument parser object.
	Returns:
	None
	"""
	load(args.source, args.cache_dir, args.test_load)


	if __name__ == '__main__':
	_main(get_cmd_args())