Skip to content

Instantly share code, notes, and snippets.

@jermnelson
Last active April 1, 2016 21:08
Show Gist options
  • Save jermnelson/7f234fb736a2115364e2 to your computer and use it in GitHub Desktop.
Save jermnelson/7f234fb736a2115364e2 to your computer and use it in GitHub Desktop.
DPLA JSON to Redis Protocol Script
*.swp
*.resp
package main
import (
// "bufio"
// "crypto/sha1"
"encoding/json"
"fmt"
//"io"
"os"
)
func check(e error) {
if e != nil {
panic(e)
}
}
func rec_parser(filepath string) {
f, err := os.Open(filepath)
check(err)
dpla_decoder := json.NewDecoder(f)
var v map[string]interface{}
dpla_decoder.Decode(&v)
source := v["_source"].(map[string]interface{})
provider := source["provider"].(map[string]interface{})
subject := []byte(source["@id"].(string))
fmt.Printf("\tsubject %x: Provider %s", subject, provider["@id"])
sourceResource := source["sourceResource"].(map[string]interface{})
language := sourceResource["language"].([]interface{})
for row := range language {
field := language[row].(map[string]interface{})
fmt.Printf("\t%s\n", field["name"])
}
}
func main() {
fmt.Printf("Starting parsing\n")
rec_parser("E:/2016/tmp/uiuc-rec1.json")
}
"""Python script uses the ijson (https://pypi.python.org/pypi/ijson/) module to
iterate through DPLA JSON files and generate Redis Protocol
(http://redis.io/topics/mass-insert) for mass insertion into a Redis instance.
"""
__author__ = "Jeremy Nelson"
import click
import datetime
import hashlib
import ijson
import sys
import redis
import urllib.parse
from rdflib import Namespace
from urllib.request import urlopen
CACHE = redis.StrictRedis() # Local Redis cache for key checkup
DPLA = Namespace("http://dp.la/items/context")
class ConversionError(Exception):
def __init__(self, error_message):
self.value = error_message
def __str__(self):
return repr(self.value)
def expand_to_url(key):
return urllib.parse.urljoin(str(DPLA), key)
def generate_redis_protocol(cmd):
proto = ""
proto += "*" + str(len(cmd)) + "\r\n"
for arg in cmd:
proto += "$" + str(len(arg)) + "\r\n"
proto += arg + "\r\n"
return proto
def handle_source_resource(prefix, event, value, source_resource={}):
"""Helper function takes an event and value, extracts key and value"""
def __add_key_val__(keys, value):
if len(keys) <= 1:
return {keys[0]: [value,]}
else:
return __add_key_val__(keys[1:], value)
if event.startswith("string"):
keys = prefix.split(".")[2:]
if keys[-1].startswith("@id"):
source_resource[keys[-1]] = value
# Remove item
if 'item' in keys:
keys.remove('item')
source_resource.update(__add_key_val__(keys, value))
return source_resource
def process_dpla(parser, output):
"""Function takes an ijson parser and an output filename and generates
RESP and saves to file.
Args:
parser -- ijson parser
output -- output RESP filename
"""
dpla_doc = {}
count = 0
for prefix, event, value in parser:
if prefix == 'item' and event.startswith("start_map"):
dpla_doc = {}
key = None
if prefix.startswith("item._source.sourceResource"):
if not 'sourceResource' in dpla_doc:
dpla_doc['sourceResource'] = handle_source_resource(prefix, event, value)
else:
dpla_doc['sourceResource'] = handle_source_resource(
prefix,
event,
value,
dpla_doc['sourceResource'])
if prefix == "item" and event.startswith("end_map"):
count +=1
if not count%10 and count > 0:
sys.stdout.write(".")
if not count%100:
sys.stdout.write(" {} ".format(count))
redis_protocol = resource_to_resp(dpla_doc)
with open(output, "a", encoding='utf8', errors='ignore') as output_resp:
output_resp.write(redis_protocol)
def resource_to_resp(source_resource):
"""Function takes a source resource and returns Redis protocol as a
string.
Args:
source_resource -- Dict of source resource
"""
resp = ''
subject = ''.join(source_resource['sourceResource'].pop('@id'))
subject_sha1 = hashlib.sha1(subject.encode()).hexdigest()
if not subject_sha1 in ADDED_SHA1:
resp += generate_redis_protocol(["SET", subject_sha1, subject])
ADDED_SHA1.add(subject_sha1)
for key, value in source_resource["sourceResource"].items():
predicate_url = expand_to_url(key)
predicate_sha1 = hashlib.sha1(predicate_url.encode()).hexdigest()
if not predicate_sha1 in ADDED_SHA1:
resp += generate_redis_protocol(
["SET",
predicate_sha1,
predicate_url])
ADDED_SHA1.add(predicate_sha1)
for row in value:
obj_sha1 = hashlib.sha1(row.encode()).hexdigest()
resp += generate_redis_protocol(["SET", obj_sha1, row])
triple_subj_key = "{}:pred-obj".format(subject_sha1)
resp += generate_redis_protocol(
["SADD",
triple_subj_key,
"{}:{}".format(predicate_sha1, obj_sha1)])
triple_pred_key = "{}:subj-obj".format(predicate_sha1)
resp += generate_redis_protocol(
["SADD",
triple_pred_key,
"{}:{}".format(subject_sha1, obj_sha1)])
triple_obj_key = "{}:subj-pred".format(obj_sha1)
resp += generate_redis_protocol(
["SADD",
triple_obj_key,
"{}:{}".format(subject_sha1, predicate_sha1)])
return resp
@click.command()
@click.option('--filepath', help='Full file path to DPLA JSON file', default=None)
@click.option('--url', help='URL to DPLA JSON', default=None)
@click.option('--output', help="Output filepath for RESP", default="output.resp")
def convert_dpla(filepath, url, output):
"""Function takes either a file path to DPLA JSON or an URL, iterates through
the JSON using ijson module, and generates RESP for mass insertion into a single
REDIS instance.
Args:
filepath -- File path to DPLA JSON
url -- URL to DPLA JSON
output -- Output file for RESP, defaults to output.resp
"""
if filepath is not None and url is not None:
raise ConversionError("Both Filename and Url cannot have values")
if filepath is None and url is None:
raise ConversionError("Need either a Filename or URL")
if filepath is not None:
parser = ijson.parse(open(filepath, encoding='utf8', errors='ignore'))
elif url is not None:
parser = ijson.parse(urlopen(url))
start = datetime.datetime.utcnow()
# Prep cache
print("Starting conversion from DPLA JSON to RESP at {}".format(start.isoformat()))
process_dpla(parser, output)
end = datetime.datetime.utcnow()
print("Finished conversion of DPLA to RESP at {}, total time {} minutes".format(
end.isoformat(),
(end-start).seconds / 60.0))
if __name__ == '__main__':
convert_dpla()
ijson>=2.0
click>=5.0
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment