Last active
April 1, 2016 21:08
-
-
Save jermnelson/7f234fb736a2115364e2 to your computer and use it in GitHub Desktop.
DPLA JSON to Redis Protocol Script
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
*.swp | |
*.resp |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
// "bufio" | |
// "crypto/sha1" | |
"encoding/json" | |
"fmt" | |
//"io" | |
"os" | |
) | |
func check(e error) { | |
if e != nil { | |
panic(e) | |
} | |
} | |
func rec_parser(filepath string) { | |
f, err := os.Open(filepath) | |
check(err) | |
dpla_decoder := json.NewDecoder(f) | |
var v map[string]interface{} | |
dpla_decoder.Decode(&v) | |
source := v["_source"].(map[string]interface{}) | |
provider := source["provider"].(map[string]interface{}) | |
subject := []byte(source["@id"].(string)) | |
fmt.Printf("\tsubject %x: Provider %s", subject, provider["@id"]) | |
sourceResource := source["sourceResource"].(map[string]interface{}) | |
language := sourceResource["language"].([]interface{}) | |
for row := range language { | |
field := language[row].(map[string]interface{}) | |
fmt.Printf("\t%s\n", field["name"]) | |
} | |
} | |
func main() { | |
fmt.Printf("Starting parsing\n") | |
rec_parser("E:/2016/tmp/uiuc-rec1.json") | |
} | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Python script uses the ijson (https://pypi.python.org/pypi/ijson/) module to | |
iterate through DPLA JSON files and generate Redis Protocol | |
(http://redis.io/topics/mass-insert) for mass insertion into a Redis instance. | |
""" | |
__author__ = "Jeremy Nelson" | |
import click | |
import datetime | |
import hashlib | |
import ijson | |
import sys | |
import redis | |
import urllib.parse | |
from rdflib import Namespace | |
from urllib.request import urlopen | |
CACHE = redis.StrictRedis() # Local Redis cache for key checkup | |
DPLA = Namespace("http://dp.la/items/context") | |
class ConversionError(Exception): | |
def __init__(self, error_message): | |
self.value = error_message | |
def __str__(self): | |
return repr(self.value) | |
def expand_to_url(key): | |
return urllib.parse.urljoin(str(DPLA), key) | |
def generate_redis_protocol(cmd): | |
proto = "" | |
proto += "*" + str(len(cmd)) + "\r\n" | |
for arg in cmd: | |
proto += "$" + str(len(arg)) + "\r\n" | |
proto += arg + "\r\n" | |
return proto | |
def handle_source_resource(prefix, event, value, source_resource={}): | |
"""Helper function takes an event and value, extracts key and value""" | |
def __add_key_val__(keys, value): | |
if len(keys) <= 1: | |
return {keys[0]: [value,]} | |
else: | |
return __add_key_val__(keys[1:], value) | |
if event.startswith("string"): | |
keys = prefix.split(".")[2:] | |
if keys[-1].startswith("@id"): | |
source_resource[keys[-1]] = value | |
# Remove item | |
if 'item' in keys: | |
keys.remove('item') | |
source_resource.update(__add_key_val__(keys, value)) | |
return source_resource | |
def process_dpla(parser, output): | |
"""Function takes an ijson parser and an output filename and generates | |
RESP and saves to file. | |
Args: | |
parser -- ijson parser | |
output -- output RESP filename | |
""" | |
dpla_doc = {} | |
count = 0 | |
for prefix, event, value in parser: | |
if prefix == 'item' and event.startswith("start_map"): | |
dpla_doc = {} | |
key = None | |
if prefix.startswith("item._source.sourceResource"): | |
if not 'sourceResource' in dpla_doc: | |
dpla_doc['sourceResource'] = handle_source_resource(prefix, event, value) | |
else: | |
dpla_doc['sourceResource'] = handle_source_resource( | |
prefix, | |
event, | |
value, | |
dpla_doc['sourceResource']) | |
if prefix == "item" and event.startswith("end_map"): | |
count +=1 | |
if not count%10 and count > 0: | |
sys.stdout.write(".") | |
if not count%100: | |
sys.stdout.write(" {} ".format(count)) | |
redis_protocol = resource_to_resp(dpla_doc) | |
with open(output, "a", encoding='utf8', errors='ignore') as output_resp: | |
output_resp.write(redis_protocol) | |
def resource_to_resp(source_resource): | |
"""Function takes a source resource and returns Redis protocol as a | |
string. | |
Args: | |
source_resource -- Dict of source resource | |
""" | |
resp = '' | |
subject = ''.join(source_resource['sourceResource'].pop('@id')) | |
subject_sha1 = hashlib.sha1(subject.encode()).hexdigest() | |
if not subject_sha1 in ADDED_SHA1: | |
resp += generate_redis_protocol(["SET", subject_sha1, subject]) | |
ADDED_SHA1.add(subject_sha1) | |
for key, value in source_resource["sourceResource"].items(): | |
predicate_url = expand_to_url(key) | |
predicate_sha1 = hashlib.sha1(predicate_url.encode()).hexdigest() | |
if not predicate_sha1 in ADDED_SHA1: | |
resp += generate_redis_protocol( | |
["SET", | |
predicate_sha1, | |
predicate_url]) | |
ADDED_SHA1.add(predicate_sha1) | |
for row in value: | |
obj_sha1 = hashlib.sha1(row.encode()).hexdigest() | |
resp += generate_redis_protocol(["SET", obj_sha1, row]) | |
triple_subj_key = "{}:pred-obj".format(subject_sha1) | |
resp += generate_redis_protocol( | |
["SADD", | |
triple_subj_key, | |
"{}:{}".format(predicate_sha1, obj_sha1)]) | |
triple_pred_key = "{}:subj-obj".format(predicate_sha1) | |
resp += generate_redis_protocol( | |
["SADD", | |
triple_pred_key, | |
"{}:{}".format(subject_sha1, obj_sha1)]) | |
triple_obj_key = "{}:subj-pred".format(obj_sha1) | |
resp += generate_redis_protocol( | |
["SADD", | |
triple_obj_key, | |
"{}:{}".format(subject_sha1, predicate_sha1)]) | |
return resp | |
@click.command() | |
@click.option('--filepath', help='Full file path to DPLA JSON file', default=None) | |
@click.option('--url', help='URL to DPLA JSON', default=None) | |
@click.option('--output', help="Output filepath for RESP", default="output.resp") | |
def convert_dpla(filepath, url, output): | |
"""Function takes either a file path to DPLA JSON or an URL, iterates through | |
the JSON using ijson module, and generates RESP for mass insertion into a single | |
REDIS instance. | |
Args: | |
filepath -- File path to DPLA JSON | |
url -- URL to DPLA JSON | |
output -- Output file for RESP, defaults to output.resp | |
""" | |
if filepath is not None and url is not None: | |
raise ConversionError("Both Filename and Url cannot have values") | |
if filepath is None and url is None: | |
raise ConversionError("Need either a Filename or URL") | |
if filepath is not None: | |
parser = ijson.parse(open(filepath, encoding='utf8', errors='ignore')) | |
elif url is not None: | |
parser = ijson.parse(urlopen(url)) | |
start = datetime.datetime.utcnow() | |
# Prep cache | |
print("Starting conversion from DPLA JSON to RESP at {}".format(start.isoformat())) | |
process_dpla(parser, output) | |
end = datetime.datetime.utcnow() | |
print("Finished conversion of DPLA to RESP at {}, total time {} minutes".format( | |
end.isoformat(), | |
(end-start).seconds / 60.0)) | |
if __name__ == '__main__': | |
convert_dpla() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ijson>=2.0 | |
click>=5.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment