Created
June 14, 2012 08:23
-
-
Save epoz/2928984 to your computer and use it in GitHub Desktop.
Converting DNB Bibliography ntriples to BibJSON
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import json | |
import ntriples | |
import time | |
from datetime import datetime | |
import httplib | |
ES_URL = "localhost:9200" | |
ES_PATH = "/bibserver/" | |
def post_data(data): | |
data['collection'] = 'dnb' | |
data['owner'] = 'test' | |
data['_created'] = datetime.now().strftime("%Y%m%d%H%M%S") | |
data['_last_modified'] = datetime.now().strftime("%Y%m%d%H%M%S") | |
c = httplib.HTTPConnection(ES_URL) | |
c.request('PUT', ES_PATH+'record/'+data['_id'], json.dumps(data)) | |
class CountSink(object): | |
def __init__(self, profile=False): | |
self.profile = profile | |
self.length = 0 | |
self.start_time = time.time() | |
self.batch = 0 | |
self.last_id_seen = None | |
self.data = {} | |
def triple(self, s, p, o): | |
if s != self.last_id_seen: | |
if self.data: | |
post_data(self.data) | |
self.data = {} | |
self.last_id_seen = s | |
self.data['_id'] = '%x' % id(s) | |
if self.profile: | |
self.batch += 1 | |
if self.batch > 39999: | |
self.batch = 0 | |
duration = time.time() - self.start_time | |
print '%s per second at %s' % (int(self.length/duration), self.length) | |
if type(s) is ntriples.bNode: return | |
if p.startswith('http://purl.org/dc/terms/'): | |
x = p[25:] | |
if x in ('title', 'subject', 'identifier', 'creator', 'contributor', 'format', 'language', 'extent'): | |
if x in ('creator', 'contributor'): | |
self.data.setdefault('author', []).append( {'name':o}) | |
elif x == 'subject': | |
self.data.setdefault('subject', []).append( {'about':o}) | |
elif x == 'identifier': | |
self.data.setdefault('identifier', []).append({'id':o}) | |
else: | |
self.data.setdefault(x, []).append(o) | |
for x in ('isbn10', 'issn', 'isbn13'): | |
if p == 'http://purl.org/ontology/bibo/%s' % x: | |
self.data.setdefault('identifier', []).append({'type':x, 'id':o}) | |
for a,b in ( ('http://iflastandards.info/ns/isbd/elements/p1016', 'publicationPlace'), | |
('http://iflastandards.info/ns/isbd/elements/P1018', 'publicationDate') ): | |
if p == a: | |
self.data.setdefault(b, []).append(o) | |
if p == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type': | |
self.data.setdefault('type', []).append(o.split('/')[-1]) | |
self.length += 1 | |
# Make the Collection object | |
collection = { | |
"owner": "test", | |
"_created": datetime.now().strftime("%Y%m%d%H%M%S"), | |
"_last_modified": datetime.now().strftime("%Y%m%d%H%M%S"), | |
"_id": "59e84ee0191543c180b9c318bdde4ad5", | |
"collection": "dnb", | |
"label": "DEUTSCHEN NATIONALBIBLIOTHEK" | |
} | |
c = httplib.HTTPConnection(ES_URL) | |
c.request('PUT', ES_PATH+'collection/'+collection['_id'], json.dumps(collection)) | |
parser = ntriples.NTriplesParser(sink=CountSink(profile=True)) | |
sink = parser.parse(open(sys.argv[1])) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
N-Triples Parser | |
Copyright 2004, Sean B. Palmer, inamidst.com | |
Licensed under GPL 2, W3C, BSD, MIT, or EFL 2 | |
Documentation: | |
http://inamidst.com/proj/rdf/ntriples-doc | |
Command line usage: | |
./ntriples.py <URI> - parses URI as N-Triples | |
./ntriples.py --help - prints out this help message | |
# @@ fully empty document? | |
""" | |
import re | |
uriref = r'<([^:]+:[^\s"<>]+)>' | |
literal = r'"([^"\\]*(?:\\.[^"\\]*)*)"' | |
litinfo = r'(?:@([a-z]+(?:-[a-z0-9]+)*)|\^\^' + uriref + r')?' | |
r_line = re.compile(r'([^\r\n]*)(?:\r\n|\r|\n)') | |
r_wspace = re.compile(r'[ \t]*') | |
r_wspaces = re.compile(r'[ \t]+') | |
r_tail = re.compile(r'[ \t]*\.[ \t]*') | |
r_uriref = re.compile(uriref) | |
r_nodeid = re.compile(r'_:([A-Za-z][A-Za-z0-9]*)') | |
r_literal = re.compile(literal + litinfo) | |
bufsiz = 2048 | |
validate = False | |
class Node(unicode): pass | |
class URI(Node): pass | |
class bNode(Node): pass | |
class Literal(Node): | |
def __new__(cls, lit, lang=None, dtype=None): | |
n = lit # str(lang) + ' ' + str(dtype) + ' ' + lit | |
return unicode.__new__(cls, n) | |
class Sink(object): | |
def __init__(self): | |
self.length = 0 | |
def triple(self, s, p, o): | |
self.length += 1 | |
print (s, p, o) | |
class ParseError(Exception): pass | |
quot = {'t': '\t', 'n': '\n', 'r': '\r', '"': '"', '\\': '\\'} | |
r_safe = re.compile(r'([\x20\x21\x23-\x5B\x5D-\x7E]+)') | |
r_quot = re.compile(r'\\(t|n|r|"|\\)') | |
r_uniquot = re.compile(r'\\u([0-9A-F]{4})|\\U([0-9A-F]{8})') | |
def unquote(s): | |
"""Unquote an N-Triples string.""" | |
result = [] | |
while s: | |
m = r_safe.match(s) | |
if m: | |
s = s[m.end():] | |
result.append(m.group(1)) | |
continue | |
m = r_quot.match(s) | |
if m: | |
s = s[2:] | |
result.append(quot[m.group(1)]) | |
continue | |
m = r_uniquot.match(s) | |
if m: | |
s = s[m.end():] | |
u, U = m.groups() | |
codepoint = int(u or U, 16) | |
if codepoint > 0x10FFFF: | |
raise ParseError("Disallowed codepoint: %08X" % codepoint) | |
result.append(unichr(codepoint)) | |
elif s.startswith('\\'): | |
raise ParseError("Illegal escape at: %s..." % s[:10]) | |
else: raise ParseError("Illegal literal character: %r" % s[0]) | |
return unicode(''.join(result)) | |
if not validate: | |
def unquote(s): | |
return s.decode('unicode-escape') | |
r_hibyte = re.compile(r'([\x80-\xFF])') | |
def uriquote(uri): | |
return r_hibyte.sub(lambda m: '%%%02X' % ord(m.group(1)), uri) | |
if not validate: | |
def uriquote(uri): | |
return uri | |
class NTriplesParser(object): | |
"""An N-Triples Parser. | |
Usage: | |
p = NTriplesParser(sink=MySink()) | |
sink = p.parse(f) # file; use parsestring for a string | |
""" | |
def __init__(self, sink=None): | |
if sink is not None: | |
self.sink = sink | |
else: self.sink = Sink() | |
def parse(self, f): | |
"""Parse f as an N-Triples file.""" | |
if not hasattr(f, 'read'): | |
raise ParseError("Item to parse must be a file-like object.") | |
self.file = f | |
self.buffer = '' | |
while True: | |
self.line = self.readline() | |
if self.line is None: break | |
try: self.parseline() | |
except ParseError: | |
raise ParseError("Invalid line: %r" % self.line) | |
return self.sink | |
def parsestring(self, s): | |
"""Parse s as an N-Triples string.""" | |
if not isinstance(s, basestring): | |
raise ParseError("Item to parse must be a string instance.") | |
from cStringIO import StringIO | |
f = StringIO() | |
f.write(s) | |
f.seek(0) | |
self.parse(f) | |
def readline(self): | |
"""Read an N-Triples line from buffered input.""" | |
# N-Triples lines end in either CRLF, CR, or LF | |
# Therefore, we can't just use f.readline() | |
if not self.buffer: | |
buffer = self.file.read(bufsiz) | |
if not buffer: return None | |
self.buffer = buffer | |
while True: | |
m = r_line.match(self.buffer) | |
if m: # the more likely prospect | |
self.buffer = self.buffer[m.end():] | |
return m.group(1) | |
else: | |
buffer = self.file.read(bufsiz) | |
if not buffer: | |
raise ParseError("EOF in line") | |
self.buffer += buffer | |
def parseline(self): | |
self.eat(r_wspace) | |
if (not self.line) or self.line.startswith('#'): | |
return # The line is empty or a comment | |
subject = self.subject() | |
self.eat(r_wspaces) | |
predicate = self.predicate() | |
self.eat(r_wspaces) | |
object = self.object() | |
self.eat(r_tail) | |
if self.line: | |
raise ParseError("Trailing garbage") | |
self.sink.triple(subject, predicate, object) | |
def peek(self, token): | |
return self.line.startswith(token) | |
def eat(self, pattern): | |
m = pattern.match(self.line) | |
if not m: # @@ Why can't we get the original pattern? | |
raise ParseError("Failed to eat %s" % pattern) | |
self.line = self.line[m.end():] | |
return m | |
def subject(self): | |
# @@ Consider using dictionary cases | |
subj = self.uriref() or self.nodeid() | |
if not subj: | |
raise ParseError("Subject must be uriref or nodeID") | |
return subj | |
def predicate(self): | |
pred = self.uriref() | |
if not pred: | |
raise ParseError("Predicate must be uriref") | |
return pred | |
def object(self): | |
objt = self.uriref() or self.nodeid() or self.literal() | |
if not objt: | |
raise ParseError("Unrecognised object type") | |
return objt | |
def uriref(self): | |
if self.peek('<'): | |
uri = self.eat(r_uriref).group(1) | |
uri = unquote(uri) | |
uri = uriquote(uri) | |
return URI(uri) | |
return False | |
def nodeid(self): | |
if self.peek('_'): | |
return bNode(self.eat(r_nodeid).group(1)) | |
return False | |
def literal(self): | |
if self.peek('"'): | |
lit, lang, dtype = self.eat(r_literal).groups() | |
if lang and dtype: | |
raise ParseError("Can't have both a language and a datatype") | |
lit = unquote(lit) | |
return Literal(lit, lang, dtype) | |
return False | |
def parseURI(uri): | |
import urllib | |
parser = NTriplesParser() | |
u = urllib.urlopen(uri) | |
sink = parser.parse(u) | |
u.close() | |
# for triple in sink: | |
# print triple | |
print 'Length of input:', sink.length | |
def main(): | |
import sys | |
if len(sys.argv) == 2: | |
parseURI(sys.argv[1]) | |
else: print __doc__ | |
if __name__=="__main__": | |
main() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, sys | |
prefixes = [] | |
FILE_PREFIX = 'a' | |
count = batch = numfiles = bufsize = 0 | |
buf = [] | |
MB = pow(2,20) | |
while True: | |
line = sys.stdin.readline() | |
if line.startswith('@prefix'): | |
prefixes.append(line) | |
continue | |
if line.startswith('<http://'): | |
count += 1 | |
batch += 1 | |
if bufsize > (50*MB): | |
batch = 0 | |
filename = '%s%s' % (FILE_PREFIX, numfiles) | |
F = open(filename, 'w') | |
F.write(''.join(prefixes)) | |
F.write('\n') | |
F.write(''.join(buf)) | |
F.close() | |
numfiles += 1 | |
sys.stderr.write('Wrote file named %s\n' % filename) | |
buf = [] | |
bufsize = 0 | |
buf.append(line) | |
bufsize += len(line) | |
if not line: break | |
print count, 'found' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment