markmacgillivray/bnb2bibjson.py

## bnb2bibjson.py
# Used to convert the data at: http://thedatahub.org/dataset/jiscopenbib-bl_bnb-1
# to a JSON format suitable for importing into BibServer

# NOTE - there will be an error in the output files. I noticed this after running,
# so used another script to fix - see the attached file
# also, the final file on this gist shows how to upload to an elasticsearch.
# I also made some changes to the JSON in that file before indexing it.

# these bits and pieces could be put into one file and done without the various writing to disk.
# But it was fine for the way I was doing this (intermittently)

# this is a conversion of https://gist.github.com/1634031 that reads and writes to files
# put the BNB data in a folder called xml then run this
# the files will be converted to bibjson into the json folder

import os, sys, json, re
from xml.etree.ElementTree import iterparse

targetdir = 'json/'
storedir = 'xml/'
dirList=os.listdir(storedir)

count = 0
for filename in dirList:
    count += 1
    print 'conversion ' + str(count) + ' - doing file ' + filename
    outfile = open(targetdir + str(count) + '.json','w')
    source = storedir + filename
    context = iter(iterparse(source, events=("start", "end")))
    event, root = context.next()
    idx = 0
    desc_buf = []
    chunk = {}
    text_collect = []
    collecting_text = False
    subject_in_scheme = None
    COLL_TYPES = ('type', 'contributor', 'publisher', 'issued', 'coverage',
                  'language', 'extent', 'description', 'subject', 'identifier')
    COMP_TYPES = {}
    outfile.write('[')
    for event, elem in context:
        root.clear()
        if event == 'start':
            if elem.tag == '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description':
                desc_buf.append(True)
            if elem.tag == '{http://purl.org/dc/terms/}title':
                chunk['title'] = [elem.text]
            for t in COLL_TYPES:
                if elem.tag == '{http://purl.org/dc/terms/}%s'% t:
                    collecting_text = True
            if collecting_text and elem.text:
                t = elem.text.strip()
                if t: text_collect.append(t)

        if event == 'end':
            if elem.tag == '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description':
                x = desc_buf.pop()
            if elem.tag == '{http://www.w3.org/2004/02/skos/core#}inScheme':
                tmp = elem.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource")
                if tmp: subject_in_scheme = tmp
            for t in COLL_TYPES:
                if elem.tag == '{http://purl.org/dc/terms/}%s' % t:
                    if subject_in_scheme:
                        chunk.setdefault(t, []).append({ 'about':' '.join(text_collect), 'type':subject_in_scheme })
                    else:
                        chunk.setdefault(t, []).append(' '.join(text_collect))
                    collecting_text, text_collect, subject_in_scheme = False, [], None

        if not desc_buf:
            idx += 1
            new_chunk = {}
            for k,v in chunk.items():
                if k == 'identifier':
                    tmp = []
                    for vv in v:
                        if vv.startswith('GB'):
                            tmp.append({'id':vv, 'type':'BNB'})
                        elif vv.startswith('URN:ISBN:'):
                            tmp.append({'id':vv, 'type':'ISBN'})
                        else:
                            tmp.append({'id':vv})
                    new_chunk[k] = tmp
                elif k == 'contributor':
                    # Check for birth/death dates in the string
                    for vv in v:
                        birth = death = None
                        bd_re = re.search('(\d\d\d\d)-(\d\d\d\d)', vv)
                        if bd_re:
                            birth, death = bd_re.groups()
                        else:
                            bd_re = re.search('(\d\d\d\d)', vv)
                            if bd_re:
                                birth = bd_re.groups()[0]
                        tmp = {'name':vv}
                        if birth: tmp['birth'] = birth
                        if death: tmp['death'] = death
                        new_chunk.setdefault('author', []).append(tmp)
                elif k == 'subject':
                    tmp = []
                    for vv in v:
                        if type(vv) is dict:
                            tmp.append(vv)
                        else:
                            tmp.append({'about':vv})
                    new_chunk[k] = tmp
                elif k not in ('subject', 'coverage', 'type'):
                    new_chunk[k] = '\n'.join(x for x in v if x)
                else:
                    new_chunk[k] = v
                if len(v) > 1:
                    COMP_TYPES[k] = True
            outfile.write(json.dumps(new_chunk, indent=2))
            outfile.write(',')
            chunk = {}

    outfile.write(']')
    outfile.close()

## bnb2es.py
# takes files converted from BNB to bibjson and sends them to an elasticsearch index
# this also performs some transforms before sending the data
# remember to set your ES params correctly

# this version uses ijson so pip install ijson and apt-get install yajl-tools

import os, json, ijson, httplib, uuid

es_url = "localhost:9200"
es_path = "/bibserver_bnb/record"

storedir = 'json/'
dirList=os.listdir(storedir)

count = 0
for filename in dirList:
    #if count > 0: break
    count += 1
    print 'round ' + str(count) + ' doing file ' + filename
    infile = open(storedir + filename,'r')

    for record in ijson.items(infile,'item'):
        if 'issued' in record:
            record['year'] = record['issued']
            del record['issued']
        if 'description' in record:
            record['description'] = record['description'].replace('\n',' ')
        if 'identifier' in record:
            for index,val in enumerate(record['identifier']):
                if val.get('type','') == 'ISBN':
                    #record['identifier'][index]['type'] = 'isbn'
                    record['identifier'][index]['id'] = val['id'].strip('URN:ISBN:')
                if val.get('type','') == 'BNB':
                    record['cid'] = val['id']
        if record:
            if 'collection' not in record:
                record['collection'] = 'bnb'
            if 'owner' not in record:
                record['owner'] = 'bibsoup'
            record['id'] = uuid.uuid4().hex
            c =  httplib.HTTPConnection(es_url)
            c.request('PUT', es_path+'/'+record['id'], json.dumps(record))
            result = c.getresponse()
            if result.status != 201:
                print result.status, result.reason, record

    infile.close()

## fixbnb2bibjson.py
# fixes the JSON  output of bnb2bibjson.py
# NOTE: this was done inefficiently to fix the problem after the fact...

import os

targetdir = 'json2/'
storedir = 'json/'
dirList=os.listdir(storedir)

count = 0
for filename in dirList:
    count += 1
    print str(count)
    infile = open(storedir+filename,'r')
    lines = infile.readlines()
    infile.close()
    outfile = open(targetdir+filename,'w')
    outfile.writelines([item for item in lines[:-1]])
    outfile.write('}]')
    outfile.close()
	# Used to convert the data at: http://thedatahub.org/dataset/jiscopenbib-bl_bnb-1
	# to a JSON format suitable for importing into BibServer

	# NOTE - there will be an error in the output files. I noticed this after running,
	# so used another script to fix - see the attached file
	# also, the final file on this gist shows how to upload to an elasticsearch.
	# I also made some changes to the JSON in that file before indexing it.

	# these bits and pieces could be put into one file and done without the various writing to disk.
	# But it was fine for the way I was doing this (intermittently)

	# this is a conversion of https://gist.github.com/1634031 that reads and writes to files
	# put the BNB data in a folder called xml then run this
	# the files will be converted to bibjson into the json folder

	import os, sys, json, re
	from xml.etree.ElementTree import iterparse

	targetdir = 'json/'
	storedir = 'xml/'
	dirList=os.listdir(storedir)

	count = 0
	for filename in dirList:
	count += 1
	print 'conversion ' + str(count) + ' - doing file ' + filename
	outfile = open(targetdir + str(count) + '.json','w')
	source = storedir + filename
	context = iter(iterparse(source, events=("start", "end")))
	event, root = context.next()
	idx = 0
	desc_buf = []
	chunk = {}
	text_collect = []
	collecting_text = False
	subject_in_scheme = None
	COLL_TYPES = ('type', 'contributor', 'publisher', 'issued', 'coverage',
	'language', 'extent', 'description', 'subject', 'identifier')
	COMP_TYPES = {}
	outfile.write('[')
	for event, elem in context:
	root.clear()
	if event == 'start':
	if elem.tag == '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description':
	desc_buf.append(True)
	if elem.tag == '{http://purl.org/dc/terms/}title':
	chunk['title'] = [elem.text]
	for t in COLL_TYPES:
	if elem.tag == '{http://purl.org/dc/terms/}%s'% t:
	collecting_text = True
	if collecting_text and elem.text:
	t = elem.text.strip()
	if t: text_collect.append(t)

	if event == 'end':
	if elem.tag == '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}Description':
	x = desc_buf.pop()
	if elem.tag == '{http://www.w3.org/2004/02/skos/core#}inScheme':
	tmp = elem.attrib.get("{http://www.w3.org/1999/02/22-rdf-syntax-ns#}resource")
	if tmp: subject_in_scheme = tmp
	for t in COLL_TYPES:
	if elem.tag == '{http://purl.org/dc/terms/}%s' % t:
	if subject_in_scheme:
	chunk.setdefault(t, []).append({ 'about':' '.join(text_collect), 'type':subject_in_scheme })
	else:
	chunk.setdefault(t, []).append(' '.join(text_collect))
	collecting_text, text_collect, subject_in_scheme = False, [], None

	if not desc_buf:
	idx += 1
	new_chunk = {}
	for k,v in chunk.items():
	if k == 'identifier':
	tmp = []
	for vv in v:
	if vv.startswith('GB'):
	tmp.append({'id':vv, 'type':'BNB'})
	elif vv.startswith('URN:ISBN:'):
	tmp.append({'id':vv, 'type':'ISBN'})
	else:
	tmp.append({'id':vv})
	new_chunk[k] = tmp
	elif k == 'contributor':
	# Check for birth/death dates in the string
	for vv in v:
	birth = death = None
	bd_re = re.search('(\d\d\d\d)-(\d\d\d\d)', vv)
	if bd_re:
	birth, death = bd_re.groups()
	else:
	bd_re = re.search('(\d\d\d\d)', vv)
	if bd_re:
	birth = bd_re.groups()[0]
	tmp = {'name':vv}
	if birth: tmp['birth'] = birth
	if death: tmp['death'] = death
	new_chunk.setdefault('author', []).append(tmp)
	elif k == 'subject':
	tmp = []
	for vv in v:
	if type(vv) is dict:
	tmp.append(vv)
	else:
	tmp.append({'about':vv})
	new_chunk[k] = tmp
	elif k not in ('subject', 'coverage', 'type'):
	new_chunk[k] = '\n'.join(x for x in v if x)
	else:
	new_chunk[k] = v
	if len(v) > 1:
	COMP_TYPES[k] = True
	outfile.write(json.dumps(new_chunk, indent=2))
	outfile.write(',')
	chunk = {}

	outfile.write(']')
	outfile.close()
	# takes files converted from BNB to bibjson and sends them to an elasticsearch index
	# this also performs some transforms before sending the data
	# remember to set your ES params correctly

	# this version uses ijson so pip install ijson and apt-get install yajl-tools

	import os, json, ijson, httplib, uuid

	es_url = "localhost:9200"
	es_path = "/bibserver_bnb/record"

	storedir = 'json/'
	dirList=os.listdir(storedir)

	count = 0
	for filename in dirList:
	#if count > 0: break
	count += 1
	print 'round ' + str(count) + ' doing file ' + filename
	infile = open(storedir + filename,'r')

	for record in ijson.items(infile,'item'):
	if 'issued' in record:
	record['year'] = record['issued']
	del record['issued']
	if 'description' in record:
	record['description'] = record['description'].replace('\n',' ')
	if 'identifier' in record:
	for index,val in enumerate(record['identifier']):
	if val.get('type','') == 'ISBN':
	#record['identifier'][index]['type'] = 'isbn'
	record['identifier'][index]['id'] = val['id'].strip('URN:ISBN:')
	if val.get('type','') == 'BNB':
	record['cid'] = val['id']
	if record:
	if 'collection' not in record:
	record['collection'] = 'bnb'
	if 'owner' not in record:
	record['owner'] = 'bibsoup'
	record['id'] = uuid.uuid4().hex
	c = httplib.HTTPConnection(es_url)
	c.request('PUT', es_path+'/'+record['id'], json.dumps(record))
	result = c.getresponse()
	if result.status != 201:
	print result.status, result.reason, record

	infile.close()
	# fixes the JSON output of bnb2bibjson.py
	# NOTE: this was done inefficiently to fix the problem after the fact...

	import os

	targetdir = 'json2/'
	storedir = 'json/'
	dirList=os.listdir(storedir)

	count = 0
	for filename in dirList:
	count += 1
	print str(count)
	infile = open(storedir+filename,'r')
	lines = infile.readlines()
	infile.close()
	outfile = open(targetdir+filename,'w')
	outfile.writelines([item for item in lines[:-1]])
	outfile.write('}]')
	outfile.close()