Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save nad2000/54be888ab893ec06bd55fc0622e485a9 to your computer and use it in GitHub Desktop.
Save nad2000/54be888ab893ec06bd55fc0622e485a9 to your computer and use it in GitHub Desktop.
NZ items in ORCID's public data files
# -*- coding: utf-8 -*-
"""
Created on Wed Jul 05 11:53:20 2017
Can we get json to do the same tricks as xml, and then pull into mongodb for reporting
ORCID public data files are tar.gz with all json serially before xml, so can speed the process a wee bit by using the json
The real question is whether the json library is quicker with json, than ETree is with xml
NB: version 2.0 of ORCID schema
@author: Jason
"""
import os, json
import tarfile
tarpath = 'D:/DATA/json_play.tar.gz'
tar = tarfile.open(tarpath, 'r:gz')
i = 0
errorlog = open('D:/DATA/json_play.error.log', 'w')
for tar_info in tar:
i += 1
tar.members = []
to_write = 0
filename = tar_info.name
# can stop when we hit the xml
if filename.endswith('.xml'):
break
if not filename.endswith('.json'):
continue
# counter is just to monitor progress
if i % 1000 == 0 : print i, filename
filename = filename.split("/")[-1]
tarjsonfile = tar.extractfile(tar_info).read()
data = json.loads(tarjsonfile)
try:
# check the person country for "NZ"
try:
if data['person']['addresses']['address'][0]['country']['value'] == "NZ":
to_write = 1
except:
# person address doesn't exist
pass
if to_write == 0:
# loop through any education affiliations and check for country "NZ"
try:
for edu_organizations in data['activities-summary']['educations']['education-summary']:
if edu_organizations['organization']['address']['country'] == "NZ":
to_write = 1
break
except:
#no education affiliations
pass
if to_write == 0:
# loop through any employment affiliations and check for country "NZ"
try:
for emp_organizations in data['activities-summary']['employments']['employment-summary']:
if emp_organizations['organization']['address']['country'] == "NZ":
to_write = 1
break
except:
#no employment affiliations
pass
print filename + ', ' + str(to_write)
if to_write == 1:
# Somewhere an NZ affiliation has been found
with open(os.path.join('D:/DATA/json_write', filename), 'w') as outfile:
outfile.write(tarjsonfile)
except:
errorlog.write('Problem with: ' + filename + '\n')
with open(os.path.join('D:/DATA/json_error', filename), 'w') as outfile:
outfile.write(tarjsonfile)
tar.close()
errorlog.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment