Last active
November 9, 2016 18:41
-
-
Save benosteen/7dd20109bbdf7716218ba73279c70a3c to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# NB this will flatten the data. Some fields (author, pdf, imgs) have extra data that will be lost if you include them in this data. | |
# the "author" field has a variety of nuances that will be lost for example (creator, editor, etc) | |
# Intended for use with https://dx.doi.org/10.21250/DB21 | |
# MIT Licence 2016 | |
import json, csv | |
EXPORTFILENAME = "book_data.csv" | |
FIELDS = ['datefield', 'shelfmarks', 'title', 'publisher', 'edition', 'flickr_url_to_book_images', 'place', 'issuance', | |
'authors', 'identifier', 'corporate', 'date', | |
#'imgs', 'pdf', | |
] | |
def flatten(item): | |
try: | |
for i in item.itervalues(): | |
for value in flatten(i): | |
yield value | |
except AttributeError as e: | |
# what was passed is not a dict | |
for value in item: | |
yield value | |
with open("book_data.json", "r") as bkd: | |
doc = json.load(bkd) | |
with open(EXPORTFILENAME, "w") as exfh: | |
outdoc = csv.DictWriter(exfh, fieldnames = FIELDS) | |
outdoc.writerow({x:x for x in FIELDS}) | |
for idx,row in enumerate(doc): | |
line_of_data = {} | |
for f in FIELDS: | |
if isinstance(row[f], basestring): | |
line_of_data[f] = row[f].encode("utf-8") | |
elif isinstance(row[f], list): | |
line_of_data[f] = u",".join(x for x in row[f]).encode("utf-8") | |
else: | |
line_of_data[f] = u",".join(flatten(row[f])).encode("utf-8") | |
outdoc.writerow(line_of_data) | |
if not (idx + 1) % 1000: | |
print("Processing row {0}".format(idx+1)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment