Skip to content

Instantly share code, notes, and snippets.

@ceteri
Created December 11, 2019 22:02
Show Gist options
  • Save ceteri/bdfb2be4b7ea169c2fc813ab80557623 to your computer and use it in GitHub Desktop.
Save ceteri/bdfb2be4b7ea169c2fc813ab80557623 to your computer and use it in GitHub Desktop.
Python code to handle unicode and codecs properly during CSV => JSON translation
#!/usr/bin/env python
# encoding: utf-8
import codecs
import csv
import json
import unicodedata
filename = "SurveyofDoctorateRecipients_linkages.csv"
#filename = "SurveyofEarnedDoctorates_linkages.csv"
pubs = []
with codecs.open(filename, encoding="utf8") as f:
csv_reader = csv.reader(f, delimiter=",")
# skip header
next(csv_reader)
for row in csv_reader:
dataset, doi, journal, search_term, title = row
pubs.append({
"datasets": [ dataset ],
"title": unicodedata.normalize("NFKD", title),
"journal": unicodedata.normalize("NFKD", journal),
"doi": doi,
"search_term": search_term
})
with codecs.open("sample.json", "wb", encoding="utf8") as f:
json.dump(pubs, f, indent=2, sort_keys=True, ensure_ascii=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment