Created
February 17, 2012 11:08
-
-
Save richard-jones/1852712 to your computer and use it in GitHub Desktop.
Convert Medline XML to JSON for indexing in Elastic Search
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from lxml import etree | |
import json | |
outfile = open("kv.json", "w+") | |
outfile.write("[\n") | |
tree = etree.parse("medline11n0001.xml") | |
elem = tree.getroot() | |
# for every item in the xml file, parse it and create a JSON object of it | |
recordcount = 1 | |
for sub in elem: | |
if recordcount != 1: | |
outfile.write(",") | |
# parse the item into a dict | |
doc = {} | |
doc["collection"] = ["medline"] | |
# there is always a pmid | |
doc["pmid"] = sub.find("PMID").text | |
# try/except blocks for each field we are interested in | |
try: | |
doc["affiliation"] = sub.find("Affiliation").text | |
except: | |
pass | |
try: | |
keywordlist = sub.find("KeywordList") | |
doc["keywords"] = [] | |
for keyword in keywordlist: | |
doc["keywords"].append(keyword.text) | |
except: | |
pass | |
try: | |
grantlist = sub.find("GrantList") | |
doc["grants"] = [] | |
for grant in grantlist: | |
doc["grants"].append(grant.find("Agency").text) | |
except: | |
pass | |
try: | |
article = sub.find("Article") | |
doc["title"] = article.find("ArticleTitle").text | |
doc["language"] = article.find("Language").text | |
except: | |
pass | |
try: | |
doi = article.find("ELocationID") | |
if doi.attrib["EIdType"] == "doi": | |
doc["doi"] = doi.text | |
except: | |
pass | |
try: | |
authorlist = article.find("AuthorList") | |
doc["author"] = [] | |
for author in authorlist: | |
lastname = author.find("LastName").text | |
firstname = author.find("ForeName").text | |
initials = author.find("Initials").text | |
doc["author"].append(firstname + " " + lastname) | |
except: | |
pass | |
try: | |
journal = article.find("Journal") | |
doc["journal"] = journal.find("Title").text | |
doc["journaliso"] = journal.find("ISOAbbreviation").text | |
doc["issn"] = journal.find("ISSN").text | |
except: | |
pass | |
try: | |
journalissue = journal.find("JournalIssue") | |
doc["volume"] = journalissue.find("Volume").text | |
except: | |
pass | |
try: | |
journalpubdate = journalissue.find("PubDate") | |
doc["year"] = journalpubdate.find("Year").text | |
doc["month"] = journalpubdate.find("Month").text | |
except: | |
pass | |
try: | |
articledate = article.find("ArticleDate") | |
doc["year"] = articledate.find("Year").text | |
doc["month"] = articledate.find("Month").text | |
doc["day"] = articledate.find("Day").text | |
except: | |
pass | |
# dump the dict to JSON | |
data = json.dumps(doc, indent=4) | |
outfile.write(data + "\n\n") | |
# increment the record count then it is time to loop | |
recordcount += 1 | |
outfile.write("]") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment