Created
January 2, 2014 17:36
-
-
Save jherskovic/8222898 to your computer and use it in GitHub Desktop.
Functions to iterate over the Medline Base collection files from the NLM. Requires https://github.com/martinblech/xmltodict and a MEDLINE baseline distribution in gzip format.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import glob | |
import xmltodict | |
import sys | |
import os | |
import logging | |
import hashlib | |
from gzip import GzipFile | |
from pprint import pprint | |
try: | |
PUBMED_PATH = sys.argv[1] | |
except IndexError: | |
PUBMED_PATH = "/Volumes/HDD/Installers/PubMed/ftp.nlm.nih.gov/projects/medleasebaseline/gz/" | |
def compute_md5_file(file_object, block_size=65535): | |
m_sum = hashlib.md5() | |
while True: | |
block = file_object.read(block_size) | |
if block == '': | |
break | |
m_sum.update(block) | |
return m_sum.hexdigest() | |
def medline_files(path, default_extension=".gz", check_integrity=True): | |
filelist = glob.glob(os.path.join(path, "*" + default_extension)) | |
for each_file in filelist: | |
if check_integrity: | |
expected_sum_file = each_file + ".md5" | |
# The actual md5 sum is the righmost element contained in the file | |
expected_sum = open(expected_sum_file, 'rU').read().strip().split()[-1] | |
with open(each_file, "rb") as reader: | |
file_sum = compute_md5_file(reader) | |
if file_sum != expected_sum: | |
raise ValueError("The md5 sum for %r is incorrect. It should be %r but it is %s.", each_file, | |
expected_sum, file_sum) | |
yield each_file | |
return | |
def parse_articles(medline_filename, callback): | |
logging.info("Parsing %r", medline_filename) | |
xmltodict.parse(GzipFile(medline_filename), item_depth=2, item_callback=callback, dict_constructor=dict) | |
def test_parse_callback(_, MedlineCitation): | |
if 'MeshHeadingList' in MedlineCitation: | |
if 'Journal' in MedlineCitation: | |
pprint(MedlineCitation['Journal']) | |
pprint(MedlineCitation['MeshHeadingList']) | |
return True | |
def parse_all_medline_serial(callback): | |
for m in medline_files(PUBMED_PATH): | |
parse_articles(m, callback) | |
if __name__ == "__main__": | |
parse_all_medline_serial(test_parse_callback) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment