Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
Functions to iterate over the Medline Base collection files from the NLM. Requires and a MEDLINE baseline distribution in gzip format.
import glob
import xmltodict
import sys
import os
import logging
import hashlib
from gzip import GzipFile
from pprint import pprint
PUBMED_PATH = sys.argv[1]
except IndexError:
PUBMED_PATH = "/Volumes/HDD/Installers/PubMed/"
def compute_md5_file(file_object, block_size=65535):
m_sum = hashlib.md5()
while True:
block =
if block == '':
return m_sum.hexdigest()
def medline_files(path, default_extension=".gz", check_integrity=True):
filelist = glob.glob(os.path.join(path, "*" + default_extension))
for each_file in filelist:
if check_integrity:
expected_sum_file = each_file + ".md5"
# The actual md5 sum is the righmost element contained in the file
expected_sum = open(expected_sum_file, 'rU').read().strip().split()[-1]
with open(each_file, "rb") as reader:
file_sum = compute_md5_file(reader)
if file_sum != expected_sum:
raise ValueError("The md5 sum for %r is incorrect. It should be %r but it is %s.", each_file,
expected_sum, file_sum)
yield each_file
def parse_articles(medline_filename, callback):"Parsing %r", medline_filename)
xmltodict.parse(GzipFile(medline_filename), item_depth=2, item_callback=callback, dict_constructor=dict)
def test_parse_callback(_, MedlineCitation):
if 'MeshHeadingList' in MedlineCitation:
if 'Journal' in MedlineCitation:
return True
def parse_all_medline_serial(callback):
for m in medline_files(PUBMED_PATH):
parse_articles(m, callback)
if __name__ == "__main__":
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment