Created
June 27, 2016 18:46
-
-
Save HansNelsen/aeec93279dcd1792855d39fc37bead2e to your computer and use it in GitHub Desktop.
Example of looping over openFDA drug event zip files.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' Simple example of reading all of the zip files from the openFDA download | |
and doing something with them. In this case, we are building an index | |
of all the medicinalproduct and drugcharacterization values and a count | |
of how often each occurred. | |
''' | |
from collections import defaultdict | |
import czipfile as zipfile | |
import glob | |
import simplejson as json | |
from os.path import basename | |
# Change to the location of where you downloaded the zip files. | |
DATA_FILES = './data/export/2016-02-05/drug/event/*/*.zip' | |
def build_histogram(): | |
hist = defaultdict(lambda: defaultdict(int)) | |
for filename in glob.glob(DATA_FILES): | |
json_file = basename(filename).replace('.zip', '') | |
print 'Processing file %s from %s' % (json_file, filename) | |
datafile = zipfile.ZipFile(filename, 'r') | |
json_data = json.load(datafile.open(json_file)) | |
for row in json_data['results']: | |
for drug in row.get('patient', {}).get('drug', []): | |
characterization = drug.get('drugcharacterization', 'unknown') | |
product = drug.get('medicinalproduct', 'unknown') | |
hist[characterization][product] += 1 | |
return hist | |
histogram = build_histogram() | |
with open('histogram.json', 'w') as json_out: | |
json.dump(histogram, json_out, indent=2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment