Skip to content

Instantly share code, notes, and snippets.

@jaklinger
Created May 27, 2021 12:36
Show Gist options
  • Save jaklinger/5f2ed4fc8d3f752d3de50e63b0445bcd to your computer and use it in GitHub Desktop.
Save jaklinger/5f2ed4fc8d3f752d3de50e63b0445bcd to your computer and use it in GitHub Desktop.
Read CORD19 data
from tempfile import TemporaryFile
import requests
import shutil
import tarfile
import csv
URL = 'https://ai2-semanticscholar-cord-19.s3-us-west-2.amazonaws.com/historical_releases/cord-19_{date}.tar.gz'
CSV_PATH = '{date}/all_sources_metadata_{date}.csv'
def stream_to_file(url, fileobj):
"""Stream contents from url to fileobj"""
with requests.get(url, stream=True) as r:
r.raise_for_status()
shutil.copyfileobj(r.raw, fileobj)
fileobj.seek(0) # reset
def cord_csv(date):
"""
Returns only the CSV metadata file from the CORD19 dataset, by
writing a temporary file to keep the in-memory processing to a minimum
since the largest CORD tarfile is 9GB
"""
url = URL.format(date=date)
filename = CSV_PATH.format(date=date)
with TemporaryFile(suffix='.tar.gz') as fileobj:
stream_to_file(url, fileobj)
with tarfile.open(fileobj=fileobj) as tf:
with tf.extractfile(filename) as csv:
return StringIO(csv.read().decode('latin'))
def cord_data(date):
"""Yield lines (dict) from the CORD19 CSV for this date"""
with cord_csv(date) as f:
for line in csv.DictReader(f):
yield line
data = list(cord_data('2020-03-13'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment