Skip to content

Instantly share code, notes, and snippets.

@Teque5
Created October 24, 2023 21:39
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Teque5/432971e88440ba78fc5c6147bcdf1662 to your computer and use it in GitHub Desktop.
Save Teque5/432971e88440ba78fc5c6147bcdf1662 to your computer and use it in GitHub Desktop.
Dataverse DOI Scraper
#!/usr/bin/env python3
# SPDX-FileContributor: 2023 The Aerospace Corporation
# SPDX-DocumentComment: Approved 2023-10-23 Request # OSS23-0008
# SPDX-License-Identifier: LGPL-3.0-or-later
"""Dataverse Scraper"""
import argparse
from pathlib import Path
import hashlib
import logging
import subprocess
import numpy as np
try:
import pyDataverse
from pyDataverse.api import NativeApi
except ImportError:
print("pyDataverse not found, run\n $ pip install pyDataverse\n")
raise
try:
from rich.logging import RichHandler
except ImportError:
print("rich logging not found, run\n $ pip install rich\n")
raise
def scrape(base_path: str, base_url: str, doi: str, shuffle: bool = True) -> None:
"""
Download from Dataverse by DOI.
Especially helpful for large datasets or slow connections.
Features
--------
* Skip complete files
* Get files in random order
* Resume incomplete downloads
* Check MD5sum when complete
Arguments
---------
base_path: str
Relative path where to save dataset.
base_url : str
Cloud server hosting the dataset like `https://dataverse.no`.
doi : str
Dataset object identifier like `10.18710/Q8OSON`.
shuffle : bool, default True
When true will retrive files in random order so dataset can be
inspected as it is retrieved.
References
----------
[1] https://guides.dataverse.org/en/5.12.1/user/find-use-data.html
[2] https://pydataverse.readthedocs.io/en/latest/user/basic-usage.html#download-and-save-a-dataset-to-disk
Future Improvements
-------------------
* Currently only supports public datasets. Private datasets require API token.
* Parse dataset URL into base_url and doi automatically.
* Download multiple files in parallel. I found in practice this wasn't
useful since there was a global connection speed limit.
"""
log.info(f"Scrape {doi} from {base_url}")
log.info(f"Destination: {base_path}")
try:
api = NativeApi(base_url)
info = api.get_info_version().json()
assert info["status"] == "OK"
except AssertionError:
log.error("Connection Issue?")
raise
except OSError:
log.error("Proxy Issue?")
raise
except pyDataverse.exceptions.OperationFailedError:
log.error("Dataverse Issue. Service Outage?")
raise
else:
log.info("Dataverse API OK")
try:
dataset = api.get_dataset(f"doi:{doi}")
except pyDataverse.exceptions.OperationFailedError:
log.error("DOI not found. Should be formatted like `10.18710/KLISS5`")
raise
else:
log.info("DOI OK")
try:
files_list = dataset.json()["data"]["latestVersion"]["files"]
assert len(files_list) > 0
except KeyError:
log.error("Expected dataset to have different structure.")
raise
except AssertionError:
log.error("Dataset contains no files.")
raise
else:
log.info(f"Dataset found with {len(files_list)} files.")
if shuffle:
# Shuffle the file list so we retrieve things in same random order
np.random.seed(0xDEADBEEF)
np.random.shuffle(files_list)
for fdx, file in enumerate(files_list):
file_name = file["dataFile"]["filename"]
file_id = file["dataFile"]["id"]
file_md5 = file["dataFile"]["md5"]
file_dir = file.get("directoryLabel", "")
# create directory if necessary
out_dir = Path(f"{base_path}/{file_dir}/").resolve()
out_dir.mkdir(parents=True, exist_ok=True)
out_file = out_dir / file_name
# check if file already exists and MD5 is okay
if out_file.exists():
with open(out_file, "rb") as derp:
result_md5 = hashlib.md5(derp.read()).hexdigest()
if file_md5 == result_md5:
# log and exit early
log.info(f"# {fdx}/{len(files_list)}, {file_name}, id {file_id} OK")
continue
# actually get the file
log.info(f"# {fdx}/{len(files_list)}, {file_name}, id {file_id} WGET")
# `-c` continues broken downloads; wget will exit when file is correct size
rc = subprocess.call(
f'wget -c {base_url}/api/access/datafile/{file_id} -O "{out_file}"',
shell=True,
)
assert rc == 0, "wget did not exit cleanly"
with open(out_file, "rb") as derp:
result_md5 = hashlib.md5(derp.read()).hexdigest()
try:
assert file_md5 == result_md5
except AssertionError:
log.error(f"MD5 mismatch. Delete corrupt file:\n{out_file}")
raise
else:
log.info("MD5 checksum OK")
log.info("Done")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("doi", help="Data object identifier like `10.18710/Q8OSON` or `10.18710/BVG5VY`")
parser.add_argument("--base_url", default="https://dataverse.no", help="Cloud domain like `https://dataverse.no`")
parser.add_argument("-o", "--output_path", default=".", help="Where to save dataset")
parser.add_argument("-v", "--verbose", action="count", default=0)
args = parser.parse_args()
log = logging.getLogger("scraper")
logging.basicConfig(
level=logging.DEBUG if args.verbose else logging.INFO,
handlers=[RichHandler()]
)
scrape(base_path=args.output_path, base_url=args.base_url, doi=args.doi)
@Teque5
Copy link
Author

Teque5 commented Oct 24, 2023

Running the script will look something like this:

teque5 ¥ ./get_dataverse_doi.py -o /tmp/trashme/ 10.18710/BVG5VY
[10/24/23 14:42:22] INFO     INFO:scraper:Scrape 10.18710/BVG5VY from https://dataverse.no                                        get_dataverse_doi.py:62
                    INFO     INFO:scraper:Destination: /tmp/trashme/                                                              get_dataverse_doi.py:63
[10/24/23 14:42:23] INFO     INFO:scraper:Dataverse API OK                                                                        get_dataverse_doi.py:78
[10/24/23 14:42:24] INFO     INFO:scraper:DOI OK                                                                                  get_dataverse_doi.py:86
                    INFO     INFO:scraper:Dataset found with 3 files.                                                             get_dataverse_doi.py:98
                    INFO     INFO:scraper:# 0/3, Compilation of survey responses.pdf, id 188649 OK                               get_dataverse_doi.py:122
                    INFO     INFO:scraper:# 1/3, 00_ReadMe.txt, id 188648 OK                                                     get_dataverse_doi.py:122
                    INFO     INFO:scraper:# 2/3, AHP calculation.csv, id 188771 OK                                               get_dataverse_doi.py:122
                    INFO     INFO:scraper:Done                                                                                   get_dataverse_doi.py:143

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment