Created
October 24, 2023 21:39
-
-
Save Teque5/432971e88440ba78fc5c6147bcdf1662 to your computer and use it in GitHub Desktop.
Dataverse DOI Scraper
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# SPDX-FileContributor: 2023 The Aerospace Corporation | |
# SPDX-DocumentComment: Approved 2023-10-23 Request # OSS23-0008 | |
# SPDX-License-Identifier: LGPL-3.0-or-later | |
"""Dataverse Scraper""" | |
import argparse | |
from pathlib import Path | |
import hashlib | |
import logging | |
import subprocess | |
import numpy as np | |
try: | |
import pyDataverse | |
from pyDataverse.api import NativeApi | |
except ImportError: | |
print("pyDataverse not found, run\n $ pip install pyDataverse\n") | |
raise | |
try: | |
from rich.logging import RichHandler | |
except ImportError: | |
print("rich logging not found, run\n $ pip install rich\n") | |
raise | |
def scrape(base_path: str, base_url: str, doi: str, shuffle: bool = True) -> None: | |
""" | |
Download from Dataverse by DOI. | |
Especially helpful for large datasets or slow connections. | |
Features | |
-------- | |
* Skip complete files | |
* Get files in random order | |
* Resume incomplete downloads | |
* Check MD5sum when complete | |
Arguments | |
--------- | |
base_path: str | |
Relative path where to save dataset. | |
base_url : str | |
Cloud server hosting the dataset like `https://dataverse.no`. | |
doi : str | |
Dataset object identifier like `10.18710/Q8OSON`. | |
shuffle : bool, default True | |
When true will retrive files in random order so dataset can be | |
inspected as it is retrieved. | |
References | |
---------- | |
[1] https://guides.dataverse.org/en/5.12.1/user/find-use-data.html | |
[2] https://pydataverse.readthedocs.io/en/latest/user/basic-usage.html#download-and-save-a-dataset-to-disk | |
Future Improvements | |
------------------- | |
* Currently only supports public datasets. Private datasets require API token. | |
* Parse dataset URL into base_url and doi automatically. | |
* Download multiple files in parallel. I found in practice this wasn't | |
useful since there was a global connection speed limit. | |
""" | |
log.info(f"Scrape {doi} from {base_url}") | |
log.info(f"Destination: {base_path}") | |
try: | |
api = NativeApi(base_url) | |
info = api.get_info_version().json() | |
assert info["status"] == "OK" | |
except AssertionError: | |
log.error("Connection Issue?") | |
raise | |
except OSError: | |
log.error("Proxy Issue?") | |
raise | |
except pyDataverse.exceptions.OperationFailedError: | |
log.error("Dataverse Issue. Service Outage?") | |
raise | |
else: | |
log.info("Dataverse API OK") | |
try: | |
dataset = api.get_dataset(f"doi:{doi}") | |
except pyDataverse.exceptions.OperationFailedError: | |
log.error("DOI not found. Should be formatted like `10.18710/KLISS5`") | |
raise | |
else: | |
log.info("DOI OK") | |
try: | |
files_list = dataset.json()["data"]["latestVersion"]["files"] | |
assert len(files_list) > 0 | |
except KeyError: | |
log.error("Expected dataset to have different structure.") | |
raise | |
except AssertionError: | |
log.error("Dataset contains no files.") | |
raise | |
else: | |
log.info(f"Dataset found with {len(files_list)} files.") | |
if shuffle: | |
# Shuffle the file list so we retrieve things in same random order | |
np.random.seed(0xDEADBEEF) | |
np.random.shuffle(files_list) | |
for fdx, file in enumerate(files_list): | |
file_name = file["dataFile"]["filename"] | |
file_id = file["dataFile"]["id"] | |
file_md5 = file["dataFile"]["md5"] | |
file_dir = file.get("directoryLabel", "") | |
# create directory if necessary | |
out_dir = Path(f"{base_path}/{file_dir}/").resolve() | |
out_dir.mkdir(parents=True, exist_ok=True) | |
out_file = out_dir / file_name | |
# check if file already exists and MD5 is okay | |
if out_file.exists(): | |
with open(out_file, "rb") as derp: | |
result_md5 = hashlib.md5(derp.read()).hexdigest() | |
if file_md5 == result_md5: | |
# log and exit early | |
log.info(f"# {fdx}/{len(files_list)}, {file_name}, id {file_id} OK") | |
continue | |
# actually get the file | |
log.info(f"# {fdx}/{len(files_list)}, {file_name}, id {file_id} WGET") | |
# `-c` continues broken downloads; wget will exit when file is correct size | |
rc = subprocess.call( | |
f'wget -c {base_url}/api/access/datafile/{file_id} -O "{out_file}"', | |
shell=True, | |
) | |
assert rc == 0, "wget did not exit cleanly" | |
with open(out_file, "rb") as derp: | |
result_md5 = hashlib.md5(derp.read()).hexdigest() | |
try: | |
assert file_md5 == result_md5 | |
except AssertionError: | |
log.error(f"MD5 mismatch. Delete corrupt file:\n{out_file}") | |
raise | |
else: | |
log.info("MD5 checksum OK") | |
log.info("Done") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description=__doc__) | |
parser.add_argument("doi", help="Data object identifier like `10.18710/Q8OSON` or `10.18710/BVG5VY`") | |
parser.add_argument("--base_url", default="https://dataverse.no", help="Cloud domain like `https://dataverse.no`") | |
parser.add_argument("-o", "--output_path", default=".", help="Where to save dataset") | |
parser.add_argument("-v", "--verbose", action="count", default=0) | |
args = parser.parse_args() | |
log = logging.getLogger("scraper") | |
logging.basicConfig( | |
level=logging.DEBUG if args.verbose else logging.INFO, | |
handlers=[RichHandler()] | |
) | |
scrape(base_path=args.output_path, base_url=args.base_url, doi=args.doi) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Running the script will look something like this: