Created
June 27, 2023 10:57
-
-
Save pbsds/62177a447abac942cff5741cb71e11f9 to your computer and use it in GitHub Desktop.
Example: `python download-from-unit-bird-no.py https://hdl.handle.net/11250/3059978`
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import requests | |
from requests.utils import parse_url | |
from functools import lru_cache | |
from pathlib import Path | |
import shlex | |
from tqdm.auto import tqdm | |
import re | |
import typer | |
import os | |
import math | |
HERE = Path(__file__).parent | |
CWD = Path(os.getcwd()) | |
def get(url:str, **headers) -> requests.Response: | |
return requests.get(url, headers=headers) | |
@lru_cache(maxsize=1) | |
def auth() -> dict: | |
resp = get("https://api.loke.aws.unit.no/dlr-gui-backend-login/v1/anonymous", | |
Origin ="https://bird.unit.no", | |
Referer="https://bird.unit.no/", | |
) | |
assert resp.ok, resp | |
return dict(Authorization = f"Bearer {resp.text.strip()}") | |
def resolve_handle(url: str) -> str: | |
resp = requests.head(url) | |
assert resp.ok, resp | |
return resp.headers["Location"] | |
def get_download_url(url: str) -> str: | |
if parse_url(url).host == "hdl.handle.net": | |
url = resolve_handle(url) | |
assert parse_url(url).host == "bird.unit.no", url | |
bird_unit_id = parse_url(url).path.rsplit("/", 1)[-1] | |
resp = get(f"https://api.loke.aws.unit.no/dlr-gui-backend-resources-content/v2/resources/{bird_unit_id}/contents/default", **auth()) | |
assert resp.ok, resp | |
return resp.json()["features"]["dlr_content_url"] | |
def download(url, target: Path = CWD): | |
resp = requests.get(url, stream=True) | |
fname, = shlex.split(resp.headers['content-disposition'].split("filename=", 1)[1]) | |
size = int(resp.headers.get('content-length', 0)) | |
written = 0 | |
with tqdm.wrapattr( | |
(target / fname).open('wb'), "write", | |
unit='B', unit_scale=True, unit_divisor=1024, miniters=1, | |
desc=fname, total=int(resp.headers.get('content-length', 0)) | |
) as f: | |
for chunk in resp.iter_content(chunk_size=4096): | |
f.write(chunk) | |
url = "https://hdl.handle.net/11250/3059978" | |
app = typer.Typer( | |
add_completion = False, | |
no_args_is_help = True, | |
) | |
@app.command() | |
def main(url: str, dir: Path = CWD): | |
download_url = get_download_url(url) | |
download(download_url, dir) | |
if __name__ == "__main__": | |
app() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
The same thing in bash:
Why is
requests
so bad?