Skip to content

Instantly share code, notes, and snippets.

@pbsds
Created June 27, 2023 10:57
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pbsds/62177a447abac942cff5741cb71e11f9 to your computer and use it in GitHub Desktop.
Save pbsds/62177a447abac942cff5741cb71e11f9 to your computer and use it in GitHub Desktop.
Example: `python download-from-unit-bird-no.py https://hdl.handle.net/11250/3059978`
#!/usr/bin/env python
import requests
from requests.utils import parse_url
from functools import lru_cache
from pathlib import Path
import shlex
from tqdm.auto import tqdm
import re
import typer
import os
import math
HERE = Path(__file__).parent
CWD = Path(os.getcwd())
def get(url:str, **headers) -> requests.Response:
return requests.get(url, headers=headers)
@lru_cache(maxsize=1)
def auth() -> dict:
resp = get("https://api.loke.aws.unit.no/dlr-gui-backend-login/v1/anonymous",
Origin ="https://bird.unit.no",
Referer="https://bird.unit.no/",
)
assert resp.ok, resp
return dict(Authorization = f"Bearer {resp.text.strip()}")
def resolve_handle(url: str) -> str:
resp = requests.head(url)
assert resp.ok, resp
return resp.headers["Location"]
def get_download_url(url: str) -> str:
if parse_url(url).host == "hdl.handle.net":
url = resolve_handle(url)
assert parse_url(url).host == "bird.unit.no", url
bird_unit_id = parse_url(url).path.rsplit("/", 1)[-1]
resp = get(f"https://api.loke.aws.unit.no/dlr-gui-backend-resources-content/v2/resources/{bird_unit_id}/contents/default", **auth())
assert resp.ok, resp
return resp.json()["features"]["dlr_content_url"]
def download(url, target: Path = CWD):
resp = requests.get(url, stream=True)
fname, = shlex.split(resp.headers['content-disposition'].split("filename=", 1)[1])
size = int(resp.headers.get('content-length', 0))
written = 0
with tqdm.wrapattr(
(target / fname).open('wb'), "write",
unit='B', unit_scale=True, unit_divisor=1024, miniters=1,
desc=fname, total=int(resp.headers.get('content-length', 0))
) as f:
for chunk in resp.iter_content(chunk_size=4096):
f.write(chunk)
url = "https://hdl.handle.net/11250/3059978"
app = typer.Typer(
add_completion = False,
no_args_is_help = True,
)
@app.command()
def main(url: str, dir: Path = CWD):
download_url = get_download_url(url)
download(download_url, dir)
if __name__ == "__main__":
app()
@pbsds
Copy link
Author

pbsds commented Jun 27, 2023

The same thing in bash:

HANDLE="https://hdl.handle.net/11250/3059978"
BIRD_UNIT_URL="$(curl -si "$HANDLE" | grep '^location: ' | cut -d' ' -f2- | tr -d '\r')"
BIRD_UNIT_ID="$(echo "$BIRD_UNIT_URL" | rev | cut -d/ -f1 | rev)"
BEARER="$(curl -s 'https://api.loke.aws.unit.no/dlr-gui-backend-login/v1/anonymous' -H 'Origin: https://bird.unit.no' -H 'Referer: https://bird.unit.no/')"
curl -s "https://api.loke.aws.unit.no/dlr-gui-backend-resources-content/v2/resources/$BIRD_UNIT_ID/contents/default" -H "Authorization: Bearer $BEARER" | jq .features.dlr_content_url -r | xargs wget

Why is requests so bad?

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment