Skip to content

Instantly share code, notes, and snippets.

@jayme-github
Last active February 3, 2022 13:49
Show Gist options
  • Save jayme-github/22dfad7023c909e0ba047ef3b30fecae to your computer and use it in GitHub Desktop.
Save jayme-github/22dfad7023c909e0ba047ef3b30fecae to your computer and use it in GitHub Desktop.
Download LAADS data from https://ladsweb.modaps.eosdis.nasa.gov
#!/bin/env python3
#
# Download LAADS data from https://ladsweb.modaps.eosdis.nasa.gov
#
import argparse
import concurrent.futures
import datetime
import shutil
import sys
import threading
from pathlib import Path
from urllib3.util import Retry
import requests
USERAGENT = "tis/download.py_1.0--" + sys.version.replace("\n", "").replace("\r", "")
def buildSession(token):
session = requests.Session()
# Authorize against API
session.headers = {"Authorization": "Bearer " + token, "User-Agent": USERAGENT}
# Mount a retry adapter to account for server errors
# total:
# Total number of retries to allow. Takes precedence over other counts.
#
# backoff_factor:
# A backoff factor to apply between attempts after the second try
# (most errors are resolved immediately by a second try without a delay).
# urllib3 will sleep for:
# {backoff factor} * (2 ** ({number of total retries} - 1)) seconds.
# If the backoff_factor is 0.1, then .sleep will sleep for [0.0s, 0.2s, 0.4s, ...]
# between retries. It will never be longer than Retry.BACKOFF_MAX.
#
# status_forcelist:
# A set of integer HTTP status codes that we should force a retry on.
retry_adapter = requests.adapters.HTTPAdapter(
max_retries=Retry(total=5, backoff_factor=0.5, status_forcelist=[502, 503, 504])
)
session.mount("http://", retry_adapter)
session.mount("https://", retry_adapter)
return session
def downloadFile(
session, dayBaseUrl, file, dstDir, deleteIncomplete=False, verbose=False
):
fileUrl = f"{dayBaseUrl}/{file['name']}"
# Create destination directory (if not exists)
dstDir.mkdir(parents=True, exist_ok=True)
dstFile = dstDir / file["name"]
# Check if file exists and has the expected size
# Default mode to open destination file (truncate)
dstFileMode = "wb"
extraHeaders = {}
currentFileSize = 0
if dstFile.exists():
currentFileSize = dstFile.stat().st_size
if currentFileSize == file["size"]:
if verbose:
print(f"File exists: {file['name']}")
return True
elif currentFileSize < file["size"]:
# Try to resume the download.
extraHeaders["Range"] = f"bytes={currentFileSize}-"
# Open destination file in append mode
dstFileMode = "ab"
else:
# current file is bigger then it is in source
dstFile.unlink()
currentFileSize = 0
with session.get(fileUrl, stream=True, headers=extraHeaders) as r:
r.raise_for_status()
what = "Downloading"
if currentFileSize > 0:
if r.status_code == 206:
what = f"Resuming (from {currentFileSize})"
elif r.status_code == 200:
# Tried to resume the download, but server did not respond
# with partial content. So truncate the file instead of appending.
dstFileMode = "wb"
if verbose:
print(f"{what} file: {file['name']} md5: {file.get('md5sum','N/A')}")
with dstFile.open(dstFileMode) as f:
shutil.copyfileobj(r.raw, f)
if not dstFile.stat().st_size == file["size"]:
if verbose:
print(f"File size missmatch: {file['name']}")
if deleteIncomplete:
dstFile.unlink()
return False
return True
def downloadFileWorker(
thread_sessions,
token,
dayBaseUrl,
file,
dstDir,
deleteIncomplete=False,
verbose=False,
):
# Sessions are not thread safe so ensure one per thread
threadID = threading.get_ident()
session = thread_sessions.get(threadID)
if session is None:
session = buildSession(token)
thread_sessions[threadID] = session
return downloadFile(session, dayBaseUrl, file, dstDir, deleteIncomplete, verbose)
def productDayListingUrl(baseUrl, product, currentDay):
return f"{baseUrl}/{product}/{currentDay.year}/{currentDay.strftime('%j')}"
def getProductDayJSON(session, dayBaseUrl):
r = session.get(dayBaseUrl + ".json")
r.raise_for_status()
j = r.json()
# There are two different JSON formats in existence
# (or LAADS switches between them...). Account for the list
# of files inside a "content" key as well as at JSON root here
if "content" in j:
return filter(lambda e: e["kind"] == "FILE", j["content"])
else:
return j
def downloadAll(
token,
start,
end,
baseUrl,
products,
dst,
threads=5,
deleteIncomplete=False,
verbose=False,
filterFunc=None,
):
# The session used to fetch file lists with.
# Each download thread creates it's own session (as sessions are not thread safe).
session = buildSession(token)
currentDay = start
thread_sessions = {}
url_futures = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
while currentDay <= end:
for product in products:
dayBaseUrl = productDayListingUrl(baseUrl, product, currentDay)
fileListing = getProductDayJSON(session, dayBaseUrl)
for file in fileListing:
# Filter
if filterFunc is not None:
if not filterFunc(product, file):
continue
dstDir = (
dst / product / str(currentDay.year) / currentDay.strftime("%j")
)
f = executor.submit(
downloadFileWorker,
thread_sessions,
token,
dayBaseUrl,
file,
dstDir,
deleteIncomplete,
verbose,
)
url_futures[f] = file
currentDay += datetime.timedelta(days=1)
for future in concurrent.futures.as_completed(url_futures):
file = url_futures[future]
try:
result = future.result()
except Exception as exc:
print("%r generated an exception: %s" % (file["name"], exc))
else:
if not result:
print("%s failed" % file["name"])
def filterByFileName(product, file):
if product.endswith("EFR") or product.endswith("RBT"):
for s in ("_1080_", "_1260_", "_1440_", "_1620_", "_1800_"):
if s in file["name"]:
return True
return False
return True
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Download LAADS data")
parser.add_argument("--token", type=str)
parser.add_argument("--start", type=datetime.date.fromisoformat)
parser.add_argument("--end", type=datetime.date.fromisoformat)
parser.add_argument("--products", type=str, nargs="+")
parser.add_argument(
"--src",
type=str,
default="https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/450/",
)
parser.add_argument("--dst", type=Path)
parser.add_argument("-v", "--verbose", action="store_true")
parser.add_argument("--delete-incomplete", action="store_true")
parser.add_argument("--threads", type=int, default=5)
args = parser.parse_args()
downloadAll(
args.token,
args.start,
args.end,
args.src,
args.products,
args.dst,
args.threads,
args.delete_incomplete,
args.verbose,
filterByFileName,
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment