jayme-github/laads.py

## laads.py
#!/bin/env python3

#
# Download LAADS data from https://ladsweb.modaps.eosdis.nasa.gov
#

import argparse
import concurrent.futures
import datetime
import shutil
import sys
import threading
from pathlib import Path
from urllib3.util import Retry

import requests

USERAGENT = "tis/download.py_1.0--" + sys.version.replace("\n", "").replace("\r", "")


def buildSession(token):
    session = requests.Session()
    # Authorize against API
    session.headers = {"Authorization": "Bearer " + token, "User-Agent": USERAGENT}

    # Mount a retry adapter to account for server errors
    # total:
    # Total number of retries to allow. Takes precedence over other counts.
    #
    # backoff_factor:
    # A backoff factor to apply between attempts after the second try
    # (most errors are resolved immediately by a second try without a delay).
    # urllib3 will sleep for:
    # {backoff factor} * (2 ** ({number of total retries} - 1)) seconds.
    # If the backoff_factor is 0.1, then .sleep will sleep for [0.0s, 0.2s, 0.4s, ...]
    # between retries. It will never be longer than Retry.BACKOFF_MAX.
    #
    # status_forcelist:
    # A set of integer HTTP status codes that we should force a retry on.
    retry_adapter = requests.adapters.HTTPAdapter(
        max_retries=Retry(total=5, backoff_factor=0.5, status_forcelist=[502, 503, 504])
    )
    session.mount("http://", retry_adapter)
    session.mount("https://", retry_adapter)
    return session


def downloadFile(
    session, dayBaseUrl, file, dstDir, deleteIncomplete=False, verbose=False
):
    fileUrl = f"{dayBaseUrl}/{file['name']}"
    # Create destination directory (if not exists)
    dstDir.mkdir(parents=True, exist_ok=True)
    dstFile = dstDir / file["name"]
    # Check if file exists and has the expected size
    # Default mode to open destination file (truncate)
    dstFileMode = "wb"
    extraHeaders = {}
    currentFileSize = 0
    if dstFile.exists():
        currentFileSize = dstFile.stat().st_size
        if currentFileSize == file["size"]:
            if verbose:
                print(f"File exists: {file['name']}")
            return True
        elif currentFileSize < file["size"]:
            # Try to resume the download.
            extraHeaders["Range"] = f"bytes={currentFileSize}-"
            # Open destination file in append mode
            dstFileMode = "ab"
        else:
            # current file is bigger then it is in source
            dstFile.unlink()
            currentFileSize = 0

    with session.get(fileUrl, stream=True, headers=extraHeaders) as r:
        r.raise_for_status()
        what = "Downloading"
        if currentFileSize > 0:
            if r.status_code == 206:
                what = f"Resuming (from {currentFileSize})"
            elif r.status_code == 200:
                # Tried to resume the download, but server did not respond
                # with partial content. So truncate the file instead of appending.
                dstFileMode = "wb"
        if verbose:
            print(f"{what} file: {file['name']} md5: {file.get('md5sum','N/A')}")

        with dstFile.open(dstFileMode) as f:
            shutil.copyfileobj(r.raw, f)

    if not dstFile.stat().st_size == file["size"]:
        if verbose:
            print(f"File size missmatch: {file['name']}")
        if deleteIncomplete:
            dstFile.unlink()
        return False

    return True


def downloadFileWorker(
    thread_sessions,
    token,
    dayBaseUrl,
    file,
    dstDir,
    deleteIncomplete=False,
    verbose=False,
):
    # Sessions are not thread safe so ensure one per thread
    threadID = threading.get_ident()
    session = thread_sessions.get(threadID)
    if session is None:
        session = buildSession(token)
        thread_sessions[threadID] = session
    return downloadFile(session, dayBaseUrl, file, dstDir, deleteIncomplete, verbose)


def productDayListingUrl(baseUrl, product, currentDay):
    return f"{baseUrl}/{product}/{currentDay.year}/{currentDay.strftime('%j')}"


def getProductDayJSON(session, dayBaseUrl):
    r = session.get(dayBaseUrl + ".json")
    r.raise_for_status()
    j = r.json()
    # There are two different JSON formats in existence
    # (or LAADS switches between them...). Account for the list
    # of files inside a "content" key as well as at JSON root here
    if "content" in j:
        return filter(lambda e: e["kind"] == "FILE", j["content"])
    else:
        return j


def downloadAll(
    token,
    start,
    end,
    baseUrl,
    products,
    dst,
    threads=5,
    deleteIncomplete=False,
    verbose=False,
    filterFunc=None,
):
    # The session used to fetch file lists with.
    # Each download thread creates it's own session (as sessions are not thread safe).
    session = buildSession(token)

    currentDay = start
    thread_sessions = {}
    url_futures = {}
    with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
        while currentDay <= end:
            for product in products:
                dayBaseUrl = productDayListingUrl(baseUrl, product, currentDay)
                fileListing = getProductDayJSON(session, dayBaseUrl)

                for file in fileListing:
                    # Filter
                    if filterFunc is not None:
                        if not filterFunc(product, file):
                            continue

                    dstDir = (
                        dst / product / str(currentDay.year) / currentDay.strftime("%j")
                    )
                    f = executor.submit(
                        downloadFileWorker,
                        thread_sessions,
                        token,
                        dayBaseUrl,
                        file,
                        dstDir,
                        deleteIncomplete,
                        verbose,
                    )
                    url_futures[f] = file

            currentDay += datetime.timedelta(days=1)

    for future in concurrent.futures.as_completed(url_futures):
        file = url_futures[future]
        try:
            result = future.result()
        except Exception as exc:
            print("%r generated an exception: %s" % (file["name"], exc))
        else:
            if not result:
                print("%s failed" % file["name"])


def filterByFileName(product, file):
    if product.endswith("EFR") or product.endswith("RBT"):
        for s in ("_1080_", "_1260_", "_1440_", "_1620_", "_1800_"):
            if s in file["name"]:
                return True
        return False
    return True


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Download LAADS data")
    parser.add_argument("--token", type=str)
    parser.add_argument("--start", type=datetime.date.fromisoformat)
    parser.add_argument("--end", type=datetime.date.fromisoformat)
    parser.add_argument("--products", type=str, nargs="+")
    parser.add_argument(
        "--src",
        type=str,
        default="https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/450/",
    )
    parser.add_argument("--dst", type=Path)
    parser.add_argument("-v", "--verbose", action="store_true")
    parser.add_argument("--delete-incomplete", action="store_true")
    parser.add_argument("--threads", type=int, default=5)
    args = parser.parse_args()
    downloadAll(
        args.token,
        args.start,
        args.end,
        args.src,
        args.products,
        args.dst,
        args.threads,
        args.delete_incomplete,
        args.verbose,
        filterByFileName,
    )
	#!/bin/env python3

	#
	# Download LAADS data from https://ladsweb.modaps.eosdis.nasa.gov
	#

	import argparse
	import concurrent.futures
	import datetime
	import shutil
	import sys
	import threading
	from pathlib import Path
	from urllib3.util import Retry

	import requests

	USERAGENT = "tis/download.py_1.0--" + sys.version.replace("\n", "").replace("\r", "")


	def buildSession(token):
	session = requests.Session()
	# Authorize against API
	session.headers = {"Authorization": "Bearer " + token, "User-Agent": USERAGENT}

	# Mount a retry adapter to account for server errors
	# total:
	# Total number of retries to allow. Takes precedence over other counts.
	#
	# backoff_factor:
	# A backoff factor to apply between attempts after the second try
	# (most errors are resolved immediately by a second try without a delay).
	# urllib3 will sleep for:
	# {backoff factor} * (2 ** ({number of total retries} - 1)) seconds.
	# If the backoff_factor is 0.1, then .sleep will sleep for [0.0s, 0.2s, 0.4s, ...]
	# between retries. It will never be longer than Retry.BACKOFF_MAX.
	#
	# status_forcelist:
	# A set of integer HTTP status codes that we should force a retry on.
	retry_adapter = requests.adapters.HTTPAdapter(
	max_retries=Retry(total=5, backoff_factor=0.5, status_forcelist=[502, 503, 504])
	)
	session.mount("http://", retry_adapter)
	session.mount("https://", retry_adapter)
	return session


	def downloadFile(
	session, dayBaseUrl, file, dstDir, deleteIncomplete=False, verbose=False
	):
	fileUrl = f"{dayBaseUrl}/{file['name']}"
	# Create destination directory (if not exists)
	dstDir.mkdir(parents=True, exist_ok=True)
	dstFile = dstDir / file["name"]
	# Check if file exists and has the expected size
	# Default mode to open destination file (truncate)
	dstFileMode = "wb"
	extraHeaders = {}
	currentFileSize = 0
	if dstFile.exists():
	currentFileSize = dstFile.stat().st_size
	if currentFileSize == file["size"]:
	if verbose:
	print(f"File exists: {file['name']}")
	return True
	elif currentFileSize < file["size"]:
	# Try to resume the download.
	extraHeaders["Range"] = f"bytes={currentFileSize}-"
	# Open destination file in append mode
	dstFileMode = "ab"
	else:
	# current file is bigger then it is in source
	dstFile.unlink()
	currentFileSize = 0

	with session.get(fileUrl, stream=True, headers=extraHeaders) as r:
	r.raise_for_status()
	what = "Downloading"
	if currentFileSize > 0:
	if r.status_code == 206:
	what = f"Resuming (from {currentFileSize})"
	elif r.status_code == 200:
	# Tried to resume the download, but server did not respond
	# with partial content. So truncate the file instead of appending.
	dstFileMode = "wb"
	if verbose:
	print(f"{what} file: {file['name']} md5: {file.get('md5sum','N/A')}")

	with dstFile.open(dstFileMode) as f:
	shutil.copyfileobj(r.raw, f)

	if not dstFile.stat().st_size == file["size"]:
	if verbose:
	print(f"File size missmatch: {file['name']}")
	if deleteIncomplete:
	dstFile.unlink()
	return False

	return True


	def downloadFileWorker(
	thread_sessions,
	token,
	dayBaseUrl,
	file,
	dstDir,
	deleteIncomplete=False,
	verbose=False,
	):
	# Sessions are not thread safe so ensure one per thread
	threadID = threading.get_ident()
	session = thread_sessions.get(threadID)
	if session is None:
	session = buildSession(token)
	thread_sessions[threadID] = session
	return downloadFile(session, dayBaseUrl, file, dstDir, deleteIncomplete, verbose)


	def productDayListingUrl(baseUrl, product, currentDay):
	return f"{baseUrl}/{product}/{currentDay.year}/{currentDay.strftime('%j')}"


	def getProductDayJSON(session, dayBaseUrl):
	r = session.get(dayBaseUrl + ".json")
	r.raise_for_status()
	j = r.json()
	# There are two different JSON formats in existence
	# (or LAADS switches between them...). Account for the list
	# of files inside a "content" key as well as at JSON root here
	if "content" in j:
	return filter(lambda e: e["kind"] == "FILE", j["content"])
	else:
	return j


	def downloadAll(
	token,
	start,
	end,
	baseUrl,
	products,
	dst,
	threads=5,
	deleteIncomplete=False,
	verbose=False,
	filterFunc=None,
	):
	# The session used to fetch file lists with.
	# Each download thread creates it's own session (as sessions are not thread safe).
	session = buildSession(token)

	currentDay = start
	thread_sessions = {}
	url_futures = {}
	with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
	while currentDay <= end:
	for product in products:
	dayBaseUrl = productDayListingUrl(baseUrl, product, currentDay)
	fileListing = getProductDayJSON(session, dayBaseUrl)

	for file in fileListing:
	# Filter
	if filterFunc is not None:
	if not filterFunc(product, file):
	continue

	dstDir = (
	dst / product / str(currentDay.year) / currentDay.strftime("%j")
	)
	f = executor.submit(
	downloadFileWorker,
	thread_sessions,
	token,
	dayBaseUrl,
	file,
	dstDir,
	deleteIncomplete,
	verbose,
	)
	url_futures[f] = file

	currentDay += datetime.timedelta(days=1)

	for future in concurrent.futures.as_completed(url_futures):
	file = url_futures[future]
	try:
	result = future.result()
	except Exception as exc:
	print("%r generated an exception: %s" % (file["name"], exc))
	else:
	if not result:
	print("%s failed" % file["name"])


	def filterByFileName(product, file):
	if product.endswith("EFR") or product.endswith("RBT"):
	for s in ("_1080_", "_1260_", "_1440_", "_1620_", "_1800_"):
	if s in file["name"]:
	return True
	return False
	return True


	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Download LAADS data")
	parser.add_argument("--token", type=str)
	parser.add_argument("--start", type=datetime.date.fromisoformat)
	parser.add_argument("--end", type=datetime.date.fromisoformat)
	parser.add_argument("--products", type=str, nargs="+")
	parser.add_argument(
	"--src",
	type=str,
	default="https://ladsweb.modaps.eosdis.nasa.gov/archive/allData/450/",
	)
	parser.add_argument("--dst", type=Path)
	parser.add_argument("-v", "--verbose", action="store_true")
	parser.add_argument("--delete-incomplete", action="store_true")
	parser.add_argument("--threads", type=int, default=5)
	args = parser.parse_args()
	downloadAll(
	args.token,
	args.start,
	args.end,
	args.src,
	args.products,
	args.dst,
	args.threads,
	args.delete_incomplete,
	args.verbose,
	filterByFileName,
	)