lbesnard/ocean_color_aqua_download.py

## ocean_color_aqua_download.py
from bs4 import BeautifulSoup

import urllib
import re
from retry import retry
import os
from netCDF4 import Dataset

YEARS_TO_DOWNLOAD = list(range(1999, 2019))
VARS_TO_DOWNLOAD = ['par']
download_path = '/tmp/manu_data'


@retry(urllib.error.URLError, tries=4, delay=3, backoff=2)
def urlopen_with_retry(f):
    if not os.path.exists(download_path):
        os.makedirs(download_path)
    output_path = os.path.join(download_path,
                               os.path.basename(f))

    # check if file already exists. overwrite only if corrupted
    if os.path.exists(output_path):
        if not is_netcdf_validity(output_path):
            os.remove(output_path)
            print('downloading {file}'.format(file=f))
            urllib.request.urlretrieve(f, output_path)
    else:
        print('downloading {file}'.format(file=f))
        urllib.request.urlretrieve(f, output_path)


def is_netcdf_validity(netcdf_path):
    try:
        with Dataset(netcdf_path, 'r'):
            return True
    except:
        return False


def list_netcdf_links():
    netcdf_links = []
    for var in VARS_TO_DOWNLOAD:
        for year in YEARS_TO_DOWNLOAD:
            try:
                page_link ="https://oceandata.sci.gsfc.nasa.gov/MODIS-Aqua/Mapped/8-Day/9km/{var}/{year}/".format(year=year,
                                                                                                                  var=var)
                html_page = urllib.request.urlopen(page_link)
                soup = BeautifulSoup(html_page)

                for link in soup.findAll('a', attrs={'href': re.compile("^https://.*\.nc")}):
                    netcdf_links.append(link.get('href'))
            except:
                ValueError('{page} does not exist'.format(page=page_link))

    return netcdf_links

## Main part of the script
netcdf_links = list_netcdf_links()

for f in netcdf_links:
    urlopen_with_retry(f)
	from bs4 import BeautifulSoup

	import urllib
	import re
	from retry import retry
	import os
	from netCDF4 import Dataset

	YEARS_TO_DOWNLOAD = list(range(1999, 2019))
	VARS_TO_DOWNLOAD = ['par']
	download_path = '/tmp/manu_data'


	@retry(urllib.error.URLError, tries=4, delay=3, backoff=2)
	def urlopen_with_retry(f):
	if not os.path.exists(download_path):
	os.makedirs(download_path)
	output_path = os.path.join(download_path,
	os.path.basename(f))

	# check if file already exists. overwrite only if corrupted
	if os.path.exists(output_path):
	if not is_netcdf_validity(output_path):
	os.remove(output_path)
	print('downloading {file}'.format(file=f))
	urllib.request.urlretrieve(f, output_path)
	else:
	print('downloading {file}'.format(file=f))
	urllib.request.urlretrieve(f, output_path)


	def is_netcdf_validity(netcdf_path):
	try:
	with Dataset(netcdf_path, 'r'):
	return True
	except:
	return False


	def list_netcdf_links():
	netcdf_links = []
	for var in VARS_TO_DOWNLOAD:
	for year in YEARS_TO_DOWNLOAD:
	try:
	page_link ="https://oceandata.sci.gsfc.nasa.gov/MODIS-Aqua/Mapped/8-Day/9km/{var}/{year}/".format(year=year,
	var=var)
	html_page = urllib.request.urlopen(page_link)
	soup = BeautifulSoup(html_page)

	for link in soup.findAll('a', attrs={'href': re.compile("^https://.*\.nc")}):
	netcdf_links.append(link.get('href'))
	except:
	ValueError('{page} does not exist'.format(page=page_link))

	return netcdf_links

	## Main part of the script
	netcdf_links = list_netcdf_links()

	for f in netcdf_links:
	urlopen_with_retry(f)