Skip to content

Instantly share code, notes, and snippets.

@lbesnard
Created September 25, 2019 01:05
Show Gist options
  • Save lbesnard/dda6f5968be6474bffd295184a16d41a to your computer and use it in GitHub Desktop.
Save lbesnard/dda6f5968be6474bffd295184a16d41a to your computer and use it in GitHub Desktop.
Ocean Colour - download data from oceandata.sci.gsfc.nasa.gov
from bs4 import BeautifulSoup
import urllib
import re
from retry import retry
import os
from netCDF4 import Dataset
YEARS_TO_DOWNLOAD = list(range(1999, 2019))
VARS_TO_DOWNLOAD = ['par']
download_path = '/tmp/manu_data'
@retry(urllib.error.URLError, tries=4, delay=3, backoff=2)
def urlopen_with_retry(f):
if not os.path.exists(download_path):
os.makedirs(download_path)
output_path = os.path.join(download_path,
os.path.basename(f))
# check if file already exists. overwrite only if corrupted
if os.path.exists(output_path):
if not is_netcdf_validity(output_path):
os.remove(output_path)
print('downloading {file}'.format(file=f))
urllib.request.urlretrieve(f, output_path)
else:
print('downloading {file}'.format(file=f))
urllib.request.urlretrieve(f, output_path)
def is_netcdf_validity(netcdf_path):
try:
with Dataset(netcdf_path, 'r'):
return True
except:
return False
def list_netcdf_links():
netcdf_links = []
for var in VARS_TO_DOWNLOAD:
for year in YEARS_TO_DOWNLOAD:
try:
page_link ="https://oceandata.sci.gsfc.nasa.gov/MODIS-Aqua/Mapped/8-Day/9km/{var}/{year}/".format(year=year,
var=var)
html_page = urllib.request.urlopen(page_link)
soup = BeautifulSoup(html_page)
for link in soup.findAll('a', attrs={'href': re.compile("^https://.*\.nc")}):
netcdf_links.append(link.get('href'))
except:
ValueError('{page} does not exist'.format(page=page_link))
return netcdf_links
## Main part of the script
netcdf_links = list_netcdf_links()
for f in netcdf_links:
urlopen_with_retry(f)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment