Skip to content

Instantly share code, notes, and snippets.

@j08lue
Last active December 16, 2015 06:08
Show Gist options
  • Save j08lue/5389041 to your computer and use it in GitHub Desktop.
Save j08lue/5389041 to your computer and use it in GitHub Desktop.
Fills out and submit a form on the NODC.NOAA website and extract download links on the resulting page. The links can then be downloaded using download_from_url.py
"""Fetch WOA09 files from NODC.NOAA website via WOAselect"""
import re
import mechanize
import os
import urllib2
import shutil
import time
def abs2relftp(url):
"""Replaces leading slash in absolute ftp url with '%2f'
http://stackoverflow.com/questions/1162053/any-way-to-specify-absolute-paths-in-ftp-urls
http://tools.ietf.org/html/draft-casey-url-ftp-00"""
return '/'.join(['/'.join(url.split('/')[:2]),
re.sub('/','/%2f/',('/'.join(url.split('/')[2:])),count=1)])
def get_woa_links(parameters=dict(T=1,S=2)):
"""Fetch WOA09 files from NODC.NOAA website via WOAselect
Paramters
---------
parameters : dict
Ocean parameters to download,
mapping the parameter number in the web interface
to a parameter ID to be used in file names etc.
e.g. {'T': '1'} for salinity
Functioning
-----------
Fills out and submit a form on the NODC.NOAA website
http://www.nodc.noaa.gov/cgi-bin/OC5/SELECT/woaselect.pl
and extracts download links on the resulting page. Stores
the links together with target file names for later download."""
# run through parameters (Temp, Sal, etc.) to download
for parID,parN in parameters.iteritems():
# create empty files
with open('urls_{}_csv.txt'.format(parID),'w') as f:
pass
with open('urls_{}_shp.txt'.format(parID),'w') as f:
pass
# open browser
br = mechanize.Browser()
br.open('http://www.nodc.noaa.gov/cgi-bin/OC5/SELECT/woaselect.pl?parameter={}'.format(parN))
#br.addheaders = [('User-agent','Mozilla/5.0')]
# run though months and levels
for mon in xrange(1,12+1):
for lev in xrange(1,33+1):
print 'Parameter {}, month {}, level {}'.format(parID,mon,lev)
# fill data selection form
print 'submitting values ...'
br.select_form(name='submitform')
br.form.set_all_readonly(False)
br['north'] = '90'
br['south'] = '-90'
br['east'] = '180'
br['west'] = '-180'
br['grid'] = ['2'] # 1/4 grid
br['figure_type'] = ['0']
br['time_period'] = ['{}'.format(mon)] # increment this from 1 through 12
br['depth'] = ['{}'.format(lev)] # increment from 1 through 33
# submit values
print 'waiting for online processing ...'
datasite = br.submit() #; print datasite.read()
# define base file name
fname = '{}_{:02d}_{:02d}'.format(parID,mon,lev)
# store url to ASCII file
url = br.find_link(text='ASCII').url
fnm = fname+'.csv.gz'
with open('urls_{}_csv.txt'.format(parID),'a') as f:
f.write(','.join([url,fnm]))
# store url to ArcGIS file
url = br.find_link(text='ArcGIS').url
fnm = fname+'.tar.gz'
with open('urls_{}_shp.txt'.format(parID),'a') as f:
f.write(','.join([url,fnm])+'\n')
# return to previous page
br.back()
# close browser when done
br.close()
print '... done'
def download(url, localFileName):
req = urllib2.Request(url)
r = urllib2.urlopen(url)
with open(localFileName, 'wb') as f:
shutil.copyfileobj(r, f)
def download_urls(urls_lst,dataDir,maxt=60,dt=10):
print 'processing {} URLs'.format(len(urls_lst))
done_url = []
for k in xrange(len(urls_lst)):
url = urls_lst[k][0]
fnm = urls_lst[k][1].rstrip()
fname = dataDir+fnm
if os.path.isfile(fname):
print 'file {} exists'.format(fname)
continue
else:
t = 0
while t <= maxt:
print 't={} {} --> {} ...'.format(t,url,fnm)
try:
download(url, fname)
except:
print 'not successful. Waiting {}s'.format(dt)
time.sleep(dt)
t+=dt
else:
done_url.append(k)
break
else:
print 'URL {} could not be retrieved.'.format(url)
raise
return done_url
def download_files(dataDir='.',maxn=1,parIDs=['S','T']):
"""Download the files to *dataDir*, trying *maxn* times"""
n=0
while n <= maxn:
for parID in parIDs:
# read lines from URL files into a list
urls_csv = []
with open('urls_{}_csv.txt'.format(parID),'r') as f:
for line in f.readlines():
urls_csv.append(line.split(','))
urls_shp = []
with open('urls_{}_shp.txt'.format(parID),'r') as f:
for line in f.readlines():
urls_shp.append(line.split(','))
# download ASCII files
kdone = download_urls(urls_csv,dataDir)
# download ArcGis files
kdone = download_urls(urls_shp,dataDir)
n+=1
if maxn > 1:
# wait before next attempt
print 'waiting for 2 hours ...'
time.sleep(7200)
if __name__ == "__main__":
get_woa_links(parameters=dict(T=1,S=2))
download_files(parIDs=['S','T'])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment