import urllib2 import re import os url = "http://www.globsnow.info/se/archive_v2.1/{}/D4SC/" start_year = 2003 end_year = 2003 for year in range(start_year, end_year + 1): year_url = url.format(year) # get the html of the directory listing x = urllib2.urlopen(year_url).read() # Get all words starting with GlobSnow and ending with .nc.gz, ? - means non-greedy fnames = re.findall(r"GlobSnow.*?\.nc\.gz", x) print len(fnames) fnames = set(fnames) # Eliminate duplicates print len(fnames) for fname in fnames: if os.path.isfile(fname): # No need to download the same file several times continue with open(fname, "w") as f: flink = os.path.join(year_url, fname) print "Downloading {} ....".format(flink) f.write(urllib2.urlopen(flink).read()) print "Downloaded data for year {}".format(year) print "All downloads finished successfully"