Skip to content

Instantly share code, notes, and snippets.

@mhogeweg
Last active June 11, 2016 15:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mhogeweg/6f396f4b4623a71b5f028425ab3efdfc to your computer and use it in GitHub Desktop.
Save mhogeweg/6f396f4b4623a71b5f028425ab3efdfc to your computer and use it in GitHub Desktop.
#! /usr/bin/env python
# -*- coding: utf-8 -*-
import httplib
from base64 import b64encode
import urllib2
import sys
import re
import urllib
import datetime
# The connection to geoportal server
connection = httplib.HTTPConnection('YOURSERVER:THEPORT')
userAndPass = b64encode(b"USERNAME:PASSWORD").decode("ascii")
headers = {'Authorization': 'Basic %s' % userAndPass, 'Content-type': 'application/xml'}
log = sys.stdout
# What Web Accessible Folder are you going to harvest + the starting point in the folder structure
site_root = "http://data.usgs.gov"
page = "/metadata/"
count = 0
def publish_metadata(f_name):
global count
f = urllib2.urlopen(f_name) # it's a file like object and works just like a file
body_content = f.read()
connection.request('PUT', '/geoportal/rest/metadata/item', body_content, headers)
result = connection.getresponse()
result.read()
count += 1
log.write('%s %s\n' % (result.status, f_name))
f.close()
def crawl(page):
first = True
for i in re.findall('''href=["'](.[^"']+)["']''', urllib.urlopen(page).read(), re.I):
# skip first link on the page as that's the one to the parent
# if you were to follow this one, it results in endless recursion up and down
if i.startswith("?"):
continue
if first:
first = False
continue
if i.startswith("ftp"):
continue
else:
next_page = ""
if i.startswith("http"):
# absolute URL
next_page = i
else:
# relative URL
next_page = page + i
if next_page.endswith(".xml"):
publish_metadata(next_page)
else:
crawl(next_page)
def main():
start = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
crawl(site_root + page)
end = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
log.write('Start %s\n' % start)
log.write('End %s\n' % end)
log.write('Count %d\n' % count)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment