Last active
June 11, 2016 15:44
-
-
Save mhogeweg/6f396f4b4623a71b5f028425ab3efdfc to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import httplib | |
from base64 import b64encode | |
import urllib2 | |
import sys | |
import re | |
import urllib | |
import datetime | |
# The connection to geoportal server | |
connection = httplib.HTTPConnection('YOURSERVER:THEPORT') | |
userAndPass = b64encode(b"USERNAME:PASSWORD").decode("ascii") | |
headers = {'Authorization': 'Basic %s' % userAndPass, 'Content-type': 'application/xml'} | |
log = sys.stdout | |
# What Web Accessible Folder are you going to harvest + the starting point in the folder structure | |
site_root = "http://data.usgs.gov" | |
page = "/metadata/" | |
count = 0 | |
def publish_metadata(f_name): | |
global count | |
f = urllib2.urlopen(f_name) # it's a file like object and works just like a file | |
body_content = f.read() | |
connection.request('PUT', '/geoportal/rest/metadata/item', body_content, headers) | |
result = connection.getresponse() | |
result.read() | |
count += 1 | |
log.write('%s %s\n' % (result.status, f_name)) | |
f.close() | |
def crawl(page): | |
first = True | |
for i in re.findall('''href=["'](.[^"']+)["']''', urllib.urlopen(page).read(), re.I): | |
# skip first link on the page as that's the one to the parent | |
# if you were to follow this one, it results in endless recursion up and down | |
if i.startswith("?"): | |
continue | |
if first: | |
first = False | |
continue | |
if i.startswith("ftp"): | |
continue | |
else: | |
next_page = "" | |
if i.startswith("http"): | |
# absolute URL | |
next_page = i | |
else: | |
# relative URL | |
next_page = page + i | |
if next_page.endswith(".xml"): | |
publish_metadata(next_page) | |
else: | |
crawl(next_page) | |
def main(): | |
start = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
crawl(site_root + page) | |
end = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") | |
log.write('Start %s\n' % start) | |
log.write('End %s\n' % end) | |
log.write('Count %d\n' % count) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment