mhogeweg/get_waf_usgs.py

## get_waf_usgs.py
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import httplib
from base64 import b64encode
import urllib2
import sys
import re
import urllib
import datetime


# The connection to geoportal server
connection = httplib.HTTPConnection('YOURSERVER:THEPORT')
userAndPass = b64encode(b"USERNAME:PASSWORD").decode("ascii")
headers = {'Authorization': 'Basic %s' % userAndPass, 'Content-type': 'application/xml'}

log = sys.stdout

# What Web Accessible Folder are you going to harvest + the starting point in the folder structure
site_root = "http://data.usgs.gov"
page = "/metadata/"
count = 0


def publish_metadata(f_name):
    global count
    f = urllib2.urlopen(f_name) # it's a file like object and works just like a file

    body_content = f.read()
    connection.request('PUT', '/geoportal/rest/metadata/item', body_content, headers)
    result = connection.getresponse()
    result.read()
    count += 1
    log.write('%s %s\n' % (result.status, f_name))
    f.close()


def crawl(page):
    first = True
    for i in re.findall('''href=["'](.[^"']+)["']''', urllib.urlopen(page).read(), re.I):
        # skip first link on the page as that's the one to the parent
        # if you were to follow this one, it results in endless recursion up and down

        if i.startswith("?"):
            continue

        if first:
            first = False
            continue

        if i.startswith("ftp"):
            continue
        else:
            next_page = ""

            if i.startswith("http"):
                # absolute URL
                next_page = i

            else:
                # relative URL
                next_page = page + i

            if next_page.endswith(".xml"):
                publish_metadata(next_page)
            else:
                crawl(next_page)


def main():
    start = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    crawl(site_root + page)
    end = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    log.write('Start %s\n' % start)
    log.write('End %s\n' % end)
    log.write('Count %d\n' % count)

if __name__ == '__main__':
    main()
	#! /usr/bin/env python
	# -- coding: utf-8 --

	import httplib
	from base64 import b64encode
	import urllib2
	import sys
	import re
	import urllib
	import datetime


	# The connection to geoportal server
	connection = httplib.HTTPConnection('YOURSERVER:THEPORT')
	userAndPass = b64encode(b"USERNAME:PASSWORD").decode("ascii")
	headers = {'Authorization': 'Basic %s' % userAndPass, 'Content-type': 'application/xml'}

	log = sys.stdout

	# What Web Accessible Folder are you going to harvest + the starting point in the folder structure
	site_root = "http://data.usgs.gov"
	page = "/metadata/"
	count = 0


	def publish_metadata(f_name):
	global count
	f = urllib2.urlopen(f_name) # it's a file like object and works just like a file

	body_content = f.read()
	connection.request('PUT', '/geoportal/rest/metadata/item', body_content, headers)
	result = connection.getresponse()
	result.read()
	count += 1
	log.write('%s %s\n' % (result.status, f_name))
	f.close()


	def crawl(page):
	first = True
	for i in re.findall('''href=["'](.[^"']+)["']''', urllib.urlopen(page).read(), re.I):
	# skip first link on the page as that's the one to the parent
	# if you were to follow this one, it results in endless recursion up and down

	if i.startswith("?"):
	continue

	if first:
	first = False
	continue

	if i.startswith("ftp"):
	continue
	else:
	next_page = ""

	if i.startswith("http"):
	# absolute URL
	next_page = i

	else:
	# relative URL
	next_page = page + i

	if next_page.endswith(".xml"):
	publish_metadata(next_page)
	else:
	crawl(next_page)


	def main():
	start = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	crawl(site_root + page)
	end = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
	log.write('Start %s\n' % start)
	log.write('End %s\n' % end)
	log.write('Count %d\n' % count)

	if __name__ == '__main__':
	main()