wassname/get_nz_strat_names.py

## get_nz_strat_names.py
#!/usr/bin/python
# -*- coding: iso-8859-15 -*-


# python script to scrape all geo names from http://data.gns.cri.nz/stratlex/view.jsp?id=981&page=1
# ~7205 pages so for each id
# on each page get
# <td class="lighter">NAMEâ/td>â
# add it to a list and output list as a textfile.dic with on word to a line

# instruction i used http://stackoverflow.com/questions/2081586/web-scraping-with-python

import urllib2
from BeautifulSoup import BeautifulSoup
# or if your're using BeautifulSoup4:
# from bs4 import BeautifulSoup
from random import random
from time import sleep

names=[] # put names here
outFile='nz_strat_units.dic'
outFileObj=open(outFile,'w')
for n in range(7205):
    user_agent = 'Mozilla/6 (Solaris %i) Gecko' % random()*10
    headers = { 'User-Agent' : user_agent }
    sleep(0.02)
    url='http://data.gns.cri.nz/stratlex/view.jsp?id=%i' % n
    try:
        soup = BeautifulSoup(urllib2.urlopen(url).read())
    except urllib2.HTTPError, error:
        print error.read()
        sleep(40)
    lights = soup('td', {'class' : 'lighter'})
    if lights:
        name=lights[1].getText()
        names.append(name)
        print n, name
        outFileObj.write(name+'\n')


outFileObj.close()
	#!/usr/bin/python
	# -- coding: iso-8859-15 --


	# python script to scrape all geo names from http://data.gns.cri.nz/stratlex/view.jsp?id=981&page=1
	# ~7205 pages so for each id
	# on each page get
	# <td class="lighter">NAMEâ/td>â
	# add it to a list and output list as a textfile.dic with on word to a line

	# instruction i used http://stackoverflow.com/questions/2081586/web-scraping-with-python

	import urllib2
	from BeautifulSoup import BeautifulSoup
	# or if your're using BeautifulSoup4:
	# from bs4 import BeautifulSoup
	from random import random
	from time import sleep

	names=[] # put names here
	outFile='nz_strat_units.dic'
	outFileObj=open(outFile,'w')
	for n in range(7205):
	user_agent = 'Mozilla/6 (Solaris %i) Gecko' % random()*10
	headers = { 'User-Agent' : user_agent }
	sleep(0.02)
	url='http://data.gns.cri.nz/stratlex/view.jsp?id=%i' % n
	try:
	soup = BeautifulSoup(urllib2.urlopen(url).read())
	except urllib2.HTTPError, error:
	print error.read()
	sleep(40)
	lights = soup('td', {'class' : 'lighter'})
	if lights:
	name=lights[1].getText()
	names.append(name)
	print n, name
	outFileObj.write(name+'\n')


	outFileObj.close()