Simple python script to scrape all geological names from the New Zealand Stratigraphic Lexicon
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
# -*- coding: iso-8859-15 -*- | |
# python script to scrape all geo names from http://data.gns.cri.nz/stratlex/view.jsp?id=981&page=1 | |
# ~7205 pages so for each id | |
# on each page get | |
# <td class="lighter">NAMEâ/td>â | |
# add it to a list and output list as a textfile.dic with on word to a line | |
# instruction i used http://stackoverflow.com/questions/2081586/web-scraping-with-python | |
import urllib2 | |
from BeautifulSoup import BeautifulSoup | |
# or if your're using BeautifulSoup4: | |
# from bs4 import BeautifulSoup | |
from random import random | |
from time import sleep | |
names=[] # put names here | |
outFile='nz_strat_units.dic' | |
outFileObj=open(outFile,'w') | |
for n in range(7205): | |
user_agent = 'Mozilla/6 (Solaris %i) Gecko' % random()*10 | |
headers = { 'User-Agent' : user_agent } | |
sleep(0.02) | |
url='http://data.gns.cri.nz/stratlex/view.jsp?id=%i' % n | |
try: | |
soup = BeautifulSoup(urllib2.urlopen(url).read()) | |
except urllib2.HTTPError, error: | |
print error.read() | |
sleep(40) | |
lights = soup('td', {'class' : 'lighter'}) | |
if lights: | |
name=lights[1].getText() | |
names.append(name) | |
print n, name | |
outFileObj.write(name+'\n') | |
outFileObj.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment