Skip to content

Instantly share code, notes, and snippets.

@andrewharvey
Created August 2, 2012 09:54
Show Gist options
  • Save andrewharvey/3236004 to your computer and use it in GitHub Desktop.
Save andrewharvey/3236004 to your computer and use it in GitHub Desktop.
Download and index 250k scanned geological maps from Geoscience Australia
#!/usr/bin/python
# This script is licensed CC0 by Andrew Harvey <andrew.harvey4@gmail.com>
#
# To the extent possible under law, the person who associated CC0
# with this work has waived all copyright and related or neighboring
# rights to this work.
# http://creativecommons.org/publicdomain/zero/1.0/
import re
import sys
from bs4 import BeautifulSoup
# define function to return the first match of a re.search
def assignFirstIfExists(i):
if (i != None):
return i.group(1)
else:
return ""
def parseHtmlDocument(html):
# set up BS
soup = BeautifulSoup(html)
# get the text within the menu tag
for menu in soup.find_all("menu"):
menuText = menu.get_text()
# find the href value for the link which anchor text of "250 DPI"
for link in soup.find_all("a", text="250 DPI"):
imageURL = link.get('href')
# extract just the .jpg file name from the URL
jpgFile = re.search('^.*250dpi\/(.*)&slowConnection', imageURL)
jpgFile = assignFirstIfExists(jpgFile)
# extract the elements within the menu
mapId = re.search(r'Map ID\(s\):\s*(.*)', menuText)
mapTile = re.search(r'Map Tile\(s\):\s*(.*)', menuText)
edition = re.search(r'Edition:\s*(.*)\s*', menuText)
publicationYear = re.search(r'Publication Year:\s*(.*)\s*', menuText)
mapId = assignFirstIfExists(mapId).strip()
mapTile = assignFirstIfExists(mapTile).strip()
edition = assignFirstIfExists(edition).strip()
publicationYear = assignFirstIfExists(publicationYear).strip()
# print the result as tab delimited values
print (jpgFile + '\t' + mapId + '\t' + mapTile + '\t' + edition + '\t' + publicationYear)
# main
currentHtmlDocument = "" # stores the HTML content of the "current" document as stdin can contain many HTML files concatenated together
for line in sys.stdin: # read lines from stdin
currentHtmlDocument = currentHtmlDocument + line # add to the current HTML document
if (line.startswith('</html>')): # if we reach the end of the html document then...
parseHtmlDocument(currentHtmlDocument) # parse it
currentHtmlDocument = "" # and reset the current HTML document
## About
# This script will download the maps and associated meta data for 250k
# Geological Maps at http://www.geoscience.gov.au/geoportal-geologicalmaps/
#
# Running `make all` shall suffice to run this script.
# ## License
# This script is licensed CC0 by Andrew Harvey <andrew.harvey4@gmail.com>
#
# To the extent possible under law, the person who associated CC0
# with this work has waived all copyright and related or neighboring
# rights to this work.
# http://creativecommons.org/publicdomain/zero/1.0/
all: clean download-html-index parse-jpg-links build-index download-jpg rename-jpg
download-html-index:
wget -O 250k_index.html 'http://www.geoscience.gov.au/cgi-bin/mapserv?mapsize=450+450&mapext=-2200000.000000+-5250000.000000+2100000.000000+-950000.00000&map=%2Fnas%2Fweb%2Fops%2Fprod%2Fapps_www-c%2Fmapserver%2Fgeoportal-geologicalmaps%2Findex.map&mode=itemnquery&layer=map250&qlayer=map250&qitem=qmapname&map_map250_query_template=sheetindex.html&qstring=%2F%2F'
parse-jpg-links: 250k_index.html
cat $< | grep 'download?map' | grep -o '/geoportal-geologicalmaps/download?map=250dpi/.*.jpg' | sed 's/^/http:\/\/www.geoscience.gov.au/' > 250dpi.txt
build-index : 250k_index.html
./index_parser.py < $< > 250k_index.tsv
download-jpg : 250dpi.txt
wget --directory-prefix=250dpi -i $<
rename-jpg :
rename 's/^250dpi\/.*%2F/250dpi\//' 250dpi/*.jpg
clean :
rm -f 250dpi.txt 250k_index.html 250k_index.tsv
rm -rf 250dpi
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment