Skip to content

Instantly share code, notes, and snippets.

@mdornseif
Created May 27, 2012 16:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mdornseif/2814922 to your computer and use it in GitHub Desktop.
Save mdornseif/2814922 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Hackish Script to leech Karst/Cave Data from
# http://environnement.wallonie.be/cartosig/index.asp
# See http://cavehackers.de/77347676 for some background
import codecs
import urllib2
import re
import subprocess
# To get decent "GPS" Coordinates:
# gdaltransform -s_srs EPSG:31370 -t_srs EPSG:4326
out_s = {}
out_o = {}
def extract(url):
data = urllib2.urlopen(url).read().replace(' ', ' ').decode('latin-1')
# print data
name_re = \
re.compile(r'Nom :</td>.+?<td align=left wrap class=text >.+?<b>(.*?)</b>.+?</td>'
, re.DOTALL | re.MULTILINE)
lambert_re = \
re.compile(r'Coordonn&eacute;es Lambert 72:.+?</td>.+?<td align=left wrap class=text>(.+?)</td>'
, re.DOTALL)
description_re = \
re.compile(r'Description : </td>.+?<td align=left class=text>(.*?)</td>'
, re.DOTALL | re.MULTILINE)
laenge_re = \
re.compile(r'veloppement.*?:.*?</td>.+?<td align=left class=text>(.*?)</td>'
, re.DOTALL | re.MULTILINE)
interesse_re = \
re.compile(r'ts du site.*?:.*?</td>.+?<td align=left class=text>(.*?)</td>'
, re.DOTALL | re.MULTILINE)
# Denivellation : 6.0
interest = interesse_re.search(data)
if not interest:
# nothing found
return False
lambert = ' '.join(lambert_re.search(data).group(1).strip().split())
lambert = re.sub(r'[XY:m,site a\(\)]', ' ', lambert)
lambert = re.sub(r' +', ' ', lambert).strip()
gps = subprocess.check_output(['sh', '-c',
'echo \'%s\' | gdaltransform -s_srs EPSG:31370 -t_srs EPSG:4326'
% lambert])
(lon, lat, h) = gps.split()
interest = interest.group(1).strip().lower()
interest = interest.replace('<i>', '').replace('</i>', '')
interest = interest.replace('&', ' ').replace('/', ' ')
interest = interest.replace(' et ', ' ').replace(',', ' ').strip('. ')
interest = [x.strip('.()? ') for x in interest.split()]
interest = [x for x in interest if x != 'et']
interest = list(set([x.strip() for x in interest if x.strip()]))
interest = ', '.join(sorted(interest, reverse=True))
if not interest:
interest = '?'
name = ' '.join(name_re.search(data).group(1).strip().split())
name = name.replace('&', '&amp;')
beschreibung = description_re.search(data).group(1).strip()
beschreibung = beschreibung.replace('&', '&amp;')
if u'spéléologique' in interest:
out = out_s
else:
out = out_o
if interest not in out:
out[interest] = []
out[interest].append(dict(name=name, lambert=lambert, gps='%s, %s' % (lon,
lat), beschreibung=beschreibung,
laenge=laenge_re.search(data).group(1).strip(),
interest=interest, url=url))
print out[interest][-1]
return True
## Um Raeren
# for count in range(1, 110):
# url = 'http://carto1.wallonie.be/documents/Karst/fiche_karst.idc?AKWANUM=431-%03d' % count
# print "---------"
# print url
# extract(url)
# for count in range(1, 74):
# url = 'http://carto1.wallonie.be/documents/Karst/fiche_karst.idc?AKWANUM=432-%03d' % count
# print "---------"
# print url
# extract(url)
# Anderes
for a in range(40, 61):
for b in range(1, 10):
for count in range(1, 500):
url = \
'http://carto1.wallonie.be/documents/Karst/fiche_karst.idc?AKWANUM=%d%d-%03d' \
% (a, b, count)
print '---------'
print url
if not extract(url):
break
# http://carto1.wallonie.be/documents/karst/site/42650z.html
# http://carto1.wallonie.be/documents/karst/site/42659z.html
# http://carto1.wallonie.be/documents/karst/site/47744z.html
# http://carto1.wallonie.be/documents/karst/site/47810z.html
# http://carto1.wallonie.be/documents/karst/site/47825z.html
# http://carto1.wallonie.be/documents/karst/site/48217z.html
# http://carto1.wallonie.be/documents/karst/site/48521z.html
# http://carto1.wallonie.be/documents/karst/site/492139z.html
# http://carto1.wallonie.be/documents/karst/site/492151z.html
# http://carto1.wallonie.be/documents/karst/site/492153z.html
# http://carto1.wallonie.be/documents/karst/site/492155z.html
# http://carto1.wallonie.be/documents/karst/site/4926z.html
# http://carto1.wallonie.be/documents/karst/site/49367z.html
# http://carto1.wallonie.be/documents/karst/site/49377z.html
# http://carto1.wallonie.be/documents/karst/site/49382z.html
# http://carto1.wallonie.be/documents/karst/site/496101z.html
# http://carto1.wallonie.be/documents/karst/site/5065z.html
# http://carto1.wallonie.be/documents/karst/site/53442z.html
# http://carto1.wallonie.be/documents/karst/site/538245b.html
# http://carto1.wallonie.be/documents/karst/site/55113Z.html
# http://carto1.wallonie.be/documents/karst/site/5522z.html
# http://carto1.wallonie.be/documents/karst/site/5522z.html
# http://carto1.wallonie.be/documents/karst/site/5555Z.html
# http://carto1.wallonie.be/documents/karst/site/59235z.html
# generate kml
fd = codecs.open('carto1.wallonie.be.kml', mode='w', encoding='utf-8')
fd.write('''<?xml version="1.0" encoding="UTF-8"?>
<kml xmlns="http://www.opengis.net/kml/2.2">
<Document>
''')
for out, fname in [(out_s, 'Speleo'), (out_o, 'Other')]:
fd.write(''' <Folder><name>%s</name>\n''' % fname)
for kat in sorted(out.keys()):
fd.write(''' <Folder><name>%s</name>''' % kat)
for loc in out[kat]:
fd.write(''' <Placemark><name>%(name)s</name>
<description>%(beschreibung)s
%(laenge)s
%(interest)s
%(url)s
</description>
<Point><coordinates>%(gps)s,0</coordinates>
</Point>\n </Placemark>\n''' % loc)
fd.write(' </Folder>\n')
fd.write(' </Folder>\n')
fd.write('</Document>\n</kml>')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment