mdornseif/georipper.py

## georipper.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Hackish Script to leech Karst/Cave Data from
# http://environnement.wallonie.be/cartosig/index.asp
# See http://cavehackers.de/77347676 for some background

import codecs
import urllib2
import re
import subprocess

# To get decent "GPS" Coordinates:
# gdaltransform -s_srs EPSG:31370 -t_srs EPSG:4326

out_s = {}
out_o = {}


def extract(url):
    data = urllib2.urlopen(url).read().replace('&nbsp;', ' ').decode('latin-1')

    # print data

    name_re = \
        re.compile(r'Nom :</td>.+?<td align=left wrap class=text >.+?<b>(.*?)</b>.+?</td>'
                   , re.DOTALL | re.MULTILINE)
    lambert_re = \
        re.compile(r'Coordonn&eacute;es Lambert 72:.+?</td>.+?<td align=left wrap class=text>(.+?)</td>'
                   , re.DOTALL)
    description_re = \
        re.compile(r'Description : </td>.+?<td align=left class=text>(.*?)</td>'
                   , re.DOTALL | re.MULTILINE)
    laenge_re = \
        re.compile(r'veloppement.*?:.*?</td>.+?<td align=left class=text>(.*?)</td>'
                   , re.DOTALL | re.MULTILINE)
    interesse_re = \
        re.compile(r'ts du site.*?:.*?</td>.+?<td align=left class=text>(.*?)</td>'
                   , re.DOTALL | re.MULTILINE)

    # Denivellation : 6.0

    interest = interesse_re.search(data)
    if not interest:
        # nothing found
        return False
    lambert = ' '.join(lambert_re.search(data).group(1).strip().split())
    lambert = re.sub(r'[XY:m,site a\(\)]', ' ', lambert)
    lambert = re.sub(r' +', ' ', lambert).strip()
    gps = subprocess.check_output(['sh', '-c',
                                  'echo \'%s\' | gdaltransform -s_srs EPSG:31370 -t_srs EPSG:4326'
                                   % lambert])
    (lon, lat, h) = gps.split()

    interest = interest.group(1).strip().lower()
    interest = interest.replace('<i>', '').replace('</i>', '')
    interest = interest.replace('&', ' ').replace('/', ' ')
    interest = interest.replace(' et ', ' ').replace(',', ' ').strip('. ')
    interest = [x.strip('.()? ') for x in interest.split()]
    interest = [x for x in interest if x != 'et']
    interest = list(set([x.strip() for x in interest if x.strip()]))
    interest = ', '.join(sorted(interest, reverse=True))
    if not interest:
        interest = '?'

    name = ' '.join(name_re.search(data).group(1).strip().split())
    name = name.replace('&', '&amp;')
    beschreibung = description_re.search(data).group(1).strip()
    beschreibung = beschreibung.replace('&', '&amp;')
    if u'spéléologique' in interest:
        out = out_s
    else:
        out = out_o
    if interest not in out:
        out[interest] = []
    out[interest].append(dict(name=name, lambert=lambert, gps='%s, %s' % (lon,
                         lat), beschreibung=beschreibung,
                         laenge=laenge_re.search(data).group(1).strip(),
                         interest=interest, url=url))
    print out[interest][-1]
    return True


## Um Raeren
# for count in range(1, 110):
#    url = 'http://carto1.wallonie.be/documents/Karst/fiche_karst.idc?AKWANUM=431-%03d' % count
#    print "---------"
#    print url
#    extract(url)
# for count in range(1, 74):
#    url = 'http://carto1.wallonie.be/documents/Karst/fiche_karst.idc?AKWANUM=432-%03d' % count
#    print "---------"
#    print url
#    extract(url)

# Anderes

for a in range(40, 61):
    for b in range(1, 10):
        for count in range(1, 500):
            url = \
                'http://carto1.wallonie.be/documents/Karst/fiche_karst.idc?AKWANUM=%d%d-%03d' \
                % (a, b, count)
            print '---------'
            print url
            if not extract(url):
                break

# http://carto1.wallonie.be/documents/karst/site/42650z.html
# http://carto1.wallonie.be/documents/karst/site/42659z.html
# http://carto1.wallonie.be/documents/karst/site/47744z.html
# http://carto1.wallonie.be/documents/karst/site/47810z.html
# http://carto1.wallonie.be/documents/karst/site/47825z.html
# http://carto1.wallonie.be/documents/karst/site/48217z.html
# http://carto1.wallonie.be/documents/karst/site/48521z.html
# http://carto1.wallonie.be/documents/karst/site/492139z.html
# http://carto1.wallonie.be/documents/karst/site/492151z.html
# http://carto1.wallonie.be/documents/karst/site/492153z.html
# http://carto1.wallonie.be/documents/karst/site/492155z.html
# http://carto1.wallonie.be/documents/karst/site/4926z.html
# http://carto1.wallonie.be/documents/karst/site/49367z.html
# http://carto1.wallonie.be/documents/karst/site/49377z.html
# http://carto1.wallonie.be/documents/karst/site/49382z.html
# http://carto1.wallonie.be/documents/karst/site/496101z.html
# http://carto1.wallonie.be/documents/karst/site/5065z.html
# http://carto1.wallonie.be/documents/karst/site/53442z.html
# http://carto1.wallonie.be/documents/karst/site/538245b.html
# http://carto1.wallonie.be/documents/karst/site/55113Z.html
# http://carto1.wallonie.be/documents/karst/site/5522z.html
# http://carto1.wallonie.be/documents/karst/site/5522z.html
# http://carto1.wallonie.be/documents/karst/site/5555Z.html
# http://carto1.wallonie.be/documents/karst/site/59235z.html

# generate kml

fd = codecs.open('carto1.wallonie.be.kml', mode='w', encoding='utf-8')
fd.write('''<?xml version="1.0" encoding="UTF-8"?>
<kml xmlns="http://www.opengis.net/kml/2.2">
<Document>
''')
for out, fname in [(out_s, 'Speleo'), (out_o, 'Other')]:
    fd.write('''  <Folder><name>%s</name>\n''' % fname)
    for kat in sorted(out.keys()):
        fd.write('''    <Folder><name>%s</name>''' % kat)
        for loc in out[kat]:
            fd.write('''      <Placemark><name>%(name)s</name>
            <description>%(beschreibung)s

            %(laenge)s

            %(interest)s

            %(url)s
            </description>
            <Point><coordinates>%(gps)s,0</coordinates>
            </Point>\n      </Placemark>\n''' % loc)
        fd.write('    </Folder>\n')
    fd.write(' </Folder>\n')
fd.write('</Document>\n</kml>')
	#!/usr/bin/env python
	# -- coding: utf-8 --

	# Hackish Script to leech Karst/Cave Data from
	# http://environnement.wallonie.be/cartosig/index.asp
	# See http://cavehackers.de/77347676 for some background

	import codecs
	import urllib2
	import re
	import subprocess

	# To get decent "GPS" Coordinates:
	# gdaltransform -s_srs EPSG:31370 -t_srs EPSG:4326

	out_s = {}
	out_o = {}


	def extract(url):
	data = urllib2.urlopen(url).read().replace(' ', ' ').decode('latin-1')

	# print data

	name_re = \
	re.compile(r'Nom :</td>.+?<td align=left wrap class=text >.+?<b>(.*?)</b>.+?</td>'
	, re.DOTALL \| re.MULTILINE)
	lambert_re = \
	re.compile(r'Coordonnées Lambert 72:.+?</td>.+?<td align=left wrap class=text>(.+?)</td>'
	, re.DOTALL)
	description_re = \
	re.compile(r'Description : </td>.+?<td align=left class=text>(.*?)</td>'
	, re.DOTALL \| re.MULTILINE)
	laenge_re = \
	re.compile(r'veloppement.?:.?</td>.+?<td align=left class=text>(.*?)</td>'
	, re.DOTALL \| re.MULTILINE)
	interesse_re = \
	re.compile(r'ts du site.?:.?</td>.+?<td align=left class=text>(.*?)</td>'
	, re.DOTALL \| re.MULTILINE)

	# Denivellation : 6.0

	interest = interesse_re.search(data)
	if not interest:
	# nothing found
	return False
	lambert = ' '.join(lambert_re.search(data).group(1).strip().split())
	lambert = re.sub(r'[XY:m,site a\(\)]', ' ', lambert)
	lambert = re.sub(r' +', ' ', lambert).strip()
	gps = subprocess.check_output(['sh', '-c',
	'echo \'%s\' \| gdaltransform -s_srs EPSG:31370 -t_srs EPSG:4326'
	% lambert])
	(lon, lat, h) = gps.split()

	interest = interest.group(1).strip().lower()
	interest = interest.replace('<i>', '').replace('</i>', '')
	interest = interest.replace('&', ' ').replace('/', ' ')
	interest = interest.replace(' et ', ' ').replace(',', ' ').strip('. ')
	interest = [x.strip('.()? ') for x in interest.split()]
	interest = [x for x in interest if x != 'et']
	interest = list(set([x.strip() for x in interest if x.strip()]))
	interest = ', '.join(sorted(interest, reverse=True))
	if not interest:
	interest = '?'

	name = ' '.join(name_re.search(data).group(1).strip().split())
	name = name.replace('&', '&')
	beschreibung = description_re.search(data).group(1).strip()
	beschreibung = beschreibung.replace('&', '&')
	if u'spéléologique' in interest:
	out = out_s
	else:
	out = out_o
	if interest not in out:
	out[interest] = []
	out[interest].append(dict(name=name, lambert=lambert, gps='%s, %s' % (lon,
	lat), beschreibung=beschreibung,
	laenge=laenge_re.search(data).group(1).strip(),
	interest=interest, url=url))
	print out[interest][-1]
	return True


	## Um Raeren
	# for count in range(1, 110):
	# url = 'http://carto1.wallonie.be/documents/Karst/fiche_karst.idc?AKWANUM=431-%03d' % count
	# print "---------"
	# print url
	# extract(url)
	# for count in range(1, 74):
	# url = 'http://carto1.wallonie.be/documents/Karst/fiche_karst.idc?AKWANUM=432-%03d' % count
	# print "---------"
	# print url
	# extract(url)

	# Anderes

	for a in range(40, 61):
	for b in range(1, 10):
	for count in range(1, 500):
	url = \
	'http://carto1.wallonie.be/documents/Karst/fiche_karst.idc?AKWANUM=%d%d-%03d' \
	% (a, b, count)
	print '---------'
	print url
	if not extract(url):
	break

	# http://carto1.wallonie.be/documents/karst/site/42650z.html
	# http://carto1.wallonie.be/documents/karst/site/42659z.html
	# http://carto1.wallonie.be/documents/karst/site/47744z.html
	# http://carto1.wallonie.be/documents/karst/site/47810z.html
	# http://carto1.wallonie.be/documents/karst/site/47825z.html
	# http://carto1.wallonie.be/documents/karst/site/48217z.html
	# http://carto1.wallonie.be/documents/karst/site/48521z.html
	# http://carto1.wallonie.be/documents/karst/site/492139z.html
	# http://carto1.wallonie.be/documents/karst/site/492151z.html
	# http://carto1.wallonie.be/documents/karst/site/492153z.html
	# http://carto1.wallonie.be/documents/karst/site/492155z.html
	# http://carto1.wallonie.be/documents/karst/site/4926z.html
	# http://carto1.wallonie.be/documents/karst/site/49367z.html
	# http://carto1.wallonie.be/documents/karst/site/49377z.html
	# http://carto1.wallonie.be/documents/karst/site/49382z.html
	# http://carto1.wallonie.be/documents/karst/site/496101z.html
	# http://carto1.wallonie.be/documents/karst/site/5065z.html
	# http://carto1.wallonie.be/documents/karst/site/53442z.html
	# http://carto1.wallonie.be/documents/karst/site/538245b.html
	# http://carto1.wallonie.be/documents/karst/site/55113Z.html
	# http://carto1.wallonie.be/documents/karst/site/5522z.html
	# http://carto1.wallonie.be/documents/karst/site/5522z.html
	# http://carto1.wallonie.be/documents/karst/site/5555Z.html
	# http://carto1.wallonie.be/documents/karst/site/59235z.html

	# generate kml

	fd = codecs.open('carto1.wallonie.be.kml', mode='w', encoding='utf-8')
	fd.write('''<?xml version="1.0" encoding="UTF-8"?>
	<kml xmlns="http://www.opengis.net/kml/2.2">
	<Document>
	''')
	for out, fname in [(out_s, 'Speleo'), (out_o, 'Other')]:
	fd.write(''' <Folder><name>%s</name>\n''' % fname)
	for kat in sorted(out.keys()):
	fd.write(''' <Folder><name>%s</name>''' % kat)
	for loc in out[kat]:
	fd.write(''' <Placemark><name>%(name)s</name>
	<description>%(beschreibung)s

	%(laenge)s

	%(interest)s

	%(url)s
	</description>
	<Point><coordinates>%(gps)s,0</coordinates>
	</Point>\n </Placemark>\n''' % loc)
	fd.write(' </Folder>\n')
	fd.write(' </Folder>\n')
	fd.write('</Document>\n</kml>')