Skip to content

Instantly share code, notes, and snippets.

@tcabrol
Created February 6, 2012 23:05
Show Gist options
  • Save tcabrol/1755738 to your computer and use it in GitHub Desktop.
Save tcabrol/1755738 to your computer and use it in GitHub Desktop.
French Open Data :: getting Tax data
#!/usr/bin/env python
# encoding: utf-8
"""
open_data.py
Created by Thomas Cabrol on 2012-01-27.
"""
import re
import os
import urllib
from BeautifulSoup import BeautifulSoup
import xlrd
import simplejson
import sys
import codecs
from datetime import datetime
import csv
from pandas import *
# Set some constants....
RAW_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'raw_data')
DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data')
class Downloader(object):
''' Pure hackery to crawl data.gouv.fr search results
and download the files of interest. Please do not try to use it as is for anything else
than the Tax data where are looking for ! '''
def __init__(self):
self.base_url = "http://www.data.gouv.fr/"
self.search_url = "content/search/(offset)/"
self.search_string = "?SearchText=&Type=data&Contexte=q%3Dtype%253Adata%26add_hit_meta%3Dhtml_simple_view%2540html_simple_view%26sort_ascending%3D0%26r%3DTop%252Fprimary_producer%252Fministere%2Bdu%2Bbudget%252C%2Bdes%2Bcomptes%2Bpublics%2Bet%2Bde%2Bla%2Breforme%2Bde%2Bl%2527etat%26r%3DTop%252Fkeywords%252Fimpot%2Bsur%2Ble%2Brevenu%26r%3DTop%252Fyear_interval%252F2009&Facet=Top/year_interval/2009"
def get_files(self, max_offset):
''' Returns a list of all Excel files to download '''
self.files = []
for offset in xrange(0, max_offset, 10):
search_full_url = self.base_url + self.search_url + str(offset) + self.search_string
html = urllib.urlopen(search_full_url).read()
soup = BeautifulSoup(html)
for data in soup.findAll('p', { 'class' : 'download' }):
link = data('a')[0]['href']
if '.xls' in link:
if link not in self.files:
self.files.append(link)
print >>sys.stdout, "%i files found...\n" % len(self.files)
return self.files
def download(self):
''' Actually download the files '''
if not os.path.isdir(RAW_DATA_DIR):
os.makedir(RAW_DATA_DIR)
for xl_file in self.get_files(200):
xl_file_url = self.base_url + xl_file
xl_file_name = xl_file.split('/')[-1]
xl_file_local = os.path.join(RAW_DATA_DIR, xl_file_name)
print >>sys.stdout, "Downloading %s..." % xl_file_url
urllib.urlretrieve(xl_file_url, xl_file_local)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment