tcabrol/tax_data.py

## tax_data.py
#!/usr/bin/env python
# encoding: utf-8
"""
tax_data.py

Created by Thomas Cabrol on 2012-02-12.
"""


import re
import os
import urllib
from BeautifulSoup import BeautifulSoup
import xlrd
import simplejson
import sys
import codecs
from datetime import datetime

# Set some constants....
RAW_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'raw_data')
DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data')

class Downloader(object):
	''' Pure hackery to crawl data.gouv.fr search results
	and download the files of interest. Please do not try to use it as is for anything else
	than the Tax data where are looking for ! '''

	def __init__(self):
		self.base_url = "http://www.data.gouv.fr/"
		self.search_url = "content/search/(offset)/"
		self.search_string = "?SearchText=&Type=data&Contexte=q%3Dtype%253Adata%26add_hit_meta%3Dhtml_simple_view%2540html_simple_view%26sort_ascending%3D0%26r%3DTop%252Fprimary_producer%252Fministere%2Bdu%2Bbudget%252C%2Bdes%2Bcomptes%2Bpublics%2Bet%2Bde%2Bla%2Breforme%2Bde%2Bl%2527etat%26r%3DTop%252Fkeywords%252Fimpot%2Bsur%2Ble%2Brevenu%26r%3DTop%252Fyear_interval%252F2009&Facet=Top/year_interval/2009"

	def get_files(self, max_offset):
		''' Returns a list of all Excel files to download '''
		self.files = []
		for offset in xrange(0, max_offset, 10):
			search_full_url = self.base_url + self.search_url + str(offset) + self.search_string
			html = urllib.urlopen(search_full_url).read()
			soup = BeautifulSoup(html)
			for data in soup.findAll('p', { 'class' : 'download' }):
				link = data('a')[0]['href']
				if '.xls' in link:
					if link not in self.files:
						self.files.append(link)
		print >>sys.stdout, "%i files found...\n" % len(self.files)
		return self.files

	def download(self):
		''' Actually download the files '''
		if not os.path.isdir(RAW_DATA_DIR):
			os.makedir(RAW_DATA_DIR)
		for xl_file in self.get_files(200):
			xl_file_url = self.base_url + xl_file
			xl_file_name = xl_file.split('/')[-1]
			xl_file_local = os.path.join(RAW_DATA_DIR, xl_file_name)
			print >>sys.stdout, "Downloading %s..." % xl_file_url
			urllib.urlretrieve(xl_file_url, xl_file_local)


class ExcelExtractor(object):
	''' Simple class that takes data from the source xls
	files and append it into a tsv file '''

	def __init__(self):
		''' Instantiate the object with some values related to
		the schema of the xls files '''
                if not os.path.isdir(DATA_DIR):
			os.makedir(DATA_DIR)
		self.outFile = os.path.join(DATA_DIR, 'raw_tax_data.tsv')
		self.sheet_index = 0
		self.min_row = 27
		self.max_row = 20000
		self.min_col = 1
		self.max_col = 14
		self.columns_map = {
		  	0 : {"description" : u"DEP", "name" : u"dep"} ,
		  	1 : {"description" : u"Commune", "name" : u"commune"} ,
			2 : {"description" : u"Libellé de la name", "name" : u"label_commune"} ,
		  	3 : {"description" : u"Revenu fiscal de référence par tranche (en euros)", "name" : u"segment_fiscal_revenue"} ,
		  	4 : {"description" : u"Nombre de foyers fiscaux", "name" : u"count_fiscal_hh"} ,
		  	5 : {"description" : u"Revenu fiscal de référence des foyers fiscaux", "name" : u"ref_fiscal_rev_fiscal_hh"} ,
		  	6 : {"description" : u"Impôt net (total)", "name" : u"net_tax"} ,
		  	7 : {"description" : u"Nombre de foyers fiscaux imposables", "name" : u"count_fiscal_hh_taxable"} ,
		  	8 : {"description" : u"Revenu fiscal de référence des foyers fiscaux imposables", "name" : u"ref_fiscal_rev_fiscal_hh_taxable"} ,
		  	9 : {"description" : u"Traitements et salaires - Nombre de foyers concernés", "name" : u"salaries_actual_hh_count"} ,
		  	10: {"description" : u"Traitements et salaires - Montant", "name" : u"salaries_amount"} ,
		  	11: {"description" : u"Retraites et pensions - Nombre de foyers concernés", "name" : u"retirement_actual_hh_count"} ,
		  	12: {"description" : u"Retraites et pensions - Montant", "name" : u"retirement_amount"}
		}

	def list_xls(self):
		''' Returns the list of xls files to be parsed '''
		return [f for f in os.listdir(RAW_DATA_DIR) if re.match('^(\d{3})(.xls)', f)]

	def printable(self, item):
		if isinstance(item, basestring):
			return item
		return str(item)

	def xls_to_tsv(self):
		t = codecs.open(self.outFile, 'w', 'utf-8')
		t.write('\t'.join(v['description'] for k, v in e.columns_map.items()) + '\n')
		for infile in self.list_xls():
			try:
				excel = xlrd.open_workbook(os.path.join(RAW_DATA_DIR, infile))
			except:
				print >>sys.stdout, "Failed to open %s..." % infile
			sheet = excel.sheet_by_index(self.sheet_index)
			for row in xrange(self.min_row, self.max_row):
				try:
					record = sheet.row_values(row, start_colx =  self.min_col, end_colx = self.max_col)
					if not len(record) == len(self.columns_map):
						print >>sys.stdout, "Line does not match : %s" % record
					t.write('\t'.join(map(self.printable, record)) + '\n')
				except:
					#print >>sys.stdout, sys.exc_info()[1]
					pass
		t.close()


if __name__ == '__main__':
	print "%s : Process starts..." % datetime.now()
	d = Downloader()
	d.download()
	e = ExcelExtractor()
	e.xls_to_tsv()
	print "%s : Process ends..." % datetime.now()
	#!/usr/bin/env python
	# encoding: utf-8
	"""
	tax_data.py

	Created by Thomas Cabrol on 2012-02-12.
	"""


	import re
	import os
	import urllib
	from BeautifulSoup import BeautifulSoup
	import xlrd
	import simplejson
	import sys
	import codecs
	from datetime import datetime

	# Set some constants....
	RAW_DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'raw_data')
	DATA_DIR = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'data')

	class Downloader(object):
	''' Pure hackery to crawl data.gouv.fr search results
	and download the files of interest. Please do not try to use it as is for anything else
	than the Tax data where are looking for ! '''

	def __init__(self):
	self.base_url = "http://www.data.gouv.fr/"
	self.search_url = "content/search/(offset)/"
	self.search_string = "?SearchText=&Type=data&Contexte=q%3Dtype%253Adata%26add_hit_meta%3Dhtml_simple_view%2540html_simple_view%26sort_ascending%3D0%26r%3DTop%252Fprimary_producer%252Fministere%2Bdu%2Bbudget%252C%2Bdes%2Bcomptes%2Bpublics%2Bet%2Bde%2Bla%2Breforme%2Bde%2Bl%2527etat%26r%3DTop%252Fkeywords%252Fimpot%2Bsur%2Ble%2Brevenu%26r%3DTop%252Fyear_interval%252F2009&Facet=Top/year_interval/2009"

	def get_files(self, max_offset):
	''' Returns a list of all Excel files to download '''
	self.files = []
	for offset in xrange(0, max_offset, 10):
	search_full_url = self.base_url + self.search_url + str(offset) + self.search_string
	html = urllib.urlopen(search_full_url).read()
	soup = BeautifulSoup(html)
	for data in soup.findAll('p', { 'class' : 'download' }):
	link = data('a')[0]['href']
	if '.xls' in link:
	if link not in self.files:
	self.files.append(link)
	print >>sys.stdout, "%i files found...\n" % len(self.files)
	return self.files

	def download(self):
	''' Actually download the files '''
	if not os.path.isdir(RAW_DATA_DIR):
	os.makedir(RAW_DATA_DIR)
	for xl_file in self.get_files(200):
	xl_file_url = self.base_url + xl_file
	xl_file_name = xl_file.split('/')[-1]
	xl_file_local = os.path.join(RAW_DATA_DIR, xl_file_name)
	print >>sys.stdout, "Downloading %s..." % xl_file_url
	urllib.urlretrieve(xl_file_url, xl_file_local)


	class ExcelExtractor(object):
	''' Simple class that takes data from the source xls
	files and append it into a tsv file '''

	def __init__(self):
	''' Instantiate the object with some values related to
	the schema of the xls files '''
	if not os.path.isdir(DATA_DIR):
	os.makedir(DATA_DIR)
	self.outFile = os.path.join(DATA_DIR, 'raw_tax_data.tsv')
	self.sheet_index = 0
	self.min_row = 27
	self.max_row = 20000
	self.min_col = 1
	self.max_col = 14
	self.columns_map = {
	0 : {"description" : u"DEP", "name" : u"dep"} ,
	1 : {"description" : u"Commune", "name" : u"commune"} ,
	2 : {"description" : u"Libellé de la name", "name" : u"label_commune"} ,
	3 : {"description" : u"Revenu fiscal de référence par tranche (en euros)", "name" : u"segment_fiscal_revenue"} ,
	4 : {"description" : u"Nombre de foyers fiscaux", "name" : u"count_fiscal_hh"} ,
	5 : {"description" : u"Revenu fiscal de référence des foyers fiscaux", "name" : u"ref_fiscal_rev_fiscal_hh"} ,
	6 : {"description" : u"Impôt net (total)", "name" : u"net_tax"} ,
	7 : {"description" : u"Nombre de foyers fiscaux imposables", "name" : u"count_fiscal_hh_taxable"} ,
	8 : {"description" : u"Revenu fiscal de référence des foyers fiscaux imposables", "name" : u"ref_fiscal_rev_fiscal_hh_taxable"} ,
	9 : {"description" : u"Traitements et salaires - Nombre de foyers concernés", "name" : u"salaries_actual_hh_count"} ,
	10: {"description" : u"Traitements et salaires - Montant", "name" : u"salaries_amount"} ,
	11: {"description" : u"Retraites et pensions - Nombre de foyers concernés", "name" : u"retirement_actual_hh_count"} ,
	12: {"description" : u"Retraites et pensions - Montant", "name" : u"retirement_amount"}
	}

	def list_xls(self):
	''' Returns the list of xls files to be parsed '''
	return [f for f in os.listdir(RAW_DATA_DIR) if re.match('^(\d{3})(.xls)', f)]

	def printable(self, item):
	if isinstance(item, basestring):
	return item
	return str(item)

	def xls_to_tsv(self):
	t = codecs.open(self.outFile, 'w', 'utf-8')
	t.write('\t'.join(v['description'] for k, v in e.columns_map.items()) + '\n')
	for infile in self.list_xls():
	try:
	excel = xlrd.open_workbook(os.path.join(RAW_DATA_DIR, infile))
	except:
	print >>sys.stdout, "Failed to open %s..." % infile
	sheet = excel.sheet_by_index(self.sheet_index)
	for row in xrange(self.min_row, self.max_row):
	try:
	record = sheet.row_values(row, start_colx = self.min_col, end_colx = self.max_col)
	if not len(record) == len(self.columns_map):
	print >>sys.stdout, "Line does not match : %s" % record
	t.write('\t'.join(map(self.printable, record)) + '\n')
	except:
	#print >>sys.stdout, sys.exc_info()[1]
	pass
	t.close()


	if __name__ == '__main__':
	print "%s : Process starts..." % datetime.now()
	d = Downloader()
	d.download()
	e = ExcelExtractor()
	e.xls_to_tsv()
	print "%s : Process ends..." % datetime.now()