nkhine/.gitignore

## .gitignore
nimes_assoc.csv

## nimes_extract.py
#!/usr/local/bin/python
# -*- coding: utf-8 -*-
import urllib2
import BeautifulSoup
import csv

user_agent = ('Mozilla/5.0 (X11; U; Linux i686)'
                'Gecko/20071127 Firefox/2.0.0.11')

origin_site = ('http://typo3.nimes.fr/index.php?'
                'id=annuaire_assos&theme=0&rech=&num_page=%s')

# there are 35 pages so our range is between 1 and 36
pages = range(1,36)

# initial list to store individual assoc information used in CSV
assoc_table = []

for page_no in pages:
    req = origin_site % page_no
    try:
        doc = urllib2.urlopen(req)
    except urllib2.URLError, e:
        continue
    # do something with the page
    soup = BeautifulSoup.BeautifulSoup(doc)
    for row in soup.findAll('tr', { "class" : "menu2" }):
        item = row.renderContents()
        soup = BeautifulSoup.BeautifulSoup(item)
        # we get the Thème and build the initial list to store the assoc data
        assoc_data = [assoc_theme.renderContents()
            for assoc_theme in soup.findAll('u')]
        # we get the Nom de l'association
        assoc_data.extend(assoc_name.renderContents()
            for assoc_name in soup.findAll('td', { "width": "70%"}))
        # we list all the links to the indivudual pages
        for i in soup.findAll('a', {'href':'#'}):
            if 'associations' in i.attrMap['onclick']:
                req = i.attrMap['onclick'].split('\'')[1]
                try:
                    doc = urllib2.urlopen(req)
                except urllib2.URLError, e:
                    continue
                # take a snapshot of the page
                soup = BeautifulSoup.BeautifulSoup(doc)
                # extract email and web address
                emails = []
                web_sites = []
                for tag in soup.recursiveChildGenerator():
                    if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('a'):
                        assoc_link = tag.string
                        if '@' in str(assoc_link):
                            # some times there are more than one emails addresses
                            # so we put it in a list and use just the first address
                            emails.append(assoc_link)
                        if 'http' in str(assoc_link):
                            web_sites.append(assoc_link)
                if emails:
                    assoc_data.append(emails[0])
                else:
                    assoc_data.append('pas de email')
                if web_sites:
                    assoc_data.append(web_sites[0])
                else:
                    assoc_data.append('pas de site web')
                # 'td' width 49% - address and name of president and post code
                assoc_data.extend(assoc_cont.renderContents() for assoc_cont in
                        soup.findAll('td', {'width': '49%', 'class': 'menu2' }))

                # 'td' width 45% has 3 entries, we only need the 1st
                assoc_tel = soup.findAll('td', {'width': '45%', 'class': 'menu2'})
                assoc_data.append(assoc_tel[0].renderContents())

        print assoc_data
        assoc_table.append(assoc_data)

# write the data to .csv
with open('nimes_assoc.csv', 'wb') as f:
    csv.writer(f).writerows(assoc_table)
	#!/usr/local/bin/python
	# -- coding: utf-8 --
	import urllib2
	import BeautifulSoup
	import csv

	user_agent = ('Mozilla/5.0 (X11; U; Linux i686)'
	'Gecko/20071127 Firefox/2.0.0.11')

	origin_site = ('http://typo3.nimes.fr/index.php?'
	'id=annuaire_assos&theme=0&rech=&num_page=%s')

	# there are 35 pages so our range is between 1 and 36
	pages = range(1,36)

	# initial list to store individual assoc information used in CSV
	assoc_table = []

	for page_no in pages:
	req = origin_site % page_no
	try:
	doc = urllib2.urlopen(req)
	except urllib2.URLError, e:
	continue
	# do something with the page
	soup = BeautifulSoup.BeautifulSoup(doc)
	for row in soup.findAll('tr', { "class" : "menu2" }):
	item = row.renderContents()
	soup = BeautifulSoup.BeautifulSoup(item)
	# we get the Thème and build the initial list to store the assoc data
	assoc_data = [assoc_theme.renderContents()
	for assoc_theme in soup.findAll('u')]
	# we get the Nom de l'association
	assoc_data.extend(assoc_name.renderContents()
	for assoc_name in soup.findAll('td', { "width": "70%"}))
	# we list all the links to the indivudual pages
	for i in soup.findAll('a', {'href':'#'}):
	if 'associations' in i.attrMap['onclick']:
	req = i.attrMap['onclick'].split('\'')[1]
	try:
	doc = urllib2.urlopen(req)
	except urllib2.URLError, e:
	continue
	# take a snapshot of the page
	soup = BeautifulSoup.BeautifulSoup(doc)
	# extract email and web address
	emails = []
	web_sites = []
	for tag in soup.recursiveChildGenerator():
	if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('a'):
	assoc_link = tag.string
	if '@' in str(assoc_link):
	# some times there are more than one emails addresses
	# so we put it in a list and use just the first address
	emails.append(assoc_link)
	if 'http' in str(assoc_link):
	web_sites.append(assoc_link)
	if emails:
	assoc_data.append(emails[0])
	else:
	assoc_data.append('pas de email')
	if web_sites:
	assoc_data.append(web_sites[0])
	else:
	assoc_data.append('pas de site web')
	# 'td' width 49% - address and name of president and post code
	assoc_data.extend(assoc_cont.renderContents() for assoc_cont in
	soup.findAll('td', {'width': '49%', 'class': 'menu2' }))

	# 'td' width 45% has 3 entries, we only need the 1st
	assoc_tel = soup.findAll('td', {'width': '45%', 'class': 'menu2'})
	assoc_data.append(assoc_tel[0].renderContents())

	print assoc_data
	assoc_table.append(assoc_data)

	# write the data to .csv
	with open('nimes_assoc.csv', 'wb') as f:
	csv.writer(f).writerows(assoc_table)