Skip to content

Instantly share code, notes, and snippets.

@nkhine
Created October 15, 2012 10:55
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nkhine/3891927 to your computer and use it in GitHub Desktop.
Save nkhine/3891927 to your computer and use it in GitHub Desktop.
Extract Associations from nimes.fr
nimes_assoc.csv
#!/usr/local/bin/python
# -*- coding: utf-8 -*-
import urllib2
import BeautifulSoup
import csv
user_agent = ('Mozilla/5.0 (X11; U; Linux i686)'
'Gecko/20071127 Firefox/2.0.0.11')
origin_site = ('http://typo3.nimes.fr/index.php?'
'id=annuaire_assos&theme=0&rech=&num_page=%s')
# there are 35 pages so our range is between 1 and 36
pages = range(1,36)
# initial list to store individual assoc information used in CSV
assoc_table = []
for page_no in pages:
req = origin_site % page_no
try:
doc = urllib2.urlopen(req)
except urllib2.URLError, e:
continue
# do something with the page
soup = BeautifulSoup.BeautifulSoup(doc)
for row in soup.findAll('tr', { "class" : "menu2" }):
item = row.renderContents()
soup = BeautifulSoup.BeautifulSoup(item)
# we get the Thème and build the initial list to store the assoc data
assoc_data = [assoc_theme.renderContents()
for assoc_theme in soup.findAll('u')]
# we get the Nom de l'association
assoc_data.extend(assoc_name.renderContents()
for assoc_name in soup.findAll('td', { "width": "70%"}))
# we list all the links to the indivudual pages
for i in soup.findAll('a', {'href':'#'}):
if 'associations' in i.attrMap['onclick']:
req = i.attrMap['onclick'].split('\'')[1]
try:
doc = urllib2.urlopen(req)
except urllib2.URLError, e:
continue
# take a snapshot of the page
soup = BeautifulSoup.BeautifulSoup(doc)
# extract email and web address
emails = []
web_sites = []
for tag in soup.recursiveChildGenerator():
if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('a'):
assoc_link = tag.string
if '@' in str(assoc_link):
# some times there are more than one emails addresses
# so we put it in a list and use just the first address
emails.append(assoc_link)
if 'http' in str(assoc_link):
web_sites.append(assoc_link)
if emails:
assoc_data.append(emails[0])
else:
assoc_data.append('pas de email')
if web_sites:
assoc_data.append(web_sites[0])
else:
assoc_data.append('pas de site web')
# 'td' width 49% - address and name of president and post code
assoc_data.extend(assoc_cont.renderContents() for assoc_cont in
soup.findAll('td', {'width': '49%', 'class': 'menu2' }))
# 'td' width 45% has 3 entries, we only need the 1st
assoc_tel = soup.findAll('td', {'width': '45%', 'class': 'menu2'})
assoc_data.append(assoc_tel[0].renderContents())
print assoc_data
assoc_table.append(assoc_data)
# write the data to .csv
with open('nimes_assoc.csv', 'wb') as f:
csv.writer(f).writerows(assoc_table)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment