Created
October 15, 2012 10:55
-
-
Save nkhine/3891927 to your computer and use it in GitHub Desktop.
Extract Associations from nimes.fr
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
nimes_assoc.csv |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/local/bin/python | |
# -*- coding: utf-8 -*- | |
import urllib2 | |
import BeautifulSoup | |
import csv | |
user_agent = ('Mozilla/5.0 (X11; U; Linux i686)' | |
'Gecko/20071127 Firefox/2.0.0.11') | |
origin_site = ('http://typo3.nimes.fr/index.php?' | |
'id=annuaire_assos&theme=0&rech=&num_page=%s') | |
# there are 35 pages so our range is between 1 and 36 | |
pages = range(1,36) | |
# initial list to store individual assoc information used in CSV | |
assoc_table = [] | |
for page_no in pages: | |
req = origin_site % page_no | |
try: | |
doc = urllib2.urlopen(req) | |
except urllib2.URLError, e: | |
continue | |
# do something with the page | |
soup = BeautifulSoup.BeautifulSoup(doc) | |
for row in soup.findAll('tr', { "class" : "menu2" }): | |
item = row.renderContents() | |
soup = BeautifulSoup.BeautifulSoup(item) | |
# we get the Thème and build the initial list to store the assoc data | |
assoc_data = [assoc_theme.renderContents() | |
for assoc_theme in soup.findAll('u')] | |
# we get the Nom de l'association | |
assoc_data.extend(assoc_name.renderContents() | |
for assoc_name in soup.findAll('td', { "width": "70%"})) | |
# we list all the links to the indivudual pages | |
for i in soup.findAll('a', {'href':'#'}): | |
if 'associations' in i.attrMap['onclick']: | |
req = i.attrMap['onclick'].split('\'')[1] | |
try: | |
doc = urllib2.urlopen(req) | |
except urllib2.URLError, e: | |
continue | |
# take a snapshot of the page | |
soup = BeautifulSoup.BeautifulSoup(doc) | |
# extract email and web address | |
emails = [] | |
web_sites = [] | |
for tag in soup.recursiveChildGenerator(): | |
if isinstance(tag,BeautifulSoup.Tag) and tag.name in ('a'): | |
assoc_link = tag.string | |
if '@' in str(assoc_link): | |
# some times there are more than one emails addresses | |
# so we put it in a list and use just the first address | |
emails.append(assoc_link) | |
if 'http' in str(assoc_link): | |
web_sites.append(assoc_link) | |
if emails: | |
assoc_data.append(emails[0]) | |
else: | |
assoc_data.append('pas de email') | |
if web_sites: | |
assoc_data.append(web_sites[0]) | |
else: | |
assoc_data.append('pas de site web') | |
# 'td' width 49% - address and name of president and post code | |
assoc_data.extend(assoc_cont.renderContents() for assoc_cont in | |
soup.findAll('td', {'width': '49%', 'class': 'menu2' })) | |
# 'td' width 45% has 3 entries, we only need the 1st | |
assoc_tel = soup.findAll('td', {'width': '45%', 'class': 'menu2'}) | |
assoc_data.append(assoc_tel[0].renderContents()) | |
print assoc_data | |
assoc_table.append(assoc_data) | |
# write the data to .csv | |
with open('nimes_assoc.csv', 'wb') as f: | |
csv.writer(f).writerows(assoc_table) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment