Skip to content

Instantly share code, notes, and snippets.

@alexpatel
Last active August 29, 2015 14:05
Show Gist options
  • Save alexpatel/a7ac71e75d022a410115 to your computer and use it in GitHub Desktop.
Save alexpatel/a7ac71e75d022a410115 to your computer and use it in GitHub Desktop.
Scrapes the contact information for student and faculty contacts from the Harvard Student Organizations site (http://osl.fas.harvard.edu/student-organizations)
#!/usr/bin/env python
''' Script to scrape contact information from Harvard Student Organizations site
(http://osl.fas.harvard.edu/student-organizations).
Run with 'python /path/to/scraper'.
-- License --
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
'''
from bs4 import BeautifulSoup
import requests
import csv
BASE_URL = "http://usodb.fas.harvard.edu/public/index.cgi"
OUTPUT = "clubs.csv"
def soupify(url):
""" Download URL and turn its text into BS-parsable obj. """
req = requests.get(url)
data = req.text
return BeautifulSoup(data)
def get_contact_info(url):
""" Gets all email addresses from a club page. """
soup = soupify(url)
contacts = []
for link in soup.find_all('a'):
href = link.get('href')
if 'mailto' in href and len(href) > 7:
contacts.append(href[7:].encode('ascii', 'ignore'))
return contacts
def get_clubs():
""" Gets all clubs from main Clubs & Organizations page. """
clubs = {}
soup = soupify(BASE_URL)
links = soup.find_all('a')
for i,a in enumerate(links):
link = BASE_URL + a.get('href')
clubs.update(
{ a.string.encode('ascii', 'ignore'):
{
'url' : link.encode('ascii', 'ignore'),
'contacts' : get_contact_info(link)
}
}
)
return clubs
def write(data, file):
""" Write data to CSV. """
with open(file, "wb") as f:
writer = csv.writer(f, delimiter=',')
for line in data:
writer.writerow(line)
def format(clubs):
data = [['Club', 'URL', 'Contact Information']]
for club in clubs:
row = [club, clubs[club]['url']]
row.extend(clubs[club]['contacts'])
data.append(row)
return data
def main():
clubs = get_clubs()
data = format(clubs)
write(data, OUTPUT)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment