Created
February 21, 2017 07:28
-
-
Save allisonmorgan/61409e78f00c87a344ed8f7454a2b773 to your computer and use it in GitHub Desktop.
Campus Group Information
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
from bs4 import BeautifulSoup, Comment | |
from collections import Counter | |
import dryscrape | |
import csv | |
contact_information = "Contact Information" | |
theme_information = "Theme" | |
email_information = "Organization Email" | |
web_information = "Organization Website" | |
fb_information = "Facebook Page" | |
head_name_information = "Communicator Name" | |
head_email_information = "Communicator Email" | |
org_theme = "org_theme" | |
org_name = "org_name" | |
org_email = "org_email" | |
org_web = "org_web" | |
org_fb = "org_fb" | |
head_name = "head_name" | |
head_email = "head_email" | |
def get_url(url): | |
# Start javascript session (mimick browser request) | |
session = dryscrape.Session() | |
session.set_header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36") | |
session.visit(url) | |
# Return the page's HTML | |
return session.body() | |
if __name__ == "__main__": | |
# Get URL | |
url = "http://sofo.colorado.edu/SOFOsdg.php" | |
r = get_url(url) | |
html = BeautifulSoup(r, "html.parser") | |
# Remove comments and any javascript | |
comments = html.findAll(text=lambda text:isinstance(text, Comment)) | |
[comment.extract() for comment in comments] | |
scripts = html.findAll(["script", "style"]) | |
[script.extract() for script in scripts] | |
# Get groups | |
groups = {} | |
# Data is stored in tables | |
tables = html.find_all('table') | |
for table in tables: | |
group = {} | |
for cell in table.find_all('td'): | |
title = cell.find('h3') # Name of group stored in h3 | |
contact = cell.find('h4') # Beginning of contact information | |
theme = cell.get_text() # Grab group themes | |
if theme and theme.strip().startswith(theme_information): | |
group[org_theme] = theme.lstrip(theme_information + ": ").strip() | |
elif title: | |
group_name = title.get_text() | |
if not group.has_key(org_name): | |
group[org_name] = group_name | |
elif group.has_key(org_name) and group[org_name] != group_name: | |
# If this is a new group, add our old group and start a new one | |
groups[group[org_name]] = group; | |
group = {}; group[org_name] = group_name | |
# Search through all of a group's possible contact info (stored in p tags) | |
elif contact and contact.get_text() == contact_information: | |
for paragraph in cell.find_all('p'): | |
string = paragraph.get_text() | |
if string.startswith(email_information): | |
group[org_email] = string.lstrip(email_information + ": ").strip() | |
elif string.startswith(web_information): | |
group[org_web] = string.lstrip(web_information + ": ").strip() | |
elif string.startswith(fb_information): | |
group[org_fb] = string.lstrip(fb_information + ": ").strip() | |
elif string.startswith(head_name_information): | |
group[head_name] = string.lstrip(head_name_information + ": ").strip() | |
elif string.startswith(head_email_information): | |
group[head_email] = string.lstrip(head_email_information + ": ").strip() | |
print ("Number of groups found: {0}".format(len(groups))) | |
writer = csv.writer(open("campus_groups.csv", 'w')) | |
writer.writerow([org_name, org_theme, org_email, org_web, org_fb, head_name, head_email]) | |
for (name, group) in groups.items(): | |
#print group | |
writer.writerow([group[org_name].encode('utf-8'), group[org_theme].encode('utf-8'),group[org_email].encode('utf-8'), group[org_web].encode('utf-8'), group[org_fb].encode('utf-8'), group[head_name].encode('utf-8'), group[head_email].encode('utf-8')]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment