Skip to content

Instantly share code, notes, and snippets.

@allisonmorgan
Created February 21, 2017 07:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save allisonmorgan/61409e78f00c87a344ed8f7454a2b773 to your computer and use it in GitHub Desktop.
Save allisonmorgan/61409e78f00c87a344ed8f7454a2b773 to your computer and use it in GitHub Desktop.
Campus Group Information
# -*- coding: utf-8 -*-
from bs4 import BeautifulSoup, Comment
from collections import Counter
import dryscrape
import csv
contact_information = "Contact Information"
theme_information = "Theme"
email_information = "Organization Email"
web_information = "Organization Website"
fb_information = "Facebook Page"
head_name_information = "Communicator Name"
head_email_information = "Communicator Email"
org_theme = "org_theme"
org_name = "org_name"
org_email = "org_email"
org_web = "org_web"
org_fb = "org_fb"
head_name = "head_name"
head_email = "head_email"
def get_url(url):
# Start javascript session (mimick browser request)
session = dryscrape.Session()
session.set_header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36")
session.visit(url)
# Return the page's HTML
return session.body()
if __name__ == "__main__":
# Get URL
url = "http://sofo.colorado.edu/SOFOsdg.php"
r = get_url(url)
html = BeautifulSoup(r, "html.parser")
# Remove comments and any javascript
comments = html.findAll(text=lambda text:isinstance(text, Comment))
[comment.extract() for comment in comments]
scripts = html.findAll(["script", "style"])
[script.extract() for script in scripts]
# Get groups
groups = {}
# Data is stored in tables
tables = html.find_all('table')
for table in tables:
group = {}
for cell in table.find_all('td'):
title = cell.find('h3') # Name of group stored in h3
contact = cell.find('h4') # Beginning of contact information
theme = cell.get_text() # Grab group themes
if theme and theme.strip().startswith(theme_information):
group[org_theme] = theme.lstrip(theme_information + ": ").strip()
elif title:
group_name = title.get_text()
if not group.has_key(org_name):
group[org_name] = group_name
elif group.has_key(org_name) and group[org_name] != group_name:
# If this is a new group, add our old group and start a new one
groups[group[org_name]] = group;
group = {}; group[org_name] = group_name
# Search through all of a group's possible contact info (stored in p tags)
elif contact and contact.get_text() == contact_information:
for paragraph in cell.find_all('p'):
string = paragraph.get_text()
if string.startswith(email_information):
group[org_email] = string.lstrip(email_information + ": ").strip()
elif string.startswith(web_information):
group[org_web] = string.lstrip(web_information + ": ").strip()
elif string.startswith(fb_information):
group[org_fb] = string.lstrip(fb_information + ": ").strip()
elif string.startswith(head_name_information):
group[head_name] = string.lstrip(head_name_information + ": ").strip()
elif string.startswith(head_email_information):
group[head_email] = string.lstrip(head_email_information + ": ").strip()
print ("Number of groups found: {0}".format(len(groups)))
writer = csv.writer(open("campus_groups.csv", 'w'))
writer.writerow([org_name, org_theme, org_email, org_web, org_fb, head_name, head_email])
for (name, group) in groups.items():
#print group
writer.writerow([group[org_name].encode('utf-8'), group[org_theme].encode('utf-8'),group[org_email].encode('utf-8'), group[org_web].encode('utf-8'), group[org_fb].encode('utf-8'), group[head_name].encode('utf-8'), group[head_email].encode('utf-8')])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment