allisonmorgan/campus_groups.py

## campus_groups.py
# -*- coding: utf-8 -*-

from bs4 import BeautifulSoup, Comment
from collections import Counter

import dryscrape
import csv

contact_information = "Contact Information"
theme_information = "Theme"
email_information = "Organization Email"
web_information = "Organization Website"
fb_information = "Facebook Page"
head_name_information = "Communicator Name"
head_email_information = "Communicator Email"

org_theme = "org_theme"
org_name = "org_name"
org_email = "org_email"
org_web = "org_web"
org_fb = "org_fb"
head_name = "head_name"
head_email = "head_email"

def get_url(url):
    # Start javascript session (mimick browser request)
    session = dryscrape.Session()
    session.set_header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36")
    session.visit(url)

    # Return the page's HTML
    return session.body()

if __name__ == "__main__":

    # Get URL
    url = "http://sofo.colorado.edu/SOFOsdg.php"
    r = get_url(url)
    html = BeautifulSoup(r, "html.parser")

    # Remove comments and any javascript
    comments = html.findAll(text=lambda text:isinstance(text, Comment))
    [comment.extract() for comment in comments]
    scripts = html.findAll(["script", "style"])
    [script.extract() for script in scripts]

    # Get groups
    groups = {}

    # Data is stored in tables
    tables = html.find_all('table')
    for table in tables:
        group = {}
        for cell in table.find_all('td'):
            title = cell.find('h3')    # Name of group stored in h3
            contact = cell.find('h4')  # Beginning of contact information

            theme = cell.get_text()    # Grab group themes
            if theme and theme.strip().startswith(theme_information):
                group[org_theme] = theme.lstrip(theme_information + ": ").strip()

            elif title:
                group_name = title.get_text()
                if not group.has_key(org_name):
                    group[org_name] = group_name
                elif group.has_key(org_name) and group[org_name] != group_name:
                    # If this is a new group, add our old group and start a new one
                    groups[group[org_name]] = group;
                    group = {}; group[org_name] = group_name

            # Search through all of a group's possible contact info (stored in p tags)
            elif contact and contact.get_text() == contact_information:
                for paragraph in cell.find_all('p'):
                    string = paragraph.get_text()
                    if string.startswith(email_information):
                        group[org_email] = string.lstrip(email_information + ": ").strip()
                    elif string.startswith(web_information):
                        group[org_web] = string.lstrip(web_information + ": ").strip()
                    elif string.startswith(fb_information):
                        group[org_fb] = string.lstrip(fb_information + ": ").strip()
                    elif string.startswith(head_name_information):
                        group[head_name] = string.lstrip(head_name_information + ": ").strip()
                    elif string.startswith(head_email_information):
                        group[head_email] = string.lstrip(head_email_information + ": ").strip()

    print ("Number of groups found: {0}".format(len(groups)))

    writer = csv.writer(open("campus_groups.csv", 'w'))
    writer.writerow([org_name, org_theme, org_email, org_web, org_fb, head_name, head_email])
    for (name, group) in groups.items():
        #print group
        writer.writerow([group[org_name].encode('utf-8'), group[org_theme].encode('utf-8'),group[org_email].encode('utf-8'), group[org_web].encode('utf-8'), group[org_fb].encode('utf-8'), group[head_name].encode('utf-8'), group[head_email].encode('utf-8')])
	# -- coding: utf-8 --

	from bs4 import BeautifulSoup, Comment
	from collections import Counter

	import dryscrape
	import csv

	contact_information = "Contact Information"
	theme_information = "Theme"
	email_information = "Organization Email"
	web_information = "Organization Website"
	fb_information = "Facebook Page"
	head_name_information = "Communicator Name"
	head_email_information = "Communicator Email"

	org_theme = "org_theme"
	org_name = "org_name"
	org_email = "org_email"
	org_web = "org_web"
	org_fb = "org_fb"
	head_name = "head_name"
	head_email = "head_email"

	def get_url(url):
	# Start javascript session (mimick browser request)
	session = dryscrape.Session()
	session.set_header("User-Agent", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36")
	session.visit(url)

	# Return the page's HTML
	return session.body()

	if __name__ == "__main__":

	# Get URL
	url = "http://sofo.colorado.edu/SOFOsdg.php"
	r = get_url(url)
	html = BeautifulSoup(r, "html.parser")

	# Remove comments and any javascript
	comments = html.findAll(text=lambda text:isinstance(text, Comment))
	[comment.extract() for comment in comments]
	scripts = html.findAll(["script", "style"])
	[script.extract() for script in scripts]

	# Get groups
	groups = {}

	# Data is stored in tables
	tables = html.find_all('table')
	for table in tables:
	group = {}
	for cell in table.find_all('td'):
	title = cell.find('h3') # Name of group stored in h3
	contact = cell.find('h4') # Beginning of contact information

	theme = cell.get_text() # Grab group themes
	if theme and theme.strip().startswith(theme_information):
	group[org_theme] = theme.lstrip(theme_information + ": ").strip()

	elif title:
	group_name = title.get_text()
	if not group.has_key(org_name):
	group[org_name] = group_name
	elif group.has_key(org_name) and group[org_name] != group_name:
	# If this is a new group, add our old group and start a new one
	groups[group[org_name]] = group;
	group = {}; group[org_name] = group_name

	# Search through all of a group's possible contact info (stored in p tags)
	elif contact and contact.get_text() == contact_information:
	for paragraph in cell.find_all('p'):
	string = paragraph.get_text()
	if string.startswith(email_information):
	group[org_email] = string.lstrip(email_information + ": ").strip()
	elif string.startswith(web_information):
	group[org_web] = string.lstrip(web_information + ": ").strip()
	elif string.startswith(fb_information):
	group[org_fb] = string.lstrip(fb_information + ": ").strip()
	elif string.startswith(head_name_information):
	group[head_name] = string.lstrip(head_name_information + ": ").strip()
	elif string.startswith(head_email_information):
	group[head_email] = string.lstrip(head_email_information + ": ").strip()

	print ("Number of groups found: {0}".format(len(groups)))

	writer = csv.writer(open("campus_groups.csv", 'w'))
	writer.writerow([org_name, org_theme, org_email, org_web, org_fb, head_name, head_email])
	for (name, group) in groups.items():
	#print group
	writer.writerow([group[org_name].encode('utf-8'), group[org_theme].encode('utf-8'),group[org_email].encode('utf-8'), group[org_web].encode('utf-8'), group[org_fb].encode('utf-8'), group[head_name].encode('utf-8'), group[head_email].encode('utf-8')])