Skip to content

Instantly share code, notes, and snippets.

@rizumu
Created October 22, 2011 16:30
Show Gist options
  • Save rizumu/1306175 to your computer and use it in GitHub Desktop.
Save rizumu/1306175 to your computer and use it in GitHub Desktop.
university_locator.py
from BeautifulSoup import BeautifulSoup
from urlparse import urlparse
import re
import urllib2
if __name__ == "__main__":
""" driver code """
# extract html
url = 'http://www.utexas.edu/world/univ/alpha/'
usock = urllib2.urlopen(url)
html = usock.read()
usock.close()
# clean up the html
soup = BeautifulSoup(html)
# grab all list items from the page
list_items = [str(embed) for embed in soup.findAll('li')]
colleges = []
url_pattern = re.compile(r'http\://[a-zA-Z0-9-\.]*')
name_pattern = re.compile(r'"\>[a-zA-Z-\.&/\ ]*')
state_pattern = re.compile(r'\([A-Z/]*')
# extra data from list items
for i in list_items:
url = re.search(url_pattern, i).group()
name = re.search(name_pattern, i).group()
# neccessary if statement because some items do not have state data
if re.search(state_pattern, i):
state = re.search(state_pattern, i).group()
else:
state = ''
# strip exteranous characters that were used in regex search
college = (url, name.strip('">'), state.strip('('))
colleges.append(college)
# cleanly print to screen the contents the colleges variable
for college in colleges:
print college
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment