rizumu/university_locator.py

## university_locator.py
from BeautifulSoup import BeautifulSoup
from urlparse import urlparse
import re
import urllib2

if __name__ == "__main__":
    """ driver code """
    # extract html
    url = 'http://www.utexas.edu/world/univ/alpha/'
    usock = urllib2.urlopen(url)
    html = usock.read()
    usock.close()
    # clean up the html
    soup = BeautifulSoup(html)
    # grab all list items from the page
    list_items = [str(embed) for embed in soup.findAll('li')]
    colleges = []
    url_pattern = re.compile(r'http\://[a-zA-Z0-9-\.]*')
    name_pattern = re.compile(r'"\>[a-zA-Z-\.&/\ ]*')
    state_pattern = re.compile(r'\([A-Z/]*')
    # extra data from list items
    for i in list_items:
        url = re.search(url_pattern, i).group()
        name = re.search(name_pattern, i).group()
        # neccessary if statement because some items do not have state data
        if re.search(state_pattern, i):
            state = re.search(state_pattern, i).group()
        else:
            state = ''
        # strip exteranous characters that were used in regex search
        college = (url, name.strip('">'), state.strip('('))
        colleges.append(college)
    # cleanly print to screen the contents the colleges variable
    for college in colleges:
        print college
	from BeautifulSoup import BeautifulSoup
	from urlparse import urlparse
	import re
	import urllib2

	if __name__ == "__main__":
	""" driver code """
	# extract html
	url = 'http://www.utexas.edu/world/univ/alpha/'
	usock = urllib2.urlopen(url)
	html = usock.read()
	usock.close()
	# clean up the html
	soup = BeautifulSoup(html)
	# grab all list items from the page
	list_items = [str(embed) for embed in soup.findAll('li')]
	colleges = []
	url_pattern = re.compile(r'http\://[a-zA-Z0-9-\.]*')
	name_pattern = re.compile(r'"\>[a-zA-Z-\.&/\ ]*')
	state_pattern = re.compile(r'\([A-Z/]*')
	# extra data from list items
	for i in list_items:
	url = re.search(url_pattern, i).group()
	name = re.search(name_pattern, i).group()
	# neccessary if statement because some items do not have state data
	if re.search(state_pattern, i):
	state = re.search(state_pattern, i).group()
	else:
	state = ''
	# strip exteranous characters that were used in regex search
	college = (url, name.strip('">'), state.strip('('))
	colleges.append(college)
	# cleanly print to screen the contents the colleges variable
	for college in colleges:
	print college