Created
July 29, 2013 03:42
-
-
Save nharrell04/6102026 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from urllib2 import urlopen | |
def gethtml(url): | |
return urlopen(url).read() | |
def extract(url): | |
html = gethtml(url) | |
return html.find('guides') | |
# finds the location of the first mention of "guides" | |
def lookfor(url): | |
z = gethtml(url) | |
y = 0 | |
for guides in z: | |
y = y+1 | |
return y | |
# i thought this would look for "guides" in the html, but that seems to return way too high of a number | |
# a command+f search of the html verifies this | |
# possibly counting each character/number in the html? | |
def parse_guides(url): | |
q = gethtml(url) | |
start_guide = q.find("/library/research/guides/abbre") | |
print start_guide | |
r = q[start_guide:] | |
z = r.split("/library/research/guides/") | |
return z | |
# returns list of html split w/ above path | |
# format issues | |
# print gethtml("http://www.law.georgetown.edu/library/research/guides/index.cfm") | |
print extract("http://www.law.georgetown.edu/library/research/guides/index.cfm") | |
print lookfor("http://www.law.georgetown.edu/library/research/guides/index.cfm") | |
print parse_guides("http://www.law.georgetown.edu/library/research/guides/index.cfm") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment