Skip to content

Instantly share code, notes, and snippets.

@benkant
Last active August 29, 2015 14:16
Show Gist options
  • Save benkant/c93aeb50046aae2ab2e9 to your computer and use it in GitHub Desktop.
Save benkant/c93aeb50046aae2ab2e9 to your computer and use it in GitHub Desktop.
check a bunch of sites for a bunch of terms
#!/usr/bin/env python
import re, urllib, sys
from urlparse import urlparse, urlunparse
terms = ['deepmind', 'recursive']
sites = ['http://torch.ch', 'http://www.arcadelearningenvironment.org/']
for site in sites:
base_url = urlparse(site)
visited_links = []
url_queue = []
url_queue.append(base_url.geturl())
while len(url_queue) > 0:
s_url = url_queue.pop()
visited_links.append(s_url)
parsed_url = urlparse(s_url)
content = urllib.urlopen(parsed_url.geturl()).read()
found = False
for term in terms:
if term in content.lower():
print "{} contains {}".format(parsed_url.geturl(), term)
found = True
break
# if any terms are found, move to the next site
if found:
break
for link in re.findall('''href=["'](.[^"']+)["']''', content, re.I):
try:
check_url = urlparse(link)
except ValueError:
# some garbage
continue
# check if this link is to another site
if check_url.scheme != '':
if base_url.netloc not in check_url.netloc or 'http' not in check_url.scheme:
continue
normalised_url = check_url.geturl()
else:
# normalise the url
normalised_url = urlunparse(
(base_url[0], base_url[1], check_url[2], check_url[3], check_url[4], '')
)
# check if we've already visited
if normalised_url not in visited_links and normalised_url not in url_queue:
url_queue.append(normalised_url)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment