benkant/site_terms.py

## site_terms.py
#!/usr/bin/env python

import re, urllib, sys
from urlparse import urlparse, urlunparse

terms = ['deepmind', 'recursive']
sites = ['http://torch.ch', 'http://www.arcadelearningenvironment.org/']

for site in sites:
    base_url = urlparse(site)

    visited_links = []
    url_queue = []
    url_queue.append(base_url.geturl())

    while len(url_queue) > 0:
        s_url = url_queue.pop()
        visited_links.append(s_url)

        parsed_url = urlparse(s_url)

        content = urllib.urlopen(parsed_url.geturl()).read()
        found = False
        for term in terms:
            if term in content.lower():
                print "{} contains {}".format(parsed_url.geturl(), term)
                found = True
                break

        # if any terms are found, move to the next site
        if found:
            break

        for link in re.findall('''href=["'](.[^"']+)["']''', content, re.I):
            try:
                check_url = urlparse(link)
            except ValueError:
                # some garbage
                continue

            # check if this link is to another site
            if check_url.scheme != '':
                if base_url.netloc not in check_url.netloc or 'http' not in check_url.scheme:
                    continue
                normalised_url = check_url.geturl()
            else:
                # normalise the url
                normalised_url = urlunparse(
                    (base_url[0], base_url[1], check_url[2], check_url[3], check_url[4], '')
                )

            # check if we've already visited
            if normalised_url not in visited_links and normalised_url not in url_queue:
                url_queue.append(normalised_url)
	#!/usr/bin/env python

	import re, urllib, sys
	from urlparse import urlparse, urlunparse

	terms = ['deepmind', 'recursive']
	sites = ['http://torch.ch', 'http://www.arcadelearningenvironment.org/']

	for site in sites:
	base_url = urlparse(site)

	visited_links = []
	url_queue = []
	url_queue.append(base_url.geturl())

	while len(url_queue) > 0:
	s_url = url_queue.pop()
	visited_links.append(s_url)

	parsed_url = urlparse(s_url)

	content = urllib.urlopen(parsed_url.geturl()).read()
	found = False
	for term in terms:
	if term in content.lower():
	print "{} contains {}".format(parsed_url.geturl(), term)
	found = True
	break

	# if any terms are found, move to the next site
	if found:
	break

	for link in re.findall('''href=["'](.[^"']+)["']''', content, re.I):
	try:
	check_url = urlparse(link)
	except ValueError:
	# some garbage
	continue

	# check if this link is to another site
	if check_url.scheme != '':
	if base_url.netloc not in check_url.netloc or 'http' not in check_url.scheme:
	continue
	normalised_url = check_url.geturl()
	else:
	# normalise the url
	normalised_url = urlunparse(
	(base_url[0], base_url[1], check_url[2], check_url[3], check_url[4], '')
	)

	# check if we've already visited
	if normalised_url not in visited_links and normalised_url not in url_queue:
	url_queue.append(normalised_url)