Last active
October 25, 2017 21:52
-
-
Save atenni/d3bb088fd5b5e194daab to your computer and use it in GitHub Desktop.
A dirty script to spider a base url, iterate over internal link and store the response code and url in link_checker_results.csv. Mainly used for testing.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# encoding: utf-8 | |
import sys | |
import csv | |
from urlparse import urlsplit | |
import cssselect | |
from lxml import html | |
import requests | |
def standardise_url(url, base_url): | |
"Take a url and return a standardised version of it" | |
parsed_url = urlsplit(url) | |
if not parsed_url.netloc: | |
parsed_url = parsed_url._replace(scheme="http", netloc=base_url) | |
return parsed_url.geturl() | |
queue = set() | |
visited = set() | |
results = [] | |
def run(base_url): | |
# Get another URL | |
url = queue.pop() | |
print("Trying: {0}".format(url)) | |
# Only visit new pages | |
if url in visited: | |
return | |
# Get links on page | |
page = requests.get(url) | |
tree = html.fromstring(page.text) | |
all_hrefs = [] | |
for elm in tree.cssselect('a'): | |
try: | |
all_hrefs.append(standardise_url(elm.attrib['href'], base_url)) | |
except KeyError: | |
pass | |
# Throw away external links | |
links = [l for l in all_hrefs if urlsplit(l).netloc == base_url] | |
# Add new links to queue for further crawling | |
for link in links: | |
if link not in visited: | |
queue.add(link) | |
# Save result | |
# results.append((page.status_code, page.url)) | |
with open('link_checker_results.csv', 'ab') as f: | |
writer = csv.writer(f) | |
writer.writerow((page.status_code, page.url)) | |
visited.add(url) | |
if __name__ == '__main__': | |
if len(sys.argv) != 3: | |
sys.exit("""link_checker requires two arguments: the base url and number of iterations. | |
Usage: python link_checker.py google.com 20 | |
""") | |
BASE = sys.argv[1] | |
iterations = int(sys.argv[2]) | |
queue.add('http://{0}'.format(BASE)) | |
with open('link_checker_results.csv', 'wb') as f: | |
writer = csv.writer(f) | |
writer.writerow(("Response", "URL")) | |
# for row in results: | |
# writer.writerow(row) | |
for _ in range(iterations): | |
run(base_url=BASE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment