Created
January 30, 2014 06:48
-
-
Save migurski/8703759 to your computer and use it in GitHub Desktop.
Scripts used in the creation of the new Code for America website.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
from requests import get | |
from csv import writer | |
from urlparse import urljoin, urlsplit, urlunsplit | |
from re import compile | |
ignore = compile(r'/blog/') | |
base_url = 'http://localhost/~migurski/Codeforamerica.org' | |
base_url = 'http://alpha.codeforamerica.org' | |
urls = [(base_url, None, 0)] | |
seen = set() | |
parsed = writer(open('parsed-links.csv', 'w')) | |
parsed.writerow(('URL', 'Clicks')) | |
problems = writer(open('checked-links.csv', 'w')) | |
problems.writerow(('Problem', 'URL', 'Referer')) | |
while urls: # and len(seen) < 20: | |
url, referer, hops = urls.pop(0) | |
if url in seen: | |
print len(urls), 'seen', url | |
continue | |
seen.add(url) | |
got = get(url) | |
if got.url in seen: | |
print len(urls), 'seen', got.url | |
continue | |
seen.add(got.url) | |
if ignore.match(got.url[len(base_url):]): | |
print len(urls), 'ignoring', got.url | |
continue | |
if not got.url.startswith(base_url): | |
print len(urls), 'skipping', got.url | |
continue | |
if got.status_code != 200: | |
problems.writerow((got.status_code, url, referer)) | |
continue | |
if not got.headers['content-type'].startswith('text/html'): | |
print len(urls), 'skipping', got.url | |
continue | |
print len(urls), got.url | |
parsed.writerow((got.url, hops)) | |
soup = BeautifulSoup(got.content) | |
main = soup.find('main') or soup | |
hrefs = [a.get('href', '') for a in main.find_all('a')] | |
for href in set(hrefs): | |
link = urljoin(got.url, href) | |
scheme, host, path, query, _ = urlsplit(link) | |
link = urlunsplit((scheme, host, path, query, '')) | |
if href in ('#', ''): | |
problems.writerow(('Empty', href, got.url)) | |
elif href.startswith('#'): | |
# ignore internal anchors | |
continue | |
elif not link.startswith(base_url): | |
# ignore external links | |
continue | |
elif link not in seen: | |
urls.append((link, got.url, hops+1)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from requests import get | |
from urlparse import urlparse | |
from csv import DictWriter, writer | |
with open('wordpress-paths.txt') as file, open('site-paths.csv', 'w') as out: | |
cols = 'original_path', 'original_code', \ | |
'live_path', 'live_code', \ | |
'new_path', 'new_code', \ | |
'status', 'hits' | |
results = writer(out) | |
results.writerow(cols) | |
for line in list(file)[:]: | |
live_code, live_path, new_code, new_path = '', '', '', '' | |
hits, method, original_path, original_code = line.split() | |
if original_code[0] not in ('2', '3'): | |
continue | |
print hits, original_path | |
try: | |
url = 'http://www.codeforamerica.org' + original_path | |
got = get(url) | |
live_code = got.status_code | |
_, live_host, live_path, _, _, _ = urlparse(got.url) | |
if live_host != 'www.codeforamerica.org': | |
live_path = 'http://' + live_host + live_path | |
dir = '/~migurski/Codeforamerica.org' | |
url = 'http://localhost' + dir + original_path | |
got = get(url) | |
new_code = got.status_code | |
_, new_host, new_path, _, _, _ = urlparse(got.url) | |
if new_host != 'localhost': | |
new_path = 'http://' + new_host + new_path | |
elif new_path.startswith(dir): | |
new_path = new_path[len(dir):] | |
if live_code == new_code: | |
status = 'OK' | |
elif new_code == 404: | |
if new_path != original_path: | |
status = 'Missing' | |
elif live_path == new_path + '/': | |
status = 'Missing' | |
elif live_path != original_path: | |
status = 'Needs redirect' | |
else: | |
status = 'Missing' | |
else: | |
status = '?' | |
except Exception, e: | |
status = 'Error: ' + unicode(e) | |
results.writerow([locals().get(col) for col in cols]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment