-
-
Save migurski/0ac8c84a462e95a2a119831d833e5251 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import sys | |
import csv | |
import argparse | |
import urllib.parse | |
import logging | |
import datetime | |
import re | |
import bs4 | |
import requests | |
import boto3 | |
import django.utils.text | |
BUCKET = 'help.osm.org-static-archive' | |
BASE_URL = 'https://help.openstreetmap.org/' | |
logging.basicConfig(level=logging.INFO, stream=sys.stdout) | |
def mangle_html(old_html): | |
''' | |
''' | |
soup = bs4.BeautifulSoup(old_html, features="html.parser") | |
today = datetime.date.today().strftime('%B %d, %Y') | |
# - Search form; replace with archive notice | |
try: | |
search_form = soup.find('div', id='searchBar').find('form') | |
search_form.parent.append( | |
bs4.BeautifulSoup( | |
f'<p style="margin:.5em;font-size:1.5em">This static page was archived {today}</p>', | |
features="html.parser", | |
) | |
) | |
search_form.decompose() | |
logging.info('Removed search form') | |
except: | |
pass | |
# - “Ask a question” | |
try: | |
question_ask = soup.find('a', id='nav_ask') | |
question_ask.decompose() | |
logging.info('Removed ask-a-question') | |
except: | |
pass | |
# - Your Answer form | |
try: | |
answer_form = soup.find('form', id='fmanswer') | |
answer_form.decompose() | |
logging.info('Removed your-answer') | |
except: | |
pass | |
# - Pagination options (page, next, posts per page) | |
try: | |
pager_div = soup.find('div', id='tail').find('div', class_='pager') | |
pager_div.decompose() | |
logging.info('Removed pager') | |
except: | |
pass | |
try: | |
pagesize_div = soup.find('div', id='tail').find('div', class_='pagesize') | |
pagesize_div.decompose() | |
logging.info('Removed pagesize') | |
except: | |
pass | |
# - Vote buttons | |
vote_buttons = soup.find_all('a', href=re.compile('/vote/\d+/(down|up)/$')) | |
for vote_button in vote_buttons: | |
vote_button.decompose() | |
logging.info('Removed one vote button') | |
# - Star buttons | |
star_buttons = soup.find_all('a', href=re.compile('/mark_favorite/\d+/$')) | |
for star_button in star_buttons: | |
star_button.decompose() | |
logging.info('Removed one star button') | |
# - Follow this question | |
try: | |
subscribe_box = soup.find('div', id='subscription_box') | |
subscribe_box.decompose() | |
logging.info('Removed subscription box') | |
except: | |
pass | |
# - Login page | |
try: | |
login_link = soup.find('div', id='top').find('a', href='/account/signin/') | |
login_link.decompose() | |
logging.info('Removed login link') | |
except: | |
pass | |
# - "Notify" bar | |
try: | |
notify_bar = soup.find('div', class_='notify') | |
notify_bar.decompose() | |
logging.info('Removed notify bar') | |
except: | |
pass | |
# - Sorting options (active, newest, hottest, most voted, unanswered) | |
# - Filtering options (active, oldest, newest, popular) | |
tab_sets = soup.find_all('div', class_='tabsA') | |
for tab_set in tab_sets: | |
tab_set.decompose() | |
logging.info('Removed one set of sorting/filtering tabs') | |
# - Most RSS | |
feed_icons = soup.find_all('a', class_='feed-icon') | |
for feed_icon in feed_icons: | |
feed_icon.decompose() | |
logging.info('Removed one feed icon') | |
new_html = str(soup) | |
return new_html.encode('utf8') | |
parser = argparse.ArgumentParser() | |
parser.add_argument('filename') | |
args = parser.parse_args() | |
s3 = boto3.client('s3') | |
with open(args.filename) as file: | |
for row in csv.DictReader(file): | |
logging.info('{status} {url}'.format(**row)) | |
if row['status'] != '200': | |
continue | |
url = urllib.parse.urljoin(BASE_URL, row['url']) | |
got = requests.get(url, headers={'User-Agent': 'Michal Migurski <mike@teczno.com>'}) | |
raw_response = got.content | |
if 'text/html' in got.headers['Content-Type']: | |
raw_response = mangle_html(raw_response) | |
if row['url'].endswith('/'): | |
# Special case directory listings | |
key_name = row['url'][1:] + 'index.html' | |
else: | |
key_name = row['url'][1:] | |
logging.info('Putting to s3://{}/{}'.format(BUCKET, key_name)) | |
s3.put_object( | |
Bucket=BUCKET, | |
Key=key_name, | |
Body=raw_response, | |
ACL='public-read', | |
ContentType=got.headers['Content-Type'], | |
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
url | method | status | count | |
---|---|---|---|---|
/questions/ | GET | 200 | -1 | |
/questions/unanswered/ | GET | 200 | -1 | |
/questions/61943/how-to-tag-a-syphon | GET | 200 | -1 | |
/questions/71100/how-to-tag-an-area-with-residential-levels-on-top-of-shops | GET | 200 | -1 | |
/tags/ | GET | 200 | -1 | |
/tags/tagging/ | GET | 200 | -1 | |
/users/ | GET | 200 | -1 | |
/users/104/frederik-ramm | GET | 200 | -1 | |
/badges/5/good-question | GET | 200 | -1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment