|
#!/usr/bin/env python3 |
|
import sys |
|
import csv |
|
import argparse |
|
import urllib.parse |
|
import logging |
|
import datetime |
|
import re |
|
|
|
import bs4 |
|
import requests |
|
import boto3 |
|
import django.utils.text |
|
|
|
BUCKET = 'help.osm.org-static-archive' |
|
BASE_URL = 'https://help.openstreetmap.org/' |
|
|
|
logging.basicConfig(level=logging.INFO, stream=sys.stdout) |
|
|
|
def mangle_html(old_html): |
|
''' |
|
''' |
|
soup = bs4.BeautifulSoup(old_html, features="html.parser") |
|
today = datetime.date.today().strftime('%B %d, %Y') |
|
|
|
# - Search form; replace with archive notice |
|
try: |
|
search_form = soup.find('div', id='searchBar').find('form') |
|
search_form.parent.append( |
|
bs4.BeautifulSoup( |
|
f'<p style="margin:.5em;font-size:1.5em">This static page was archived {today}</p>', |
|
features="html.parser", |
|
) |
|
) |
|
search_form.decompose() |
|
logging.info('Removed search form') |
|
except: |
|
pass |
|
|
|
# - “Ask a question” |
|
try: |
|
question_ask = soup.find('a', id='nav_ask') |
|
question_ask.decompose() |
|
logging.info('Removed ask-a-question') |
|
except: |
|
pass |
|
|
|
# - Your Answer form |
|
try: |
|
answer_form = soup.find('form', id='fmanswer') |
|
answer_form.decompose() |
|
logging.info('Removed your-answer') |
|
except: |
|
pass |
|
|
|
# - Pagination options (page, next, posts per page) |
|
try: |
|
pager_div = soup.find('div', id='tail').find('div', class_='pager') |
|
pager_div.decompose() |
|
logging.info('Removed pager') |
|
except: |
|
pass |
|
try: |
|
pagesize_div = soup.find('div', id='tail').find('div', class_='pagesize') |
|
pagesize_div.decompose() |
|
logging.info('Removed pagesize') |
|
except: |
|
pass |
|
|
|
# - Vote buttons |
|
vote_buttons = soup.find_all('a', href=re.compile('/vote/\d+/(down|up)/$')) |
|
for vote_button in vote_buttons: |
|
vote_button.decompose() |
|
logging.info('Removed one vote button') |
|
|
|
# - Star buttons |
|
star_buttons = soup.find_all('a', href=re.compile('/mark_favorite/\d+/$')) |
|
for star_button in star_buttons: |
|
star_button.decompose() |
|
logging.info('Removed one star button') |
|
|
|
# - Follow this question |
|
try: |
|
subscribe_box = soup.find('div', id='subscription_box') |
|
subscribe_box.decompose() |
|
logging.info('Removed subscription box') |
|
except: |
|
pass |
|
|
|
# - Login page |
|
try: |
|
login_link = soup.find('div', id='top').find('a', href='/account/signin/') |
|
login_link.decompose() |
|
logging.info('Removed login link') |
|
except: |
|
pass |
|
|
|
# - "Notify" bar |
|
try: |
|
notify_bar = soup.find('div', class_='notify') |
|
notify_bar.decompose() |
|
logging.info('Removed notify bar') |
|
except: |
|
pass |
|
|
|
# - Sorting options (active, newest, hottest, most voted, unanswered) |
|
# - Filtering options (active, oldest, newest, popular) |
|
tab_sets = soup.find_all('div', class_='tabsA') |
|
for tab_set in tab_sets: |
|
tab_set.decompose() |
|
logging.info('Removed one set of sorting/filtering tabs') |
|
|
|
# - Most RSS |
|
feed_icons = soup.find_all('a', class_='feed-icon') |
|
for feed_icon in feed_icons: |
|
feed_icon.decompose() |
|
logging.info('Removed one feed icon') |
|
|
|
new_html = str(soup) |
|
return new_html.encode('utf8') |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('filename') |
|
|
|
args = parser.parse_args() |
|
s3 = boto3.client('s3') |
|
|
|
with open(args.filename) as file: |
|
for row in csv.DictReader(file): |
|
logging.info('{status} {url}'.format(**row)) |
|
|
|
if row['status'] != '200': |
|
continue |
|
|
|
url = urllib.parse.urljoin(BASE_URL, row['url']) |
|
got = requests.get(url, headers={'User-Agent': 'Michal Migurski <mike@teczno.com>'}) |
|
raw_response = got.content |
|
|
|
if 'text/html' in got.headers['Content-Type']: |
|
raw_response = mangle_html(raw_response) |
|
|
|
if row['url'].endswith('/'): |
|
# Special case directory listings |
|
key_name = row['url'][1:] + 'index.html' |
|
else: |
|
key_name = row['url'][1:] |
|
|
|
logging.info('Putting to s3://{}/{}'.format(BUCKET, key_name)) |
|
s3.put_object( |
|
Bucket=BUCKET, |
|
Key=key_name, |
|
Body=raw_response, |
|
ACL='public-read', |
|
ContentType=got.headers['Content-Type'], |
|
) |