Skip to content

Instantly share code, notes, and snippets.

@migurski

migurski/mirror-and-mangle.py Secret

Last active Jun 4, 2020
Embed
What would you like to do?
#!/usr/bin/env python3
import sys
import csv
import argparse
import urllib.parse
import logging
import datetime
import re
import bs4
import requests
import boto3
import django.utils.text
BUCKET = 'help.osm.org-static-archive'
BASE_URL = 'https://help.openstreetmap.org/'
logging.basicConfig(level=logging.INFO, stream=sys.stdout)
def mangle_html(old_html):
'''
'''
soup = bs4.BeautifulSoup(old_html, features="html.parser")
today = datetime.date.today().strftime('%B %d, %Y')
# - Search form; replace with archive notice
try:
search_form = soup.find('div', id='searchBar').find('form')
search_form.parent.append(
bs4.BeautifulSoup(
f'<p style="margin:.5em;font-size:1.5em">This static page was archived {today}</p>',
features="html.parser",
)
)
search_form.decompose()
logging.info('Removed search form')
except:
pass
# - “Ask a question”
try:
question_ask = soup.find('a', id='nav_ask')
question_ask.decompose()
logging.info('Removed ask-a-question')
except:
pass
# - Your Answer form
try:
answer_form = soup.find('form', id='fmanswer')
answer_form.decompose()
logging.info('Removed your-answer')
except:
pass
# - Pagination options (page, next, posts per page)
try:
pager_div = soup.find('div', id='tail').find('div', class_='pager')
pager_div.decompose()
logging.info('Removed pager')
except:
pass
try:
pagesize_div = soup.find('div', id='tail').find('div', class_='pagesize')
pagesize_div.decompose()
logging.info('Removed pagesize')
except:
pass
# - Vote buttons
vote_buttons = soup.find_all('a', href=re.compile('/vote/\d+/(down|up)/$'))
for vote_button in vote_buttons:
vote_button.decompose()
logging.info('Removed one vote button')
# - Star buttons
star_buttons = soup.find_all('a', href=re.compile('/mark_favorite/\d+/$'))
for star_button in star_buttons:
star_button.decompose()
logging.info('Removed one star button')
# - Follow this question
try:
subscribe_box = soup.find('div', id='subscription_box')
subscribe_box.decompose()
logging.info('Removed subscription box')
except:
pass
# - Login page
try:
login_link = soup.find('div', id='top').find('a', href='/account/signin/')
login_link.decompose()
logging.info('Removed login link')
except:
pass
# - "Notify" bar
try:
notify_bar = soup.find('div', class_='notify')
notify_bar.decompose()
logging.info('Removed notify bar')
except:
pass
# - Sorting options (active, newest, hottest, most voted, unanswered)
# - Filtering options (active, oldest, newest, popular)
tab_sets = soup.find_all('div', class_='tabsA')
for tab_set in tab_sets:
tab_set.decompose()
logging.info('Removed one set of sorting/filtering tabs')
# - Most RSS
feed_icons = soup.find_all('a', class_='feed-icon')
for feed_icon in feed_icons:
feed_icon.decompose()
logging.info('Removed one feed icon')
new_html = str(soup)
return new_html.encode('utf8')
parser = argparse.ArgumentParser()
parser.add_argument('filename')
args = parser.parse_args()
s3 = boto3.client('s3')
with open(args.filename) as file:
for row in csv.DictReader(file):
logging.info('{status} {url}'.format(**row))
if row['status'] != '200':
continue
url = urllib.parse.urljoin(BASE_URL, row['url'])
got = requests.get(url, headers={'User-Agent': 'Michal Migurski <mike@teczno.com>'})
raw_response = got.content
if 'text/html' in got.headers['Content-Type']:
raw_response = mangle_html(raw_response)
if row['url'].endswith('/'):
# Special case directory listings
key_name = row['url'][1:] + 'index.html'
else:
key_name = row['url'][1:]
logging.info('Putting to s3://{}/{}'.format(BUCKET, key_name))
s3.put_object(
Bucket=BUCKET,
Key=key_name,
Body=raw_response,
ACL='public-read',
ContentType=got.headers['Content-Type'],
)
url method status count
/questions/ GET 200 -1
/questions/unanswered/ GET 200 -1
/questions/61943/how-to-tag-a-syphon GET 200 -1
/questions/71100/how-to-tag-an-area-with-residential-levels-on-top-of-shops GET 200 -1
/tags/ GET 200 -1
/tags/tagging/ GET 200 -1
/users/ GET 200 -1
/users/104/frederik-ramm GET 200 -1
/badges/5/good-question GET 200 -1
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment