Skip to content

Instantly share code, notes, and snippets.

@koutoftimer
Last active October 25, 2015 12:23
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save koutoftimer/8053332bae362a3430f0 to your computer and use it in GitHub Desktop.
Save koutoftimer/8053332bae362a3430f0 to your computer and use it in GitHub Desktop.
habraproxy
# ~*~ encoding: utf-8 ~*~
import argparse
import re
import requests
from urllib import urlencode
from urlparse import urljoin, urlparse
from bs4 import BeautifulSoup
from flask import Flask, Response, stream_with_context, request
app = Flask(__name__)
app.HOST = 'http://habrahabr.ru'
app.DOMAIN = None
ALLOWED_CONTENT_TYPES = ('text/html', 'text/xml', 'text/xhtml', 'text/plain')
WORD_RE = re.compile(r'(?P<prefix>^|\W)(?P<word>\w{6})(?P<suffix>$|\W)',
re.UNICODE)
WORD_REPLACEMENT = u'\g<prefix>\g<word>™\g<suffix>'
def html_content_type(content_type):
"""
Determine whether `content_type` is subset of xml or plain text.
:type content_type: str
:rtype: bool
"""
return any(map(content_type.count, ALLOWED_CONTENT_TYPES))
def replace(text):
html = BeautifulSoup(text, 'html.parser')
for line in html.find_all(text=True):
line.replace_with(WORD_RE.sub(WORD_REPLACEMENT, line))
return str(html)
@app.route('/<path:url>')
def home(url):
headers = {k: v for k, v in request.headers if v}
headers.update({'Host': app.DOMAIN})
url = ''.join([urljoin(app.HOST, url), urlencode(request.args)])
response = requests.get(
url, stream=True, headers=headers, cookies=request.cookies)
if not html_content_type(response.headers['content-type']) or \
response.status_code >= 300:
return Response(stream_with_context(response.iter_content()),
content_type=response.headers['content-type'])
return replace(response.text)
@app.route('/')
def index():
return home('')
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--host', help='Site host name, including protocol: '
'http://host, https://host')
args = parser.parse_args()
#: Configure flask app.
if args.host:
app.HOST = args.host
app.DOMAIN = urlparse(app.HOST).netloc
app.run()
beautifulsoup4
flask
requests
$ python habraproxy.py --help
usage: habraproxy.py [-h] [--host HOST]
optional arguments:
-h, --help show this help message and exit
--host HOST Site host name, including protocol: http://host, https://host
$ python habraproxy.py --host https://docs.python.org
* Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment