Created
July 6, 2016 22:58
-
-
Save deliro/cd1f79bdc0adb5d158714fdd880de42a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# coding: utf-8 | |
# requirements: | |
# requests | |
# beautifulsoup4 | |
# flask | |
# html5lib | |
import re | |
import webbrowser | |
import argparse | |
import requests | |
from bs4 import BeautifulSoup | |
from flask import Flask | |
app = Flask(__name__) | |
init_args = None | |
class Rewriter(object): | |
def __init__(self, url, args, escape_tags=None): | |
if escape_tags is None: | |
self.escape_tags = ('script', 'noscript', 'style', 'head') | |
else: | |
self.escape_tags = escape_tags | |
content = requests.get(args.site + url).content | |
self.args = args | |
self.soup = BeautifulSoup(content, 'html5lib') | |
def add_tms(self): | |
""" | |
Добавляем ™ к словам, исключая скрипты | |
стили и head | |
""" | |
for el in self.soup.find_all(text=True): | |
if el.parent.name not in self.escape_tags: | |
el.replace_with(re.sub(r'\b(\w{6})\b', r'\1™', el)) | |
def replace_links(self): | |
""" | |
Заменяем абсолютные ссылки на относительные | |
""" | |
for el in self.soup.find_all('a', href=True): | |
el['href'] = el['href'].replace(self.args.site, '') | |
def bypass_styles(self): | |
""" | |
Заменяем линки на стили, чтобы они шли в обход нашего прокси | |
""" | |
for el in self.soup.find_all('link', attrs={'rel': 'stylesheet'}, href=True): | |
if not el['href'].startswith('//'): | |
el['href'] = self.args.site + el['href'] | |
def bypass_scripts(self): | |
""" | |
Заменяем линки на скрипты, чтобы они шли в обход нашего прокси | |
""" | |
for el in self.soup.find_all('script', attrs={'src': True}): | |
if not el['src'].startswith('//'): | |
el['src'] = self.args.site + el['src'] | |
def process(self): | |
self.add_tms() | |
self.replace_links() | |
self.bypass_styles() | |
self.bypass_scripts() | |
result = str(self.soup) | |
if result.startswith('html'): | |
result = result[4:] | |
return result | |
@app.route('/', defaults={'path': ''}) | |
@app.route('/<path:path>') | |
def handler(path): | |
if not path.startswith('/'): | |
path = '/' + path | |
return Rewriter(path, init_args).process() | |
def parse_args(): | |
parser = argparse.ArgumentParser(description='Habraproxy') | |
parser.add_argument('--host', dest='host', type=str, default='localhost', | |
help='Host (eg: localhost).') | |
parser.add_argument('--port', dest='port', type=int, default=5000, | |
help='Port (eg: 5000).') | |
parser.add_argument('--site', dest='site', type=str, | |
default='https://habrahabr.ru', | |
help='Site (eg: https://habrahabr.ru).') | |
args = parser.parse_args() | |
if not args.site.startswith('http'): | |
args.site = 'http://' + args.site | |
return args | |
if __name__ == '__main__': | |
init_args = parse_args() | |
webbrowser.open_new_tab('http://%s:%s/' % (init_args.host, init_args.port)) | |
app.run(host=init_args.host, port=init_args.port) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment