Skip to content

Instantly share code, notes, and snippets.

@deliro
Created July 6, 2016 22:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save deliro/cd1f79bdc0adb5d158714fdd880de42a to your computer and use it in GitHub Desktop.
Save deliro/cd1f79bdc0adb5d158714fdd880de42a to your computer and use it in GitHub Desktop.
# coding: utf-8
# requirements:
# requests
# beautifulsoup4
# flask
# html5lib
import re
import webbrowser
import argparse
import requests
from bs4 import BeautifulSoup
from flask import Flask
app = Flask(__name__)
init_args = None
class Rewriter(object):
def __init__(self, url, args, escape_tags=None):
if escape_tags is None:
self.escape_tags = ('script', 'noscript', 'style', 'head')
else:
self.escape_tags = escape_tags
content = requests.get(args.site + url).content
self.args = args
self.soup = BeautifulSoup(content, 'html5lib')
def add_tms(self):
"""
Добавляем ™ к словам, исключая скрипты
стили и head
"""
for el in self.soup.find_all(text=True):
if el.parent.name not in self.escape_tags:
el.replace_with(re.sub(r'\b(\w{6})\b', r'\1™', el))
def replace_links(self):
"""
Заменяем абсолютные ссылки на относительные
"""
for el in self.soup.find_all('a', href=True):
el['href'] = el['href'].replace(self.args.site, '')
def bypass_styles(self):
"""
Заменяем линки на стили, чтобы они шли в обход нашего прокси
"""
for el in self.soup.find_all('link', attrs={'rel': 'stylesheet'}, href=True):
if not el['href'].startswith('//'):
el['href'] = self.args.site + el['href']
def bypass_scripts(self):
"""
Заменяем линки на скрипты, чтобы они шли в обход нашего прокси
"""
for el in self.soup.find_all('script', attrs={'src': True}):
if not el['src'].startswith('//'):
el['src'] = self.args.site + el['src']
def process(self):
self.add_tms()
self.replace_links()
self.bypass_styles()
self.bypass_scripts()
result = str(self.soup)
if result.startswith('html'):
result = result[4:]
return result
@app.route('/', defaults={'path': ''})
@app.route('/<path:path>')
def handler(path):
if not path.startswith('/'):
path = '/' + path
return Rewriter(path, init_args).process()
def parse_args():
parser = argparse.ArgumentParser(description='Habraproxy')
parser.add_argument('--host', dest='host', type=str, default='localhost',
help='Host (eg: localhost).')
parser.add_argument('--port', dest='port', type=int, default=5000,
help='Port (eg: 5000).')
parser.add_argument('--site', dest='site', type=str,
default='https://habrahabr.ru',
help='Site (eg: https://habrahabr.ru).')
args = parser.parse_args()
if not args.site.startswith('http'):
args.site = 'http://' + args.site
return args
if __name__ == '__main__':
init_args = parse_args()
webbrowser.open_new_tab('http://%s:%s/' % (init_args.host, init_args.port))
app.run(host=init_args.host, port=init_args.port)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment