deliro/habraproxy.py

## habraproxy.py
# coding: utf-8

# requirements:
# requests
# beautifulsoup4
# flask
# html5lib

import re
import webbrowser
import argparse

import requests
from bs4 import BeautifulSoup
from flask import Flask

app = Flask(__name__)

init_args = None

class Rewriter(object):
    def __init__(self, url, args, escape_tags=None):
        if escape_tags is None:
            self.escape_tags = ('script', 'noscript', 'style', 'head')
        else:
            self.escape_tags = escape_tags
        content = requests.get(args.site + url).content
        self.args = args
        self.soup = BeautifulSoup(content, 'html5lib')

    def add_tms(self):
        """
        Добавляем ™ к словам, исключая скрипты
        стили и head
        """
        for el in self.soup.find_all(text=True):
            if el.parent.name not in self.escape_tags:
                el.replace_with(re.sub(r'\b(\w{6})\b', r'\1™', el))

    def replace_links(self):
        """
        Заменяем абсолютные ссылки на относительные
        """
        for el in self.soup.find_all('a', href=True):
            el['href'] = el['href'].replace(self.args.site, '')

    def bypass_styles(self):
        """
        Заменяем линки на стили, чтобы они шли в обход нашего прокси
        """
        for el in self.soup.find_all('link', attrs={'rel': 'stylesheet'}, href=True):
            if not el['href'].startswith('//'):
                el['href'] = self.args.site + el['href']

    def bypass_scripts(self):
        """
        Заменяем линки на скрипты, чтобы они шли в обход нашего прокси
        """
        for el in self.soup.find_all('script', attrs={'src': True}):
            if not el['src'].startswith('//'):
                el['src'] = self.args.site + el['src']

    def process(self):
        self.add_tms()
        self.replace_links()
        self.bypass_styles()
        self.bypass_scripts()

        result = str(self.soup)
        if result.startswith('html'):
            result = result[4:]
        return result


@app.route('/', defaults={'path': ''})
@app.route('/<path:path>')
def handler(path):
    if not path.startswith('/'):
        path = '/' + path
    return Rewriter(path, init_args).process()


def parse_args():
    parser = argparse.ArgumentParser(description='Habraproxy')
    parser.add_argument('--host', dest='host', type=str, default='localhost',
                        help='Host (eg: localhost).')
    parser.add_argument('--port', dest='port', type=int, default=5000,
                        help='Port (eg: 5000).')
    parser.add_argument('--site', dest='site', type=str,
                        default='https://habrahabr.ru',
                        help='Site (eg: https://habrahabr.ru).')

    args = parser.parse_args()
    if not args.site.startswith('http'):
        args.site = 'http://' + args.site
    return args


if __name__ == '__main__':
    init_args = parse_args()

    webbrowser.open_new_tab('http://%s:%s/' % (init_args.host, init_args.port))
    app.run(host=init_args.host, port=init_args.port)
	# coding: utf-8

	# requirements:
	# requests
	# beautifulsoup4
	# flask
	# html5lib

	import re
	import webbrowser
	import argparse

	import requests
	from bs4 import BeautifulSoup
	from flask import Flask

	app = Flask(__name__)

	init_args = None

	class Rewriter(object):
	def __init__(self, url, args, escape_tags=None):
	if escape_tags is None:
	self.escape_tags = ('script', 'noscript', 'style', 'head')
	else:
	self.escape_tags = escape_tags
	content = requests.get(args.site + url).content
	self.args = args
	self.soup = BeautifulSoup(content, 'html5lib')

	def add_tms(self):
	"""
	Добавляем ™ к словам, исключая скрипты
	стили и head
	"""
	for el in self.soup.find_all(text=True):
	if el.parent.name not in self.escape_tags:
	el.replace_with(re.sub(r'\b(\w{6})\b', r'\1™', el))

	def replace_links(self):
	"""
	Заменяем абсолютные ссылки на относительные
	"""
	for el in self.soup.find_all('a', href=True):
	el['href'] = el['href'].replace(self.args.site, '')

	def bypass_styles(self):
	"""
	Заменяем линки на стили, чтобы они шли в обход нашего прокси
	"""
	for el in self.soup.find_all('link', attrs={'rel': 'stylesheet'}, href=True):
	if not el['href'].startswith('//'):
	el['href'] = self.args.site + el['href']

	def bypass_scripts(self):
	"""
	Заменяем линки на скрипты, чтобы они шли в обход нашего прокси
	"""
	for el in self.soup.find_all('script', attrs={'src': True}):
	if not el['src'].startswith('//'):
	el['src'] = self.args.site + el['src']

	def process(self):
	self.add_tms()
	self.replace_links()
	self.bypass_styles()
	self.bypass_scripts()

	result = str(self.soup)
	if result.startswith('html'):
	result = result[4:]
	return result


	@app.route('/', defaults={'path': ''})
	@app.route('/<path:path>')
	def handler(path):
	if not path.startswith('/'):
	path = '/' + path
	return Rewriter(path, init_args).process()


	def parse_args():
	parser = argparse.ArgumentParser(description='Habraproxy')
	parser.add_argument('--host', dest='host', type=str, default='localhost',
	help='Host (eg: localhost).')
	parser.add_argument('--port', dest='port', type=int, default=5000,
	help='Port (eg: 5000).')
	parser.add_argument('--site', dest='site', type=str,
	default='https://habrahabr.ru',
	help='Site (eg: https://habrahabr.ru).')

	args = parser.parse_args()
	if not args.site.startswith('http'):
	args.site = 'http://' + args.site
	return args


	if __name__ == '__main__':
	init_args = parse_args()

	webbrowser.open_new_tab('http://%s:%s/' % (init_args.host, init_args.port))
	app.run(host=init_args.host, port=init_args.port)