valignatev/habraproxy.py

## habraproxy.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import string
import webbrowser
import re
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer

# pip install requests
# pip install beautifulsoup4
# pip install lxml
import requests
from bs4 import BeautifulSoup, Comment
from bs4.dammit import EntitySubstitution
from bs4.element import NavigableString

SITE = 'https://habrahabr.ru'
PORT = 8232

# Regexp pattern for word with 6 letters
REGEXP = re.compile(ur'\b(\w{6})\b', re.UNICODE)

# list of tags which content can not be replaced
NOT_ALLOWED_TAGS = [
    'body',
    'head',
    'link',
    'meta',
    'style',
    'title',
    'script',
]


def all_but_comments(text):
    """
    What happens in comments stays in comments
    Check comment for new searchbox :)
    """
    return re.search(REGEXP, text) and not isinstance(text, Comment)


def add_trademark(word):
    word_length = len(word.strip(string.punctuation + string.whitespace))
    return word + u'\u2122' if word_length == 6 else word


def transform_content(page_tree):
    """
    Adds trademark symbol to word if word length equals 6
    page_tree is BeautifulSoup object
    """
    for content in page_tree.find_all(string=all_but_comments):
        splitted = re.split(REGEXP, unicode(content))
        transformed_text = u''.join(add_trademark(word) for word in splitted)
        if content.parent.name not in NOT_ALLOWED_TAGS:
            content.replace_with(transformed_text)
    return page_tree.encode('utf-8', formatter=habraformatter)


def habraformatter(entity):
    """
    Format html only if it's in <code> tag. Helps with xml entities
    which is invisible unless converted into valid html.
    Otherwise just return entity - we don't want to escape all the things
    (plus signs on frontpage for example)
    """
    if isinstance(entity, NavigableString) and entity.parent.name == 'code':
        return EntitySubstitution.substitute_xml(entity)
    else:
        return entity


class HttpProcessor(BaseHTTPRequestHandler):

    def do_GET(self):
        r = requests.get(SITE + self.path)

        # Replace url for better navigation
        request_content = r.content.replace(SITE, '')
        self.send_response(r.status_code)
        self.send_header('Content-type', r.headers['content-type'])
        self.end_headers()

        if 'text/html' in r.headers['content-type']:
            page_tree = BeautifulSoup(request_content, 'lxml')
            request_content = transform_content(page_tree)

        self.wfile.write(request_content)


def main():
    server = HTTPServer(("localhost", PORT), HttpProcessor)
    webbrowser.open_new('http://localhost:%s' % PORT)
    server.serve_forever()


if __name__ == '__main__':
    main()
	#!/usr/bin/env python
	# -- coding: utf-8 --

	import string
	import webbrowser
	import re
	from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer

	# pip install requests
	# pip install beautifulsoup4
	# pip install lxml
	import requests
	from bs4 import BeautifulSoup, Comment
	from bs4.dammit import EntitySubstitution
	from bs4.element import NavigableString

	SITE = 'https://habrahabr.ru'
	PORT = 8232

	# Regexp pattern for word with 6 letters
	REGEXP = re.compile(ur'\b(\w{6})\b', re.UNICODE)

	# list of tags which content can not be replaced
	NOT_ALLOWED_TAGS = [
	'body',
	'head',
	'link',
	'meta',
	'style',
	'title',
	'script',
	]


	def all_but_comments(text):
	"""
	What happens in comments stays in comments
	Check comment for new searchbox :)
	"""
	return re.search(REGEXP, text) and not isinstance(text, Comment)


	def add_trademark(word):
	word_length = len(word.strip(string.punctuation + string.whitespace))
	return word + u'\u2122' if word_length == 6 else word


	def transform_content(page_tree):
	"""
	Adds trademark symbol to word if word length equals 6
	page_tree is BeautifulSoup object
	"""
	for content in page_tree.find_all(string=all_but_comments):
	splitted = re.split(REGEXP, unicode(content))
	transformed_text = u''.join(add_trademark(word) for word in splitted)
	if content.parent.name not in NOT_ALLOWED_TAGS:
	content.replace_with(transformed_text)
	return page_tree.encode('utf-8', formatter=habraformatter)


	def habraformatter(entity):
	"""
	Format html only if it's in <code> tag. Helps with xml entities
	which is invisible unless converted into valid html.
	Otherwise just return entity - we don't want to escape all the things
	(plus signs on frontpage for example)
	"""
	if isinstance(entity, NavigableString) and entity.parent.name == 'code':
	return EntitySubstitution.substitute_xml(entity)
	else:
	return entity


	class HttpProcessor(BaseHTTPRequestHandler):

	def do_GET(self):
	r = requests.get(SITE + self.path)

	# Replace url for better navigation
	request_content = r.content.replace(SITE, '')
	self.send_response(r.status_code)
	self.send_header('Content-type', r.headers['content-type'])
	self.end_headers()

	if 'text/html' in r.headers['content-type']:
	page_tree = BeautifulSoup(request_content, 'lxml')
	request_content = transform_content(page_tree)

	self.wfile.write(request_content)


	def main():
	server = HTTPServer(("localhost", PORT), HttpProcessor)
	webbrowser.open_new('http://localhost:%s' % PORT)
	server.serve_forever()


	if __name__ == '__main__':
	main()