Skip to content

Instantly share code, notes, and snippets.

@brawaga
Forked from anonymous/gist:06e0bd519490c8f03404
Last active August 26, 2015 19:49
Show Gist options
  • Save brawaga/e45a1df9021301efa9d9 to your computer and use it in GitHub Desktop.
Save brawaga/e45a1df9021301efa9d9 to your computer and use it in GitHub Desktop.
#!/usr/bin/python2
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
import re
from BeautifulSoup import BeautifulSoup
import chardet
from tornado.httpclient import AsyncHTTPClient
import tornado.ioloop
from tornado.options import define, options
import tornado.web
import zlib
def tag_editable_selector(tag):
return tag.get('contenteditable', None) not in [None, 'false']
def tag_selector(tag):
if tag.name in (
'[document]', 'html', 'head', 'meta', 'code', 'style', 'script'
):
return False
if tag.findParent(tag_editable_selector):
return False
return True
def prank_data(data):
soup = BeautifulSoup(data)
for txt in soup.findAll(text=True):
if not tag_selector(txt.parent):
continue
newtext = re.sub(r"(?u)\b(\w{6})\b", r"\1™", txt)
txt.replaceWith(newtext)
return str(soup)
class MainHandler(tornado.web.RequestHandler):
def handle_response(self, resp):
if resp.error:
self.write_error(resp.code)
else:
page = resp.body
if 'gzip' in resp.headers.get('Content-Encoding', ''):
page = zlib.decompress(page, 16+zlib.MAX_WBITS)
content_type = resp.headers.get('Content-Type', None)
print 'Content-Type:', content_type
if content_type is not None:
content_type = re.sub(
r"(charset\=)(.*)", r"\1utf-8", content_type)
self.set_header('Content-Type', content_type)
if 'text/html' in (content_type or ''):
enc = chardet.detect(page)['encoding']
page = page.decode(enc)
page = prank_data(page)
self.write(page)
self.finish()
@tornado.web.asynchronous
def get(self, path):
httpclient = AsyncHTTPClient()
url = 'http://'+options.host+'/'+path
print url
httpclient.fetch(url, self.handle_response)
application = tornado.web.Application([
(r"/(.*)", MainHandler),
])
if __name__ == "__main__":
define("host", default="habrahabr.ru", help="Source host to tm-sorce")
define("port", default=8888, help="Port to listen on", type=int)
options.parse_command_line()
print options.host, options.port
application.listen(options.port)
tornado.ioloop.IOLoop.current().start()
@brawaga
Copy link
Author

brawaga commented Aug 26, 2015

Just my version of this.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment