Skip to content

Instantly share code, notes, and snippets.

@maxpoletaev
Forked from anonymous/gist:06e0bd519490c8f03404
Last active September 4, 2015 13:16
Show Gist options
  • Save maxpoletaev/b946d40a410e0a6493a9 to your computer and use it in GitHub Desktop.
Save maxpoletaev/b946d40a410e0a6493a9 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
from SimpleHTTPServer import SimpleHTTPRequestHandler
from SocketServer import TCPServer
from StringIO import StringIO
from lxml import html
import webbrowser
import urllib2
import gzip
import re
PORT = 8232
WEBSITE = 'habrahabr.ru'
class Proxy(SimpleHTTPRequestHandler):
def do_GET(self):
response = urllib2.urlopen('http://' + WEBSITE + self.path)
info = response.info()
if info.get('Content-Encoding') == 'gzip':
response = gzip.GzipFile(fileobj=StringIO(response.read()))
content = response.read()
if 'text/html' in info.get('Content-Type'):
content = process_content(content)
self.wfile.write(content)
def process_content(content):
word_regex = re.compile(r'(\b\w{6}\b)', flags=re.UNICODE)
ignore_tags = ['pre', 'code', 'script', 'style']
tree = html.fromstring(content)
def add_tm(text):
return word_regex.sub(ur'\1™', unicode(text))
def fix_link(text):
return text.replace('//%s' % WEBSITE, '//localhost:' + str(PORT))
for node in tree.xpath('.//body/descendant-or-self::*'):
if node.tag not in ignore_tags:
if node.text:
node.text = add_tm(node.text)
if node.tail:
node.tail = add_tm(node.tail)
if node.tag == 'a' and 'href' in node.attrib:
node.attrib['href'] = fix_link(node.attrib['href'])
return html.tostring(tree)
def main():
webbrowser.open('http://localhost:%d' % PORT)
server = TCPServer(('', PORT), Proxy)
server.serve_forever()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment