Skip to content

Instantly share code, notes, and snippets.

@sdfsdhgjkbmnmxc
Created May 28, 2015 18:39
Show Gist options
  • Save sdfsdhgjkbmnmxc/0825bd4cb3da7cf0062e to your computer and use it in GitHub Desktop.
Save sdfsdhgjkbmnmxc/0825bd4cb3da7cf0062e to your computer and use it in GitHub Desktop.
habraproxy.py
# -*- coding: utf-8 -*-
import BaseHTTPServer
import requests
from bs4 import BeautifulSoup
import string
HOST = ''
PORT = 8232
def main():
server_class = BaseHTTPServer.HTTPServer
server_address = (HOST, PORT)
httpd = server_class(server_address, Handler)
httpd.serve_forever()
class Handler(BaseHTTPServer.BaseHTTPRequestHandler):
site = 'habrahabr.ru'
def do_GET(self):
url = 'http://' + self.site + self.path
req = requests.get(url)
content = req.content.replace('http://' + self.site, '')
headers = req.headers
self.send_response(200)
self.send_header('Content-type', headers['content-type'])
self.end_headers()
if headers['content-type'].startswith('text/html'):
content = transform_html(content)
self.wfile.write(content)
def transform_html(html):
skip = ('style', 'script', '[document]', 'head', 'title')
soup = BeautifulSoup(html)
for tag in soup.find_all(text=True):
if tag.parent.name in skip:
continue
s = tag.string.strip()
if s:
tag.string.replace_with(transform(s))
return soup.prettify().encode('utf-8')
def is_tm_word(s):
return len(s) == 6 and not s.endswith(string.punctuation)
def transform(s):
return u' '.join(x + u'™' if is_tm_word(x) else x for x in s.split())
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment