Skip to content

Instantly share code, notes, and snippets.

@MyGodIsHe
Last active September 5, 2016 11:23
Show Gist options
  • Save MyGodIsHe/22a34069cf25f645578ebb96886b7c5c to your computer and use it in GitHub Desktop.
Save MyGodIsHe/22a34069cf25f645578ebb96886b7c5c to your computer and use it in GitHub Desktop.
pip install BeautifulSoup
#!/usr/bin/python
import SocketServer
import SimpleHTTPServer
import urllib
import re
from BeautifulSoup import BeautifulSoup, NavigableString, Tag, PageElement
TARGET = 'habrahabr.ru'
HOST = '127.0.0.1'
PORT = 8232
PROXY_URL = 'http://%s:%s' % (HOST, PORT)
URL_REGEX = re.compile('^https?://%s' % TARGET)
# fix ampersand bug
PageElement._sub_entity = lambda self, x: x.group(0)[0]
def find_text(tag):
t_tag = type(tag)
if t_tag == Tag:
if tag.name in ['script', 'style']:
return []
# replace target url
if tag.name == 'a' and tag.get('href'):
tag['href'] = URL_REGEX.sub(PROXY_URL, tag['href'])
# text
if t_tag == NavigableString:
return [tag]
# deep
data = []
if hasattr(tag, 'contents'):
for i in tag.contents:
data.extend(find_text(i))
return data
def tm_injection(data):
ln = 0
for c in data:
if c.isalpha():
ln += 1
else:
if ln == 6:
c = '™' + c
ln = 0
yield c
if ln == 6:
yield '™'
class HabrWrapper(object):
def __init__(self, f):
self.file = f
def read(self, size=None):
# read all
page = ''
while 1:
buf = self.file.read(16*1024)
if not buf:
break
page += buf
if not page:
return
soup = BeautifulSoup(page)
soup.convertHTMLEntities = False
text_tags = find_text(soup)
for tag in text_tags:
m_str = ''.join(tm_injection(tag.string))
tag.replaceWith(m_str)
return soup
def __getattr__(self, attr):
return getattr(self.file, attr)
class Proxy(SimpleHTTPServer.SimpleHTTPRequestHandler):
def do_GET(self):
print self.requestline
f = urllib.urlopen('http://%s%s' % (TARGET, self.path))
if f.headers['Content-Type'].startswith('text/html;'):
f = HabrWrapper(f)
self.copyfile(f, self.wfile)
if __name__ == '__main__':
httpd = SocketServer.TCPServer((HOST, PORT), Proxy)
print 'Starting proxy server at %s' % PROXY_URL
httpd.serve_forever()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment