Skip to content

Instantly share code, notes, and snippets.

@ba1dr
Last active September 26, 2015 08:00
Show Gist options
  • Save ba1dr/158c4699e45eafa9d598 to your computer and use it in GitHub Desktop.
Save ba1dr/158c4699e45eafa9d598 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
__author__ = 'Alexey Kolyanov'
__email__ = 'alexey.kolyanov@gmail.com'
"""
Third-party requirements:
* lxml (+ beautifulsoup)
* requests
Limitations:
* HTTP headers do not passed
* No errors processing
* Hardcoded encoding: utf-8
"""
import re
import requests
import BaseHTTPServer
from lxml.html import soupparser
SKIPTAGS = ('script', 'style', )
word_re = re.compile(r'\b([\w]{6})\b', re.I + re.U)
def itertext(root, handlers=dict(ul=lambda el: (list(el.itertext()), el.tail))):
if root.text: # replace in texts
root.text = word_re.sub(u"\\1™", root.text)
yield root.text
for el in root:
if el.tag is soupparser.etree.Comment or el.tag.lower() in SKIPTAGS:
yield el # do NOT replace in forbidden tags or comments
continue
for x in handlers.get(el.tag, itertext)(el): # loop over elements
yield x
if root.tail: # replace in tails
root.tail = word_re.sub(u"\\1™", root.tail)
yield root.tail
class Proxy(BaseHTTPServer.BaseHTTPRequestHandler):
def do_GET(self):
url = "http://habrahabr.ru%s" % self.path
resp = requests.get(url)
if not resp.ok:
# just output to console, but continue processing
print("HTTP Error %s: %s" % (resp.status_code, ""))
contenttype = resp.headers.get('content-type')
if 'text/html' in contenttype:
doc = soupparser.html.fromstring(resp.content)
for x in itertext(doc):
pass
hdata = soupparser.etree.tostring(doc, method='html', encoding=unicode).replace('%SOUP-ENCODING%', 'utf-8')
self.send_response(200)
self.send_header("Content-type", "text/html")
self.end_headers()
self.wfile.write(hdata.encode('utf-8'))
else:
self.send_response(200)
if contenttype:
self.send_header("Content-type", contenttype)
self.end_headers()
self.wfile.write(resp.content)
def run(server_class=BaseHTTPServer.HTTPServer,
handler_class=BaseHTTPServer.BaseHTTPRequestHandler):
server_address = ('', 8000)
httpd = server_class(server_address, handler_class)
httpd.serve_forever()
def main():
run(handler_class=Proxy)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment