Skip to content

Instantly share code, notes, and snippets.

@AHAPX
Last active October 11, 2017 12:52
Show Gist options
  • Save AHAPX/8058a98872b4b010039ea2f9ec71f7df to your computer and use it in GitHub Desktop.
Save AHAPX/8058a98872b4b010039ea2f9ec71f7df to your computer and use it in GitHub Desktop.
habrproxy
import re
from http.server import HTTPServer, BaseHTTPRequestHandler
from urllib.parse import urljoin, urlparse
import requests
from lxml.html import tostring, document_fromstring
class MyProxy(BaseHTTPRequestHandler):
to_host = 'http://habrahabr.ru'
def text_replace(self, text):
return re.sub(r'\b([^\W\d]{6})\b', r'\1™', text, flags=re.U)
def do_GET(self):
url = urljoin(self.to_host, self.path)
response = requests.get(url, stream=True)
output = response.content
if 'text/html' in response.headers['content-type'].lower():
try:
page = document_fromstring(response.text)
except Exception as e:
self.send_error(500, e.message)
return
server_netloc = '{}:{}'.format(
self.server.server_address[0],
self.server.server_address[1]
)
links = '//*[@href]'
for link in page.xpath(links):
try:
ref = urlparse(link)._replace(netloc=server_netloc)
link.getparent().attrib['href'] = ref.geturl()
except:
pass
excluded_tags = ['script', 'style']
for node in page.iter('*'):
if node.tag in excluded_tags:
continue
for text in node.xpath('./text()'):
clean_text = text.strip()
if clean_text and len(clean_text) < 6:
continue
parent = text.getparent()
if parent.tag in excluded_tags:
continue
if text.is_text:
parent.text = self.text_replace(text)
elif text.is_tail:
parent.tail = self.text_replace(text)
output = tostring(page, encoding='utf-8')
self.send_response(response.status_code)
self.send_header("Content-Type", response.headers['content-type'])
self.end_headers()
self.wfile.write(output)
if __name__ == '__main__':
httpd = HTTPServer(('127.0.0.1', 8000), MyProxy)
try:
httpd.serve_forever()
except KeyboardInterrupt:
pass
httpd.server_close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment