Last active
October 11, 2017 12:52
-
-
Save AHAPX/8058a98872b4b010039ea2f9ec71f7df to your computer and use it in GitHub Desktop.
habrproxy
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from http.server import HTTPServer, BaseHTTPRequestHandler | |
from urllib.parse import urljoin, urlparse | |
import requests | |
from lxml.html import tostring, document_fromstring | |
class MyProxy(BaseHTTPRequestHandler): | |
to_host = 'http://habrahabr.ru' | |
def text_replace(self, text): | |
return re.sub(r'\b([^\W\d]{6})\b', r'\1™', text, flags=re.U) | |
def do_GET(self): | |
url = urljoin(self.to_host, self.path) | |
response = requests.get(url, stream=True) | |
output = response.content | |
if 'text/html' in response.headers['content-type'].lower(): | |
try: | |
page = document_fromstring(response.text) | |
except Exception as e: | |
self.send_error(500, e.message) | |
return | |
server_netloc = '{}:{}'.format( | |
self.server.server_address[0], | |
self.server.server_address[1] | |
) | |
links = '//*[@href]' | |
for link in page.xpath(links): | |
try: | |
ref = urlparse(link)._replace(netloc=server_netloc) | |
link.getparent().attrib['href'] = ref.geturl() | |
except: | |
pass | |
excluded_tags = ['script', 'style'] | |
for node in page.iter('*'): | |
if node.tag in excluded_tags: | |
continue | |
for text in node.xpath('./text()'): | |
clean_text = text.strip() | |
if clean_text and len(clean_text) < 6: | |
continue | |
parent = text.getparent() | |
if parent.tag in excluded_tags: | |
continue | |
if text.is_text: | |
parent.text = self.text_replace(text) | |
elif text.is_tail: | |
parent.tail = self.text_replace(text) | |
output = tostring(page, encoding='utf-8') | |
self.send_response(response.status_code) | |
self.send_header("Content-Type", response.headers['content-type']) | |
self.end_headers() | |
self.wfile.write(output) | |
if __name__ == '__main__': | |
httpd = HTTPServer(('127.0.0.1', 8000), MyProxy) | |
try: | |
httpd.serve_forever() | |
except KeyboardInterrupt: | |
pass | |
httpd.server_close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment