Skip to content

Instantly share code, notes, and snippets.

@radxene
Created February 7, 2019 08:35
Show Gist options
  • Save radxene/bcdca69952e57b8f035fdcba79b552c4 to your computer and use it in GitHub Desktop.
Save radxene/bcdca69952e57b8f035fdcba79b552c4 to your computer and use it in GitHub Desktop.
Habraproxy - local http proxy server (Хабрапрокси)
from urllib.parse import urlparse, urljoin
from http.server import BaseHTTPRequestHandler, HTTPServer
import requests
import lxml.etree as etree
class ManipStr(object):
@staticmethod
def rm_host_habra(url):
pr = urlparse(url)
if pr.netloc in ['habrahabr.ru', 'habr.com']:
return pr.path
return url
@staticmethod
def mark_tm(data):
words = data.split(' ')
new_words = []
for w in words:
symbols = ['.', ',', ':', ';', '!', '?']
char = '™'
w = w.strip()
if len(w) == 6 and w.isalnum():
w += char
if len(w) == 7 and w[0:-1].isalnum() and w[-1] in symbols:
w = w[0:-1] + char + w[-1]
new_words.append(w)
return ' '.join(new_words)
class CollectorTarget(object):
def __init__(self):
self.parsed = '<!DOCTYPE html>'
self.cur_elem = ''
def start(self, tag, attrib):
self.cur_elem = tag
self.parsed += '<' + tag
if attrib:
for key, value in attrib.items():
if tag == 'a' and key == 'href':
value = ManipStr.rm_host_habra(value)
self.parsed += ' {}="{}"'.format(key, value)
self.parsed += '>'
def end(self, tag):
if tag in ['meta', 'link']:
self.parsed = self.parsed[0:-1] + '/>'
else:
self.parsed += '</{}>'.format(tag)
def data(self, data):
if self.cur_elem != 'style' and self.cur_elem != 'script':
self.parsed += ManipStr.mark_tm(data)
def comment(self, comment):
self.parsed += '<!--{}-->'.format(comment)
def close(self):
return self.parsed
class ProxyHandler(BaseHTTPRequestHandler):
def _set_headers(self):
self.send_response(200)
self.send_header('Content-type', 'text/html')
self.end_headers()
def do_GET(self):
old_host = 'http://habrahabr.ru'
new_host = 'https://habr.com'
if self.path == '/':
url = urljoin(new_host, '/ru/top')
elif self.path.startswith('/ru/'):
url = urljoin(new_host, self.path)
else:
url = urljoin(old_host, self.path)
res = requests.get(url)
if res.status_code == requests.codes.ok:
parser = etree.HTMLParser(target=CollectorTarget())
content = etree.HTML(res.text, parser).encode('utf-8')
try:
self._set_headers()
self.wfile.write(content)
except BrokenPipeError:
pass
if __name__ == '__main__':
from sys import argv
hostname = 'localhost'
port = 8232
if len(argv) == 2:
port = int(argv[1])
server_class = HTTPServer
httpd = server_class((hostname, port), ProxyHandler)
print('Server Starts - http://{}:{}'.format(hostname, port))
try:
httpd.serve_forever()
except KeyboardInterrupt:
pass
httpd.server_close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment