Skip to content

Instantly share code, notes, and snippets.

@muravjov
Created October 17, 2016 12:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save muravjov/192db0cfa53d2d43d1308a2f1be376f6 to your computer and use it in GitHub Desktop.
Save muravjov/192db0cfa53d2d43d1308a2f1be376f6 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
"""
Setup:
$ pip install urllib3 lxml git+https://github.com/bpabel/html5charref.git
"""
from __future__ import print_function
import re
# No html5 entities in Python 2 (like html.unescape in Python3), so
import html5charref
def main():
import SimpleHTTPServer as shttps
import urllib3
# :TODO: http://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
urllib3.disable_warnings()
import lxml.html as l_html
http = urllib3.PoolManager()
domain = "habrahabr.ru"
prefixes = ["http://", "https://"]
class Handler(shttps.SimpleHTTPRequestHandler):
def do_GET(self):
url = "http://" + domain + self.path
resp = http.urlopen("GET", url, preload_content=False)
status = resp.status
ctype = resp.getheader("Content-Type", "text/plain")
body = resp.read()
lst = ctype.split(";")
if lst and lst[0] == "text/html":
tree = l_html.fromstring(body, parser=l_html.html_parser)
for elem in tree.iter():
if not(elem.tag in ["script"]):
elem.text = append_tm(elem.text)
elem.tail = append_tm(elem.tail)
# https://habrahabr.ru/company/plarium/blog/312318/ =>
# /company/plarium/blog/312318/
href = elem.attrib.get("href")
if href:
for prefix in prefixes:
prefix += domain
if href.startswith(prefix):
href = href[len(prefix):]
elem.attrib["href"] = href
break
# body = etree.tostring(tree)
doctype = tree.getroottree().docinfo.doctype
body = l_html.tostring(tree, encoding="utf-8", doctype=doctype)
# output
self.send_response(status)
self.send_header("Content-type", ctype)
self.send_header("Content-Length", str(len(body)))
self.end_headers()
self.wfile.write(body)
shttps.BaseHTTPServer.test(Handler, shttps.BaseHTTPServer.HTTPServer)
def on_word(match):
res = match.group()
if len(res) - res.count("-") == 6:
res += u"™"
return res
def append_tm(txt):
res = txt
if res: # != None
# + - специфика хабра
txt = html5charref.unescape(txt)
res = re.sub(ur"\b[\w-]+\b", on_word, txt, flags=re.U)
return res
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment