kafeg/kfproxy.py

## kfproxy.py
# demo: http://forsk.ru:8232/company/yandex/blog/258673/
# pip install html5lib beautifulsoup4
# -*- coding: utf-8 -*-
import BaseHTTPServer
import requests
import string
from bs4 import BeautifulSoup

def main():
    httpd = BaseHTTPServer.HTTPServer(('0.0.0.0', 8232), TMHandler)
    httpd.serve_forever()


class TMHandler(BaseHTTPServer.BaseHTTPRequestHandler):
    site = 'habrahabr.ru'

    def do_GET(self):
        url = 'http://' + self.site + self.path
        req = requests.get(url)
        content = req.content.replace('http://' + self.site, '')
        headers = req.headers
        self.send_response(200)
        self.send_header('Content-type', headers['content-type'])
        self.end_headers()
        if headers['content-type'].startswith('text/html'):
            content = parsehtml(content)
        self.wfile.write(content)


def parsehtml(html):
    skip = ('style', 'script', '[document]', 'head', 'title')
    doc = BeautifulSoup(html,'html5lib')
    for tag in doc.find_all(text=True):
        if tag.parent.name in skip:
            continue
        s = tag.string.strip()
        if s:
            tag.string.replace_with(transform(s))
    return doc.prettify().encode('utf-8')


def isTMWord(s):
    excludes = [',', '.']
    if (not s.endswith(tuple(excludes)) and len(s) == 6):
        return True
    else:
        return False


def transform(s):
    return u' '.join(x + u'™' if isTMWord(x) else x for x in s.split())


if __name__ == '__main__':
    main()
	# demo: http://forsk.ru:8232/company/yandex/blog/258673/
	# pip install html5lib beautifulsoup4
	# -- coding: utf-8 --
	import BaseHTTPServer
	import requests
	import string
	from bs4 import BeautifulSoup

	def main():
	httpd = BaseHTTPServer.HTTPServer(('0.0.0.0', 8232), TMHandler)
	httpd.serve_forever()


	class TMHandler(BaseHTTPServer.BaseHTTPRequestHandler):
	site = 'habrahabr.ru'

	def do_GET(self):
	url = 'http://' + self.site + self.path
	req = requests.get(url)
	content = req.content.replace('http://' + self.site, '')
	headers = req.headers
	self.send_response(200)
	self.send_header('Content-type', headers['content-type'])
	self.end_headers()
	if headers['content-type'].startswith('text/html'):
	content = parsehtml(content)
	self.wfile.write(content)


	def parsehtml(html):
	skip = ('style', 'script', '[document]', 'head', 'title')
	doc = BeautifulSoup(html,'html5lib')
	for tag in doc.find_all(text=True):
	if tag.parent.name in skip:
	continue
	s = tag.string.strip()
	if s:
	tag.string.replace_with(transform(s))
	return doc.prettify().encode('utf-8')


	def isTMWord(s):
	excludes = [',', '.']
	if (not s.endswith(tuple(excludes)) and len(s) == 6):
	return True
	else:
	return False


	def transform(s):
	return u' '.join(x + u'™' if isTMWord(x) else x for x in s.split())


	if __name__ == '__main__':
	main()