GeyseR/gist:417513a184a7466ce28f

## gistfile1.txt
#coding=utf-8
from __future__ import unicode_literals
from bs4 import BeautifulSoup
import urlparse
import re
import requests
import socket
import time

DELAY = 0.001

def main():
    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
    try:
        sock.bind(('127.0.0.1', 8888))
        sock.listen(1)

        while True:
            time.sleep(DELAY)
            conn, address = sock.accept()
            headers = conn.recv(1024)
            method, address, protocol = headers.splitlines()[0].split(' ')

            if method == 'GET':
                response = requests.get(urlparse.urljoin('http://habrahabr.ru', address))
                content = response.content
                if response.status_code == 200:
                    content = process_links(content)
                    content = process_words(content)
                    conn.sendall(content.encode('utf-8'))
                else:
                    print 'Wrong habrahabr response code: %s' % response.status_code
            else:
                print 'Processing only GET requests'
            conn.close()
    except Exception, ex:
        print 'Exception in proxy: %s' % ex
    finally:
        sock.close()


def process_links(html):
    soup = BeautifulSoup(html)
    for tag, href_attr in (('img', 'src'), ('link', 'href'), ('script', 'src')):
        for link in soup.find_all(tag):
            curr_href = link.get(href_attr, '')
            if curr_href.startswith('/') and not curr_href.startswith('//'):
                link[href_attr] = urlparse.urljoin('http://habrahabr.ru', curr_href)

    return unicode(soup)


def process_words(html):
    soup = BeautifulSoup(html)
    word_re = re.compile(ur'(\W|^)(\w{6})(?=\W|$)', re.UNICODE)
    for tag in soup.find_all(text=True):
        container = tag.parent
        if container and tag.string and tag.string.strip():
            if container.name not in ['script', 'body', 'code', 'head', 'title', '[document]']:
                if word_re.findall(unicode(tag.string)):
                    tag.string.replace_with(word_re.sub(ur'\1\2™', unicode(tag.string)))

    return unicode(soup)


if __name__ == '__main__':
    main()
	#coding=utf-8
	from __future__ import unicode_literals
	from bs4 import BeautifulSoup
	import urlparse
	import re
	import requests
	import socket
	import time

	DELAY = 0.001

	def main():
	sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
	try:
	sock.bind(('127.0.0.1', 8888))
	sock.listen(1)

	while True:
	time.sleep(DELAY)
	conn, address = sock.accept()
	headers = conn.recv(1024)
	method, address, protocol = headers.splitlines()[0].split(' ')

	if method == 'GET':
	response = requests.get(urlparse.urljoin('http://habrahabr.ru', address))
	content = response.content
	if response.status_code == 200:
	content = process_links(content)
	content = process_words(content)
	conn.sendall(content.encode('utf-8'))
	else:
	print 'Wrong habrahabr response code: %s' % response.status_code
	else:
	print 'Processing only GET requests'
	conn.close()
	except Exception, ex:
	print 'Exception in proxy: %s' % ex
	finally:
	sock.close()


	def process_links(html):
	soup = BeautifulSoup(html)
	for tag, href_attr in (('img', 'src'), ('link', 'href'), ('script', 'src')):
	for link in soup.find_all(tag):
	curr_href = link.get(href_attr, '')
	if curr_href.startswith('/') and not curr_href.startswith('//'):
	link[href_attr] = urlparse.urljoin('http://habrahabr.ru', curr_href)

	return unicode(soup)


	def process_words(html):
	soup = BeautifulSoup(html)
	word_re = re.compile(ur'(\W\|^)(\w{6})(?=\W\|$)', re.UNICODE)
	for tag in soup.find_all(text=True):
	container = tag.parent
	if container and tag.string and tag.string.strip():
	if container.name not in ['script', 'body', 'code', 'head', 'title', '[document]']:
	if word_re.findall(unicode(tag.string)):
	tag.string.replace_with(word_re.sub(ur'\1\2™', unicode(tag.string)))

	return unicode(soup)


	if __name__ == '__main__':
	main()