Created
June 24, 2015 09:42
-
-
Save GeyseR/417513a184a7466ce28f to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#coding=utf-8 | |
from __future__ import unicode_literals | |
from bs4 import BeautifulSoup | |
import urlparse | |
import re | |
import requests | |
import socket | |
import time | |
DELAY = 0.001 | |
def main(): | |
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) | |
try: | |
sock.bind(('127.0.0.1', 8888)) | |
sock.listen(1) | |
while True: | |
time.sleep(DELAY) | |
conn, address = sock.accept() | |
headers = conn.recv(1024) | |
method, address, protocol = headers.splitlines()[0].split(' ') | |
if method == 'GET': | |
response = requests.get(urlparse.urljoin('http://habrahabr.ru', address)) | |
content = response.content | |
if response.status_code == 200: | |
content = process_links(content) | |
content = process_words(content) | |
conn.sendall(content.encode('utf-8')) | |
else: | |
print 'Wrong habrahabr response code: %s' % response.status_code | |
else: | |
print 'Processing only GET requests' | |
conn.close() | |
except Exception, ex: | |
print 'Exception in proxy: %s' % ex | |
finally: | |
sock.close() | |
def process_links(html): | |
soup = BeautifulSoup(html) | |
for tag, href_attr in (('img', 'src'), ('link', 'href'), ('script', 'src')): | |
for link in soup.find_all(tag): | |
curr_href = link.get(href_attr, '') | |
if curr_href.startswith('/') and not curr_href.startswith('//'): | |
link[href_attr] = urlparse.urljoin('http://habrahabr.ru', curr_href) | |
return unicode(soup) | |
def process_words(html): | |
soup = BeautifulSoup(html) | |
word_re = re.compile(ur'(\W|^)(\w{6})(?=\W|$)', re.UNICODE) | |
for tag in soup.find_all(text=True): | |
container = tag.parent | |
if container and tag.string and tag.string.strip(): | |
if container.name not in ['script', 'body', 'code', 'head', 'title', '[document]']: | |
if word_re.findall(unicode(tag.string)): | |
tag.string.replace_with(word_re.sub(ur'\1\2™', unicode(tag.string))) | |
return unicode(soup) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment