Skip to content

Instantly share code, notes, and snippets.

@GeyseR
Created June 24, 2015 09:42
Show Gist options
  • Save GeyseR/417513a184a7466ce28f to your computer and use it in GitHub Desktop.
Save GeyseR/417513a184a7466ce28f to your computer and use it in GitHub Desktop.
#coding=utf-8
from __future__ import unicode_literals
from bs4 import BeautifulSoup
import urlparse
import re
import requests
import socket
import time
DELAY = 0.001
def main():
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
try:
sock.bind(('127.0.0.1', 8888))
sock.listen(1)
while True:
time.sleep(DELAY)
conn, address = sock.accept()
headers = conn.recv(1024)
method, address, protocol = headers.splitlines()[0].split(' ')
if method == 'GET':
response = requests.get(urlparse.urljoin('http://habrahabr.ru', address))
content = response.content
if response.status_code == 200:
content = process_links(content)
content = process_words(content)
conn.sendall(content.encode('utf-8'))
else:
print 'Wrong habrahabr response code: %s' % response.status_code
else:
print 'Processing only GET requests'
conn.close()
except Exception, ex:
print 'Exception in proxy: %s' % ex
finally:
sock.close()
def process_links(html):
soup = BeautifulSoup(html)
for tag, href_attr in (('img', 'src'), ('link', 'href'), ('script', 'src')):
for link in soup.find_all(tag):
curr_href = link.get(href_attr, '')
if curr_href.startswith('/') and not curr_href.startswith('//'):
link[href_attr] = urlparse.urljoin('http://habrahabr.ru', curr_href)
return unicode(soup)
def process_words(html):
soup = BeautifulSoup(html)
word_re = re.compile(ur'(\W|^)(\w{6})(?=\W|$)', re.UNICODE)
for tag in soup.find_all(text=True):
container = tag.parent
if container and tag.string and tag.string.strip():
if container.name not in ['script', 'body', 'code', 'head', 'title', '[document]']:
if word_re.findall(unicode(tag.string)):
tag.string.replace_with(word_re.sub(ur'\1\2™', unicode(tag.string)))
return unicode(soup)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment