-
-
Save valignatev/2156782b65cb2b164bd4 to your computer and use it in GitHub Desktop.
add to all words > 6 letters 'tm'
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import string | |
import webbrowser | |
import re | |
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer | |
# pip install requests | |
# pip install beautifulsoup4 | |
# pip install lxml | |
import requests | |
from bs4 import BeautifulSoup, Comment | |
from bs4.dammit import EntitySubstitution | |
from bs4.element import NavigableString | |
SITE = 'https://habrahabr.ru' | |
PORT = 8232 | |
# Regexp pattern for word with 6 letters | |
REGEXP = re.compile(ur'\b(\w{6})\b', re.UNICODE) | |
# list of tags which content can not be replaced | |
NOT_ALLOWED_TAGS = [ | |
'body', | |
'head', | |
'link', | |
'meta', | |
'style', | |
'title', | |
'script', | |
] | |
def all_but_comments(text): | |
""" | |
What happens in comments stays in comments | |
Check comment for new searchbox :) | |
""" | |
return re.search(REGEXP, text) and not isinstance(text, Comment) | |
def add_trademark(word): | |
word_length = len(word.strip(string.punctuation + string.whitespace)) | |
return word + u'\u2122' if word_length == 6 else word | |
def transform_content(page_tree): | |
""" | |
Adds trademark symbol to word if word length equals 6 | |
page_tree is BeautifulSoup object | |
""" | |
for content in page_tree.find_all(string=all_but_comments): | |
splitted = re.split(REGEXP, unicode(content)) | |
transformed_text = u''.join(add_trademark(word) for word in splitted) | |
if content.parent.name not in NOT_ALLOWED_TAGS: | |
content.replace_with(transformed_text) | |
return page_tree.encode('utf-8', formatter=habraformatter) | |
def habraformatter(entity): | |
""" | |
Format html only if it's in <code> tag. Helps with xml entities | |
which is invisible unless converted into valid html. | |
Otherwise just return entity - we don't want to escape all the things | |
(plus signs on frontpage for example) | |
""" | |
if isinstance(entity, NavigableString) and entity.parent.name == 'code': | |
return EntitySubstitution.substitute_xml(entity) | |
else: | |
return entity | |
class HttpProcessor(BaseHTTPRequestHandler): | |
def do_GET(self): | |
r = requests.get(SITE + self.path) | |
# Replace url for better navigation | |
request_content = r.content.replace(SITE, '') | |
self.send_response(r.status_code) | |
self.send_header('Content-type', r.headers['content-type']) | |
self.end_headers() | |
if 'text/html' in r.headers['content-type']: | |
page_tree = BeautifulSoup(request_content, 'lxml') | |
request_content = transform_content(page_tree) | |
self.wfile.write(request_content) | |
def main(): | |
server = HTTPServer(("localhost", PORT), HttpProcessor) | |
webbrowser.open_new('http://localhost:%s' % PORT) | |
server.serve_forever() | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment