Skip to content

Instantly share code, notes, and snippets.

@valignatev
Forked from anonymous/gist:06e0bd519490c8f03404
Last active October 19, 2016 12:45
Show Gist options
  • Save valignatev/2156782b65cb2b164bd4 to your computer and use it in GitHub Desktop.
Save valignatev/2156782b65cb2b164bd4 to your computer and use it in GitHub Desktop.
add to all words > 6 letters 'tm'
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import string
import webbrowser
import re
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
# pip install requests
# pip install beautifulsoup4
# pip install lxml
import requests
from bs4 import BeautifulSoup, Comment
from bs4.dammit import EntitySubstitution
from bs4.element import NavigableString
SITE = 'https://habrahabr.ru'
PORT = 8232
# Regexp pattern for word with 6 letters
REGEXP = re.compile(ur'\b(\w{6})\b', re.UNICODE)
# list of tags which content can not be replaced
NOT_ALLOWED_TAGS = [
'body',
'head',
'link',
'meta',
'style',
'title',
'script',
]
def all_but_comments(text):
"""
What happens in comments stays in comments
Check comment for new searchbox :)
"""
return re.search(REGEXP, text) and not isinstance(text, Comment)
def add_trademark(word):
word_length = len(word.strip(string.punctuation + string.whitespace))
return word + u'\u2122' if word_length == 6 else word
def transform_content(page_tree):
"""
Adds trademark symbol to word if word length equals 6
page_tree is BeautifulSoup object
"""
for content in page_tree.find_all(string=all_but_comments):
splitted = re.split(REGEXP, unicode(content))
transformed_text = u''.join(add_trademark(word) for word in splitted)
if content.parent.name not in NOT_ALLOWED_TAGS:
content.replace_with(transformed_text)
return page_tree.encode('utf-8', formatter=habraformatter)
def habraformatter(entity):
"""
Format html only if it's in <code> tag. Helps with xml entities
which is invisible unless converted into valid html.
Otherwise just return entity - we don't want to escape all the things
(plus signs on frontpage for example)
"""
if isinstance(entity, NavigableString) and entity.parent.name == 'code':
return EntitySubstitution.substitute_xml(entity)
else:
return entity
class HttpProcessor(BaseHTTPRequestHandler):
def do_GET(self):
r = requests.get(SITE + self.path)
# Replace url for better navigation
request_content = r.content.replace(SITE, '')
self.send_response(r.status_code)
self.send_header('Content-type', r.headers['content-type'])
self.end_headers()
if 'text/html' in r.headers['content-type']:
page_tree = BeautifulSoup(request_content, 'lxml')
request_content = transform_content(page_tree)
self.wfile.write(request_content)
def main():
server = HTTPServer(("localhost", PORT), HttpProcessor)
webbrowser.open_new('http://localhost:%s' % PORT)
server.serve_forever()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment