Skip to content

Instantly share code, notes, and snippets.

@kidig
Last active August 29, 2015 14:27
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kidig/ff514ab803198e0d0964 to your computer and use it in GitHub Desktop.
Save kidig/ff514ab803198e0d0964 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding: utf-8
import re
import requests
from bottle import route, run, response
from bs4 import BeautifulSoup, Comment, Doctype
SITE_URL = 'http://habrahabr.ru'
HOST = 'localhost'
PORT = 8088
def main():
run(host=HOST, port=PORT)
@route('/')
@route('/<path:path>')
def index(path=None):
url = '%s/%s' % (SITE_URL, path) if path else SITE_URL
r = requests.get(url)
content = r.content
content_type = r.headers['content-type']
response.content_type = content_type
if 'text/html' in content_type:
content = parse(content)
return content
def parse(content):
soup = BeautifulSoup(content, "lxml")
for a in soup('a'):
href = a.get('href')
if href:
a['href'] = href.replace(SITE_URL, '')
for tag in soup.find_all(string=lambda s: not isinstance(s, (Comment, Doctype))):
if tag.parent.name in ('style', 'script', 'noscript', 'head', '[document]'):
continue
text = tag.string.strip()
if text:
tag.string.replace_with(add_trademark(text))
return soup.prettify().encode('utf-8')
def add_trademark(text):
regex = re.compile(ur'(?<=\b)(?<!-)(\w{6})(?!-)(?=\b)', re.UNICODE)
return regex.sub(u'\\1™', text)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment