Skip to content

Instantly share code, notes, and snippets.

@lesha-co
Last active December 9, 2016 13:43
Show Gist options
  • Save lesha-co/80d54e89a01fe18122f16bdb91469e2d to your computer and use it in GitHub Desktop.
Save lesha-co/80d54e89a01fe18122f16bdb91469e2d to your computer and use it in GitHub Desktop.
# coding=utf-8
import re
from urlparse import urlparse, urlunparse
from flask import Flask
from bs4 import BeautifulSoup, NavigableString, Tag
import requests
app = Flask(__name__)
DOMAIN = 'habrahabr.ru'
IGNORED_TAGS = ['style', 'script']
PORT = 5000
TM = u'™'
# жёсткий хак для того, что бы кириллица считалась за \w
six_letters_word = r'\b((?u)\w{6})\b'
substitute = r'\1' + TM
def recursive_replace(tag):
try:
for item in tag.contents:
if type(item) is NavigableString:
text = unicode(item)
new_text = re.sub(six_letters_word, substitute, text)
item.replaceWith(new_text)
elif type(item) is Tag:
if item.name not in IGNORED_TAGS:
recursive_replace(item)
except AttributeError as ae:
print ae
@app.route('/', defaults={'path': ''})
@app.route('/<path:path>')
def main(path):
r = requests.get('http://{}/{}'.format(DOMAIN, path))
# не надо нам фиксить JS и CSS
if 'text/html' in r.headers['content-type']:
soup = BeautifulSoup(r.content, 'html.parser')
recursive_replace(soup)
# поправим ссылки с абсолютными урлами
links = soup.findAll('a')
for link in links:
if link.has_attr('href'):
parts = urlparse(link['href'])
if DOMAIN == parts.netloc: # внешние ссылки не будем трогать
new_parts = parts._replace(
netloc="localhost:{}".format(PORT), scheme='http')
link['href'] = urlunparse(new_parts)
return str(soup)
else:
return r.content
if __name__ == "__main__":
app.run(port=PORT)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment