Skip to content

Instantly share code, notes, and snippets.

@adilkhash
Last active January 6, 2017 09:03
Show Gist options
  • Save adilkhash/050d7ba461f29c6977d0a1f9a67297ac to your computer and use it in GitHub Desktop.
Save adilkhash/050d7ba461f29c6977d0a1f9a67297ac to your computer and use it in GitHub Desktop.
Ivelum test
# -*- coding: utf8 -*-
import re
import string
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
import requests
from bs4 import BeautifulSoup
def get_response(url):
return requests.get(url, timeout=(10, 20))
def escape(s):
return s.replace('<', '&lt;').replace('>', '&gt;')
def trademark_it(word):
def clean(s):
return re.sub(ur'[^а-яА-ЯёЁa-zA-Z_]+', '', s.replace('&nbsp;', ''), flags=re.IGNORECASE)
cleaned = clean(word)
result = ''
i = word.find(cleaned)
if i > 0: # восстанавливаем символы, обрезанные вначале строки
result = u'{0}'.format(word[0:i])
if len(cleaned) == 6:
result += u'{0}™{1}'.format(cleaned, word[6+i:]) # восстанавливаем слово со знаком
return result
return u'{0}'.format(word)
def proccess_text(text):
def strip(text):
return string.strip(text, ' ') # удаляем только обычные пробелы
# конвертируем неразрывный пробел, чтобы не потерять
# добавляем кастомный разделитель, чтобы не потерять обычный пробел
text = text.replace(u'\N{NO-BREAK SPACE}', '&nbsp;').replace(' ', '&csp;')
words = map(trademark_it, map(strip, text.split('&csp;')))
result = u' '.join(map(escape, words))
return result
def process_text_nodes(nodes):
for title in nodes:
for child in title.contents:
if child.string is None:
continue
child.string.replace_with(proccess_text(unicode(child.string)))
def replace_habr_urls(content):
text = re.sub(r'https://habrahabr\.ru', '', content, flags=re.IGNORECASE)
text = re.sub(r'link href="/', 'link href="https://habrahabr.ru/', text, flags=re.IGNORECASE)
return text
def hijack_habr(url):
response = get_response(url)
html_content = replace_habr_urls(response.content)
soup = BeautifulSoup(html_content, 'html.parser')
process_text_nodes(soup.find_all('h1', class_='post__title'))
process_text_nodes(soup.find_all('h2', class_='post__title'))
process_text_nodes(soup.find_all('div', class_='content'))
process_text_nodes(soup.find_all('span', class_='tab-item__value'))
process_text_nodes(soup.find_all('div', class_='buttons'))
return soup.encode(encoding='utf-8', formatter=None)
def build_url(uri):
return 'https://habrahabr.ru{0}'.format(uri)
class HttpProcessor(BaseHTTPRequestHandler):
def handle_fonts(self):
response = requests.get(build_url(self.path))
self.send_response(response.status_code)
for k, v in response.headers.iteritems():
self.send_header(k, v)
self.end_headers()
self.wfile.write(response.content)
def do_GET(self):
if self.path.startswith('/fonts/'):
self.handle_fonts()
else:
self.send_response(200)
self.send_header('content-type', 'text/html')
self.end_headers()
self.wfile.write(hijack_habr(build_url(self.path)))
serv = HTTPServer(('localhost', 8080), HttpProcessor)
serv.serve_forever()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment