Created
September 10, 2013 05:34
-
-
Save yamingd/6505352 to your computer and use it in GitHub Desktop.
html hight lighter for words, sentences
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import re | |
keeps = ('em', 'i', 'b', 'strong', 'p', 'br', 'img', 'u') | |
CLEANBODY_RE = re.compile(r'<(/?)(.+?)>', re.M) | |
IMG_SRC = re.compile(r'(src="(.*?)")', re.M) | |
WORD_RE = re.compile(r'(([\s</]*?)(\w+)([\s>]*?))', re.M) | |
def force_unicode(s, encoding='utf-8', errors='ignore'): | |
""" | |
Returns a unicode object representing 's'. Treats bytestrings using the | |
'encoding' codec. | |
""" | |
if s is None: | |
return '' | |
try: | |
if not isinstance(s, basestring,): | |
if hasattr(s, '__unicode__'): | |
s = unicode(s) | |
else: | |
try: | |
s = unicode(str(s), encoding, errors) | |
except UnicodeEncodeError: | |
if not isinstance(s, Exception): | |
raise | |
# If we get to here, the caller has passed in an Exception | |
# subclass populated with non-ASCII data without special | |
# handling to display as a string. We need to handle this | |
# without raising a further exception. We do an | |
# approximation to what the Exception's standard str() | |
# output should be. | |
s = ' '.join([ | |
force_unicode(arg, encoding, errors) for arg in s]) | |
elif not isinstance(s, unicode): | |
# Note: We use .decode() here, instead of unicode(s, encoding, | |
# errors), so that if s is a SafeString, it ends up being a | |
# SafeUnicode at the end. | |
s = s.decode(encoding, errors) | |
except UnicodeDecodeError, e: | |
if not isinstance(s, Exception): | |
raise UnicodeDecodeError(s, *e.args) | |
else: | |
# If we get to here, the caller has passed in an Exception | |
# subclass populated with non-ASCII bytestring data without a | |
# working unicode method. Try to handle this without raising a | |
# further exception by individually forcing the exception args | |
# to unicode. | |
s = ' '.join([force_unicode(arg, encoding, errors) for arg in s]) | |
return s | |
class HtmlLighter(object): | |
def __init__(self, domain): | |
self.domain = domain | |
def _format_img(self, ct): | |
atts = IMG_SRC.findall(ct) | |
if len(atts) == 0: | |
return '' | |
_, src = atts[0] | |
if not src.startswith('http://') and not src.startswith('https://'): | |
if not self.domain.endswith('/') and not src.startswith('/'): | |
src = '/' + src | |
src = u'%s%s' % (self.domain, src) | |
return '<img src="%s" />' % src | |
def _repl_tags(self, match): | |
tag = match.group(2).split(' ')[0].lower() | |
# print match.group(0), match.group(1), match.group(2) | |
if tag == 'p': | |
return '<%sp>' % match.group(1) | |
elif tag == 'img': | |
# replace src | |
ct = match.group(0).lower() | |
return self._format_img(ct) | |
elif tag in keeps: | |
return match.group(0).lower() | |
return u'' | |
def strip_tags(self, text): | |
text = force_unicode(text) | |
return CLEANBODY_RE.sub(self._repl_tags, text) | |
def _repl_words(self, match): | |
#print match.group(0), match.group(1) | |
if match.group(0).startswith('<'): | |
return match.group(0) | |
return '<u>' + match.group(0) + '</u>' | |
def words(self, text, words): | |
if len(words) == 0: | |
return text | |
text = force_unicode(text) | |
# (\blisp\b)|(\bpython\b)|(\bperl\b)|(\bjava\b)|(\bc\b) | |
for w in words: | |
restr = '(([</]*?)\\b%s\\b)' % w.lower() | |
#print restr | |
regex = re.compile(restr, re.I | re.M) | |
text = regex.sub(self._repl_words, text) | |
return text | |
def _repl_sentences(self, match): | |
#print match.group(0) | |
return '<u>' + match.group(0) + "</u>" | |
def sentence(self, text, sentence, limit=4): | |
# (\blisp\b)|(\bpython\b)|(\bperl\b)|(\bjava\b)|(\bc\b) | |
words = re.split('\s+', sentence) | |
if len(words) > limit: | |
return text | |
text = force_unicode(text) | |
restr = ['(%s)' % w for w in words] | |
restr = '(\s*?)'.join(restr) | |
restr = '(' + restr + ')' | |
#print restr | |
regex = re.compile(restr, re.I | re.M) | |
text = regex.sub(self._repl_sentences, text) | |
return text | |
def light(self, text, picks, limit=4): | |
if len(picks) == 0: | |
return text | |
words = [item for item in picks if ' ' not in item] | |
sents = [item for item in picks if ' ' in item] | |
if len(words) > 0: | |
text = self.words(text, words) | |
if len(sents) > 0: | |
for sent in sents: | |
text = self.sentence(text, sent, limit=limit) | |
return text | |
if __name__ == '__main__': | |
text = None | |
hb = HtmlLighter(u'http://www.51voa.com') | |
with open('profile.txt', 'r') as f: | |
text = hb.strip_tags(f.read()) | |
with open('profile.clean.html', 'w+') as f: | |
f.write(text.encode('utf8')) | |
text2 = hb.light(text, ["News", "strong", "hurricane", "the storm", "according to", "scientists wrote the report"]) | |
with open('profile.clean2.html', 'w+') as f: | |
f.write(text2.encode('utf8')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment