Skip to content

Instantly share code, notes, and snippets.

@yamingd
Created September 10, 2013 05:34
Show Gist options
  • Save yamingd/6505352 to your computer and use it in GitHub Desktop.
Save yamingd/6505352 to your computer and use it in GitHub Desktop.
html hight lighter for words, sentences
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
keeps = ('em', 'i', 'b', 'strong', 'p', 'br', 'img', 'u')
CLEANBODY_RE = re.compile(r'<(/?)(.+?)>', re.M)
IMG_SRC = re.compile(r'(src="(.*?)")', re.M)
WORD_RE = re.compile(r'(([\s</]*?)(\w+)([\s>]*?))', re.M)
def force_unicode(s, encoding='utf-8', errors='ignore'):
"""
Returns a unicode object representing 's'. Treats bytestrings using the
'encoding' codec.
"""
if s is None:
return ''
try:
if not isinstance(s, basestring,):
if hasattr(s, '__unicode__'):
s = unicode(s)
else:
try:
s = unicode(str(s), encoding, errors)
except UnicodeEncodeError:
if not isinstance(s, Exception):
raise
# If we get to here, the caller has passed in an Exception
# subclass populated with non-ASCII data without special
# handling to display as a string. We need to handle this
# without raising a further exception. We do an
# approximation to what the Exception's standard str()
# output should be.
s = ' '.join([
force_unicode(arg, encoding, errors) for arg in s])
elif not isinstance(s, unicode):
# Note: We use .decode() here, instead of unicode(s, encoding,
# errors), so that if s is a SafeString, it ends up being a
# SafeUnicode at the end.
s = s.decode(encoding, errors)
except UnicodeDecodeError, e:
if not isinstance(s, Exception):
raise UnicodeDecodeError(s, *e.args)
else:
# If we get to here, the caller has passed in an Exception
# subclass populated with non-ASCII bytestring data without a
# working unicode method. Try to handle this without raising a
# further exception by individually forcing the exception args
# to unicode.
s = ' '.join([force_unicode(arg, encoding, errors) for arg in s])
return s
class HtmlLighter(object):
def __init__(self, domain):
self.domain = domain
def _format_img(self, ct):
atts = IMG_SRC.findall(ct)
if len(atts) == 0:
return ''
_, src = atts[0]
if not src.startswith('http://') and not src.startswith('https://'):
if not self.domain.endswith('/') and not src.startswith('/'):
src = '/' + src
src = u'%s%s' % (self.domain, src)
return '<img src="%s" />' % src
def _repl_tags(self, match):
tag = match.group(2).split(' ')[0].lower()
# print match.group(0), match.group(1), match.group(2)
if tag == 'p':
return '<%sp>' % match.group(1)
elif tag == 'img':
# replace src
ct = match.group(0).lower()
return self._format_img(ct)
elif tag in keeps:
return match.group(0).lower()
return u''
def strip_tags(self, text):
text = force_unicode(text)
return CLEANBODY_RE.sub(self._repl_tags, text)
def _repl_words(self, match):
#print match.group(0), match.group(1)
if match.group(0).startswith('<'):
return match.group(0)
return '<u>' + match.group(0) + '</u>'
def words(self, text, words):
if len(words) == 0:
return text
text = force_unicode(text)
# (\blisp\b)|(\bpython\b)|(\bperl\b)|(\bjava\b)|(\bc\b)
for w in words:
restr = '(([</]*?)\\b%s\\b)' % w.lower()
#print restr
regex = re.compile(restr, re.I | re.M)
text = regex.sub(self._repl_words, text)
return text
def _repl_sentences(self, match):
#print match.group(0)
return '<u>' + match.group(0) + "</u>"
def sentence(self, text, sentence, limit=4):
# (\blisp\b)|(\bpython\b)|(\bperl\b)|(\bjava\b)|(\bc\b)
words = re.split('\s+', sentence)
if len(words) > limit:
return text
text = force_unicode(text)
restr = ['(%s)' % w for w in words]
restr = '(\s*?)'.join(restr)
restr = '(' + restr + ')'
#print restr
regex = re.compile(restr, re.I | re.M)
text = regex.sub(self._repl_sentences, text)
return text
def light(self, text, picks, limit=4):
if len(picks) == 0:
return text
words = [item for item in picks if ' ' not in item]
sents = [item for item in picks if ' ' in item]
if len(words) > 0:
text = self.words(text, words)
if len(sents) > 0:
for sent in sents:
text = self.sentence(text, sent, limit=limit)
return text
if __name__ == '__main__':
text = None
hb = HtmlLighter(u'http://www.51voa.com')
with open('profile.txt', 'r') as f:
text = hb.strip_tags(f.read())
with open('profile.clean.html', 'w+') as f:
f.write(text.encode('utf8'))
text2 = hb.light(text, ["News", "strong", "hurricane", "the storm", "according to", "scientists wrote the report"])
with open('profile.clean2.html', 'w+') as f:
f.write(text2.encode('utf8'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment