yamingd/gist:6505352

## gistfile1.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import re

keeps = ('em', 'i', 'b', 'strong', 'p', 'br', 'img', 'u')

CLEANBODY_RE = re.compile(r'<(/?)(.+?)>', re.M)
IMG_SRC = re.compile(r'(src="(.*?)")', re.M)
WORD_RE = re.compile(r'(([\s</]*?)(\w+)([\s>]*?))', re.M)


def force_unicode(s, encoding='utf-8', errors='ignore'):
    """
    Returns a unicode object representing 's'. Treats bytestrings using the
    'encoding' codec.
    """
    if s is None:
        return ''

    try:
        if not isinstance(s, basestring,):
            if hasattr(s, '__unicode__'):
                s = unicode(s)
            else:
                try:
                    s = unicode(str(s), encoding, errors)
                except UnicodeEncodeError:
                    if not isinstance(s, Exception):
                        raise
                    # If we get to here, the caller has passed in an Exception
                    # subclass populated with non-ASCII data without special
                    # handling to display as a string. We need to handle this
                    # without raising a further exception. We do an
                    # approximation to what the Exception's standard str()
                    # output should be.
                    s = ' '.join([
                                 force_unicode(arg, encoding, errors) for arg in s])
        elif not isinstance(s, unicode):
            # Note: We use .decode() here, instead of unicode(s, encoding,
            # errors), so that if s is a SafeString, it ends up being a
            # SafeUnicode at the end.
            s = s.decode(encoding, errors)
    except UnicodeDecodeError, e:
        if not isinstance(s, Exception):
            raise UnicodeDecodeError(s, *e.args)
        else:
            # If we get to here, the caller has passed in an Exception
            # subclass populated with non-ASCII bytestring data without a
            # working unicode method. Try to handle this without raising a
            # further exception by individually forcing the exception args
            # to unicode.
            s = ' '.join([force_unicode(arg, encoding, errors) for arg in s])
    return s


class HtmlLighter(object):

    def __init__(self, domain):
        self.domain = domain

    def _format_img(self, ct):
        atts = IMG_SRC.findall(ct)
        if len(atts) == 0:
            return ''
        _, src = atts[0]
        if not src.startswith('http://') and not src.startswith('https://'):
            if not self.domain.endswith('/') and not src.startswith('/'):
                src = '/' + src
            src = u'%s%s' % (self.domain, src)
        return '<img src="%s" />' % src

    def _repl_tags(self, match):
        tag = match.group(2).split(' ')[0].lower()
        # print match.group(0), match.group(1), match.group(2)
        if tag == 'p':
            return '<%sp>' % match.group(1)
        elif tag == 'img':
            # replace src
            ct = match.group(0).lower()
            return self._format_img(ct)
        elif tag in keeps:
            return match.group(0).lower()
        return u''

    def strip_tags(self, text):
        text = force_unicode(text)
        return CLEANBODY_RE.sub(self._repl_tags, text)

    def _repl_words(self, match):
        #print match.group(0), match.group(1)
        if match.group(0).startswith('<'):
            return match.group(0)
        return '<u>' + match.group(0) + '</u>'

    def words(self, text, words):
        if len(words) == 0:
            return text
        text = force_unicode(text)
        # (\blisp\b)|(\bpython\b)|(\bperl\b)|(\bjava\b)|(\bc\b)
        for w in words:
            restr = '(([</]*?)\\b%s\\b)' % w.lower()
            #print restr
            regex = re.compile(restr, re.I | re.M)
            text = regex.sub(self._repl_words, text)
        return text

    def _repl_sentences(self, match):
        #print match.group(0)
        return '<u>' + match.group(0) + "</u>"

    def sentence(self, text, sentence, limit=4):
        # (\blisp\b)|(\bpython\b)|(\bperl\b)|(\bjava\b)|(\bc\b)
        words = re.split('\s+', sentence)
        if len(words) > limit:
            return text
        text = force_unicode(text)
        restr = ['(%s)' % w for w in words]
        restr = '(\s*?)'.join(restr)
        restr = '(' + restr + ')'
        #print restr
        regex = re.compile(restr, re.I | re.M)
        text = regex.sub(self._repl_sentences, text)
        return text

    def light(self, text, picks, limit=4):
        if len(picks) == 0:
            return text
        words = [item for item in picks if ' ' not in item]
        sents = [item for item in picks if ' ' in item]
        if len(words) > 0:
            text = self.words(text, words)
        if len(sents) > 0:
            for sent in sents:
                text = self.sentence(text, sent, limit=limit)
        return text

if __name__ == '__main__':
    text = None
    hb = HtmlLighter(u'http://www.51voa.com')
    with open('profile.txt', 'r') as f:
        text = hb.strip_tags(f.read())
    with open('profile.clean.html', 'w+') as f:
        f.write(text.encode('utf8'))
    text2 = hb.light(text, ["News", "strong", "hurricane", "the storm", "according to", "scientists wrote the report"])
    with open('profile.clean2.html', 'w+') as f:
        f.write(text2.encode('utf8'))
	#!/usr/bin/env python
	# -- coding: utf-8 --

	import re

	keeps = ('em', 'i', 'b', 'strong', 'p', 'br', 'img', 'u')

	CLEANBODY_RE = re.compile(r'<(/?)(.+?)>', re.M)
	IMG_SRC = re.compile(r'(src="(.*?)")', re.M)
	WORD_RE = re.compile(r'(([\s</]?)(\w+)([\s>]?))', re.M)


	def force_unicode(s, encoding='utf-8', errors='ignore'):
	"""
	Returns a unicode object representing 's'. Treats bytestrings using the
	'encoding' codec.
	"""
	if s is None:
	return ''

	try:
	if not isinstance(s, basestring,):
	if hasattr(s, '__unicode__'):
	s = unicode(s)
	else:
	try:
	s = unicode(str(s), encoding, errors)
	except UnicodeEncodeError:
	if not isinstance(s, Exception):
	raise
	# If we get to here, the caller has passed in an Exception
	# subclass populated with non-ASCII data without special
	# handling to display as a string. We need to handle this
	# without raising a further exception. We do an
	# approximation to what the Exception's standard str()
	# output should be.
	s = ' '.join([
	force_unicode(arg, encoding, errors) for arg in s])
	elif not isinstance(s, unicode):
	# Note: We use .decode() here, instead of unicode(s, encoding,
	# errors), so that if s is a SafeString, it ends up being a
	# SafeUnicode at the end.
	s = s.decode(encoding, errors)
	except UnicodeDecodeError, e:
	if not isinstance(s, Exception):
	raise UnicodeDecodeError(s, *e.args)
	else:
	# If we get to here, the caller has passed in an Exception
	# subclass populated with non-ASCII bytestring data without a
	# working unicode method. Try to handle this without raising a
	# further exception by individually forcing the exception args
	# to unicode.
	s = ' '.join([force_unicode(arg, encoding, errors) for arg in s])
	return s


	class HtmlLighter(object):

	def __init__(self, domain):
	self.domain = domain

	def _format_img(self, ct):
	atts = IMG_SRC.findall(ct)
	if len(atts) == 0:
	return ''
	_, src = atts[0]
	if not src.startswith('http://') and not src.startswith('https://'):
	if not self.domain.endswith('/') and not src.startswith('/'):
	src = '/' + src
	src = u'%s%s' % (self.domain, src)
	return '<img src="%s" />' % src

	def _repl_tags(self, match):
	tag = match.group(2).split(' ')[0].lower()
	# print match.group(0), match.group(1), match.group(2)
	if tag == 'p':
	return '<%sp>' % match.group(1)
	elif tag == 'img':
	# replace src
	ct = match.group(0).lower()
	return self._format_img(ct)
	elif tag in keeps:
	return match.group(0).lower()
	return u''

	def strip_tags(self, text):
	text = force_unicode(text)
	return CLEANBODY_RE.sub(self._repl_tags, text)

	def _repl_words(self, match):
	#print match.group(0), match.group(1)
	if match.group(0).startswith('<'):
	return match.group(0)
	return '<u>' + match.group(0) + '</u>'

	def words(self, text, words):
	if len(words) == 0:
	return text
	text = force_unicode(text)
	# (\blisp\b)\|(\bpython\b)\|(\bperl\b)\|(\bjava\b)\|(\bc\b)
	for w in words:
	restr = '(([</]*?)\\b%s\\b)' % w.lower()
	#print restr
	regex = re.compile(restr, re.I \| re.M)
	text = regex.sub(self._repl_words, text)
	return text

	def _repl_sentences(self, match):
	#print match.group(0)
	return '<u>' + match.group(0) + "</u>"

	def sentence(self, text, sentence, limit=4):
	# (\blisp\b)\|(\bpython\b)\|(\bperl\b)\|(\bjava\b)\|(\bc\b)
	words = re.split('\s+', sentence)
	if len(words) > limit:
	return text
	text = force_unicode(text)
	restr = ['(%s)' % w for w in words]
	restr = '(\s*?)'.join(restr)
	restr = '(' + restr + ')'
	#print restr
	regex = re.compile(restr, re.I \| re.M)
	text = regex.sub(self._repl_sentences, text)
	return text

	def light(self, text, picks, limit=4):
	if len(picks) == 0:
	return text
	words = [item for item in picks if ' ' not in item]
	sents = [item for item in picks if ' ' in item]
	if len(words) > 0:
	text = self.words(text, words)
	if len(sents) > 0:
	for sent in sents:
	text = self.sentence(text, sent, limit=limit)
	return text

	if __name__ == '__main__':
	text = None
	hb = HtmlLighter(u'http://www.51voa.com')
	with open('profile.txt', 'r') as f:
	text = hb.strip_tags(f.read())
	with open('profile.clean.html', 'w+') as f:
	f.write(text.encode('utf8'))
	text2 = hb.light(text, ["News", "strong", "hurricane", "the storm", "according to", "scientists wrote the report"])
	with open('profile.clean2.html', 'w+') as f:
	f.write(text2.encode('utf8'))