waylan/htmlspellchecker.py

## htmlspellchecker.py
try:
    from html.parser import HTMLParser
except ImportError:
    from HTMLParser import HTMLParser
from enchant.checker import SpellChecker


class HTMLSpellChecker(HTMLParser):
    """
    Check the spelling of an HTML document using the given SpellChecker.

    Only the text nodes (and title and alt attributes) are spellchecked by
    default. The content of a tag can be excluded by passing that tag name
    to `exclude`. An attribute's content will be included by passing that
    attribute name to `attrs`. Declarations, processing instructions, and
    unknown declarations are always ignored. Character and entity
    references are ignored if they are not converted to text by the parser.

    Keywords:

    *   `checker`: A pyenchant SpellChecker instance.
    *   `exclude`: A list of tags to exclude from checking.
        Defaults to an empty list. Note that `<script>` and `<style>` tags
        are always skipped and do not need to be included in the exclude list.
    *   `attrs`: A list of attributes to include in the checker.
        The content of each included attribute is spell checked regardless
        of the tag it is associated with. Defaults to `['title', 'alt']`.
        Note that passing in a list of attributes is not addative. Only
        the attributes passed in will be checked.
    *   `comments`: Check the content of comments. Defaults to `False`.
    *   Any keywords accepted by the `html.parser.HTMLParser` class.
        Note that, unlike with `HTMLParser`, 'convert_charrefs' defaults
        to `True`.
    """

    def __init__(self, checker, exclude=None, attrs=None, comments=False, **kwargs):
        self.checker = checker
        cdatatags = ['script', 'style']
        self.excluded_tags = set(exclude + cdatatags if exclude else cdatatags)
        self.included_attrs = set(attrs if attrs is not None else ['title', 'alt'])
        self.check_comments = comments
        if 'convert_charrefs' not in kwargs:
            kwargs['convert_charrefs'] = True
        try:
            super(HTMLSpellChecker, self).__init__(**kwargs)
        except TypeError:
            # must be python 2
            HTMLParser.__init__(self)

    def reset(self):
        self.stack = []
        #super(HTMLSpellChecker, self).reset()
        HTMLParser.reset(self)

    def checkspelling(self, txt):
        for n, line in enumerate(txt.split('\n'), start=self.lineno):
            if line.strip():
                #print("Checking at line %s: %s" % (n, line))
                self.checker.set_text(line)
                for err in self.checker:
                    print "ERROR (line %d):" % n, err.word

    def in_excluded_tag(self):
        # Could be in a child (or grandchild) element of an excluded tag.
        # Therefore, check that no excluded tags are anywhere in stack.
        return bool(self.excluded_tags.intersection(self.stack))

    def handle_starttag(self, tag, attrs):
        # Force a <p> tag to always close any previously unclosed <p> tag.
        # As unclosed child's exclusion should not carry to the parents's sibling.
        # TODO: perhaps have all block level elements close a <p> tag.
        if tag == 'p' and tag in self.stack:
            self.handle_endtag(tag)

        self.stack.append(tag)
        if not self.in_excluded_tag():
            for k, v in attrs:
                if k in self.included_attrs:
                    self.checkspelling(v)

    def handle_endtag(self, tag):
        # Some child tags may not have been closed.
        # Remove all unclosed children from stack.
        # If a tag was never opened just ignore it.
        if tag in self.stack:
            while self.stack:
                t = self.stack.pop()
                if t == tag:
                    break

    def handle_data(self, data):
        if not self.in_excluded_tag():
            self.checkspelling(data)

    def handle_comment(self, data):
        if self.check_comments and not self.in_excluded_tag():
            self.checkspelling(data)


html = """
<div>
    <script># A script</script>

    <p>
        This is line one of a parragraph.
        This is <em>line two</em>.
    </p>

    <p>
        <img src="example.jpg"
             alt="Some allt text"
             title="A tittle for the image" />
        Some <a href="example.html" title="link ttitle">textt</a>
        in parragraph 2. Some <code>codde</code>.
    </p>
    <!-- a commment -->
</div>
<section>
    <!-- Comment in section -->
    <p>
        A <strong>grandchildr</strong>
        <img src="example.jpg"
             alt="Some alt text in section"
             title="A title for the image in section" />
    </p>
</section>
"""


checker = HTMLSpellChecker(SpellChecker('en_US'), exclude=['code', 'section'], comments=True)
checker.feed(html)
	try:
	from html.parser import HTMLParser
	except ImportError:
	from HTMLParser import HTMLParser
	from enchant.checker import SpellChecker


	class HTMLSpellChecker(HTMLParser):
	"""
	Check the spelling of an HTML document using the given SpellChecker.

	Only the text nodes (and title and alt attributes) are spellchecked by
	default. The content of a tag can be excluded by passing that tag name
	to `exclude`. An attribute's content will be included by passing that
	attribute name to `attrs`. Declarations, processing instructions, and
	unknown declarations are always ignored. Character and entity
	references are ignored if they are not converted to text by the parser.

	Keywords:

	* `checker`: A pyenchant SpellChecker instance.
	* `exclude`: A list of tags to exclude from checking.
	Defaults to an empty list. Note that `<script>` and `<style>` tags
	are always skipped and do not need to be included in the exclude list.
	* `attrs`: A list of attributes to include in the checker.
	The content of each included attribute is spell checked regardless
	of the tag it is associated with. Defaults to `['title', 'alt']`.
	Note that passing in a list of attributes is not addative. Only
	the attributes passed in will be checked.
	* `comments`: Check the content of comments. Defaults to `False`.
	* Any keywords accepted by the `html.parser.HTMLParser` class.
	Note that, unlike with `HTMLParser`, 'convert_charrefs' defaults
	to `True`.
	"""

	def __init__(self, checker, exclude=None, attrs=None, comments=False, **kwargs):
	self.checker = checker
	cdatatags = ['script', 'style']
	self.excluded_tags = set(exclude + cdatatags if exclude else cdatatags)
	self.included_attrs = set(attrs if attrs is not None else ['title', 'alt'])
	self.check_comments = comments
	if 'convert_charrefs' not in kwargs:
	kwargs['convert_charrefs'] = True
	try:
	super(HTMLSpellChecker, self).__init__(**kwargs)
	except TypeError:
	# must be python 2
	HTMLParser.__init__(self)

	def reset(self):
	self.stack = []
	#super(HTMLSpellChecker, self).reset()
	HTMLParser.reset(self)

	def checkspelling(self, txt):
	for n, line in enumerate(txt.split('\n'), start=self.lineno):
	if line.strip():
	#print("Checking at line %s: %s" % (n, line))
	self.checker.set_text(line)
	for err in self.checker:
	print "ERROR (line %d):" % n, err.word

	def in_excluded_tag(self):
	# Could be in a child (or grandchild) element of an excluded tag.
	# Therefore, check that no excluded tags are anywhere in stack.
	return bool(self.excluded_tags.intersection(self.stack))

	def handle_starttag(self, tag, attrs):
	# Force a <p> tag to always close any previously unclosed <p> tag.
	# As unclosed child's exclusion should not carry to the parents's sibling.
	# TODO: perhaps have all block level elements close a <p> tag.
	if tag == 'p' and tag in self.stack:
	self.handle_endtag(tag)

	self.stack.append(tag)
	if not self.in_excluded_tag():
	for k, v in attrs:
	if k in self.included_attrs:
	self.checkspelling(v)

	def handle_endtag(self, tag):
	# Some child tags may not have been closed.
	# Remove all unclosed children from stack.
	# If a tag was never opened just ignore it.
	if tag in self.stack:
	while self.stack:
	t = self.stack.pop()
	if t == tag:
	break

	def handle_data(self, data):
	if not self.in_excluded_tag():
	self.checkspelling(data)

	def handle_comment(self, data):
	if self.check_comments and not self.in_excluded_tag():
	self.checkspelling(data)


	html = """
	<div>
	<script># A script</script>

	<p>
	This is line one of a parragraph.
	This is <em>line two</em>.
	</p>

	<p>
	<img src="example.jpg"
	alt="Some allt text"
	title="A tittle for the image" />
	Some <a href="example.html" title="link ttitle">textt</a>
	in parragraph 2. Some <code>codde</code>.
	</p>
	<!-- a commment -->
	</div>
	<section>
	<!-- Comment in section -->
	<p>
	A <strong>grandchildr</strong>
	<img src="example.jpg"
	alt="Some alt text in section"
	title="A title for the image in section" />
	</p>
	</section>
	"""


	checker = HTMLSpellChecker(SpellChecker('en_US'), exclude=['code', 'section'], comments=True)
	checker.feed(html)