Skip to content

Instantly share code, notes, and snippets.

@waylan
Created April 11, 2015 00:50
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save waylan/86026a6baca01a14ba0c to your computer and use it in GitHub Desktop.
Save waylan/86026a6baca01a14ba0c to your computer and use it in GitHub Desktop.
Check the spelling of an HTML document.
try:
from html.parser import HTMLParser
except ImportError:
from HTMLParser import HTMLParser
from enchant.checker import SpellChecker
class HTMLSpellChecker(HTMLParser):
"""
Check the spelling of an HTML document using the given SpellChecker.
Only the text nodes (and title and alt attributes) are spellchecked by
default. The content of a tag can be excluded by passing that tag name
to `exclude`. An attribute's content will be included by passing that
attribute name to `attrs`. Declarations, processing instructions, and
unknown declarations are always ignored. Character and entity
references are ignored if they are not converted to text by the parser.
Keywords:
* `checker`: A pyenchant SpellChecker instance.
* `exclude`: A list of tags to exclude from checking.
Defaults to an empty list. Note that `<script>` and `<style>` tags
are always skipped and do not need to be included in the exclude list.
* `attrs`: A list of attributes to include in the checker.
The content of each included attribute is spell checked regardless
of the tag it is associated with. Defaults to `['title', 'alt']`.
Note that passing in a list of attributes is not addative. Only
the attributes passed in will be checked.
* `comments`: Check the content of comments. Defaults to `False`.
* Any keywords accepted by the `html.parser.HTMLParser` class.
Note that, unlike with `HTMLParser`, 'convert_charrefs' defaults
to `True`.
"""
def __init__(self, checker, exclude=None, attrs=None, comments=False, **kwargs):
self.checker = checker
cdatatags = ['script', 'style']
self.excluded_tags = set(exclude + cdatatags if exclude else cdatatags)
self.included_attrs = set(attrs if attrs is not None else ['title', 'alt'])
self.check_comments = comments
if 'convert_charrefs' not in kwargs:
kwargs['convert_charrefs'] = True
try:
super(HTMLSpellChecker, self).__init__(**kwargs)
except TypeError:
# must be python 2
HTMLParser.__init__(self)
def reset(self):
self.stack = []
#super(HTMLSpellChecker, self).reset()
HTMLParser.reset(self)
def checkspelling(self, txt):
for n, line in enumerate(txt.split('\n'), start=self.lineno):
if line.strip():
#print("Checking at line %s: %s" % (n, line))
self.checker.set_text(line)
for err in self.checker:
print "ERROR (line %d):" % n, err.word
def in_excluded_tag(self):
# Could be in a child (or grandchild) element of an excluded tag.
# Therefore, check that no excluded tags are anywhere in stack.
return bool(self.excluded_tags.intersection(self.stack))
def handle_starttag(self, tag, attrs):
# Force a <p> tag to always close any previously unclosed <p> tag.
# As unclosed child's exclusion should not carry to the parents's sibling.
# TODO: perhaps have all block level elements close a <p> tag.
if tag == 'p' and tag in self.stack:
self.handle_endtag(tag)
self.stack.append(tag)
if not self.in_excluded_tag():
for k, v in attrs:
if k in self.included_attrs:
self.checkspelling(v)
def handle_endtag(self, tag):
# Some child tags may not have been closed.
# Remove all unclosed children from stack.
# If a tag was never opened just ignore it.
if tag in self.stack:
while self.stack:
t = self.stack.pop()
if t == tag:
break
def handle_data(self, data):
if not self.in_excluded_tag():
self.checkspelling(data)
def handle_comment(self, data):
if self.check_comments and not self.in_excluded_tag():
self.checkspelling(data)
html = """
<div>
<script># A script</script>
<p>
This is line one of a parragraph.
This is <em>line two</em>.
</p>
<p>
<img src="example.jpg"
alt="Some allt text"
title="A tittle for the image" />
Some <a href="example.html" title="link ttitle">textt</a>
in parragraph 2. Some <code>codde</code>.
</p>
<!-- a commment -->
</div>
<section>
<!-- Comment in section -->
<p>
A <strong>grandchildr</strong>
<img src="example.jpg"
alt="Some alt text in section"
title="A title for the image in section" />
</p>
</section>
"""
checker = HTMLSpellChecker(SpellChecker('en_US'), exclude=['code', 'section'], comments=True)
checker.feed(html)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment