Created
April 11, 2015 00:50
-
-
Save waylan/86026a6baca01a14ba0c to your computer and use it in GitHub Desktop.
Check the spelling of an HTML document.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
try: | |
from html.parser import HTMLParser | |
except ImportError: | |
from HTMLParser import HTMLParser | |
from enchant.checker import SpellChecker | |
class HTMLSpellChecker(HTMLParser): | |
""" | |
Check the spelling of an HTML document using the given SpellChecker. | |
Only the text nodes (and title and alt attributes) are spellchecked by | |
default. The content of a tag can be excluded by passing that tag name | |
to `exclude`. An attribute's content will be included by passing that | |
attribute name to `attrs`. Declarations, processing instructions, and | |
unknown declarations are always ignored. Character and entity | |
references are ignored if they are not converted to text by the parser. | |
Keywords: | |
* `checker`: A pyenchant SpellChecker instance. | |
* `exclude`: A list of tags to exclude from checking. | |
Defaults to an empty list. Note that `<script>` and `<style>` tags | |
are always skipped and do not need to be included in the exclude list. | |
* `attrs`: A list of attributes to include in the checker. | |
The content of each included attribute is spell checked regardless | |
of the tag it is associated with. Defaults to `['title', 'alt']`. | |
Note that passing in a list of attributes is not addative. Only | |
the attributes passed in will be checked. | |
* `comments`: Check the content of comments. Defaults to `False`. | |
* Any keywords accepted by the `html.parser.HTMLParser` class. | |
Note that, unlike with `HTMLParser`, 'convert_charrefs' defaults | |
to `True`. | |
""" | |
def __init__(self, checker, exclude=None, attrs=None, comments=False, **kwargs): | |
self.checker = checker | |
cdatatags = ['script', 'style'] | |
self.excluded_tags = set(exclude + cdatatags if exclude else cdatatags) | |
self.included_attrs = set(attrs if attrs is not None else ['title', 'alt']) | |
self.check_comments = comments | |
if 'convert_charrefs' not in kwargs: | |
kwargs['convert_charrefs'] = True | |
try: | |
super(HTMLSpellChecker, self).__init__(**kwargs) | |
except TypeError: | |
# must be python 2 | |
HTMLParser.__init__(self) | |
def reset(self): | |
self.stack = [] | |
#super(HTMLSpellChecker, self).reset() | |
HTMLParser.reset(self) | |
def checkspelling(self, txt): | |
for n, line in enumerate(txt.split('\n'), start=self.lineno): | |
if line.strip(): | |
#print("Checking at line %s: %s" % (n, line)) | |
self.checker.set_text(line) | |
for err in self.checker: | |
print "ERROR (line %d):" % n, err.word | |
def in_excluded_tag(self): | |
# Could be in a child (or grandchild) element of an excluded tag. | |
# Therefore, check that no excluded tags are anywhere in stack. | |
return bool(self.excluded_tags.intersection(self.stack)) | |
def handle_starttag(self, tag, attrs): | |
# Force a <p> tag to always close any previously unclosed <p> tag. | |
# As unclosed child's exclusion should not carry to the parents's sibling. | |
# TODO: perhaps have all block level elements close a <p> tag. | |
if tag == 'p' and tag in self.stack: | |
self.handle_endtag(tag) | |
self.stack.append(tag) | |
if not self.in_excluded_tag(): | |
for k, v in attrs: | |
if k in self.included_attrs: | |
self.checkspelling(v) | |
def handle_endtag(self, tag): | |
# Some child tags may not have been closed. | |
# Remove all unclosed children from stack. | |
# If a tag was never opened just ignore it. | |
if tag in self.stack: | |
while self.stack: | |
t = self.stack.pop() | |
if t == tag: | |
break | |
def handle_data(self, data): | |
if not self.in_excluded_tag(): | |
self.checkspelling(data) | |
def handle_comment(self, data): | |
if self.check_comments and not self.in_excluded_tag(): | |
self.checkspelling(data) | |
html = """ | |
<div> | |
<script># A script</script> | |
<p> | |
This is line one of a parragraph. | |
This is <em>line two</em>. | |
</p> | |
<p> | |
<img src="example.jpg" | |
alt="Some allt text" | |
title="A tittle for the image" /> | |
Some <a href="example.html" title="link ttitle">textt</a> | |
in parragraph 2. Some <code>codde</code>. | |
</p> | |
<!-- a commment --> | |
</div> | |
<section> | |
<!-- Comment in section --> | |
<p> | |
A <strong>grandchildr</strong> | |
<img src="example.jpg" | |
alt="Some alt text in section" | |
title="A title for the image in section" /> | |
</p> | |
</section> | |
""" | |
checker = HTMLSpellChecker(SpellChecker('en_US'), exclude=['code', 'section'], comments=True) | |
checker.feed(html) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment