Skip to content

Instantly share code, notes, and snippets.

@MattDietz
Created April 28, 2011 07:45
Show Gist options
  • Save MattDietz/945979 to your computer and use it in GitHub Desktop.
Save MattDietz/945979 to your computer and use it in GitHub Desktop.
pyhole
diff --git a/pyhole/utils.py b/pyhole/utils.py
index 86a0180..5ff46e3 100644
--- a/pyhole/utils.py
+++ b/pyhole/utils.py
@@ -71,18 +71,20 @@ def spawn(func):
def decode_entities(html):
"""Strip HTML entities from a string"""
- html = re.sub("<[^>]*?>", "", html)
- html = re.sub("&nbsp;", " ", html)
- html = re.sub("&amp;", "&", html)
- html = re.sub("&quot;", "\"", html)
- html = re.sub("&#8212;", "-", html)
- html = re.sub("&#8217;", "'", html)
- html = re.sub("&#8220;", "\"", html)
- html = re.sub("&#8221;", "\"", html)
- html = re.sub("&#8230;", "...", html)
- html = filter(lambda x: x in string.printable, html)
-
- return html.strip()
+ entities = [
+ ("<[^>]*?>",""),
+ ("&nbsp;"," "),
+ ("&amp;","&"),
+ ("&quot;","\""),
+ ("&#8212;","-"),
+ ("&#8217;","'"),
+ ("&#8220;","\""),
+ ("&#8221;","\""),
+ ("&#8230;","...")]
+
+ #YARLY
+ html = reduce(lambda a,b: re.sub(b[0], b[1], a), entities, html)
+ return filter(lambda x: ord(x) > 9 and ord(x) < 127, html).strip()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment