Skip to content

Instantly share code, notes, and snippets.

@FZambia
Last active December 27, 2015 01:59
Show Gist options
  • Save FZambia/7249557 to your computer and use it in GitHub Desktop.
Save FZambia/7249557 to your computer and use it in GitHub Desktop.
utils to clean html
# coding: utf-8
# bleach==1.2.2
# beautifulsoup4==4.3.2
from bleach import clean
from bs4 import BeautifulSoup
ALLOWED_TAGS = [
'font', 'del', 'br', 'a', 'p', 'img', 'blockquote',
'span', 'strong', 'em', 'pre', 'code', 'ul', 'ol',
'li', 'iframe', 'strong', 'i', 'hr', 'h1', 'h2',
'h3', 'h4', 'h5', 'h6', 'table', 'tbody', 'th', 'tr', 'td'
]
ALLOWED_STYLES = [
'background',
'background-color',
'background-image',
'background-position',
'background-repeat',
'background-size',
'border',
'border-bottom',
'border-bottom-color',
'border-bottom-left-radius',
'border-bottom-right-radius',
'border-bottom-style',
'border-bottom-width',
'border-collapse',
'border-color',
'border-left',
'border-left-color',
'border-left-style',
'border-left-width',
'border-radius',
'border-right',
'border-right-color',
'border-right-style',
'border-right-width',
'border-spacing',
'border-style',
'border-top',
'border-top-color',
'border-top-left-radius',
'border-top-right-radius',
'border-top-style',
'border-top-width',
'border-width',
'bottom',
'box-shadow',
'clear',
'color',
'content',
'cursor',
'display',
'float',
'font',
'font-family',
'font-size',
'font-size-adjust',
'font-stretch',
'font-style',
'font-variant',
'font-weight',
'height',
'left',
'line-height',
'list-style',
'list-style-image',
'list-style-position',
'list-style-type',
'margin',
'margin-bottom',
'margin-left',
'margin-right',
'margin-top',
'max-height',
'max-width',
'min-height',
'min-width',
'opacity',
'outline',
'outline-color',
'outline-offset',
'outline-style',
'outline-width',
'overflow',
'overflow-style',
'overflow-x',
'overflow-y',
'padding',
'padding-bottom',
'padding-left',
'padding-right',
'padding-top',
'position',
'quotes',
'resize',
'right',
'text-align',
'text-align-last',
'text-decoration',
'text-emphasis',
'text-height',
'text-indent',
'text-justify',
'text-outline',
'text-overflow',
'text-shadow',
'text-transform',
'text-wrap',
'top',
'vertical-align',
'visibility',
'white-space',
'white-space-collapse',
'width',
'word-break',
'word-spacing',
'word-wrap',
'z-index'
]
ALLOWED_ATTRIBUTES = [
'href', 'style', 'target', 'title', 'width', 'height', 'src', 'frameborder', 'allowfullscreen',
'webkitAllowFullScreen', 'mozallowfullscreen', 'color'
]
def clean_html(html):
return clean(
html,
tags=ALLOWED_TAGS,
attributes=ALLOWED_ATTRIBUTES,
styles=ALLOWED_STYLES,
strip=False,
strip_comments=True
)
def split_by_separator(content, separator):
if separator not in content:
return content
return content.split(separator, 1)[0]
def prettify_html(content):
tree = BeautifulSoup(content, "html.parser")
return tree.prettify()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment