Skip to content

Instantly share code, notes, and snippets.

@hanleybrand

hanleybrand/clean_attrs.py

Last active Mar 19, 2019
Embed
What would you like to do?
Remove unwanted attributes from tags in HTML snippets with BeautifulSoup (3.21 - might work with BS4)
from BeautifulSoup import BeautifulSoup
def clean(html):
whitelist = ['backColor', 'backcolor', 'bgcolor',
'color', 'fg', 'fontName', 'fontSize',
'fontname', 'fontsize', 'href', 'name',
'textColor', 'textcolor']
html.attrs = None
for e in html.findAll(True):
for attribute in e.attrs:
if attribute[0] not in whitelist:
del e[attribute[0]]
return html
test = """
A full view of the location of the Ben Shahn mural shown in the previous image.
The mural is a fresco And is located at the entrance to the school gym. Below it are a group of photographs by several well known photographers ((including Dorothea Lange and Arthur Rothstein, documenti
ng the early days of the community.
<a href="http://music.columbia.edu/roosevelt/pop_mural_video.html"
rel="nofollow">music.columbia.edu/roosevelt/pop_mural_video.html</a>
<a href="http://music.columbia.edu/roosevelt/pop_mural.html" rel="no
follow">music.columbia.edu/roosevelt/pop_mural.html</a>.
"""
soup = BeautifulSoup(test)
clean(soup)
print soup
# output
"""
A full view of the location of the Ben Shahn mural shown in the previous image.
The mural is a fresco And is located at the entrance to the school gym. Below it are a group of photographs by several well known photographers ((including Dorothea Lange and Arthur Rothstein, documenti
ng the early days of the community.
<a href="http://music.columbia.edu/roosevelt/pop_mural_video.html">music.columbia.edu/roosevelt/pop_mural_video.html</a>
<a href="http://music.columbia.edu/roosevelt/pop_mural.html">music.columbia.edu/roosevelt/pop_mural.html</a>.
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment