Skip to content

Instantly share code, notes, and snippets.

@nguaman
Forked from revotu/remove_attrs.py
Created June 25, 2018 23:58
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nguaman/e40e2bf8ee34e3d19521fd9fbd0468d8 to your computer and use it in GitHub Desktop.
Save nguaman/e40e2bf8ee34e3d19521fd9fbd0468d8 to your computer and use it in GitHub Desktop.
remove all HTML attributes with BeautifulSoup except some tags(<a> <img>...)
from bs4 import BeautifulSoup
# remove all attributes
def _remove_all_attrs(soup):
for tag in soup.find_all(True):
tag.attrs = {}
return soup
# remove all attributes except some tags
def _remove_all_attrs_except(soup):
whitelist = ['a','img']
for tag in soup.find_all(True):
if tag.name not in whitelist:
tag.attrs = {}
return soup
# remove all attributes except some tags(only saving ['href','src'] attr)
def _remove_all_attrs_except_saving(soup):
whitelist = ['a','img']
for tag in soup.find_all(True):
if tag.name not in whitelist:
tag.attrs = {}
else:
attrs = dict(tag.attrs)
for attr in attrs:
if attr not in ['src','href']:
del tag.attrs[attr]
return soup
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment