Skip to content

Instantly share code, notes, and snippets.

@Andrew-Chen-Wang
Forked from revotu/remove_attrs.py
Created January 27, 2024 06:48
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Andrew-Chen-Wang/2bcc3b4511ab8dda77ea1cffb8444b98 to your computer and use it in GitHub Desktop.
Save Andrew-Chen-Wang/2bcc3b4511ab8dda77ea1cffb8444b98 to your computer and use it in GitHub Desktop.
remove all HTML attributes with BeautifulSoup except some tags(<a> <img>...)
from bs4 import BeautifulSoup
# remove all attributes
def _remove_all_attrs(soup):
for tag in soup.find_all(True):
tag.attrs = {}
return soup
# remove all attributes except some tags
def _remove_all_attrs_except(soup):
whitelist = ['a','img']
for tag in soup.find_all(True):
if tag.name not in whitelist:
tag.attrs = {}
return soup
# remove all attributes except some tags(only saving ['href','src'] attr)
def _remove_all_attrs_except_saving(soup):
whitelist = ['a','img']
for tag in soup.find_all(True):
if tag.name not in whitelist:
tag.attrs = {}
else:
attrs = dict(tag.attrs)
for attr in attrs:
if attr not in ['src','href']:
del tag.attrs[attr]
return soup
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment