Skip to content

Instantly share code, notes, and snippets.

@alexmill
Forked from dmattera/soup_prettify2.py
Last active October 2, 2019 01:33
Show Gist options
  • Save alexmill/1dbd8b865353994bd5621ad1e884c491 to your computer and use it in GitHub Desktop.
Save alexmill/1dbd8b865353994bd5621ad1e884c491 to your computer and use it in GitHub Desktop.
Visualize HTML structure of a BeautifulSoup object
# Visualize HTML structure of a BeautifulSoup object with:
# - vertical connecting lines
# - option to remove attributes
# Forked from Dan Mattera's : https://gist.github.com/danmattera/ef11cb37c31d732f9e5d2347eea876c2
# By Alex Miller https://alex.miller.im
from bs4 import BeautifulSoup as BS
def BeautifulSoup(X):
# This just sets the default parser for BeautifulSoup
# to "html.parser" so it doesn't alwasy add <html><body>
# tags by default.
return(BS(X, "html.parser"))
def soup_viz(soup, spacing=1, with_attrs=False, output='print'): #where desired_indent is number of spaces as an int()
desired_indent = 2
pretty_soup = str()
previous_indent = 0
for line in soup.prettify().split("\n"): # iterate over each line of a prettified soup
current_indent = str(line).find("<") # returns the index for the opening html tag '<'
# which is also represents the number of spaces in the lines indentation
if current_indent == -1 or current_indent > previous_indent + 2:
current_indent = previous_indent + 1
# str.find() will equal -1 when no '<' is found. This means the line is some kind
# of text or script instead of an HTML element and should be treated as a child
# of the previous line. also, current_indent should never be more than previous + 1.
previous_indent = current_indent
pretty_soup += write_new_line(line, current_indent, spacing=spacing, with_attrs=with_attrs)
if output=='print':
print(pretty_soup)
else:
return(pretty_soup)
def write_new_line(line, current_indent, spacing=1, with_attrs=False):
new_line = ""
spaces_to_add = (current_indent * 2) - current_indent
if spaces_to_add > 0:
for i in range(spaces_to_add):
new_line += "│"+ " "*(spacing-1)
# plain text line
if not line.strip().startswith('<'):
connector = " "*(spacing-1)
line_content = str(line).strip()
# opening tag line
elif not line.strip().startswith('</'):
connector = '┌'
if with_attrs:
line_content = str(line).strip()
else:
line_content = '<{}>'.format(BeautifulSoup(str(line).strip()).find().name)
# closing tag line
else:
connector = '└'
line_content = str(line.strip())
new_line += connector + line_content + "\n"
return(new_line)
# Example:
soup = BeautifulSoup("""<div><div><span>a</span><span>b</span>
<a>link</a></div><a>link1</a><a>link2</a></div>""")
viz = soup_viz(soup, spacing=3, with_attrs=True)
print(viz)
@alexmill
Copy link
Author

Output of script above should look like:

┌<div>
│  ┌<div>
│  │  ┌<span>
│  │  │    a
│  │  └</span>
│  │  ┌<span>
│  │  │    b
│  │  └</span>
│  │  ┌<a>
│  │  │    link
│  │  └</a>
│  └</div>
│  ┌<a>
│  │    link1
│  └</a>
│  ┌<a>
│  │    link2
│  └</a>
└</div>

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment