-
-
Save devnullone/88e734b5b12c921d709adfbcb23d5e4f to your computer and use it in GitHub Desktop.
Beautiful soup cheat sheet: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# 10_basic.py | |
# 15_make_soup.py | |
# 20_search.py | |
# 25_navigation.py | |
# 30_edit.py | |
# 40_encoding.py | |
# 50_parse_only_part.py |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# https://www.crummy.com/software/BeautifulSoup/bs4/doc/ | |
from bs4 import BeautifulSoup | |
soup = BeautifulSoup(html_doc, 'html.parser') | |
soup.title # <title>The Dormouse's story</title> | |
soup.title.name # u'title' | |
soup.title.string # u'The Dormouse's story' | |
soup.title.parent.name # u'head' | |
#various finder | |
css_soup.select("p.strikeout.body") # css finder | |
soup.p # <p class="title"><b>The Dormouse's story</b></p> | |
soup.p['class'] # u'title' | |
soup.a # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> | |
soup.find_all('a') # [<a ..>, ..] | |
soup.find(id="link3") # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a> | |
for link in soup.find_all('a'): | |
print(link.get('href')) # http://example.com/elsi, # http://example.com/lacie |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
soup = BeautifulSoup(open("index.html")) | |
soup = BeautifulSoup("<html>data</html>") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# HTML | |
soup.prettify() #pretty print | |
str(soup) # non-pretty print | |
# String | |
soup.get_text() #all text under the element |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
search.pyhttps://www.crummy.com/software/BeautifulSoup/bs4/doc/ | |
#------------------------- | |
# css selector | |
#------------------------- | |
css_soup.select("p.strikeout.body") | |
soup.select("p nth-of-type(3)") # 3rd child | |
soup.select("head > title") | |
soup.select("p > a:nth-of-type(2)") | |
soup.select("p > #link1") # direct child | |
soup.select("#link1 ~ .sister") # sibling | |
soup.select('a[href]') # existence of an attribute | |
soup.select_one(".sister") | |
# attribute value | |
soup.select('a[href="http://example.com/elsie"]') # exact attribute | |
soup.select('a[href^="http://example.com/"]') # negative match | |
soup.select('a[href$="tillie"]') # end match | |
soup.select('a[href*=".com/el"]') # middle match | |
#------------------------- | |
# basic | |
#------------------------- | |
soup.find_all('b') # match by tag | |
soup.find_all(re.compile("^b")) # match by tag using regex | |
soup.find_all(["a", "b"]) # match by tag in list | |
# function (complex condition) | |
def has_class_but_no_id(tag): | |
return tag.has_attr('class') and not tag.has_attr('id') | |
soup.find_all(has_class_but_no_id) | |
#------------------------- | |
# find_all_api | |
#------------------------- | |
find_all(name, attrs, recursive, string, limit, **kwargs) | |
soup.find_all("title") # tag condition | |
soup.find_all("p", "title") # tag and attr | |
# [<p class="title"><b>The Dormouse's story</b></p>] | |
soup.find_all("a") | |
# keyword arguments | |
soup.find_all(id="link2") | |
soup.find_all(href=re.compile("elsie"), id='link1') | |
soup.find(string=re.compile("sisters")) # text contain sisters | |
# css class (class is researved keyword) | |
soup.find_all("a", class_="sister") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#---------------------------- | |
# change exisitng tag | |
#---------------------------- | |
tag.name = "blockquote" # modify tag name | |
tag['class'] = 'verybold' # modify tag attribute | |
del tag['class'] # delete attribute | |
tag.string= 'not too bold' # modify tag contents string | |
tag.append(" but bolder than usual") # append tag contents | |
#---------------------------- | |
# insert tag | |
#---------------------------- | |
new_tag = soup.new_tag("a", href="http://www.example.com") | |
original_tag.append(new_tag) # create child | |
new_tag.string = "Link text." # can edit element after creating child | |
soup.b.string.insert_before(tag) | |
soup.b.i.insert_after(soup.new_string(" ever ")) | |
#---------------------------- | |
# delete tag | |
#---------------------------- | |
soup.i.clear() # removes the contents | |
i_tag = soup.i.extract() # completely removes a tag from tree and returns the element | |
soup.i.decompose() # completely removes a tag from tree and discard the tag | |
#---------------------------- | |
# replace/wrap/unwrap tag | |
#---------------------------- | |
a_tag.i.replace_with(soup.new_tag("b")) | |
a_tag.i.replace_with(Beautifulsoup("<b>bold element</b>")) # replace inner html | |
soup.p.string.wrap(soup.new_tag("b")) | |
a_tag.i.unwrap() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#output | |
soup.prettify("latin-1") | |
tag.encode("utf-8") | |
tag.encode("latin-1") | |
tag.encode("ascii") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# The SoupStrainer class allows you to choose which parts of an | |
# incoming document are parsed | |
from bs4 import SoupStrainer | |
# conditions | |
only_a_tags = SoupStrainer("a") | |
only_tags_with_id_link2 = SoupStrainer(id="link2") | |
def is_short_string(string): | |
return len(string) < 10 | |
only_short_strings = SoupStrainer(string=is_short_string) | |
# execute parse | |
BeautifulSoup(html_doc, "html.parser", parse_only=only_a_tags) | |
BeautifulSoup(html_doc, "html.parser", parse_only=only_tags_with_id_link2) | |
BeautifulSoup(html_doc, "html.parser", parse_only=only_short_strings) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment