Last active
August 29, 2015 14:24
-
-
Save dsaiztc/348acaa521f8d676cfdb to your computer and use it in GitHub Desktop.
Screen Scraping with BeautifulSoup for Python.
http://www.crummy.com/software/BeautifulSoup/bs4/doc/
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
soup = BeautifulSoup(html_doc, 'html.parser') | |
print(soup.prettify()) | |
tag = soup.kind_of_tag # retrieve the element with kind_of_tag (the first occurrence I suppose) | |
tag.name # name of the tag | |
tag.string # text of the tag | |
tag.attrs # attributes of the tag as a dictionary | |
tag['attribute_name'] # access to an attribute | |
tag.attribute_name # same | |
# search for tags | |
soup.find_all('tag-name') # find all 'tag-name' tags and return a list of them | |
for link in soup.find_all('a'): | |
print(link.get('href')) | |
# search for tags (with attributes of them) | |
soup.find(id='my_id') | |
soup.find(id=True) # the id attribute has a value | |
soup.find_all(href=re.compile('.es$')) | |
soup.find_all(attrs={'data-foo': 'value'}) | |
soup.find_all('a', attrs={'class': 'sister'}) | |
soup.find_all(class_=re.compile('title')) # search by CSS class: <p class="title"><b>The Dormouse's story</b></p> | |
soup.find_all('a', class_='sister') # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> | |
# Any argument that’s not recognized will be turned into a filter on one of a tag’s attributes | |
# Some attributes, like the data-* attributes in HTML 5, have names that can’t be used as the names of keyword arguments | |
# search for strings instead of tags | |
soup.find_all('a', text='Elsie') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment