Skip to content

Instantly share code, notes, and snippets.

@leandroluk
Last active March 30, 2022 17:09
Show Gist options
  • Save leandroluk/1fb573ac6bae98062c3119075924fbae to your computer and use it in GitHub Desktop.
Save leandroluk/1fb573ac6bae98062c3119075924fbae to your computer and use it in GitHub Desktop.
Crawler for extract highlighted notice title of globo.com using requests and BeautifulSoup4
import requests
from bs4 import BeautifulSoup
class GloboBot:
def fetch_first_page(self) -> str:
resp = requests.get("https://www.globo.com/", headers={
"User-agent": (
"Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHT"
"ML, like Gecko) Chrome/90.0.4430.212 Safari/537.36"
)
})
return str(resp.content)
def extract_highlighted_notice_title(self, html_doc: str):
soup = BeautifulSoup(html_doc, 'html.parser')
notice_title = soup.find('div', 'area-destaque') \
.find('div', 'headline') \
.find('a', 'post__link').text
return notice_title
def run(self):
html_doc = self.fetch_first_page()
notice_title = self.extract_highlighted_notice_title(html_doc)
return notice_title
if __file__ == '__main__':
bot = GloboBot()
result = bot.run()
print(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment