Skip to content

Instantly share code, notes, and snippets.

@mateuspestana
Last active April 27, 2024 13:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mateuspestana/b291ace100a6c9f9ad9151fa7703ffe0 to your computer and use it in GitHub Desktop.
Save mateuspestana/b291ace100a6c9f9ad9151fa7703ffe0 to your computer and use it in GitHub Desktop.
Raspa BP
import numpy as np
import pandas as pd
import selenium
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.service import Service as FirefoxService
from webdriver_manager.firefox import GeckoDriverManager
# Edge
# from selenium.webdriver.edge.service import Service as EdgeService
# from webdriver_manager.microsoft import EdgeChromiumDriverManager
# Chrome
# from selenium.webdriver.chrome.service import Service as ChromeService
# from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import time
driver = webdriver.Firefox(service=FirefoxService(GeckoDriverManager().install()))
driver.get('https://www.brasilparalelo.com.br/search?query=lula')
time.sleep(5)
soup = BeautifulSoup(driver.page_source, 'lxml')
noticias = soup.find_all('div', {'class': 'search-result-item'})
news_lula = []
for noticia in noticias:
titulo = noticia.find('h2').text
descricao = noticia.find('p').text
link = 'https://www.brasilparalelo.com.br' + noticia.find('a').get('href')
if not 'episodios-programas' in link and not 'colunas' in link:
driver.get(link)
time.sleep(2)
pagina = BeautifulSoup(driver.page_source, 'lxml')
tipo = pagina.find('div', {'class': 'reading-time-type'})
tipo = tipo.text
tempo_leitura = pagina.find('div', {'id': 'reading-time'})
tempo_leitura = tempo_leitura.text
temas = pagina.find_all('div', {'class': 'noticia-tag-dot'})
temas = [tema.text for tema in temas]
if len(temas) == 0:
temas = pagina.find('div', {'class': 'noticia-tag'})
temas = temas.text
data_pub = pagina.find('span', {'class': 'published-at'})
data_pub = data_pub.text
materia = pagina.find('div', {'class': 'w-richtext'})
materia = materia.text
dados = {'titulo': titulo, 'descricao': descricao, 'link': link, 'tipo': tipo,
'tempo_leitura': tempo_leitura, 'temas': temas, 'data_pub': data_pub,
'materia': materia}
news_lula.append(dados)
df_bp = pd.DataFrame(news_lula)
df_bp.to_excel('news_lula.xlsx', index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment