Skip to content

Instantly share code, notes, and snippets.

@zepadovani
Last active May 16, 2023 00:51
Show Gist options
  • Save zepadovani/a06e8f1b83abee59de958c0c90da09c0 to your computer and use it in GitHub Desktop.
Save zepadovani/a06e8f1b83abee59de958c0c90da09c0 to your computer and use it in GitHub Desktop.
Make a dataframe (and a csv) with the names of bird species and portuguese common names of birds
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import pandas as pd
def getWikiAvesDF(filename=None,savecsv=False):
"""
Scrape the Wikipedia Aves table and return a DataFrame with 'Espécie' and 'Nome Comum' columns.
Parameters:
- filename (str): Name of the CSV file to save the DataFrame (optional).
- savecsv (bool): Flag indicating whether to save the DataFrame as a CSV file (optional).
Returns:
- DataFrame: DataFrame with 'Espécie' and 'Nome Comum' columns.
"""
url = "https://www.wikiaves.com.br/especies.php?t=t"
driver = webdriver.Chrome() # Create a new Selenium webdriver instance (make sure you have the appropriate driver installed)
driver.get(url) # Load the page using Selenium
content = driver.page_source # Get the HTML content after JavaScript execution
driver.quit() # Close the Selenium webdriver
soup = BeautifulSoup(content, "html.parser") # Parse the HTML content using BeautifulSoup
table = soup.find('table') # Table
# Extract the table headers
headers = []
for th in table.find_all('th'):
headers.append(th.text)
# Extract the table rows
data = []
for tr in table.find_all('tr'):
row = []
for td in tr.find_all('td'):
row.append(td.text)
if row:
data.append(row)
# Create a DataFrame
df = pd.DataFrame(data, columns=headers)
# remove repetição de nome comum antes da espécie (sei lá porque aconteceu isso)
df['Espécie'] = df.apply(lambda row: row['Espécie'].replace(row['Nome Comum'], '', 1) if pd.notnull(row['Espécie']) and pd.notnull(row['Nome Comum']) else row['Espécie'], axis=1)
df = df.dropna(subset=['Espécie'])
df = df.reset_index(drop=True)
outdf = df[["Espécie","Nome Comum"]]
if savecsv:
df.to_csv(filename, index=False)
return outdf
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment