Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save genkio/ff276c35f9318dd7fef7e66f059da15b to your computer and use it in GitHub Desktop.
Save genkio/ff276c35f9318dd7fef7e66f059da15b to your computer and use it in GitHub Desktop.
Extract Data from wikipedia for World Cup 2022 Dataset

Extract Code

generated by OpenAI ChatGPT, data fetched from

import requests
from bs4 import BeautifulSoup
import csv

# Define the URL of the Wikipedia page
url = ''

# Use the requests library to fetch the HTML content of the page
response = requests.get(url)

# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(response.content, 'html.parser')

# Initialize a list to store the team names
team_names = []

# Loop over each paragraph on the page
for p in soup.find_all('p'):
    # Extract the text from the paragraph
    text = p.text.strip()
    # Check if the paragraph contains a team name
    if ' announced' in text:
        # Split the text at the team name
        parts = text.split(' announced')
        # Extract the team name from the first part
        team_name = parts[0]
        # Add the team name to the list

# Find all of the tables containing squad information
tables = soup.find_all('table', {'class': 'wikitable'})

# Initialize an empty list to store the data
data = []

# Loop over each of the tables
for table, team in zip(tables, team_names):
    # Set the team group based on the team index
    team_group = chr(65 + team_names.index(team) // 4)

    # Extract the data for each player in the table
    for row in table.find_all('tr'):
        cells = row.find_all('td')
        if len(cells) > 0:
            # Extract the data from each cell
            no = cells[0].text.strip()
            pos = cells[1].text.strip()
            # Extract the player name from the first 'th' element
            player = row.find('th').text.strip()
            dob = cells[2].text.strip()
            caps = cells[3].text.strip()
            goals = cells[4].text.strip()
            club = cells[5].text.strip()

            # Add the team name and player data to the list
            data.append([team, team_group, no, pos, player, dob, caps, goals, club])

# Write the extracted data to a CSV file
with open('world_cup_squads.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(['Team', 'Group', 'No.', 'Pos.', 'Player', 'DOB', 'Caps', 'Goals', 'Club'])


ChatGPT conversation

How asked for the help from OpenAI ChatGPT to help generate this code:

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment