Skip to content

Instantly share code, notes, and snippets.

@kashaziz
Last active May 28, 2024 04:36
Show Gist options
  • Save kashaziz/dbdb8f39e5c99a8a5bcbfdc01553d66b to your computer and use it in GitHub Desktop.
Save kashaziz/dbdb8f39e5c99a8a5bcbfdc01553d66b to your computer and use it in GitHub Desktop.
Example of web scraping using Python and BeautifulSoup.
'''
Example of web scraping using Python and BeautifulSoup.
Sraping ESPN College Football data
http://www.espn.com/college-sports/football/recruiting/databaseresults/_/sportid/24/class/2006/sort/school/starsfilter/GT/ratingfilter/GT/statuscommit/Commitments/statusuncommit/Uncommited
The script will loop through a defined number of pages to extract footballer data.
'''
from bs4 import BeautifulSoup
import requests
import os
import os.path
import csv
import time
def writerows(rows, filename):
with open(filename, 'a', encoding='utf-8') as toWrite:
writer = csv.writer(toWrite)
writer.writerows(rows)
def getlistings(listingurl):
'''
scrap footballer data from the page and write to CSV
'''
# prepare headers
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
# fetching the url, raising error if operation fails
try:
response = requests.get(listingurl, headers=headers)
except requests.exceptions.RequestException as e:
print(e)
exit()
soup = BeautifulSoup(response.text, "html.parser")
listings = []
# loop through the table, get data from the columns
for rows in soup.find_all("tr"):
if ("oddrow" in rows["class"]) or ("evenrow" in rows["class"]):
name = rows.find("div", class_="name").a.get_text()
hometown = rows.find_all("td")[1].get_text()
school = hometown[hometown.find(",")+4:]
city = hometown[:hometown.find(",")+4]
position = rows.find_all("td")[2].get_text()
grade = rows.find_all("td")[4].get_text()
# append data to the list
listings.append([name, school, city, position, grade])
return listings
if __name__ == "__main__":
'''
Set CSV file name.
Remove if file alreay exists to ensure a fresh start
'''
filename = "footballers.csv"
if os.path.exists(filename):
os.remove(filename)
'''
Url to fetch consists of 3 parts:
baseurl, page number, year, remaining url
'''
baseurl = "http://www.espn.com/college-sports/football/recruiting/databaseresults/_/page/"
page = 1
parturl = "/sportid/24/class/2006/sort/school/starsfilter/GT/ratingfilter/GT/statuscommit/Commitments/statusuncommit/Uncommited"
# scrap all pages
while page < 259:
listingurl = baseurl + str(page) + parturl
listings = getlistings(listingurl)
# write to CSV
writerows(listings, filename)
# take a break
time.sleep(3)
page += 1
if page > 1:
print("Listings fetched successfully.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment