Example of web scraping using Python and BeautifulSoup.
Sraping ESPN College Football data
The script will loop through a defined number of pages to extract footballer data.
from bs4 import BeautifulSoup
import requests
import os
import os.path
import csv
import time
def writerows(rows, filename):
with open(filename, 'a', encoding='utf-8') as toWrite:
writer = csv.writer(toWrite)
def getlistings(listingurl):
scrap footballer data from the page and write to CSV
# prepare headers
headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'}
# fetching the url, raising error if operation fails
response = requests.get(listingurl, headers=headers)
except requests.exceptions.RequestException as e:
soup = BeautifulSoup(response.text, "html.parser")
listings = []
# loop through the table, get data from the columns
for rows in soup.find_all("tr"):
if ("oddrow" in rows["class"]) or ("evenrow" in rows["class"]):
name = rows.find("div", class_="name").a.get_text()
hometown = rows.find_all("td")[1].get_text()
school = hometown[hometown.find(",")+4:]
city = hometown[:hometown.find(",")+4]
position = rows.find_all("td")[2].get_text()
grade = rows.find_all("td")[4].get_text()
# append data to the list
listings.append([name, school, city, position, grade])
return listings
if __name__ == "__main__":
Set CSV file name.
Remove if file alreay exists to ensure a fresh start
filename = "footballers.csv"
if os.path.exists(filename):
Url to fetch consists of 3 parts:
baseurl, page number, year, remaining url
baseurl = ""
page = 1
parturl = "/sportid/24/class/2006/sort/school/starsfilter/GT/ratingfilter/GT/statuscommit/Commitments/statusuncommit/Uncommited"
# scrap all pages
while page < 259:
listingurl = baseurl + str(page) + parturl
listings = getlistings(listingurl)
# write to CSV
writerows(listings, filename)
# take a break
page += 1
if page > 1:
print("Listings fetched successfully.")
