Skip to content

Instantly share code, notes, and snippets.

@vinovator
Last active March 4, 2016 16:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save vinovator/fc8ac49fd81717db854d to your computer and use it in GitHub Desktop.
Save vinovator/fc8ac49fd81717db854d to your computer and use it in GitHub Desktop.
Scrap Workd T20 schedule from ICC website using BeautifulSoup & Requests and format the excel output
# world_t20_itinerary.py
# Python 2.7.6
"""
Scrap Workd T20 schedule from ICC website using BeautifulSoup & Requests
Load the schedule into an excel file using pandas
Format the excel file using openpyxl
- Apply border, wrap text and color headers
- Highlight India matches
"""
import requests
from BeautifulSoup import BeautifulSoup
import pandas as pd
import openpyxl as op
# Contstants
url = "http://www.espncricinfo.com/icc-world-twenty20-2016/content/series/901359.html?template=fixtures"
xl_name = "World T20 Schedule.xlsx"
sheet_name = "Itinerary"
def scrap_schedule():
"""
Scrap WT20 schedule from cricinfo website
"""
resp = requests.get(url)
# Scrap schedule
soup = BeautifulSoup(resp.content)
# print soup.prettify()
schedule_ul = soup.findAll("ul", {"class": "large-20 columns"})
match_list = list()
for section in schedule_ul:
schedule_li = section.findAll(
"li", {"class": "large-20 medium-20 columns"})
for li in schedule_li:
match = dict()
divs = li.findAll("div")
for div in divs:
# Match date and time details
if (div["class"] == "large-5 medium-5 small-20 columns"):
fixture = div.findAll("span", {"class": "fixture_date"})
match["date"] = fixture[0].getText()
match["time"] = fixture[1].getText().replace(" ", " ")
local_time = div.getText().split("(")[-1]
local_time = local_time.split(")")[0].replace(
" ", " ")
match["local time"] = local_time
# Teams and venue details
elif (div["class"] == "large-11 medium-11 small-20 columns"):
# print div.getText()
spans = div.findAll("span")
match["match details"] = spans[0].getText()
match["venue"] = spans[1].getText()
match["live"] = spans[2].getText()
if len(spans) > 3:
match["match type"] = spans[3].getText()
else:
match["match type"] = "D"
# This div is just there for empty space; so pass
elif (div["class"] ==
"large-4 medium-4 columns hide-for-small"):
pass
match_list.append(match)
return match_list # List of all matches
def load_to_excel(match_list):
"""
Load the match list into excel file using pandas
"""
# Load list to dataframe
wt20_df = pd.DataFrame(match_list)
wt20_df.index += 1 # So that the excel column starts from 1
# Write dataframe to excel
xlw = pd.ExcelWriter(xl_name)
wt20_df.to_excel(xlw, sheet_name=sheet_name, index_label="#")
xlw.save()
def format_excel(xl, sheet="Sheet1"):
"""
Get the excel file path and format the file
If no sheet name is passed, by default take Sheet1
"""
# Open the excel file
wb = op.load_workbook(xl)
ws = wb.get_sheet_by_name(sheet)
# Freeze panes
ws.freeze_panes = "B2"
# Adjust column width
cols = ("A", "E", "H")
widths = (5, 50, 40)
for combo in zip(cols, widths):
ws.column_dimensions[combo[0]].width = combo[1]
# define color formmatting
blue_fill = op.styles.PatternFill(start_color="00aadd",
fill_type='solid')
green_fill = op.styles.PatternFill(start_color="00ff00",
fill_type='solid')
# define border style
thin_border = op.styles.borders.Border(left=op.styles.Side(style='thin'),
right=op.styles.Side(style='thin'),
top=op.styles.Side(style='thin'),
bottom=op.styles.Side(style='thin'))
# define Text wrap
text_wrap = op.styles.Alignment(wrap_text=True)
# Format the header row
for row in range(1, 2): # Loop only the 1st row
for col in range(1, ws.max_column + 1): # loop through all columns
ws.cell(row=row, column=col).fill = blue_fill
# Format all cells
for row in ws.iter_rows():
for cell in row:
# Draw borders
cell.border = thin_border
# Wrap all columns
cell.alignment = text_wrap
# Highlight India matches
if (cell.col_idx == 5) and (cell.value.find("India") > 0):
cell.fill = green_fill
# Save back as same file name
wb.save(xl)
if __name__ == "__main__":
""" Starting block """
# Scrap the schedule from cricinfo website
schedule = scrap_schedule()
# Load the match list to excel
load_to_excel(schedule)
# Format the excel file
format_excel(xl_name, sheet_name)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment