This is a simple and easy way to extract and save data from boxofficemojo.com for further analysis. This method is meant for educational purposes only.
!pip install requests requests-html pandas
# if python < python3.7
# pip install dataclasses
from dataclasses import dataclass
import pathlib
import pandas as pd
import requests
from requests_html import HTML
@dataclass
class ScrapeBoxOffice:
base_endpoint:str = "https://www.boxofficemojo.com/year/world/"
year:int = None
save_raw:bool = False
save:bool = False
output_dir: str = "."
table_selector: str = '.imdb-scroll-table'
table_data = []
table_header_names = []
df = pd.DataFrame()
@property
def name(self):
return self.year if isinstance(self.year, int) else 'world'
def get_endpoint(self):
endpoint = self.base_endpoint
if isinstance(self.year, int):
endpoint = f"{endpoint}{self.year}/"
return endpoint
def get_output_dir(self):
return pathlib.Path(self.output_dir)
def extract_html_str(self, endpoint=None):
url = endpoint if endpoint is not None else self.get_endpoint()
r = requests.get(url, stream=True)
html_text = None
status = r.status_code
if r.status_code == 200:
html_text = r.text
if self.save_raw:
output_fname = f"{self.name}.html"
raw_output_dir = self.get_output_dir() / 'html'
raw_output_dir.mkdir(exist_ok=True, parents=True)
output_fname = raw_output_dir / output_fname
with open(f"{output_fname}", 'w') as f:
f.write(html_text)
return html_text, status
return html_text, status
def parse_html(self, html_str=''):
r_html = HTML(html=html_str)
r_table = r_html.find(self.table_selector)
if len(r_table) == 0:
return None
table_data = []
header_names = []
parsed_table = r_table[0]
rows = parsed_table.find("tr")
header_row = rows[0]
header_cols = header_row.find('th')
header_names = [x.text for x in header_cols]
for row in rows[1:]:
cols = row.find("td")
row_data = []
row_dict_data = {}
for i, col in enumerate(cols):
header_name = header_names[i]
row_data.append(col.text)
table_data.append(row_data)
self.table_data = table_data
self.table_header_names = header_names
return self.table_data, self.table_header_names
def to_df(self, data=[], columns=[]):
return pd.DataFrame(data, columns=columns)
def run(self, save=False):
save = self.save if save is False else save
endpoint = self.get_endpoint()
html_str, status = self.extract_html_str(endpoint=endpoint)
if status not in range(200, 299):
raise Exception(f"Extraction failed, endpoint status {status} at {endpoint}")
data, headers = self.parse_html(html_str if html_str is not None else '')
df = self.to_df(data=data, columns=headers)
self.df = df
if save:
filepath = self.get_output_dir() / f'{self.name}.csv'
df.to_csv(filepath, index=False)
return self.df
scraper = ScrapeBoxOffice(year=2010, save=True, save_raw=True, output_dir='data')
df = scraper.run()
df.head()