Skip to content

Instantly share code, notes, and snippets.

@codingforentrepreneurs
Last active January 22, 2024 08:23
Show Gist options
  • Save codingforentrepreneurs/44e53db884fe9bfaf92ce09fae743923 to your computer and use it in GitHub Desktop.
Save codingforentrepreneurs/44e53db884fe9bfaf92ce09fae743923 to your computer and use it in GitHub Desktop.
Scrape & Save Data from Box Office Mojo (Educational)
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

Scrape & Save Data from Box Office Mojo (Educational)

This is a simple and easy way to extract and save data from boxofficemojo.com for further analysis. This method is meant for educational purposes only.

!pip install requests requests-html pandas

# if python < python3.7
# pip install dataclasses
from dataclasses import dataclass
import pathlib
import pandas as pd
import requests
from requests_html import HTML
@dataclass
class ScrapeBoxOffice:
    base_endpoint:str = "https://www.boxofficemojo.com/year/world/"
    year:int = None
    save_raw:bool = False
    save:bool = False
    output_dir: str = "."
    table_selector: str = '.imdb-scroll-table'
    table_data = []
    table_header_names = []
    df = pd.DataFrame()
    
    @property
    def name(self):
        return self.year if isinstance(self.year, int) else 'world'
    
    def get_endpoint(self):
        endpoint = self.base_endpoint
        if isinstance(self.year, int):
            endpoint = f"{endpoint}{self.year}/"
        return endpoint
    
    def get_output_dir(self):
        return pathlib.Path(self.output_dir)
    
    def extract_html_str(self, endpoint=None):
        url = endpoint if endpoint is not None else self.get_endpoint()
        r = requests.get(url, stream=True)
        html_text = None
        status = r.status_code
        if r.status_code == 200:
            html_text = r.text
            if self.save_raw:
                output_fname = f"{self.name}.html"
                raw_output_dir = self.get_output_dir() / 'html'
                raw_output_dir.mkdir(exist_ok=True, parents=True)
                output_fname = raw_output_dir / output_fname
                with open(f"{output_fname}", 'w') as f:
                    f.write(html_text)
            return html_text, status
        return html_text, status
    
    def parse_html(self, html_str=''):
        r_html = HTML(html=html_str)
        r_table = r_html.find(self.table_selector)
        if len(r_table) == 0:
            return None
        table_data = []
        header_names = []
        parsed_table = r_table[0]
        rows = parsed_table.find("tr")
        header_row = rows[0]
        header_cols = header_row.find('th')
        header_names = [x.text for x in header_cols]
        for row in rows[1:]:
            cols = row.find("td")
            row_data = []
            row_dict_data = {}
            for i, col in enumerate(cols):
                header_name = header_names[i]
                row_data.append(col.text)
            table_data.append(row_data)
        self.table_data = table_data
        self.table_header_names = header_names
        return self.table_data, self.table_header_names
    
    def to_df(self, data=[], columns=[]):
        return pd.DataFrame(data, columns=columns)
    
    def run(self, save=False):
        save = self.save if save is False else save
        endpoint = self.get_endpoint()
        html_str, status = self.extract_html_str(endpoint=endpoint)
        if status not in range(200, 299):
            raise Exception(f"Extraction failed, endpoint status {status} at {endpoint}")
        data, headers = self.parse_html(html_str if html_str is not None else '')
        df = self.to_df(data=data, columns=headers)
        self.df = df
        if save:
            filepath = self.get_output_dir() / f'{self.name}.csv'
            df.to_csv(filepath, index=False)
        return self.df

Usage Example

scraper = ScrapeBoxOffice(year=2010, save=True, save_raw=True, output_dir='data')
df = scraper.run()
df.head()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment