Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save evilensky/d6d0ffb2e761d33f70eb1ec127e49b17 to your computer and use it in GitHub Desktop.
Save evilensky/d6d0ffb2e761d33f70eb1ec127e49b17 to your computer and use it in GitHub Desktop.
Scrape & Save Data from Box Office Mojo (Educational)
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "c0aa51db",
"metadata": {},
"source": [
"### Scrape & Save Data from Box Office Mojo (Educational)\n",
"\n",
"This is a simple and easy way to extract and save data from boxofficemojo.com for further analysis. This method is meant for educational purposes only."
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4d1e8293",
"metadata": {},
"outputs": [],
"source": [
"!pip install requests requests-html pandas\n",
"\n",
"# if python < python3.7\n",
"# pip install dataclasses"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6738c0a6",
"metadata": {},
"outputs": [],
"source": [
"from dataclasses import dataclass\n",
"import pathlib\n",
"import pandas as pd\n",
"import requests\n",
"from requests_html import HTML"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "73ebfaaf",
"metadata": {},
"outputs": [],
"source": [
"@dataclass\n",
"class ScrapeBoxOffice:\n",
" base_endpoint:str = \"https://www.boxofficemojo.com/year/world/\"\n",
" year:int = None\n",
" save_raw:bool = False\n",
" save:bool = False\n",
" output_dir: str = \".\"\n",
" table_selector: str = '.imdb-scroll-table'\n",
" table_data = []\n",
" table_header_names = []\n",
" df = pd.DataFrame()\n",
" \n",
" @property\n",
" def name(self):\n",
" return self.year if isinstance(self.year, int) else 'world'\n",
" \n",
" def get_endpoint(self):\n",
" endpoint = self.base_endpoint\n",
" if isinstance(self.year, int):\n",
" endpoint = f\"{endpoint}{self.year}/\"\n",
" return endpoint\n",
" \n",
" def get_output_dir(self):\n",
" return pathlib.Path(self.output_dir)\n",
" \n",
" def extract_html_str(self, endpoint=None):\n",
" url = endpoint if endpoint is not None else self.get_endpoint()\n",
" r = requests.get(url, stream=True)\n",
" html_text = None\n",
" status = r.status_code\n",
" if r.status_code == 200:\n",
" html_text = r.text\n",
" if self.save_raw:\n",
" output_fname = f\"{self.name}.html\"\n",
" raw_output_dir = self.get_output_dir() / 'html'\n",
" raw_output_dir.mkdir(exist_ok=True, parents=True)\n",
" output_fname = raw_output_dir / output_fname\n",
" with open(f\"{output_fname}\", 'w') as f:\n",
" f.write(html_text)\n",
" return html_text, status\n",
" return html_text, status\n",
" \n",
" def parse_html(self, html_str=''):\n",
" r_html = HTML(html=html_str)\n",
" r_table = r_html.find(self.table_selector)\n",
" if len(r_table) == 0:\n",
" return None\n",
" table_data = []\n",
" header_names = []\n",
" parsed_table = r_table[0]\n",
" rows = parsed_table.find(\"tr\")\n",
" header_row = rows[0]\n",
" header_cols = header_row.find('th')\n",
" header_names = [x.text for x in header_cols]\n",
" for row in rows[1:]:\n",
" cols = row.find(\"td\")\n",
" row_data = []\n",
" row_dict_data = {}\n",
" for i, col in enumerate(cols):\n",
" header_name = header_names[i]\n",
" row_data.append(col.text)\n",
" table_data.append(row_data)\n",
" self.table_data = table_data\n",
" self.table_header_names = header_names\n",
" return self.table_data, self.table_header_names\n",
" \n",
" def to_df(self, data=[], columns=[]):\n",
" return pd.DataFrame(data, columns=columns)\n",
" \n",
" def run(self, save=False):\n",
" save = self.save if save is False else save\n",
" endpoint = self.get_endpoint()\n",
" html_str, status = self.extract_html_str(endpoint=endpoint)\n",
" if status not in range(200, 299):\n",
" raise Exception(f\"Extraction failed, endpoint status {status} at {endpoint}\")\n",
" data, headers = self.parse_html(html_str if html_str is not None else '')\n",
" df = self.to_df(data=data, columns=headers)\n",
" self.df = df\n",
" if save:\n",
" filepath = self.get_output_dir() / f'{self.name}.csv'\n",
" df.to_csv(filepath, index=False)\n",
" return self.df"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "4d54c4b0",
"metadata": {},
"outputs": [],
"source": [
"scraper = ScrapeBoxOffice(year=2010, save=True, save_raw=True, output_dir='data')\n",
"df = scraper.run()\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "57956e7d",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

Scrape & Save Data from Box Office Mojo (Educational)

This is a simple and easy way to extract and save data from boxofficemojo.com for further analysis. This method is meant for educational purposes only.

!pip install requests requests-html pandas

# if python < python3.7
# pip install dataclasses
from dataclasses import dataclass
import pathlib
import pandas as pd
import requests
from requests_html import HTML
@dataclass
class ScrapeBoxOffice:
    base_endpoint:str = "https://www.boxofficemojo.com/year/world/"
    year:int = None
    save_raw:bool = False
    save:bool = False
    output_dir: str = "."
    table_selector: str = '.imdb-scroll-table'
    table_data = []
    table_header_names = []
    df = pd.DataFrame()
    
    @property
    def name(self):
        return self.year if isinstance(self.year, int) else 'world'
    
    def get_endpoint(self):
        endpoint = self.base_endpoint
        if isinstance(self.year, int):
            endpoint = f"{endpoint}{self.year}/"
        return endpoint
    
    def get_output_dir(self):
        return pathlib.Path(self.output_dir)
    
    def extract_html_str(self, endpoint=None):
        url = endpoint if endpoint is not None else self.get_endpoint()
        r = requests.get(url, stream=True)
        html_text = None
        status = r.status_code
        if r.status_code == 200:
            html_text = r.text
            if self.save_raw:
                output_fname = f"{self.name}.html"
                raw_output_dir = self.get_output_dir() / 'html'
                raw_output_dir.mkdir(exist_ok=True, parents=True)
                output_fname = raw_output_dir / output_fname
                with open(f"{output_fname}", 'w') as f:
                    f.write(html_text)
            return html_text, status
        return html_text, status
    
    def parse_html(self, html_str=''):
        r_html = HTML(html=html_str)
        r_table = r_html.find(self.table_selector)
        if len(r_table) == 0:
            return None
        table_data = []
        header_names = []
        parsed_table = r_table[0]
        rows = parsed_table.find("tr")
        header_row = rows[0]
        header_cols = header_row.find('th')
        header_names = [x.text for x in header_cols]
        for row in rows[1:]:
            cols = row.find("td")
            row_data = []
            row_dict_data = {}
            for i, col in enumerate(cols):
                header_name = header_names[i]
                row_data.append(col.text)
            table_data.append(row_data)
        self.table_data = table_data
        self.table_header_names = header_names
        return self.table_data, self.table_header_names
    
    def to_df(self, data=[], columns=[]):
        return pd.DataFrame(data, columns=columns)
    
    def run(self, save=False):
        save = self.save if save is False else save
        endpoint = self.get_endpoint()
        html_str, status = self.extract_html_str(endpoint=endpoint)
        if status not in range(200, 299):
            raise Exception(f"Extraction failed, endpoint status {status} at {endpoint}")
        data, headers = self.parse_html(html_str if html_str is not None else '')
        df = self.to_df(data=data, columns=headers)
        self.df = df
        if save:
            filepath = self.get_output_dir() / f'{self.name}.csv'
            df.to_csv(filepath, index=False)
        return self.df

Usage Example

scraper = ScrapeBoxOffice(year=2010, save=True, save_raw=True, output_dir='data')
df = scraper.run()
df.head()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment