evilensky/Scrape-BoxOfficeMojo-Notebook.ipynb

## Scrape-BoxOfficeMojo-Notebook.ipynb
{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "c0aa51db",
   "metadata": {},
   "source": [
    "### Scrape & Save Data from Box Office Mojo (Educational)\n",
    "\n",
    "This is a simple and easy way to extract and save data from boxofficemojo.com for further analysis. This method is meant for educational purposes only."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d1e8293",
   "metadata": {},
   "outputs": [],
   "source": [
    "!pip install requests requests-html pandas\n",
    "\n",
    "# if python < python3.7\n",
    "# pip install dataclasses"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6738c0a6",
   "metadata": {},
   "outputs": [],
   "source": [
    "from dataclasses import dataclass\n",
    "import pathlib\n",
    "import pandas as pd\n",
    "import requests\n",
    "from requests_html import HTML"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "73ebfaaf",
   "metadata": {},
   "outputs": [],
   "source": [
    "@dataclass\n",
    "class ScrapeBoxOffice:\n",
    "    base_endpoint:str = \"https://www.boxofficemojo.com/year/world/\"\n",
    "    year:int = None\n",
    "    save_raw:bool = False\n",
    "    save:bool = False\n",
    "    output_dir: str = \".\"\n",
    "    table_selector: str = '.imdb-scroll-table'\n",
    "    table_data = []\n",
    "    table_header_names = []\n",
    "    df = pd.DataFrame()\n",
    "    \n",
    "    @property\n",
    "    def name(self):\n",
    "        return self.year if isinstance(self.year, int) else 'world'\n",
    "    \n",
    "    def get_endpoint(self):\n",
    "        endpoint = self.base_endpoint\n",
    "        if isinstance(self.year, int):\n",
    "            endpoint = f\"{endpoint}{self.year}/\"\n",
    "        return endpoint\n",
    "    \n",
    "    def get_output_dir(self):\n",
    "        return pathlib.Path(self.output_dir)\n",
    "    \n",
    "    def extract_html_str(self, endpoint=None):\n",
    "        url = endpoint if endpoint is not None else self.get_endpoint()\n",
    "        r = requests.get(url, stream=True)\n",
    "        html_text = None\n",
    "        status = r.status_code\n",
    "        if r.status_code == 200:\n",
    "            html_text = r.text\n",
    "            if self.save_raw:\n",
    "                output_fname = f\"{self.name}.html\"\n",
    "                raw_output_dir = self.get_output_dir() / 'html'\n",
    "                raw_output_dir.mkdir(exist_ok=True, parents=True)\n",
    "                output_fname = raw_output_dir / output_fname\n",
    "                with open(f\"{output_fname}\", 'w') as f:\n",
    "                    f.write(html_text)\n",
    "            return html_text, status\n",
    "        return html_text, status\n",
    "    \n",
    "    def parse_html(self, html_str=''):\n",
    "        r_html = HTML(html=html_str)\n",
    "        r_table = r_html.find(self.table_selector)\n",
    "        if len(r_table) == 0:\n",
    "            return None\n",
    "        table_data = []\n",
    "        header_names = []\n",
    "        parsed_table = r_table[0]\n",
    "        rows = parsed_table.find(\"tr\")\n",
    "        header_row = rows[0]\n",
    "        header_cols = header_row.find('th')\n",
    "        header_names = [x.text for x in header_cols]\n",
    "        for row in rows[1:]:\n",
    "            cols = row.find(\"td\")\n",
    "            row_data = []\n",
    "            row_dict_data = {}\n",
    "            for i, col in enumerate(cols):\n",
    "                header_name = header_names[i]\n",
    "                row_data.append(col.text)\n",
    "            table_data.append(row_data)\n",
    "        self.table_data = table_data\n",
    "        self.table_header_names = header_names\n",
    "        return self.table_data, self.table_header_names\n",
    "    \n",
    "    def to_df(self, data=[], columns=[]):\n",
    "        return pd.DataFrame(data, columns=columns)\n",
    "    \n",
    "    def run(self, save=False):\n",
    "        save = self.save if save is False else save\n",
    "        endpoint = self.get_endpoint()\n",
    "        html_str, status = self.extract_html_str(endpoint=endpoint)\n",
    "        if status not in range(200, 299):\n",
    "            raise Exception(f\"Extraction failed, endpoint status {status} at {endpoint}\")\n",
    "        data, headers = self.parse_html(html_str if html_str is not None else '')\n",
    "        df = self.to_df(data=data, columns=headers)\n",
    "        self.df = df\n",
    "        if save:\n",
    "            filepath = self.get_output_dir() / f'{self.name}.csv'\n",
    "            df.to_csv(filepath, index=False)\n",
    "        return self.df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "4d54c4b0",
   "metadata": {},
   "outputs": [],
   "source": [
    "scraper = ScrapeBoxOffice(year=2010, save=True, save_raw=True, output_dir='data')\n",
    "df = scraper.run()\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "57956e7d",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}

## Scrape-BoxOfficeMojo-with-Python.md

      
    Raw
  

              Scrape-BoxOfficeMojo-with-Python.md
            
          
    Scrape & Save Data from Box Office Mojo (Educational)

This is a simple and easy way to extract and save data from boxofficemojo.com for further analysis. This method is meant for educational purposes only.
!pip install requests requests-html pandas

# if python < python3.7
# pip install dataclasses
from dataclasses import dataclass
import pathlib
import pandas as pd
import requests
from requests_html import HTML
@dataclass
class ScrapeBoxOffice:
    base_endpoint:str = "https://www.boxofficemojo.com/year/world/"
    year:int = None
    save_raw:bool = False
    save:bool = False
    output_dir: str = "."
    table_selector: str = '.imdb-scroll-table'
    table_data = []
    table_header_names = []
    df = pd.DataFrame()
    
    @property
    def name(self):
        return self.year if isinstance(self.year, int) else 'world'
    
    def get_endpoint(self):
        endpoint = self.base_endpoint
        if isinstance(self.year, int):
            endpoint = f"{endpoint}{self.year}/"
        return endpoint
    
    def get_output_dir(self):
        return pathlib.Path(self.output_dir)
    
    def extract_html_str(self, endpoint=None):
        url = endpoint if endpoint is not None else self.get_endpoint()
        r = requests.get(url, stream=True)
        html_text = None
        status = r.status_code
        if r.status_code == 200:
            html_text = r.text
            if self.save_raw:
                output_fname = f"{self.name}.html"
                raw_output_dir = self.get_output_dir() / 'html'
                raw_output_dir.mkdir(exist_ok=True, parents=True)
                output_fname = raw_output_dir / output_fname
                with open(f"{output_fname}", 'w') as f:
                    f.write(html_text)
            return html_text, status
        return html_text, status
    
    def parse_html(self, html_str=''):
        r_html = HTML(html=html_str)
        r_table = r_html.find(self.table_selector)
        if len(r_table) == 0:
            return None
        table_data = []
        header_names = []
        parsed_table = r_table[0]
        rows = parsed_table.find("tr")
        header_row = rows[0]
        header_cols = header_row.find('th')
        header_names = [x.text for x in header_cols]
        for row in rows[1:]:
            cols = row.find("td")
            row_data = []
            row_dict_data = {}
            for i, col in enumerate(cols):
                header_name = header_names[i]
                row_data.append(col.text)
            table_data.append(row_data)
        self.table_data = table_data
        self.table_header_names = header_names
        return self.table_data, self.table_header_names
    
    def to_df(self, data=[], columns=[]):
        return pd.DataFrame(data, columns=columns)
    
    def run(self, save=False):
        save = self.save if save is False else save
        endpoint = self.get_endpoint()
        html_str, status = self.extract_html_str(endpoint=endpoint)
        if status not in range(200, 299):
            raise Exception(f"Extraction failed, endpoint status {status} at {endpoint}")
        data, headers = self.parse_html(html_str if html_str is not None else '')
        df = self.to_df(data=data, columns=headers)
        self.df = df
        if save:
            filepath = self.get_output_dir() / f'{self.name}.csv'
            df.to_csv(filepath, index=False)
        return self.df
Usage Example

scraper = ScrapeBoxOffice(year=2010, save=True, save_raw=True, output_dir='data')
df = scraper.run()
df.head()
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "c0aa51db",
	"metadata": {},
	"source": [
	"### Scrape & Save Data from Box Office Mojo (Educational)\n",
	"\n",
	"This is a simple and easy way to extract and save data from boxofficemojo.com for further analysis. This method is meant for educational purposes only."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "4d1e8293",
	"metadata": {},
	"outputs": [],
	"source": [
	"!pip install requests requests-html pandas\n",
	"\n",
	"# if python < python3.7\n",
	"# pip install dataclasses"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "6738c0a6",
	"metadata": {},
	"outputs": [],
	"source": [
	"from dataclasses import dataclass\n",
	"import pathlib\n",
	"import pandas as pd\n",
	"import requests\n",
	"from requests_html import HTML"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "73ebfaaf",
	"metadata": {},
	"outputs": [],
	"source": [
	"@dataclass\n",
	"class ScrapeBoxOffice:\n",
	" base_endpoint:str = \"https://www.boxofficemojo.com/year/world/\"\n",
	" year:int = None\n",
	" save_raw:bool = False\n",
	" save:bool = False\n",
	" output_dir: str = \".\"\n",
	" table_selector: str = '.imdb-scroll-table'\n",
	" table_data = []\n",
	" table_header_names = []\n",
	" df = pd.DataFrame()\n",
	" \n",
	" @property\n",
	" def name(self):\n",
	" return self.year if isinstance(self.year, int) else 'world'\n",
	" \n",
	" def get_endpoint(self):\n",
	" endpoint = self.base_endpoint\n",
	" if isinstance(self.year, int):\n",
	" endpoint = f\"{endpoint}{self.year}/\"\n",
	" return endpoint\n",
	" \n",
	" def get_output_dir(self):\n",
	" return pathlib.Path(self.output_dir)\n",
	" \n",
	" def extract_html_str(self, endpoint=None):\n",
	" url = endpoint if endpoint is not None else self.get_endpoint()\n",
	" r = requests.get(url, stream=True)\n",
	" html_text = None\n",
	" status = r.status_code\n",
	" if r.status_code == 200:\n",
	" html_text = r.text\n",
	" if self.save_raw:\n",
	" output_fname = f\"{self.name}.html\"\n",
	" raw_output_dir = self.get_output_dir() / 'html'\n",
	" raw_output_dir.mkdir(exist_ok=True, parents=True)\n",
	" output_fname = raw_output_dir / output_fname\n",
	" with open(f\"{output_fname}\", 'w') as f:\n",
	" f.write(html_text)\n",
	" return html_text, status\n",
	" return html_text, status\n",
	" \n",
	" def parse_html(self, html_str=''):\n",
	" r_html = HTML(html=html_str)\n",
	" r_table = r_html.find(self.table_selector)\n",
	" if len(r_table) == 0:\n",
	" return None\n",
	" table_data = []\n",
	" header_names = []\n",
	" parsed_table = r_table[0]\n",
	" rows = parsed_table.find(\"tr\")\n",
	" header_row = rows[0]\n",
	" header_cols = header_row.find('th')\n",
	" header_names = [x.text for x in header_cols]\n",
	" for row in rows[1:]:\n",
	" cols = row.find(\"td\")\n",
	" row_data = []\n",
	" row_dict_data = {}\n",
	" for i, col in enumerate(cols):\n",
	" header_name = header_names[i]\n",
	" row_data.append(col.text)\n",
	" table_data.append(row_data)\n",
	" self.table_data = table_data\n",
	" self.table_header_names = header_names\n",
	" return self.table_data, self.table_header_names\n",
	" \n",
	" def to_df(self, data=[], columns=[]):\n",
	" return pd.DataFrame(data, columns=columns)\n",
	" \n",
	" def run(self, save=False):\n",
	" save = self.save if save is False else save\n",
	" endpoint = self.get_endpoint()\n",
	" html_str, status = self.extract_html_str(endpoint=endpoint)\n",
	" if status not in range(200, 299):\n",
	" raise Exception(f\"Extraction failed, endpoint status {status} at {endpoint}\")\n",
	" data, headers = self.parse_html(html_str if html_str is not None else '')\n",
	" df = self.to_df(data=data, columns=headers)\n",
	" self.df = df\n",
	" if save:\n",
	" filepath = self.get_output_dir() / f'{self.name}.csv'\n",
	" df.to_csv(filepath, index=False)\n",
	" return self.df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "4d54c4b0",
	"metadata": {},
	"outputs": [],
	"source": [
	"scraper = ScrapeBoxOffice(year=2010, save=True, save_raw=True, output_dir='data')\n",
	"df = scraper.run()\n",
	"df.head()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"id": "57956e7d",
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 3",
	"language": "python",
	"name": "python3"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.8.2"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}