This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from time import time | |
def timing(f: Callable) -> None: | |
"""Times a function runtime in minutes. | |
Args: | |
f (callable): a function/method. | |
""" | |
def wrap(*args, **kw): | |
ts = time() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.service import Service | |
from webdriver_manager.chrome import ChromeDriverManager | |
def make_soup_with_selenium(url: str, driver_service: Service) -> BeautifulSoup: | |
"""Return an HTML body from an URL. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Data manipulation | |
import pandas as pd | |
import re as regex | |
# Scraping | |
import requests | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.service import Service |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re as regex | |
def get_episodes_links(link: str, driver_service) -> List[str]: | |
"""Retrieve links to episodes, from series' season main page. | |
Args: | |
link (str): link to season main page. | |
driver_service (Service): a Chrome web driver. | |
Returns: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def get_ratings_page(episode_page: str, suffix: str="/ratings/?ref_=tt_ov_rt"): | |
return ("/").join(episode_page.split("/")[:-1]) + suffix | |
def get_reviews_page(episode_page: str, suffix: str="/reviews?ref_=tt_urv"): | |
return ("/").join(episode_page.split("/")[:-1]) + suffix |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from typing import Union | |
def fetch_el_if_available(soup: BeautifulSoup, element_type: str, class_type: str) -> Union[str, None]: | |
"""Returns element text if found, otherwise returns None. | |
Args: | |
soup (BeautifulSoup): a b24 soup. | |
element_type (str): HTML type e.g. 'div'. | |
class_type (str): the class of the desired element. | |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def scrape_reviews_page(reviews_soup: BeautifulSoup) -> pd.DataFrame: | |
"""Scrape IMDB reviews page. | |
Note: Extracts ratings, usernames, review date, titles, review body text, | |
number of reactions, total reactions to review. | |
Args: | |
reviews_soup (BeautifulSoup): soup of the entirely loaded reviews page. | |
Returns: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
@timing | |
def main(season_link: str, show_link: str, driver_service: Service, output_path: str) -> None: | |
"""Main function to scrape an IMDB season's reviews for each episode and also the general reviews. | |
Args: | |
season_link (str): URL pointing to season page. | |
show_link (str): URL pointing to show general reviews. | |
driver_service (Service): a Chrome web driver. | |
output_path (str): path including filename where we want to save the CSV. | |
""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Data manipulation | |
import pandas as pd | |
import re as regex | |
# Scraping | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.chrome.service import Service | |
from webdriver_manager.chrome import ChromeDriverManager |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class ScraperException(Exception): | |
"""Starting point for Scraper exceptions.""" | |
pass | |
class ImdbScraperException(ScraperException): | |
"""Starting point for Scraper exceptions.""" | |
pass | |
OlderNewer