Skip to content

Instantly share code, notes, and snippets.

View aleenprd's full-sized avatar
💭
busy bee

Alin Preda aleenprd

💭
busy bee
View GitHub Profile
@aleenprd
aleenprd / timing_function.py
Last active October 20, 2022 17:40
timing_decorator
from time import time
def timing(f: Callable) -> None:
"""Times a function runtime in minutes.
Args:
f (callable): a function/method.
"""
def wrap(*args, **kw):
ts = time()
@aleenprd
aleenprd / make_soup_with_selenium.py
Created October 20, 2022 17:59
Make Soup with Selenium
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
def make_soup_with_selenium(url: str, driver_service: Service) -> BeautifulSoup:
"""Return an HTML body from an URL.
@aleenprd
aleenprd / imports_imdb_scraping.py
Last active October 20, 2022 18:53
IMDB Scraper Imports
# Data manipulation
import pandas as pd
import re as regex
# Scraping
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
@aleenprd
aleenprd / get_episodes_links.py
Created October 20, 2022 18:07
Get Links to Episodes
@aleenprd
aleenprd / get_ratings_and_reviews_page.py
Last active October 20, 2022 18:12
Get Ratings and Reviews Page
def get_ratings_page(episode_page: str, suffix: str="/ratings/?ref_=tt_ov_rt"):
return ("/").join(episode_page.split("/")[:-1]) + suffix
def get_reviews_page(episode_page: str, suffix: str="/reviews?ref_=tt_urv"):
return ("/").join(episode_page.split("/")[:-1]) + suffix
@aleenprd
aleenprd / fetch_el_if_available.py
Last active October 20, 2022 18:55
fetch_el_if_available
from typing import Union
def fetch_el_if_available(soup: BeautifulSoup, element_type: str, class_type: str) -> Union[str, None]:
"""Returns element text if found, otherwise returns None.
Args:
soup (BeautifulSoup): a b24 soup.
element_type (str): HTML type e.g. 'div'.
class_type (str): the class of the desired element.
@aleenprd
aleenprd / scrape_reviews_page.py
Last active October 21, 2022 19:22
scrape_reviews_page
def scrape_reviews_page(reviews_soup: BeautifulSoup) -> pd.DataFrame:
"""Scrape IMDB reviews page.
Note: Extracts ratings, usernames, review date, titles, review body text,
number of reactions, total reactions to review.
Args:
reviews_soup (BeautifulSoup): soup of the entirely loaded reviews page.
Returns:
@aleenprd
aleenprd / scrape_imdb_reviews_pages_main.py
Last active October 20, 2022 19:30
scrape_imdb_reviews_pages
@timing
def main(season_link: str, show_link: str, driver_service: Service, output_path: str) -> None:
"""Main function to scrape an IMDB season's reviews for each episode and also the general reviews.
Args:
season_link (str): URL pointing to season page.
show_link (str): URL pointing to show general reviews.
driver_service (Service): a Chrome web driver.
output_path (str): path including filename where we want to save the CSV.
"""
@aleenprd
aleenprd / imports_scraper_classes.py
Last active October 24, 2022 18:12
imports_scraper_classes
# Data manipulation
import pandas as pd
import re as regex
# Scraping
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
@aleenprd
aleenprd / scraper_base_class.py
Last active October 24, 2022 18:11
scraper_base_class
class ScraperException(Exception):
"""Starting point for Scraper exceptions."""
pass
class ImdbScraperException(ScraperException):
"""Starting point for Scraper exceptions."""
pass