Skip to content

Instantly share code, notes, and snippets.

View aleenprd's full-sized avatar
💭
busy bee

Alin Preda aleenprd

💭
busy bee
View GitHub Profile
@aleenprd
aleenprd / WorldPostCodeScraper.py
Last active November 11, 2023 17:19
WorldPostCodeScraper
import pandas as pd
from urllib import request
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
from typing import Union
from time import sleep
class WorldPostCodeScraper:
"""Scraper class for https://worldpostalcode.com/."""
@aleenprd
aleenprd / scrape_imdb_reviews_main.py
Created October 24, 2022 19:47
scrape_imdb_reviews_main
"""Python executable which scrapes IMDB for reviews."""
import argparse
import pandas as pd
from time import sleep
from tqdm import tqdm
from dependencies.general import timing
from dependencies.scrapers import ImdbReviewScraper
@aleenprd
aleenprd / ImdbReviewScraper.py
Created October 24, 2022 19:45
ImdbReviewScraper
class ImdbReviewScraper(Scraper):
"""Implements methods for scraping IMDB.
Inherited Attributes:
chromedriver (chromedriver): a Chrome webdriver for Selenium.
Own Methods:
@staticmethod get_ratings_page
@staticmethod get_reviews_page
get_episodes_links
@aleenprd
aleenprd / ImdbReviewScraper.py
Created October 24, 2022 17:51
ImdbReviewScraper
class ImdbReviewScraper(Scraper):
"""Implements methods for scraping IMDB.
Inherited Attributes:
chromedriver (chromedriver): a Chrome webdriver for Selenium.
Own Methods:
@staticmethod get_ratings_page
@staticmethod get_reviews_page
get_episodes_links
@aleenprd
aleenprd / scraper_base_class.py
Last active October 24, 2022 18:11
scraper_base_class
class ScraperException(Exception):
"""Starting point for Scraper exceptions."""
pass
class ImdbScraperException(ScraperException):
"""Starting point for Scraper exceptions."""
pass
@aleenprd
aleenprd / imports_scraper_classes.py
Last active October 24, 2022 18:12
imports_scraper_classes
# Data manipulation
import pandas as pd
import re as regex
# Scraping
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
@aleenprd
aleenprd / scrape_imdb_reviews_pages_main.py
Last active October 20, 2022 19:30
scrape_imdb_reviews_pages
@timing
def main(season_link: str, show_link: str, driver_service: Service, output_path: str) -> None:
"""Main function to scrape an IMDB season's reviews for each episode and also the general reviews.
Args:
season_link (str): URL pointing to season page.
show_link (str): URL pointing to show general reviews.
driver_service (Service): a Chrome web driver.
output_path (str): path including filename where we want to save the CSV.
"""
@aleenprd
aleenprd / scrape_reviews_page.py
Last active October 21, 2022 19:22
scrape_reviews_page
def scrape_reviews_page(reviews_soup: BeautifulSoup) -> pd.DataFrame:
"""Scrape IMDB reviews page.
Note: Extracts ratings, usernames, review date, titles, review body text,
number of reactions, total reactions to review.
Args:
reviews_soup (BeautifulSoup): soup of the entirely loaded reviews page.
Returns:
@aleenprd
aleenprd / fetch_el_if_available.py
Last active October 20, 2022 18:55
fetch_el_if_available
from typing import Union
def fetch_el_if_available(soup: BeautifulSoup, element_type: str, class_type: str) -> Union[str, None]:
"""Returns element text if found, otherwise returns None.
Args:
soup (BeautifulSoup): a b24 soup.
element_type (str): HTML type e.g. 'div'.
class_type (str): the class of the desired element.
@aleenprd
aleenprd / get_ratings_and_reviews_page.py
Last active October 20, 2022 18:12
Get Ratings and Reviews Page
def get_ratings_page(episode_page: str, suffix: str="/ratings/?ref_=tt_ov_rt"):
return ("/").join(episode_page.split("/")[:-1]) + suffix
def get_reviews_page(episode_page: str, suffix: str="/reviews?ref_=tt_urv"):
return ("/").join(episode_page.split("/")[:-1]) + suffix