Skip to content

Instantly share code, notes, and snippets.

@aleenprd
Created October 24, 2022 17:51
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aleenprd/5351aa15fa53fefbf0ba361eef6e05dc to your computer and use it in GitHub Desktop.
Save aleenprd/5351aa15fa53fefbf0ba361eef6e05dc to your computer and use it in GitHub Desktop.
ImdbReviewScraper
class ImdbReviewScraper(Scraper):
"""Implements methods for scraping IMDB.
Inherited Attributes:
chromedriver (chromedriver): a Chrome webdriver for Selenium.
Own Methods:
@staticmethod get_ratings_page
@staticmethod get_reviews_page
get_episodes_links
scrape_reviews_page
scroll_reviews_and_cook_soup
Inherited Methods:
make_soup_with_selenium
@staticmethod fetch_el_if_available
"""
def __init__(self):
driver_service = Service(ChromeDriverManager().install())
self.chromedriver = webdriver.Chrome(service=driver_service)
def get_episodes_links(self, link: str) -> List[str]:
"""Retrieve links to episodes, from series' season main page.
Args:
link (str): link to season main page.
Returns:
links (List[str]): a list of links to the episodes.
"""
soup = self.make_soup_with_selenium(link)
website = "https://www.imdb.com"
links = []
for link in soup.find_all('a', href=True):
href = link["href"]
# Episode links end with the following Regex
if bool(regex.search('=ttep_ep\d$', href)):
links.append(href)
links = list(set(links))
links.sort()
links = [f"{website}{l}" for l in links]
return links
@staticmethod
def get_ratings_page(episode_page, suffix="/ratings/?ref_=tt_ov_rt"):
return ("/").join(episode_page.split("/")[:-1]) + suffix
@staticmethod
def get_reviews_page(episode_page, suffix="/reviews?ref_=tt_urv"):
return ("/").join(episode_page.split("/")[:-1]) + suffix
def scroll_reviews_and_cook_soup(self, link: str) -> BeautifulSoup:
"""Scroll reviews page until the end using Selenium.
Args:
link (str): link to page, in this case the reviews page.
Reeturns:
reviews_soup (BeautifulSoup): soup of the fully loaded page.
"""
self.chromedriver.maximize_window() # make sure we capture everything on display
self.chromedriver.get(link)
sleep(5) # Wait for the page to load
while True:
try:
loadMoreButton = self.chromedriver.find_element(By.ID, "load-more-trigger")
loadMoreButton.click()
sleep(2) # For it to laod and also to be friendlier on the server
# At one point, there will be no more buttons to push
# but the browser session continues. It just can't click any more
except:
ImdbScraperException("Reviews page fully finished loading.")
break
page_source = self.chromedriver.page_source
reviews_soup = BeautifulSoup(page_source, 'lxml')
return reviews_soup
def scrape_reviews_page(self, reviews_soup: BeautifulSoup) -> pd.DataFrame:
"""Scrape IMDB reviews page.
Note: Extracts ratings, usernames, review date, titles, review body text,
number of reactions, total reactions to review.
Args:
reviews_soup (BeautifulSoup): soup of the entirely loaded reviews page.
Returns:
df_out (pd.DataFrame): a Pandas DataFrame with all of the above
structured as columns.
"""
# Initialize dataframe columns as empty lists to pe populated
df_out = pd.DataFrame()
review_ratings = []
user_names = []
review_dates = []
review_titles = []
review_texts = []
num_helpful_reactions = []
num_total_reactions = []
# Find all review boxes on the page so we can iterate over them
review_boxes = reviews_soup.find_all('div', {"class": "lister-item"})
for review in review_boxes:
# Rating of review
review_rating = Scraper.fetch_el_if_available(review, "div", "ipl-ratings-bar")
if review_rating is not None:
review_rating = float(review_rating.replace("\n", "").split("/")[0])
review_ratings.append(review_rating)
# User name
user_name_and_date = Scraper.fetch_el_if_available(review, "div", "display-name-date")
if user_name_and_date is not None:
user_name_and_date = user_name_and_date.replace("\n", "").split(" ")
user_names.append(user_name_and_date[0])
else:
user_names.append(None)
# Review date
review_date = Scraper.fetch_el_if_available(review, "span", "review-date")
if review_date is not None:
review_date = review_date.replace("\n", "").strip()
review_dates.append(review_date)
# Title of review
review_title = Scraper.fetch_el_if_available(review, "a", "title")
if review_title is not None:
review_title = review_title.replace("\n", "")
review_titles.append(review_title)
# Text of review
review_text = Scraper.fetch_el_if_available(review, "div", "text")
if review_title is not None:
review_text = review_text.replace("\n", "")
review_texts.append(review_text)
# Review Reactions
reactions = Scraper.fetch_el_if_available(review, "div", "actions")
if reactions is not None:
reactions = reactions.replace("\n", "").strip().split(" ")
num_helpful_reactions.append(float(reactions[0].replace(",", "")))
num_total_reactions.append(float(reactions[3].replace(",", "")))
else:
num_helpful_reactions.append(None)
num_total_reactions.append(None)
df_out["review_rating"] = review_ratings
df_out["user_name"] = user_names
df_out["review_date"] = review_dates
df_out["review_title"] = review_titles
df_out["review_text"] = review_texts
df_out["num_helpful_reactions"] = num_helpful_reactions
df_out["num_total_reactions"] = num_total_reactions
return df_out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment