Skip to content

Instantly share code, notes, and snippets.

@aleenprd
Last active October 21, 2022 19:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aleenprd/981194b2197a462161d66680525a74c1 to your computer and use it in GitHub Desktop.
Save aleenprd/981194b2197a462161d66680525a74c1 to your computer and use it in GitHub Desktop.
scrape_reviews_page
def scrape_reviews_page(reviews_soup: BeautifulSoup) -> pd.DataFrame:
"""Scrape IMDB reviews page.
Note: Extracts ratings, usernames, review date, titles, review body text,
number of reactions, total reactions to review.
Args:
reviews_soup (BeautifulSoup): soup of the entirely loaded reviews page.
Returns:
df_out (pd.DataFrame): a Pandas DataFrame with all of the above
structured as columns.
"""
# Initialize dataframe columns as empty lists to pe populated
df_out = pd.DataFrame()
review_ratings = []
user_names = []
review_dates = []
review_titles = []
review_texts = []
num_helpful_reactions = []
num_total_reactions = []
# Find all review boxes on the page so we can iterate over them
review_boxes = reviews_soup.find_all('div', {"class": "lister-item"})
for review in review_boxes:
# Rating of review
review_rating = fetch_el_if_available(review, "div", "ipl-ratings-bar")
if review_rating is not None:
review_rating = float(review_rating.replace("\n", "").split("/")[0])
review_ratings.append(review_rating)
# User name
user_name_and_date = fetch_el_if_available(review, "div", "display-name-date")
if user_name_and_date is not None:
user_name_and_date = user_name_and_date.replace("\n", "").split(" ")
user_names.append(user_name_and_date[0])
else:
user_names.append(None)
# Review date
review_date = fetch_el_if_available(review, "span", "review-date")
if review_date is not None:
review_date = review_date.replace("\n", "").strip()
review_dates.append(review_date)
# Title of review
review_title = fetch_el_if_available(review, "a", "title")
if review_title is not None:
review_title = review_title.replace("\n", "")
review_titles.append(review_title)
# Text of review
review_text = fetch_el_if_available(review, "div", "text")
if review_title is not None:
review_text = review_text.replace("\n", "")
review_texts.append(review_text)
# Review Reactions
reactions = fetch_el_if_available(review, "div", "actions")
if reactions is not None:
reactions = reactions.replace("\n", "").strip().split(" ")
num_helpful_reactions.append(float(reactions[0].replace(",", "")))
num_total_reactions.append(float(reactions[3].replace(",", "")))
else:
num_helpful_reactions.append(None)
num_total_reactions.append(None)
df_out["review_rating"] = review_ratings
df_out["user_name"] = user_names
df_out["review_date"] = review_dates
df_out["review_title"] = review_titles
df_out["review_text"] = review_texts
df_out["num_helpful_reactions"] = num_helpful_reactions
df_out["num_total_reactions"] = num_total_reactions
return df_out
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment