Skip to content

Instantly share code, notes, and snippets.

@aleenprd
Last active October 20, 2022 19:30
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aleenprd/090c49fd0fbb1f53afbfcc758dfcbf52 to your computer and use it in GitHub Desktop.
Save aleenprd/090c49fd0fbb1f53afbfcc758dfcbf52 to your computer and use it in GitHub Desktop.
scrape_imdb_reviews_pages
@timing
def main(season_link: str, show_link: str, driver_service: Service, output_path: str) -> None:
"""Main function to scrape an IMDB season's reviews for each episode and also the general reviews.
Args:
season_link (str): URL pointing to season page.
show_link (str): URL pointing to show general reviews.
driver_service (Service): a Chrome web driver.
output_path (str): path including filename where we want to save the CSV.
"""
# Results dataframes: one for each episode in the season
re_dfs = [] # Will concatenate all dataframes at the end.
episodes_links = get_episodes_links(link=season_link, driver_service=driver_service)
print("Episodes: ", episodes_links)
# We use TQDM to construct a progress bar, showing us how far off we are with scraping.
# For each episode, we want to get the reviws page, scroll till the end and make our DF.
for ep in tqdm(episodes_links):
reviews_page = get_reviews_page(ep)
print("Parsing Reviews at: ", reviews_page)
reviews_soup = scroll_reviews_and_cook_soup(
link=reviews_page, driver_service=driver_service)
df_temp = scrape_reviews_page(reviews_soup)
df_temp["episode_number"] = int(ep.split("ep")[-1])
re_dfs.append(df_temp)
sleep(5) # Sleep again again for a while to not overwhelm server with requests
show_reviews_link = reviews_page = get_reviews_page(show_link)
print("Parsing Reviews at: ", show_reviews_link)
show_reviews_soup = scroll_reviews_and_cook_soup(link=show_reviews_link, driver_service=driver_service)
df_temp = scrape_reviews_page(show_reviews_soup)
df_temp["episode_number"] = 0
re_dfs.append(df_temp)
season_reviews_df = pd.concat(re_dfs)
season_reviews_df.to_csv(output_path, header=True, index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment