Skip to content

Instantly share code, notes, and snippets.

@fpcorso
Last active December 26, 2023 05:40
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save fpcorso/a14ffd982ca62b172cef3c80f24c86b9 to your computer and use it in GitHub Desktop.
Save fpcorso/a14ffd982ca62b172cef3c80f24c86b9 to your computer and use it in GitHub Desktop.
WordPress Plugin Reviews Scraper
# Script: WordPress Plugin Reviews Scraper
# Author: Frank Corso
# Date Created: 08/16/2019
# Last Modified: 05/23/2020
# Python Version: 3.6.5
# There are waits built in to avoid putting strain on the wp.org servers.
# However, any amount of scraping adds to server load.
# So, please avoiding running this script on plugins with lots of reviews
# and try to only run this script once and not on any repeating schedule.
# To run, download as `plugin-reviews.py`. Run `python plugin-reviews.py`
# and follow prompts for filename and plugin slug.
# For example, `wp-health-reviews.csv` and `my-wp-health-check`
import csv
import datetime
import os
import requests
from bs4 import BeautifulSoup
from time import sleep
headers = {'user-agent': 'frank-corso-scripts', 'Cache-Control': 'no-cache', 'cookie': ''}
def clear_screen():
"""Clears the screen"""
os.system("cls" if os.name == 'nt' else 'clear')
def main_loop():
"""The main function."""
# Prepares our CSV file.
csv_name = input('What is the name for the file to create? ')
if isinstance(csv_name, str) is not True or len(csv_name) == 0:
raise ValueError("Name must be a string with at least one character.")
fh = open(csv_name, mode="a", newline='', encoding='utf-8')
csv_writer = csv.writer(fh)
csv_writer.writerow(["Title", "Review", "User", "Number of Stars", "Date", "URL", "Tags"])
# Gets the reviews one page at a time.
slug = input('What is the plugin slug? ')
reviews_url = get_reviews_url(slug)
page = 1
max_page = 10
try:
while page < max_page + 1:
reviews = get_reviews(reviews_url, page)
# Saves reviews to file.
print("Saving current page of reviews to file...")
for review in reviews:
csv_writer.writerow(review)
print("Waiting for a few seconds...")
sleep(10)
page += 1
except ValueError:
print("Found end at {}".format(page))
fh.close()
def get_reviews(reviews_url, page=1):
"""Crawls the reviews pages and returns a reviews list."""
reviews = []
crawl_url = reviews_url
print("Getting page {}".format(page))
if page != 1:
crawl_url += "page/{}/".format(page)
# Makes sure the URL is to a real plugin."""
try:
print("Getting reviews on: {}".format(crawl_url))
r = requests.get(crawl_url, headers=headers, timeout=30)
r.raise_for_status()
except:
raise ValueError("Page not found")
# Makes sure we are dealing with HTML.
if 'html' in r.headers['Content-Type']:
soup = BeautifulSoup(r.text, 'html.parser')
elements = soup.find_all('a', {'class': 'bbp-topic-permalink'})
if len(elements) == 0:
raise ValueError("No reviews found")
# Cycles through each review link.
for tag in elements:
# If the link does not have an href attribute
if not tag.has_attr('href'):
continue
try:
reviews.append(get_review(tag['href']))
except ValueError as error:
print("Error {} trying to get and save review at: {}".format(error, tag['href']))
sleep(5)
return reviews
def get_review(review_url):
"""Gets the review content from individual review."""
title = ''
review = ''
stars = ''
user = ''
date = ''
tags = []
# Makes sure the URL is to a real review."""
try:
print("Getting review at: {}".format(review_url))
r = requests.get(review_url, headers=headers, timeout=30)
r.raise_for_status()
except:
raise ValueError("Page not found")
# Makes sure we are dealing with HTML.
if 'html' in r.headers['Content-Type']:
soup = BeautifulSoup(r.text, 'html.parser')
# Gets basic data
title = soup.find('h1', {'class': 'page-title'}).text
post_content = soup.find('div', {'class': 'bbp-topic-content'}).contents
if len(post_content) > 1:
review = post_content[1].text
user = soup.find('p', {'class': 'bbp-user-nicename'}).text
stars = soup.find_all('span', {'class': 'dashicons-star-filled'})
# Gets tags, if any
tag_list = soup.find('ul', {'class': 'topic-tags'})
if tag_list is not None:
for tag in tag_list.contents:
tags.append(tag.text)
# Gets the date string (Formatted similar to "X months, X days ago").
date_string = soup.find('a', {'class': 'bbp-topic-permalink'}).text
# Begins converting date string to actual date.
date_deltas = date_string.split(',')
time_deltas = {}
for dateframe_string in date_deltas:
dateframe = dateframe_string.split()
if dateframe[1] == 'year':
dateframe[1] = 'years'
if dateframe[1] == 'month':
dateframe[1] = 'months'
if dateframe[1] == 'week':
dateframe[1] = 'weeks'
if dateframe[1] == 'day':
dateframe[1] = 'days'
if dateframe[1] == 'hour':
dateframe[1] = 'hours'
if dateframe[1] == 'minute':
dateframe[1] = 'minutes'
time_deltas[dateframe[1]] = int(dateframe[0])
# Switches months to additional days.
if 'months' in time_deltas.keys():
extra_days = time_deltas['months'] * 30.47
if 'days' in time_deltas.keys():
time_deltas['days'] += extra_days
else:
time_deltas['days'] = extra_days
del time_deltas['months']
# Switches years to additional days.
if 'years' in time_deltas.keys():
extra_days = time_deltas['years'] * 365.24
if 'days' in time_deltas.keys():
time_deltas['days'] += extra_days
else:
time_deltas['days'] = extra_days
del time_deltas['years']
# Gets actual date by creating timedelta and subtracting diff from today.
diff = datetime.timedelta(**time_deltas)
review_datetime = datetime.datetime.now() - diff
date = review_datetime.strftime("%m/%d/%Y")
return title, review, user, len(stars), date, review_url, ','.join(tags)
def get_reviews_url(plugin_slug=''):
"""Gets the URL for the reviews for the plugin."""
if isinstance(plugin_slug, str):
return "https://wordpress.org/support/plugin/{}/reviews/".format(plugin_slug)
else:
raise ValueError("Plugin slug not a string!")
if __name__ == '__main__':
clear_screen()
main_loop()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment