fpcorso/plugin-reviews.py

## plugin-reviews.py
# Script: WordPress Plugin Reviews Scraper
# Author: Frank Corso
# Date Created: 08/16/2019
# Last Modified: 05/23/2020
# Python Version: 3.6.5

# There are waits built in to avoid putting strain on the wp.org servers.
# However, any amount of scraping adds to server load.
# So, please avoiding running this script on plugins with lots of reviews
# and try to only run this script once and not on any repeating schedule.

# To run, download as `plugin-reviews.py`. Run `python plugin-reviews.py`
# and follow prompts for filename and plugin slug.
# For example, `wp-health-reviews.csv` and `my-wp-health-check`

import csv
import datetime
import os
import requests
from bs4 import BeautifulSoup
from time import sleep

headers = {'user-agent': 'frank-corso-scripts', 'Cache-Control': 'no-cache', 'cookie': ''}


def clear_screen():
    """Clears the screen"""
    os.system("cls" if os.name == 'nt' else 'clear')


def main_loop():
    """The main function."""
    # Prepares our CSV file.
    csv_name = input('What is the name for the file to create? ')
    if isinstance(csv_name, str) is not True or len(csv_name) == 0:
        raise ValueError("Name must be a string with at least one character.")
    fh = open(csv_name, mode="a", newline='', encoding='utf-8')
    csv_writer = csv.writer(fh)
    csv_writer.writerow(["Title", "Review", "User", "Number of Stars", "Date", "URL", "Tags"])

    # Gets the reviews one page at a time.
    slug = input('What is the plugin slug? ')
    reviews_url = get_reviews_url(slug)
    page = 1
    max_page = 10
    try:
        while page < max_page + 1:
            reviews = get_reviews(reviews_url, page)

            # Saves reviews to file.
            print("Saving current page of reviews to file...")
            for review in reviews:
                csv_writer.writerow(review)

            print("Waiting for a few seconds...")
            sleep(10)
            page += 1
    except ValueError:
        print("Found end at {}".format(page))

    fh.close()


def get_reviews(reviews_url, page=1):
    """Crawls the reviews pages and returns a reviews list."""

    reviews = []
    crawl_url = reviews_url

    print("Getting page {}".format(page))

    if page != 1:
        crawl_url += "page/{}/".format(page)

    # Makes sure the URL is to a real plugin."""
    try:
        print("Getting reviews on: {}".format(crawl_url))
        r = requests.get(crawl_url, headers=headers, timeout=30)
        r.raise_for_status()
    except:
        raise ValueError("Page not found")

    # Makes sure we are dealing with HTML.
    if 'html' in r.headers['Content-Type']:
        soup = BeautifulSoup(r.text, 'html.parser')

        elements = soup.find_all('a', {'class': 'bbp-topic-permalink'})
        if len(elements) == 0:
            raise ValueError("No reviews found")

        # Cycles through each review link.
        for tag in elements:

            # If the link does not have an href attribute
            if not tag.has_attr('href'):
                continue

            try:
                reviews.append(get_review(tag['href']))
            except ValueError as error:
                print("Error {} trying to get and save review at: {}".format(error, tag['href']))

            sleep(5)

    return reviews


def get_review(review_url):
    """Gets the review content from individual review."""

    title = ''
    review = ''
    stars = ''
    user = ''
    date = ''
    tags = []

    # Makes sure the URL is to a real review."""
    try:
        print("Getting review at: {}".format(review_url))
        r = requests.get(review_url, headers=headers, timeout=30)
        r.raise_for_status()
    except:
        raise ValueError("Page not found")

    # Makes sure we are dealing with HTML.
    if 'html' in r.headers['Content-Type']:
        soup = BeautifulSoup(r.text, 'html.parser')

        # Gets basic data
        title = soup.find('h1', {'class': 'page-title'}).text
        post_content = soup.find('div', {'class': 'bbp-topic-content'}).contents
        if len(post_content) > 1:
            review = post_content[1].text
        user = soup.find('p', {'class': 'bbp-user-nicename'}).text
        stars = soup.find_all('span', {'class': 'dashicons-star-filled'})

        # Gets tags, if any
        tag_list = soup.find('ul', {'class': 'topic-tags'})
        if tag_list is not None:
            for tag in tag_list.contents:
                tags.append(tag.text)

        # Gets the date string (Formatted similar to "X months, X days ago").
        date_string = soup.find('a', {'class': 'bbp-topic-permalink'}).text

        # Begins converting date string to actual date.
        date_deltas = date_string.split(',')
        time_deltas = {}
        for dateframe_string in date_deltas:
            dateframe = dateframe_string.split()
            if dateframe[1] == 'year':
                dateframe[1] = 'years'
            if dateframe[1] == 'month':
                dateframe[1] = 'months'
            if dateframe[1] == 'week':
                dateframe[1] = 'weeks'
            if dateframe[1] == 'day':
                dateframe[1] = 'days'
            if dateframe[1] == 'hour':
                dateframe[1] = 'hours'
            if dateframe[1] == 'minute':
                dateframe[1] = 'minutes'
            time_deltas[dateframe[1]] = int(dateframe[0])

        # Switches months to additional days.
        if 'months' in time_deltas.keys():
            extra_days = time_deltas['months'] * 30.47
            if 'days' in time_deltas.keys():
                time_deltas['days'] += extra_days
            else:
                time_deltas['days'] = extra_days
            del time_deltas['months']

        # Switches years to additional days.
        if 'years' in time_deltas.keys():
            extra_days = time_deltas['years'] * 365.24
            if 'days' in time_deltas.keys():
                time_deltas['days'] += extra_days
            else:
                time_deltas['days'] = extra_days
            del time_deltas['years']

        # Gets actual date by creating timedelta and subtracting diff from today.
        diff = datetime.timedelta(**time_deltas)
        review_datetime = datetime.datetime.now() - diff
        date = review_datetime.strftime("%m/%d/%Y")

    return title, review, user, len(stars), date, review_url, ','.join(tags)


def get_reviews_url(plugin_slug=''):
    """Gets the URL for the reviews for the plugin."""
    if isinstance(plugin_slug, str):
        return "https://wordpress.org/support/plugin/{}/reviews/".format(plugin_slug)
    else:
        raise ValueError("Plugin slug not a string!")


if __name__ == '__main__':
    clear_screen()
    main_loop()
	# Script: WordPress Plugin Reviews Scraper
	# Author: Frank Corso
	# Date Created: 08/16/2019
	# Last Modified: 05/23/2020
	# Python Version: 3.6.5

	# There are waits built in to avoid putting strain on the wp.org servers.
	# However, any amount of scraping adds to server load.
	# So, please avoiding running this script on plugins with lots of reviews
	# and try to only run this script once and not on any repeating schedule.

	# To run, download as `plugin-reviews.py`. Run `python plugin-reviews.py`
	# and follow prompts for filename and plugin slug.
	# For example, `wp-health-reviews.csv` and `my-wp-health-check`

	import csv
	import datetime
	import os
	import requests
	from bs4 import BeautifulSoup
	from time import sleep

	headers = {'user-agent': 'frank-corso-scripts', 'Cache-Control': 'no-cache', 'cookie': ''}


	def clear_screen():
	"""Clears the screen"""
	os.system("cls" if os.name == 'nt' else 'clear')


	def main_loop():
	"""The main function."""
	# Prepares our CSV file.
	csv_name = input('What is the name for the file to create? ')
	if isinstance(csv_name, str) is not True or len(csv_name) == 0:
	raise ValueError("Name must be a string with at least one character.")
	fh = open(csv_name, mode="a", newline='', encoding='utf-8')
	csv_writer = csv.writer(fh)
	csv_writer.writerow(["Title", "Review", "User", "Number of Stars", "Date", "URL", "Tags"])

	# Gets the reviews one page at a time.
	slug = input('What is the plugin slug? ')
	reviews_url = get_reviews_url(slug)
	page = 1
	max_page = 10
	try:
	while page < max_page + 1:
	reviews = get_reviews(reviews_url, page)

	# Saves reviews to file.
	print("Saving current page of reviews to file...")
	for review in reviews:
	csv_writer.writerow(review)

	print("Waiting for a few seconds...")
	sleep(10)
	page += 1
	except ValueError:
	print("Found end at {}".format(page))

	fh.close()


	def get_reviews(reviews_url, page=1):
	"""Crawls the reviews pages and returns a reviews list."""

	reviews = []
	crawl_url = reviews_url

	print("Getting page {}".format(page))

	if page != 1:
	crawl_url += "page/{}/".format(page)

	# Makes sure the URL is to a real plugin."""
	try:
	print("Getting reviews on: {}".format(crawl_url))
	r = requests.get(crawl_url, headers=headers, timeout=30)
	r.raise_for_status()
	except:
	raise ValueError("Page not found")

	# Makes sure we are dealing with HTML.
	if 'html' in r.headers['Content-Type']:
	soup = BeautifulSoup(r.text, 'html.parser')

	elements = soup.find_all('a', {'class': 'bbp-topic-permalink'})
	if len(elements) == 0:
	raise ValueError("No reviews found")

	# Cycles through each review link.
	for tag in elements:

	# If the link does not have an href attribute
	if not tag.has_attr('href'):
	continue

	try:
	reviews.append(get_review(tag['href']))
	except ValueError as error:
	print("Error {} trying to get and save review at: {}".format(error, tag['href']))

	sleep(5)

	return reviews


	def get_review(review_url):
	"""Gets the review content from individual review."""

	title = ''
	review = ''
	stars = ''
	user = ''
	date = ''
	tags = []

	# Makes sure the URL is to a real review."""
	try:
	print("Getting review at: {}".format(review_url))
	r = requests.get(review_url, headers=headers, timeout=30)
	r.raise_for_status()
	except:
	raise ValueError("Page not found")

	# Makes sure we are dealing with HTML.
	if 'html' in r.headers['Content-Type']:
	soup = BeautifulSoup(r.text, 'html.parser')

	# Gets basic data
	title = soup.find('h1', {'class': 'page-title'}).text
	post_content = soup.find('div', {'class': 'bbp-topic-content'}).contents
	if len(post_content) > 1:
	review = post_content[1].text
	user = soup.find('p', {'class': 'bbp-user-nicename'}).text
	stars = soup.find_all('span', {'class': 'dashicons-star-filled'})

	# Gets tags, if any
	tag_list = soup.find('ul', {'class': 'topic-tags'})
	if tag_list is not None:
	for tag in tag_list.contents:
	tags.append(tag.text)

	# Gets the date string (Formatted similar to "X months, X days ago").
	date_string = soup.find('a', {'class': 'bbp-topic-permalink'}).text

	# Begins converting date string to actual date.
	date_deltas = date_string.split(',')
	time_deltas = {}
	for dateframe_string in date_deltas:
	dateframe = dateframe_string.split()
	if dateframe[1] == 'year':
	dateframe[1] = 'years'
	if dateframe[1] == 'month':
	dateframe[1] = 'months'
	if dateframe[1] == 'week':
	dateframe[1] = 'weeks'
	if dateframe[1] == 'day':
	dateframe[1] = 'days'
	if dateframe[1] == 'hour':
	dateframe[1] = 'hours'
	if dateframe[1] == 'minute':
	dateframe[1] = 'minutes'
	time_deltas[dateframe[1]] = int(dateframe[0])

	# Switches months to additional days.
	if 'months' in time_deltas.keys():
	extra_days = time_deltas['months'] * 30.47
	if 'days' in time_deltas.keys():
	time_deltas['days'] += extra_days
	else:
	time_deltas['days'] = extra_days
	del time_deltas['months']

	# Switches years to additional days.
	if 'years' in time_deltas.keys():
	extra_days = time_deltas['years'] * 365.24
	if 'days' in time_deltas.keys():
	time_deltas['days'] += extra_days
	else:
	time_deltas['days'] = extra_days
	del time_deltas['years']

	# Gets actual date by creating timedelta and subtracting diff from today.
	diff = datetime.timedelta(**time_deltas)
	review_datetime = datetime.datetime.now() - diff
	date = review_datetime.strftime("%m/%d/%Y")

	return title, review, user, len(stars), date, review_url, ','.join(tags)


	def get_reviews_url(plugin_slug=''):
	"""Gets the URL for the reviews for the plugin."""
	if isinstance(plugin_slug, str):
	return "https://wordpress.org/support/plugin/{}/reviews/".format(plugin_slug)
	else:
	raise ValueError("Plugin slug not a string!")


	if __name__ == '__main__':
	clear_screen()
	main_loop()