mmikhan/wp-forum-scrapper.py

## wp-forum-scrapper.py
"""
This script scrapes discussions from the Yoast SEO plugin's public forum on
WordPress (https://wordpress.org/support/plugin/wordpress-seo/). The script
fetches discussion titles and URLs from all available pages.

The script also features lazy loading by adding a delay between requests. This
helps ensure responsible web scraping by not overwhelming the server.

The fetched URLs are then saved to multiple CSV files, each containing a
specified number of URLs (default is 500). The CSV files are saved in the 'dist/'
directory without any headers.

To display the progress of fetching discussions, a progress bar is shown using
the tqdm library.

Libraries used in this script:
    - requests
    - BeautifulSoup (bs4)
    - pandas
    - tqdm
    - time
    - os
"""


import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm

base_url = "https://wordpress.org/support/plugin/wordpress-seo"
page_url_template = "https://wordpress.org/support/plugin/wordpress-seo/page/{}/"


def get_last_page_number():
    response = requests.get(base_url)
    soup = BeautifulSoup(response.text, "html.parser")
    page_links = soup.select("a.page-numbers")
    last_page_link = page_links[-2].get("href")  # Second last element
    return int(last_page_link.split("/")[-2])


def get_discussions_from_page(page_number):
    page_url = page_url_template.format(page_number)
    response = requests.get(page_url)
    soup = BeautifulSoup(response.text, "html.parser")
    discussions = []

    for discussion in soup.select("ul.topic li.bbp-topic-title"):
        title = discussion.select_one("a.bbp-topic-permalink").text
        url = discussion.select_one("a.bbp-topic-permalink").get("href")
        discussions.append({"title": title, "url": url})

    time.sleep(1)  # Add a delay of 1 second between requests for lazy loading
    return discussions


def scrape_all_discussions():
    last_page_number = get_last_page_number()
    all_discussions = []

    for page_number in tqdm(range(1, last_page_number + 1), desc="Scraping pages"):
        discussions = get_discussions_from_page(page_number)
        all_discussions.extend(discussions)

    return all_discussions


def save_urls_to_csvs_with_pandas(urls, urls_per_file=800):
    df = pd.DataFrame(urls)
    url_count = len(df)

    for i in range(0, url_count, urls_per_file):
        chunk = df.iloc[i : i + urls_per_file]
        file_name = f"wp_forum_sitemap_urls_chunk_{(i // urls_per_file) + 1}.csv"
        chunk.to_csv(f"dist/{file_name}", columns=["url"], index=False, header=False)


if __name__ == "__main__":
    os.makedirs("dist", exist_ok=True)
    discussions = scrape_all_discussions()
    save_urls_to_csvs_with_pandas(discussions)
	"""
	This script scrapes discussions from the Yoast SEO plugin's public forum on
	WordPress (https://wordpress.org/support/plugin/wordpress-seo/). The script
	fetches discussion titles and URLs from all available pages.

	The script also features lazy loading by adding a delay between requests. This
	helps ensure responsible web scraping by not overwhelming the server.

	The fetched URLs are then saved to multiple CSV files, each containing a
	specified number of URLs (default is 500). The CSV files are saved in the 'dist/'
	directory without any headers.

	To display the progress of fetching discussions, a progress bar is shown using
	the tqdm library.

	Libraries used in this script:
	- requests
	- BeautifulSoup (bs4)
	- pandas
	- tqdm
	- time
	- os
	"""


	import os
	import time
	import requests
	import pandas as pd
	from bs4 import BeautifulSoup
	from tqdm import tqdm

	base_url = "https://wordpress.org/support/plugin/wordpress-seo"
	page_url_template = "https://wordpress.org/support/plugin/wordpress-seo/page/{}/"


	def get_last_page_number():
	response = requests.get(base_url)
	soup = BeautifulSoup(response.text, "html.parser")
	page_links = soup.select("a.page-numbers")
	last_page_link = page_links[-2].get("href") # Second last element
	return int(last_page_link.split("/")[-2])


	def get_discussions_from_page(page_number):
	page_url = page_url_template.format(page_number)
	response = requests.get(page_url)
	soup = BeautifulSoup(response.text, "html.parser")
	discussions = []

	for discussion in soup.select("ul.topic li.bbp-topic-title"):
	title = discussion.select_one("a.bbp-topic-permalink").text
	url = discussion.select_one("a.bbp-topic-permalink").get("href")
	discussions.append({"title": title, "url": url})

	time.sleep(1) # Add a delay of 1 second between requests for lazy loading
	return discussions


	def scrape_all_discussions():
	last_page_number = get_last_page_number()
	all_discussions = []

	for page_number in tqdm(range(1, last_page_number + 1), desc="Scraping pages"):
	discussions = get_discussions_from_page(page_number)
	all_discussions.extend(discussions)

	return all_discussions


	def save_urls_to_csvs_with_pandas(urls, urls_per_file=800):
	df = pd.DataFrame(urls)
	url_count = len(df)

	for i in range(0, url_count, urls_per_file):
	chunk = df.iloc[i : i + urls_per_file]
	file_name = f"wp_forum_sitemap_urls_chunk_{(i // urls_per_file) + 1}.csv"
	chunk.to_csv(f"dist/{file_name}", columns=["url"], index=False, header=False)


	if __name__ == "__main__":
	os.makedirs("dist", exist_ok=True)
	discussions = scrape_all_discussions()
	save_urls_to_csvs_with_pandas(discussions)