Skip to content

Instantly share code, notes, and snippets.

@mmikhan
Created June 4, 2023 21:45
Show Gist options
  • Save mmikhan/265ed186eec4e81b6e27962079bf51db to your computer and use it in GitHub Desktop.
Save mmikhan/265ed186eec4e81b6e27962079bf51db to your computer and use it in GitHub Desktop.
Scrapes discussions from WordPress forum and export the URLs to CSV files
"""
This script scrapes discussions from the Yoast SEO plugin's public forum on
WordPress (https://wordpress.org/support/plugin/wordpress-seo/). The script
fetches discussion titles and URLs from all available pages.
The script also features lazy loading by adding a delay between requests. This
helps ensure responsible web scraping by not overwhelming the server.
The fetched URLs are then saved to multiple CSV files, each containing a
specified number of URLs (default is 500). The CSV files are saved in the 'dist/'
directory without any headers.
To display the progress of fetching discussions, a progress bar is shown using
the tqdm library.
Libraries used in this script:
- requests
- BeautifulSoup (bs4)
- pandas
- tqdm
- time
- os
"""
import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
from tqdm import tqdm
base_url = "https://wordpress.org/support/plugin/wordpress-seo"
page_url_template = "https://wordpress.org/support/plugin/wordpress-seo/page/{}/"
def get_last_page_number():
response = requests.get(base_url)
soup = BeautifulSoup(response.text, "html.parser")
page_links = soup.select("a.page-numbers")
last_page_link = page_links[-2].get("href") # Second last element
return int(last_page_link.split("/")[-2])
def get_discussions_from_page(page_number):
page_url = page_url_template.format(page_number)
response = requests.get(page_url)
soup = BeautifulSoup(response.text, "html.parser")
discussions = []
for discussion in soup.select("ul.topic li.bbp-topic-title"):
title = discussion.select_one("a.bbp-topic-permalink").text
url = discussion.select_one("a.bbp-topic-permalink").get("href")
discussions.append({"title": title, "url": url})
time.sleep(1) # Add a delay of 1 second between requests for lazy loading
return discussions
def scrape_all_discussions():
last_page_number = get_last_page_number()
all_discussions = []
for page_number in tqdm(range(1, last_page_number + 1), desc="Scraping pages"):
discussions = get_discussions_from_page(page_number)
all_discussions.extend(discussions)
return all_discussions
def save_urls_to_csvs_with_pandas(urls, urls_per_file=800):
df = pd.DataFrame(urls)
url_count = len(df)
for i in range(0, url_count, urls_per_file):
chunk = df.iloc[i : i + urls_per_file]
file_name = f"wp_forum_sitemap_urls_chunk_{(i // urls_per_file) + 1}.csv"
chunk.to_csv(f"dist/{file_name}", columns=["url"], index=False, header=False)
if __name__ == "__main__":
os.makedirs("dist", exist_ok=True)
discussions = scrape_all_discussions()
save_urls_to_csvs_with_pandas(discussions)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment