Created
June 4, 2023 21:45
-
-
Save mmikhan/265ed186eec4e81b6e27962079bf51db to your computer and use it in GitHub Desktop.
Scrapes discussions from WordPress forum and export the URLs to CSV files
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
This script scrapes discussions from the Yoast SEO plugin's public forum on | |
WordPress (https://wordpress.org/support/plugin/wordpress-seo/). The script | |
fetches discussion titles and URLs from all available pages. | |
The script also features lazy loading by adding a delay between requests. This | |
helps ensure responsible web scraping by not overwhelming the server. | |
The fetched URLs are then saved to multiple CSV files, each containing a | |
specified number of URLs (default is 500). The CSV files are saved in the 'dist/' | |
directory without any headers. | |
To display the progress of fetching discussions, a progress bar is shown using | |
the tqdm library. | |
Libraries used in this script: | |
- requests | |
- BeautifulSoup (bs4) | |
- pandas | |
- tqdm | |
- time | |
- os | |
""" | |
import os | |
import time | |
import requests | |
import pandas as pd | |
from bs4 import BeautifulSoup | |
from tqdm import tqdm | |
base_url = "https://wordpress.org/support/plugin/wordpress-seo" | |
page_url_template = "https://wordpress.org/support/plugin/wordpress-seo/page/{}/" | |
def get_last_page_number(): | |
response = requests.get(base_url) | |
soup = BeautifulSoup(response.text, "html.parser") | |
page_links = soup.select("a.page-numbers") | |
last_page_link = page_links[-2].get("href") # Second last element | |
return int(last_page_link.split("/")[-2]) | |
def get_discussions_from_page(page_number): | |
page_url = page_url_template.format(page_number) | |
response = requests.get(page_url) | |
soup = BeautifulSoup(response.text, "html.parser") | |
discussions = [] | |
for discussion in soup.select("ul.topic li.bbp-topic-title"): | |
title = discussion.select_one("a.bbp-topic-permalink").text | |
url = discussion.select_one("a.bbp-topic-permalink").get("href") | |
discussions.append({"title": title, "url": url}) | |
time.sleep(1) # Add a delay of 1 second between requests for lazy loading | |
return discussions | |
def scrape_all_discussions(): | |
last_page_number = get_last_page_number() | |
all_discussions = [] | |
for page_number in tqdm(range(1, last_page_number + 1), desc="Scraping pages"): | |
discussions = get_discussions_from_page(page_number) | |
all_discussions.extend(discussions) | |
return all_discussions | |
def save_urls_to_csvs_with_pandas(urls, urls_per_file=800): | |
df = pd.DataFrame(urls) | |
url_count = len(df) | |
for i in range(0, url_count, urls_per_file): | |
chunk = df.iloc[i : i + urls_per_file] | |
file_name = f"wp_forum_sitemap_urls_chunk_{(i // urls_per_file) + 1}.csv" | |
chunk.to_csv(f"dist/{file_name}", columns=["url"], index=False, header=False) | |
if __name__ == "__main__": | |
os.makedirs("dist", exist_ok=True) | |
discussions = scrape_all_discussions() | |
save_urls_to_csvs_with_pandas(discussions) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment