Last active
April 7, 2024 20:35
-
-
Save m-radzikowski/00d92b412ae3f6f944b2eed116b9aed8 to your computer and use it in GitHub Desktop.
Migrate comments from WordPress wpDiscuz (https://wpdiscuz.com/) to Giscus (https://giscus.app/) powered by GitHub Discussions
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import hashlib | |
import re | |
from time import sleep | |
from urllib.parse import urlparse | |
import requests | |
from bs4 import BeautifulSoup | |
from gql import gql, Client | |
from gql.transport.aiohttp import AIOHTTPTransport | |
from markdownify import markdownify as md | |
""" | |
This script migrates comments from WordPress wpDiscuz plugin (https://wpdiscuz.com/) | |
to Giscus (https://giscus.app/) powered by GitHub Discussions. | |
It fetches the WordPress page, parses it to extract comments, | |
and creates new discussions in the GitHub repository using GitHub GraphQL API. | |
This is a one-time script that should be run only once for each post. | |
Otherwise, it will create duplicate discussions and comments. | |
The script is quick and dirty, so it may not handle all edge cases. | |
Review the parsed comments in dry mode first. | |
The script assumes the Giscus discussion mapping mode "pathname". | |
It works with (or without) "strict title matching" option. | |
To run: | |
1. Install the required packages: `pip install requests beautifulsoup4 markdownify gql[all]` | |
2. Set the configuration parameters below. | |
3. Run the script: `python3 migrate-comments.py` | |
Note that the GitHub API has rate limits. The script will sleep for 1 second after each comment creation | |
to avoid hitting the rate limits, but it may still happen if there are too many comments to migrate. | |
In that case, you can delete the half-migrated discussion and run the script again for given post. | |
See more on the rate limits here: https://docs.github.com/en/graphql/overview/rate-limits-and-node-limits-for-the-graphql-api | |
""" | |
########## Configuration ########## | |
# in dry run mode, script will not create any discussions or comments but output the comments to the console | |
dryrun = False | |
# GitHub Personal Access Token with "Discussions" read and write permissions on comments repository; create: https://github.com/settings/tokens | |
token = "" | |
# GitHub repository ID and discussions category ID, matching parameters in the Giscus script | |
repository_id = "" # "data-repo-id" attribute in the Giscus script | |
category_id = "" # "data-category-id" attribute in the Giscus script | |
# list of WordPress post URLs to migrate | |
posts = [ | |
# "https://example.com/wordpress-to-github-discussions-migration/" | |
] | |
########## Configuration End ########## | |
posts.reverse() # start from the oldest post | |
transport = AIOHTTPTransport(url="https://api.github.com/graphql", headers={"Authorization": f"Bearer {token}"}) | |
client = Client(transport=transport, fetch_schema_from_transport=True) | |
def main(): | |
for post in posts: | |
migrate_post(post) | |
def migrate_post(url): | |
response = requests.get(url) | |
if response.status_code != 200: | |
print('Failed to fetch the page') | |
exit(1) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
title = soup.find("h1").get_text().strip() | |
description = soup.find("meta", property="og:description")["content"] | |
print(f"# Post: {title}") | |
thread = soup.find("div", class_="wpd-thread-list") | |
comments = get_comments(thread, deep=False) | |
if len(comments) > 0: | |
discussion_id = create_discussion(url, description) | |
for comment in comments: | |
(element, text) = comment | |
if dryrun: | |
print("\n" + text) | |
comment_id = create_comment(discussion_id, text, None) | |
responses = get_comments(element, deep=True) | |
for response in responses: | |
(el, text) = response | |
if dryrun: | |
print("----------\n") | |
print(text) | |
create_comment(discussion_id, text, comment_id) | |
if dryrun: | |
print("--------------------\n--------------------") | |
def get_comments(parent, deep: bool): | |
comments = parent.find_all("div", class_="comment", recursive=deep) | |
results = [] | |
for comment in comments: | |
author = comment.find("div", class_="wpd-comment-author").get_text().strip() | |
author = f"**{author}**" | |
date = comment.find("div", class_="wpd-comment-date")["title"] | |
date = " ".join(date.split(" ")[0:3]) | |
content = comment.find("div", class_="wpd-comment-text").prettify().strip() | |
content = convert_to_markdown(content) | |
upvotes = int(comment.find("div", class_="wpd-vote-result").get_text().strip()) | |
if upvotes != 0: | |
reactions = f"_Reactions: {abs(upvotes)} x " | |
reactions += "👍" if upvotes > 0 else "👎" | |
reactions += "_\n" | |
else: | |
reactions = "" | |
text = f"_From {author} on {date} (migrated from WordPress):_" + "\n\n" + content + "\n\n" + reactions | |
results.append((comment, text)) | |
return results | |
def convert_to_markdown(content): | |
content = re.sub(r'\n\s+', '\n', content) | |
content = content.replace("\n", " ") | |
content = content.replace("> ", ">") | |
markdown = md(content, escape_asterisks=False, escape_underscores=False, escape_misc=False).strip() | |
return markdown | |
def create_discussion(url, description): | |
if dryrun: | |
return | |
pathname = urlparse(url).path[1:] | |
sha1 = hashlib.sha1(pathname.encode("utf-8")).hexdigest() | |
body = f"# {pathname}\n\n{description}\n\n{url}\n\n<!-- sha1: {sha1} -->" | |
query = gql(f""" | |
mutation {{ | |
createDiscussion(input: {{repositoryId: "{repository_id}", categoryId: "{category_id}", body: "{escape_body(body)}", title: "{pathname}"}}) {{ | |
discussion {{ | |
id | |
}} | |
}} | |
}} | |
""") | |
result = client.execute(query) | |
discussion_id = result["createDiscussion"]["discussion"]["id"] | |
print(f'Created discussion "{pathname}" with ID: {discussion_id}') | |
return discussion_id | |
def create_comment(discussion_id, body, reply_to_id): | |
if dryrun: | |
return | |
reply_to_id = "null" if reply_to_id is None else f'"{reply_to_id}"' | |
query = gql(f""" | |
mutation {{ | |
addDiscussionComment(input: {{discussionId: "{discussion_id}", body: "{escape_body(body)}", replyToId: {reply_to_id}}}) {{ | |
comment {{ | |
id | |
}} | |
}} | |
}} | |
""") | |
result = client.execute(query) | |
comment_id = result["addDiscussionComment"]["comment"]["id"] | |
print(f"Created comment with ID: {comment_id}") | |
sleep(1) # sleep for 1 second to avoid hitting rate limits | |
return comment_id | |
def escape_body(body): | |
return body.replace('"', '\\"').replace("\n", "\\n") | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Sample output comment:
From John on April 7, 2024 (migrated from WordPress):
Much comments, very wow. With formatting.
Reactions: 3 x 👍