m-radzikowski/migrate-comments-to-giscus.py

## migrate-comments-to-giscus.py
import hashlib
import re
from time import sleep
from urllib.parse import urlparse

import requests
from bs4 import BeautifulSoup
from gql import gql, Client
from gql.transport.aiohttp import AIOHTTPTransport
from markdownify import markdownify as md

"""
This script migrates comments from WordPress wpDiscuz plugin (https://wpdiscuz.com/)
to Giscus (https://giscus.app/) powered by GitHub Discussions.

It fetches the WordPress page, parses it to extract comments,
and creates new discussions in the GitHub repository using GitHub GraphQL API.

This is a one-time script that should be run only once for each post.
Otherwise, it will create duplicate discussions and comments.

The script is quick and dirty, so it may not handle all edge cases.
Review the parsed comments in dry mode first.

The script assumes the Giscus discussion mapping mode "pathname".
It works with (or without) "strict title matching" option.

To run:

1. Install the required packages: `pip install requests beautifulsoup4 markdownify gql[all]`
2. Set the configuration parameters below.
3. Run the script: `python3 migrate-comments.py`

Note that the GitHub API has rate limits. The script will sleep for 1 second after each comment creation
to avoid hitting the rate limits, but it may still happen if there are too many comments to migrate.
In that case, you can delete the half-migrated discussion and run the script again for given post.
See more on the rate limits here: https://docs.github.com/en/graphql/overview/rate-limits-and-node-limits-for-the-graphql-api
"""

########## Configuration ##########

# in dry run mode, script will not create any discussions or comments but output the comments to the console
dryrun = False

# GitHub Personal Access Token with "Discussions" read and write permissions on comments repository; create: https://github.com/settings/tokens
token = ""

# GitHub repository ID and discussions category ID, matching parameters in the Giscus script
repository_id = ""  # "data-repo-id" attribute in the Giscus script
category_id = ""  # "data-category-id" attribute in the Giscus script

# list of WordPress post URLs to migrate
posts = [
    # "https://example.com/wordpress-to-github-discussions-migration/"
]

########## Configuration End ##########

posts.reverse()  # start from the oldest post

transport = AIOHTTPTransport(url="https://api.github.com/graphql", headers={"Authorization": f"Bearer {token}"})
client = Client(transport=transport, fetch_schema_from_transport=True)


def main():
    for post in posts:
        migrate_post(post)


def migrate_post(url):
    response = requests.get(url)
    if response.status_code != 200:
        print('Failed to fetch the page')
        exit(1)

    soup = BeautifulSoup(response.content, 'html.parser')

    title = soup.find("h1").get_text().strip()
    description = soup.find("meta", property="og:description")["content"]

    print(f"# Post: {title}")

    thread = soup.find("div", class_="wpd-thread-list")
    comments = get_comments(thread, deep=False)

    if len(comments) > 0:
        discussion_id = create_discussion(url, description)

    for comment in comments:
        (element, text) = comment

        if dryrun:
            print("\n" + text)

        comment_id = create_comment(discussion_id, text, None)

        responses = get_comments(element, deep=True)
        for response in responses:
            (el, text) = response

            if dryrun:
                print("----------\n")
                print(text)

            create_comment(discussion_id, text, comment_id)

        if dryrun:
            print("--------------------\n--------------------")


def get_comments(parent, deep: bool):
    comments = parent.find_all("div", class_="comment", recursive=deep)

    results = []

    for comment in comments:
        author = comment.find("div", class_="wpd-comment-author").get_text().strip()
        author = f"**{author}**"

        date = comment.find("div", class_="wpd-comment-date")["title"]
        date = " ".join(date.split(" ")[0:3])

        content = comment.find("div", class_="wpd-comment-text").prettify().strip()
        content = convert_to_markdown(content)

        upvotes = int(comment.find("div", class_="wpd-vote-result").get_text().strip())
        if upvotes != 0:
            reactions = f"_Reactions: {abs(upvotes)} x "
            reactions += "👍" if upvotes > 0 else "👎"
            reactions += "_\n"
        else:
            reactions = ""

        text = f"_From {author} on {date} (migrated from WordPress):_" + "\n\n" + content + "\n\n" + reactions

        results.append((comment, text))

    return results


def convert_to_markdown(content):
    content = re.sub(r'\n\s+', '\n', content)
    content = content.replace("\n", " ")
    content = content.replace("> ", ">")

    markdown = md(content, escape_asterisks=False, escape_underscores=False, escape_misc=False).strip()

    return markdown


def create_discussion(url, description):
    if dryrun:
        return

    pathname = urlparse(url).path[1:]
    sha1 = hashlib.sha1(pathname.encode("utf-8")).hexdigest()

    body = f"# {pathname}\n\n{description}\n\n{url}\n\n<!-- sha1: {sha1} -->"

    query = gql(f"""
    mutation {{
      createDiscussion(input: {{repositoryId: "{repository_id}", categoryId: "{category_id}", body: "{escape_body(body)}", title: "{pathname}"}}) {{
        discussion {{
          id
        }}
      }}
    }}
    """)

    result = client.execute(query)
    discussion_id = result["createDiscussion"]["discussion"]["id"]

    print(f'Created discussion "{pathname}" with ID: {discussion_id}')

    return discussion_id


def create_comment(discussion_id, body, reply_to_id):
    if dryrun:
        return

    reply_to_id = "null" if reply_to_id is None else f'"{reply_to_id}"'
    query = gql(f"""
    mutation {{
      addDiscussionComment(input: {{discussionId: "{discussion_id}", body: "{escape_body(body)}", replyToId: {reply_to_id}}}) {{
        comment {{
          id
        }}
      }}
    }}
    """)

    result = client.execute(query)
    comment_id = result["addDiscussionComment"]["comment"]["id"]

    print(f"Created comment with ID: {comment_id}")

    sleep(1)  # sleep for 1 second to avoid hitting rate limits

    return comment_id


def escape_body(body):
    return body.replace('"', '\\"').replace("\n", "\\n")


if __name__ == '__main__':
    main()
	import hashlib
	import re
	from time import sleep
	from urllib.parse import urlparse

	import requests
	from bs4 import BeautifulSoup
	from gql import gql, Client
	from gql.transport.aiohttp import AIOHTTPTransport
	from markdownify import markdownify as md

	"""
	This script migrates comments from WordPress wpDiscuz plugin (https://wpdiscuz.com/)
	to Giscus (https://giscus.app/) powered by GitHub Discussions.

	It fetches the WordPress page, parses it to extract comments,
	and creates new discussions in the GitHub repository using GitHub GraphQL API.

	This is a one-time script that should be run only once for each post.
	Otherwise, it will create duplicate discussions and comments.

	The script is quick and dirty, so it may not handle all edge cases.
	Review the parsed comments in dry mode first.

	The script assumes the Giscus discussion mapping mode "pathname".
	It works with (or without) "strict title matching" option.

	To run:

	1. Install the required packages: `pip install requests beautifulsoup4 markdownify gql[all]`
	2. Set the configuration parameters below.
	3. Run the script: `python3 migrate-comments.py`

	Note that the GitHub API has rate limits. The script will sleep for 1 second after each comment creation
	to avoid hitting the rate limits, but it may still happen if there are too many comments to migrate.
	In that case, you can delete the half-migrated discussion and run the script again for given post.
	See more on the rate limits here: https://docs.github.com/en/graphql/overview/rate-limits-and-node-limits-for-the-graphql-api
	"""

	########## Configuration ##########

	# in dry run mode, script will not create any discussions or comments but output the comments to the console
	dryrun = False

	# GitHub Personal Access Token with "Discussions" read and write permissions on comments repository; create: https://github.com/settings/tokens
	token = ""

	# GitHub repository ID and discussions category ID, matching parameters in the Giscus script
	repository_id = "" # "data-repo-id" attribute in the Giscus script
	category_id = "" # "data-category-id" attribute in the Giscus script

	# list of WordPress post URLs to migrate
	posts = [
	# "https://example.com/wordpress-to-github-discussions-migration/"
	]

	########## Configuration End ##########

	posts.reverse() # start from the oldest post

	transport = AIOHTTPTransport(url="https://api.github.com/graphql", headers={"Authorization": f"Bearer {token}"})
	client = Client(transport=transport, fetch_schema_from_transport=True)


	def main():
	for post in posts:
	migrate_post(post)


	def migrate_post(url):
	response = requests.get(url)
	if response.status_code != 200:
	print('Failed to fetch the page')
	exit(1)

	soup = BeautifulSoup(response.content, 'html.parser')

	title = soup.find("h1").get_text().strip()
	description = soup.find("meta", property="og:description")["content"]

	print(f"# Post: {title}")

	thread = soup.find("div", class_="wpd-thread-list")
	comments = get_comments(thread, deep=False)

	if len(comments) > 0:
	discussion_id = create_discussion(url, description)

	for comment in comments:
	(element, text) = comment

	if dryrun:
	print("\n" + text)

	comment_id = create_comment(discussion_id, text, None)

	responses = get_comments(element, deep=True)
	for response in responses:
	(el, text) = response

	if dryrun:
	print("----------\n")
	print(text)

	create_comment(discussion_id, text, comment_id)

	if dryrun:
	print("--------------------\n--------------------")


	def get_comments(parent, deep: bool):
	comments = parent.find_all("div", class_="comment", recursive=deep)

	results = []

	for comment in comments:
	author = comment.find("div", class_="wpd-comment-author").get_text().strip()
	author = f"{author}"

	date = comment.find("div", class_="wpd-comment-date")["title"]
	date = " ".join(date.split(" ")[0:3])

	content = comment.find("div", class_="wpd-comment-text").prettify().strip()
	content = convert_to_markdown(content)

	upvotes = int(comment.find("div", class_="wpd-vote-result").get_text().strip())
	if upvotes != 0:
	reactions = f"_Reactions: {abs(upvotes)} x "
	reactions += "👍" if upvotes > 0 else "👎"
	reactions += "_\n"
	else:
	reactions = ""

	text = f"_From {author} on {date} (migrated from WordPress):_" + "\n\n" + content + "\n\n" + reactions

	results.append((comment, text))

	return results


	def convert_to_markdown(content):
	content = re.sub(r'\n\s+', '\n', content)
	content = content.replace("\n", " ")
	content = content.replace("> ", ">")

	markdown = md(content, escape_asterisks=False, escape_underscores=False, escape_misc=False).strip()

	return markdown


	def create_discussion(url, description):
	if dryrun:
	return

	pathname = urlparse(url).path[1:]
	sha1 = hashlib.sha1(pathname.encode("utf-8")).hexdigest()

	body = f"# {pathname}\n\n{description}\n\n{url}\n\n<!-- sha1: {sha1} -->"

	query = gql(f"""
	mutation {{
	createDiscussion(input: {{repositoryId: "{repository_id}", categoryId: "{category_id}", body: "{escape_body(body)}", title: "{pathname}"}}) {{
	discussion {{
	id
	}}
	}}
	}}
	""")

	result = client.execute(query)
	discussion_id = result["createDiscussion"]["discussion"]["id"]

	print(f'Created discussion "{pathname}" with ID: {discussion_id}')

	return discussion_id


	def create_comment(discussion_id, body, reply_to_id):
	if dryrun:
	return

	reply_to_id = "null" if reply_to_id is None else f'"{reply_to_id}"'
	query = gql(f"""
	mutation {{
	addDiscussionComment(input: {{discussionId: "{discussion_id}", body: "{escape_body(body)}", replyToId: {reply_to_id}}}) {{
	comment {{
	id
	}}
	}}
	}}
	""")

	result = client.execute(query)
	comment_id = result["addDiscussionComment"]["comment"]["id"]

	print(f"Created comment with ID: {comment_id}")

	sleep(1) # sleep for 1 second to avoid hitting rate limits

	return comment_id


	def escape_body(body):
	return body.replace('"', '\\"').replace("\n", "\\n")


	if __name__ == '__main__':
	main()