ColdHeat/htmldiff.py

## htmldiff.py
import re
import subprocess
import tempfile

import cmarkgfm
import mistune
from bs4 import BeautifulSoup
from sqlalchemy import create_engine
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import sessionmaker

from distutils.util import strtobool
import sys


def prompt(query):
    sys.stdout.write("%s [y/n]: " % query)
    val = input()
    try:
        ret = strtobool(val)
    except ValueError:
        sys.stdout.write("Please answer with y/n")
        return prompt(query)
    return bool(ret)


def old(md):
    markdown = mistune.Markdown()
    return markdown(md)


def new(md):
    return cmarkgfm.markdown_to_html_with_extensions(
        md, extensions=["autolink", "table", "strikethrough"]
    )


def regex_add_newline(match):
    match = match.group()
    return f"\n{match}"


def fix_html(html):
    out = subprocess.run(["html-beautify", "--max_preserve_newlines=0"], capture_output=True, text=True, input=html)
    pretty = out.stdout
    return pretty


def fix_markdown_with_html(html):
    soup = BeautifulSoup(html, "html.parser")

    # Prettify the soup. This should remove extraneous newlines to make HTML blocks not end prematurely.
    pretty_soup = soup.prettify()

    # Add a newline to all end div tags. This should be expanded to other tags as well.
    pretty_soup = re.sub(r"^</div>", "</div>\n", pretty_soup, flags=re.MULTILINE)

    # All starting parent block tags should have a padding newline before them.
    pretty_soup = re.sub(
        r"^<[a-z](.*)>", regex_add_newline, pretty_soup, flags=re.MULTILINE
    )
    return pretty_soup


# Connect to the database
Base = automap_base()
# "sqlite:///CTFd/CTFd.db"
engine = create_engine(input("Enter database url: "))
Base.prepare(engine, reflect=True)

Session = sessionmaker(bind=engine)
session = Session()

Pages = Base.classes.pages

# Get all pages
pages = session.query(Pages).all()

for p in pages:
    subprocess.run(["less"], text=True, input=p.content, shell=True)
    t = input("Is this html or markdown? ")
    if t == "html":
        cmd = fix_html
    else:
        cmd = fix_markdown_with_html

    with tempfile.NamedTemporaryFile(
        suffix=".html"
    ) as temp1, tempfile.NamedTemporaryFile(suffix=".html") as temp2:
        temp1.write(bytes(old(p.content), encoding="utf-8"))
        temp1.flush()
        temp2.write(bytes(new(cmd(p.content)), encoding="utf-8"))
        temp2.flush()

        # Show textual diff
        subprocess.call(["vimdiff", temp1.name, temp2.name])

        # Generate image of previous HTML
        subprocess.call(
            [
                "wkhtmltoimage",
                "--load-error-handling",
                "ignore",
                f"file://{temp1.name}",
                "temp1.png",
            ]
        )

        # Generate image of new HTML
        subprocess.call(
            [
                "wkhtmltoimage",
                "--load-error-handling",
                "ignore",
                f"file://{temp2.name}",
                "temp2.png",
            ]
        )

        # Generate diff image
        subprocess.call(
            [
                "compare",
                "-identify",
                "-metric",
                "MAE",
                "temp1.png",
                "temp2.png",
                "diff.png",
            ]
        )

        # Combine all images together
        subprocess.call(
            ["convert", "+append", "temp1.png", "temp2.png", "diff.png", "temp.png"]
        )

        # Open and show to user
        subprocess.call(["open", "temp.png"])

        # Update content
        p.content = fix_html(p.content)

        cmd = input("Waiting for input to move to next page...")


print("=" * 20)
print("\n" * 3)
confirm = prompt("Are all changes good? This will commit the changes to the db.")
print(confirm)
if confirm is True:
    session.commit()
else:
    print("okay skipping...")

## install-js-beautify.sh
yarn global add js-beautify

## requirements.txt
mistune==0.8.4
beautifulsoup4==4.9.1
SQLAlchemy==1.3.18
cmarkgfm==0.4.2
	import re
	import subprocess
	import tempfile

	import cmarkgfm
	import mistune
	from bs4 import BeautifulSoup
	from sqlalchemy import create_engine
	from sqlalchemy.ext.automap import automap_base
	from sqlalchemy.orm import sessionmaker

	from distutils.util import strtobool
	import sys


	def prompt(query):
	sys.stdout.write("%s [y/n]: " % query)
	val = input()
	try:
	ret = strtobool(val)
	except ValueError:
	sys.stdout.write("Please answer with y/n")
	return prompt(query)
	return bool(ret)


	def old(md):
	markdown = mistune.Markdown()
	return markdown(md)


	def new(md):
	return cmarkgfm.markdown_to_html_with_extensions(
	md, extensions=["autolink", "table", "strikethrough"]
	)


	def regex_add_newline(match):
	match = match.group()
	return f"\n{match}"


	def fix_html(html):
	out = subprocess.run(["html-beautify", "--max_preserve_newlines=0"], capture_output=True, text=True, input=html)
	pretty = out.stdout
	return pretty


	def fix_markdown_with_html(html):
	soup = BeautifulSoup(html, "html.parser")

	# Prettify the soup. This should remove extraneous newlines to make HTML blocks not end prematurely.
	pretty_soup = soup.prettify()

	# Add a newline to all end div tags. This should be expanded to other tags as well.
	pretty_soup = re.sub(r"^</div>", "</div>\n", pretty_soup, flags=re.MULTILINE)

	# All starting parent block tags should have a padding newline before them.
	pretty_soup = re.sub(
	r"^<[a-z](.*)>", regex_add_newline, pretty_soup, flags=re.MULTILINE
	)
	return pretty_soup


	# Connect to the database
	Base = automap_base()
	# "sqlite:///CTFd/CTFd.db"
	engine = create_engine(input("Enter database url: "))
	Base.prepare(engine, reflect=True)

	Session = sessionmaker(bind=engine)
	session = Session()

	Pages = Base.classes.pages

	# Get all pages
	pages = session.query(Pages).all()

	for p in pages:
	subprocess.run(["less"], text=True, input=p.content, shell=True)
	t = input("Is this html or markdown? ")
	if t == "html":
	cmd = fix_html
	else:
	cmd = fix_markdown_with_html

	with tempfile.NamedTemporaryFile(
	suffix=".html"
	) as temp1, tempfile.NamedTemporaryFile(suffix=".html") as temp2:
	temp1.write(bytes(old(p.content), encoding="utf-8"))
	temp1.flush()
	temp2.write(bytes(new(cmd(p.content)), encoding="utf-8"))
	temp2.flush()

	# Show textual diff
	subprocess.call(["vimdiff", temp1.name, temp2.name])

	# Generate image of previous HTML
	subprocess.call(
	[
	"wkhtmltoimage",
	"--load-error-handling",
	"ignore",
	f"file://{temp1.name}",
	"temp1.png",
	]
	)

	# Generate image of new HTML
	subprocess.call(
	[
	"wkhtmltoimage",
	"--load-error-handling",
	"ignore",
	f"file://{temp2.name}",
	"temp2.png",
	]
	)

	# Generate diff image
	subprocess.call(
	[
	"compare",
	"-identify",
	"-metric",
	"MAE",
	"temp1.png",
	"temp2.png",
	"diff.png",
	]
	)

	# Combine all images together
	subprocess.call(
	["convert", "+append", "temp1.png", "temp2.png", "diff.png", "temp.png"]
	)

	# Open and show to user
	subprocess.call(["open", "temp.png"])

	# Update content
	p.content = fix_html(p.content)

	cmd = input("Waiting for input to move to next page...")


	print("=" * 20)
	print("\n" * 3)
	confirm = prompt("Are all changes good? This will commit the changes to the db.")
	print(confirm)
	if confirm is True:
	session.commit()
	else:
	print("okay skipping...")
	mistune==0.8.4
	beautifulsoup4==4.9.1
	SQLAlchemy==1.3.18
	cmarkgfm==0.4.2