kkirsche/reserved_words_retriever.py

## reserved_words_retriever.py
#!/usr/bin/env python
"""Usage Example:

$ python reserved_words_retriever.py --help
usage: reserved_words_retriever.py [-h] [-y] [-a] [-w] [-d]

Retrieve the reserved keywords for MySQL and/or MariaDB

options:
  -h, --help     show this help message and exit
  -y, --mysql    Retrieve the keywords for MySQL (default: True)
  -a, --mariadb  Retrieve the keywords for MariaDB (default: True)
  -w, --write    Write the keywords to a file (default: True)
  -d, --diff     Print a diff between the original list and the new list, if available (default: True)

$ python reserved_words_retriever.py
Parsed MariaDB: 249 reserved words
Parsed MySQL: 275 reserved words
No original list set, ignoring diffs...
"""

from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, Namespace

# python -m pip install -U beautifulsoup4 requests lxml
from typing import List, Text, TypedDict

from bs4 import BeautifulSoup
from requests import Session, get

original_list = set()  # removed for gist
reserved_words_mariadb = set()
reserved_words_mysql = set()


class MysqlDict(TypedDict):
    """The structure used to relate HTML source code to a MySQL version."""

    html: str
    version: str


def get_mysql_word_list_htmls(
    versions: List[str] = ["8.0", "5.7", "5.6"]
) -> List[MysqlDict]:
    """Retrieve the HTML of the keywords page, for each version.

    Parameters:
        versions: A list of version numbers with documented keywords.
            (default: ["8.0", "5.7", "5.6"])

    Returns:
        List[MySQLDict]: A list of dictionaries containing the HTML and the
            version that was used to retrieve it.

    Raises:
        requests.HTTPError: If a non-successful status code is received
        requests.Timeout: If the request times out.
    """
    responses = []
    session = Session()  # use a session to keepalive
    try:
        for version in versions:
            url = f"https://dev.mysql.com/doc/refman/{version}/en/keywords.html"
            response = session.get(url=url, timeout=10)
            response.raise_for_status()
            responses.append(MysqlDict(html=response.text, version=version))
    finally:
        session.close()
    return responses


def parse_mysql_word_list_htmls(version_contents: List[MysqlDict]) -> None:
    """Parses the HTML source code to extract the reserved keywords.

    Parameters:
        version_contents: A list of MysqlDict's containing the HTML and
            the version number used to retrieve it.

    Raises:
        ValueError: Raised if the MySQL keywords page has an unexpected number
            of segments in it, indicating the structure of the page has changed
            since this was written.
    """
    for version in version_contents:
        soup = BeautifulSoup(version["html"], "lxml")
        sects = soup.find_all("div", {"class": "simplesect"})
        if len(sects) != 3:
            raise ValueError("Unexpected number of sections")

        # section 1: MySQL x.0 Keywords and Reserved Words
        # section 2: MySQL x.0 New Keywords and Reserved Words
        # section 3: MySQL x.0 Removed Keywords and Reserved Words
        for sect_num, sect in enumerate(sects):
            if sect_num > 0:
                # sect_num is zero based
                # skip the new / removed diff
                continue
            for row in sect.find_all("li", {"class": "listitem"}):
                paragraph = row.find("p")
                paragraph_text = paragraph.get_text()
                word = row.find("code", {"class": "literal"}).get_text()
                if (
                    paragraph_text.endswith("(R)")
                    or "(reserved)" in paragraph_text
                    or "(R);"
                    in paragraph_text  # used to handle FUNCTION (R); reserved in 8.0.1
                    or "became nonreserved in"
                    in paragraph_text  # used to handle PERSIST; became nonreserved in 8.0.16
                ):
                    reserved_words_mysql.add(word.lower())
    print(f"Parsed MySQL: {len(reserved_words_mysql)} reserved words")


def get_mariadb_word_list_html(
    url: Text | bytes = "https://mariadb.com/kb/en/reserved-words/",
) -> str:
    """Retrieve the HTML of the reserved words page.

    Parameters:
        url: The url of the MariaDB reserved word list website.
            (default: https://mariadb.com/kb/en/reserved-words)

    Returns:
        str: The HTML of the MariaDB reserved word list website.

    Raises:
        requests.HTTPError: If a non-successful status code is received
        requests.Timeout: If the request times out.
    """
    response = get(url=url, timeout=10)
    response.raise_for_status()
    return response.text


def parse_mariadb_word_list_html(html: str) -> None:
    """Parses the HTML source code to extract the reserved keywords.

    Parameters:
        html: The HTML source code of the MariaDB reserved word list website.

    Raises:
        ValueError: Raised if the MariaDB reserved words page has an unexpected number
            of tables in it, indicating the structure of the page has changed
            since this was written.
    """
    soup = BeautifulSoup(html, "lxml")
    tables = soup.find_all("table")
    if len(tables) != 3:
        raise ValueError("Unexpected number of tables")

    # table 1: reserved words
    # table 2: exceptions
    # table 3: oracle mode resered words
    for table_num, table in enumerate(tables):
        if table_num > 0:
            # table_num is zero-based, skip exceptions
            # exceptions are not actually reserved
            continue
        for row in table.find_all("tr"):
            cells = row.find_all("td")
            if len(cells):
                word = cells[0].find(text=True)
                reserved_words_mariadb.add(word.lower())
    print(f"Parsed MariaDB: {len(reserved_words_mariadb)} reserved words")


def print_diffs() -> None:
    """Prints the additions and removals between the original list and new lists."""
    if len(original_list) == 0:
        print("No original list set, ignoring diffs...")
        return
    mariadb_additions = reserved_words_mariadb - original_list
    mysql_additions = reserved_words_mysql - original_list
    mariadb_removals = original_list - reserved_words_mariadb
    mysql_removals = original_list - reserved_words_mysql
    print("# MariaDB")
    for word in mariadb_additions:
        print(f"+ {word}")
    for word in mariadb_removals:
        print(f"- {word}")
    print("# MySQL")
    for word in mysql_additions:
        print(f"+ {word}")
    for word in mysql_removals:
        print(f"- {word}")


def write_reserved_words_to_file(
    parsed_args: Namespace, filename: str = "reserved_words.py", encoding="utf-8"
) -> None:
    """Writes the MariaDB and MySQL word lists to a file.

    Parameters:
        parsed_args: The arguments that were parsed from argparse.
        filename: The filename of the file to write to. Note, this will overwrite the
            file. (default: reserved_words.py)
        encoding: The encoding of the file when it is opened. (default: utf-8)
    """
    with open(filename, "w", encoding=encoding) as f:
        f.write(
            "\n".join(
                [
                    "# generated using:",
                    "# https://gist.github.com/kkirsche/4f31f2153ed7a3248be1ec44ca6ddbc9",
                    "#\n",  # include a trailing new line
                ]
            )
        )
        if parsed_args.retrieve_mariadb:
            mariadb_headers = [
                "# https://mariadb.com/kb/en/reserved-words/",
                "# includes: Reserved Words",
                "# excludes: Exceptions, Oracle Mode, Function Names",
                "RESERVED_WORDS_MARIADB = set([",
            ]
            for header in mariadb_headers:
                f.write(f"{header}\n")
            for word in sorted(reserved_words_mariadb, key=str.lower):
                f.write(f'    "{word.lower()}",\n')
            f.write("])\n\n")

        if parsed_args.retrieve_mysql:
            mysql_headers = [
                "# https://dev.mysql.com/doc/refman/8.0/en/keywords.html",
                "# https://dev.mysql.com/doc/refman/5.7/en/keywords.html",
                "# https://dev.mysql.com/doc/refman/5.6/en/keywords.html",
                "# includes: MySQL x.0 Keywords and Reserved Words",
                "# excludes: MySQL x.0 New Keywords and Reserved Words,",
                "#       MySQL x.0 Removed Keywords and Reserved Words",
                "RESERVED_WORDS_MYSQL = set([",
            ]
            for header in mysql_headers:
                f.write(f"{header}\n")
            for word in sorted(reserved_words_mysql, key=str.lower):
                f.write(f'    "{word.lower()}",\n')
            f.write("])\n")


def main() -> None:
    """The entrypoint of the script."""
    parser = ArgumentParser(
        description="Retrieve the reserved keywords for MySQL and/or MariaDB",
        formatter_class=ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "-y",
        "--mysql",
        action="store_false",
        default=True,
        dest="retrieve_mysql",
        help="Retrieve the keywords for MySQL",
    )
    parser.add_argument(
        "-a",
        "--mariadb",
        action="store_false",
        default=True,
        dest="retrieve_mariadb",
        help="Retrieve the keywords for MariaDB",
    )
    parser.add_argument(
        "-w",
        "--write",
        action="store_false",
        default=True,
        dest="write_changes",
        help="Write the keywords to a file",
    )
    parser.add_argument(
        "-d",
        "--diff",
        action="store_false",
        default=True,
        dest="show_diff",
        help="Print a diff between the original list and the new list, if available",
    )
    args = parser.parse_args()
    if args.retrieve_mariadb:
        html = get_mariadb_word_list_html()
        parse_mariadb_word_list_html(html=html)
    if args.retrieve_mysql:
        version_contents = get_mysql_word_list_htmls()
        parse_mysql_word_list_htmls(version_contents=version_contents)
    if args.write_changes:
        write_reserved_words_to_file(parsed_args=args)
    if args.show_diff:
        print_diffs()


if __name__ == "__main__":
    main()
	#!/usr/bin/env python
	"""Usage Example:

	$ python reserved_words_retriever.py --help
	usage: reserved_words_retriever.py [-h] [-y] [-a] [-w] [-d]

	Retrieve the reserved keywords for MySQL and/or MariaDB

	options:
	-h, --help show this help message and exit
	-y, --mysql Retrieve the keywords for MySQL (default: True)
	-a, --mariadb Retrieve the keywords for MariaDB (default: True)
	-w, --write Write the keywords to a file (default: True)
	-d, --diff Print a diff between the original list and the new list, if available (default: True)

	$ python reserved_words_retriever.py
	Parsed MariaDB: 249 reserved words
	Parsed MySQL: 275 reserved words
	No original list set, ignoring diffs...
	"""

	from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, Namespace

	# python -m pip install -U beautifulsoup4 requests lxml
	from typing import List, Text, TypedDict

	from bs4 import BeautifulSoup
	from requests import Session, get

	original_list = set() # removed for gist
	reserved_words_mariadb = set()
	reserved_words_mysql = set()


	class MysqlDict(TypedDict):
	"""The structure used to relate HTML source code to a MySQL version."""

	html: str
	version: str


	def get_mysql_word_list_htmls(
	versions: List[str] = ["8.0", "5.7", "5.6"]
	) -> List[MysqlDict]:
	"""Retrieve the HTML of the keywords page, for each version.

	Parameters:
	versions: A list of version numbers with documented keywords.
	(default: ["8.0", "5.7", "5.6"])

	Returns:
	List[MySQLDict]: A list of dictionaries containing the HTML and the
	version that was used to retrieve it.

	Raises:
	requests.HTTPError: If a non-successful status code is received
	requests.Timeout: If the request times out.
	"""
	responses = []
	session = Session() # use a session to keepalive
	try:
	for version in versions:
	url = f"https://dev.mysql.com/doc/refman/{version}/en/keywords.html"
	response = session.get(url=url, timeout=10)
	response.raise_for_status()
	responses.append(MysqlDict(html=response.text, version=version))
	finally:
	session.close()
	return responses


	def parse_mysql_word_list_htmls(version_contents: List[MysqlDict]) -> None:
	"""Parses the HTML source code to extract the reserved keywords.

	Parameters:
	version_contents: A list of MysqlDict's containing the HTML and
	the version number used to retrieve it.

	Raises:
	ValueError: Raised if the MySQL keywords page has an unexpected number
	of segments in it, indicating the structure of the page has changed
	since this was written.
	"""
	for version in version_contents:
	soup = BeautifulSoup(version["html"], "lxml")
	sects = soup.find_all("div", {"class": "simplesect"})
	if len(sects) != 3:
	raise ValueError("Unexpected number of sections")

	# section 1: MySQL x.0 Keywords and Reserved Words
	# section 2: MySQL x.0 New Keywords and Reserved Words
	# section 3: MySQL x.0 Removed Keywords and Reserved Words
	for sect_num, sect in enumerate(sects):
	if sect_num > 0:
	# sect_num is zero based
	# skip the new / removed diff
	continue
	for row in sect.find_all("li", {"class": "listitem"}):
	paragraph = row.find("p")
	paragraph_text = paragraph.get_text()
	word = row.find("code", {"class": "literal"}).get_text()
	if (
	paragraph_text.endswith("(R)")
	or "(reserved)" in paragraph_text
	or "(R);"
	in paragraph_text # used to handle FUNCTION (R); reserved in 8.0.1
	or "became nonreserved in"
	in paragraph_text # used to handle PERSIST; became nonreserved in 8.0.16
	):
	reserved_words_mysql.add(word.lower())
	print(f"Parsed MySQL: {len(reserved_words_mysql)} reserved words")


	def get_mariadb_word_list_html(
	url: Text \| bytes = "https://mariadb.com/kb/en/reserved-words/",
	) -> str:
	"""Retrieve the HTML of the reserved words page.

	Parameters:
	url: The url of the MariaDB reserved word list website.
	(default: https://mariadb.com/kb/en/reserved-words)

	Returns:
	str: The HTML of the MariaDB reserved word list website.

	Raises:
	requests.HTTPError: If a non-successful status code is received
	requests.Timeout: If the request times out.
	"""
	response = get(url=url, timeout=10)
	response.raise_for_status()
	return response.text


	def parse_mariadb_word_list_html(html: str) -> None:
	"""Parses the HTML source code to extract the reserved keywords.

	Parameters:
	html: The HTML source code of the MariaDB reserved word list website.

	Raises:
	ValueError: Raised if the MariaDB reserved words page has an unexpected number
	of tables in it, indicating the structure of the page has changed
	since this was written.
	"""
	soup = BeautifulSoup(html, "lxml")
	tables = soup.find_all("table")
	if len(tables) != 3:
	raise ValueError("Unexpected number of tables")

	# table 1: reserved words
	# table 2: exceptions
	# table 3: oracle mode resered words
	for table_num, table in enumerate(tables):
	if table_num > 0:
	# table_num is zero-based, skip exceptions
	# exceptions are not actually reserved
	continue
	for row in table.find_all("tr"):
	cells = row.find_all("td")
	if len(cells):
	word = cells[0].find(text=True)
	reserved_words_mariadb.add(word.lower())
	print(f"Parsed MariaDB: {len(reserved_words_mariadb)} reserved words")


	def print_diffs() -> None:
	"""Prints the additions and removals between the original list and new lists."""
	if len(original_list) == 0:
	print("No original list set, ignoring diffs...")
	return
	mariadb_additions = reserved_words_mariadb - original_list
	mysql_additions = reserved_words_mysql - original_list
	mariadb_removals = original_list - reserved_words_mariadb
	mysql_removals = original_list - reserved_words_mysql
	print("# MariaDB")
	for word in mariadb_additions:
	print(f"+ {word}")
	for word in mariadb_removals:
	print(f"- {word}")
	print("# MySQL")
	for word in mysql_additions:
	print(f"+ {word}")
	for word in mysql_removals:
	print(f"- {word}")


	def write_reserved_words_to_file(
	parsed_args: Namespace, filename: str = "reserved_words.py", encoding="utf-8"
	) -> None:
	"""Writes the MariaDB and MySQL word lists to a file.

	Parameters:
	parsed_args: The arguments that were parsed from argparse.
	filename: The filename of the file to write to. Note, this will overwrite the
	file. (default: reserved_words.py)
	encoding: The encoding of the file when it is opened. (default: utf-8)
	"""
	with open(filename, "w", encoding=encoding) as f:
	f.write(
	"\n".join(
	[
	"# generated using:",
	"# https://gist.github.com/kkirsche/4f31f2153ed7a3248be1ec44ca6ddbc9",
	"#\n", # include a trailing new line
	]
	)
	)
	if parsed_args.retrieve_mariadb:
	mariadb_headers = [
	"# https://mariadb.com/kb/en/reserved-words/",
	"# includes: Reserved Words",
	"# excludes: Exceptions, Oracle Mode, Function Names",
	"RESERVED_WORDS_MARIADB = set([",
	]
	for header in mariadb_headers:
	f.write(f"{header}\n")
	for word in sorted(reserved_words_mariadb, key=str.lower):
	f.write(f' "{word.lower()}",\n')
	f.write("])\n\n")

	if parsed_args.retrieve_mysql:
	mysql_headers = [
	"# https://dev.mysql.com/doc/refman/8.0/en/keywords.html",
	"# https://dev.mysql.com/doc/refman/5.7/en/keywords.html",
	"# https://dev.mysql.com/doc/refman/5.6/en/keywords.html",
	"# includes: MySQL x.0 Keywords and Reserved Words",
	"# excludes: MySQL x.0 New Keywords and Reserved Words,",
	"# MySQL x.0 Removed Keywords and Reserved Words",
	"RESERVED_WORDS_MYSQL = set([",
	]
	for header in mysql_headers:
	f.write(f"{header}\n")
	for word in sorted(reserved_words_mysql, key=str.lower):
	f.write(f' "{word.lower()}",\n')
	f.write("])\n")


	def main() -> None:
	"""The entrypoint of the script."""
	parser = ArgumentParser(
	description="Retrieve the reserved keywords for MySQL and/or MariaDB",
	formatter_class=ArgumentDefaultsHelpFormatter,
	)
	parser.add_argument(
	"-y",
	"--mysql",
	action="store_false",
	default=True,
	dest="retrieve_mysql",
	help="Retrieve the keywords for MySQL",
	)
	parser.add_argument(
	"-a",
	"--mariadb",
	action="store_false",
	default=True,
	dest="retrieve_mariadb",
	help="Retrieve the keywords for MariaDB",
	)
	parser.add_argument(
	"-w",
	"--write",
	action="store_false",
	default=True,
	dest="write_changes",
	help="Write the keywords to a file",
	)
	parser.add_argument(
	"-d",
	"--diff",
	action="store_false",
	default=True,
	dest="show_diff",
	help="Print a diff between the original list and the new list, if available",
	)
	args = parser.parse_args()
	if args.retrieve_mariadb:
	html = get_mariadb_word_list_html()
	parse_mariadb_word_list_html(html=html)
	if args.retrieve_mysql:
	version_contents = get_mysql_word_list_htmls()
	parse_mysql_word_list_htmls(version_contents=version_contents)
	if args.write_changes:
	write_reserved_words_to_file(parsed_args=args)
	if args.show_diff:
	print_diffs()


	if __name__ == "__main__":
	main()