Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Reserved Word Retriever for SQLAlchemy
#!/usr/bin/env python
"""Usage Example:
$ python reserved_words_retriever.py --help
usage: reserved_words_retriever.py [-h] [-y] [-a] [-w] [-d]
Retrieve the reserved keywords for MySQL and/or MariaDB
options:
-h, --help show this help message and exit
-y, --mysql Retrieve the keywords for MySQL (default: True)
-a, --mariadb Retrieve the keywords for MariaDB (default: True)
-w, --write Write the keywords to a file (default: True)
-d, --diff Print a diff between the original list and the new list, if available (default: True)
$ python reserved_words_retriever.py
Parsed MariaDB: 249 reserved words
Parsed MySQL: 275 reserved words
No original list set, ignoring diffs...
"""
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser, Namespace
# python -m pip install -U beautifulsoup4 requests lxml
from typing import List, Text, TypedDict
from bs4 import BeautifulSoup
from requests import Session, get
original_list = set() # removed for gist
reserved_words_mariadb = set()
reserved_words_mysql = set()
class MysqlDict(TypedDict):
"""The structure used to relate HTML source code to a MySQL version."""
html: str
version: str
def get_mysql_word_list_htmls(
versions: List[str] = ["8.0", "5.7", "5.6"]
) -> List[MysqlDict]:
"""Retrieve the HTML of the keywords page, for each version.
Parameters:
versions: A list of version numbers with documented keywords.
(default: ["8.0", "5.7", "5.6"])
Returns:
List[MySQLDict]: A list of dictionaries containing the HTML and the
version that was used to retrieve it.
Raises:
requests.HTTPError: If a non-successful status code is received
requests.Timeout: If the request times out.
"""
responses = []
session = Session() # use a session to keepalive
try:
for version in versions:
url = f"https://dev.mysql.com/doc/refman/{version}/en/keywords.html"
response = session.get(url=url, timeout=10)
response.raise_for_status()
responses.append(MysqlDict(html=response.text, version=version))
finally:
session.close()
return responses
def parse_mysql_word_list_htmls(version_contents: List[MysqlDict]) -> None:
"""Parses the HTML source code to extract the reserved keywords.
Parameters:
version_contents: A list of MysqlDict's containing the HTML and
the version number used to retrieve it.
Raises:
ValueError: Raised if the MySQL keywords page has an unexpected number
of segments in it, indicating the structure of the page has changed
since this was written.
"""
for version in version_contents:
soup = BeautifulSoup(version["html"], "lxml")
sects = soup.find_all("div", {"class": "simplesect"})
if len(sects) != 3:
raise ValueError("Unexpected number of sections")
# section 1: MySQL x.0 Keywords and Reserved Words
# section 2: MySQL x.0 New Keywords and Reserved Words
# section 3: MySQL x.0 Removed Keywords and Reserved Words
for sect_num, sect in enumerate(sects):
if sect_num > 0:
# sect_num is zero based
# skip the new / removed diff
continue
for row in sect.find_all("li", {"class": "listitem"}):
paragraph = row.find("p")
paragraph_text = paragraph.get_text()
word = row.find("code", {"class": "literal"}).get_text()
if (
paragraph_text.endswith("(R)")
or "(reserved)" in paragraph_text
or "(R);"
in paragraph_text # used to handle FUNCTION (R); reserved in 8.0.1
or "became nonreserved in"
in paragraph_text # used to handle PERSIST; became nonreserved in 8.0.16
):
reserved_words_mysql.add(word.lower())
print(f"Parsed MySQL: {len(reserved_words_mysql)} reserved words")
def get_mariadb_word_list_html(
url: Text | bytes = "https://mariadb.com/kb/en/reserved-words/",
) -> str:
"""Retrieve the HTML of the reserved words page.
Parameters:
url: The url of the MariaDB reserved word list website.
(default: https://mariadb.com/kb/en/reserved-words)
Returns:
str: The HTML of the MariaDB reserved word list website.
Raises:
requests.HTTPError: If a non-successful status code is received
requests.Timeout: If the request times out.
"""
response = get(url=url, timeout=10)
response.raise_for_status()
return response.text
def parse_mariadb_word_list_html(html: str) -> None:
"""Parses the HTML source code to extract the reserved keywords.
Parameters:
html: The HTML source code of the MariaDB reserved word list website.
Raises:
ValueError: Raised if the MariaDB reserved words page has an unexpected number
of tables in it, indicating the structure of the page has changed
since this was written.
"""
soup = BeautifulSoup(html, "lxml")
tables = soup.find_all("table")
if len(tables) != 3:
raise ValueError("Unexpected number of tables")
# table 1: reserved words
# table 2: exceptions
# table 3: oracle mode resered words
for table_num, table in enumerate(tables):
if table_num > 0:
# table_num is zero-based, skip exceptions
# exceptions are not actually reserved
continue
for row in table.find_all("tr"):
cells = row.find_all("td")
if len(cells):
word = cells[0].find(text=True)
reserved_words_mariadb.add(word.lower())
print(f"Parsed MariaDB: {len(reserved_words_mariadb)} reserved words")
def print_diffs() -> None:
"""Prints the additions and removals between the original list and new lists."""
if len(original_list) == 0:
print("No original list set, ignoring diffs...")
return
mariadb_additions = reserved_words_mariadb - original_list
mysql_additions = reserved_words_mysql - original_list
mariadb_removals = original_list - reserved_words_mariadb
mysql_removals = original_list - reserved_words_mysql
print("# MariaDB")
for word in mariadb_additions:
print(f"+ {word}")
for word in mariadb_removals:
print(f"- {word}")
print("# MySQL")
for word in mysql_additions:
print(f"+ {word}")
for word in mysql_removals:
print(f"- {word}")
def write_reserved_words_to_file(
parsed_args: Namespace, filename: str = "reserved_words.py", encoding="utf-8"
) -> None:
"""Writes the MariaDB and MySQL word lists to a file.
Parameters:
parsed_args: The arguments that were parsed from argparse.
filename: The filename of the file to write to. Note, this will overwrite the
file. (default: reserved_words.py)
encoding: The encoding of the file when it is opened. (default: utf-8)
"""
with open(filename, "w", encoding=encoding) as f:
f.write(
"\n".join(
[
"# generated using:",
"# https://gist.github.com/kkirsche/4f31f2153ed7a3248be1ec44ca6ddbc9",
"#\n", # include a trailing new line
]
)
)
if parsed_args.retrieve_mariadb:
mariadb_headers = [
"# https://mariadb.com/kb/en/reserved-words/",
"# includes: Reserved Words",
"# excludes: Exceptions, Oracle Mode, Function Names",
"RESERVED_WORDS_MARIADB = set([",
]
for header in mariadb_headers:
f.write(f"{header}\n")
for word in sorted(reserved_words_mariadb, key=str.lower):
f.write(f' "{word.lower()}",\n')
f.write("])\n\n")
if parsed_args.retrieve_mysql:
mysql_headers = [
"# https://dev.mysql.com/doc/refman/8.0/en/keywords.html",
"# https://dev.mysql.com/doc/refman/5.7/en/keywords.html",
"# https://dev.mysql.com/doc/refman/5.6/en/keywords.html",
"# includes: MySQL x.0 Keywords and Reserved Words",
"# excludes: MySQL x.0 New Keywords and Reserved Words,",
"# MySQL x.0 Removed Keywords and Reserved Words",
"RESERVED_WORDS_MYSQL = set([",
]
for header in mysql_headers:
f.write(f"{header}\n")
for word in sorted(reserved_words_mysql, key=str.lower):
f.write(f' "{word.lower()}",\n')
f.write("])\n")
def main() -> None:
"""The entrypoint of the script."""
parser = ArgumentParser(
description="Retrieve the reserved keywords for MySQL and/or MariaDB",
formatter_class=ArgumentDefaultsHelpFormatter,
)
parser.add_argument(
"-y",
"--mysql",
action="store_false",
default=True,
dest="retrieve_mysql",
help="Retrieve the keywords for MySQL",
)
parser.add_argument(
"-a",
"--mariadb",
action="store_false",
default=True,
dest="retrieve_mariadb",
help="Retrieve the keywords for MariaDB",
)
parser.add_argument(
"-w",
"--write",
action="store_false",
default=True,
dest="write_changes",
help="Write the keywords to a file",
)
parser.add_argument(
"-d",
"--diff",
action="store_false",
default=True,
dest="show_diff",
help="Print a diff between the original list and the new list, if available",
)
args = parser.parse_args()
if args.retrieve_mariadb:
html = get_mariadb_word_list_html()
parse_mariadb_word_list_html(html=html)
if args.retrieve_mysql:
version_contents = get_mysql_word_list_htmls()
parse_mysql_word_list_htmls(version_contents=version_contents)
if args.write_changes:
write_reserved_words_to_file(parsed_args=args)
if args.show_diff:
print_diffs()
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment