Skip to content

Instantly share code, notes, and snippets.

@ChrisVilches
Last active July 6, 2022 16:10
Show Gist options
  • Save ChrisVilches/ab62b0f39a5af57260e7ed8dadaa1e41 to your computer and use it in GitHub Desktop.
Save ChrisVilches/ab62b0f39a5af57260e7ed8dadaa1e41 to your computer and use it in GitHub Desktop.
Clean HTML files generated by the Windows Kindle app when exporting highlights.
import sys
import re
# TODO: The main code should be inside a __main__, I think.
FILE = sys.argv[1]
file = open(FILE, "r", encoding="utf8")
data = file.read()
file.close()
if (
("class='bookTitle'" not in data)
or ("class='noteHeading'" not in data)
or (not FILE.endswith(".html"))
):
raise Exception("It seems this file is not a Kindle highlight HTML file.")
original_data = data
replacement_rules = {
" .": ".",
" ,": ",",
"( ": "(",
" )": ")",
" :": ":",
" ;": ";",
" ?": "?",
" !": "!",
" - ": "-",
" / ": "/",
"“ ": "“",
" ”": "”",
}
replacement_rules_regex = {
"# ([0-9])": "#\\1"
}
for key, value in replacement_rules.items():
data = data.replace(key, value)
for key, value in replacement_rules_regex.items():
data = re.sub(key, value, data)
if original_data == data:
print("Data didn't change.")
else:
print("Data changed.")
file = open(FILE, "w", encoding="utf8")
file.write(data)
file.close()
print("OK")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment