Skip to content

Instantly share code, notes, and snippets.

@gujiaxi
Last active February 19, 2021 14:47
Show Gist options
  • Save gujiaxi/d29167ca4aefab679a22fdfc8cf3a807 to your computer and use it in GitHub Desktop.
Save gujiaxi/d29167ca4aefab679a22fdfc8cf3a807 to your computer and use it in GitHub Desktop.
Validate duplicate highlights in Kindle clippings.
import sys
from collections import defaultdict
# This script is used for validating duplicate
# highlights of Kindle's "My Clippings.txt".
if len(sys.argv) <= 1:
print("Usage: ./{} \"My Clippings.txt\"".format(sys.argv[0]))
exit(1)
filename = sys.argv[1]
num = 1
note_dict = defaultdict(list)
book_title = ""
prev_note = "#####"
with open(filename, 'r') as clip_file:
lines = clip_file.readlines()
for line in lines:
line = line.strip().strip(u"\ufeff")
if line.startswith("- Your Highlight on Location "):
book_title = prev_note
continue
prev_note = line
if not line or line.startswith("=" * 10) or line in note_dict:
continue
for note in note_dict[book_title]:
(note_short, note_long) = (line, note) \
if len(line) < len(note) else (note, line)
if note_long.startswith(note_short):
print("[DUPLICATE #{}]".format(num))
print("- {}".format(note_short))
print("+ {}".format(note_long))
print("==========")
num += 1
note_dict[book_title].append(line)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment