AlexRiina/anki_similarity.py

## anki_similarity.py
"""
Text clustering of Anki cards to find duplicated concepts. Prints handful of notes and then a list of similar notes.

```
pip install anki_sqlalchemy bs4 sklearn
cp ${ANKI_DATABASE:?replace me} backup.db
python anki_similarity.py
```
"""

import re
import bs4
import random
from anki_sqlalchemy import Note, Collection
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker

from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

CLOZE_EXTRACT = re.compile(r"{{(?P<group>.*?)::(?P<answer>.*?)(::.*?)?}}")

engine = create_engine("sqlite:///backup.db", echo=True)
Session = sessionmaker(bind=engine)
session = Session()

col = session.query(Collection).one()

cloze = next(model for model in col.models.values()
             if model["name"] == "Cloze")
cloze_notes = session.query(Note).filter_by(model_id=cloze["id"])
text_ord = next(
    field for field in cloze["flds"] if field["name"] == "Text")["ord"]


def flatten(a: str) -> str:
    a = a.replace("&nbsp;", " ")

    if '<' in a:
        a = bs4.BeautifulSoup(a).get_text()

    return CLOZE_EXTRACT.sub(r"\g<answer>", a).lower()


cloze_pairs = {note.id: flatten(note.fields[text_ord]) for note in cloze_notes}
cloze_ids, cloze_texts = list(cloze_pairs.keys()), list(cloze_pairs.values())

tfidf = TfidfVectorizer(use_idf=False, stop_words='english')
text_vectors = tfidf.fit_transform(cloze_texts)

nbrs = NearestNeighbors(n_neighbors=3, algorithm="ball_tree")
nbrs.fit(text_vectors.todense())
neighbors_lists = nbrs.radius_neighbors(
    text_vectors.todense(), radius=0.9, return_distance=False
)
neighbors_lists = [
    (index, list(set(neighbors) - {index}))
    for index, neighbors in enumerate(neighbors_lists)
    if len(neighbors) > 1  # more than self
]

for idx, neighbors in random.sample(neighbors_lists, 12):
    print(f"{cloze_texts[idx]} is similar to")

    for neighbor in neighbors:
        print('\t', cloze_texts[neighbor][:50], len(cloze_texts[neighbor]))
	"""
	Text clustering of Anki cards to find duplicated concepts. Prints handful of notes and then a list of similar notes.

	```
	pip install anki_sqlalchemy bs4 sklearn
	cp ${ANKI_DATABASE:?replace me} backup.db
	python anki_similarity.py
	```
	"""

	import re
	import bs4
	import random
	from anki_sqlalchemy import Note, Collection
	from sqlalchemy import create_engine
	from sqlalchemy.orm import sessionmaker

	from sklearn.neighbors import NearestNeighbors
	from sklearn.feature_extraction.text import TfidfVectorizer

	CLOZE_EXTRACT = re.compile(r"{{(?P<group>.?)::(?P<answer>.?)(::.*?)?}}")

	engine = create_engine("sqlite:///backup.db", echo=True)
	Session = sessionmaker(bind=engine)
	session = Session()

	col = session.query(Collection).one()

	cloze = next(model for model in col.models.values()
	if model["name"] == "Cloze")
	cloze_notes = session.query(Note).filter_by(model_id=cloze["id"])
	text_ord = next(
	field for field in cloze["flds"] if field["name"] == "Text")["ord"]


	def flatten(a: str) -> str:
	a = a.replace(" ", " ")

	if '<' in a:
	a = bs4.BeautifulSoup(a).get_text()

	return CLOZE_EXTRACT.sub(r"\g<answer>", a).lower()


	cloze_pairs = {note.id: flatten(note.fields[text_ord]) for note in cloze_notes}
	cloze_ids, cloze_texts = list(cloze_pairs.keys()), list(cloze_pairs.values())

	tfidf = TfidfVectorizer(use_idf=False, stop_words='english')
	text_vectors = tfidf.fit_transform(cloze_texts)

	nbrs = NearestNeighbors(n_neighbors=3, algorithm="ball_tree")
	nbrs.fit(text_vectors.todense())
	neighbors_lists = nbrs.radius_neighbors(
	text_vectors.todense(), radius=0.9, return_distance=False
	)
	neighbors_lists = [
	(index, list(set(neighbors) - {index}))
	for index, neighbors in enumerate(neighbors_lists)
	if len(neighbors) > 1 # more than self
	]

	for idx, neighbors in random.sample(neighbors_lists, 12):
	print(f"{cloze_texts[idx]} is similar to")

	for neighbor in neighbors:
	print('\t', cloze_texts[neighbor][:50], len(cloze_texts[neighbor]))