cryptocoinserver/AnkiCardSimilarity.py

## AnkiCardSimilarity.py
import os
import re
import sqlite3
import zipfile

import numpy as np
import pandas as pd
import spacy
from ankipandas import Collection
from sklearn.feature_extraction.text import TfidfVectorizer
from spacy.lang.de.stop_words import STOP_WORDS
from tqdm import tqdm

tqdm.pandas()


class AnkiCardSimilarity:
    def __init__(self, deck_name, nlp, stopwords, lemmatization, threshold=0.8):
        self.col = Collection()
        self.deck_name = deck_name
        self.nlp = nlp
        self.stopwords = stopwords
        self.lemmatization = lemmatization
        self.threshold = threshold

    def _preprocess(self, text):
        # preprocess text
        text = re.sub("<[^<]+?>", "", text)
        text = re.sub("[^a-zA-ZäöüÄÖÜß]", " ", text)
        text = text.lower()
        text = " ".join([word for word in text.split() if word not in self.stopwords])
        if self.lemmatization:
            text = " ".join([token.lemma_ for token in self.nlp(text)])
        return text

    def _preprocess_notes(self, notes):
        # apply with tqdm preprocessing to cards nflds field. before that turns the list into a string.
        print("preprocessing notes")
        notes["preprocced"] = notes["nflds"].progress_apply(lambda x: " ".join(x))
        notes["preprocced"] = notes["preprocced"].progress_apply(self._preprocess)
        return notes

    def _get_similar_notes(self, notes):
        # get similar cards using TF-IDF and cosine similarity. returns a list of tuples with the corresponding nid values in cards
        print("calculating similarity")
        tfidf = TfidfVectorizer()
        tfidf_matrix = tfidf.fit_transform(notes["preprocced"])
        cosine_similarities = np.dot(tfidf_matrix, tfidf_matrix.T)
        similar_cards = []
        for i in tqdm(range(cosine_similarities.shape[0])):
            for j in range(i + 1, cosine_similarities.shape[0]):
                if cosine_similarities[i, j] > self.threshold:
                    similar_cards.append(
                        (
                            notes.iloc[i]["nid"],
                            notes.iloc[j]["nid"],
                            cosine_similarities[i, j],
                        )
                    )
        return similar_cards

    def _add_tags(self, notes, similar_notes):
        # add tags to similar cards and less information tag to card with less information and remvove duplicated tags in the end
        # with tqdm
        print("adding tags to cards")
        for note1, note2, similarity in tqdm(similar_notes):
            notes.loc[notes["nid"] == note1, "ntags"] = notes.loc[
                notes["nid"] == note1, "ntags"
            ].apply(
                lambda x: x
                + [f"SimilarityCheck::Score{similarity}-nids{note1}*{note2}"]
            )
            notes.loc[notes["nid"] == note2, "ntags"] = notes.loc[
                notes["nid"] == note2, "ntags"
            ].apply(
                lambda x: x
                + [f"SimilarityCheck::Score{similarity}-nids{note1}*{note2}"]
            )

            if len(notes.loc[notes["nid"] == note1, "preprocced"].iloc[0]) < len(
                notes.loc[notes["nid"] == note2, "preprocced"].iloc[0]
            ):
                notes.loc[notes["nid"] == note1, "ntags"] = notes.loc[
                    notes["nid"] == note1, "ntags"
                ].apply(lambda x: x + ["SimilarityCheck::less-information"])
            else:
                notes.loc[notes["nid"] == note2, "ntags"] = notes.loc[
                    notes["nid"] == note2, "ntags"
                ].apply(lambda x: x + ["SimilarityCheck::less-information"])

        notes["ntags"] = notes["ntags"].apply(lambda x: list(set(x)))
        return notes

    def _save_cards(self):
        self.col.summarize_changes(output="print")
        self.col.write(modify=True)

    def run(self):
        # run the whole process
        decks = self.col.cards.list_decks()
        # print(decks)
        cards = self.col.cards.merge_notes()
        cards = cards[cards["cdeck"].str.startswith(self.deck_name)]

        note_ids = cards.nid
        selected_notes = self.col.notes[self.col.notes.nid.isin(note_ids)]

        selected_notes = selected_notes.reset_index()

        selected_notes = self._preprocess_notes(selected_notes)
        similar_notes = self._get_similar_notes(selected_notes)
        selected_notes = self._add_tags(selected_notes, similar_notes)

        selected_notes = selected_notes.set_index("nid")
        # drop preprocced column
        selected_notes = selected_notes.drop(columns=["preprocced"])

        selected_notes.info()

        # update cards in collection
        self.col.notes.update(selected_notes)

        self._save_cards()


if __name__ == "__main__":
    deck_name = "Statistik"
    nlp = spacy.load("de_core_news_lg")
    stopwords = STOP_WORDS
    lemmatization = True
    threshold = 0.8
    anki_card_similarity = AnkiCardSimilarity(
        deck_name, nlp, stopwords, lemmatization, threshold
    )
    anki_card_similarity.run()
	import os
	import re
	import sqlite3
	import zipfile

	import numpy as np
	import pandas as pd
	import spacy
	from ankipandas import Collection
	from sklearn.feature_extraction.text import TfidfVectorizer
	from spacy.lang.de.stop_words import STOP_WORDS
	from tqdm import tqdm

	tqdm.pandas()


	class AnkiCardSimilarity:
	def __init__(self, deck_name, nlp, stopwords, lemmatization, threshold=0.8):
	self.col = Collection()
	self.deck_name = deck_name
	self.nlp = nlp
	self.stopwords = stopwords
	self.lemmatization = lemmatization
	self.threshold = threshold

	def _preprocess(self, text):
	# preprocess text
	text = re.sub("<[^<]+?>", "", text)
	text = re.sub("[^a-zA-ZäöüÄÖÜß]", " ", text)
	text = text.lower()
	text = " ".join([word for word in text.split() if word not in self.stopwords])
	if self.lemmatization:
	text = " ".join([token.lemma_ for token in self.nlp(text)])
	return text

	def _preprocess_notes(self, notes):
	# apply with tqdm preprocessing to cards nflds field. before that turns the list into a string.
	print("preprocessing notes")
	notes["preprocced"] = notes["nflds"].progress_apply(lambda x: " ".join(x))
	notes["preprocced"] = notes["preprocced"].progress_apply(self._preprocess)
	return notes

	def _get_similar_notes(self, notes):
	# get similar cards using TF-IDF and cosine similarity. returns a list of tuples with the corresponding nid values in cards
	print("calculating similarity")
	tfidf = TfidfVectorizer()
	tfidf_matrix = tfidf.fit_transform(notes["preprocced"])
	cosine_similarities = np.dot(tfidf_matrix, tfidf_matrix.T)
	similar_cards = []
	for i in tqdm(range(cosine_similarities.shape[0])):
	for j in range(i + 1, cosine_similarities.shape[0]):
	if cosine_similarities[i, j] > self.threshold:
	similar_cards.append(
	(
	notes.iloc[i]["nid"],
	notes.iloc[j]["nid"],
	cosine_similarities[i, j],
	)
	)
	return similar_cards

	def _add_tags(self, notes, similar_notes):
	# add tags to similar cards and less information tag to card with less information and remvove duplicated tags in the end
	# with tqdm
	print("adding tags to cards")
	for note1, note2, similarity in tqdm(similar_notes):
	notes.loc[notes["nid"] == note1, "ntags"] = notes.loc[
	notes["nid"] == note1, "ntags"
	].apply(
	lambda x: x
	+ [f"SimilarityCheck::Score{similarity}-nids{note1}*{note2}"]
	)
	notes.loc[notes["nid"] == note2, "ntags"] = notes.loc[
	notes["nid"] == note2, "ntags"
	].apply(
	lambda x: x
	+ [f"SimilarityCheck::Score{similarity}-nids{note1}*{note2}"]
	)

	if len(notes.loc[notes["nid"] == note1, "preprocced"].iloc[0]) < len(
	notes.loc[notes["nid"] == note2, "preprocced"].iloc[0]
	):
	notes.loc[notes["nid"] == note1, "ntags"] = notes.loc[
	notes["nid"] == note1, "ntags"
	].apply(lambda x: x + ["SimilarityCheck::less-information"])
	else:
	notes.loc[notes["nid"] == note2, "ntags"] = notes.loc[
	notes["nid"] == note2, "ntags"
	].apply(lambda x: x + ["SimilarityCheck::less-information"])

	notes["ntags"] = notes["ntags"].apply(lambda x: list(set(x)))
	return notes

	def _save_cards(self):
	self.col.summarize_changes(output="print")
	self.col.write(modify=True)

	def run(self):
	# run the whole process
	decks = self.col.cards.list_decks()
	# print(decks)
	cards = self.col.cards.merge_notes()
	cards = cards[cards["cdeck"].str.startswith(self.deck_name)]

	note_ids = cards.nid
	selected_notes = self.col.notes[self.col.notes.nid.isin(note_ids)]

	selected_notes = selected_notes.reset_index()

	selected_notes = self._preprocess_notes(selected_notes)
	similar_notes = self._get_similar_notes(selected_notes)
	selected_notes = self._add_tags(selected_notes, similar_notes)

	selected_notes = selected_notes.set_index("nid")
	# drop preprocced column
	selected_notes = selected_notes.drop(columns=["preprocced"])

	selected_notes.info()

	# update cards in collection
	self.col.notes.update(selected_notes)

	self._save_cards()


	if __name__ == "__main__":
	deck_name = "Statistik"
	nlp = spacy.load("de_core_news_lg")
	stopwords = STOP_WORDS
	lemmatization = True
	threshold = 0.8
	anki_card_similarity = AnkiCardSimilarity(
	deck_name, nlp, stopwords, lemmatization, threshold
	)
	anki_card_similarity.run()