Destaq/graph_comprehension.py

## graph_comprehension.py
# generates a graph of the percentage known words of a text at every step
import argparse
from LAC import LAC
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from collections import Counter

parser = argparse.ArgumentParser(
    "Display a graph of your comprehension every *x* of a Chinese text file."
)
parser.add_argument("-t", "--target", required=True, help="Path to target file.")
parser.add_argument(
    "-s", "--step", required=False, default=1000, help="Every x characters/words."
)
parser.add_argument(
    "-k", "--known", required=False, help="Path to your known word list."
)
parser.add_argument(
    "-l",
    "--learned",
    required=False,
    default=-1,
    help="The number of new words you estimate you will learn for every step (float).",
)
parser.add_argument(
    "-c",
    "--cutoff",
    required=False,
    help="How many characters of the file to read up to",
    default=-1,
)
args = parser.parse_args()


class GraphAnalyzer:
    """
    Shows cumulative new words every step.
    """

    ignore_chars = ",.:()!@[]+/\\！?？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.?;﹔|.-·-*─''\""

    segmenter = LAC(mode="seg")

    def __init__(self, target_path, step, knownfile_path, cutoff, learned_per_step):
        self.target_path = target_path
        self.step = step
        self.knownfile_path = knownfile_path
        self.cutoff = cutoff
        self.learned_per_step = learned_per_step

        self.runner()

    def runner(self):
        with open(self.target_path, "r") as f:
            text = "".join(f.read().split())

        if self.cutoff != -1:
            text = text[: self.cutoff]

        text_words = self.segmenter.run(text)

        # now strip the undesired punctuation from every element of text_words
        for i in range(len(text_words)):
            text_words[i] = text_words[i].translate(
                str.maketrans("", "", self.ignore_chars)
            )

        # remove elements that are just empty strings
        self.text_words = [x for x in text_words if x != ""]

        # create a list of single characters from the text, excluding ignore chars
        self.text_chars = [x for x in text if x not in self.ignore_chars]

        # generate the data
        self.data_generator()

        # graph the data
        self.graph()

    def data_generator(self):
        # create a list of known words and chars
        if self.knownfile_path:
            with open(self.knownfile_path, "r") as f:
                known_words = f.read().split()
                known_chars = [x for x in known_words if x not in self.ignore_chars]
        else:
            known_words = []
            known_chars = []

        # trackers for updating known words and chars if they are a float
        word_tracker = 0.0
        char_tracker = 0.0

        # calculate the number of new words and chars every step
        self.new_words_step_counter = []
        current_possible_new_known_words = Counter([])
        for i in range(0, len(self.text_words), self.step):
            current_new_words = 0
            # check for new words
            for word in self.text_words[i : i + self.step]:
                if word not in known_words:
                    current_new_words += 1

            # update known words based off of estimated gain in knowledge
            if self.learned_per_step != -1:
                for word in self.text_words[i : i + self.step]:
                    current_possible_new_known_words[word] += 1

                # update tracker
                word_tracker += self.learned_per_step
                remainder = word_tracker - int(word_tracker)

                # choose the top learned_amount from the counter to add to known words
                added = 0
                for word, count in current_possible_new_known_words.most_common():
                    if added < int(word_tracker):
                        if word not in known_words:
                            known_words.append(word)
                            added += 1
                    else:
                        break

                word_tracker = remainder
            else:
                # you assume you will learn every word
                for word in self.text_words[i : i + self.step]:
                    known_words.append(word)

            # append to the step counter
            self.new_words_step_counter.append(1 - current_new_words / self.step)

        #########

        self.new_chars_step_counter = []
        # now do the same for characters
        current_possible_new_known_chars = Counter([])
        for i in range(0, len(self.text_chars), self.step):
            current_new_chars = 0
            # check for new chars
            for char in self.text_chars[i : i + self.step]:
                if char not in known_chars:
                    current_new_chars += 1

            # update known chars based off of estimated gain in knowledge
            if self.learned_per_step != -1:
                for char in self.text_chars[i : i + self.step]:
                    current_possible_new_known_chars[char] += 1

                # update tracker
                char_tracker += self.learned_per_step
                remainder = char_tracker - int(char_tracker)

                # choose the top learned_amount from the counter to add to known chars
                added = 0
                for char, count in current_possible_new_known_chars.most_common():
                    if added < int(char_tracker):
                        if char not in known_chars:
                            known_chars.append(char)
                            added += 1
                    else:
                        break

                char_tracker = remainder
            else:
                # you assume you will learn every char
                for char in self.text_chars[i : i + self.step]:
                    known_chars.append(char)

            # append to the step counter
            self.new_chars_step_counter.append(1 - current_new_chars / self.step)

    def graph(self):
        # create a graph of the new chars and new words per step using plotly express
        # show both figures on the graph

        fig = make_subplots(rows=1, cols=2)

        fig.add_trace(
            go.Scatter(
                x=list(range(0, len(self.new_words_step_counter))),
                y=self.new_words_step_counter,
                name="New Words",
                mode="lines",
                line=dict(color="blue"),
            ),
            row=1,
            col=1,
        )


        fig.add_trace(
            go.Scatter(
                x=list(range(0, len(self.new_chars_step_counter))),
                y=self.new_chars_step_counter,
                name="New Chars",
                mode="lines",
                line=dict(color="red"),
            ),
            row=1,
            col=2,
        )

        fig.update_layout(
            height=600,
            width=1440,
            title_text=f"New Chars and Words by Step - {self.target_path[:-4]}",
        )
        # create gap between subplots
        fig.update_yaxes(showgrid=True, row=1, col=1)
        fig["layout"]["xaxis"].update(title=f"Words * {self.step}")
        fig["layout"]["yaxis"].update(title=f"Known Words Proportion")
        fig["layout"]["xaxis2"].update(title=f"Chars * {self.step}")
        fig["layout"]["yaxis2"].update(title=f"Known Chars Proportion")
        fig.show()


if __name__ == "__main__":
    GraphAnalyzer(
        args.target, int(args.step), args.known, int(args.cutoff), float(args.learned)
    )
	# generates a graph of the percentage known words of a text at every step
	import argparse
	from LAC import LAC
	from plotly.subplots import make_subplots
	import plotly.graph_objects as go
	from collections import Counter

	parser = argparse.ArgumentParser(
	"Display a graph of your comprehension every x of a Chinese text file."
	)
	parser.add_argument("-t", "--target", required=True, help="Path to target file.")
	parser.add_argument(
	"-s", "--step", required=False, default=1000, help="Every x characters/words."
	)
	parser.add_argument(
	"-k", "--known", required=False, help="Path to your known word list."
	)
	parser.add_argument(
	"-l",
	"--learned",
	required=False,
	default=-1,
	help="The number of new words you estimate you will learn for every step (float).",
	)
	parser.add_argument(
	"-c",
	"--cutoff",
	required=False,
	help="How many characters of the file to read up to",
	default=-1,
	)
	args = parser.parse_args()


	class GraphAnalyzer:
	"""
	Shows cumulative new words every step.
	"""

	ignore_chars = ",.:()!@[]+/\\！?？｡。＂＃＄％＆＇（）＊＋，－／：；＜＝＞＠［＼］＾＿｀｛｜｝～｟｠｢｣､、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.?;﹔\|.-·-*─''\""

	segmenter = LAC(mode="seg")

	def __init__(self, target_path, step, knownfile_path, cutoff, learned_per_step):
	self.target_path = target_path
	self.step = step
	self.knownfile_path = knownfile_path
	self.cutoff = cutoff
	self.learned_per_step = learned_per_step

	self.runner()

	def runner(self):
	with open(self.target_path, "r") as f:
	text = "".join(f.read().split())

	if self.cutoff != -1:
	text = text[: self.cutoff]

	text_words = self.segmenter.run(text)

	# now strip the undesired punctuation from every element of text_words
	for i in range(len(text_words)):
	text_words[i] = text_words[i].translate(
	str.maketrans("", "", self.ignore_chars)
	)

	# remove elements that are just empty strings
	self.text_words = [x for x in text_words if x != ""]

	# create a list of single characters from the text, excluding ignore chars
	self.text_chars = [x for x in text if x not in self.ignore_chars]

	# generate the data
	self.data_generator()

	# graph the data
	self.graph()

	def data_generator(self):
	# create a list of known words and chars
	if self.knownfile_path:
	with open(self.knownfile_path, "r") as f:
	known_words = f.read().split()
	known_chars = [x for x in known_words if x not in self.ignore_chars]
	else:
	known_words = []
	known_chars = []

	# trackers for updating known words and chars if they are a float
	word_tracker = 0.0
	char_tracker = 0.0

	# calculate the number of new words and chars every step
	self.new_words_step_counter = []
	current_possible_new_known_words = Counter([])
	for i in range(0, len(self.text_words), self.step):
	current_new_words = 0
	# check for new words
	for word in self.text_words[i : i + self.step]:
	if word not in known_words:
	current_new_words += 1

	# update known words based off of estimated gain in knowledge
	if self.learned_per_step != -1:
	for word in self.text_words[i : i + self.step]:
	current_possible_new_known_words[word] += 1

	# update tracker
	word_tracker += self.learned_per_step
	remainder = word_tracker - int(word_tracker)

	# choose the top learned_amount from the counter to add to known words
	added = 0
	for word, count in current_possible_new_known_words.most_common():
	if added < int(word_tracker):
	if word not in known_words:
	known_words.append(word)
	added += 1
	else:
	break

	word_tracker = remainder
	else:
	# you assume you will learn every word
	for word in self.text_words[i : i + self.step]:
	known_words.append(word)

	# append to the step counter
	self.new_words_step_counter.append(1 - current_new_words / self.step)

	#########

	self.new_chars_step_counter = []
	# now do the same for characters
	current_possible_new_known_chars = Counter([])
	for i in range(0, len(self.text_chars), self.step):
	current_new_chars = 0
	# check for new chars
	for char in self.text_chars[i : i + self.step]:
	if char not in known_chars:
	current_new_chars += 1

	# update known chars based off of estimated gain in knowledge
	if self.learned_per_step != -1:
	for char in self.text_chars[i : i + self.step]:
	current_possible_new_known_chars[char] += 1

	# update tracker
	char_tracker += self.learned_per_step
	remainder = char_tracker - int(char_tracker)

	# choose the top learned_amount from the counter to add to known chars
	added = 0
	for char, count in current_possible_new_known_chars.most_common():
	if added < int(char_tracker):
	if char not in known_chars:
	known_chars.append(char)
	added += 1
	else:
	break

	char_tracker = remainder
	else:
	# you assume you will learn every char
	for char in self.text_chars[i : i + self.step]:
	known_chars.append(char)

	# append to the step counter
	self.new_chars_step_counter.append(1 - current_new_chars / self.step)

	def graph(self):
	# create a graph of the new chars and new words per step using plotly express
	# show both figures on the graph

	fig = make_subplots(rows=1, cols=2)

	fig.add_trace(
	go.Scatter(
	x=list(range(0, len(self.new_words_step_counter))),
	y=self.new_words_step_counter,
	name="New Words",
	mode="lines",
	line=dict(color="blue"),
	),
	row=1,
	col=1,
	)


	fig.add_trace(
	go.Scatter(
	x=list(range(0, len(self.new_chars_step_counter))),
	y=self.new_chars_step_counter,
	name="New Chars",
	mode="lines",
	line=dict(color="red"),
	),
	row=1,
	col=2,
	)

	fig.update_layout(
	height=600,
	width=1440,
	title_text=f"New Chars and Words by Step - {self.target_path[:-4]}",
	)
	# create gap between subplots
	fig.update_yaxes(showgrid=True, row=1, col=1)
	fig["layout"]["xaxis"].update(title=f"Words * {self.step}")
	fig["layout"]["yaxis"].update(title=f"Known Words Proportion")
	fig["layout"]["xaxis2"].update(title=f"Chars * {self.step}")
	fig["layout"]["yaxis2"].update(title=f"Known Chars Proportion")
	fig.show()


	if __name__ == "__main__":
	GraphAnalyzer(
	args.target, int(args.step), args.known, int(args.cutoff), float(args.learned)
	)