Skip to content

Instantly share code, notes, and snippets.

@Destaq
Last active October 15, 2021 22:18
Show Gist options
  • Save Destaq/06415954e3394527d2351713c53fd769 to your computer and use it in GitHub Desktop.
Save Destaq/06415954e3394527d2351713c53fd769 to your computer and use it in GitHub Desktop.
Graphs your total comprehension of a Chinese text file based on known vocabulary, number of words learnt per step, and other rules.
# generates a graph of the percentage known words of a text at every step
import argparse
from LAC import LAC
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from collections import Counter
parser = argparse.ArgumentParser(
"Display a graph of your comprehension every *x* of a Chinese text file."
)
parser.add_argument("-t", "--target", required=True, help="Path to target file.")
parser.add_argument(
"-s", "--step", required=False, default=1000, help="Every x characters/words."
)
parser.add_argument(
"-k", "--known", required=False, help="Path to your known word list."
)
parser.add_argument(
"-l",
"--learned",
required=False,
default=-1,
help="The number of new words you estimate you will learn for every step (float).",
)
parser.add_argument(
"-c",
"--cutoff",
required=False,
help="How many characters of the file to read up to",
default=-1,
)
args = parser.parse_args()
class GraphAnalyzer:
"""
Shows cumulative new words every step.
"""
ignore_chars = ",.:()!@[]+/\\!??。。"#$%&'()*+,-/:;<=>@[\]^_`{|}~⦅⦆「」、、〃《》「」『』【】〔〕〖〗〘〙〚〛〜〝〞〟〰〾〿–—‘’‛“”„‟…‧﹏.?;﹔|.-·-*─''\""
segmenter = LAC(mode="seg")
def __init__(self, target_path, step, knownfile_path, cutoff, learned_per_step):
self.target_path = target_path
self.step = step
self.knownfile_path = knownfile_path
self.cutoff = cutoff
self.learned_per_step = learned_per_step
self.runner()
def runner(self):
with open(self.target_path, "r") as f:
text = "".join(f.read().split())
if self.cutoff != -1:
text = text[: self.cutoff]
text_words = self.segmenter.run(text)
# now strip the undesired punctuation from every element of text_words
for i in range(len(text_words)):
text_words[i] = text_words[i].translate(
str.maketrans("", "", self.ignore_chars)
)
# remove elements that are just empty strings
self.text_words = [x for x in text_words if x != ""]
# create a list of single characters from the text, excluding ignore chars
self.text_chars = [x for x in text if x not in self.ignore_chars]
# generate the data
self.data_generator()
# graph the data
self.graph()
def data_generator(self):
# create a list of known words and chars
if self.knownfile_path:
with open(self.knownfile_path, "r") as f:
known_words = f.read().split()
known_chars = [x for x in known_words if x not in self.ignore_chars]
else:
known_words = []
known_chars = []
# trackers for updating known words and chars if they are a float
word_tracker = 0.0
char_tracker = 0.0
# calculate the number of new words and chars every step
self.new_words_step_counter = []
current_possible_new_known_words = Counter([])
for i in range(0, len(self.text_words), self.step):
current_new_words = 0
# check for new words
for word in self.text_words[i : i + self.step]:
if word not in known_words:
current_new_words += 1
# update known words based off of estimated gain in knowledge
if self.learned_per_step != -1:
for word in self.text_words[i : i + self.step]:
current_possible_new_known_words[word] += 1
# update tracker
word_tracker += self.learned_per_step
remainder = word_tracker - int(word_tracker)
# choose the top learned_amount from the counter to add to known words
added = 0
for word, count in current_possible_new_known_words.most_common():
if added < int(word_tracker):
if word not in known_words:
known_words.append(word)
added += 1
else:
break
word_tracker = remainder
else:
# you assume you will learn every word
for word in self.text_words[i : i + self.step]:
known_words.append(word)
# append to the step counter
self.new_words_step_counter.append(1 - current_new_words / self.step)
#########
self.new_chars_step_counter = []
# now do the same for characters
current_possible_new_known_chars = Counter([])
for i in range(0, len(self.text_chars), self.step):
current_new_chars = 0
# check for new chars
for char in self.text_chars[i : i + self.step]:
if char not in known_chars:
current_new_chars += 1
# update known chars based off of estimated gain in knowledge
if self.learned_per_step != -1:
for char in self.text_chars[i : i + self.step]:
current_possible_new_known_chars[char] += 1
# update tracker
char_tracker += self.learned_per_step
remainder = char_tracker - int(char_tracker)
# choose the top learned_amount from the counter to add to known chars
added = 0
for char, count in current_possible_new_known_chars.most_common():
if added < int(char_tracker):
if char not in known_chars:
known_chars.append(char)
added += 1
else:
break
char_tracker = remainder
else:
# you assume you will learn every char
for char in self.text_chars[i : i + self.step]:
known_chars.append(char)
# append to the step counter
self.new_chars_step_counter.append(1 - current_new_chars / self.step)
def graph(self):
# create a graph of the new chars and new words per step using plotly express
# show both figures on the graph
fig = make_subplots(rows=1, cols=2)
fig.add_trace(
go.Scatter(
x=list(range(0, len(self.new_words_step_counter))),
y=self.new_words_step_counter,
name="New Words",
mode="lines",
line=dict(color="blue"),
),
row=1,
col=1,
)
fig.add_trace(
go.Scatter(
x=list(range(0, len(self.new_chars_step_counter))),
y=self.new_chars_step_counter,
name="New Chars",
mode="lines",
line=dict(color="red"),
),
row=1,
col=2,
)
fig.update_layout(
height=600,
width=1440,
title_text=f"New Chars and Words by Step - {self.target_path[:-4]}",
)
# create gap between subplots
fig.update_yaxes(showgrid=True, row=1, col=1)
fig["layout"]["xaxis"].update(title=f"Words * {self.step}")
fig["layout"]["yaxis"].update(title=f"Known Words Proportion")
fig["layout"]["xaxis2"].update(title=f"Chars * {self.step}")
fig["layout"]["yaxis2"].update(title=f"Known Chars Proportion")
fig.show()
if __name__ == "__main__":
GraphAnalyzer(
args.target, int(args.step), args.known, int(args.cutoff), float(args.learned)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment