Skip to content

Instantly share code, notes, and snippets.

@FrankGrimm
Created November 10, 2020 08:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save FrankGrimm/1ac8cfbb0cd035692f319c492757a885 to your computer and use it in GitHub Desktop.
Save FrankGrimm/1ac8cfbb0cd035692f319c492757a885 to your computer and use it in GitHub Desktop.
from collections import defaultdict
import numpy as np
choices = defaultdict(int)
# sentence = [["[SOS]", "lorem"], ["lorem", "ipsum"], ["ipsum", "dolor"], ...]
# [SOS] "lorem" [EOS]
#
# 0 1 2 3 4 5 6
# "[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]"
#
# 0_[SOS], 1_lorem, 2_ipsum,
#
# -----
#
# 0 1 2 3 4 5 6 7 ...
# ["[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]", "[PAD]", "[PAD]", "[PAD]", "[PAD]"]
#
# -3 -2 -1
# ["[SOS]", "lorem", "ipsum", <sample>
#
# -2 -1 | i | +1 +2
# ["[SOS]", "lorem", "ipsum", "dolor" | "silir" | "amet", "[EOS]", "[PAD]", "[PAD]", "[PAD]", "[PAD]"]
# ^--- current
#
#
# {[SOS], [EOS], [PAD], lorem, ipsum, dolor, silir, amet}
unigrams = defaultdict(int)
sentence = ["[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]"]
for token in sentence:
unigrams[token] += 1
print(unigrams)
bigrams = defaultdict(int)
# corpus:
# - doc_0
# - doc_1
# - doc_2
# - doc_3
# - doc_4
#
# ------------ freeze and evaluate
# [UNK]
# - doc_5
# - doc_6
class Vocabulary:
def __init__(self):
self._frozen = False
self.id2token = {}
self.token2id = {}
@property
def frozen(self):
return self._frozen
def freeze(self):
self._frozen = True
def add(self, token):
if self.frozen:
raise Exception("cannot modify frozen vocabulary")
if token in self.token2id:
return False
newid = len(self.token2id)
self.token2id[token] = newid
self.id2token[newid] = token
def get(self, token):
if token in self.token2id:
return self.token2id[token]
return None
import string
corpus = "./snlp_worksheets/worksheet1/corpus.txt"
vocab = Vocabulary()
with open(corpus, "rt") as infile:
for line in infile:
line = "".join([c for c in line.strip() if c not in string.punctuation]).lower()
sentence = line.split(" ")
for i in range(1, len(sentence)):
bigram = "_".join(sentence[i-1:i+1])
vocab.add(bigram)
vocab.freeze()
import numpy as np
bigram_counts = defaultdict(int)
counts = np.zeros((len(vocab.id2token), ))
print(counts)
with open(corpus, "rt") as infile:
for line in infile:
line = "".join([c for c in line.strip() if c not in string.punctuation]).lower()
sentence = line.split(" ")
for i in range(1, len(sentence)):
bigram = "_".join(sentence[i-1:i+1])
bigram_index = vocab.get(bigram)
if bigram_index is not None:
counts[bigram_index] += 1
bigram_counts[bigram] += 1
print(counts)
print(counts[counts==1].sum())
for bigram_id, bigram in vocab.id2token.items():
print(bigram_id, bigram, "count:", counts[bigram_id])
if bigram_id > 100:
break
print(len(vocab.id2token))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment