FrankGrimm/bernoulli.py

## bernoulli.py
from collections import defaultdict
import numpy as np

choices = defaultdict(int)

# sentence = [["[SOS]", "lorem"], ["lorem", "ipsum"], ["ipsum", "dolor"], ...]
# [SOS] "lorem" [EOS]
#
#     0       1       2       3           4       5       6
# "[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]"
#
# 0_[SOS], 1_lorem, 2_ipsum,
#
# -----
#
#     0       1           2       3       4       5       6       7 ...
# ["[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]", "[PAD]", "[PAD]", "[PAD]", "[PAD]"]
#
#     -3       -2       -1
# ["[SOS]", "lorem", "ipsum", <sample>
#
#                     -2         -1   |    i    |   +1      +2
# ["[SOS]", "lorem", "ipsum", "dolor" | "silir" | "amet", "[EOS]", "[PAD]", "[PAD]", "[PAD]", "[PAD]"]
#                                         ^--- current
#
#
# {[SOS], [EOS], [PAD], lorem, ipsum, dolor, silir, amet}

unigrams = defaultdict(int)

sentence = ["[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]"]

for token in sentence:
    unigrams[token] += 1

print(unigrams)

bigrams = defaultdict(int)


# corpus:
#     - doc_0
#     - doc_1
#     - doc_2
#     - doc_3
#     - doc_4
#
#     ------------ freeze and evaluate
#     [UNK]
#     - doc_5
#     - doc_6


class Vocabulary:
    def __init__(self):
        self._frozen = False
        self.id2token = {}
        self.token2id = {}

    @property
    def frozen(self):
        return self._frozen

    def freeze(self):
        self._frozen = True

    def add(self, token):
        if self.frozen:
            raise Exception("cannot modify frozen vocabulary")
        if token in self.token2id:
            return False
        newid = len(self.token2id)
        self.token2id[token] = newid
        self.id2token[newid] = token

    def get(self, token):
        if token in self.token2id:
            return self.token2id[token]
        return None

import string
corpus = "./snlp_worksheets/worksheet1/corpus.txt"
vocab = Vocabulary()

with open(corpus, "rt") as infile:
    for line in infile:
        line = "".join([c for c in line.strip() if c not in string.punctuation]).lower()
        sentence = line.split(" ")

        for i in range(1, len(sentence)):
            bigram = "_".join(sentence[i-1:i+1])
            vocab.add(bigram)

vocab.freeze()

import numpy as np

bigram_counts = defaultdict(int)
counts = np.zeros((len(vocab.id2token), ))
print(counts)

with open(corpus, "rt") as infile:
    for line in infile:
        line = "".join([c for c in line.strip() if c not in string.punctuation]).lower()
        sentence = line.split(" ")
        for i in range(1, len(sentence)):
            bigram = "_".join(sentence[i-1:i+1])

            bigram_index = vocab.get(bigram)
            if bigram_index is not None:
                counts[bigram_index] += 1

            bigram_counts[bigram] += 1

print(counts)
print(counts[counts==1].sum())

for bigram_id, bigram in vocab.id2token.items():
    print(bigram_id, bigram, "count:", counts[bigram_id])

    if bigram_id > 100:
        break

print(len(vocab.id2token))
	from collections import defaultdict
	import numpy as np

	choices = defaultdict(int)

	# sentence = [["[SOS]", "lorem"], ["lorem", "ipsum"], ["ipsum", "dolor"], ...]
	# [SOS] "lorem" [EOS]
	#
	# 0 1 2 3 4 5 6
	# "[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]"
	#
	# 0_[SOS], 1_lorem, 2_ipsum,
	#
	# -----
	#
	# 0 1 2 3 4 5 6 7 ...
	# ["[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]", "[PAD]", "[PAD]", "[PAD]", "[PAD]"]
	#
	# -3 -2 -1
	# ["[SOS]", "lorem", "ipsum", <sample>
	#
	# -2 -1 \| i \| +1 +2
	# ["[SOS]", "lorem", "ipsum", "dolor" \| "silir" \| "amet", "[EOS]", "[PAD]", "[PAD]", "[PAD]", "[PAD]"]
	# ^--- current
	#
	#
	# {[SOS], [EOS], [PAD], lorem, ipsum, dolor, silir, amet}

	unigrams = defaultdict(int)

	sentence = ["[SOS]", "lorem", "ipsum", "dolor", "silir", "amet", "[EOS]"]

	for token in sentence:
	unigrams[token] += 1

	print(unigrams)

	bigrams = defaultdict(int)


	# corpus:
	# - doc_0
	# - doc_1
	# - doc_2
	# - doc_3
	# - doc_4
	#
	# ------------ freeze and evaluate
	# [UNK]
	# - doc_5
	# - doc_6


	class Vocabulary:
	def __init__(self):
	self._frozen = False
	self.id2token = {}
	self.token2id = {}

	@property
	def frozen(self):
	return self._frozen

	def freeze(self):
	self._frozen = True

	def add(self, token):
	if self.frozen:
	raise Exception("cannot modify frozen vocabulary")
	if token in self.token2id:
	return False
	newid = len(self.token2id)
	self.token2id[token] = newid
	self.id2token[newid] = token

	def get(self, token):
	if token in self.token2id:
	return self.token2id[token]
	return None

	import string
	corpus = "./snlp_worksheets/worksheet1/corpus.txt"
	vocab = Vocabulary()

	with open(corpus, "rt") as infile:
	for line in infile:
	line = "".join([c for c in line.strip() if c not in string.punctuation]).lower()
	sentence = line.split(" ")

	for i in range(1, len(sentence)):
	bigram = "_".join(sentence[i-1:i+1])
	vocab.add(bigram)

	vocab.freeze()

	import numpy as np

	bigram_counts = defaultdict(int)
	counts = np.zeros((len(vocab.id2token), ))
	print(counts)

	with open(corpus, "rt") as infile:
	for line in infile:
	line = "".join([c for c in line.strip() if c not in string.punctuation]).lower()
	sentence = line.split(" ")
	for i in range(1, len(sentence)):
	bigram = "_".join(sentence[i-1:i+1])

	bigram_index = vocab.get(bigram)
	if bigram_index is not None:
	counts[bigram_index] += 1

	bigram_counts[bigram] += 1

	print(counts)
	print(counts[counts==1].sum())

	for bigram_id, bigram in vocab.id2token.items():
	print(bigram_id, bigram, "count:", counts[bigram_id])

	if bigram_id > 100:
	break

	print(len(vocab.id2token))