Last active
March 30, 2024 20:37
-
-
Save joegr/45a47c10ea23cb5a393cbd5d29e5161a to your computer and use it in GitHub Desktop.
Big Little Language Model Code
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Entity: | |
def __init__(self, name, desc): | |
self.id = uuid4() | |
self.name = name | |
self.desc = desc | |
def __str__(self): | |
return self.id | |
def __repr__(self): | |
return self.id | |
def _get(self): | |
return self.id, self.name, self.desc |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import deque | |
def list_base_tokens(): | |
res = [] | |
with open(words) as f: | |
txt = f.read() | |
lines = txt.split() | |
for line in lines: | |
res.append(line) | |
return res | |
class KB: | |
def __init__(self, vocab=list_base_tokens()): | |
self.pool_lim = 100 | |
self.vx = CountVectorizer(analyzer="word", ngram_range=(6,6)) | |
self.I = {} # entid : Entname | |
self.nm = {} # ENTNAME : CVMatrix | |
self.vocab = {} # token: count all tokens uniquely mentioned in training descriptions | |
self.pmem = {} # set of 1st level aliases, ie EntID:set() | |
self.smem = {} # set of 2nd level aliases, ie EntID:set() | |
words = list_base_tokens() | |
for word in list_base_tokens(): | |
self.vocab[word] = 0 | |
def add(self, ent): | |
i, n, d = ent._get() | |
print(i, n, d) | |
if n != "" and d != "": | |
new_toks = word_tokenize("".join([n,d])) | |
print(new_toks) | |
if new_toks != "": | |
for token in new_toks: | |
if token in stops: | |
pass | |
elif token in self.vocab.keys(): | |
self.vocab[token] = self.vocab[token] + 1 | |
else: | |
self.vocab[token] = 0 | |
#index resolution | |
if i not in self.I.keys(): | |
self.I[i] = n | |
else: | |
print("entid exists") | |
#name matrix creation | |
#if n not in self.nm: | |
#self.nm[n] = self.vx.fit_transform([n,d]) | |
else: | |
pass | |
def addL1_alias(self, entid, a): | |
#confirm entity exists: | |
if self.idcheck(entid): | |
#confirm memory is not full | |
self.memcheck() | |
#confirm alias not already in memory and add if not | |
if a not in np.array(self.pmem.values()).flatten(): | |
self.pmem[entid] = set() | |
self.pmem[entid].add(a) | |
return ( "alias does not exist adding", a, " to ent ", entid ) | |
else: | |
entid = self.find(a) | |
return "existing alias in memory tied to different entid ", entid | |
else: | |
print("entid does not exist, please add ent before alias") | |
def addL2_alias(self, entid, a): | |
if self.idcheck(entid): | |
self.memcheck() | |
if a not in np.array(self.smem.values).flatten(): | |
self.smem[entid] = set() | |
self.smem[entid].add(a) | |
return str(self.smem) | |
else: | |
return False | |
def learn_ents_from_csv(self, path): | |
df= pd.read_csv(path) | |
df= df.fillna("") | |
for tup in df.itertuples(): | |
d = tup[2] | |
n = tup[1] | |
ent = Entity(n, d) | |
self.add(ent) | |
return "learned ", df.head() | |
def ents(self): | |
return str(self.I) | |
def aliases(self): | |
return str(list(self.pmem.values())+list(self.smem.values())) | |
def learn_aliases_from_csv(self, path): | |
df = pd.read_csv(path) | |
df.fillna("") | |
for tup in df.itertuples(): | |
entid = tup[0] | |
alias = tup[1] | |
try: | |
self.addL1_alias(entid, alias) | |
except Exception as e: | |
try: | |
self.addL2_alias(entid, alias) | |
except Exception as ex: | |
return e, ex | |
return "learned, ", df.rows() | |
def summary(self): | |
return "In Memory Knowledge Base initialized on Google's first 10,000 words and MDD linkages" | |
def __repr__(self): | |
plt.plot(np.array(list(self.vocab.values()))) | |
res = {} | |
for k, v in self.vocab.items(): | |
if v > 0: | |
res[k] = v | |
else: | |
pass | |
return str(res) | |
def memcheck(self, l2=False): | |
if l2: | |
for eid in self.smem.keys(): | |
if len(self.smem[eid]) < self.pool_lim: | |
pass | |
else: | |
return False, "ERR: alias secondary memory full at", eid | |
else: | |
for eid in self.pmem.keys(): | |
if len(self.pmem[eid]) < self.pool_lim: | |
pass | |
else: | |
return False, "ERR: alias primary memory full at", eid | |
def find(self, string): | |
# try immediate primary memory "edge" relationships | |
try: | |
return self.pmem.get(string) | |
except Exception as e: | |
print(e) | |
return "Token not found in primary alias memory" | |
# try secondary memory "edge" relationships | |
try: | |
return self.smem.get(string) | |
except Exception as e: | |
print(e) | |
return "Token not found in secondary alias memory" | |
def idcheck(self, entid): | |
return True if entid in self.I.keys() else False |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment